* [PATCH 1/5] t/config.t: test PublicInbox::Git sharing between inboxes
2019-01-31 4:27 [PATCH 0/5] a few more solver fixups and improvements Eric Wong
@ 2019-01-31 4:27 ` Eric Wong
2019-01-31 4:27 ` [PATCH 2/5] inbox: perform cleanup of Git objects for coderepos Eric Wong
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2019-01-31 4:27 UTC (permalink / raw)
To: meta
We need to ensure we don't introduce unnecessary processes
and memory usage for mapping multiple inboxes to the same
code repos.
---
t/config.t | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/t/config.t b/t/config.t
index 7531fd7..ad738bd 100644
--- a/t/config.t
+++ b/t/config.t
@@ -169,4 +169,23 @@ for my $s (@valid) {
is_deeply(\@result, \@expect);
}
+{
+ my $pfx1 = "publicinbox.test1";
+ my $pfx2 = "publicinbox.test2";
+ my $h = {
+ "$pfx1.address" => 'test@example.com',
+ "$pfx1.mainrepo" => '/path/to/non/existent',
+ "$pfx2.address" => 'foo@example.com',
+ "$pfx2.mainrepo" => '/path/to/foo',
+ "$pfx1.coderepo" => 'project',
+ "$pfx2.coderepo" => 'project',
+ "coderepo.project.dir" => '/path/to/project.git',
+ };
+ my $cfg = PublicInbox::Config->new($h);
+ my $t1 = $cfg->lookup_name('test1');
+ my $t2 = $cfg->lookup_name('test2');
+ is($t1->{-repo_objs}->[0], $t2->{-repo_objs}->[0],
+ 'inboxes share ::Git object');
+}
+
done_testing();
--
EW
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 2/5] inbox: perform cleanup of Git objects for coderepos
2019-01-31 4:27 [PATCH 0/5] a few more solver fixups and improvements Eric Wong
2019-01-31 4:27 ` [PATCH 1/5] t/config.t: test PublicInbox::Git sharing between inboxes Eric Wong
@ 2019-01-31 4:27 ` Eric Wong
2019-01-31 4:27 ` [PATCH 3/5] solvergit: allow searching on longer-than-needed OIDs Eric Wong
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2019-01-31 4:27 UTC (permalink / raw)
To: meta
Otherwise, long-running but idle git processes may keep unlinked
packs around indefinitely and waste disk space.
---
lib/PublicInbox/Git.pm | 18 ++++++++++++++----
lib/PublicInbox/Inbox.pm | 17 +++++++++++++++--
t/git.t | 4 ++++
3 files changed, 33 insertions(+), 6 deletions(-)
diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm
index e844884..a756684 100644
--- a/lib/PublicInbox/Git.pm
+++ b/lib/PublicInbox/Git.pm
@@ -206,7 +206,15 @@ sub check {
}
sub _destroy {
- my ($self, $in, $out, $pid) = @_;
+ my ($self, $in, $out, $pid, $expire) = @_;
+ my $rfh = $self->{$in} or return;
+ if (defined $expire) {
+ # at least FreeBSD 11.2 and Linux 4.20 update mtime of the
+ # read end of a pipe when the pipe is written to; dunno
+ # about other OSes.
+ my $mtime = (stat($rfh))[9];
+ return if $mtime > $expire;
+ }
my $p = delete $self->{$pid} or return;
foreach my $f ($in, $out) {
delete $self->{$f};
@@ -236,10 +244,12 @@ sub qx {
<$fh>
}
+# returns true if there are pending "git cat-file" processes
sub cleanup {
- my ($self) = @_;
- _destroy($self, qw(in out pid));
- _destroy($self, qw(in_c out_c pid_c));
+ my ($self, $expire) = @_;
+ _destroy($self, qw(in out pid), $expire);
+ _destroy($self, qw(in_c out_c pid_c), $expire);
+ !!($self->{pid} || $self->{pid_c});
}
# assuming a well-maintained repo, this should be a somewhat
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index d57e46d..6fe896f 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -22,12 +22,25 @@ my $cleanup_broken = $@;
my $CLEANUP = {}; # string(inbox) -> inbox
sub cleanup_task () {
$cleanup_timer = undef;
+ my $next = {};
for my $ibx (values %$CLEANUP) {
- foreach my $f (qw(git mm search)) {
+ my $again;
+ foreach my $f (qw(mm search)) {
delete $ibx->{$f} if SvREFCNT($ibx->{$f}) == 1;
}
+ my $expire = time - 60;
+ if (my $git = $ibx->{git}) {
+ $again = $git->cleanup($expire);
+ }
+ if (my $gits = $ibx->{-repo_objs}) {
+ foreach my $git (@$gits) {
+ $again = 1 if $git->cleanup($expire);
+ }
+ }
+ $again ||= !!($ibx->{mm} || $ibx->{search});
+ $next->{"$ibx"} = $ibx if $again;
}
- $CLEANUP = {};
+ $CLEANUP = $next;
}
sub _cleanup_later ($) {
diff --git a/t/git.t b/t/git.t
index 9c80fbb..d637e63 100644
--- a/t/git.t
+++ b/t/git.t
@@ -142,6 +142,10 @@ if ('alternates reloaded') {
open $fh, '<', "$alt/config" or die "open failed: $!\n";
my $config = eval { local $/; <$fh> };
is($$found, $config, 'alternates reloaded');
+
+ ok($gcf->cleanup(time - 30), 'cleanup did not expire');
+ ok(!$gcf->cleanup(time + 30), 'cleanup can expire');
+ ok(!$gcf->cleanup, 'cleanup idempotent');
}
use_ok 'PublicInbox::Git', qw(git_unquote git_quote);
--
EW
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 3/5] solvergit: allow searching on longer-than-needed OIDs
2019-01-31 4:27 [PATCH 0/5] a few more solver fixups and improvements Eric Wong
2019-01-31 4:27 ` [PATCH 1/5] t/config.t: test PublicInbox::Git sharing between inboxes Eric Wong
2019-01-31 4:27 ` [PATCH 2/5] inbox: perform cleanup of Git objects for coderepos Eric Wong
@ 2019-01-31 4:27 ` Eric Wong
2019-01-31 4:27 ` [PATCH 4/5] solvergit: allow shorter-than-necessary OIDs from user Eric Wong
2019-01-31 4:27 ` [PATCH 5/5] viewvcs: support streaming large blobs Eric Wong
4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2019-01-31 4:27 UTC (permalink / raw)
To: meta
public-inbox can only index the abbreviated object_ids in
emails, not the full or even longer-than-necessary object_ids.
So retry failed object_ids if they're longer than 7 hex
characters.
---
lib/PublicInbox/SolverGit.pm | 17 ++++++++++++++++-
t/solver_git.t | 9 +++++++++
2 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm
index c502526..97da795 100644
--- a/lib/PublicInbox/SolverGit.pm
+++ b/lib/PublicInbox/SolverGit.pm
@@ -23,6 +23,7 @@ use URI::Escape qw(uri_escape_utf8);
# headroom into this.
use POSIX qw(sysconf _SC_ARG_MAX);
my $ARG_SIZE_MAX = (sysconf(_SC_ARG_MAX) || 4096) - 2048;
+my $OID_MIN = 7;
# By default, "git format-patch" generates filenames with a four-digit
# prefix, so that means 9999 patch series are OK, right? :>
@@ -353,7 +354,13 @@ sub next_step ($) {
sub mark_found ($$$) {
my ($self, $oid, $found_info) = @_;
- $self->{found}->{$oid} = $found_info;
+ my $found = $self->{found};
+ $found->{$oid} = $found_info;
+ my $oid_cur = $found_info->[1];
+ while ($oid_cur ne $oid && length($oid_cur) > $OID_MIN) {
+ $found->{$oid_cur} = $found_info;
+ chop($oid_cur);
+ }
}
sub parse_ls_files ($$$$) {
@@ -485,6 +492,14 @@ sub resolve_patch ($$) {
}
return next_step($self); # onto the next todo item
}
+ if (length($cur_want) > $OID_MIN) {
+ chop($cur_want);
+ dbg($self, "retrying $want->{oid_b} as $cur_want");
+ $want->{oid_b} = $cur_want;
+ push @{$self->{todo}}, $want;
+ return next_step($self); # retry with shorter abbrev
+ }
+
dbg($self, "could not find $cur_want");
eval { delete($self->{user_cb})->(undef) }; # not found! :<
die "E: $@" if $@;
diff --git a/t/solver_git.t b/t/solver_git.t
index 197a003..66e6317 100644
--- a/t/solver_git.t
+++ b/t/solver_git.t
@@ -64,6 +64,15 @@ if (0) { # TODO: check this?
diag $z;
}
+my $oid = $expect;
+for my $i (1..2) {
+ my $more;
+ my $s = PublicInbox::SolverGit->new($ibx, sub { $more = $_[0] });
+ $s->solve($psgi_env, $log, $oid, {});
+ is($more->[1], $expect, 'resolved blob to long OID '.$i);
+ chop($oid);
+}
+
$solver = undef;
$res = undef;
my $wt_git_dir = $wt_git->{git_dir};
--
EW
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 4/5] solvergit: allow shorter-than-necessary OIDs from user
2019-01-31 4:27 [PATCH 0/5] a few more solver fixups and improvements Eric Wong
` (2 preceding siblings ...)
2019-01-31 4:27 ` [PATCH 3/5] solvergit: allow searching on longer-than-needed OIDs Eric Wong
@ 2019-01-31 4:27 ` Eric Wong
2019-01-31 4:27 ` [PATCH 5/5] viewvcs: support streaming large blobs Eric Wong
4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2019-01-31 4:27 UTC (permalink / raw)
To: meta
We can rely on git to disambiguate, here; because sometimes
shorter OIDs can be unambiguous even if we only resolved the
longer one.
---
lib/PublicInbox/SolverGit.pm | 24 ++++++++++++++++++++++--
1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm
index 97da795..a13ae9e 100644
--- a/lib/PublicInbox/SolverGit.pm
+++ b/lib/PublicInbox/SolverGit.pm
@@ -302,6 +302,26 @@ sub extract_old_mode ($) {
'100644';
}
+sub do_finish ($$) {
+ my ($self, $user_cb) = @_;
+ my $found = $self->{found};
+ my $oid_want = $self->{oid_want};
+ if (my $exists = $found->{$oid_want}) {
+ return $user_cb->($exists);
+ }
+
+ # let git disambiguate if oid_want was too short,
+ # but long enough to be unambiguous:
+ my $tmp_git = $self->{tmp_git};
+ if (my @res = $tmp_git->check($oid_want)) {
+ return $user_cb->($found->{$res[0]});
+ }
+ if (my $err = $tmp_git->last_check_err) {
+ dbg($self, $err);
+ }
+ $user_cb->(undef);
+}
+
sub do_step ($) {
my ($self) = @_;
eval {
@@ -323,8 +343,8 @@ sub do_step ($) {
# our result: (which may be undef)
# Other steps may call user_cb to terminate prematurely
# on error
- } elsif (my $ucb = delete($self->{user_cb})) {
- $ucb->($self->{found}->{$self->{oid_want}});
+ } elsif (my $user_cb = delete($self->{user_cb})) {
+ do_finish($self, $user_cb);
} else {
die 'about to call user_cb twice'; # Oops :x
}
--
EW
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 5/5] viewvcs: support streaming large blobs
2019-01-31 4:27 [PATCH 0/5] a few more solver fixups and improvements Eric Wong
` (3 preceding siblings ...)
2019-01-31 4:27 ` [PATCH 4/5] solvergit: allow shorter-than-necessary OIDs from user Eric Wong
@ 2019-01-31 4:27 ` Eric Wong
4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2019-01-31 4:27 UTC (permalink / raw)
To: meta
Forking off git-cat-file here for streaming large blobs is
reasonably efficient, at least no worse than using
git-http-backend for serving clones. So let our limiter
framework deal with it.
git itself isn't great for large files, and AFAIK there's no
stable/widely-available mechanisms for reading smaller chunks
of giant blobs in git itself.
Tested with some giant GPU headers in the Linux kernel.
---
lib/PublicInbox/ViewVCS.pm | 37 +++++++++++++++++++++++++++++++++----
1 file changed, 33 insertions(+), 4 deletions(-)
diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm
index 85edf22..63731e9 100644
--- a/lib/PublicInbox/ViewVCS.pm
+++ b/lib/PublicInbox/ViewVCS.pm
@@ -34,6 +34,7 @@ END { $hl = undef };
my %QP_MAP = ( A => 'oid_a', B => 'oid_b', a => 'path_a', b => 'path_b' );
my $max_size = 1024 * 1024; # TODO: configurable
my $enc_utf8 = find_encoding('UTF-8');
+my $BIN_DETECT = 8000; # same as git
sub html_page ($$$) {
my ($ctx, $code, $strref) = @_;
@@ -43,7 +44,33 @@ sub html_page ($$$) {
my ($nr, undef) = @_;
$nr == 1 ? $$strref : undef;
});
- $wcb->($res);
+ $wcb ? $wcb->($res) : $res;
+}
+
+sub stream_large_blob ($$$$) {
+ my ($ctx, $res, $logref, $fn) = @_;
+ my ($git, $oid, $type, $size, $di) = @$res;
+ my $cmd = ['git', "--git-dir=$git->{git_dir}", 'cat-file', $type, $oid];
+ my $qsp = PublicInbox::Qspawn->new($cmd);
+ my @cl = ('Content-Length', $size);
+ my $env = $ctx->{env};
+ $env->{'qspawn.response'} = delete $ctx->{-wcb};
+ $qsp->psgi_return($env, undef, sub {
+ my ($r, $bref) = @_;
+ if (!defined $r) { # error
+ html_page($ctx, 500, $logref);
+ } elsif (index($$bref, "\0") >= 0) {
+ my $ct = 'application/octet-stream';
+ [200, ['Content-Type', $ct, @cl ] ];
+ } else {
+ my $n = bytes::length($$bref);
+ if ($n >= $BIN_DETECT || $n == $size) {
+ my $ct = 'text/plain; charset=UTF-8';
+ return [200, ['Content-Type', $ct, @cl] ];
+ }
+ undef; # bref keeps growing
+ }
+ });
}
sub solve_result {
@@ -65,9 +92,13 @@ sub solve_result {
$ref eq 'ARRAY' or return html_page($ctx, 500, \$log);
my ($git, $oid, $type, $size, $di) = @$res;
+ my $path = to_filename($di->{path_b} || $hints->{path_b} || 'blob');
+ my $raw_link = "(<a\nhref=$path>raw</a>)";
if ($size > $max_size) {
+ return stream_large_blob($ctx, $res, \$log, $fn) if defined $fn;
# TODO: stream the raw file if it's gigantic, at least
- $log = '<pre><b>Too big to show</b></pre>' . $log;
+ $log = "<pre><b>Too big to show, download available</b>\n" .
+ "$oid $type $size bytes $raw_link</pre>" . $log;
return html_page($ctx, 500, \$log);
}
@@ -86,8 +117,6 @@ sub solve_result {
return delete($ctx->{-wcb})->([200, $h, [ $$blob ]]);
}
- my $path = to_filename($di->{path_b} || $hints->{path_b} || 'blob');
- my $raw_link = "(<a\nhref=$path>raw</a>)";
if ($binary) {
$log = "<pre>$oid $type $size bytes (binary)" .
" $raw_link</pre>" . $log;
--
EW
^ permalink raw reply related [flat|nested] 6+ messages in thread