From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 67F711FA18 for ; Sat, 19 Sep 2020 09:37:15 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 7/7] gcf2: wire up read-only daemons and rm -gcf2 script Date: Sat, 19 Sep 2020 09:37:14 +0000 Message-Id: <20200919093714.21776-8-e@80x24.org> In-Reply-To: <20200919093714.21776-1-e@80x24.org> References: <20200919093714.21776-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: It seems easiest to have a singleton Gcf2Client client object per daemon worker for all inboxes to use. This reduces overall FD usage from pipes. The `public-inbox-gcf2' command + manpage are gone and a `$^X' one-liner is used, instead. This saves inodes for internal commands and hopefully makes it easier to avoid mismatched PERL5LIB include paths (as noticed during development :x). We'll also make the existing cat-file process management infrastructure more resilient to BOFHs on process killing sprees (or in case our libgit2-based code fails on us). (Rare) PublicInbox::WWW PSGI users NOT using public-inbox-httpd won't automatically benefit from this change, and extra configuration will be required (to be documented later). --- Documentation/public-inbox-gcf2.pod | 63 ----------------------------- MANIFEST | 2 - Makefile.PL | 5 --- lib/PublicInbox/Daemon.pm | 11 +++++ lib/PublicInbox/Gcf2.pm | 36 ++++++++++++++++- lib/PublicInbox/Gcf2Client.pm | 59 +++++++++++++++++++++------ lib/PublicInbox/Git.pm | 31 +++++++++----- lib/PublicInbox/GitAsyncCat.pm | 55 +++++++++++++++++++++++-- lib/PublicInbox/IMAP.pm | 2 +- script/public-inbox-gcf2 | 31 -------------- script/public-inbox-httpd | 1 + t/gcf2_client.t | 14 ++++--- 12 files changed, 172 insertions(+), 138 deletions(-) delete mode 100644 Documentation/public-inbox-gcf2.pod delete mode 100755 script/public-inbox-gcf2 diff --git a/Documentation/public-inbox-gcf2.pod b/Documentation/public-inbox-gcf2.pod deleted file mode 100644 index 813fbe7f..00000000 --- a/Documentation/public-inbox-gcf2.pod +++ /dev/null @@ -1,63 +0,0 @@ -=head1 NAME - -public-inbox-gcf2 - internal libgit2-based blob retriever - -=head1 SYNOPSIS - - This is an internal command used by public-inbox. - It may change unrecognizably or cease to exist at some point - -=head1 DESCRIPTION - -public-inbox-gcf2 is an optional internal process used by -public-inbox daemons for read-only access to underlying git -repositories. - -Users are NOT expected to run public-inbox-gcf2 on their own. -It replaces multiple C processes by treating -any git repos it knows about as alternates. - -None of its behaviors are stable and it is ALL subject to change -at any time. - -Any lines written to its standard input prefixed with a C -are interpreted as a git directory. That git directory -will be suffixed with "/objects" and treated as an alternate. -It writes nothing to stdout in this case. - -Otherwise it behaves like C, but only accepts -unabbreviated hexadecimal object IDs in its standard input. -Its output format is identical to C. It -only works for L inboxes and v1 -inboxes indexed by L. - -=head1 OPTIONS - -=head1 ENVIRONMENT - -=over 8 - -=item PERL_INLINE_DIRECTORY - -This must be set unless C<~/.cache/public-inbox/inline-c> -exists. C uses L and libgit2 -and compiles a small shim on its first run. - -=back - -=head1 CONTACT - -Feedback welcome via plain-text mail to L - -The mail archives are hosted at L -and L - -=head1 COPYRIGHT - -Copyright 2020 all contributors L - -License: AGPL-3.0+ L - -=head1 SEE ALSO - -L diff --git a/MANIFEST b/MANIFEST index 91457dab..67615d4d 100644 --- a/MANIFEST +++ b/MANIFEST @@ -26,7 +26,6 @@ Documentation/public-inbox-config.pod Documentation/public-inbox-convert.pod Documentation/public-inbox-daemon.pod Documentation/public-inbox-edit.pod -Documentation/public-inbox-gcf2.pod Documentation/public-inbox-httpd.pod Documentation/public-inbox-imapd.pod Documentation/public-inbox-index.pod @@ -223,7 +222,6 @@ sa_config/user/.spamassassin/user_prefs script/public-inbox-compact script/public-inbox-convert script/public-inbox-edit -script/public-inbox-gcf2 script/public-inbox-httpd script/public-inbox-imapd script/public-inbox-index diff --git a/Makefile.PL b/Makefile.PL index 5a268362..3fe9acf8 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -71,11 +71,6 @@ $v->{gz_docs} = [ map { "$_.gz" } (@{$v->{docs}},@{$v->{docs_html}}) ]; $v->{rsync_docs} = [ @{$v->{gz_docs}}, @{$v->{docs}}, @{$v->{docs_html}}, qw(NEWS.atom NEWS.atom.gz)]; -# filter out public-inbox-gcf2 from the website, it's an internal command -for my $var (qw(gz_docs rsync_docs)) { - @{$v->{$var}} = grep(!/-gcf2/, @{$v->{$var}}); -} - # external manpages which we host ourselves, since some packages # (currently just Xapian) doesn't host manpages themselves. my @xman = qw(copydatabase.1 xapian-compact.1); diff --git a/lib/PublicInbox/Daemon.pm b/lib/PublicInbox/Daemon.pm index b929ec2a..1520f8f2 100644 --- a/lib/PublicInbox/Daemon.pm +++ b/lib/PublicInbox/Daemon.pm @@ -19,6 +19,7 @@ use PublicInbox::Syscall qw($SFD_NONBLOCK); require PublicInbox::Listener; use PublicInbox::EOFpipe; use PublicInbox::Sigfd; +use PublicInbox::GitAsyncCat; my @CMD; my ($set_user, $oldset); my (@cfg_listen, $stdout, $stderr, $group, $user, $pid_file, $daemonize); @@ -652,6 +653,16 @@ sub run ($$$;$) { daemon_prepare($default); my $af_default = $default =~ /:8080\z/ ? 'httpready' : undef; my $for_destroy = daemonize(); + + # this wastes a bit of memory for non-PublicInbox::WWW -httpd users + # oh well... + eval { + require PublicInbox::Gcf2; + require PublicInbox::Gcf2Client; + }; + local $PublicInbox::GitAsyncCat::GCF2C = + PublicInbox::Gcf2Client::new() if !$@; + daemon_loop($refresh, $post_accept, $tlsd, $af_default); PublicInbox::DS->Reset; # ->DESTROY runs when $for_destroy goes out-of-scope diff --git a/lib/PublicInbox/Gcf2.pm b/lib/PublicInbox/Gcf2.pm index fe76b1fd..7983c841 100644 --- a/lib/PublicInbox/Gcf2.pm +++ b/lib/PublicInbox/Gcf2.pm @@ -1,12 +1,13 @@ # Copyright (C) 2020 all contributors # License: AGPL-3.0+ -# backend for public-inbox-gcf2(1) (git-cat-file based on libgit2, -# other libgit2 stuff may go here, too) +# backend for a git-cat-file-workalike based on libgit2, +# other libgit2 stuff may go here, too. package PublicInbox::Gcf2; use strict; use PublicInbox::Spawn qw(which popen_rd); use Fcntl qw(LOCK_EX); +use IO::Handle; # autoflush my (%CFG, $c_src, $lockfh); BEGIN { # PublicInbox::Spawn will set PERL_INLINE_DIRECTORY @@ -54,4 +55,35 @@ use Inline C => $c_src; undef $c_src; undef %CFG; undef $lockfh; + +# Usage: $^X -MPublicInbox::Gcf2 -e 'PublicInbox::Gcf2::loop()' +# (see lib/PublicInbox/Gcf2Client.pm) +sub loop { + my $gcf2 = new(); + STDERR->autoflush(1); + STDOUT->autoflush(1); + + while () { + chomp; + my ($oid, $git_dir) = split(/ /, $_, 2); + $gcf2->add_alternate("$git_dir/objects"); + if (!$gcf2->cat_oid(1, $oid)) { + # retry once if missing. We only get unabbreviated OIDs + # from SQLite or Xapian DBs, here, so malicious clients + # can't trigger excessive retries: + warn "I: $$ $oid missing, retrying in $git_dir\n"; + + $gcf2 = new(); + $gcf2->add_alternate("$git_dir/objects"); + + if ($gcf2->cat_oid(1, $oid)) { + warn "I: $$ $oid found after retry\n"; + } else { + warn "W: $$ $oid missing after retry\n"; + print "$oid missing\n"; # mimic git-cat-file + } + } + } +} + 1; diff --git a/lib/PublicInbox/Gcf2Client.pm b/lib/PublicInbox/Gcf2Client.pm index 30f85c71..42ff1bf3 100644 --- a/lib/PublicInbox/Gcf2Client.pm +++ b/lib/PublicInbox/Gcf2Client.pm @@ -1,29 +1,62 @@ # Copyright (C) 2020 all contributors # License: AGPL-3.0+ -# connects public-inbox processes to public-inbox-gcf2(1) +# connects public-inbox processes to PublicInbox::Gcf2::loop() package PublicInbox::Gcf2Client; use strict; -use parent 'PublicInbox::Git'; +use parent qw(PublicInbox::DS); +use PublicInbox::Git; use PublicInbox::Spawn qw(popen_rd); use IO::Handle (); +use PublicInbox::Syscall qw(EPOLLONESHOT EPOLLOUT); +# fields: +# async_cat => GitAsyncCat ref (read-only pipe) +# sock => writable pipe to Gcf2::loop -sub new { - my ($rdr) = @_; - my $self = bless {}, __PACKAGE__; +sub new { bless($_[0] // {}, __PACKAGE__) } + +sub gcf2c_begin ($) { + my ($self) = @_; + # ensure the child process has the same @INC we do: + my $env = { PERL5LIB => join(':', @INC) }; my ($out_r, $out_w); - pipe($out_r, $out_w) or $self->fail("pipe failed: $!"); - $rdr //= {}; - $rdr->{0} = $out_r; - @$self{qw(in pid)} = popen_rd(['public-inbox-gcf2'], undef, $rdr); - $self->{inflight} = []; - $self->{out} = $out_w; + pipe($out_r, $out_w) or die "pipe failed: $!"; + my $rdr = { 0 => $out_r, 2 => $self->{2} }; + my $cmd = [$^X, qw[-MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop()]]; + @$self{qw(in pid)} = popen_rd($cmd, $env, $rdr); fcntl($out_w, 1031, 4096) if $^O eq 'linux'; # 1031: F_SETPIPE_SZ $out_w->autoflush(1); - $self; + $out_w->blocking(0); + $self->SUPER::new($out_w, 0); # EPOLL_CTL_ADD (a bit wasteful :x) + $self->{inflight} = []; +} + +sub fail { + my $self = shift; + $self->close; # PublicInbox::DS::close + PublicInbox::Git::fail($self, @_); +} + +sub cat_async ($$$;$) { + my ($self, $req, $cb, $arg) = @_; + my $inflight = $self->{inflight} // gcf2c_begin($self); + + # rare, I hope: + cat_async_step($self, $inflight) if $self->{wbuf}; + + $self->write(\"$req\n") or $self->fail("gcf2c write: $!"); + push @$inflight, $req, $cb, $arg; } -# always false, since -gcf2 retries internally +# ensure PublicInbox::Git::cat_async_step never calls cat_async_retry sub alternates_changed {} +no warnings 'once'; + +# this is the write-only end of a pipe, DS->EventLoop will call this +*event_step = \&PublicInbox::DS::flush_write; + +# used by GitAsyncCat +*cat_async_step = \&PublicInbox::Git::cat_async_step; + 1; diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index 6bb82b6b..2323cecc 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -185,11 +185,12 @@ sub cat_async_step ($$) { my $rbuf = delete($self->{cat_rbuf}) // \(my $new = ''); my ($bref, $oid, $type, $size); my $head = my_readline($self->{in}, $rbuf); + # ->fail may be called via Gcf2Client.pm if ($head =~ /^([0-9a-f]{40,}) (\S+) ([0-9]+)$/) { ($oid, $type, $size) = ($1, $2, $3 + 0); $bref = my_read($self->{in}, $rbuf, $size + 1) or - fail($self, defined($bref) ? 'read EOF' : "read: $!"); - chop($$bref) eq "\n" or fail($self, 'LF missing after blob'); + $self->fail(defined($bref) ? 'read EOF' : "read: $!"); + chop($$bref) eq "\n" or $self->fail('LF missing after blob'); } elsif ($head =~ s/ missing\n//s) { $oid = $head; # ref($req) indicates it's already been retried @@ -201,7 +202,7 @@ sub cat_async_step ($$) { $type = 'missing'; $oid = ref($req) ? $$req : $req if $oid eq ''; } else { - fail($self, "Unexpected result from async git cat-file: $head"); + $self->fail("Unexpected result from async git cat-file: $head"); } eval { $cb->($bref, $oid, $type, $size, $arg) }; $self->{cat_rbuf} = $rbuf if $$rbuf ne ''; @@ -304,10 +305,12 @@ sub check { sub _destroy { my ($self, $rbuf, $in, $out, $pid, $err) = @_; - my $p = delete $self->{$pid} or return; delete @$self{($rbuf, $in, $out)}; delete $self->{$err} if $err; # `err_c' + # GitAsyncCat::event_step may delete {pid} + my $p = delete $self->{$pid} or return; + # PublicInbox::DS may not be loaded eval { PublicInbox::DS::dwaitpid($p, undef, undef) }; waitpid($p, 0) if $@; # wait synchronously if not in event loop @@ -315,14 +318,21 @@ sub _destroy { sub cat_async_abort ($) { my ($self) = @_; - my $inflight = delete $self->{inflight} or die 'BUG: not in async'; + if (my $inflight = delete $self->{inflight}) { + while (@$inflight) { + my ($req, $cb, $arg) = splice(@$inflight, 0, 3); + $req =~ s/ .*//; # drop git_dir for Gcf2Client + eval { $cb->(undef, $req, undef, undef, $arg) }; + warn "E: $req: $@ (in abort)\n" if $@; + } + } cleanup($self); } sub fail { my ($self, $msg) = @_; - $self->{inflight} ? cat_async_abort($self) : cleanup($self); - croak("git $self->{git_dir}: $msg"); + cat_async_abort($self); + croak(ref($self) . ' ' . ($self->{git_dir} // '') . ": $msg"); } sub popen { @@ -352,6 +362,7 @@ sub cleanup { !!($self->{pid} || $self->{pid_c}); } + # assuming a well-maintained repo, this should be a somewhat # accurate estimation of its size # TODO: show this in the WWW UI as a hint to potential cloners @@ -397,7 +408,7 @@ sub pub_urls { sub cat_async_begin { my ($self) = @_; cleanup($self) if $self->alternates_changed; - batch_prepare($self); + $self->batch_prepare; die 'BUG: already in async' if $self->{inflight}; $self->{inflight} = []; } @@ -413,11 +424,9 @@ sub cat_async ($$$;$) { push(@$inflight, $oid, $cb, $arg); } -# this is safe to call inside $cb, but not guaranteed to enqueue -# returns true if successful, undef if not. sub async_prefetch { my ($self, $oid, $cb, $arg) = @_; - if (defined($self->{async_cat}) && (my $inflight = $self->{inflight})) { + if (my $inflight = $self->{inflight}) { # we could use MAX_INFLIGHT here w/o the halving, # but lets not allow one client to monopolize a git process if (scalar(@$inflight) < int(MAX_INFLIGHT/2)) { diff --git a/lib/PublicInbox/GitAsyncCat.pm b/lib/PublicInbox/GitAsyncCat.pm index 8a54c608..b9dbe0cc 100644 --- a/lib/PublicInbox/GitAsyncCat.pm +++ b/lib/PublicInbox/GitAsyncCat.pm @@ -11,23 +11,49 @@ package PublicInbox::GitAsyncCat; use strict; use parent qw(PublicInbox::DS Exporter); +use POSIX qw(WNOHANG); use PublicInbox::Syscall qw(EPOLLIN EPOLLET); -our @EXPORT = qw(git_async_cat); +our @EXPORT = qw(git_async_cat git_async_prefetch); +use PublicInbox::Git (); + +our $GCF2C; # singleton PublicInbox::Gcf2Client + +sub close { + my ($self) = @_; + + if (my $gitish = delete $self->{gitish}) { + PublicInbox::Git::cat_async_abort($gitish); + } + $self->SUPER::close; # PublicInbox::DS::close +} sub event_step { my ($self) = @_; - my $gitish = $self->{gitish}; + my $gitish = $self->{gitish} or return; return $self->close if ($gitish->{in} // 0) != ($self->{sock} // 1); my $inflight = $gitish->{inflight}; if ($inflight && @$inflight) { $gitish->cat_async_step($inflight); - $self->requeue if @$inflight || exists $gitish->{cat_rbuf}; + + # child death? + if (($gitish->{in} // 0) != ($self->{sock} // 1)) { + $self->close; + } elsif (@$inflight || exists $gitish->{cat_rbuf}) { + # ok, more to do, requeue for fairness + $self->requeue; + } + } elsif ((my $pid = waitpid($gitish->{pid}, WNOHANG)) > 0) { + # May happen if the child process is killed by a BOFH + # (or segfaults) + delete $gitish->{pid}; + warn "E: gitish $pid exited with \$?=$?\n"; + $self->close; } } sub git_async_cat ($$$$) { my ($git, $oid, $cb, $arg) = @_; - my $gitish = $git->{gcf2c}; # PublicInbox::Gcf2Client + my $gitish = $GCF2C; if ($gitish) { $oid .= " $git->{git_dir}"; } else { @@ -41,4 +67,25 @@ sub git_async_cat ($$$$) { }; } +# this is safe to call inside $cb, but not guaranteed to enqueue +# returns true if successful, undef if not. +sub git_async_prefetch { + my ($git, $oid, $cb, $arg) = @_; + if ($GCF2C) { + if ($GCF2C->{async_cat} && !$GCF2C->{wbuf}) { + $oid .= " $git->{git_dir}"; + return $GCF2C->cat_async($oid, $cb, $arg); + } + } elsif ($git->{async_cat} && (my $inflight = $git->{inflight})) { + # we could use MAX_INFLIGHT here w/o the halving, + # but lets not allow one client to monopolize a git process + if (@$inflight < int(PublicInbox::Git::MAX_INFLIGHT/2)) { + print { $git->{out} } $oid, "\n" or + $git->fail("write error: $!"); + return push(@$inflight, $oid, $cb, $arg); + } + } + undef; +} + 1; diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm index 47c08aea..a861282f 100644 --- a/lib/PublicInbox/IMAP.pm +++ b/lib/PublicInbox/IMAP.pm @@ -626,7 +626,7 @@ sub fetch_blob_cb { # called by git->cat_async via git_async_cat } my $pre; if (!$self->{wbuf} && (my $nxt = $msgs->[0])) { - $pre = $self->{ibx}->git->async_prefetch($nxt->{blob}, + $pre = git_async_prefetch($self->{ibx}->git, $nxt->{blob}, \&fetch_blob_cb, $fetch_arg); } fetch_run_ops($self, $smsg, $bref, $ops, $partial); diff --git a/script/public-inbox-gcf2 b/script/public-inbox-gcf2 deleted file mode 100755 index 4a44b654..00000000 --- a/script/public-inbox-gcf2 +++ /dev/null @@ -1,31 +0,0 @@ -#!perl -w -# Copyright (C) 2020 all contributors -# License: AGPL-3.0+ -eval { require PublicInbox::Gcf2 }; -die "libgit2 development package or Inline::C missing for $0: $@\n" if $@; -my $gcf2 = PublicInbox::Gcf2::new(); -use IO::Handle; # autoflush -STDERR->autoflush(1); -STDOUT->autoflush(1); - -while () { - chomp; - my ($oid, $git_dir) = split(/ /, $_, 2); - $gcf2->add_alternate("$git_dir/objects"); - if (!$gcf2->cat_oid(1, $oid)) { - # retry once if missing. We only get unabbreviated OIDs - # from SQLite or Xapian DBs, here, so malicious clients - # can't trigger excessive retries: - warn "I: $$ $oid missing, retrying in $git_dir...\n"; - - $gcf2 = PublicInbox::Gcf2::new(); - $gcf2->add_alternate("$git_dir/objects"); - - if ($gcf2->cat_oid(1, $oid)) { - warn "I: $$ $oid found after retry\n"; - } else { - warn "W: $$ $oid missing after retry\n"; - print "$oid missing\n"; # mimic git-cat-file - } - } -} diff --git a/script/public-inbox-httpd b/script/public-inbox-httpd index b8159f3a..3befdab8 100755 --- a/script/public-inbox-httpd +++ b/script/public-inbox-httpd @@ -13,6 +13,7 @@ BEGIN { require PublicInbox::HTTP; require PublicInbox::HTTPD; } + my %httpds; my $app; my $refresh = sub { diff --git a/t/gcf2_client.t b/t/gcf2_client.t index 19462379..f1302a54 100644 --- a/t/gcf2_client.t +++ b/t/gcf2_client.t @@ -6,6 +6,7 @@ use PublicInbox::TestCommon; use Test::More; use Cwd qw(getcwd); use PublicInbox::Import; +use PublicInbox::DS; require_mods('PublicInbox::Gcf2'); use_ok 'PublicInbox::Gcf2Client'; @@ -24,8 +25,8 @@ my $tree = 'fdbc43725f21f485051c17463b50185f4c3cf88c'; my $called = 0; my $err_f = "$tmpdir/err"; { - local $ENV{PATH} = getcwd()."/blib/script:$ENV{PATH}"; - open my $err, '>', $err_f or BAIL_OUT $!; + PublicInbox::DS->Reset; + open my $err, '>>', $err_f or BAIL_OUT $!; my $gcf2c = PublicInbox::Gcf2Client::new({ 2 => $err }); $gcf2c->cat_async("$tree $git_a", sub { my ($bref, $oid, $type, $size, $arg) = @_; @@ -36,7 +37,7 @@ my $err_f = "$tmpdir/err"; is($arg, 'hi', 'arg passed'); $called++; }, 'hi'); - $gcf2c->cat_async_wait; + $gcf2c->cat_async_step($gcf2c->{inflight}); open $err, '<', $err_f or BAIL_OUT $!; my $estr = do { local $/; <$err> }; @@ -52,13 +53,14 @@ my $err_f = "$tmpdir/err"; is($arg, 'bye', 'arg passed when missing'); $called++; }, 'bye'); - $gcf2c->cat_async_wait; + $gcf2c->cat_async_step($gcf2c->{inflight}); open $err, '<', $err_f or BAIL_OUT $!; $estr = do { local $/; <$err> }; like($estr, qr/retrying/, 'warned about retry'); # try failed alternates lookup + PublicInbox::DS->Reset; open $err, '>', $err_f or BAIL_OUT $!; $gcf2c = PublicInbox::Gcf2Client::new({ 2 => $err }); $gcf2c->cat_async("$tree $git_b", sub { @@ -66,7 +68,7 @@ my $err_f = "$tmpdir/err"; is(undef, $bref, 'missing bref from alt is undef'); $called++; }); - $gcf2c->cat_async_wait; + $gcf2c->cat_async_step($gcf2c->{inflight}); open $err, '<', $err_f or BAIL_OUT $!; $estr = do { local $/; <$err> }; like($estr, qr/retrying/, 'warned about retry before alt update'); @@ -82,7 +84,7 @@ my $err_f = "$tmpdir/err"; is($$bref, $expect, 'tree content matched'); $called++; }); - $gcf2c->cat_async_wait; + $gcf2c->cat_async_step($gcf2c->{inflight}); } is($called, 4, 'cat_async callbacks hit'); done_testing;