From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF, T_SCC_BODY_TEXT_LINE shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 79E391F47C for ; Thu, 29 Aug 2024 23:26:04 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1724973964; bh=sXWXA+7LlVT2a2elpdFOhNcJ18eo1ThgfgaUo88q460=; h=From:To:Subject:Date:In-Reply-To:References:From; b=0SR2TpX4HGLG94ylHd3uq59ATms6ubAT1nvHr89JXhKnPWhHX+DO5CWn7e/bTEf3t 0dX8udIE9TJSBj6OFca/0gs/6xLB+6/HVIQ7z1q3jCpcLWdVXt9EBvb1ulioW9ddCr rx2Pb7SuLh1qf7cb/8RTtac8L7PtDBQxG+lfpSg0= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 1/3] solver: use async check (`info') for coderepo Date: Thu, 29 Aug 2024 23:26:01 +0000 Message-ID: <20240829232603.2120168-2-e@80x24.org> In-Reply-To: <20240829232603.2120168-1-e@80x24.org> References: <20240829232603.2120168-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Async --batch-check or `info' batch commands allow our Perl process to handle other requests while git is busy waiting on slow storage or CPUs to retrieve blob information. This improves parallelism for SMP machines in addition to allowing the Perl process to service other HTTP/NNTP/IMAP/POP3 requests while waiting for disk seeks, zlib inflation, and delta resolution. Checking stderr for error hints is now potentially racy, but it's only a hint so overall performance under worst case scenarios is preferable to correctness. --- lib/PublicInbox/Git.pm | 2 +- lib/PublicInbox/SolverGit.pm | 134 +++++++++++++++++++---------------- 2 files changed, 75 insertions(+), 61 deletions(-) diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index dd0a14e9..c43f9863 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -664,7 +664,7 @@ sub watch_async ($) { sub close { my ($self) = @_; - delete @$self{qw(-bc err_c inflight)}; + delete @$self{qw(-bc err_c inflight -prev_oids)}; delete($self->{epwatch}) ? $self->SUPER::close : delete($self->{sock}); # gcf_drain may run from PublicInbox::IO::DESTROY } diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm index 898ca72d..4d087eab 100644 --- a/lib/PublicInbox/SolverGit.pm +++ b/lib/PublicInbox/SolverGit.pm @@ -75,40 +75,84 @@ sub ERR ($$) { die $err; } -# look for existing objects already in git repos, returns arrayref -# if found, number of remaining git coderepos to try if not. -sub solve_existing ($$) { - my ($self, $want) = @_; - my $try = $want->{try_gits} //= [ @{$self->{gits}} ]; # array copy - my $git = shift @$try or die 'BUG {try_gits} empty'; - my $oid_b = $want->{oid_b}; - - # can't use async_check due to last_check_err :< - my ($oid_full, $type, $size) = $git->check($oid_b); - $git->schedule_cleanup if $self->{psgi_env}->{'pi-httpd.async'}; +sub ck_existing_cb { # async_check cb + my (undef, $oid_full, $type, $size, $arg) = @_; + my ($self, $want) = @$arg; + + # git <2.21 would show `dangling' (2.21+ shows `ambiguous') + # https://public-inbox.org/git/20190118033845.s2vlrb3wd3m2jfzu@dcvr/T/ + if ($type =~ /\A(?:missing|ambiguous)\z/ || + ($oid_full//'') eq 'dangling') { + undef $oid_full; + undef $type; + } + my $try = $want->{try_gits}; + my $git = shift @$try // die 'BUG: no {try_gits}'; - if ($oid_b eq ($oid_full // '') || (defined($type) && + if ($want->{oid_b} eq ($oid_full // '') || (defined($type) && (!$self->{have_hints} || $type eq 'blob'))) { delete $want->{try_gits}; - return [ $git, $oid_full, $type, int($size) ]; # done, success - } + dbg($self, "found $want->{oid_b} in " . + join(" ||\n\t", $git->pub_urls($self->{psgi_env}))); - # TODO: deal with 40-char "abbreviations" with future SHA-256 git - return scalar(@$try) if length($oid_b) >= 40; + my $existing = [ $git, $oid_full, $type, $size + 0 ]; + if ($want->{oid_b} eq $self->{oid_want} || $type ne 'blob') { + eval { done($self, $existing) }; + die "E: $@" if $@; + return; + } + mark_found($self, $want->{oid_b}, $existing); + return next_step($self); # onto patch application + } - # parse stderr of "git cat-file --batch-check" + # parse stderr of "git cat-file --batch-check", async means + # we may end up slurping output from other requests: my $err = $git->last_check_err; - my (@oids) = ($err =~ /\b([a-f0-9]{40,})\s+blob\b/g); - return scalar(@$try) unless scalar(@oids); - - # TODO: do something with the ambiguous array? - # push @ambiguous, [ $git, @oids ]; - - dbg($self, "`$oid_b' ambiguous in " . + my $prev_oids = $git->{-prev_oids} //= []; + my @any_oids = ($err =~ /\b([a-f0-9]{40,})\s+blob\b/g); + push @$prev_oids, @any_oids; + my $oid_b_re = qr/\A$want->{oid_b}/; + my @ambig_oids = grep /$oid_b_re/, @$prev_oids; + @$prev_oids = grep !/$oid_b_re/, @$prev_oids; + delete $git->{-prev_oids} unless @$prev_oids; + @ambig_oids and dbg($self, "`$want->{oid_b}' ambiguous in " . join("\n\t", $git->pub_urls($self->{psgi_env})) . "\n" . - join('', map { "$_ blob\n" } @oids)); - scalar(@$try); + join('', map { "$_ blob\n" } @ambig_oids)); + + return retry_current($self, $want) if @$try; + + # we may retry if inbox scan (below) fails + delete $want->{try_gits}; + + # scan through inboxes to look for emails which results in + # the oid we want: + my $ibx = shift(@{$want->{try_ibxs}}) or return done($self, undef); + if (my $msgs = find_smsgs($self, $ibx, $want)) { + $want->{try_smsgs} = $msgs; + $want->{cur_ibx} = $ibx; + $self->{tmp_diffs} = []; + return retry_current($self, $want); + } + try_harder($self, $want); +} + +# look for existing objects already in git repos, returns arrayref +# if found, number of remaining git coderepos to try if not. +sub solve_existing ($$) { + my ($self, $want) = @_; + my $try = $want->{try_gits} //= [ @{$self->{gits}} ]; # array copy + my $git = $try->[0] // die 'BUG {try_gits} empty'; + + if ($self->{psgi_env}->{'pi-httpd.async'}) { + async_check({ git => $git }, $want->{oid_b}, + \&ck_existing_cb, [ $self, $want ]); + $git->schedule_cleanup; + } else { + my ($oid_full, $type, $size) = $git->check($want->{oid_b}); + $type //= 'missing'; + ck_existing_cb(undef, $oid_full, $type, $size, [ $self, $want ]) + } } sub _tmp { @@ -574,7 +618,7 @@ sub extract_diffs_done { try_harder($self, $want); } -sub extract_diff_async { +sub extract_diff_async { # cat_async cb my ($bref, $oid, $type, $size, $x) = @_; my ($self, $want, $smsg) = @$x; if (defined($oid)) { @@ -590,7 +634,6 @@ sub extract_diff_async { sub resolve_patch ($$) { my ($self, $want) = @_; - my $cur_want = $want->{oid_b}; if (scalar(@{$self->{patches}}) > $MAX_PATCH) { die "Aborting, too many steps to $self->{oid_want}"; } @@ -613,40 +656,11 @@ sub resolve_patch ($$) { } # see if we can find the blob in an existing git repo: - if (!$want->{try_ibxs} && $self->{seen_oid}->{$cur_want}++) { - die "Loop detected solving $cur_want\n"; + if (!$want->{try_ibxs} && $self->{seen_oid}->{$want->{oid_b}}++) { + die "Loop detected solving $want->{oid_b}\n"; } $want->{try_ibxs} //= [ @{$self->{inboxes}} ]; # array copy - my $existing = solve_existing($self, $want); - if (ref $existing) { - my ($found_git, undef, $type, undef) = @$existing; - dbg($self, "found $cur_want in " . - join(" ||\n\t", - $found_git->pub_urls($self->{psgi_env}))); - - if ($cur_want eq $self->{oid_want} || $type ne 'blob') { - eval { done($self, $existing) }; - die "E: $@" if $@; - return; - } - mark_found($self, $cur_want, $existing); - return next_step($self); # onto patch application - } elsif ($existing > 0) { - return retry_current($self, $want); - } else { # $existing == 0: we may retry if inbox scan (below) fails - delete $want->{try_gits}; - } - - # scan through inboxes to look for emails which results in - # the oid we want: - my $ibx = shift(@{$want->{try_ibxs}}) or return done($self, undef); - if (my $msgs = find_smsgs($self, $ibx, $want)) { - $want->{try_smsgs} = $msgs; - $want->{cur_ibx} = $ibx; - $self->{tmp_diffs} = []; - return retry_current($self, $want); - } - try_harder($self, $want); + solve_existing($self, $want); } # this API is designed to avoid creating self-referential structures;