From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id AD7571F8DB for ; Fri, 24 Jul 2020 05:56:07 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 03/20] v2writable: introduce idx_stack Date: Fri, 24 Jul 2020 05:55:49 +0000 Message-Id: <20200724055606.27332-4-e@yhbt.net> In-Reply-To: <20200724055606.27332-1-e@yhbt.net> References: <20200724055606.27332-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This avoids pinning a potentially large chunk of memory from `git-log --reverse' into RAM (or triggering less predictable swap behavior). Instead it uses a contiguous temporary file with a fixed-size record for every blob we'll need to index. --- MANIFEST | 2 + lib/PublicInbox/IdxStack.pm | 52 ++++++++++++++++ lib/PublicInbox/V2Writable.pm | 114 ++++++++++++++++++++-------------- t/idx_stack.t | 56 +++++++++++++++++ 4 files changed, 176 insertions(+), 48 deletions(-) create mode 100644 lib/PublicInbox/IdxStack.pm create mode 100644 t/idx_stack.t diff --git a/MANIFEST b/MANIFEST index 9d90c8c23..f46a0776d 100644 --- a/MANIFEST +++ b/MANIFEST @@ -138,6 +138,7 @@ lib/PublicInbox/IMAPD.pm lib/PublicInbox/IMAPTracker.pm lib/PublicInbox/IMAPdeflate.pm lib/PublicInbox/IMAPsearchqp.pm +lib/PublicInbox/IdxStack.pm lib/PublicInbox/Import.pm lib/PublicInbox/In2Tie.pm lib/PublicInbox/Inbox.pm @@ -277,6 +278,7 @@ t/httpd-https.t t/httpd-unix.t t/httpd.t t/hval.t +t/idx_stack.t t/imap.t t/imap_searchqp.t t/imap_tracker.t diff --git a/lib/PublicInbox/IdxStack.pm b/lib/PublicInbox/IdxStack.pm new file mode 100644 index 000000000..b43b8064e --- /dev/null +++ b/lib/PublicInbox/IdxStack.pm @@ -0,0 +1,52 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ + +# temporary stack for public-inbox-index +package PublicInbox::IdxStack; +use v5.10.1; +use strict; +use Fcntl qw(:seek); +use constant FMT => eval { pack('Q', 1) } ? 'A1QQH*' : 'A1IIH*'; + +# start off in write-only mode +sub new { + open(my $io, '+>', undef) or die "open: $!"; + bless { wr => $io, latest_cmt => $_[1] }, __PACKAGE__ +} + +# file_char = [a|m] +sub push_rec { + my ($self, $file_char, $at, $ct, $blob_oid) = @_; + my $rec = pack(FMT, $file_char, $at, $ct, $blob_oid); + $self->{rec_size} //= length($rec); + print { $self->{wr} } $rec or die "print: $!"; + $self->{tot_size} += length($rec); +} + +sub num_records { + my ($self) = @_; + $self->{rec_size} ? $self->{tot_size} / $self->{rec_size} : 0; +} + +# switch into read-only mode and returns self +sub read_prepare { + my ($self) = @_; + my $io = $self->{rd} = delete($self->{wr}); + $io->flush or die "flush: $!"; + $self; +} + +sub pop_rec { + my ($self) = @_; + my $sz = $self->{rec_size} or return; + my $rec_pos = $self->{tot_size} -= $sz; + return if $rec_pos < 0; + my $io = $self->{rd}; + seek($io, $rec_pos, SEEK_SET) or die "seek: $!"; + my $r = read($io, my $buf, $sz); + defined($r) or die "read: $!"; + $r == $sz or die "read($r != $sz)"; + unpack(FMT, $buf); +} + +1; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index c04ea5d77..04c91e5dd 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -8,6 +8,7 @@ use strict; use v5.10.1; use parent qw(PublicInbox::Lock); use PublicInbox::SearchIdxShard; +use PublicInbox::IdxStack; use PublicInbox::Eml; use PublicInbox::Git; use PublicInbox::Import; @@ -881,9 +882,6 @@ sub reindex_checkpoint ($$$) { sub reindex_oid ($$$$) { my ($self, $sync, $git, $oid) = @_; - if (my $D = $sync->{D}) { # don't waste I/O on deletes - return if $D->{pack('H*', $oid)}; - } return if PublicInbox::SearchIdx::too_big($self, $git, $oid); my ($num, $mid0, $len); my $msgref = $git->cat_file($oid, \$len); @@ -1035,20 +1033,45 @@ $range $range; } -# don't bump num_highwater on --reindex -sub mark_deleted ($$$) { +sub prepare_range_stack { my ($git, $sync, $range) = @_; - my $D = $sync->{D} //= {}; # pack("H*", $oid) => NR - my $fh = $git->popen(qw(log --raw --no-abbrev - --pretty=tformat:%H - --no-notes --no-color --no-renames - --diff-filter=AM), $range, '--', 'd'); + # Don't bump num_highwater on --reindex by using {D}. + # We intentionally do NOT use {D} in the non-reindex case because + # we want NNTP article number gaps from unindexed messages to + # show up in mirrors, too. + my $D = $sync->{D} //= $sync->{reindex} ? {} : undef; # OID_BIN => NR + + my $fh = $git->popen(qw(log --raw -r --pretty=tformat:%at-%ct-%H + --no-notes --no-color --no-renames --no-abbrev), + $range); + my ($at, $ct, $stk); while (<$fh>) { - if (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) { - $D->{pack('H*', $1)}++; + if (/\A([0-9]+)-([0-9]+)-($x40)$/o) { + ($at, $ct) = ($1 + 0, $2 + 0); + $stk //= PublicInbox::IdxStack->new($3); + } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) { + my $oid = $1; + if ($D) { # reindex case + $D->{pack('H*', $oid)}++; + } else { # non-reindex case: + $stk->push_rec('d', $at, $ct, $oid); + } + } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) { + my $oid = $1; + if ($D) { + my $oid_bin = pack('H*', $oid); + my $nr = --$D->{$oid_bin}; + delete($D->{$oid_bin}) if $nr <= 0; + + # nr < 0 (-1) means it never existed + $stk->push_rec('m', $at, $ct, $oid) if $nr < 0; + } else { + $stk->push_rec('m', $at, $ct, $oid); + } } } close $fh or die "git log failed: \$?=$?"; + $stk ? $stk->read_prepare : undef; } sub sync_prepare ($$$) { @@ -1061,7 +1084,7 @@ sub sync_prepare ($$$) { # without {reindex} my $reindex_heads = last_commits($self, $epoch_max) if $sync->{reindex}; - for my $i (0..$epoch_max) { + for (my $i = $epoch_max; $i >= 0; $i--) { die 'BUG: already indexing!' if $self->{reindex_pipe}; my $git_dir = git_dir_n($self, $i); -d $git_dir or next; # missing epochs are fine @@ -1077,15 +1100,24 @@ sub sync_prepare ($$$) { # can't use 'rev-list --count' if we use --diff-filter $pr->("$i.git counting $range ... ") if $pr; - my $n = 0; - my $fh = $git->popen(qw(log --pretty=tformat:%H - --no-notes --no-color --no-renames - --diff-filter=AM), $range, '--', 'm'); - ++$n while <$fh>; - close $fh or die "git log failed: \$?=$?"; - $pr->("$n\n") if $pr; - $regen_max += $n; - mark_deleted($git, $sync, $range) if $sync->{reindex}; + my $stk = prepare_range_stack($git, $sync, $range); + my $nr = $stk ? $stk->num_records : 0; + $pr->("$nr\n") if $pr; + $sync->{stacks}->[$i] = $stk if $stk; + $regen_max += $nr; + } + + # XXX this should not happen unless somebody bypasses checks in + # our code and blindly injects "d" file history into git repos + if (my @leftovers = keys %{delete($sync->{D}) // {}}) { + warn('W: unindexing '.scalar(@leftovers)." leftovers\n"); + my $git = $self->{-inbox}->git; + for my $oid (@leftovers) { + $oid = unpack('H*', $oid); + $self->{current_info} = "leftover $oid"; + unindex_oid($self, $git, $oid); + } + $git->cleanup; } return 0 if (!$regen_max && !keys(%{$self->{unindex_range}})); @@ -1186,38 +1218,24 @@ sub index_epoch ($$$) { if (my $unindex_range = delete $sync->{unindex_range}->{$i}) { unindex($self, $sync, $git, $unindex_range); } - defined(my $range = $sync->{ranges}->[$i]) or return; + defined(my $stk = $sync->{stacks}->[$i]) or return; + $sync->{stacks}->[$i] = undef; + my $range = $sync->{ranges}->[$i]; if (my $pr = $sync->{-opt}->{-progress}) { $pr->("$i.git indexing $range\n"); } - my @cmd = qw(log --reverse --raw -r --pretty=tformat:%H.%at.%ct - --no-notes --no-color --no-abbrev --no-renames); - my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $range); - my $cmt; - my $D = $sync->{D}; - while (<$fh>) { - chomp; - $self->{current_info} = "$i.git $_"; - if (/\A($x40)\.([0-9]+)\.([0-9]+)$/o) { - $cmt = $1; - $self->{autime} = $2; - $self->{cotime} = $3; - } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) { - reindex_oid($self, $sync, $git, $1); - } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) { - # allow re-add if there was user error - my $oid = $1; - if ($D) { - my $oid_bin = pack('H*', $oid); - my $nr = --$D->{$oid_bin}; - delete($D->{$oid_bin}) if $nr <= 0; - } + while (my ($f, $at, $ct, $oid) = $stk->pop_rec) { + $self->{current_info} = "$i.git $oid"; + if ($f eq 'm') { + $self->{autime} = $at; + $self->{cotime} = $ct; + reindex_oid($self, $sync, $git, $oid); + } elsif ($f eq 'd') { unindex_oid($self, $git, $oid); } } - close $fh or die "git log failed: \$?=$?"; - delete @$self{qw(reindex_pipe autime cotime)}; - update_last_commit($self, $git, $i, $cmt) if defined $cmt; + delete @$self{qw(autime cotime)}; + update_last_commit($self, $git, $i, $stk->{latest_cmt}); } # public, called by public-inbox-index diff --git a/t/idx_stack.t b/t/idx_stack.t new file mode 100644 index 000000000..35aff37b7 --- /dev/null +++ b/t/idx_stack.t @@ -0,0 +1,56 @@ +#!perl -w +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ +use strict; +use Test::More; +use_ok 'PublicInbox::IdxStack'; +my $oid_a = '03c21563cf15c241687966b5b2a3f37cdc193316'; +my $oid_b = '963caad026055ab9bcbe3ee9550247f9d8840feb'; + +my $stk = PublicInbox::IdxStack->new; +is($stk->read_prepare, $stk, 'nothing'); +is($stk->num_records, 0, 'no records'); +is($stk->pop_rec, undef, 'undef on empty'); + +$stk = PublicInbox::IdxStack->new; +$stk->push_rec('m', 1234, 5678, $oid_a); +is($stk->read_prepare, $stk, 'read_prepare'); +is($stk->num_records, 1, 'num_records'); +is_deeply([$stk->pop_rec], ['m', 1234, 5678, $oid_a], 'pop once'); +is($stk->pop_rec, undef, 'undef on empty'); + +$stk = PublicInbox::IdxStack->new; +$stk->push_rec('m', 1234, 5678, $oid_a); +$stk->push_rec('d', 1234, 5678, $oid_b); +is($stk->read_prepare, $stk, 'read_prepare'); +is($stk->num_records, 2, 'num_records'); +is_deeply([$stk->pop_rec], ['d', 1234, 5678, $oid_b], 'pop'); +is_deeply([$stk->pop_rec], ['m', 1234, 5678, $oid_a], 'pop-pop'); +is($stk->pop_rec, undef, 'empty'); + +SKIP: { + $stk = undef; + my $nr = $ENV{TEST_GIT_LOG} or skip 'TEST_GIT_LOG unset', 3; + open my $fh, '-|', qw(git log --pretty=tformat:%at.%ct.%H), "-$nr" or + die "git log: $!"; + my @expect; + while (<$fh>) { + chomp; + my ($at, $ct, $H) = split(/\./); + $stk //= PublicInbox::IdxStack->new($H); + # not bothering to parse blobs here, just using commit OID + # as a blob OID since they're the same size + format + $stk->push_rec('m', $at + 0, $ct + 0, $H); + push(@expect, [ 'm', $at, $ct, $H ]); + } + $stk or skip('nothing from git log', 3); + is($stk->read_prepare, $stk, 'read_prepare'); + is($stk->num_records, scalar(@expect), 'num_records matches expected'); + my @result; + while (my @tmp = $stk->pop_rec) { + unshift @result, \@tmp; + } + is_deeply(\@result, \@expect, 'results match expected'); +} + +done_testing;