From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, LOTS_OF_MONEY shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 79E4D1F91B for ; Fri, 24 Jul 2020 05:56:10 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 17/20] v2writable: share log2stack code with v1 Date: Fri, 24 Jul 2020 05:56:03 +0000 Message-Id: <20200724055606.27332-18-e@yhbt.net> In-Reply-To: <20200724055606.27332-1-e@yhbt.net> References: <20200724055606.27332-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Another step in making v1 and v2 more similar. --- lib/PublicInbox/SearchIdx.pm | 44 ++++++++++++++++++--------- lib/PublicInbox/V2Writable.pm | 57 ++++++----------------------------- 2 files changed, 38 insertions(+), 63 deletions(-) diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 764257432..4d2e0da92 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -9,7 +9,7 @@ package PublicInbox::SearchIdx; use strict; use v5.10.1; -use parent qw(PublicInbox::Search PublicInbox::Lock); +use parent qw(PublicInbox::Search PublicInbox::Lock Exporter); use PublicInbox::Eml; use PublicInbox::InboxWritable; use PublicInbox::MID qw(mid_mime mids_for_index mids); @@ -21,6 +21,7 @@ use PublicInbox::OverIdx; use PublicInbox::Spawn qw(spawn); use PublicInbox::Git qw(git_unquote); use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); +our @EXPORT_OK = qw(too_big crlf_adjust log2stack is_ancestor); my $X = \%PublicInbox::Search::X; my ($DB_CREATE_OR_OPEN, $DB_OPEN); our $DB_NO_SYNC = 0; @@ -31,8 +32,6 @@ use constant DEBUG => !!$ENV{DEBUG}; my $xapianlevels = qr/\A(?:full|medium)\z/; my $hex = '[a-f0-9]'; my $OID = $hex .'{40,}'; -my $addmsg = qr!^:000000 100644 \S+ ($OID) A\t${hex}{2}/${hex}{38}$!; -my $delmsg = qr!^:100644 000000 ($OID) \S+ D\t${hex}{2}/${hex}{38}$!; sub new { my ($class, $ibx, $creat, $shard) = @_; @@ -600,17 +599,18 @@ sub process_stack { $batch_cb->($nr, $stk); } -sub prepare_stack ($$$) { - my ($self, $sync, $range) = @_; - my $git = $self->{ibx}->git; - - if (index($range, '..') < 0) { - # don't show annoying git errors to users who run -index - # on empty inboxes - $git->qx(qw(rev-parse -q --verify), "$range^0"); - return PublicInbox::IdxStack->new->read_prepare if $?; +sub log2stack ($$$$) { + my ($sync, $git, $range, $ibx) = @_; + my $D = $sync->{D}; # OID_BIN => NR (if reindexing, undef otherwise) + my ($add, $del); + if ($ibx->version == 1) { + my $path = $hex.'{2}/'.$hex.'{38}'; + $add = qr!\A:000000 100644 \S+ ($OID) A\t$path$!; + $del = qr!\A:100644 000000 ($OID) \S+ D\t$path$!; + } else { + $del = qr!\A:\d{6} 100644 $OID ($OID) [AM]\td$!; + $add = qr!\A:\d{6} 100644 $OID ($OID) [AM]\tm$!; } - my $D = $sync->{D} = $sync->{reindex} ? {} : undef; # OID_BIN => NR # Count the new files so they can be added newest to oldest # and still have numbers increasing from oldest to newest @@ -622,14 +622,14 @@ sub prepare_stack ($$$) { if (/\A([0-9]+)-([0-9]+)-($OID)$/o) { ($at, $ct) = ($1 + 0, $2 + 0); $stk //= PublicInbox::IdxStack->new($3); - } elsif (/$delmsg/) { + } elsif (/$del/) { my $oid = $1; if ($D) { # reindex case $D->{pack('H*', $oid)}++; } else { # non-reindex case: $stk->push_rec('d', $at, $ct, $oid); } - } elsif (/$addmsg/) { + } elsif (/$add/) { my $oid = $1; if ($D) { my $oid_bin = pack('H*', $oid); @@ -648,6 +648,20 @@ sub prepare_stack ($$$) { $stk->read_prepare; } +sub prepare_stack ($$$) { + my ($self, $sync, $range) = @_; + my $git = $self->{ibx}->git; + + if (index($range, '..') < 0) { + # don't show annoying git errors to users who run -index + # on empty inboxes + $git->qx(qw(rev-parse -q --verify), "$range^0"); + return PublicInbox::IdxStack->new->read_prepare if $?; + } + $sync->{D} = $sync->{reindex} ? {} : undef; # OID_BIN => NR + log2stack($sync, $git, $range, $self->{ibx}); +} + # --is-ancestor requires git 1.8.0+ sub is_ancestor ($$$) { my ($git, $cur, $tip) = @_; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 3dc200956..9a58a7a94 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -18,7 +18,7 @@ use PublicInbox::InboxWritable; use PublicInbox::OverIdx; use PublicInbox::Msgmap; use PublicInbox::Spawn qw(spawn popen_rd); -use PublicInbox::SearchIdx; +use PublicInbox::SearchIdx qw(too_big log2stack crlf_adjust is_ancestor); use IO::Handle; # ->autoflush use File::Temp qw(tempfile); @@ -156,8 +156,7 @@ sub add { # indexes a message, returns true if checkpointing is needed sub do_idx ($$$$) { my ($self, $msgref, $mime, $smsg) = @_; - $smsg->{bytes} = $smsg->{raw_bytes} + - PublicInbox::SearchIdx::crlf_adjust($$msgref); + $smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref); $self->{over}->add_overview($mime, $smsg); my $idx = idx_shard($self, $smsg->{num} % $self->{shards}); $idx->index_raw($msgref, $mime, $smsg); @@ -878,7 +877,7 @@ sub reindex_checkpoint ($$) { sub reindex_oid ($$$) { my ($self, $sync, $oid) = @_; - return if PublicInbox::SearchIdx::too_big($self, $oid); + return if too_big($self, $oid); my ($num, $mid0, $len); my $msgref = $self->{ibx}->git->cat_file($oid, \$len); return if $len == 0; # purged @@ -976,8 +975,6 @@ sub last_commits ($$) { $heads; } -*is_ancestor = *PublicInbox::SearchIdx::is_ancestor; - # returns a revision range for git-log(1) sub log_range ($$$$$) { my ($self, $sync, $git, $i, $tip) = @_; @@ -1029,47 +1026,6 @@ $range $range; } -sub prepare_range_stack { - my ($git, $sync, $range) = @_; - # Don't bump num_highwater on --reindex by using {D}. - # We intentionally do NOT use {D} in the non-reindex case because - # we want NNTP article number gaps from unindexed messages to - # show up in mirrors, too. - my $D = $sync->{D} //= $sync->{reindex} ? {} : undef; # OID_BIN => NR - - my $fh = $git->popen(qw(log --raw -r --pretty=tformat:%at-%ct-%H - --no-notes --no-color --no-renames --no-abbrev), - $range); - my ($at, $ct, $stk); - while (<$fh>) { - if (/\A([0-9]+)-([0-9]+)-($OID)$/o) { - ($at, $ct) = ($1 + 0, $2 + 0); - $stk //= PublicInbox::IdxStack->new($3); - } elsif (/\A:\d{6} 100644 $OID ($OID) [AM]\td$/o) { - my $oid = $1; - if ($D) { # reindex case - $D->{pack('H*', $oid)}++; - } else { # non-reindex case: - $stk->push_rec('d', $at, $ct, $oid); - } - } elsif (/\A:\d{6} 100644 $OID ($OID) [AM]\tm$/o) { - my $oid = $1; - if ($D) { - my $oid_bin = pack('H*', $oid); - my $nr = --$D->{$oid_bin}; - delete($D->{$oid_bin}) if $nr <= 0; - - # nr < 0 (-1) means it never existed - $stk->push_rec('m', $at, $ct, $oid) if $nr < 0; - } else { - $stk->push_rec('m', $at, $ct, $oid); - } - } - } - close $fh or die "git log failed: \$?=$?"; - $stk ? $stk->read_prepare : undef; -} - sub sync_prepare ($$$) { my ($self, $sync, $epoch_max) = @_; my $pr = $sync->{-opt}->{-progress}; @@ -1093,7 +1049,12 @@ sub sync_prepare ($$$) { my $range = log_range($self, $sync, $git, $i, $tip) or next; # can't use 'rev-list --count' if we use --diff-filter $pr->("$i.git counting $range ... ") if $pr; - my $stk = prepare_range_stack($git, $sync, $range); + # Don't bump num_highwater on --reindex by using {D}. + # We intentionally do NOT use {D} in the non-reindex case + # because we want NNTP article number gaps from unindexed + # messages to show up in mirrors, too. + $sync->{D} //= $sync->{reindex} ? {} : undef; # OID_BIN => NR + my $stk = log2stack($sync, $git, $range, $self->{ibx}); my $nr = $stk ? $stk->num_records : 0; $pr->("$nr\n") if $pr; $sync->{stacks}->[$i] = $stk if $stk;