From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-3.0 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, URIBL_SBL,URIBL_SBL_A shortcircuit=no autolearn=no autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 88DA91FF9D for ; Fri, 25 Dec 2020 10:21:15 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 4/7] index: do not attach inbox to extindex unless updated Date: Fri, 25 Dec 2020 10:21:12 +0000 Message-Id: <20201225102115.6745-5-e@80x24.org> In-Reply-To: <20201225102115.6745-1-e@80x24.org> References: <20201225102115.6745-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We'll count the number of log changes (regardless of index or unindex) and only attach inboxes to ExtSearchIdx objects when they get new work. We'll also reduce lock bouncing and only update external indices after all per-inbox indexing is done. This also updates existing v2 indexing/unindexing callers to be more consistent and ensures unindex log entries update per-inbox last commit information. --- lib/PublicInbox/Admin.pm | 1 + lib/PublicInbox/SearchIdx.pm | 2 ++ lib/PublicInbox/V2Writable.pm | 26 +++++++++++++++++++------- script/public-inbox-index | 23 ++++++++++++++--------- 4 files changed, 36 insertions(+), 16 deletions(-) diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index 9a86d206..b468108e 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -271,6 +271,7 @@ EOM $idx = PublicInbox::SearchIdx->new($ibx, 1); } $idx->index_sync($opt); + $idx->{nidx} // 0; # returns number processed } sub progress_prepare ($) { diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index c8e309fc..b3361e05 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -615,6 +615,7 @@ sub index_both { # git->cat_async callback $smsg->{num} = index_mm($self, $eml, $oid, $sync) or die "E: could not generate NNTP article number for $oid"; add_message($self, $eml, $smsg, $sync); + ++$self->{nidx}; my $cur_cmt = $sync->{cur_cmt} // die 'BUG: {cur_cmt} missing'; ${$sync->{latest_cmt}} = $cur_cmt; } @@ -629,6 +630,7 @@ sub unindex_both { # git->cat_async callback if (defined(my $cur_cmt = $sync->{cur_cmt})) { ${$sync->{latest_cmt}} = $cur_cmt; } + ++$self->{nidx}; } sub with_umask { diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 2b849ddf..ca52874b 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -891,12 +891,22 @@ sub reindex_checkpoint ($$) { $mm_tmp->atfork_parent if $mm_tmp; } +sub index_finalize ($$) { + my ($arg, $index) = @_; + ++$arg->{self}->{nidx}; + if (defined(my $cur = $arg->{cur_cmt})) { + ${$arg->{latest_cmt}} = $cur; + } elsif ($index) { + die 'BUG: {cur_cmt} missing'; + } # else { unindexing @leftovers doesn't set {cur_cmt} +} + sub index_oid { # cat_async callback my ($bref, $oid, $type, $size, $arg) = @_; - return if is_bad_blob($oid, $type, $size, $arg->{oid}); + is_bad_blob($oid, $type, $size, $arg->{oid}) and + return index_finalize($arg, 1); # size == 0 purged returns here my $self = $arg->{self}; local $self->{current_info} = "$self->{current_info} $oid"; - return if $size == 0; # purged my ($num, $mid0); my $eml = PublicInbox::Eml->new($$bref); my $mids = mids($eml); @@ -967,7 +977,7 @@ sub index_oid { # cat_async callback if (do_idx($self, $bref, $eml, $smsg)) { ${$arg->{need_checkpoint}} = 1; } - ${$arg->{latest_cmt}} = $arg->{cur_cmt} // die 'BUG: {cur_cmt} missing'; + index_finalize($arg, 1); } # only update last_commit for $i on reindex iff newer than current @@ -1157,11 +1167,12 @@ sub unindex_oid_aux ($$$) { } sub unindex_oid ($$;$) { # git->cat_async callback - my ($bref, $oid, $type, $size, $sync) = @_; - return if is_bad_blob($oid, $type, $size, $sync->{oid}); - my $self = $sync->{self}; + my ($bref, $oid, $type, $size, $arg) = @_; + is_bad_blob($oid, $type, $size, $arg->{oid}) and + return index_finalize($arg, 0); + my $self = $arg->{self}; local $self->{current_info} = "$self->{current_info} $oid"; - my $unindexed = $sync->{in_unindex} ? $sync->{unindexed} : undef; + my $unindexed = $arg->{in_unindex} ? $arg->{unindexed} : undef; my $mm = $self->{mm}; my $mids = mids(PublicInbox::Eml->new($bref)); undef $$bref; @@ -1186,6 +1197,7 @@ sub unindex_oid ($$;$) { # git->cat_async callback } unindex_oid_aux($self, $oid, $mid); } + index_finalize($arg, 0); } sub git { $_[0]->{ibx}->git } diff --git a/script/public-inbox-index b/script/public-inbox-index index 87893ef1..a17bf615 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -63,7 +63,7 @@ my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg); PublicInbox::Admin::require_or_die('-index'); unless (@ibxs) { print STDERR $help; exit 1 } -my (@eidx_dir, %eidx_seen); +my (@eidx, %eidx_seen); my $update_extindex = $opt->{'update-extindex'}; if (!scalar(@$update_extindex) && (my $ALL = $cfg->ALL)) { # extindex and normal inboxes may have different owners @@ -84,7 +84,8 @@ for my $ei_name (@$update_extindex) { } else { die "extindex `$ei_name' not configured or found\n"; } - $eidx_seen{$topdir} //= push(@eidx_dir, $topdir); + $eidx_seen{$topdir} //= + push(@eidx, PublicInbox::ExtSearchIdx->new($topdir)); } my $mods = {}; my @eidx_unconfigured; @@ -95,7 +96,7 @@ foreach my $ibx (@ibxs) { $ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapian_only} ? 'full' : $detected); PublicInbox::Admin::scan_ibx_modules($mods, $ibx); - if (@eidx_dir && $ibx->{-unconfigured}) { + if (@eidx && $ibx->{-unconfigured}) { push @eidx_unconfigured, " $ibx->{inboxdir}\n"; } } @@ -128,18 +129,22 @@ publicInbox.$ibx->{name}.indexSequentialShard not boolean EOL $ibx_opt = { %$opt, sequential_shard => $v }; } - PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt); + my $nidx = PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt); last if $ibx_opt->{quit}; if (my $copt = $opt->{compact_opt}) { local $copt->{jobs} = 0 if $ibx_opt->{sequential_shard}; PublicInbox::Xapcmd::run($ibx, 'compact', $copt); } - next if $ibx->{-unconfigured}; last if $ibx_opt->{quit}; - for my $dir (@eidx_dir) { - my $eidx = PublicInbox::ExtSearchIdx->new($dir); + next if $ibx->{-unconfigured} || !$nidx; + for my $eidx (@eidx) { $eidx->attach_inbox($ibx); - $eidx->eidx_sync($ibx_opt); - last if $ibx_opt->{quit}; } } +$opt->{-no_fsync} = 1 if !$opt->{fsync}; +my $pr = $opt->{-progress}; +for my $eidx (@eidx) { + $pr->("indexing $eidx->{topdir} ...\n") if $pr; + $eidx->eidx_sync($opt); + last if $opt->{quit}; +}