From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id B94CE1FA18 for ; Tue, 15 Dec 2020 02:02:25 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 5/9] extsearchidx: reindex works on Xapian, too Date: Tue, 15 Dec 2020 02:02:20 +0000 Message-Id: <20201215020224.11739-6-e@80x24.org> In-Reply-To: <20201215020224.11739-1-e@80x24.org> References: <20201215020224.11739-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Instead of just working on over.sqlite3, we need to work on the Xapian DBs as well. While no changes to our Xapian use have taken place recently, they could in the future and --reindex exists to account for that. --- lib/PublicInbox/ExtSearchIdx.pm | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index c77fb197..f29a84e3 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -404,13 +404,18 @@ sub _reindex_finalize ($$$) { my $orig_smsg = $req->{orig_smsg} // die 'BUG: no {orig_smsg}'; my $docid = $smsg->{num} = $orig_smsg->{num}; $self->{oidx}->add_overview($eml, $smsg); # may rethread - return if $nr == 1; # likely, all good - + $self->{transact_bytes} += $smsg->{bytes}; + if ($nr == 1) { # likely, all good + $self->idx_shard($docid)->shard_reindex_docid($docid); + return; + } warn "W: #$docid split into $nr due to deduplication change\n"; my $chash0 = $smsg->{chash} // die "BUG: $smsg->{blob} no {chash}"; delete($by_chash->{$chash0}) // die "BUG: $smsg->{blob} chash missing"; + my @todo; for my $ary (values %$by_chash) { for my $x (reverse @$ary) { + warn "removing #$docid xref3 $x->{blob}\n"; my $n = $self->{oidx}->remove_xref3($docid, $x->{blob}); die "BUG: $x->{blob} invalidated #$docid" if $n == 0; } @@ -424,6 +429,12 @@ sub _reindex_finalize ($$$) { $e->{blob} eq $x->{blob} or die <{blob} != $e->{blob} (${\$ibx->eidx_key}:$e->{num}); EOF + push @todo, $ibx, $e; + } + $self->{oidx}->commit_lazy; # ensure shard workers can see xref removals + $self->{oidx}->begin_lazy; + $self->idx_shard($docid)->shard_reindex_docid($docid); + while (my ($ibx, $e) = splice(@todo, 0, 2)) { reindex_unseen($self, $sync, $ibx, $e); } } @@ -531,11 +542,12 @@ sub eidxq_process ($$) { # for reindexing # shards flush on their own, just don't queue up too many # deletes - if (($cur % 1000) == 0) { + if ($self->{transact_bytes} >= $self->{batch_bytes}) { $self->git->async_wait_all; $self->{oidx}->commit_lazy; $self->{oidx}->begin_lazy; $pr->("reindexed $cur/$tot\n") if $pr; + $self->{transact_bytes} = 0; } # this is only for SIGUSR1, shards do their own accounting: reindex_checkpoint($self, $sync) if ${$sync->{need_checkpoint}};