From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 816371F8C6; Wed, 1 Sep 2021 00:17:32 +0000 (UTC) Date: Wed, 1 Sep 2021 00:17:32 +0000 From: Eric Wong To: Konstantin Ryabitsev Cc: meta@public-inbox.org Subject: [PATCH] extindex: --gc removes messages from over, too Message-ID: <20210901001732.GA18457@dcvr> References: <20210830191748.6ugkcpccdeq2pzic@nitro.local> <20210830192732.GB25471@dcvr> <20210830201723.dehoul4y6gpqf2cp@nitro.local> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: <20210830201723.dehoul4y6gpqf2cp@nitro.local> List-Id: Konstantin Ryabitsev wrote: > This works, though there are some interesting side-effects to it. For example, > I removed the /gitolite-transparency-log/* feed (it was too noisy and wasn't > useful for anything, really). After running the --gc, I see lots of messages > about things being removed, but /all/ still contains leftover entries from the > inbox: > > https://x-lore.kernel.org/all/?t=20210830182453 > > (look for "post-receive:" in the page) > > Clicking on the link returns a thread with blank messages (but not all of > them). > > This is not a huge problem, as the messages are for sure gone from search > results, but the behaviour is a bit odd. Thanks, that was a bug, but not a data-loss one (nor the biting ones that get worse every summer...): ----------8<--------- Subject: [PATCH] extindex: --gc removes messages from over, too While messages from removed inboxes were removed from Xapian search, --gc failed to remove messages from over.sqlite3 entirely. They no longer show up in the topic summary view. Reported-by: Konstantin Ryabitsev Link: https://public-inbox.org/20210830201723.dehoul4y6gpqf2cp@nitro.local/ --- lib/PublicInbox/ExtSearchIdx.pm | 19 +++++----- t/extsearch.t | 61 +++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 8 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index cf61237c..8cdad23d 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -343,20 +343,18 @@ sub _sync_inbox ($$$) { sub gc_unref_doc ($$$$) { my ($self, $ibx_id, $eidx_key, $docid) = @_; - my $dbh = $self->{oidx}->dbh; - + my $remain = 0; # for debug/info purposes, oids may no longer be accessible + my $dbh = $self->{oidx}->dbh; my $sth = $dbh->prepare_cached(<<'', undef, 1); SELECT oidbin FROM xref3 WHERE docid = ? AND ibx_id = ? $sth->execute($docid, $ibx_id); my @oid = map { unpack('H*', $_->[0]) } @{$sth->fetchall_arrayref}; - - $dbh->prepare_cached(<<'')->execute($docid, $ibx_id); -DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? - - my $remain = $self->{oidx}->get_xref3($docid); - if (scalar(@$remain)) { + for my $oid (@oid) { + $remain += $self->{oidx}->remove_xref3($docid, $oid, $eidx_key); + } + if ($remain) { $self->{oidx}->eidxq_add($docid); # enqueue for reindex for my $oid (@oid) { warn "I: unref #$docid $eidx_key $oid\n"; @@ -421,6 +419,11 @@ DELETE FROM xref3 WHERE docid NOT IN (SELECT num FROM over) warn "I: eliminated $nr stale xref3 entries\n" if $nr != 0; + # fixup from old bugs: + $nr = $dbh->do(<<''); +DELETE FROM over WHERE num NOT IN (SELECT docid FROM xref3) + + warn "I: eliminated $nr stale over entries\n" if $nr != 0; done($self); } diff --git a/t/extsearch.t b/t/extsearch.t index b03adc17..ad4f2c6d 100644 --- a/t/extsearch.t +++ b/t/extsearch.t @@ -466,4 +466,65 @@ SKIP: { '--gc works after compact'); } +{ # ensure --gc removes non-xposted messages + my $old_size = -s $cfg_path // xbail "stat $cfg_path $!"; + my $tmp_addr = 'v2tmp@example.com'; + run_script([qw(-init v2tmp --indexlevel basic + --newsgroup v2tmp.example), + "$home/v2tmp", 'http://example.com/v2tmp', $tmp_addr ]) + or xbail '-init'; + $env = { ORIGINAL_RECIPIENT => $tmp_addr }; + open $fh, '+>', undef or xbail "open $!"; + $fh->autoflush(1); + my $mid = 'tmpmsg@example.com'; + print $fh < +Subject: tmpmsg +Date: Tue, 19 Jan 2038 03:14:07 +0000 + +EOM + seek $fh, 0, SEEK_SET or xbail "seek $!"; + run_script([qw(-mda --no-precheck)], $env, {0 => $fh}) or xbail '-mda'; + ok(run_script([qw(-extindex --all), "$home/extindex"]), 'update'); + my $nr; + { + my $es = PublicInbox::ExtSearch->new("$home/extindex"); + my ($id, $prv); + my $smsg = $es->over->next_by_mid($mid, \$id, \$prv); + ok($smsg, 'tmpmsg indexed'); + my $mset = $es->search->mset("mid:$mid"); + is($mset->size, 1, 'new message found'); + $mset = $es->search->mset('z:0..'); + $nr = $mset->size; + } + truncate($cfg_path, $old_size) or xbail "truncate $!"; + my $rdr = { 2 => \(my $err) }; + ok(run_script([qw(-extindex --gc), "$home/extindex"], undef, $rdr), + 'gc to get rid of removed inbox'); + is_deeply([ grep(!/^(?:I:|#)/, split(/^/m, $err)) ], [], + 'no non-informational errors in stderr'); + + my $es = PublicInbox::ExtSearch->new("$home/extindex"); + my $mset = $es->search->mset("mid:$mid"); + is($mset->size, 0, 'tmpmsg gone from search'); + my ($id, $prv); + is($es->over->next_by_mid($mid, \$id, \$prv), undef, + 'tmpmsg gone from over'); + $id = $prv = undef; + is($es->over->next_by_mid('testmessage@example.com', \$id, \$prv), + undef, 'remaining message not indavderover'); + $mset = $es->search->mset('z:0..'); + is($mset->size, $nr - 1, 'existing messages not clobbered from search'); + my $o = $es->over->{dbh}->selectall_arrayref(<size, 'over row count matches Xapian'); + my $x = $es->over->{dbh}->selectall_arrayref(<