* [PATCH] extindex: --gc removes messages from over, too
2021-08-30 20:17 ` Konstantin Ryabitsev
2021-08-30 20:22 ` Eric Wong
@ 2021-09-01 0:17 ` Eric Wong
1 sibling, 0 replies; 5+ messages in thread
From: Eric Wong @ 2021-09-01 0:17 UTC (permalink / raw)
To: Konstantin Ryabitsev; +Cc: meta
Konstantin Ryabitsev <konstantin@linuxfoundation.org> wrote:
> This works, though there are some interesting side-effects to it. For example,
> I removed the /gitolite-transparency-log/* feed (it was too noisy and wasn't
> useful for anything, really). After running the --gc, I see lots of messages
> about things being removed, but /all/ still contains leftover entries from the
> inbox:
>
> https://x-lore.kernel.org/all/?t=20210830182453
>
> (look for "post-receive:" in the page)
>
> Clicking on the link returns a thread with blank messages (but not all of
> them).
>
> This is not a huge problem, as the messages are for sure gone from search
> results, but the behaviour is a bit odd.
Thanks, that was a bug, but not a data-loss one (nor the biting
ones that get worse every summer...):
----------8<---------
Subject: [PATCH] extindex: --gc removes messages from over, too
While messages from removed inboxes were removed from Xapian
search, --gc failed to remove messages from over.sqlite3
entirely. They no longer show up in the topic summary view.
Reported-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Link: https://public-inbox.org/20210830201723.dehoul4y6gpqf2cp@nitro.local/
---
lib/PublicInbox/ExtSearchIdx.pm | 19 +++++-----
t/extsearch.t | 61 +++++++++++++++++++++++++++++++++
2 files changed, 72 insertions(+), 8 deletions(-)
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index cf61237c..8cdad23d 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -343,20 +343,18 @@ sub _sync_inbox ($$$) {
sub gc_unref_doc ($$$$) {
my ($self, $ibx_id, $eidx_key, $docid) = @_;
- my $dbh = $self->{oidx}->dbh;
-
+ my $remain = 0;
# for debug/info purposes, oids may no longer be accessible
+ my $dbh = $self->{oidx}->dbh;
my $sth = $dbh->prepare_cached(<<'', undef, 1);
SELECT oidbin FROM xref3 WHERE docid = ? AND ibx_id = ?
$sth->execute($docid, $ibx_id);
my @oid = map { unpack('H*', $_->[0]) } @{$sth->fetchall_arrayref};
-
- $dbh->prepare_cached(<<'')->execute($docid, $ibx_id);
-DELETE FROM xref3 WHERE docid = ? AND ibx_id = ?
-
- my $remain = $self->{oidx}->get_xref3($docid);
- if (scalar(@$remain)) {
+ for my $oid (@oid) {
+ $remain += $self->{oidx}->remove_xref3($docid, $oid, $eidx_key);
+ }
+ if ($remain) {
$self->{oidx}->eidxq_add($docid); # enqueue for reindex
for my $oid (@oid) {
warn "I: unref #$docid $eidx_key $oid\n";
@@ -421,6 +419,11 @@ DELETE FROM xref3 WHERE docid NOT IN (SELECT num FROM over)
warn "I: eliminated $nr stale xref3 entries\n" if $nr != 0;
+ # fixup from old bugs:
+ $nr = $dbh->do(<<'');
+DELETE FROM over WHERE num NOT IN (SELECT docid FROM xref3)
+
+ warn "I: eliminated $nr stale over entries\n" if $nr != 0;
done($self);
}
diff --git a/t/extsearch.t b/t/extsearch.t
index b03adc17..ad4f2c6d 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -466,4 +466,65 @@ SKIP: {
'--gc works after compact');
}
+{ # ensure --gc removes non-xposted messages
+ my $old_size = -s $cfg_path // xbail "stat $cfg_path $!";
+ my $tmp_addr = 'v2tmp@example.com';
+ run_script([qw(-init v2tmp --indexlevel basic
+ --newsgroup v2tmp.example),
+ "$home/v2tmp", 'http://example.com/v2tmp', $tmp_addr ])
+ or xbail '-init';
+ $env = { ORIGINAL_RECIPIENT => $tmp_addr };
+ open $fh, '+>', undef or xbail "open $!";
+ $fh->autoflush(1);
+ my $mid = 'tmpmsg@example.com';
+ print $fh <<EOM or xbail "print $!";
+From: b\@z
+To: b\@r
+Message-Id: <$mid>
+Subject: tmpmsg
+Date: Tue, 19 Jan 2038 03:14:07 +0000
+
+EOM
+ seek $fh, 0, SEEK_SET or xbail "seek $!";
+ run_script([qw(-mda --no-precheck)], $env, {0 => $fh}) or xbail '-mda';
+ ok(run_script([qw(-extindex --all), "$home/extindex"]), 'update');
+ my $nr;
+ {
+ my $es = PublicInbox::ExtSearch->new("$home/extindex");
+ my ($id, $prv);
+ my $smsg = $es->over->next_by_mid($mid, \$id, \$prv);
+ ok($smsg, 'tmpmsg indexed');
+ my $mset = $es->search->mset("mid:$mid");
+ is($mset->size, 1, 'new message found');
+ $mset = $es->search->mset('z:0..');
+ $nr = $mset->size;
+ }
+ truncate($cfg_path, $old_size) or xbail "truncate $!";
+ my $rdr = { 2 => \(my $err) };
+ ok(run_script([qw(-extindex --gc), "$home/extindex"], undef, $rdr),
+ 'gc to get rid of removed inbox');
+ is_deeply([ grep(!/^(?:I:|#)/, split(/^/m, $err)) ], [],
+ 'no non-informational errors in stderr');
+
+ my $es = PublicInbox::ExtSearch->new("$home/extindex");
+ my $mset = $es->search->mset("mid:$mid");
+ is($mset->size, 0, 'tmpmsg gone from search');
+ my ($id, $prv);
+ is($es->over->next_by_mid($mid, \$id, \$prv), undef,
+ 'tmpmsg gone from over');
+ $id = $prv = undef;
+ is($es->over->next_by_mid('testmessage@example.com', \$id, \$prv),
+ undef, 'remaining message not indavderover');
+ $mset = $es->search->mset('z:0..');
+ is($mset->size, $nr - 1, 'existing messages not clobbered from search');
+ my $o = $es->over->{dbh}->selectall_arrayref(<<EOM);
+SELECT num FROM over ORDER BY num
+EOM
+ is(scalar(@$o), $mset->size, 'over row count matches Xapian');
+ my $x = $es->over->{dbh}->selectall_arrayref(<<EOM);
+SELECT DISTINCT(docid) FROM xref3 ORDER BY docid
+EOM
+ is_deeply($x, $o, 'xref3 and over docids match');
+}
+
done_testing;
^ permalink raw reply related [flat|nested] 5+ messages in thread