* [PATCH] extindex: warn and hint about --gc on bad ibx_id
@ 2023-11-16 11:00 Eric Wong
0 siblings, 0 replies; only message in thread
From: Eric Wong @ 2023-11-16 11:00 UTC (permalink / raw)
To: meta
Stale entries from newsgroup name changes (including adding
a `publicinbox.<name>.newsgroup' entry when none existed
before) can wreak havoc during a --reindex. So give the
hint to users about running -extindex with --gc to clean
up stale entries.
---
Documentation/public-inbox-extindex.pod | 5 +++--
lib/PublicInbox/ExtSearchIdx.pm | 29 +++++++++++++++++++++----
2 files changed, 28 insertions(+), 6 deletions(-)
diff --git a/Documentation/public-inbox-extindex.pod b/Documentation/public-inbox-extindex.pod
index fbb12fe9..be4ea4de 100644
--- a/Documentation/public-inbox-extindex.pod
+++ b/Documentation/public-inbox-extindex.pod
@@ -50,8 +50,9 @@ significant space savings on Xapian indices.
=item --gc
Perform garbage collection instead of indexing. Use this if
-inboxes are removed from the extindex, or if messages are
-purged or removed from some inboxes.
+inboxes are removed from the extindex, a newsgroup name is
+set or changed, or if messages are purged or removed from
+some inboxes.
=item --reindex
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 6856ae66..7b7436ea 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -113,11 +113,30 @@ sub check_batch_limit ($) {
${$req->{need_checkpoint}} = 1 if $n >= $self->{batch_bytes};
}
+sub bad_ibx_id ($$;$) {
+ my ($self, $ibx_id, $cb) = @_;
+ my $msg = "E: bad/stale ibx_id=#$ibx_id encountered";
+ my $ekey = $self->{oidx}->dbh->selectrow_array(<<EOM, undef, $ibx_id);
+SELECT eidx_key FROM inboxes WHERE ibx_id = ? LIMIT 1
+EOM
+ $msg .= " (formerly `$ekey')" if defined $ekey;
+ $cb //= \&carp;
+ $cb->($msg, "\nE: running $0 --gc may be required");
+}
+
+sub check_xr3 ($$$) {
+ my ($self, $id2pos, $xr3) = @_;
+ @$xr3 = grep {
+ defined($id2pos->{$_->[0]}) ? 1 : bad_ibx_id($self, $_->[0])
+ } @$xr3;
+}
+
sub apply_boost ($$) {
my ($req, $smsg) = @_;
my $id2pos = $req->{id2pos}; # index in ibx_sorted
my $xr3 = $req->{self}->{oidx}->get_xref3($smsg->{num}, 1);
- @$xr3 = sort {
+ check_xr3($req->{self}, $id2pos, $xr3);
+ @$xr3 = sort { # sort ascending
$id2pos->{$a->[0]} <=> $id2pos->{$b->[0]}
||
$a->[1] <=> $b->[1] # break ties with {xnum}
@@ -513,8 +532,9 @@ sub eidx_gc {
sub _ibx_for ($$$) {
my ($self, $sync, $smsg) = @_;
- my $ibx_id = delete($smsg->{ibx_id}) // die '{ibx_id} unset';
- my $pos = $sync->{id2pos}->{$ibx_id} // die "$ibx_id no pos";
+ my $ibx_id = delete($smsg->{ibx_id}) // die 'BUG: {ibx_id} unset';
+ my $pos = $sync->{id2pos}->{$ibx_id} //
+ bad_ibx_id($self, $ibx_id, \&croak);
$self->{-ibx_ary_known}->[$pos] //
die "BUG: ibx for $smsg->{blob} not mapped"
}
@@ -657,7 +677,8 @@ BUG? #$docid $smsg->{blob} is not referenced by inboxes during reindex
# hit the common case in _reindex_finalize without rereading
# from git (or holding multiple messages in memory).
my $id2pos = $sync->{id2pos}; # index in ibx_sorted
- @$xr3 = sort {
+ check_xr3($self, $id2pos, $xr3);
+ @$xr3 = sort { # sort descending
$id2pos->{$b->[0]} <=> $id2pos->{$a->[0]}
||
$b->[1] <=> $a->[1] # break ties with {xnum}
^ permalink raw reply related [flat|nested] only message in thread
only message in thread, other threads:[~2023-11-16 11:00 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-11-16 11:00 [PATCH] extindex: warn and hint about --gc on bad ibx_id Eric Wong
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).