From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <e@80x24.org>
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net
X-Spam-Level: 
X-Spam-ASN:  
X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00,
	DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,
	T_SCC_BODY_TEXT_LINE shortcircuit=no autolearn=ham autolearn_force=no
	version=3.4.6
Received: from localhost (dcvr.yhbt.net [127.0.0.1])
	by dcvr.yhbt.net (Postfix) with ESMTP id E2C921F406
	for <meta@public-inbox.org>; Thu, 16 Nov 2023 11:00:20 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org;
	s=selector1; t=1700132421;
	bh=VRl/ebgFUQ44O3KWH9UB/WNfftvQLMdKkU8XXaoDHLk=;
	h=From:To:Subject:Date:From;
	b=uUvZqamI8LDWjBiW/Tb8whVuYPLtK6rOjBHg47j6J70g9FBgEg7gVJ4Ek/urbb3rg
	 9Rk8ecq3BVg+pInlJ1wn8HOREeWdrOdjmGYGYBdSAL8vNF4xthZBe/xPoMgeNrLEZF
	 l1d57jOEiJf20kXpQhmZf1FFq3Z1Z8979Z2/jrzU=
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH] extindex: warn and hint about --gc on bad ibx_id
Date: Thu, 16 Nov 2023 11:00:20 +0000
Message-Id: <20231116110020.1224857-1-e@80x24.org>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
List-Id: <meta.public-inbox.org>

Stale entries from newsgroup name changes (including adding
a `publicinbox.<name>.newsgroup' entry when none existed
before) can wreak havoc during a --reindex.  So give the
hint to users about running -extindex with --gc to clean
up stale entries.
---
 Documentation/public-inbox-extindex.pod |  5 +++--
 lib/PublicInbox/ExtSearchIdx.pm         | 29 +++++++++++++++++++++----
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/Documentation/public-inbox-extindex.pod b/Documentation/public-inbox-extindex.pod
index fbb12fe9..be4ea4de 100644
--- a/Documentation/public-inbox-extindex.pod
+++ b/Documentation/public-inbox-extindex.pod
@@ -50,8 +50,9 @@ significant space savings on Xapian indices.
 =item --gc
 
 Perform garbage collection instead of indexing.  Use this if
-inboxes are removed from the extindex, or if messages are
-purged or removed from some inboxes.
+inboxes are removed from the extindex, a newsgroup name is
+set or changed, or if messages are purged or removed from
+some inboxes.
 
 =item --reindex
 
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 6856ae66..7b7436ea 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -113,11 +113,30 @@ sub check_batch_limit ($) {
 	${$req->{need_checkpoint}} = 1 if $n >= $self->{batch_bytes};
 }
 
+sub bad_ibx_id ($$;$) {
+	my ($self, $ibx_id, $cb) = @_;
+	my $msg = "E: bad/stale ibx_id=#$ibx_id encountered";
+	my $ekey = $self->{oidx}->dbh->selectrow_array(<<EOM, undef, $ibx_id);
+SELECT eidx_key FROM inboxes WHERE ibx_id = ? LIMIT 1
+EOM
+	$msg .= " (formerly `$ekey')" if defined $ekey;
+	$cb //= \&carp;
+	$cb->($msg, "\nE: running $0 --gc may be required");
+}
+
+sub check_xr3 ($$$) {
+	my ($self, $id2pos, $xr3) = @_;
+	@$xr3 = grep {
+		defined($id2pos->{$_->[0]}) ? 1 : bad_ibx_id($self, $_->[0])
+	} @$xr3;
+}
+
 sub apply_boost ($$) {
 	my ($req, $smsg) = @_;
 	my $id2pos = $req->{id2pos}; # index in ibx_sorted
 	my $xr3 = $req->{self}->{oidx}->get_xref3($smsg->{num}, 1);
-	@$xr3 = sort {
+	check_xr3($req->{self}, $id2pos, $xr3);
+	@$xr3 = sort { # sort ascending
 		$id2pos->{$a->[0]} <=> $id2pos->{$b->[0]}
 				||
 		$a->[1] <=> $b->[1] # break ties with {xnum}
@@ -513,8 +532,9 @@ sub eidx_gc {
 
 sub _ibx_for ($$$) {
 	my ($self, $sync, $smsg) = @_;
-	my $ibx_id = delete($smsg->{ibx_id}) // die '{ibx_id} unset';
-	my $pos = $sync->{id2pos}->{$ibx_id} // die "$ibx_id no pos";
+	my $ibx_id = delete($smsg->{ibx_id}) // die 'BUG: {ibx_id} unset';
+	my $pos = $sync->{id2pos}->{$ibx_id} //
+		bad_ibx_id($self, $ibx_id, \&croak);
 	$self->{-ibx_ary_known}->[$pos] //
 		die "BUG: ibx for $smsg->{blob} not mapped"
 }
@@ -657,7 +677,8 @@ BUG? #$docid $smsg->{blob} is not referenced by inboxes during reindex
 	# hit the common case in _reindex_finalize without rereading
 	# from git (or holding multiple messages in memory).
 	my $id2pos = $sync->{id2pos}; # index in ibx_sorted
-	@$xr3 = sort {
+	check_xr3($self, $id2pos, $xr3);
+	@$xr3 = sort { # sort descending
 		$id2pos->{$b->[0]} <=> $id2pos->{$a->[0]}
 				||
 		$b->[1] <=> $a->[1] # break ties with {xnum}