unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 11/11] extindex: support per-inbox indexheader+altid
Date: Sat, 10 Aug 2024 09:00:12 +0000	[thread overview]
Message-ID: <20240810090012.23269-12-e@80x24.org> (raw)
In-Reply-To: <20240810090012.23269-1-e@80x24.org>

This allows the venerable altid (e.g. gmane:1234) to finally
work for extindex users.  The newer indexheader directive works
here, too.  This allows a multi-inbox extindex to fully emulate
the capabilities of per-inbox Xapian indices.

For now, per-inbox indexheader and altid DO NOT work when
searching the extindex directly.  In other words, gmane:1234
might work on the /git/ inbox, but not the /all/ extindex
virtual inbox.  This may remain the case since altid is
typically per-inbox only, and stuff like X-Archives-Hash
can be global across inboxes.
---
 lib/PublicInbox/Config.pm       |  2 +-
 lib/PublicInbox/ExtSearchIdx.pm |  1 +
 lib/PublicInbox/Isearch.pm      | 14 ++++-
 lib/PublicInbox/SearchIdx.pm    | 20 +++++--
 t/extsearch.t                   | 98 +++++++++++++++++++++++++++++----
 5 files changed, 117 insertions(+), 18 deletions(-)

diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm
index b40e96f1..cda3045e 100644
--- a/lib/PublicInbox/Config.pm
+++ b/lib/PublicInbox/Config.pm
@@ -571,7 +571,7 @@ sub _fill_ei ($$) {
 	}
 	return unless valid_foo_name($name, 'extindex');
 	$es->{name} = $name;
-	$es->load_extra_indexers($es);
+	$es->load_extra_indexers($es); # extindex.*.{altid,indexheader}
 	$es;
 }
 
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 094821a3..cead0f8a 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -21,6 +21,7 @@ use Carp qw(croak carp);
 use Scalar::Util qw(blessed);
 use Sys::Hostname qw(hostname);
 use File::Glob qw(bsd_glob GLOB_NOSORT);
+use PublicInbox::Isearch;
 use PublicInbox::MultiGit;
 use PublicInbox::Spawn ();
 use PublicInbox::Search;
diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm
index 9566f710..5f22c2f2 100644
--- a/lib/PublicInbox/Isearch.pm
+++ b/lib/PublicInbox/Isearch.pm
@@ -11,7 +11,11 @@ use PublicInbox::Search;
 
 sub new {
 	my (undef, $ibx, $es) = @_;
-	bless { es => $es, eidx_key => $ibx->eidx_key }, __PACKAGE__;
+	my $self = bless { es => $es, eidx_key => $ibx->eidx_key }, __PACKAGE__;
+	# load publicinbox.*.{altid,indexheader}
+	PublicInbox::Search::load_extra_indexers($self, $ibx);
+	push @{$self->{-extra}}, @{$es->{-extra} // []} if $self->{-extra};
+	$self;
 }
 
 sub _ibx_id ($) {
@@ -55,14 +59,22 @@ SELECT MAX(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ?
 	\%opt;
 }
 
+sub _isrch_qparse ($) {
+	my ($self) = @_;
+	local $self->{es}->{-extra} = $self->{-extra};
+	$self->{es}->qparse_new; # XXX worth memoizing?
+}
+
 sub mset {
 	my ($self, $str, $opt) = @_;
+	local $self->{es}->{qp} = _isrch_qparse($self) if $self->{-extra};
 	$self->{es}->mset($str, eidx_mset_prep $self, $opt);
 }
 
 sub async_mset {
 	my ($self, $str, $opt, $cb, @args) = @_;
 	$opt = eidx_mset_prep $self, $opt;
+	local $self->{es}->{-extra} = $self->{-extra} if $self->{-extra};
 	$self->{es}->async_mset($str, $opt, $cb, @args);
 }
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 53c16e55..7829c7d4 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -475,9 +475,8 @@ sub eml2doc ($$$;$) {
 	term_generator($self)->set_document($doc);
 	index_headers($self, $smsg);
 
-	if (defined(my $eidx_key = $smsg->{eidx_key})) {
-		$doc->add_boolean_term('O'.$eidx_key) if $eidx_key ne '.';
-	}
+	my $ekey = $smsg->{eidx_key};
+	$doc->add_boolean_term('O'.$ekey) if ($ekey // '.') ne '.';
 	msg_iter($eml, \&index_xapian, [ $self, $doc ]);
 	index_ids($self, $doc, $eml, $mids);
 
@@ -491,9 +490,10 @@ sub eml2doc ($$$;$) {
 		my $data = $smsg->to_doc_data;
 		$doc->set_data($data);
 	}
-
-	for my $extra (@{$self->{-extra} // []}) {
-		$extra->index_extra($self, $eml, $mids);
+	my $xtra = defined $ekey ? $self->{"-extra\t$ekey"} : undef;
+	$xtra //= $self->{-extra};
+	for my $e (@$xtra) {
+		$e->index_extra($self, $eml, $mids);
 	}
 	$doc;
 }
@@ -1170,6 +1170,14 @@ sub eidx_shard_new {
 	}, $class;
 	$self->{-set_indexlevel_once} = 1 if $self->{indexlevel} eq 'medium';
 	$self->load_extra_indexers($eidx);
+	require PublicInbox::Isearch;
+	my $all = $self->{-extra};
+	for my $ibx (@{$eidx->{ibx_active} // []}) {
+		my $isrch = PublicInbox::Isearch->new($ibx);
+		my $per_ibx = $isrch->{-extra} // next;
+		$self->{"-extra\t$isrch->{eidx_key}"} =
+					$all ? [ @$per_ibx, @$all ] : $per_ibx;
+	}
 	$self;
 }
 
diff --git a/t/extsearch.t b/t/extsearch.t
index 0ea5bc5b..28c43763 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -593,6 +593,7 @@ test_lei(sub {
 		'noted unindexed extindex is unsupported');
 });
 
+require PublicInbox::XhcMset;
 if ('indexheader support') {
 	xsys_e [qw(git config extindex.all.indexheader
 		boolean_term:xarchiveshash:X-Archives-Hash)],
@@ -608,20 +609,97 @@ if ('indexheader support') {
 	$es = PublicInbox::Config->new($cfg_path)->ALL;
 	my $mset = $es->mset('xarchiveshash:deadbeefcafe');
 	is $mset->size, 1, 'extindex.*.indexheader works';
-	local $PublicInbox::Search::XHC = eval {
-		require PublicInbox::XhcMset;
-		PublicInbox::XapClient::start_helper('-j0');
-	} or xbail "no XHC: $@";
+	local $PublicInbox::Search::XHC =
+			PublicInbox::XapClient::start_helper('-j0') or
+			xbail "no XHC: $@";
 	my @args;
 	$es->async_mset('xarchiveshash:deadbeefcafe', {} , sub { @args = @_ });
-	is scalar(@args), 2, 'no extra args on hit';
-	is $args[0]->size, 1, 'async mset hit works';
-	ok !$args[1], 'no error on hit';
+	is scalar(@args), 2, 'no extra args on xarchiveshash hit';
+	is $args[0]->size, 1, 'async mset xarchiveshash hit works';
+	ok !$args[1], 'no error on xarchiveshash hit';
 	@args = ();
 	$es->async_mset('xarchiveshash:cafebeefdead', {} , sub { @args = @_ });
-	is scalar(@args), 2, 'no extra args on miss';
-	is $args[0]->size, 0, 'async mset miss works';
-	ok !$args[1], 'no error on miss';
+	is scalar(@args), 2, 'no extra args on xarchiveshash miss';
+	is $args[0]->size, 0, 'async mset xarchivehash miss works';
+	ok !$args[1], 'no error on xarchiveshash miss';
+}
+
+if ('per-inbox altid w/ extindex') {
+	my $another = 'another-nntp.sqlite3';
+	my $altid = [ "serial:gmane:file=$another" ];
+	my $aibx = create_inbox 'v2', version => 2, indexlevel => 'basic',
+				altid => $altid, sub {
+		my ($im, $ibx) = @_;
+		my $mm = PublicInbox::Msgmap->new_file(
+					"$ibx->{inboxdir}/$another", 2);
+		$mm->mid_set(1234, 'a@example.com') == 1 or xbail 'mid_set';
+		$im->add(PublicInbox::Eml->new(<<'EOF')) or BAIL_OUT;
+From: a@example.com
+To: b@example.com
+Subject: boo!
+Message-ID: <a@example.com>
+X-Archives-Hash: dadfad
+Organization: felonious feline family
+
+hello world gmane:666
+EOF
+	};
+	PublicInbox::IO::write_file '>>', $cfg_path, <<EOF;
+[publicinbox "altid-test"]
+	inboxdir = $aibx->{inboxdir}
+	address = b\@example.com
+	altid = $altid->[0]
+	indexheader = phrase:organization:Organization
+EOF
+	ok run_script([qw(-extindex --all -vvv), $eidxdir]),
+		'extindex update w/ altid';
+	local $PublicInbox::Search::XHC =
+			PublicInbox::XapClient::start_helper('-j0') or
+			xbail "no XHC: $@";
+	my @args;
+	my $pi_cfg = PublicInbox::Config->new($cfg_path);
+	my $ibx = $pi_cfg->lookup('b@example.com');
+	my $mset = $ibx->isrch->mset('gmane:1234');
+
+	is $mset->size, 1, 'isrch->mset altid hit';
+	$ibx->isrch->async_mset('gmane:1234', {} , sub { @args = @_ });
+	is scalar(@args), 2, 'no extra args on altid hit';
+	is $args[0]->size, 1, 'isrch->async_mset altid hit';
+
+	$mset = $ibx->isrch->mset('organization:felonious');
+	is $mset->size, 1, 'isrch->mset indexheader hit';
+	@args = ();
+	$ibx->isrch->async_mset('organization:felonious', {} , sub { @args = @_ });
+	is scalar(@args), 2, 'no extra args on indexheader hit';
+	is $args[0]->size, 1, 'isrch->async_mset indexheader hit';
+
+	$mset = $ibx->isrch->mset('organization:world');
+	is $mset->size, 0, 'isrch->mset indexheader miss';
+	@args = ();
+	$ibx->isrch->async_mset('organization:world', {} , sub { @args = @_ });
+	is scalar(@args), 2, 'no extra args on indexheader miss';
+	is $args[0]->size, 0, 'isrch->async_mset indexheader miss';
+
+	$mset = $ibx->isrch->mset('xarchiveshash:deadbeefcafe');
+	is $mset->size, 0, 'isrch->mset does not cross inbox on indexheader';
+	$mset = $ibx->isrch->mset('xarchiveshash:dadfad');
+	is $mset->size, 1, 'isrch->mset hits global indexheader';
+
+	$es = $pi_cfg->ALL;
+	$mset = $es->mset('xarchiveshash:dadfad');
+	is $mset->size, 1, 'esrch->mset global indexheader hit';
+	$mset = $es->mset('gmane:1234');
+	is $mset->size, 1, '->mset altid hit works globally';
+
+	$mset = $es->mset('gmane:666');
+	is $mset->size, 0, 'global ->mset hits';
+	$mset = $ibx->isrch->mset('gmane:666');
+	is $mset->size, 0, 'isrch->mset altid miss works';
+
+	@args = ();
+	$ibx->isrch->async_mset('gmane:666', {} , sub { @args = @_ });
+	is scalar(@args), 2, 'no extra args on altid miss';
+	is $args[0]->size, 0, 'isrch->async_mset altid miss works';
 }
 
 done_testing;

  parent reply	other threads:[~2024-08-10  9:00 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-08-10  9:00 [PATCH 00/11] indexheader + altid enhancements Eric Wong
2024-08-10  9:00 ` [PATCH 01/11] search: support per-inbox indexheader directive Eric Wong
2024-08-10  9:00 ` [PATCH 02/11] indexheader: deduplicate common values Eric Wong
2024-08-10  9:00 ` [PATCH 03/11] search: help: avoid ':' in user prefixes Eric Wong
2024-08-10  9:00 ` [PATCH 04/11] search: move QueryParser mappings to xh_args Eric Wong
2024-08-10  9:00 ` [PATCH 05/11] www_text: show indexheader contents in help Eric Wong
2024-08-10  9:00 ` [PATCH 06/11] www: don't memoize ->user_help contents Eric Wong
2024-08-10  9:00 ` [PATCH 07/11] extindex: avoid branch in ->index_eml Eric Wong
2024-08-10  9:00 ` [PATCH 08/11] t/extsearch: use autodie to detect chmod failures Eric Wong
2024-08-10  9:00 ` [PATCH 09/11] t/extsearch: use xsys_e to detect errors Eric Wong
2024-08-10  9:00 ` [PATCH 10/11] extindex: support extindex.*.indexheader Eric Wong
2024-08-10  9:00 ` Eric Wong [this message]
2024-08-12 13:55 ` [PATCH 00/11] indexheader + altid enhancements Konstantin Ryabitsev

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240810090012.23269-12-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).