From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 11/11] extindex: support per-inbox indexheader+altid
Date: Sat, 10 Aug 2024 09:00:12 +0000 [thread overview]
Message-ID: <20240810090012.23269-12-e@80x24.org> (raw)
In-Reply-To: <20240810090012.23269-1-e@80x24.org>
This allows the venerable altid (e.g. gmane:1234) to finally
work for extindex users. The newer indexheader directive works
here, too. This allows a multi-inbox extindex to fully emulate
the capabilities of per-inbox Xapian indices.
For now, per-inbox indexheader and altid DO NOT work when
searching the extindex directly. In other words, gmane:1234
might work on the /git/ inbox, but not the /all/ extindex
virtual inbox. This may remain the case since altid is
typically per-inbox only, and stuff like X-Archives-Hash
can be global across inboxes.
---
lib/PublicInbox/Config.pm | 2 +-
lib/PublicInbox/ExtSearchIdx.pm | 1 +
lib/PublicInbox/Isearch.pm | 14 ++++-
lib/PublicInbox/SearchIdx.pm | 20 +++++--
t/extsearch.t | 98 +++++++++++++++++++++++++++++----
5 files changed, 117 insertions(+), 18 deletions(-)
diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm
index b40e96f1..cda3045e 100644
--- a/lib/PublicInbox/Config.pm
+++ b/lib/PublicInbox/Config.pm
@@ -571,7 +571,7 @@ sub _fill_ei ($$) {
}
return unless valid_foo_name($name, 'extindex');
$es->{name} = $name;
- $es->load_extra_indexers($es);
+ $es->load_extra_indexers($es); # extindex.*.{altid,indexheader}
$es;
}
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 094821a3..cead0f8a 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -21,6 +21,7 @@ use Carp qw(croak carp);
use Scalar::Util qw(blessed);
use Sys::Hostname qw(hostname);
use File::Glob qw(bsd_glob GLOB_NOSORT);
+use PublicInbox::Isearch;
use PublicInbox::MultiGit;
use PublicInbox::Spawn ();
use PublicInbox::Search;
diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm
index 9566f710..5f22c2f2 100644
--- a/lib/PublicInbox/Isearch.pm
+++ b/lib/PublicInbox/Isearch.pm
@@ -11,7 +11,11 @@ use PublicInbox::Search;
sub new {
my (undef, $ibx, $es) = @_;
- bless { es => $es, eidx_key => $ibx->eidx_key }, __PACKAGE__;
+ my $self = bless { es => $es, eidx_key => $ibx->eidx_key }, __PACKAGE__;
+ # load publicinbox.*.{altid,indexheader}
+ PublicInbox::Search::load_extra_indexers($self, $ibx);
+ push @{$self->{-extra}}, @{$es->{-extra} // []} if $self->{-extra};
+ $self;
}
sub _ibx_id ($) {
@@ -55,14 +59,22 @@ SELECT MAX(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ?
\%opt;
}
+sub _isrch_qparse ($) {
+ my ($self) = @_;
+ local $self->{es}->{-extra} = $self->{-extra};
+ $self->{es}->qparse_new; # XXX worth memoizing?
+}
+
sub mset {
my ($self, $str, $opt) = @_;
+ local $self->{es}->{qp} = _isrch_qparse($self) if $self->{-extra};
$self->{es}->mset($str, eidx_mset_prep $self, $opt);
}
sub async_mset {
my ($self, $str, $opt, $cb, @args) = @_;
$opt = eidx_mset_prep $self, $opt;
+ local $self->{es}->{-extra} = $self->{-extra} if $self->{-extra};
$self->{es}->async_mset($str, $opt, $cb, @args);
}
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 53c16e55..7829c7d4 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -475,9 +475,8 @@ sub eml2doc ($$$;$) {
term_generator($self)->set_document($doc);
index_headers($self, $smsg);
- if (defined(my $eidx_key = $smsg->{eidx_key})) {
- $doc->add_boolean_term('O'.$eidx_key) if $eidx_key ne '.';
- }
+ my $ekey = $smsg->{eidx_key};
+ $doc->add_boolean_term('O'.$ekey) if ($ekey // '.') ne '.';
msg_iter($eml, \&index_xapian, [ $self, $doc ]);
index_ids($self, $doc, $eml, $mids);
@@ -491,9 +490,10 @@ sub eml2doc ($$$;$) {
my $data = $smsg->to_doc_data;
$doc->set_data($data);
}
-
- for my $extra (@{$self->{-extra} // []}) {
- $extra->index_extra($self, $eml, $mids);
+ my $xtra = defined $ekey ? $self->{"-extra\t$ekey"} : undef;
+ $xtra //= $self->{-extra};
+ for my $e (@$xtra) {
+ $e->index_extra($self, $eml, $mids);
}
$doc;
}
@@ -1170,6 +1170,14 @@ sub eidx_shard_new {
}, $class;
$self->{-set_indexlevel_once} = 1 if $self->{indexlevel} eq 'medium';
$self->load_extra_indexers($eidx);
+ require PublicInbox::Isearch;
+ my $all = $self->{-extra};
+ for my $ibx (@{$eidx->{ibx_active} // []}) {
+ my $isrch = PublicInbox::Isearch->new($ibx);
+ my $per_ibx = $isrch->{-extra} // next;
+ $self->{"-extra\t$isrch->{eidx_key}"} =
+ $all ? [ @$per_ibx, @$all ] : $per_ibx;
+ }
$self;
}
diff --git a/t/extsearch.t b/t/extsearch.t
index 0ea5bc5b..28c43763 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -593,6 +593,7 @@ test_lei(sub {
'noted unindexed extindex is unsupported');
});
+require PublicInbox::XhcMset;
if ('indexheader support') {
xsys_e [qw(git config extindex.all.indexheader
boolean_term:xarchiveshash:X-Archives-Hash)],
@@ -608,20 +609,97 @@ if ('indexheader support') {
$es = PublicInbox::Config->new($cfg_path)->ALL;
my $mset = $es->mset('xarchiveshash:deadbeefcafe');
is $mset->size, 1, 'extindex.*.indexheader works';
- local $PublicInbox::Search::XHC = eval {
- require PublicInbox::XhcMset;
- PublicInbox::XapClient::start_helper('-j0');
- } or xbail "no XHC: $@";
+ local $PublicInbox::Search::XHC =
+ PublicInbox::XapClient::start_helper('-j0') or
+ xbail "no XHC: $@";
my @args;
$es->async_mset('xarchiveshash:deadbeefcafe', {} , sub { @args = @_ });
- is scalar(@args), 2, 'no extra args on hit';
- is $args[0]->size, 1, 'async mset hit works';
- ok !$args[1], 'no error on hit';
+ is scalar(@args), 2, 'no extra args on xarchiveshash hit';
+ is $args[0]->size, 1, 'async mset xarchiveshash hit works';
+ ok !$args[1], 'no error on xarchiveshash hit';
@args = ();
$es->async_mset('xarchiveshash:cafebeefdead', {} , sub { @args = @_ });
- is scalar(@args), 2, 'no extra args on miss';
- is $args[0]->size, 0, 'async mset miss works';
- ok !$args[1], 'no error on miss';
+ is scalar(@args), 2, 'no extra args on xarchiveshash miss';
+ is $args[0]->size, 0, 'async mset xarchivehash miss works';
+ ok !$args[1], 'no error on xarchiveshash miss';
+}
+
+if ('per-inbox altid w/ extindex') {
+ my $another = 'another-nntp.sqlite3';
+ my $altid = [ "serial:gmane:file=$another" ];
+ my $aibx = create_inbox 'v2', version => 2, indexlevel => 'basic',
+ altid => $altid, sub {
+ my ($im, $ibx) = @_;
+ my $mm = PublicInbox::Msgmap->new_file(
+ "$ibx->{inboxdir}/$another", 2);
+ $mm->mid_set(1234, 'a@example.com') == 1 or xbail 'mid_set';
+ $im->add(PublicInbox::Eml->new(<<'EOF')) or BAIL_OUT;
+From: a@example.com
+To: b@example.com
+Subject: boo!
+Message-ID: <a@example.com>
+X-Archives-Hash: dadfad
+Organization: felonious feline family
+
+hello world gmane:666
+EOF
+ };
+ PublicInbox::IO::write_file '>>', $cfg_path, <<EOF;
+[publicinbox "altid-test"]
+ inboxdir = $aibx->{inboxdir}
+ address = b\@example.com
+ altid = $altid->[0]
+ indexheader = phrase:organization:Organization
+EOF
+ ok run_script([qw(-extindex --all -vvv), $eidxdir]),
+ 'extindex update w/ altid';
+ local $PublicInbox::Search::XHC =
+ PublicInbox::XapClient::start_helper('-j0') or
+ xbail "no XHC: $@";
+ my @args;
+ my $pi_cfg = PublicInbox::Config->new($cfg_path);
+ my $ibx = $pi_cfg->lookup('b@example.com');
+ my $mset = $ibx->isrch->mset('gmane:1234');
+
+ is $mset->size, 1, 'isrch->mset altid hit';
+ $ibx->isrch->async_mset('gmane:1234', {} , sub { @args = @_ });
+ is scalar(@args), 2, 'no extra args on altid hit';
+ is $args[0]->size, 1, 'isrch->async_mset altid hit';
+
+ $mset = $ibx->isrch->mset('organization:felonious');
+ is $mset->size, 1, 'isrch->mset indexheader hit';
+ @args = ();
+ $ibx->isrch->async_mset('organization:felonious', {} , sub { @args = @_ });
+ is scalar(@args), 2, 'no extra args on indexheader hit';
+ is $args[0]->size, 1, 'isrch->async_mset indexheader hit';
+
+ $mset = $ibx->isrch->mset('organization:world');
+ is $mset->size, 0, 'isrch->mset indexheader miss';
+ @args = ();
+ $ibx->isrch->async_mset('organization:world', {} , sub { @args = @_ });
+ is scalar(@args), 2, 'no extra args on indexheader miss';
+ is $args[0]->size, 0, 'isrch->async_mset indexheader miss';
+
+ $mset = $ibx->isrch->mset('xarchiveshash:deadbeefcafe');
+ is $mset->size, 0, 'isrch->mset does not cross inbox on indexheader';
+ $mset = $ibx->isrch->mset('xarchiveshash:dadfad');
+ is $mset->size, 1, 'isrch->mset hits global indexheader';
+
+ $es = $pi_cfg->ALL;
+ $mset = $es->mset('xarchiveshash:dadfad');
+ is $mset->size, 1, 'esrch->mset global indexheader hit';
+ $mset = $es->mset('gmane:1234');
+ is $mset->size, 1, '->mset altid hit works globally';
+
+ $mset = $es->mset('gmane:666');
+ is $mset->size, 0, 'global ->mset hits';
+ $mset = $ibx->isrch->mset('gmane:666');
+ is $mset->size, 0, 'isrch->mset altid miss works';
+
+ @args = ();
+ $ibx->isrch->async_mset('gmane:666', {} , sub { @args = @_ });
+ is scalar(@args), 2, 'no extra args on altid miss';
+ is $args[0]->size, 0, 'isrch->async_mset altid miss works';
}
done_testing;
next prev parent reply other threads:[~2024-08-10 9:00 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-08-10 9:00 [PATCH 00/11] indexheader + altid enhancements Eric Wong
2024-08-10 9:00 ` [PATCH 01/11] search: support per-inbox indexheader directive Eric Wong
2024-08-10 9:00 ` [PATCH 02/11] indexheader: deduplicate common values Eric Wong
2024-08-10 9:00 ` [PATCH 03/11] search: help: avoid ':' in user prefixes Eric Wong
2024-08-10 9:00 ` [PATCH 04/11] search: move QueryParser mappings to xh_args Eric Wong
2024-08-10 9:00 ` [PATCH 05/11] www_text: show indexheader contents in help Eric Wong
2024-08-10 9:00 ` [PATCH 06/11] www: don't memoize ->user_help contents Eric Wong
2024-08-10 9:00 ` [PATCH 07/11] extindex: avoid branch in ->index_eml Eric Wong
2024-08-10 9:00 ` [PATCH 08/11] t/extsearch: use autodie to detect chmod failures Eric Wong
2024-08-10 9:00 ` [PATCH 09/11] t/extsearch: use xsys_e to detect errors Eric Wong
2024-08-10 9:00 ` [PATCH 10/11] extindex: support extindex.*.indexheader Eric Wong
2024-08-10 9:00 ` Eric Wong [this message]
2024-08-12 13:55 ` [PATCH 00/11] indexheader + altid enhancements Konstantin Ryabitsev
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240810090012.23269-12-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).