From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id C387E1FC61 for ; Sat, 10 Aug 2024 09:00:15 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1723280415; bh=po1ONlKLtOUlTRHs0DVN+yvoR1vIVyrqJ6nPqUdCTsY=; h=From:To:Subject:Date:In-Reply-To:References:From; b=ujpJ8d6juoF/ozdgD1Wu+M3i32Z9wFnQTqkPBPCtPDBeGfP/AbUHXvj4hTAPqYtsD ++4drlfbix+42UbJYFfXgrpQ37qr9kaGB3U7mqt3jvAVqMzMynZR0srUo06AlYtnhu MZiP46UXDuPt5osnXeiFY2Lt6KOABskxti67ZhjE= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 11/11] extindex: support per-inbox indexheader+altid Date: Sat, 10 Aug 2024 09:00:12 +0000 Message-ID: <20240810090012.23269-12-e@80x24.org> In-Reply-To: <20240810090012.23269-1-e@80x24.org> References: <20240810090012.23269-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This allows the venerable altid (e.g. gmane:1234) to finally work for extindex users. The newer indexheader directive works here, too. This allows a multi-inbox extindex to fully emulate the capabilities of per-inbox Xapian indices. For now, per-inbox indexheader and altid DO NOT work when searching the extindex directly. In other words, gmane:1234 might work on the /git/ inbox, but not the /all/ extindex virtual inbox. This may remain the case since altid is typically per-inbox only, and stuff like X-Archives-Hash can be global across inboxes. --- lib/PublicInbox/Config.pm | 2 +- lib/PublicInbox/ExtSearchIdx.pm | 1 + lib/PublicInbox/Isearch.pm | 14 ++++- lib/PublicInbox/SearchIdx.pm | 20 +++++-- t/extsearch.t | 98 +++++++++++++++++++++++++++++---- 5 files changed, 117 insertions(+), 18 deletions(-) diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index b40e96f1..cda3045e 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -571,7 +571,7 @@ sub _fill_ei ($$) { } return unless valid_foo_name($name, 'extindex'); $es->{name} = $name; - $es->load_extra_indexers($es); + $es->load_extra_indexers($es); # extindex.*.{altid,indexheader} $es; } diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 094821a3..cead0f8a 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -21,6 +21,7 @@ use Carp qw(croak carp); use Scalar::Util qw(blessed); use Sys::Hostname qw(hostname); use File::Glob qw(bsd_glob GLOB_NOSORT); +use PublicInbox::Isearch; use PublicInbox::MultiGit; use PublicInbox::Spawn (); use PublicInbox::Search; diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm index 9566f710..5f22c2f2 100644 --- a/lib/PublicInbox/Isearch.pm +++ b/lib/PublicInbox/Isearch.pm @@ -11,7 +11,11 @@ use PublicInbox::Search; sub new { my (undef, $ibx, $es) = @_; - bless { es => $es, eidx_key => $ibx->eidx_key }, __PACKAGE__; + my $self = bless { es => $es, eidx_key => $ibx->eidx_key }, __PACKAGE__; + # load publicinbox.*.{altid,indexheader} + PublicInbox::Search::load_extra_indexers($self, $ibx); + push @{$self->{-extra}}, @{$es->{-extra} // []} if $self->{-extra}; + $self; } sub _ibx_id ($) { @@ -55,14 +59,22 @@ SELECT MAX(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ? \%opt; } +sub _isrch_qparse ($) { + my ($self) = @_; + local $self->{es}->{-extra} = $self->{-extra}; + $self->{es}->qparse_new; # XXX worth memoizing? +} + sub mset { my ($self, $str, $opt) = @_; + local $self->{es}->{qp} = _isrch_qparse($self) if $self->{-extra}; $self->{es}->mset($str, eidx_mset_prep $self, $opt); } sub async_mset { my ($self, $str, $opt, $cb, @args) = @_; $opt = eidx_mset_prep $self, $opt; + local $self->{es}->{-extra} = $self->{-extra} if $self->{-extra}; $self->{es}->async_mset($str, $opt, $cb, @args); } diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 53c16e55..7829c7d4 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -475,9 +475,8 @@ sub eml2doc ($$$;$) { term_generator($self)->set_document($doc); index_headers($self, $smsg); - if (defined(my $eidx_key = $smsg->{eidx_key})) { - $doc->add_boolean_term('O'.$eidx_key) if $eidx_key ne '.'; - } + my $ekey = $smsg->{eidx_key}; + $doc->add_boolean_term('O'.$ekey) if ($ekey // '.') ne '.'; msg_iter($eml, \&index_xapian, [ $self, $doc ]); index_ids($self, $doc, $eml, $mids); @@ -491,9 +490,10 @@ sub eml2doc ($$$;$) { my $data = $smsg->to_doc_data; $doc->set_data($data); } - - for my $extra (@{$self->{-extra} // []}) { - $extra->index_extra($self, $eml, $mids); + my $xtra = defined $ekey ? $self->{"-extra\t$ekey"} : undef; + $xtra //= $self->{-extra}; + for my $e (@$xtra) { + $e->index_extra($self, $eml, $mids); } $doc; } @@ -1170,6 +1170,14 @@ sub eidx_shard_new { }, $class; $self->{-set_indexlevel_once} = 1 if $self->{indexlevel} eq 'medium'; $self->load_extra_indexers($eidx); + require PublicInbox::Isearch; + my $all = $self->{-extra}; + for my $ibx (@{$eidx->{ibx_active} // []}) { + my $isrch = PublicInbox::Isearch->new($ibx); + my $per_ibx = $isrch->{-extra} // next; + $self->{"-extra\t$isrch->{eidx_key}"} = + $all ? [ @$per_ibx, @$all ] : $per_ibx; + } $self; } diff --git a/t/extsearch.t b/t/extsearch.t index 0ea5bc5b..28c43763 100644 --- a/t/extsearch.t +++ b/t/extsearch.t @@ -593,6 +593,7 @@ test_lei(sub { 'noted unindexed extindex is unsupported'); }); +require PublicInbox::XhcMset; if ('indexheader support') { xsys_e [qw(git config extindex.all.indexheader boolean_term:xarchiveshash:X-Archives-Hash)], @@ -608,20 +609,97 @@ if ('indexheader support') { $es = PublicInbox::Config->new($cfg_path)->ALL; my $mset = $es->mset('xarchiveshash:deadbeefcafe'); is $mset->size, 1, 'extindex.*.indexheader works'; - local $PublicInbox::Search::XHC = eval { - require PublicInbox::XhcMset; - PublicInbox::XapClient::start_helper('-j0'); - } or xbail "no XHC: $@"; + local $PublicInbox::Search::XHC = + PublicInbox::XapClient::start_helper('-j0') or + xbail "no XHC: $@"; my @args; $es->async_mset('xarchiveshash:deadbeefcafe', {} , sub { @args = @_ }); - is scalar(@args), 2, 'no extra args on hit'; - is $args[0]->size, 1, 'async mset hit works'; - ok !$args[1], 'no error on hit'; + is scalar(@args), 2, 'no extra args on xarchiveshash hit'; + is $args[0]->size, 1, 'async mset xarchiveshash hit works'; + ok !$args[1], 'no error on xarchiveshash hit'; @args = (); $es->async_mset('xarchiveshash:cafebeefdead', {} , sub { @args = @_ }); - is scalar(@args), 2, 'no extra args on miss'; - is $args[0]->size, 0, 'async mset miss works'; - ok !$args[1], 'no error on miss'; + is scalar(@args), 2, 'no extra args on xarchiveshash miss'; + is $args[0]->size, 0, 'async mset xarchivehash miss works'; + ok !$args[1], 'no error on xarchiveshash miss'; +} + +if ('per-inbox altid w/ extindex') { + my $another = 'another-nntp.sqlite3'; + my $altid = [ "serial:gmane:file=$another" ]; + my $aibx = create_inbox 'v2', version => 2, indexlevel => 'basic', + altid => $altid, sub { + my ($im, $ibx) = @_; + my $mm = PublicInbox::Msgmap->new_file( + "$ibx->{inboxdir}/$another", 2); + $mm->mid_set(1234, 'a@example.com') == 1 or xbail 'mid_set'; + $im->add(PublicInbox::Eml->new(<<'EOF')) or BAIL_OUT; +From: a@example.com +To: b@example.com +Subject: boo! +Message-ID: +X-Archives-Hash: dadfad +Organization: felonious feline family + +hello world gmane:666 +EOF + }; + PublicInbox::IO::write_file '>>', $cfg_path, <{inboxdir} + address = b\@example.com + altid = $altid->[0] + indexheader = phrase:organization:Organization +EOF + ok run_script([qw(-extindex --all -vvv), $eidxdir]), + 'extindex update w/ altid'; + local $PublicInbox::Search::XHC = + PublicInbox::XapClient::start_helper('-j0') or + xbail "no XHC: $@"; + my @args; + my $pi_cfg = PublicInbox::Config->new($cfg_path); + my $ibx = $pi_cfg->lookup('b@example.com'); + my $mset = $ibx->isrch->mset('gmane:1234'); + + is $mset->size, 1, 'isrch->mset altid hit'; + $ibx->isrch->async_mset('gmane:1234', {} , sub { @args = @_ }); + is scalar(@args), 2, 'no extra args on altid hit'; + is $args[0]->size, 1, 'isrch->async_mset altid hit'; + + $mset = $ibx->isrch->mset('organization:felonious'); + is $mset->size, 1, 'isrch->mset indexheader hit'; + @args = (); + $ibx->isrch->async_mset('organization:felonious', {} , sub { @args = @_ }); + is scalar(@args), 2, 'no extra args on indexheader hit'; + is $args[0]->size, 1, 'isrch->async_mset indexheader hit'; + + $mset = $ibx->isrch->mset('organization:world'); + is $mset->size, 0, 'isrch->mset indexheader miss'; + @args = (); + $ibx->isrch->async_mset('organization:world', {} , sub { @args = @_ }); + is scalar(@args), 2, 'no extra args on indexheader miss'; + is $args[0]->size, 0, 'isrch->async_mset indexheader miss'; + + $mset = $ibx->isrch->mset('xarchiveshash:deadbeefcafe'); + is $mset->size, 0, 'isrch->mset does not cross inbox on indexheader'; + $mset = $ibx->isrch->mset('xarchiveshash:dadfad'); + is $mset->size, 1, 'isrch->mset hits global indexheader'; + + $es = $pi_cfg->ALL; + $mset = $es->mset('xarchiveshash:dadfad'); + is $mset->size, 1, 'esrch->mset global indexheader hit'; + $mset = $es->mset('gmane:1234'); + is $mset->size, 1, '->mset altid hit works globally'; + + $mset = $es->mset('gmane:666'); + is $mset->size, 0, 'global ->mset hits'; + $mset = $ibx->isrch->mset('gmane:666'); + is $mset->size, 0, 'isrch->mset altid miss works'; + + @args = (); + $ibx->isrch->async_mset('gmane:666', {} , sub { @args = @_ }); + is scalar(@args), 2, 'no extra args on altid miss'; + is $args[0]->size, 0, 'isrch->async_mset altid miss works'; } done_testing;