From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 9F01E1FB0C for ; Sat, 5 Dec 2020 10:11:40 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 2/2] isearch: emulate per-inbox search with ->ALL Date: Sat, 5 Dec 2020 10:11:38 +0000 Message-Id: <20201205101138.11973-3-e@80x24.org> In-Reply-To: <20201205101138.11973-1-e@80x24.org> References: <20201205101138.11973-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Using "eidx_key:" boolean prefix to limit results to a given inbox, we can use ->ALL to emulate and replace per-Inbox xap15/[0-9] search indices. With this change, the presence of "extindex.all.topdir" in the $PI_CONFIG will cause the WWW code to use that extindex and ignore per-inbox Xapian DBs in xap15/[0-9]. Unfortunately IMAP search still requires old per-inbox indices, for now. Mapping extindex Xapian docids to per-Inbox UIDs and vice-versa is proving tricky. Fortunately, IMAP search is rarely used and optional. The RFCs don't specify expensive phrase search, either, so `indexlevel=medium' can be used in per-inbox Xapian indices to save space. For primarily WWW (and future JMAP) users; this should result in significant disk space, FD, and page cache footprint savings for large instances with many inboxes and many cross-posted messages. --- MANIFEST | 1 + lib/PublicInbox/Config.pm | 4 ++ lib/PublicInbox/DummyInbox.pm | 2 +- lib/PublicInbox/ExtMsg.pm | 2 +- lib/PublicInbox/ExtSearch.pm | 1 + lib/PublicInbox/Inbox.pm | 4 ++ lib/PublicInbox/Isearch.pm | 87 +++++++++++++++++++++++++++++++++++ lib/PublicInbox/Mbox.pm | 6 +-- lib/PublicInbox/Search.pm | 5 +- lib/PublicInbox/SearchView.pm | 10 ++-- lib/PublicInbox/SolverGit.pm | 2 +- lib/PublicInbox/WWW.pm | 2 +- lib/PublicInbox/WwwStream.pm | 2 +- lib/PublicInbox/WwwText.pm | 2 +- t/extsearch.t | 25 +++++++++- 15 files changed, 139 insertions(+), 16 deletions(-) create mode 100644 lib/PublicInbox/Isearch.pm diff --git a/MANIFEST b/MANIFEST index 946e4b8a..b39f63db 100644 --- a/MANIFEST +++ b/MANIFEST @@ -156,6 +156,7 @@ lib/PublicInbox/In2Tie.pm lib/PublicInbox/Inbox.pm lib/PublicInbox/InboxIdle.pm lib/PublicInbox/InboxWritable.pm +lib/PublicInbox/Isearch.pm lib/PublicInbox/KQNotify.pm lib/PublicInbox/Linkify.pm lib/PublicInbox/Listener.pm diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index ba0ead6e..1844f8b2 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -477,6 +477,10 @@ EOF push @$repo_objs, $repo if $repo; } } + if (my $es = ALL($self)) { + require PublicInbox::Isearch; + $ibx->{isrch} = PublicInbox::Isearch->new($ibx, $es); + } $self->{-by_eidx_key}->{$ibx->eidx_key} = $ibx; } diff --git a/lib/PublicInbox/DummyInbox.pm b/lib/PublicInbox/DummyInbox.pm index 02426f13..981043ce 100644 --- a/lib/PublicInbox/DummyInbox.pm +++ b/lib/PublicInbox/DummyInbox.pm @@ -16,7 +16,7 @@ no warnings 'once'; *max = \&uidvalidity; *query_xover = \&uid_range; *over = \&mm; -*search = *unsubscribe_unlock = +*isrch = *search = *unsubscribe_unlock = *get_art = *description = *base_url = \&subscribe_unlock; 1; diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm index 2a0a3e46..2a579c1b 100644 --- a/lib/PublicInbox/ExtMsg.pm +++ b/lib/PublicInbox/ExtMsg.pm @@ -32,7 +32,7 @@ sub PARTIAL_MAX () { 100 } sub search_partial ($$) { my ($ibx, $mid) = @_; return if length($mid) < $MIN_PARTIAL_LEN; - my $srch = $ibx->search or return; + my $srch = $ibx->search or return; # NOT ->isrch, we already try ->ALL my $opt = { limit => PARTIAL_MAX, mset => 2 }; my @try = ("m:$mid*"); my $chop = $mid; diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm index 80455d8d..2a560935 100644 --- a/lib/PublicInbox/ExtSearch.pm +++ b/lib/PublicInbox/ExtSearch.pm @@ -128,5 +128,6 @@ no warnings 'once'; *recent = \&PublicInbox::Inbox::recent; *max_git_epoch = *nntp_usable = *msg_by_path = \&mm; # undef +*isrch = *search; 1; diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 58651687..52aece7c 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -201,6 +201,10 @@ sub search { (eval { $srch->xdb }) ? $srch : undef; } +# isrch is preferred for read-only interfaces if available since it +# reduces kernel cache and FD overhead +sub isrch { $_[0]->{isrch} // search($_[0]) } + sub over { $_[0]->{over} //= eval { my $srch = $_[0]->{search} //= eval { diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm new file mode 100644 index 00000000..0ab3b19a --- /dev/null +++ b/lib/PublicInbox/Isearch.pm @@ -0,0 +1,87 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ + +# Provides everything the PublicInbox::Search object does; +# but uses global ExtSearch (->ALL) with an eidx_key query to +# emulate per-Inbox search using ->ALL. +package PublicInbox::Isearch; +use strict; +use v5.10.1; +use PublicInbox::ExtSearch; +use PublicInbox::Search; + +sub new { + my (undef, $ibx, $es) = @_; + bless { es => $es, eidx_key => $ibx->eidx_key }, __PACKAGE__; +} + +sub mset { + my ($self, $str, $opt) = @_; + $self->{es}->mset($str, { $opt ? %$opt : (), + eidx_key => $self->{eidx_key} }); +} + +sub _ibx_id ($) { + my ($self) = @_; + my $sth = $self->{es}->over->dbh->prepare_cached(<<'', undef, 1); +SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1 + + $sth->execute($self->{eidx_key}); + $sth->fetchrow_array // + die "E: `$self->{eidx_key}' not in $self->{es}->{topdir}\n"; +} + +sub mset_to_artnums { + my ($self, $mset) = @_; + my $docids = PublicInbox::Search::mset_to_artnums($self->{es}, $mset); + my $ibx_id = $self->{-ibx_id} //= _ibx_id($self); + my $qmarks = join(',', map { '?' } @$docids); + my $rows = $self->{es}->over->dbh-> + selectall_arrayref(<<"", undef, $ibx_id, @$docids); +SELECT docid,xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks) + + my $i = -1; + my %order = map { $_ => ++$i } @$docids; + my @xnums; + for my $row (@$rows) { # @row = ($docid, $xnum) + my $idx = delete($order{$row->[0]}) // next; + $xnums[$idx] = $row->[1]; + } + if (scalar keys %order) { + warn "W: $self->{es}->{topdir} #", + join(', #', sort keys %order), + " not mapped to `$self->{eidx_key}'\n"; + warn "W: $self->{es}->{topdir} may need to be reindexed\n"; + @xnums = grep { defined } @xnums; + } + \@xnums; +} + +sub mset_to_smsg { + my ($self, $ibx, $mset) = @_; # $ibx is a real inbox, not eidx + my $xnums = mset_to_artnums($self, $mset); + my $i = -1; + my %order = map { $_ => ++$i } @$xnums; + my $unordered = $ibx->over->get_all(@$xnums); + my @msgs; + for my $smsg (@$unordered) { + my $idx = delete($order{$smsg->{num}}) // do { + warn "W: $ibx->{inboxdir} #$smsg->{num}\n"; + next; + }; + $msgs[$idx] = $smsg; + } + if (scalar keys %order) { + warn "W: $ibx->{inboxdir} #", + join(', #', sort keys %order), + " no longer valid\n"; + warn "W: $self->{es}->{topdir} may need to be reindexed\n"; + } + wantarray ? ($mset->get_matches_estimated, \@msgs) : \@msgs; +} + +sub has_threadid { 1 } + +sub help { $_[0]->{es}->help } + +1; diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index 22516998..19459150 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -218,7 +218,7 @@ sub results_cb { return $smsg; } # refill result set - my $srch = $ctx->{-inbox}->search or return gone($ctx,'search'); + my $srch = $ctx->{-inbox}->isrch or return gone($ctx, 'search'); my $mset = $srch->mset($ctx->{query}, $ctx->{qopts}); my $size = $mset->size or return; $ctx->{qopts}->{offset} += $size; @@ -240,7 +240,7 @@ sub results_thread_cb { next if $over->expand_thread($ctx); # refill result set - my $srch = $ctx->{-inbox}->search or return gone($ctx,'search'); + my $srch = $ctx->{-inbox}->isrch or return gone($ctx, 'search'); my $mset = $srch->mset($ctx->{query}, $ctx->{qopts}); my $size = $mset->size or return; $ctx->{qopts}->{offset} += $size; @@ -253,7 +253,7 @@ sub mbox_all { my ($ctx, $q) = @_; my $q_string = $q->{'q'}; return mbox_all_ids($ctx) if $q_string !~ /\S/; - my $srch = $ctx->{-inbox}->search or + my $srch = $ctx->{-inbox}->isrch or return PublicInbox::WWW::need($ctx, 'Search'); my $over = $ctx->{-inbox}->over or return PublicInbox::WWW::need($ctx, 'Overview'); diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 7e72913f..ba239255 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -55,7 +55,7 @@ use constant { use PublicInbox::Smsg; use PublicInbox::Over; our $QP_FLAGS; -our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem); +our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem Query); our $Xap; # 'Search::Xapian' or 'Xapian' our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor') our $ENQ_ASCENDING; @@ -331,6 +331,9 @@ sub has_threadid ($) { sub _enquire_once { # retry_reopen callback my ($self, $query, $opts) = @_; my $xdb = xdb($self); + if (defined(my $eidx_key = $opts->{eidx_key})) { + $query = $X{Query}->new(OP_FILTER(), $query, 'O'.$eidx_key); + } my $enquire = $X{Enquire}->new($xdb); $enquire->set_query($query); $opts ||= {}; diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index 26426c01..f3c96126 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -30,7 +30,7 @@ sub mbox_results { sub sres_top_html { my ($ctx) = @_; - my $srch = $ctx->{-inbox}->search or + my $srch = $ctx->{-inbox}->isrch or return PublicInbox::WWW::need($ctx, 'Search'); my $q = PublicInbox::SearchQuery->new($ctx->{qp}); my $x = $q->{x}; @@ -95,7 +95,7 @@ sub mset_summary { my $res = \($ctx->{-html_tip}); my $ibx = $ctx->{-inbox}; my $obfs_ibx = $ibx->{obfuscate} ? $ibx : undef; - my @nums = @{$ibx->search->mset_to_artnums($mset)}; + my @nums = @{$ibx->isrch->mset_to_artnums($mset)}; my %num2msg = map { $_->{num} => $_ } @{$ibx->over->get_all(@nums)}; my ($min, $max); @@ -201,7 +201,7 @@ sub search_nav_top { } my $A = $q->qs_html(x => 'A', r => undef); $rv .= qq{|Atom feed]}; - if ($ctx->{-inbox}->search->has_threadid) { + if ($ctx->{-inbox}->isrch->has_threadid) { $rv .= qq{\n\t\t\tdownload mbox.gz: } . # we set name=z w/o using it since it seems required for # lynx (but works fine for w3m). @@ -288,7 +288,7 @@ sub mset_thread { my ($ctx, $mset, $q) = @_; my $ibx = $ctx->{-inbox}; my @pct = map { get_pct($_) } $mset->items; - my $msgs = $ibx->search->mset_to_smsg($ibx, $mset); + my $msgs = $ibx->isrch->mset_to_smsg($ibx, $mset); my $i = 0; $_->{pct} = $pct[$i++] for @$msgs; my $r = $q->{r}; @@ -353,7 +353,7 @@ sub ctx_prepare { sub adump { my ($cb, $mset, $q, $ctx) = @_; - $ctx->{ids} = $ctx->{-inbox}->search->mset_to_artnums($mset); + $ctx->{ids} = $ctx->{-inbox}->isrch->mset_to_artnums($mset); $ctx->{search_query} = $q; # used by WwwAtomStream::atom_header PublicInbox::WwwAtomStream->response($ctx, 200, \&adump_i); } diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm index 83f7a4ee..a53f28b1 100644 --- a/lib/PublicInbox/SolverGit.pm +++ b/lib/PublicInbox/SolverGit.pm @@ -216,7 +216,7 @@ sub filename_query ($) { sub find_smsgs ($$$) { my ($self, $ibx, $want) = @_; - my $srch = $ibx->search or return; + my $srch = $ibx->isrch or return; my $post = $want->{oid_b} or die 'BUG: no {oid_b}'; $post =~ /\A[a-f0-9]+\z/ or die "BUG: oid_b not hex: $post"; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index cdbcff1e..fc208816 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -462,7 +462,7 @@ sub serve_git { sub mbox_results { my ($ctx) = @_; if ($ctx->{env}->{QUERY_STRING} =~ /(?:\A|[&;])q=/) { - $ctx->{-inbox}->search or return need($ctx, 'search'); + $ctx->{-inbox}->isrch or return need($ctx, 'search'); require PublicInbox::SearchView; return PublicInbox::SearchView::mbox_results($ctx); } diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm index 638f4e27..2527b8ed 100644 --- a/lib/PublicInbox/WwwStream.pm +++ b/lib/PublicInbox/WwwStream.pm @@ -54,7 +54,7 @@ sub html_top ($) { qq(color / ). qq(mirror / ). qq(Atom feed); - if ($ibx->search) { + if ($ibx->isrch) { my $q_val = delete($ctx->{-q_value_html}) // ''; $q_val = qq(\nvalue="$q_val") if $q_val ne ''; # XXX gross, for SearchView.pm diff --git a/lib/PublicInbox/WwwText.pm b/lib/PublicInbox/WwwText.pm index 04c9b1c4..8cc818df 100644 --- a/lib/PublicInbox/WwwText.pm +++ b/lib/PublicInbox/WwwText.pm @@ -250,7 +250,7 @@ EOF # n.b. we use the Xapian DB for any regeneratable, # order-of-arrival-independent data. - my $srch = $ibx->search; + my $srch = $ibx->isrch; if ($srch) { $$txt .= <new("$home/extindex"); is($mset->size, 1, 'new message found'); $mset = $es->mset('b:"test message"'); is($mset->size, 1, 'old message found'); - delete @$es{qw(git over xdb)}; # fork preparation + my $pi_cfg = PublicInbox::Config->new; + $pi_cfg->fill_all; + is(scalar($pi_cfg->ALL->mset('s:Testing')->items), 2, + '2 results in ->ALL'); + my $res = {}; + my $nr = 0; + $pi_cfg->each_inbox(sub { + $nr++; + my ($ibx) = @_; + local $SIG{__WARN__} = sub {}; # FIXME support --reindex + my $mset = $ibx->isrch->mset('s:Testing'); + $res->{$ibx->eidx_key} = $ibx->isrch->mset_to_smsg($ibx, $mset); + }); + is($nr, 2, 'two inboxes'); + my $exp = {}; + for my $v (qw(v1 v2)) { + my $ibx = $pi_cfg->lookup_newsgroup("$v.example"); + my $smsg = $ibx->over->get_art(1); + $smsg->psgi_cull; + $exp->{"$v.example"} = [ $smsg ]; + } + is_deeply($res, $exp, 'isearch limited results'); + $pi_cfg = $res = $exp = undef; + open my $rmfh, '+>', undef or BAIL_OUT $!; $rmfh->autoflush(1); print $rmfh $eml2->as_string or BAIL_OUT $!;