From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id B5C931F4B4; Sun, 27 Dec 2020 20:02:51 +0000 (UTC) Date: Sun, 27 Dec 2020 20:02:51 +0000 From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 27/26] lei_xsearch: cross-(inbox|extindex) search Message-ID: <20201227200251.GA31406@dcvr> References: <20201218120950.23272-1-e@80x24.org> <20201218120950.23272-27-e@80x24.org> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: <20201218120950.23272-27-e@80x24.org> List-Id: While a single extindex combines multiple inboxes into a single search index, extindex still requires up-front indexing on items which can be searched. XSearch has no on-disk footprint itself and uses Xapian DBs of existing publicinbox and extindex ("extinbox") exclusively. XSearch still suffers from the multi-shard Xapian scalability problems which led to the creation of extindex, but I expect the number of shards to remain relatively low. I envision users hosting public-inbox instances on their workstations will only have two extindex combined by this, one read-only extindex for serving public archives, and one read-write extindex managed by LeiStore for private mail. --- Note: this depends on {relevance} == 2 support at https://public-inbox.org/meta/20201227193829.9408-3-e@80x24.org/ MANIFEST | 2 + lib/PublicInbox/LeiSearch.pm | 14 +++---- lib/PublicInbox/LeiXSearch.pm | 72 ++++++++++++++++++++++++++++++++++ lib/PublicInbox/Search.pm | 19 ++++----- t/lei_xsearch.t | 73 +++++++++++++++++++++++++++++++++++ 5 files changed, 160 insertions(+), 20 deletions(-) create mode 100644 lib/PublicInbox/LeiXSearch.pm create mode 100644 t/lei_xsearch.t diff --git a/MANIFEST b/MANIFEST index 656c707e..a5ff81cf 100644 --- a/MANIFEST +++ b/MANIFEST @@ -165,6 +165,7 @@ lib/PublicInbox/LEI.pm lib/PublicInbox/LeiExtinbox.pm lib/PublicInbox/LeiSearch.pm lib/PublicInbox/LeiStore.pm +lib/PublicInbox/LeiXSearch.pm lib/PublicInbox/Linkify.pm lib/PublicInbox/Listener.pm lib/PublicInbox/Lock.pm @@ -327,6 +328,7 @@ t/kqnotify.t t/lei-oneshot.t t/lei.t t/lei_store.t +t/lei_xsearch.t t/linkify.t t/main-bin/spamc t/mda-mime.eml diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm index 66c16e04..0b962b11 100644 --- a/lib/PublicInbox/LeiSearch.pm +++ b/lib/PublicInbox/LeiSearch.pm @@ -7,20 +7,18 @@ use v5.10.1; use parent qw(PublicInbox::ExtSearch); use PublicInbox::Search; -sub combined_docid ($$) { +# get combined docid from over.num: +# (not generic Xapian, only works with our sharding scheme) +sub num2docid ($$) { my ($self, $num) = @_; - ($num - 1) * $self->{nshard} + 1; + my $nshard = $self->{nshard}; + ($num - 1) * $nshard + $num % $nshard + 1; } sub msg_keywords { my ($self, $num) = @_; # num_or_mitem my $xdb = $self->xdb; # set {nshard}; - my $docid = ref($num) ? $num->get_docid : do { - # get combined docid from over.num: - # (not generic Xapian, only works with our sharding scheme) - my $nshard = $self->{nshard}; - ($num - 1) * $nshard + $num % $nshard + 1; - }; + my $docid = ref($num) ? $num->get_docid : num2docid($self, $num); my %kw; eval { my $end = $xdb->termlist_end($docid); diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm new file mode 100644 index 00000000..1a81b14a --- /dev/null +++ b/lib/PublicInbox/LeiXSearch.pm @@ -0,0 +1,72 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ + +# Combine any combination of PublicInbox::Search, +# PublicInbox::ExtSearch, and PublicInbox::LeiSearch objects +# into one Xapian DB +package PublicInbox::LeiXSearch; +use strict; +use v5.10.1; +use parent qw(PublicInbox::LeiSearch); + +sub new { + my ($class) = @_; + PublicInbox::Search::load_xapian(); + bless { + qp_flags => $PublicInbox::Search::QP_FLAGS | + PublicInbox::Search::FLAG_PURE_NOT(), + }, $class +} + +sub attach_extinbox { + my ($self, $ibxish) = @_; # ibxish = ExtSearch or Inbox + if (!$ibxish->can('over')) { + push @{$self->{remotes}}, $ibxish + } + if (delete $self->{xdb}) { # XXX: do we need this? + # clobber existing {xdb} if amending + my $expect = delete $self->{nshard}; + my $shards = delete $self->{shards_flat}; + scalar(@$shards) == $expect or die + "BUG: {nshard}$expect != shards=".scalar(@$shards); + + my $prev = {}; + for my $old_ibxish (@{$self->{shard2ibx}}) { + next if $prev == $old_ibxish; + $prev = $old_ibxish; + my @shards = $old_ibxish->search->xdb_shards_flat; + push @{$self->{shards_flat}}, @shards; + } + my $nr = scalar(@{$self->{shards_flat}}); + $nr == $expect or die + "BUG: reloaded $nr shards, expected $expect" + } + my @shards = $ibxish->search->xdb_shards_flat; + push @{$self->{shards_flat}}, @shards; + push(@{$self->{shard2ibx}}, $ibxish) for (@shards); +} + +# called by PublicInbox::Search::xdb +sub xdb_shards_flat { @{$_[0]->{shards_flat}} } + +# like over->get_art +sub smsg_for { + my ($self, $mitem) = @_; + # cf. https://trac.xapian.org/wiki/FAQ/MultiDatabaseDocumentID + my $nshard = $self->{nshard}; + my $docid = $mitem->get_docid; + my $shard = ($docid - 1) % $nshard; + my $num = int(($docid - 1) / $nshard) + 1; + my $smsg = $self->{shard2ibx}->[$shard]->over->get_art($num); + $smsg->{docid} = $docid; + $smsg; +} + +sub recent { + my ($self, $qstr, $opt) = @_; + $opt //= {}; + $opt->{relevance} //= -2; + $self->mset($qstr //= 'bytes:1..', $opt); +} + +1; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 4fae0c66..58653c9e 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -195,6 +195,7 @@ sub xdb_shards_flat ($) { my ($self) = @_; my $xpfx = $self->{xpfx}; my (@xdb, $slow_phrase); + load_xapian(); if ($xpfx =~ m/xapian${\SCHEMA_VERSION}\z/) { @xdb = ($X{Database}->new($xpfx)); $self->{qp_flags} |= FLAG_PHRASE() if !-f "$xpfx/iamchert"; @@ -213,16 +214,6 @@ sub xdb_shards_flat ($) { @xdb; } -sub _xdb { - my ($self) = @_; - $self->{qp_flags} //= $QP_FLAGS; - my @xdb = xdb_shards_flat($self) or return; - $self->{nshard} = scalar(@xdb); - my $xdb = shift @xdb; - $xdb->add_database($_) for @xdb; - $xdb; -} - # v2 Xapian docids don't conflict, so they're identical to # NNTP article numbers and IMAP UIDs. # https://trac.xapian.org/wiki/FAQ/MultiDatabaseDocumentID @@ -241,8 +232,12 @@ sub mset_to_artnums { sub xdb ($) { my ($self) = @_; $self->{xdb} //= do { - load_xapian(); - $self->_xdb; + $self->{qp_flags} //= $QP_FLAGS; + my @xdb = $self->xdb_shards_flat or return; + $self->{nshard} = scalar(@xdb); + my $xdb = shift @xdb; + $xdb->add_database($_) for @xdb; + $xdb; }; } diff --git a/t/lei_xsearch.t b/t/lei_xsearch.t new file mode 100644 index 00000000..c41213bd --- /dev/null +++ b/t/lei_xsearch.t @@ -0,0 +1,73 @@ +#!perl -w +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ +use strict; +use v5.10.1; +use Test::More; +use List::Util qw(shuffle max); +use PublicInbox::TestCommon; +use PublicInbox::ExtSearchIdx; +use PublicInbox::Eml; +use PublicInbox::InboxWritable; +require_mods(qw(DBD::SQLite Search::Xapian)); +require_git 2.6; +require_ok 'PublicInbox::LeiXSearch'; +my ($home, $for_destroy) = tmpdir(); +my @ibx; +for my $V (1..2) { + for my $i (3..6) { + my $ibx = PublicInbox::InboxWritable->new({ + inboxdir => "$home/v$V-$i", + name => "test-v$V-$i", + version => $V, + indexlevel => 'medium', + -primary_address => "v$V-$i\@example.com", + }, { nproc => int(rand(8)) + 1 }); + push @ibx, $ibx; + my $im = $ibx->importer(0); + for my $j (0..9) { + my $eml = PublicInbox::Eml->new(<{-primary_address} +Date: Fri, 02 Oct 1993 0$V:0$i:0$j +0000 +Subject: v${V}i${i}j$j +Message-ID: + +${V}er ${i}on j$j +EOF + $im->add($eml); + } + $im->done; + } +} +my $first = shift @ibx; is($first->{name}, 'test-v1-3', 'first plucked'); +my $last = pop @ibx; is($last->{name}, 'test-v2-6', 'last plucked'); +my $eidx = PublicInbox::ExtSearchIdx->new("$home/eidx"); +$eidx->attach_inbox($first); +$eidx->attach_inbox($last); +$eidx->eidx_sync({fsync => 0}); +my $es = PublicInbox::ExtSearch->new("$home/eidx"); +my $lxs = PublicInbox::LeiXSearch->new; +for my $ibxish (shuffle($es, @ibx)) { + $lxs->attach_extinbox($ibxish); +} +my $nr = $lxs->xdb->get_doccount; +my $mset = $lxs->mset('d:19931002..19931003', { limit => $nr }); +is($mset->size, $nr, 'got all messages'); +my @msgs; +for my $mi ($mset->items) { + if (my $smsg = $lxs->smsg_for($mi)) { + push @msgs, $smsg; + } else { + diag "E: ${\$mi->get_docid} missing"; + } +} +is(scalar(@msgs), $nr, 'smsgs retrieved for all'); + +$mset = $lxs->recent(undef, { limit => 1 }); +is($mset->size, 1, 'one result'); +my $max = max(map { $_->{docid} } @msgs); +is($lxs->smsg_for(($mset->items)[0])->{docid}, $max, + 'got highest docid'); + +done_testing;