From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 27/26] lei_xsearch: cross-(inbox|extindex) search
Date: Sun, 27 Dec 2020 20:02:51 +0000 [thread overview]
Message-ID: <20201227200251.GA31406@dcvr> (raw)
In-Reply-To: <20201218120950.23272-27-e@80x24.org>
While a single extindex combines multiple inboxes into a single
search index, extindex still requires up-front indexing on items
which can be searched. XSearch has no on-disk footprint itself
and uses Xapian DBs of existing publicinbox and extindex
("extinbox") exclusively.
XSearch still suffers from the multi-shard Xapian scalability
problems which led to the creation of extindex, but I expect the
number of shards to remain relatively low.
I envision users hosting public-inbox instances on their
workstations will only have two extindex combined by this, one
read-only extindex for serving public archives, and one
read-write extindex managed by LeiStore for private mail.
---
Note: this depends on {relevance} == 2 support at
https://public-inbox.org/meta/20201227193829.9408-3-e@80x24.org/
MANIFEST | 2 +
lib/PublicInbox/LeiSearch.pm | 14 +++----
lib/PublicInbox/LeiXSearch.pm | 72 ++++++++++++++++++++++++++++++++++
lib/PublicInbox/Search.pm | 19 ++++-----
t/lei_xsearch.t | 73 +++++++++++++++++++++++++++++++++++
5 files changed, 160 insertions(+), 20 deletions(-)
create mode 100644 lib/PublicInbox/LeiXSearch.pm
create mode 100644 t/lei_xsearch.t
diff --git a/MANIFEST b/MANIFEST
index 656c707e..a5ff81cf 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -165,6 +165,7 @@ lib/PublicInbox/LEI.pm
lib/PublicInbox/LeiExtinbox.pm
lib/PublicInbox/LeiSearch.pm
lib/PublicInbox/LeiStore.pm
+lib/PublicInbox/LeiXSearch.pm
lib/PublicInbox/Linkify.pm
lib/PublicInbox/Listener.pm
lib/PublicInbox/Lock.pm
@@ -327,6 +328,7 @@ t/kqnotify.t
t/lei-oneshot.t
t/lei.t
t/lei_store.t
+t/lei_xsearch.t
t/linkify.t
t/main-bin/spamc
t/mda-mime.eml
diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm
index 66c16e04..0b962b11 100644
--- a/lib/PublicInbox/LeiSearch.pm
+++ b/lib/PublicInbox/LeiSearch.pm
@@ -7,20 +7,18 @@ use v5.10.1;
use parent qw(PublicInbox::ExtSearch);
use PublicInbox::Search;
-sub combined_docid ($$) {
+# get combined docid from over.num:
+# (not generic Xapian, only works with our sharding scheme)
+sub num2docid ($$) {
my ($self, $num) = @_;
- ($num - 1) * $self->{nshard} + 1;
+ my $nshard = $self->{nshard};
+ ($num - 1) * $nshard + $num % $nshard + 1;
}
sub msg_keywords {
my ($self, $num) = @_; # num_or_mitem
my $xdb = $self->xdb; # set {nshard};
- my $docid = ref($num) ? $num->get_docid : do {
- # get combined docid from over.num:
- # (not generic Xapian, only works with our sharding scheme)
- my $nshard = $self->{nshard};
- ($num - 1) * $nshard + $num % $nshard + 1;
- };
+ my $docid = ref($num) ? $num->get_docid : num2docid($self, $num);
my %kw;
eval {
my $end = $xdb->termlist_end($docid);
diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm
new file mode 100644
index 00000000..1a81b14a
--- /dev/null
+++ b/lib/PublicInbox/LeiXSearch.pm
@@ -0,0 +1,72 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# Combine any combination of PublicInbox::Search,
+# PublicInbox::ExtSearch, and PublicInbox::LeiSearch objects
+# into one Xapian DB
+package PublicInbox::LeiXSearch;
+use strict;
+use v5.10.1;
+use parent qw(PublicInbox::LeiSearch);
+
+sub new {
+ my ($class) = @_;
+ PublicInbox::Search::load_xapian();
+ bless {
+ qp_flags => $PublicInbox::Search::QP_FLAGS |
+ PublicInbox::Search::FLAG_PURE_NOT(),
+ }, $class
+}
+
+sub attach_extinbox {
+ my ($self, $ibxish) = @_; # ibxish = ExtSearch or Inbox
+ if (!$ibxish->can('over')) {
+ push @{$self->{remotes}}, $ibxish
+ }
+ if (delete $self->{xdb}) { # XXX: do we need this?
+ # clobber existing {xdb} if amending
+ my $expect = delete $self->{nshard};
+ my $shards = delete $self->{shards_flat};
+ scalar(@$shards) == $expect or die
+ "BUG: {nshard}$expect != shards=".scalar(@$shards);
+
+ my $prev = {};
+ for my $old_ibxish (@{$self->{shard2ibx}}) {
+ next if $prev == $old_ibxish;
+ $prev = $old_ibxish;
+ my @shards = $old_ibxish->search->xdb_shards_flat;
+ push @{$self->{shards_flat}}, @shards;
+ }
+ my $nr = scalar(@{$self->{shards_flat}});
+ $nr == $expect or die
+ "BUG: reloaded $nr shards, expected $expect"
+ }
+ my @shards = $ibxish->search->xdb_shards_flat;
+ push @{$self->{shards_flat}}, @shards;
+ push(@{$self->{shard2ibx}}, $ibxish) for (@shards);
+}
+
+# called by PublicInbox::Search::xdb
+sub xdb_shards_flat { @{$_[0]->{shards_flat}} }
+
+# like over->get_art
+sub smsg_for {
+ my ($self, $mitem) = @_;
+ # cf. https://trac.xapian.org/wiki/FAQ/MultiDatabaseDocumentID
+ my $nshard = $self->{nshard};
+ my $docid = $mitem->get_docid;
+ my $shard = ($docid - 1) % $nshard;
+ my $num = int(($docid - 1) / $nshard) + 1;
+ my $smsg = $self->{shard2ibx}->[$shard]->over->get_art($num);
+ $smsg->{docid} = $docid;
+ $smsg;
+}
+
+sub recent {
+ my ($self, $qstr, $opt) = @_;
+ $opt //= {};
+ $opt->{relevance} //= -2;
+ $self->mset($qstr //= 'bytes:1..', $opt);
+}
+
+1;
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 4fae0c66..58653c9e 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -195,6 +195,7 @@ sub xdb_shards_flat ($) {
my ($self) = @_;
my $xpfx = $self->{xpfx};
my (@xdb, $slow_phrase);
+ load_xapian();
if ($xpfx =~ m/xapian${\SCHEMA_VERSION}\z/) {
@xdb = ($X{Database}->new($xpfx));
$self->{qp_flags} |= FLAG_PHRASE() if !-f "$xpfx/iamchert";
@@ -213,16 +214,6 @@ sub xdb_shards_flat ($) {
@xdb;
}
-sub _xdb {
- my ($self) = @_;
- $self->{qp_flags} //= $QP_FLAGS;
- my @xdb = xdb_shards_flat($self) or return;
- $self->{nshard} = scalar(@xdb);
- my $xdb = shift @xdb;
- $xdb->add_database($_) for @xdb;
- $xdb;
-}
-
# v2 Xapian docids don't conflict, so they're identical to
# NNTP article numbers and IMAP UIDs.
# https://trac.xapian.org/wiki/FAQ/MultiDatabaseDocumentID
@@ -241,8 +232,12 @@ sub mset_to_artnums {
sub xdb ($) {
my ($self) = @_;
$self->{xdb} //= do {
- load_xapian();
- $self->_xdb;
+ $self->{qp_flags} //= $QP_FLAGS;
+ my @xdb = $self->xdb_shards_flat or return;
+ $self->{nshard} = scalar(@xdb);
+ my $xdb = shift @xdb;
+ $xdb->add_database($_) for @xdb;
+ $xdb;
};
}
diff --git a/t/lei_xsearch.t b/t/lei_xsearch.t
new file mode 100644
index 00000000..c41213bd
--- /dev/null
+++ b/t/lei_xsearch.t
@@ -0,0 +1,73 @@
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use v5.10.1;
+use Test::More;
+use List::Util qw(shuffle max);
+use PublicInbox::TestCommon;
+use PublicInbox::ExtSearchIdx;
+use PublicInbox::Eml;
+use PublicInbox::InboxWritable;
+require_mods(qw(DBD::SQLite Search::Xapian));
+require_git 2.6;
+require_ok 'PublicInbox::LeiXSearch';
+my ($home, $for_destroy) = tmpdir();
+my @ibx;
+for my $V (1..2) {
+ for my $i (3..6) {
+ my $ibx = PublicInbox::InboxWritable->new({
+ inboxdir => "$home/v$V-$i",
+ name => "test-v$V-$i",
+ version => $V,
+ indexlevel => 'medium',
+ -primary_address => "v$V-$i\@example.com",
+ }, { nproc => int(rand(8)) + 1 });
+ push @ibx, $ibx;
+ my $im = $ibx->importer(0);
+ for my $j (0..9) {
+ my $eml = PublicInbox::Eml->new(<<EOF);
+From: x\@example.com
+To: $ibx->{-primary_address}
+Date: Fri, 02 Oct 1993 0$V:0$i:0$j +0000
+Subject: v${V}i${i}j$j
+Message-ID: <v${V}i${i}j$j\@example>
+
+${V}er ${i}on j$j
+EOF
+ $im->add($eml);
+ }
+ $im->done;
+ }
+}
+my $first = shift @ibx; is($first->{name}, 'test-v1-3', 'first plucked');
+my $last = pop @ibx; is($last->{name}, 'test-v2-6', 'last plucked');
+my $eidx = PublicInbox::ExtSearchIdx->new("$home/eidx");
+$eidx->attach_inbox($first);
+$eidx->attach_inbox($last);
+$eidx->eidx_sync({fsync => 0});
+my $es = PublicInbox::ExtSearch->new("$home/eidx");
+my $lxs = PublicInbox::LeiXSearch->new;
+for my $ibxish (shuffle($es, @ibx)) {
+ $lxs->attach_extinbox($ibxish);
+}
+my $nr = $lxs->xdb->get_doccount;
+my $mset = $lxs->mset('d:19931002..19931003', { limit => $nr });
+is($mset->size, $nr, 'got all messages');
+my @msgs;
+for my $mi ($mset->items) {
+ if (my $smsg = $lxs->smsg_for($mi)) {
+ push @msgs, $smsg;
+ } else {
+ diag "E: ${\$mi->get_docid} missing";
+ }
+}
+is(scalar(@msgs), $nr, 'smsgs retrieved for all');
+
+$mset = $lxs->recent(undef, { limit => 1 });
+is($mset->size, 1, 'one result');
+my $max = max(map { $_->{docid} } @msgs);
+is($lxs->smsg_for(($mset->items)[0])->{docid}, $max,
+ 'got highest docid');
+
+done_testing;
prev parent reply other threads:[~2020-12-27 20:02 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-12-18 12:09 [PATCH 00/26] lei: basic UI + IPC work Eric Wong
2020-12-18 12:09 ` [PATCH 01/26] lei: FD-passing and IPC basics Eric Wong
2020-12-18 12:09 ` [PATCH 02/26] lei: proposed command-listing and options Eric Wong
2021-02-18 20:42 ` lei q --save-as=... requires too much thinking Eric Wong
2020-12-18 12:09 ` [PATCH 03/26] lei_store: local storage for Local Email Interface Eric Wong
2020-12-18 12:09 ` [PATCH 04/26] tests: more common JSON module loading Eric Wong
2020-12-18 12:09 ` [PATCH 05/26] lei: use spawn (vfork + execve) for lazy start Eric Wong
2020-12-18 12:09 ` [PATCH 06/26] lei: refine help/option parsing, implement "init" Eric Wong
2020-12-18 12:09 ` [PATCH 07/26] t/lei-oneshot: standalone oneshot (non-socket) test Eric Wong
2020-12-18 12:09 ` [PATCH 08/26] lei: ensure we run a restrictive umask Eric Wong
2020-12-18 12:09 ` [PATCH 09/26] lei: support `daemon-env' for modifying long-lived env Eric Wong
2020-12-18 12:09 ` [PATCH 10/26] lei_store: simplify git_epoch_max, slightly Eric Wong
2020-12-18 12:09 ` [PATCH 11/26] search: simplify initialization, add ->xdb_shards_flat Eric Wong
2020-12-18 12:09 ` [PATCH 12/26] rename LeiDaemon package to PublicInbox::LEI Eric Wong
2020-12-18 12:09 ` [PATCH 13/26] lei: support pass-through for `lei config' Eric Wong
2020-12-18 12:09 ` [PATCH 14/26] lei: help: show actual paths being operated on Eric Wong
2020-12-18 12:09 ` [PATCH 15/26] lei: rename $client => $self and bless Eric Wong
2020-12-18 12:09 ` [PATCH 16/26] lei: micro-optimize startup time Eric Wong
2020-12-18 12:09 ` [PATCH 17/26] lei_store: relax GIT_COMMITTER_IDENT check Eric Wong
2020-12-18 12:09 ` [PATCH 18/26] lei_store: keyword extraction from mbox and Maildir Eric Wong
2020-12-18 12:09 ` [PATCH 19/26] on_destroy: generic localized END Eric Wong
2020-12-18 12:09 ` [PATCH 20/26] lei: restore default __DIE__ handler for event loop Eric Wong
2020-12-18 12:09 ` [PATCH 21/26] lei: drop $SIG{__DIE__}, add oneshot fallbacks Eric Wong
2020-12-18 12:09 ` [PATCH 22/26] lei: start working on bash completion Eric Wong
2020-12-18 12:09 ` [PATCH 23/26] build: add lei.sh + "make symlink-install" target Eric Wong
2020-12-18 12:09 ` [PATCH 24/26] lei: support for -$DIGIT and -$SIG CLI switches Eric Wong
2020-12-18 12:09 ` [PATCH 25/26] lei: revise output routines Eric Wong
2020-12-18 12:09 ` [PATCH 26/26] lei: extinbox: start implementing in config file Eric Wong
2020-12-18 20:23 ` Eric Wong
2020-12-27 20:02 ` Eric Wong [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20201227200251.GA31406@dcvr \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).