From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id D6D041FA13 for ; Mon, 23 Nov 2020 07:06:02 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 01/12] miscsearch: a new Xapian sub-DB for extindex Date: Mon, 23 Nov 2020 07:05:51 +0000 Message-Id: <20201123070602.9698-2-e@80x24.org> In-Reply-To: <20201123070602.9698-1-e@80x24.org> References: <20201123070602.9698-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This will be used to index and search Inbox objects and perhaps individual git repositories/epochs for grokmirror manifest.js.gz generation. There is no sharding planned for this at the moment since inbox count should remain low (~100K to 1M) compared to message count. Folding this into the existing sharded DBs could be possible; but would likely increase query and maintenance costs, as well as development complexity. So we'll use a few more inodes and FDs at runtime, instead. --- MANIFEST | 3 + lib/PublicInbox/ExtSearch.pm | 6 ++ lib/PublicInbox/ExtSearchIdx.pm | 11 +++- lib/PublicInbox/MiscIdx.pm | 107 ++++++++++++++++++++++++++++++++ lib/PublicInbox/MiscSearch.pm | 79 +++++++++++++++++++++++ lib/PublicInbox/Search.pm | 8 +-- lib/PublicInbox/SearchIdx.pm | 7 ++- lib/PublicInbox/V2Writable.pm | 5 ++ t/extsearch.t | 3 + t/miscsearch.t | 54 ++++++++++++++++ 10 files changed, 275 insertions(+), 8 deletions(-) create mode 100644 lib/PublicInbox/MiscIdx.pm create mode 100644 lib/PublicInbox/MiscSearch.pm create mode 100644 t/miscsearch.t diff --git a/MANIFEST b/MANIFEST index fc79a134..544ec5f9 100644 --- a/MANIFEST +++ b/MANIFEST @@ -166,6 +166,8 @@ lib/PublicInbox/MIME.pm lib/PublicInbox/ManifestJsGz.pm lib/PublicInbox/Mbox.pm lib/PublicInbox/MboxGz.pm +lib/PublicInbox/MiscIdx.pm +lib/PublicInbox/MiscSearch.pm lib/PublicInbox/MsgIter.pm lib/PublicInbox/MsgTime.pm lib/PublicInbox/Msgmap.pm @@ -319,6 +321,7 @@ t/mda.t t/mda_filter_rubylang.t t/mid.t t/mime.t +t/miscsearch.t t/msg_iter-nested.eml t/msg_iter-order.eml t/msg_iter.t diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm index eb665027..c41ae443 100644 --- a/lib/PublicInbox/ExtSearch.pm +++ b/lib/PublicInbox/ExtSearch.pm @@ -10,6 +10,7 @@ use v5.10.1; use PublicInbox::Over; use PublicInbox::Inbox; use File::Spec (); +use PublicInbox::MiscSearch; # for ->reopen, ->mset, ->mset_to_artnums use parent qw(PublicInbox::Search); @@ -24,6 +25,11 @@ sub new { }, __PACKAGE__; } +sub misc { + my ($self) = @_; + $self->{misc} //= PublicInbox::MiscSearch->new("$self->{xpfx}/misc"); +} + sub search { $_[0] } # self # overrides PublicInbox::Search::_xdb diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 91434b26..708f8a3e 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -21,6 +21,7 @@ use Carp qw(croak carp); use PublicInbox::Search; use PublicInbox::SearchIdx qw(crlf_adjust prepare_stack is_ancestor); use PublicInbox::OverIdx; +use PublicInbox::MiscIdx; use PublicInbox::MID qw(mids); use PublicInbox::V2Writable; use PublicInbox::InboxWritable; @@ -309,6 +310,7 @@ sub _sync_inbox ($$$) { return; } index_todo($self, $sync, $_) for @{delete($sync->{todo}) // []}; + $self->{midx}->index_ibx($ibx); } sub eidx_sync { # main entry point @@ -374,6 +376,12 @@ sub update_last_commit { # overrides V2Writable $self->{oidx}->eidx_meta($meta_key, $latest_cmt); } +sub _idx_init { # with_umask callback + my ($self, $opt) = @_; + PublicInbox::V2Writable::_idx_init($self, $opt); + $self->{midx} = PublicInbox::MiscIdx->new($self); +} + sub idx_init { # similar to V2Writable my ($self, $opt) = @_; return if $self->{idx_shards}; @@ -406,9 +414,10 @@ sub idx_init { # similar to V2Writable } $self->parallel_init($self->{indexlevel}); $self->umask_prepare; - $self->with_umask(\&PublicInbox::V2Writable::_idx_init, $self, $opt); + $self->with_umask(\&_idx_init, $self, $opt); $self->{oidx}->begin_lazy; $self->{oidx}->eidx_prep; + $self->{midx}->begin_txn; } no warnings 'once'; diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm new file mode 100644 index 00000000..edc70f9b --- /dev/null +++ b/lib/PublicInbox/MiscIdx.pm @@ -0,0 +1,107 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ + +# like PublicInbox::SearchIdx, but for searching for non-mail messages. +# Things indexed include: +# * inboxes themselves +# * epoch information +# * (maybe) git code repository information +# Expect ~100K-1M documents with no parallelism opportunities, +# so no sharding, here. +# +# See MiscSearch for read-only counterpart +package PublicInbox::MiscIdx; +use strict; +use v5.10.1; +use PublicInbox::InboxWritable; +use PublicInbox::Search; # for SWIG Xapian and Search::Xapian compat +use PublicInbox::SearchIdx qw(index_text term_generator add_val); +use PublicInbox::Spawn qw(nodatacow_dir); +use Carp qw(croak); +use File::Path (); +use PublicInbox::MiscSearch; + +sub new { + my ($class, $eidx) = @_; + PublicInbox::SearchIdx::load_xapian_writable(); + my $mi_dir = "$eidx->{xpfx}/misc"; + File::Path::mkpath($mi_dir); + nodatacow_dir($mi_dir); + my $flags = $PublicInbox::SearchIdx::DB_CREATE_OR_OPEN; + $flags |= $PublicInbox::SearchIdx::DB_NO_SYNC if $eidx->{-no_fsync}; + bless { + mi_dir => $mi_dir, + flags => $flags, + indexlevel => 'full', # small DB, no point in medium? + }, $class; +} + +sub begin_txn { + my ($self) = @_; + croak 'BUG: already in txn' if $self->{xdb}; # XXX make lazy? + my $wdb = $PublicInbox::Search::X{WritableDatabase}; + my $xdb = eval { $wdb->new($self->{mi_dir}, $self->{flags}) }; + croak "Failed opening $self->{mi_dir}: $@" if $@; + $self->{xdb} = $xdb; + $xdb->begin_transaction; +} + +sub commit_txn { + my ($self) = @_; + croak 'BUG: not in txn' unless $self->{xdb}; # XXX make lazy? + delete($self->{xdb})->commit_transaction; +} + +sub index_ibx { + my ($self, $ibx) = @_; + my $eidx_key = $ibx->eidx_key; + my $xdb = $self->{xdb}; + # Q = uniQue in Xapian terminology + my $head = $xdb->postlist_begin('Q'.$eidx_key); + my $tail = $xdb->postlist_end('Q'.$eidx_key); + my ($docid, @drop); + for (; $head != $tail; $head++) { + if (defined $docid) { + my $i = $head->get_docid; + push @drop, $i; + warn <get_docid; + } + } + $xdb->delete_document($_) for @drop; # just in case + + my $doc = $PublicInbox::Search::X{Document}->new; + + # allow sorting by modified + add_val($doc, $PublicInbox::MiscSearch::MODIFIED, $ibx->modified); + + $doc->add_boolean_term('Q'.$eidx_key); + $doc->add_boolean_term('T'.'inbox'); + term_generator($self)->set_document($doc); + + # description = S/Subject (or title) + # address = A/Author + index_text($self, $ibx->description, 1, 'S'); + my %map = ( + address => 'A', + listid => 'XLISTID', + infourl => 'XINFOURL', + url => 'XURL' + ); + while (my ($f, $pfx) = each %map) { + for my $v (@{$ibx->{$f} // []}) { + index_text($self, $v, 1, $pfx); + } + } + index_text($self, $ibx->{name}, 1, 'XNAME'); + if (defined $docid) { + $xdb->replace_document($docid, $doc); + } else { + $xdb->add_document($doc); + } +} + +1; diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm new file mode 100644 index 00000000..8beb8349 --- /dev/null +++ b/lib/PublicInbox/MiscSearch.pm @@ -0,0 +1,79 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ + +# read-only counterpart to MiscIdx +package PublicInbox::MiscSearch; +use strict; +use v5.10.1; +use PublicInbox::Search qw(retry_reopen); + +# Xapian value columns: +our $MODIFIED = 0; + +# avoid conflicting with message Search::prob_prefix for UI/UX reasons +my %PROB_PREFIX = ( + description => 'S', # $INBOX_DIR/description + address => 'A', + listid => 'XLISTID', + url => 'XURL', + infourl => 'XINFOURL', + name => 'XNAME', + '' => 'S A XLISTID XNAME XURL XINFOURL' +); + +sub new { + my ($class, $dir) = @_; + bless { + xdb => $PublicInbox::Search::X{Database}->new($dir) + }, $class; +} + +# read-only +sub mi_qp_new ($) { + my ($self) = @_; + my $xdb = $self->{xdb}; + my $qp = $PublicInbox::Search::X{QueryParser}->new; + $qp->set_default_op(PublicInbox::Search::OP_AND()); + $qp->set_database($xdb); + $qp->set_stemmer(PublicInbox::Search::stemmer($self)); + $qp->set_stemming_strategy(PublicInbox::Search::STEM_SOME()); + my $cb = $qp->can('set_max_wildcard_expansion') // + $qp->can('set_max_expansion'); # Xapian 1.5.0+ + $cb->($qp, 100); + $cb = $qp->can('add_valuerangeprocessor') // + $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ + while (my ($name, $prefix) = each %PROB_PREFIX) { + $qp->add_prefix($name, $_) for split(/ /, $prefix); + } + $qp->add_boolean_prefix('type', 'T'); + $qp; +} + +sub misc_enquire_once { # retry_reopen callback + my ($self, $qr, $opt) = @{$_[0]}; + my $eq = $PublicInbox::Search::X{Enquire}->new($self->{xdb}); + $eq->set_query($qr); + my $desc = !$opt->{asc}; + my $rel = $opt->{relevance} // 0; + if ($rel == -1) { # ORDER BY docid/UID + $eq->set_docid_order($PublicInbox::Search::ENQ_ASCENDING); + $eq->set_weighting_scheme($PublicInbox::Search::X{BoolWeight}->new); + } elsif ($rel) { + $eq->set_sort_by_relevance_then_value($MODIFIED, $desc); + } else { + $eq->set_sort_by_value_then_relevance($MODIFIED, $desc); + } + $eq->get_mset($opt->{offset} || 0, $opt->{limit} || 200); +} + +sub mset { + my ($self, $qs, $opt) = @_; + $opt ||= {}; + my $qp = $self->{qp} //= mi_qp_new($self); + $qs = 'type:inbox' if $qs eq ''; + my $qr = $qp->parse_query($qs, $PublicInbox::Search::QP_FLAGS); + $opt->{relevance} = 1 unless exists $opt->{relevance}; + retry_reopen($self, \&misc_enquire_once, [ $self, $qr, $opt ]); +} + +1; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 71417d5e..05d5a133 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -6,7 +6,7 @@ package PublicInbox::Search; use strict; use parent qw(Exporter); -our @EXPORT_OK = qw(mdocid); +our @EXPORT_OK = qw(mdocid retry_reopen); use List::Util qw(max); # values for searching, changing the numeric value breaks @@ -54,11 +54,11 @@ use constant { use PublicInbox::Smsg; use PublicInbox::Over; -my $QP_FLAGS; +our $QP_FLAGS; our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem); our $Xap; # 'Search::Xapian' or 'Xapian' -my $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor') -my $ENQ_ASCENDING; +our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor') +our $ENQ_ASCENDING; sub load_xapian () { return 1 if defined $Xap; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 6ff2cf94..18390602 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -22,9 +22,10 @@ use PublicInbox::OverIdx; use PublicInbox::Spawn qw(spawn nodatacow_dir); use PublicInbox::Git qw(git_unquote); use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); -our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack); +our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack + index_text term_generator add_val); my $X = \%PublicInbox::Search::X; -my ($DB_CREATE_OR_OPEN, $DB_OPEN); +our ($DB_CREATE_OR_OPEN, $DB_OPEN); our $DB_NO_SYNC = 0; our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : 1_000_000; use constant DEBUG => !!$ENV{DEBUG}; @@ -154,7 +155,7 @@ sub term_generator ($) { # write-only $self->{term_generator} //= do { my $tg = $X->{TermGenerator}->new; - $tg->set_stemmer($self->stemmer); + $tg->set_stemmer(PublicInbox::Search::stemmer($self)); $tg; } } diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index ba7cef13..afba0220 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -631,6 +631,9 @@ sub checkpoint ($;$) { $_->shard_commit for @$shards; } + my $midx = $self->{midx}; # misc index + $midx->commit_txn if $midx; + # last_commit is special, don't commit these until # Xapian shards are done: $dbh->begin_work if $dbh; @@ -639,6 +642,7 @@ sub checkpoint ($;$) { $dbh->commit; $dbh->begin_work; } + $midx->begin_txn if $midx; } $self->{total_bytes} += $self->{transact_bytes}; $self->{transact_bytes} = 0; @@ -678,6 +682,7 @@ sub done { } eval { $self->{oidx}->dbh_close }; $err .= "over close: $@\n" if $@; + delete $self->{midx}; delete $self->{bnote}; my $nbytes = $self->{total_bytes}; $self->{total_bytes} = 0; diff --git a/t/extsearch.t b/t/extsearch.t index 8792fd9e..e28e2f71 100644 --- a/t/extsearch.t +++ b/t/extsearch.t @@ -72,4 +72,7 @@ my $es = PublicInbox::ExtSearch->new("$home/eindex"); isnt($x1->[0], $x2->[0], 'xref3 differs'); } +my $misc = $es->misc; +is(scalar($misc->mset('')->items), 2, 'two inboxes'); + done_testing; diff --git a/t/miscsearch.t b/t/miscsearch.t new file mode 100644 index 00000000..45a19da9 --- /dev/null +++ b/t/miscsearch.t @@ -0,0 +1,54 @@ +#!perl -w +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ +use strict; +use Test::More; +use PublicInbox::TestCommon; +use PublicInbox::InboxWritable; +require_mods(qw(Search::Xapian DBD::SQLite)); +use_ok 'PublicInbox::MiscSearch'; +use_ok 'PublicInbox::MiscIdx'; + +my ($tmp, $for_destroy) = tmpdir(); +my $eidx = { xpfx => "$tmp/eidx", -no_fsync => 1 }; # mock ExtSearchIdx +{ + mkdir "$tmp/v1" or BAIL_OUT "mkdir $!"; + open my $fh, '>', "$tmp/v1/description" or BAIL_OUT "open: $!"; + print $fh "Everything sucks this year\n" or BAIL_OUT "print $!"; + close $fh or BAIL_OUT "close $!"; +} +{ + my $v1 = PublicInbox::InboxWritable->new({ + inboxdir => "$tmp/v1", + name => 'hope', + address => [ 'nope@example.com' ], + indexlevel => 'basic', + version => 1, + }); + $v1->init_inbox; + my $mi = PublicInbox::MiscIdx->new($eidx); + $mi->begin_txn; + $mi->index_ibx($v1); + $mi->commit_txn; +} + +my $ms = PublicInbox::MiscSearch->new("$tmp/eidx/misc"); +my $mset = $ms->mset('"everything sucks today"'); +is(scalar($mset->items), 0, 'no match on description phrase'); + +$mset = $ms->mset('"everything sucks this year"'); +is(scalar($mset->items), 1, 'match phrase on description'); + +$mset = $ms->mset('everything sucks'); +is(scalar($mset->items), 1, 'match words in description'); + +$mset = $ms->mset('nope@example.com'); +is(scalar($mset->items), 1, 'match full address'); + +$mset = $ms->mset('nope'); +is(scalar($mset->items), 1, 'match partial address'); + +$mset = $ms->mset('hope'); +is(scalar($mset->items), 1, 'match name'); + +done_testing;