From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 629CF1F549 for ; Sat, 10 Aug 2024 09:00:13 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1723280413; bh=0xQIecsmEshDhKxKG7DxhmbhIpl1f7KI1Ckm0Wz9kzs=; h=From:To:Subject:Date:In-Reply-To:References:From; b=XeuAnmQ88WDiT2No/JaRJ6AZPgifHtLQr9CgXPfg/37OWQpcPS6e1gilyzksSrfXo lI21HszaJ8Nmw19h0ANcmrj9KFjKp/C+dLOS9ftQ5vp7PuY/VIom7XjqVWoQ0nThlh WH30CgvD2PR54iZPWF/2/0+MvdYJQiEdFQYAX1Gw= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 01/11] search: support per-inbox indexheader directive Date: Sat, 10 Aug 2024 09:00:02 +0000 Message-ID: <20240810090012.23269-2-e@80x24.org> In-Reply-To: <20240810090012.23269-1-e@80x24.org> References: <20240810090012.23269-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This allows indexing arbitrary headers to allow filtering by boolean terms or existing text rules. Disabling RFC 2047 decoding is supported, as well. This also refactors AltId support to rely on the same mechanisms as the IndexHeader class for indexing, user help, and Xapian::QueryParser setup via both bindings and external XapHelper process to avoid adding complexity to Search.pm and SearchIdx.pm. We'll finally document altid support in public-inbox-config(5) since we're in the area, as it's been a stable feature for many years, now. --- Documentation/public-inbox-config.pod | 62 ++++++++++++++++++ MANIFEST | 2 + lib/PublicInbox/AltId.pm | 60 +++++++++-------- lib/PublicInbox/Config.pm | 2 +- lib/PublicInbox/IndexHeader.pm | 73 +++++++++++++++++++++ lib/PublicInbox/Search.pm | 43 +++++++------ lib/PublicInbox/SearchIdx.pm | 34 +++++----- t/watch_indexheader.t | 92 +++++++++++++++++++++++++++ 8 files changed, 306 insertions(+), 62 deletions(-) create mode 100644 lib/PublicInbox/IndexHeader.pm create mode 100644 t/watch_indexheader.t diff --git a/Documentation/public-inbox-config.pod b/Documentation/public-inbox-config.pod index b4a1d94d..50746b21 100644 --- a/Documentation/public-inbox-config.pod +++ b/Documentation/public-inbox-config.pod @@ -172,6 +172,68 @@ link to the line numbers of blobs. Default: none +=item publicinbox..altid + +Index by an alternative ID mechanism as a Xapian search prefix e.g. +C. This is useful to allow looking up legacy serial IDs +(e.g. gmane article numbers). + +It must be specified in the form of +C where C<$USER_PREFIX> is a +lowercase prefix like C for search queries, and +C<$SQLITE_FILENAME> is points to an SQLite DB. C<$SQLITE_FILENAME> may +be an absolute path or a path relative to C for v2 inboxes or +C for v1 inboxes. + +The schema of C<$SQLITE_FILENAME> should be the same as a +C. See C in the public-inbox +source tree for an example of how to generate such a mapping from +via NNTP. + +This is a noop with C + +Default: none + +=item publicinbox..indexheader + +Supports indexing of arbitrary mail headers in Xapian. + +It must be specified in the form of +C<$TYPE:$USER_PREFIX:$MAIL_HEADER:$PARAMS> +where C<$TYPE> determines how it's indexed and queried; +C<$USER_PREFIX> is a lowercase prefix for search queries, +C<$MAIL_HEADER> is the header to index (e.g. C), +C<$PARAMS> is a URL-style query string for optional parameters. + +Valid C<$TYPE> values (in ascending order of storage cost) are as follows: + +* C - index for simple filtering (not sortable by relevance) + +* C - add frequency information to allow sorting by relevance + +* C - add positional information to match sentences or phrases + +In other words: C forces indexing of a particular header to +behave like it used C; while C indexes as if +that header used C. + +Valid keys in C<$PARAMS> include: + +* raw - do not perform RFC2047 decoding of headers + +Example: + + [publicinbox "foo"] + indexheader = boolean_term:xlabel:X-Label:raw=1 + +Support for other parameters is not finalized and subject to change. + +This is a noop with C + +New in public-inbox 2.0.0 (PENDING) + +Default: none + =item publicinbox..replyto May be used to control how reply instructions in the PSGI diff --git a/MANIFEST b/MANIFEST index af65a86e..34d3ef14 100644 --- a/MANIFEST +++ b/MANIFEST @@ -228,6 +228,7 @@ lib/PublicInbox/In3Watch.pm lib/PublicInbox/Inbox.pm lib/PublicInbox/InboxIdle.pm lib/PublicInbox/InboxWritable.pm +lib/PublicInbox/IndexHeader.pm lib/PublicInbox/Inotify.pm lib/PublicInbox/Inotify3.pm lib/PublicInbox/InputPipe.pm @@ -630,6 +631,7 @@ t/v2writable.t t/view.t t/watch_filter_rubylang.t t/watch_imap.t +t/watch_indexheader.t t/watch_maildir.t t/watch_maildir_v2.t t/watch_mh.t diff --git a/lib/PublicInbox/AltId.pm b/lib/PublicInbox/AltId.pm index 80757ceb..bd6cf973 100644 --- a/lib/PublicInbox/AltId.pm +++ b/lib/PublicInbox/AltId.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2021 all contributors +# Copyright (C) all contributors # License: AGPL-3.0+ # Used for giving serial numbers to messages. This can be tied to @@ -10,25 +10,20 @@ # it leads to reliance on centralization. However, being able # to use existing serial numbers is beneficial. package PublicInbox::AltId; -use strict; -use warnings; -use URI::Escape qw(uri_unescape); -use PublicInbox::Msgmap; +use v5.12; +use parent qw(PublicInbox::IndexHeader); # spec: TYPE:PREFIX:param1=value1¶m2=value2&... # The PREFIX will be a searchable boolean prefix in Xapian # Example: serial:gmane:file=/path/to/altmsgmap.sqlite3 sub new { my ($class, $ibx, $spec, $writable) = @_; - my ($type, $prefix, $query) = split(/:/, $spec, 3); - $type eq 'serial' or die "non-serial not supported, yet\n"; - $prefix =~ /\A\w+\z/ or warn "non-word prefix not searchable\n"; - my %params = map { - my ($k, $v) = split(/=/, uri_unescape($_), 2); - $v = '' unless defined $v; - ($k, $v); - } split(/[&;]/, $query); - my $f = $params{file} or die "file: required for $type spec $spec\n"; + my ($type, $pfx, $query) = split /:/, $spec, 3; + $type eq 'serial' or die "E: non-serial not supported, yet ($spec)\n"; + my $self = bless {}, $class; + my $params = $self->extra_indexer_new_common($spec, $pfx, $query); + my $f = delete $params->{file} or + die "E: file= required for $type spec $spec\n"; unless (index($f, '/') == 0) { if ($ibx->version == 1) { $f = "$ibx->{inboxdir}/public-inbox/$f"; @@ -36,26 +31,37 @@ sub new { $f = "$ibx->{inboxdir}/$f"; } } - bless { - filename => $f, - writable => $writable, - prefix => $prefix, - xprefix => 'X'.uc($prefix), - }, $class; + my @k = keys %$params; + warn "W: unknown params in `$spec': ", join(', ', @k), "\n" if @k; + $self->{filename} = $f; + $self->{writable} = $writable if $writable; + $self; } -sub mm_alt { +sub mm_alt ($) { my ($self) = @_; $self->{mm_alt} ||= eval { - my $f = $self->{filename}; - my $writable = $self->{writable}; - PublicInbox::Msgmap->new_file($f, $writable); + require PublicInbox::Msgmap; + PublicInbox::Msgmap->new_file(@$self{qw(filename writable)}); }; } -sub mid2alt { - my ($self, $mid) = @_; - $self->mm_alt->num_for($mid); +sub index_extra { # for PublicInbox::SearchIdx + my ($self, $sidx, $eml, $mids) = @_; + for my $mid (@$mids) { + my $id = mm_alt($self)->num_for($mid) // next; + $sidx->index_boolean_term($self->{xprefix}, $id); + } } +sub user_help { # for PublicInbox::Search + my ($self) = @_; + ("$self->{prefix}:", <{prefix}:12345 (boolean) +EOF +} + +# callback for PublicInbox::Search +sub query_parser_method { 'add_boolean_prefix' } + 1; diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 998fc25e..3af5f23c 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -481,7 +481,7 @@ sub _fill_ibx { # more things to encourage decentralization for my $k (qw(address altid nntpmirror imapmirror coderepo hide listid url - infourl watchheader + infourl watchheader indexheader nntpserver imapserver pop3server)) { my $v = $self->{"$pfx.$k"} // next; $ibx->{$k} = _array($v); diff --git a/lib/PublicInbox/IndexHeader.pm b/lib/PublicInbox/IndexHeader.pm new file mode 100644 index 00000000..53e9373b --- /dev/null +++ b/lib/PublicInbox/IndexHeader.pm @@ -0,0 +1,73 @@ +# Copyright (C) all contributors +# License: AGPL-3.0+ + +# allow searching on arbitrary headers as text +package PublicInbox::IndexHeader; +use v5.12; +use URI::Escape qw(uri_unescape); + +my %T2IDX = ( # map to PublicInbox::SearchIdx methods + phrase => 'index_phrase1', + boolean_term => 'index_boolean_term', + text => 'index_text1', +); + +# also called by AltId->new +sub extra_indexer_new_common ($$$$) { + my ($self, $spec, $pfx, $query) = @_; + $pfx =~ /\A[a-z][a-z0-9]*\z/ or + warn "W: non-word prefix in `$spec' not searchable\n"; + $self->{prefix} = $pfx; + my %params = map { + my ($k, $v) = split /=/, uri_unescape($_), 2; + ($k, $v // ''); + } split /[&;]/, $query // ''; + my $xpfx = delete($params{index_prefix}) // "X\U$pfx"; + $xpfx =~ /\A[A-Z][A-Z0-9]*\z/ or die + die "E: `index_prefix' in `$spec' must be ALL CAPS\n"; + $self->{xprefix} = $xpfx; + \%params; +} + +sub new { + my ($cls, $ibx, $spec) = @_; + my ($type, $pfx, $header, $query) = split /:/, $spec, 4; + $pfx // die "E: `$spec' has no user prefix\n"; + $header // die "E: `$spec' has no mail header\n"; + my $self = bless { header => $header, type => $type }, $cls; + my $params = extra_indexer_new_common $self, $spec, $pfx, $query; + $self->{hdr_method} = delete $params->{raw} ? 'header_raw' : 'header'; + my @k = keys %$params; + warn "W: unknown params in `$spec': ", join(', ', @k), "\n" if @k; + $T2IDX{$type} // die + "E: `$type' not supported in $spec, must be one of: ", + join(', ', sort keys %T2IDX), "\n"; + $self; +} + +sub index_extra { # for PublicInbox::SearchIdx + my ($self, $sidx, $eml, $mids) = @_; + my $idx_method = $self->{-idx_method} //= $T2IDX{$self->{type}}; + my $hdr_method = $self->{hdr_method}; + for my $val ($eml->$hdr_method($self->{header})) { + $sidx->$idx_method($self->{xprefix}, $val); + } +} + +sub user_help { # for PublicInbox::Search + my ($self) = @_; + ("$self->{prefix}:", <{header}' mail header e.g. $self->{prefix}:stable +EOF +} + +my %TYPE_2_QPMETHOD = ( + phrase => 'add_prefix', + boolean_term => 'add_boolean_prefix', + text => 'add_prefix', +); + +# callback for PublicInbox::Search +sub query_parser_method { $TYPE_2_QPMETHOD{$_[0]->{type}} } + +1; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 649157be..6a0bdb0f 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -292,13 +292,25 @@ sub xdb ($) { }; } +sub load_extra_indexers ($$) { + my ($self, $ibx) = @_; + my @extra; + for my $f (qw(IndexHeader AltId)) { + my $specs = $ibx->{lc $f} // next; + my $cls = "PublicInbox::$f"; + eval "require $cls" or die $@; + push @extra, map { $cls->new($ibx, $_) } @$specs; + } + $self->{-extra} = \@extra if @extra; +} + sub new { my ($class, $ibx) = @_; ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx"; my $xap = $ibx->version > 1 ? 'xap' : 'public-inbox/xapian'; my $xpfx = "$ibx->{inboxdir}/$xap".SCHEMA_VERSION; my $self = bless { xpfx => $xpfx }, $class; - $self->{altid} = $ibx->{altid} if defined($ibx->{altid}); + $self->load_extra_indexers($ibx); $self; } @@ -439,6 +451,8 @@ sub xhc_start_maybe (@) { $xhc; } +my %QPMETHOD_2_SYM = (add_prefix => ':', add_boolean_prefix => '='); + sub xh_opt ($$) { my ($self, $opt) = @_; my $lim = $opt->{limit} || 50; @@ -464,9 +478,9 @@ sub xh_opt ($$) { push @ret, '-O', $opt->{eidx_key} if defined $opt->{eidx_key}; my $apfx = $self->{-alt_pfx} //= do { my @tmp; - for (grep /\Aserial:/, @{$self->{altid} // []}) { - my (undef, $pfx) = split /:/, $_; - push @tmp, '-Q', "$pfx=X\U$pfx"; + for my $x (@{$self->{-extra} // []}) { + my $sym = $QPMETHOD_2_SYM{$x->query_parser_method}; + push @tmp, '-Q', $x->{prefix}.$sym.$x->{xprefix}; } # TODO: arbitrary header indexing goes here \@tmp; @@ -593,21 +607,12 @@ sub qparse_new { $qp->add_boolean_prefix($name, $_) foreach split(/ /, $prefix); } - # we do not actually create AltId objects, - # just parse the spec to avoid the extra DB handles for now. - if (my $altid = $self->{altid}) { + if (my $extra = $self->{-extra}) { my $user_pfx = $self->{-user_pfx} = []; - for (@$altid) { - # $_ = 'serial:gmane:/path/to/gmane.msgmap.sqlite3' - # note: Xapian supports multibyte UTF-8, /^[0-9]+$/, - # and '_' with prefixes matching \w+ - /\Aserial:(\w+):/ or next; - my $pfx = $1; - push @$user_pfx, "$pfx:", < XGMANE - $qp->add_boolean_prefix($pfx, 'X'.uc($pfx)); + for my $x (@$extra) { + push @$user_pfx, $x->user_help; + my $m = $x->query_parser_method; + $qp->$m(@$x{qw(prefix xprefix)}); } chomp @$user_pfx; } @@ -654,7 +659,7 @@ EOM sub help { my ($self) = @_; - $self->{qp} // $self->qparse_new; # parse altids + $self->{qp} // $self->qparse_new; # parse altids + indexheaders my @ret = @HELP; if (my $user_pfx = $self->{-user_pfx}) { push @ret, @$user_pfx; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 4fd493d9..b2576e52 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -52,11 +52,6 @@ sub new { my $inboxdir = $ibx->{inboxdir}; my $version = $ibx->version; my $indexlevel = 'full'; - my $altid = $ibx->{altid}; - if ($altid) { - require PublicInbox::AltId; - $altid = [ map { PublicInbox::AltId->new($ibx, $_); } @$altid ]; - } if ($ibx->{indexlevel}) { if ($ibx->{indexlevel} =~ $INDEXLEVELS) { $indexlevel = $ibx->{indexlevel}; @@ -69,7 +64,7 @@ sub new { my $self = PublicInbox::Search->new($ibx); bless $self, $class; $self->{ibx} = $ibx; - $self->{-altid} = $altid; + $self->load_extra_indexers($ibx); $self->{indexlevel} = $indexlevel; $self->{-set_indexlevel_once} = 1 if $indexlevel eq 'medium'; if ($ibx->{-skip_docdata}) { @@ -184,6 +179,22 @@ sub index_phrase ($$$$) { $self->{term_generator}->increase_termpos; } +sub index_phrase1 { # called by various ->index_extra + my ($self, $pfx, $text) = @_; + index_phrase $self, $text, 1, $pfx; +} + +sub index_text1 { # called by various ->index_extra + my ($self, $pfx, $text) = @_; + $self->{term_generator}->index_text_without_positions($text, 1, $pfx); +} + +sub index_boolean_term { # called by various ->index_extra + my ($self, $pfx, $term) = @_; + my $doc = $self->{term_generator}->get_document; + $doc->add_boolean_term($pfx.$term); +} + sub index_text ($$$$) { my ($self, $text, $wdf_inc, $prefix) = @_; @@ -481,15 +492,8 @@ sub eml2doc ($$$;$) { $doc->set_data($data); } - if (my $altid = $self->{-altid}) { - foreach my $alt (@$altid) { - my $pfx = $alt->{xprefix}; - foreach my $mid (@$mids) { - my $id = $alt->mid2alt($mid); - next unless defined $id; - $doc->add_boolean_term($pfx . $id); - } - } + for my $extra (@{$self->{-extra} // []}) { + $extra->index_extra($self, $eml, $mids); } $doc; } diff --git a/t/watch_indexheader.t b/t/watch_indexheader.t new file mode 100644 index 00000000..e815fca9 --- /dev/null +++ b/t/watch_indexheader.t @@ -0,0 +1,92 @@ +# Copyright (C) all contributors +# License: AGPL-3.0+ +use v5.12; +use autodie; +use PublicInbox::TestCommon; +use PublicInbox::Eml; +use PublicInbox::Emergency; +use PublicInbox::IO qw(write_file); +use PublicInbox::InboxIdle; +use PublicInbox::Inbox; +use PublicInbox::DS; +use PublicInbox::Config; +require_mods(qw(DBD::SQLite Xapian)); +my $tmpdir = tmpdir; +my $config = "$tmpdir/pi_config"; +local $ENV{PI_CONFIG} = $config; +delete local $ENV{PI_DIR}; +my @V = (1); +my @creat_opt = (indexlevel => 'medium', sub {}); +my $v1 = create_inbox 'v1', tmpdir => "$tmpdir/v1", @creat_opt; +my $fh = write_file '>', $config, <{inboxdir} + address = v1\@example.com + watch = maildir:$tmpdir/v1-md + indexheader = boolean_term:xarchiveshash:X-Archives-Hash +EOM + +SKIP: { + require_git(v2.6, 1); + push @V, 2; + my $v2 = create_inbox 'v2', tmpdir => "$tmpdir/v2", @creat_opt; + print $fh <new; +for my $v (@V) { for ('', qw(cur new tmp)) { mkdir "$tmpdir/v$v-md/$_" } } +my $wm = start_script([qw(-watch)]); +my $h1 = 'deadbeef' x 4; +my @em = map { + my $v = $_; + my $em = PublicInbox::Emergency->new("$tmpdir/v$v-md"); + $em->prepare(\(PublicInbox::Eml->new(<as_string)); +From: x\@example.com +Message-ID: +To: +Date: Sat, 02 Oct 2010 00:00:00 +0000 +X-Archives-Hash: $h1 + +EOM + $em; +} @V; + +my $delivered = 0; +my $cb = sub { + diag "message delivered to `$_[0]->{name}'"; + ++$delivered; +}; +PublicInbox::DS->Reset; +my $ii = PublicInbox::InboxIdle->new($cfg); +my $obj = bless \$cb, 'PublicInbox::TestCommon::InboxWakeup'; +$cfg->each_inbox(sub { $_[0]->subscribe_unlock('ident', $obj) }); +local @PublicInbox::DS::post_loop_do = (sub { $delivered != @V }); +$_->commit for @em; +diag 'waiting for -watch to import new message(s)'; +PublicInbox::DS::event_loop(); +$wm->join('TERM'); +$ii->close; + +$cfg->each_inbox(sub { + my ($ibx) = @_; + my $srch = $ibx->search; + my $mset = $srch->mset('xarchiveshash:miss'); + is($mset->size, 0, 'got xarchiveshash:miss non-result'); + $mset = $srch->mset("xarchiveshash:$h1"); + is($mset->size, 1, 'got xarchiveshash: hit result') or return; + my $num = $srch->mset_to_artnums($mset); + my $eml = $ibx->smsg_eml($ibx->over->get_art($num->[0])); + is($eml->header_raw('X-Archives-Hash'), $h1, + 'stored message with X-Archives-Hash'); + my @opt = $srch->xh_opt; + is $opt[-2], '-Q', 'xap_helper -Q switch'; + is $opt[-1], 'xarchiveshash=XXARCHIVESHASH', 'xap_helper -Q arg'; +}); + +done_testing;