From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 511A01F9FD for ; Sun, 21 Mar 2021 09:50:47 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 1/3] lei import: vivify external-only messages Date: Sun, 21 Mar 2021 15:50:45 +0600 Message-Id: <20210321095047.13855-2-e@80x24.org> In-Reply-To: <20210321095047.13855-1-e@80x24.org> References: <20210321095047.13855-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Keyword storage for external-only messages was preventing messages from being explicitly imported. Teach lei_store to vivify keyword-only entries into fully-indexed messages on import. --- lib/PublicInbox/Import.pm | 14 ++++++++++- lib/PublicInbox/LeiImport.pm | 22 +++++++++++------ lib/PublicInbox/LeiSearch.pm | 5 +++- lib/PublicInbox/LeiStore.pm | 46 +++++++++++++++++++++++++++++++----- lib/PublicInbox/Over.pm | 2 +- lib/PublicInbox/SearchIdx.pm | 12 ++++++++-- t/lei-q-kw.t | 44 ++++++++++++++++++++++++++++++++++ 7 files changed, 127 insertions(+), 18 deletions(-) diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index b8fa5c21..34738279 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -413,7 +413,19 @@ sub add { $smsg->{blob} = $self->get_mark(":$blob"); $smsg->set_bytes($raw_email, $n); if (my $oidx = delete $smsg->{-oidx}) { # used by LeiStore - return if $oidx->blob_exists($smsg->{blob}); + my @docids = $oidx->blob_exists($smsg->{blob}); + my @vivify_xvmd; + for my $id (@docids) { + if (my $cur = $oidx->get_art($id)) { + # already imported if bytes > 0 + return if $cur->{bytes} > 0; + push @vivify_xvmd, $id; + } else { + warn "W: $smsg->{blob} ", + "#$id gone (bug?)\n"; + } + } + $smsg->{-vivify_xvmd} = \@vivify_xvmd; } } my $ref = $self->{ref}; diff --git a/lib/PublicInbox/LeiImport.pm b/lib/PublicInbox/LeiImport.pm index 137c22fc..ae24a1fa 100644 --- a/lib/PublicInbox/LeiImport.pm +++ b/lib/PublicInbox/LeiImport.pm @@ -10,9 +10,14 @@ use PublicInbox::Eml; use PublicInbox::PktOp qw(pkt_do); sub _import_eml { # MboxReader callback - my ($eml, $sto, $set_kw) = @_; - $sto->ipc_do('set_eml', $eml, $set_kw ? - { kw => PublicInbox::MboxReader::mbox_keywords($eml) } : ()); + my ($eml, $lei, $mbox_keywords) = @_; + my $vmd; + if ($mbox_keywords) { + my $kw = $mbox_keywords->($eml); + $vmd = { kw => $kw } if scalar(@$kw); + } + my $xoids = $lei->{ale}->xoids_for($eml); + $lei->{sto}->ipc_do('set_eml', $eml, $vmd, $xoids); } sub import_done_wait { # dwaitpid callback @@ -41,6 +46,7 @@ sub net_merge_complete { # callback used by LeiAuth sub import_start { my ($lei) = @_; my $self = $lei->{imp}; + $lei->ale; my $j = $lei->{opt}->{jobs} // scalar(@{$self->{inputs}}) || 1; if (my $net = $lei->{net}) { # $j = $net->net_concurrency($j); TODO @@ -130,7 +136,8 @@ sub ipc_atfork_child { sub _import_fh { my ($lei, $fh, $input, $ifmt) = @_; - my $set_kw = $lei->{opt}->{kw}; + my $kw = $lei->{opt}->{kw} ? + PublicInbox::MboxReader->can('mbox_keywords') : undef; eval { if ($ifmt eq 'eml') { my $buf = do { local $/; <$fh> } // @@ -138,11 +145,11 @@ sub _import_fh { error reading $input: $! my $eml = PublicInbox::Eml->new(\$buf); - _import_eml($eml, $lei->{sto}, $set_kw); + _import_eml($eml, $lei, $kw); } else { # some mbox (->can already checked in call); my $cb = PublicInbox::MboxReader->can($ifmt) // die "BUG: bad fmt=$ifmt"; - $cb->(undef, $fh, \&_import_eml, $lei->{sto}, $set_kw); + $cb->(undef, $fh, \&_import_eml, $lei, $kw); } }; $lei->child_error(1 << 8, "$input: $@") if $@; @@ -193,7 +200,8 @@ EOM sub import_stdin { my ($self) = @_; my $lei = $self->{lei}; - _import_fh($lei, delete $self->{0}, '', $lei->{opt}->{'in-format'}); + my $in = delete $self->{0}; + _import_fh($lei, $in, '', $lei->{opt}->{'in-format'}); } no warnings 'once'; # the following works even when LeiAuth is lazy-loaded diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm index 360a37e5..bbb00661 100644 --- a/lib/PublicInbox/LeiSearch.pm +++ b/lib/PublicInbox/LeiSearch.pm @@ -63,7 +63,10 @@ sub _cmp_1st { # git->cat_async callback } } -sub xoids_for { # returns { OID => docid } mapping for $eml matches +# returns { OID => num } mapping for $eml matches +# The `num' hash value only makes sense from LeiSearch itself +# and is nonsense from the PublicInbox::LeiALE subclass +sub xoids_for { my ($self, $eml, $min) = @_; my ($chash, $mids) = content_key($eml); my @overs = ($self->over // $self->overs_all); diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm index c66d3dc2..b390b318 100644 --- a/lib/PublicInbox/LeiStore.pm +++ b/lib/PublicInbox/LeiStore.pm @@ -161,7 +161,7 @@ sub remove_eml_vmd { } sub add_eml { - my ($self, $eml, $vmd) = @_; + my ($self, $eml, $vmd, $xoids) = @_; my $im = $self->importer; # may create new epoch my $eidx = eidx_init($self); # writes ALL.git/objects/info/alternates my $oidx = $eidx->{oidx}; # PublicInbox::Import::add checks this @@ -169,7 +169,40 @@ sub add_eml { $im->add($eml, undef, $smsg) or return; # duplicate returns undef local $self->{current_info} = $smsg->{blob}; - if (my @docids = _docids_for($self, $eml)) { + my $vivify_xvmd = delete($smsg->{-vivify_xvmd}) // []; # exact matches + if ($xoids) { # fuzzy matches from externals in ale->xoids_for + delete $xoids->{$smsg->{blob}}; # added later + if (scalar keys %$xoids) { + my %docids = map { $_ => 1 } @$vivify_xvmd; + for my $oid (keys %$xoids) { + my @id = $oidx->blob_exists($oid); + @docids{@id} = @id; + } + @$vivify_xvmd = sort { $a <=> $b } keys(%docids); + } + } + if (@$vivify_xvmd) { + $xoids //= {}; + $xoids->{$smsg->{blob}} = 1; + for my $docid (@$vivify_xvmd) { + my $cur = $oidx->get_art($docid); + my $idx = $eidx->idx_shard($docid); + if (!$cur || $cur->{bytes} == 0) { # really vivifying + $smsg->{num} = $docid; + $oidx->add_overview($eml, $smsg); + $smsg->{-merge_vmd} = 1; + $idx->index_eml($eml, $smsg); + } else { # lse fuzzy hit off ale + $idx->ipc_do('add_eidx_info', $docid, '.', $eml); + } + for my $oid (keys %$xoids) { + $oidx->add_xref3($docid, -1, $oid, '.'); + } + $idx->ipc_do('add_vmd', $docid, $vmd) if $vmd; + } + $vivify_xvmd; + } elsif (my @docids = _docids_for($self, $eml)) { + # fuzzy match from within lei/store for my $docid (@docids) { my $idx = $eidx->idx_shard($docid); $oidx->add_xref3($docid, -1, $smsg->{blob}, '.'); @@ -178,20 +211,21 @@ sub add_eml { $idx->ipc_do('add_vmd', $docid, $vmd) if $vmd; } \@docids; - } else { + } else { # totally new message $smsg->{num} = $oidx->adj_counter('eidx_docid', '+'); $oidx->add_overview($eml, $smsg); $oidx->add_xref3($smsg->{num}, -1, $smsg->{blob}, '.'); my $idx = $eidx->idx_shard($smsg->{num}); $idx->index_eml($eml, $smsg); - $idx->ipc_do('add_vmd', $smsg->{num}, $vmd ) if $vmd; + $idx->ipc_do('add_vmd', $smsg->{num}, $vmd) if $vmd; $smsg; } } sub set_eml { - my ($self, $eml, $vmd) = @_; - add_eml($self, $eml, $vmd) // set_eml_vmd($self, $eml, $vmd); + my ($self, $eml, $vmd, $xoids) = @_; + add_eml($self, $eml, $vmd, $xoids) // + set_eml_vmd($self, $eml, $vmd); } # set or update keywords for external message, called via ipc_do diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm index 587e0516..0e191c47 100644 --- a/lib/PublicInbox/Over.pm +++ b/lib/PublicInbox/Over.pm @@ -353,7 +353,7 @@ sub blob_exists { my ($self, $oidhex) = @_; if (wantarray) { my $sth = $self->dbh->prepare_cached(<<'', undef, 1); -SELECT docid FROM xref3 WHERE oidbin = ? +SELECT docid FROM xref3 WHERE oidbin = ? ORDER BY docid ASC $sth->bind_param(1, pack('H*', $oidhex), SQL_BLOB); $sth->execute; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 3237aadc..3f933121 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -11,6 +11,7 @@ use strict; use v5.10.1; use parent qw(PublicInbox::Search PublicInbox::Lock Exporter); use PublicInbox::Eml; +use PublicInbox::Search qw(xap_terms); use PublicInbox::InboxWritable; use PublicInbox::MID qw(mids_for_index mids); use PublicInbox::MsgIter; @@ -34,6 +35,7 @@ use constant DEBUG => !!$ENV{DEBUG}; my $xapianlevels = qr/\A(?:full|medium)\z/; my $hex = '[a-f0-9]'; my $OID = $hex .'{40,}'; +my @VMD_MAP = (kw => 'K', label => 'L'); our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/; sub new { @@ -428,7 +430,15 @@ sub eml2doc ($$$;$) { sub add_xapian ($$$$) { my ($self, $eml, $smsg, $mids) = @_; begin_txn_lazy($self); + my $merge_vmd = delete $smsg->{-merge_vmd}; my $doc = eml2doc($self, $eml, $smsg, $mids); + if (my $old = $merge_vmd ? _get_doc($self, $smsg->{num}) : undef) { + my @x = @VMD_MAP; + while (my ($field, $pfx) = splice(@x, 0, 2)) { + my $vals = xap_terms($pfx, $old); + $doc->add_boolean_term($pfx.$_) for keys %$vals; + } + } $self->{xdb}->replace_document($smsg->{num}, $doc); } @@ -531,8 +541,6 @@ sub remove_eidx_info { $self->{xdb}->replace_document($docid, $doc); } -my @VMD_MAP = (kw => 'K', label => 'L'); - sub set_vmd { my ($self, $docid, $vmd) = @_; begin_txn_lazy($self); diff --git a/t/lei-q-kw.t b/t/lei-q-kw.t index b5e22e9b..4db27363 100644 --- a/t/lei-q-kw.t +++ b/t/lei-q-kw.t @@ -161,5 +161,49 @@ like($s, qr/^Status: O\nX-Status: AF\n/ms, lei_ok(qw(q --pretty), "m:$m", @inc); like($lei_out, qr/^ "kw": \["answered", "flagged"\],\n/sm, '--pretty JSON output shows kw: on one line'); + +# ensure import on previously external-only message works +lei_ok('q', "m:$m"); +is_deeply(json_utf8->decode($lei_out), [ undef ], + 'to-be-imported message non-existent'); +lei_ok(qw(import -F eml t/x-unknown-alpine.eml)); +is($lei_err, '', 'no errors importing previous external-only message'); +lei_ok('q', "m:$m"); +$res = json_utf8->decode($lei_out); +is($res->[1], undef, 'got one result'); +is_deeply($res->[0]->{kw}, [ qw(answered flagged) ], 'kw preserved on exact'); + +# ensure fuzzy match import works, too +$m = 'multipart@example.com'; +$o = "$ENV{HOME}/fuzz"; +lei_ok('q', '-o', $o, "m:$m", @inc); +@fn = glob("$o/cur/*"); +scalar(@fn) == 1 or BAIL_OUT "wrote multiple or zero files: ".explain(\@fn); +rename($fn[0], "$fn[0]S") or BAIL_OUT "rename $!"; +lei_ok('q', '-o', $o, "m:$m"); +is_deeply([glob("$o/cur/*")], [], 'clobbered output results'); +my $eml = eml_load('t/plack-2-txt-bodies.eml'); +$eml->header_set('List-Id', ''); +my $in = $eml->as_string; +lei_ok([qw(import -F eml --stdin)], undef, { 0 => \$in, %$lei_opt }); +is($lei_err, '', 'no errors from import'); +lei_ok(qw(q -f mboxrd), "m:$m"); +open $fh, '<', \$lei_out or BAIL_OUT $!; +my @res; +PublicInbox::MboxReader->mboxrd($fh, sub { push @res, shift }); +is($res[0]->header('Status'), 'RO', 'seen kw set'); +$res[0]->header_set('Status'); +is_deeply(\@res, [ $eml ], 'imported message matches w/ List-Id'); + +$eml->header_set('List-Id', ''); +$in = $eml->as_string; +lei_ok([qw(import -F eml --stdin)], undef, { 0 => \$in, %$lei_opt }); +is($lei_err, '', 'no errors from 2nd import'); +lei_ok(qw(q -f mboxrd), "m:$m", 'l:another.example.com'); +my @another; +open $fh, '<', \$lei_out or BAIL_OUT $!; +PublicInbox::MboxReader->mboxrd($fh, sub { push @another, shift }); +is($another[0]->header('Status'), 'RO', 'seen kw set'); + }); # test_lei done_testing;