From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id A01291FA13 for ; Thu, 21 Jan 2021 19:46:24 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 02/12] lei q: retrieve keywords for local, non-external messages Date: Thu, 21 Jan 2021 19:46:14 +0000 Message-Id: <20210121194624.32002-3-e@80x24.org> In-Reply-To: <20210121194624.32002-1-e@80x24.org> References: <20210121194624.32002-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This isn't tested for now, so maybe it works. --- lib/PublicInbox/LeiOverview.pm | 8 +++----- lib/PublicInbox/LeiSearch.pm | 16 +++------------- lib/PublicInbox/LeiXSearch.pm | 14 ++++++++++---- lib/PublicInbox/Search.pm | 20 +++++++++++++++++++- 4 files changed, 35 insertions(+), 23 deletions(-) diff --git a/lib/PublicInbox/LeiOverview.pm b/lib/PublicInbox/LeiOverview.pm index 8799f1cc..47d9eb31 100644 --- a/lib/PublicInbox/LeiOverview.pm +++ b/lib/PublicInbox/LeiOverview.pm @@ -224,9 +224,8 @@ sub ovv_each_smsg_cb { # runs in wq worker usually my $git_dir = $git->{git_dir}; sub { my ($smsg, $mitem) = @_; - my $kw = []; # TODO get from mitem $l2m->wq_do('write_mail', \@io, $git_dir, - $smsg->{blob}, $lei_ipc, $kw) + $smsg->{blob}, $lei_ipc, $smsg->{kw}); } } elsif ($l2m) { my $wcb = $l2m->write_cb($lei); @@ -235,8 +234,8 @@ sub ovv_each_smsg_cb { # runs in wq worker usually my $g2m = $l2m->can('git_to_mail'); sub { my ($smsg, $mitem) = @_; - my $kw = []; # TODO get from mitem - $git->cat_async($smsg->{blob}, $g2m, [ $wcb, $kw ]); + $git->cat_async($smsg->{blob}, $g2m, + [ $wcb, $smsg->{kw} ]); }; } elsif ($self->{fmt} =~ /\A(concat)?json\z/ && $lei->{opt}->{pretty}) { my $EOR = ($1//'') eq 'concat' ? "\n}" : "\n},"; @@ -266,7 +265,6 @@ sub ovv_each_smsg_cb { # runs in wq worker usually $lei->{ovv_buf} = \(my $buf = ''); sub { my ($smsg, $mitem) = @_; - delete @$smsg{qw(tid num)}; $buf .= $json->encode(_unbless_smsg(@_)) . $ORS; if (length($buf) > 65536) { my $lk = $self->lock_for_scope; diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm index b7e337de..440bacf5 100644 --- a/lib/PublicInbox/LeiSearch.pm +++ b/lib/PublicInbox/LeiSearch.pm @@ -5,7 +5,7 @@ package PublicInbox::LeiSearch; use strict; use v5.10.1; use parent qw(PublicInbox::ExtSearch); -use PublicInbox::Search; +use PublicInbox::Search qw(xap_terms); # get combined docid from over.num: # (not generic Xapian, only works with our sharding scheme) @@ -19,19 +19,9 @@ sub msg_keywords { my ($self, $num) = @_; # num_or_mitem my $xdb = $self->xdb; # set {nshard}; my $docid = ref($num) ? $num->get_docid : num2docid($self, $num); - my %kw; - eval { - my $end = $xdb->termlist_end($docid); - my $cur = $xdb->termlist_begin($docid); - for (; $cur != $end; $cur++) { - $cur->skip_to('K'); - last if $cur == $end; - my $kw = $cur->get_termname; - $kw =~ s/\AK//s and $kw{$kw} = undef; - } - }; + my $kw = xap_terms('K', $xdb, $docid); warn "E: #$docid ($num): $@\n" if $@; - wantarray ? sort(keys(%kw)) : \%kw; + wantarray ? sort(keys(%$kw)) : $kw; } 1; diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm index a6d827de..d7688ede 100644 --- a/lib/PublicInbox/LeiXSearch.pm +++ b/lib/PublicInbox/LeiXSearch.pm @@ -13,6 +13,7 @@ use PublicInbox::OpPipe; use PublicInbox::Import; use File::Temp 0.19 (); # 0.19 for ->newdir use File::Spec (); +use PublicInbox::Search qw(xap_terms); sub new { my ($class) = @_; @@ -74,7 +75,12 @@ sub smsg_for { my $docid = $mitem->get_docid; my $shard = ($docid - 1) % $nshard; my $num = int(($docid - 1) / $nshard) + 1; - my $smsg = $self->{shard2ibx}->[$shard]->over->get_art($num); + my $ibx = $self->{shard2ibx}->[$shard]; + my $smsg = $ibx->over->get_art($num); + if (ref($ibx->can('msg_keywords'))) { + my $kw = xap_terms('K', $mitem->get_document); + $smsg->{kw} = [ sort keys %$kw ]; + } $smsg->{docid} = $docid; $smsg; } @@ -153,11 +159,11 @@ sub query_mset { # non-parallel for non-"--thread" users $dedupe->prepare_dedupe; do { $mset = $self->mset($mo->{qstr}, $mo); - for my $it ($mset->items) { - my $smsg = smsg_for($self, $it) or next; + for my $mitem ($mset->items) { + my $smsg = smsg_for($self, $mitem) or next; wait_startq($startq) if $startq; next if $dedupe->is_smsg_dup($smsg); - $each_smsg->($smsg, $it); + $each_smsg->($smsg, $mitem); } } while (_mset_more($mset, $mo)); undef $each_smsg; # drops @io for l2m->{each_smsg_done} diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index a4b40f94..7c6a16be 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -6,7 +6,7 @@ package PublicInbox::Search; use strict; use parent qw(Exporter); -our @EXPORT_OK = qw(retry_reopen int_val get_pct); +our @EXPORT_OK = qw(retry_reopen int_val get_pct xap_terms); use List::Util qw(max); # values for searching, changing the numeric value breaks @@ -432,4 +432,22 @@ sub get_pct ($) { # mset item $n > 99 ? 99 : $n; } +sub xap_terms ($$;@) { + my ($pfx, $xdb_or_doc, @docid) = @_; # @docid may be empty () + my %ret; + eval { + my $end = $xdb_or_doc->termlist_end(@docid); + my $cur = $xdb_or_doc->termlist_begin(@docid); + for (; $cur != $end; $cur++) { + $cur->skip_to($pfx); + last if $cur == $end; + my $tn = $cur->get_termname; + if (index($tn, $pfx) == 0) { + $ret{substr($tn, length($pfx))} = undef; + } + } + }; + \%ret; +} + 1;