From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 189D41F62B for ; Wed, 24 Apr 2024 06:44:48 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1713941088; bh=6T+NCfqbblLnF0ba3xGh6qj3I2v2HsyKmpM4rvTAbE0=; h=From:To:Subject:Date:In-Reply-To:References:From; b=WvJStf0f5F9w8IzjwuM8Z04Ee+ubrgQXyuQ0+RFS5WmDp3EUeNcxR/K9SK1xvIjK6 art/TDkd8Z9dqZdz+hjjWLmbc73afhS0+n7aucqviZYAjp9TDQEC43XAIKqQvg//DI 9DKsyniF6Zx0nGMxYuPJ+0CBBb3bUL3yJOaMCn5o= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 5/5] www: wire up search to use async xap_helper Date: Wed, 24 Apr 2024 06:44:47 +0000 Message-ID: <20240424064447.523799-6-e@80x24.org> In-Reply-To: <20240424064447.523799-1-e@80x24.org> References: <20240424064447.523799-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: The C++ version of xap_helper will allow more complex and expensive queries. Both the Perl and C++-only version will allow offloading search into a separate process which can be killed via ITIMER_REAL or RLIMIT_CPU in the face of overload. The xap_helper `mset' command wrapper is simplified to unconditionally return rank, percentage, and estimated matches information. This may slightly penalize mbox retrievals and lei users, but perhaps that can be a different command entirely. --- MANIFEST | 2 + lib/PublicInbox/Isearch.pm | 50 ++++++++----- lib/PublicInbox/Mbox.pm | 115 +++++++++++++++++------------ lib/PublicInbox/MboxGz.pm | 4 +- lib/PublicInbox/Search.pm | 52 ++++++++++++- lib/PublicInbox/SearchView.pm | 56 ++++++++------ lib/PublicInbox/XapClient.pm | 1 + lib/PublicInbox/XapHelper.pm | 11 +-- lib/PublicInbox/XhcMset.pm | 51 +++++++++++++ lib/PublicInbox/XhcMsetIterator.pm | 20 +++++ lib/PublicInbox/xap_helper.h | 9 ++- lib/PublicInbox/xh_mset.h | 33 ++------- t/cindex.t | 8 +- t/xap_helper.t | 14 ++-- 14 files changed, 287 insertions(+), 139 deletions(-) create mode 100644 lib/PublicInbox/XhcMset.pm create mode 100644 lib/PublicInbox/XhcMsetIterator.pm diff --git a/MANIFEST b/MANIFEST index 4c974338..fb175e5f 100644 --- a/MANIFEST +++ b/MANIFEST @@ -382,6 +382,8 @@ lib/PublicInbox/XapClient.pm lib/PublicInbox/XapHelper.pm lib/PublicInbox/XapHelperCxx.pm lib/PublicInbox/Xapcmd.pm +lib/PublicInbox/XhcMset.pm +lib/PublicInbox/XhcMsetIterator.pm lib/PublicInbox/gcf2_libgit2.h lib/PublicInbox/xap_helper.h lib/PublicInbox/xh_cidx.h diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm index 62112171..20808d6d 100644 --- a/lib/PublicInbox/Isearch.pm +++ b/lib/PublicInbox/Isearch.pm @@ -26,34 +26,44 @@ SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1 sub query_approxidate { $_[0]->{es}->query_approxidate($_[1], $_[2]) } -sub mset { - my ($self, $str, $opt) = @_; +sub eidx_mset_prep ($$) { + my ($self, $opt) = @_; my %opt = $opt ? %$opt : (); $opt{eidx_key} = $self->{eidx_key}; - if (my $uid_range = $opt{uid_range}) { - my ($beg, $end) = @$uid_range; - my $ibx_id = $self->{-ibx_id} //= _ibx_id($self); - my $dbh = $self->{es}->over->dbh; - my $sth = $dbh->prepare_cached(<<'', undef, 1); + my $uid_range = $opt{uid_range} or return \%opt; + my ($beg, $end) = @$uid_range; + my $ibx_id = $self->{-ibx_id} //= _ibx_id($self); + my $dbh = $self->{es}->over->dbh; + my $sth = $dbh->prepare_cached(<<'', undef, 1); SELECT MIN(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ? - $sth->execute($ibx_id, $beg, $end); - my @r = ($sth->fetchrow_array); + $sth->execute($ibx_id, $beg, $end); + my @r = ($sth->fetchrow_array); - $sth = $dbh->prepare_cached(<<'', undef, 1); + $sth = $dbh->prepare_cached(<<'', undef, 1); SELECT MAX(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ? - $sth->execute($ibx_id, $beg, $end); - $r[1] = $sth->fetchrow_array; - if (defined($r[1]) && defined($r[0])) { - $opt{limit} = $r[1] - $r[0] + 1; - } else { - $r[1] //= $self->{es}->xdb->get_lastdocid; - $r[0] //= 0; - } - $opt{uid_range} = \@r; # these are fed to Xapian and SQLite + $sth->execute($ibx_id, $beg, $end); + $r[1] = $sth->fetchrow_array; + if (defined($r[1]) && defined($r[0])) { + $opt{limit} = $r[1] - $r[0] + 1; + } else { + $r[1] //= $self->{es}->xdb->get_lastdocid; + $r[0] //= 0; } - $self->{es}->mset($str, \%opt); + $opt{uid_range} = \@r; # these are fed to Xapian and SQLite + \%opt; +} + +sub mset { + my ($self, $str, $opt) = @_; + $self->{es}->mset($str, eidx_mset_prep $self, $opt); +} + +sub async_mset { + my ($self, $str, $opt, $cb, @args) = @_; + $opt = eidx_mset_prep $self, $opt; + $self->{es}->async_mset($str, $opt, $cb, @args); } sub mset_to_artnums { diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index ac565df9..82fba5c6 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -31,8 +31,8 @@ sub async_next { my ($http) = @_; # PublicInbox::HTTP my $ctx = $http->{forward} or return; # client aborted eval { - my $smsg = $ctx->{smsg} or return $ctx->close; - $ctx->smsg_blob($smsg); + my $smsg = $ctx->{smsg} // return $ctx->close; + $ctx->smsg_blob($smsg) if $smsg; }; warn "E: $@" if $@; } @@ -159,6 +159,7 @@ sub all_ids_cb { } $ctx->{ids} = $ids = $over->ids_after(\($ctx->{prev})); } while (@$ids); + undef; } sub mbox_all_ids { @@ -175,52 +176,79 @@ sub mbox_all_ids { PublicInbox::MboxGz::mbox_gz($ctx, \&all_ids_cb, 'all'); } -sub refill_result_ids ($) { - my ($ctx) = @_; +my $refill_ids_cb = sub { # async_mset cb + my ($ctx, $http, $mset, $err) = @_; + $http = undef unless $ctx->{-really_async}; + if ($err) { + warn "E: $err"; + $ctx->close if $http; # our async httpd + return; + } # refill result set, deprioritize since there's many results - my $srch = $ctx->{ibx}->isrch or return $ctx->gone('search'); - my $mset = $srch->mset($ctx->{query}, $ctx->{qopts}); - my $size = $mset->size or return; + my $size = $mset->size or do { + $ctx->close if $http; + $ctx->{-mbox_done} = 1; + return; + }; $ctx->{qopts}->{offset} += $size; - $ctx->{ids} = $srch->mset_to_artnums($mset, $ctx->{qopts}); + $ctx->{ids} = $ctx->{srch}->mset_to_artnums($mset, $ctx->{qopts}); $ctx->{-low_prio} = 1; # true -} + return if !$http; + eval { + my $smsg = results_cb($ctx) // return $ctx->close; + return if !$smsg; # '' wait for async_mset + $ctx->smsg_blob($ctx->{smsg} = $smsg); + }; + warn "E: $@" if $@; +}; -sub results_cb { - my ($ctx) = @_; +sub results_cb { # async_next or MboxGz->getline cb + my ($ctx, $http) = @_; my $over = $ctx->{ibx}->over or return $ctx->gone('over'); while (1) { - while (defined(my $num = shift(@{$ctx->{ids}}))) { + my $ids = $ctx->{xids} // $ctx->{ids}; + while (defined(my $num = shift(@$ids))) { my $smsg = $over->get_art($num) or next; return $smsg; } - refill_result_ids($ctx) or return; # refill ctx->{ids} + next if $ctx->{xids} && $over->expand_thread($ctx); + return '' if $ctx->{srch}->async_mset(@$ctx{qw(query qopts)}, + $refill_ids_cb, $ctx, $http); + return if $ctx->{-mbox_done}; } } -sub results_thread_cb { - my ($ctx) = @_; - - my $over = $ctx->{ibx}->over or return $ctx->gone('over'); - while (1) { - while (defined(my $num = shift(@{$ctx->{xids}}))) { - my $smsg = $over->get_art($num) or next; - return $smsg; - } - next if $over->expand_thread($ctx); # refills ctx->{xids} - - refill_result_ids($ctx) or return; # refill ctx->{ids} +sub mbox_qry_cb { # async_mset cb + my ($ctx, $q, $mset, $err) = @_; + my $wcb = delete $ctx->{wcb}; + if ($err) { + warn "E: $err"; + return $wcb->([500, [qw(Content-Type text/plain)], + [ "Internal server error\n" ]]) } + $ctx->{qopts}->{offset} = $mset->size or + return $wcb->([404, [qw(Content-Type text/plain)], + ["No results found\n"]]); + $ctx->{ids} = $ctx->{srch}->mset_to_artnums($mset, $ctx->{qopts}); + my $fn; + if ($q->{t} && $ctx->{srch}->has_threadid) { + $ctx->{xids} = []; # triggers over->expand_thread + $fn = "results-thread-$ctx->{query}"; + } else { + $fn = "results-$ctx->{query}"; + } + require PublicInbox::MboxGz; + my $res = PublicInbox::MboxGz::mbox_gz($ctx, \&results_cb, $fn); + ref($res) eq 'CODE' ? $res->($wcb) : $wcb->($res); } sub mbox_all { my ($ctx, $q) = @_; - my $q_string = $q->{'q'}; - return mbox_all_ids($ctx) if $q_string !~ /\S/; - my $srch = $ctx->{ibx}->isrch or + my $qstr = $q->{'q'}; + return mbox_all_ids($ctx) if $qstr !~ /\S/; + my $srch = $ctx->{srch} = $ctx->{ibx}->isrch or return PublicInbox::WWW::need($ctx, 'Search'); - - my $qopts = $ctx->{qopts} = { relevance => -2 }; # ORDER BY docid DESC + my $opt = $ctx->{qopts} = { relevance => -2 }; # ORDER BY docid DESC # {threadid} limits results to a given thread # {threads} collapses results from messages in the same thread, @@ -230,25 +258,16 @@ sub mbox_all { $ctx->{ibx}->{isrch}->{es}->over : $ctx->{ibx}->over) or return PublicInbox::WWW::need($ctx, 'Overview'); - $qopts->{threadid} = $over->mid2tid($ctx->{mid}); - } - $qopts->{threads} = 1 if $q->{t}; - $srch->query_approxidate($ctx->{ibx}->git, $q_string); - my $mset = $srch->mset($q_string, $qopts); - $qopts->{offset} = $mset->size or - return [404, [qw(Content-Type text/plain)], - ["No results found\n"]]; - $ctx->{query} = $q_string; - $ctx->{ids} = $srch->mset_to_artnums($mset, $qopts); - require PublicInbox::MboxGz; - my $fn; - if ($q->{t} && $srch->has_threadid) { - $fn = 'results-thread-'.$q_string; - PublicInbox::MboxGz::mbox_gz($ctx, \&results_thread_cb, $fn); - } else { - $fn = 'results-'.$q_string; - PublicInbox::MboxGz::mbox_gz($ctx, \&results_cb, $fn); + $opt->{threadid} = $over->mid2tid($ctx->{mid}); } + $opt->{threads} = 1 if $q->{t}; + $srch->query_approxidate($ctx->{ibx}->git, $qstr); + $ctx->{query} = $qstr; + sub { # called by PSGI server + $ctx->{wcb} = $_[0]; # PSGI server supplied write cb + $srch->async_mset($qstr, $opt, \&mbox_qry_cb, $ctx, $q) and + $ctx->{-really_async} = 1; + }; } 1; diff --git a/lib/PublicInbox/MboxGz.pm b/lib/PublicInbox/MboxGz.pm index 533d2ff1..864d701e 100644 --- a/lib/PublicInbox/MboxGz.pm +++ b/lib/PublicInbox/MboxGz.pm @@ -13,8 +13,8 @@ sub async_next ($) { my ($http) = @_; # PublicInbox::HTTP my $ctx = $http->{forward} or return; eval { - $ctx->{smsg} = $ctx->{cb}->($ctx) or return $ctx->close; - $ctx->smsg_blob($ctx->{smsg}); + my $smsg = $ctx->{cb}->($ctx, $http) // return $ctx->close; + $smsg and $ctx->smsg_blob($ctx->{smsg} = $smsg); }; warn "E: $@" if $@; } diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 0196dd45..60d12dbf 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -11,6 +11,7 @@ our @EXPORT_OK = qw(retry_reopen int_val get_pct xap_terms); use List::Util qw(max); use POSIX qw(strftime); use Carp (); +our $XHC; # values for searching, changing the numeric value breaks # compatibility with old indices (so don't change them it) @@ -85,7 +86,6 @@ our @XH_SPEC = ( 'k=i', # sort column (like sort(1)) 'm=i', # maximum number of results 'o=i', # offset - 'p', # show percent 'r', # 1=relevance then column 't', # collapse threads 'A=s@', # prefixes @@ -428,6 +428,56 @@ sub mset { do_enquire($self, $qry, $opt, TS); } +sub xhc_start_maybe () { + require PublicInbox::XapClient; + my $xhc = PublicInbox::XapClient::start_helper(); + require PublicInbox::XhcMset if $xhc; + $xhc; +} + +sub xh_opt ($) { + my ($opt) = @_; + my $lim = $opt->{limit} || 50; + my @ret; + push @ret, '-o', $opt->{offset} if $opt->{offset}; + push @ret, '-m', $lim; + my $rel = $opt->{relevance} // 0; + if ($rel == -2) { # ORDER BY docid/UID (highest first) + push @ret, '-k', '-1'; + } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first) + push @ret, '-k', '-1'; + push @ret, '-a'; + } elsif ($rel == 0) { + push @ret, '-k', $opt->{sort_col} // TS; + push @ret, '-a' if $opt->{asc}; + } else { # rel > 0 + push @ret, '-r'; + push @ret, '-k', $opt->{sort_col} // TS; + push @ret, '-a' if $opt->{asc}; + } + push @ret, '-t' if $opt->{threads}; + push @ret, '-T', $opt->{threadid} if defined $opt->{threadid}; + push @ret, '-O', $opt->{eidx_key} if defined $opt->{eidx_key}; + @ret; +} + +# returns a true value if actually handled asynchronously, +# and a falsy value if handled synchronously +sub async_mset { + my ($self, $qry_str, $opt, $cb, @args) = @_; + $XHC //= xhc_start_maybe; + if ($XHC) { # unconditionally retrieving pct + rank for now + xdb($self); # populate {nshards} + my @margs = ($self->xh_args, xh_opt($opt)); + my $rd = $XHC->mkreq(undef, 'mset', @margs, $qry_str); + PublicInbox::XhcMset->maybe_new($rd, $self, $cb, @args); + } else { # synchronous + my $mset = $self->mset($qry_str, $opt); + $cb->(@args, $mset); + undef; + } +} + sub do_enquire { # shared with CodeSearch my ($self, $qry, $opt, $col) = @_; my $enq = $X{Enquire}->new(xdb($self)); diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index 4016ddeb..9919e25c 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -30,10 +30,9 @@ sub mbox_results { sub sres_top_html { my ($ctx) = @_; - my $srch = $ctx->{ibx}->isrch or + my $srch = $ctx->{srch} = $ctx->{ibx}->isrch or return PublicInbox::WWW::need($ctx, 'Search'); my $q = PublicInbox::SearchQuery->new($ctx->{qp}); - my $x = $q->{x}; my $o = $q->{o} // 0; my $asc; if ($o < 0) { @@ -41,48 +40,57 @@ sub sres_top_html { $o = -($o + 1); # so [-1] is the last element, like Perl lists } - my $code = 200; # double the limit for expanded views: - my $opts = { + my $opt = { limit => $q->{l}, offset => $o, relevance => $q->{r}, threads => $q->{t}, asc => $asc, }; - my ($mset, $total, $err, $html); -retry: - eval { - my $query = $q->{'q'}; - $srch->query_approxidate($ctx->{ibx}->git, $query); - $mset = $srch->mset($query, $opts); - $total = $mset->get_matches_estimated; - }; - $err = $@; + my $qs = $q->{'q'}; + $srch->query_approxidate($ctx->{ibx}->git, $qs); + sub { + $ctx->{wcb} = $_[0]; # PSGI server supplied write cb + $srch->async_mset($qs, $opt, \&sres_html_cb, $ctx, $opt, $q); + } +} + +sub sres_html_cb { # async_mset cb + my ($ctx, $opt, $q, $mset, $err) = @_; + my $code = 200; + my $total = $mset ? $mset->get_matches_estimated : undef; ctx_prepare($q, $ctx); + my ($res, $html); if ($err) { $code = 400; $html = '
'.err_txt($ctx, $err).'

'; } elsif ($total == 0) { - if (defined($ctx->{-uxs_retried})) { - # undo retry damage: + if (defined($ctx->{-uxs_retried})) { # undo retry damage: $q->{'q'} = $ctx->{-uxs_retried}; - } elsif (index($q->{'q'}, '%') >= 0) { + } elsif (index($q->{'q'}, '%') >= 0) { # retry unescaped $ctx->{-uxs_retried} = $q->{'q'}; - $q->{'q'} = uri_unescape($q->{'q'}); - goto retry; + my $qs = $q->{'q'} = uri_unescape($q->{'q'}); + $ctx->{srch}->query_approxidate($ctx->{ibx}->git, $qs); + return $ctx->{srch}->async_mset($qs, $opt, + \&sres_html_cb, $ctx, $opt, $q); } $code = 404; $html = "
\n[No results found]

"; + } elsif ($q->{x} eq 'A') { + $res = adump($mset, $q, $ctx); } else { - return adump($mset, $q, $ctx) if $x eq 'A'; - $ctx->{-html_tip} = search_nav_top($mset, $q, $ctx); - return mset_thread($ctx, $mset, $q) if $x eq 't'; - mset_summary($ctx, $mset, $q); # appends to {-html_tip} - $html = ''; + if ($q->{x} eq 't') { + $res = mset_thread($ctx, $mset, $q); + } else { + mset_summary($ctx, $mset, $q); # appends to {-html_tip} + $html = ''; + } } - html_oneshot($ctx, $code, $html); + $res //= html_oneshot($ctx, $code, $html); + my $wcb = delete $ctx->{wcb}; + ref($res) eq 'CODE' ? $res->($wcb) : $wcb->($res); } # display non-nested search results similar to what users expect from diff --git a/lib/PublicInbox/XapClient.pm b/lib/PublicInbox/XapClient.pm index 98034130..f0270091 100644 --- a/lib/PublicInbox/XapClient.pm +++ b/lib/PublicInbox/XapClient.pm @@ -26,6 +26,7 @@ sub mkreq { } sub start_helper { + $PublicInbox::IPC::send_cmd or return; # can't work w/o SCM_RIGHTS my @argv = @_; socketpair(my $sock, my $in, AF_UNIX, SOCK_SEQPACKET, 0); my $cls = 'PublicInbox::XapHelperCxx'; diff --git a/lib/PublicInbox/XapHelper.pm b/lib/PublicInbox/XapHelper.pm index 27d98ba1..c55a72ce 100644 --- a/lib/PublicInbox/XapHelper.pm +++ b/lib/PublicInbox/XapHelper.pm @@ -147,12 +147,8 @@ sub cmd_dump_roots { sub mset_iter ($$) { my ($req, $it) = @_; - eval { - my $buf = $it->get_docid; - $buf .= "\0".$it->get_percent if $req->{p}; - say { $req->{0} } $buf; - }; - $@ ? iter_retry_check($req) : 0; + say { $req->{0} } $it->get_docid, "\0", + $it->get_percent, "\0", $it->get_rank; } sub cmd_mset { # to be used by WWW + IMAP @@ -165,7 +161,8 @@ sub cmd_mset { # to be used by WWW + IMAP $opt->{eidx_key} = $req->{O} if defined $req->{O}; $opt->{threadid} = $req->{T} if defined $req->{T}; my $mset = $req->{srch}->mset($qry_str, $opt); - say { $req->{0} } 'mset.size=', $mset->size; + say { $req->{0} } 'mset.size=', $mset->size, + ' .get_matches_estimated=', $mset->get_matches_estimated; for my $it ($mset->items) { for (my $t = 10; $t > 0; --$t) { $t = mset_iter($req, $it) // $t; diff --git a/lib/PublicInbox/XhcMset.pm b/lib/PublicInbox/XhcMset.pm new file mode 100644 index 00000000..ac25eece --- /dev/null +++ b/lib/PublicInbox/XhcMset.pm @@ -0,0 +1,51 @@ +# Copyright (C) all contributors +# License: AGPL-3.0+ + +# mocks Xapian::Mset and allows slow queries from blocking the event loop +package PublicInbox::XhcMset; +use v5.12; +use parent qw(PublicInbox::DS); +use PublicInbox::XhcMsetIterator; +use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT); + +sub event_step { + my ($self) = @_; + my ($cb, @args) = @{delete $self->{cb_args} // return}; + my $rd = $self->{sock}; + eval { + my $hdr = <$rd> // die "E: reading mset header: $!"; + for (split /\s+/, $hdr) { # read mset.size + estimated_matches + my ($k, $v) = split /=/, $_, 2; + $k =~ s/\A[^\.]*\.//; # s/(mset)?\./ + $self->{$k} = $v; + } + my $size = $self->{size} // die "E: bad xhc header: `$hdr'"; + my @it = map { PublicInbox::XhcMsetIterator::make($_) } <$rd>; + $self->{items} = \@it; + scalar(@it) == $size or die + 'E: got ',scalar(@it),", expected mset.size=$size"; + }; + my $err = $@; + $self->close; + eval { $cb->(@args, $self, $err) }; + warn "E: $@\n" if $@; +} + +sub maybe_new { + my (undef, $rd, $srch, @cb_args) = @_; + my $self = bless { cb_args => \@cb_args, srch => $srch }, __PACKAGE__; + if ($PublicInbox::DS::in_loop) { # async + $self->SUPER::new($rd, EPOLLIN|EPOLLONESHOT); + } else { # synchronous + $self->{sock} = $rd; + event_step($self); + undef; + } +} + +eval(join('', map { "sub $_ { \$_[0]->{$_} }\n" } qw(size + get_matches_estimated))); + +sub items { @{$_[0]->{items}} } + +1; diff --git a/lib/PublicInbox/XhcMsetIterator.pm b/lib/PublicInbox/XhcMsetIterator.pm new file mode 100644 index 00000000..dcfc61e4 --- /dev/null +++ b/lib/PublicInbox/XhcMsetIterator.pm @@ -0,0 +1,20 @@ +# Copyright (C) all contributors +# License: AGPL-3.0+ + +# mocks Xapian::MsetIterator, there's many of these allocated at once +package PublicInbox::XhcMsetIterator; +use v5.12; + +sub make ($) { + chomp($_[0]); + my @self = map { $_ + 0 } split /\0/, $_[0]; # docid, pct, rank + # we don't store $xdb in self[4] since we avoid $it->get_document + # in favor of $xdb->get_document($it->get_docid) + bless \@self, __PACKAGE__; +} + +sub get_docid { $_[0]->[0] } +sub get_percent { $_[0]->[1] } +sub get_rank { $_[0]->[2] } + +1; diff --git a/lib/PublicInbox/xap_helper.h b/lib/PublicInbox/xap_helper.h index 872f063d..5a89544a 100644 --- a/lib/PublicInbox/xap_helper.h +++ b/lib/PublicInbox/xap_helper.h @@ -141,7 +141,6 @@ struct req { // argv and pfxv point into global rbuf bool collapse_threads; bool code_search; bool relevance; // sort by relevance before column - bool emit_percent; bool asc; // ascending sort }; @@ -225,6 +224,13 @@ static Xapian::MSet mail_mset(struct req *req, const char *qry_str) qry = Xapian::Query(Xapian::Query::OP_FILTER, qry, Xapian::Query(req->Oeidx_key)); } + // TODO: uid_range + if (req->threadid != ULLONG_MAX) { + std::string tid = Xapian::sortable_serialise(req->threadid); + qry = Xapian::Query(Xapian::Query::OP_FILTER, qry, + Xapian::Query(Xapian::Query::OP_VALUE_RANGE, THREADID, + tid, tid)); + } Xapian::Enquire enq = prep_enquire(req); enq.set_query(qry); // THREADID is a CPP macro defined on CLI (see) XapHelperCxx.pm @@ -632,7 +638,6 @@ static void dispatch(struct req *req) if (*end || req->off == ULLONG_MAX) ABORT("-o %s", optarg); break; - case 'p': req->emit_percent = true; break; case 'r': req->relevance = true; break; case 't': req->collapse_threads = true; break; case 'A': diff --git a/lib/PublicInbox/xh_mset.h b/lib/PublicInbox/xh_mset.h index 3727a932..db2692c9 100644 --- a/lib/PublicInbox/xh_mset.h +++ b/lib/PublicInbox/xh_mset.h @@ -3,25 +3,6 @@ // This file is only intended to be included by xap_helper.h // it implements pieces used by WWW, IMAP and lei -static enum exc_iter mset_iter(const struct req *req, FILE *fp, off_t off, - Xapian::MSetIterator *i) -{ - try { - fprintf(fp, "%llu", (unsigned long long)(*(*i))); // get_docid - if (req->emit_percent) - fprintf(fp, "%c%d", 0, i->get_percent()); - fputc('\n', fp); - } catch (const Xapian::DatabaseModifiedError & e) { - req->srch->db->reopen(); - if (fseeko(fp, off, SEEK_SET) < 0) EABORT("fseeko"); - return ITER_RETRY; - } catch (const Xapian::DocNotFoundError & e) { // oh well... - warnx("doc not found: %s", e.get_description().c_str()); - if (fseeko(fp, off, SEEK_SET) < 0) EABORT("fseeko"); - } - return ITER_OK; -} - #ifndef WBUF_FLUSH_THRESHOLD # define WBUF_FLUSH_THRESHOLD (BUFSIZ - 1000) #endif @@ -39,7 +20,9 @@ static bool cmd_mset(struct req *req) Xapian::MSet mset = req->code_search ? commit_mset(req, qry_str) : mail_mset(req, qry_str); fbuf_init(&wbuf); - fprintf(wbuf.fp, "mset.size=%llu\n", (unsigned long long)mset.size()); + fprintf(wbuf.fp, "mset.size=%llu .get_matches_estimated=%llu\n", + (unsigned long long)mset.size(), + (unsigned long long)mset.get_matches_estimated()); int fd = fileno(req->fp[0]); for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) { off_t off = ftello(wbuf.fp); @@ -58,12 +41,10 @@ static bool cmd_mset(struct req *req) if (fseeko(wbuf.fp, 0, SEEK_SET)) EABORT("fseeko"); off = 0; } - for (int t = 10; t > 0; --t) - switch (mset_iter(req, wbuf.fp, off, &i)) { - case ITER_OK: t = 0; break; // leave inner loop - case ITER_RETRY: break; // continue for-loop - case ITER_ABORT: return false; // error - } + fprintf(wbuf.fp, "%llu" "%c" "%d" "%c" "%llu\n", + (unsigned long long)(*i), // get_docid + 0, i.get_percent(), + 0, (unsigned long long)i.get_rank()); } off_t off = ftello(wbuf.fp); if (off < 0) EABORT("ftello"); diff --git a/t/cindex.t b/t/cindex.t index acd74a5d..90236287 100644 --- a/t/cindex.t +++ b/t/cindex.t @@ -152,10 +152,12 @@ my $test_xhc = sub { my ($r, @l); $r = $xhc->mkreq([], qw(mset -c -g), $zp_git, @xh_args, 'NUL'); chomp(@l = <$r>); - is(shift(@l), 'mset.size=2', "got expected header $impl"); + like shift(@l), qr/\bmset\.size=2\b/, "got expected header $impl"; my %docid2data; my @got = sort map { - my ($docid, @extra) = split /\0/; + my ($docid, $pct, $rank, @extra) = split /\0/; + ok $pct >= 0 && $pct <= 100, 'pct in range'; + ok $rank >= 0 && $rank <= 100000, 'rank ok'; is scalar(@extra), 0, 'no extra fields'; $docid2data{$docid} = $csrch->xdb->get_document($docid)->get_data; @@ -164,7 +166,7 @@ my $test_xhc = sub { $r = $xhc->mkreq([], qw(mset -c -g), "$tmp/wt0/.git", @xh_args, 'NUL'); chomp(@l = <$r>); - is(shift(@l), 'mset.size=0', "got miss in wrong dir $impl"); + like shift(@l), qr/\bmset.size=0\b/, "got miss in wrong dir $impl"; is_deeply(\@l, [], "no extra lines $impl"); while (my ($did, $expect) = each %docid2data) { diff --git a/t/xap_helper.t b/t/xap_helper.t index 70c634ac..effe8bc5 100644 --- a/t/xap_helper.t +++ b/t/xap_helper.t @@ -204,10 +204,11 @@ for my $n (@NO_CXX) { $err = do { local $/; <$err_r> }; is $err, "mset.size=6 nr_out=5\n", "got expected status ($xhc->{impl})"; - $r = $xhc->mkreq([], qw(mset -p), @ibx_shard_args, + $r = $xhc->mkreq([], qw(mset), @ibx_shard_args, 'dfn:lib/PublicInbox/Search.pm'); chomp((my $hdr, @res) = readline($r)); - is $hdr, 'mset.size=1', "got expected header via mset ($xhc->{impl}"; + like $hdr, qr/\bmset\.size=1\b/, + "got expected header via mset ($xhc->{impl}"; is scalar(@res), 1, 'got one result'; @res = split /\0/, $res[0]; { @@ -217,19 +218,20 @@ for my $n (@NO_CXX) { is_deeply \@q, [ $mid ], 'docid usable'; } ok $res[1] > 0 && $res[1] <= 100, 'pct > 0 && <= 100'; - is scalar(@res), 2, 'only 2 columns in result'; + is scalar(@res), 3, 'only 3 columns in result'; - $r = $xhc->mkreq([], qw(mset -p), @ibx_shard_args, + $r = $xhc->mkreq([], qw(mset), @ibx_shard_args, 'dt:19700101'.'000000..'); chomp(($hdr, @res) = readline($r)); - is $hdr, 'mset.size=6', + like $hdr, qr/\bmset\.size=6\b/, "got expected header via multi-result mset ($xhc->{impl}"; is(scalar(@res), 6, 'got 6 rows'); for my $r (@res) { - my ($docid, $pct, @rest) = split /\0/, $r; + my ($docid, $pct, $rank, @rest) = split /\0/, $r; my $doc = $v2->search->xdb->get_document($docid); ok $pct > 0 && $pct <= 100, "pct > 0 && <= 100 #$docid ($xhc->{impl})"; + like $rank, qr/\A\d+\z/, 'rank is a digit'; is scalar(@rest), 0, 'no extra rows returned'; } my $nr;