From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id BCA981F51B for ; Wed, 24 Apr 2024 06:44:47 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1713941087; bh=82MBYDSM5z5q6uTsxmBdkvN1695sfb8Y1f+NqOz7CqY=; h=From:To:Subject:Date:In-Reply-To:References:From; b=eAovj5+OP58uCZ7Wi3y7ZKHvZMsMFM0j4K33TkC07YuaLQefysFTgYJGoiSKIuSiR 27rrHxT/xj5iDiZcIdtJocmEO6GcKN1XPLN6PLxGHlw/dT1ytYcpCsaf9/AhyAYuYf kkEv11I6Y3U/Ru57ohv7pGBI1c4QEWhNpw+HMfQ8= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 3/5] xap_helper: drop terms+data from `mset' command Date: Wed, 24 Apr 2024 06:44:45 +0000 Message-ID: <20240424064447.523799-4-e@80x24.org> In-Reply-To: <20240424064447.523799-1-e@80x24.org> References: <20240424064447.523799-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Retrieving Xapian document terms, data (and possibly values) and transferring to the Perl side would be an increase in complexity and I/O both the Perl and C++ sides. It would require more I/O in C++ and transient memory use on the Perl side where slow mset iteration gives an opportunity to dictate memory release rate. So lets ignore the document-related stuff here for now for ease-of-development. We can reconsider this change if dropping Xapian Perl bindings entirely and relying on JAOT C++ ever becomes a possibility. --- lib/PublicInbox/Search.pm | 1 - lib/PublicInbox/XapHelper.pm | 5 ----- lib/PublicInbox/xap_helper.h | 2 -- lib/PublicInbox/xh_mset.h | 24 ------------------------ t/cindex.t | 17 +++++++++-------- t/xap_helper.t | 21 +++++---------------- 6 files changed, 14 insertions(+), 56 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 678c8c5d..0196dd45 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -89,7 +89,6 @@ our @XH_SPEC = ( 'r', # 1=relevance then column 't', # collapse threads 'A=s@', # prefixes - 'D', # emit docdata 'K=i', # timeout kill after i seconds 'O=s', # eidx_key 'T=i', # threadid diff --git a/lib/PublicInbox/XapHelper.pm b/lib/PublicInbox/XapHelper.pm index 8c7732f5..27d98ba1 100644 --- a/lib/PublicInbox/XapHelper.pm +++ b/lib/PublicInbox/XapHelper.pm @@ -150,11 +150,6 @@ sub mset_iter ($$) { eval { my $buf = $it->get_docid; $buf .= "\0".$it->get_percent if $req->{p}; - my $doc = ($req->{A} || $req->{D}) ? $it->get_document : undef; - for my $p (@{$req->{A}}) { - $buf .= "\0".$p.$_ for xap_terms($p, $doc); - } - $buf .= "\0".$doc->get_data if $req->{D}; say { $req->{0} } $buf; }; $@ ? iter_retry_check($req) : 0; diff --git a/lib/PublicInbox/xap_helper.h b/lib/PublicInbox/xap_helper.h index 0e6805b3..872f063d 100644 --- a/lib/PublicInbox/xap_helper.h +++ b/lib/PublicInbox/xap_helper.h @@ -142,7 +142,6 @@ struct req { // argv and pfxv point into global rbuf bool code_search; bool relevance; // sort by relevance before column bool emit_percent; - bool emit_docdata; bool asc; // ascending sort }; @@ -641,7 +640,6 @@ static void dispatch(struct req *req) if (MY_ARG_MAX == req->pfxc) ABORT("too many -A"); break; - case 'D': req->emit_docdata = true; break; case 'K': req->timeout_sec = strtoul(optarg, &end, 10); if (*end || req->timeout_sec == ULONG_MAX) diff --git a/lib/PublicInbox/xh_mset.h b/lib/PublicInbox/xh_mset.h index 4e97a284..3727a932 100644 --- a/lib/PublicInbox/xh_mset.h +++ b/lib/PublicInbox/xh_mset.h @@ -3,20 +3,6 @@ // This file is only intended to be included by xap_helper.h // it implements pieces used by WWW, IMAP and lei -static void emit_doc_term(FILE *fp, const char *pfx, Xapian::Document *doc) -{ - Xapian::TermIterator cur = doc->termlist_begin(); - Xapian::TermIterator end = doc->termlist_end(); - size_t pfx_len = strlen(pfx); - - for (cur.skip_to(pfx); cur != end; cur++) { - std::string tn = *cur; - if (!starts_with(&tn, pfx, pfx_len)) break; - fputc(0, fp); - fwrite(tn.data(), tn.size(), 1, fp); - } -} - static enum exc_iter mset_iter(const struct req *req, FILE *fp, off_t off, Xapian::MSetIterator *i) { @@ -24,16 +10,6 @@ static enum exc_iter mset_iter(const struct req *req, FILE *fp, off_t off, fprintf(fp, "%llu", (unsigned long long)(*(*i))); // get_docid if (req->emit_percent) fprintf(fp, "%c%d", 0, i->get_percent()); - if (req->pfxc || req->emit_docdata) { - Xapian::Document doc = i->get_document(); - for (int p = 0; p < req->pfxc; p++) - emit_doc_term(fp, req->pfxv[p], &doc); - if (req->emit_docdata) { - std::string d = doc.get_data(); - fputc(0, fp); - fwrite(d.data(), d.size(), 1, fp); - } - } fputc('\n', fp); } catch (const Xapian::DatabaseModifiedError & e) { req->srch->db->reopen(); diff --git a/t/cindex.t b/t/cindex.t index e5f26ec3..acd74a5d 100644 --- a/t/cindex.t +++ b/t/cindex.t @@ -147,17 +147,18 @@ if ('multi-repo search') { my $test_xhc = sub { my ($xhc) = @_; + my $csrch = PublicInbox::CodeSearch->new("$tmp/ext"); my $impl = $xhc->{impl}; my ($r, @l); - $r = $xhc->mkreq([], qw(mset -D -c -g), $zp_git, @xh_args, 'NUL'); + $r = $xhc->mkreq([], qw(mset -c -g), $zp_git, @xh_args, 'NUL'); chomp(@l = <$r>); is(shift(@l), 'mset.size=2', "got expected header $impl"); my %docid2data; my @got = sort map { - my @f = split /\0/; - is scalar(@f), 2, 'got 2 entries'; - $docid2data{$f[0]} = $f[1]; - $f[1]; + my ($docid, @extra) = split /\0/; + is scalar(@extra), 0, 'no extra fields'; + $docid2data{$docid} = + $csrch->xdb->get_document($docid)->get_data; } @l; is_deeply(\@got, $exp, "expected doc_data $impl"); @@ -166,7 +167,6 @@ my $test_xhc = sub { is(shift(@l), 'mset.size=0', "got miss in wrong dir $impl"); is_deeply(\@l, [], "no extra lines $impl"); - my $csrch = PublicInbox::CodeSearch->new("$tmp/ext"); while (my ($did, $expect) = each %docid2data) { is_deeply($csrch->xdb->get_document($did)->get_data, $expect, "docid=$did data matches"); @@ -179,14 +179,15 @@ SKIP: { require_mods('+SCM_RIGHTS', 1); require PublicInbox::XapClient; my $xhc = PublicInbox::XapClient::start_helper('-j0'); - $test_xhc->($xhc); + my $csrch = PublicInbox::CodeSearch->new("$tmp/ext"); + $test_xhc->($xhc, $csrch); skip 'PI_NO_CXX set', 1 if $ENV{PI_NO_CXX}; $xhc->{impl} =~ /Cxx/ or skip 'C++ compiler or xapian development libs missing', 1; skip 'TEST_XH_CXX_ONLY set', 1 if $ENV{TEST_XH_CXX_ONLY}; local $ENV{PI_NO_CXX} = 1; # force XS or SWIG binding test $xhc = PublicInbox::XapClient::start_helper('-j0'); - $test_xhc->($xhc); + $test_xhc->($xhc, $csrch); } if ('--update') { diff --git a/t/xap_helper.t b/t/xap_helper.t index 0f474608..70c634ac 100644 --- a/t/xap_helper.t +++ b/t/xap_helper.t @@ -204,7 +204,7 @@ for my $n (@NO_CXX) { $err = do { local $/; <$err_r> }; is $err, "mset.size=6 nr_out=5\n", "got expected status ($xhc->{impl})"; - $r = $xhc->mkreq([], qw(mset -p -A XDFID -A Q), @ibx_shard_args, + $r = $xhc->mkreq([], qw(mset -p), @ibx_shard_args, 'dfn:lib/PublicInbox/Search.pm'); chomp((my $hdr, @res) = readline($r)); is $hdr, 'mset.size=1', "got expected header via mset ($xhc->{impl}"; @@ -212,15 +212,14 @@ for my $n (@NO_CXX) { @res = split /\0/, $res[0]; { my $doc = $v2->search->xdb->get_document($res[0]); + ok $doc, 'valid document retrieved'; my @q = PublicInbox::Search::xap_terms('Q', $doc); is_deeply \@q, [ $mid ], 'docid usable'; } ok $res[1] > 0 && $res[1] <= 100, 'pct > 0 && <= 100'; - is $res[2], 'XDFID'.$dfid, 'XDFID result matches'; - is $res[3], 'Q'.$mid, 'Q (msgid) mset result matches'; - is scalar(@res), 4, 'only 4 columns in result'; + is scalar(@res), 2, 'only 2 columns in result'; - $r = $xhc->mkreq([], qw(mset -p -A XDFID -A Q), @ibx_shard_args, + $r = $xhc->mkreq([], qw(mset -p), @ibx_shard_args, 'dt:19700101'.'000000..'); chomp(($hdr, @res) = readline($r)); is $hdr, 'mset.size=6', @@ -231,17 +230,7 @@ for my $n (@NO_CXX) { my $doc = $v2->search->xdb->get_document($docid); ok $pct > 0 && $pct <= 100, "pct > 0 && <= 100 #$docid ($xhc->{impl})"; - my %terms; - for (@rest) { - s/\A([A-Z]+)// or xbail 'no prefix=', \@rest; - push @{$terms{$1}}, $_; - } - while (my ($pfx, $vals) = each %terms) { - @$vals = sort @$vals; - my @q = PublicInbox::Search::xap_terms($pfx, $doc); - is_deeply $vals, \@q, - "#$docid $pfx as expected ($xhc->{impl})"; - } + is scalar(@rest), 0, 'no extra rows returned'; } my $nr; for my $i (7, 8, 39, 40) {