From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: <e@80x24.org> X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF, T_SCC_BODY_TEXT_LINE shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 7C3B41F518 for <meta@public-inbox.org>; Fri, 8 Dec 2023 03:54:39 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1702007679; bh=3ax0TY25rniRs4wxsVz3wimEFGmFWNQKfsYlxfa8skU=; h=From:To:Subject:Date:In-Reply-To:References:From; b=gcnxs5hZcckvylN4En5Htr5c11GzZNaqVZ2lBxLty7+P5SVbMV3FbWaX9X0FYZO2/ 8n/7AJ3vLESlIQ7LuEFIiEyFE43AYy3UVKqC78OhTdN6QRS02cvf/foEWJTQZYX+3w NO+M8EA6i4Qkox26mHBsIooO4OnSgXuhr++NgwYY= From: Eric Wong <e@80x24.org> To: meta@public-inbox.org Subject: [PATCH 5/6] xap_helper: support term length limit Date: Fri, 8 Dec 2023 03:54:37 +0000 Message-ID: <20231208035438.3710696-6-e@80x24.org> In-Reply-To: <20231208035438.3710696-1-e@80x24.org> References: <20231208035438.3710696-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: <meta.public-inbox.org> This will allow us to use p2q-compatible specifications such as "dfpost7" to only capture blob OIDs which are 7 characters in length (the indexer will always index down to 7 characters) --- lib/PublicInbox/XapHelper.pm | 24 +++++++++++++++--- lib/PublicInbox/xap_helper.h | 11 ++++++++- lib/PublicInbox/xh_cidx.h | 48 ++++++++++++++++++++++++++++++++---- t/xap_helper.t | 33 +++++++++++++++++++++++++ 4 files changed, 106 insertions(+), 10 deletions(-) diff --git a/lib/PublicInbox/XapHelper.pm b/lib/PublicInbox/XapHelper.pm index b21e70a2..ed11a2f8 100644 --- a/lib/PublicInbox/XapHelper.pm +++ b/lib/PublicInbox/XapHelper.pm @@ -39,13 +39,24 @@ sub iter_retry_check ($) { } } +sub term_length_extract ($) { + my ($req) = @_; + @{$req->{A_len}} = map { + my $len = s/([0-9]+)\z// ? ($1 + 0) : undef; + [ $_, $len ]; + } @{$req->{A}}; +} + sub dump_ibx_iter ($$$) { my ($req, $ibx_id, $it) = @_; my $out = $req->{0}; eval { my $doc = $it->get_document; - for my $p (@{$req->{A}}) { - for (xap_terms($p, $doc)) { + for my $pair (@{$req->{A_len}}) { + my ($pfx, $len) = @$pair; + my @t = xap_terms($pfx, $doc); + @t = grep { length == $len } @t if defined($len); + for (@t) { print $out "$_ $ibx_id\n" or die "print: $!"; ++$req->{nr_out}; } @@ -64,6 +75,7 @@ sub cmd_dump_ibx { my ($req, $ibx_id, $qry_str) = @_; $qry_str // die 'usage: dump_ibx [OPTIONS] IBX_ID QRY_STR'; $req->{A} or die 'dump_ibx requires -A PREFIX'; + term_length_extract $req; my $max = $req->{'m'} // $req->{srch}->{xdb}->get_doccount; my $opt = { relevance => -1, limit => $max, offset => $req->{o} // 0 }; $opt->{eidx_key} = $req->{O} if defined $req->{O}; @@ -82,8 +94,11 @@ sub dump_roots_iter ($$$) { eval { my $doc = $it->get_document; my $G = join(' ', map { $root2off->{$_} } xap_terms('G', $doc)); - for my $p (@{$req->{A}}) { - for (xap_terms($p, $doc)) { + for my $pair (@{$req->{A_len}}) { + my ($pfx, $len) = @$pair; + my @t = xap_terms($pfx, $doc); + @t = grep { length == $len } @t if defined($len); + for (@t) { $req->{wbuf} .= "$_ $G\n"; ++$req->{nr_out}; } @@ -106,6 +121,7 @@ sub cmd_dump_roots { my ($req, $root2off_file, $qry_str) = @_; $qry_str // die 'usage: dump_roots [OPTIONS] ROOT2ID_FILE QRY_STR'; $req->{A} or die 'dump_roots requires -A PREFIX'; + term_length_extract $req; open my $fh, '<', $root2off_file; my $root2off; # record format: $OIDHEX "\0" uint32_t my @x = split(/\0/, read_all $fh); diff --git a/lib/PublicInbox/xap_helper.h b/lib/PublicInbox/xap_helper.h index 1f8c426b..3456910b 100644 --- a/lib/PublicInbox/xap_helper.h +++ b/lib/PublicInbox/xap_helper.h @@ -123,6 +123,7 @@ typedef bool (*cmd)(struct req *); struct req { // argv and pfxv point into global rbuf char *argv[MY_ARG_MAX]; char *pfxv[MY_ARG_MAX]; // -A <prefix> + size_t *lenv; // -A <prefix>LENGTH struct srch *srch; char *Pgit_dir; char *Oeidx_key; @@ -727,6 +728,13 @@ static void sigw(int sig) // SIGTERM handler for worker sock_fd = -1; // break out of recv_loop } +#define CLEANUP_REQ __attribute__((__cleanup__(req_cleanup))) +static void req_cleanup(void *ptr) +{ + struct req *req = (struct req *)ptr; + free(req->lenv); +} + static void recv_loop(void) // worker process loop { static char rbuf[4096 * 33]; // per-process @@ -737,7 +745,8 @@ static void recv_loop(void) // worker process loop while (sock_fd == 0) { size_t len = sizeof(rbuf); - struct req req = {}; + CLEANUP_REQ struct req req = {}; + if (!recv_req(&req, rbuf, &len)) continue; if (req.fp[1]) diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h index 2803b3a4..311ca05f 100644 --- a/lib/PublicInbox/xh_cidx.h +++ b/lib/PublicInbox/xh_cidx.h @@ -3,16 +3,49 @@ // This file is only intended to be included by xap_helper.h // it implements pieces used by CodeSearchIdx.pm -static void dump_ibx_term(struct req *req, const char *pfx, +static void term_length_extract(struct req *req) +{ + req->lenv = (size_t *)calloc(req->pfxc, sizeof(size_t)); + if (!req->lenv) + EABORT("lenv = calloc(%d %zu)", req->pfxc, sizeof(size_t)); + for (int i = 0; i < req->pfxc; i++) { + char *pfx = req->pfxv[i]; + // extract trailing digits as length: + // $len = s/([0-9]+)\z// ? ($1+0) : 0 + for (size_t j = 0; pfx[j]; j++) { + if (pfx[j] < '0' || pfx[j] > '9') + continue; + if (j == 0) { + warnx("W: `%s' not a valid prefix", pfx); + continue; + } + char *end; + unsigned long long tmp = strtoull(pfx + j, &end, 10); + if (*end || tmp >= (unsigned long long)SIZE_MAX) { + warnx("W: `%s' not recognized", pfx); + } else { + req->lenv[i] = (size_t)tmp; + pfx[j] = 0; + break; + } + } + } +} + +static void dump_ibx_term(struct req *req, int p, Xapian::Document *doc, const char *ibx_id) { Xapian::TermIterator cur = doc->termlist_begin(); Xapian::TermIterator end = doc->termlist_end(); + const char *pfx = req->pfxv[p]; size_t pfx_len = strlen(pfx); + size_t term_len = req->lenv[p]; for (cur.skip_to(pfx); cur != end; cur++) { std::string tn = *cur; if (!starts_with(&tn, pfx, pfx_len)) break; + if (term_len > 0 && (tn.length() - pfx_len) != term_len) + continue; fprintf(req->fp[0], "%s %s\n", tn.c_str() + pfx_len, ibx_id); ++req->nr_out; } @@ -24,7 +57,7 @@ static enum exc_iter dump_ibx_iter(struct req *req, const char *ibx_id, try { Xapian::Document doc = i->get_document(); for (int p = 0; p < req->pfxc; p++) - dump_ibx_term(req, req->pfxv[p], &doc, ibx_id); + dump_ibx_term(req, p, &doc, ibx_id); } catch (const Xapian::DatabaseModifiedError & e) { req->srch->db->reopen(); return ITER_RETRY; @@ -46,6 +79,7 @@ static bool cmd_dump_ibx(struct req *req) EABORT("setlinebuf(fp[0])"); // WTF? req->asc = true; req->sort_col = -1; + term_length_extract(req); Xapian::MSet mset = mail_mset(req, req->argv[optind + 1]); // @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine @@ -110,18 +144,22 @@ static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc) // writes term values matching @pfx for a given @doc, ending the line // with the contents of @root_offs -static void dump_roots_term(struct req *req, const char *pfx, +static void dump_roots_term(struct req *req, int p, struct dump_roots_tmp *drt, struct fbuf *root_offs, Xapian::Document *doc) { Xapian::TermIterator cur = doc->termlist_begin(); Xapian::TermIterator end = doc->termlist_end(); + const char *pfx = req->pfxv[p]; size_t pfx_len = strlen(pfx); + size_t term_len = req->lenv[p]; for (cur.skip_to(pfx); cur != end; cur++) { std::string tn = *cur; if (!starts_with(&tn, pfx, pfx_len)) break; + if (term_len > 0 && (tn.length() - pfx_len) != term_len) + continue; fputs(tn.c_str() + pfx_len, drt->wbuf.fp); fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp); ++req->nr_out; @@ -163,8 +201,7 @@ static enum exc_iter dump_roots_iter(struct req *req, if (!root2offs_str(&root_offs, &doc)) return ITER_ABORT; // bad request, abort for (int p = 0; p < req->pfxc; p++) - dump_roots_term(req, req->pfxv[p], drt, - &root_offs, &doc); + dump_roots_term(req, p, drt, &root_offs, &doc); } catch (const Xapian::DatabaseModifiedError & e) { req->srch->db->reopen(); return ITER_RETRY; @@ -217,6 +254,7 @@ static bool cmd_dump_roots(struct req *req) req->asc = true; req->sort_col = -1; Xapian::MSet mset = commit_mset(req, req->argv[optind + 1]); + term_length_extract(req); fbuf_init(&drt.wbuf); diff --git a/t/xap_helper.t b/t/xap_helper.t index ec78998c..be010c75 100644 --- a/t/xap_helper.t +++ b/t/xap_helper.t @@ -241,6 +241,39 @@ for my $n (@NO_CXX) { "#$docid $pfx as expected ($xhc->{impl})"; } } + my $nr; + for my $i (7, 8, 39, 40) { + pipe($err_r, $err_w); + $r = $xhc->mkreq([ undef, $err_w ], qw(dump_roots -c -A), + "XDFPOST$i", (map { ('-d', $_) } @int), + $root2id_file, 'dt:19700101'.'000000..'); + close $err_w; + @res = <$r>; + my @err = <$err_r>; + if (defined $nr) { + is scalar(@res), $nr, + "got expected results ($xhc->{impl})"; + } else { + $nr //= scalar @res; + ok $nr, "got initial results ($xhc->{impl})"; + } + my @oids = (join('', @res) =~ /^([a-f0-9]+) /gms); + is_deeply [grep { length == $i } @oids], \@oids, + "all OIDs match expected length ($xhc->{impl})"; + my ($nr_out) = ("@err" =~ /nr_out=(\d+)/); + is $nr_out, scalar(@oids), "output count matches $xhc->{impl}" + or diag explain(\@res, \@err); + } + pipe($err_r, $err_w); + $r = $xhc->mkreq([ undef, $err_w ], qw(dump_ibx -A XDFPOST7), + @ibx_shard_args, qw(13 rt:0..)); + close $err_w; + @res = <$r>; + my @err = <$err_r>; + my ($nr_out) = ("@err" =~ /nr_out=(\d+)/); + my @oids = (join('', @res) =~ /^([a-f0-9]{7}) /gms); + is $nr_out, scalar(@oids), "output count matches $xhc->{impl}" or + diag explain(\@res, \@err); } done_testing;