From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF, T_SCC_BODY_TEXT_LINE shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id F30721F51A for ; Tue, 28 Nov 2023 14:56:28 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1701183389; bh=AiYmKmvGnzO/phgzYS/cuyIAOj9o37GOKYkx+JdTSCg=; h=From:To:Subject:Date:In-Reply-To:References:From; b=K4tsNCaGNk/uZD8P2UdwOqCWuBN1k1qpUb4U3NfkktuY5f5/WtBO2Dbh0A3rz7RTc NkG2Jriee8UfAfJTopxR4Q95HKxLzkFnLVBtzTrOzwa41zCX1t549pPVCjOq5gZRzd Sd9IoWYv8Q597q+5ASd2hLMGb+E2CJW+H96vQyVs= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 06/14] xap_helper: implement mset endpoint for WWW, IMAP, etc... Date: Tue, 28 Nov 2023 14:56:19 +0000 Message-ID: <20231128145628.1455176-7-e@80x24.org> In-Reply-To: <20231128145628.1455176-1-e@80x24.org> References: <20231128145628.1455176-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: The C++ version will allow us to take full advantage of Xapian's APIs for better queries, and the Perl bindings version can still be advantageous in the future since we'll be able to support timeouts effectively. --- MANIFEST | 1 + Makefile.PL | 8 ++- lib/PublicInbox/Search.pm | 25 ++++++++ lib/PublicInbox/XapHelper.pm | 51 ++++++++++----- lib/PublicInbox/XapHelperCxx.pm | 6 +- lib/PublicInbox/xap_helper.h | 110 ++++++++++++++++++++++++++------ lib/PublicInbox/xh_cidx.h | 37 ++++------- lib/PublicInbox/xh_mset.h | 96 ++++++++++++++++++++++++++++ t/cindex.t | 52 ++++++++++++++- t/xap_helper.t | 49 ++++++++++++-- 10 files changed, 363 insertions(+), 72 deletions(-) create mode 100644 lib/PublicInbox/xh_mset.h diff --git a/MANIFEST b/MANIFEST index bbbe0b91..7b6178f9 100644 --- a/MANIFEST +++ b/MANIFEST @@ -379,6 +379,7 @@ lib/PublicInbox/Xapcmd.pm lib/PublicInbox/gcf2_libgit2.h lib/PublicInbox/xap_helper.h lib/PublicInbox/xh_cidx.h +lib/PublicInbox/xh_mset.h sa_config/Makefile sa_config/README sa_config/root/etc/spamassassin/public-inbox.pre diff --git a/Makefile.PL b/Makefile.PL index 38e030f5..28f8263e 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -273,14 +273,16 @@ pm_to_blib : lib/PublicInbox.pm lib/PublicInbox.pm : FORCE VERSION=\$(VERSION) \$(PERL) -w ./version-gen.perl +XH_TESTS = t/xap_helper.t t/cindex.t + test-asan : pure_all - TEST_XH_CXX_ONLY=1 CXXFLAGS='-O0 -Wall -ggdb3 -fsanitize=address' \\ - prove -bvw t/xap_helper.t + TEST_XH_CXX_ONLY=1 CXXFLAGS='-Wall -ggdb3 -fsanitize=address' \\ + prove -bvw \$(XH_TESTS) VG_OPT = -v --trace-children=yes --track-fds=yes VG_OPT += --leak-check=yes --track-origins=yes test-valgrind : pure_all TEST_XH_CXX_ONLY=1 VALGRIND="valgrind \$(VG_OPT)" \\ - prove -bvw t/xap_helper.t + prove -bvw \$(XH_TESTS) EOF } diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 477f77dc..6145b027 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -76,6 +76,25 @@ our @MAIL_VMAP = ( ); our @MAIL_NRP; +# Getopt::Long spec, only short options for portability in C++ implementation +our @XH_SPEC = ( + 'a', # ascending sort + 'c', # code search + 'd=s@', # shard dirs + 'g=s', # git dir (with -c) + 'k=i', # sort column (like sort(1)) + 'm=i', # maximum number of results + 'o=i', # offset + 'p', # show percent + 'r', # 1=relevance then column + 't', # collapse threads + 'A=s@', # prefixes + 'D', # emit docdata + 'K=i', # timeout kill after i seconds + 'O=s', # eidx_key + 'T=i', # threadid +); + sub load_xapian () { return 1 if defined $Xap; # n.b. PI_XAPIAN is intended for development use only @@ -247,6 +266,12 @@ sub mdocid { int(($docid - 1) / $nshard) + 1; } +sub docids_to_artnums { + my $nshard = shift->{nshard}; + # XXX does array vs arrayref make a difference in modern Perls? + map { int(($_ - 1) / $nshard) + 1 } @_; +} + sub mset_to_artnums { my ($self, $mset) = @_; my $nshard = $self->{nshard}; diff --git a/lib/PublicInbox/XapHelper.pm b/lib/PublicInbox/XapHelper.pm index fe831b8f..b21e70a2 100644 --- a/lib/PublicInbox/XapHelper.pm +++ b/lib/PublicInbox/XapHelper.pm @@ -21,21 +21,6 @@ my $X = \%PublicInbox::Search::X; our (%SRCH, %WORKERS, $nworker, $workerset, $in); our $stderr = \*STDERR; -# only short options for portability in C++ implementation -our @SPEC = ( - 'a', # ascending sort - 'c', # code search - 'd=s@', # shard dirs - 'k=i', # sort column (like sort(1)) - 'm=i', # maximum number of results - 'o=i', # offset - 'r', # 1=relevance then column - 't', # collapse threads - 'A=s@', # prefixes - 'O=s', # eidx_key - 'T=i', # timeout in seconds -); - sub cmd_test_inspect { my ($req) = @_; print { $req->{0} } "pid=$$ has_threadid=", @@ -144,10 +129,44 @@ sub cmd_dump_roots { emit_mset_stats($req, $mset); } +sub mset_iter ($$) { + my ($req, $it) = @_; + eval { + my $buf = $it->get_docid; + $buf .= "\0".$it->get_percent if $req->{p}; + my $doc = ($req->{A} || $req->{D}) ? $it->get_document : undef; + for my $p (@{$req->{A}}) { + $buf .= "\0".$p.$_ for xap_terms($p, $doc); + } + $buf .= "\0".$doc->get_data if $req->{D}; + say { $req->{0} } $buf; + }; + $@ ? iter_retry_check($req) : 0; +} + +sub cmd_mset { # to be used by WWW + IMAP + my ($req, $qry_str) = @_; + $qry_str // die 'usage: mset [OPTIONS] QRY_STR'; + my $opt = { limit => $req->{'m'}, offset => $req->{o} // 0 }; + $opt->{relevance} = 1 if $req->{r}; + $opt->{threads} = 1 if defined $req->{t}; + $opt->{git_dir} = $req->{g} if defined $req->{g}; + $opt->{eidx_key} = $req->{O} if defined $req->{O}; + $opt->{threadid} = $req->{T} if defined $req->{T}; + my $mset = $req->{srch}->mset($qry_str, $opt); + say { $req->{0} } 'mset.size=', $mset->size; + for my $it ($mset->items) { + for (my $t = 10; $t > 0; --$t) { + $t = mset_iter($req, $it) // $t; + } + } +} + sub dispatch { my ($req, $cmd, @argv) = @_; my $fn = $req->can("cmd_$cmd") or return; - $GLP->getoptionsfromarray(\@argv, $req, @SPEC) or return; + $GLP->getoptionsfromarray(\@argv, $req, @PublicInbox::Search::XH_SPEC) + or return; my $dirs = delete $req->{d} or die 'no -d args'; my $key = join("\0", @$dirs); $req->{srch} = $SRCH{$key} //= do { diff --git a/lib/PublicInbox/XapHelperCxx.pm b/lib/PublicInbox/XapHelperCxx.pm index 8a66fdcd..1aa75f2a 100644 --- a/lib/PublicInbox/XapHelperCxx.pm +++ b/lib/PublicInbox/XapHelperCxx.pm @@ -20,13 +20,15 @@ $ENV{PERL_INLINE_DIRECTORY} // die('BUG: PERL_INLINE_DIRECTORY unset'); substr($dir, 0, 0) = "$ENV{PERL_INLINE_DIRECTORY}/"; my $bin = "$dir/xap_helper"; my ($srcpfx) = (__FILE__ =~ m!\A(.+/)[^/]+\z!); -my @srcs = map { $srcpfx.$_ } qw(xap_helper.h xh_cidx.h); +my @srcs = map { $srcpfx.$_ } qw(xh_mset.h xh_cidx.h xap_helper.h); my @pm_dep = map { $srcpfx.$_ } qw(Search.pm CodeSearch.pm); my $ldflags = '-Wl,-O1'; $ldflags .= ' -Wl,--compress-debug-sections=zlib' if $^O ne 'openbsd'; my $xflags = ($ENV{CXXFLAGS} // '-Wall -ggdb3 -pipe') . ' ' . ' -DTHREADID=' . PublicInbox::Search::THREADID . - ' ' . ($ENV{LDFLAGS} // $ldflags); + ' -DXH_SPEC="'.join('', + map { s/=.*/:/; $_ } @PublicInbox::Search::XH_SPEC) . '" ' . + ($ENV{LDFLAGS} // $ldflags); my $xap_modversion; sub xap_cfg (@) { diff --git a/lib/PublicInbox/xap_helper.h b/lib/PublicInbox/xap_helper.h index 89d151d9..18665567 100644 --- a/lib/PublicInbox/xap_helper.h +++ b/lib/PublicInbox/xap_helper.h @@ -124,10 +124,12 @@ struct req { // argv and pfxv point into global rbuf char *argv[MY_ARG_MAX]; char *pfxv[MY_ARG_MAX]; // -A struct srch *srch; + char *Pgit_dir; char *Oeidx_key; cmd fn; unsigned long long max; unsigned long long off; + unsigned long long threadid; unsigned long timeout_sec; size_t nr_out; long sort_col; // value column, negative means BoolWeight @@ -138,6 +140,8 @@ struct req { // argv and pfxv point into global rbuf bool collapse_threads; bool code_search; bool relevance; // sort by relevance before column + bool emit_percent; + bool emit_docdata; bool asc; // ascending sort }; @@ -230,12 +234,53 @@ static Xapian::MSet mail_mset(struct req *req, const char *qry_str) return enquire_mset(req, &enq); } +static bool starts_with(const std::string *s, const char *pfx, size_t pfx_len) +{ + return s->size() >= pfx_len && !memcmp(pfx, s->c_str(), pfx_len); +} + +static void apply_roots_filter(struct req *req, Xapian::Query *qry) +{ + if (!req->Pgit_dir) return; + req->Pgit_dir[0] = 'P'; // modifies static rbuf + Xapian::Database *xdb = req->srch->db; + for (int i = 0; i < 9; i++) { + try { + std::string P = req->Pgit_dir; + Xapian::PostingIterator p = xdb->postlist_begin(P); + if (p == xdb->postlist_end(P)) { + warnx("W: %s not indexed?", req->Pgit_dir + 1); + return; + } + Xapian::TermIterator cur = xdb->termlist_begin(*p); + Xapian::TermIterator end = xdb->termlist_end(*p); + cur.skip_to("G"); + if (cur == end) { + warnx("W: %s has no root commits?", + req->Pgit_dir + 1); + return; + } + Xapian::Query f = Xapian::Query(*cur); + for (++cur; cur != end; ++cur) { + std::string tn = *cur; + if (!starts_with(&tn, "G", 1)) + continue; + f = Xapian::Query(Xapian::Query::OP_OR, f, tn); + } + *qry = Xapian::Query(Xapian::Query::OP_FILTER, *qry, f); + return; + } catch (const Xapian::DatabaseModifiedError & e) { + xdb->reopen(); + } + } +} + // for cindex static Xapian::MSet commit_mset(struct req *req, const char *qry_str) { struct srch *srch = req->srch; Xapian::Query qry = srch->qp->parse_query(qry_str, srch->qp_flags); - // TODO: git_dir + roots_filter + apply_roots_filter(req, &qry); // we only want commits: qry = Xapian::Query(Xapian::Query::OP_FILTER, qry, @@ -254,11 +299,6 @@ static void emit_mset_stats(struct req *req, const Xapian::MSet *mset) ABORT("BUG: %s caller only passed 1 FD", req->argv[0]); } -static bool starts_with(const std::string *s, const char *pfx, size_t pfx_len) -{ - return s->size() >= pfx_len && !memcmp(pfx, s->c_str(), pfx_len); -} - static int my_setlinebuf(FILE *fp) // glibc setlinebuf(3) can't report errors { return setvbuf(fp, NULL, _IOLBF, 0); @@ -284,6 +324,32 @@ static void fbuf_init(struct fbuf *fbuf) if (!fbuf->fp) err(EXIT_FAILURE, "open_memstream(fbuf)"); } +static bool write_all(int fd, const struct fbuf *wbuf, size_t len) +{ + const char *p = wbuf->ptr; + assert(wbuf->len >= len); + do { // write to client FD + ssize_t n = write(fd, p, len); + if (n > 0) { + len -= n; + p += n; + } else { + perror(n ? "write" : "write (zero bytes)"); + return false; + } + } while (len); + return true; +} + +#define ERR_FLUSH(f) do { \ + if (ferror(f) | fflush(f)) err(EXIT_FAILURE, "ferror|fflush "#f); \ +} while (0) + +#define ERR_CLOSE(f, e) do { \ + if (ferror(f) | fclose(f)) \ + e ? err(e, "ferror|fclose "#f) : perror("ferror|fclose "#f); \ +} while (0) + static void xclose(int fd) { if (close(fd) < 0 && errno != EINTR) @@ -339,6 +405,7 @@ static bool cmd_test_inspect(struct req *req) return false; } +#include "xh_mset.h" // read-only (WWW, IMAP, lei) stuff #include "xh_cidx.h" // CodeSearchIdx.pm stuff #define CMD(n) { .fn_len = sizeof(#n) - 1, .fn_name = #n, .fn = cmd_##n } @@ -348,6 +415,7 @@ static const struct cmd_entry { cmd fn; } cmds[] = { // should be small enough to not need bsearch || gperf // most common commands first + CMD(mset), // WWW and IMAP requests CMD(dump_ibx), // many inboxes CMD(dump_roots), // per-cidx shard CMD(test_inspect), // least common commands last @@ -520,7 +588,7 @@ static void dispatch(struct req *req) char *end; FILE *kfp; struct srch **s; - req->fn = NULL; + req->threadid = ULLONG_MAX; for (c = 0; c < (int)MY_ARRAY_SIZE(cmds); c++) { if (cmds[c].fn_len == size && !memcmp(cmds[c].fn_name, req->argv[0], size)) { @@ -540,12 +608,13 @@ static void dispatch(struct req *req) optarg = NULL; MY_DO_OPTRESET(); - // keep sync with @PublicInbox::XapHelper::SPEC - while ((c = getopt(req->argc, req->argv, "acd:k:m:o:rtA:O:T:")) != -1) { + // XH_SPEC is generated from @PublicInbox::Search::XH_SPEC + while ((c = getopt(req->argc, req->argv, XH_SPEC)) != -1) { switch (c) { case 'a': req->asc = true; break; case 'c': req->code_search = true; break; case 'd': fwrite(optarg, strlen(optarg) + 1, 1, kfp); break; + case 'g': req->Pgit_dir = optarg - 1; break; // pad "P" prefix case 'k': req->sort_col = strtol(optarg, &end, 10); if (*end) ABORT("-k %s", optarg); @@ -563,6 +632,7 @@ static void dispatch(struct req *req) if (*end || req->off == ULLONG_MAX) ABORT("-o %s", optarg); break; + case 'p': req->emit_percent = true; break; case 'r': req->relevance = true; break; case 't': req->collapse_threads = true; break; case 'A': @@ -570,17 +640,22 @@ static void dispatch(struct req *req) if (MY_ARG_MAX == req->pfxc) ABORT("too many -A"); break; - case 'O': req->Oeidx_key = optarg - 1; break; // pad "O" prefix - case 'T': + case 'D': req->emit_docdata = true; break; + case 'K': req->timeout_sec = strtoul(optarg, &end, 10); if (*end || req->timeout_sec == ULONG_MAX) + ABORT("-K %s", optarg); + break; + case 'O': req->Oeidx_key = optarg - 1; break; // pad "O" prefix + case 'T': + req->threadid = strtoull(optarg, &end, 10); + if (*end || req->threadid == ULLONG_MAX) ABORT("-T %s", optarg); break; default: ABORT("bad switch `-%c'", c); } } - if (ferror(kfp) | fclose(kfp)) /* sets kbuf.srch */ - err(EXIT_FAILURE, "ferror|fclose"); // likely ENOMEM + ERR_CLOSE(kfp, EXIT_FAILURE); // may ENOMEM, sets kbuf.srch kbuf.srch->db = NULL; kbuf.srch->qp = NULL; kbuf.srch->paths_len = size - offsetof(struct srch, paths); @@ -639,8 +714,7 @@ static void stderr_restore(FILE *tmp_err) stderr = orig_err; return; #endif - if (ferror(stderr) | fflush(stderr)) - err(EXIT_FAILURE, "ferror|fflush stderr"); + ERR_CLOSE(stderr, EXIT_FAILURE); while (dup2(orig_err_fd, STDERR_FILENO) < 0) { if (errno != EINTR) err(EXIT_FAILURE, "dup2(%d => 2)", orig_err_fd); @@ -670,12 +744,10 @@ static void recv_loop(void) // worker process loop stderr_set(req.fp[1]); req.argc = (int)SPLIT2ARGV(req.argv, rbuf, len); dispatch(&req); - if (ferror(req.fp[0]) | fclose(req.fp[0])) - perror("ferror|fclose fp[0]"); + ERR_CLOSE(req.fp[0], 0); if (req.fp[1]) { stderr_restore(req.fp[1]); - if (ferror(req.fp[1]) | fclose(req.fp[1])) - perror("ferror|fclose fp[1]"); + ERR_CLOSE(req.fp[1], 0); } } } diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h index c2d94162..1980f9f6 100644 --- a/lib/PublicInbox/xh_cidx.h +++ b/lib/PublicInbox/xh_cidx.h @@ -107,8 +107,7 @@ static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc) fputs((const char *)ep->data, root_offs->fp); } fputc('\n', root_offs->fp); - if (ferror(root_offs->fp) | fclose(root_offs->fp)) - err(EXIT_FAILURE, "ferror|fclose(root_offs)"); // ENOMEM + ERR_CLOSE(root_offs->fp, EXIT_FAILURE); // ENOMEM root_offs->fp = NULL; return true; } @@ -138,38 +137,24 @@ static void dump_roots_term(struct req *req, const char *pfx, // buffering and rely on flock(2), here static bool dump_roots_flush(struct req *req, struct dump_roots_tmp *drt) { - char *p; - int fd = fileno(req->fp[0]); bool ok = true; + off_t off = ftello(drt->wbuf.fp); + if (off < 0) EABORT("ftello"); + if (!off) return ok; + + ERR_FLUSH(drt->wbuf.fp); // ENOMEM + int fd = fileno(req->fp[0]); - if (!drt->wbuf.fp) return true; - if (fd < 0) EABORT("BUG: fileno"); - if (ferror(drt->wbuf.fp) | fclose(drt->wbuf.fp)) // ENOMEM? - err(EXIT_FAILURE, "ferror|fclose(drt->wbuf.fp)"); - drt->wbuf.fp = NULL; - if (!drt->wbuf.len) goto done_free; while (flock(drt->root2off_fd, LOCK_EX)) { if (errno == EINTR) continue; err(EXIT_FAILURE, "LOCK_EX"); // ENOLCK? } - p = drt->wbuf.ptr; - do { // write to client FD - ssize_t n = write(fd, p, drt->wbuf.len); - if (n > 0) { - drt->wbuf.len -= n; - p += n; - } else { - perror(n ? "write" : "write (zero bytes)"); - return false; - } - } while (drt->wbuf.len); + ok = write_all(fd, &drt->wbuf, (size_t)off); while (flock(drt->root2off_fd, LOCK_UN)) { if (errno == EINTR) continue; err(EXIT_FAILURE, "LOCK_UN"); // ENOLCK? } -done_free: // OK to skip on errors, dump_roots_ensure calls fbuf_ensure - free(drt->wbuf.ptr); - drt->wbuf.ptr = NULL; + if (fseeko(drt->wbuf.fp, 0, SEEK_SET)) EABORT("fseeko"); return ok; } @@ -238,11 +223,11 @@ static bool cmd_dump_roots(struct req *req) req->sort_col = -1; Xapian::MSet mset = commit_mset(req, req->argv[optind + 1]); + fbuf_init(&drt.wbuf); + // @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine // in case we need to retry on DB reopens for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) { - if (!drt.wbuf.fp) - fbuf_init(&drt.wbuf); for (int t = 10; t > 0; --t) switch (dump_roots_iter(req, &drt, &i)) { case ITER_OK: t = 0; break; // leave inner loop diff --git a/lib/PublicInbox/xh_mset.h b/lib/PublicInbox/xh_mset.h new file mode 100644 index 00000000..056fe22b --- /dev/null +++ b/lib/PublicInbox/xh_mset.h @@ -0,0 +1,96 @@ +// Copyright (C) all contributors +// License: GPL-2.0+ +// This file is only intended to be included by xap_helper.h +// it implements pieces used by WWW, IMAP and lei + +static void emit_doc_term(FILE *fp, const char *pfx, Xapian::Document *doc) +{ + Xapian::TermIterator cur = doc->termlist_begin(); + Xapian::TermIterator end = doc->termlist_end(); + size_t pfx_len = strlen(pfx); + + for (cur.skip_to(pfx); cur != end; cur++) { + std::string tn = *cur; + if (!starts_with(&tn, pfx, pfx_len)) continue; + fputc(0, fp); + fwrite(tn.data(), tn.size(), 1, fp); + } +} + +static enum exc_iter mset_iter(const struct req *req, FILE *fp, off_t off, + Xapian::MSetIterator *i) +{ + try { + fprintf(fp, "%llu", (unsigned long long)(*(*i))); // get_docid + if (req->emit_percent) + fprintf(fp, "%c%d", 0, i->get_percent()); + if (req->pfxc || req->emit_docdata) { + Xapian::Document doc = i->get_document(); + for (int p = 0; p < req->pfxc; p++) + emit_doc_term(fp, req->pfxv[p], &doc); + if (req->emit_docdata) { + std::string d = doc.get_data(); + fputc(0, fp); + fwrite(d.data(), d.size(), 1, fp); + } + } + fputc('\n', fp); + } catch (const Xapian::DatabaseModifiedError & e) { + req->srch->db->reopen(); + if (fseeko(fp, off, SEEK_SET) < 0) EABORT("fseeko"); + return ITER_RETRY; + } catch (const Xapian::DocNotFoundError & e) { // oh well... + warnx("doc not found: %s", e.get_description().c_str()); + if (fseeko(fp, off, SEEK_SET) < 0) EABORT("fseeko"); + } + return ITER_OK; +} + +#ifndef WBUF_FLUSH_THRESHOLD +# define WBUF_FLUSH_THRESHOLD (BUFSIZ - 1000) +#endif +#if WBUF_FLUSH_THRESHOLD < 0 +# undef WBUF_FLUSH_THRESHOLD +# define WBUF_FLUSH_THRESHOLD BUFSIZ +#endif + +static bool cmd_mset(struct req *req) +{ + if (optind >= req->argc) ABORT("usage: mset [OPTIONS] WANT QRY_STR"); + if (req->fp[1]) ABORT("mset only accepts 1 FD"); + const char *qry_str = req->argv[optind]; + CLEANUP_FBUF struct fbuf wbuf = {}; + Xapian::MSet mset = req->code_search ? commit_mset(req, qry_str) : + mail_mset(req, qry_str); + fbuf_init(&wbuf); + fprintf(wbuf.fp, "mset.size=%llu\n", (unsigned long long)mset.size()); + int fd = fileno(req->fp[0]); + for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) { + off_t off = ftello(wbuf.fp); + if (off < 0) EABORT("ftello"); + /* + * TODO verify our fflush + fseeko use isn't affected by a + * glibc <2.25 bug: + * https://sourceware.org/bugzilla/show_bug.cgi?id=20181 + * CentOS 7.x only has glibc 2.17. In any case, bug #20181 + * shouldn't affect us since our use of fseeko is used to + * effectively discard data. + */ + if (off > WBUF_FLUSH_THRESHOLD) { + ERR_FLUSH(wbuf.fp); + if (!write_all(fd, &wbuf, (size_t)off)) return false; + if (fseeko(wbuf.fp, 0, SEEK_SET)) EABORT("fseeko"); + off = 0; + } + for (int t = 10; t > 0; --t) + switch (mset_iter(req, wbuf.fp, off, &i)) { + case ITER_OK: t = 0; break; // leave inner loop + case ITER_RETRY: break; // continue for-loop + case ITER_ABORT: return false; // error + } + } + off_t off = ftello(wbuf.fp); + if (off < 0) EABORT("ftello"); + ERR_FLUSH(wbuf.fp); + return off > 0 ? write_all(fd, &wbuf, (size_t)off) : true; +} diff --git a/t/cindex.t b/t/cindex.t index 261945bf..a9075092 100644 --- a/t/cindex.t +++ b/t/cindex.t @@ -121,22 +121,70 @@ my $no_metadata_set = sub { use_ok 'PublicInbox::CodeSearch'; + +my @xh_args; +my $exp = [ 'initial with NUL character', 'remove NUL character' ]; +my $zp_git = abs_path("$zp/.git"); if ('multi-repo search') { my $csrch = PublicInbox::CodeSearch->new("$tmp/ext"); my $mset = $csrch->mset('NUL'); is(scalar($mset->items), 2, 'got results'); - my $exp = [ 'initial with NUL character', 'remove NUL character' ]; my @have = sort(map { $_->get_document->get_data } $mset->items); is_xdeeply(\@have, $exp, 'got expected subjects'); $mset = $csrch->mset('NUL', { git_dir => "$tmp/wt0/.git" }); is(scalar($mset->items), 0, 'no results with other GIT_DIR'); - $mset = $csrch->mset('NUL', { git_dir => abs_path("$zp/.git") }); + $mset = $csrch->mset('NUL', { git_dir => $zp_git }); @have = sort(map { $_->get_document->get_data } $mset->items); is_xdeeply(\@have, $exp, 'got expected subjects w/ GIT_DIR filter'); my @xdb = $csrch->xdb_shards_flat; $no_metadata_set->(0, ['indexlevel'], \@xdb); + @xh_args = $csrch->xh_args; +} + +my $test_xhc = sub { + my ($xhc) = @_; + my $impl = $xhc->{impl}; + my ($r, @l); + $r = $xhc->mkreq([], qw(mset -D -c -g), $zp_git, @xh_args, 'NUL'); + chomp(@l = <$r>); + is(shift(@l), 'mset.size=2', "got expected header $impl"); + my %docid2data; + my @got = sort map { + my @f = split /\0/; + is scalar(@f), 2, 'got 2 entries'; + $docid2data{$f[0]} = $f[1]; + $f[1]; + } @l; + is_deeply(\@got, $exp, "expected doc_data $impl"); + + $r = $xhc->mkreq([], qw(mset -c -g), "$tmp/wt0/.git", @xh_args, 'NUL'); + chomp(@l = <$r>); + is(shift(@l), 'mset.size=0', "got miss in wrong dir $impl"); + is_deeply(\@l, [], "no extra lines $impl"); + + my $csrch = PublicInbox::CodeSearch->new("$tmp/ext"); + while (my ($did, $expect) = each %docid2data) { + is_deeply($csrch->xdb->get_document($did)->get_data, + $expect, "docid=$did data matches"); + } + ok(!$xhc->{io}->close, "$impl close"); + is($?, 66 << 8, "got EX_NOINPUT from $impl exit"); +}; + +SKIP: { + require_mods('+SCM_RIGHTS', 1); + require PublicInbox::XapClient; + my $xhc = PublicInbox::XapClient::start_helper('-j0'); + $test_xhc->($xhc); + skip 'PI_NO_CXX set', 1 if $ENV{PI_NO_CXX}; + $xhc->{impl} =~ /Cxx/ or + skip 'C++ compiler or xapian development libs missing', 1; + skip 'TEST_XH_CXX_ONLY set', 1 if $ENV{TEST_XH_CXX_ONLY}; + local $ENV{PI_NO_CXX} = 1; # force XS or SWIG binding test + $xhc = PublicInbox::XapClient::start_helper('-j0'); + $test_xhc->($xhc); } if ('--update') { diff --git a/t/xap_helper.t b/t/xap_helper.t index e3abeded..ee25b2dc 100644 --- a/t/xap_helper.t +++ b/t/xap_helper.t @@ -40,6 +40,7 @@ my $v2 = create_inbox 'v2', indexlevel => 'medium', version => 2, }; my @ibx_idx = glob("$v2->{inboxdir}/xap*/?"); +my @ibx_shard_args = map { ('-d', $_) } @ibx_idx; my (@int) = glob("$crepo/public-inbox-cindex/cidx*/?"); my (@ext) = glob("$crepo/cidx-ext/cidx*/?"); is(scalar(@ext), 2, 'have 2 external shards') or diag explain(\@ext); @@ -76,8 +77,7 @@ my $test = sub { is($cinfo{has_threadid}, '0', 'has_threadid false for cindex'); is($cinfo{pid}, $info{pid}, 'PID unchanged for cindex'); - my @dump = (qw(dump_ibx -A XDFID), (map { ('-d', $_) } @ibx_idx), - qw(13 rt:0..)); + my @dump = (qw(dump_ibx -A XDFID), @ibx_shard_args, qw(13 rt:0..)); $r = $doreq->($s, @dump); my @res; while (sysread($r, my $buf, 512) != 0) { push @res, $buf } @@ -89,7 +89,8 @@ my $test = sub { my $res = do { local $/; <$r> }; is(join('', @res), $res, 'got identical response w/ error pipe'); my $stats = do { local $/; <$err_rd> }; - is($stats, "mset.size=6 nr_out=6\n", 'mset.size reported'); + is($stats, "mset.size=6 nr_out=6\n", 'mset.size reported') or + diag "res=$res"; return wantarray ? ($ar, $s) : $ar if $cinfo{pid} == $pid; @@ -198,7 +199,47 @@ for my $n (@NO_CXX) { is(scalar(@res), scalar(grep(/\A[0-9a-f]{40,} [0-9]+\n\z/, @res)), 'entries match format'); $err = do { local $/; <$err_r> }; - is($err, "mset.size=6 nr_out=5\n", "got expected status ($xhc->{impl})"); + is $err, "mset.size=6 nr_out=5\n", "got expected status ($xhc->{impl})"; + + $r = $xhc->mkreq([], qw(mset -p -A XDFID -A Q), @ibx_shard_args, + 'dfn:lib/PublicInbox/Search.pm'); + chomp((my $hdr, @res) = readline($r)); + is $hdr, 'mset.size=1', "got expected header via mset ($xhc->{impl}"; + is scalar(@res), 1, 'got one result'; + @res = split /\0/, $res[0]; + { + my $doc = $v2->search->xdb->get_document($res[0]); + my @q = PublicInbox::Search::xap_terms('Q', $doc); + is_deeply \@q, [ $mid ], 'docid usable'; + } + ok $res[1] > 0 && $res[1] <= 100, 'pct > 0 && <= 100'; + is $res[2], 'XDFID'.$dfid, 'XDFID result matches'; + is $res[3], 'Q'.$mid, 'Q (msgid) mset result matches'; + is scalar(@res), 4, 'only 4 columns in result'; + + $r = $xhc->mkreq([], qw(mset -p -A XDFID -A Q), @ibx_shard_args, + 'dt:19700101'.'000000..'); + chomp(($hdr, @res) = readline($r)); + is $hdr, 'mset.size=6', + "got expected header via multi-result mset ($xhc->{impl}"; + is(scalar(@res), 6, 'got 6 rows'); + for my $r (@res) { + my ($docid, $pct, @rest) = split /\0/, $r; + my $doc = $v2->search->xdb->get_document($docid); + ok $pct > 0 && $pct <= 100, + "pct > 0 && <= 100 #$docid ($xhc->{impl})"; + my %terms; + for (@rest) { + s/\A([A-Z]+)// or xbail 'no prefix=', \@rest; + push @{$terms{$1}}, $_; + } + while (my ($pfx, $vals) = each %terms) { + @$vals = sort @$vals; + my @q = PublicInbox::Search::xap_terms($pfx, $doc); + is_deeply $vals, \@q, + "#$docid $pfx as expected ($xhc->{impl})"; + } + } } done_testing;