From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 3/2] imap: support isearch and reduce Xapian queries
Date: Sat, 5 Dec 2020 11:10:45 +0000 [thread overview]
Message-ID: <20201205111045.GA27365@dcvr> (raw)
In-Reply-To: <20201205101138.11973-1-e@80x24.org>
Since IMAP search (either with Isearch or traditional per-Inbox
search) only returns UIDs, we can safely set the limit to the
UID slice size(*). With isearch, we can also trust the Xapian
result to fit any docid range we specify.
Limiting Xapian results to 1000 was making ->ALL docid <=>
per-Inbox UID impossible since results could overlap between
ranges unpredictably.
Finally, we can map the ->ALL docids into per-Inbox UIDs and
show them to the client in the UID order of the Inbox, not the
docid order of the ->ALL extindex.
This also lets us get rid of the "uid:" query parser prefix
and use the Xapian::Query API directly to reduce our search
prefix footprint.
For mbox.gz downloads in WWW, we'll also make a best effort to
preserve the order from the Inbox, not the order of extindex;
though it's possible large result sets can have non-overlapping
windows.
(*) by definition, UID slice size is a "safe" value which
shouldn't OOM either the server or clients.
---
lib/PublicInbox/IMAP.pm | 41 +++++++----------------------
lib/PublicInbox/Isearch.pm | 54 +++++++++++++++++++++++++++++++++-----
lib/PublicInbox/Mbox.pm | 6 ++---
lib/PublicInbox/Search.pm | 8 +++++-
4 files changed, 67 insertions(+), 42 deletions(-)
diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm
index 9599f494..f123eb01 100644
--- a/lib/PublicInbox/IMAP.pm
+++ b/lib/PublicInbox/IMAP.pm
@@ -1122,33 +1122,6 @@ sub parse_query ($$) {
$q;
}
-sub refill_xap ($$$$) {
- my ($self, $uids, $range_info, $q) = @_;
- my ($beg, $end) = @$range_info;
- my $srch = $self->{ibx}->search;
- my $opt = { mset => 2, limit => 1000 };
- my $mset = $srch->mset("$q uid:$beg..$end", $opt);
- @$uids = @{$srch->mset_to_artnums($mset)};
- if (@$uids) {
- $range_info->[0] = $uids->[-1] + 1; # update $beg
- return; # possibly more
- }
- 0; # all done
-}
-
-sub search_xap_range { # long_response
- my ($self, $tag, $q, $range_info, $want_msn) = @_;
- my $uids = [];
- if (defined(my $err = refill_xap($self, $uids, $range_info, $q))) {
- $err ||= 'OK Search done';
- $self->write("\r\n$tag $err\r\n");
- return;
- }
- msn_convert($self, $uids) if $want_msn;
- $self->msg_more(join(' ', '', @$uids));
- 1; # more
-}
-
sub search_common {
my ($self, $tag, $query, $want_msn) = @_;
my $ibx = $self->{ibx} or return "$tag BAD No mailbox selected\r\n";
@@ -1160,11 +1133,17 @@ sub search_common {
long_response($self, \&search_uid_range,
$tag, $sql, $range_info, $want_msn);
} elsif ($q = $q->{xap}) {
- $self->{ibx}->search or
+ my $srch = $self->{ibx}->isrch or
return "$tag BAD search not available for mailbox\r\n";
- $self->msg_more('* SEARCH');
- long_response($self, \&search_xap_range,
- $tag, $q, $range_info, $want_msn);
+ my $opt = {
+ mset => 2,
+ limit => UID_SLICE,
+ uid_range => $range_info
+ };
+ my $mset = $srch->mset($q, $opt);
+ my $uids = $srch->mset_to_artnums($mset, $opt);
+ msn_convert($self, $uids) if $want_msn;
+ "* SEARCH @$uids\r\n$tag OK Search done\r\n";
} else {
"$tag BAD Error\r\n";
}
diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm
index 0ab3b19a..8a1f257a 100644
--- a/lib/PublicInbox/Isearch.pm
+++ b/lib/PublicInbox/Isearch.pm
@@ -15,12 +15,6 @@ sub new {
bless { es => $es, eidx_key => $ibx->eidx_key }, __PACKAGE__;
}
-sub mset {
- my ($self, $str, $opt) = @_;
- $self->{es}->mset($str, { $opt ? %$opt : (),
- eidx_key => $self->{eidx_key} });
-}
-
sub _ibx_id ($) {
my ($self) = @_;
my $sth = $self->{es}->over->dbh->prepare_cached(<<'', undef, 1);
@@ -31,11 +25,57 @@ SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1
die "E: `$self->{eidx_key}' not in $self->{es}->{topdir}\n";
}
+
+sub mset {
+ my ($self, $str, $opt) = @_;
+ my %opt = $opt ? %$opt : ();
+ $opt{eidx_key} = $self->{eidx_key};
+ if (my $uid_range = $opt{uid_range}) {
+ my ($beg, $end) = @$uid_range;
+ my $ibx_id = $self->{-ibx_id} //= _ibx_id($self);
+ my $dbh = $self->{es}->{over}->dbh;
+ my $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT MIN(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ?
+
+ $sth->execute($ibx_id, $beg, $end);
+ my @r = ($sth->fetchrow_array);
+
+ $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT MAX(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ?
+
+ $sth->execute($ibx_id, $beg, $end);
+ $r[1] = $sth->fetchrow_array;
+ if (defined($r[1]) && defined($r[0])) {
+ $opt{limit} = $r[1] - $r[0] + 1;
+ } else {
+ $r[1] //= 0xffffffff;
+ $r[0] //= 0;
+ }
+ $opt{uid_range} = \@r;
+ }
+ $self->{es}->mset($str, \%opt);
+}
+
sub mset_to_artnums {
- my ($self, $mset) = @_;
+ my ($self, $mset, $opt) = @_;
my $docids = PublicInbox::Search::mset_to_artnums($self->{es}, $mset);
my $ibx_id = $self->{-ibx_id} //= _ibx_id($self);
my $qmarks = join(',', map { '?' } @$docids);
+ if ($opt && ($opt->{mset} // 0) == 2) { # opt->{mset} = 2 was used
+ my $range = '';
+ my @r;
+ if (my $r = $opt->{uid_range}) {
+ $range = 'AND xnum >= ? AND xnum <= ?';
+ @r = @$r;
+ }
+ my $rows = $self->{es}->over->dbh->
+ selectall_arrayref(<<"", undef, $ibx_id, @$docids, @r);
+SELECT xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks) $range
+ORDER BY xnum ASC
+
+ return [ map { $_->[0] } @$rows ];
+ }
+
my $rows = $self->{es}->over->dbh->
selectall_arrayref(<<"", undef, $ibx_id, @$docids);
SELECT docid,xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks)
diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
index 19459150..0df31e7f 100644
--- a/lib/PublicInbox/Mbox.pm
+++ b/lib/PublicInbox/Mbox.pm
@@ -222,7 +222,7 @@ sub results_cb {
my $mset = $srch->mset($ctx->{query}, $ctx->{qopts});
my $size = $mset->size or return;
$ctx->{qopts}->{offset} += $size;
- $ctx->{ids} = $srch->mset_to_artnums($mset);
+ $ctx->{ids} = $srch->mset_to_artnums($mset, $ctx->{qopts});
}
}
@@ -244,7 +244,7 @@ sub results_thread_cb {
my $mset = $srch->mset($ctx->{query}, $ctx->{qopts});
my $size = $mset->size or return;
$ctx->{qopts}->{offset} += $size;
- $ctx->{ids} = $srch->mset_to_artnums($mset);
+ $ctx->{ids} = $srch->mset_to_artnums($mset, $ctx->{qopts});
}
}
@@ -265,7 +265,7 @@ sub mbox_all {
return [404, [qw(Content-Type text/plain)],
["No results found\n"]];
$ctx->{query} = $q_string;
- $ctx->{ids} = $srch->mset_to_artnums($mset);
+ $ctx->{ids} = $srch->mset_to_artnums($mset, $qopts);
require PublicInbox::MboxGz;
my $fn;
if ($q->{t} && $srch->has_threadid) {
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index ba239255..7785d483 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -90,6 +90,7 @@ sub load_xapian () {
$ENQ_ASCENDING = $x eq 'Xapian' ?
1 : Search::Xapian::ENQ_ASCENDING();
+ *sortable_serialise = $x.'::sortable_serialise';
# n.b. FLAG_PURE_NOT is expensive not suitable for a public
# website as it could become a denial-of-service vector
# FLAG_PHRASE also seems to cause performance problems chert
@@ -334,6 +335,12 @@ sub _enquire_once { # retry_reopen callback
if (defined(my $eidx_key = $opts->{eidx_key})) {
$query = $X{Query}->new(OP_FILTER(), $query, 'O'.$eidx_key);
}
+ if (defined(my $uid_range = $opts->{uid_range})) {
+ my $range = $X{Query}->new(OP_VALUE_RANGE(), UID,
+ sortable_serialise($uid_range->[0]),
+ sortable_serialise($uid_range->[1]));
+ $query = $X{Query}->new(OP_FILTER(), $query, $range);
+ }
my $enquire = $X{Enquire}->new($xdb);
$enquire->set_query($query);
$opts ||= {};
@@ -389,7 +396,6 @@ sub qparse_new ($) {
# for IMAP, undocumented for WWW and may be split off go away
$cb->($qp, $NVRP->new(BYTES, 'bytes:'));
$cb->($qp, $NVRP->new(TS, 'ts:'));
- $cb->($qp, $NVRP->new(UID, 'uid:'));
while (my ($name, $prefix) = each %bool_pfx_external) {
$qp->add_boolean_prefix($name, $_) foreach split(/ /, $prefix);
next prev parent reply other threads:[~2020-12-05 11:10 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-12-05 10:11 [PATCH 0/2] isearch: per-inbox search using ->ALL Eric Wong
2020-12-05 10:11 ` [PATCH 1/2] inbox: simplify ->search and callers Eric Wong
2020-12-05 10:11 ` [PATCH 2/2] isearch: emulate per-inbox search with ->ALL Eric Wong
2020-12-05 11:10 ` Eric Wong [this message]
2020-12-05 22:26 ` [PATCH 4/2] search: reinstate "uid:" internal search prefix Eric Wong
2020-12-10 0:41 ` One, All; Some? [was: isearch: per-inbox search using ->ALL] Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20201205111045.GA27365@dcvr \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).