unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 3/2] imap: support isearch and reduce Xapian queries
Date: Sat, 5 Dec 2020 11:10:45 +0000	[thread overview]
Message-ID: <20201205111045.GA27365@dcvr> (raw)
In-Reply-To: <20201205101138.11973-1-e@80x24.org>

Since IMAP search (either with Isearch or traditional per-Inbox
search) only returns UIDs, we can safely set the limit to the
UID slice size(*).  With isearch, we can also trust the Xapian
result to fit any docid range we specify.

Limiting Xapian results to 1000 was making ->ALL docid <=>
per-Inbox UID impossible since results could overlap between
ranges unpredictably.

Finally, we can map the ->ALL docids into per-Inbox UIDs and
show them to the client in the UID order of the Inbox, not the
docid order of the ->ALL extindex.

This also lets us get rid of the "uid:" query parser prefix
and use the Xapian::Query API directly to reduce our search
prefix footprint.

For mbox.gz downloads in WWW, we'll also make a best effort to
preserve the order from the Inbox, not the order of extindex;
though it's possible large result sets can have non-overlapping
windows.

(*) by definition, UID slice size is a "safe" value which
    shouldn't OOM either the server or clients.
---
 lib/PublicInbox/IMAP.pm    | 41 +++++++----------------------
 lib/PublicInbox/Isearch.pm | 54 +++++++++++++++++++++++++++++++++-----
 lib/PublicInbox/Mbox.pm    |  6 ++---
 lib/PublicInbox/Search.pm  |  8 +++++-
 4 files changed, 67 insertions(+), 42 deletions(-)

diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm
index 9599f494..f123eb01 100644
--- a/lib/PublicInbox/IMAP.pm
+++ b/lib/PublicInbox/IMAP.pm
@@ -1122,33 +1122,6 @@ sub parse_query ($$) {
 	$q;
 }
 
-sub refill_xap ($$$$) {
-	my ($self, $uids, $range_info, $q) = @_;
-	my ($beg, $end) = @$range_info;
-	my $srch = $self->{ibx}->search;
-	my $opt = { mset => 2, limit => 1000 };
-	my $mset = $srch->mset("$q uid:$beg..$end", $opt);
-	@$uids = @{$srch->mset_to_artnums($mset)};
-	if (@$uids) {
-		$range_info->[0] = $uids->[-1] + 1; # update $beg
-		return; # possibly more
-	}
-	0; # all done
-}
-
-sub search_xap_range { # long_response
-	my ($self, $tag, $q, $range_info, $want_msn) = @_;
-	my $uids = [];
-	if (defined(my $err = refill_xap($self, $uids, $range_info, $q))) {
-		$err ||= 'OK Search done';
-		$self->write("\r\n$tag $err\r\n");
-		return;
-	}
-	msn_convert($self, $uids) if $want_msn;
-	$self->msg_more(join(' ', '', @$uids));
-	1; # more
-}
-
 sub search_common {
 	my ($self, $tag, $query, $want_msn) = @_;
 	my $ibx = $self->{ibx} or return "$tag BAD No mailbox selected\r\n";
@@ -1160,11 +1133,17 @@ sub search_common {
 		long_response($self, \&search_uid_range,
 				$tag, $sql, $range_info, $want_msn);
 	} elsif ($q = $q->{xap}) {
-		$self->{ibx}->search or
+		my $srch = $self->{ibx}->isrch or
 			return "$tag BAD search not available for mailbox\r\n";
-		$self->msg_more('* SEARCH');
-		long_response($self, \&search_xap_range,
-				$tag, $q, $range_info, $want_msn);
+		my $opt = {
+			mset => 2,
+			limit => UID_SLICE,
+			uid_range => $range_info
+		};
+		my $mset = $srch->mset($q, $opt);
+		my $uids = $srch->mset_to_artnums($mset, $opt);
+		msn_convert($self, $uids) if $want_msn;
+		"* SEARCH @$uids\r\n$tag OK Search done\r\n";
 	} else {
 		"$tag BAD Error\r\n";
 	}
diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm
index 0ab3b19a..8a1f257a 100644
--- a/lib/PublicInbox/Isearch.pm
+++ b/lib/PublicInbox/Isearch.pm
@@ -15,12 +15,6 @@ sub new {
 	bless { es => $es, eidx_key => $ibx->eidx_key }, __PACKAGE__;
 }
 
-sub mset {
-	my ($self, $str, $opt) = @_;
-	$self->{es}->mset($str, { $opt ? %$opt : (),
-				eidx_key => $self->{eidx_key} });
-}
-
 sub _ibx_id ($) {
 	my ($self) = @_;
 	my $sth = $self->{es}->over->dbh->prepare_cached(<<'', undef, 1);
@@ -31,11 +25,57 @@ SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1
 		die "E: `$self->{eidx_key}' not in $self->{es}->{topdir}\n";
 }
 
+
+sub mset {
+	my ($self, $str, $opt) = @_;
+	my %opt = $opt ? %$opt : ();
+	$opt{eidx_key} = $self->{eidx_key};
+	if (my $uid_range = $opt{uid_range}) {
+		my ($beg, $end) = @$uid_range;
+		my $ibx_id = $self->{-ibx_id} //= _ibx_id($self);
+		my $dbh = $self->{es}->{over}->dbh;
+		my $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT MIN(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ?
+
+		$sth->execute($ibx_id, $beg, $end);
+		my @r = ($sth->fetchrow_array);
+
+		$sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT MAX(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ?
+
+		$sth->execute($ibx_id, $beg, $end);
+		$r[1] = $sth->fetchrow_array;
+		if (defined($r[1]) && defined($r[0])) {
+			$opt{limit} = $r[1] - $r[0] + 1;
+		} else {
+			$r[1] //= 0xffffffff;
+			$r[0] //= 0;
+		}
+		$opt{uid_range} = \@r;
+	}
+	$self->{es}->mset($str, \%opt);
+}
+
 sub mset_to_artnums {
-	my ($self, $mset) = @_;
+	my ($self, $mset, $opt) = @_;
 	my $docids = PublicInbox::Search::mset_to_artnums($self->{es}, $mset);
 	my $ibx_id = $self->{-ibx_id} //= _ibx_id($self);
 	my $qmarks = join(',', map { '?' } @$docids);
+	if ($opt && ($opt->{mset} // 0) == 2) { # opt->{mset} = 2 was used
+		my $range = '';
+		my @r;
+		if (my $r = $opt->{uid_range}) {
+			$range = 'AND xnum >= ? AND xnum <= ?';
+			@r = @$r;
+		}
+		my $rows = $self->{es}->over->dbh->
+			selectall_arrayref(<<"", undef, $ibx_id, @$docids, @r);
+SELECT xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks) $range
+ORDER BY xnum ASC
+
+		return [ map { $_->[0] } @$rows ];
+	}
+
 	my $rows = $self->{es}->over->dbh->
 			selectall_arrayref(<<"", undef, $ibx_id, @$docids);
 SELECT docid,xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks)
diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
index 19459150..0df31e7f 100644
--- a/lib/PublicInbox/Mbox.pm
+++ b/lib/PublicInbox/Mbox.pm
@@ -222,7 +222,7 @@ sub results_cb {
 		my $mset = $srch->mset($ctx->{query}, $ctx->{qopts});
 		my $size = $mset->size or return;
 		$ctx->{qopts}->{offset} += $size;
-		$ctx->{ids} = $srch->mset_to_artnums($mset);
+		$ctx->{ids} = $srch->mset_to_artnums($mset, $ctx->{qopts});
 	}
 }
 
@@ -244,7 +244,7 @@ sub results_thread_cb {
 		my $mset = $srch->mset($ctx->{query}, $ctx->{qopts});
 		my $size = $mset->size or return;
 		$ctx->{qopts}->{offset} += $size;
-		$ctx->{ids} = $srch->mset_to_artnums($mset);
+		$ctx->{ids} = $srch->mset_to_artnums($mset, $ctx->{qopts});
 	}
 
 }
@@ -265,7 +265,7 @@ sub mbox_all {
 			return [404, [qw(Content-Type text/plain)],
 				["No results found\n"]];
 	$ctx->{query} = $q_string;
-	$ctx->{ids} = $srch->mset_to_artnums($mset);
+	$ctx->{ids} = $srch->mset_to_artnums($mset, $qopts);
 	require PublicInbox::MboxGz;
 	my $fn;
 	if ($q->{t} && $srch->has_threadid) {
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index ba239255..7785d483 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -90,6 +90,7 @@ sub load_xapian () {
 		$ENQ_ASCENDING = $x eq 'Xapian' ?
 				1 : Search::Xapian::ENQ_ASCENDING();
 
+		*sortable_serialise = $x.'::sortable_serialise';
 		# n.b. FLAG_PURE_NOT is expensive not suitable for a public
 		# website as it could become a denial-of-service vector
 		# FLAG_PHRASE also seems to cause performance problems chert
@@ -334,6 +335,12 @@ sub _enquire_once { # retry_reopen callback
 	if (defined(my $eidx_key = $opts->{eidx_key})) {
 		$query = $X{Query}->new(OP_FILTER(), $query, 'O'.$eidx_key);
 	}
+	if (defined(my $uid_range = $opts->{uid_range})) {
+		my $range = $X{Query}->new(OP_VALUE_RANGE(), UID,
+					sortable_serialise($uid_range->[0]),
+					sortable_serialise($uid_range->[1]));
+		$query = $X{Query}->new(OP_FILTER(), $query, $range);
+	}
 	my $enquire = $X{Enquire}->new($xdb);
 	$enquire->set_query($query);
 	$opts ||= {};
@@ -389,7 +396,6 @@ sub qparse_new ($) {
 	# for IMAP, undocumented for WWW and may be split off go away
 	$cb->($qp, $NVRP->new(BYTES, 'bytes:'));
 	$cb->($qp, $NVRP->new(TS, 'ts:'));
-	$cb->($qp, $NVRP->new(UID, 'uid:'));
 
 	while (my ($name, $prefix) = each %bool_pfx_external) {
 		$qp->add_boolean_prefix($name, $_) foreach split(/ /, $prefix);

  parent reply	other threads:[~2020-12-05 11:10 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-12-05 10:11 [PATCH 0/2] isearch: per-inbox search using ->ALL Eric Wong
2020-12-05 10:11 ` [PATCH 1/2] inbox: simplify ->search and callers Eric Wong
2020-12-05 10:11 ` [PATCH 2/2] isearch: emulate per-inbox search with ->ALL Eric Wong
2020-12-05 11:10 ` Eric Wong [this message]
2020-12-05 22:26   ` [PATCH 4/2] search: reinstate "uid:" internal search prefix Eric Wong
2020-12-10  0:41 ` One, All; Some? [was: isearch: per-inbox search using ->ALL] Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201205111045.GA27365@dcvr \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).