From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id C802F1FB0A; Sat, 5 Dec 2020 11:10:46 +0000 (UTC) Date: Sat, 5 Dec 2020 11:10:45 +0000 From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 3/2] imap: support isearch and reduce Xapian queries Message-ID: <20201205111045.GA27365@dcvr> References: <20201205101138.11973-1-e@80x24.org> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: <20201205101138.11973-1-e@80x24.org> List-Id: Since IMAP search (either with Isearch or traditional per-Inbox search) only returns UIDs, we can safely set the limit to the UID slice size(*). With isearch, we can also trust the Xapian result to fit any docid range we specify. Limiting Xapian results to 1000 was making ->ALL docid <=> per-Inbox UID impossible since results could overlap between ranges unpredictably. Finally, we can map the ->ALL docids into per-Inbox UIDs and show them to the client in the UID order of the Inbox, not the docid order of the ->ALL extindex. This also lets us get rid of the "uid:" query parser prefix and use the Xapian::Query API directly to reduce our search prefix footprint. For mbox.gz downloads in WWW, we'll also make a best effort to preserve the order from the Inbox, not the order of extindex; though it's possible large result sets can have non-overlapping windows. (*) by definition, UID slice size is a "safe" value which shouldn't OOM either the server or clients. --- lib/PublicInbox/IMAP.pm | 41 +++++++---------------------- lib/PublicInbox/Isearch.pm | 54 +++++++++++++++++++++++++++++++++----- lib/PublicInbox/Mbox.pm | 6 ++--- lib/PublicInbox/Search.pm | 8 +++++- 4 files changed, 67 insertions(+), 42 deletions(-) diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm index 9599f494..f123eb01 100644 --- a/lib/PublicInbox/IMAP.pm +++ b/lib/PublicInbox/IMAP.pm @@ -1122,33 +1122,6 @@ sub parse_query ($$) { $q; } -sub refill_xap ($$$$) { - my ($self, $uids, $range_info, $q) = @_; - my ($beg, $end) = @$range_info; - my $srch = $self->{ibx}->search; - my $opt = { mset => 2, limit => 1000 }; - my $mset = $srch->mset("$q uid:$beg..$end", $opt); - @$uids = @{$srch->mset_to_artnums($mset)}; - if (@$uids) { - $range_info->[0] = $uids->[-1] + 1; # update $beg - return; # possibly more - } - 0; # all done -} - -sub search_xap_range { # long_response - my ($self, $tag, $q, $range_info, $want_msn) = @_; - my $uids = []; - if (defined(my $err = refill_xap($self, $uids, $range_info, $q))) { - $err ||= 'OK Search done'; - $self->write("\r\n$tag $err\r\n"); - return; - } - msn_convert($self, $uids) if $want_msn; - $self->msg_more(join(' ', '', @$uids)); - 1; # more -} - sub search_common { my ($self, $tag, $query, $want_msn) = @_; my $ibx = $self->{ibx} or return "$tag BAD No mailbox selected\r\n"; @@ -1160,11 +1133,17 @@ sub search_common { long_response($self, \&search_uid_range, $tag, $sql, $range_info, $want_msn); } elsif ($q = $q->{xap}) { - $self->{ibx}->search or + my $srch = $self->{ibx}->isrch or return "$tag BAD search not available for mailbox\r\n"; - $self->msg_more('* SEARCH'); - long_response($self, \&search_xap_range, - $tag, $q, $range_info, $want_msn); + my $opt = { + mset => 2, + limit => UID_SLICE, + uid_range => $range_info + }; + my $mset = $srch->mset($q, $opt); + my $uids = $srch->mset_to_artnums($mset, $opt); + msn_convert($self, $uids) if $want_msn; + "* SEARCH @$uids\r\n$tag OK Search done\r\n"; } else { "$tag BAD Error\r\n"; } diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm index 0ab3b19a..8a1f257a 100644 --- a/lib/PublicInbox/Isearch.pm +++ b/lib/PublicInbox/Isearch.pm @@ -15,12 +15,6 @@ sub new { bless { es => $es, eidx_key => $ibx->eidx_key }, __PACKAGE__; } -sub mset { - my ($self, $str, $opt) = @_; - $self->{es}->mset($str, { $opt ? %$opt : (), - eidx_key => $self->{eidx_key} }); -} - sub _ibx_id ($) { my ($self) = @_; my $sth = $self->{es}->over->dbh->prepare_cached(<<'', undef, 1); @@ -31,11 +25,57 @@ SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1 die "E: `$self->{eidx_key}' not in $self->{es}->{topdir}\n"; } + +sub mset { + my ($self, $str, $opt) = @_; + my %opt = $opt ? %$opt : (); + $opt{eidx_key} = $self->{eidx_key}; + if (my $uid_range = $opt{uid_range}) { + my ($beg, $end) = @$uid_range; + my $ibx_id = $self->{-ibx_id} //= _ibx_id($self); + my $dbh = $self->{es}->{over}->dbh; + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT MIN(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ? + + $sth->execute($ibx_id, $beg, $end); + my @r = ($sth->fetchrow_array); + + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT MAX(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ? + + $sth->execute($ibx_id, $beg, $end); + $r[1] = $sth->fetchrow_array; + if (defined($r[1]) && defined($r[0])) { + $opt{limit} = $r[1] - $r[0] + 1; + } else { + $r[1] //= 0xffffffff; + $r[0] //= 0; + } + $opt{uid_range} = \@r; + } + $self->{es}->mset($str, \%opt); +} + sub mset_to_artnums { - my ($self, $mset) = @_; + my ($self, $mset, $opt) = @_; my $docids = PublicInbox::Search::mset_to_artnums($self->{es}, $mset); my $ibx_id = $self->{-ibx_id} //= _ibx_id($self); my $qmarks = join(',', map { '?' } @$docids); + if ($opt && ($opt->{mset} // 0) == 2) { # opt->{mset} = 2 was used + my $range = ''; + my @r; + if (my $r = $opt->{uid_range}) { + $range = 'AND xnum >= ? AND xnum <= ?'; + @r = @$r; + } + my $rows = $self->{es}->over->dbh-> + selectall_arrayref(<<"", undef, $ibx_id, @$docids, @r); +SELECT xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks) $range +ORDER BY xnum ASC + + return [ map { $_->[0] } @$rows ]; + } + my $rows = $self->{es}->over->dbh-> selectall_arrayref(<<"", undef, $ibx_id, @$docids); SELECT docid,xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks) diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index 19459150..0df31e7f 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -222,7 +222,7 @@ sub results_cb { my $mset = $srch->mset($ctx->{query}, $ctx->{qopts}); my $size = $mset->size or return; $ctx->{qopts}->{offset} += $size; - $ctx->{ids} = $srch->mset_to_artnums($mset); + $ctx->{ids} = $srch->mset_to_artnums($mset, $ctx->{qopts}); } } @@ -244,7 +244,7 @@ sub results_thread_cb { my $mset = $srch->mset($ctx->{query}, $ctx->{qopts}); my $size = $mset->size or return; $ctx->{qopts}->{offset} += $size; - $ctx->{ids} = $srch->mset_to_artnums($mset); + $ctx->{ids} = $srch->mset_to_artnums($mset, $ctx->{qopts}); } } @@ -265,7 +265,7 @@ sub mbox_all { return [404, [qw(Content-Type text/plain)], ["No results found\n"]]; $ctx->{query} = $q_string; - $ctx->{ids} = $srch->mset_to_artnums($mset); + $ctx->{ids} = $srch->mset_to_artnums($mset, $qopts); require PublicInbox::MboxGz; my $fn; if ($q->{t} && $srch->has_threadid) { diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index ba239255..7785d483 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -90,6 +90,7 @@ sub load_xapian () { $ENQ_ASCENDING = $x eq 'Xapian' ? 1 : Search::Xapian::ENQ_ASCENDING(); + *sortable_serialise = $x.'::sortable_serialise'; # n.b. FLAG_PURE_NOT is expensive not suitable for a public # website as it could become a denial-of-service vector # FLAG_PHRASE also seems to cause performance problems chert @@ -334,6 +335,12 @@ sub _enquire_once { # retry_reopen callback if (defined(my $eidx_key = $opts->{eidx_key})) { $query = $X{Query}->new(OP_FILTER(), $query, 'O'.$eidx_key); } + if (defined(my $uid_range = $opts->{uid_range})) { + my $range = $X{Query}->new(OP_VALUE_RANGE(), UID, + sortable_serialise($uid_range->[0]), + sortable_serialise($uid_range->[1])); + $query = $X{Query}->new(OP_FILTER(), $query, $range); + } my $enquire = $X{Enquire}->new($xdb); $enquire->set_query($query); $opts ||= {}; @@ -389,7 +396,6 @@ sub qparse_new ($) { # for IMAP, undocumented for WWW and may be split off go away $cb->($qp, $NVRP->new(BYTES, 'bytes:')); $cb->($qp, $NVRP->new(TS, 'ts:')); - $cb->($qp, $NVRP->new(UID, 'uid:')); while (my ($name, $prefix) = each %bool_pfx_external) { $qp->add_boolean_prefix($name, $_) foreach split(/ /, $prefix);