From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Cc: Eric Wong <e@80x24.org>
Subject: [PATCH 2/2] searchmsg: remove ensure_metadata
Date: Tue, 20 Dec 2016 03:03:57 +0000 [thread overview]
Message-ID: <20161220030357.26350-3-e@80x24.org> (raw)
In-Reply-To: <20161220030357.26350-1-e@80x24.org>
Instead, only preload the ->mid field for threading,
as we only need ->thread and ->path once in Search->get_thread
(but we will need the ->mid field repeatedly).
This more than doubles View->load_results performance on
according to thread-all on an inbox with over 300K messages.
---
lib/PublicInbox/Search.pm | 6 ------
lib/PublicInbox/SearchMsg.pm | 39 ++++++++++++---------------------------
lib/PublicInbox/View.pm | 2 +-
t/search.t | 2 --
4 files changed, 13 insertions(+), 36 deletions(-)
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 24cb266..d4f6f77 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -108,12 +108,6 @@ my %all_pfx = (%bool_pfx_internal, %bool_pfx_external, %prob_prefix);
sub xpfx { $all_pfx{$_[0]} }
-our %PFX2TERM_RMAP;
-my %meta_pfx = (mid => 1, thread => 1, path => 1);
-while (my ($k, $v) = each %all_pfx) {
- $PFX2TERM_RMAP{$v} = $k if $meta_pfx{$k};
-}
-
my $mail_query = Search::Xapian::Query->new(xpfx('type') . 'mail');
sub xdir {
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index d62f02c..96406c6 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -10,7 +10,6 @@ use Search::Xapian;
use Date::Parse qw/str2time/;
use PublicInbox::MID qw/mid_clean/;
use PublicInbox::Address;
-our $PFX2TERM_RE = undef;
sub new {
my ($class, $mime) = @_;
@@ -121,29 +120,17 @@ sub references {
defined $x ? $x : '';
}
-sub ensure_metadata {
- my ($self) = @_;
+sub _get_term_val ($$$) {
+ my ($self, $pfx, $re) = @_;
my $doc = $self->{doc};
my $end = $doc->termlist_end;
-
- unless (defined $PFX2TERM_RE) {
- my $or = join('|', keys %PublicInbox::Search::PFX2TERM_RMAP);
- $PFX2TERM_RE = qr/\A($or)/;
- }
-
- while (my ($pfx, $field) = each %PublicInbox::Search::PFX2TERM_RMAP) {
- # ideally we'd move this out of the loop:
- my $i = $doc->termlist_begin;
-
- $i->skip_to($pfx);
- if ($i != $end) {
- my $val = $i->get_termname;
-
- if ($val =~ s/$PFX2TERM_RE//o) {
- $self->{$field} = $val;
- }
- }
+ my $i = $doc->termlist_begin;
+ $i->skip_to($pfx);
+ if ($i != $end) {
+ my $val = $i->get_termname;
+ $val =~ s/$re// and return $val;
}
+ undef;
}
sub mid ($;$) {
@@ -154,8 +141,8 @@ sub mid ($;$) {
} elsif (my $rv = $self->{mid}) {
$rv;
} else {
- $self->ensure_metadata; # needed for ghosts
- $self->{mid} ||= $self->_extract_mid;
+ $self->{mid} = _get_term_val($self, 'Q', qr/\AQ/) ||
+ $self->_extract_mid;
}
}
@@ -194,16 +181,14 @@ sub thread_id {
my ($self) = @_;
my $tid = $self->{thread};
return $tid if defined $tid;
- $self->ensure_metadata;
- $self->{thread};
+ $self->{thread} = _get_term_val($self, 'G', qr/\AG/); # *G*roup
}
sub path {
my ($self) = @_;
my $path = $self->{path};
return $path if defined $path;
- $self->ensure_metadata;
- $self->{path};
+ $self->{path} = _get_term_val($self, 'XPATH', qr/\AXPATH/); # path
}
1;
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index fa47a16..a50cb64 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -737,7 +737,7 @@ sub indent_for {
sub load_results {
my ($srch, $sres) = @_;
my $msgs = delete $sres->{msgs};
- $srch->retry_reopen(sub { [ map { $_->ensure_metadata; $_ } @$msgs ] });
+ $srch->retry_reopen(sub { [ map { $_->mid; $_ } @$msgs ] });
}
sub msg_timestamp {
diff --git a/t/search.t b/t/search.t
index eed9c9b..c16811d 100644
--- a/t/search.t
+++ b/t/search.t
@@ -109,7 +109,6 @@ sub filter_mids {
my $found = $ro->lookup_message('<root@s>');
ok($found, "message found");
is($root_id, $found->{doc_id}, 'doc_id set correctly');
- $found->ensure_metadata;
is($found->mid, 'root@s', 'mid set correctly');
ok(int($found->thread_id) > 0, 'thread_id is an integer');
@@ -290,7 +289,6 @@ sub filter_mids {
body => "LOOP!\n"));
ok($doc_id > 0, "doc_id defined with circular reference");
my $smsg = $rw->lookup_message('circle@a');
- $smsg->ensure_metadata;
is($smsg->references, '', "no references created");
my $msg = PublicInbox::SearchMsg->load_doc($smsg->{doc});
is($s, $msg->subject, 'long subject not rewritten');
--
EW
prev parent reply other threads:[~2016-12-20 3:04 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-12-20 3:03 [PATCH 0/2] improve threading performance Eric Wong
2016-12-20 3:03 ` [PATCH 1/2] tests: add thread-all testing for benchmarking Eric Wong
2016-12-20 3:03 ` Eric Wong [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20161220030357.26350-3-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).