* [PATCH 1/3] search: remove unnecessary abstractions and functionality
2017-06-14 0:14 [PATCH 0/3] search improvements Eric Wong
@ 2017-06-14 0:14 ` Eric Wong
2017-06-15 23:11 ` [PATCH 4/3] searchidx: remove messages correctly from Xapian index Eric Wong
2017-06-14 0:14 ` [PATCH 2/3] searchidx: switch to accounting by message bytes Eric Wong
2017-06-14 0:14 ` [PATCH 3/3] search: allow searching within mail diffs Eric Wong
2 siblings, 1 reply; 5+ messages in thread
From: Eric Wong @ 2017-06-14 0:14 UTC (permalink / raw)
To: meta
This simplifies the code a bit and reduces the translation
overhead for looking directly at data from tools shipped
with Xapian.
While we're at it, fix thread-all.t :)
---
lib/PublicInbox/Search.pm | 31 +++++++++----------------------
lib/PublicInbox/SearchIdx.pm | 20 +++++++++-----------
lib/PublicInbox/SearchMsg.pm | 2 +-
t/search.t | 9 +--------
4 files changed, 20 insertions(+), 42 deletions(-)
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 82a6e54..67837f4 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -56,8 +56,6 @@ my %bool_pfx_internal = (
);
my %bool_pfx_external = (
- # do we still need these? probably not..
- path => 'XPATH',
mid => 'Q', # uniQue id (Message-ID)
);
@@ -107,11 +105,7 @@ chomp @HELP;
# da (diff a/ removed lines)
# db (diff b/ added lines)
-my %all_pfx = (%bool_pfx_internal, %bool_pfx_external, %prob_prefix);
-
-sub xpfx { $all_pfx{$_[0]} }
-
-my $mail_query = Search::Xapian::Query->new(xpfx('type') . 'mail');
+my $mail_query = Search::Xapian::Query->new('T' . 'mail');
sub xdir {
my (undef, $git_dir) = @_;
@@ -146,11 +140,11 @@ sub get_thread {
my $smsg = eval { $self->lookup_message($mid) };
return { total => 0, msgs => [] } unless $smsg;
- my $qtid = Search::Xapian::Query->new(xpfx('thread').$smsg->thread_id);
+ my $qtid = Search::Xapian::Query->new('G' . $smsg->thread_id);
my $path = $smsg->path;
if (defined $path && $path ne '') {
my $path = id_compress($smsg->path);
- my $qsub = Search::Xapian::Query->new(xpfx('path').$path);
+ my $qsub = Search::Xapian::Query->new('XPATH' . $path);
$qtid = Search::Xapian::Query->new(OP_OR, $qtid, $qsub);
}
$opts ||= {};
@@ -279,7 +273,7 @@ sub lookup_message {
my ($self, $mid) = @_;
$mid = mid_clean($mid);
- my $doc_id = $self->find_unique_doc_id('mid', $mid);
+ my $doc_id = $self->find_unique_doc_id('Q' . $mid);
my $smsg;
if (defined $doc_id) {
# raises on error:
@@ -299,9 +293,9 @@ sub lookup_mail { # no ghosts!
}
sub find_unique_doc_id {
- my ($self, $term, $value) = @_;
+ my ($self, $termval) = @_;
- my ($begin, $end) = $self->find_doc_ids($term, $value);
+ my ($begin, $end) = $self->find_doc_ids($termval);
return undef if $begin->equal($end); # not found
@@ -309,23 +303,16 @@ sub find_unique_doc_id {
# sanity check
$begin->inc;
- $begin->equal($end) or die "Term '$term:$value' is not unique\n";
+ $begin->equal($end) or die "Term '$termval' is not unique\n";
$rv;
}
# returns begin and end PostingIterator
sub find_doc_ids {
- my ($self, $term, $value) = @_;
-
- $self->find_doc_ids_for_term(xpfx($term) . $value);
-}
-
-# returns begin and end PostingIterator
-sub find_doc_ids_for_term {
- my ($self, $term) = @_;
+ my ($self, $termval) = @_;
my $db = $self->{xdb};
- ($db->postlist_begin($term), $db->postlist_end($term));
+ ($db->postlist_begin($termval), $db->postlist_end($termval));
}
# normalize subjects so they are suitable as pathnames for URLs
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index fd0d320..316111b 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -19,7 +19,6 @@ use PublicInbox::MsgIter;
use Carp qw(croak);
use POSIX qw(strftime);
require PublicInbox::Git;
-*xpfx = *PublicInbox::Search::xpfx;
use constant MAX_MID_SIZE => 244; # max term size - 1 in Xapian
use constant {
@@ -160,12 +159,12 @@ sub add_message {
}
$smsg = PublicInbox::SearchMsg->new($mime);
my $doc = $smsg->{doc};
- $doc->add_term(xpfx('mid') . $mid);
+ $doc->add_term('Q' . $mid);
my $subj = $smsg->subject;
if ($subj ne '') {
my $path = $self->subject_path($subj);
- $doc->add_term(xpfx('path') . id_compress($path));
+ $doc->add_term('XPATH' . id_compress($path));
}
add_values($smsg, $bytes, $num);
@@ -332,7 +331,7 @@ sub link_message {
} else {
$tid = defined $old_tid ? $old_tid : $self->next_thread_id;
}
- $doc->add_term(xpfx('thread') . $tid);
+ $doc->add_term('G' . $tid);
}
sub index_blob {
@@ -542,9 +541,9 @@ sub create_ghost {
my $tid = $self->next_thread_id;
my $doc = Search::Xapian::Document->new;
- $doc->add_term(xpfx('mid') . $mid);
- $doc->add_term(xpfx('thread') . $tid);
- $doc->add_term(xpfx('type') . 'ghost');
+ $doc->add_term('Q' . $mid);
+ $doc->add_term('G' . $tid);
+ $doc->add_term('T' . 'ghost');
my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid);
$self->{xdb}->add_document($doc);
@@ -555,15 +554,14 @@ sub create_ghost {
sub merge_threads {
my ($self, $winner_tid, $loser_tid) = @_;
return if $winner_tid == $loser_tid;
- my ($head, $tail) = $self->find_doc_ids('thread', $loser_tid);
- my $thread_pfx = xpfx('thread');
+ my ($head, $tail) = $self->find_doc_ids('G' . $loser_tid);
my $db = $self->{xdb};
for (; $head != $tail; $head->inc) {
my $docid = $head->get_docid;
my $doc = $db->get_document($docid);
- $doc->remove_term($thread_pfx . $loser_tid);
- $doc->add_term($thread_pfx . $winner_tid);
+ $doc->remove_term('G' . $loser_tid);
+ $doc->add_term('G' . $winner_tid);
$db->replace_document($docid, $doc);
}
}
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index b8eee66..a19d45d 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -14,7 +14,7 @@ use PublicInbox::Address;
sub new {
my ($class, $mime) = @_;
my $doc = Search::Xapian::Document->new;
- $doc->add_term(PublicInbox::Search::xpfx('type') . 'mail');
+ $doc->add_term('T' . 'mail');
bless { type => 'mail', doc => $doc, mime => $mime }, $class;
}
diff --git a/t/search.t b/t/search.t
index c9c4e34..a75dc9b 100644
--- a/t/search.t
+++ b/t/search.t
@@ -95,15 +95,8 @@ sub filter_mids {
is($found->mid, 'root@s', 'mid set correctly');
ok(int($found->thread_id) > 0, 'thread_id is an integer');
+ my ($res, @res);
my @exp = sort qw(root@s last@s);
- my $res = $ro->query("path:hello_world");
- my @res = filter_mids($res);
- is_deeply(\@res, \@exp, 'got expected results for path: match');
-
- foreach my $p (qw(hello hello_ hello_world2 hello_world_)) {
- $res = $ro->query("path:$p");
- is($res->{total}, 0, "path variant `$p' does not match");
- }
$res = $ro->query('s:(Hello world)');
@res = filter_mids($res);
--
EW
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 2/3] searchidx: switch to accounting by message bytes
2017-06-14 0:14 [PATCH 0/3] search improvements Eric Wong
2017-06-14 0:14 ` [PATCH 1/3] search: remove unnecessary abstractions and functionality Eric Wong
@ 2017-06-14 0:14 ` Eric Wong
2017-06-14 0:14 ` [PATCH 3/3] search: allow searching within mail diffs Eric Wong
2 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2017-06-14 0:14 UTC (permalink / raw)
To: meta
Xapian memory usage is tied to the size of the indexed
text, so take the raw message size into account when
deciding when to flush Xapian data.
More importantly, we now flush Xapian before we have it
buffer beyond our maximum; and we do it unconditionally
to prevent even high priority processes from OOM-ing.
---
lib/PublicInbox/SearchIdx.pm | 23 +++++++++++++++--------
1 file changed, 15 insertions(+), 8 deletions(-)
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 316111b..30d3fe9 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -20,13 +20,14 @@ use Carp qw(croak);
use POSIX qw(strftime);
require PublicInbox::Git;
-use constant MAX_MID_SIZE => 244; # max term size - 1 in Xapian
use constant {
+ MAX_MID_SIZE => 244, # max term size - 1 in Xapian
PERM_UMASK => 0,
OLD_PERM_GROUP => 1,
OLD_PERM_EVERYBODY => 2,
PERM_GROUP => 0660,
PERM_EVERYBODY => 0664,
+ BATCH_BYTES => 1_000_000,
};
sub new {
@@ -71,7 +72,6 @@ sub _xdb_acquire {
require File::Path;
_lock_acquire($self);
File::Path::mkpath($dir);
- $self->{batch_size} = 100;
$flag = Search::Xapian::DB_CREATE_OR_OPEN;
}
$self->{xdb} = Search::Xapian::WritableDatabase->new($dir, $flag);
@@ -395,6 +395,15 @@ sub index_sync {
with_umask($self, sub { $self->_index_sync($opts) });
}
+sub batch_adjust ($$$$) {
+ my ($max, $bytes, $batch_cb, $latest) = @_;
+ $$max -= $bytes;
+ if ($$max <= 0) {
+ $$max = BATCH_BYTES;
+ $batch_cb->($latest, 1);
+ }
+}
+
sub rlog {
my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_;
my $hex = '[a-f0-9]';
@@ -404,23 +413,21 @@ sub rlog {
my $git = $self->{git};
my $latest;
my $bytes;
- my $max = $self->{batch_size}; # may be undef
+ my $max = BATCH_BYTES;
local $/ = "\n";
my $line;
while (defined($line = <$log>)) {
if ($line =~ /$addmsg/o) {
my $blob = $1;
my $mime = do_cat_mail($git, $blob, \$bytes) or next;
+ batch_adjust(\$max, $bytes, $batch_cb, $latest);
$add_cb->($self, $mime, $bytes, $blob);
} elsif ($line =~ /$delmsg/o) {
my $blob = $1;
- my $mime = do_cat_mail($git, $blob) or next;
+ my $mime = do_cat_mail($git, $blob, \$bytes) or next;
+ batch_adjust(\$max, $bytes, $batch_cb, $latest);
$del_cb->($self, $mime);
} elsif ($line =~ /^commit ($h40)/o) {
- if (defined $max && --$max <= 0) {
- $max = $self->{batch_size};
- $batch_cb->($latest, 1);
- }
$latest = $1;
}
}
--
EW
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 3/3] search: allow searching within mail diffs
2017-06-14 0:14 [PATCH 0/3] search improvements Eric Wong
2017-06-14 0:14 ` [PATCH 1/3] search: remove unnecessary abstractions and functionality Eric Wong
2017-06-14 0:14 ` [PATCH 2/3] searchidx: switch to accounting by message bytes Eric Wong
@ 2017-06-14 0:14 ` Eric Wong
2 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2017-06-14 0:14 UTC (permalink / raw)
To: meta
This can be tied into a repository browser to browse
in-flight topics on a mailing list.
---
lib/PublicInbox/Search.pm | 20 +++++--
lib/PublicInbox/SearchIdx.pm | 137 +++++++++++++++++++++++++++++++++++++++++--
2 files changed, 148 insertions(+), 9 deletions(-)
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 67837f4..c7c5455 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -75,6 +75,14 @@ my %prob_prefix = (
q => 'XQUOT',
nq => 'XNQ',
+ dfn => 'XDFN',
+ dfa => 'XDFA',
+ dfb => 'XDFB',
+ dfhh => 'XDFHH',
+ dfctx => 'XDFCTX',
+ dfpre => 'XDFPRE',
+ dfpost => 'XDFPOST',
+ dfblob => 'XDFPRE XDFPOST',
# default:
'' => 'XMID S A XNQ XQUOT XFN',
@@ -98,12 +106,16 @@ EOF
'a:' => 'match within the To, Cc, and From headers',
'tc:' => 'match within the To and Cc headers',
'bs:' => 'match within the Subject and body',
+ 'dfn:' => 'match filename from diff',
+ 'dfa:' => 'match diff removed (-) lines',
+ 'dfb:' => 'match diff added (+) lines',
+ 'dfhh:' => 'match diff hunk header context (usually a function name)',
+ 'dfctx:' => 'match diff context lines',
+ 'dfpre:' => 'match pre-image git blob ID',
+ 'dfpost:' => 'match post-image git blob ID',
+ 'dfblob:' => 'match either pre or post-image git blob ID',
);
chomp @HELP;
-# TODO:
-# df (filenames from diff)
-# da (diff a/ removed lines)
-# db (diff b/ added lines)
my $mail_query = Search::Xapian::Query->new('T' . 'mail');
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 30d3fe9..9ba9437 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -28,8 +28,28 @@ use constant {
PERM_GROUP => 0660,
PERM_EVERYBODY => 0664,
BATCH_BYTES => 1_000_000,
+ DEBUG => !!$ENV{DEBUG},
};
+my %GIT_ESC = (
+ a => "\a",
+ b => "\b",
+ f => "\f",
+ n => "\n",
+ r => "\r",
+ t => "\t",
+ v => "\013",
+);
+
+sub git_unquote ($) {
+ my ($s) = @_;
+ return $s unless ($s =~ /\A"(.*)"\z/);
+ $s = $1;
+ $s =~ s/\\([abfnrtv])/$GIT_ESC{$1}/g;
+ $s =~ s/\\([0-7]{1,3})/chr(oct($1))/ge;
+ $s;
+}
+
sub new {
my ($class, $inbox, $creat) = @_;
my $git_dir = $inbox;
@@ -134,11 +154,108 @@ sub index_users ($$) {
$tg->increase_termpos;
}
+sub index_text_inc ($$$) {
+ my ($tg, $text, $pfx) = @_;
+ $tg->index_text($text, 1, $pfx);
+ $tg->increase_termpos;
+}
+
+sub index_old_diff_fn {
+ my ($tg, $seen, $fa, $fb) = @_;
+
+ # no renames or space support for traditional diffs,
+ # find the number of leading common paths to strip:
+ my @fa = split('/', $fa);
+ my @fb = split('/', $fb);
+ while (scalar(@fa) && scalar(@fb)) {
+ $fa = join('/', @fa);
+ $fb = join('/', @fb);
+ if ($fa eq $fb) {
+ index_text_inc($tg, $fa,'XDFN') unless $seen->{$fa}++;
+ return 1;
+ }
+ shift @fa;
+ shift @fb;
+ }
+ 0;
+}
+
+sub index_diff ($$$) {
+ my ($tg, $lines, $doc) = @_;
+ my %seen;
+ my $in_diff;
+ foreach (@$lines) {
+ if ($in_diff && s/^ //) { # diff context
+ index_text_inc($tg, $_, 'XDFCTX');
+ } elsif (/^-- $/) { # email signature begins
+ $in_diff = undef;
+ } elsif (m!^diff --git ("?a/.+) ("?b/.+)\z!) {
+ my ($fa, $fb) = ($1, $2);
+ my $fn = (split('/', git_unquote($fa), 2))[1];
+ index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++;
+ $fn = (split('/', git_unquote($fb), 2))[1];
+ index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++;
+ $in_diff = 1;
+ # traditional diff:
+ } elsif (m/^diff -(.+) (\S+) (\S+)$/) {
+ my ($opt, $fa, $fb) = ($1, $2, $3);
+ # only support unified:
+ next unless $opt =~ /[uU]/;
+ $in_diff = index_old_diff_fn($tg, \%seen, $fa, $fb);
+ } elsif (m!^--- ("?a/.+)!) {
+ my $fn = (split('/', git_unquote($1), 2))[1];
+ index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++;
+ $in_diff = 1;
+ } elsif (m!^\+\+\+ ("?b/.+)!) {
+ my $fn = (split('/', git_unquote($1), 2))[1];
+ index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++;
+ $in_diff = 1;
+ } elsif (/^--- (\S+)/) {
+ $in_diff = $1;
+ } elsif (defined $in_diff && /^\+\+\+ (\S+)/) {
+ $in_diff = index_old_diff_fn($tg, \%seen, $in_diff, $1);
+ } elsif ($in_diff && s/^\+//) { # diff added
+ index_text_inc($tg, $_, 'XDFB');
+ } elsif ($in_diff && s/^-//) { # diff removed
+ index_text_inc($tg, $_, 'XDFA');
+ } elsif (m!^index ([a-f0-9]+)\.\.([a-f0-9]+)!) {
+ my ($ba, $bb) = ($1, $2);
+ index_git_blob_id($doc, 'XDFPRE', $ba);
+ index_git_blob_id($doc, 'XDFPOST', $bb);
+ $in_diff = 1;
+ } elsif (/^@@ (?:\S+) (?:\S+) @@\s*$/) {
+ # traditional diff w/o -p
+ } elsif (/^@@ (?:\S+) (?:\S+) @@\s*(\S+.*)$/) {
+ # hunk header context
+ index_text_inc($tg, $1, 'XDFHH');
+ # ignore the following lines:
+ } elsif (/^(?:dis)similarity index/) {
+ } elsif (/^(?:old|new) mode/) {
+ } elsif (/^(?:deleted|new) file mode/) {
+ } elsif (/^(?:copy|rename) (?:from|to) /) {
+ } elsif (/^(?:dis)?similarity index /) {
+ } elsif (/^\\ No newline at end of file/) {
+ } elsif (/^Binary files .* differ/) {
+ } elsif ($_ eq '') {
+ $in_diff = undef;
+ } else {
+ warn "non-diff line: $_\n" if DEBUG && $_ ne '';
+ $in_diff = undef;
+ }
+ }
+}
+
sub index_body ($$$) {
- my ($tg, $lines, $inc) = @_;
- $tg->index_text(join("\n", @$lines), $inc, $inc ? 'XNQ' : 'XQUOT');
- @$lines = ();
+ my ($tg, $lines, $doc) = @_;
+ my $txt = join("\n", @$lines);
+ $tg->index_text($txt, !!$doc, $doc ? 'XNQ' : 'XQUOT');
$tg->increase_termpos;
+ # does it look like a diff?
+ if ($doc && $txt =~ /^(?:diff|---|\+\+\+) /ms) {
+ $txt = undef;
+ index_diff($tg, $lines, $doc);
+ }
+ @$lines = ();
}
sub add_message {
@@ -205,7 +322,7 @@ sub add_message {
my @lines = split(/\n/, $body);
while (defined(my $l = shift @lines)) {
if ($l =~ /^>/) {
- index_body($tg, \@orig, 1) if @orig;
+ index_body($tg, \@orig, $doc) if @orig;
push @quot, $l;
} else {
index_body($tg, \@quot, 0) if @quot;
@@ -213,7 +330,7 @@ sub add_message {
}
}
index_body($tg, \@quot, 0) if @quot;
- index_body($tg, \@orig, 1) if @orig;
+ index_body($tg, \@orig, $doc) if @orig;
});
link_message($self, $smsg, $old_tid);
@@ -339,6 +456,16 @@ sub index_blob {
$self->add_message($mime, $bytes, $num, $blob);
}
+sub index_git_blob_id {
+ my ($doc, $pfx, $objid) = @_;
+
+ my $len = length($objid);
+ for (my $len = length($objid); $len >= 7; ) {
+ $doc->add_term($pfx.$objid);
+ $objid = substr($objid, 0, --$len);
+ }
+}
+
sub unindex_blob {
my ($self, $mime) = @_;
my $mid = eval { mid_clean(mid_mime($mime)) };
--
EW
^ permalink raw reply related [flat|nested] 5+ messages in thread