From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 16/34] searchidx: avoid excessive XNQ indexing with diffs
Date: Tue, 6 Mar 2018 08:42:24 +0000 [thread overview]
Message-ID: <20180306084242.19988-17-e@80x24.org> (raw)
In-Reply-To: <20180306084242.19988-1-e@80x24.org>
When indexing diffs, we can avoid indexing the diff parts under
XNQ and instead combine the parts in the read-only search
interface. This results in better indexing performance and
10-15% smaller Xapian indices.
---
lib/PublicInbox/Search.pm | 9 +++---
lib/PublicInbox/SearchIdx.pm | 77 ++++++++++++++++++++++++++++----------------
2 files changed, 55 insertions(+), 31 deletions(-)
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index fb7a126..a1c423c 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -59,6 +59,7 @@ my %bool_pfx_external = (
mid => 'Q', # Message-ID (full/exact), this is mostly uniQue
);
+my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST';
my %prob_prefix = (
# for mairix compatibility
s => 'S',
@@ -69,12 +70,12 @@ my %prob_prefix = (
c => 'XCC',
tcf => 'XTO XCC A',
a => 'XTO XCC A',
- b => 'XNQ XQUOT',
- bs => 'XNQ XQUOT S',
+ b => $non_quoted_body . ' XQUOT',
+ bs => $non_quoted_body . ' XQUOT S',
n => 'XFN',
q => 'XQUOT',
- nq => 'XNQ',
+ nq => $non_quoted_body,
dfn => 'XDFN',
dfa => 'XDFA',
dfb => 'XDFB',
@@ -85,7 +86,7 @@ my %prob_prefix = (
dfblob => 'XDFPRE XDFPOST',
# default:
- '' => 'XM S A XNQ XQUOT XFN',
+ '' => 'XM S A XQUOT XFN ' . $non_quoted_body,
);
# not documenting m: and mid: for now, the using the URLs works w/o Xapian
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 1c10728..1bca3a6 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -175,14 +175,19 @@ sub index_users ($$) {
$tg->increase_termpos;
}
-sub index_text_inc ($$$) {
- my ($tg, $text, $pfx) = @_;
+sub index_diff_inc ($$$$) {
+ my ($tg, $text, $pfx, $xnq) = @_;
+ if (@$xnq) {
+ $tg->index_text(join("\n", @$xnq), 1, 'XNQ');
+ $tg->increase_termpos;
+ @$xnq = ();
+ }
$tg->index_text($text, 1, $pfx);
$tg->increase_termpos;
}
sub index_old_diff_fn {
- my ($tg, $seen, $fa, $fb) = @_;
+ my ($tg, $seen, $fa, $fb, $xnq) = @_;
# no renames or space support for traditional diffs,
# find the number of leading common paths to strip:
@@ -192,7 +197,9 @@ sub index_old_diff_fn {
$fa = join('/', @fa);
$fb = join('/', @fb);
if ($fa eq $fb) {
- index_text_inc($tg, $fa,'XDFN') unless $seen->{$fa}++;
+ unless ($seen->{$fa}++) {
+ index_diff_inc($tg, $fa, 'XDFN', $xnq);
+ }
return 1;
}
shift @fa;
@@ -205,40 +212,46 @@ sub index_diff ($$$) {
my ($tg, $lines, $doc) = @_;
my %seen;
my $in_diff;
+ my @xnq;
+ my $xnq = \@xnq;
foreach (@$lines) {
if ($in_diff && s/^ //) { # diff context
- index_text_inc($tg, $_, 'XDFCTX');
+ index_diff_inc($tg, $_, 'XDFCTX', $xnq);
} elsif (/^-- $/) { # email signature begins
$in_diff = undef;
} elsif (m!^diff --git ("?a/.+) ("?b/.+)\z!) {
my ($fa, $fb) = ($1, $2);
my $fn = (split('/', git_unquote($fa), 2))[1];
- index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++;
+ $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq);
$fn = (split('/', git_unquote($fb), 2))[1];
- index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++;
+ $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq);
$in_diff = 1;
# traditional diff:
} elsif (m/^diff -(.+) (\S+) (\S+)$/) {
my ($opt, $fa, $fb) = ($1, $2, $3);
+ push @xnq, $_;
# only support unified:
next unless $opt =~ /[uU]/;
- $in_diff = index_old_diff_fn($tg, \%seen, $fa, $fb);
+ $in_diff = index_old_diff_fn($tg, \%seen, $fa, $fb,
+ $xnq);
} elsif (m!^--- ("?a/.+)!) {
my $fn = (split('/', git_unquote($1), 2))[1];
- index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++;
+ $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq);
$in_diff = 1;
} elsif (m!^\+\+\+ ("?b/.+)!) {
my $fn = (split('/', git_unquote($1), 2))[1];
- index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++;
+ $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq);
$in_diff = 1;
} elsif (/^--- (\S+)/) {
$in_diff = $1;
+ push @xnq, $_;
} elsif (defined $in_diff && /^\+\+\+ (\S+)/) {
- $in_diff = index_old_diff_fn($tg, \%seen, $in_diff, $1);
+ $in_diff = index_old_diff_fn($tg, \%seen, $in_diff, $1,
+ $xnq);
} elsif ($in_diff && s/^\+//) { # diff added
- index_text_inc($tg, $_, 'XDFB');
+ index_diff_inc($tg, $_, 'XDFB', $xnq);
} elsif ($in_diff && s/^-//) { # diff removed
- index_text_inc($tg, $_, 'XDFA');
+ index_diff_inc($tg, $_, 'XDFA', $xnq);
} elsif (m!^index ([a-f0-9]+)\.\.([a-f0-9]+)!) {
my ($ba, $bb) = ($1, $2);
index_git_blob_id($doc, 'XDFPRE', $ba);
@@ -248,34 +261,44 @@ sub index_diff ($$$) {
# traditional diff w/o -p
} elsif (/^@@ (?:\S+) (?:\S+) @@\s*(\S+.*)$/) {
# hunk header context
- index_text_inc($tg, $1, 'XDFHH');
+ index_diff_inc($tg, $1, 'XDFHH', $xnq);
# ignore the following lines:
- } elsif (/^(?:dis)similarity index/) {
- } elsif (/^(?:old|new) mode/) {
- } elsif (/^(?:deleted|new) file mode/) {
- } elsif (/^(?:copy|rename) (?:from|to) /) {
- } elsif (/^(?:dis)?similarity index /) {
- } elsif (/^\\ No newline at end of file/) {
- } elsif (/^Binary files .* differ/) {
+ } elsif (/^(?:dis)similarity index/ ||
+ /^(?:old|new) mode/ ||
+ /^(?:deleted|new) file mode/ ||
+ /^(?:copy|rename) (?:from|to) / ||
+ /^(?:dis)?similarity index / ||
+ /^\\ No newline at end of file/ ||
+ /^Binary files .* differ/) {
+ push @xnq, $_;
} elsif ($_ eq '') {
$in_diff = undef;
} else {
+ push @xnq, $_;
warn "non-diff line: $_\n" if DEBUG && $_ ne '';
$in_diff = undef;
}
}
+
+ $tg->index_text(join("\n", @xnq), 1, 'XNQ');
+ $tg->increase_termpos;
}
sub index_body ($$$) {
my ($tg, $lines, $doc) = @_;
my $txt = join("\n", @$lines);
- $tg->index_text($txt, !!$doc, $doc ? 'XNQ' : 'XQUOT');
- $tg->increase_termpos;
- # does it look like a diff?
- if ($doc && $txt =~ /^(?:diff|---|\+\+\+) /ms) {
- $txt = undef;
- index_diff($tg, $lines, $doc);
+ if ($doc) {
+ # does it look like a diff?
+ if ($txt =~ /^(?:diff|---|\+\+\+) /ms) {
+ $txt = undef;
+ index_diff($tg, $lines, $doc);
+ } else {
+ $tg->index_text($txt, 1, 'XNQ');
+ }
+ } else {
+ $tg->index_text($txt, 0, 'XQUOT');
}
+ $tg->increase_termpos;
@$lines = ();
}
--
EW
next prev parent reply other threads:[~2018-03-06 8:42 UTC|newest]
Thread overview: 36+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-03-06 8:42 [v2 PATCH 00/34] duplicate handling, smaller Xapian DBs, date fixes Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 01/34] v2writable: delete ::Import obj when ->done Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 02/34] search: remove informational "warning" message Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 03/34] searchidx: add PID to error message when die-ing Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 04/34] content_id: special treatment for Message-Id headers Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 05/34] evcleanup: disable outside of daemon Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 06/34] v2writable: deduplicate detection on add Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 07/34] evcleanup: do not create event loop if nothing was registered Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 08/34] mid: add `mids' and `references' methods for extraction Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 09/34] content_id: use `mids' and `references' for MID extraction Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 10/34] searchidx: use new `references' method for parsing References Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 11/34] content_id: no need to be human-friendly Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 12/34] v2writable: inject new Message-IDs on true duplicates Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 13/34] search: revert to using 'Q' as a uniQue id per-Xapian conventions Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 14/34] searchidx: support indexing multiple MIDs Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 15/34] mid: be strict with References, but loose on Message-Id Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-03-06 8:42 ` [PATCH 17/34] searchidxskeleton: add a note about locking Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 18/34] v2writable: generated Message-ID goes first Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 19/34] searchidx: use add_boolean_term for internal terms Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 20/34] searchidx: add NNTP article number as a searchable term Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 21/34] mid: truncate excessively long MIDs early Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 22/34] nntp: use NNTP article numbers for lookups Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 23/34] nntp: fix NEWNEWS command Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 24/34] searchidx: store the primary MID in doc data for NNTP Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 25/34] import: consolidate object info for v2 imports Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 26/34] v2: avoid redundant/repeated configs for git partition repos Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 27/34] INSTALL: document more optional dependencies Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 28/34] search: favor skeleton DB for lookup_mail Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 29/34] search: each_smsg_by_mid uses skeleton if available Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 30/34] v2writable: remove unnecessary skeleton commit Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 31/34] favor Received: date over Date: header globally Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 32/34] import: fall back to Sender for extracting name and email Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 33/34] scripts/import_vger_from_mbox: perform mboxrd or mboxo escaping Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:42 ` [PATCH 34/34] v2writable: detect and use previous partition count Eric Wong (Contractor, The Linux Foundation)
2018-03-06 8:53 ` [v2 PATCH 00/34] duplicate handling, smaller Xapian DBs, date fixes Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180306084242.19988-17-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).