* [PATCH 1/3] searchidx: use regexp as first arg for `split' op
2022-06-20 19:27 [PATCH 0/3] search indexing improvements Eric Wong
@ 2022-06-20 19:27 ` Eric Wong
2022-06-20 19:27 ` [PATCH 2/3] search: support "patchid:" prefix (git patch-id --stable) Eric Wong
2022-06-20 19:27 ` [PATCH 3/3] search: do not index base-85 binary patches Eric Wong
2 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2022-06-20 19:27 UTC (permalink / raw)
To: meta
Current implementations of Perl5 don't have optimizations for
single-character field separators (unlike another non-Perl5 VM
I'm familiar with).
---
lib/PublicInbox/SearchIdx.pm | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 85fae4ad..50e26050 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -236,8 +236,8 @@ sub index_old_diff_fn {
# no renames or space support for traditional diffs,
# find the number of leading common paths to strip:
- my @fa = split('/', $fa);
- my @fb = split('/', $fb);
+ my @fa = split(m'/', $fa);
+ my @fb = split(m'/', $fb);
while (scalar(@fa) && scalar(@fb)) {
$fa = join('/', @fa);
$fb = join('/', @fb);
@@ -278,12 +278,12 @@ sub index_diff ($$$) {
$xnq);
} elsif (m!^--- ("?[^/]+/.+)!) {
my $fn = $1;
- $fn = (split('/', git_unquote($fn), 2))[1];
+ $fn = (split(m'/', git_unquote($fn), 2))[1];
$seen{$fn}++ or index_diff_inc($self, $fn, 'XDFN', $xnq);
$in_diff = 1;
} elsif (m!^\+\+\+ ("?[^/]+/.+)!) {
my $fn = $1;
- $fn = (split('/', git_unquote($fn), 2))[1];
+ $fn = (split(m'/', git_unquote($fn), 2))[1];
$seen{$fn}++ or index_diff_inc($self, $fn, 'XDFN', $xnq);
$in_diff = 1;
} elsif (/^--- (\S+)/) {
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 2/3] search: support "patchid:" prefix (git patch-id --stable)
2022-06-20 19:27 [PATCH 0/3] search indexing improvements Eric Wong
2022-06-20 19:27 ` [PATCH 1/3] searchidx: use regexp as first arg for `split' op Eric Wong
@ 2022-06-20 19:27 ` Eric Wong
2022-06-20 20:01 ` Kyle Meyer
2022-06-20 19:27 ` [PATCH 3/3] search: do not index base-85 binary patches Eric Wong
2 siblings, 1 reply; 7+ messages in thread
From: Eric Wong @ 2022-06-20 19:27 UTC (permalink / raw)
To: meta
This allows easy searching via patch-id from a git commit.
Currently, abbreviations are not supported, and it seems
needless to support them since AFAIK (git) doesn't generate
nor resolve abbreviated patch-ids anywhere.
---
TODO | 3 ---
lib/PublicInbox/Search.pm | 5 +++--
lib/PublicInbox/SearchIdx.pm | 15 +++++++++++++++
t/extsearch.t | 7 ++++++-
t/v2mda.t | 10 ++++++++--
5 files changed, 32 insertions(+), 8 deletions(-)
diff --git a/TODO b/TODO
index 5be4b5e3..43eee063 100644
--- a/TODO
+++ b/TODO
@@ -137,9 +137,6 @@ all need to be considered for everything we introduce)
* make "git cat-file --batch" detect unlinked packfiles so we don't
have to restart processes (very long-term)
-* support searching based on `git-patch-id --stable` to improve
- bidirectional mapping of commits <=> emails
-
* linter to check validity of config file
* linter option and WWW endpoint to graph relationships and flows
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 523003b3..6f9fdde1 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
# based on notmuch, but with no concept of folders, files or flags
#
@@ -118,9 +118,10 @@ my %bool_pfx_external = (
dfpre => 'XDFPRE',
dfpost => 'XDFPOST',
dfblob => 'XDFPRE XDFPOST',
+ patchid => 'XDFID',
);
-my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST';
+my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID';
my %prob_prefix = (
# for mairix compatibility
s => 'S',
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 50e26050..53ec23a5 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -18,6 +18,7 @@ use PublicInbox::MsgIter;
use PublicInbox::IdxStack;
use Carp qw(croak carp);
use POSIX qw(strftime);
+use Fcntl qw(SEEK_SET);
use Time::Local qw(timegm);
use PublicInbox::OverIdx;
use PublicInbox::Spawn qw(spawn);
@@ -349,6 +350,20 @@ sub index_xapian { # msg_iter callback
defined $s or return;
$_[0]->[0] = $part = undef; # free memory
+ if ($s =~ /^(?:diff|---|\+\+\+) /ms) {
+ open(my $fh, '+>:utf8', undef) or die "open: $!";
+ open(my $eh, '+>', undef) or die "open: $!";
+ $fh->autoflush(1);
+ print $fh $s or die "print: $!";
+ sysseek($fh, 0, SEEK_SET) or die "sysseek: $!";
+ my $id = ($self->{ibx} // $self->{eidx})->git->qx(
+ [qw(patch-id --stable)],
+ {}, { 0 => $fh, 2 => $eh });
+ $id =~ /\A([a-f0-9]{40,})/ and $doc->add_term('XDFID'.$1);
+ seek($eh, 0, SEEK_SET) or die "seek: $!";
+ while (<$eh>) { warn $_ }
+ }
+
# split off quoted and unquoted blocks:
my @sections = PublicInbox::MsgIter::split_quotes($s);
undef $s; # free memory
diff --git a/t/extsearch.t b/t/extsearch.t
index 09cbdabe..2d7375d6 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -314,7 +314,12 @@ if ('reindex catches missed messages') {
is($new->{subject}, $eml->header('Subject'), 'new message added');
$es->{xdb}->reopen;
- my $mset = $es->mset("mid:$new->{mid}");
+ # git patch-id --stable <t/data/0001.patch | awk '{print $1}'
+ my $patchid = '91ee6b761fc7f47cad9f2b09b10489f313eb5b71';
+ my $mset = $es->search->mset("patchid:$patchid");
+ is($mset->size, 1, 'patchid search works');
+
+ $mset = $es->mset("mid:$new->{mid}");
is($mset->size, 1, 'previously unseen, now indexed in Xapian');
ok($im->remove($eml), 'remove new message from v2 inbox');
diff --git a/t/v2mda.t b/t/v2mda.t
index 3dfc569e..8f2f335d 100644
--- a/t/v2mda.t
+++ b/t/v2mda.t
@@ -1,7 +1,8 @@
-# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org>
+#!perl -w
+# Copyright (C) all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use v5.10.1;
use strict;
-use warnings;
use Test::More;
use Fcntl qw(SEEK_SET);
use Cwd;
@@ -88,6 +89,11 @@ is($eml->as_string, $mime->as_string, 'injected message');
$pre = $ibx->search->mset_to_smsg($ibx, $pre);
$post = $ibx->search->mset_to_smsg($ibx, $post);
is($post->[0]->{blob}, $pre->[0]->{blob}, 'same message in both cases');
+
+ # git patch-id --stable <t/data/0001.patch | awk '{print $1}'
+ my $patchid = '91ee6b761fc7f47cad9f2b09b10489f313eb5b71';
+ my $mset = $ibx->search->mset("patchid:$patchid");
+ is($mset->size, 1, 'patchid search works');
}
done_testing();
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 3/3] search: do not index base-85 binary patches
2022-06-20 19:27 [PATCH 0/3] search indexing improvements Eric Wong
2022-06-20 19:27 ` [PATCH 1/3] searchidx: use regexp as first arg for `split' op Eric Wong
2022-06-20 19:27 ` [PATCH 2/3] search: support "patchid:" prefix (git patch-id --stable) Eric Wong
@ 2022-06-20 19:27 ` Eric Wong
2 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2022-06-20 19:27 UTC (permalink / raw)
To: meta
Base-85 binary patches generated by git lead to many false
positives, so skip over gibberish words which may occur in them.
To avoid regressions in search results, continue to allow
searching for exact size matches (via "literal $SIZE") and the
phrase "GIT binary patch" for the mere presence of a binary
patch.
---
MANIFEST | 1 +
TODO | 2 --
lib/PublicInbox/SearchIdx.pm | 52 +++++++++++++++++++++++++-----------
t/data/binary.patch | 20 ++++++++++++++
t/search.t | 15 +++++++++++
5 files changed, 72 insertions(+), 18 deletions(-)
create mode 100644 t/data/binary.patch
diff --git a/MANIFEST b/MANIFEST
index ce2cf4a5..607a4c5b 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -397,6 +397,7 @@ t/content_hash.t
t/convert-compact.t
t/data-gen/.gitignore
t/data/0001.patch
+t/data/binary.patch
t/data/message_embed.eml
t/dir_idle.t
t/ds-kqxs.t
diff --git a/TODO b/TODO
index 43eee063..7a27fdd2 100644
--- a/TODO
+++ b/TODO
@@ -153,8 +153,6 @@ all need to be considered for everything we introduce)
* support UUCP addresses for legacy archives
-* decode (skip indexing of) base-85 binary patches to avoid false-positives
-
* support pipelining as an IMAP/NNTP client for -watch + lei
* auto-detect and reload on TLS cert+key changes in daemons
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 53ec23a5..cbfe7816 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -36,9 +36,8 @@ our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff :
# assume a typical 64-bit system has 8x more RAM than a
# typical 32-bit system:
(($Config{ptrsize} >= 8 ? 8192 : 1024) * 1024);
-
use constant DEBUG => !!$ENV{DEBUG};
-
+my $BASE85 = qr/\A[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+\z/;
my $xapianlevels = qr/\A(?:full|medium)\z/;
my $hex = '[a-f0-9]';
my $OID = $hex .'{40,}';
@@ -258,21 +257,42 @@ sub index_diff ($$$) {
my ($self, $txt, $doc) = @_;
my %seen;
my $in_diff;
- my @xnq;
- my $xnq = \@xnq;
- foreach (split(/\n/, $txt)) {
- if ($in_diff && s/^ //) { # diff context
+ my $xnq = [];
+ my @l = split(/\n/, $$txt);
+ undef $$txt;
+ while (defined($_ = shift @l)) {
+ if ($in_diff && /^GIT binary patch/) {
+ push @$xnq, $_;
+ while (@l && $l[0] =~ /^literal /) {
+ # TODO allow searching by size range?
+ # allows searching by exact size via:
+ # "literal $SIZE"
+ push @$xnq, shift(@l);
+
+ # skip base85 and empty lines
+ while (@l && ($l[0] =~ /$BASE85/o ||
+ $l[0] !~ /\S/)) {
+ shift @l;
+ }
+ # loop hits trailing "literal 0\nHcmV?d00001\n"
+ }
+ } elsif ($in_diff && s/^ //) { # diff context
index_diff_inc($self, $_, 'XDFCTX', $xnq);
} elsif (/^-- $/) { # email signature begins
$in_diff = undef;
- } elsif (m!^diff --git "?[^/]+/.+ "?[^/]+/.+\z!) {
- # wait until "---" and "+++" to capture filenames
+ } elsif (m!^diff --git ("?[^/]+/.+) ("?[^/]+/.+)\z!) {
+ # capture filenames here for binary diffs:
+ my ($fa, $fb) = ($1, $2);
+ push @$xnq, $_;
$in_diff = 1;
- push @xnq, $_;
+ $fa = (split(m'/', git_unquote($fa), 2))[1];
+ $fb = (split(m'/', git_unquote($fb), 2))[1];
+ $seen{$fa}++ or index_diff_inc($self, $fa, 'XDFN', $xnq);
+ $seen{$fb}++ or index_diff_inc($self, $fb, 'XDFN', $xnq);
# traditional diff:
} elsif (m/^diff -(.+) (\S+) (\S+)$/) {
my ($opt, $fa, $fb) = ($1, $2, $3);
- push @xnq, $_;
+ push @$xnq, $_;
# only support unified:
next unless $opt =~ /[uU]/;
$in_diff = index_old_diff_fn($self, \%seen, $fa, $fb,
@@ -288,8 +308,8 @@ sub index_diff ($$$) {
$seen{$fn}++ or index_diff_inc($self, $fn, 'XDFN', $xnq);
$in_diff = 1;
} elsif (/^--- (\S+)/) {
- $in_diff = $1;
- push @xnq, $_;
+ $in_diff = $1; # old diff filename
+ push @$xnq, $_;
} elsif (defined $in_diff && /^\+\+\+ (\S+)/) {
$in_diff = index_old_diff_fn($self, \%seen, $in_diff,
$1, $xnq);
@@ -315,19 +335,19 @@ sub index_diff ($$$) {
/^(?:dis)?similarity index / ||
/^\\ No newline at end of file/ ||
/^Binary files .* differ/) {
- push @xnq, $_;
+ push @$xnq, $_;
} elsif ($_ eq '') {
# possible to be in diff context, some mail may be
# stripped by MUA or even GNU diff(1). "git apply"
# treats a bare "\n" as diff context, too
} else {
- push @xnq, $_;
+ push @$xnq, $_;
warn "non-diff line: $_\n" if DEBUG && $_ ne '';
$in_diff = undef;
}
}
- index_text($self, join("\n", @xnq), 1, 'XNQ');
+ index_text($self, join("\n", @$xnq), 1, 'XNQ');
}
sub index_xapian { # msg_iter callback
@@ -373,7 +393,7 @@ sub index_xapian { # msg_iter callback
} else {
# does it look like a diff?
if ($txt =~ /^(?:diff|---|\+\+\+) /ms) {
- index_diff($self, $txt, $doc);
+ index_diff($self, \$txt, $doc);
} else {
index_text($self, $txt, 1, 'XNQ');
}
diff --git a/t/data/binary.patch b/t/data/binary.patch
new file mode 100644
index 00000000..58717abe
--- /dev/null
+++ b/t/data/binary.patch
@@ -0,0 +1,20 @@
+From 7a1921ba7bd99c63ad6dc6ec0791691ee80e279a Mon Sep 17 00:00:00 2001
+From: BOFH <bofh@example.com>
+Date: Fri, 13 May 2022 23:04:14 +0000
+Subject: [PATCH] binary patch test
+Message-ID: <binary-patch-test@example>
+
+---
+ zero | Bin 0 -> 1 bytes
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 zero
+
+diff --git a/zero b/zero
+new file mode 100644
+index 0000000000000000000000000000000000000000..f76dd238ade08917e6712764a16a22005a50573d
+GIT binary patch
+literal 1
+IcmZPo000310RR91
+
+literal 0
+HcmV?d00001
diff --git a/t/search.t b/t/search.t
index 47a67f7f..13210ff5 100644
--- a/t/search.t
+++ b/t/search.t
@@ -533,6 +533,21 @@ $ibx->with_umask(sub {
is($query->('s:"mail header experiments"')->[0]->{mid},
'20200418222508.GA13918@dcvr',
'Subject search reaches inside message/rfc822');
+
+ $doc_id = $rw->add_message(eml_load('t/data/binary.patch'));
+ $rw->commit_txn_lazy;
+ $ibx->search->reopen;
+ my $res = $query->('HcmV');
+ is_deeply($res, [], 'no results against trailer');
+ $res = $query->('IcmZPo000310RR91');
+ is_deeply($res, [], 'no results against 1-byte binary patch');
+ $res = $query->('"GIT binary patch"');
+ is(scalar(@$res), 1, 'got binary result from "GIT binary patch"');
+ is($res->[0]->{mid}, 'binary-patch-test@example', 'msgid for binary');
+ my $s = $query->('"literal 1"');
+ is_deeply($s, $res, 'got binary result from exact literal size');
+ $s = $query->('"literal 2"');
+ is_deeply($s, [], 'no results for wrong size');
});
SKIP: {
^ permalink raw reply related [flat|nested] 7+ messages in thread