From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id B26D51F626 for ; Mon, 20 Feb 2023 09:21:50 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1676884910; bh=2f8nvKAuWHREtgo7a3tCjkzVrTS8QqSSsqblOz1W1zU=; h=From:To:Subject:Date:From; b=knV3HliNrcjGi4hgVohJybBM0oWPqpMM7DeIWNzx21GAR2nbTymvpthCv03FoLvD8 tqrkR8s3Q/bnjZ50PQUTsljH4mr2/DMxHLRmBzV63K8nc/IDj3lXjq56jJvOkj9l+t xK15UipEz1o/4GOTdS1SGerLqTYHN3hUX9J7qu98= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] searchidx: do not index quoted Base-85 patches Date: Mon, 20 Feb 2023 09:21:50 +0000 Message-Id: <20230220092150.379964-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Base-85 binary patches were a source of false-positives in results and we've filtered out in non-quoted text since July 2022. Unfortunately, people were quoting binary patch contents in replies (*sigh*) and triggering false positives in search results. So we must filter out base-85-looking contents from quoted text, too. Followup-to: 8fda04081acde705 (search: do not index base-85 binary patches, 2022-06-20) Followup-to: 840785917bc74c8e (searchidx: skip "delta $N" sections for base-85, 2022-07-19) --- lib/PublicInbox/SearchIdx.pm | 10 ++++++++-- t/search.t | 13 +++++++++++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 257b83a5..fc464383 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -37,7 +37,7 @@ our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : # typical 32-bit system: (($Config{ptrsize} >= 8 ? 8192 : 1024) * 1024); use constant DEBUG => !!$ENV{DEBUG}; -my $BASE85 = qr/\A[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+\z/; +my $BASE85 = qr/[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+/; my $xapianlevels = qr/\A(?:full|medium)\z/; my $hex = '[a-f0-9]'; my $OID = $hex .'{40,}'; @@ -270,7 +270,7 @@ sub index_diff ($$$) { push @$xnq, shift(@l); # skip base85 and empty lines - while (@l && ($l[0] =~ /$BASE85/o || + while (@l && ($l[0] =~ /\A$BASE85\h*\z/o || $l[0] !~ /\S/)) { shift @l; } @@ -389,6 +389,12 @@ sub index_xapian { # msg_iter callback undef $s; # free memory for my $txt (@sections) { if ($txt =~ /\A>/) { + if ($txt =~ /^[>\t ]+GIT binary patch\r?/sm) { + # get rid of Base-85 noise + $txt =~ s/^([>\h]+(?:literal|delta) + \x20[0-9]+\r?\n) + (?:[>\h]+$BASE85\h*\r?\n)+/$1/gsmx; + } index_text($self, $txt, 0, 'XQUOT'); } else { # does it look like a diff? diff --git a/t/search.t b/t/search.t index dded6c40..cf639a6d 100644 --- a/t/search.t +++ b/t/search.t @@ -534,7 +534,15 @@ $ibx->with_umask(sub { '20200418222508.GA13918@dcvr', 'Subject search reaches inside message/rfc822'); - $doc_id = $rw->add_message(eml_load('t/data/binary.patch')); + my $eml = eml_load('t/data/binary.patch'); + my $body = $eml->body; + $rw->add_message($eml); + + $body =~ s/^/> /gsm; + $eml = PublicInbox::Eml->new($eml->header_obj->as_string."\n".$body); + $eml->header_set('Message-ID', ''); + $rw->add_message($eml); + $rw->commit_txn_lazy; $ibx->search->reopen; my $res = $query->('HcmV'); @@ -542,8 +550,9 @@ $ibx->with_umask(sub { $res = $query->('IcmZPo000310RR91'); is_deeply($res, [], 'no results against 1-byte binary patch'); $res = $query->('"GIT binary patch"'); - is(scalar(@$res), 1, 'got binary result from "GIT binary patch"'); + is(scalar(@$res), 2, 'got binary results from "GIT binary patch"'); is($res->[0]->{mid}, 'binary-patch-test@example', 'msgid for binary'); + is($res->[1]->{mid}, 'binary-patch-reply@example', 'msgid for reply'); my $s = $query->('"literal 1"'); is_deeply($s, $res, 'got binary result from exact literal size'); $s = $query->('"literal 2"');