From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id B675C1F934 for ; Fri, 23 Apr 2021 01:45:14 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 1/2] lei: saved searches support --dedupe= Date: Fri, 23 Apr 2021 01:45:12 +0000 Message-Id: <20210423014513.73103-2-e@80x24.org> In-Reply-To: <20210423014513.73103-1-e@80x24.org> References: <20210423014513.73103-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This is less surprising in case users are used to using --dedupe= without --save. --- lib/PublicInbox/LeiSavedSearch.pm | 27 ++++++++++++++++++++-- lib/PublicInbox/LeiUp.pm | 4 ++++ t/lei-q-save.t | 37 +++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/LeiSavedSearch.pm b/lib/PublicInbox/LeiSavedSearch.pm index cd9effce..ed217cf2 100644 --- a/lib/PublicInbox/LeiSavedSearch.pm +++ b/lib/PublicInbox/LeiSavedSearch.pm @@ -11,6 +11,7 @@ use PublicInbox::LeiSearch; use PublicInbox::Config; use PublicInbox::Spawn qw(run_die); use PublicInbox::ContentHash qw(git_sha); +use PublicInbox::MID qw(mids_for_index); use Digest::SHA qw(sha256_hex); # move this to PublicInbox::Config if other things use it: @@ -65,6 +66,14 @@ sub list { } @$out } +sub translate_dedupe ($$$) { + my ($self, $lei, $dd) = @_; + $dd //= 'content'; + return 1 if $dd eq 'content'; # the default + return $self->{"-dedupe_$dd"} = 1 if ($dd eq 'oid' || $dd eq 'mid'); + $lei->fail("--dedupe=$dd unsupported with --save"); +} + sub up { # updating existing saved search via "lei up" my ($cls, $lei, $dst) = @_; my $f; @@ -89,6 +98,8 @@ sub new { # new saved search "lei q --save" File::Path::make_path($dir); # raises on error $self->{-cfg} = {}; my $f = $self->{'-f'} = "$dir/lei.saved-search"; + my $dd = $lei->{opt}->{dedupe}; + translate_dedupe($self, $lei, $dd) or return; open my $fh, '>', $f or return $lei->fail("open $f: $!"); my $sq_dst = PublicInbox::Config::squote_maybe($dst); my $q = $lei->{mset_opt}->{q_raw} // die 'BUG: {q_raw} missing'; @@ -105,6 +116,7 @@ sub new { # new saved search "lei q --save" [lei "q"] output = $dst EOM + print $fh "\tdedupe = $dd\n" if $dd; for my $k (ARRAY_FIELDS) { my $ary = $lei->{opt}->{$k} // next; for my $x (@$ary) { @@ -134,14 +146,25 @@ sub is_dup { my ($self, $eml, $smsg) = @_; my $oidx = $self->{oidx} // die 'BUG: no {oidx}'; my $blob = $smsg ? $smsg->{blob} : undef; - return 1 if $blob && $oidx->blob_exists($blob); my $lk = $self->lock_for_scope_fast; + return 1 if $blob && $oidx->blob_exists($blob); + if ($self->{-dedupe_mid}) { + for my $mid (@{mids_for_index($eml)}) { + my ($id, $prv); + return 1 if $oidx->next_by_mid($mid, \$id, \$prv); + } + } if (my $xoids = PublicInbox::LeiSearch::xoids_for($self, $eml, 1)) { for my $docid (values %$xoids) { $oidx->add_xref3($docid, -1, $blob, '.'); } $oidx->commit_lazy; - 1; + if ($self->{-dedupe_oid}) { + $smsg->{blob} //= git_sha(1, $eml)->hexdigest; + exists $xoids->{$smsg->{blob}} ? 1 : undef; + } else { + 1; + } } else { # n.b. above xoids_for fills out eml->{-lei_fake_mid} if needed unless ($smsg) { diff --git a/lib/PublicInbox/LeiUp.pm b/lib/PublicInbox/LeiUp.pm index 0fb9698b..f4ff070b 100644 --- a/lib/PublicInbox/LeiUp.pm +++ b/lib/PublicInbox/LeiUp.pm @@ -25,6 +25,10 @@ sub up1 ($$) { my $o = $lei->{opt}->{output} = $lss->{-cfg}->{'lei.q.output'} // return $lei->fail("lei.q.output unset in $f"); ref($o) and return $lei->fail("multiple values of lei.q.output in $f"); + if (defined(my $dd = $lss->{-cfg}->{'lei.q.dedupe'})) { + $lss->translate_dedupe($lei, $dd) or return; + $lei->{opt}->{dedupe} = $dd; + } for my $k (qw(only include exclude)) { my $v = $lss->{-cfg}->get_all("lei.q.$k") // next; $lei->{opt}->{$k} = $v; diff --git a/t/lei-q-save.t b/t/lei-q-save.t index 5a2f7fff..26ea5cb8 100644 --- a/t/lei-q-save.t +++ b/t/lei-q-save.t @@ -121,5 +121,42 @@ test_lei(sub { unlike($lei_out, qr/mbrd-aug/, 'forget-search completion cleared after forget'); ok(!lei('up', "$home/mbrd-aug"), 'lei up fails after forget'); + + # dedupe=mid + my $o = "$home/dd-mid"; + $in = $doc2->as_string . "\n-------\nappended list sig\n"; + lei_ok [qw(import -q -F eml -)], undef, { 0 => \$in, %$lei_opt }; + lei_ok(qw(q --dedupe=mid --save m:testmessage@example.com -o), $o); + my @m = glob("$o/cur/*"); + is(scalar(@m), 1, '--dedupe=mid w/ --save'); + $in = $doc2->as_string . "\n-------\nanother list sig\n"; + lei_ok [qw(import -q -F eml -)], undef, { 0 => \$in, %$lei_opt }; + lei_ok 'up', $o; + is_deeply([glob("$o/cur/*")], \@m, 'lei up dedupe=mid works'); + + for my $dd (qw(content)) { + $o = "$home/dd-$dd"; + lei_ok(qw(q --save m:testmessage@example.com -o), $o, + "--dedupe=$dd"); + @m = glob("$o/cur/*"); + is(scalar(@m), 3, 'all 3 matches with dedupe='.$dd); + } + + # dedupe=oid + $o = "$home/dd-oid"; + my $ibx = create_inbox 'ibx', indexlevel => 'medium', + tmpdir => "$home/v1", sub {}; + lei_ok(qw(q --save --dedupe=oid m:qp@example.com -o), $o, + '-I', $ibx->{inboxdir}); + @m = glob("$o/cur/*"); + is(scalar(@m), 1, 'got first result'); + + my $im = $ibx->importer(0); + my $diff = "X-Insignificant-Header: x\n".$doc1->as_string; + $im->add(PublicInbox::Eml->new($diff)); + $im->done; + lei_ok('up', $o); + @m = glob("$o/cur/*"); + is(scalar(@m), 2, 'got 2nd result due to different OID'); }); done_testing;