From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.1 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF, T_SCC_BODY_TEXT_LINE shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 2E3261F727; Thu, 7 Jul 2022 09:40:31 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1657186831; bh=KoKDFmd5EU9WtpwGXD/uyKqa6TZFkdaH2bFpBkqWykE=; h=From:To:Subject:Date:In-Reply-To:References:From; b=sLQAIbIn7L1FcD2wRn82CHatlq9K03FXTBtlMrqnVA+S19u8Ro1ZcfKnqzN1Xflqc +Zt9Hahex8vu5hWmkUuMvrUbApHHweo1LjcKjXZtJpZ/0VBkMLOcrCc0W3C9pKQ0Kn 3AzGkGdlwnz6JT5QxW6lAXUdkLu/ZIZuAffkjNpA= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 2/2] lei: track seen messages to note duplicates Date: Thu, 7 Jul 2022 09:40:30 +0000 Message-Id: <20220707094030.1185793-3-e@80x24.org> In-Reply-To: <20220707094030.1185793-1-e@80x24.org> References: <20220707094030.1185793-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This may help track down deduplication or other bugs in lei which lead to occasionally missing messages. Link: https://public-inbox.org/meta/CAL_JsqJH8xx_2NyZffNsRXbGXiv3kjmCETvKXt3Yfb0uToLm9Q@mail.gmail.com/ --- lib/PublicInbox/LeiConvert.pm | 8 +++++--- lib/PublicInbox/LeiToMail.pm | 13 ++++++++++--- lib/PublicInbox/LeiXSearch.pm | 20 ++++++++++++-------- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/lib/PublicInbox/LeiConvert.pm b/lib/PublicInbox/LeiConvert.pm index 906f3026..59af40de 100644 --- a/lib/PublicInbox/LeiConvert.pm +++ b/lib/PublicInbox/LeiConvert.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2021 all contributors +# Copyright (C) all contributors # License: AGPL-3.0+ # front-end for the "lei convert" sub-command @@ -35,8 +35,10 @@ sub process_inputs { # via wq_do my $lei = $self->{lei}; delete $lei->{1}; delete $self->{wcb}; # commit - my $nr = delete($lei->{-nr_write}) // 0; - $lei->qerr("# converted $nr messages"); + my $nr_w = delete($lei->{-nr_write}) // 0; + my $d = (delete($lei->{-nr_seen}) // 0) - $nr_w; + $d = $d ? " ($d duplicates)" : ''; + $lei->qerr("# converted $nr_w messages$d"); } sub lei_convert { # the main "lei convert" method diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm index 3c5e7e59..2aa3977e 100644 --- a/lib/PublicInbox/LeiToMail.pm +++ b/lib/PublicInbox/LeiToMail.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 all contributors +# Copyright (C) all contributors # License: AGPL-3.0+ # Writes PublicInbox::Eml objects atomically to a mbox variant or Maildir @@ -197,6 +197,7 @@ sub _mbox_write_cb ($$) { sub { # for git_to_mail my ($buf, $smsg, $eml) = @_; $eml //= PublicInbox::Eml->new($buf); + ++$lei->{-nr_seen}; return if $dedupe->is_dup($eml, $smsg); $lse->xsmsg_vmd($smsg) if $lse; $smsg->{-recent} = 1 if $set_recent; @@ -291,6 +292,8 @@ sub _maildir_write_cb ($$) { sub { # for git_to_mail my ($bref, $smsg, $eml) = @_; $dst // return $lei->fail; # dst may be undef-ed in last run + + ++$lei->{-nr_seen}; return if $dedupe && $dedupe->is_dup($eml // PublicInbox::Eml->new($$bref), $smsg); @@ -317,6 +320,8 @@ sub _imap_write_cb ($$) { sub { # for git_to_mail my ($bref, $smsg, $eml) = @_; $mic // return $lei->fail; # mic may be undef-ed in last run + + ++$lei->{-nr_seen}; return if $dedupe && $dedupe->is_dup($eml // PublicInbox::Eml->new($$bref), $smsg); @@ -360,6 +365,7 @@ sub _v2_write_cb ($$) { sub { # for git_to_mail my ($bref, $smsg, $eml) = @_; $eml //= PublicInbox::Eml->new($bref); + ++$lei->{-nr_seen}; return if $dedupe && $dedupe->is_dup($eml, $smsg); $lei->{v2w}->wq_do('add', $eml); # V2Writable->add ++$lei->{-nr_write}; @@ -792,9 +798,10 @@ sub wq_atexit_child { my $lei = $self->{lei}; delete $self->{wcb}; $lei->{ale}->git->async_wait_all; - my $nr = delete($lei->{-nr_write}) or return; + my ($nr_w, $nr_s) = delete(@$lei{qw(-nr_write -nr_seen)}); + $nr_s or return; return if $lei->{early_mua} || !$lei->{-progress} || !$lei->{pkt_op_p}; - $lei->{pkt_op_p}->pkt_do('l2m_progress', $nr); + $lei->{pkt_op_p}->pkt_do('l2m_progress', $nr_w, $nr_s); } # runs on a 1s timer in lei-daemon diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm index 41e79856..6f877019 100644 --- a/lib/PublicInbox/LeiXSearch.pm +++ b/lib/PublicInbox/LeiXSearch.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 all contributors +# Copyright (C) all contributors # License: AGPL-3.0+ # Combine any combination of PublicInbox::Search, @@ -163,8 +163,9 @@ sub mset_progress { } sub l2m_progress { - my ($lei, $nr) = @_; - $lei->{-nr_write} += $nr; + my ($lei, $nr_write, $nr_seen) = @_; + $lei->{-nr_write} += $nr_write; + $lei->{-nr_seen} += $nr_seen; } sub query_one_mset { # for --threads and l2m w/o sort @@ -447,13 +448,16 @@ Error closing $lei->{ovv}->{dst}: \$!=$! \$?=$? } if ($lei->{-progress}) { my $tot = $lei->{-mset_total} // 0; - my $nr = $lei->{-nr_write} // 0; + my $nr_w = $lei->{-nr_write} // 0; + my $d = ($lei->{-nr_seen} // 0) - $nr_w; + my $x = "$tot matches"; + $x .= ", $d duplicates" if $d; if ($l2m) { - my $m = "# $nr written to " . - "$lei->{ovv}->{dst} ($tot matches)"; - $nr ? $lei->qfin($m) : $lei->qerr($m); + my $m = "# $nr_w written to " . + "$lei->{ovv}->{dst} ($x)"; + $nr_w ? $lei->qfin($m) : $lei->qerr($m); } else { - $lei->qerr("# $tot matches"); + $lei->qerr("# $x"); } } $lei->start_mua if $l2m && !$l2m->lock_free;