From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id E94BF1F4B4 for ; Sat, 30 Jan 2021 05:41:09 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] content_hash: skip Sender for cross posted messages Date: Fri, 29 Jan 2021 23:41:09 -0600 Message-Id: <20210130054109.24815-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This regression was introduced long ago and matches behavior originally specified in the comments. It makes a noticeable improvement with search results using -extindex ("all") and lei results with multiple inboxes. Update some style bits at the top of the test case while we're at it. Fixes: f0ef0a56a8957d6f ("v2: improve deduplication checks") --- lib/PublicInbox/ContentHash.pm | 7 +++---- t/content_hash.t | 14 +++++++++++++- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/lib/PublicInbox/ContentHash.pm b/lib/PublicInbox/ContentHash.pm index 838fdd6f..4dbe7b50 100644 --- a/lib/PublicInbox/ContentHash.pm +++ b/lib/PublicInbox/ContentHash.pm @@ -68,10 +68,9 @@ sub content_digest ($) { # Only use Sender: if From is not present foreach my $h (qw(From Sender)) { - my @v = $eml->header($h); - if (@v) { - digest_addr($dig, $h, $_) foreach @v; - } + my @v = $eml->header($h) or next; + digest_addr($dig, $h, $_) foreach @v; + last; } foreach my $h (qw(Subject Date)) { my @v = $eml->header($h); diff --git a/t/content_hash.t b/t/content_hash.t index 3f02b1b3..060665f6 100644 --- a/t/content_hash.t +++ b/t/content_hash.t @@ -1,7 +1,8 @@ +#!perl -w # Copyright (C) 2018-2021 all contributors # License: AGPL-3.0+ use strict; -use warnings; +use v5.10.1; use Test::More; use PublicInbox::ContentHash qw(content_hash); use PublicInbox::Eml; @@ -19,6 +20,17 @@ EOF my $orig = content_hash($mime); my $reload = content_hash(PublicInbox::Eml->new($mime->as_string)); is($orig, $reload, 'content_hash matches after serialization'); +{ + my $s1 = PublicInbox::Eml->new($mime->as_string); + $s1->header_set('Sender', 's@example.com'); + is(content_hash($s1), $orig, "Sender ignored when 'From' present"); + my $s2 = PublicInbox::Eml->new($s1->as_string); + $s1->header_set('Sender', 'sender@example.com'); + is(content_hash($s2), $orig, "Sender really ignored 'From'"); + $_->header_set('From') for ($s1, $s2); + isnt(content_hash($s1), content_hash($s2), + 'sender accounted when From missing'); +} foreach my $h (qw(From To Cc)) { my $n = q("Quoted N'Ame" );