From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 49BEA1FAF1 for ; Tue, 6 Mar 2018 08:42:43 +0000 (UTC) From: "Eric Wong (Contractor, The Linux Foundation)" To: meta@public-inbox.org Subject: [PATCH 15/34] mid: be strict with References, but loose on Message-Id Date: Tue, 6 Mar 2018 08:42:23 +0000 Message-Id: <20180306084242.19988-16-e@80x24.org> In-Reply-To: <20180306084242.19988-1-e@80x24.org> References: <20180306084242.19988-1-e@80x24.org> List-Id: Traditionally we've been more lax on parsing Message-Id and allow it without the angle brackets. We've always been strict on References and can't have it be pointlessly large when some MUA decides to use HTML-escaped angle brackets ("<", ">"). --- lib/PublicInbox/MID.pm | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm index 4ccb704..9608539 100644 --- a/lib/PublicInbox/MID.pm +++ b/lib/PublicInbox/MID.pm @@ -49,16 +49,39 @@ sub mid2path { sub mid_mime ($) { $_[0]->header_obj->header_raw('Message-ID') } -sub uniq_mids { - my ($hdr, @fields) = @_; - my %seen; - my @raw; - foreach my $f (@fields) { - push @raw, $hdr->header_raw($f); +sub mids ($) { + my ($hdr) = @_; + my @mids; + my @v = $hdr->header_raw('Message-Id'); + foreach my $v (@v) { + my @cur = ($v =~ /<([^>]+)>/sg); + if (@cur) { + push(@mids, @cur); + } else { + push(@mids, $v); + } } - my @mids = (join(' ', @raw) =~ /<([^>]+)>/g); - my $mids = scalar(@mids) == 0 ? \@raw: \@mids; + uniq_mids(\@mids); +} + +# last References should be IRT, but some mail clients do things +# out of order, so trust IRT over References iff IRT exists +sub references ($) { + my ($hdr) = @_; + my @mids; + foreach my $f (qw(References In-Reply-To)) { + my @v = $hdr->header_raw($f); + foreach my $v (@v) { + push(@mids, ($v =~ /<([^>]+)>/sg)); + } + } + uniq_mids(\@mids); +} + +sub uniq_mids ($) { + my ($mids) = @_; my @ret; + my %seen; foreach (@$mids) { next if $seen{$_}; push @ret, $_; @@ -67,12 +90,6 @@ sub uniq_mids { \@ret; } -sub mids { uniq_mids($_[0], 'Message-Id') } - -# last References should be IRT, but some mail clients do things -# out of order, so trust IRT over References iff IRT exists -sub references { uniq_mids($_[0], 'References', 'In-Reply-To') } - # RFC3986, section 3.3: sub MID_ESC () { '^A-Za-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@' } sub mid_escape ($) { uri_escape_utf8($_[0], MID_ESC) } -- EW