From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id D92D61F770 for ; Sun, 30 Dec 2018 12:41:25 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] handle "multipart/mixed" messages which are not multipart Date: Sun, 30 Dec 2018 12:41:25 +0000 Message-Id: <20181230124125.28778-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: I've found two examples on https://lore.kernel.org/lkml/ where the messages declared themselves to be "multipart/mixed" but were actually plain text: <87llgalspt.fsf@free.fr> <200308111450.h7BEoOu20077@mail.osdl.org> With the mboxrd downloaded, mutt is able to view them without difficulty. Note: this change would require reindexing of Xapian to pick up the changes. But it's only two ancient messages, the first was resent by the original sender and the second is too old to be relevant. --- MANIFEST | 1 + lib/PublicInbox/ContentId.pm | 9 +---- lib/PublicInbox/MsgIter.pm | 25 +++++++++++++- lib/PublicInbox/SearchIdx.pm | 14 +------- lib/PublicInbox/View.pm | 25 ++------------ t/psgi_multipart_not.t | 65 ++++++++++++++++++++++++++++++++++++ 6 files changed, 95 insertions(+), 44 deletions(-) create mode 100644 t/psgi_multipart_not.t diff --git a/MANIFEST b/MANIFEST index dc85167..f25a580 100644 --- a/MANIFEST +++ b/MANIFEST @@ -189,6 +189,7 @@ t/precheck.t t/psgi_attach.t t/psgi_bad_mids.t t/psgi_mount.t +t/psgi_multipart_not.t t/psgi_search.t t/psgi_text.t t/psgi_v2.t diff --git a/lib/PublicInbox/ContentId.pm b/lib/PublicInbox/ContentId.pm index b1d27eb..dd3155b 100644 --- a/lib/PublicInbox/ContentId.pm +++ b/lib/PublicInbox/ContentId.pm @@ -75,14 +75,7 @@ sub content_digest ($) { } $dig->add("b\0"); my $ct = $part->content_type || 'text/plain'; - my $s = eval { $part->body_str }; - if ($@ && $ct =~ m!\btext/plain\b!i) { - # Try to assume UTF-8 because Alpine - # seems to do wacky things and set - # charset=X-UNKNOWN - $part->charset_set('UTF-8'); - $s = eval { $part->body_str }; - } + my ($s, undef) = msg_part_text($part, $ct); if (defined $s) { $s =~ s/\r\n/\n/gs; $s =~ s/\s*\z//s; diff --git a/lib/PublicInbox/MsgIter.pm b/lib/PublicInbox/MsgIter.pm index a795f61..9e2d797 100644 --- a/lib/PublicInbox/MsgIter.pm +++ b/lib/PublicInbox/MsgIter.pm @@ -5,7 +5,7 @@ package PublicInbox::MsgIter; use strict; use warnings; use base qw(Exporter); -our @EXPORT = qw(msg_iter); +our @EXPORT = qw(msg_iter msg_part_text); use PublicInbox::MIME; # Like Email::MIME::walk_parts, but this is: @@ -34,4 +34,27 @@ sub msg_iter ($$) { } } +sub msg_part_text ($$) { + my ($part, $ct) = @_; + + my $s = eval { $part->body_str }; + my $err = $@; + + # text/plain is the default, multipart/mixed happened a few + # times when it should not have been: + # <87llgalspt.fsf@free.fr> + # <200308111450.h7BEoOu20077@mail.osdl.org> + if ($ct =~ m!\btext/plain\b!i || $ct =~ m!\bmultipart/mixed\b!i) { + # Try to assume UTF-8 because Alpine seems to + # do wacky things and set charset=X-UNKNOWN + $part->charset_set('UTF-8'); + $s = eval { $part->body_str }; + + # If forcing charset=UTF-8 failed, + # caller will warn further down... + $s = $part->body if $@; + } + ($s, $err); +} + 1; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index ca832ad..76f3f33 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -305,19 +305,7 @@ sub add_xapian ($$$$$) { $self->index_text($fn, 1, 'XFN'); } - return if $ct =~ m!\btext/x?html\b!i; - - my $s = eval { $part->body_str }; - if ($@) { - if ($ct =~ m!\btext/plain\b!i) { - # Try to assume UTF-8 because Alpine - # seems to do wacky things and set - # charset=X-UNKNOWN - $part->charset_set('UTF-8'); - $s = eval { $part->body_str }; - $s = $part->body if $@; - } - } + my ($s, undef) = msg_part_text($part, $ct); defined $s or return; my (@orig, @quot); diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 86acd82..bb49c03 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -543,33 +543,14 @@ sub add_text_body { my ($part, $depth) = @$p; # attachment @idx is unused my $ct = $part->content_type || 'text/plain'; my $fn = $part->filename; + my ($s, $err) = msg_part_text($part, $ct); - if ($ct =~ m!\btext/x?html\b!i) { - return attach_link($upfx, $ct, $p, $fn); - } - - my $s = eval { $part->body_str }; - - # badly-encoded message? tell the world about it! - my $err = $@; - if ($err) { - if ($ct =~ m!\btext/plain\b!i) { - # Try to assume UTF-8 because Alpine seems to - # do wacky things and set charset=X-UNKNOWN - $part->charset_set('UTF-8'); - $s = eval { $part->body_str }; - - # If forcing charset=UTF-8 failed, - # attach_link will warn further down... - $s = $part->body if $@; - } else { - return attach_link($upfx, $ct, $p, $fn); - } - } + return attach_link($upfx, $ct, $p, $fn) unless defined $s; my @lines = split(/^/m, $s); $s = ''; if (defined($fn) || $depth > 0 || $err) { + # badly-encoded message with $err? tell the world about it! $s .= attach_link($upfx, $ct, $p, $fn, $err); $s .= "\n"; } diff --git a/t/psgi_multipart_not.t b/t/psgi_multipart_not.t new file mode 100644 index 0000000..4c9fa57 --- /dev/null +++ b/t/psgi_multipart_not.t @@ -0,0 +1,65 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use File::Temp qw/tempdir/; +use Email::MIME; +use PublicInbox::Config; +use PublicInbox::WWW; +my @mods = qw(DBD::SQLite Search::Xapian HTTP::Request::Common + Plack::Test URI::Escape Plack::Builder Plack::Test); +foreach my $mod (@mods) { + eval "require $mod"; + plan skip_all => "$mod missing for psgi_multipart_not.t" if $@; +} +use_ok($_) for @mods; +use_ok 'PublicInbox::V2Writable'; +my $repo = tempdir('pi-psgi-multipart-not.XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $ibx = PublicInbox::Inbox->new({ + mainrepo => $repo, + name => 'multipart-not', + version => 2, + -primary_address => 'test@example.com', +}); +my $im = PublicInbox::V2Writable->new($ibx, 1); +$im->{parallel} = 0; + +my $mime = PublicInbox::MIME->new(<<'EOF'); +Message-Id: <200308111450.h7BEoOu20077@mail.osdl.org> +To: linux-kernel@vger.kernel.org +Subject: [OSDL] linux-2.6.0-test3 reaim results +Mime-Version: 1.0 +Content-Type: multipart/mixed ; + boundary="==_Exmh_120757360" +Date: Mon, 11 Aug 2003 07:50:24 -0700 +From: exmh user + +Freed^Wmultipart ain't what it used to be +EOF + +ok($im->add($mime), 'added broken multipart message'); +$im->done; + +my $cfgpfx = "publicinbox.v2test"; +my $cfg = { + "$cfgpfx.address" => $ibx->{-primary_address}, + "$cfgpfx.mainrepo" => $repo, +}; +my $config = PublicInbox::Config->new($cfg); +my $www = PublicInbox::WWW->new($config); + +my ($res, $raw); +test_psgi(sub { $www->call(@_) }, sub { + my ($cb) = @_; + for my $u ('/v2test/?q=%22ain\'t what it used to be%22&x=t', + '/v2test/new.atom', '/v2test/new.html') { + $res = $cb->(GET($u)); + $raw = $res->content; + ok(index($raw, 'Freed^Wmultipart') >= 0, $u); + ok(index($raw, 'Warning: decoded text') >= 0, $u.' warns'); + } +}); + +done_testing(); +1; -- EW