From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 689561F9FD for ; Thu, 11 Mar 2021 01:45:39 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] msg_part_text: discover text in application/octet-stream Date: Wed, 10 Mar 2021 19:45:39 -0600 Message-Id: <20210311014539.19756-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Some poorly-configured MUAs will send application/octet-stream even for text-only attachments. We can't make expect all MUAs are configured with proper MIME types, and there is plenty of historical mail that falls into this unfortunate criteria. --- MANIFEST | 1 + lib/PublicInbox/MsgIter.pm | 12 ++++++ t/msg_iter.t | 64 ++++++++++++++++++++++++++++--- xt/eml_octet-stream.t | 77 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 149 insertions(+), 5 deletions(-) create mode 100644 xt/eml_octet-stream.t diff --git a/MANIFEST b/MANIFEST index 8c9c86a0..4757b4fc 100644 --- a/MANIFEST +++ b/MANIFEST @@ -471,6 +471,7 @@ xt/cmp-msgstr.t xt/cmp-msgview.t xt/create-many-inboxes.t xt/eml_check_limits.t +xt/eml_octet-stream.t xt/git-http-backend.t xt/git_async_cmp.t xt/httpd-async-stream.t diff --git a/lib/PublicInbox/MsgIter.pm b/lib/PublicInbox/MsgIter.pm index c503eb98..e2819523 100644 --- a/lib/PublicInbox/MsgIter.pm +++ b/lib/PublicInbox/MsgIter.pm @@ -84,6 +84,18 @@ sub msg_part_text ($$) { # If forcing charset=UTF-8 failed, # caller will warn further down... $s = $part->body if $@; + } elsif ($err && $ct =~ m!\bapplication/octet-stream\b!i) { + # Some unconfigured/poorly-configured MUAs will set + # application/octet-stream even for all text attachments. + # Try to see if it's printable text that we can index + # and display: + $s = $part->body; + if ($s =~ /[^\p{XPosixPrint}\s]/s) { + utf8::decode($s); + $s =~ /[^\p{XPosixPrint}\s]/s ? undef($s) : undef($err); + } else { + undef($err); + } } ($s, $err); } diff --git a/t/msg_iter.t b/t/msg_iter.t index e46d515c..6c52eec8 100644 --- a/t/msg_iter.t +++ b/t/msg_iter.t @@ -1,10 +1,8 @@ # Copyright (C) 2016-2021 all contributors # License: AGPL-3.0+ -use strict; -use warnings; -use Test::More; -use PublicInbox::TestCommon; +use strict; use v5.10.1; use PublicInbox::TestCommon; use PublicInbox::Hval qw(ascii_html); +use MIME::QuotedPrint 3.05 qw(encode_qp); use_ok('PublicInbox::MsgIter'); { @@ -88,5 +86,61 @@ use_ok('PublicInbox::MsgIter'); is($check[1], $nq, 'long quoted section matches'); } +{ + open my $fh, '<', 't/utf8.eml' or BAIL_OUT $!; + my $expect = do { local $/; <$fh> }; + my $qp_patch = encode_qp($expect, "\r\n"); + my $common = <new(<each_part(sub { + my ($part, $level, @ex) = @{$_[0]}; + my ($s, $err) = msg_part_text($part, $part->content_type); + push @parts, $s; + }); + $expect =~ s/\n/\r\n/sg; + is_deeply(\@parts, [ "blah\r\n", $expect ], + 'fallback to application/octet-stream as UTF-8 text'); + + my $qp_binary = encode_qp("Binary\0crap", "\r\n"); + $eml = PublicInbox::Eml->new(<each_part(sub { + my ($part, $level, @ex) = @{$_[0]}; + my ($s, $err) = msg_part_text($part, $part->content_type); + push @parts, $s; + push @err, $err; + }); + is_deeply(\@parts, [ "blah\r\n", undef ], + 'non-text ignored in octet-stream'); + ok($err[1], 'got error for second element'); +} + done_testing(); -1; diff --git a/xt/eml_octet-stream.t b/xt/eml_octet-stream.t new file mode 100644 index 00000000..8173aec2 --- /dev/null +++ b/xt/eml_octet-stream.t @@ -0,0 +1,77 @@ +#!perl -w +# Copyright (C) 2021 all contributors +# License: AGPL-3.0+ +use strict; use v5.10.1; use PublicInbox::TestCommon; +use PublicInbox::Git; +use PublicInbox::Eml; +use PublicInbox::MsgIter qw(msg_part_text); +use PublicInbox::LeiToMail; +my $eml2mboxcl2 = PublicInbox::LeiToMail->can('eml2mboxcl2'); +my $git_dir = $ENV{GIANT_GIT_DIR}; +plan 'skip_all' => "GIANT_GIT_DIR not defined for $0" unless defined($git_dir); +use Data::Dumper; +$Data::Dumper::Useqq = 1; +my $mboxfh; +if (my $out = $ENV{DEBUG_MBOXCL2}) { + BAIL_OUT("$out exists") if -s $out; + open $mboxfh, '>', $out or BAIL_OUT "open $out: $!"; +} else { + diag "DEBUG_MBOXCL2 unset, not saving debug output"; +} + +my $git = PublicInbox::Git->new($git_dir); +my @cat = qw(cat-file --buffer --batch-check --batch-all-objects); +if (require_git(2.19, 1)) { + push @cat, '--unordered'; +} else { + warn "git <2.19, cat-file lacks --unordered, locality suffers\n"; +} +my ($errs, $ok, $tot); +$errs = $ok = $tot = 0; +my $ep = sub { # eml->each_part callback + my ($part, $level, @ex) = @{$_[0]}; + ++$tot; + my $ct = $part->content_type // return; + $ct =~ m!\bapplication/octet-stream\b!i or return; + my ($s, $err) = msg_part_text($part, $ct); + if (defined $s) { + ++$ok; + } else { + warn "binary $err\n"; + ++$errs; + my $x = eval { $part->body }; + if ($@) { + warn "decode totally failed: $@"; + } else { + my ($bad) = ($x =~ m/([\p{XPosixPrint}\s]{0,10} + [^\p{XPosixPrint}\s]+ + [\p{XPosixPrint}\s]{0,10})/sx); + warn Dumper([$bad]); + } + + push @{$_[1]}, $err; # $fail + } +}; + +my $cb = sub { + my ($bref, $oid) = @_; + my $eml = PublicInbox::Eml->new($bref); + local $SIG{__WARN__} = sub { diag("$oid ", @_) }; + $eml->each_part($ep, my $fail = []); + if (@$fail && $mboxfh) { + diag "@$fail"; + print $mboxfh ${$eml2mboxcl2->($eml, { blob => $oid })} or + BAIL_OUT "print: $!"; + } +}; +my $cat = $git->popen(@cat); +while (<$cat>) { + my ($oid, $type, $size) = split(/ /); + $git->cat_async($oid, $cb) if $size && $type eq 'blob'; +} +$git->cat_async_wait; +note "$errs errors"; +note "$ok/$tot messages had text as application/octet-stream"; +ok 1; + +done_testing;