From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 4958E1F8C6; Sat, 4 Sep 2021 21:36:58 +0000 (UTC) Date: Sat, 4 Sep 2021 21:36:58 +0000 From: Eric Wong To: Konstantin Ryabitsev Cc: meta@public-inbox.org Subject: [PATCH] lei_to_mail+mbox_reader: fix handling of empty/bogus emails Message-ID: <20210904213658.GA27941@dcvr> References: <20210902211225.pmnykwcwcxeaunt5@meerkat.local> <20210902215850.GA5063@dcvr> <20210903151500.h72mzcpqixgtytjs@meerkat.local> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: <20210903151500.h72mzcpqixgtytjs@meerkat.local> List-Id: Konstantin Ryabitsev wrote: > Yep, that seems to work fine. Question -- I noticed that lei just issues a > regular query, retrieves results with curl and then parses the output. Is > there a danger of potentially running into issues with parsing the regular > HTML output if it changes in the future? It's actually parsing gzipped mboxrd (&x=m). But you're right we could use stronger safeguards in case we see gzipped HTML or something else... ----------8<--------- Subject: [PATCH] lei_to_mail+mbox_reader: fix handling of empty/bogus emails We may be handling invalid mboxes, so just return no objects in that case. While "lei q" on HTTP(S) externals expects a gzipped mboxrd, there's always a chance something else gzipped can be sent to us. There's also changes to lei_to_mail to better handle emails which lack a body and/or headers (e.g. t/solve/bare.patch) Link: https://public-inbox.org/meta/20210903151500.h72mzcpqixgtytjs@meerkat.local/ --- lib/PublicInbox/Eml.pm | 8 ++++++++ lib/PublicInbox/LeiToMail.pm | 21 +++++++-------------- lib/PublicInbox/MboxReader.pm | 3 ++- t/mbox_reader.t | 23 +++++++++++++++++++++++ 4 files changed, 40 insertions(+), 15 deletions(-) diff --git a/lib/PublicInbox/Eml.pm b/lib/PublicInbox/Eml.pm index 955d6a96..0867a016 100644 --- a/lib/PublicInbox/Eml.pm +++ b/lib/PublicInbox/Eml.pm @@ -480,6 +480,14 @@ sub charset_set { sub crlf { $_[0]->{crlf} // "\n" } +sub raw_size { + my ($self) = @_; + my $len = length(${$self->{hdr}}); + defined($self->{bdy}) and + $len += length(${$self->{bdy}}) + length($self->{crlf}); + $len; +} + # warnings to ignore when handling spam mailboxes and maybe other places sub warn_ignore { my $s = "@_"; diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm index 6e102a1d..1221d3c7 100644 --- a/lib/PublicInbox/LeiToMail.pm +++ b/lib/PublicInbox/LeiToMail.pm @@ -109,32 +109,25 @@ sub _mboxcl_common ($$$) { $$buf .= 'Content-Length: '.length($$bdy).$crlf. 'Lines: '.$lines.$crlf.$crlf; substr($$bdy, 0, 0, $$buf); # prepend header - $_[0] = $bdy; + $$bdy .= $crlf; + $bdy; } # mboxcl still escapes "From " lines sub eml2mboxcl { my ($eml, $smsg) = @_; my $buf = _mbox_hdr_buf($eml, 'mboxcl', $smsg); - my $crlf = $eml->{crlf}; - if (my $bdy = delete $eml->{bdy}) { - $$bdy =~ s/^From />From /gm; - _mboxcl_common($buf, $bdy, $crlf); - } - $$buf .= $crlf; - $buf; + my $bdy = delete($eml->{bdy}) // \(my $empty = ''); + $$bdy =~ s/^From />From /gm; + _mboxcl_common($buf, $bdy, $eml->{crlf}); } # mboxcl2 has no "From " escaping sub eml2mboxcl2 { my ($eml, $smsg) = @_; my $buf = _mbox_hdr_buf($eml, 'mboxcl2', $smsg); - my $crlf = $eml->{crlf}; - if (my $bdy = delete $eml->{bdy}) { - _mboxcl_common($buf, $bdy, $crlf); - } - $$buf .= $crlf; - $buf; + my $bdy = delete($eml->{bdy}) // \(my $empty = ''); + _mboxcl_common($buf, $bdy, $eml->{crlf}); } sub git_to_mail { # git->cat_async callback diff --git a/lib/PublicInbox/MboxReader.pm b/lib/PublicInbox/MboxReader.pm index 9291f00b..5a754cb8 100644 --- a/lib/PublicInbox/MboxReader.pm +++ b/lib/PublicInbox/MboxReader.pm @@ -41,7 +41,7 @@ sub _mbox_from { $raw =~ s/^\r?\n\z//ms; $raw =~ s/$from_re/$1/gms; my $eml = PublicInbox::Eml->new(\$raw); - $eml_cb->($eml, @arg); + $eml_cb->($eml, @arg) if $eml->raw_size; } return if $r == 0; # EOF } @@ -96,6 +96,7 @@ sub _mbox_cl ($$$;@) { $$hdr =~ s/\A[\r\n]*From [^\n]*\n//s or die "E: no 'From ' line in:\n", Dumper($hdr); my $eml = PublicInbox::Eml->new($hdr); + next unless $eml->raw_size; my @cl = $eml->header_raw('Content-Length'); my $n = scalar(@cl); $n == 0 and die "E: Content-Length missing in:\n", diff --git a/t/mbox_reader.t b/t/mbox_reader.t index da0ce7f1..e5f57d7b 100644 --- a/t/mbox_reader.t +++ b/t/mbox_reader.t @@ -71,6 +71,12 @@ my $check_fmt = sub { "Content-Length is correct $fmt $cur"); # clobber for ->as_string comparison below $eml->header_set('Content-Length'); + + # special case for t/solve/bare.patch, not sure if we + # should even handle it... + if ($cl[0] eq '0' && ${$eml->{hdr}} eq '') { + delete $eml->{bdy}; + } } else { is(scalar(@cl), 0, "Content-Length unset $fmt $cur"); } @@ -121,4 +127,21 @@ exit 1 is(scalar(grep(/Final/, @x)), 0, 'no incomplete bit'); } +{ + my $html = <hi,how are you +EOM + for my $m (qw(mboxrd mboxcl mboxcl2 mboxo)) { + my (@w, @x); + local $SIG{__WARN__} = sub { push @w, @_ }; + open my $fh, '<', \$html or xbail 'PerlIO::scalar'; + PublicInbox::MboxReader->$m($fh, sub { + push @x, $_[0]->as_string + }); + is_deeply(\@x, [], "messages in invalid $m"); + is_deeply([grep(!/^W: leftover/, @w)], [], + "no extra warnings besides leftover ($m)"); + } +} + done_testing;