* [PATCH 1/2] view: account for filter bugs which leak HTML into the repo
@ 2014-11-13 21:53 Eric Wong
2014-11-13 21:53 ` [PATCH 2/2] -learn: nuke HTML portions when training as ham Eric Wong
0 siblings, 1 reply; 2+ messages in thread
From: Eric Wong @ 2014-11-13 21:53 UTC (permalink / raw)
To: meta; +Cc: Eric Wong
Ugh, apparently there's a (yet-to-be-fixed) bug in the Filter
code which caused an HTML message portion of a multipart message
to be displayed on the web UI. Account for that and nuke it.
---
lib/PublicInbox/View.pm | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index e76d904..b09c3ba 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -104,7 +104,12 @@ sub index_entry {
$mime->walk_parts(sub {
my ($part) = @_;
return if $part->subparts; # walk_parts already recurses
- my $enc = enc_for($part->content_type) || $enc_msg || $enc_utf8;
+ my $ct = $part->content_type;
+
+ # account for filter bugs...
+ return if defined $ct && $ct =~ m!\btext/[xh]+tml\b!i;
+
+ my $enc = enc_for($ct) || $enc_msg || $enc_utf8;
if ($part_nr > 0) {
my $fn = $part->filename;
@@ -178,7 +183,12 @@ sub multipart_text_as_html {
$mime->walk_parts(sub {
my ($part) = @_;
return if $part->subparts; # walk_parts already recurses
- my $enc = enc_for($part->content_type) || $enc_msg || $enc_utf8;
+ my $ct = $part->content_type;
+
+ # account for filter bugs...
+ return if defined $ct && $ct =~ m!\btext/[xh]+tml\b!i;
+
+ my $enc = enc_for($ct) || $enc_msg || $enc_utf8;
if ($part_nr > 0) {
my $fn = $part->filename;
--
EW
^ permalink raw reply related [flat|nested] 2+ messages in thread
* [PATCH 2/2] -learn: nuke HTML portions when training as ham
2014-11-13 21:53 [PATCH 1/2] view: account for filter bugs which leak HTML into the repo Eric Wong
@ 2014-11-13 21:53 ` Eric Wong
0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong @ 2014-11-13 21:53 UTC (permalink / raw)
To: meta; +Cc: Eric Wong
Sometimes people send HTML email and I forget to fixup in my
MUA during moderation. Automatically strip out HTML portions
instead.
---
public-inbox-learn | 19 ++++++++++---------
t/mda.t | 41 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 51 insertions(+), 9 deletions(-)
diff --git a/public-inbox-learn b/public-inbox-learn
index 13b75b7..db0a1bb 100755
--- a/public-inbox-learn
+++ b/public-inbox-learn
@@ -24,9 +24,16 @@ foreach my $h (qw(Cc To)) {
}
}
-my $in = $mime->as_string;
-$mime->body_set('');
+my ($name, $email, $date);
+
+if ($train eq "ham") {
+ require PublicInbox::MDA;
+ require PublicInbox::Filter;
+ PublicInbox::Filter->run($mime);
+ ($name, $email, $date) = PublicInbox::MDA->author_info($mime);
+}
+my $in = $mime->as_string;
my $err = 0;
my @output = qw(> /dev/null > /dev/null);
@@ -50,16 +57,10 @@ foreach my $recipient (keys %dests) {
}
}
} else { # $train eq "ham"
- require PublicInbox::MDA;
- require PublicInbox::Filter;
-
- # no checking for errors here, we assume the message has
+ # no checking for spam here, we assume the message has
# been reviewed by a human at this point:
- PublicInbox::Filter->run($mime);
PublicInbox::MDA->set_list_headers($mime, $dst);
- my ($name, $email, $date) =
- PublicInbox::MDA->author_info($mime);
local $ENV{GIT_AUTHOR_NAME} = $name;
local $ENV{GIT_AUTHOR_EMAIL} = $email;
local $ENV{GIT_AUTHOR_DATE} = $date;
diff --git a/t/mda.t b/t/mda.t
index fad96e5..53712a5 100644
--- a/t/mda.t
+++ b/t/mda.t
@@ -205,14 +205,55 @@ EOF
my $in = $simple->as_string;
# now train it
+ # these should be overridden
local $ENV{GIT_AUTHOR_EMAIL} = 'trainer@example.com';
local $ENV{GIT_COMMITTER_EMAIL} = 'trainer@example.com';
+
run([$learn, "ham"], \$in);
is($?, 0, "learned ham without failure");
my $msg = `ssoma cat $mid $maindir`;
like($msg, qr/\Q$mid\E/, "ham message delivered");
run([$learn, "ham"], \$in);
is($?, 0, "learned ham idempotently ");
+
+ # ensure trained email is filtered, too
+ my $html_body = "<html><body>hi</body></html>";
+ my $parts = [
+ Email::MIME->create(
+ attributes => {
+ content_type => 'text/html; charset=UTF-8',
+ encoding => 'base64',
+ },
+ body => $html_body,
+ ),
+ Email::MIME->create(
+ attributes => {
+ content_type => 'text/plain',
+ encoding => 'quoted-printable',
+ },
+ body => 'hi = "bye"',
+ )
+ ];
+ $mid = 'multipart-html-sucks@11';
+ my $mime = Email::MIME->create(
+ header_str => [
+ From => 'a@example.com',
+ Subject => 'blah',
+ Cc => $addr,
+ 'Message-ID' => "<$mid>",
+ 'Content-Type' => 'multipart/alternative',
+ ],
+ parts => $parts,
+ );
+
+ {
+ $in = $mime->as_string;
+ run([$learn, "ham"], \$in);
+ is($?, 0, "learned ham without failure");
+ $msg = `ssoma cat $mid $maindir`;
+ like($msg, qr/<\Q$mid\E>/, "ham message delivered");
+ unlike($msg, qr/<html>/i, '<html> filtered');
+ }
}
# faildir - emergency destination is maildir
--
EW
^ permalink raw reply related [flat|nested] 2+ messages in thread
end of thread, other threads:[~2014-11-13 21:53 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-11-13 21:53 [PATCH 1/2] view: account for filter bugs which leak HTML into the repo Eric Wong
2014-11-13 21:53 ` [PATCH 2/2] -learn: nuke HTML portions when training as ham Eric Wong
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).