From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 6EC042018E for ; Wed, 15 Jun 2016 00:37:44 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 7/9] filter: begin work on a new filter API Date: Wed, 15 Jun 2016 00:37:40 +0000 Message-Id: <20160615003742.22538-8-e@80x24.org> In-Reply-To: <20160615003742.22538-1-e@80x24.org> References: <20160615003742.22538-1-e@80x24.org> List-Id: This filter API should be independent of Email::Filter and hopefully less intrusive to long running processes. --- lib/PublicInbox/Filter/Base.pm | 100 +++++++++++++++++++++++++++++++++++++++ lib/PublicInbox/Filter/Mirror.pm | 12 +++++ lib/PublicInbox/Filter/Vger.pm | 33 +++++++++++++ t/filter_base.t | 81 +++++++++++++++++++++++++++++++ t/filter_mirror.t | 40 ++++++++++++++++ t/filter_vger.t | 46 ++++++++++++++++++ 6 files changed, 312 insertions(+) create mode 100644 lib/PublicInbox/Filter/Base.pm create mode 100644 lib/PublicInbox/Filter/Mirror.pm create mode 100644 lib/PublicInbox/Filter/Vger.pm create mode 100644 t/filter_base.t create mode 100644 t/filter_mirror.t create mode 100644 t/filter_vger.t diff --git a/lib/PublicInbox/Filter/Base.pm b/lib/PublicInbox/Filter/Base.pm new file mode 100644 index 0000000..0991e87 --- /dev/null +++ b/lib/PublicInbox/Filter/Base.pm @@ -0,0 +1,100 @@ +# Copyright (C) 2016 all contributors +# License: AGPL-3.0+ +# +# base class for creating per-list or per-project filters +package PublicInbox::Filter::Base; +use strict; +use warnings; +use PublicInbox::MsgIter; +use constant MAX_MID_SIZE => 244; # max term size - 1 in Xapian + +my $NO_HTML = '*** We only accept plain-text mail, no HTML ***'; +our %DEFAULTS = ( + reject_suffix => [ qw(exe bat cmd com pif scr vbs cpl zip) ], + reject_type => [ "text/html:$NO_HTML", "text/xhtml:$NO_HTML", + 'application/vnd.ms-*:No proprietary data formats' ], +); +our $INVALID_FN = qr/\0/; + +sub REJECT () { 100 } +sub ACCEPT { scalar @_ > 1 ? $_[1] : 1 } +sub IGNORE () { 0 } + +my %patmap = ('*' => '.*', '?' => '.', '[' => '[', ']' => ']'); +sub glob2pat { + my ($glob) = @_; + $glob =~ s!(.)!$patmap{$1} || "\Q$1"!ge; + $glob; +} + +sub new { + my ($class, %opts) = @_; + my $self = bless { err => '', %opts }, $class; + foreach my $f (qw(reject_suffix reject_type)) { + # allow undef: + $self->{$f} = $DEFAULTS{$f} unless exists $self->{$f}; + } + if (defined $self->{reject_suffix}) { + my $tmp = $self->{reject_suffix}; + $tmp = join('|', map { glob2pat($_) } @$tmp); + $self->{reject_suffix} = qr/\.($tmp)\s*\z/i; + } + my $rt = []; + if (defined $self->{reject_type}) { + my $tmp = $self->{reject_type}; + @$rt = map { + my ($type, $msg) = split(':', $_, 2); + $type = lc $type; + $msg ||= "Unacceptable Content-Type: $type"; + my $re = glob2pat($type); + [ qr/\b$re\b/i, $msg ]; + } @$tmp; + } + $self->{reject_type} = $rt; + $self; +} + +sub reject ($$) { + my ($self, $reason) = @_; + $self->{err} = $reason; + REJECT; +} + +sub err ($) { $_[0]->{err} } + +# for MDA +sub delivery { + my ($self, $mime) = @_; + + my $rt = $self->{reject_type}; + my $reject_suffix = $self->{reject_suffix} || $INVALID_FN; + my (%sfx, %type); + + msg_iter($mime, sub { + my ($part, $depth, @idx) = @{$_[0]}; + + my $ct = $part->content_type || 'text/plain'; + foreach my $p (@$rt) { + if ($ct =~ $p->[0]) { + $type{$p->[1]} = 1; + } + } + + my $fn = $part->filename; + if (defined($fn) && $fn =~ $reject_suffix) { + $sfx{$1} = 1; + } + }); + + my @r; + if (keys %type) { + push @r, sort keys %type; + } + if (keys %sfx) { + push @r, 'Rejected suffixes(s): '.join(', ', sort keys %sfx); + } + + @r ? $self->reject(join("\n", @r)) : $self->ACCEPT; +} + +1; diff --git a/lib/PublicInbox/Filter/Mirror.pm b/lib/PublicInbox/Filter/Mirror.pm new file mode 100644 index 0000000..d994088 --- /dev/null +++ b/lib/PublicInbox/Filter/Mirror.pm @@ -0,0 +1,12 @@ +# Copyright (C) 2016 all contributors +# License: AGPL-3.0+ + +# Dumb filter for blindly accepting everything +package PublicInbox::Filter::Mirror; +use base qw(PublicInbox::Filter::Base); +use strict; +use warnings; + +sub delivery { $_[0]->ACCEPT }; + +1; diff --git a/lib/PublicInbox/Filter/Vger.pm b/lib/PublicInbox/Filter/Vger.pm new file mode 100644 index 0000000..9498081 --- /dev/null +++ b/lib/PublicInbox/Filter/Vger.pm @@ -0,0 +1,33 @@ +# Copyright (C) 2016 all contributors +# License: AGPL-3.0+ + +# Filter for vger.kernel.org list trailer +package PublicInbox::Filter::Vger; +use base qw(PublicInbox::Filter::Base); +use strict; +use warnings; + +my $l0 = qr/-+/; # older messages only had one '-' +my $l1 = + qr/To unsubscribe from this list: send the line "unsubscribe [\w-]+" in/; +my $l2 = qr/the body of a message to majordomo\@vger\.kernel\.org/; +my $l3 = + qr!More majordomo info at +http://vger\.kernel\.org/majordomo-info\.html!; + +# only LKML had this, and LKML nowadays has no list trailer since Jan 2016 +my $l4 = qr!Please read the FAQ at +http://www\.tux\.org/lkml/!; + +sub delivery { + my ($self, $mime) = @_; + my $s = $mime->as_string; + + # the vger appender seems to only work on the raw string, + # so in multipart (e.g. GPG-signed) messages, the list trailer + # becomes invisible to MIME-aware email clients. + if ($s =~ s/$l0\n$l1\n$l2\n$l3\n($l4\n)?\z//os) { + $mime = Email::MIME->new(\$s); + } + $self->ACCEPT($mime); +} + +1; diff --git a/t/filter_base.t b/t/filter_base.t new file mode 100644 index 0000000..ee5c730 --- /dev/null +++ b/t/filter_base.t @@ -0,0 +1,81 @@ +# Copyright (C) 2016 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use Email::MIME; +use_ok 'PublicInbox::Filter::Base'; + +{ + my $f = PublicInbox::Filter::Base->new; + ok($f, 'created stock object'); + ok(defined $f->{reject_suffix}, 'rejected suffix redefined'); + is(ref($f->{reject_suffix}), 'Regexp', 'reject_suffix should be a RE'); +} + +{ + my $f = PublicInbox::Filter::Base->new(reject_suffix => undef); + ok($f, 'created base object q/o reject_suffix'); + ok(!defined $f->{reject_suffix}, 'reject_suffix not defined'); +} + +{ + my $f = PublicInbox::Filter::Base->new; + my $html_body = "hi"; + my $parts = [ + Email::MIME->create( + attributes => { + content_type => 'text/xhtml; charset=UTF-8', + encoding => 'base64', + }, + body => $html_body, + ), + Email::MIME->create( + attributes => { + content_type => 'text/plain', + encoding => 'quoted-printable', + }, + body => 'hi = "bye"', + ) + ]; + my $email = Email::MIME->create( + header_str => [ + From => 'a@example.com', + Subject => 'blah', + 'Content-Type' => 'multipart/alternative' + ], + parts => $parts, + ); + is($f->delivery($email), 100, "xhtml rejected"); +} + +{ + my $f = PublicInbox::Filter::Base->new; + my $parts = [ + Email::MIME->create( + attributes => { + content_type => 'application/vnd.ms-excel', + encoding => 'base64', + }, + body => 'junk', + ), + Email::MIME->create( + attributes => { + content_type => 'text/plain', + encoding => 'quoted-printable', + }, + body => 'junk', + ) + ]; + my $email = Email::MIME->create( + header_str => [ + From => 'a@example.com', + Subject => 'blah', + 'Content-Type' => 'multipart/mixed' + ], + parts => $parts, + ); + is($f->delivery($email), 100, 'proprietary format rejected on glob'); +} + +done_testing(); diff --git a/t/filter_mirror.t b/t/filter_mirror.t new file mode 100644 index 0000000..01be282 --- /dev/null +++ b/t/filter_mirror.t @@ -0,0 +1,40 @@ +# Copyright (C) 2016 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use Email::MIME; +use_ok 'PublicInbox::Filter::Mirror'; + +my $f = PublicInbox::Filter::Mirror->new; +ok($f, 'created PublicInbox::Filter::Mirror object'); +{ + my $html_body = "hi"; + my $parts = [ + Email::MIME->create( + attributes => { + content_type => 'text/html; charset=UTF-8', + encoding => 'base64', + }, + body => $html_body, + ), + Email::MIME->create( + attributes => { + content_type => 'text/plain', + encoding => 'quoted-printable', + }, + body => 'hi = "bye"', + ) + ]; + my $email = Email::MIME->create( + header_str => [ + From => 'a@example.com', + Subject => 'blah', + 'Content-Type' => 'multipart/alternative' + ], + parts => $parts, + ); + is($f->ACCEPT, $f->delivery($email), 'accept any trash that comes'); +} + +done_testing(); diff --git a/t/filter_vger.t b/t/filter_vger.t new file mode 100644 index 0000000..83a4c9e --- /dev/null +++ b/t/filter_vger.t @@ -0,0 +1,46 @@ +# Copyright (C) 2016 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use Email::MIME; +use_ok 'PublicInbox::Filter::Vger'; + +my $f = PublicInbox::Filter::Vger->new; +ok($f, 'created PublicInbox::Filter::Vger object'); +{ + my $lkml = <<'EOF'; +From: foo@example.com +Subject: test + +keep this +-- +To unsubscribe from this list: send the line "unsubscribe linux-kernel" in +the body of a message to majordomo@vger.kernel.org +More majordomo info at http://vger.kernel.org/majordomo-info.html +Please read the FAQ at http://www.tux.org/lkml/ +EOF + + my $mime = Email::MIME->new($lkml); + $mime = $f->delivery($mime); + is("keep this\n", $mime->body, 'normal message filtered OK'); +} + +{ + my $no_nl = <<'EOF'; +From: foo@example.com +Subject: test + +OSX users :P-- +To unsubscribe from this list: send the line "unsubscribe git" in +the body of a message to majordomo@vger.kernel.org +More majordomo info at http://vger.kernel.org/majordomo-info.html +EOF + + my $mime = Email::MIME->new($no_nl); + $mime = $f->delivery($mime); + is('OSX users :P', $mime->body, 'missing trailing LF in original OK'); +} + + +done_testing();