From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-3.7 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id E131020191 for ; Fri, 24 Jun 2016 20:47:18 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 5/6] watch_maildir: implement optional spam checking Date: Fri, 24 Jun 2016 20:47:17 +0000 Message-Id: <20160624204718.27540-5-e@80x24.org> In-Reply-To: <20160624204718.27540-1-e@80x24.org> References: <20160624204718.27540-1-e@80x24.org> List-Id: Mailing lists I watch and mirror may not have the best spam filtering, and an extra layer should not hurt. --- lib/PublicInbox/Import.pm | 6 +++++- lib/PublicInbox/WatchMaildir.pm | 34 ++++++++++++++++++++++++++++++++-- t/import.t | 6 +++++- t/watch_maildir.t | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 76 insertions(+), 4 deletions(-) diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 5ffc26e..27f36a7 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -140,7 +140,7 @@ sub remove { # returns undef on duplicate sub add { - my ($self, $mime) = @_; # mime = Email::MIME + my ($self, $mime, $check_cb) = @_; # mime = Email::MIME my $from = $mime->header('From'); my ($email) = ($from =~ /([^<\s]+\@[^>\s]+)/g); @@ -170,6 +170,10 @@ sub add { # kill potentially confusing/misleading headers $mime->header_set($_) for qw(bytes lines content-length status); + if ($check_cb) { + $mime = $check_cb->($mime) or return; + } + $mime = $mime->as_string; my $blob = $self->{mark}++; print $w "blob\nmark :$blob\ndata ", length($mime), "\n" or wfail; diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm index c1fe81e..72bd3d0 100644 --- a/lib/PublicInbox/WatchMaildir.pm +++ b/lib/PublicInbox/WatchMaildir.pm @@ -13,7 +13,9 @@ use PublicInbox::Spawn qw(spawn); sub new { my ($class, $config) = @_; - my (%mdmap, @mdir); + my (%mdmap, @mdir, $spamc); + + # XXX is "publicinboxlearn" really a good namespace for this? my $k = 'publicinboxlearn.watchspam'; if (my $spamdir = $config->{$k}) { if ($spamdir =~ s/\Amaildir://) { @@ -26,6 +28,21 @@ sub new { warn "unsupported $k=$spamdir\n"; } } + + $k = 'publicinboxwatch.spamcheck'; + my $spamcheck = $config->{$k}; + if ($spamcheck) { + if ($spamcheck eq 'spamc') { + $spamcheck = 'PublicInbox::Spamcheck::Spamc'; + } + if ($spamcheck =~ /::/) { + eval "require $spamcheck"; + $spamcheck = _spamcheck_cb($spamcheck->new); + } else { + warn "unsupported $k=$spamcheck\n"; + $spamcheck = undef; + } + } foreach $k (keys %$config) { $k =~ /\Apublicinbox\.([^\.]+)\.watch\z/ or next; my $name = $1; @@ -52,6 +69,7 @@ sub new { my $mdre = join('|', map { quotemeta($_) } @mdir); $mdre = qr!\A($mdre)/!; bless { + spamcheck => $spamcheck, mdmap => \%mdmap, mdir => \@mdir, mdre => $mdre, @@ -136,7 +154,7 @@ sub _try_path { } _force_mid($mime); - $im->add($mime); + $im->add($mime, $self->{spamcheck}); } sub watch { @@ -208,4 +226,16 @@ sub _scrubber_for { undef; } +sub _spamcheck_cb { + my ($sc) = @_; + sub { + my ($mime) = @_; + my $tmp = ''; + if ($sc->spamcheck($mime, \$tmp)) { + return Email::MIME->new(\$tmp); + } + undef; + } +} + 1; diff --git a/t/import.t b/t/import.t index 09c0036..73f92ad 100644 --- a/t/import.t +++ b/t/import.t @@ -30,7 +30,7 @@ is(scalar @revs, 1, 'one revision created'); $mime->header_set('Message-ID', ''); $mime->header_set('Subject', 'msg2'); -like($im->add($mime), qr/\A:\d+\z/, 'added 2nd message'); +like($im->add($mime, sub { $mime }), qr/\A:\d+\z/, 'added 2nd message'); $im->done; @revs = $git->qx(qw(rev-list HEAD)); is(scalar @revs, 2, '2 revisions exist'); @@ -61,5 +61,9 @@ is($mark, 'MISMATCH', 'mark == MISMATCH on mismatch'); is($msg->header('Message-ID'), '', 'Message-ID matches'); isnt($msg->header('Subject'), $mime->header('Subject'), 'subject mismatch'); +$mime->header_set('Message-Id', ''); +is($im->add($mime, sub { undef }), undef, 'check callback fails'); +is($im->remove($mime), undef, 'message not added, so not removed'); + $im->done; done_testing(); diff --git a/t/watch_maildir.t b/t/watch_maildir.t index be1a312..2138963 100644 --- a/t/watch_maildir.t +++ b/t/watch_maildir.t @@ -3,6 +3,7 @@ use Test::More; use File::Temp qw/tempdir/; use Email::MIME; +use Cwd; use PublicInbox::Config; my @mods = qw(Filesys::Notify::Simple); foreach my $mod (@mods) { @@ -86,4 +87,37 @@ More majordomo info at http://vger.kernel.org/majordomo-info.html\n); is(scalar @list, 4, 'four revisions in rev-list'); } +{ + my $fail_bin = getcwd()."/t/fail-bin"; + ok(-x "$fail_bin/spamc", "mock spamc exists"); + my $fail_path = "$fail_bin:$ENV{PATH}"; # for spamc ham mock + local $ENV{PATH} = $fail_path; + PublicInbox::Emergency->new($maildir)->prepare(\$msg); + $config->{'publicinboxwatch.spamcheck'} = 'spamc'; + PublicInbox::WatchMaildir->new($config)->scan; + @list = $git->qx(qw(ls-tree -r --name-only refs/heads/master)); + is(scalar @list, 0, 'tree has no files spamc checked'); + is(unlink(glob("$maildir/new/*")), 1); +} + +{ + my $main_bin = getcwd()."/t/main-bin"; + ok(-x "$main_bin/spamc", "mock spamc exists"); + my $main_path = "$main_bin:$ENV{PATH}"; # for spamc ham mock + local $ENV{PATH} = $main_path; + PublicInbox::Emergency->new($maildir)->prepare(\$msg); + $config->{'publicinboxwatch.spamcheck'} = 'spamc'; + @list = $git->qx(qw(ls-tree -r --name-only refs/heads/master)); + PublicInbox::WatchMaildir->new($config)->scan; + @list = $git->qx(qw(ls-tree -r --name-only refs/heads/master)); + is(scalar @list, 1, 'tree has one file after spamc checked'); + + # XXX: workaround some weird caching/memoization in cat-file, + # shouldn't be an issue in real-world use, though... + $git = PublicInbox::Git->new($git_dir); + + my $mref = $git->cat_file('refs/heads/master:'.$list[0]); + like($$mref, qr/something\n\z/s, 'message scrubbed on import'); +} + done_testing;