* [PATCH] watchmaildir: spam checking and learning improvements
@ 2019-01-10 9:02 Eric Wong
2019-01-10 9:19 ` [REJECT] " Eric Wong
0 siblings, 1 reply; 2+ messages in thread
From: Eric Wong @ 2019-01-10 9:02 UTC (permalink / raw)
To: meta
It turns out spam training ("spamc -L spam") was never implemented
for -watch. So implement that, and take advantage of an
existing feature to pass FDs directly to spamc(1) instead of
reconverting the MIME object into a string.
While we're at it, the spam-checking call is now only performed
once per message when a Maildir supports multiple inboxes.
This changes PublicInbox::Import::add (and V2Writable
equivalent) API to no longer perform spam-checking.
While ->add is one of the few documented API functions in POD,
the spamcheck callback was never documented, so it's gone. I'm
not sure what I was thinking when I made the callback in ->add
instead of just checking spam before it was called...
---
MANIFEST | 1 +
lib/PublicInbox/Import.pm | 7 +--
lib/PublicInbox/V2Writable.pm | 13 ++----
lib/PublicInbox/WatchMaildir.pm | 76 ++++++++++++++++++++++-----------
t/import.t | 3 +-
t/log-bin/spamc | 6 +++
t/watch_maildir.t | 31 +++++++++++++-
7 files changed, 92 insertions(+), 45 deletions(-)
create mode 100755 t/log-bin/spamc
diff --git a/MANIFEST b/MANIFEST
index 73d1047..cabcf7b 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -173,6 +173,7 @@ t/import.t
t/inbox.t
t/init.t
t/linkify.t
+t/log-bin/spamc
t/main-bin/spamc
t/mda.t
t/mda_filter_rubylang.t
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index fd4255c..65a7e8c 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -356,7 +356,7 @@ ($$$)
# returns undef on duplicate
# returns the :MARK of the most recent commit
sub add {
- my ($self, $mime, $check_cb) = @_; # mime = Email::MIME
+ my ($self, $mime) = @_; # mime = Email::MIME
my ($name, $email) = extract_author_info($mime);
my $hdr = $mime->header_obj;
@@ -383,11 +383,6 @@ sub add {
drop_unwanted_headers($mime);
- # spam check:
- if ($check_cb) {
- $mime = $check_cb->($mime) or return;
- }
-
my $blob = $self->{mark}++;
my $str = $mime->as_string;
my $n = length($str);
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 222df5c..f4f0cfe 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -99,19 +99,12 @@ sub init_inbox {
# returns undef on duplicate or spam
# mimics Import::add and wraps it for v2
sub add {
- my ($self, $mime, $check_cb) = @_;
- $self->{-inbox}->with_umask(sub {
- _add($self, $mime, $check_cb)
- });
+ my ($self, $mime) = @_;
+ $self->{-inbox}->with_umask(sub { _add($self, $mime) });
}
sub _add {
- my ($self, $mime, $check_cb) = @_;
-
- # spam check:
- if ($check_cb) {
- $mime = $check_cb->($mime) or return;
- }
+ my ($self, $mime) = @_;
# All pipes (> $^F) known to Perl 5.6+ have FD_CLOEXEC set,
# as does SQLite 3.4.1+ (released in 2007-07-20), and
diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm
index 2d4c6f4..17ace92 100644
--- a/lib/PublicInbox/WatchMaildir.pm
+++ b/lib/PublicInbox/WatchMaildir.pm
@@ -16,6 +16,7 @@ package PublicInbox::WatchMaildir;
use PublicInbox::Filter::Base;
use PublicInbox::Spamcheck;
*REJECT = *PublicInbox::Filter::Base::REJECT;
+use Fcntl qw(SEEK_SET);
sub new {
my ($class, $config) = @_;
@@ -52,7 +53,6 @@ sub new {
my $k = 'publicinboxwatch.spamcheck';
my $default = undef;
my $spamcheck = PublicInbox::Spamcheck::get($config, $k, $default);
- $spamcheck = _spamcheck_cb($spamcheck) if $spamcheck;
$config->each_inbox(sub {
# need to make all inboxes writable for spam removal:
@@ -115,7 +115,8 @@ sub _remove_spam {
my ($self, $path) = @_;
# path must be marked as (S)een
$path =~ /:2,[A-R]*S[T-Za-z]*\z/ or return;
- my $mime = _path_to_mime($path) or return;
+ my $fh;
+ my $mime = _path_to_mime($path, \$fh) or return;
$self->{config}->each_inbox(sub {
my ($ibx) = @_;
eval {
@@ -132,7 +133,22 @@ sub _remove_spam {
warn "error removing spam at: ", $path,
" from ", $ibx->{name}, ': ', $@, "\n";
}
- })
+ });
+
+ # This ignores the scrubbed version if it differs.
+ # Probably not worth training both...
+ my $spamcheck = $self->{spamcheck};
+ $mime = undef;
+ if ($spamcheck && $spamcheck->can('spamlearn')) {
+ sysseek($fh, 0, SEEK_SET) or die "sysseek failed: $!";
+ $spamcheck->spamlearn($fh);
+ }
+}
+
+sub spam_fail ($$) {
+ my ($mime, $path) = @_;
+ warn $mime->header('Message-ID')." failed spam check\n",
+ 'path: ', $path, "\n";
}
sub _try_path {
@@ -150,22 +166,37 @@ sub _try_path {
if (!ref($inboxes) && $inboxes eq 'watchspam') {
return _remove_spam($self, $path);
}
+
+ my $spamcheck = $self->{spamcheck};
+ my $fh;
foreach my $ibx (@$inboxes) {
- my $mime = _path_to_mime($path) or next;
- my $im = _importer_for($self, $ibx);
+ my $mime = $fh ? _fh_to_mime($fh, 0) :
+ _path_to_mime($path, \$fh) or next;
+ my $im = _importer_for($self, $ibx);
my $wm = $ibx->{-watchheader};
if ($wm) {
my $v = $mime->header_obj->header_raw($wm->[0]);
next unless ($v && $v =~ $wm->[1]);
}
+ # we only check each message once:
+ if ($spamcheck) {
+ my $dst = '';
+ sysseek($fh, 0, SEEK_SET) or die "sysseek failed: $!";
+ $spamcheck->spamcheck($mime, \$dst) or
+ return spam_fail($mime, $path);
+
+ $mime = PublicInbox::MIME->new(\$dst);
+ $spamcheck = undef; # no need to check again this loop
+ }
+
if (my $scrub = $ibx->filter($im)) {
my $ret = $scrub->scrub($mime) or next;
$ret == REJECT() and next;
$mime = $ret;
}
- $im->add($mime, $self->{spamcheck});
+ $im->add($mime);
}
}
@@ -237,18 +268,24 @@ sub scan {
trigger_scan($self, 'cont') if keys %$opendirs;
}
+sub _fh_to_mime {
+ my ($fh, $off) = @_;
+ if (defined $off) {
+ seek($fh, $off, SEEK_SET) or die "seek failed: $!";
+ }
+ defined(my $str = do { local $/; <$fh> }) or return;
+ PublicInbox::MIME->new(\$str);
+}
+
sub _path_to_mime {
- my ($path) = @_;
+ my ($path, $fhref) = @_;
if (open my $fh, '<', $path) {
- local $/;
- my $str = <$fh>;
- $str or return;
- return PublicInbox::MIME->new(\$str);
+ $$fhref = $fh;
+ _fh_to_mime($fh);
} elsif ($!{ENOENT}) {
- return;
+ # race with another process, just ignore the missing message
} else {
warn "failed to open $path: $!\n";
- return;
}
}
@@ -264,19 +301,6 @@ sub _importer_for {
$importers->{"$ibx"} = $im;
}
-sub _spamcheck_cb {
- my ($sc) = @_;
- sub {
- my ($mime) = @_;
- my $tmp = '';
- if ($sc->spamcheck($mime, \$tmp)) {
- return PublicInbox::MIME->new(\$tmp);
- }
- warn $mime->header('Message-ID')." failed spam check\n";
- undef;
- }
-}
-
sub is_maildir {
$_[0] =~ s!\Amaildir:!! or return;
$_[0] =~ tr!/!/!s;
diff --git a/t/import.t b/t/import.t
index eee4744..1016ee8 100644
--- a/t/import.t
+++ b/t/import.t
@@ -56,7 +56,7 @@ is(scalar @revs, 1, 'one revision created');
$mime->header_set('Message-ID', '<b@example.com>');
$mime->header_set('Subject', 'msg2');
-like($im->add($mime, sub { $mime }), qr/\A:\d+\z/, 'added 2nd message');
+like($im->add($mime), qr/\A:\d+\z/, 'added 2nd message');
$im->done;
@revs = $git->qx(qw(rev-list HEAD));
is(scalar @revs, 2, '2 revisions exist');
@@ -88,7 +88,6 @@ is($msg->header('Message-ID'), '<a@example.com>', 'Message-ID matches');
isnt($msg->header('Subject'), $mime->header('Subject'), 'subject mismatch');
$mime->header_set('Message-Id', '<failcheck@example.com>');
-is($im->add($mime, sub { undef }), undef, 'check callback fails');
is($im->remove($mime), undef, 'message not added, so not removed');
is(undef, $im->checkpoint, 'checkpoint works before ->done');
$im->done;
diff --git a/t/log-bin/spamc b/t/log-bin/spamc
new file mode 100755
index 0000000..9a1551f
--- /dev/null
+++ b/t/log-bin/spamc
@@ -0,0 +1,6 @@
+#!/bin/sh
+# mock spamc which writes its args to LOG_PATH
+if test -n "$LOG_PATH"
+then
+ echo >>"$LOG_PATH" "ARGS: $@"
+fi
diff --git a/t/watch_maildir.t b/t/watch_maildir.t
index b85ddc5..ee84069 100644
--- a/t/watch_maildir.t
+++ b/t/watch_maildir.t
@@ -95,10 +95,14 @@ More majordomo info at http://vger.kernel.org/majordomo-info.html\n);
local $ENV{PATH} = $fail_path;
PublicInbox::Emergency->new($maildir)->prepare(\$msg);
$config->{'publicinboxwatch.spamcheck'} = 'spamc';
+ my @warn;
{
- local $SIG{__WARN__} = sub {}; # quiet spam check warning
+ local $SIG{__WARN__} = sub { push @warn, @_ };
PublicInbox::WatchMaildir->new($config)->scan('full');
}
+ my $warn = join("\n", @warn);
+ like($warn, qr/failed spam check/, 'warned about spam');
+ like($warn, qr/^path: /m, 'shows failing path');
@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
is(scalar @list, 0, 'tree has no files spamc checked');
is(unlink(glob("$maildir/new/*")), 1);
@@ -124,6 +128,31 @@ More majordomo info at http://vger.kernel.org/majordomo-info.html\n);
like($$mref, qr/something\n\z/s, 'message scrubbed on import');
}
+my $write_spam = sub {
+ is(scalar glob("$spamdir/new/*"), undef, 'no spam existing');
+ PublicInbox::Emergency->new($spamdir)->prepare(\$msg);
+ my @new = glob("$spamdir/new/*");
+ is(scalar @new, 1);
+ my @p = split(m!/+!, $new[0]);
+ ok(link($new[0], "$spamdir/cur/".$p[-1].":2,S"));
+ is(unlink($new[0]), 1);
+};
+
+{
+ my $log_bin = getcwd()."/t/log-bin";
+ ok(-x "$log_bin/spamc", "mock spamc exists for logging");
+ my $log_path = "$log_bin:$ENV{PATH}"; # for spamc ham mock
+ local $ENV{PATH} = $log_path;
+ my $log_path = "$tmpdir/spamc.log";
+ local $ENV{LOG_PATH} = $log_path;
+ ok(unlink(glob("$maildir/*/*")));
+ ok(unlink(glob("$spamdir/*/*")));
+ $write_spam->(\$msg);
+ PublicInbox::WatchMaildir->new($config)->scan('full');
+ ok(open(my $fh, '<', $log_path), 'opened spamc.log');
+ like(<$fh>, qr/-L spam/, '"spamc -L spam" called');
+}
+
sub is_maildir {
my ($dir) = @_;
PublicInbox::WatchMaildir::is_maildir($dir);
--
EW
^ permalink raw reply related [flat|nested] 2+ messages in thread
* [REJECT] watchmaildir: spam checking and learning improvements
2019-01-10 9:02 [PATCH] watchmaildir: spam checking and learning improvements Eric Wong
@ 2019-01-10 9:19 ` Eric Wong
0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong @ 2019-01-10 9:19 UTC (permalink / raw)
To: meta
Eric Wong <e@80x24.org> wrote:
> It turns out spam training ("spamc -L spam") was never implemented
> for -watch. So implement that, and take advantage of an
> existing feature to pass FDs directly to spamc(1) instead of
> reconverting the MIME object into a string.
Will keep that...
> While we're at it, the spam-checking call is now only performed
> once per message when a Maildir supports multiple inboxes.
And that.
> This changes PublicInbox::Import::add (and V2Writable
> equivalent) API to no longer perform spam-checking.
But nope, we need to do spam-checking inside ->add;
because we don't want to waste cycles re-spam-checking
messages which are already in our archives on -watch
restarts.
And the V2Writable->add spam-checking is in the wrong place.
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2019-01-10 9:19 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-01-10 9:02 [PATCH] watchmaildir: spam checking and learning improvements Eric Wong
2019-01-10 9:19 ` [REJECT] " Eric Wong
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).