unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
* [PATCH 0/2] watch-related fixes for filter users
@ 2024-12-30 22:28 Eric Wong
  2024-12-30 22:28 ` [PATCH 1/2] watch|mda|purge: Filter::*->scrub is destructive Eric Wong
  2024-12-30 22:28 ` [PATCH 2/2] watch: import_eml: avoid Eml dup for non-scrub case Eric Wong
  0 siblings, 2 replies; 4+ messages in thread
From: Eric Wong @ 2024-12-30 22:28 UTC (permalink / raw)
  To: meta

1/2 is a bugfix, and 2/2 is a small optimization for common setups
(probably immeasurable).

Eric Wong (2):
  watch|mda|purge: Filter::*->scrub is destructive
  watch: import_eml: avoid Eml dup for non-scrub case

 lib/PublicInbox/Filter/Vger.pm   | 15 +++++-----
 lib/PublicInbox/InboxWritable.pm |  2 --
 lib/PublicInbox/Watch.pm         | 50 ++++++++++++--------------------
 script/public-inbox-mda          | 19 +++++-------
 script/public-inbox-purge        | 12 ++++----
 5 files changed, 39 insertions(+), 59 deletions(-)

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 1/2] watch|mda|purge: Filter::*->scrub is destructive
  2024-12-30 22:28 [PATCH 0/2] watch-related fixes for filter users Eric Wong
@ 2024-12-30 22:28 ` Eric Wong
  2024-12-30 22:28 ` [PATCH 2/2] watch: import_eml: avoid Eml dup for non-scrub case Eric Wong
  1 sibling, 0 replies; 4+ messages in thread
From: Eric Wong @ 2024-12-30 22:28 UTC (permalink / raw)
  To: meta

-watch was failing to properly account for inboxes having
different filters and ->scrub behavior during message removal
(but not adds).  Explicitly duplicate and simplify the -watch
code for handling message removal and update the rest of the
callers to reflect the standardized destructive behavior of
->scrub.  While most filters were destructive anyways, the Vger
filter was a notable exception and updated to be destructive.

We'll also update some variables from `$mime' uses to `$eml' to
reflect the switch from (Email|PublicInbox)::MIME to
PublicInbox::Eml years ago.

Finally, the Vger filter includes updated comments and uses
v5.10.1+ features, now.
---
 lib/PublicInbox/Filter/Vger.pm   | 15 ++++++-------
 lib/PublicInbox/InboxWritable.pm |  2 --
 lib/PublicInbox/Watch.pm         | 36 +++++++++++---------------------
 script/public-inbox-mda          | 19 +++++++----------
 script/public-inbox-purge        | 12 +++++------
 5 files changed, 33 insertions(+), 51 deletions(-)

diff --git a/lib/PublicInbox/Filter/Vger.pm b/lib/PublicInbox/Filter/Vger.pm
index 5b3c0277..2202a217 100644
--- a/lib/PublicInbox/Filter/Vger.pm
+++ b/lib/PublicInbox/Filter/Vger.pm
@@ -1,9 +1,10 @@
-# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 
 # Filter for vger.kernel.org list trailer
 package PublicInbox::Filter::Vger;
-use base qw(PublicInbox::Filter::Base);
+use v5.10.1; # check regexps before v5.12
+use parent qw(PublicInbox::Filter::Base);
 use strict;
 use PublicInbox::Eml;
 
@@ -18,16 +19,16 @@ my $l3 =
 my $l4 = qr!Please read the FAQ at +http://www\.tux\.org/lkml/!;
 
 sub scrub {
-	my ($self, $mime) = @_;
-	my $s = $mime->as_string;
+	my ($self, $eml) = @_;
+	my $s = $eml->as_string;
 
-	# the vger appender seems to only work on the raw string,
+	# the old vger appender seemed to only work on the raw string,
 	# so in multipart (e.g. GPG-signed) messages, the list trailer
 	# becomes invisible to MIME-aware email clients.
 	if ($s =~ s/$l0\n$l1\n$l2\n$l3\n(?:$l4\n)?\n*\z//os) {
-		$mime = PublicInbox::Eml->new(\$s);
+		$_[1] = $eml= PublicInbox::Eml->new(\$s);
 	}
-	$self->ACCEPT($mime);
+	$self->ACCEPT($eml);
 }
 
 sub delivery {
diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm
index b995a8ad..18412cc3 100644
--- a/lib/PublicInbox/InboxWritable.pm
+++ b/lib/PublicInbox/InboxWritable.pm
@@ -129,7 +129,6 @@ sub _each_maildir_eml {
 	if ($self && (my $filter = $self->filter($im))) {
 		my $ret = $filter->scrub($eml) or return;
 		return if $ret == REJECT();
-		$eml = $ret;
 	}
 	$im->add($eml);
 }
@@ -153,7 +152,6 @@ sub _mbox_eml_cb { # MboxReader->mbox* callback
 	if ($filter) {
 		my $ret = $filter->scrub($eml) or return;
 		return if $ret == REJECT();
-		$eml = $ret;
 	}
 	$im->add($eml);
 }
diff --git a/lib/PublicInbox/Watch.pm b/lib/PublicInbox/Watch.pm
index 5cde1d80..c270f587 100644
--- a/lib/PublicInbox/Watch.pm
+++ b/lib/PublicInbox/Watch.pm
@@ -176,30 +176,19 @@ sub remove_eml_i { # each_inbox callback
 	eval {
 		# try to avoid taking a lock or unnecessary spawning
 		my $im = $self->{importers}->{"$ibx"};
-		my $scrubbed;
+		my @eml = ($eml);
+		if (my $filter = $ibx->filter($im)) {
+			my $tmp = PublicInbox::Eml->new(\($eml->as_string));
+			my $ret = $filter->scrub($tmp, 1);
+			push @eml, $tmp if $ret && $ret != REJECT;
+		}
+
 		if ((!$im || !$im->active) && $ibx->over) {
-			if (content_exists($ibx, $eml)) {
-				# continue
-			} elsif (my $scrub = $ibx->filter($im)) {
-				$scrubbed = $scrub->scrub($eml, 1);
-				if ($scrubbed && $scrubbed != REJECT &&
-					  !content_exists($ibx, $scrubbed)) {
-					return;
-				}
-			} else {
-				return;
-			}
+			@eml = grep { content_exists($ibx, $_) } @eml or return;
 		}
 
 		$im //= _importer_for($self, $ibx); # may spawn fast-import
-		$im->remove($eml, 'spam');
-		$scrubbed //= do {
-			my $scrub = $ibx->filter($im);
-			$scrub ? $scrub->scrub($eml, 1) : undef;
-		};
-		if ($scrubbed && $scrubbed != REJECT) {
-			$im->remove($scrubbed, 'spam');
-		}
+		$im->remove($_, 'spam') for @eml;
 	};
 	if ($@) {
 		warn "error removing spam at: $loc from $ibx->{name}: $@\n";
@@ -232,10 +221,9 @@ sub import_eml ($$$) {
 	}
 	eval {
 		my $im = _importer_for($self, $ibx);
-		if (my $scrub = $ibx->filter($im)) {
-			my $scrubbed = $scrub->scrub($eml) or return;
-			$scrubbed == REJECT and return;
-			$eml = $scrubbed;
+		if (my $filter = $ibx->filter($im)) {
+			my $ret = $filter->scrub($eml) or return;
+			$ret == REJECT and return;
 		}
 		$im->add($eml, $self->{spamcheck});
 	};
diff --git a/script/public-inbox-mda b/script/public-inbox-mda
index 74202912..9d68ea35 100755
--- a/script/public-inbox-mda
+++ b/script/public-inbox-mda
@@ -120,26 +120,21 @@ my @rejects;
 for my $ibx (@$dests) {
 	mda_filter_adjust($ibx);
 	my $filter = $ibx->filter;
-	my $mime = PublicInbox::Eml->new($str);
-	my $ret = $filter->delivery($mime);
-	if (ref($ret) && ($ret->isa('PublicInbox::Eml') ||
-			$ret->isa('Email::MIME'))) { # filter altered message
-		$mime = $ret;
-	} elsif ($ret == PublicInbox::Filter::Base::IGNORE) {
-		next; # nothing, keep looping
-	} elsif ($ret == PublicInbox::Filter::Base::REJECT) {
+	my $eml = PublicInbox::Eml->new($str);
+	my $ret = $filter->delivery($eml) or next; # $ret == IGNORE
+	if ($ret == PublicInbox::Filter::Base::REJECT) {
 		push @rejects, $filter->err;
 		next;
-	}
+	} # else success
 
-	PublicInbox::MDA->set_list_headers($mime, $ibx);
+	PublicInbox::MDA->set_list_headers($eml, $ibx);
 	my $im = $ibx->importer(0);
-	if (defined $im->add($mime)) {
+	if (defined $im->add($eml)) {
 		# ->abort is idempotent, no emergency if a single
 		# destination succeeds
 		$emm->abort;
 	} else { # v1-only
-		my $mid = $mime->header_raw('Message-ID');
+		my $mid = $eml->header_raw('Message-ID');
 		# this message is similar to what ssoma-mda shows:
 		print STDERR "CONFLICT: Message-ID: $mid exists\n";
 	}
diff --git a/script/public-inbox-purge b/script/public-inbox-purge
index 618cfec4..381f58cd 100755
--- a/script/public-inbox-purge
+++ b/script/public-inbox-purge
@@ -38,16 +38,16 @@ PublicInbox::Eml::strip_from($data);
 my $n_purged = 0;
 
 foreach my $ibx (@ibxs) {
-	my $mime = PublicInbox::Eml->new($data);
+	my $eml = PublicInbox::Eml->new($data);
 	my $v2w = PublicInbox::V2Writable->new($ibx, 0);
 
-	my $commits = $v2w->purge($mime) || [];
+	my $commits = $v2w->purge($eml) || [];
 
-	if (my $scrub = $ibx->filter($v2w)) {
-		my $scrubbed = $scrub->scrub($mime, 1);
+	if (my $filter = $ibx->filter($v2w)) {
+		my $ret = $filter->scrub($eml, 1); # destructive
 
-		if ($scrubbed && $scrubbed != REJECT()) {
-			my $scrub_commits = $v2w->purge($scrubbed);
+		if ($ret && $ret != REJECT) {
+			my $scrub_commits = $v2w->purge($eml);
 			push @$commits, @$scrub_commits if $scrub_commits;
 		}
 	}

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/2] watch: import_eml: avoid Eml dup for non-scrub case
  2024-12-30 22:28 [PATCH 0/2] watch-related fixes for filter users Eric Wong
  2024-12-30 22:28 ` [PATCH 1/2] watch|mda|purge: Filter::*->scrub is destructive Eric Wong
@ 2024-12-30 22:28 ` Eric Wong
  2025-01-01 23:11   ` [PATCH 3/2] watch: don't propagate header changes to other inboxes Eric Wong
  1 sibling, 1 reply; 4+ messages in thread
From: Eric Wong @ 2024-12-30 22:28 UTC (permalink / raw)
  To: meta

Most inboxes don't use filters, so the destructive ->scrub won't
be called and we can avoid duplicating the Eml object in the
common case.  We'll also drop the last inbox optimization for
net_cb since it's likely unnecessary given that filters are not
very common.

This may be more noticeable for users who map multiple inboxes
to a single Maildir|IMAP|NNTP source.
---
 lib/PublicInbox/Watch.pm | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/lib/PublicInbox/Watch.pm b/lib/PublicInbox/Watch.pm
index c270f587..a37038b5 100644
--- a/lib/PublicInbox/Watch.pm
+++ b/lib/PublicInbox/Watch.pm
@@ -222,8 +222,10 @@ sub import_eml ($$$) {
 	eval {
 		my $im = _importer_for($self, $ibx);
 		if (my $filter = $ibx->filter($im)) {
-			my $ret = $filter->scrub($eml) or return;
+			my $tmp = PublicInbox::Eml->new(\($eml->as_string));
+			my $ret = $filter->scrub($tmp) or return;
 			$ret == REJECT and return;
+			$eml = $tmp;
 		}
 		$im->add($eml, $self->{spamcheck});
 	};
@@ -263,9 +265,9 @@ sub _try_path ($$) {
 		$warn_cb->($pfx, "path: $path\n", @_);
 	};
 	return _remove_spam($self, $path) if $inboxes eq 'watchspam';
+	my $eml = eml_from_path($path) or return;
 	for my $ibx (@$inboxes) {
-		my $eml = eml_from_path($path) or next;
-		import_eml($self, $ibx, $eml); # $eml may be scrubbed
+		import_eml($self, $ibx, $eml);
 	}
 	1;
 }
@@ -323,13 +325,9 @@ sub net_cb { # NetReader::(nntp|imap)_each callback
 	return if grep(/\Adraft\z/, @$kw);
 	local $self->{cur_uid} = $art; # IMAP UID or NNTP article
 	if (ref($inboxes)) {
-		my @ibx = @$inboxes;
-		my $last = pop @ibx;
-		for my $ibx (@ibx) {
-			my $tmp = PublicInbox::Eml->new(\($eml->as_string));
-			import_eml($self, $ibx, $tmp);
+		for my $ibx (@$inboxes) {
+			import_eml($self, $ibx, $eml);
 		}
-		import_eml($self, $last, $eml);
 	} elsif ($inboxes eq 'watchspam') {
 		if ($uri->scheme =~ /\Aimaps?\z/ && !grep(/\Aseen\z/, @$kw)) {
 			return;

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 3/2] watch: don't propagate header changes to other inboxes
  2024-12-30 22:28 ` [PATCH 2/2] watch: import_eml: avoid Eml dup for non-scrub case Eric Wong
@ 2025-01-01 23:11   ` Eric Wong
  0 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2025-01-01 23:11 UTC (permalink / raw)
  To: meta

Message-IDs may be appended by v2 inboxes for messages with with
conflicting Message-IDs or SpamAssassin (or another spam filter)
can change headers of both v1 and v2 messages.  So we rely on
`local' to reach into Eml internals to localize ->header_set
modifications and rely on modern Perl having copy-on-write
scalars to minimize memory traffic in the common case.
---
 MANIFEST                      |  1 +
 lib/PublicInbox/Watch.pm      |  4 ++
 t/eml.t                       |  9 +++++
 t/watch_v1_v2_mix_no_modify.t | 74 +++++++++++++++++++++++++++++++++++
 4 files changed, 88 insertions(+)
 create mode 100644 t/watch_v1_v2_mix_no_modify.t

diff --git a/MANIFEST b/MANIFEST
index b0b4f71c..30adab80 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -641,6 +641,7 @@ t/watch_maildir.t
 t/watch_maildir_v2.t
 t/watch_mh.t
 t/watch_multiple_headers.t
+t/watch_v1_v2_mix_no_modify.t
 t/www_altid.t
 t/www_listing.t
 t/www_static.t
diff --git a/lib/PublicInbox/Watch.pm b/lib/PublicInbox/Watch.pm
index a37038b5..eb6eb85f 100644
--- a/lib/PublicInbox/Watch.pm
+++ b/lib/PublicInbox/Watch.pm
@@ -210,6 +210,10 @@ sub _remove_spam {
 sub import_eml ($$$) {
 	my ($self, $ibx, $eml) = @_;
 
+	# v2 may add a new Message-ID header on conflicts w/ different content
+	# and SpamAssassin may alter headers.  $copy is CoW in newer Perls.
+	local $eml->{hdr} = \(my $copy = ${$eml->{hdr}});
+
 	# any header match means it's eligible for the inbox:
 	if (my $watch_hdrs = $ibx->{-watchheaders}) {
 		my $ok;
diff --git a/t/eml.t b/t/eml.t
index 690ada57..d63f7a3b 100644
--- a/t/eml.t
+++ b/t/eml.t
@@ -28,6 +28,15 @@ sub mime_load ($) {
 	is($eml->as_string, "a: b\n\nhi\n", '->as_string');
 	my $empty = PublicInbox::Eml->new("\n\n");
 	is($empty->as_string, "\n\n", 'empty message');
+
+	# we use this pattern in -watch for writing an eml to multiple inboxes
+	my $in = do { # n.b. `my' must be used to assign a local var
+		local $eml->{hdr} = \(my $copy = ${$eml->{hdr}});
+		$eml->header_set(qw(Message-ID <a@example> <b@example>));
+		${$eml->{hdr}};
+	};
+	like $in, qr/<a\@example>.*?<b\@example>/s, 'Message-ID set in copy';
+	is $eml->header_raw('Message-ID'), undef, 'local $eml->{hdr} works';
 }
 
 for my $cls (@classes) {
diff --git a/t/watch_v1_v2_mix_no_modify.t b/t/watch_v1_v2_mix_no_modify.t
new file mode 100644
index 00000000..759fd0bb
--- /dev/null
+++ b/t/watch_v1_v2_mix_no_modify.t
@@ -0,0 +1,74 @@
+#!perl -w
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use v5.12;
+use autodie;
+use PublicInbox::TestCommon;
+require_mods 'v2';
+use PublicInbox::InboxIdle;
+use PublicInbox::DS qw(now);
+use PublicInbox::IO qw(write_file);
+use PublicInbox::Eml;
+my $tmpdir = tmpdir;
+local $ENV{HOME} = "$tmpdir";
+local $ENV{PI_CONFIG} = "$tmpdir/.public-inbox/config";
+my $msg = <<'EOM';
+From: a@example.com
+To: v1@example.com, v2@example.com
+Subject: test
+Message-ID: <a@example>
+
+hi
+EOM
+
+for my $v (2, 1) {
+	run_script(['-init', "-V$v", "v$v", '-Lbasic', "$tmpdir/v$v",
+		"https://example.com/v$v", "v$v\@example.com" ]) or
+			xbail "init -V$v";
+	write_file '>>', $ENV{PI_CONFIG}, "watch = maildir:$tmpdir/md$v";
+	mkdir "$tmpdir/md$v/$_" for ('', qw(cur tmp new));
+}
+my $orig = "$tmpdir/md2/cur/initial:2,S";
+write_file '>', $orig, $msg;
+
+my $delivered = 0;
+my $cb = sub {
+	my ($ibx) = @_;
+	diag "message delivered to `$ibx->{name}'";
+	$delivered++;
+};
+my $ii = PublicInbox::InboxIdle->new(my $cfg = PublicInbox::Config->new);
+my $obj = bless \$cb, 'PublicInbox::TestCommon::InboxWakeup';
+$cfg->each_inbox(sub { $_[0]->subscribe_unlock('ident', $obj) });
+my $exp = now + 10;
+local @PublicInbox::DS::post_loop_do = (sub { $delivered < 1 || now > $exp});
+
+my $rdr;
+open $rdr->{2}, '>>', undef;
+my $w = start_script(['-watch'], undef, $rdr);
+diag 'waiting for -watch to import 2 existing messages..';
+PublicInbox::DS::add_timer 10, \&PublicInbox::Config::noop;
+PublicInbox::DS::event_loop;
+is $delivered, 1, 'delivered initial message';
+
+$exp = now + 10;
+PublicInbox::DS::add_timer 10, \&PublicInbox::Config::noop;
+local @PublicInbox::DS::post_loop_do = (sub { $delivered < 3 || now > $exp });
+my $tmpmsg = "$tmpdir/md2/tmp/conflict:2,S";
+write_file '>', $tmpmsg, $msg, "hi\n";
+link $tmpmsg, "$tmpdir/md1/cur/conflict:2,S";
+rename $tmpmsg, "$tmpdir/md2/cur/conflict:2,S";
+PublicInbox::DS::event_loop;
+is $delivered, 3, 'delivered new conflicting message to v2 and new to v1';
+
+my $v1ibx = $cfg->lookup('v1@example.com');
+my $v1msgs = $v1ibx->over->recent;
+is scalar(@$v1msgs), 1, 'only one message in v1';
+my $v1eml = PublicInbox::Eml->new($v1ibx->git->cat_file($v1msgs->[0]->{blob}));
+my @mid = $v1eml->header_raw('Message-ID');
+is_deeply \@mid, [ qw[<a@example>] ], 'only one Message-ID in v1 message';
+
+my $v2msgs = $cfg->lookup('v2@example.com')->over->recent;
+is scalar(@$v2msgs), 2, 'both messages in v2';
+
+done_testing;

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-01-01 23:11 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-12-30 22:28 [PATCH 0/2] watch-related fixes for filter users Eric Wong
2024-12-30 22:28 ` [PATCH 1/2] watch|mda|purge: Filter::*->scrub is destructive Eric Wong
2024-12-30 22:28 ` [PATCH 2/2] watch: import_eml: avoid Eml dup for non-scrub case Eric Wong
2025-01-01 23:11   ` [PATCH 3/2] watch: don't propagate header changes to other inboxes Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).