From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH] watch: don't count invalid paths against batch limit
Date: Thu, 26 Dec 2024 21:48:51 +0000 [thread overview]
Message-ID: <20241226214851.2078117-1-e@80x24.org> (raw)
Invalid paths such as `.', `..', `.mh_sequence', and perhaps
other implementation-specific files may throw off the count and
cause premature commits. While the premature commit isn't too
harmful in the common case, it's possible a pathological case of
having too many non-mail entries in a directory can cause
noticeable slowdowns and storage wear.
So have _try_path() and _remove_spam() return a true value if
a file was actually read. We'll also simplify the $inboxes
check by relying simply on `eq', since the `ref' check isn't
necessary as the `eq' against a ref will never match the
"watchspam" literal.
---
lib/PublicInbox/Watch.pm | 29 +++++++++++++++++------------
1 file changed, 17 insertions(+), 12 deletions(-)
diff --git a/lib/PublicInbox/Watch.pm b/lib/PublicInbox/Watch.pm
index 0520967f..5cde1d80 100644
--- a/lib/PublicInbox/Watch.pm
+++ b/lib/PublicInbox/Watch.pm
@@ -207,6 +207,7 @@ sub remove_eml_i { # each_inbox callback
}
}
+# returns true if a file was read
sub _remove_spam {
my ($self, $path) = @_;
# path must be marked as (S)een
@@ -214,6 +215,7 @@ sub _remove_spam {
my $eml = eml_from_path($path) or return;
local $SIG{__WARN__} = PublicInbox::Eml::warn_ignore_cb();
$self->{pi_cfg}->each_inbox(\&remove_eml_i, $self, $eml, $path);
+ 1;
}
sub import_eml ($$$) {
@@ -243,13 +245,18 @@ sub import_eml ($$$) {
}
}
-sub _try_path {
+# returns true if a file was read
+sub _try_path ($$) {
my ($self, $path) = @_;
- $path =~ $self->{d_re} or
- return warn("BUG? unrecognized path: $path\n");
+ if ($path !~ $self->{d_re}) {
+ warn "BUG? unrecognized path: $path\n";
+ return;
+ }
my $dir = $1;
- my $inboxes = $self->{d_map}->{$dir} //
- return warn("W: unmappable dir: $dir\n");
+ my $inboxes = $self->{d_map}->{$dir} // do {
+ warn "W: unmappable dir: $dir\n";
+ return;
+ };
my ($md_fl, $mh_seq);
if ($self->{d_type}->{$dir} & D_MH) {
$path =~ m!/([0-9]+)\z! ? ($mh_seq = $1) : return;
@@ -267,13 +274,12 @@ sub _try_path {
my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : '';
$warn_cb->($pfx, "path: $path\n", @_);
};
- if (!ref($inboxes) && $inboxes eq 'watchspam') {
- return _remove_spam($self, $path);
- }
- foreach my $ibx (@$inboxes) {
+ return _remove_spam($self, $path) if $inboxes eq 'watchspam';
+ for my $ibx (@$inboxes) {
my $eml = eml_from_path($path) or next;
- import_eml($self, $ibx, $eml);
+ import_eml($self, $ibx, $eml); # $eml may be scrubbed
}
+ 1;
}
sub quit_done ($) {
@@ -598,8 +604,7 @@ sub fs_scan_step {
}
my $n = $self->{max_batch};
while (my $fn = readdir($dh)) {
- _try_path($self, "$dir/$fn");
- last if --$n < 0;
+ last if _try_path($self, "$dir/$fn") and --$n < 0;
}
if ($n < 0) {
unshift @{$self->{scan_q}}, [ $dir, $dh ];
reply other threads:[~2024-12-26 21:48 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241226214851.2078117-1-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).