From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id B7CC41F9F2 for ; Mon, 5 Apr 2021 10:27:52 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 3/5] lei: maildir: move shard support to MdirReader Date: Mon, 5 Apr 2021 10:27:50 +0000 Message-Id: <20210405102752.6249-4-e@80x24.org> In-Reply-To: <20210405102752.6249-1-e@80x24.org> References: <20210405102752.6249-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We'll eventually want lei_input users like "lei import" and "lei tag" to support parallel reads. --- lib/PublicInbox/InboxWritable.pm | 4 ++-- lib/PublicInbox/LeiInput.pm | 2 +- lib/PublicInbox/LeiToMail.pm | 29 +++++++++-------------------- lib/PublicInbox/MdirReader.pm | 25 +++++++++++++++++++++---- t/lei-convert.t | 2 +- t/lei_to_mail.t | 8 ++++---- 6 files changed, 38 insertions(+), 32 deletions(-) diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index eeebc485..45d8cdc7 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -154,8 +154,8 @@ sub import_maildir { my $im = $self->importer(1); my @self = $self->filter($im) ? ($self) : (); require PublicInbox::MdirReader; - PublicInbox::MdirReader::maildir_each_file(\&_each_maildir_fn, - $im, @self); + PublicInbox::MdirReader->new->maildir_each_file(\&_each_maildir_fn, + $im, @self); $im->done; } diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm index 40d71f9e..e416d3ed 100644 --- a/lib/PublicInbox/LeiInput.pm +++ b/lib/PublicInbox/LeiInput.pm @@ -88,7 +88,7 @@ sub input_path_url { return $lei->fail(<new->maildir_each_eml($input, $self->can('input_maildir_cb'), $self, @args); } else { diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm index 76a11b0e..2e736070 100644 --- a/lib/PublicInbox/LeiToMail.pm +++ b/lib/PublicInbox/LeiToMail.pm @@ -14,7 +14,6 @@ use PublicInbox::PktOp qw(pkt_do); use Symbol qw(gensym); use IO::Handle; # ->autoflush use Fcntl qw(SEEK_SET SEEK_END O_CREAT O_EXCL O_WRONLY); -use Digest::SHA qw(sha256_hex); my %kw2char = ( # Maildir characters draft => 'D', @@ -234,17 +233,9 @@ sub update_kw_maybe ($$$$) { } } -sub _augment_or_unlink { # maildir_each_eml cb - my ($f, $kw, $eml, $lei, $lse, $mod, $shard, $unlink) = @_; - if ($mod) { - # can't get dirent.d_ino w/ pure Perl readdir, so we extract - # the OID if it looks like one instead of doing stat(2) - my $hex = $f =~ m!\b([a-f0-9]{40,})[^/]*\z! ? - $1 : sha256_hex($f); - my $recno = hex(substr($hex, 0, 8)); - return if ($recno % $mod) != $shard; - update_kw_maybe($lei, $lse, $eml, $kw); - } +sub _md_update { # maildir_each_eml cb + my ($f, $kw, $eml, $lei, $lse, $unlink) = @_; + update_kw_maybe($lei, $lse, $eml, $kw); $unlink ? unlink($f) : _augment($eml, $lei); } @@ -392,21 +383,19 @@ sub _do_augment_maildir { my ($self, $lei) = @_; my $dst = $lei->{ovv}->{dst}; my $lse = $lei->{opt}->{'import-before'} ? $lei->{lse} : undef; - my ($mod, $shard) = @{$self->{shard_info} // []}; + my $mdr = PublicInbox::MdirReader->new; if ($lei->{opt}->{augment}) { my $dedupe = $lei->{dedupe}; if ($dedupe && $dedupe->prepare_dedupe) { - PublicInbox::MdirReader::maildir_each_eml($dst, - \&_augment_or_unlink, - $lei, $lse, $mod, $shard); + $mdr->{shard_info} = $self->{shard_info}; + $mdr->maildir_each_eml($dst, \&_md_update, $lei, $lse); $dedupe->pause_dedupe; } } elsif ($lse) { - PublicInbox::MdirReader::maildir_each_eml($dst, - \&_augment_or_unlink, - $lei, $lse, $mod, $shard, 1); + $mdr->{shard_info} = $self->{shard_info}; + $mdr->maildir_each_eml($dst, \&_md_update, $lei, $lse, 1); } else {# clobber existing Maildir - PublicInbox::MdirReader::maildir_each_file($dst, \&_unlink); + $mdr->maildir_each_file($dst, \&_unlink); } } diff --git a/lib/PublicInbox/MdirReader.pm b/lib/PublicInbox/MdirReader.pm index 1685e4d8..b49c8ceb 100644 --- a/lib/PublicInbox/MdirReader.pm +++ b/lib/PublicInbox/MdirReader.pm @@ -8,6 +8,7 @@ package PublicInbox::MdirReader; use strict; use v5.10.1; use PublicInbox::InboxWritable qw(eml_from_path); +use Digest::SHA qw(sha256_hex); # returns Maildir flags from a basename ('' for no flags, undef for invalid) sub maildir_basename_flags { @@ -24,14 +25,25 @@ sub maildir_path_flags { $i >= 0 ? maildir_basename_flags(substr($f, $i + 1)) : undef; } -sub maildir_each_file ($$;@) { - my ($dir, $cb, @arg) = @_; +sub shard_ok ($$$) { + my ($bn, $mod, $shard) = @_; + # can't get dirent.d_ino w/ pure Perl readdir, so we extract + # the OID if it looks like one instead of doing stat(2) + my $hex = $bn =~ m!\A([a-f0-9]{40,})! ? $1 : sha256_hex($bn); + my $recno = hex(substr($hex, 0, 8)); + ($recno % $mod) == $shard; +} + +sub maildir_each_file { + my ($self, $dir, $cb, @arg) = @_; $dir .= '/' unless substr($dir, -1) eq '/'; + my ($mod, $shard) = @{$self->{shard_info} // []}; for my $d (qw(new/ cur/)) { my $pfx = $dir.$d; opendir my $dh, $pfx or next; while (defined(my $bn = readdir($dh))) { maildir_basename_flags($bn) // next; + next if defined($mod) && !shard_ok($bn, $mod, $shard); $cb->($pfx.$bn, @arg); } } @@ -40,15 +52,17 @@ sub maildir_each_file ($$;@) { my %c2kw = ('D' => 'draft', F => 'flagged', P => 'forwarded', R => 'answered', S => 'seen'); -sub maildir_each_eml ($$;@) { - my ($dir, $cb, @arg) = @_; +sub maildir_each_eml { + my ($self, $dir, $cb, @arg) = @_; $dir .= '/' unless substr($dir, -1) eq '/'; + my ($mod, $shard) = @{$self->{shard_info} // []}; my $pfx = $dir . 'new/'; if (opendir(my $dh, $pfx)) { while (defined(my $bn = readdir($dh))) { next if substr($bn, 0, 1) eq '.'; my @f = split(/:/, $bn, -1); next if scalar(@f) != 1; + next if defined($mod) && !shard_ok($bn, $mod, $shard); my $f = $pfx.$bn; my $eml = eml_from_path($f) or next; $cb->($f, [], $eml, @arg); @@ -59,6 +73,7 @@ sub maildir_each_eml ($$;@) { while (defined(my $bn = readdir($dh))) { my $fl = maildir_basename_flags($bn) // next; next if index($fl, 'T') >= 0; + next if defined($mod) && !shard_ok($bn, $mod, $shard); my $f = $pfx.$bn; my $eml = eml_from_path($f) or next; my @kw = sort(map { $c2kw{$_} // () } split(//, $fl)); @@ -66,4 +81,6 @@ sub maildir_each_eml ($$;@) { } } +sub new { bless {}, __PACKAGE__ } + 1; diff --git a/t/lei-convert.t b/t/lei-convert.t index dc53b82c..0ea860c8 100644 --- a/t/lei-convert.t +++ b/t/lei-convert.t @@ -57,7 +57,7 @@ test_lei({ tmpdir => $tmpdir }, sub { lei_ok('convert', '-o', "$d/md", "mboxrd:$d/foo.mboxrd"); ok(-d "$d/md", 'Maildir created'); my @md; - PublicInbox::MdirReader::maildir_each_eml("$d/md", sub { + PublicInbox::MdirReader->new->maildir_each_eml("$d/md", sub { push @md, $_[2]; }); is(scalar(@md), scalar(@mboxrd), 'got expected emails in Maildir') or diff --git a/t/lei_to_mail.t b/t/lei_to_mail.t index 75314add..51357257 100644 --- a/t/lei_to_mail.t +++ b/t/lei_to_mail.t @@ -253,7 +253,7 @@ SKIP: { # FIFO support } { # Maildir support - my $each_file = PublicInbox::MdirReader->can('maildir_each_file'); + my $mdr = PublicInbox::MdirReader->new; my $md = "$tmpdir/maildir/"; my $wcb = $wcb_get->('maildir', $md); is(ref($wcb), 'CODE', 'got Maildir callback'); @@ -261,7 +261,7 @@ SKIP: { # FIFO support $wcb->(\(my $x = $buf), $b4dc0ffee); my @f; - $each_file->($md, sub { push @f, shift }); + $mdr->maildir_each_file($md, sub { push @f, shift }); open my $fh, $f[0] or BAIL_OUT $!; is(do { local $/; <$fh> }, $buf, 'wrote to Maildir'); @@ -270,7 +270,7 @@ SKIP: { # FIFO support $wcb->(\($x = $buf."\nx\n"), $deadcafe); my @x = (); - $each_file->($md, sub { push @x, shift }); + $mdr->maildir_each_file($md, sub { push @x, shift }); is(scalar(@x), 1, 'wrote one new file'); ok(!-f $f[0], 'old file clobbered'); open $fh, $x[0] or BAIL_OUT $!; @@ -281,7 +281,7 @@ SKIP: { # FIFO support $wcb->(\($x = $buf."\ny\n"), $deadcafe); $wcb->(\($x = $buf."\ny\n"), $b4dc0ffee); # skipped by dedupe @f = (); - $each_file->($md, sub { push @f, shift }); + $mdr->maildir_each_file($md, sub { push @f, shift }); is(scalar grep(/\A\Q$x[0]\E\z/, @f), 1, 'old file still there'); my @new = grep(!/\A\Q$x[0]\E\z/, @f); is(scalar @new, 1, '1 new file written (b4dc0ffee skipped)');