From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 5BE541FAE7 for ; Thu, 22 Mar 2018 09:40:16 +0000 (UTC) From: "Eric Wong (Contractor, The Linux Foundation)" To: meta@public-inbox.org Subject: [PATCH 04/13] InboxWritable: add mbox/maildir parsing + import logic Date: Thu, 22 Mar 2018 09:40:06 +0000 Message-Id: <20180322094015.14422-5-e@80x24.org> In-Reply-To: <20180322094015.14422-1-e@80x24.org> References: <20180322094015.14422-1-e@80x24.org> List-Id: This will make it easier to as well as supporting future Filter API users. It allows simplifying our ad-hoc import_vger_from_mbox script. --- lib/PublicInbox/InboxWritable.pm | 103 +++++++++++++++++++++++++++++++++++++++ lib/PublicInbox/V2Writable.pm | 8 +++ lib/PublicInbox/WatchMaildir.pm | 20 +++----- script/public-inbox-init | 6 +-- scripts/import_vger_from_mbox | 51 +++---------------- 5 files changed, 126 insertions(+), 62 deletions(-) diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index 0a976ea..82834f0 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -7,6 +7,8 @@ use strict; use warnings; use base qw(PublicInbox::Inbox); use PublicInbox::Import; +use PublicInbox::Filter::Base; +*REJECT = *PublicInbox::Filter::Base::REJECT; sub new { my ($class, $ibx) = @_; @@ -54,4 +56,105 @@ sub filter { undef; } +sub is_maildir_basename ($) { + my ($bn) = @_; + return 0 if $bn !~ /\A[a-zA-Z0-9][\-\w:,=\.]+\z/; + if ($bn =~ /:2,([A-Z]+)\z/i) { + my $flags = $1; + return 0 if $flags =~ /[DT]/; # no [D]rafts or [T]rashed mail + } + 1; +} + +sub is_maildir_path ($) { + my ($path) = @_; + my @p = split(m!/+!, $path); + (is_maildir_basename($p[-1]) && -f $path) ? 1 : 0; +} + +sub maildir_path_load ($) { + my ($path) = @_; + if (open my $fh, '<', $path) { + local $/; + my $str = <$fh>; + $str or return; + return PublicInbox::MIME->new(\$str); + } elsif ($!{ENOENT}) { + # common with Maildir + return; + } else { + warn "failed to open $path: $!\n"; + return; + } +} + +sub import_maildir { + my ($self, $dir) = @_; + my $im = $self->importer(1); + my $filter = $self->filter; + foreach my $sub (qw(cur new tmp)) { + -d "$dir/$sub" or die "$dir is not a Maildir (missing $sub)\n"; + } + foreach my $sub (qw(cur new)) { + opendir my $dh, "$dir/$sub" or die "opendir $dir/$sub: $!\n"; + while (defined(my $fn = readdir($dh))) { + next unless is_maildir_basename($fn); + my $mime = maildir_file_load("$dir/$fn") or next; + if ($filter) { + my $ret = $filter->scrub($mime) or return; + return if $ret == REJECT(); + $mime = $ret; + } + $im->add($mime); + } + } + $im->done; +} + +# asctime: From example@example.com Fri Jun 23 02:56:55 2000 +my $from_strict = qr/^From \S+ +\S+ \S+ +\S+ [^:]+:[^:]+:[^:]+ [^:]+/; + +sub mb_add ($$$$) { + my ($im, $variant, $filter, $msg) = @_; + $$msg =~ s/(\r?\n)+\z/$1/s; + my $mime = PublicInbox::MIME->new($msg); + if ($variant eq 'mboxrd') { + $$msg =~ s/^>(>*From )/$1/sm; + } elsif ($variant eq 'mboxo') { + $$msg =~ s/^>From /From /sm; + } + if ($filter) { + my $ret = $filter->scrub($mime) or return; + return if $ret == REJECT(); + $mime = $ret; + } + $im->add($mime) +} + +sub import_mbox { + my ($self, $fh, $variant) = @_; + if ($variant !~ /\A(?:mboxrd|mboxo)\z/) { + die "variant must be 'mboxrd' or 'mboxo'\n"; + } + my $im = $self->importer(1); + my $prev = undef; + my $msg = ''; + my $filter = $self->filter; + while (defined(my $l = <$fh>)) { + if ($l =~ /$from_strict/o) { + if (!defined($prev) || $prev =~ /^\r?$/) { + mb_add($im, $variant, $filter, \$msg) if $msg; + $msg = ''; + $prev = $l; + next; + } + warn "W[$.] $l\n"; + } + $prev = $l; + $msg .= $l; + } + mb_add($im, $variant, $filter, \$msg) if $msg; + $im->done; +} + 1; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index dc96b87..46bfebb 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -65,6 +65,14 @@ sub new { bless $self, $class; } +sub init_inbox { + my ($self, $parallel) = @_; + $self->{parallel} = $parallel; + $self->idx_init; + $self->git_init(0); + $self->done; +} + # returns undef on duplicate or spam # mimics Import::add and wraps it for v2 sub add { diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm index d3ca2a1..7ee29da 100644 --- a/lib/PublicInbox/WatchMaildir.pm +++ b/lib/PublicInbox/WatchMaildir.pm @@ -13,6 +13,8 @@ use PublicInbox::MDA; use PublicInbox::Spawn qw(spawn); use PublicInbox::InboxWritable; use File::Temp qw//; +use PublicInbox::Filter::Base; +*REJECT = *PublicInbox::Filter::Base::REJECT; sub new { my ($class, $config) = @_; @@ -125,7 +127,7 @@ sub _remove_spam { $im->remove($mime, 'spam'); if (my $scrub = $ibx->filter) { my $scrubbed = $scrub->scrub($mime) or return; - $scrubbed == 100 and return; + $scrubbed == REJECT() and return; $im->remove($scrubbed, 'spam'); } }; @@ -138,13 +140,7 @@ sub _remove_spam { sub _try_path { my ($self, $path) = @_; - my @p = split(m!/+!, $path); - return if $p[-1] !~ /\A[a-zA-Z0-9][\-\w:,=\.]+\z/; - if ($p[-1] =~ /:2,([A-Z]+)\z/i) { - my $flags = $1; - return if $flags =~ /[DT]/; # no [D]rafts or [T]rashed mail - } - return unless -f $path; + return unless PublicInbox::InboxWritable::is_maildir_path($path); if ($path !~ $self->{mdre}) { warn "unrecognized path: $path\n"; return; @@ -166,7 +162,7 @@ sub _try_path { } if (my $scrub = $inbox->filter) { my $ret = $scrub->scrub($mime) or return; - $ret == 100 and return; + $ret == REJECT() and return; $mime = $ret; } @@ -258,14 +254,14 @@ sub _path_to_mime { sub _importer_for { my ($self, $ibx) = @_; - my $im = $ibx->importer(0); my $importers = $self->{importers}; + my $im = $importers->{"$ibx"} ||= $ibx->importer(0); if (scalar(keys(%$importers)) > 2) { - delete $importers->{"$im"}; + delete $importers->{"$ibx"}; _done_for_now($self); } - $importers->{"$im"} = $im; + $importers->{"$ibx"} = $im; } sub _spamcheck_cb { diff --git a/script/public-inbox-init b/script/public-inbox-init index fdad136..86cf8b5 100755 --- a/script/public-inbox-init +++ b/script/public-inbox-init @@ -82,11 +82,7 @@ if ($version >= 2) { -primary_address => $address[0], }; $ibx = PublicInbox::Inbox->new($ibx); - my $v2w = PublicInbox::V2Writable->new($ibx, 1); - $v2w->{parallel} = 0; - $v2w->idx_init; - $v2w->git_init(0); - $v2w->done; + PublicInbox::V2Writable->new($ibx, 1)->init_inbox(0); } elsif ($version == 1) { x(qw(git init -q --bare), $mainrepo); diff --git a/scripts/import_vger_from_mbox b/scripts/import_vger_from_mbox index 1edb987..369dac7 100644 --- a/scripts/import_vger_from_mbox +++ b/scripts/import_vger_from_mbox @@ -5,7 +5,7 @@ use strict; use warnings; use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/; use PublicInbox::MIME; -use PublicInbox::Inbox; +use PublicInbox::InboxWritable; use PublicInbox::V2Writable; use PublicInbox::Import; use PublicInbox::MDA; @@ -30,55 +30,16 @@ my $ibx = { name => $name, version => $version, -primary_address => $email, + filter => 'PublicInbox::Filter::Vger', }; $ibx = PublicInbox::Inbox->new($ibx); -my $im; unless ($dry_run) { if ($version >= 2) { - $im = PublicInbox::V2Writable->new($ibx, 1); + PublicInbox::V2Writable->new($ibx, 1)->init_inbox(0); } else { - system(qw(git init --bare -q), $mainrepo); - my $git = PublicInbox::Git->new($mainrepo); - $im = PublicInbox::Import->new($git, $name, $email, $ibx); + system(qw(git init --bare -q), $mainrepo) == 0 or die; } } +$ibx = PublicInbox::InboxWritable->new($ibx); binmode STDIN; -my $msg = ''; -use PublicInbox::Filter::Vger; -my $vger = PublicInbox::Filter::Vger->new; - -sub do_add ($$) { - my ($im, $msg) = @_; - $$msg =~ s/(\r?\n)+\z/$1/s; - my $mime = PublicInbox::MIME->new($msg); - if ($variant eq 'mboxrd') { - $$msg =~ s/^>(>*From )/$1/sm; - } elsif ($variant eq 'mboxo') { - $$msg =~ s/^>From /From /sm; - } - $mime = $vger->scrub($mime); - return unless $im; - $mime->header_set($_) foreach @PublicInbox::MDA::BAD_HEADERS; - $im->add($mime) or - warn "duplicate: ", - $mime->header_obj->header_raw('Message-ID'), "\n"; -} - -# asctime: From example@example.com Fri Jun 23 02:56:55 2000 -my $from_strict = qr/^From \S+ +\S+ \S+ +\S+ [^:]+:[^:]+:[^:]+ [^:]+/; -my $prev = undef; -while (defined(my $l = )) { - if ($l =~ /$from_strict/o) { - if (!defined($prev) || $prev =~ /^\r?$/) { - do_add($im, \$msg) if $msg; - $msg = ''; - $prev = $l; - next; - } - warn "W[$.] $l\n"; - } - $prev = $l; - $msg .= $l; -} -do_add($im, \$msg) if $msg; -$im->done if $im; +$ibx->import_mbox(\*STDIN, $variant); -- EW