From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF, T_SCC_BODY_TEXT_LINE shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 7BE701F406 for ; Fri, 29 Dec 2023 18:05:14 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1703873114; bh=dN/g4vXRUfIjhMg4eOr6tWVCPrnnFMmKs+Pi8ZUV1SQ=; h=Date:From:To:Subject:References:In-Reply-To:From; b=atcP13KkObrNEHj7ceB5EwySRERS6+QyvyC2rYRQc2n+/f5b5tqR/92JBNtXTrcgR w1shE4zY4n1MtH9SB3K3SPnz3zbk1qfMs7tTPEWC2DpfRHTVVAfz9RP6Am7/IbyqRX PK32thUJ6hCYD7YmMMcxbGTncv72lmBEgmFUHjKY= Date: Fri, 29 Dec 2023 18:05:14 +0000 From: Eric Wong To: meta@public-inbox.org Subject: [PATCH v2] lei: support reading MH for convert+import+index Message-ID: <20231229180514.M393557@dcvr> References: <20231216130932.479628-1-e@80x24.org> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: <20231216130932.479628-1-e@80x24.org> List-Id: The MH format is widely-supported and used by various MUAs such as mutt and sylpheed, and a MH-like format is used by mlmmj for archives, as well. Locking implementations for writes are inconsistent, so this commit doesn't support writes, yet. inotify|EVFILT_VNODE watches aren't supported, yet, but that'll have to come since MH allows packing unused integers and renaming files. --- v2 fixes: * uses Perl REGEXP match via DBD::SQLite for folder filtering * unconditionally verify blob contents * eliminate unused $tmpdir in test diff -u b/lib/PublicInbox/LeiMailSync.pm b/lib/PublicInbox/LeiMailSync.pm --- b/lib/PublicInbox/LeiMailSync.pm +++ b/lib/PublicInbox/LeiMailSync.pm @@ -471,19 +471,20 @@ } } + # MH, except `uid' is not always unique (can be packed) $b2n = $dbh->prepare(<<''); SELECT f.loc,b.uid FROM blob2num b LEFT JOIN folders f ON b.fid = f.fid -WHERE b.oidbin = ? /* AND f.loc LIKE 'mh:/%' */ +WHERE b.oidbin = ? AND f.loc REGEXP '^mh:/' $b2n->bind_param(1, $oidbin, SQL_BLOB); $b2n->execute; - while (my ($d, $n) = $b2n->fetchrow_array) { - substr($d, 0, length('mh:')) = ''; - my $f = "$d/$n"; + while (my ($f, $n) = $b2n->fetchrow_array) { + $f =~ s/\Amh://s or die "BUG: not MH: $f"; + $f .= "/$n"; open my $fh, '<', $f or next; my $raw = read_all($fh, -s $fh // next); - next if $vrfy && blob_mismatch $f, $oidhex, \$raw; + next if blob_mismatch $f, $oidhex, \$raw; return \$raw; } undef; diff -u b/t/mh_reader.t b/t/mh_reader.t --- b/t/mh_reader.t +++ b/t/mh_reader.t @@ -10,7 +10,6 @@ use autodie; opendir my $cwdfh, '.'; -my $tmpdir = tmpdir; my $normal = create_dir 'normal', sub { write_file '>', 3, "Subject: replied a\n\n"; write_file '>', 4, "Subject: replied b\n\n"; MANIFEST | 3 + lib/PublicInbox/LEI.pm | 13 ++-- lib/PublicInbox/LeiConvert.pm | 5 ++ lib/PublicInbox/LeiImport.pm | 23 +++++++ lib/PublicInbox/LeiImportKw.pm | 2 +- lib/PublicInbox/LeiIndex.pm | 2 +- lib/PublicInbox/LeiInput.pm | 52 +++++++++++++--- lib/PublicInbox/LeiMailSync.pm | 40 ++++++++---- lib/PublicInbox/LeiToMail.pm | 5 ++ lib/PublicInbox/MHreader.pm | 103 +++++++++++++++++++++++++++++++ lib/PublicInbox/MdirReader.pm | 2 +- lib/PublicInbox/MdirSort.pm | 46 ++++++++++++++ lib/PublicInbox/TestCommon.pm | 22 ++++--- t/mh_reader.t | 107 +++++++++++++++++++++++++++++++++ 14 files changed, 392 insertions(+), 33 deletions(-) create mode 100644 lib/PublicInbox/MHreader.pm create mode 100644 lib/PublicInbox/MdirSort.pm create mode 100644 t/mh_reader.t diff --git a/MANIFEST b/MANIFEST index 109ce88a..051cd6f9 100644 --- a/MANIFEST +++ b/MANIFEST @@ -296,6 +296,7 @@ lib/PublicInbox/Linkify.pm lib/PublicInbox/Listener.pm lib/PublicInbox/Lock.pm lib/PublicInbox/MDA.pm +lib/PublicInbox/MHreader.pm lib/PublicInbox/MID.pm lib/PublicInbox/MIME.pm lib/PublicInbox/MailDiff.pm @@ -305,6 +306,7 @@ lib/PublicInbox/MboxGz.pm lib/PublicInbox/MboxLock.pm lib/PublicInbox/MboxReader.pm lib/PublicInbox/MdirReader.pm +lib/PublicInbox/MdirSort.pm lib/PublicInbox/MiscIdx.pm lib/PublicInbox/MiscSearch.pm lib/PublicInbox/MsgIter.pm @@ -547,6 +549,7 @@ t/mda-mime.eml t/mda.t t/mda_filter_rubylang.t t/mdir_reader.t +t/mh_reader.t t/mid.t t/mime.t t/miscsearch.t diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm index 17431518..e0cfd55a 100644 --- a/lib/PublicInbox/LEI.pm +++ b/lib/PublicInbox/LEI.pm @@ -267,7 +267,7 @@ import => [ 'LOCATION...|--stdin [LABELS...]', 'one-time import/update from URL or filesystem', qw(stdin| offset=i recursive|r exclude=s include|I=s new-only lock=s@ in-format|F=s kw! verbose|v+ incremental! mail-sync! - commit-delay=i), + commit-delay=i sort|s:s@), @net_opt, @c_opt ], 'forget-mail-sync' => [ 'LOCATION...', 'forget sync information for a mail folder', @c_opt ], @@ -280,7 +280,7 @@ import => [ 'LOCATION...|--stdin [LABELS...]', 'convert' => [ 'LOCATION...|--stdin', 'one-time conversion from URL or filesystem to another format', qw(stdin| in-format|F=s out-format|f=s output|mfolder|o=s lock=s@ kw! - rsyncable), + rsyncable sort|s:s@), @net_opt, @c_opt ], 'p2q' => [ 'LOCATION_OR_COMMIT...|--stdin', "use a patch to generate a query for `lei q --stdin'", @@ -321,6 +321,9 @@ import => [ 'LOCATION...|--stdin [LABELS...]', my $stdin_formats = [ 'MAIL_FORMAT|eml|mboxrd|mboxcl2|mboxcl|mboxo', 'specify message input format' ]; my $ls_format = [ 'OUT|plain|json|null', 'listing output format' ]; +my $sort_out = [ 'VAL|received|relevance|docid', + "order of results is `--output'-dependent"]; +my $sort_in = [ 'sequence|mtime|size', 'sort input (format-dependent)' ]; # we use \x{a0} (non-breaking SP) to avoid wrapping in PublicInbox::LeiHelp my %OPTDESC = ( @@ -428,8 +431,10 @@ my %OPTDESC = ( 'limit|n=i@' => ['NUM', 'limit on number of matches (default: 10000)' ], 'offset=i' => ['OFF', 'search result offset (default: 0)'], -'sort|s=s' => [ 'VAL|received|relevance|docid', - "order of results is `--output'-dependent"], +'sort|s=s q' => $sort_out, +'sort|s=s lcat' => $sort_out, +'sort|s:s@ convert' => $sort_in, +'sort|s:s@ import' => $sort_in, 'reverse|r' => 'reverse search results', # like sort(1) 'boost=i' => 'increase/decrease priority of results (default: 0)', diff --git a/lib/PublicInbox/LeiConvert.pm b/lib/PublicInbox/LeiConvert.pm index 8f628562..17a952f2 100644 --- a/lib/PublicInbox/LeiConvert.pm +++ b/lib/PublicInbox/LeiConvert.pm @@ -28,6 +28,11 @@ sub input_maildir_cb { $self->{wcb}->(undef, { kw => $kw }, $eml); } +sub input_mh_cb { + my ($dn, $bn, $kw, $eml, $self) = @_; + $self->{wcb}->(undef, { kw => $kw }, $eml); +} + sub process_inputs { # via wq_do my ($self) = @_; local $PublicInbox::DS::in_loop = 0; # force synchronous awaitpid diff --git a/lib/PublicInbox/LeiImport.pm b/lib/PublicInbox/LeiImport.pm index c2552bf0..5521188c 100644 --- a/lib/PublicInbox/LeiImport.pm +++ b/lib/PublicInbox/LeiImport.pm @@ -53,6 +53,29 @@ sub pmdir_cb { # called via wq_io_do from LeiPmdir->each_mdir_fn } } +sub input_mh_cb { + my ($mhdir, $n, $kw, $eml, $self) = @_; + substr($mhdir, 0, 0) = 'mh:'; # add prefix + my $lse = $self->{lse} //= $self->{lei}->{sto}->search; + my $lms = $self->{-lms_rw} //= $self->{lei}->lms; # may be 0 or undef + my @oidbin = $lms ? $lms->num_oidbin($mhdir, $n) : (); + @oidbin > 1 and warn("W: $mhdir/$n not unique:\n", + map { "\t".unpack('H*', $_)."\n" } @oidbin); + my @docids = sort { $a <=> $b } uniqstr + map { $lse->over->oidbin_exists($_) } @oidbin; + if (scalar @docids) { + $lse->kw_changed(undef, $kw, \@docids) or return; + } + if (defined $eml) { + my $vmd = $self->{-import_kw} ? { kw => $kw } : undef; + $vmd->{sync_info} = [ $mhdir, $n + 0 ] if $self->{-mail_sync}; + $self->input_eml_cb($eml, $vmd); + } + # TODO: + # elsif (my $ikw = $self->{lei}->{ikw}) { # old message, kw only + # $ikw->wq_io_do('ck_update_kw', [], "mh:$dir", $uid, $kw); +} + sub input_net_cb { # imap_each / nntp_each my ($uri, $uid, $kw, $eml, $self) = @_; if (defined $eml) { diff --git a/lib/PublicInbox/LeiImportKw.pm b/lib/PublicInbox/LeiImportKw.pm index 4b8e69fb..765e23cd 100644 --- a/lib/PublicInbox/LeiImportKw.pm +++ b/lib/PublicInbox/LeiImportKw.pm @@ -36,7 +36,7 @@ sub ipc_atfork_child { sub ck_update_kw { # via wq_io_do my ($self, $url, $uid, $kw) = @_; my @oidbin = $self->{-lms_rw}->num_oidbin($url, $uid); - my $uid_url = "$url/;UID=$uid"; + my $uid_url = index($url, 'mh:') == 0 ? $url.$uid : "$url/;UID=$uid"; @oidbin > 1 and warn("W: $uid_url not unique:\n", map { "\t".unpack('H*', $_)."\n" } @oidbin); my @docids = sort { $a <=> $b } uniqstr diff --git a/lib/PublicInbox/LeiIndex.pm b/lib/PublicInbox/LeiIndex.pm index b3f3e1a0..0e329e58 100644 --- a/lib/PublicInbox/LeiIndex.pm +++ b/lib/PublicInbox/LeiIndex.pm @@ -35,7 +35,7 @@ sub lei_index { no warnings 'once'; no strict 'refs'; -for my $m (qw(pmdir_cb input_net_cb)) { +for my $m (qw(pmdir_cb input_net_cb input_mh_cb)) { *$m = PublicInbox::LeiImport->can($m); } diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm index daba9a8e..947a7a79 100644 --- a/lib/PublicInbox/LeiInput.pm +++ b/lib/PublicInbox/LeiInput.pm @@ -69,6 +69,11 @@ sub input_maildir_cb { $self->input_eml_cb($eml); } +sub input_mh_cb { + my ($dn, $n, $kw, $eml, $self) = @_; + $self->input_eml_cb($eml); +} + sub input_net_cb { # imap_each, nntp_each cb my ($url, $uid, $kw, $eml, $self) = @_; $self->input_eml_cb($eml); @@ -190,7 +195,7 @@ sub input_path_url { $ifmt = lc($1); } elsif ($input =~ /\.(?:patch|eml)\z/i) { $ifmt = 'eml'; - } elsif (-f $input && $input =~ m{\A(?:.+)/(?:new|cur)/([^/]+)\z}) { + } elsif ($input =~ m{\A(?:.+)/(?:new|cur)/([^/]+)\z} && -f $input) { my $bn = $1; my $fl = PublicInbox::MdirReader::maildir_basename_flags($bn); return if index($fl, 'T') >= 0; @@ -204,6 +209,10 @@ sub input_path_url { my $devfd = $lei->path_to_fd($input) // return; if ($devfd >= 0) { $self->input_fh($ifmt, $lei->{$devfd}, $input, @args); + } elsif ($devfd < 0 && $input =~ m{\A(.+/)([0-9]+)\z} && -f $input) { + my ($dn, $n) = ($1, $2); + my $mhr = PublicInbox::MHreader->new($dn, $lei->{3}); + $mhr->mh_read_one($n, $self->can('input_mh_cb'), $self); } elsif (-f $input && $ifmt eq 'eml') { open my $fh, '<', $input or return $lei->fail("open($input): $!"); @@ -231,6 +240,10 @@ sub input_path_url { $self->can('input_maildir_cb'), $self, @args); } + } elsif (-d _ && $ifmt eq 'mh') { + my $mhr = PublicInbox::MHreader->new($input.'/', $lei->{3}); + $mhr->{sort} = $lei->{opt}->{sort}; + $mhr->mh_each_eml($self->can('input_mh_cb'), $self, @args); } elsif (-d _ && $ifmt =~ /\A(?:v1|v2)\z/) { my $ibx = PublicInbox::Inbox->new({inboxdir => $input}); each_ibx_eml($self, $ibx, @args); @@ -354,13 +367,15 @@ sub prepare_inputs { # returns undef on error PublicInbox::MboxReader->reads($ifmt) or return $lei->fail("$ifmt not supported"); } elsif (-d $input_path) { # TODO extindex - $ifmt =~ /\A(?:maildir|v1|v2|extindex)\z/ or + $ifmt =~ /\A(?:maildir|mh|v1|v2|extindex)\z/ or return$lei->fail("$ifmt not supported"); $input = $input_path; add_dir $lei, $istate, $ifmt, \$input; - } elsif ($self->{missing_ok} && !-e _) { + } elsif ($self->{missing_ok} && + $ifmt =~ /\A(?:maildir|mh)\z/ && + !-e $input_path) { # for "lei rm-watch" on missing Maildir - $may_sync and $input = 'maildir:'. + $may_sync and $input = "$ifmt:". $lei->abs_path($input_path); } else { my $m = "Unable to handle $input"; @@ -373,7 +388,7 @@ sub prepare_inputs { # returns undef on error $input is `eml', not --in-format=$in_fmt push @{$sync->{no}}, $input if $sync; - } elsif (-f $input && $input =~ m{\A(.+)/(new|cur)/([^/]+)\z}) { + } elsif ($input =~ m{\A(.+)/(new|cur)/([^/]+)\z} && -f $input) { # single file in a Maildir my ($mdir, $nc, $bn) = ($1, $2, $3); my $other = $mdir . ($nc eq 'new' ? '/cur' : '/new'); @@ -385,12 +400,24 @@ $input is `eml', not --in-format=$in_fmt if ($sync) { $input = $lei->abs_path($mdir) . "/$nc/$bn"; - push @{$sync->{ok}}, $input if $sync; + push @{$sync->{ok}}, $input; } require PublicInbox::MdirReader; } else { my $devfd = $lei->path_to_fd($input) // return; - if ($devfd >= 0 || -f $input || -p _) { + if ($devfd < 0 && $input =~ m{\A(.+)/([0-9]+)\z} && + -f $input) { # single file in MH dir + my ($mh, $n) = ($1, $2); + lc($in_fmt//'eml') eq 'eml' or + return $lei->fail(<<""); +$input is `eml', not --in-format=$in_fmt + + if ($sync) { + $input = $lei->abs_path($mh)."/$n"; + push @{$sync->{ok}}, $input; + } + require PublicInbox::MHreader; + } elsif ($devfd >= 0 || -f $input || -p _) { push @{$sync->{no}}, $input if $sync; push @f, $input; } elsif (-d "$input/new" && -d "$input/cur") { @@ -401,10 +428,13 @@ $input is `eml', not --in-format=$in_fmt add_dir $lei, $istate, 'v1', \$input; } elsif (-e "$input/ei.lock") { add_dir $lei, $istate, 'extindex', \$input; + } elsif (-f "$input/.mh_sequences") { + add_dir $lei, $istate, 'mh', \$input; } elsif ($self->{missing_ok} && !-e $input) { if ($lei->{cmd} eq 'p2q') { # will run "git format-patch" } elsif ($may_sync) { # for lei rm-watch + # FIXME: support MH, here $input = 'maildir:'. $lei->abs_path($input); } @@ -446,6 +476,14 @@ $input is `eml', not --in-format=$in_fmt $lei->refresh_watches; } } + if (my $mh = $istate->{mh}) { + require PublicInbox::MHreader; + grep(!m!\Amh:!i, @$mh) and die "BUG: @$mh (no pfx)"; + if ($may_sync && $lei->{sto}) { + $lei->lms(1)->lms_write_prepare->add_folders(@$mh); + # $lei->refresh_watches; TODO + } + } require PublicInbox::ExtSearch if $istate->{extindex}; $self->{inputs} = $inputs; } diff --git a/lib/PublicInbox/LeiMailSync.pm b/lib/PublicInbox/LeiMailSync.pm index 17254a82..593715dc 100644 --- a/lib/PublicInbox/LeiMailSync.pm +++ b/lib/PublicInbox/LeiMailSync.pm @@ -435,15 +435,24 @@ sub folders { map { $_->[0] } @{$sth->fetchall_arrayref}; } +sub blob_mismatch ($$$) { + my ($f, $oidhex, $rawref) = @_; + my $sha = $HEXLEN2SHA{length($oidhex)}; + my $got = git_sha($sha, $rawref)->hexdigest; + $got eq $oidhex ? undef : warn("$f changed $oidhex => $got\n"); +} + sub local_blob { my ($self, $oidhex, $vrfy) = @_; my $dbh = $self->{dbh} //= dbh_new($self); + my $oidbin = pack('H*', $oidhex); + my $b2n = $dbh->prepare(<<''); SELECT f.loc,b.name FROM blob2name b LEFT JOIN folders f ON b.fid = f.fid WHERE b.oidbin = ? - $b2n->bind_param(1, pack('H*', $oidhex), SQL_BLOB); + $b2n->bind_param(1, $oidbin, SQL_BLOB); $b2n->execute; while (my ($d, $n) = $b2n->fetchrow_array) { substr($d, 0, length('maildir:')) = ''; @@ -456,19 +465,28 @@ WHERE b.oidbin = ? my $f = "$d/$x/$n"; open my $fh, '<', $f or next; # some (buggy) Maildir writers are non-atomic: - next unless -s $fh; - my $raw = read_all($fh, -s _); - if ($vrfy) { - my $sha = $HEXLEN2SHA{length($oidhex)}; - my $got = git_sha($sha, \$raw)->hexdigest; - if ($got ne $oidhex) { - warn "$f changed $oidhex => $got\n"; - next; - } - } + my $raw = read_all($fh, -s $fh // next); + next if $vrfy && blob_mismatch $f, $oidhex, \$raw; return \$raw; } } + + # MH, except `uid' is not always unique (can be packed) + $b2n = $dbh->prepare(<<''); +SELECT f.loc,b.uid FROM blob2num b +LEFT JOIN folders f ON b.fid = f.fid +WHERE b.oidbin = ? AND f.loc REGEXP '^mh:/' + + $b2n->bind_param(1, $oidbin, SQL_BLOB); + $b2n->execute; + while (my ($f, $n) = $b2n->fetchrow_array) { + $f =~ s/\Amh://s or die "BUG: not MH: $f"; + $f .= "/$n"; + open my $fh, '<', $f or next; + my $raw = read_all($fh, -s $fh // next); + next if blob_mismatch $f, $oidhex, \$raw; + return \$raw; + } undef; } diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm index 071ba113..de75e99e 100644 --- a/lib/PublicInbox/LeiToMail.pm +++ b/lib/PublicInbox/LeiToMail.pm @@ -400,6 +400,11 @@ sub new { "$dst exists and is not a directory\n"; $lei->{ovv}->{dst} = $dst .= '/' if substr($dst, -1) ne '/'; $lei->{opt}->{save} //= \1 if $lei->{cmd} eq 'q'; + } elsif ($fmt eq 'mh') { + -e $dst && !-d _ and die + "$dst exists and is not a directory\n"; + $lei->{ovv}->{dst} = $dst .= '/' if substr($dst, -1) ne '/'; + $lei->{opt}->{save} //= \1 if $lei->{cmd} eq 'q'; } elsif (substr($fmt, 0, 4) eq 'mbox') { require PublicInbox::MboxReader; $self->can("eml2$fmt") or die "bad mbox format: $fmt\n"; diff --git a/lib/PublicInbox/MHreader.pm b/lib/PublicInbox/MHreader.pm new file mode 100644 index 00000000..673e3e06 --- /dev/null +++ b/lib/PublicInbox/MHreader.pm @@ -0,0 +1,103 @@ +# Copyright (C) all contributors +# License: AGPL-3.0+ + +# MH reader, based on Lib/mailbox.py in cpython source +package PublicInbox::MHreader; +use v5.12; +use PublicInbox::InboxWritable qw(eml_from_path); +use PublicInbox::OnDestroy; +use PublicInbox::IO qw(try_cat); +use PublicInbox::MdirSort; +use Carp qw(carp); +use autodie qw(chdir closedir opendir); + +my %FL2OFF = ( # mh_sequences key => our keyword + replied => 0, + flagged => 1, + unseen => 2, # negate +); +my @OFF2KW = qw(answered flagged); # [2] => unseen (negated) + +sub new { + my ($cls, $dir, $cwdfh) = @_; + if (substr($dir, -1) ne '/') { # TODO: do this earlier + carp "W: appending `/' to `$dir' (fix caller)\n"; + $dir .= '/'; + } + bless { dir => $dir, cwdfh => $cwdfh }, $cls; +} + +sub read_mh_sequences ($) { # caller must chdir($self->{dir}) + my ($self) = @_; + my ($fl, $off, @n); + my @seq = ('', '', ''); + for (split /\n+/s, try_cat('.mh_sequences')) { + ($fl, @n) = split /[: \t]+/; + $off = $FL2OFF{$fl} // do { warn <{dir}.mh_sequences (ignoring) +EOM + next; + }; + @n = grep /\A[0-9]+\z/s, @n; # don't stat, yet + if (@n) { + @n = sort { $b <=> $a } @n; # to avoid resize + my $buf = ''; + vec($buf, $_, 1) = 1 for @n; + $seq[$off] = $buf; + } + } + \@seq; +} + +sub mh_each_file { + my ($self, $efcb, @arg) = @_; + opendir(my $dh, my $dir = $self->{dir}); + my $restore = PublicInbox::OnDestroy->new($$, \&chdir, $self->{cwdfh}); + chdir($dh); + if (defined(my $sort = $self->{sort})) { + my @sort = map { + my @tmp = $_ eq '' ? ('sequence') : split(/[, ]/); + # sorting by name alphabetically makes no sense for MH: + for my $k (@tmp) { + s/\A(\-|\+|)(?:name|)\z/$1sequence/; + } + @tmp; + } @$sort; + my @n = grep /\A[0-9]+\z/s, readdir $dh; + mdir_sort \@n, \@sort; + $efcb->($dir, $_, $self, @arg) for @n; + } else { + while (readdir $dh) { # perl v5.12+ to set $_ on readdir + $efcb->($dir, $_, $self, @arg) if /\A[0-9]+\z/s; + } + } + closedir $dh; # may die +} + +sub kw_for ($$) { + my ($self, $n) = @_; + my $seq = $self->{mh_seq} //= read_mh_sequences($self); + my @kw = map { vec($seq->[$_], $n, 1) ? $OFF2KW[$_] : () } (0, 1); + vec($seq->[2], $n, 1) or push @kw, 'seen'; + \@kw; +} + +sub _file2eml { # mh_each_file cb + my ($dir, $n, $self, $ucb, @arg) = @_; + my $eml = eml_from_path($n); + $ucb->($dir, $n, kw_for($self, $n), $eml, @arg) if $eml; +} + +sub mh_each_eml { + my ($self, $ucb, @arg) = @_; + mh_each_file($self, \&_file2eml, $ucb, @arg); +} + +sub mh_read_one { + my ($self, $n, $ucb, @arg) = @_; + my $restore = PublicInbox::OnDestroy->new($$, \&chdir, $self->{cwdfh}); + chdir(my $dir = $self->{dir}); + _file2eml($dir, $n, $self, $ucb, @arg); +} + +1; diff --git a/lib/PublicInbox/MdirReader.pm b/lib/PublicInbox/MdirReader.pm index db5f4545..2981b058 100644 --- a/lib/PublicInbox/MdirReader.pm +++ b/lib/PublicInbox/MdirReader.pm @@ -1,7 +1,7 @@ # Copyright (C) all contributors # License: AGPL-3.0+ -# Maildirs for now, MH eventually +# Maildirs only (PublicInbox::MHreader exists, now) # ref: https://cr.yp.to/proto/maildir.html # https://wiki2.dovecot.org/MailboxFormat/Maildir package PublicInbox::MdirReader; diff --git a/lib/PublicInbox/MdirSort.pm b/lib/PublicInbox/MdirSort.pm new file mode 100644 index 00000000..6bd9fb6c --- /dev/null +++ b/lib/PublicInbox/MdirSort.pm @@ -0,0 +1,46 @@ +# Copyright (C) all contributors +# License: AGPL-3.0+ + +# used for sorting MH (and (TODO) Maildir) names +# TODO: consider sort(1) to parallelize sorting of gigantic directories +package PublicInbox::MdirSort; +use v5.12; +use Time::HiRes (); +use parent qw(Exporter); +use Fcntl qw(S_ISREG); +our @EXPORT = qw(mdir_sort); +my %ST = (sequence => 0, size => 1, atime => 2, mtime => 3, ctime => 4); + +sub mdir_sort ($$;$) { + my ($ent, $sort, $max) = @_; + my @st; + my @ent = map { + @st = Time::HiRes::stat $_; + # name, size, {a,m,c}time + S_ISREG($st[2]) ? [ $_, @st[7..10] ] : (); + } @$ent; + @ent = grep { $_->[1] <= $max } @ent if $max; + use sort 'stable'; + for my $s (@$sort) { + if ($s =~ /\A(\-|\+|)name\z/) { + if ($1 eq '-') { + @ent = sort { $b->[0] cmp $a->[0] } @ent; + } else { + @ent = sort { $a->[0] cmp $b->[0] } @ent; + } + } elsif ($s =~ /\A(\-|\+|) + (sequence|size|ctime|mtime|atime)\z/x) { + my $key = $ST{$2}; + if ($1 eq '-') { + @ent = sort { $b->[$key] <=> $a->[$key] } @ent; + } else { + @ent = sort { $a->[$key] <=> $b->[$key] } @ent; + } + } else { + die "E: unrecognized sort parameter: `$s'"; + } + } + @$ent = map { $_->[0] } @ent; +} + +1; diff --git a/lib/PublicInbox/TestCommon.pm b/lib/PublicInbox/TestCommon.pm index b0f28e16..d20bff28 100644 --- a/lib/PublicInbox/TestCommon.pm +++ b/lib/PublicInbox/TestCommon.pm @@ -24,6 +24,7 @@ BEGIN { @EXPORT = qw(tmpdir tcp_server tcp_connect require_git require_mods run_script start_script key2sub xsys xsys_e xqx eml_load tick have_xapian_compact json_utf8 setup_public_inboxes create_inbox + create_dir create_coderepo require_bsd kernel_version check_broken_tmpfs quit_waiter_pipe wait_for_eof require_git_http_backend tcp_host_port test_lei lei lei_ok $lei_out $lei_err $lei_opt @@ -843,26 +844,24 @@ sub my_sum { substr PublicInbox::SHA::sha256_hex(join('', @l)), 0, 8; } -sub create_coderepo ($$;@) { - my $ident = shift; - my $cb = pop; +sub create_dir (@) { + my ($ident, $cb) = (shift, pop); my %opt = @_; require PublicInbox::Lock; require PublicInbox::Import; - my ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!); - my ($db) = (PublicInbox::Import::default_branch() =~ m!([^/]+)\z!); my $tmpdir = delete $opt{tmpdir}; - my $dir = "t/data-gen/$base.$ident-".my_sum($db, $cb, \%opt); + my ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!); + my $dir = "t/data-gen/$base.$ident-".my_sum($cb, \%opt); require File::Path; my $new = File::Path::make_path($dir); my $lk = PublicInbox::Lock->new("$dir/creat.lock"); my $scope = $lk->lock_for_scope; if (!-f "$dir/creat.stamp") { - opendir(my $dfh, '.'); + opendir(my $cwd, '.'); chdir($dir); local %ENV = (%ENV, %COMMIT_ENV); $cb->($dir); - chdir($dfh); + chdir($cwd); # some $cb chdir around open my $s, '>', "$dir/creat.stamp"; } return $dir if !defined($tmpdir); @@ -870,6 +869,13 @@ sub create_coderepo ($$;@) { $tmpdir; } +sub create_coderepo (@) { + my $ident = shift; + require PublicInbox::Import; + my ($db) = (PublicInbox::Import::default_branch() =~ m!([^/]+)\z!); + create_dir "$ident-$db", @_; +} + sub create_inbox ($;@) { my $ident = shift; my $cb = pop; diff --git a/t/mh_reader.t b/t/mh_reader.t new file mode 100644 index 00000000..e8f69fa8 --- /dev/null +++ b/t/mh_reader.t @@ -0,0 +1,107 @@ +#!perl -w +# Copyright (C) all contributors +# License: AGPL-3.0+ +use PublicInbox::TestCommon; +require_ok 'PublicInbox::MHreader'; +use PublicInbox::IO qw(write_file); +use PublicInbox::Lock; +use PublicInbox::OnDestroy; +use PublicInbox::Eml; +use autodie; +opendir my $cwdfh, '.'; + +my $normal = create_dir 'normal', sub { + write_file '>', 3, "Subject: replied a\n\n"; + write_file '>', 4, "Subject: replied b\n\n"; + write_file '>', 1, "Subject: unseen\n\n"; + write_file '>', 2, "Subject: unseen flagged\n\n"; + write_file '>', '.mh_sequences', <', $name, "Subject: ".($_ x $_)."\n\n"; + } +}; + +my $stale = create_dir 'stale', sub { + write_file '>', 4, "Subject: msg 4\n\n"; + write_file '>', '.mh_sequences', <new("$normal/", $cwdfh); + $mhr->{sort} = [ '' ]; + my @res; + $mhr->mh_each_eml(sub { push @res, \@_; }, [ 'bogus' ]); + is scalar(@res), 4, 'got 4 messages' or diag explain(\@res); + is_deeply [map { $_->[1] } @res], [1, 2, 3, 4], + 'got messages in expected order'; + is scalar(grep { $_->[4]->[0] eq 'bogus' } @res), scalar(@res), + 'cb arg passed to all messages' or diag explain(\@res); + + $mhr = PublicInbox::MHreader->new("$stale/", $cwdfh); + @res = (); + $mhr->mh_each_eml(sub { push @res, \@_; }); + is scalar(@res), 1, 'ignored stale messages'; +} + +test_lei(sub { + lei_ok qw(convert -f mboxrd), $normal; + my @msgs = grep /\S/s, split /^From .[^\n]+\n/sm, $lei_out; + my @eml = map { PublicInbox::Eml->new($_) } @msgs; + my $h = 'Subject'; + @eml = sort { $a->header_raw($h) cmp $b->header_raw($h) } @eml; + my @has = map { scalar $_->header_raw($h) } @eml; + is_xdeeply \@has, + [ 'replied a', 'replied b', 'unseen', 'unseen flagged' ], + 'subjects sorted'; + $h = 'X-Status'; + @has = map { scalar $_->header_raw($h) } @eml; + is_xdeeply \@has, [ 'A', 'A', undef, 'F' ], 'answered and flagged kw'; + $h = 'Status'; + @has = map { scalar $_->header_raw($h) } @eml; + is_xdeeply \@has, ['RO', 'RO', 'O', 'O'], 'read and old'; + lei_ok qw(import +L:normal), $normal; + lei_ok qw(q L:normal -f mboxrd); + @msgs = grep /\S/s, split /^From .[^\n]+\n/sm, $lei_out; + my @eml2 = map { PublicInbox::Eml->new($_) } @msgs; + $h = 'Subject'; + @eml2 = sort { $a->header_raw($h) cmp $b->header_raw($h) } @eml2; + is_xdeeply \@eml2, \@eml, 'import preserved kw'; + + lei_ok 'ls-mail-sync'; + is $lei_out, 'mh:'.File::Spec->rel2abs($normal)."\n", + 'mail sync stored'; + + lei_ok qw(convert -s size -f mboxrd), "mh:$for_sort"; + chomp(my @s = grep /^Subject:/, split(/^/sm, $lei_out)); + s/^Subject: // for @s; + is_xdeeply \@s, [ 1, 22, 333 ], 'sorted by size'; + + for my $s ([], [ 'name' ], [ 'sequence' ]) { + lei_ok qw(convert -f mboxrd), "mh:$for_sort", '-s', @$s; + chomp(@s = grep /^Subject:/, split(/^/sm, $lei_out)); + s/^Subject: // for @s; + my $desc = "@$s" || '(default)'; + is_xdeeply \@s, [ 333, 22, 1 ], "sorted by: $desc"; + } + + lei_ok qw(import +L:sorttest), "MH:$for_sort"; + lei_ok 'ls-mail-sync', $for_sort; + is $lei_out, 'mh:'.File::Spec->rel2abs($for_sort)."\n", + "mail sync stored with `MH' normalized to `mh'"; + lei_ok qw(index), 'mh:'.$stale; + lei qw(q -f mboxrd), 's:msg 4'; + like $lei_out, qr/^Subject: msg 4\nStatus: RO\n\n\n/ms, + "message retrieved after `lei index'" +}); + +done_testing;