From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH v2] lei: support reading MH for convert+import+index
Date: Fri, 29 Dec 2023 18:05:14 +0000 [thread overview]
Message-ID: <20231229180514.M393557@dcvr> (raw)
In-Reply-To: <20231216130932.479628-1-e@80x24.org>
The MH format is widely-supported and used by various MUAs such
as mutt and sylpheed, and a MH-like format is used by mlmmj for
archives, as well. Locking implementations for writes are
inconsistent, so this commit doesn't support writes, yet.
inotify|EVFILT_VNODE watches aren't supported, yet, but that'll
have to come since MH allows packing unused integers and
renaming files.
---
v2 fixes:
* uses Perl REGEXP match via DBD::SQLite for folder filtering
* unconditionally verify blob contents
* eliminate unused $tmpdir in test
diff -u b/lib/PublicInbox/LeiMailSync.pm b/lib/PublicInbox/LeiMailSync.pm
--- b/lib/PublicInbox/LeiMailSync.pm
+++ b/lib/PublicInbox/LeiMailSync.pm
@@ -471,19 +471,20 @@
}
}
+ # MH, except `uid' is not always unique (can be packed)
$b2n = $dbh->prepare(<<'');
SELECT f.loc,b.uid FROM blob2num b
LEFT JOIN folders f ON b.fid = f.fid
-WHERE b.oidbin = ? /* AND f.loc LIKE 'mh:/%' */
+WHERE b.oidbin = ? AND f.loc REGEXP '^mh:/'
$b2n->bind_param(1, $oidbin, SQL_BLOB);
$b2n->execute;
- while (my ($d, $n) = $b2n->fetchrow_array) {
- substr($d, 0, length('mh:')) = '';
- my $f = "$d/$n";
+ while (my ($f, $n) = $b2n->fetchrow_array) {
+ $f =~ s/\Amh://s or die "BUG: not MH: $f";
+ $f .= "/$n";
open my $fh, '<', $f or next;
my $raw = read_all($fh, -s $fh // next);
- next if $vrfy && blob_mismatch $f, $oidhex, \$raw;
+ next if blob_mismatch $f, $oidhex, \$raw;
return \$raw;
}
undef;
diff -u b/t/mh_reader.t b/t/mh_reader.t
--- b/t/mh_reader.t
+++ b/t/mh_reader.t
@@ -10,7 +10,6 @@
use autodie;
opendir my $cwdfh, '.';
-my $tmpdir = tmpdir;
my $normal = create_dir 'normal', sub {
write_file '>', 3, "Subject: replied a\n\n";
write_file '>', 4, "Subject: replied b\n\n";
MANIFEST | 3 +
lib/PublicInbox/LEI.pm | 13 ++--
lib/PublicInbox/LeiConvert.pm | 5 ++
lib/PublicInbox/LeiImport.pm | 23 +++++++
lib/PublicInbox/LeiImportKw.pm | 2 +-
lib/PublicInbox/LeiIndex.pm | 2 +-
lib/PublicInbox/LeiInput.pm | 52 +++++++++++++---
lib/PublicInbox/LeiMailSync.pm | 40 ++++++++----
lib/PublicInbox/LeiToMail.pm | 5 ++
lib/PublicInbox/MHreader.pm | 103 +++++++++++++++++++++++++++++++
lib/PublicInbox/MdirReader.pm | 2 +-
lib/PublicInbox/MdirSort.pm | 46 ++++++++++++++
lib/PublicInbox/TestCommon.pm | 22 ++++---
t/mh_reader.t | 107 +++++++++++++++++++++++++++++++++
14 files changed, 392 insertions(+), 33 deletions(-)
create mode 100644 lib/PublicInbox/MHreader.pm
create mode 100644 lib/PublicInbox/MdirSort.pm
create mode 100644 t/mh_reader.t
diff --git a/MANIFEST b/MANIFEST
index 109ce88a..051cd6f9 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -296,6 +296,7 @@ lib/PublicInbox/Linkify.pm
lib/PublicInbox/Listener.pm
lib/PublicInbox/Lock.pm
lib/PublicInbox/MDA.pm
+lib/PublicInbox/MHreader.pm
lib/PublicInbox/MID.pm
lib/PublicInbox/MIME.pm
lib/PublicInbox/MailDiff.pm
@@ -305,6 +306,7 @@ lib/PublicInbox/MboxGz.pm
lib/PublicInbox/MboxLock.pm
lib/PublicInbox/MboxReader.pm
lib/PublicInbox/MdirReader.pm
+lib/PublicInbox/MdirSort.pm
lib/PublicInbox/MiscIdx.pm
lib/PublicInbox/MiscSearch.pm
lib/PublicInbox/MsgIter.pm
@@ -547,6 +549,7 @@ t/mda-mime.eml
t/mda.t
t/mda_filter_rubylang.t
t/mdir_reader.t
+t/mh_reader.t
t/mid.t
t/mime.t
t/miscsearch.t
diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm
index 17431518..e0cfd55a 100644
--- a/lib/PublicInbox/LEI.pm
+++ b/lib/PublicInbox/LEI.pm
@@ -267,7 +267,7 @@ import => [ 'LOCATION...|--stdin [LABELS...]',
'one-time import/update from URL or filesystem',
qw(stdin| offset=i recursive|r exclude=s include|I=s new-only
lock=s@ in-format|F=s kw! verbose|v+ incremental! mail-sync!
- commit-delay=i),
+ commit-delay=i sort|s:s@),
@net_opt, @c_opt ],
'forget-mail-sync' => [ 'LOCATION...',
'forget sync information for a mail folder', @c_opt ],
@@ -280,7 +280,7 @@ import => [ 'LOCATION...|--stdin [LABELS...]',
'convert' => [ 'LOCATION...|--stdin',
'one-time conversion from URL or filesystem to another format',
qw(stdin| in-format|F=s out-format|f=s output|mfolder|o=s lock=s@ kw!
- rsyncable),
+ rsyncable sort|s:s@),
@net_opt, @c_opt ],
'p2q' => [ 'LOCATION_OR_COMMIT...|--stdin',
"use a patch to generate a query for `lei q --stdin'",
@@ -321,6 +321,9 @@ import => [ 'LOCATION...|--stdin [LABELS...]',
my $stdin_formats = [ 'MAIL_FORMAT|eml|mboxrd|mboxcl2|mboxcl|mboxo',
'specify message input format' ];
my $ls_format = [ 'OUT|plain|json|null', 'listing output format' ];
+my $sort_out = [ 'VAL|received|relevance|docid',
+ "order of results is `--output'-dependent"];
+my $sort_in = [ 'sequence|mtime|size', 'sort input (format-dependent)' ];
# we use \x{a0} (non-breaking SP) to avoid wrapping in PublicInbox::LeiHelp
my %OPTDESC = (
@@ -428,8 +431,10 @@ my %OPTDESC = (
'limit|n=i@' => ['NUM', 'limit on number of matches (default: 10000)' ],
'offset=i' => ['OFF', 'search result offset (default: 0)'],
-'sort|s=s' => [ 'VAL|received|relevance|docid',
- "order of results is `--output'-dependent"],
+'sort|s=s q' => $sort_out,
+'sort|s=s lcat' => $sort_out,
+'sort|s:s@ convert' => $sort_in,
+'sort|s:s@ import' => $sort_in,
'reverse|r' => 'reverse search results', # like sort(1)
'boost=i' => 'increase/decrease priority of results (default: 0)',
diff --git a/lib/PublicInbox/LeiConvert.pm b/lib/PublicInbox/LeiConvert.pm
index 8f628562..17a952f2 100644
--- a/lib/PublicInbox/LeiConvert.pm
+++ b/lib/PublicInbox/LeiConvert.pm
@@ -28,6 +28,11 @@ sub input_maildir_cb {
$self->{wcb}->(undef, { kw => $kw }, $eml);
}
+sub input_mh_cb {
+ my ($dn, $bn, $kw, $eml, $self) = @_;
+ $self->{wcb}->(undef, { kw => $kw }, $eml);
+}
+
sub process_inputs { # via wq_do
my ($self) = @_;
local $PublicInbox::DS::in_loop = 0; # force synchronous awaitpid
diff --git a/lib/PublicInbox/LeiImport.pm b/lib/PublicInbox/LeiImport.pm
index c2552bf0..5521188c 100644
--- a/lib/PublicInbox/LeiImport.pm
+++ b/lib/PublicInbox/LeiImport.pm
@@ -53,6 +53,29 @@ sub pmdir_cb { # called via wq_io_do from LeiPmdir->each_mdir_fn
}
}
+sub input_mh_cb {
+ my ($mhdir, $n, $kw, $eml, $self) = @_;
+ substr($mhdir, 0, 0) = 'mh:'; # add prefix
+ my $lse = $self->{lse} //= $self->{lei}->{sto}->search;
+ my $lms = $self->{-lms_rw} //= $self->{lei}->lms; # may be 0 or undef
+ my @oidbin = $lms ? $lms->num_oidbin($mhdir, $n) : ();
+ @oidbin > 1 and warn("W: $mhdir/$n not unique:\n",
+ map { "\t".unpack('H*', $_)."\n" } @oidbin);
+ my @docids = sort { $a <=> $b } uniqstr
+ map { $lse->over->oidbin_exists($_) } @oidbin;
+ if (scalar @docids) {
+ $lse->kw_changed(undef, $kw, \@docids) or return;
+ }
+ if (defined $eml) {
+ my $vmd = $self->{-import_kw} ? { kw => $kw } : undef;
+ $vmd->{sync_info} = [ $mhdir, $n + 0 ] if $self->{-mail_sync};
+ $self->input_eml_cb($eml, $vmd);
+ }
+ # TODO:
+ # elsif (my $ikw = $self->{lei}->{ikw}) { # old message, kw only
+ # $ikw->wq_io_do('ck_update_kw', [], "mh:$dir", $uid, $kw);
+}
+
sub input_net_cb { # imap_each / nntp_each
my ($uri, $uid, $kw, $eml, $self) = @_;
if (defined $eml) {
diff --git a/lib/PublicInbox/LeiImportKw.pm b/lib/PublicInbox/LeiImportKw.pm
index 4b8e69fb..765e23cd 100644
--- a/lib/PublicInbox/LeiImportKw.pm
+++ b/lib/PublicInbox/LeiImportKw.pm
@@ -36,7 +36,7 @@ sub ipc_atfork_child {
sub ck_update_kw { # via wq_io_do
my ($self, $url, $uid, $kw) = @_;
my @oidbin = $self->{-lms_rw}->num_oidbin($url, $uid);
- my $uid_url = "$url/;UID=$uid";
+ my $uid_url = index($url, 'mh:') == 0 ? $url.$uid : "$url/;UID=$uid";
@oidbin > 1 and warn("W: $uid_url not unique:\n",
map { "\t".unpack('H*', $_)."\n" } @oidbin);
my @docids = sort { $a <=> $b } uniqstr
diff --git a/lib/PublicInbox/LeiIndex.pm b/lib/PublicInbox/LeiIndex.pm
index b3f3e1a0..0e329e58 100644
--- a/lib/PublicInbox/LeiIndex.pm
+++ b/lib/PublicInbox/LeiIndex.pm
@@ -35,7 +35,7 @@ sub lei_index {
no warnings 'once';
no strict 'refs';
-for my $m (qw(pmdir_cb input_net_cb)) {
+for my $m (qw(pmdir_cb input_net_cb input_mh_cb)) {
*$m = PublicInbox::LeiImport->can($m);
}
diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm
index daba9a8e..947a7a79 100644
--- a/lib/PublicInbox/LeiInput.pm
+++ b/lib/PublicInbox/LeiInput.pm
@@ -69,6 +69,11 @@ sub input_maildir_cb {
$self->input_eml_cb($eml);
}
+sub input_mh_cb {
+ my ($dn, $n, $kw, $eml, $self) = @_;
+ $self->input_eml_cb($eml);
+}
+
sub input_net_cb { # imap_each, nntp_each cb
my ($url, $uid, $kw, $eml, $self) = @_;
$self->input_eml_cb($eml);
@@ -190,7 +195,7 @@ sub input_path_url {
$ifmt = lc($1);
} elsif ($input =~ /\.(?:patch|eml)\z/i) {
$ifmt = 'eml';
- } elsif (-f $input && $input =~ m{\A(?:.+)/(?:new|cur)/([^/]+)\z}) {
+ } elsif ($input =~ m{\A(?:.+)/(?:new|cur)/([^/]+)\z} && -f $input) {
my $bn = $1;
my $fl = PublicInbox::MdirReader::maildir_basename_flags($bn);
return if index($fl, 'T') >= 0;
@@ -204,6 +209,10 @@ sub input_path_url {
my $devfd = $lei->path_to_fd($input) // return;
if ($devfd >= 0) {
$self->input_fh($ifmt, $lei->{$devfd}, $input, @args);
+ } elsif ($devfd < 0 && $input =~ m{\A(.+/)([0-9]+)\z} && -f $input) {
+ my ($dn, $n) = ($1, $2);
+ my $mhr = PublicInbox::MHreader->new($dn, $lei->{3});
+ $mhr->mh_read_one($n, $self->can('input_mh_cb'), $self);
} elsif (-f $input && $ifmt eq 'eml') {
open my $fh, '<', $input or
return $lei->fail("open($input): $!");
@@ -231,6 +240,10 @@ sub input_path_url {
$self->can('input_maildir_cb'),
$self, @args);
}
+ } elsif (-d _ && $ifmt eq 'mh') {
+ my $mhr = PublicInbox::MHreader->new($input.'/', $lei->{3});
+ $mhr->{sort} = $lei->{opt}->{sort};
+ $mhr->mh_each_eml($self->can('input_mh_cb'), $self, @args);
} elsif (-d _ && $ifmt =~ /\A(?:v1|v2)\z/) {
my $ibx = PublicInbox::Inbox->new({inboxdir => $input});
each_ibx_eml($self, $ibx, @args);
@@ -354,13 +367,15 @@ sub prepare_inputs { # returns undef on error
PublicInbox::MboxReader->reads($ifmt) or return
$lei->fail("$ifmt not supported");
} elsif (-d $input_path) { # TODO extindex
- $ifmt =~ /\A(?:maildir|v1|v2|extindex)\z/ or
+ $ifmt =~ /\A(?:maildir|mh|v1|v2|extindex)\z/ or
return$lei->fail("$ifmt not supported");
$input = $input_path;
add_dir $lei, $istate, $ifmt, \$input;
- } elsif ($self->{missing_ok} && !-e _) {
+ } elsif ($self->{missing_ok} &&
+ $ifmt =~ /\A(?:maildir|mh)\z/ &&
+ !-e $input_path) {
# for "lei rm-watch" on missing Maildir
- $may_sync and $input = 'maildir:'.
+ $may_sync and $input = "$ifmt:".
$lei->abs_path($input_path);
} else {
my $m = "Unable to handle $input";
@@ -373,7 +388,7 @@ sub prepare_inputs { # returns undef on error
$input is `eml', not --in-format=$in_fmt
push @{$sync->{no}}, $input if $sync;
- } elsif (-f $input && $input =~ m{\A(.+)/(new|cur)/([^/]+)\z}) {
+ } elsif ($input =~ m{\A(.+)/(new|cur)/([^/]+)\z} && -f $input) {
# single file in a Maildir
my ($mdir, $nc, $bn) = ($1, $2, $3);
my $other = $mdir . ($nc eq 'new' ? '/cur' : '/new');
@@ -385,12 +400,24 @@ $input is `eml', not --in-format=$in_fmt
if ($sync) {
$input = $lei->abs_path($mdir) . "/$nc/$bn";
- push @{$sync->{ok}}, $input if $sync;
+ push @{$sync->{ok}}, $input;
}
require PublicInbox::MdirReader;
} else {
my $devfd = $lei->path_to_fd($input) // return;
- if ($devfd >= 0 || -f $input || -p _) {
+ if ($devfd < 0 && $input =~ m{\A(.+)/([0-9]+)\z} &&
+ -f $input) { # single file in MH dir
+ my ($mh, $n) = ($1, $2);
+ lc($in_fmt//'eml') eq 'eml' or
+ return $lei->fail(<<"");
+$input is `eml', not --in-format=$in_fmt
+
+ if ($sync) {
+ $input = $lei->abs_path($mh)."/$n";
+ push @{$sync->{ok}}, $input;
+ }
+ require PublicInbox::MHreader;
+ } elsif ($devfd >= 0 || -f $input || -p _) {
push @{$sync->{no}}, $input if $sync;
push @f, $input;
} elsif (-d "$input/new" && -d "$input/cur") {
@@ -401,10 +428,13 @@ $input is `eml', not --in-format=$in_fmt
add_dir $lei, $istate, 'v1', \$input;
} elsif (-e "$input/ei.lock") {
add_dir $lei, $istate, 'extindex', \$input;
+ } elsif (-f "$input/.mh_sequences") {
+ add_dir $lei, $istate, 'mh', \$input;
} elsif ($self->{missing_ok} && !-e $input) {
if ($lei->{cmd} eq 'p2q') {
# will run "git format-patch"
} elsif ($may_sync) { # for lei rm-watch
+ # FIXME: support MH, here
$input = 'maildir:'.
$lei->abs_path($input);
}
@@ -446,6 +476,14 @@ $input is `eml', not --in-format=$in_fmt
$lei->refresh_watches;
}
}
+ if (my $mh = $istate->{mh}) {
+ require PublicInbox::MHreader;
+ grep(!m!\Amh:!i, @$mh) and die "BUG: @$mh (no pfx)";
+ if ($may_sync && $lei->{sto}) {
+ $lei->lms(1)->lms_write_prepare->add_folders(@$mh);
+ # $lei->refresh_watches; TODO
+ }
+ }
require PublicInbox::ExtSearch if $istate->{extindex};
$self->{inputs} = $inputs;
}
diff --git a/lib/PublicInbox/LeiMailSync.pm b/lib/PublicInbox/LeiMailSync.pm
index 17254a82..593715dc 100644
--- a/lib/PublicInbox/LeiMailSync.pm
+++ b/lib/PublicInbox/LeiMailSync.pm
@@ -435,15 +435,24 @@ sub folders {
map { $_->[0] } @{$sth->fetchall_arrayref};
}
+sub blob_mismatch ($$$) {
+ my ($f, $oidhex, $rawref) = @_;
+ my $sha = $HEXLEN2SHA{length($oidhex)};
+ my $got = git_sha($sha, $rawref)->hexdigest;
+ $got eq $oidhex ? undef : warn("$f changed $oidhex => $got\n");
+}
+
sub local_blob {
my ($self, $oidhex, $vrfy) = @_;
my $dbh = $self->{dbh} //= dbh_new($self);
+ my $oidbin = pack('H*', $oidhex);
+
my $b2n = $dbh->prepare(<<'');
SELECT f.loc,b.name FROM blob2name b
LEFT JOIN folders f ON b.fid = f.fid
WHERE b.oidbin = ?
- $b2n->bind_param(1, pack('H*', $oidhex), SQL_BLOB);
+ $b2n->bind_param(1, $oidbin, SQL_BLOB);
$b2n->execute;
while (my ($d, $n) = $b2n->fetchrow_array) {
substr($d, 0, length('maildir:')) = '';
@@ -456,19 +465,28 @@ WHERE b.oidbin = ?
my $f = "$d/$x/$n";
open my $fh, '<', $f or next;
# some (buggy) Maildir writers are non-atomic:
- next unless -s $fh;
- my $raw = read_all($fh, -s _);
- if ($vrfy) {
- my $sha = $HEXLEN2SHA{length($oidhex)};
- my $got = git_sha($sha, \$raw)->hexdigest;
- if ($got ne $oidhex) {
- warn "$f changed $oidhex => $got\n";
- next;
- }
- }
+ my $raw = read_all($fh, -s $fh // next);
+ next if $vrfy && blob_mismatch $f, $oidhex, \$raw;
return \$raw;
}
}
+
+ # MH, except `uid' is not always unique (can be packed)
+ $b2n = $dbh->prepare(<<'');
+SELECT f.loc,b.uid FROM blob2num b
+LEFT JOIN folders f ON b.fid = f.fid
+WHERE b.oidbin = ? AND f.loc REGEXP '^mh:/'
+
+ $b2n->bind_param(1, $oidbin, SQL_BLOB);
+ $b2n->execute;
+ while (my ($f, $n) = $b2n->fetchrow_array) {
+ $f =~ s/\Amh://s or die "BUG: not MH: $f";
+ $f .= "/$n";
+ open my $fh, '<', $f or next;
+ my $raw = read_all($fh, -s $fh // next);
+ next if blob_mismatch $f, $oidhex, \$raw;
+ return \$raw;
+ }
undef;
}
diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm
index 071ba113..de75e99e 100644
--- a/lib/PublicInbox/LeiToMail.pm
+++ b/lib/PublicInbox/LeiToMail.pm
@@ -400,6 +400,11 @@ sub new {
"$dst exists and is not a directory\n";
$lei->{ovv}->{dst} = $dst .= '/' if substr($dst, -1) ne '/';
$lei->{opt}->{save} //= \1 if $lei->{cmd} eq 'q';
+ } elsif ($fmt eq 'mh') {
+ -e $dst && !-d _ and die
+ "$dst exists and is not a directory\n";
+ $lei->{ovv}->{dst} = $dst .= '/' if substr($dst, -1) ne '/';
+ $lei->{opt}->{save} //= \1 if $lei->{cmd} eq 'q';
} elsif (substr($fmt, 0, 4) eq 'mbox') {
require PublicInbox::MboxReader;
$self->can("eml2$fmt") or die "bad mbox format: $fmt\n";
diff --git a/lib/PublicInbox/MHreader.pm b/lib/PublicInbox/MHreader.pm
new file mode 100644
index 00000000..673e3e06
--- /dev/null
+++ b/lib/PublicInbox/MHreader.pm
@@ -0,0 +1,103 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# MH reader, based on Lib/mailbox.py in cpython source
+package PublicInbox::MHreader;
+use v5.12;
+use PublicInbox::InboxWritable qw(eml_from_path);
+use PublicInbox::OnDestroy;
+use PublicInbox::IO qw(try_cat);
+use PublicInbox::MdirSort;
+use Carp qw(carp);
+use autodie qw(chdir closedir opendir);
+
+my %FL2OFF = ( # mh_sequences key => our keyword
+ replied => 0,
+ flagged => 1,
+ unseen => 2, # negate
+);
+my @OFF2KW = qw(answered flagged); # [2] => unseen (negated)
+
+sub new {
+ my ($cls, $dir, $cwdfh) = @_;
+ if (substr($dir, -1) ne '/') { # TODO: do this earlier
+ carp "W: appending `/' to `$dir' (fix caller)\n";
+ $dir .= '/';
+ }
+ bless { dir => $dir, cwdfh => $cwdfh }, $cls;
+}
+
+sub read_mh_sequences ($) { # caller must chdir($self->{dir})
+ my ($self) = @_;
+ my ($fl, $off, @n);
+ my @seq = ('', '', '');
+ for (split /\n+/s, try_cat('.mh_sequences')) {
+ ($fl, @n) = split /[: \t]+/;
+ $off = $FL2OFF{$fl} // do { warn <<EOM;
+W: unknown `$fl' in $self->{dir}.mh_sequences (ignoring)
+EOM
+ next;
+ };
+ @n = grep /\A[0-9]+\z/s, @n; # don't stat, yet
+ if (@n) {
+ @n = sort { $b <=> $a } @n; # to avoid resize
+ my $buf = '';
+ vec($buf, $_, 1) = 1 for @n;
+ $seq[$off] = $buf;
+ }
+ }
+ \@seq;
+}
+
+sub mh_each_file {
+ my ($self, $efcb, @arg) = @_;
+ opendir(my $dh, my $dir = $self->{dir});
+ my $restore = PublicInbox::OnDestroy->new($$, \&chdir, $self->{cwdfh});
+ chdir($dh);
+ if (defined(my $sort = $self->{sort})) {
+ my @sort = map {
+ my @tmp = $_ eq '' ? ('sequence') : split(/[, ]/);
+ # sorting by name alphabetically makes no sense for MH:
+ for my $k (@tmp) {
+ s/\A(\-|\+|)(?:name|)\z/$1sequence/;
+ }
+ @tmp;
+ } @$sort;
+ my @n = grep /\A[0-9]+\z/s, readdir $dh;
+ mdir_sort \@n, \@sort;
+ $efcb->($dir, $_, $self, @arg) for @n;
+ } else {
+ while (readdir $dh) { # perl v5.12+ to set $_ on readdir
+ $efcb->($dir, $_, $self, @arg) if /\A[0-9]+\z/s;
+ }
+ }
+ closedir $dh; # may die
+}
+
+sub kw_for ($$) {
+ my ($self, $n) = @_;
+ my $seq = $self->{mh_seq} //= read_mh_sequences($self);
+ my @kw = map { vec($seq->[$_], $n, 1) ? $OFF2KW[$_] : () } (0, 1);
+ vec($seq->[2], $n, 1) or push @kw, 'seen';
+ \@kw;
+}
+
+sub _file2eml { # mh_each_file cb
+ my ($dir, $n, $self, $ucb, @arg) = @_;
+ my $eml = eml_from_path($n);
+ $ucb->($dir, $n, kw_for($self, $n), $eml, @arg) if $eml;
+}
+
+sub mh_each_eml {
+ my ($self, $ucb, @arg) = @_;
+ mh_each_file($self, \&_file2eml, $ucb, @arg);
+}
+
+sub mh_read_one {
+ my ($self, $n, $ucb, @arg) = @_;
+ my $restore = PublicInbox::OnDestroy->new($$, \&chdir, $self->{cwdfh});
+ chdir(my $dir = $self->{dir});
+ _file2eml($dir, $n, $self, $ucb, @arg);
+}
+
+1;
diff --git a/lib/PublicInbox/MdirReader.pm b/lib/PublicInbox/MdirReader.pm
index db5f4545..2981b058 100644
--- a/lib/PublicInbox/MdirReader.pm
+++ b/lib/PublicInbox/MdirReader.pm
@@ -1,7 +1,7 @@
# Copyright (C) all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
-# Maildirs for now, MH eventually
+# Maildirs only (PublicInbox::MHreader exists, now)
# ref: https://cr.yp.to/proto/maildir.html
# https://wiki2.dovecot.org/MailboxFormat/Maildir
package PublicInbox::MdirReader;
diff --git a/lib/PublicInbox/MdirSort.pm b/lib/PublicInbox/MdirSort.pm
new file mode 100644
index 00000000..6bd9fb6c
--- /dev/null
+++ b/lib/PublicInbox/MdirSort.pm
@@ -0,0 +1,46 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# used for sorting MH (and (TODO) Maildir) names
+# TODO: consider sort(1) to parallelize sorting of gigantic directories
+package PublicInbox::MdirSort;
+use v5.12;
+use Time::HiRes ();
+use parent qw(Exporter);
+use Fcntl qw(S_ISREG);
+our @EXPORT = qw(mdir_sort);
+my %ST = (sequence => 0, size => 1, atime => 2, mtime => 3, ctime => 4);
+
+sub mdir_sort ($$;$) {
+ my ($ent, $sort, $max) = @_;
+ my @st;
+ my @ent = map {
+ @st = Time::HiRes::stat $_;
+ # name, size, {a,m,c}time
+ S_ISREG($st[2]) ? [ $_, @st[7..10] ] : ();
+ } @$ent;
+ @ent = grep { $_->[1] <= $max } @ent if $max;
+ use sort 'stable';
+ for my $s (@$sort) {
+ if ($s =~ /\A(\-|\+|)name\z/) {
+ if ($1 eq '-') {
+ @ent = sort { $b->[0] cmp $a->[0] } @ent;
+ } else {
+ @ent = sort { $a->[0] cmp $b->[0] } @ent;
+ }
+ } elsif ($s =~ /\A(\-|\+|)
+ (sequence|size|ctime|mtime|atime)\z/x) {
+ my $key = $ST{$2};
+ if ($1 eq '-') {
+ @ent = sort { $b->[$key] <=> $a->[$key] } @ent;
+ } else {
+ @ent = sort { $a->[$key] <=> $b->[$key] } @ent;
+ }
+ } else {
+ die "E: unrecognized sort parameter: `$s'";
+ }
+ }
+ @$ent = map { $_->[0] } @ent;
+}
+
+1;
diff --git a/lib/PublicInbox/TestCommon.pm b/lib/PublicInbox/TestCommon.pm
index b0f28e16..d20bff28 100644
--- a/lib/PublicInbox/TestCommon.pm
+++ b/lib/PublicInbox/TestCommon.pm
@@ -24,6 +24,7 @@ BEGIN {
@EXPORT = qw(tmpdir tcp_server tcp_connect require_git require_mods
run_script start_script key2sub xsys xsys_e xqx eml_load tick
have_xapian_compact json_utf8 setup_public_inboxes create_inbox
+ create_dir
create_coderepo require_bsd kernel_version check_broken_tmpfs
quit_waiter_pipe wait_for_eof require_git_http_backend
tcp_host_port test_lei lei lei_ok $lei_out $lei_err $lei_opt
@@ -843,26 +844,24 @@ sub my_sum {
substr PublicInbox::SHA::sha256_hex(join('', @l)), 0, 8;
}
-sub create_coderepo ($$;@) {
- my $ident = shift;
- my $cb = pop;
+sub create_dir (@) {
+ my ($ident, $cb) = (shift, pop);
my %opt = @_;
require PublicInbox::Lock;
require PublicInbox::Import;
- my ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!);
- my ($db) = (PublicInbox::Import::default_branch() =~ m!([^/]+)\z!);
my $tmpdir = delete $opt{tmpdir};
- my $dir = "t/data-gen/$base.$ident-".my_sum($db, $cb, \%opt);
+ my ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!);
+ my $dir = "t/data-gen/$base.$ident-".my_sum($cb, \%opt);
require File::Path;
my $new = File::Path::make_path($dir);
my $lk = PublicInbox::Lock->new("$dir/creat.lock");
my $scope = $lk->lock_for_scope;
if (!-f "$dir/creat.stamp") {
- opendir(my $dfh, '.');
+ opendir(my $cwd, '.');
chdir($dir);
local %ENV = (%ENV, %COMMIT_ENV);
$cb->($dir);
- chdir($dfh);
+ chdir($cwd); # some $cb chdir around
open my $s, '>', "$dir/creat.stamp";
}
return $dir if !defined($tmpdir);
@@ -870,6 +869,13 @@ sub create_coderepo ($$;@) {
$tmpdir;
}
+sub create_coderepo (@) {
+ my $ident = shift;
+ require PublicInbox::Import;
+ my ($db) = (PublicInbox::Import::default_branch() =~ m!([^/]+)\z!);
+ create_dir "$ident-$db", @_;
+}
+
sub create_inbox ($;@) {
my $ident = shift;
my $cb = pop;
diff --git a/t/mh_reader.t b/t/mh_reader.t
new file mode 100644
index 00000000..e8f69fa8
--- /dev/null
+++ b/t/mh_reader.t
@@ -0,0 +1,107 @@
+#!perl -w
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use PublicInbox::TestCommon;
+require_ok 'PublicInbox::MHreader';
+use PublicInbox::IO qw(write_file);
+use PublicInbox::Lock;
+use PublicInbox::OnDestroy;
+use PublicInbox::Eml;
+use autodie;
+opendir my $cwdfh, '.';
+
+my $normal = create_dir 'normal', sub {
+ write_file '>', 3, "Subject: replied a\n\n";
+ write_file '>', 4, "Subject: replied b\n\n";
+ write_file '>', 1, "Subject: unseen\n\n";
+ write_file '>', 2, "Subject: unseen flagged\n\n";
+ write_file '>', '.mh_sequences', <<EOM;
+unseen: 1 2
+flagged: 2
+replied: 3 4
+EOM
+};
+
+my $for_sort = create_dir 'size', sub {
+ for (1..3) {
+ my $name = 10 - $_;
+ write_file '>', $name, "Subject: ".($_ x $_)."\n\n";
+ }
+};
+
+my $stale = create_dir 'stale', sub {
+ write_file '>', 4, "Subject: msg 4\n\n";
+ write_file '>', '.mh_sequences', <<EOM;
+unseen: 1 2
+EOM
+};
+
+{
+ my $mhr = PublicInbox::MHreader->new("$normal/", $cwdfh);
+ $mhr->{sort} = [ '' ];
+ my @res;
+ $mhr->mh_each_eml(sub { push @res, \@_; }, [ 'bogus' ]);
+ is scalar(@res), 4, 'got 4 messages' or diag explain(\@res);
+ is_deeply [map { $_->[1] } @res], [1, 2, 3, 4],
+ 'got messages in expected order';
+ is scalar(grep { $_->[4]->[0] eq 'bogus' } @res), scalar(@res),
+ 'cb arg passed to all messages' or diag explain(\@res);
+
+ $mhr = PublicInbox::MHreader->new("$stale/", $cwdfh);
+ @res = ();
+ $mhr->mh_each_eml(sub { push @res, \@_; });
+ is scalar(@res), 1, 'ignored stale messages';
+}
+
+test_lei(sub {
+ lei_ok qw(convert -f mboxrd), $normal;
+ my @msgs = grep /\S/s, split /^From .[^\n]+\n/sm, $lei_out;
+ my @eml = map { PublicInbox::Eml->new($_) } @msgs;
+ my $h = 'Subject';
+ @eml = sort { $a->header_raw($h) cmp $b->header_raw($h) } @eml;
+ my @has = map { scalar $_->header_raw($h) } @eml;
+ is_xdeeply \@has,
+ [ 'replied a', 'replied b', 'unseen', 'unseen flagged' ],
+ 'subjects sorted';
+ $h = 'X-Status';
+ @has = map { scalar $_->header_raw($h) } @eml;
+ is_xdeeply \@has, [ 'A', 'A', undef, 'F' ], 'answered and flagged kw';
+ $h = 'Status';
+ @has = map { scalar $_->header_raw($h) } @eml;
+ is_xdeeply \@has, ['RO', 'RO', 'O', 'O'], 'read and old';
+ lei_ok qw(import +L:normal), $normal;
+ lei_ok qw(q L:normal -f mboxrd);
+ @msgs = grep /\S/s, split /^From .[^\n]+\n/sm, $lei_out;
+ my @eml2 = map { PublicInbox::Eml->new($_) } @msgs;
+ $h = 'Subject';
+ @eml2 = sort { $a->header_raw($h) cmp $b->header_raw($h) } @eml2;
+ is_xdeeply \@eml2, \@eml, 'import preserved kw';
+
+ lei_ok 'ls-mail-sync';
+ is $lei_out, 'mh:'.File::Spec->rel2abs($normal)."\n",
+ 'mail sync stored';
+
+ lei_ok qw(convert -s size -f mboxrd), "mh:$for_sort";
+ chomp(my @s = grep /^Subject:/, split(/^/sm, $lei_out));
+ s/^Subject: // for @s;
+ is_xdeeply \@s, [ 1, 22, 333 ], 'sorted by size';
+
+ for my $s ([], [ 'name' ], [ 'sequence' ]) {
+ lei_ok qw(convert -f mboxrd), "mh:$for_sort", '-s', @$s;
+ chomp(@s = grep /^Subject:/, split(/^/sm, $lei_out));
+ s/^Subject: // for @s;
+ my $desc = "@$s" || '(default)';
+ is_xdeeply \@s, [ 333, 22, 1 ], "sorted by: $desc";
+ }
+
+ lei_ok qw(import +L:sorttest), "MH:$for_sort";
+ lei_ok 'ls-mail-sync', $for_sort;
+ is $lei_out, 'mh:'.File::Spec->rel2abs($for_sort)."\n",
+ "mail sync stored with `MH' normalized to `mh'";
+ lei_ok qw(index), 'mh:'.$stale;
+ lei qw(q -f mboxrd), 's:msg 4';
+ like $lei_out, qr/^Subject: msg 4\nStatus: RO\n\n\n/ms,
+ "message retrieved after `lei index'"
+});
+
+done_testing;
prev parent reply other threads:[~2023-12-29 18:05 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-12-16 13:09 [PATCH] lei: support reading MH for convert+import+index Eric Wong
2023-12-16 16:15 ` Konstantin Ryabitsev
2023-12-16 18:17 ` Eric Wong
2023-12-17 7:59 ` Eric Wong
2023-12-29 18:05 ` Eric Wong [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20231229180514.M393557@dcvr \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).