unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
* [PATCH] lei: support reading MH for convert+import+index
@ 2023-12-16 13:09 Eric Wong
  2023-12-16 16:15 ` Konstantin Ryabitsev
  2023-12-29 18:05 ` [PATCH v2] " Eric Wong
  0 siblings, 2 replies; 5+ messages in thread
From: Eric Wong @ 2023-12-16 13:09 UTC (permalink / raw)
  To: meta

The MH format is widely-supported and used by various MUAs such
as mutt and sylpheed, and a MH-like format is used by mlmmj for
archives, as well.  Locking implementations for writes are
inconsistent, so this commit doesn't support writes, yet.

inotify|EVFILT_VNODE watches aren't supported, yet, either.
---
 MANIFEST                       |   3 +
 lib/PublicInbox/LEI.pm         |  13 ++--
 lib/PublicInbox/LeiConvert.pm  |   5 ++
 lib/PublicInbox/LeiImport.pm   |  23 +++++++
 lib/PublicInbox/LeiImportKw.pm |   2 +-
 lib/PublicInbox/LeiIndex.pm    |   2 +-
 lib/PublicInbox/LeiInput.pm    |  52 +++++++++++++---
 lib/PublicInbox/LeiMailSync.pm |  39 ++++++++----
 lib/PublicInbox/LeiToMail.pm   |   5 ++
 lib/PublicInbox/MHreader.pm    | 103 +++++++++++++++++++++++++++++++
 lib/PublicInbox/MdirReader.pm  |   2 +-
 lib/PublicInbox/MdirSort.pm    |  46 ++++++++++++++
 lib/PublicInbox/TestCommon.pm  |  22 ++++---
 t/mh_reader.t                  | 108 +++++++++++++++++++++++++++++++++
 14 files changed, 392 insertions(+), 33 deletions(-)
 create mode 100644 lib/PublicInbox/MHreader.pm
 create mode 100644 lib/PublicInbox/MdirSort.pm
 create mode 100644 t/mh_reader.t

diff --git a/MANIFEST b/MANIFEST
index e22674b7..8bcc3179 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -293,6 +293,7 @@ lib/PublicInbox/Linkify.pm
 lib/PublicInbox/Listener.pm
 lib/PublicInbox/Lock.pm
 lib/PublicInbox/MDA.pm
+lib/PublicInbox/MHreader.pm
 lib/PublicInbox/MID.pm
 lib/PublicInbox/MIME.pm
 lib/PublicInbox/MailDiff.pm
@@ -302,6 +303,7 @@ lib/PublicInbox/MboxGz.pm
 lib/PublicInbox/MboxLock.pm
 lib/PublicInbox/MboxReader.pm
 lib/PublicInbox/MdirReader.pm
+lib/PublicInbox/MdirSort.pm
 lib/PublicInbox/MiscIdx.pm
 lib/PublicInbox/MiscSearch.pm
 lib/PublicInbox/MsgIter.pm
@@ -543,6 +545,7 @@ t/mda-mime.eml
 t/mda.t
 t/mda_filter_rubylang.t
 t/mdir_reader.t
+t/mh_reader.t
 t/mid.t
 t/mime.t
 t/miscsearch.t
diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm
index 17431518..e0cfd55a 100644
--- a/lib/PublicInbox/LEI.pm
+++ b/lib/PublicInbox/LEI.pm
@@ -267,7 +267,7 @@ import => [ 'LOCATION...|--stdin [LABELS...]',
 	'one-time import/update from URL or filesystem',
 	qw(stdin| offset=i recursive|r exclude=s include|I=s new-only
 	lock=s@ in-format|F=s kw! verbose|v+ incremental! mail-sync!
-	commit-delay=i),
+	commit-delay=i sort|s:s@),
 	@net_opt, @c_opt ],
 'forget-mail-sync' => [ 'LOCATION...',
 	'forget sync information for a mail folder', @c_opt ],
@@ -280,7 +280,7 @@ import => [ 'LOCATION...|--stdin [LABELS...]',
 'convert' => [ 'LOCATION...|--stdin',
 	'one-time conversion from URL or filesystem to another format',
 	qw(stdin| in-format|F=s out-format|f=s output|mfolder|o=s lock=s@ kw!
-		rsyncable),
+		rsyncable sort|s:s@),
 	@net_opt, @c_opt ],
 'p2q' => [ 'LOCATION_OR_COMMIT...|--stdin',
 	"use a patch to generate a query for `lei q --stdin'",
@@ -321,6 +321,9 @@ import => [ 'LOCATION...|--stdin [LABELS...]',
 my $stdin_formats = [ 'MAIL_FORMAT|eml|mboxrd|mboxcl2|mboxcl|mboxo',
 			'specify message input format' ];
 my $ls_format = [ 'OUT|plain|json|null', 'listing output format' ];
+my $sort_out = [ 'VAL|received|relevance|docid',
+		"order of results is `--output'-dependent"];
+my $sort_in = [ 'sequence|mtime|size', 'sort input (format-dependent)' ];
 
 # we use \x{a0} (non-breaking SP) to avoid wrapping in PublicInbox::LeiHelp
 my %OPTDESC = (
@@ -428,8 +431,10 @@ my %OPTDESC = (
 'limit|n=i@' => ['NUM', 'limit on number of matches (default: 10000)' ],
 'offset=i' => ['OFF', 'search result offset (default: 0)'],
 
-'sort|s=s' => [ 'VAL|received|relevance|docid',
-		"order of results is `--output'-dependent"],
+'sort|s=s	q' => $sort_out,
+'sort|s=s	lcat' => $sort_out,
+'sort|s:s@	convert' => $sort_in,
+'sort|s:s@	import' => $sort_in,
 'reverse|r' => 'reverse search results', # like sort(1)
 
 'boost=i' => 'increase/decrease priority of results (default: 0)',
diff --git a/lib/PublicInbox/LeiConvert.pm b/lib/PublicInbox/LeiConvert.pm
index 8f628562..17a952f2 100644
--- a/lib/PublicInbox/LeiConvert.pm
+++ b/lib/PublicInbox/LeiConvert.pm
@@ -28,6 +28,11 @@ sub input_maildir_cb {
 	$self->{wcb}->(undef, { kw => $kw }, $eml);
 }
 
+sub input_mh_cb {
+	my ($dn, $bn, $kw, $eml, $self) = @_;
+	$self->{wcb}->(undef, { kw => $kw }, $eml);
+}
+
 sub process_inputs { # via wq_do
 	my ($self) = @_;
 	local $PublicInbox::DS::in_loop = 0; # force synchronous awaitpid
diff --git a/lib/PublicInbox/LeiImport.pm b/lib/PublicInbox/LeiImport.pm
index c2552bf0..5521188c 100644
--- a/lib/PublicInbox/LeiImport.pm
+++ b/lib/PublicInbox/LeiImport.pm
@@ -53,6 +53,29 @@ sub pmdir_cb { # called via wq_io_do from LeiPmdir->each_mdir_fn
 	}
 }
 
+sub input_mh_cb {
+	my ($mhdir, $n, $kw, $eml, $self) = @_;
+	substr($mhdir, 0, 0) = 'mh:'; # add prefix
+	my $lse = $self->{lse} //= $self->{lei}->{sto}->search;
+	my $lms = $self->{-lms_rw} //= $self->{lei}->lms; # may be 0 or undef
+	my @oidbin = $lms ? $lms->num_oidbin($mhdir, $n) : ();
+	@oidbin > 1 and warn("W: $mhdir/$n not unique:\n",
+				map { "\t".unpack('H*', $_)."\n" } @oidbin);
+	my @docids = sort { $a <=> $b } uniqstr
+			map { $lse->over->oidbin_exists($_) } @oidbin;
+	if (scalar @docids) {
+		$lse->kw_changed(undef, $kw, \@docids) or return;
+	}
+	if (defined $eml) {
+		my $vmd = $self->{-import_kw} ? { kw => $kw } : undef;
+		$vmd->{sync_info} = [ $mhdir, $n + 0 ] if $self->{-mail_sync};
+		$self->input_eml_cb($eml, $vmd);
+	}
+	# TODO:
+	# elsif (my $ikw = $self->{lei}->{ikw}) { # old message, kw only
+	#	$ikw->wq_io_do('ck_update_kw', [], "mh:$dir", $uid, $kw);
+}
+
 sub input_net_cb { # imap_each / nntp_each
 	my ($uri, $uid, $kw, $eml, $self) = @_;
 	if (defined $eml) {
diff --git a/lib/PublicInbox/LeiImportKw.pm b/lib/PublicInbox/LeiImportKw.pm
index 4b8e69fb..765e23cd 100644
--- a/lib/PublicInbox/LeiImportKw.pm
+++ b/lib/PublicInbox/LeiImportKw.pm
@@ -36,7 +36,7 @@ sub ipc_atfork_child {
 sub ck_update_kw { # via wq_io_do
 	my ($self, $url, $uid, $kw) = @_;
 	my @oidbin = $self->{-lms_rw}->num_oidbin($url, $uid);
-	my $uid_url = "$url/;UID=$uid";
+	my $uid_url = index($url, 'mh:') == 0 ? $url.$uid : "$url/;UID=$uid";
 	@oidbin > 1 and warn("W: $uid_url not unique:\n",
 				map { "\t".unpack('H*', $_)."\n" } @oidbin);
 	my @docids = sort { $a <=> $b } uniqstr
diff --git a/lib/PublicInbox/LeiIndex.pm b/lib/PublicInbox/LeiIndex.pm
index b3f3e1a0..0e329e58 100644
--- a/lib/PublicInbox/LeiIndex.pm
+++ b/lib/PublicInbox/LeiIndex.pm
@@ -35,7 +35,7 @@ sub lei_index {
 
 no warnings 'once';
 no strict 'refs';
-for my $m (qw(pmdir_cb input_net_cb)) {
+for my $m (qw(pmdir_cb input_net_cb input_mh_cb)) {
 	*$m = PublicInbox::LeiImport->can($m);
 }
 
diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm
index daba9a8e..947a7a79 100644
--- a/lib/PublicInbox/LeiInput.pm
+++ b/lib/PublicInbox/LeiInput.pm
@@ -69,6 +69,11 @@ sub input_maildir_cb {
 	$self->input_eml_cb($eml);
 }
 
+sub input_mh_cb {
+	my ($dn, $n, $kw, $eml, $self) = @_;
+	$self->input_eml_cb($eml);
+}
+
 sub input_net_cb { # imap_each, nntp_each cb
 	my ($url, $uid, $kw, $eml, $self) = @_;
 	$self->input_eml_cb($eml);
@@ -190,7 +195,7 @@ sub input_path_url {
 		$ifmt = lc($1);
 	} elsif ($input =~ /\.(?:patch|eml)\z/i) {
 		$ifmt = 'eml';
-	} elsif (-f $input && $input =~ m{\A(?:.+)/(?:new|cur)/([^/]+)\z}) {
+	} elsif ($input =~ m{\A(?:.+)/(?:new|cur)/([^/]+)\z} && -f $input) {
 		my $bn = $1;
 		my $fl = PublicInbox::MdirReader::maildir_basename_flags($bn);
 		return if index($fl, 'T') >= 0;
@@ -204,6 +209,10 @@ sub input_path_url {
 	my $devfd = $lei->path_to_fd($input) // return;
 	if ($devfd >= 0) {
 		$self->input_fh($ifmt, $lei->{$devfd}, $input, @args);
+	} elsif ($devfd < 0 && $input =~ m{\A(.+/)([0-9]+)\z} && -f $input) {
+		my ($dn, $n) = ($1, $2);
+		my $mhr = PublicInbox::MHreader->new($dn, $lei->{3});
+		$mhr->mh_read_one($n, $self->can('input_mh_cb'), $self);
 	} elsif (-f $input && $ifmt eq 'eml') {
 		open my $fh, '<', $input or
 					return $lei->fail("open($input): $!");
@@ -231,6 +240,10 @@ sub input_path_url {
 						$self->can('input_maildir_cb'),
 						$self, @args);
 		}
+	} elsif (-d _ && $ifmt eq 'mh') {
+		my $mhr = PublicInbox::MHreader->new($input.'/', $lei->{3});
+		$mhr->{sort} = $lei->{opt}->{sort};
+		$mhr->mh_each_eml($self->can('input_mh_cb'), $self, @args);
 	} elsif (-d _ && $ifmt =~ /\A(?:v1|v2)\z/) {
 		my $ibx = PublicInbox::Inbox->new({inboxdir => $input});
 		each_ibx_eml($self, $ibx, @args);
@@ -354,13 +367,15 @@ sub prepare_inputs { # returns undef on error
 				PublicInbox::MboxReader->reads($ifmt) or return
 					$lei->fail("$ifmt not supported");
 			} elsif (-d $input_path) { # TODO extindex
-				$ifmt =~ /\A(?:maildir|v1|v2|extindex)\z/ or
+				$ifmt =~ /\A(?:maildir|mh|v1|v2|extindex)\z/ or
 					return$lei->fail("$ifmt not supported");
 				$input = $input_path;
 				add_dir $lei, $istate, $ifmt, \$input;
-			} elsif ($self->{missing_ok} && !-e _) {
+			} elsif ($self->{missing_ok} &&
+					$ifmt =~ /\A(?:maildir|mh)\z/ &&
+					!-e $input_path) {
 				# for "lei rm-watch" on missing Maildir
-				$may_sync and $input = 'maildir:'.
+				$may_sync and $input = "$ifmt:".
 						$lei->abs_path($input_path);
 			} else {
 				my $m = "Unable to handle $input";
@@ -373,7 +388,7 @@ sub prepare_inputs { # returns undef on error
 $input is `eml', not --in-format=$in_fmt
 
 			push @{$sync->{no}}, $input if $sync;
-		} elsif (-f $input && $input =~ m{\A(.+)/(new|cur)/([^/]+)\z}) {
+		} elsif ($input =~ m{\A(.+)/(new|cur)/([^/]+)\z} && -f $input) {
 			# single file in a Maildir
 			my ($mdir, $nc, $bn) = ($1, $2, $3);
 			my $other = $mdir . ($nc eq 'new' ? '/cur' : '/new');
@@ -385,12 +400,24 @@ $input is `eml', not --in-format=$in_fmt
 
 			if ($sync) {
 				$input = $lei->abs_path($mdir) . "/$nc/$bn";
-				push @{$sync->{ok}}, $input if $sync;
+				push @{$sync->{ok}}, $input;
 			}
 			require PublicInbox::MdirReader;
 		} else {
 			my $devfd = $lei->path_to_fd($input) // return;
-			if ($devfd >= 0 || -f $input || -p _) {
+			if ($devfd < 0 && $input =~ m{\A(.+)/([0-9]+)\z} &&
+					-f $input) { # single file in MH dir
+				my ($mh, $n) = ($1, $2);
+				lc($in_fmt//'eml') eq 'eml' or
+						return $lei->fail(<<"");
+$input is `eml', not --in-format=$in_fmt
+
+				if ($sync) {
+					$input = $lei->abs_path($mh)."/$n";
+					push @{$sync->{ok}}, $input;
+				}
+				require PublicInbox::MHreader;
+			} elsif ($devfd >= 0 || -f $input || -p _) {
 				push @{$sync->{no}}, $input if $sync;
 				push @f, $input;
 			} elsif (-d "$input/new" && -d "$input/cur") {
@@ -401,10 +428,13 @@ $input is `eml', not --in-format=$in_fmt
 				add_dir $lei, $istate, 'v1', \$input;
 			} elsif (-e "$input/ei.lock") {
 				add_dir $lei, $istate, 'extindex', \$input;
+			} elsif (-f "$input/.mh_sequences") {
+				add_dir $lei, $istate, 'mh', \$input;
 			} elsif ($self->{missing_ok} && !-e $input) {
 				if ($lei->{cmd} eq 'p2q') {
 					# will run "git format-patch"
 				} elsif ($may_sync) { # for lei rm-watch
+					# FIXME: support MH, here
 					$input = 'maildir:'.
 						$lei->abs_path($input);
 				}
@@ -446,6 +476,14 @@ $input is `eml', not --in-format=$in_fmt
 			$lei->refresh_watches;
 		}
 	}
+	if (my $mh = $istate->{mh}) {
+		require PublicInbox::MHreader;
+		grep(!m!\Amh:!i, @$mh) and die "BUG: @$mh (no pfx)";
+		if ($may_sync && $lei->{sto}) {
+			$lei->lms(1)->lms_write_prepare->add_folders(@$mh);
+			# $lei->refresh_watches; TODO
+		}
+	}
 	require PublicInbox::ExtSearch if $istate->{extindex};
 	$self->{inputs} = $inputs;
 }
diff --git a/lib/PublicInbox/LeiMailSync.pm b/lib/PublicInbox/LeiMailSync.pm
index 17254a82..8d00d1fa 100644
--- a/lib/PublicInbox/LeiMailSync.pm
+++ b/lib/PublicInbox/LeiMailSync.pm
@@ -435,15 +435,24 @@ sub folders {
 	map { $_->[0] } @{$sth->fetchall_arrayref};
 }
 
+sub blob_mismatch ($$$) {
+	my ($f, $oidhex, $rawref) = @_;
+	my $sha = $HEXLEN2SHA{length($oidhex)};
+	my $got = git_sha($sha, $rawref)->hexdigest;
+	$got eq $oidhex ? undef : warn("$f changed $oidhex => $got\n");
+}
+
 sub local_blob {
 	my ($self, $oidhex, $vrfy) = @_;
 	my $dbh = $self->{dbh} //= dbh_new($self);
+	my $oidbin = pack('H*', $oidhex);
+
 	my $b2n = $dbh->prepare(<<'');
 SELECT f.loc,b.name FROM blob2name b
 LEFT JOIN folders f ON b.fid = f.fid
 WHERE b.oidbin = ?
 
-	$b2n->bind_param(1, pack('H*', $oidhex), SQL_BLOB);
+	$b2n->bind_param(1, $oidbin, SQL_BLOB);
 	$b2n->execute;
 	while (my ($d, $n) = $b2n->fetchrow_array) {
 		substr($d, 0, length('maildir:')) = '';
@@ -456,19 +465,27 @@ WHERE b.oidbin = ?
 			my $f = "$d/$x/$n";
 			open my $fh, '<', $f or next;
 			# some (buggy) Maildir writers are non-atomic:
-			next unless -s $fh;
-			my $raw = read_all($fh, -s _);
-			if ($vrfy) {
-				my $sha = $HEXLEN2SHA{length($oidhex)};
-				my $got = git_sha($sha, \$raw)->hexdigest;
-				if ($got ne $oidhex) {
-					warn "$f changed $oidhex => $got\n";
-					next;
-				}
-			}
+			my $raw = read_all($fh, -s $fh // next);
+			next if $vrfy && blob_mismatch $f, $oidhex, \$raw;
 			return \$raw;
 		}
 	}
+
+	$b2n = $dbh->prepare(<<'');
+SELECT f.loc,b.uid FROM blob2num b
+LEFT JOIN folders f ON b.fid = f.fid
+WHERE b.oidbin = ? /* AND f.loc LIKE 'mh:/%' */
+
+	$b2n->bind_param(1, $oidbin, SQL_BLOB);
+	$b2n->execute;
+	while (my ($d, $n) = $b2n->fetchrow_array) {
+		substr($d, 0, length('mh:')) = '';
+		my $f = "$d/$n";
+		open my $fh, '<', $f or next;
+		my $raw = read_all($fh, -s $fh // next);
+		next if $vrfy && blob_mismatch $f, $oidhex, \$raw;
+		return \$raw;
+	}
 	undef;
 }
 
diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm
index 071ba113..de75e99e 100644
--- a/lib/PublicInbox/LeiToMail.pm
+++ b/lib/PublicInbox/LeiToMail.pm
@@ -400,6 +400,11 @@ sub new {
 				"$dst exists and is not a directory\n";
 		$lei->{ovv}->{dst} = $dst .= '/' if substr($dst, -1) ne '/';
 		$lei->{opt}->{save} //= \1 if $lei->{cmd} eq 'q';
+	} elsif ($fmt eq 'mh') {
+		-e $dst && !-d _ and die
+				"$dst exists and is not a directory\n";
+		$lei->{ovv}->{dst} = $dst .= '/' if substr($dst, -1) ne '/';
+		$lei->{opt}->{save} //= \1 if $lei->{cmd} eq 'q';
 	} elsif (substr($fmt, 0, 4) eq 'mbox') {
 		require PublicInbox::MboxReader;
 		$self->can("eml2$fmt") or die "bad mbox format: $fmt\n";
diff --git a/lib/PublicInbox/MHreader.pm b/lib/PublicInbox/MHreader.pm
new file mode 100644
index 00000000..673e3e06
--- /dev/null
+++ b/lib/PublicInbox/MHreader.pm
@@ -0,0 +1,103 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# MH reader, based on Lib/mailbox.py in cpython source
+package PublicInbox::MHreader;
+use v5.12;
+use PublicInbox::InboxWritable qw(eml_from_path);
+use PublicInbox::OnDestroy;
+use PublicInbox::IO qw(try_cat);
+use PublicInbox::MdirSort;
+use Carp qw(carp);
+use autodie qw(chdir closedir opendir);
+
+my %FL2OFF = ( # mh_sequences key => our keyword
+	replied => 0,
+	flagged => 1,
+	unseen => 2, # negate
+);
+my @OFF2KW = qw(answered flagged); # [2] => unseen (negated)
+
+sub new {
+	my ($cls, $dir, $cwdfh) = @_;
+	if (substr($dir, -1) ne '/') { # TODO: do this earlier
+		carp "W: appending `/' to `$dir' (fix caller)\n";
+		$dir .= '/';
+	}
+	bless { dir => $dir, cwdfh => $cwdfh }, $cls;
+}
+
+sub read_mh_sequences ($) { # caller must chdir($self->{dir})
+	my ($self) = @_;
+	my ($fl, $off, @n);
+	my @seq = ('', '', '');
+	for (split /\n+/s, try_cat('.mh_sequences')) {
+		($fl, @n) = split /[: \t]+/;
+		$off = $FL2OFF{$fl} // do { warn <<EOM;
+W: unknown `$fl' in $self->{dir}.mh_sequences (ignoring)
+EOM
+			next;
+		};
+		@n = grep /\A[0-9]+\z/s, @n; # don't stat, yet
+		if (@n) {
+			@n = sort { $b <=> $a } @n; # to avoid resize
+			my $buf = '';
+			vec($buf, $_, 1) = 1 for @n;
+			$seq[$off] = $buf;
+		}
+	}
+	\@seq;
+}
+
+sub mh_each_file {
+	my ($self, $efcb, @arg) = @_;
+	opendir(my $dh, my $dir = $self->{dir});
+	my $restore = PublicInbox::OnDestroy->new($$, \&chdir, $self->{cwdfh});
+	chdir($dh);
+	if (defined(my $sort = $self->{sort})) {
+		my @sort = map {
+			my @tmp = $_ eq '' ? ('sequence') : split(/[, ]/);
+			# sorting by name alphabetically makes no sense for MH:
+			for my $k (@tmp) {
+				s/\A(\-|\+|)(?:name|)\z/$1sequence/;
+			}
+			@tmp;
+		} @$sort;
+		my @n = grep /\A[0-9]+\z/s, readdir $dh;
+		mdir_sort \@n, \@sort;
+		$efcb->($dir, $_, $self, @arg) for @n;
+	} else {
+		while (readdir $dh) { # perl v5.12+ to set $_ on readdir
+			$efcb->($dir, $_, $self, @arg) if /\A[0-9]+\z/s;
+		}
+	}
+	closedir $dh; # may die
+}
+
+sub kw_for ($$) {
+	my ($self, $n) = @_;
+	my $seq = $self->{mh_seq} //= read_mh_sequences($self);
+	my @kw = map { vec($seq->[$_], $n, 1) ? $OFF2KW[$_] : () } (0, 1);
+	vec($seq->[2], $n, 1) or push @kw, 'seen';
+	\@kw;
+}
+
+sub _file2eml { # mh_each_file cb
+	my ($dir, $n, $self, $ucb, @arg) = @_;
+	my $eml = eml_from_path($n);
+	$ucb->($dir, $n, kw_for($self, $n), $eml, @arg) if $eml;
+}
+
+sub mh_each_eml {
+	my ($self, $ucb, @arg) = @_;
+	mh_each_file($self, \&_file2eml, $ucb, @arg);
+}
+
+sub mh_read_one {
+	my ($self, $n, $ucb, @arg) = @_;
+	my $restore = PublicInbox::OnDestroy->new($$, \&chdir, $self->{cwdfh});
+	chdir(my $dir = $self->{dir});
+	_file2eml($dir, $n, $self, $ucb, @arg);
+}
+
+1;
diff --git a/lib/PublicInbox/MdirReader.pm b/lib/PublicInbox/MdirReader.pm
index db5f4545..2981b058 100644
--- a/lib/PublicInbox/MdirReader.pm
+++ b/lib/PublicInbox/MdirReader.pm
@@ -1,7 +1,7 @@
 # Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 
-# Maildirs for now, MH eventually
+# Maildirs only (PublicInbox::MHreader exists, now)
 # ref: https://cr.yp.to/proto/maildir.html
 #	https://wiki2.dovecot.org/MailboxFormat/Maildir
 package PublicInbox::MdirReader;
diff --git a/lib/PublicInbox/MdirSort.pm b/lib/PublicInbox/MdirSort.pm
new file mode 100644
index 00000000..6bd9fb6c
--- /dev/null
+++ b/lib/PublicInbox/MdirSort.pm
@@ -0,0 +1,46 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# used for sorting MH (and (TODO) Maildir) names
+# TODO: consider sort(1) to parallelize sorting of gigantic directories
+package PublicInbox::MdirSort;
+use v5.12;
+use Time::HiRes ();
+use parent qw(Exporter);
+use Fcntl qw(S_ISREG);
+our @EXPORT = qw(mdir_sort);
+my %ST = (sequence => 0, size => 1, atime => 2, mtime => 3, ctime => 4);
+
+sub mdir_sort ($$;$) {
+	my ($ent, $sort, $max) = @_;
+	my @st;
+	my @ent = map {
+		@st = Time::HiRes::stat $_;
+		# name, size, {a,m,c}time
+		S_ISREG($st[2]) ? [ $_, @st[7..10] ] : ();
+	} @$ent;
+	@ent = grep { $_->[1] <= $max } @ent if $max;
+	use sort 'stable';
+	for my $s (@$sort) {
+		if ($s =~ /\A(\-|\+|)name\z/) {
+			if ($1 eq '-') {
+				@ent = sort { $b->[0] cmp $a->[0] } @ent;
+			} else {
+				@ent = sort { $a->[0] cmp $b->[0] } @ent;
+			}
+		} elsif ($s =~ /\A(\-|\+|)
+				(sequence|size|ctime|mtime|atime)\z/x) {
+			my $key = $ST{$2};
+			if ($1 eq '-') {
+				@ent = sort { $b->[$key] <=> $a->[$key] } @ent;
+			} else {
+				@ent = sort { $a->[$key] <=> $b->[$key] } @ent;
+			}
+		} else {
+			die "E: unrecognized sort parameter: `$s'";
+		}
+	}
+	@$ent = map { $_->[0] } @ent;
+}
+
+1;
diff --git a/lib/PublicInbox/TestCommon.pm b/lib/PublicInbox/TestCommon.pm
index 22c50675..64fe09fa 100644
--- a/lib/PublicInbox/TestCommon.pm
+++ b/lib/PublicInbox/TestCommon.pm
@@ -24,6 +24,7 @@ BEGIN {
 	@EXPORT = qw(tmpdir tcp_server tcp_connect require_git require_mods
 		run_script start_script key2sub xsys xsys_e xqx eml_load tick
 		have_xapian_compact json_utf8 setup_public_inboxes create_inbox
+		create_dir
 		create_coderepo require_bsd kernel_version check_broken_tmpfs
 		quit_waiter_pipe wait_for_eof require_git_http_backend
 		tcp_host_port test_lei lei lei_ok $lei_out $lei_err $lei_opt
@@ -843,26 +844,24 @@ sub my_sum {
 	substr PublicInbox::SHA::sha256_hex(join('', @l)), 0, 8;
 }
 
-sub create_coderepo ($$;@) {
-	my $ident = shift;
-	my $cb = pop;
+sub create_dir (@) {
+	my ($ident, $cb) = (shift, pop);
 	my %opt = @_;
 	require PublicInbox::Lock;
 	require PublicInbox::Import;
-	my ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!);
-	my ($db) = (PublicInbox::Import::default_branch() =~ m!([^/]+)\z!);
 	my $tmpdir = delete $opt{tmpdir};
-	my $dir = "t/data-gen/$base.$ident-".my_sum($db, $cb, \%opt);
+	my ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!);
+	my $dir = "t/data-gen/$base.$ident-".my_sum($cb, \%opt);
 	require File::Path;
 	my $new = File::Path::make_path($dir);
 	my $lk = PublicInbox::Lock->new("$dir/creat.lock");
 	my $scope = $lk->lock_for_scope;
 	if (!-f "$dir/creat.stamp") {
-		opendir(my $dfh, '.');
+		opendir(my $cwd, '.');
 		chdir($dir);
 		local %ENV = (%ENV, %COMMIT_ENV);
 		$cb->($dir);
-		chdir($dfh);
+		chdir($cwd); # some $cb chdir around
 		open my $s, '>', "$dir/creat.stamp";
 	}
 	return $dir if !defined($tmpdir);
@@ -870,6 +869,13 @@ sub create_coderepo ($$;@) {
 	$tmpdir;
 }
 
+sub create_coderepo (@) {
+	my $ident = shift;
+	require PublicInbox::Import;
+	my ($db) = (PublicInbox::Import::default_branch() =~ m!([^/]+)\z!);
+	create_dir "$ident-$db", @_;
+}
+
 sub create_inbox ($;@) {
 	my $ident = shift;
 	my $cb = pop;
diff --git a/t/mh_reader.t b/t/mh_reader.t
new file mode 100644
index 00000000..4bc77c1e
--- /dev/null
+++ b/t/mh_reader.t
@@ -0,0 +1,108 @@
+#!perl -w
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use PublicInbox::TestCommon;
+require_ok 'PublicInbox::MHreader';
+use PublicInbox::IO qw(write_file);
+use PublicInbox::Lock;
+use PublicInbox::OnDestroy;
+use PublicInbox::Eml;
+use autodie;
+opendir my $cwdfh, '.';
+
+my $tmpdir = tmpdir;
+my $normal = create_dir 'normal', sub {
+	write_file '>', 3, "Subject: replied a\n\n";
+	write_file '>', 4, "Subject: replied b\n\n";
+	write_file '>', 1, "Subject: unseen\n\n";
+	write_file '>', 2, "Subject: unseen flagged\n\n";
+	write_file '>', '.mh_sequences', <<EOM;
+unseen: 1 2
+flagged: 2
+replied: 3 4
+EOM
+};
+
+my $for_sort = create_dir 'size', sub {
+	for (1..3) {
+		my $name = 10 - $_;
+		write_file '>', $name, "Subject: ".($_ x $_)."\n\n";
+	}
+};
+
+my $stale = create_dir 'stale', sub {
+	write_file '>', 4, "Subject: msg 4\n\n";
+	write_file '>', '.mh_sequences', <<EOM;
+unseen: 1 2
+EOM
+};
+
+{
+	my $mhr = PublicInbox::MHreader->new("$normal/", $cwdfh);
+	$mhr->{sort} = [ '' ];
+	my @res;
+	$mhr->mh_each_eml(sub { push @res, \@_; }, [ 'bogus' ]);
+	is scalar(@res), 4, 'got 4 messages' or diag explain(\@res);
+	is_deeply [map { $_->[1] } @res], [1, 2, 3, 4],
+		'got messages in expected order';
+	is scalar(grep { $_->[4]->[0] eq 'bogus' } @res), scalar(@res),
+		'cb arg passed to all messages' or diag explain(\@res);
+
+	$mhr = PublicInbox::MHreader->new("$stale/", $cwdfh);
+	@res = ();
+	$mhr->mh_each_eml(sub { push @res, \@_; });
+	is scalar(@res), 1, 'ignored stale messages';
+}
+
+test_lei(sub {
+	lei_ok qw(convert -f mboxrd), $normal;
+	my @msgs = grep /\S/s, split /^From .[^\n]+\n/sm, $lei_out;
+	my @eml = map { PublicInbox::Eml->new($_) } @msgs;
+	my $h = 'Subject';
+	@eml = sort { $a->header_raw($h) cmp $b->header_raw($h) } @eml;
+	my @has = map { scalar $_->header_raw($h) } @eml;
+	is_xdeeply \@has,
+		[ 'replied a', 'replied b', 'unseen', 'unseen flagged' ],
+		'subjects sorted';
+	$h = 'X-Status';
+	@has = map { scalar $_->header_raw($h) } @eml;
+	is_xdeeply \@has, [ 'A', 'A', undef, 'F' ], 'answered and flagged kw';
+	$h = 'Status';
+	@has = map { scalar $_->header_raw($h) } @eml;
+	is_xdeeply \@has, ['RO', 'RO', 'O', 'O'], 'read and old';
+	lei_ok qw(import +L:normal), $normal;
+	lei_ok qw(q L:normal -f mboxrd);
+	@msgs = grep /\S/s, split /^From .[^\n]+\n/sm, $lei_out;
+	my @eml2 = map { PublicInbox::Eml->new($_) } @msgs;
+	$h = 'Subject';
+	@eml2 = sort { $a->header_raw($h) cmp $b->header_raw($h) } @eml2;
+	is_xdeeply \@eml2, \@eml, 'import preserved kw';
+
+	lei_ok 'ls-mail-sync';
+	is $lei_out, 'mh:'.File::Spec->rel2abs($normal)."\n",
+		'mail sync stored';
+
+	lei_ok qw(convert -s size -f mboxrd), "mh:$for_sort";
+	chomp(my @s = grep /^Subject:/, split(/^/sm, $lei_out));
+	s/^Subject: // for @s;
+	is_xdeeply \@s, [ 1, 22, 333 ], 'sorted by size';
+
+	for my $s ([], [ 'name' ], [ 'sequence' ]) {
+		lei_ok qw(convert -f mboxrd), "mh:$for_sort", '-s', @$s;
+		chomp(@s = grep /^Subject:/, split(/^/sm, $lei_out));
+		s/^Subject: // for @s;
+		my $desc = "@$s" || '(default)';
+		is_xdeeply \@s, [ 333, 22, 1 ], "sorted by: $desc";
+	}
+
+	lei_ok qw(import +L:sorttest), "MH:$for_sort";
+	lei_ok 'ls-mail-sync', $for_sort;
+	is $lei_out, 'mh:'.File::Spec->rel2abs($for_sort)."\n",
+		"mail sync stored with `MH' normalized to `mh'";
+	lei_ok qw(index), 'mh:'.$stale;
+	lei qw(q -f mboxrd), 's:msg 4';
+	like $lei_out, qr/^Subject: msg 4\nStatus: RO\n\n\n/ms,
+		"message retrieved after `lei index'"
+});
+
+done_testing;

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-12-29 18:05 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-12-16 13:09 [PATCH] lei: support reading MH for convert+import+index Eric Wong
2023-12-16 16:15 ` Konstantin Ryabitsev
2023-12-16 18:17   ` Eric Wong
2023-12-17  7:59     ` Eric Wong
2023-12-29 18:05 ` [PATCH v2] " Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).