unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
* [PATCH 0/2] lei: support reading inboxes & extindex w/o search
@ 2023-09-30  0:36 Eric Wong
  2023-09-30  0:36 ` [PATCH 1/2] lei_input: always prefix `maildir:' internally Eric Wong
  2023-09-30  0:36 ` [PATCH 2/2] lei convert: support reading from v1, v2, and extindex Eric Wong
  0 siblings, 2 replies; 3+ messages in thread
From: Eric Wong @ 2023-09-30  0:36 UTC (permalink / raw)
  To: meta

This works on completely unindexed inboxes, even, as long as the
inbox.lock (or ssoma.lock) file exists.

Eric Wong (2):
  lei_input: always prefix `maildir:' internally
  lei convert: support reading from v1, v2, and extindex

 lib/PublicInbox/ExtSearch.pm |   6 +-
 lib/PublicInbox/LeiInput.pm  | 113 ++++++++++++++++++++++++++---------
 t/extsearch.t                |  24 ++++++++
 t/lei-convert.t              |  40 +++++++++++++
 4 files changed, 153 insertions(+), 30 deletions(-)


^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 1/2] lei_input: always prefix `maildir:' internally
  2023-09-30  0:36 [PATCH 0/2] lei: support reading inboxes & extindex w/o search Eric Wong
@ 2023-09-30  0:36 ` Eric Wong
  2023-09-30  0:36 ` [PATCH 2/2] lei convert: support reading from v1, v2, and extindex Eric Wong
  1 sibling, 0 replies; 3+ messages in thread
From: Eric Wong @ 2023-09-30  0:36 UTC (permalink / raw)
  To: meta

This allows us to reduce stats for `new' and `cur' subdirs
of the Maildir and will also make it easier for us to support
MH, v2, v1, and extindex directories as inputs.
---
 lib/PublicInbox/LeiInput.pm | 57 ++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm
index b6c2b6bb..f88c5374 100644
--- a/lib/PublicInbox/LeiInput.pm
+++ b/lib/PublicInbox/LeiInput.pm
@@ -1,10 +1,9 @@
-# Copyright (C) 2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 
 # parent class for LeiImport, LeiConvert, LeiIndex
 package PublicInbox::LeiInput;
-use strict;
-use v5.10.1;
+use v5.12;
 use PublicInbox::DS;
 use PublicInbox::Spawn qw(which popen_rd);
 use PublicInbox::InboxWritable qw(eml_from_path);
@@ -181,10 +180,7 @@ sub input_path_url {
 		}
 		local $PublicInbox::DS::in_loop = 0 if $zsfx; # awaitpid
 		$self->input_fh($ifmt, $mbl->{fh}, $input, @args);
-	} elsif (-d _ && (-d "$input/cur" || -d "$input/new")) {
-		return $lei->fail(<<EOM) if $ifmt && $ifmt ne 'maildir';
-$input appears to be a maildir, not $ifmt
-EOM
+	} elsif (-d _ && $ifmt eq 'maildir') {
 		my $mdr = PublicInbox::MdirReader->new;
 		if (my $pmd = $self->{pmd}) {
 			$mdr->maildir_each_file($input,
@@ -259,6 +255,17 @@ sub prepare_http_input ($$$) {
 	$self->{"-curl-$url"} = [ @curl_opt, $uri ]; # for handle_http_input
 }
 
+sub add_dir ($$$$) {
+	my ($lei, $istate, $ifmt, $input) = @_;
+	if ($istate->{-may_sync}) {
+		$$input = "$ifmt:".$lei->abs_path($$input);
+		push @{$istate->{-sync}->{ok}}, $$input if $istate->{-sync};
+	} else {
+		substr($$input, 0, 0) = "$ifmt:"; # prefix
+	}
+	push @{$istate->{$ifmt}}, $$input;
+}
+
 sub prepare_inputs { # returns undef on error
 	my ($self, $lei, $inputs) = @_;
 	my $in_fmt = $lei->{opt}->{'in-format'};
@@ -272,7 +279,8 @@ sub prepare_inputs { # returns undef on error
 		push @{$sync->{no}}, '/dev/stdin' if $sync;
 	}
 	my $net = $lei->{net}; # NetWriter may be created by l2m
-	my (@f, @md);
+	my @f;
+	my $istate = { -sync => $sync, -may_sync => $may_sync };
 	# e.g. Maildir:/home/user/Mail/ or imaps://example.com/INBOX
 	for my $input (@$inputs) {
 		my $input_path = $input;
@@ -292,11 +300,8 @@ sub prepare_inputs { # returns undef on error
 --in-format=$in_fmt and `$ifmt:' conflict
 
 			}
-			if ($ifmt =~ /\A(?:maildir|mh)\z/i) {
-				push @{$sync->{ok}}, $input if $sync;
-			} else {
-				push @{$sync->{no}}, $input if $sync;
-			}
+			($sync && $ifmt !~ /\A(?:maildir|mh)\z/i) and
+				push(@{$sync->{no}}, $input);
 			my $devfd = $lei->path_to_fd($input_path) // return;
 			if ($devfd >= 0 || (-f $input_path || -p _)) {
 				require PublicInbox::MboxLock;
@@ -304,11 +309,10 @@ sub prepare_inputs { # returns undef on error
 				PublicInbox::MboxReader->reads($ifmt) or return
 					$lei->fail("$ifmt not supported");
 			} elsif (-d $input_path) {
-				$ifmt eq 'maildir' or return
+				$ifmt eq 'maildir' or return # TODO v1/v2/ei
 					$lei->fail("$ifmt not supported");
-				$may_sync and $input = 'maildir:'.
-						$lei->abs_path($input_path);
-				push @md, $input;
+				$input = $input_path;
+				add_dir $lei, $istate, $ifmt, \$input;
 			} elsif ($self->{missing_ok} && !-e _) {
 				# for "lei rm-watch" on missing Maildir
 				$may_sync and $input = 'maildir:'.
@@ -345,12 +349,13 @@ $input is `eml', not --in-format=$in_fmt
 				push @{$sync->{no}}, $input if $sync;
 				push @f, $input;
 			} elsif (-d "$input/new" && -d "$input/cur") {
-				if ($may_sync) {
-					$input = 'maildir:'.
-						$lei->abs_path($input);
-					push @{$sync->{ok}}, $input if $sync;
-				}
-				push @md, $input;
+				add_dir $lei, $istate, 'maildir', \$input;
+			} elsif (-e "$input/inbox.lock") { # TODO
+				$lei->fail('v2 inputs not yet supported (TODO)');
+				#add_dir $lei, $istate, 'v2', \$input;
+			} elsif (-e "$input/ssoma.lock") { # TODO
+				$lei->fail('v1 inputs not yet supported (TODO)');
+				#add_dir $lei, $istate, 'v1', \$input;
 			} elsif ($self->{missing_ok} && !-e $input) {
 				if ($lei->{cmd} eq 'p2q') {
 					# will run "git format-patch"
@@ -382,17 +387,17 @@ $input is `eml', not --in-format=$in_fmt
 		$lei->{auth} //= PublicInbox::LeiAuth->new;
 		$lei->{net} //= $net;
 	}
-	if (scalar(@md)) {
+	if (my $md = $istate->{maildir}) {
 		require PublicInbox::MdirReader;
 		if ($self->can('pmdir_cb')) {
 			require PublicInbox::LeiPmdir;
 			$self->{pmd} = PublicInbox::LeiPmdir->new($lei, $self);
 		}
+		grep(!m!\Amaildir:/!i, @$md) and die "BUG: @$md (no pfx)";
 
 		# start watching Maildirs ASAP
 		if ($may_sync && $lei->{sto}) {
-			grep(!m!\Amaildir:/!i, @md) and die "BUG: @md (no pfx)";
-			$lei->lms(1)->lms_write_prepare->add_folders(@md);
+			$lei->lms(1)->lms_write_prepare->add_folders(@$md);
 			$lei->refresh_watches;
 		}
 	}

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/2] lei convert: support reading from v1, v2, and extindex
  2023-09-30  0:36 [PATCH 0/2] lei: support reading inboxes & extindex w/o search Eric Wong
  2023-09-30  0:36 ` [PATCH 1/2] lei_input: always prefix `maildir:' internally Eric Wong
@ 2023-09-30  0:36 ` Eric Wong
  1 sibling, 0 replies; 3+ messages in thread
From: Eric Wong @ 2023-09-30  0:36 UTC (permalink / raw)
  To: meta

We should be able to dump all public-inbox and extindex directories
to Maildir/mbox* or IMAP folders.  Even unindexed inboxes can be
dumped as long as inbox.lock (or ssoma.lock) exists.

This change likely works for `lei tag' and other lei_input-using
things, as well, but that's untested at the moment.  I mainly
want to be able to use `lei convert' to benchmark some upcoming
changes...
---
 lib/PublicInbox/ExtSearch.pm |  6 ++--
 lib/PublicInbox/LeiInput.pm  | 70 +++++++++++++++++++++++++++++++-----
 t/extsearch.t                | 24 +++++++++++++
 t/lei-convert.t              | 40 +++++++++++++++++++++
 4 files changed, 129 insertions(+), 11 deletions(-)

diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm
index fa49a1d0..d43c23e6 100644
--- a/lib/PublicInbox/ExtSearch.pm
+++ b/lib/PublicInbox/ExtSearch.pm
@@ -33,9 +33,11 @@ sub misc {
 # same as per-inbox ->over, for now...
 sub over {
 	my ($self) = @_;
-	$self->{over} //= do {
+	$self->{over} // eval {
 		PublicInbox::Inbox::_cleanup_later($self);
-		PublicInbox::Over->new("$self->{xpfx}/over.sqlite3");
+		my $over = PublicInbox::Over->new("$self->{xpfx}/over.sqlite3");
+		$over->dbh; # may die
+		$self->{over} = $over;
 	};
 }
 
diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm
index f88c5374..58069b0a 100644
--- a/lib/PublicInbox/LeiInput.pm
+++ b/lib/PublicInbox/LeiInput.pm
@@ -125,6 +125,51 @@ sub handle_http_input ($$@) {
 	$lei->child_error($?, "@$cmd failed: @err") if @err;
 }
 
+sub oid2eml { # git->cat_async cb
+	my ($bref, $oid, $type, $size, $self) = @_;
+	if ($type eq 'blob') {
+		$self->input_eml_cb(PublicInbox::Eml->new($bref));
+	} else {
+		warn "W: $oid is type=$type\n";
+	}
+}
+
+sub each_ibx_eml_unindexed {
+	my ($self, $ibx, @args) = @_;
+	$ibx->isa('PublicInbox::Inbox') or return $self->{lei}->fail(<<EOM);
+unindexed extindex $ibx->{topdir} not supported
+EOM
+	require PublicInbox::SearchIdx;
+	my $n = $ibx->max_git_epoch;
+	my @g = defined($n) ? map { $ibx->git_epoch($_) } (0..$n) : ($ibx->git);
+	my $sync = { D => {}, ibx => $ibx }; # D => {} filters out deletes
+	my ($f, $at, $ct, $oid, $cmt);
+	for my $git (grep defined, @g) {
+		my $s = PublicInbox::SearchIdx::log2stack($sync, $git, 'HEAD');
+		while (($f, $at, $ct, $oid, $cmt) = $s->pop_rec) {
+			$git->cat_async($oid, \&oid2eml, $self) if $f eq 'm';
+		}
+		$git->cleanup; # wait all
+	}
+}
+
+sub each_ibx_eml {
+	my ($self, $ibx, @args) = @_; # TODO: is @args used at all?
+	my $over = $ibx->over or return each_ibx_eml_unindexed(@_);
+	my $git = $ibx->git;
+	my $prev = 0;
+	my $smsg;
+	my $ids = $over->ids_after(\$prev);
+	while (@$ids) {
+		for (@$ids) {
+			$smsg = $over->get_art($_) // next;
+			$git->cat_async($smsg->{blob}, \&oid2eml, $self);
+		}
+		$ids = $over->ids_after(\$prev);
+	}
+	$git->cat_async_wait;
+}
+
 sub input_path_url {
 	my ($self, $input, @args) = @_;
 	my $lei = $self->{lei};
@@ -191,6 +236,12 @@ sub input_path_url {
 						$self->can('input_maildir_cb'),
 						$self, @args);
 		}
+	} elsif (-d _ && $ifmt =~ /\A(?:v1|v2)\z/) {
+		my $ibx = PublicInbox::Inbox->new({inboxdir => $input});
+		each_ibx_eml($self, $ibx, @args);
+	} elsif (-d _ && $ifmt eq 'extindex') {
+		my $esrch = PublicInbox::ExtSearch->new($input);
+		each_ibx_eml($self, $esrch, @args);
 	} elsif ($self->{missing_ok} && !-e $input) { # don't ->fail
 		if ($lei->{cmd} eq 'p2q') {
 			my $fp = [ qw(git format-patch --stdout -1), $input ];
@@ -308,9 +359,9 @@ sub prepare_inputs { # returns undef on error
 				require PublicInbox::MboxReader;
 				PublicInbox::MboxReader->reads($ifmt) or return
 					$lei->fail("$ifmt not supported");
-			} elsif (-d $input_path) {
-				$ifmt eq 'maildir' or return # TODO v1/v2/ei
-					$lei->fail("$ifmt not supported");
+			} elsif (-d $input_path) { # TODO extindex
+				$ifmt =~ /\A(?:maildir|v1|v2|extindex)\z/ or
+					return$lei->fail("$ifmt not supported");
 				$input = $input_path;
 				add_dir $lei, $istate, $ifmt, \$input;
 			} elsif ($self->{missing_ok} && !-e _) {
@@ -350,12 +401,12 @@ $input is `eml', not --in-format=$in_fmt
 				push @f, $input;
 			} elsif (-d "$input/new" && -d "$input/cur") {
 				add_dir $lei, $istate, 'maildir', \$input;
-			} elsif (-e "$input/inbox.lock") { # TODO
-				$lei->fail('v2 inputs not yet supported (TODO)');
-				#add_dir $lei, $istate, 'v2', \$input;
-			} elsif (-e "$input/ssoma.lock") { # TODO
-				$lei->fail('v1 inputs not yet supported (TODO)');
-				#add_dir $lei, $istate, 'v1', \$input;
+			} elsif (-e "$input/inbox.lock") {
+				add_dir $lei, $istate, 'v2', \$input;
+			} elsif (-e "$input/ssoma.lock") {
+				add_dir $lei, $istate, 'v1', \$input;
+			} elsif (-e "$input/ei.lock") {
+				add_dir $lei, $istate, 'extindex', \$input;
 			} elsif ($self->{missing_ok} && !-e $input) {
 				if ($lei->{cmd} eq 'p2q') {
 					# will run "git format-patch"
@@ -401,6 +452,7 @@ $input is `eml', not --in-format=$in_fmt
 			$lei->refresh_watches;
 		}
 	}
+	require PublicInbox::ExtSearch if $istate->{extindex};
 	$self->{inputs} = $inputs;
 }
 
diff --git a/t/extsearch.t b/t/extsearch.t
index 8ded3382..19eaf3b5 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -581,4 +581,28 @@ EOM
 	}
 }
 
+test_lei(sub {
+	my $d = "$home/extindex";
+	lei_ok('convert', '-o', "$home/md1", $d);
+	lei_ok('convert', '-o', "$home/md2", "extindex:$d");
+	my $dst = [];
+	my $cb = sub { push @$dst, $_[2]->as_string };
+	require PublicInbox::MdirReader;
+	PublicInbox::MdirReader->new->maildir_each_eml("$home/md1", $cb);
+	my @md1 = sort { $a cmp $b } @$dst;
+	ok(scalar(@md1), 'dumped messages to md1');
+	$dst = [];
+	PublicInbox::MdirReader->new->maildir_each_eml("$home/md2", $cb);
+	@$dst = sort { $a cmp $b } @$dst;
+	is_deeply($dst, \@md1,
+		"convert from extindex w/ or w/o `extindex' prefix");
+
+	use autodie qw(unlink);
+	my @o = glob "$home/extindex/ei*/over.sqlite*";
+	unlink(@o);
+	ok(!lei('convert', '-o', "$home/fail", "extindex:$d"));
+	like($lei_err, qr/unindexed .*?not supported/,
+		'noted unindexed extindex is unsupported');
+});
+
 done_testing;
diff --git a/t/lei-convert.t b/t/lei-convert.t
index 115e7ed0..d75110cb 100644
--- a/t/lei-convert.t
+++ b/t/lei-convert.t
@@ -7,6 +7,8 @@ use PublicInbox::MdirReader;
 use PublicInbox::NetReader;
 use PublicInbox::Eml;
 use IO::Uncompress::Gunzip;
+use File::Path qw(remove_tree);
+use PublicInbox::Spawn qw(which);
 use autodie qw(open);
 require_mods(qw(lei -imapd -nntpd Mail::IMAPClient Net::NNTP));
 my ($tmpdir, $for_destroy) = tmpdir;
@@ -148,5 +150,43 @@ test_lei({ tmpdir => $tmpdir }, sub {
 		});
 		is_deeply(\@tmp, \@bar, 'read rsyncable-gzipped mboxcl2');
 	}
+	my $cp = which('cp') or xbail 'cp(1) not available (WTF?)';
+	for my $v (1, 2) {
+		my $ibx_dir = "$ro_home/t$v";
+		lei_ok qw(convert -f mboxrd), $ibx_dir,
+				\"dump v$v inbox to mboxrd";
+		my $out = $lei_out;
+		lei_ok qw(convert -f mboxrd), "v$v:$ibx_dir",
+				\"dump v$v inbox to mboxrd w/ v$v:// prefix";
+		is $out, $lei_out, "v$v:// prefix accepted";
+		open my $fh, '<', \$out;
+		my (@mb, @md, @md2);
+		PublicInbox::MboxReader->mboxrd($fh, sub {
+			$_[0]->header_set('Status');
+			push @mb, $_[0]->as_string;
+		});
+		undef $out;
+		ok(scalar(@mb), 'got messages output');
+		my $mdir = "$d/v$v-mdir";
+		lei_ok qw(convert -o), $mdir, $ibx_dir,
+			\"dump v$v inbox to Maildir";
+		PublicInbox::MdirReader->new->maildir_each_eml($mdir, sub {
+			push @md, $_[2]->as_string;
+		});
+		@md = sort { $a cmp $b } @md;
+		@mb = sort { $a cmp $b } @mb;
+		is_deeply(\@mb, \@md, 'got matching inboxes');
+		xsys_e([$cp, '-Rp', $ibx_dir, "$d/tv$v" ]);
+		remove_tree($mdir, "$d/tv$v/public-inbox",
+				glob("$d/tv$v/xap*"));
+
+		lei_ok qw(convert -o), $mdir, "$d/tv$v",
+			\"dump u indexed v$v inbox to Maildir";
+		PublicInbox::MdirReader->new->maildir_each_eml($mdir, sub {
+			push @md2, $_[2]->as_string;
+		});
+		@md2 = sort { $a cmp $b } @md2;
+		is_deeply(\@md, \@md2, 'got matching inboxes even unindexed');
+	}
 });
 done_testing;

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-09-30  0:36 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-30  0:36 [PATCH 0/2] lei: support reading inboxes & extindex w/o search Eric Wong
2023-09-30  0:36 ` [PATCH 1/2] lei_input: always prefix `maildir:' internally Eric Wong
2023-09-30  0:36 ` [PATCH 2/2] lei convert: support reading from v1, v2, and extindex Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).