From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 2/2] lei convert: support reading from v1, v2, and extindex
Date: Sat, 30 Sep 2023 00:36:16 +0000 [thread overview]
Message-ID: <20230930003616.3224311-3-e@80x24.org> (raw)
In-Reply-To: <20230930003616.3224311-1-e@80x24.org>
We should be able to dump all public-inbox and extindex directories
to Maildir/mbox* or IMAP folders. Even unindexed inboxes can be
dumped as long as inbox.lock (or ssoma.lock) exists.
This change likely works for `lei tag' and other lei_input-using
things, as well, but that's untested at the moment. I mainly
want to be able to use `lei convert' to benchmark some upcoming
changes...
---
lib/PublicInbox/ExtSearch.pm | 6 ++--
lib/PublicInbox/LeiInput.pm | 70 +++++++++++++++++++++++++++++++-----
t/extsearch.t | 24 +++++++++++++
t/lei-convert.t | 40 +++++++++++++++++++++
4 files changed, 129 insertions(+), 11 deletions(-)
diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm
index fa49a1d0..d43c23e6 100644
--- a/lib/PublicInbox/ExtSearch.pm
+++ b/lib/PublicInbox/ExtSearch.pm
@@ -33,9 +33,11 @@ sub misc {
# same as per-inbox ->over, for now...
sub over {
my ($self) = @_;
- $self->{over} //= do {
+ $self->{over} // eval {
PublicInbox::Inbox::_cleanup_later($self);
- PublicInbox::Over->new("$self->{xpfx}/over.sqlite3");
+ my $over = PublicInbox::Over->new("$self->{xpfx}/over.sqlite3");
+ $over->dbh; # may die
+ $self->{over} = $over;
};
}
diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm
index f88c5374..58069b0a 100644
--- a/lib/PublicInbox/LeiInput.pm
+++ b/lib/PublicInbox/LeiInput.pm
@@ -125,6 +125,51 @@ sub handle_http_input ($$@) {
$lei->child_error($?, "@$cmd failed: @err") if @err;
}
+sub oid2eml { # git->cat_async cb
+ my ($bref, $oid, $type, $size, $self) = @_;
+ if ($type eq 'blob') {
+ $self->input_eml_cb(PublicInbox::Eml->new($bref));
+ } else {
+ warn "W: $oid is type=$type\n";
+ }
+}
+
+sub each_ibx_eml_unindexed {
+ my ($self, $ibx, @args) = @_;
+ $ibx->isa('PublicInbox::Inbox') or return $self->{lei}->fail(<<EOM);
+unindexed extindex $ibx->{topdir} not supported
+EOM
+ require PublicInbox::SearchIdx;
+ my $n = $ibx->max_git_epoch;
+ my @g = defined($n) ? map { $ibx->git_epoch($_) } (0..$n) : ($ibx->git);
+ my $sync = { D => {}, ibx => $ibx }; # D => {} filters out deletes
+ my ($f, $at, $ct, $oid, $cmt);
+ for my $git (grep defined, @g) {
+ my $s = PublicInbox::SearchIdx::log2stack($sync, $git, 'HEAD');
+ while (($f, $at, $ct, $oid, $cmt) = $s->pop_rec) {
+ $git->cat_async($oid, \&oid2eml, $self) if $f eq 'm';
+ }
+ $git->cleanup; # wait all
+ }
+}
+
+sub each_ibx_eml {
+ my ($self, $ibx, @args) = @_; # TODO: is @args used at all?
+ my $over = $ibx->over or return each_ibx_eml_unindexed(@_);
+ my $git = $ibx->git;
+ my $prev = 0;
+ my $smsg;
+ my $ids = $over->ids_after(\$prev);
+ while (@$ids) {
+ for (@$ids) {
+ $smsg = $over->get_art($_) // next;
+ $git->cat_async($smsg->{blob}, \&oid2eml, $self);
+ }
+ $ids = $over->ids_after(\$prev);
+ }
+ $git->cat_async_wait;
+}
+
sub input_path_url {
my ($self, $input, @args) = @_;
my $lei = $self->{lei};
@@ -191,6 +236,12 @@ sub input_path_url {
$self->can('input_maildir_cb'),
$self, @args);
}
+ } elsif (-d _ && $ifmt =~ /\A(?:v1|v2)\z/) {
+ my $ibx = PublicInbox::Inbox->new({inboxdir => $input});
+ each_ibx_eml($self, $ibx, @args);
+ } elsif (-d _ && $ifmt eq 'extindex') {
+ my $esrch = PublicInbox::ExtSearch->new($input);
+ each_ibx_eml($self, $esrch, @args);
} elsif ($self->{missing_ok} && !-e $input) { # don't ->fail
if ($lei->{cmd} eq 'p2q') {
my $fp = [ qw(git format-patch --stdout -1), $input ];
@@ -308,9 +359,9 @@ sub prepare_inputs { # returns undef on error
require PublicInbox::MboxReader;
PublicInbox::MboxReader->reads($ifmt) or return
$lei->fail("$ifmt not supported");
- } elsif (-d $input_path) {
- $ifmt eq 'maildir' or return # TODO v1/v2/ei
- $lei->fail("$ifmt not supported");
+ } elsif (-d $input_path) { # TODO extindex
+ $ifmt =~ /\A(?:maildir|v1|v2|extindex)\z/ or
+ return$lei->fail("$ifmt not supported");
$input = $input_path;
add_dir $lei, $istate, $ifmt, \$input;
} elsif ($self->{missing_ok} && !-e _) {
@@ -350,12 +401,12 @@ $input is `eml', not --in-format=$in_fmt
push @f, $input;
} elsif (-d "$input/new" && -d "$input/cur") {
add_dir $lei, $istate, 'maildir', \$input;
- } elsif (-e "$input/inbox.lock") { # TODO
- $lei->fail('v2 inputs not yet supported (TODO)');
- #add_dir $lei, $istate, 'v2', \$input;
- } elsif (-e "$input/ssoma.lock") { # TODO
- $lei->fail('v1 inputs not yet supported (TODO)');
- #add_dir $lei, $istate, 'v1', \$input;
+ } elsif (-e "$input/inbox.lock") {
+ add_dir $lei, $istate, 'v2', \$input;
+ } elsif (-e "$input/ssoma.lock") {
+ add_dir $lei, $istate, 'v1', \$input;
+ } elsif (-e "$input/ei.lock") {
+ add_dir $lei, $istate, 'extindex', \$input;
} elsif ($self->{missing_ok} && !-e $input) {
if ($lei->{cmd} eq 'p2q') {
# will run "git format-patch"
@@ -401,6 +452,7 @@ $input is `eml', not --in-format=$in_fmt
$lei->refresh_watches;
}
}
+ require PublicInbox::ExtSearch if $istate->{extindex};
$self->{inputs} = $inputs;
}
diff --git a/t/extsearch.t b/t/extsearch.t
index 8ded3382..19eaf3b5 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -581,4 +581,28 @@ EOM
}
}
+test_lei(sub {
+ my $d = "$home/extindex";
+ lei_ok('convert', '-o', "$home/md1", $d);
+ lei_ok('convert', '-o', "$home/md2", "extindex:$d");
+ my $dst = [];
+ my $cb = sub { push @$dst, $_[2]->as_string };
+ require PublicInbox::MdirReader;
+ PublicInbox::MdirReader->new->maildir_each_eml("$home/md1", $cb);
+ my @md1 = sort { $a cmp $b } @$dst;
+ ok(scalar(@md1), 'dumped messages to md1');
+ $dst = [];
+ PublicInbox::MdirReader->new->maildir_each_eml("$home/md2", $cb);
+ @$dst = sort { $a cmp $b } @$dst;
+ is_deeply($dst, \@md1,
+ "convert from extindex w/ or w/o `extindex' prefix");
+
+ use autodie qw(unlink);
+ my @o = glob "$home/extindex/ei*/over.sqlite*";
+ unlink(@o);
+ ok(!lei('convert', '-o', "$home/fail", "extindex:$d"));
+ like($lei_err, qr/unindexed .*?not supported/,
+ 'noted unindexed extindex is unsupported');
+});
+
done_testing;
diff --git a/t/lei-convert.t b/t/lei-convert.t
index 115e7ed0..d75110cb 100644
--- a/t/lei-convert.t
+++ b/t/lei-convert.t
@@ -7,6 +7,8 @@ use PublicInbox::MdirReader;
use PublicInbox::NetReader;
use PublicInbox::Eml;
use IO::Uncompress::Gunzip;
+use File::Path qw(remove_tree);
+use PublicInbox::Spawn qw(which);
use autodie qw(open);
require_mods(qw(lei -imapd -nntpd Mail::IMAPClient Net::NNTP));
my ($tmpdir, $for_destroy) = tmpdir;
@@ -148,5 +150,43 @@ test_lei({ tmpdir => $tmpdir }, sub {
});
is_deeply(\@tmp, \@bar, 'read rsyncable-gzipped mboxcl2');
}
+ my $cp = which('cp') or xbail 'cp(1) not available (WTF?)';
+ for my $v (1, 2) {
+ my $ibx_dir = "$ro_home/t$v";
+ lei_ok qw(convert -f mboxrd), $ibx_dir,
+ \"dump v$v inbox to mboxrd";
+ my $out = $lei_out;
+ lei_ok qw(convert -f mboxrd), "v$v:$ibx_dir",
+ \"dump v$v inbox to mboxrd w/ v$v:// prefix";
+ is $out, $lei_out, "v$v:// prefix accepted";
+ open my $fh, '<', \$out;
+ my (@mb, @md, @md2);
+ PublicInbox::MboxReader->mboxrd($fh, sub {
+ $_[0]->header_set('Status');
+ push @mb, $_[0]->as_string;
+ });
+ undef $out;
+ ok(scalar(@mb), 'got messages output');
+ my $mdir = "$d/v$v-mdir";
+ lei_ok qw(convert -o), $mdir, $ibx_dir,
+ \"dump v$v inbox to Maildir";
+ PublicInbox::MdirReader->new->maildir_each_eml($mdir, sub {
+ push @md, $_[2]->as_string;
+ });
+ @md = sort { $a cmp $b } @md;
+ @mb = sort { $a cmp $b } @mb;
+ is_deeply(\@mb, \@md, 'got matching inboxes');
+ xsys_e([$cp, '-Rp', $ibx_dir, "$d/tv$v" ]);
+ remove_tree($mdir, "$d/tv$v/public-inbox",
+ glob("$d/tv$v/xap*"));
+
+ lei_ok qw(convert -o), $mdir, "$d/tv$v",
+ \"dump u indexed v$v inbox to Maildir";
+ PublicInbox::MdirReader->new->maildir_each_eml($mdir, sub {
+ push @md2, $_[2]->as_string;
+ });
+ @md2 = sort { $a cmp $b } @md2;
+ is_deeply(\@md, \@md2, 'got matching inboxes even unindexed');
+ }
});
done_testing;
prev parent reply other threads:[~2023-09-30 0:36 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-09-30 0:36 [PATCH 0/2] lei: support reading inboxes & extindex w/o search Eric Wong
2023-09-30 0:36 ` [PATCH 1/2] lei_input: always prefix `maildir:' internally Eric Wong
2023-09-30 0:36 ` Eric Wong [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230930003616.3224311-3-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).