From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id DD08B1F565 for ; Sat, 30 Sep 2023 00:36:16 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1696034176; bh=ICcSG6GpitWuI8LjJ53t50u/OqJf6SHK8lyEK1u6InU=; h=From:To:Subject:Date:In-Reply-To:References:From; b=QKiDxvZWLAJMIznAeb4cYBXzCEEcyc4Wi47QIz9BVn8y7aQ1a7OndfpzSy0QDChXz 5uYQKT5ucAgW727o4pwzYoVpsEyxmvbK9T5MX5xNyV1IHzpS42UiTMMPgJ12VNqAcl VwIEV9XdJbunBM+GwXGaedp3lHb1ArVX4FjYWtSc= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 2/2] lei convert: support reading from v1, v2, and extindex Date: Sat, 30 Sep 2023 00:36:16 +0000 Message-Id: <20230930003616.3224311-3-e@80x24.org> In-Reply-To: <20230930003616.3224311-1-e@80x24.org> References: <20230930003616.3224311-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We should be able to dump all public-inbox and extindex directories to Maildir/mbox* or IMAP folders. Even unindexed inboxes can be dumped as long as inbox.lock (or ssoma.lock) exists. This change likely works for `lei tag' and other lei_input-using things, as well, but that's untested at the moment. I mainly want to be able to use `lei convert' to benchmark some upcoming changes... --- lib/PublicInbox/ExtSearch.pm | 6 ++-- lib/PublicInbox/LeiInput.pm | 70 +++++++++++++++++++++++++++++++----- t/extsearch.t | 24 +++++++++++++ t/lei-convert.t | 40 +++++++++++++++++++++ 4 files changed, 129 insertions(+), 11 deletions(-) diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm index fa49a1d0..d43c23e6 100644 --- a/lib/PublicInbox/ExtSearch.pm +++ b/lib/PublicInbox/ExtSearch.pm @@ -33,9 +33,11 @@ sub misc { # same as per-inbox ->over, for now... sub over { my ($self) = @_; - $self->{over} //= do { + $self->{over} // eval { PublicInbox::Inbox::_cleanup_later($self); - PublicInbox::Over->new("$self->{xpfx}/over.sqlite3"); + my $over = PublicInbox::Over->new("$self->{xpfx}/over.sqlite3"); + $over->dbh; # may die + $self->{over} = $over; }; } diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm index f88c5374..58069b0a 100644 --- a/lib/PublicInbox/LeiInput.pm +++ b/lib/PublicInbox/LeiInput.pm @@ -125,6 +125,51 @@ sub handle_http_input ($$@) { $lei->child_error($?, "@$cmd failed: @err") if @err; } +sub oid2eml { # git->cat_async cb + my ($bref, $oid, $type, $size, $self) = @_; + if ($type eq 'blob') { + $self->input_eml_cb(PublicInbox::Eml->new($bref)); + } else { + warn "W: $oid is type=$type\n"; + } +} + +sub each_ibx_eml_unindexed { + my ($self, $ibx, @args) = @_; + $ibx->isa('PublicInbox::Inbox') or return $self->{lei}->fail(<{topdir} not supported +EOM + require PublicInbox::SearchIdx; + my $n = $ibx->max_git_epoch; + my @g = defined($n) ? map { $ibx->git_epoch($_) } (0..$n) : ($ibx->git); + my $sync = { D => {}, ibx => $ibx }; # D => {} filters out deletes + my ($f, $at, $ct, $oid, $cmt); + for my $git (grep defined, @g) { + my $s = PublicInbox::SearchIdx::log2stack($sync, $git, 'HEAD'); + while (($f, $at, $ct, $oid, $cmt) = $s->pop_rec) { + $git->cat_async($oid, \&oid2eml, $self) if $f eq 'm'; + } + $git->cleanup; # wait all + } +} + +sub each_ibx_eml { + my ($self, $ibx, @args) = @_; # TODO: is @args used at all? + my $over = $ibx->over or return each_ibx_eml_unindexed(@_); + my $git = $ibx->git; + my $prev = 0; + my $smsg; + my $ids = $over->ids_after(\$prev); + while (@$ids) { + for (@$ids) { + $smsg = $over->get_art($_) // next; + $git->cat_async($smsg->{blob}, \&oid2eml, $self); + } + $ids = $over->ids_after(\$prev); + } + $git->cat_async_wait; +} + sub input_path_url { my ($self, $input, @args) = @_; my $lei = $self->{lei}; @@ -191,6 +236,12 @@ sub input_path_url { $self->can('input_maildir_cb'), $self, @args); } + } elsif (-d _ && $ifmt =~ /\A(?:v1|v2)\z/) { + my $ibx = PublicInbox::Inbox->new({inboxdir => $input}); + each_ibx_eml($self, $ibx, @args); + } elsif (-d _ && $ifmt eq 'extindex') { + my $esrch = PublicInbox::ExtSearch->new($input); + each_ibx_eml($self, $esrch, @args); } elsif ($self->{missing_ok} && !-e $input) { # don't ->fail if ($lei->{cmd} eq 'p2q') { my $fp = [ qw(git format-patch --stdout -1), $input ]; @@ -308,9 +359,9 @@ sub prepare_inputs { # returns undef on error require PublicInbox::MboxReader; PublicInbox::MboxReader->reads($ifmt) or return $lei->fail("$ifmt not supported"); - } elsif (-d $input_path) { - $ifmt eq 'maildir' or return # TODO v1/v2/ei - $lei->fail("$ifmt not supported"); + } elsif (-d $input_path) { # TODO extindex + $ifmt =~ /\A(?:maildir|v1|v2|extindex)\z/ or + return$lei->fail("$ifmt not supported"); $input = $input_path; add_dir $lei, $istate, $ifmt, \$input; } elsif ($self->{missing_ok} && !-e _) { @@ -350,12 +401,12 @@ $input is `eml', not --in-format=$in_fmt push @f, $input; } elsif (-d "$input/new" && -d "$input/cur") { add_dir $lei, $istate, 'maildir', \$input; - } elsif (-e "$input/inbox.lock") { # TODO - $lei->fail('v2 inputs not yet supported (TODO)'); - #add_dir $lei, $istate, 'v2', \$input; - } elsif (-e "$input/ssoma.lock") { # TODO - $lei->fail('v1 inputs not yet supported (TODO)'); - #add_dir $lei, $istate, 'v1', \$input; + } elsif (-e "$input/inbox.lock") { + add_dir $lei, $istate, 'v2', \$input; + } elsif (-e "$input/ssoma.lock") { + add_dir $lei, $istate, 'v1', \$input; + } elsif (-e "$input/ei.lock") { + add_dir $lei, $istate, 'extindex', \$input; } elsif ($self->{missing_ok} && !-e $input) { if ($lei->{cmd} eq 'p2q') { # will run "git format-patch" @@ -401,6 +452,7 @@ $input is `eml', not --in-format=$in_fmt $lei->refresh_watches; } } + require PublicInbox::ExtSearch if $istate->{extindex}; $self->{inputs} = $inputs; } diff --git a/t/extsearch.t b/t/extsearch.t index 8ded3382..19eaf3b5 100644 --- a/t/extsearch.t +++ b/t/extsearch.t @@ -581,4 +581,28 @@ EOM } } +test_lei(sub { + my $d = "$home/extindex"; + lei_ok('convert', '-o', "$home/md1", $d); + lei_ok('convert', '-o', "$home/md2", "extindex:$d"); + my $dst = []; + my $cb = sub { push @$dst, $_[2]->as_string }; + require PublicInbox::MdirReader; + PublicInbox::MdirReader->new->maildir_each_eml("$home/md1", $cb); + my @md1 = sort { $a cmp $b } @$dst; + ok(scalar(@md1), 'dumped messages to md1'); + $dst = []; + PublicInbox::MdirReader->new->maildir_each_eml("$home/md2", $cb); + @$dst = sort { $a cmp $b } @$dst; + is_deeply($dst, \@md1, + "convert from extindex w/ or w/o `extindex' prefix"); + + use autodie qw(unlink); + my @o = glob "$home/extindex/ei*/over.sqlite*"; + unlink(@o); + ok(!lei('convert', '-o', "$home/fail", "extindex:$d")); + like($lei_err, qr/unindexed .*?not supported/, + 'noted unindexed extindex is unsupported'); +}); + done_testing; diff --git a/t/lei-convert.t b/t/lei-convert.t index 115e7ed0..d75110cb 100644 --- a/t/lei-convert.t +++ b/t/lei-convert.t @@ -7,6 +7,8 @@ use PublicInbox::MdirReader; use PublicInbox::NetReader; use PublicInbox::Eml; use IO::Uncompress::Gunzip; +use File::Path qw(remove_tree); +use PublicInbox::Spawn qw(which); use autodie qw(open); require_mods(qw(lei -imapd -nntpd Mail::IMAPClient Net::NNTP)); my ($tmpdir, $for_destroy) = tmpdir; @@ -148,5 +150,43 @@ test_lei({ tmpdir => $tmpdir }, sub { }); is_deeply(\@tmp, \@bar, 'read rsyncable-gzipped mboxcl2'); } + my $cp = which('cp') or xbail 'cp(1) not available (WTF?)'; + for my $v (1, 2) { + my $ibx_dir = "$ro_home/t$v"; + lei_ok qw(convert -f mboxrd), $ibx_dir, + \"dump v$v inbox to mboxrd"; + my $out = $lei_out; + lei_ok qw(convert -f mboxrd), "v$v:$ibx_dir", + \"dump v$v inbox to mboxrd w/ v$v:// prefix"; + is $out, $lei_out, "v$v:// prefix accepted"; + open my $fh, '<', \$out; + my (@mb, @md, @md2); + PublicInbox::MboxReader->mboxrd($fh, sub { + $_[0]->header_set('Status'); + push @mb, $_[0]->as_string; + }); + undef $out; + ok(scalar(@mb), 'got messages output'); + my $mdir = "$d/v$v-mdir"; + lei_ok qw(convert -o), $mdir, $ibx_dir, + \"dump v$v inbox to Maildir"; + PublicInbox::MdirReader->new->maildir_each_eml($mdir, sub { + push @md, $_[2]->as_string; + }); + @md = sort { $a cmp $b } @md; + @mb = sort { $a cmp $b } @mb; + is_deeply(\@mb, \@md, 'got matching inboxes'); + xsys_e([$cp, '-Rp', $ibx_dir, "$d/tv$v" ]); + remove_tree($mdir, "$d/tv$v/public-inbox", + glob("$d/tv$v/xap*")); + + lei_ok qw(convert -o), $mdir, "$d/tv$v", + \"dump u indexed v$v inbox to Maildir"; + PublicInbox::MdirReader->new->maildir_each_eml($mdir, sub { + push @md2, $_[2]->as_string; + }); + @md2 = sort { $a cmp $b } @md2; + is_deeply(\@md, \@md2, 'got matching inboxes even unindexed'); + } }); done_testing;