From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id D820B1FA18 for ; Thu, 21 Jan 2021 19:46:24 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 04/12] lei: show {pct} and {oid} in From_ lines and filenames Date: Thu, 21 Jan 2021 19:46:16 +0000 Message-Id: <20210121194624.32002-5-e@80x24.org> In-Reply-To: <20210121194624.32002-1-e@80x24.org> References: <20210121194624.32002-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: From_ lines are shown when mbox* variants are output to stdout, making {oid} and {pct} information visible without risking being propagated to other importer processes if they were in lei-specific X-* headers. Maildirs already had OIDs in the filename, now they gain Xapian {pct} in case anybody cares. --- lib/PublicInbox/LeiOverview.pm | 9 ++--- lib/PublicInbox/LeiToMail.pm | 60 +++++++++++++++++++--------------- t/lei_to_mail.t | 41 +++++++++++++---------- 3 files changed, 61 insertions(+), 49 deletions(-) diff --git a/lib/PublicInbox/LeiOverview.pm b/lib/PublicInbox/LeiOverview.pm index 47d9eb31..7a4fa857 100644 --- a/lib/PublicInbox/LeiOverview.pm +++ b/lib/PublicInbox/LeiOverview.pm @@ -224,8 +224,9 @@ sub ovv_each_smsg_cb { # runs in wq worker usually my $git_dir = $git->{git_dir}; sub { my ($smsg, $mitem) = @_; - $l2m->wq_do('write_mail', \@io, $git_dir, - $smsg->{blob}, $lei_ipc, $smsg->{kw}); + $smsg->{pct} = get_pct($mitem) if $mitem; + $l2m->wq_do('write_mail', \@io, $git_dir, $smsg, + $lei_ipc); } } elsif ($l2m) { my $wcb = $l2m->write_cb($lei); @@ -234,8 +235,8 @@ sub ovv_each_smsg_cb { # runs in wq worker usually my $g2m = $l2m->can('git_to_mail'); sub { my ($smsg, $mitem) = @_; - $git->cat_async($smsg->{blob}, $g2m, - [ $wcb, $smsg->{kw} ]); + $smsg->{pct} = get_pct($mitem) if $mitem; + $git->cat_async($smsg->{blob}, $g2m, [ $wcb, $smsg ]); }; } elsif ($self->{fmt} =~ /\A(concat)?json\z/ && $lei->{opt}->{pretty}) { my $EOR = ($1//'') eq 'concat' ? "\n}" : "\n},"; diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm index 1be0b09c..3dcce9e7 100644 --- a/lib/PublicInbox/LeiToMail.pm +++ b/lib/PublicInbox/LeiToMail.pm @@ -32,14 +32,14 @@ my %kw2status = ( ); sub _mbox_hdr_buf ($$$) { - my ($eml, $type, $kw) = @_; + my ($eml, $type, $smsg) = @_; $eml->header_set($_) for (qw(Lines Bytes Content-Length)); # Messages are always 'O' (non-\Recent in IMAP), it saves # MUAs the trouble of rewriting the mbox if no other # changes are made my %hdr = (Status => [ 'O' ]); # set Status, X-Status - for my $k (@$kw) { + for my $k (@{$smsg->{kw} // []}) { if (my $ent = $kw2status{$k}) { push @{$hdr{$ent->[0]}}, $ent->[1]; } else { # X-Label? @@ -53,9 +53,11 @@ sub _mbox_hdr_buf ($$$) { # fixup old bug from import (pre-a0c07cba0e5d8b6a) $$buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + my $ident = $smsg->{blob} // 'lei'; + if (defined(my $pct = $smsg->{pct})) { $ident .= "=$pct" } substr($$buf, 0, 0, # prepend From line - "From lei\@$type Thu Jan 1 00:00:00 1970$eml->{crlf}"); + "From $ident\@$type Thu Jan 1 00:00:00 1970$eml->{crlf}"); $buf; } @@ -71,8 +73,8 @@ sub _print_full { } sub eml2mboxrd ($;$) { - my ($eml, $kw) = @_; - my $buf = _mbox_hdr_buf($eml, 'mboxrd', $kw); + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxrd', $smsg); if (my $bdy = delete $eml->{bdy}) { $$bdy =~ s/^(>*From )/>$1/gm; $$buf .= $eml->{crlf}; @@ -84,8 +86,8 @@ sub eml2mboxrd ($;$) { } sub eml2mboxo { - my ($eml, $kw) = @_; - my $buf = _mbox_hdr_buf($eml, 'mboxo', $kw); + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxo', $smsg); if (my $bdy = delete $eml->{bdy}) { $$bdy =~ s/^From />From /gm; $$buf .= $eml->{crlf}; @@ -108,8 +110,8 @@ sub _mboxcl_common ($$$) { # mboxcl still escapes "From " lines sub eml2mboxcl { - my ($eml, $kw) = @_; - my $buf = _mbox_hdr_buf($eml, 'mboxcl', $kw); + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxcl', $smsg); my $crlf = $eml->{crlf}; if (my $bdy = delete $eml->{bdy}) { $$bdy =~ s/^From />From /gm; @@ -121,8 +123,8 @@ sub eml2mboxcl { # mboxcl2 has no "From " escaping sub eml2mboxcl2 { - my ($eml, $kw) = @_; - my $buf = _mbox_hdr_buf($eml, 'mboxcl2', $kw); + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxcl2', $smsg); my $crlf = $eml->{crlf}; if (my $bdy = delete $eml->{bdy}) { _mboxcl_common($buf, $bdy, $crlf); @@ -140,10 +142,11 @@ sub git_to_mail { # git->cat_async callback warn "unexpected type=$type for $oid\n"; } } - if ($size > 0) { - my ($write_cb, $kw) = @$arg; - $write_cb->($bref, $oid, $kw); + my ($write_cb, $smsg) = @$arg; + if ($smsg->{blob} ne $oid) { + die "BUG: expected=$smsg->{blob} got=$oid"; } + $write_cb->($bref, $smsg) if $size > 0; } sub reap_compress { # dwaitpid callback @@ -247,11 +250,11 @@ sub _mbox_write_cb ($$) { my $dedupe = $lei->{dedupe}; $dedupe->prepare_dedupe; sub { # for git_to_mail - my ($buf, $oid, $kw) = @_; + my ($buf, $smsg) = @_; return unless $out; my $eml = PublicInbox::Eml->new($buf); - if (!$dedupe->is_dup($eml, $oid)) { - $buf = $eml2mbox->($eml, $kw); + if (!$dedupe->is_dup($eml, $smsg->{blob})) { + $buf = $eml2mbox->($eml, $smsg); my $lk = $ovv->lock_for_scope; eval { $write->($out, $buf) }; if ($@) { @@ -283,12 +286,15 @@ sub _augment_file { # _maildir_each_file cb sub _unlink { unlink($_[0]) } sub _buf2maildir { - my ($dst, $buf, $oid, $kw) = @_; + my ($dst, $buf, $smsg) = @_; + my $kw = $smsg->{kw} // []; my $sfx = join('', sort(map { $kw2char{$_} // () } @$kw)); my $rand = ''; # chosen by die roll :P my ($tmp, $fh, $final); + my $common = $smsg->{blob}; + if (defined(my $pct = $smsg->{pct})) { $common .= "=$pct" } do { - $tmp = $dst.'tmp/'.$rand."oid=$oid"; + $tmp = $dst.'tmp/'.$rand.$common; } while (!sysopen($fh, $tmp, O_CREAT|O_EXCL|O_WRONLY) && $! == EEXIST && ($rand = int(rand 0x7fffffff).',')); if (print $fh $$buf and close($fh)) { @@ -299,14 +305,14 @@ sub _buf2maildir { $dst .= 'cur/'; $rand = ''; do { - $final = $dst.$rand."oid=$oid:2,$sfx"; + $final = $dst.$rand.$common.':2,'.$sfx; } while (!link($tmp, $final) && $! == EEXIST && ($rand = int(rand 0x7fffffff).',')); unlink($tmp) or warn "W: failed to unlink $tmp: $!\n"; } else { my $err = $!; unlink($tmp); - die "Error writing $oid to $dst: $err"; + die "Error writing $smsg->{blob} to $dst: $err"; } } @@ -316,12 +322,12 @@ sub _maildir_write_cb ($$) { $dedupe->prepare_dedupe; my $dst = $lei->{ovv}->{dst}; sub { # for git_to_mail - my ($buf, $oid, $kw) = @_; - return _buf2maildir($dst, $buf, $oid, $kw) if !$dedupe; + my ($buf, $smsg) = @_; + return _buf2maildir($dst, $buf, $smsg) if !$dedupe; my $eml = PublicInbox::Eml->new($$buf); # copy buf - return if $dedupe->is_dup($eml, $oid); + return if $dedupe->is_dup($eml, $smsg->{blob}); undef $eml; - _buf2maildir($dst, $buf, $oid, $kw); + _buf2maildir($dst, $buf, $smsg); } } @@ -447,7 +453,7 @@ sub post_augment { # fast (spawn compressor or mkdir), runs in main daemon } sub write_mail { # via ->wq_do - my ($self, $git_dir, $oid, $lei, $kw) = @_; + my ($self, $git_dir, $smsg, $lei) = @_; my $not_done = delete $self->{4}; # write end of {each_smsg_done} my $wcb = $self->{wcb} //= do { # first message my %sig = $lei->atfork_child_wq($self); @@ -456,7 +462,7 @@ sub write_mail { # via ->wq_do $self->write_cb($lei); }; my $git = $self->{"$$\0$git_dir"} //= PublicInbox::Git->new($git_dir); - $git->cat_async($oid, \&git_to_mail, [ $wcb, $kw, $not_done ]); + $git->cat_async($smsg->{blob}, \&git_to_mail, [$wcb, $smsg, $not_done]); } sub ipc_atfork_prepare { diff --git a/t/lei_to_mail.t b/t/lei_to_mail.t index 6673d9a6..47c0e3d4 100644 --- a/t/lei_to_mail.t +++ b/t/lei_to_mail.t @@ -18,11 +18,12 @@ my $noeol = "Subject: x\n\nFrom hell"; my $crlf = $noeol; $crlf =~ s/\n/\r\n/g; my $kw = [qw(seen answered flagged)]; +my $smsg = { kw => $kw, blob => '0'x40 }; my @MBOX = qw(mboxcl2 mboxrd mboxcl mboxo); for my $mbox (@MBOX) { my $m = "eml2$mbox"; my $cb = PublicInbox::LeiToMail->can($m); - my $s = $cb->(PublicInbox::Eml->new($from), $kw); + my $s = $cb->(PublicInbox::Eml->new($from), $smsg); is(substr($$s, -1, 1), "\n", "trailing LF in normal $mbox"); my $eml = PublicInbox::Eml->new($s); is($eml->header('Status'), 'OR', "Status: set by $m"); @@ -40,7 +41,7 @@ for my $mbox (@MBOX) { } else { is(scalar(@cl), 0, "$m clobbered Content-Length"); } - $s = $cb->(PublicInbox::Eml->new($noeol), $kw); + $s = $cb->(PublicInbox::Eml->new($noeol), $smsg); is(substr($$s, -1, 1), "\n", "trailing LF added by $m when original lacks EOL"); $eml = PublicInbox::Eml->new($s); @@ -49,7 +50,7 @@ for my $mbox (@MBOX) { } else { is($eml->body_raw, ">From hell\n", "From escaped once by $m"); } - $s = $cb->(PublicInbox::Eml->new($crlf), $kw); + $s = $cb->(PublicInbox::Eml->new($crlf), $smsg); is(substr($$s, -2, 2), "\r\n", "trailing CRLF added $m by original lacks EOL"); $eml = PublicInbox::Eml->new($s); @@ -62,7 +63,7 @@ for my $mbox (@MBOX) { is($eml->header('Content-Length') + length("\r\n"), length($eml->body_raw), "$m Content-Length matches"); } elsif ($mbox eq 'mboxrd') { - $s = $cb->($eml, $kw); + $s = $cb->($eml, $smsg); $eml = PublicInbox::Eml->new($s); is($eml->body_raw, ">>From hell\r\n\r\n", "From escaped again by $m"); @@ -102,11 +103,12 @@ my $wcb_get = sub { $cb; }; +my $deadbeef = { blob => 'deadbeef', kw => [ qw(seen) ] }; my $orig = do { my $wcb = $wcb_get->($mbox, $fn); is(ref $wcb, 'CODE', 'write_cb returned callback'); ok(-f $fn && !-s _, 'empty file created'); - $wcb->(\(my $dup = $buf), 'deadbeef', [ qw(seen) ]); + $wcb->(\(my $dup = $buf), $deadbeef); undef $wcb; open my $fh, '<', $fn or BAIL_OUT $!; my $raw = do { local $/; <$fh> }; @@ -116,7 +118,7 @@ my $orig = do { local $lei->{opt} = { jobs => 2 }; $wcb = $wcb_get->($mbox, $fn); ok(-f $fn && !-s _, 'truncated mbox destination'); - $wcb->(\($dup = $buf), 'deadbeef', [ qw(seen) ]); + $wcb->(\($dup = $buf), $deadbeef); undef $wcb; open $fh, '<', $fn or BAIL_OUT $!; is(do { local $/; <$fh> }, $raw, 'jobs > 1'); @@ -131,7 +133,7 @@ for my $zsfx (qw(gz bz2 xz)) { # XXX should we support zst, zz, lzo, lzma? ok($dc_cmd, "decompressor for .$zsfx"); my $f = "$fn.$zsfx"; my $wcb = $wcb_get->($mbox, $f); - $wcb->(\(my $dup = $buf), 'deadbeef', [ qw(seen) ]); + $wcb->(\(my $dup = $buf), $deadbeef); undef $wcb; my $uncompressed = xqx([@$dc_cmd, $f]); is($uncompressed, $orig, "$zsfx works unlocked"); @@ -139,13 +141,13 @@ for my $zsfx (qw(gz bz2 xz)) { # XXX should we support zst, zz, lzo, lzma? local $lei->{opt} = { jobs => 2 }; # for atomic writes unlink $f or BAIL_OUT "unlink $!"; $wcb = $wcb_get->($mbox, $f); - $wcb->(\($dup = $buf), 'deadbeef', [ qw(seen) ]); + $wcb->(\($dup = $buf), $deadbeef); undef $wcb; is(xqx([@$dc_cmd, $f]), $orig, "$zsfx matches with lock"); local $lei->{opt} = { augment => 1 }; $wcb = $wcb_get->($mbox, $f); - $wcb->(\($dup = $buf . "\nx\n"), 'deadbeef', [ qw(seen) ]); + $wcb->(\($dup = $buf . "\nx\n"), $deadbeef); undef $wcb; # commit my $cat = popen_rd([@$dc_cmd, $f]); @@ -157,7 +159,7 @@ for my $zsfx (qw(gz bz2 xz)) { # XXX should we support zst, zz, lzo, lzma? local $lei->{opt} = { augment => 1, jobs => 2 }; $wcb = $wcb_get->($mbox, $f); - $wcb->(\($dup = $buf . "\ny\n"), 'deadbeef', [ qw(seen) ]); + $wcb->(\($dup = $buf . "\ny\n"), $deadbeef); undef $wcb; # commit my @raw3; @@ -179,7 +181,8 @@ my $as_orig = sub { unlink $fn or BAIL_OUT $!; if ('default deduplication uses content_hash') { my $wcb = $wcb_get->('mboxo', $fn); - $wcb->(\(my $x = $buf), 'deadbeef', []) for (1..2); + $deadbeef->{kw} = []; + $wcb->(\(my $x = $buf), $deadbeef) for (1..2); undef $wcb; # undef to commit changes my $cmp = ''; open my $fh, '<', $fn or BAIL_OUT $!; @@ -188,7 +191,7 @@ if ('default deduplication uses content_hash') { local $lei->{opt} = { augment => 1 }; $wcb = $wcb_get->('mboxo', $fn); - $wcb->(\($x = $buf . "\nx\n"), 'deadbeef', []) for (1..2); + $wcb->(\($x = $buf . "\nx\n"), $deadbeef) for (1..2); undef $wcb; # undef to commit changes open $fh, '<', $fn or BAIL_OUT $!; my @x; @@ -202,7 +205,7 @@ if ('default deduplication uses content_hash') { open my $tmp, '+>', undef or BAIL_OUT $!; local $lei->{1} = $tmp; my $wcb = $wcb_get->('mboxrd', '/dev/stdout'); - $wcb->(\(my $x = $buf), 'deadbeef', []); + $wcb->(\(my $x = $buf), $deadbeef); undef $wcb; # commit seek($tmp, 0, SEEK_SET) or BAIL_OUT $!; my $cmp = ''; @@ -216,7 +219,7 @@ SKIP: { # FIFO support mkfifo($fn, 0600) or skip("mkfifo not supported: $!", 1); my $cat = popen_rd([which('cat'), $fn]); my $wcb = $wcb_get->('mboxo', $fn); - $wcb->(\(my $x = $buf), 'deadbeef', []); + $wcb->(\(my $x = $buf), $deadbeef); undef $wcb; # commit my $cmp = ''; PublicInbox::MboxReader->mboxo($cat, sub { $cmp .= $as_orig->(@_) }); @@ -227,7 +230,8 @@ SKIP: { # FIFO support my $md = "$tmpdir/maildir/"; my $wcb = $wcb_get->('maildir', $md); is(ref($wcb), 'CODE', 'got Maildir callback'); - $wcb->(\(my $x = $buf), 'badc0ffee', []); + my $b4dc0ffee = { blob => 'badc0ffee', kw => [] }; + $wcb->(\(my $x = $buf), $b4dc0ffee); my @f; PublicInbox::LeiToMail::_maildir_each_file($md, sub { push @f, shift }); @@ -235,7 +239,8 @@ SKIP: { # FIFO support is(do { local $/; <$fh> }, $buf, 'wrote to Maildir'); $wcb = $wcb_get->('maildir', $md); - $wcb->(\($x = $buf."\nx\n"), 'deadcafe', []); + my $deadcafe = { blob => 'deadcafe', kw => [] }; + $wcb->(\($x = $buf."\nx\n"), $deadcafe); my @x = (); PublicInbox::LeiToMail::_maildir_each_file($md, sub { push @x, shift }); @@ -246,8 +251,8 @@ SKIP: { # FIFO support local $lei->{opt}->{augment} = 1; $wcb = $wcb_get->('maildir', $md); - $wcb->(\($x = $buf."\ny\n"), 'deadcafe', []); - $wcb->(\($x = $buf."\ny\n"), 'b4dc0ffee', []); # skipped by dedupe + $wcb->(\($x = $buf."\ny\n"), $deadcafe); + $wcb->(\($x = $buf."\ny\n"), $b4dc0ffee); # skipped by dedupe @f = (); PublicInbox::LeiToMail::_maildir_each_file($md, sub { push @f, shift }); is(scalar grep(/\A\Q$x[0]\E\z/, @f), 1, 'old file still there');