From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 5AD001FBCA for ; Mon, 1 Jun 2020 10:06:58 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 05/13] smsg: introduce ->populate method Date: Mon, 1 Jun 2020 10:06:49 +0000 Message-Id: <20200601100657.14700-6-e@yhbt.net> In-Reply-To: <20200601100657.14700-1-e@yhbt.net> References: <20200601100657.14700-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This will eventually replace the __hdr() calling methods and eradicate {mime} usage from Smsg. For now, we can eliminate PublicInbox::Smsg->new since most callers already rely on an open `bless' to avoid the old {mime} arg. --- lib/PublicInbox/Import.pm | 40 ++++++++++++++++---------------- lib/PublicInbox/SearchIdx.pm | 31 +++++++++++-------------- lib/PublicInbox/Smsg.pm | 43 +++++++++++++++++++++++++++-------- lib/PublicInbox/V2Writable.pm | 9 ++++---- t/import.t | 3 ++- 5 files changed, 73 insertions(+), 53 deletions(-) diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 1a7ed9ce878..ab75aa00dc2 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -12,7 +12,8 @@ use v5.10.1; use PublicInbox::Spawn qw(spawn popen_rd); use PublicInbox::MID qw(mids mid2path); use PublicInbox::Address; -use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); +use PublicInbox::Smsg; +use PublicInbox::MsgTime qw(msg_datestamp); use PublicInbox::ContentHash qw(content_digest); use PublicInbox::MDA; use PublicInbox::Eml; @@ -269,8 +270,8 @@ sub remove { (($self->{tip} = ":$commit"), $cur); } -sub git_timestamp { - my ($ts, $zone) = @_; +sub git_timestamp ($) { + my ($ts, $zone) = @{$_[0]}; $ts = 0 if $ts < 0; # git uses unsigned times "$ts $zone"; } @@ -278,10 +279,13 @@ sub git_timestamp { sub extract_cmt_info ($;$) { my ($mime, $smsg) = @_; # $mime is PublicInbox::Eml, but remains Email::MIME-compatible + $smsg //= bless {}, 'PublicInbox::Smsg'; - my $sender = ''; my $hdr = $mime->header_obj; - my $from = $hdr->header('From') // ''; + $smsg->populate($hdr); + + my $sender = ''; + my $from = delete($smsg->{From}) // ''; my ($email) = PublicInbox::Address::emails($from); my ($name) = PublicInbox::Address::names($from); if (!defined($name) || !defined($email)) { @@ -313,17 +317,11 @@ sub extract_cmt_info ($;$) { warn "no name in From: $from or Sender: $sender\n"; } - my $subject = $hdr->header('Subject') // '(no subject)'; - # MIME decoding can create nulls replace them with spaces to protect git - $subject =~ tr/\0/ /; + my $subject = delete($smsg->{Subject}) // '(no subject)'; utf8::encode($subject); - my $at = git_timestamp(my @at = msg_datestamp($hdr)); - my $ct = git_timestamp(my @ct = msg_timestamp($hdr)); - if ($smsg) { - $smsg->{ds} = $at[0]; - $smsg->{ts} = $ct[0]; - } - ($name, $email, $at, $ct, $subject); + my $at = git_timestamp(delete $smsg->{-ds}); + my $ct = git_timestamp(delete $smsg->{-ts}); + ("$name <$email>", $at, $ct, $subject); } # kill potentially confusing/misleading headers @@ -370,7 +368,7 @@ sub clean_tree_v2 ($$$) { sub add { my ($self, $mime, $check_cb, $smsg) = @_; - my ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime, $smsg); + my ($author, $at, $ct, $subject) = extract_cmt_info($mime, $smsg); my $path_type = $self->{path_type}; my $path; if ($path_type eq '2/38') { @@ -414,7 +412,7 @@ sub add { } print $w "commit $ref\nmark :$commit\n", - "author $name <$email> $at\n", + "author $author $at\n", "committer $self->{ident} $ct\n" or wfail; print $w "data ", (length($subject) + 1), "\n", $subject, "\n\n" or wfail; @@ -502,11 +500,11 @@ sub digest2mid ($$) { sub rewrite_commit ($$$$) { my ($self, $oids, $buf, $mime) = @_; - my ($name, $email, $at, $ct, $subject); + my ($author, $at, $ct, $subject); if ($mime) { - ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime); + ($author, $at, $ct, $subject) = extract_cmt_info($mime); } else { - $name = $email = ''; + $author = '<>'; $subject = 'purged '.join(' ', @$oids); } @$oids = (); @@ -515,7 +513,7 @@ sub rewrite_commit ($$$$) { my $l = $buf->[$i]; if ($l =~ /^author .* ([0-9]+ [\+-]?[0-9]+)$/) { $at //= $1; - $buf->[$i] = "author $name <$email> $at\n"; + $buf->[$i] = "author $author $at\n"; } elsif ($l =~ /^committer .* ([0-9]+ [\+-]?[0-9]+)$/) { $ct //= $1; $buf->[$i] = "committer $self->{ident} $ct\n"; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index b4088933dbf..eb228e6bba7 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -156,16 +156,14 @@ sub index_text ($$$$) { } } -sub index_users ($$) { +sub index_headers ($$) { my ($self, $smsg) = @_; - - my $from = $smsg->from; - my $to = $smsg->to; - my $cc = $smsg->cc; - - index_text($self, $from, 1, 'A'); # A - author - index_text($self, $to, 1, 'XTO') if $to ne ''; - index_text($self, $cc, 1, 'XCC') if $cc ne ''; + my @x = (from => 'A', # Author + subject => 'S', to => 'XTO', cc => 'XCC'); + while (my ($field, $pfx) = splice(@x, 0, 2)) { + my $val = $smsg->{$field}; + index_text($self, $val, 1, $pfx) if $val ne ''; + } } sub index_diff_inc ($$$$) { @@ -285,9 +283,9 @@ sub index_xapian { # msg_iter callback if ($part->{is_submsg}) { my $mids = mids_for_index($part); index_ids($self, $doc, $part, $mids); - my $smsg = PublicInbox::Smsg->new($part); - index_users($self, $smsg); - index_text($self, $smsg->subject, 1, 'S') if $smsg->subject; + my $smsg = bless {}, 'PublicInbox::Smsg'; + $smsg->populate($part); + index_headers($self, $smsg); } my ($s, undef) = msg_part_text($part, $ct); @@ -335,10 +333,8 @@ sub index_ids ($$$$) { sub add_xapian ($$$$) { my ($self, $mime, $smsg, $mids) = @_; - $smsg->{mime} = $mime; # XXX dangerous my $hdr = $mime->header_obj; my $doc = $X->{Document}->new; - my $subj = $smsg->subject; add_val($doc, PublicInbox::Search::TS(), $smsg->{ts}); my @ds = gmtime($smsg->{ds}); my $yyyymmdd = strftime('%Y%m%d', @ds); @@ -348,8 +344,7 @@ sub add_xapian ($$$$) { my $tg = term_generator($self); $tg->set_document($doc); - index_text($self, $subj, 1, 'S') if $subj; - index_users($self, $smsg); + index_headers($self, $smsg); msg_iter($mime, \&index_xapian, [ $self, $doc ]); index_ids($self, $doc, $hdr, $mids); @@ -392,8 +387,7 @@ sub add_message { }; # v1 and tests only: - $smsg->{ds} //= msg_datestamp($hdr, $self->{autime}); - $smsg->{ts} //= msg_timestamp($hdr, $self->{cotime}); + $smsg->populate($hdr, $self); eval { # order matters, overview stores every possible piece of @@ -649,6 +643,7 @@ sub read_log { my $mime = do_cat_mail($git, $blob, \$bytes); $del_cb->($self, $mime); } + delete @$self{qw(autime cotime)}; $batch_cb->($nr, $latest, $newest); } diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm index 7a2766d8ff8..8e2771274a1 100644 --- a/lib/PublicInbox/Smsg.pm +++ b/lib/PublicInbox/Smsg.pm @@ -17,11 +17,6 @@ use PublicInbox::Address; use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); use Time::Local qw(timegm); -sub new { - my ($class, $mime) = @_; - bless { mime => $mime }, $class; -} - sub wrap { my ($class, $mid) = @_; bless { mid => $mid }, $class; @@ -36,11 +31,11 @@ sub get_val ($$) { sub to_doc_data { my ($self) = @_; join("\n", - $self->subject, - $self->from, + $self->{subject}, + $self->{from}, $self->references, - $self->to, - $self->cc, + $self->{to}, + $self->{cc}, $self->{blob}, $self->{mid}, $self->{bytes} // '', @@ -115,6 +110,36 @@ sub __hdr ($$) { }; } +# for Import and v1 WWW code paths +sub populate { + my ($self, $hdr, $v2w) = @_; + for my $f (qw(From To Cc Subject)) { + my @all = $hdr->header($f); + my $val = join(', ', @all); + $val =~ tr/\r//d; + # MIME decoding can create NULs, replace them with spaces + # to protect git and NNTP clients + $val =~ tr/\0\t\n/ /; + + # lower-case fields for read-only stuff + $self->{lc($f)} = $val; + + # Capitalized From/Subject for git-fast-import + next if $f eq 'To' || $f eq 'Cc'; + if (scalar(@all) > 1) { + $val = $all[0]; + $val =~ tr/\r//d; + $val =~ tr/\0\t\n/ /; + } + $self->{$f} = $val if $val ne ''; + } + $v2w //= {}; + $self->{-ds} = [ my @ds = msg_datestamp($hdr, $v2w->{autime}) ]; + $self->{-ts} = [ my @ts = msg_timestamp($hdr, $v2w->{cotime}) ]; + $self->{ds} //= $ds[0]; # no zone + $self->{ts} //= $ts[0]; +} + sub subject ($) { __hdr($_[0], 'Subject') } sub to ($) { __hdr($_[0], 'To') } sub cc ($) { __hdr($_[0], 'Cc') } diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 1a824531f3c..79bee7f9f3d 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -19,7 +19,6 @@ use PublicInbox::OverIdx; use PublicInbox::Msgmap; use PublicInbox::Spawn qw(spawn popen_rd); use PublicInbox::SearchIdx; -use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); use PublicInbox::MultiMidQueue; use IO::Handle; # ->autoflush use File::Temp qw(tempfile); @@ -156,8 +155,6 @@ sub add { # indexes a message, returns true if checkpointing is needed sub do_idx ($$$$) { my ($self, $msgref, $mime, $smsg) = @_; - $smsg->{ds} //= msg_datestamp($mime->header_obj, $self->{autime}); - $smsg->{ts} //= msg_timestamp($mime->header_obj, $self->{cotime}); $self->{over}->add_overview($mime, $smsg); my $idx = idx_shard($self, $smsg->{num} % $self->{shards}); $idx->index_raw($msgref, $mime, $smsg); @@ -575,6 +572,8 @@ W: $list num => $smsg->{num}, mid => $smsg->{mid}, }, 'PublicInbox::Smsg'; + my $v2w = { autime => $smsg->{ds}, cotime => $smsg->{ts} }; + $new_smsg->populate($new_mime, $v2w); do_idx($self, \$raw, $new_mime, $new_smsg); } $rewritten->{rewrites}; @@ -968,6 +967,7 @@ sub reindex_oid_m ($$$$;$) { blob => $oid, mid => $mid0, }, 'PublicInbox::Smsg'; + $smsg->populate($mime, $self); if (do_idx($self, $msgref, $mime, $smsg)) { reindex_checkpoint($self, $sync, $git); } @@ -1059,6 +1059,7 @@ sub reindex_oid ($$$$) { blob => $oid, mid => $mid0, }, 'PublicInbox::Smsg'; + $smsg->populate($mime, $self); if (do_idx($self, $msgref, $mime, $smsg)) { reindex_checkpoint($self, $sync, $git); } @@ -1298,7 +1299,7 @@ sub index_epoch ($$$) { } } close $fh or die "git log failed: \$?=$?"; - delete $self->{reindex_pipe}; + delete @$self{qw(reindex_pipe autime cotime)}; update_last_commit($self, $git, $i, $cmt) if defined $cmt; } diff --git a/t/import.t b/t/import.t index 3f308299148..f987b1141f7 100644 --- a/t/import.t +++ b/t/import.t @@ -4,6 +4,7 @@ use strict; use warnings; use Test::More; use PublicInbox::Eml; +use PublicInbox::Smsg; use PublicInbox::Git; use PublicInbox::Import; use PublicInbox::Spawn qw(spawn); @@ -26,7 +27,7 @@ hello world EOF my $v2 = require_git(2.6, 1); -my $smsg = {} if $v2; +my $smsg = bless {}, 'PublicInbox::Smsg' if $v2; like($im->add($mime, undef, $smsg), qr/\A:[0-9]+\z/, 'added one message'); if ($v2) {