From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 60EB51FA18 for ; Sun, 3 Jan 2021 02:06:18 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 5/7] use Eml (or MIME) objects for all indexing paths Date: Sun, 3 Jan 2021 02:06:15 +0000 Message-Id: <20210103020617.15719-6-e@80x24.org> In-Reply-To: <20210103020617.15719-1-e@80x24.org> References: <20210103020617.15719-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We don't need to be keeping the raw message around after it hits git. Shard work now relies on Storable (or Sereal) and all of the indexing code relies on the Email::MIME-like API of Eml to access interesting parts of the message. Similarly, smsg->{raw_bytes} is no longer carried around and we do the CRLF adjustment when setting smsg->{bytes}. There's also a small simplification to t/import.t while we're in the area to use xqx instead of spawn/popen_rd. --- lib/PublicInbox/ExtSearchIdx.pm | 11 ++++------- lib/PublicInbox/Import.pm | 4 +--- lib/PublicInbox/LeiStore.pm | 4 ---- lib/PublicInbox/SearchIdx.pm | 17 +++-------------- lib/PublicInbox/Smsg.pm | 13 +++++++++++++ lib/PublicInbox/V2Writable.pm | 23 ++++++++++------------- t/import.t | 12 ++---------- t/search.t | 2 +- 8 files changed, 34 insertions(+), 52 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index d55d3db9..e6c21866 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -21,8 +21,7 @@ use Carp qw(croak carp); use Sys::Hostname qw(hostname); use POSIX qw(strftime); use PublicInbox::Search; -use PublicInbox::SearchIdx qw(crlf_adjust prepare_stack is_ancestor - is_bad_blob); +use PublicInbox::SearchIdx qw(prepare_stack is_ancestor is_bad_blob); use PublicInbox::OverIdx; use PublicInbox::MiscIdx; use PublicInbox::MID qw(mids); @@ -82,8 +81,6 @@ sub check_batch_limit ($) { my ($req) = @_; my $self = $req->{self}; my $new_smsg = $req->{new_smsg}; - - # {raw_bytes} may be unset, so just use {bytes} my $n = $self->{transact_bytes} += $new_smsg->{bytes}; # set flag for PublicInbox::V2Writable::index_todo: @@ -239,7 +236,7 @@ sub index_oid { # git->cat_async callback for 'm' my $new_smsg = $req->{new_smsg} = bless { blob => $oid, }, 'PublicInbox::Smsg'; - $new_smsg->{bytes} = $size + crlf_adjust($$bref); + $new_smsg->set_bytes($$bref, $size); defined($req->{xnum} = cur_ibx_xnum($req, $bref)) or return; ++${$req->{nr}}; do_step($req); @@ -496,7 +493,7 @@ sub _reindex_oid { # git->cat_async callback my $ci = $self->{current_info}; local $self->{current_info} = "$ci #$docid $oid"; my $re_smsg = bless { blob => $oid }, 'PublicInbox::Smsg'; - $re_smsg->{bytes} = $size + crlf_adjust($$bref); + $re_smsg->set_bytes($$bref, $size); my $eml = PublicInbox::Eml->new($bref); $re_smsg->populate($eml, { autime => $orig_smsg->{ds}, cotime => $orig_smsg->{ts} }); @@ -676,7 +673,7 @@ sub _reindex_unseen { # git->cat_async callback my $self = $req->{self} // die 'BUG: {self} unset'; local $self->{current_info} = "$self->{current_info} $oid"; my $new_smsg = bless { blob => $oid, }, 'PublicInbox::Smsg'; - $new_smsg->{bytes} = $size + crlf_adjust($$bref); + $new_smsg->set_bytes($$bref, $size); my $eml = $req->{eml} = PublicInbox::Eml->new($bref); $req->{new_smsg} = $new_smsg; $req->{chash} = content_hash($eml); diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 052b145b..8a06a661 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -412,12 +412,10 @@ sub add { # v2: we need this for Xapian if ($smsg) { $smsg->{blob} = $self->get_mark(":$blob"); - $smsg->{raw_bytes} = $n; + $smsg->set_bytes($raw_email, $n); if (my $oidx = delete $smsg->{-oidx}) { # used by LeiStore return if $oidx->blob_exists($smsg->{blob}); } - # XXX do we need this? it's in git at this point - $smsg->{-raw_email} = \$raw_email; } my $ref = $self->{ref}; my $commit = $self->{mark}++; diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm index 4f77e8fa..7cda7e44 100644 --- a/lib/PublicInbox/LeiStore.pm +++ b/lib/PublicInbox/LeiStore.pm @@ -10,7 +10,6 @@ package PublicInbox::LeiStore; use strict; use v5.10.1; use parent qw(PublicInbox::Lock PublicInbox::IPC); -use PublicInbox::SearchIdx qw(crlf_adjust); use PublicInbox::ExtSearchIdx; use PublicInbox::Import; use PublicInbox::InboxWritable; @@ -197,9 +196,6 @@ sub add_eml { my $smsg = bless { -oidx => $oidx }, 'PublicInbox::Smsg'; my $im = $self->importer; $im->add($eml, undef, $smsg) or return; # duplicate returns undef - my $msgref = delete $smsg->{-raw_email}; - $smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref); - undef $msgref; local $self->{current_info} = $smsg->{blob}; if (my @docids = _docids_for($self, $eml)) { diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index da3ac2e3..a7005051 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -22,7 +22,7 @@ use PublicInbox::OverIdx; use PublicInbox::Spawn qw(spawn nodatacow_dir); use PublicInbox::Git qw(git_unquote); use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); -our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack +our @EXPORT_OK = qw(log2stack is_ancestor check_size prepare_stack index_text term_generator add_val is_bad_blob); my $X = \%PublicInbox::Search::X; our ($DB_CREATE_OR_OPEN, $DB_OPEN); @@ -613,17 +613,6 @@ sub index_mm { } } -# returns the number of bytes to add if given a non-CRLF arg -sub crlf_adjust ($) { - if (index($_[0], "\r\n") < 0) { - # common case is LF-only, every \n needs an \r; - # so favor a cheap tr// over an expensive m//g - $_[0] =~ tr/\n/\n/; - } else { # count number of '\n' w/o '\r', expensive: - scalar(my @n = ($_[0] =~ m/(?cat_async callback my ($nr, $max) = @$sync{qw(nr max)}; ++$$nr; $$max -= $size; - $size += crlf_adjust($$bref); - my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg'; + my $smsg = bless { blob => $oid }, 'PublicInbox::Smsg'; + $smsg->set_bytes($$bref, $size); my $self = $sync->{sidx}; local $self->{current_info} = "$self->{current_info}: $oid"; my $eml = PublicInbox::Eml->new($bref); diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm index 571cbb6f..c6ff7f52 100644 --- a/lib/PublicInbox/Smsg.pm +++ b/lib/PublicInbox/Smsg.pm @@ -135,4 +135,17 @@ sub subject_normalized ($) { $subj; } +# returns the number of bytes to add if given a non-CRLF arg +sub crlf_adjust ($) { + if (index($_[0], "\r\n") < 0) { + # common case is LF-only, every \n needs an \r; + # so favor a cheap tr// over an expensive m//g + $_[0] =~ tr/\n/\n/; + } else { # count number of '\n' w/o '\r', expensive: + scalar(my @n = ($_[0] =~ m/(?{bytes} = $_[2] + crlf_adjust($_[1]) } + 1; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 7b6b93a0..c4efbdd2 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -17,8 +17,7 @@ use PublicInbox::InboxWritable; use PublicInbox::OverIdx; use PublicInbox::Msgmap; use PublicInbox::Spawn qw(spawn popen_rd run_die); -use PublicInbox::SearchIdx qw(log2stack crlf_adjust is_ancestor check_size - is_bad_blob); +use PublicInbox::SearchIdx qw(log2stack is_ancestor check_size is_bad_blob); use IO::Handle; # ->autoflush use File::Temp (); @@ -139,13 +138,12 @@ sub idx_shard ($$) { } # indexes a message, returns true if checkpointing is needed -sub do_idx ($$$$) { - my ($self, $msgref, $eml, $smsg) = @_; - $smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref); +sub do_idx ($$$) { + my ($self, $eml, $smsg) = @_; $self->{oidx}->add_overview($eml, $smsg); my $idx = idx_shard($self, $smsg->{num}); $idx->index_eml($eml, $smsg); - my $n = $self->{transact_bytes} += $smsg->{raw_bytes}; + my $n = $self->{transact_bytes} += $smsg->{bytes}; $n >= $self->{batch_bytes}; } @@ -173,7 +171,7 @@ sub _add { $cmt = $im->get_mark($cmt); $self->{last_commit}->[$self->{epoch_max}] = $cmt; - if (do_idx($self, delete $smsg->{-raw_email}, $mime, $smsg)) { + if (do_idx($self, $mime, $smsg)) { $self->checkpoint; } @@ -536,13 +534,13 @@ W: $list for my $smsg (@$need_reindex) { my $new_smsg = bless { blob => $blob, - raw_bytes => $bytes, num => $smsg->{num}, mid => $smsg->{mid}, }, 'PublicInbox::Smsg'; my $sync = { autime => $smsg->{ds}, cotime => $smsg->{ts} }; $new_smsg->populate($new_mime, $sync); - do_idx($self, \$raw, $new_mime, $new_smsg); + $new_smsg->set_bytes($raw, $bytes); + do_idx($self, $new_mime, $new_smsg); } $rewritten->{rewrites}; } @@ -937,13 +935,13 @@ sub index_oid { # cat_async callback } ++${$arg->{nr}}; my $smsg = bless { - raw_bytes => $size, num => $num, blob => $oid, mid => $mid0, }, 'PublicInbox::Smsg'; $smsg->populate($eml, $arg); - if (do_idx($self, $bref, $eml, $smsg)) { + $smsg->set_bytes($$bref, $size); + if (do_idx($self, $eml, $smsg)) { ${$arg->{need_checkpoint}} = 1; } index_finalize($arg, 1); @@ -1217,9 +1215,8 @@ sub index_xap_only { # git->cat_async callback my ($bref, $oid, $type, $size, $smsg) = @_; my $self = $smsg->{self}; my $idx = idx_shard($self, $smsg->{num}); - $smsg->{raw_bytes} = $size; $idx->index_eml(PublicInbox::Eml->new($bref), $smsg); - $self->{transact_bytes} += $size; + $self->{transact_bytes} += $smsg->{bytes}; } sub index_xap_step ($$$;$) { diff --git a/t/import.t b/t/import.t index 855b5d7c..ae76858b 100644 --- a/t/import.t +++ b/t/import.t @@ -7,7 +7,6 @@ use PublicInbox::Eml; use PublicInbox::Smsg; use PublicInbox::Git; use PublicInbox::Import; -use PublicInbox::Spawn qw(spawn); use Fcntl qw(:DEFAULT SEEK_SET); use PublicInbox::TestCommon; use MIME::Base64 3.05; # Perl 5.10.0 / 5.9.2 @@ -32,20 +31,13 @@ like($im->add($mime, undef, $smsg), qr/\A:[0-9]+\z/, 'added one message'); if ($v2) { like($smsg->{blob}, qr/\A[a-f0-9]{40}\z/, 'got last object_id'); - my $raw_email = $smsg->{-raw_email}; - is($mime->as_string, $$raw_email, 'string matches'); - is($smsg->{raw_bytes}, length($$raw_email), 'length matches'); my @cmd = ('git', "--git-dir=$git->{git_dir}", qw(hash-object --stdin)); open my $in, '+<', undef or BAIL_OUT "open(+<): $!"; print $in $mime->as_string or die "write failed: $!"; $in->flush or die "flush failed: $!"; - seek($in, 0, SEEK_SET); - open my $out, '+<', undef or BAIL_OUT "open(+<): $!"; - my $pid = spawn(\@cmd, {}, { 0 => $in, 1 => $out }); - is(waitpid($pid, 0), $pid, 'waitpid succeeds on hash-object'); + seek($in, 0, SEEK_SET) or die "seek: $!"; + chomp(my $hashed_obj = xqx(\@cmd, undef, { 0 => $in })); is($?, 0, 'hash-object'); - seek($out, 0, SEEK_SET); - chomp(my $hashed_obj = <$out>); is($hashed_obj, $smsg->{blob}, "blob object_id matches exp"); } diff --git a/t/search.t b/t/search.t index 7495233e..b2958c00 100644 --- a/t/search.t +++ b/t/search.t @@ -60,7 +60,7 @@ sub oct_is ($$$) { } { - my $crlf_adjust = \&PublicInbox::SearchIdx::crlf_adjust; + my $crlf_adjust = \&PublicInbox::Smsg::crlf_adjust; is($crlf_adjust->("hi\r\nworld\r\n"), 0, 'no adjustment needed'); is($crlf_adjust->("hi\nworld\n"), 2, 'LF-only counts two CR'); is($crlf_adjust->("hi\r\nworld\n"), 1, 'CRLF/LF-mix 1 counts 1 CR');