From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 5/7] use Eml (or MIME) objects for all indexing paths
Date: Sun, 3 Jan 2021 02:06:15 +0000 [thread overview]
Message-ID: <20210103020617.15719-6-e@80x24.org> (raw)
In-Reply-To: <20210103020617.15719-1-e@80x24.org>
We don't need to be keeping the raw message around after it hits
git. Shard work now relies on Storable (or Sereal) and all of
the indexing code relies on the Email::MIME-like API of Eml to
access interesting parts of the message.
Similarly, smsg->{raw_bytes} is no longer carried around and we
do the CRLF adjustment when setting smsg->{bytes}.
There's also a small simplification to t/import.t while
we're in the area to use xqx instead of spawn/popen_rd.
---
lib/PublicInbox/ExtSearchIdx.pm | 11 ++++-------
lib/PublicInbox/Import.pm | 4 +---
lib/PublicInbox/LeiStore.pm | 4 ----
lib/PublicInbox/SearchIdx.pm | 17 +++--------------
lib/PublicInbox/Smsg.pm | 13 +++++++++++++
lib/PublicInbox/V2Writable.pm | 23 ++++++++++-------------
t/import.t | 12 ++----------
t/search.t | 2 +-
8 files changed, 34 insertions(+), 52 deletions(-)
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index d55d3db9..e6c21866 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -21,8 +21,7 @@ use Carp qw(croak carp);
use Sys::Hostname qw(hostname);
use POSIX qw(strftime);
use PublicInbox::Search;
-use PublicInbox::SearchIdx qw(crlf_adjust prepare_stack is_ancestor
- is_bad_blob);
+use PublicInbox::SearchIdx qw(prepare_stack is_ancestor is_bad_blob);
use PublicInbox::OverIdx;
use PublicInbox::MiscIdx;
use PublicInbox::MID qw(mids);
@@ -82,8 +81,6 @@ sub check_batch_limit ($) {
my ($req) = @_;
my $self = $req->{self};
my $new_smsg = $req->{new_smsg};
-
- # {raw_bytes} may be unset, so just use {bytes}
my $n = $self->{transact_bytes} += $new_smsg->{bytes};
# set flag for PublicInbox::V2Writable::index_todo:
@@ -239,7 +236,7 @@ sub index_oid { # git->cat_async callback for 'm'
my $new_smsg = $req->{new_smsg} = bless {
blob => $oid,
}, 'PublicInbox::Smsg';
- $new_smsg->{bytes} = $size + crlf_adjust($$bref);
+ $new_smsg->set_bytes($$bref, $size);
defined($req->{xnum} = cur_ibx_xnum($req, $bref)) or return;
++${$req->{nr}};
do_step($req);
@@ -496,7 +493,7 @@ sub _reindex_oid { # git->cat_async callback
my $ci = $self->{current_info};
local $self->{current_info} = "$ci #$docid $oid";
my $re_smsg = bless { blob => $oid }, 'PublicInbox::Smsg';
- $re_smsg->{bytes} = $size + crlf_adjust($$bref);
+ $re_smsg->set_bytes($$bref, $size);
my $eml = PublicInbox::Eml->new($bref);
$re_smsg->populate($eml, { autime => $orig_smsg->{ds},
cotime => $orig_smsg->{ts} });
@@ -676,7 +673,7 @@ sub _reindex_unseen { # git->cat_async callback
my $self = $req->{self} // die 'BUG: {self} unset';
local $self->{current_info} = "$self->{current_info} $oid";
my $new_smsg = bless { blob => $oid, }, 'PublicInbox::Smsg';
- $new_smsg->{bytes} = $size + crlf_adjust($$bref);
+ $new_smsg->set_bytes($$bref, $size);
my $eml = $req->{eml} = PublicInbox::Eml->new($bref);
$req->{new_smsg} = $new_smsg;
$req->{chash} = content_hash($eml);
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 052b145b..8a06a661 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -412,12 +412,10 @@ sub add {
# v2: we need this for Xapian
if ($smsg) {
$smsg->{blob} = $self->get_mark(":$blob");
- $smsg->{raw_bytes} = $n;
+ $smsg->set_bytes($raw_email, $n);
if (my $oidx = delete $smsg->{-oidx}) { # used by LeiStore
return if $oidx->blob_exists($smsg->{blob});
}
- # XXX do we need this? it's in git at this point
- $smsg->{-raw_email} = \$raw_email;
}
my $ref = $self->{ref};
my $commit = $self->{mark}++;
diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm
index 4f77e8fa..7cda7e44 100644
--- a/lib/PublicInbox/LeiStore.pm
+++ b/lib/PublicInbox/LeiStore.pm
@@ -10,7 +10,6 @@ package PublicInbox::LeiStore;
use strict;
use v5.10.1;
use parent qw(PublicInbox::Lock PublicInbox::IPC);
-use PublicInbox::SearchIdx qw(crlf_adjust);
use PublicInbox::ExtSearchIdx;
use PublicInbox::Import;
use PublicInbox::InboxWritable;
@@ -197,9 +196,6 @@ sub add_eml {
my $smsg = bless { -oidx => $oidx }, 'PublicInbox::Smsg';
my $im = $self->importer;
$im->add($eml, undef, $smsg) or return; # duplicate returns undef
- my $msgref = delete $smsg->{-raw_email};
- $smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref);
- undef $msgref;
local $self->{current_info} = $smsg->{blob};
if (my @docids = _docids_for($self, $eml)) {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index da3ac2e3..a7005051 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -22,7 +22,7 @@ use PublicInbox::OverIdx;
use PublicInbox::Spawn qw(spawn nodatacow_dir);
use PublicInbox::Git qw(git_unquote);
use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
-our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack
+our @EXPORT_OK = qw(log2stack is_ancestor check_size prepare_stack
index_text term_generator add_val is_bad_blob);
my $X = \%PublicInbox::Search::X;
our ($DB_CREATE_OR_OPEN, $DB_OPEN);
@@ -613,17 +613,6 @@ sub index_mm {
}
}
-# returns the number of bytes to add if given a non-CRLF arg
-sub crlf_adjust ($) {
- if (index($_[0], "\r\n") < 0) {
- # common case is LF-only, every \n needs an \r;
- # so favor a cheap tr// over an expensive m//g
- $_[0] =~ tr/\n/\n/;
- } else { # count number of '\n' w/o '\r', expensive:
- scalar(my @n = ($_[0] =~ m/(?<!\r)\n/g));
- }
-}
-
sub is_bad_blob ($$$$) {
my ($oid, $type, $size, $expect_oid) = @_;
if ($type ne 'blob') {
@@ -640,8 +629,8 @@ sub index_both { # git->cat_async callback
my ($nr, $max) = @$sync{qw(nr max)};
++$$nr;
$$max -= $size;
- $size += crlf_adjust($$bref);
- my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg';
+ my $smsg = bless { blob => $oid }, 'PublicInbox::Smsg';
+ $smsg->set_bytes($$bref, $size);
my $self = $sync->{sidx};
local $self->{current_info} = "$self->{current_info}: $oid";
my $eml = PublicInbox::Eml->new($bref);
diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm
index 571cbb6f..c6ff7f52 100644
--- a/lib/PublicInbox/Smsg.pm
+++ b/lib/PublicInbox/Smsg.pm
@@ -135,4 +135,17 @@ sub subject_normalized ($) {
$subj;
}
+# returns the number of bytes to add if given a non-CRLF arg
+sub crlf_adjust ($) {
+ if (index($_[0], "\r\n") < 0) {
+ # common case is LF-only, every \n needs an \r;
+ # so favor a cheap tr// over an expensive m//g
+ $_[0] =~ tr/\n/\n/;
+ } else { # count number of '\n' w/o '\r', expensive:
+ scalar(my @n = ($_[0] =~ m/(?<!\r)\n/g));
+ }
+}
+
+sub set_bytes { $_[0]->{bytes} = $_[2] + crlf_adjust($_[1]) }
+
1;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 7b6b93a0..c4efbdd2 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -17,8 +17,7 @@ use PublicInbox::InboxWritable;
use PublicInbox::OverIdx;
use PublicInbox::Msgmap;
use PublicInbox::Spawn qw(spawn popen_rd run_die);
-use PublicInbox::SearchIdx qw(log2stack crlf_adjust is_ancestor check_size
- is_bad_blob);
+use PublicInbox::SearchIdx qw(log2stack is_ancestor check_size is_bad_blob);
use IO::Handle; # ->autoflush
use File::Temp ();
@@ -139,13 +138,12 @@ sub idx_shard ($$) {
}
# indexes a message, returns true if checkpointing is needed
-sub do_idx ($$$$) {
- my ($self, $msgref, $eml, $smsg) = @_;
- $smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref);
+sub do_idx ($$$) {
+ my ($self, $eml, $smsg) = @_;
$self->{oidx}->add_overview($eml, $smsg);
my $idx = idx_shard($self, $smsg->{num});
$idx->index_eml($eml, $smsg);
- my $n = $self->{transact_bytes} += $smsg->{raw_bytes};
+ my $n = $self->{transact_bytes} += $smsg->{bytes};
$n >= $self->{batch_bytes};
}
@@ -173,7 +171,7 @@ sub _add {
$cmt = $im->get_mark($cmt);
$self->{last_commit}->[$self->{epoch_max}] = $cmt;
- if (do_idx($self, delete $smsg->{-raw_email}, $mime, $smsg)) {
+ if (do_idx($self, $mime, $smsg)) {
$self->checkpoint;
}
@@ -536,13 +534,13 @@ W: $list
for my $smsg (@$need_reindex) {
my $new_smsg = bless {
blob => $blob,
- raw_bytes => $bytes,
num => $smsg->{num},
mid => $smsg->{mid},
}, 'PublicInbox::Smsg';
my $sync = { autime => $smsg->{ds}, cotime => $smsg->{ts} };
$new_smsg->populate($new_mime, $sync);
- do_idx($self, \$raw, $new_mime, $new_smsg);
+ $new_smsg->set_bytes($raw, $bytes);
+ do_idx($self, $new_mime, $new_smsg);
}
$rewritten->{rewrites};
}
@@ -937,13 +935,13 @@ sub index_oid { # cat_async callback
}
++${$arg->{nr}};
my $smsg = bless {
- raw_bytes => $size,
num => $num,
blob => $oid,
mid => $mid0,
}, 'PublicInbox::Smsg';
$smsg->populate($eml, $arg);
- if (do_idx($self, $bref, $eml, $smsg)) {
+ $smsg->set_bytes($$bref, $size);
+ if (do_idx($self, $eml, $smsg)) {
${$arg->{need_checkpoint}} = 1;
}
index_finalize($arg, 1);
@@ -1217,9 +1215,8 @@ sub index_xap_only { # git->cat_async callback
my ($bref, $oid, $type, $size, $smsg) = @_;
my $self = $smsg->{self};
my $idx = idx_shard($self, $smsg->{num});
- $smsg->{raw_bytes} = $size;
$idx->index_eml(PublicInbox::Eml->new($bref), $smsg);
- $self->{transact_bytes} += $size;
+ $self->{transact_bytes} += $smsg->{bytes};
}
sub index_xap_step ($$$;$) {
diff --git a/t/import.t b/t/import.t
index 855b5d7c..ae76858b 100644
--- a/t/import.t
+++ b/t/import.t
@@ -7,7 +7,6 @@ use PublicInbox::Eml;
use PublicInbox::Smsg;
use PublicInbox::Git;
use PublicInbox::Import;
-use PublicInbox::Spawn qw(spawn);
use Fcntl qw(:DEFAULT SEEK_SET);
use PublicInbox::TestCommon;
use MIME::Base64 3.05; # Perl 5.10.0 / 5.9.2
@@ -32,20 +31,13 @@ like($im->add($mime, undef, $smsg), qr/\A:[0-9]+\z/, 'added one message');
if ($v2) {
like($smsg->{blob}, qr/\A[a-f0-9]{40}\z/, 'got last object_id');
- my $raw_email = $smsg->{-raw_email};
- is($mime->as_string, $$raw_email, 'string matches');
- is($smsg->{raw_bytes}, length($$raw_email), 'length matches');
my @cmd = ('git', "--git-dir=$git->{git_dir}", qw(hash-object --stdin));
open my $in, '+<', undef or BAIL_OUT "open(+<): $!";
print $in $mime->as_string or die "write failed: $!";
$in->flush or die "flush failed: $!";
- seek($in, 0, SEEK_SET);
- open my $out, '+<', undef or BAIL_OUT "open(+<): $!";
- my $pid = spawn(\@cmd, {}, { 0 => $in, 1 => $out });
- is(waitpid($pid, 0), $pid, 'waitpid succeeds on hash-object');
+ seek($in, 0, SEEK_SET) or die "seek: $!";
+ chomp(my $hashed_obj = xqx(\@cmd, undef, { 0 => $in }));
is($?, 0, 'hash-object');
- seek($out, 0, SEEK_SET);
- chomp(my $hashed_obj = <$out>);
is($hashed_obj, $smsg->{blob}, "blob object_id matches exp");
}
diff --git a/t/search.t b/t/search.t
index 7495233e..b2958c00 100644
--- a/t/search.t
+++ b/t/search.t
@@ -60,7 +60,7 @@ sub oct_is ($$$) {
}
{
- my $crlf_adjust = \&PublicInbox::SearchIdx::crlf_adjust;
+ my $crlf_adjust = \&PublicInbox::Smsg::crlf_adjust;
is($crlf_adjust->("hi\r\nworld\r\n"), 0, 'no adjustment needed');
is($crlf_adjust->("hi\nworld\n"), 2, 'LF-only counts two CR');
is($crlf_adjust->("hi\r\nworld\n"), 1, 'CRLF/LF-mix 1 counts 1 CR');
next prev parent reply other threads:[~2021-01-03 2:06 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-01-03 2:06 [PATCH 0/7] v2: swap in new IPC package Eric Wong
2021-01-03 2:06 ` [PATCH 1/7] ipc: some documentation comments Eric Wong
2021-01-03 2:06 ` [PATCH 2/7] searchidxshard: use PublicInbox::IPC to kill lots of code Eric Wong
2021-01-03 2:06 ` [PATCH 3/7] searchidxshard: IPC conversion, part 2 Eric Wong
2021-01-03 2:06 ` [PATCH 4/7] searchidxshard: replace index_raw with index_eml Eric Wong
2021-01-03 2:06 ` Eric Wong [this message]
2021-01-03 2:06 ` [PATCH 6/7] ipc: switch to one-way pipes Eric Wong
2021-01-03 2:06 ` [PATCH 7/7] searchidxshard: use add_xapian directly for v2 Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210103020617.15719-6-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).