unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
* [PATCH] speedup $EXTRACT_DIFFS callers by 1%
@ 2024-06-25 18:49 Eric Wong
  0 siblings, 0 replies; only message in thread
From: Eric Wong @ 2024-06-25 18:49 UTC (permalink / raw)
  To: meta

While Perl docs recommend against using //o, we know the regexp
won't change at runtime and there's a measurable improvement to
be found.  The included perf test on a packed mirror of
meta@public-inbox.org shows a consistent ~1% improvement on my
system.
---
 MANIFEST                       |  1 +
 lib/PublicInbox/LeiRediff.pm   |  2 +-
 lib/PublicInbox/LeiViewText.pm |  2 +-
 lib/PublicInbox/ViewDiff.pm    |  2 +-
 xt/perf-extract-diffs.t        | 80 ++++++++++++++++++++++++++++++++++
 5 files changed, 84 insertions(+), 3 deletions(-)
 create mode 100644 xt/perf-extract-diffs.t

diff --git a/MANIFEST b/MANIFEST
index 3fc01a43..af65a86e 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -662,6 +662,7 @@ xt/net_nntp_socks.t
 xt/net_writer-imap.t
 xt/nntpd-validate.t
 xt/over-fsck.perl
+xt/perf-extract-diffs.t
 xt/perf-msgview.t
 xt/perf-nntpd.t
 xt/perf-threading.t
diff --git a/lib/PublicInbox/LeiRediff.pm b/lib/PublicInbox/LeiRediff.pm
index 66359dd4..59fee3f6 100644
--- a/lib/PublicInbox/LeiRediff.pm
+++ b/lib/PublicInbox/LeiRediff.pm
@@ -158,7 +158,7 @@ sub extract_oids { # Eml each_part callback
 	$self->{dqre} && $s =~ s/$self->{dqre}//g && $lei->{opt}->{drq} and
 		local $lei->{1} = requote($lei, $1);
 
-	my @top = split($PublicInbox::ViewDiff::EXTRACT_DIFFS, $s);
+	my @top = split(/$PublicInbox::ViewDiff::EXTRACT_DIFFS/o, $s);
 	undef $s;
 	my $blobs = $self->{blobs}; # blobs to resolve
 	my $ctxq;
diff --git a/lib/PublicInbox/LeiViewText.pm b/lib/PublicInbox/LeiViewText.pm
index 6510b19e..fa608ca2 100644
--- a/lib/PublicInbox/LeiViewText.pm
+++ b/lib/PublicInbox/LeiViewText.pm
@@ -187,7 +187,7 @@ EOF
 
 sub flush_text_diff ($$) {
 	my ($self, $cur) = @_;
-	my @top = split($PublicInbox::ViewDiff::EXTRACT_DIFFS, $$cur);
+	my @top = split(/$PublicInbox::ViewDiff::EXTRACT_DIFFS/o, $$cur);
 	undef $$cur; # free memory
 	my $dctx;
 	my $obuf = $self->{obuf};
diff --git a/lib/PublicInbox/ViewDiff.pm b/lib/PublicInbox/ViewDiff.pm
index 7a6d9a2b..9ddf8515 100644
--- a/lib/PublicInbox/ViewDiff.pm
+++ b/lib/PublicInbox/ViewDiff.pm
@@ -183,7 +183,7 @@ sub flush_diff ($$) {
 	my ($ctx, $cur) = @_;
 
 	my ($subj) = ($$cur =~ /^Subject:\s*\[[^\]]+\]\s*(.+?)$/sm);
-	my @top = split($EXTRACT_DIFFS, $$cur);
+	my @top = split(/$EXTRACT_DIFFS/o, $$cur);
 	undef $$cur; # free memory
 	$ctx->{-qry_subj} = $subj if $subj;
 	my $lnk = $ctx->{-linkify};
diff --git a/xt/perf-extract-diffs.t b/xt/perf-extract-diffs.t
new file mode 100644
index 00000000..72c39ed1
--- /dev/null
+++ b/xt/perf-extract-diffs.t
@@ -0,0 +1,80 @@
+#!perl -w
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use v5.12;
+use PublicInbox::TestCommon;
+use Benchmark qw(:all :hireswallclock);
+use PublicInbox::Inbox;
+use PublicInbox::ViewDiff;
+use PublicInbox::MsgIter qw(msg_part_text);
+my $nr = $ENV{NR} // 5;
+my $inboxdir = $ENV{GIANT_INBOX_DIR};
+plan skip_all => "GIANT_INBOX_DIR not defined for $0" unless $inboxdir;
+
+my @cat = qw(cat-file --buffer --batch-check --batch-all-objects);
+if (require_git(v2.19, 1)) {
+	push @cat, '--unordered';
+} else {
+	warn
+"git <2.19, cat-file lacks --unordered, locality suffers\n";
+}
+my $ibx = PublicInbox::Inbox->new({ inboxdir => $inboxdir, name => 'name' });
+my $git = $ibx->git;
+my ($eml, $res, $oid, $type, $n, $m);
+my ($part, $s, $err, @top);
+sub text_part {
+	$part = $_[0]->[0];
+	($s, $err) = msg_part_text($part, $part->content_type || 'text/plain');
+	$s // return;
+	$s =~ s/\r+\n/\n/sg;
+}
+
+my %extract_cb = (
+	var => sub { # callback for Eml->each_part
+		text_part(@_) // return;
+		my @top = split($PublicInbox::ViewDiff::EXTRACT_DIFFS, $s);
+	},
+	slash => sub { # callback for Eml->each_part
+		text_part(@_) // return;
+		my @top = split(/$PublicInbox::ViewDiff::EXTRACT_DIFFS/, $s);
+	},
+	slash_o => sub { # callback for Eml->each_part
+		text_part(@_) // return;
+		my @top = split(/$PublicInbox::ViewDiff::EXTRACT_DIFFS/o, $s);
+	},
+);
+
+my $oid_cb = sub {
+	my ($bref, undef, undef, undef, $cb) = @_;
+	++$m;
+	$eml = PublicInbox::Eml->new($bref);
+	$eml->each_part($cb);
+};
+
+# ensure all --batch-check processes are ready
+my @cats = map {
+	my $fh = $git->popen(@cat);
+	vec(my $vec = '', fileno($fh), 1) = 1;
+	select($vec, undef, undef, 60) or
+		xbail 'timed out waiting for --batch-check';
+	$fh
+} (1..((scalar keys %extract_cb) * $nr));
+
+my $time;
+while (my ($name, $eml_cb) = each %extract_cb) {
+	$time->{$name} = sub {
+		my $fh = shift @cats // xbail "no --batch-check for $name";
+		$n = $m = 0;
+		while (<$fh>) {
+			($oid, $type) = split / /;
+			next if $type ne 'blob';
+			++$n;
+			$git->cat_async($oid, $oid_cb, $eml_cb);
+		}
+		$git->async_wait_all;
+		is $n, $m, "$n of $m messages scanned ($name)";
+	};
+}
+
+timethese($nr, $time);
+done_testing;

^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2024-06-25 18:49 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-06-25 18:49 [PATCH] speedup $EXTRACT_DIFFS callers by 1% Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).