unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
* [PATCH 0/5] various indexing cleanups and quieting
@ 2020-12-16 23:19 Eric Wong
  2020-12-16 23:19 ` [PATCH 1/5] inboxwritable: warn_ignore: "Bad UTF7 data escape" Eric Wong
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-16 23:19 UTC (permalink / raw)
  To: meta

Less code, less warnings, more better

Eric Wong (5):
  inboxwritable: warn_ignore: skip "Bad UTF7 data escape"
  index: ignore data warnings, set {current_info} for v1
  inbox: simplify v2 epoch count, reuse in InboxWritable
  inboxwritable: drop git_dir_n sub
  extsearchidx: no need to use InboxWritable

 lib/PublicInbox/Admin.pm         | 23 ++++++++++++++---------
 lib/PublicInbox/ExtSearchIdx.pm  |  6 +-----
 lib/PublicInbox/Inbox.pm         | 18 +++++++-----------
 lib/PublicInbox/InboxWritable.pm | 22 +++++-----------------
 lib/PublicInbox/Search.pm        |  3 +--
 lib/PublicInbox/SearchIdx.pm     |  6 +++++-
 lib/PublicInbox/V2Writable.pm    | 19 ++++++-------------
 lib/PublicInbox/Xapcmd.pm        |  3 +--
 8 files changed, 40 insertions(+), 60 deletions(-)

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/5] inboxwritable: warn_ignore: "Bad UTF7 data escape"
  2020-12-16 23:19 [PATCH 0/5] various indexing cleanups and quieting Eric Wong
@ 2020-12-16 23:19 ` Eric Wong
  2020-12-16 23:19 ` [PATCH 2/5] index: ignore some warnings, set {current_info} for v1 Eric Wong
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-16 23:19 UTC (permalink / raw)
  To: meta

As with the other messages in this callback, there's
nothing we can do about invalid messages ending up in
our Maildirs for -watch.
---
 lib/PublicInbox/InboxWritable.pm | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm
index 49809045..bdfae2f8 100644
--- a/lib/PublicInbox/InboxWritable.pm
+++ b/lib/PublicInbox/InboxWritable.pm
@@ -287,6 +287,8 @@ sub warn_ignore {
 	# PublicInbox::MsgTime
 	|| $s =~ /^bogus TZ offset: .+?, ignoring and assuming \+0000/
 	|| $s =~ /^bad Date: .+? in /
+	# Encode::Unicode::UTF7
+	|| $s =~ /^Bad UTF7 data escape at /
 }
 
 # this expects to be RHS in this assignment: "local $SIG{__WARN__} = ..."

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/5] index: ignore some warnings, set {current_info} for v1
  2020-12-16 23:19 [PATCH 0/5] various indexing cleanups and quieting Eric Wong
  2020-12-16 23:19 ` [PATCH 1/5] inboxwritable: warn_ignore: "Bad UTF7 data escape" Eric Wong
@ 2020-12-16 23:19 ` Eric Wong
  2020-12-16 23:19 ` [PATCH 3/5] inbox: simplify v2 epoch counting Eric Wong
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-16 23:19 UTC (permalink / raw)
  To: meta

-index runs on data that's already frozen in git, so there's
no point in warning users about it.

While we're at it, set the {current_info} prefix for v1 as
we do in v2 inboxes in case new problems show up.
---
 lib/PublicInbox/Admin.pm     | 23 ++++++++++++++---------
 lib/PublicInbox/SearchIdx.pm |  6 +++++-
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index eeef2f63..3977d812 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -209,12 +209,20 @@ sub index_terminate {
 
 sub index_inbox {
 	my ($ibx, $im, $opt) = @_;
+	require PublicInbox::InboxWritable;
 	my $jobs = delete $opt->{jobs} if $opt;
 	if (my $pr = $opt->{-progress}) {
 		$pr->("indexing $ibx->{inboxdir} ...\n");
 	}
 	local %SIG = %SIG;
 	setup_signals(\&index_terminate, $ibx);
+	my $warn_cb = $SIG{__WARN__} // sub { print STDERR @_ };
+	my $idx = { current_info => $ibx->{inboxdir} };
+	my $warn_ignore = PublicInbox::InboxWritable->can('warn_ignore');
+	local $SIG{__WARN__} = sub {
+		return if $warn_ignore->(@_);
+		$warn_cb->($idx->{current_info}, ': ', @_);
+	};
 	if (ref($ibx) && $ibx->version == 2) {
 		eval { require PublicInbox::V2Writable };
 		die "v2 requirements not met: $@\n" if $@;
@@ -226,21 +234,18 @@ sub index_inbox {
 			} else {
 				my $n = $v2w->{shards};
 				if ($jobs < ($n + 1) && !$opt->{reshard}) {
-					warn
-"Unable to respect --jobs=$jobs on index, inbox was created with $n shards\n";
+					warn <<EOM;
+Unable to respect --jobs=$jobs on index, inbox was created with $n shards
+EOM
 				}
 			}
 		}
-		my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ };
-		local $SIG{__WARN__} = sub {
-			$warn_cb->($v2w->{current_info}, ': ', @_);
-		};
-		$v2w->index_sync($opt);
+		$idx = $v2w;
 	} else {
 		require PublicInbox::SearchIdx;
-		my $s = PublicInbox::SearchIdx->new($ibx, 1);
-		$s->index_sync($opt);
+		$idx = PublicInbox::SearchIdx->new($ibx, 1);
 	}
+	$idx->index_sync($opt);
 }
 
 sub progress_prepare ($) {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index c6d2a0e8..b731f698 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -617,6 +617,7 @@ sub index_both { # git->cat_async callback
 	$size += crlf_adjust($$bref);
 	my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg';
 	my $self = $sync->{sidx};
+	local $self->{current_info} = "$self->{current_info}: $oid";
 	my $eml = PublicInbox::Eml->new($bref);
 	$smsg->{num} = index_mm($self, $eml, $oid, $sync) or
 		die "E: could not generate NNTP article number for $oid";
@@ -628,7 +629,9 @@ sub index_both { # git->cat_async callback
 sub unindex_both { # git->cat_async callback
 	my ($bref, $oid, $type, $size, $sync) = @_;
 	return if is_bad_blob($oid, $type, $size, $sync->{oid});
-	unindex_eml($sync->{sidx}, $oid, PublicInbox::Eml->new($bref));
+	my $self = $sync->{sidx};
+	local $self->{current_info} = "$self->{current_info}: $oid";
+	unindex_eml($self, $oid, PublicInbox::Eml->new($bref));
 	# may be undef if leftover
 	if (defined(my $cur_cmt = $sync->{cur_cmt})) {
 		${$sync->{latest_cmt}} = $cur_cmt;
@@ -872,6 +875,7 @@ sub _index_sync {
 	my ($self, $opt) = @_;
 	my $tip = $opt->{ref} || 'HEAD';
 	my $ibx = $self->{ibx};
+	local $self->{current_info} = "$ibx->{inboxdir}";
 	$self->{batch_bytes} = $opt->{batch_size} // $BATCH_BYTES;
 	$ibx->git->batch_prepare;
 	my $pr = $opt->{-progress};

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/5] inbox: simplify v2 epoch counting
  2020-12-16 23:19 [PATCH 0/5] various indexing cleanups and quieting Eric Wong
  2020-12-16 23:19 ` [PATCH 1/5] inboxwritable: warn_ignore: "Bad UTF7 data escape" Eric Wong
  2020-12-16 23:19 ` [PATCH 2/5] index: ignore some warnings, set {current_info} for v1 Eric Wong
@ 2020-12-16 23:19 ` Eric Wong
  2020-12-16 23:19 ` [PATCH 4/5] inboxwritable: drop git_dir_n sub Eric Wong
  2020-12-16 23:19 ` [PATCH 5/5] extsearchidx: no need to make InboxWritable Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-16 23:19 UTC (permalink / raw)
  To: meta

Perl readdir detects list context and can return an array
suitable for the grep op.  From there, we can rely on
substr to remove the ".git" suffix and integerize the value
to save a few bytes before letting List::Util::max return
the value.

This is how we detect Xapian shards nowadays, too, and
we'll also use defined-or (//) to simplify the return
value there.

We'll also simplify InboxWritable->git_dir_latest,
remove some callers, and consider removing it entirely.
---
 lib/PublicInbox/ExtSearchIdx.pm  |  4 +---
 lib/PublicInbox/Inbox.pm         | 18 +++++++-----------
 lib/PublicInbox/InboxWritable.pm | 17 +++--------------
 lib/PublicInbox/Search.pm        |  3 +--
 lib/PublicInbox/V2Writable.pm    | 16 ++++------------
 lib/PublicInbox/Xapcmd.pm        |  3 +--
 6 files changed, 17 insertions(+), 44 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index f492734a..3764612c 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -290,9 +290,7 @@ sub _sync_inbox ($$$) {
 	my $v = $ibx->version;
 	my $ekey = $ibx->eidx_key;
 	if ($v == 2) {
-		my $epoch_max;
-		defined($ibx->git_dir_latest(\$epoch_max)) or return;
-		$sync->{epoch_max} = $epoch_max;
+		$sync->{epoch_max} = $ibx->max_git_epoch // return;
 		sync_prepare($self, $sync); # or return # TODO: once MiscIdx is stable
 	} elsif ($v == 1) {
 		my $uv = $ibx->uidvalidity;
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index 4e3c23f3..0973bf94 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -4,10 +4,10 @@
 # Represents a public-inbox (which may have multiple mailing addresses)
 package PublicInbox::Inbox;
 use strict;
-use warnings;
 use PublicInbox::Git;
 use PublicInbox::MID qw(mid2path);
 use PublicInbox::Eml;
+use List::Util qw(max);
 
 # Long-running "git-cat-file --batch" processes won't notice
 # unlinked packs, so we need to restart those processes occasionally.
@@ -155,19 +155,15 @@ sub max_git_epoch {
 	my ($self) = @_;
 	return if $self->version < 2;
 	my $cur = $self->{-max_git_epoch};
-	my $changed = git($self)->alternates_changed;
-	if (!defined($cur) || $changed) {
+	my $changed;
+	if (!defined($cur) || ($changed = git($self)->alternates_changed)) {
 		git_cleanup($self) if $changed;
 		my $gits = "$self->{inboxdir}/git";
 		if (opendir my $dh, $gits) {
-			my $max = -1;
-			while (defined(my $git_dir = readdir($dh))) {
-				$git_dir =~ m!\A([0-9]+)\.git\z! or next;
-				$max = $1 if $1 > $max;
-			}
-			$cur = $self->{-max_git_epoch} = $max if $max >= 0;
-		} else {
-			warn "opendir $gits failed: $!\n";
+			my $max = max(map {
+				substr($_, 0, -4) + 0; # drop ".git" suffix
+			} grep(/\A[0-9]+\.git\z/, readdir($dh))) // return;
+			$cur = $self->{-max_git_epoch} = $max;
 		}
 	}
 	$cur;
diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm
index bdfae2f8..48d2267f 100644
--- a/lib/PublicInbox/InboxWritable.pm
+++ b/lib/PublicInbox/InboxWritable.pm
@@ -303,22 +303,11 @@ sub warn_ignore_cb {
 # v2+ only
 sub git_dir_n { "$_[0]->{inboxdir}/git/$_[1].git" }
 
-# v2+ only
+# v2+ only, XXX: maybe we can just rely on ->max_git_epoch and remove
 sub git_dir_latest {
 	my ($self, $max) = @_;
-	$$max = -1;
-	my $pfx = "$self->{inboxdir}/git";
-	return unless -d $pfx;
-	my $latest;
-	opendir my $dh, $pfx or die "opendir $pfx: $!\n";
-	while (defined(my $git_dir = readdir($dh))) {
-		$git_dir =~ m!\A([0-9]+)\.git\z! or next;
-		if ($1 > $$max) {
-			$$max = $1;
-			$latest = "$pfx/$git_dir";
-		}
-	}
-	$latest;
+	defined($$max = $self->max_git_epoch) ?
+		"$self->{inboxdir}/git/$$max.git" : undef;
 }
 
 1;
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 803914b0..b1d38fb9 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -197,8 +197,7 @@ sub xdb_sharded {
 
 	# We need numeric sorting so shard[0] is first for reading
 	# Xapian metadata, if needed
-	my $last = max(grep(/\A[0-9]+\z/, readdir($dh)));
-	return if !defined($last);
+	my $last = max(grep(/\A[0-9]+\z/, readdir($dh))) // return;
 	my (@xdb, $slow_phrase);
 	for (0..$last) {
 		my $shard_dir = "$self->{xpfx}/$_";
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 992305c5..7b8b5abf 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -128,12 +128,9 @@ sub init_inbox {
 	}
 	$self->idx_init;
 	$self->{mm}->skip_artnum($skip_artnum) if defined $skip_artnum;
-	my $epoch_max = -1;
-	$self->{ibx}->git_dir_latest(\$epoch_max);
-	if (defined $skip_epoch && $epoch_max == -1) {
-		$epoch_max = $skip_epoch;
-	}
-	$self->git_init($epoch_max >= 0 ? $epoch_max : 0);
+	my $max = $self->{ibx}->max_git_epoch;
+	$max = $skip_epoch if (defined($skip_epoch) && !defined($max));
+	$self->git_init($max // 0);
 	$self->done;
 }
 
@@ -336,12 +333,7 @@ sub _replace_oids ($$$) {
 	my $ibx = $self->{ibx};
 	my $pfx = "$ibx->{inboxdir}/git";
 	my $rewrites = []; # epoch => commit
-	my $max = $self->{epoch_max};
-
-	unless (defined($max)) {
-		defined(my $latest = $ibx->git_dir_latest(\$max)) or return;
-		$self->{epoch_max} = $max;
-	}
+	my $max = $self->{epoch_max} //= $ibx->max_git_epoch // return;
 
 	foreach my $i (0..$max) {
 		my $git_dir = "$pfx/$i.git";
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index 4332943c..4f77ef25 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -109,8 +109,7 @@ sub prepare_reindex ($$$) {
 			$opt->{reindex}->{from} = $lc;
 		}
 	} else { # v2
-		my $max;
-		$ibx->git_dir_latest(\$max) or return;
+		my $max = $ibx->max_git_epoch // return;
 		my $from = $opt->{reindex}->{from};
 		my $mm = $ibx->mm;
 		my $v = PublicInbox::Search::SCHEMA_VERSION();

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/5] inboxwritable: drop git_dir_n sub
  2020-12-16 23:19 [PATCH 0/5] various indexing cleanups and quieting Eric Wong
                   ` (2 preceding siblings ...)
  2020-12-16 23:19 ` [PATCH 3/5] inbox: simplify v2 epoch counting Eric Wong
@ 2020-12-16 23:19 ` Eric Wong
  2020-12-16 23:19 ` [PATCH 5/5] extsearchidx: no need to make InboxWritable Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-16 23:19 UTC (permalink / raw)
  To: meta

There's only one caller, unlikely to be any more, and
should be harmless to open code.
---
 lib/PublicInbox/InboxWritable.pm | 3 ---
 lib/PublicInbox/V2Writable.pm    | 3 ++-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm
index 48d2267f..c0e88f3d 100644
--- a/lib/PublicInbox/InboxWritable.pm
+++ b/lib/PublicInbox/InboxWritable.pm
@@ -300,9 +300,6 @@ sub warn_ignore_cb {
 	}
 }
 
-# v2+ only
-sub git_dir_n { "$_[0]->{inboxdir}/git/$_[1].git" }
-
 # v2+ only, XXX: maybe we can just rely on ->max_git_epoch and remove
 sub git_dir_latest {
 	my ($self, $max) = @_;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 7b8b5abf..3e3b275f 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -1091,8 +1091,9 @@ sub sync_prepare ($$) {
 	if ($sync->{max_size} = $sync->{-opt}->{max_size}) {
 		$sync->{index_oid} = $self->can('index_oid');
 	}
+	my $git_pfx = "$sync->{ibx}->{inboxdir}/git";
 	for (my $i = $sync->{epoch_max}; $i >= 0; $i--) {
-		my $git_dir = $sync->{ibx}->git_dir_n($i);
+		my $git_dir = "$git_pfx/$i.git";
 		-d $git_dir or next; # missing epochs are fine
 		my $git = PublicInbox::Git->new($git_dir);
 		my $unit = { git => $git, epoch => $i };

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 5/5] extsearchidx: no need to make InboxWritable
  2020-12-16 23:19 [PATCH 0/5] various indexing cleanups and quieting Eric Wong
                   ` (3 preceding siblings ...)
  2020-12-16 23:19 ` [PATCH 4/5] inboxwritable: drop git_dir_n sub Eric Wong
@ 2020-12-16 23:19 ` Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-16 23:19 UTC (permalink / raw)
  To: meta

extindex treats v1/v2 public inboxes as read-only, so there's
no need to scare people by using the InboxWritable package
now that ->git_dir_n is gone and we can use ->max_git_epoch
instead of ->git_dir_latest.
---
 lib/PublicInbox/ExtSearchIdx.pm | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 3764612c..c6fb398b 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -63,7 +63,6 @@ sub new {
 
 sub attach_inbox {
 	my ($self, $ibx) = @_;
-	$ibx = PublicInbox::InboxWritable->new($ibx);
 	my $key = $ibx->eidx_key;
 	if (!$ibx->over || !$ibx->mm) {
 		warn "W: skipping $key (unindexed)\n";
@@ -78,7 +77,6 @@ sub attach_inbox {
 		warn "W: `$ibx->{inboxdir}' canonicalized to `$ibxdir'\n";
 		$ibx->{inboxdir} = $ibxdir;
 	}
-	$ibx = PublicInbox::InboxWritable->new($ibx);
 	$self->{ibx_map}->{$key} //= do {
 		push @{$self->{ibx_list}}, $ibx;
 		$ibx;

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-12-16 23:19 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-16 23:19 [PATCH 0/5] various indexing cleanups and quieting Eric Wong
2020-12-16 23:19 ` [PATCH 1/5] inboxwritable: warn_ignore: "Bad UTF7 data escape" Eric Wong
2020-12-16 23:19 ` [PATCH 2/5] index: ignore some warnings, set {current_info} for v1 Eric Wong
2020-12-16 23:19 ` [PATCH 3/5] inbox: simplify v2 epoch counting Eric Wong
2020-12-16 23:19 ` [PATCH 4/5] inboxwritable: drop git_dir_n sub Eric Wong
2020-12-16 23:19 ` [PATCH 5/5] extsearchidx: no need to make InboxWritable Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).