* [PATCH 1/5] inbox: delay ->version detection
2020-12-21 7:51 [PATCH 0/5] extindex->misc prep stuff Eric Wong
@ 2020-12-21 7:51 ` Eric Wong
2020-12-21 18:54 ` Eric Wong
2020-12-21 7:51 ` [PATCH 2/5] isearch: use numeric sort for article numbers Eric Wong
` (3 subsequent siblings)
4 siblings, 1 reply; 7+ messages in thread
From: Eric Wong @ 2020-12-21 7:51 UTC (permalink / raw)
To: meta
Our read-only code won't need to know the version until an inbox
is accessed. This is a small step towards eliminating many
stat() calls on read-only daemon startup.
---
lib/PublicInbox/Inbox.pm | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index 8a3a0194..863a5de4 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -109,10 +109,6 @@ sub new {
delete $opts->{feedmax};
}
$opts->{nntpserver} ||= $pi_cfg->{'publicinbox.nntpserver'};
- my $dir = $opts->{inboxdir};
- if (defined $dir && -f "$dir/inbox.lock") {
- $opts->{version} = 2;
- }
# allow any combination of multi-line or comma-delimited hide entries
my $hide = {};
@@ -125,7 +121,9 @@ sub new {
bless $opts, $class;
}
-sub version { $_[0]->{version} // 1 }
+sub version {
+ $_[0]->{version} //= -f "$_[0]->{inboxdir}/inbox.lock" ? 2 : 1
+}
sub git_epoch {
my ($self, $epoch) = @_; # v2-only, callers always supply $epoch
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 2/5] isearch: use numeric sort for article numbers
2020-12-21 7:51 [PATCH 0/5] extindex->misc prep stuff Eric Wong
2020-12-21 7:51 ` [PATCH 1/5] inbox: delay ->version detection Eric Wong
@ 2020-12-21 7:51 ` Eric Wong
2020-12-21 7:51 ` [PATCH 3/5] use rel2abs_collapsed when loading Inbox objects Eric Wong
` (2 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2020-12-21 7:51 UTC (permalink / raw)
To: meta
Perl sort is alphabetical by default and Xapian uses numeric
document IDs, so sort must be told explicitly to use numeric
comparisons even if the scalars are integer values (IV)
internally.
And eliminate extra hash marks ("#") since they're probably too
noisy if there are many IDs.
Note: I haven't seen this warning message in syslog, yet :>
---
lib/PublicInbox/Isearch.pm | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm
index 8a1f257a..e362c80a 100644
--- a/lib/PublicInbox/Isearch.pm
+++ b/lib/PublicInbox/Isearch.pm
@@ -89,7 +89,7 @@ SELECT docid,xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks)
}
if (scalar keys %order) {
warn "W: $self->{es}->{topdir} #",
- join(', #', sort keys %order),
+ join(', ', sort { $a <=> $b } keys %order),
" not mapped to `$self->{eidx_key}'\n";
warn "W: $self->{es}->{topdir} may need to be reindexed\n";
@xnums = grep { defined } @xnums;
@@ -113,7 +113,7 @@ sub mset_to_smsg {
}
if (scalar keys %order) {
warn "W: $ibx->{inboxdir} #",
- join(', #', sort keys %order),
+ join(', ', sort { $a <=> $b } keys %order),
" no longer valid\n";
warn "W: $self->{es}->{topdir} may need to be reindexed\n";
}
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 3/5] use rel2abs_collapsed when loading Inbox objects
2020-12-21 7:51 [PATCH 0/5] extindex->misc prep stuff Eric Wong
2020-12-21 7:51 ` [PATCH 1/5] inbox: delay ->version detection Eric Wong
2020-12-21 7:51 ` [PATCH 2/5] isearch: use numeric sort for article numbers Eric Wong
@ 2020-12-21 7:51 ` Eric Wong
2020-12-21 7:51 ` [PATCH 4/5] searchidx: rename get_val to int_val and return IV Eric Wong
2020-12-21 7:51 ` [PATCH 5/5] extsearch*: drop unnecessary path canonicalization Eric Wong
4 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2020-12-21 7:51 UTC (permalink / raw)
To: meta
We need to canonicalize paths for inboxes which do not have
a newsgroup defined, otherwise ->eidx_key matches can fail
in unexpected ways.
---
lib/PublicInbox/Admin.pm | 11 +----------
lib/PublicInbox/Config.pm | 28 +++++++++++++++++++++++-----
lib/PublicInbox/ExtSearchIdx.pm | 5 -----
script/public-inbox-convert | 2 +-
script/public-inbox-init | 2 +-
5 files changed, 26 insertions(+), 22 deletions(-)
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index ea82133a..c972fb68 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -10,7 +10,7 @@ our @EXPORT_OK = qw(setup_signals);
use PublicInbox::Config;
use PublicInbox::Inbox;
use PublicInbox::Spawn qw(popen_rd);
-use File::Spec ();
+*rel2abs_collapsed = \&PublicInbox::Config::rel2abs_collapsed;
sub setup_signals {
my ($cb, $arg) = @_; # optional
@@ -27,15 +27,6 @@ sub setup_signals {
};
}
-# abs_path resolves symlinks, so we want to avoid it if rel2abs
-# is sufficient and doesn't leave "/.." or "/../"
-sub rel2abs_collapsed ($) {
- my $p = File::Spec->rel2abs($_[0]);
- return $p if substr($p, -3, 3) ne '/..' && index($p, '/../') < 0; # likely
- require Cwd;
- Cwd::abs_path($p);
-}
-
sub resolve_inboxdir {
my ($cd, $ver) = @_;
my $try = $cd // '.';
diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm
index 2f5c83cd..577337dc 100644
--- a/lib/PublicInbox/Config.pm
+++ b/lib/PublicInbox/Config.pm
@@ -368,6 +368,16 @@ sub git_bool {
}
}
+# abs_path resolves symlinks, so we want to avoid it if rel2abs
+# is sufficient and doesn't leave "/.." or "/../"
+sub rel2abs_collapsed {
+ require File::Spec;
+ my $p = File::Spec->rel2abs($_[-1]);
+ return $p if substr($p, -3, 3) ne '/..' && index($p, '/../') < 0;
+ require Cwd;
+ Cwd::abs_path($p);
+}
+
sub _fill {
my ($self, $pfx) = @_;
my $ibx = {};
@@ -391,9 +401,9 @@ EOF
}
# "mainrepo" is backwards compatibility:
- $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"} // return;
- if ($ibx->{inboxdir} =~ /\n/s) {
- warn "E: `$ibx->{inboxdir}' must not contain `\\n'\n";
+ my $dir = $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"} // return;
+ if (index($dir, "\n") >= 0) {
+ warn "E: `$dir' must not contain `\\n'\n";
return;
}
foreach my $k (qw(obfuscate)) {
@@ -436,7 +446,7 @@ EOF
$self->{-by_list_id}->{lc($list_id)} = $ibx;
}
}
- if (my $ngname = $ibx->{newsgroup}) {
+ if (defined(my $ngname = $ibx->{newsgroup})) {
if (ref($ngname)) {
delete $ibx->{newsgroup};
warn 'multiple newsgroups not supported: '.
@@ -445,7 +455,8 @@ EOF
# wildmat-exact and RFC 3501 (IMAP) ATOM-CHAR.
# Leave out a few chars likely to cause problems or conflicts:
# '|', '<', '>', ';', '#', '$', '&',
- } elsif ($ngname =~ m![^A-Za-z0-9/_\.\-\~\@\+\=:]!) {
+ } elsif ($ngname =~ m![^A-Za-z0-9/_\.\-\~\@\+\=:]! ||
+ $ngname eq '') {
delete $ibx->{newsgroup};
warn "newsgroup name invalid: `$ngname'\n";
} else {
@@ -454,6 +465,13 @@ EOF
$self->{-by_newsgroup}->{$ngname} = $ibx;
}
}
+ unless (defined $ibx->{newsgroup}) { # for ->eidx_key
+ my $abs = rel2abs_collapsed($dir);
+ if ($abs ne $dir) {
+ warn "W: `$dir' canonicalized to `$abs'\n";
+ $ibx->{inboxdir} = $abs;
+ }
+ }
$self->{-by_name}->{$name} = $ibx;
if ($ibx->{obfuscate}) {
$ibx->{-no_obfuscate} = $self->{-no_obfuscate};
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index b82d0546..c4b429df 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -72,11 +72,6 @@ sub attach_inbox {
warn "W: skipping $key (no UIDVALIDITY)\n";
return;
}
- my $ibxdir = File::Spec->canonpath($ibx->{inboxdir});
- if ($ibxdir ne $ibx->{inboxdir}) {
- warn "W: `$ibx->{inboxdir}' canonicalized to `$ibxdir'\n";
- $ibx->{inboxdir} = $ibxdir;
- }
$self->{ibx_map}->{$key} //= do {
push @{$self->{ibx_list}}, $ibx;
$ibx;
diff --git a/script/public-inbox-convert b/script/public-inbox-convert
index fbd527a6..800c364c 100755
--- a/script/public-inbox-convert
+++ b/script/public-inbox-convert
@@ -75,7 +75,7 @@ if ($opt->{'index'}) {
}
local %ENV = (%$env, %ENV) if $env;
my $new = { %$old };
-$new->{inboxdir} = PublicInbox::Admin::rel2abs_collapsed($new_dir);
+$new->{inboxdir} = $cfg->rel2abs_collapsed($new_dir);
$new->{version} = 2;
$new = PublicInbox::InboxWritable->new($new, { nproc => $opt->{jobs} });
$new->{-no_fsync} = 1 if !$opt->{fsync};
diff --git a/script/public-inbox-init b/script/public-inbox-init
index eb605a51..afaa4c12 100755
--- a/script/public-inbox-init
+++ b/script/public-inbox-init
@@ -138,7 +138,7 @@ close($fh) or die "failed to close $pi_config_tmp: $!\n";
my $pfx = "publicinbox.$name";
my @x = (qw/git config/, "--file=$pi_config_tmp");
-PublicInbox::Admin::rel2abs_collapsed($inboxdir);
+PublicInbox::Config::rel2abs_collapsed($inboxdir);
die "`\\n' not allowed in `$inboxdir'\n" if index($inboxdir, "\n") >= 0;
if (-f "$inboxdir/inbox.lock") {
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 4/5] searchidx: rename get_val to int_val and return IV
2020-12-21 7:51 [PATCH 0/5] extindex->misc prep stuff Eric Wong
` (2 preceding siblings ...)
2020-12-21 7:51 ` [PATCH 3/5] use rel2abs_collapsed when loading Inbox objects Eric Wong
@ 2020-12-21 7:51 ` Eric Wong
2020-12-21 7:51 ` [PATCH 5/5] extsearch*: drop unnecessary path canonicalization Eric Wong
4 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2020-12-21 7:51 UTC (permalink / raw)
To: meta
Values can be strings in Xapian, although we currently use
integer values exclusively. Give the wrapper a more appropriate
name in case we start using string columns.
For future-proofing, we'll now return `undef' on missing columns
and coerce the return value to an IV (integer value) to save
memory, as sortable_unserialise returns a PV (pointer value)
scalar despite it existing to support numeric values.
---
lib/PublicInbox/SearchIdx.pm | 9 +++++----
t/search.t | 4 ++--
2 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index b731f698..cf2c2c55 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -501,17 +501,18 @@ sub remove_eidx_info {
$self->{xdb}->replace_document($docid, $doc);
}
-sub get_val ($$) {
+sub int_val ($$) {
my ($doc, $col) = @_;
- sortable_unserialise($doc->get_value($col));
+ my $val = $doc->get_value($col) or return; # undefined is '' in Xapian
+ sortable_unserialise($val) + 0; # PV => IV conversion
}
sub smsg_from_doc ($) {
my ($doc) = @_;
my $data = $doc->get_data or return;
my $smsg = bless {}, 'PublicInbox::Smsg';
- $smsg->{ts} = get_val($doc, PublicInbox::Search::TS());
- my $dt = get_val($doc, PublicInbox::Search::DT());
+ $smsg->{ts} = int_val($doc, PublicInbox::Search::TS());
+ my $dt = int_val($doc, PublicInbox::Search::DT());
my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt);
$smsg->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy);
$smsg->load_from_data($data);
diff --git a/t/search.t b/t/search.t
index da9acb07..11143204 100644
--- a/t/search.t
+++ b/t/search.t
@@ -332,13 +332,13 @@ $ibx->with_umask(sub {
like($smsg->{to}, qr/\blist\@example\.com\b/, 'to appears');
my $doc = $m->get_document;
my $col = PublicInbox::Search::BYTES();
- my $bytes = PublicInbox::SearchIdx::get_val($doc, $col);
+ my $bytes = PublicInbox::SearchIdx::int_val($doc, $col);
like($bytes, qr/\A[0-9]+\z/, '$bytes stored as digit');
ok($bytes > 0, '$bytes is > 0');
is($bytes, $smsg->{bytes}, 'bytes Xapian value matches Over');
$col = PublicInbox::Search::UID();
- my $uid = PublicInbox::SearchIdx::get_val($doc, $col);
+ my $uid = PublicInbox::SearchIdx::int_val($doc, $col);
is($uid, $smsg->{num}, 'UID column matches {num}');
is($uid, $m->get_docid, 'UID column matches docid');
}
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 5/5] extsearch*: drop unnecessary path canonicalization
2020-12-21 7:51 [PATCH 0/5] extindex->misc prep stuff Eric Wong
` (3 preceding siblings ...)
2020-12-21 7:51 ` [PATCH 4/5] searchidx: rename get_val to int_val and return IV Eric Wong
@ 2020-12-21 7:51 ` Eric Wong
4 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2020-12-21 7:51 UTC (permalink / raw)
To: meta
Unlike inboxdir, the canonical-ness of -extindex paths is not
relevant at the moment, and may never be relevant at all. So
don't mislead others into thinking these paths being
canonicalized matters.
---
lib/PublicInbox/ExtSearch.pm | 2 --
lib/PublicInbox/ExtSearchIdx.pm | 2 --
2 files changed, 4 deletions(-)
diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm
index 2a560935..a2b97798 100644
--- a/lib/PublicInbox/ExtSearch.pm
+++ b/lib/PublicInbox/ExtSearch.pm
@@ -9,7 +9,6 @@ use strict;
use v5.10.1;
use PublicInbox::Over;
use PublicInbox::Inbox;
-use File::Spec ();
use PublicInbox::MiscSearch;
use DBI qw(:sql_types); # SQL_BLOB
@@ -18,7 +17,6 @@ use parent qw(PublicInbox::Search);
sub new {
my (undef, $topdir) = @_;
- $topdir = File::Spec->canonpath($topdir);
bless {
topdir => $topdir,
# xpfx => 'ei15'
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index c4b429df..f04e0443 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -30,13 +30,11 @@ use PublicInbox::V2Writable;
use PublicInbox::InboxWritable;
use PublicInbox::ContentHash qw(content_hash);
use PublicInbox::Eml;
-use File::Spec;
use PublicInbox::DS qw(now);
use DBI qw(:sql_types); # SQL_BLOB
sub new {
my (undef, $dir, $opt) = @_;
- $dir = File::Spec->canonpath($dir);
my $l = $opt->{indexlevel} // 'full';
$l !~ $PublicInbox::SearchIdx::INDEXLEVELS and
die "invalid indexlevel=$l\n";
^ permalink raw reply related [flat|nested] 7+ messages in thread