* [PATCH 01/13] content_id: do not take Message-Id into account
2018-03-22 9:40 [PATCH 00/13] reindexing, feeds, date fixes Eric Wong (Contractor, The Linux Foundation)
@ 2018-03-22 9:40 ` Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 02/13] introduce InboxWritable class Eric Wong (Contractor, The Linux Foundation)
` (11 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Eric Wong (Contractor, The Linux Foundation) @ 2018-03-22 9:40 UTC (permalink / raw)
To: meta
If we need to use content_id, we've already lost hope
in relying on Message-Id as a differentiator. This
prevents duplicates from showing up repeatedly with
-watch when Message-Ids are reused and we generate
new Message-Ids to disambiguate.
---
lib/PublicInbox/ContentId.pm | 3 ++-
t/v2writable.t | 10 +++++++---
2 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/lib/PublicInbox/ContentId.pm b/lib/PublicInbox/ContentId.pm
index 9082b76..279eec0 100644
--- a/lib/PublicInbox/ContentId.pm
+++ b/lib/PublicInbox/ContentId.pm
@@ -21,7 +21,8 @@ sub content_digest ($) {
# in SearchIdx, so treat them the same for this:
my %seen;
foreach my $mid (@{mids($hdr)}) {
- $dig->add('mid: '.$mid);
+ # do NOT consider the Message-ID as part of the content_id
+ # if we got here, we've already got Message-ID reuse
$seen{$mid} = 1;
}
foreach my $mid (@{references($hdr)}) {
diff --git a/t/v2writable.t b/t/v2writable.t
index 85b48d2..6cabf0d 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -61,11 +61,15 @@ if ('ensure git configs are correct') {
@warn = ();
$mime->header_set('Message-Id', '<a-mid@b>', '<c@d>');
- ok($im->add($mime), 'secondary MID used');
+ is($im->add($mime), undef, 'secondary MID ignored if first matches');
+ my $sec = PublicInbox::MIME->new($mime->as_string);
+ $sec->header_set('Date');
+ $sec->header_set('Message-Id', '<a-mid@b>', '<c@d>');
+ ok($im->add($sec), 'secondary MID used if data is different');
like(join(' ', @warn), qr/mismatched/, 'warned about mismatch');
like(join(' ', @warn), qr/alternative/, 'warned about alternative');
is_deeply([ '<a-mid@b>', '<c@d>' ],
- [ $mime->header_obj->header_raw('Message-Id') ],
+ [ $sec->header_obj->header_raw('Message-Id') ],
'no new Message-Id added');
my $sane_mid = qr/\A<[\w\-]+\@localhost>\z/;
@@ -85,7 +89,7 @@ if ('ensure git configs are correct') {
my $gen = PublicInbox::Import::digest2mid(content_digest($mime));
unlike($gen, qr![\+/=]!, 'no URL-unfriendly chars in Message-Id');
my $fake = PublicInbox::MIME->new($mime->as_string);
- $fake->header_set('Message-Id', $gen);
+ $fake->header_set('Message-Id', "<$gen>");
ok($im->add($fake), 'fake added easily');
is_deeply(\@warn, [], 'no warnings from a faker');
ok($im->add($mime), 'random MID made');
--
EW
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 02/13] introduce InboxWritable class
2018-03-22 9:40 [PATCH 00/13] reindexing, feeds, date fixes Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 01/13] content_id: do not take Message-Id into account Eric Wong (Contractor, The Linux Foundation)
@ 2018-03-22 9:40 ` Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 03/13] import: discard all the same headers as MDA Eric Wong (Contractor, The Linux Foundation)
` (10 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Eric Wong (Contractor, The Linux Foundation) @ 2018-03-22 9:40 UTC (permalink / raw)
To: meta
This code will be shared with future mass-import tools.
---
MANIFEST | 1 +
lib/PublicInbox/InboxWritable.pm | 57 ++++++++++++++++++++++++++++++++++++++++
lib/PublicInbox/WatchMaildir.pm | 50 +++++++----------------------------
3 files changed, 67 insertions(+), 41 deletions(-)
create mode 100644 lib/PublicInbox/InboxWritable.pm
diff --git a/MANIFEST b/MANIFEST
index 4346cd9..567148a 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -66,6 +66,7 @@ lib/PublicInbox/HTTPD/Async.pm
lib/PublicInbox/Hval.pm
lib/PublicInbox/Import.pm
lib/PublicInbox/Inbox.pm
+lib/PublicInbox/InboxWritable.pm
lib/PublicInbox/Linkify.pm
lib/PublicInbox/Listener.pm
lib/PublicInbox/Lock.pm
diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm
new file mode 100644
index 0000000..0a976ea
--- /dev/null
+++ b/lib/PublicInbox/InboxWritable.pm
@@ -0,0 +1,57 @@
+# Copyright (C) 2018 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# Extends read-only Inbox for writing
+package PublicInbox::InboxWritable;
+use strict;
+use warnings;
+use base qw(PublicInbox::Inbox);
+use PublicInbox::Import;
+
+sub new {
+ my ($class, $ibx) = @_;
+ bless $ibx, $class;
+}
+
+sub importer {
+ my ($self, $parallel) = @_;
+ $self->{-importer} ||= eval {
+ my $v = $self->{version} || 1;
+ if ($v == 2) {
+ eval { require PublicInbox::V2Writable };
+ die "v2 not supported: $@\n" if $@;
+ my $v2w = PublicInbox::V2Writable->new($self);
+ $v2w->{parallel} = $parallel;
+ $v2w;
+ } elsif ($v == 1) {
+ my $git = $self->git;
+ my $name = $self->{name};
+ my $addr = $self->{-primary_address};
+ PublicInbox::Import->new($git, $name, $addr, $self);
+ } else {
+ die "unsupported inbox version: $v\n";
+ }
+ }
+}
+
+sub filter {
+ my ($self) = @_;
+ my $f = $self->{filter};
+ if ($f && $f =~ /::/) {
+ my @args = (-inbox => $self);
+ # basic line splitting, only
+ # Perhaps we can have proper quote splitting one day...
+ ($f, @args) = split(/\s+/, $f) if $f =~ /\s+/;
+
+ eval "require $f";
+ if ($@) {
+ warn $@;
+ } else {
+ # e.g: PublicInbox::Filter::Vger->new(@args)
+ return $f->new(@args);
+ }
+ }
+ undef;
+}
+
+1;
diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm
index e28e602..b165a60 100644
--- a/lib/PublicInbox/WatchMaildir.pm
+++ b/lib/PublicInbox/WatchMaildir.pm
@@ -11,6 +11,7 @@ use PublicInbox::Git;
use PublicInbox::Import;
use PublicInbox::MDA;
use PublicInbox::Spawn qw(spawn);
+use PublicInbox::InboxWritable;
use File::Temp qw//;
sub new {
@@ -50,6 +51,10 @@ sub new {
$spamcheck = undef;
}
}
+
+ # need to make all inboxes writable for spam removal:
+ $config->each_inbox(sub { PublicInbox::InboxWritable->new($_[0]) });
+
foreach $k (keys %$config) {
$k =~ /\Apublicinbox\.([^\.]+)\.watch\z/ or next;
my $name = $1;
@@ -118,7 +123,7 @@ sub _remove_spam {
eval {
my $im = _importer_for($self, $ibx);
$im->remove($mime, 'spam');
- if (my $scrub = _scrubber_for($ibx)) {
+ if (my $scrub = $ibx->filter) {
my $scrubbed = $scrub->scrub($mime) or return;
$scrubbed == 100 and return;
$im->remove($scrubbed, 'spam');
@@ -160,7 +165,7 @@ sub _try_path {
my $v = $mime->header_obj->header_raw($wm->[0]);
return unless ($v && $v =~ $wm->[1]);
}
- if (my $scrub = _scrubber_for($inbox)) {
+ if (my $scrub = $inbox->filter) {
my $ret = $scrub->scrub($mime) or return;
$ret == 100 and return;
$mime = $ret;
@@ -253,25 +258,8 @@ sub _path_to_mime {
}
sub _importer_for {
- my ($self, $inbox) = @_;
- my $im = $inbox->{-import} ||= eval {
- my $v = $inbox->{version} || 1;
- if ($v == 2) {
- eval { require PublicInbox::V2Writable };
- die "v2 not supported: $@\n" if $@;
- my $v2w = PublicInbox::V2Writable->new($inbox);
- $v2w->{parallel} = 0;
- $v2w;
- } elsif ($v == 1) {
- my $git = $inbox->git;
- my $name = $inbox->{name};
- my $addr = $inbox->{-primary_address};
- PublicInbox::Import->new($git, $name, $addr, $inbox);
- } else {
- die "unsupported inbox version: $v\n";
- }
- };
-
+ my ($self, $ibx) = @_;
+ my $im = $ibx->importer(0);
my $importers = $self->{importers};
if (scalar(keys(%$importers)) > 2) {
delete $importers->{"$im"};
@@ -281,26 +269,6 @@ sub _importer_for {
$importers->{"$im"} = $im;
}
-sub _scrubber_for {
- my ($inbox) = @_;
- my $f = $inbox->{filter};
- if ($f && $f =~ /::/) {
- my @args = (-inbox => $inbox);
- # basic line splitting, only
- # Perhaps we can have proper quote splitting one day...
- ($f, @args) = split(/\s+/, $f) if $f =~ /\s+/;
-
- eval "require $f";
- if ($@) {
- warn $@;
- } else {
- # e.g: PublicInbox::Filter::Vger->new(@args)
- return $f->new(@args);
- }
- }
- undef;
-}
-
sub _spamcheck_cb {
my ($sc) = @_;
sub {
--
EW
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 03/13] import: discard all the same headers as MDA
2018-03-22 9:40 [PATCH 00/13] reindexing, feeds, date fixes Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 01/13] content_id: do not take Message-Id into account Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 02/13] introduce InboxWritable class Eric Wong (Contractor, The Linux Foundation)
@ 2018-03-22 9:40 ` Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 04/13] InboxWritable: add mbox/maildir parsing + import logic Eric Wong (Contractor, The Linux Foundation)
` (9 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Eric Wong (Contractor, The Linux Foundation) @ 2018-03-22 9:40 UTC (permalink / raw)
To: meta
Reduce the places where we have duplicate logic for discarding
unwanted headers.
---
lib/PublicInbox/Import.pm | 2 ++
lib/PublicInbox/MDA.pm | 2 --
lib/PublicInbox/WatchMaildir.pm | 1 -
3 files changed, 2 insertions(+), 3 deletions(-)
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index fc740fa..e50f115 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -13,6 +13,7 @@ use PublicInbox::MID qw(mids mid_mime mid2path);
use PublicInbox::Address;
use PublicInbox::MsgTime qw(msg_timestamp);
use PublicInbox::ContentId qw(content_digest);
+use PublicInbox::MDA;
sub new {
my ($class, $git, $name, $email, $ibx) = @_;
@@ -319,6 +320,7 @@ sub add {
# kill potentially confusing/misleading headers
$mime->header_set($_) for qw(bytes lines content-length status);
+ $mime->header_set($_) for @PublicInbox::MDA::BAD_HEADERS;
# spam check:
if ($check_cb) {
diff --git a/lib/PublicInbox/MDA.pm b/lib/PublicInbox/MDA.pm
index d5af8f9..637404e 100644
--- a/lib/PublicInbox/MDA.pm
+++ b/lib/PublicInbox/MDA.pm
@@ -81,8 +81,6 @@ sub set_list_headers {
$pa =~ tr/@/./; # RFC2919
$simple->header_set("List-Id", "<$pa>");
}
-
- $simple->header_set($_) foreach @BAD_HEADERS;
}
1;
diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm
index b165a60..d3ca2a1 100644
--- a/lib/PublicInbox/WatchMaildir.pm
+++ b/lib/PublicInbox/WatchMaildir.pm
@@ -159,7 +159,6 @@ sub _try_path {
}
my $im = _importer_for($self, $inbox);
my $mime = _path_to_mime($path) or return;
- $mime->header_set($_) foreach @PublicInbox::MDA::BAD_HEADERS;
my $wm = $inbox->{-watchheader};
if ($wm) {
my $v = $mime->header_obj->header_raw($wm->[0]);
--
EW
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 04/13] InboxWritable: add mbox/maildir parsing + import logic
2018-03-22 9:40 [PATCH 00/13] reindexing, feeds, date fixes Eric Wong (Contractor, The Linux Foundation)
` (2 preceding siblings ...)
2018-03-22 9:40 ` [PATCH 03/13] import: discard all the same headers as MDA Eric Wong (Contractor, The Linux Foundation)
@ 2018-03-22 9:40 ` Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 05/13] use both Date: and Received: times Eric Wong (Contractor, The Linux Foundation)
` (8 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Eric Wong (Contractor, The Linux Foundation) @ 2018-03-22 9:40 UTC (permalink / raw)
To: meta
This will make it easier to as well as supporting future
Filter API users. It allows simplifying our ad-hoc
import_vger_from_mbox script.
---
lib/PublicInbox/InboxWritable.pm | 103 +++++++++++++++++++++++++++++++++++++++
lib/PublicInbox/V2Writable.pm | 8 +++
lib/PublicInbox/WatchMaildir.pm | 20 +++-----
script/public-inbox-init | 6 +--
scripts/import_vger_from_mbox | 51 +++----------------
5 files changed, 126 insertions(+), 62 deletions(-)
diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm
index 0a976ea..82834f0 100644
--- a/lib/PublicInbox/InboxWritable.pm
+++ b/lib/PublicInbox/InboxWritable.pm
@@ -7,6 +7,8 @@ use strict;
use warnings;
use base qw(PublicInbox::Inbox);
use PublicInbox::Import;
+use PublicInbox::Filter::Base;
+*REJECT = *PublicInbox::Filter::Base::REJECT;
sub new {
my ($class, $ibx) = @_;
@@ -54,4 +56,105 @@ sub filter {
undef;
}
+sub is_maildir_basename ($) {
+ my ($bn) = @_;
+ return 0 if $bn !~ /\A[a-zA-Z0-9][\-\w:,=\.]+\z/;
+ if ($bn =~ /:2,([A-Z]+)\z/i) {
+ my $flags = $1;
+ return 0 if $flags =~ /[DT]/; # no [D]rafts or [T]rashed mail
+ }
+ 1;
+}
+
+sub is_maildir_path ($) {
+ my ($path) = @_;
+ my @p = split(m!/+!, $path);
+ (is_maildir_basename($p[-1]) && -f $path) ? 1 : 0;
+}
+
+sub maildir_path_load ($) {
+ my ($path) = @_;
+ if (open my $fh, '<', $path) {
+ local $/;
+ my $str = <$fh>;
+ $str or return;
+ return PublicInbox::MIME->new(\$str);
+ } elsif ($!{ENOENT}) {
+ # common with Maildir
+ return;
+ } else {
+ warn "failed to open $path: $!\n";
+ return;
+ }
+}
+
+sub import_maildir {
+ my ($self, $dir) = @_;
+ my $im = $self->importer(1);
+ my $filter = $self->filter;
+ foreach my $sub (qw(cur new tmp)) {
+ -d "$dir/$sub" or die "$dir is not a Maildir (missing $sub)\n";
+ }
+ foreach my $sub (qw(cur new)) {
+ opendir my $dh, "$dir/$sub" or die "opendir $dir/$sub: $!\n";
+ while (defined(my $fn = readdir($dh))) {
+ next unless is_maildir_basename($fn);
+ my $mime = maildir_file_load("$dir/$fn") or next;
+ if ($filter) {
+ my $ret = $filter->scrub($mime) or return;
+ return if $ret == REJECT();
+ $mime = $ret;
+ }
+ $im->add($mime);
+ }
+ }
+ $im->done;
+}
+
+# asctime: From example@example.com Fri Jun 23 02:56:55 2000
+my $from_strict = qr/^From \S+ +\S+ \S+ +\S+ [^:]+:[^:]+:[^:]+ [^:]+/;
+
+sub mb_add ($$$$) {
+ my ($im, $variant, $filter, $msg) = @_;
+ $$msg =~ s/(\r?\n)+\z/$1/s;
+ my $mime = PublicInbox::MIME->new($msg);
+ if ($variant eq 'mboxrd') {
+ $$msg =~ s/^>(>*From )/$1/sm;
+ } elsif ($variant eq 'mboxo') {
+ $$msg =~ s/^>From /From /sm;
+ }
+ if ($filter) {
+ my $ret = $filter->scrub($mime) or return;
+ return if $ret == REJECT();
+ $mime = $ret;
+ }
+ $im->add($mime)
+}
+
+sub import_mbox {
+ my ($self, $fh, $variant) = @_;
+ if ($variant !~ /\A(?:mboxrd|mboxo)\z/) {
+ die "variant must be 'mboxrd' or 'mboxo'\n";
+ }
+ my $im = $self->importer(1);
+ my $prev = undef;
+ my $msg = '';
+ my $filter = $self->filter;
+ while (defined(my $l = <$fh>)) {
+ if ($l =~ /$from_strict/o) {
+ if (!defined($prev) || $prev =~ /^\r?$/) {
+ mb_add($im, $variant, $filter, \$msg) if $msg;
+ $msg = '';
+ $prev = $l;
+ next;
+ }
+ warn "W[$.] $l\n";
+ }
+ $prev = $l;
+ $msg .= $l;
+ }
+ mb_add($im, $variant, $filter, \$msg) if $msg;
+ $im->done;
+}
+
1;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index dc96b87..46bfebb 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -65,6 +65,14 @@ sub new {
bless $self, $class;
}
+sub init_inbox {
+ my ($self, $parallel) = @_;
+ $self->{parallel} = $parallel;
+ $self->idx_init;
+ $self->git_init(0);
+ $self->done;
+}
+
# returns undef on duplicate or spam
# mimics Import::add and wraps it for v2
sub add {
diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm
index d3ca2a1..7ee29da 100644
--- a/lib/PublicInbox/WatchMaildir.pm
+++ b/lib/PublicInbox/WatchMaildir.pm
@@ -13,6 +13,8 @@ use PublicInbox::MDA;
use PublicInbox::Spawn qw(spawn);
use PublicInbox::InboxWritable;
use File::Temp qw//;
+use PublicInbox::Filter::Base;
+*REJECT = *PublicInbox::Filter::Base::REJECT;
sub new {
my ($class, $config) = @_;
@@ -125,7 +127,7 @@ sub _remove_spam {
$im->remove($mime, 'spam');
if (my $scrub = $ibx->filter) {
my $scrubbed = $scrub->scrub($mime) or return;
- $scrubbed == 100 and return;
+ $scrubbed == REJECT() and return;
$im->remove($scrubbed, 'spam');
}
};
@@ -138,13 +140,7 @@ sub _remove_spam {
sub _try_path {
my ($self, $path) = @_;
- my @p = split(m!/+!, $path);
- return if $p[-1] !~ /\A[a-zA-Z0-9][\-\w:,=\.]+\z/;
- if ($p[-1] =~ /:2,([A-Z]+)\z/i) {
- my $flags = $1;
- return if $flags =~ /[DT]/; # no [D]rafts or [T]rashed mail
- }
- return unless -f $path;
+ return unless PublicInbox::InboxWritable::is_maildir_path($path);
if ($path !~ $self->{mdre}) {
warn "unrecognized path: $path\n";
return;
@@ -166,7 +162,7 @@ sub _try_path {
}
if (my $scrub = $inbox->filter) {
my $ret = $scrub->scrub($mime) or return;
- $ret == 100 and return;
+ $ret == REJECT() and return;
$mime = $ret;
}
@@ -258,14 +254,14 @@ sub _path_to_mime {
sub _importer_for {
my ($self, $ibx) = @_;
- my $im = $ibx->importer(0);
my $importers = $self->{importers};
+ my $im = $importers->{"$ibx"} ||= $ibx->importer(0);
if (scalar(keys(%$importers)) > 2) {
- delete $importers->{"$im"};
+ delete $importers->{"$ibx"};
_done_for_now($self);
}
- $importers->{"$im"} = $im;
+ $importers->{"$ibx"} = $im;
}
sub _spamcheck_cb {
diff --git a/script/public-inbox-init b/script/public-inbox-init
index fdad136..86cf8b5 100755
--- a/script/public-inbox-init
+++ b/script/public-inbox-init
@@ -82,11 +82,7 @@ if ($version >= 2) {
-primary_address => $address[0],
};
$ibx = PublicInbox::Inbox->new($ibx);
- my $v2w = PublicInbox::V2Writable->new($ibx, 1);
- $v2w->{parallel} = 0;
- $v2w->idx_init;
- $v2w->git_init(0);
- $v2w->done;
+ PublicInbox::V2Writable->new($ibx, 1)->init_inbox(0);
} elsif ($version == 1) {
x(qw(git init -q --bare), $mainrepo);
diff --git a/scripts/import_vger_from_mbox b/scripts/import_vger_from_mbox
index 1edb987..369dac7 100644
--- a/scripts/import_vger_from_mbox
+++ b/scripts/import_vger_from_mbox
@@ -5,7 +5,7 @@ use strict;
use warnings;
use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/;
use PublicInbox::MIME;
-use PublicInbox::Inbox;
+use PublicInbox::InboxWritable;
use PublicInbox::V2Writable;
use PublicInbox::Import;
use PublicInbox::MDA;
@@ -30,55 +30,16 @@ my $ibx = {
name => $name,
version => $version,
-primary_address => $email,
+ filter => 'PublicInbox::Filter::Vger',
};
$ibx = PublicInbox::Inbox->new($ibx);
-my $im;
unless ($dry_run) {
if ($version >= 2) {
- $im = PublicInbox::V2Writable->new($ibx, 1);
+ PublicInbox::V2Writable->new($ibx, 1)->init_inbox(0);
} else {
- system(qw(git init --bare -q), $mainrepo);
- my $git = PublicInbox::Git->new($mainrepo);
- $im = PublicInbox::Import->new($git, $name, $email, $ibx);
+ system(qw(git init --bare -q), $mainrepo) == 0 or die;
}
}
+$ibx = PublicInbox::InboxWritable->new($ibx);
binmode STDIN;
-my $msg = '';
-use PublicInbox::Filter::Vger;
-my $vger = PublicInbox::Filter::Vger->new;
-
-sub do_add ($$) {
- my ($im, $msg) = @_;
- $$msg =~ s/(\r?\n)+\z/$1/s;
- my $mime = PublicInbox::MIME->new($msg);
- if ($variant eq 'mboxrd') {
- $$msg =~ s/^>(>*From )/$1/sm;
- } elsif ($variant eq 'mboxo') {
- $$msg =~ s/^>From /From /sm;
- }
- $mime = $vger->scrub($mime);
- return unless $im;
- $mime->header_set($_) foreach @PublicInbox::MDA::BAD_HEADERS;
- $im->add($mime) or
- warn "duplicate: ",
- $mime->header_obj->header_raw('Message-ID'), "\n";
-}
-
-# asctime: From example@example.com Fri Jun 23 02:56:55 2000
-my $from_strict = qr/^From \S+ +\S+ \S+ +\S+ [^:]+:[^:]+:[^:]+ [^:]+/;
-my $prev = undef;
-while (defined(my $l = <STDIN>)) {
- if ($l =~ /$from_strict/o) {
- if (!defined($prev) || $prev =~ /^\r?$/) {
- do_add($im, \$msg) if $msg;
- $msg = '';
- $prev = $l;
- next;
- }
- warn "W[$.] $l\n";
- }
- $prev = $l;
- $msg .= $l;
-}
-do_add($im, \$msg) if $msg;
-$im->done if $im;
+$ibx->import_mbox(\*STDIN, $variant);
--
EW
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 05/13] use both Date: and Received: times
2018-03-22 9:40 [PATCH 00/13] reindexing, feeds, date fixes Eric Wong (Contractor, The Linux Foundation)
` (3 preceding siblings ...)
2018-03-22 9:40 ` [PATCH 04/13] InboxWritable: add mbox/maildir parsing + import logic Eric Wong (Contractor, The Linux Foundation)
@ 2018-03-22 9:40 ` Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 06/13] msgmap: add tmp_clone to create an anonymous copy Eric Wong (Contractor, The Linux Foundation)
` (7 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Eric Wong (Contractor, The Linux Foundation) @ 2018-03-22 9:40 UTC (permalink / raw)
To: meta
We want to rely on Date: to sort messages within individual
threads since it keeps messages from git-send-email(1) sorted.
However, since developers occasionally have the clock set
wrong on their machines, sort overall messages by the newest
date in a Received: header so the landing page isn't forever
polluted by messages from the future.
This also gives us determinism for commit times in most cases,
as we'll used the Received: timestamp there, as well.
---
lib/PublicInbox/Import.pm | 17 ++++----
lib/PublicInbox/MsgTime.pm | 80 ++++++++++++++++++++++++------------
lib/PublicInbox/Search.pm | 5 ++-
lib/PublicInbox/SearchIdx.pm | 6 ++-
lib/PublicInbox/SearchIdxSkeleton.pm | 4 +-
lib/PublicInbox/SearchMsg.pm | 26 ++++++------
lib/PublicInbox/SearchView.pm | 6 +--
lib/PublicInbox/View.pm | 30 +++++++-------
8 files changed, 103 insertions(+), 71 deletions(-)
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index e50f115..d69934b 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -11,7 +11,7 @@ use base qw(PublicInbox::Lock);
use PublicInbox::Spawn qw(spawn);
use PublicInbox::MID qw(mids mid_mime mid2path);
use PublicInbox::Address;
-use PublicInbox::MsgTime qw(msg_timestamp);
+use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
use PublicInbox::ContentId qw(content_digest);
use PublicInbox::MDA;
@@ -244,9 +244,8 @@ sub remove {
(($self->{tip} = ":$commit"), $cur);
}
-sub parse_date ($) {
- my ($mime) = @_;
- my ($ts, $zone) = msg_timestamp($mime->header_obj);
+sub git_timestamp {
+ my ($ts, $zone) = @_;
$ts = 0 if $ts < 0; # git uses unsigned times
"$ts $zone";
}
@@ -295,7 +294,11 @@ sub add {
my ($self, $mime, $check_cb) = @_; # mime = Email::MIME
my ($name, $email) = extract_author_info($mime);
- my $date_raw = parse_date($mime);
+ my $hdr = $mime->header_obj;
+ my @at = msg_datestamp($hdr);
+ my @ct = msg_timestamp($hdr);
+ my $author_time_raw = git_timestamp(@at);
+ my $commit_time_raw = git_timestamp(@ct);
my $subject = $mime->header('Subject');
$subject = '(no subject)' unless defined $subject;
my $path_type = $self->{path_type};
@@ -349,8 +352,8 @@ sub add {
utf8::encode($subject);
print $w "commit $ref\nmark :$commit\n",
- "author $name <$email> $date_raw\n",
- "committer $self->{ident} ", now_raw(), "\n" or wfail;
+ "author $name <$email> $author_time_raw\n",
+ "committer $self->{ident} $commit_time_raw\n" or wfail;
print $w "data ", (length($subject) + 1), "\n",
$subject, "\n\n" or wfail;
if ($tip ne '') {
diff --git a/lib/PublicInbox/MsgTime.pm b/lib/PublicInbox/MsgTime.pm
index 87664f4..4295e87 100644
--- a/lib/PublicInbox/MsgTime.pm
+++ b/lib/PublicInbox/MsgTime.pm
@@ -4,48 +4,76 @@ package PublicInbox::MsgTime;
use strict;
use warnings;
use base qw(Exporter);
-our @EXPORT_OK = qw(msg_timestamp);
+our @EXPORT_OK = qw(msg_timestamp msg_datestamp);
use Date::Parse qw(str2time);
use Time::Zone qw(tz_offset);
-sub msg_timestamp ($) {
+sub zone_clamp ($) {
+ my ($zone) = @_;
+ $zone ||= '+0000';
+ # "-1200" is the furthest westermost zone offset,
+ # but git fast-import is liberal so we use "-1400"
+ if ($zone >= 1400 || $zone <= -1400) {
+ warn "bogus TZ offset: $zone, ignoring and assuming +0000\n";
+ $zone = '+0000';
+ }
+ $zone;
+}
+
+sub time_response ($) {
+ my ($ret) = @_;
+ wantarray ? @$ret : $ret->[0];
+}
+
+sub msg_received_at ($) {
my ($hdr) = @_; # Email::MIME::Header
- my ($ts, $zone);
- my $mid;
my @recvd = $hdr->header_raw('Received');
+ my ($ts, $zone);
foreach my $r (@recvd) {
$zone = undef;
$r =~ /\s*(\d+\s+[[:alpha:]]+\s+\d{2,4}\s+
\d+\D\d+(?:\D\d+)\s+([\+\-]\d+))/sx or next;
$zone = $2;
$ts = eval { str2time($1) } and last;
- $mid ||= $hdr->header_raw('Message-ID');
+ my $mid = $hdr->header_raw('Message-ID');
warn "no date in $mid Received: $r\n";
}
- unless (defined $ts) {
- my @date = $hdr->header_raw('Date');
- foreach my $d (@date) {
- $zone = undef;
- $ts = eval { str2time($d) };
- if ($@) {
- $mid ||= $hdr->header_raw('Message-ID');
- warn "bad Date: $d in $mid: $@\n";
- } elsif ($d =~ /\s+([\+\-]\d+)\s*\z/) {
- $zone = $1;
- }
+ defined $ts ? [ $ts, zone_clamp($zone) ] : undef;
+}
+
+sub msg_date_only ($) {
+ my ($hdr) = @_; # Email::MIME::Header
+ my @date = $hdr->header_raw('Date');
+ my ($ts, $zone);
+ foreach my $d (@date) {
+ $zone = undef;
+ $ts = eval { str2time($d) };
+ if ($@) {
+ my $mid = $hdr->header_raw('Message-ID');
+ warn "bad Date: $d in $mid: $@\n";
+ } elsif ($d =~ /\s+([\+\-]\d+)\s*\z/) {
+ $zone = $1;
}
}
- $ts = time unless defined $ts;
- return $ts unless wantarray;
+ defined $ts ? [ $ts, zone_clamp($zone) ] : undef;
+}
- $zone ||= '+0000';
- # "-1200" is the furthest westermost zone offset,
- # but git fast-import is liberal so we use "-1400"
- if ($zone >= 1400 || $zone <= -1400) {
- warn "bogus TZ offset: $zone, ignoring and assuming +0000\n";
- $zone = '+0000';
- }
- ($ts, $zone);
+# Favors Received header for sorting globally
+sub msg_timestamp ($) {
+ my ($hdr) = @_; # Email::MIME::Header
+ my $ret;
+ $ret = msg_received_at($hdr) and return time_response($ret);
+ $ret = msg_date_only($hdr) and return time_response($ret);
+ wantarray ? (time, '+0000') : time;
+}
+
+# Favors the Date: header for display and sorting within a thread
+sub msg_datestamp ($) {
+ my ($hdr) = @_; # Email::MIME::Header
+ my $ret;
+ $ret = msg_date_only($hdr) and return time_response($ret);
+ $ret = msg_received_at($hdr) and return time_response($ret);
+ wantarray ? (time, '+0000') : time;
}
1;
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 7e7c989..f08b987 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -8,11 +8,12 @@ use strict;
use warnings;
# values for searching
-use constant TS => 0; # timestamp
+use constant DS => 0; # Date: header in Unix time
use constant NUM => 1; # NNTP article number
use constant BYTES => 2; # :bytes as defined in RFC 3977
use constant LINES => 3; # :lines as defined in RFC 3977
-use constant YYYYMMDD => 4; # for searching in the WWW UI
+use constant TS => 4; # Received: header in Unix time
+use constant YYYYMMDD => 5; # for searching in the WWW UI
use Search::Xapian qw/:standard/;
use PublicInbox::SearchMsg;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 3d80b00..ef723a4 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -134,7 +134,9 @@ sub add_values ($$) {
my $lines = $values->[PublicInbox::Search::LINES];
add_val($doc, PublicInbox::Search::LINES, $lines);
- my $yyyymmdd = strftime('%Y%m%d', gmtime($ts));
+ my $ds = $values->[PublicInbox::Search::DS];
+ add_val($doc, PublicInbox::Search::DS, $ds);
+ my $yyyymmdd = strftime('%Y%m%d', gmtime($ds));
add_val($doc, PublicInbox::Search::YYYYMMDD, $yyyymmdd);
}
@@ -298,7 +300,7 @@ sub add_message {
}
my $lines = $mime->body_raw =~ tr!\n!\n!;
- my @values = ($smsg->ts, $num, $bytes, $lines);
+ my @values = ($smsg->ds, $num, $bytes, $lines, $smsg->ts);
add_values($doc, \@values);
my $tg = $self->term_generator;
diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm
index ba43969..78a1730 100644
--- a/lib/PublicInbox/SearchIdxSkeleton.pm
+++ b/lib/PublicInbox/SearchIdxSkeleton.pm
@@ -121,18 +121,16 @@ sub remote_remove {
die $err if $err;
}
-# values: [ TS, NUM, BYTES, LINES, MID, XPATH, doc_data ]
+# values: [ DS, NUM, BYTES, LINES, TS, MIDS, XPATH, doc_data ]
sub index_skeleton_real ($$) {
my ($self, $values) = @_;
my $doc_data = pop @$values;
my $xpath = pop @$values;
my $mids = pop @$values;
- my $ts = $values->[PublicInbox::Search::TS];
my $smsg = PublicInbox::SearchMsg->new(undef);
my $doc = $smsg->{doc};
PublicInbox::SearchIdx::add_values($doc, $values);
$doc->set_data($doc_data);
- $smsg->{ts} = $ts;
$smsg->load_from_data($doc_data);
my $num = $values->[PublicInbox::Search::NUM];
my @refs = ($smsg->references =~ /<([^>]+)>/g);
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index a1cd0c2..de1fd13 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -9,7 +9,7 @@ use warnings;
use Search::Xapian;
use PublicInbox::MID qw/mid_clean mid_mime/;
use PublicInbox::Address;
-use PublicInbox::MsgTime qw(msg_timestamp);
+use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
sub new {
my ($class, $mime) = @_;
@@ -46,6 +46,7 @@ sub load_expand {
my $doc = $self->{doc};
my $data = $doc->get_data or return;
$self->{ts} = get_val($doc, &PublicInbox::Search::TS);
+ $self->{ds} = get_val($doc, &PublicInbox::Search::DS);
utf8::decode($data);
load_from_data($self, $data);
$self;
@@ -53,12 +54,8 @@ sub load_expand {
sub load_doc {
my ($class, $doc) = @_;
- my $data = $doc->get_data or return;
- my $ts = get_val($doc, &PublicInbox::Search::TS);
- utf8::decode($data);
- my $self = bless { doc => $doc, ts => $ts }, $class;
- load_from_data($self, $data);
- $self
+ my $self = bless { doc => $doc }, $class;
+ $self->load_expand;
}
# :bytes and :lines metadata in RFC 3977
@@ -91,9 +88,9 @@ my @MoY = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
sub date ($) {
my ($self) = @_;
- my $ts = $self->{ts};
- return unless defined $ts;
- my ($sec, $min, $hour, $mday, $mon, $year, $wday) = gmtime($ts);
+ my $ds = $self->{ds};
+ return unless defined $ds;
+ my ($sec, $min, $hour, $mday, $mon, $year, $wday) = gmtime($ds);
"$DoW[$wday], " . sprintf("%02d $MoY[$mon] %04d %02d:%02d:%02d +0000",
$mday, $year+1900, $hour, $min, $sec);
@@ -119,9 +116,12 @@ sub from_name {
sub ts {
my ($self) = @_;
- $self->{ts} ||= eval {
- msg_timestamp($self->{mime}->header_obj);
- } || 0;
+ $self->{ts} ||= eval { msg_timestamp($self->{mime}->header_obj) } || 0;
+}
+
+sub ds {
+ my ($self) = @_;
+ $self->{ds} ||= eval { msg_datestamp($self->{mime}->header_obj); } || 0;
}
sub to_doc_data {
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 53b88c3..55c588c 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -117,11 +117,11 @@ sub mset_summary {
obfuscate_addrs($obfs_ibx, $s);
obfuscate_addrs($obfs_ibx, $f);
}
- my $ts = PublicInbox::View::fmt_ts($smsg->ts);
+ my $date = PublicInbox::View::fmt_ts($smsg->ds);
my $mid = PublicInbox::Hval->new_msgid($smsg->mid)->{href};
$$res .= qq{$rank. <b><a\nhref="$mid/">}.
$s . "</a></b>\n";
- $$res .= "$pfx - by $f @ $ts UTC [$pct%]\n\n";
+ $$res .= "$pfx - by $f @ $date UTC [$pct%]\n\n";
}
$$res .= search_nav_bot($mset, $q);
*noop;
@@ -227,7 +227,7 @@ sub mset_thread {
} ($mset->items) ]});
my $r = $q->{r};
my $rootset = PublicInbox::SearchThread::thread($msgs,
- $r ? sort_relevance(\%pct) : *PublicInbox::View::sort_ts,
+ $r ? sort_relevance(\%pct) : *PublicInbox::View::sort_ds,
$srch);
my $skel = search_nav_bot($mset, $q). "<pre>";
my $inbox = $ctx->{-inbox};
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index f811f4f..18882af 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -6,7 +6,7 @@
package PublicInbox::View;
use strict;
use warnings;
-use PublicInbox::MsgTime qw(msg_timestamp);
+use PublicInbox::MsgTime qw(msg_datestamp);
use PublicInbox::Hval qw/ascii_html obfuscate_addrs/;
use PublicInbox::Linkify;
use PublicInbox::MID qw/mid_clean id_compress mid_mime mid_escape/;
@@ -735,7 +735,7 @@ sub load_results {
sub thread_results {
my ($msgs, $srch) = @_;
require PublicInbox::SearchThread;
- PublicInbox::SearchThread::thread($msgs, *sort_ts, $srch);
+ PublicInbox::SearchThread::thread($msgs, *sort_ds, $srch);
}
sub missing_thread {
@@ -746,7 +746,7 @@ sub missing_thread {
sub _msg_date {
my ($hdr) = @_;
- fmt_ts(msg_timestamp($hdr));
+ fmt_ts(msg_datestamp($hdr));
}
sub fmt_ts { POSIX::strftime('%Y-%m-%d %k:%M', gmtime($_[0])) }
@@ -782,7 +782,7 @@ sub skel_dump {
my $obfs_ibx = $ctx->{-obfs_ibx};
obfuscate_addrs($obfs_ibx, $f) if $obfs_ibx;
- my $d = fmt_ts($smsg->{ts}) . ' ' . indent_for($level) . th_pfx($level);
+ my $d = fmt_ts($smsg->{ds}) . ' ' . indent_for($level) . th_pfx($level);
my $attr = $f;
$ctx->{first_level} ||= $level;
@@ -863,10 +863,10 @@ sub _skel_ghost {
$$dst .= $d;
}
-sub sort_ts {
+sub sort_ds {
[ sort {
- (eval { $a->topmost->{smsg}->ts } || 0) <=>
- (eval { $b->topmost->{smsg}->ts } || 0)
+ (eval { $a->topmost->{smsg}->ds } || 0) <=>
+ (eval { $b->topmost->{smsg}->ds } || 0)
} @{$_[0]} ];
}
@@ -877,21 +877,21 @@ sub acc_topic {
my $srch = $ctx->{srch};
my $mid = $node->{id};
my $x = $node->{smsg} || $srch->lookup_mail($mid);
- my ($subj, $ts);
+ my ($subj, $ds);
my $topic;
if ($x) {
$subj = $x->subject;
$subj = $srch->subject_normalized($subj);
- $ts = $x->ts;
+ $ds = $x->ds;
if ($level == 0) {
- $topic = [ $ts, 1, { $subj => $mid }, $subj ];
+ $topic = [ $ds, 1, { $subj => $mid }, $subj ];
$ctx->{-cur_topic} = $topic;
push @{$ctx->{order}}, $topic;
return;
}
$topic = $ctx->{-cur_topic}; # should never be undef
- $topic->[0] = $ts if $ts > $topic->[0];
+ $topic->[0] = $ds if $ds > $topic->[0];
$topic->[1]++;
my $seen = $topic->[2];
if (scalar(@$topic) == 3) { # parent was a ghost
@@ -910,7 +910,7 @@ sub acc_topic {
sub dump_topics {
my ($ctx) = @_;
- my $order = delete $ctx->{order}; # [ ts, subj1, subj2, subj3, ... ]
+ my $order = delete $ctx->{order}; # [ ds, subj1, subj2, subj3, ... ]
if (!@$order) {
$ctx->{-html_tip} = '<pre>[No topics in range]</pre>';
return 404;
@@ -923,14 +923,14 @@ sub dump_topics {
# sort by recency, this allows new posts to "bump" old topics...
foreach my $topic (sort { $b->[0] <=> $a->[0] } @$order) {
- my ($ts, $n, $seen, $top, @ex) = @$topic;
+ my ($ds, $n, $seen, $top, @ex) = @$topic;
@$topic = ();
next unless defined $top; # ghost topic
my $mid = delete $seen->{$top};
my $href = mid_escape($mid);
my $prev_subj = [ split(/ /, $top) ];
$top = PublicInbox::Hval->new($top)->as_html;
- $ts = fmt_ts($ts);
+ $ds = fmt_ts($ds);
# $n isn't the total number of posts on the topic,
# just the number of posts in the current results window
@@ -946,7 +946,7 @@ sub dump_topics {
my $mbox = qq(<a\nhref="$href/t.mbox.gz">mbox.gz</a>);
my $atom = qq(<a\nhref="$href/t.atom">Atom</a>);
my $s = "<a\nhref=\"$href/T/$anchor\"><b>$top</b></a>\n" .
- " $ts UTC $n - $mbox / $atom\n";
+ " $ds UTC $n - $mbox / $atom\n";
for (my $i = 0; $i < scalar(@ex); $i += 2) {
my $level = $ex[$i];
my $subj = $ex[$i + 1];
--
EW
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 06/13] msgmap: add tmp_clone to create an anonymous copy
2018-03-22 9:40 [PATCH 00/13] reindexing, feeds, date fixes Eric Wong (Contractor, The Linux Foundation)
` (4 preceding siblings ...)
2018-03-22 9:40 ` [PATCH 05/13] use both Date: and Received: times Eric Wong (Contractor, The Linux Foundation)
@ 2018-03-22 9:40 ` Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 07/13] fix syntax warnings Eric Wong (Contractor, The Linux Foundation)
` (6 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Eric Wong (Contractor, The Linux Foundation) @ 2018-03-22 9:40 UTC (permalink / raw)
To: meta
This will be used to keep track of Message-ID <-> NNTP Article
numbers to prevent article number reuse when reindexing.
---
lib/PublicInbox/Inbox.pm | 2 +-
lib/PublicInbox/Msgmap.pm | 19 +++++++++++++++++++
t/msgmap.t | 4 ++++
3 files changed, 24 insertions(+), 1 deletion(-)
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index b9cd4c4..f307038 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -75,7 +75,7 @@ sub new {
_set_uint($opts, 'feedmax', 25);
$opts->{nntpserver} ||= $pi_config->{'publicinbox.nntpserver'};
my $dir = $opts->{mainrepo};
- if (defined $dir && -f "$dir/msgmap.sqlite3") { # XXX DIRTY
+ if (defined $dir && -f "$dir/inbox.lock") {
$opts->{version} = 2;
}
bless $opts, $class;
diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index 8e81fba..78922d3 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -12,6 +12,7 @@ use strict;
use warnings;
use DBI;
use DBD::SQLite;
+use File::Temp qw(tempfile);
sub new {
my ($class, $git_dir, $writable) = @_;
@@ -45,6 +46,18 @@ sub new_file {
$self;
}
+# used to keep track of used numeric mappings for v2 reindex
+sub tmp_clone {
+ my ($self) = @_;
+ my ($fh, $fn) = tempfile(EXLOCK => 0);
+ $self->{dbh}->sqlite_backup_to_file($fn);
+ my $tmp = ref($self)->new_file($fn, 1);
+ $tmp->{dbh}->do('PRAGMA synchronous = OFF');
+ $tmp->{tmp_name} = $fn; # SQLite won't work if unlinked, apparently
+ $fh = undef;
+ $tmp;
+}
+
# n.b. invoked directly by scripts/xhdr-num2mid
sub meta_accessor {
my ($self, $key, $value) = @_;
@@ -189,4 +202,10 @@ sub mid_set {
$sth->execute($num, $mid);
}
+sub DESTROY {
+ my ($self) = @_;
+ delete $self->{dbh};
+ unlink $self->{tmp_name} if defined $self->{tmp_name};
+}
+
1;
diff --git a/t/msgmap.t b/t/msgmap.t
index bc22137..dce98f4 100644
--- a/t/msgmap.t
+++ b/t/msgmap.t
@@ -65,4 +65,8 @@ my $orig = $d->mid_insert('spam@1');
$d->mid_delete('spam@1');
is($d->mid_insert('spam@2'), 1 + $orig, "last number not recycled");
+my $tmp = $d->tmp_clone;
+is_deeply([$d->minmax], [$tmp->minmax], 'Cloned temporary DB matches');
+ok($tmp->mid_delete('spam@2'), 'temporary DB is writable');
+
done_testing();
--
EW
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 07/13] fix syntax warnings
2018-03-22 9:40 [PATCH 00/13] reindexing, feeds, date fixes Eric Wong (Contractor, The Linux Foundation)
` (5 preceding siblings ...)
2018-03-22 9:40 ` [PATCH 06/13] msgmap: add tmp_clone to create an anonymous copy Eric Wong (Contractor, The Linux Foundation)
@ 2018-03-22 9:40 ` Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 08/13] v2writable: support reindexing Xapian Eric Wong (Contractor, The Linux Foundation)
` (5 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Eric Wong (Contractor, The Linux Foundation) @ 2018-03-22 9:40 UTC (permalink / raw)
To: meta
I keep forgetting to run "make syntax"
---
lib/PublicInbox/Daemon.pm | 2 +-
lib/PublicInbox/EvCleanup.pm | 4 +++-
lib/PublicInbox/Inbox.pm | 2 +-
lib/PublicInbox/SearchMsg.pm | 2 +-
4 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/lib/PublicInbox/Daemon.pm b/lib/PublicInbox/Daemon.pm
index 9d27d2b..4629aad 100644
--- a/lib/PublicInbox/Daemon.pm
+++ b/lib/PublicInbox/Daemon.pm
@@ -460,7 +460,7 @@ sub daemon_loop ($$) {
@listeners = map {
PublicInbox::Listener->new($_, $post_accept)
} @listeners;
- $PublicInbox::EvCleanup::ENABLED = 1;
+ PublicInbox::EvCleanup::enable();
Danga::Socket->EventLoop;
$parent_pipe = undef;
}
diff --git a/lib/PublicInbox/EvCleanup.pm b/lib/PublicInbox/EvCleanup.pm
index 384efd3..1a3a3d5 100644
--- a/lib/PublicInbox/EvCleanup.pm
+++ b/lib/PublicInbox/EvCleanup.pm
@@ -8,7 +8,9 @@ use warnings;
use base qw(Danga::Socket);
use fields qw(rd);
-our $ENABLED;
+my $ENABLED;
+sub enabled { $ENABLED }
+sub enable { $ENABLED = 1 }
my $singleton;
my $asapq = [ [], undef ];
my $nextq = [ [], undef ];
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index f307038..666c81d 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -29,7 +29,7 @@ sub cleanup_task () {
sub _cleanup_later ($) {
my ($self) = @_;
- return unless $PublicInbox::EvCleanup::ENABLED;
+ return unless PublicInbox::EvCleanup::enabled();
$cleanup_timer ||= PublicInbox::EvCleanup::later(*cleanup_task);
$CLEANUP->{"$self"} = $self;
}
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index de1fd13..dd3d58d 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -62,7 +62,7 @@ sub load_doc {
sub bytes ($) { get_val($_[0]->{doc}, &PublicInbox::Search::BYTES) }
sub lines ($) { get_val($_[0]->{doc}, &PublicInbox::Search::LINES) }
sub num ($) {
- $_[0]->{num} ||= get_val($_[0]->{doc}, PublicInbox::Search::NUM)
+ $_[0]->{num} ||= get_val($_[0]->{doc}, PublicInbox::Search::NUM())
}
sub __hdr ($$) {
--
EW
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 08/13] v2writable: support reindexing Xapian
2018-03-22 9:40 [PATCH 00/13] reindexing, feeds, date fixes Eric Wong (Contractor, The Linux Foundation)
` (6 preceding siblings ...)
2018-03-22 9:40 ` [PATCH 07/13] fix syntax warnings Eric Wong (Contractor, The Linux Foundation)
@ 2018-03-22 9:40 ` Eric Wong (Contractor, The Linux Foundation)
2018-03-26 20:08 ` Eric Wong
2018-03-22 9:40 ` [PATCH 09/13] t/altid.t: extra tests for mid_set Eric Wong (Contractor, The Linux Foundation)
` (4 subsequent siblings)
12 siblings, 1 reply; 15+ messages in thread
From: Eric Wong (Contractor, The Linux Foundation) @ 2018-03-22 9:40 UTC (permalink / raw)
To: meta
This still requires a msgmap.sqlite3 file to exist, but
it allows us to tweak Xapian indexing rules and reindex
the Xapian database online while -watch is running.
---
lib/PublicInbox/Msgmap.pm | 40 +++++++++--
lib/PublicInbox/SearchIdx.pm | 14 +++-
lib/PublicInbox/SearchIdxSkeleton.pm | 1 +
lib/PublicInbox/V2Writable.pm | 134 +++++++++++++++++++++++++++++++----
script/public-inbox-index | 25 +++++--
5 files changed, 189 insertions(+), 25 deletions(-)
diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index 78922d3..1283305 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -24,9 +24,8 @@ sub new {
new_file($class, "$d/msgmap.sqlite3", $writable);
}
-sub new_file {
- my ($class, $f, $writable) = @_;
-
+sub dbh_new {
+ my ($f, $writable) = @_;
my $dbh = DBI->connect("dbi:SQLite:dbname=$f",'','', {
AutoCommit => 1,
RaiseError => 1,
@@ -35,6 +34,13 @@ sub new_file {
sqlite_use_immediate_transaction => 1,
});
$dbh->do('PRAGMA case_sensitive_like = ON');
+ $dbh;
+}
+
+sub new_file {
+ my ($class, $f, $writable) = @_;
+
+ my $dbh = dbh_new($f, $writable);
my $self = bless { dbh => $dbh }, $class;
if ($writable) {
@@ -49,12 +55,13 @@ sub new_file {
# used to keep track of used numeric mappings for v2 reindex
sub tmp_clone {
my ($self) = @_;
- my ($fh, $fn) = tempfile(EXLOCK => 0);
+ my ($fh, $fn) = tempfile('msgmap-XXXXXXXX', EXLOCK => 0, TMPDIR => 1);
$self->{dbh}->sqlite_backup_to_file($fn);
my $tmp = ref($self)->new_file($fn, 1);
$tmp->{dbh}->do('PRAGMA synchronous = OFF');
$tmp->{tmp_name} = $fn; # SQLite won't work if unlinked, apparently
- $fh = undef;
+ $tmp->{pid} = $$;
+ close $fh or die "failed to close $fn: $!";
$tmp;
}
@@ -205,7 +212,28 @@ sub mid_set {
sub DESTROY {
my ($self) = @_;
delete $self->{dbh};
- unlink $self->{tmp_name} if defined $self->{tmp_name};
+ my $f = delete $self->{tmp_name};
+ if (defined $f && $self->{pid} == $$) {
+ unlink $f or warn "failed to unlink $f: $!\n";
+ }
+}
+
+sub atfork_parent {
+ my ($self) = @_;
+ my $f = $self->{tmp_name} or die "not a temporary clone\n";
+ delete $self->{dbh} and die "tmp_clone dbh not prepared for parent";
+ my $dbh = $self->{dbh} = dbh_new($f, 1);
+ $dbh->do('PRAGMA synchronous = OFF');
+}
+
+sub atfork_prepare {
+ my ($self) = @_;
+ my $f = $self->{tmp_name} or die "not a temporary clone\n";
+ $self->{pid} == $$ or
+ die "BUG: atfork_prepare not called from $self->{pid}\n";
+ $self->{dbh} or die "temporary clone not open\n";
+ # must clobber prepared statements
+ %$self = (tmp_name => $f, pid => $$);
}
1;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index ef723a4..7ac16ec 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -352,7 +352,7 @@ sub add_message {
# populates smsg->references for smsg->to_doc_data
my $refs = parse_references($smsg);
- $mid0 = $mids->[0] unless defined $mid0;
+ $mid0 = $mids->[0] unless defined $mid0; # v1 compatibility
my $data = $smsg->to_doc_data($oid, $mid0);
foreach my $mid (@$mids) {
$tg->index_text($mid, 1, 'XM');
@@ -369,10 +369,12 @@ sub add_message {
}
}
+ $self->delete_article($num) if defined $num; # for reindexing
if ($skel) {
push @values, $mids, $xpath, $data;
$skel->index_skeleton(\@values);
$doc->add_boolean_term('Q' . $_) foreach @$mids;
+ $doc->add_boolean_term('XNUM' . $num) if defined $num;
$doc_id = $self->{xdb}->add_document($doc);
} else {
$doc_id = link_and_save($self, $doc, $mids, $refs,
@@ -421,6 +423,16 @@ sub remove_message {
}
}
+sub delete_article {
+ my ($self, $num) = @_;
+ my $ndel = 0;
+ batch_do($self, 'XNUM' . $num, sub {
+ my ($ids) = @_;
+ $ndel += scalar @$ids;
+ $self->{xdb}->delete_document($_) for @$ids;
+ });
+}
+
# MID is a hint in V2
sub remove_by_oid {
my ($self, $oid, $mid) = @_;
diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm
index 78a1730..4f15816 100644
--- a/lib/PublicInbox/SearchIdxSkeleton.pm
+++ b/lib/PublicInbox/SearchIdxSkeleton.pm
@@ -134,6 +134,7 @@ sub index_skeleton_real ($$) {
$smsg->load_from_data($doc_data);
my $num = $values->[PublicInbox::Search::NUM];
my @refs = ($smsg->references =~ /<([^>]+)>/g);
+ $self->delete_article($num) if defined $num; # for reindexing
$self->link_and_save($doc, $mids, \@refs, $num, $xpath);
}
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 46bfebb..550a74d 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -359,6 +359,23 @@ sub git_init {
$git_dir
}
+sub git_dir_latest {
+ my ($self, $max) = @_;
+ $$max = -1;
+ my $pfx = "$self->{-inbox}->{mainrepo}/git";
+ return unless -d $pfx;
+ my $latest;
+ opendir my $dh, $pfx or die "opendir $pfx: $!\n";
+ while (defined(my $git_dir = readdir($dh))) {
+ $git_dir =~ m!\A(\d+)\.git\z! or next;
+ if ($1 > $$max) {
+ $$max = $1;
+ $latest = "$pfx/$git_dir";
+ }
+ }
+ $latest;
+}
+
sub importer {
my ($self) = @_;
my $im = $self->{im};
@@ -375,20 +392,9 @@ sub importer {
return $self->import_init($git, 0);
}
}
- my $latest;
- my $max = -1;
my $new = 0;
- my $pfx = "$self->{-inbox}->{mainrepo}/git";
- if (-d $pfx) {
- foreach my $git_dir (glob("$pfx/*.git")) {
- $git_dir =~ m!/(\d+)\.git\z! or next;
- my $n = $1;
- if ($n > $max) {
- $max = $n;
- $latest = $git_dir;
- }
- }
- }
+ my $max;
+ my $latest = git_dir_latest($self, \$max);
if (defined $latest) {
my $git = PublicInbox::Git->new($latest);
my $packed_bytes = $git->packed_bytes;
@@ -466,6 +472,8 @@ sub lookup_content {
sub atfork_child {
my ($self) = @_;
+ my $fh = delete $self->{reindex_pipe};
+ close $fh if $fh;
if (my $parts = $self->{idx_parts}) {
$_->atfork_child foreach @$parts;
}
@@ -474,4 +482,104 @@ sub atfork_child {
}
}
+sub mark_deleted {
+ my ($self, $D, $git, $oid) = @_;
+ my $msgref = $git->cat_file($oid);
+ my $mime = PublicInbox::MIME->new($$msgref);
+ my $mids = mids($mime->header_obj);
+ my $cid = content_id($mime);
+ foreach my $mid (@$mids) {
+ $D->{$mid.$cid} = 1;
+ }
+}
+
+sub reindex_oid {
+ my ($self, $mm_tmp, $D, $git, $oid) = @_;
+ my $len;
+ my $msgref = $git->cat_file($oid, \$len);
+ my $mime = PublicInbox::MIME->new($$msgref);
+ my $mids = mids($mime->header_obj);
+ my $cid = content_id($mime);
+
+ # get the NNTP article number we used before, highest number wins
+ # and gets deleted from mm_tmp;
+ my $mid0;
+ my $num = -1;
+ my $del = 0;
+ foreach my $mid (@$mids) {
+ $del += (delete $D->{$mid.$cid} || 0);
+ my $n = $mm_tmp->num_for($mid);
+ if (defined $n && $n > $num) {
+ $mid0 = $mid;
+ $num = $n;
+ }
+ }
+ if (!defined($mid0) || $del) {
+ return if (!defined($mid0) && $del); # expected for deletes
+
+ my $id = '<' . join('> <', @$mids) . '>';
+ defined($mid0) or
+ warn "Skipping $id, no article number found\n";
+ if ($del && defined($mid0)) {
+ warn "$id was deleted $del " .
+ "time(s) but mapped to article #$num\n";
+ }
+ return;
+
+ }
+ $mm_tmp->mid_delete($mid0) or
+ die "failed to delete <$mid0> for article #$num\n";
+
+ my $nparts = $self->{partitions};
+ my $part = $num % $nparts;
+ my $idx = $self->idx_part($part);
+ $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime);
+ my $n = $self->{transact_bytes} += $len;
+ if ($n > (PublicInbox::SearchIdx::BATCH_BYTES * $nparts)) {
+ $git->cleanup;
+ $mm_tmp->atfork_prepare;
+ $self->done; # release lock
+ # allow -watch or -mda to write...
+ $self->idx_init; # reacquire lock
+ $mm_tmp->atfork_parent;
+ }
+}
+
+sub reindex {
+ my ($self) = @_;
+ my $ibx = $self->{-inbox};
+ my $pfx = "$ibx->{mainrepo}/git";
+ my $max_git;
+ my $latest = git_dir_latest($self, \$max_git);
+ return unless defined $latest;
+ my @cmd = qw(log --raw -r --pretty=tformat:%h
+ --no-notes --no-color --no-abbrev);
+ my $head = $ibx->{ref_head} || 'refs/heads/master';
+ $self->idx_init; # acquire lock
+ my $x40 = qr/[a-f0-9]{40}/;
+ my $mm_tmp = $self->{skel}->{mm}->tmp_clone;
+ my $D = {};
+
+ # work backwards through history
+ for (my $cur = $max_git; $cur >= 0; $cur--) {
+ die "already reindexing!\n" if delete $self->{reindex_pipe};
+ my $cmt;
+ my $git_dir = "$pfx/$cur.git";
+ my $git = PublicInbox::Git->new($git_dir);
+ my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $head);
+ while (<$fh>) {
+ if (/\A$x40$/o) {
+ chomp($cmt = $_);
+ } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) {
+ $self->reindex_oid($mm_tmp, $D, $git, $1);
+ } elsif (m!\A:\d{6} 100644 $x40 ($x40) [AM]\t_/D$!o) {
+ $self->mark_deleted($D, $git, $1);
+ }
+ }
+ delete $self->{reindex_pipe};
+ }
+ my ($min, $max) = $mm_tmp->minmax;
+ defined $max and die "leftover article numbers at $min..$max\n";
+}
+
1;
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 1debbaa..cea3573 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -31,6 +31,9 @@ my @dirs;
sub resolve_repo_dir {
my ($cd) = @_;
my $prefix = defined $cd ? $cd : './';
+ if (-d $prefix && -f "$prefix/inbox.lock") { # v2
+ return abs_path($prefix);
+ }
my @cmd = qw(git rev-parse --git-dir);
my $cmd = join(' ', @cmd);
@@ -75,14 +78,26 @@ foreach my $k (keys %$config) {
}
foreach my $dir (@dirs) {
+ if (!ref($dir) && -f "$dir/inbox.lock") { # v2
+ my $ibx = { mainrepo => $dir, name => 'unnamed' };
+ $dir = PublicInbox::Inbox->new($ibx);
+ }
index_dir($dir);
}
sub index_dir {
- my ($git_dir) = @_;
- if (!ref $git_dir && ! -d $git_dir) {
- die "$git_dir does not appear to be a git repository\n";
+ my ($repo) = @_;
+ if (!ref $repo && ! -d $repo) {
+ die "$repo does not appear to be an inbox repository\n";
+ }
+ if (ref($repo) && ($repo->{version} || 1) == 2) {
+ eval { require PublicInbox::V2Writable };
+ die "v2 requirements not met: $@\n" if $@;
+ my $v2w = PublicInbox::V2Writable->new($repo);
+ $v2w->reindex;
+ $v2w->done;
+ } else {
+ my $s = PublicInbox::SearchIdx->new($repo, 1);
+ $s->index_sync({ reindex => $reindex });
}
- my $s = PublicInbox::SearchIdx->new($git_dir, 1);
- $s->index_sync({ reindex => $reindex });
}
--
EW
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH 08/13] v2writable: support reindexing Xapian
2018-03-22 9:40 ` [PATCH 08/13] v2writable: support reindexing Xapian Eric Wong (Contractor, The Linux Foundation)
@ 2018-03-26 20:08 ` Eric Wong
0 siblings, 0 replies; 15+ messages in thread
From: Eric Wong @ 2018-03-26 20:08 UTC (permalink / raw)
To: meta
<e@80x24.org> wrote:
> --- a/lib/PublicInbox/SearchIdx.pm
> +++ b/lib/PublicInbox/SearchIdx.pm
> @@ -369,10 +369,12 @@ sub add_message {
> }
> }
>
> + $self->delete_article($num) if defined $num; # for reindexing
> if ($skel) {
> push @values, $mids, $xpath, $data;
> $skel->index_skeleton(\@values);
> $doc->add_boolean_term('Q' . $_) foreach @$mids;
> + $doc->add_boolean_term('XNUM' . $num) if defined $num;
> $doc_id = $self->{xdb}->add_document($doc);
> } else {
> $doc_id = link_and_save($self, $doc, $mids, $refs,
> @@ -421,6 +423,16 @@ sub remove_message {
> }
> }
>
> +sub delete_article {
> + my ($self, $num) = @_;
> + my $ndel = 0;
> + batch_do($self, 'XNUM' . $num, sub {
> + my ($ids) = @_;
> + $ndel += scalar @$ids;
> + $self->{xdb}->delete_document($_) for @$ids;
> + });
> +}
I will need to do some further investigation, but I must be
missing something and there's increases in Xapian DB size
which doesn't seem to get recovered on xapian-compact...
> diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm
> index 78a1730..4f15816 100644
> --- a/lib/PublicInbox/SearchIdxSkeleton.pm
> +++ b/lib/PublicInbox/SearchIdxSkeleton.pm
> @@ -134,6 +134,7 @@ sub index_skeleton_real ($$) {
> $smsg->load_from_data($doc_data);
> my $num = $values->[PublicInbox::Search::NUM];
> my @refs = ($smsg->references =~ /<([^>]+)>/g);
> + $self->delete_article($num) if defined $num; # for reindexing
> $self->link_and_save($doc, $mids, \@refs, $num, $xpath);
> }
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH 09/13] t/altid.t: extra tests for mid_set
2018-03-22 9:40 [PATCH 00/13] reindexing, feeds, date fixes Eric Wong (Contractor, The Linux Foundation)
` (7 preceding siblings ...)
2018-03-22 9:40 ` [PATCH 08/13] v2writable: support reindexing Xapian Eric Wong (Contractor, The Linux Foundation)
@ 2018-03-22 9:40 ` Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 10/13] v2writable: add NNTP article number regeneration support Eric Wong (Contractor, The Linux Foundation)
` (3 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Eric Wong (Contractor, The Linux Foundation) @ 2018-03-22 9:40 UTC (permalink / raw)
To: meta
I'll be relying on some of this behavior for regenerating NNTP
article numbers off fresh clones.
---
t/altid.t | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/t/altid.t b/t/altid.t
index 7759bd6..0f3b86c 100644
--- a/t/altid.t
+++ b/t/altid.t
@@ -20,7 +20,9 @@ my $altid = [ "serial:gmane:file=$alt_file" ];
{
my $mm = PublicInbox::Msgmap->new_file($alt_file, 1);
- $mm->mid_set(1234, 'a@example.com');
+ is($mm->mid_set(1234, 'a@example.com'), 1, 'mid_set once OK');
+ ok(0 == $mm->mid_set(1234, 'a@example.com'), 'mid_set not idempotent');
+ ok(0 == $mm->mid_set(1, 'a@example.com'), 'mid_set fails with dup MID');
}
{
@@ -56,6 +58,13 @@ my $altid = [ "serial:gmane:file=$alt_file" ];
is($res->{total}, 0, 'body did NOT match');
};
+{
+ my $mm = PublicInbox::Msgmap->new_file($alt_file, 1);
+ my ($min, $max) = $mm->minmax;
+ my $num = $mm->mid_insert('b@example.com');
+ ok($num > $max, 'auto-increment goes beyond mid_set');
+}
+
done_testing();
1;
--
EW
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 10/13] v2writable: add NNTP article number regeneration support
2018-03-22 9:40 [PATCH 00/13] reindexing, feeds, date fixes Eric Wong (Contractor, The Linux Foundation)
` (8 preceding siblings ...)
2018-03-22 9:40 ` [PATCH 09/13] t/altid.t: extra tests for mid_set Eric Wong (Contractor, The Linux Foundation)
@ 2018-03-22 9:40 ` Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 11/13] v2writable: clarify header cleanups Eric Wong (Contractor, The Linux Foundation)
` (2 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Eric Wong (Contractor, The Linux Foundation) @ 2018-03-22 9:40 UTC (permalink / raw)
To: meta
Allow best-effort regeneration of NNTP article numbers from
cloned git repositories in addition to indexing Xapian Article
numbers will not remain consistent when we add purge support,
though.
---
MANIFEST | 1 +
lib/PublicInbox/V2Writable.pm | 61 +++++++++++++++++++++++----
script/public-inbox-index | 35 ++++++++++++++--
t/v2reindex.t | 98 +++++++++++++++++++++++++++++++++++++++++++
4 files changed, 185 insertions(+), 10 deletions(-)
create mode 100644 t/v2reindex.t
diff --git a/MANIFEST b/MANIFEST
index 567148a..0f88995 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -180,6 +180,7 @@ t/spawn.t
t/thread-all.t
t/thread-cycle.t
t/utf8.mbox
+t/v2reindex.t
t/v2writable.t
t/view.t
t/watch_maildir.t
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 550a74d..605f688 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -494,7 +494,7 @@ sub mark_deleted {
}
sub reindex_oid {
- my ($self, $mm_tmp, $D, $git, $oid) = @_;
+ my ($self, $mm_tmp, $D, $git, $oid, $regen) = @_;
my $len;
my $msgref = $git->cat_file($oid, \$len);
my $mime = PublicInbox::MIME->new($$msgref);
@@ -514,8 +514,27 @@ sub reindex_oid {
$num = $n;
}
}
+ if (!defined($mid0) && $regen && !$del) {
+ $num = $$regen--;
+ die "BUG: ran out of article numbers\n" if $num <= 0;
+ my $mm = $self->{skel}->{mm};
+ foreach my $mid (@$mids) {
+ if ($mm->mid_set($num, $mid) == 1) {
+ $mid0 = $mid;
+ last;
+ }
+ }
+ if (!defined($mid0)) {
+ my $id = '<' . join('> <', @$mids) . '>';
+ warn "Message-Id $id unusable for $num\n";
+ }
+ }
+
if (!defined($mid0) || $del) {
- return if (!defined($mid0) && $del); # expected for deletes
+ if (!defined($mid0) && $del) { # expected for deletes
+ $$regen--;
+ return
+ }
my $id = '<' . join('> <', @$mids) . '>';
defined($mid0) or
@@ -546,19 +565,45 @@ sub reindex_oid {
}
sub reindex {
- my ($self) = @_;
+ my ($self, $regen) = @_;
my $ibx = $self->{-inbox};
my $pfx = "$ibx->{mainrepo}/git";
my $max_git;
my $latest = git_dir_latest($self, \$max_git);
return unless defined $latest;
- my @cmd = qw(log --raw -r --pretty=tformat:%h
- --no-notes --no-color --no-abbrev);
my $head = $ibx->{ref_head} || 'refs/heads/master';
$self->idx_init; # acquire lock
my $x40 = qr/[a-f0-9]{40}/;
my $mm_tmp = $self->{skel}->{mm}->tmp_clone;
+ if (!$regen) {
+ my (undef, $max) = $mm_tmp->minmax;
+ unless (defined $max) {
+ $regen = 1;
+ warn
+"empty msgmap.sqlite3, regenerating article numbers\n";
+ }
+ }
+ my $tip; # latest commit out of all git repos
+ if ($regen) {
+ my $regen_max = 0;
+ for (my $cur = $max_git; $cur >= 0; $cur--) {
+ die "already reindexing!\n" if $self->{reindex_pipe};
+ my $git = PublicInbox::Git->new("$pfx/$cur.git");
+ chomp($tip = $git->qx('rev-parse', $head)) unless $tip;
+ my $h = $cur == $max_git ? $tip : $head;
+ my @count = ('rev-list', '--count', $h, '--', 'm');
+ $regen_max += $git->qx(@count);
+ }
+ die "No messages found in $pfx/*.git, bug?\n" unless $regen_max;
+ $regen = \$regen_max;
+ }
my $D = {};
+ my @cmd = qw(log --raw -r --pretty=tformat:%h
+ --no-notes --no-color --no-abbrev);
+
+ # if we are regenerating, we must not use a newer tip commit than what
+ # the regeneration counter used:
+ $tip ||= $head;
# work backwards through history
for (my $cur = $max_git; $cur >= 0; $cur--) {
@@ -566,12 +611,14 @@ sub reindex {
my $cmt;
my $git_dir = "$pfx/$cur.git";
my $git = PublicInbox::Git->new($git_dir);
- my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $head);
+ my $h = $cur == $max_git ? $tip : $head;
+ my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $h);
while (<$fh>) {
if (/\A$x40$/o) {
chomp($cmt = $_);
} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) {
- $self->reindex_oid($mm_tmp, $D, $git, $1);
+ $self->reindex_oid($mm_tmp, $D, $git, $1,
+ $regen);
} elsif (m!\A:\d{6} 100644 $x40 ($x40) [AM]\t_/D$!o) {
$self->mark_deleted($D, $git, $1);
}
diff --git a/script/public-inbox-index b/script/public-inbox-index
index cea3573..52d6ba7 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -23,8 +23,15 @@ if ($@) {
}
my $reindex;
-my %opts = ( '--reindex' => \$reindex );
+my $regen;
+my $jobs = undef;
+my %opts = (
+ '--reindex' => \$reindex,
+ '--regenerate' => \$regen,
+ '--jobs|j=i' => \$jobs,
+);
GetOptions(%opts) or die "bad command-line args\n$usage";
+die "--jobs must be positive\n" if defined $jobs && $jobs <= 0;
my @dirs;
@@ -93,8 +100,30 @@ sub index_dir {
if (ref($repo) && ($repo->{version} || 1) == 2) {
eval { require PublicInbox::V2Writable };
die "v2 requirements not met: $@\n" if $@;
- my $v2w = PublicInbox::V2Writable->new($repo);
- $v2w->reindex;
+ my $v2w = eval {
+ local $ENV{NPROC} = $jobs;
+ PublicInbox::V2Writable->new($repo);
+ };
+ if (defined $jobs) {
+ if ($jobs == 1) {
+ $v2w->{parallel} = 0;
+ } else {
+ my $n = $v2w->{partitions};
+ if ($jobs != $n) {
+ warn
+"Unable to respect --jobs=$jobs, inbox was created with $n partitions\n";
+ }
+ }
+ }
+ my $mm = $repo->mm;
+ my (undef, $max) = $mm->minmax if $mm;
+ if (defined($max) && !$reindex && !$regen) {
+ die
+"v2 inboxes may only use --reindex and/or --regenerate once\n".
+"msgmap.sqlite3 is initialized\n";
+ }
+
+ $v2w->reindex($regen);
$v2w->done;
} else {
my $s = PublicInbox::SearchIdx->new($repo, 1);
diff --git a/t/v2reindex.t b/t/v2reindex.t
new file mode 100644
index 0000000..b9540e4
--- /dev/null
+++ b/t/v2reindex.t
@@ -0,0 +1,98 @@
+# Copyright (C) 2018 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use warnings;
+use Test::More;
+use PublicInbox::MIME;
+use PublicInbox::ContentId qw(content_digest);
+use File::Temp qw/tempdir/;
+use File::Path qw(remove_tree);
+
+foreach my $mod (qw(DBD::SQLite Search::Xapian)) {
+ eval "require $mod";
+ plan skip_all => "$mod missing for v2reindex.t" if $@;
+}
+use_ok 'PublicInbox::V2Writable';
+my $mainrepo = tempdir('pi-v2reindex-XXXXXX', TMPDIR => 1, CLEANUP => 1);
+my $ibx = {
+ mainrepo => $mainrepo,
+ name => 'test-v2writable',
+ version => 2,
+ -primary_address => 'test@example.com',
+};
+$ibx = PublicInbox::Inbox->new($ibx);
+my $mime = PublicInbox::MIME->create(
+ header => [
+ From => 'a@example.com',
+ To => 'test@example.com',
+ Subject => 'this is a subject',
+ Date => 'Fri, 02 Oct 1993 00:00:00 +0000',
+ ],
+ body => "hello world\n",
+);
+
+my $im = PublicInbox::V2Writable->new($ibx, 1);
+$im->{parallel} = 0;
+foreach my $i (1..10) {
+ $mime->header_set('Message-Id', "<$i\@example.com>");
+ ok($im->add($mime), "message $i added");
+ if ($i == 4) {
+ $im->remove($mime);
+ }
+}
+
+if ('test remove later') {
+ $mime->header_set('Message-Id', "<5\@example.com>");
+ $im->remove($mime);
+}
+
+$im->done;
+my $minmax = [ $ibx->mm->minmax ];
+ok(defined $minmax->[0] && defined $minmax->[1], 'minmax defined');
+
+eval { $im->reindex };
+is($@, '', 'no error from reindexing');
+$im->done;
+
+my $xap = "$mainrepo/xap".PublicInbox::Search::SCHEMA_VERSION();
+remove_tree($xap);
+ok(!-d $xap, 'Xapian directories removed');
+eval { $im->reindex };
+is($@, '', 'no error from reindexing');
+$im->done;
+ok(-d $xap, 'Xapian directories recreated');
+
+delete $ibx->{mm};
+is_deeply($minmax, [ $ibx->mm->minmax ], 'minmax unchanged');
+
+ok(unlink "$mainrepo/msgmap.sqlite3", 'remove msgmap');
+remove_tree($xap);
+ok(!-d $xap, 'Xapian directories removed again');
+{
+ my @warn;
+ local $SIG{__WARN__} = sub { push @warn, @_ };
+ eval { $im->reindex };
+ is($@, '', 'no error from reindexing without msgmap');
+ like(join(' ', @warn), qr/regenerat/, 'warned about regenerating');
+ $im->done;
+ ok(-d $xap, 'Xapian directories recreated');
+ delete $ibx->{mm};
+ is_deeply($minmax, [ $ibx->mm->minmax ], 'minmax unchanged');
+}
+
+ok(unlink "$mainrepo/msgmap.sqlite3", 'remove msgmap');
+remove_tree($xap);
+ok(!-d $xap, 'Xapian directories removed again');
+{
+ my @warn;
+ local $SIG{__WARN__} = sub { push @warn, @_ };
+ eval { $im->reindex(my $regen = 1) };
+ is($@, '', 'no error from reindexing without msgmap');
+ is_deeply(\@warn, [], 'no warnings');
+ $im->done;
+ ok(-d $xap, 'Xapian directories recreated');
+ delete $ibx->{mm};
+ is_deeply($minmax, [ $ibx->mm->minmax ], 'minmax unchanged');
+}
+
+done_testing();
--
EW
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 11/13] v2writable: clarify header cleanups
2018-03-22 9:40 [PATCH 00/13] reindexing, feeds, date fixes Eric Wong (Contractor, The Linux Foundation)
` (9 preceding siblings ...)
2018-03-22 9:40 ` [PATCH 10/13] v2writable: add NNTP article number regeneration support Eric Wong (Contractor, The Linux Foundation)
@ 2018-03-22 9:40 ` Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 12/13] v2writable: DEBUG_DIFF respects $TMPDIR Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 13/13] feed: $INBOX/new.atom endpoint supports v2 inboxes Eric Wong (Contractor, The Linux Foundation)
12 siblings, 0 replies; 15+ messages in thread
From: Eric Wong (Contractor, The Linux Foundation) @ 2018-03-22 9:40 UTC (permalink / raw)
To: meta
We want to make it clear to the code and DEBUG_DIFF users
that we do not introduce messages with unsuitable headers
into public archives.
---
lib/PublicInbox/Import.pm | 12 +++++++++---
lib/PublicInbox/V2Writable.pm | 7 +++++++
2 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index d69934b..5d116a1 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -288,6 +288,14 @@ sub extract_author_info ($) {
($name, $email);
}
+# kill potentially confusing/misleading headers
+sub drop_unwanted_headers ($) {
+ my ($mime) = @_;
+
+ $mime->header_set($_) for qw(bytes lines content-length status);
+ $mime->header_set($_) for @PublicInbox::MDA::BAD_HEADERS;
+}
+
# returns undef on duplicate
# returns the :MARK of the most recent commit
sub add {
@@ -321,9 +329,7 @@ sub add {
_check_path($r, $w, $tip, $path) and return;
}
- # kill potentially confusing/misleading headers
- $mime->header_set($_) for qw(bytes lines content-length status);
- $mime->header_set($_) for @PublicInbox::MDA::BAD_HEADERS;
+ drop_unwanted_headers($mime);
# spam check:
if ($check_cb) {
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 605f688..44b5528 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -223,6 +223,12 @@ sub remove {
my $mm = $skel->{mm};
my $removed;
my $mids = mids($mime->header_obj);
+
+ # We avoid introducing new blobs into git since the raw content
+ # can be slightly different, so we do not need the user-supplied
+ # message now that we have the mids and content_id
+ $mime = undef;
+
foreach my $mid (@$mids) {
$srch->reopen->each_smsg_by_mid($mid, sub {
my ($smsg) = @_;
@@ -430,6 +436,7 @@ sub diff ($$$) {
print $ah $cur->as_string or die "print: $!";
close $ah or die "close: $!";
my ($bh, $bn) = tempfile('email-new-XXXXXXXX');
+ PublicInbox::Import::drop_unwanted_headers($new);
print $bh $new->as_string or die "print: $!";
close $bh or die "close: $!";
my $cmd = [ qw(diff -u), $an, $bn ];
--
EW
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 12/13] v2writable: DEBUG_DIFF respects $TMPDIR
2018-03-22 9:40 [PATCH 00/13] reindexing, feeds, date fixes Eric Wong (Contractor, The Linux Foundation)
` (10 preceding siblings ...)
2018-03-22 9:40 ` [PATCH 11/13] v2writable: clarify header cleanups Eric Wong (Contractor, The Linux Foundation)
@ 2018-03-22 9:40 ` Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 13/13] feed: $INBOX/new.atom endpoint supports v2 inboxes Eric Wong (Contractor, The Linux Foundation)
12 siblings, 0 replies; 15+ messages in thread
From: Eric Wong (Contractor, The Linux Foundation) @ 2018-03-22 9:40 UTC (permalink / raw)
To: meta
The File::Temp API is a bit tricky and needs TMPDIR explicitly
enabled if a template is given.
---
lib/PublicInbox/V2Writable.pm | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 44b5528..20c2736 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -432,10 +432,10 @@ sub diff ($$$) {
use File::Temp qw(tempfile);
use PublicInbox::Spawn qw(spawn);
- my ($ah, $an) = tempfile('email-cur-XXXXXXXX');
+ my ($ah, $an) = tempfile('email-cur-XXXXXXXX', TMPDIR => 1);
print $ah $cur->as_string or die "print: $!";
close $ah or die "close: $!";
- my ($bh, $bn) = tempfile('email-new-XXXXXXXX');
+ my ($bh, $bn) = tempfile('email-new-XXXXXXXX', TMPDIR => 1);
PublicInbox::Import::drop_unwanted_headers($new);
print $bh $new->as_string or die "print: $!";
close $bh or die "close: $!";
--
EW
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 13/13] feed: $INBOX/new.atom endpoint supports v2 inboxes
2018-03-22 9:40 [PATCH 00/13] reindexing, feeds, date fixes Eric Wong (Contractor, The Linux Foundation)
` (11 preceding siblings ...)
2018-03-22 9:40 ` [PATCH 12/13] v2writable: DEBUG_DIFF respects $TMPDIR Eric Wong (Contractor, The Linux Foundation)
@ 2018-03-22 9:40 ` Eric Wong (Contractor, The Linux Foundation)
12 siblings, 0 replies; 15+ messages in thread
From: Eric Wong (Contractor, The Linux Foundation) @ 2018-03-22 9:40 UTC (permalink / raw)
To: meta
We can no longer rely on tree name lookups for v2. This also
optimizes v1 by relying on git blob object_id lookups while
avoiding process spawning overhead for "git log".
---
lib/PublicInbox/Feed.pm | 66 +++++++++++++++++++++++++++++--------------------
1 file changed, 39 insertions(+), 27 deletions(-)
diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm
index c32e7bd..3277b09 100644
--- a/lib/PublicInbox/Feed.pm
+++ b/lib/PublicInbox/Feed.pm
@@ -12,15 +12,15 @@ use PublicInbox::WwwAtomStream;
# main function
sub generate {
my ($ctx) = @_;
- my @paths;
- each_recent_blob($ctx, sub { push @paths, $_[0] });
- return _no_thread() unless @paths;
+ my @oids;
+ each_recent_blob($ctx, sub { push @oids, $_[0] });
+ return _no_thread() unless @oids;
- my $ibx = $ctx->{-inbox};
+ my $git = $ctx->{-inbox}->git;
PublicInbox::WwwAtomStream->response($ctx, 200, sub {
- while (my $path = shift @paths) {
- my $mime = do_cat_mail($ibx, $path) or next;
- return $mime;
+ while (my $oid = shift @oids) {
+ my $msg = $git->cat_file($oid) or next;
+ return PublicInbox::MIME->new($msg);
}
});
}
@@ -63,25 +63,27 @@ sub generate_html_index {
sub new_html {
my ($ctx) = @_;
- my @paths;
+ die "BUG: new_html is not used with search" if $ctx->{srch};
+ my @oids;
my (undef, $last) = each_recent_blob($ctx, sub {
- my ($path, $commit, $ts, $u, $subj) = @_;
+ my ($oid, $commit, $ts, $u, $subj) = @_;
$ctx->{first} ||= $commit;
- push @paths, $path;
+ push @oids, $oid;
});
- if (!@paths) {
+ if (!@oids) {
return [404, ['Content-Type', 'text/plain'],
["No messages, yet\n"] ];
}
$ctx->{-html_tip} = '<pre>';
$ctx->{-upfx} = '';
$ctx->{-hr} = 1;
+ my $git = $ctx->{-inbox}->git;
PublicInbox::WwwStream->response($ctx, 200, sub {
- while (my $path = shift @paths) {
- my $m = do_cat_mail($ctx->{-inbox}, $path) or next;
- my $more = scalar @paths;
- my $s = PublicInbox::View::index_entry($m, $ctx, $more);
- return $s;
+ while (my $oid = shift @oids) {
+ my $msg = $git->cat_file($oid) or next;
+ my $m = PublicInbox::MIME->new($msg);
+ my $more = scalar @oids;
+ return PublicInbox::View::index_entry($m, $ctx, $more);
}
new_html_footer($ctx, $last);
});
@@ -111,10 +113,26 @@ sub new_html_footer {
sub each_recent_blob {
my ($ctx, $cb) = @_;
- my $max = $ctx->{-inbox}->{feedmax};
+ my $ibx = $ctx->{-inbox};
+ my $max = $ibx->{feedmax};
+ my $v = $ibx->{version} || 1;
+ if ($v == 2) {
+ wantarray and die "each_recent_blob return ignored for v2";
+ } elsif ($v != 1) {
+ die "BUG: unsupported inbox version: $v\n";
+ }
+ if (my $srch = $ibx->search) {
+ my $res = $srch->query('', { limit => $max });
+ foreach my $smsg (@{$res->{msgs}}) {
+ # search-enabled callers do not need author/date/subject
+ $cb->($smsg->{blob});
+ }
+ return;
+ }
+
my $hex = '[a-f0-9]';
- my $addmsg = qr!^:000000 100644 \S+ \S+ A\t(${hex}{2}/${hex}{38})$!;
- my $delmsg = qr!^:100644 000000 \S+ \S+ D\t(${hex}{2}/${hex}{38})$!;
+ my $addmsg = qr!^:000000 100644 \S+ (\S+) A\t${hex}{2}/${hex}{38}$!;
+ my $delmsg = qr!^:100644 000000 (\S+) \S+ D\t(${hex}{2}/${hex}{38})$!;
my $refhex = qr/(?:HEAD|${hex}{4,40})(?:~\d+)?/;
my $qp = $ctx->{qp};
@@ -128,9 +146,9 @@ sub each_recent_blob {
# get recent messages
# we could use git log -z, but, we already know ssoma will not
# leave us with filenames with spaces in them..
- my $log = $ctx->{-inbox}->git->popen(qw/log
+ my $log = $ibx->git->popen(qw/log
--no-notes --no-color --raw -r
- --abbrev=16 --abbrev-commit/,
+ --no-abbrev --abbrev-commit/,
"--format=%h%x00%ct%x00%an%x00%s%x00",
$range);
my %deleted; # only an optimization at this point
@@ -172,10 +190,4 @@ sub each_recent_blob {
($first_commit, $last_commit);
}
-sub do_cat_mail {
- my ($ibx, $path) = @_;
- my $mime = eval { $ibx->msg_by_path($path) } or return;
- PublicInbox::MIME->new($mime);
-}
-
1;
--
EW
^ permalink raw reply related [flat|nested] 15+ messages in thread