From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 07/27] v2writable: implement remove correctly
Date: Mon, 19 Mar 2018 08:14:39 +0000 [thread overview]
Message-ID: <20180319081459.10645-8-e@80x24.org> (raw)
In-Reply-To: <20180319081459.10645-1-e@80x24.org>
We need to hide removals from anybody hitting the search engine.
---
lib/PublicInbox/Msgmap.pm | 8 ++++++
lib/PublicInbox/SearchIdx.pm | 32 ++++++++++++++++++++++
lib/PublicInbox/SearchIdxPart.pm | 8 ++++++
lib/PublicInbox/SearchIdxSkeleton.pm | 18 +++++++++++++
lib/PublicInbox/SearchMsg.pm | 4 ++-
lib/PublicInbox/V2Writable.pm | 51 ++++++++++++++++++++++++++++--------
t/v2writable.t | 18 ++++++++++++-
7 files changed, 126 insertions(+), 13 deletions(-)
diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index a147b9f..8e81fba 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -140,6 +140,14 @@ sub mid_delete {
$sth->execute;
}
+sub num_delete {
+ my ($self, $num) = @_;
+ my $dbh = $self->{dbh};
+ my $sth = $dbh->prepare('DELETE FROM msgmap WHERE num = ?');
+ $sth->bind_param(1, $num);
+ $sth->execute;
+}
+
sub create_tables {
my ($dbh) = @_;
my $e;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index ccec018..ae2544d 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -440,6 +440,31 @@ sub remove_message {
}
}
+# MID is a hint in V2
+sub remove_by_oid {
+ my ($self, $oid, $mid) = @_;
+ my $db = $self->{xdb};
+
+ # XXX careful, we cannot use batch_do here since we conditionally
+ # delete documents based on other factors, so we cannot call
+ # find_doc_ids twice.
+ my ($head, $tail) = $self->find_doc_ids('Q' . $mid);
+ return if $head == $tail;
+
+ # there is only ONE element in @delete unless we
+ # have bugs in our v2writable deduplication check
+ my @delete;
+ for (; $head != $tail; $head->inc) {
+ my $docid = $head->get_docid;
+ my $doc = $db->get_document($docid);
+ my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid);
+ $smsg->load_expand;
+ push(@delete, $docid) if $smsg->{blob} eq $oid;
+ }
+ $db->delete_document($_) foreach @delete;
+ scalar(@delete);
+}
+
sub term_generator { # write-only
my ($self) = @_;
@@ -896,4 +921,11 @@ sub remote_close {
$? == 0 or die ref($self)." pid:$pid exited with: $?";
}
+# triggers remove_by_oid in partition or skeleton
+sub remote_remove {
+ my ($self, $oid, $mid) = @_;
+ print { $self->{w} } "D $oid $mid\n" or
+ die "failed to write remove $!";
+}
+
1;
diff --git a/lib/PublicInbox/SearchIdxPart.pm b/lib/PublicInbox/SearchIdxPart.pm
index dd7ace6..c166078 100644
--- a/lib/PublicInbox/SearchIdxPart.pm
+++ b/lib/PublicInbox/SearchIdxPart.pm
@@ -54,6 +54,14 @@ sub partition_worker_loop ($$$) {
$txn = undef;
print { $self->{skeleton}->{w} } "barrier $part\n" or
die "write failed to skeleton: $!\n";
+ } elsif ($line =~ /\AD ([a-f0-9]{40,}) (.+)\n\z/s) {
+ my ($oid, $mid) = ($1, $2);
+ $xdb ||= $self->_xdb_acquire;
+ if (!$txn) {
+ $xdb->begin_transaction;
+ $txn = 1;
+ }
+ $self->remove_by_oid($oid, $mid);
} else {
chomp $line;
my ($len, $artnum, $oid, $mid0) = split(/ /, $line);
diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm
index 4cb10f5..beb17b9 100644
--- a/lib/PublicInbox/SearchIdxSkeleton.pm
+++ b/lib/PublicInbox/SearchIdxSkeleton.pm
@@ -73,6 +73,14 @@ sub skeleton_worker_loop {
print $barrier_note "barrier_done\n" or die
"print failed to barrier note: $!";
}
+ } elsif ($line =~ /\AD ([a-f0-9]{40,}) (.*)\n\z/s) {
+ my ($oid, $mid) = ($1, $2);
+ $xdb ||= $self->_xdb_acquire;
+ if (!$txn) {
+ $xdb->begin_transaction;
+ $txn = 1;
+ }
+ $self->remove_by_oid($oid, $mid);
} else {
my $len = int($line);
my $n = read($r, my $msg, $len) or die "read: $!\n";
@@ -110,6 +118,16 @@ sub index_skeleton {
die "print failed: $err\n" if $err;
}
+sub remote_remove {
+ my ($self, $oid, $mid) = @_;
+ my $err;
+ $self->_lock_acquire;
+ eval { $self->SUPER::remote_remove($oid, $mid) };
+ $err = $@;
+ $self->_lock_release;
+ die $err if $err;
+}
+
# values: [ TS, NUM, BYTES, LINES, MID, XPATH, doc_data ]
sub index_skeleton_real ($$) {
my ($self, $values) = @_;
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index 23478a2..a1cd0c2 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -64,7 +64,9 @@ sub load_doc {
# :bytes and :lines metadata in RFC 3977
sub bytes ($) { get_val($_[0]->{doc}, &PublicInbox::Search::BYTES) }
sub lines ($) { get_val($_[0]->{doc}, &PublicInbox::Search::LINES) }
-sub num ($) { get_val($_[0]->{doc}, &PublicInbox::Search::NUM) }
+sub num ($) {
+ $_[0]->{num} ||= get_val($_[0]->{doc}, PublicInbox::Search::NUM)
+}
sub __hdr ($$) {
my ($self, $field) = @_;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index e673c25..656f069 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -199,18 +199,47 @@ sub idx_init {
}
sub remove {
- my ($self, $mime, $msg) = @_;
- my $existing = $self->lookup_content($mime) or return;
-
- # don't touch ghosts or already junked messages
- return unless $existing->type eq 'mail';
-
- # always write removals to the current (latest) git repo since
- # we process chronologically
+ my ($self, $mime, $cmt_msg) = @_;
+ $self->barrier;
+ $self->idx_init;
my $im = $self->importer;
- my ($cmt, undef) = $im->remove($mime, $msg);
- $cmt = $im->get_mark($cmt);
- $self->unindex_msg($existing, $cmt);
+ my $ibx = $self->{-inbox};
+ my $srch = $ibx->search;
+ my $cid = content_id($mime);
+ my $skel = $self->{skel};
+ my $parts = $self->{idx_parts};
+ my $mm = $skel->{mm};
+ my $removed;
+ my $mids = mids($mime->header_obj);
+ foreach my $mid (@$mids) {
+ $srch->reopen->each_smsg_by_mid($mid, sub {
+ my ($smsg) = @_;
+ $smsg->load_expand;
+ my $msg = $ibx->msg_by_smsg($smsg);
+ if (!defined($msg)) {
+ warn "broken smsg for $mid\n";
+ return 1; # continue
+ }
+ my $cur = PublicInbox::MIME->new($msg);
+ if (content_id($cur) eq $cid) {
+ $mm->num_delete($smsg->num);
+ # $removed should only be set once assuming
+ # no bugs in our deduplication code:
+ $removed = $smsg;
+ $removed->{mime} = $cur;
+ $im->remove($cur, $cmt_msg);
+ $removed->num; # memoize this for callers
+
+ my $oid = $smsg->{blob};
+ foreach my $idx (@$parts, $skel) {
+ $idx->remote_remove($oid, $mid);
+ }
+ }
+ 1; # continue
+ });
+ $self->barrier;
+ }
+ $removed;
}
sub done {
diff --git a/t/v2writable.t b/t/v2writable.t
index 7d276da..6e37b72 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -36,12 +36,13 @@ my $im = eval {
};
is($im->{partitions}, 1, 'one partition when forced');
ok($im->add($mime), 'ordinary message added');
+my $git0;
if ('ensure git configs are correct') {
my @cmd = (qw(git config), "--file=$mainrepo/all.git/config",
qw(core.sharedRepository 0644));
is(system(@cmd), 0, "set sharedRepository in all.git");
- my $git0 = PublicInbox::Git->new("$mainrepo/git/0.git");
+ $git0 = PublicInbox::Git->new("$mainrepo/git/0.git");
my $fh = $git0->popen(qw(config core.sharedRepository));
my $v = eval { local $/; <$fh> };
chomp $v;
@@ -189,8 +190,23 @@ EOF
};
{
local $ENV{NPROC} = 2;
+ my @before = $git0->qx(qw(log --pretty=oneline));
$im = PublicInbox::V2Writable->new($ibx, 1);
is($im->{partitions}, 1, 'detected single partition from previous');
+ my $smsg = $im->remove($mime, 'test removal');
+ my @after = $git0->qx(qw(log --pretty=oneline));
+ $im->done;
+ my $tip = shift @after;
+ like($tip, qr/\A[a-f0-9]+ test removal\n\z/s,
+ 'commit message propaged to git');
+ is_deeply(\@after, \@before, 'only one commit written to git');
+ is($ibx->mm->num_for($smsg->mid), undef, 'no longer in Msgmap by mid');
+ like($smsg->num, qr/\A\d+\z/, 'numeric number in return message');
+ is($ibx->mm->mid_for($smsg->num), undef, 'no longer in Msgmap by num');
+ my $srch = $ibx->search->reopen;
+ my @found = ();
+ $srch->each_smsg_by_mid($smsg->mid, sub { push @found, @_; 1 });
+ is(scalar(@found), 0, 'no longer found in Xapian skeleton');
}
done_testing();
--
EW
next prev parent reply other threads:[~2018-03-19 8:15 UTC|newest]
Thread overview: 29+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-03-19 8:14 [PATCH 00/27] v2 public-inbox-watch support Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 01/27] content_id: use Sender header if From is not available Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 02/27] v2writable: support "barrier" operation to avoid reforking Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 03/27] use string ref for Email::Simple->new Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 04/27] v2writable: remove unnecessary idx_init call Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 05/27] searchidx: do not delete documents while iterating Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 06/27] search: allow ->reopen to be chainable Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-03-19 8:14 ` [PATCH 08/27] skeleton: barrier init requires a lock Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 09/27] import: (v2) delete writes the blob into history in subdir Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 10/27] import: (v2): write deletes to a separate '_' subdirectory Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 11/27] import: implement barrier operation for v1 repos Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 12/27] mid: mid_mime uses v2-compatible mids function Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 13/27] watchmaildir: use content_digest to generate Message-Id Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 14/27] import: force Message-ID generation for v1 here Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 15/27] import: switch to URL-safe Base64 for Message-IDs Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 16/27] v2writable: test for idempotent removals Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 17/27] import: enable locking under v2 Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 18/27] index: s/GIT_DIR/REPO_DIR/ Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 19/27] Lock: new base class for writable lockers Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 20/27] t/watch_maildir: note the reason for FIFO creation Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 21/27] v2writable: ensure ->done is idempotent Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 22/27] watchmaildir: support v2 repositories Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 23/27] searchidxpart: s/barrier/remote_barrier/ Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 24/27] v2writable: allow disabling parallelization Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 25/27] scripts/import_vger_from_mbox: filter out same headers as MDA Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 26/27] v2writable: add DEBUG_DIFF env support Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:14 ` [PATCH 27/27] v2writable: remove "resent" message for duplicate Message-IDs Eric Wong (Contractor, The Linux Foundation)
2018-03-19 8:18 ` SQUASH: " Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180319081459.10645-8-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).