unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 07/27] v2writable: implement remove correctly
Date: Mon, 19 Mar 2018 08:14:39 +0000	[thread overview]
Message-ID: <20180319081459.10645-8-e@80x24.org> (raw)
In-Reply-To: <20180319081459.10645-1-e@80x24.org>

We need to hide removals from anybody hitting the search engine.
---
 lib/PublicInbox/Msgmap.pm            |  8 ++++++
 lib/PublicInbox/SearchIdx.pm         | 32 ++++++++++++++++++++++
 lib/PublicInbox/SearchIdxPart.pm     |  8 ++++++
 lib/PublicInbox/SearchIdxSkeleton.pm | 18 +++++++++++++
 lib/PublicInbox/SearchMsg.pm         |  4 ++-
 lib/PublicInbox/V2Writable.pm        | 51 ++++++++++++++++++++++++++++--------
 t/v2writable.t                       | 18 ++++++++++++-
 7 files changed, 126 insertions(+), 13 deletions(-)

diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index a147b9f..8e81fba 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -140,6 +140,14 @@ sub mid_delete {
 	$sth->execute;
 }
 
+sub num_delete {
+	my ($self, $num) = @_;
+	my $dbh = $self->{dbh};
+	my $sth = $dbh->prepare('DELETE FROM msgmap WHERE num = ?');
+	$sth->bind_param(1, $num);
+	$sth->execute;
+}
+
 sub create_tables {
 	my ($dbh) = @_;
 	my $e;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index ccec018..ae2544d 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -440,6 +440,31 @@ sub remove_message {
 	}
 }
 
+# MID is a hint in V2
+sub remove_by_oid {
+	my ($self, $oid, $mid) = @_;
+	my $db = $self->{xdb};
+
+	# XXX careful, we cannot use batch_do here since we conditionally
+	# delete documents based on other factors, so we cannot call
+	# find_doc_ids twice.
+	my ($head, $tail) = $self->find_doc_ids('Q' . $mid);
+	return if $head == $tail;
+
+	# there is only ONE element in @delete unless we
+	# have bugs in our v2writable deduplication check
+	my @delete;
+	for (; $head != $tail; $head->inc) {
+		my $docid = $head->get_docid;
+		my $doc = $db->get_document($docid);
+		my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid);
+		$smsg->load_expand;
+		push(@delete, $docid) if $smsg->{blob} eq $oid;
+	}
+	$db->delete_document($_) foreach @delete;
+	scalar(@delete);
+}
+
 sub term_generator { # write-only
 	my ($self) = @_;
 
@@ -896,4 +921,11 @@ sub remote_close {
 	$? == 0 or die ref($self)." pid:$pid exited with: $?";
 }
 
+# triggers remove_by_oid in partition or skeleton
+sub remote_remove {
+	my ($self, $oid, $mid) = @_;
+	print { $self->{w} } "D $oid $mid\n" or
+			die "failed to write remove $!";
+}
+
 1;
diff --git a/lib/PublicInbox/SearchIdxPart.pm b/lib/PublicInbox/SearchIdxPart.pm
index dd7ace6..c166078 100644
--- a/lib/PublicInbox/SearchIdxPart.pm
+++ b/lib/PublicInbox/SearchIdxPart.pm
@@ -54,6 +54,14 @@ sub partition_worker_loop ($$$) {
 			$txn = undef;
 			print { $self->{skeleton}->{w} } "barrier $part\n" or
 					die "write failed to skeleton: $!\n";
+		} elsif ($line =~ /\AD ([a-f0-9]{40,}) (.+)\n\z/s) {
+			my ($oid, $mid) = ($1, $2);
+			$xdb ||= $self->_xdb_acquire;
+			if (!$txn) {
+				$xdb->begin_transaction;
+				$txn = 1;
+			}
+			$self->remove_by_oid($oid, $mid);
 		} else {
 			chomp $line;
 			my ($len, $artnum, $oid, $mid0) = split(/ /, $line);
diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm
index 4cb10f5..beb17b9 100644
--- a/lib/PublicInbox/SearchIdxSkeleton.pm
+++ b/lib/PublicInbox/SearchIdxSkeleton.pm
@@ -73,6 +73,14 @@ sub skeleton_worker_loop {
 				print $barrier_note "barrier_done\n" or die
 					"print failed to barrier note: $!";
 			}
+		} elsif ($line =~ /\AD ([a-f0-9]{40,}) (.*)\n\z/s) {
+			my ($oid, $mid) = ($1, $2);
+			$xdb ||= $self->_xdb_acquire;
+			if (!$txn) {
+				$xdb->begin_transaction;
+				$txn = 1;
+			}
+			$self->remove_by_oid($oid, $mid);
 		} else {
 			my $len = int($line);
 			my $n = read($r, my $msg, $len) or die "read: $!\n";
@@ -110,6 +118,16 @@ sub index_skeleton {
 	die "print failed: $err\n" if $err;
 }
 
+sub remote_remove {
+	my ($self, $oid, $mid) = @_;
+	my $err;
+	$self->_lock_acquire;
+	eval { $self->SUPER::remote_remove($oid, $mid) };
+	$err = $@;
+	$self->_lock_release;
+	die $err if $err;
+}
+
 # values: [ TS, NUM, BYTES, LINES, MID, XPATH, doc_data ]
 sub index_skeleton_real ($$) {
 	my ($self, $values) = @_;
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index 23478a2..a1cd0c2 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -64,7 +64,9 @@ sub load_doc {
 # :bytes and :lines metadata in RFC 3977
 sub bytes ($) { get_val($_[0]->{doc}, &PublicInbox::Search::BYTES) }
 sub lines ($) { get_val($_[0]->{doc}, &PublicInbox::Search::LINES) }
-sub num ($) { get_val($_[0]->{doc}, &PublicInbox::Search::NUM) }
+sub num ($) {
+	$_[0]->{num} ||= get_val($_[0]->{doc}, PublicInbox::Search::NUM)
+}
 
 sub __hdr ($$) {
 	my ($self, $field) = @_;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index e673c25..656f069 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -199,18 +199,47 @@ sub idx_init {
 }
 
 sub remove {
-	my ($self, $mime, $msg) = @_;
-	my $existing = $self->lookup_content($mime) or return;
-
-	# don't touch ghosts or already junked messages
-	return unless $existing->type eq 'mail';
-
-	# always write removals to the current (latest) git repo since
-	# we process chronologically
+	my ($self, $mime, $cmt_msg) = @_;
+	$self->barrier;
+	$self->idx_init;
 	my $im = $self->importer;
-	my ($cmt, undef) = $im->remove($mime, $msg);
-	$cmt = $im->get_mark($cmt);
-	$self->unindex_msg($existing, $cmt);
+	my $ibx = $self->{-inbox};
+	my $srch = $ibx->search;
+	my $cid = content_id($mime);
+	my $skel = $self->{skel};
+	my $parts = $self->{idx_parts};
+	my $mm = $skel->{mm};
+	my $removed;
+	my $mids = mids($mime->header_obj);
+	foreach my $mid (@$mids) {
+		$srch->reopen->each_smsg_by_mid($mid, sub {
+			my ($smsg) = @_;
+			$smsg->load_expand;
+			my $msg = $ibx->msg_by_smsg($smsg);
+			if (!defined($msg)) {
+				warn "broken smsg for $mid\n";
+				return 1; # continue
+			}
+			my $cur = PublicInbox::MIME->new($msg);
+			if (content_id($cur) eq $cid) {
+				$mm->num_delete($smsg->num);
+				# $removed should only be set once assuming
+				# no bugs in our deduplication code:
+				$removed = $smsg;
+				$removed->{mime} = $cur;
+				$im->remove($cur, $cmt_msg);
+				$removed->num; # memoize this for callers
+
+				my $oid = $smsg->{blob};
+				foreach my $idx (@$parts, $skel) {
+					$idx->remote_remove($oid, $mid);
+				}
+			}
+			1; # continue
+		});
+		$self->barrier;
+	}
+	$removed;
 }
 
 sub done {
diff --git a/t/v2writable.t b/t/v2writable.t
index 7d276da..6e37b72 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -36,12 +36,13 @@ my $im = eval {
 };
 is($im->{partitions}, 1, 'one partition when forced');
 ok($im->add($mime), 'ordinary message added');
+my $git0;
 
 if ('ensure git configs are correct') {
 	my @cmd = (qw(git config), "--file=$mainrepo/all.git/config",
 		qw(core.sharedRepository 0644));
 	is(system(@cmd), 0, "set sharedRepository in all.git");
-	my $git0 = PublicInbox::Git->new("$mainrepo/git/0.git");
+	$git0 = PublicInbox::Git->new("$mainrepo/git/0.git");
 	my $fh = $git0->popen(qw(config core.sharedRepository));
 	my $v = eval { local $/; <$fh> };
 	chomp $v;
@@ -189,8 +190,23 @@ EOF
 };
 {
 	local $ENV{NPROC} = 2;
+	my @before = $git0->qx(qw(log --pretty=oneline));
 	$im = PublicInbox::V2Writable->new($ibx, 1);
 	is($im->{partitions}, 1, 'detected single partition from previous');
+	my $smsg = $im->remove($mime, 'test removal');
+	my @after = $git0->qx(qw(log --pretty=oneline));
+	$im->done;
+	my $tip = shift @after;
+	like($tip, qr/\A[a-f0-9]+ test removal\n\z/s,
+		'commit message propaged to git');
+	is_deeply(\@after, \@before, 'only one commit written to git');
+	is($ibx->mm->num_for($smsg->mid), undef, 'no longer in Msgmap by mid');
+	like($smsg->num, qr/\A\d+\z/, 'numeric number in return message');
+	is($ibx->mm->mid_for($smsg->num), undef, 'no longer in Msgmap by num');
+	my $srch = $ibx->search->reopen;
+	my @found = ();
+	$srch->each_smsg_by_mid($smsg->mid, sub { push @found, @_; 1 });
+	is(scalar(@found), 0, 'no longer found in Xapian skeleton');
 }
 
 done_testing();
-- 
EW


  parent reply	other threads:[~2018-03-19  8:15 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-19  8:14 [PATCH 00/27] v2 public-inbox-watch support Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 01/27] content_id: use Sender header if From is not available Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 02/27] v2writable: support "barrier" operation to avoid reforking Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 03/27] use string ref for Email::Simple->new Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 04/27] v2writable: remove unnecessary idx_init call Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 05/27] searchidx: do not delete documents while iterating Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 06/27] search: allow ->reopen to be chainable Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-03-19  8:14 ` [PATCH 08/27] skeleton: barrier init requires a lock Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 09/27] import: (v2) delete writes the blob into history in subdir Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 10/27] import: (v2): write deletes to a separate '_' subdirectory Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 11/27] import: implement barrier operation for v1 repos Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 12/27] mid: mid_mime uses v2-compatible mids function Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 13/27] watchmaildir: use content_digest to generate Message-Id Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 14/27] import: force Message-ID generation for v1 here Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 15/27] import: switch to URL-safe Base64 for Message-IDs Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 16/27] v2writable: test for idempotent removals Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 17/27] import: enable locking under v2 Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 18/27] index: s/GIT_DIR/REPO_DIR/ Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 19/27] Lock: new base class for writable lockers Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 20/27] t/watch_maildir: note the reason for FIFO creation Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 21/27] v2writable: ensure ->done is idempotent Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 22/27] watchmaildir: support v2 repositories Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 23/27] searchidxpart: s/barrier/remote_barrier/ Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 24/27] v2writable: allow disabling parallelization Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 25/27] scripts/import_vger_from_mbox: filter out same headers as MDA Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 26/27] v2writable: add DEBUG_DIFF env support Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 27/27] v2writable: remove "resent" message for duplicate Message-IDs Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:18   ` SQUASH: " Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180319081459.10645-8-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).