From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.1 (2015-04-28) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.1 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 379F41F597; Sun, 5 Aug 2018 08:19:25 +0000 (UTC) Date: Sun, 5 Aug 2018 08:19:25 +0000 From: Eric Wong To: meta@public-inbox.org Cc: Konstantin Ryabitsev Subject: [RFC] overidx: preserve `tid' column on re-indexing Message-ID: <20180805081925.ypej6lcxtswdtdow@dcvr> References: <20180803182647.GA28438@chatter> <20180803192056.5swqcf67bsdxbpg6@dcvr> <20180805060440.fhl7zvyis246e3ym@dcvr> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: <20180805060440.fhl7zvyis246e3ym@dcvr> List-Id: Eric Wong wrote: > While working on this, I noticed the backwards --reindex walk > breaks `tid' on v1 repositories, at least. That bug was hidden > by the Subject: match logic and not discovered until now. It > will be fixed separately. Lightly tested, but seems to make sense... Reindexing http://czquwvybam4bgbro.onion/git/ now... -------8<------- Subject: [RFC] overidx: preserve `tid' column on re-indexing Otherwise, walking backwards through history could mean the root message in a thread forgets its `tid' and it prevents messages from being looked up by it. This bug was hidden by the fact that `sid' matches were often good enough to link threads together. --- lib/PublicInbox/OverIdx.pm | 11 +++++++++-- t/search-thr-index.t | 40 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 62fec0d..cc9bd7d 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -79,8 +79,15 @@ sub mid2id { } sub delete_by_num { - my ($self, $num) = @_; + my ($self, $num, $tid_ref) = @_; my $dbh = $self->{dbh}; + if ($tid_ref) { + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT tid FROM over WHERE num = ? LIMIT 1 + + $sth->execute($num); + $$tid_ref = $sth->fetchrow_array; # may be undef + } foreach (qw(over id2num)) { $dbh->prepare_cached(<<"")->execute($num); DELETE FROM $_ WHERE num = ? @@ -262,7 +269,7 @@ sub add_over { my $vivified = 0; $self->begin_lazy; - $self->delete_by_num($num); + $self->delete_by_num($num, \$old_tid); foreach my $mid (@$mids) { my $v = 0; each_by_mid($self, $mid, ['tid'], sub { diff --git a/t/search-thr-index.t b/t/search-thr-index.t index 2aa97bf..ab6d1b0 100644 --- a/t/search-thr-index.t +++ b/t/search-thr-index.t @@ -48,9 +48,49 @@ foreach (reverse split(/\n\n/, $data)) { } my $prev; +my %tids; +my $dbh = $rw->{over}->connect; foreach my $mid (@mids) { my $msgs = $rw->{over}->get_thread($mid); is(3, scalar(@$msgs), "got all messages from $mid"); + foreach my $m (@$msgs) { + my $tid = $dbh->selectrow_array(<<'', undef, $m->{num}); +SELECT tid FROM over WHERE num = ? LIMIT 1 + + $tids{$tid}++; + } +} + +is(scalar keys %tids, 1, 'all messages have the same tid'); + +$rw->commit_txn_lazy; + +$xdb = $rw->begin_txn_lazy; +{ + my $mime = Email::MIME->new(<<''); +Subject: [RFC 00/14] +Message-Id: <1-bw@g> +From: bw@g +To: git@vger.kernel.org + + my $dbh = $rw->{over}->connect; + my ($id, $prev); + my $reidx = $rw->{over}->next_by_mid('1-bw@g', \$id, \$prev); + ok(defined $reidx); + my $num = $reidx->{num}; + my $tid0 = $dbh->selectrow_array(<<'', undef, $num); +SELECT tid FROM over WHERE num = ? LIMIT 1 + + my $bytes = bytes::length($mime->as_string); + my $mid = mids($mime->header_obj)->[0]; + my $doc_id = $rw->add_message($mime, $bytes, $num, 'ignored', $mid); + ok($doc_id, 'message reindexed'. $mid); + is($doc_id, $num, "article number unchanged: $num"); + + my $tid1 = $dbh->selectrow_array(<<'', undef, $num); +SELECT tid FROM over WHERE num = ? LIMIT 1 + + is($tid1, $tid0, 'tid unchanged on reindex'); } $rw->commit_txn_lazy; -- EW