From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id EBD291F4B4; Wed, 16 Dec 2020 23:04:53 +0000 (UTC) Date: Wed, 16 Dec 2020 23:04:53 +0000 From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 10/9] extsearchidx: lock eidxq on full --reindex Message-ID: <20201216230453.GA12030@dcvr> References: <20201211033740.24568-1-e@80x24.org> <20201215020224.11739-1-e@80x24.org> <20201216064038.GA4161@dcvr> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: <20201216064038.GA4161@dcvr> List-Id: Eric Wong wrote: > Putting eidxq in over.sqlite3 was a bad idea when multiple > -extindex processes may run :x Nothing fatal or leading to > index corruption, just some stuff delayed or work needlessly > repeated. It's still nice to be able to resume interrupted runs, however. Final patch in this series from burning SSD hell, I hope... ------------8<------------ Subject: [PATCH] extsearchidx: lock eidxq on full --reindex Incremental indexing can use the `eidxq' reindexing queue for handling deletes and resuming interrupted indexing. Ensure those incremental -extindex invocations do not steal (and prematurely perform) work that an "-extindex --reindex" invocation is handling. --- lib/PublicInbox/ExtSearchIdx.pm | 86 +++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index b5024823..f492734a 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -18,6 +18,8 @@ use strict; use v5.10.1; use parent qw(PublicInbox::ExtSearch PublicInbox::Lock); use Carp qw(croak carp); +use Sys::Hostname qw(hostname); +use POSIX qw(strftime); use PublicInbox::Search; use PublicInbox::SearchIdx qw(crlf_adjust prepare_stack is_ancestor is_bad_blob); @@ -524,9 +526,86 @@ sub checkpoint_due ($) { ${$sync->{need_checkpoint}} || (now() > $sync->{next_check}); } +sub host_ident () { + # I've copied FS images and only changed the hostname before, + # so prepend hostname. Use `state' since these a BOFH can change + # these while this process is running and we always want to be + # able to release locks taken by this process. + state $retval = hostname . '-' . do { + my $m; # machine-id(5) is systemd + if (open(my $fh, '<', '/etc/machine-id')) { $m = <$fh> } + # hostid(1) is in GNU coreutils, kern.hostid is FreeBSD + chomp($m ||= `hostid` || `sysctl -n kern.hostid`); + $m; + }; +} + +sub eidxq_release { + my ($self) = @_; + my $expect = delete($self->{-eidxq_locked}) or return; + my ($owner_pid, undef) = split(/-/, $expect); + return if $owner_pid != $$; # shards may fork + my $oidx = $self->{oidx}; + $oidx->begin_lazy; + my $cur = $oidx->eidx_meta('eidxq_lock') // ''; + if ($cur eq $expect) { + $oidx->eidx_meta('eidxq_lock', ''); + return 1; + } elsif ($cur ne '') { + warn "E: eidxq_lock($expect) stolen by $cur\n"; + } else { + warn "E: eidxq_lock($expect) released by another process\n"; + } + undef; +} + +sub DESTROY { + my ($self) = @_; + eidxq_release($self) and $self->{oidx}->commit_lazy; +} + +sub _eidxq_take ($) { + my ($self) = @_; + my $val = "$$-${\time}-$>-".host_ident; + $self->{oidx}->eidx_meta('eidxq_lock', $val); + $self->{-eidxq_locked} = $val; +} + +sub eidxq_lock_acquire ($) { + my ($self) = @_; + my $oidx = $self->{oidx}; + $oidx->begin_lazy; + my $cur = $oidx->eidx_meta('eidxq_lock') || return _eidxq_take($self); + if (my $locked = $self->{-eidxq_locked}) { # be lazy + return $locked if $locked eq $cur; + } + my ($pid, $time, $euid, $ident) = split(/-/, $cur, 4); + my $t = strftime('%Y-%m-%d %k:%M:%S', gmtime($time)); + if ($euid == $> && $ident eq host_ident) { + if (kill(0, $pid)) { + warn <dbh->sqlite_db_filename; + warn <{oidx}->dbh; my $tot = $dbh->selectrow_array('SELECT COUNT(*) FROM eidxq') or return; ${$sync->{nr}} = 0; @@ -719,6 +798,12 @@ sub _reindex_inbox ($$$) { sub eidx_reindex { my ($self, $sync) = @_; + # acquire eidxq_lock early because full reindex takes forever + # and incremental -extindex processes can run during our checkpoints + if (!eidxq_lock_acquire($self)) { + warn "E: aborting --reindex\n"; + return; + } for my $ibx (@{$self->{ibx_list}}) { _reindex_inbox($self, $sync, $ibx); last if $sync->{quit}; @@ -769,6 +854,7 @@ sub eidx_sync { # main entry point $self->{oidx}->rethread_done($opt) unless $sync->{quit}; eidxq_process($self, $sync) unless $sync->{quit}; + eidxq_release($self); PublicInbox::V2Writable::done($self); }