* [PATCH 1/3] lei inspect: learn "num:" and "docid:" prefixes
2021-06-17 22:00 [PATCH 0/3] lei: internal bug fixups Eric Wong
@ 2021-06-17 22:00 ` Eric Wong
2021-06-17 22:00 ` [PATCH 2/3] lei_input: prefix bare Maildir paths w/ "maildir:" Eric Wong
2021-06-17 22:00 ` [PATCH 3/3] lei/store: cull redundant docids based on blob OID Eric Wong
2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2021-06-17 22:00 UTC (permalink / raw)
To: meta
"num:" is useful for inspecting Inbox-ish directories, while
"docid:" can be used for any Xapian DB (not just stuff managed
by our code).
---
lib/PublicInbox/LeiInspect.pm | 73 +++++++++++++++++++++++++++++++++++
1 file changed, 73 insertions(+)
diff --git a/lib/PublicInbox/LeiInspect.pm b/lib/PublicInbox/LeiInspect.pm
index eb2634b4..30714764 100644
--- a/lib/PublicInbox/LeiInspect.pm
+++ b/lib/PublicInbox/LeiInspect.pm
@@ -57,6 +57,75 @@ sub inspect_sync_folder ($$) {
$ent
}
+sub inspect_docid ($$;$) {
+ my ($lei, $docid, $ent) = @_;
+ require PublicInbox::Search;
+ $ent //= {};
+ my $xdb;
+ if ($xdb = delete $ent->{xdb}) { # from inspect_num
+ } elsif (defined(my $dir = $lei->{opt}->{dir})) {
+ no warnings 'once';
+ $xdb = $PublicInbox::Search::X{Database}->new($dir);
+ } else {
+ $xdb = $lei->{lse}->xdb;
+ }
+ $xdb or return $lei->fail('no Xapian DB');
+ my $doc = $xdb->get_document($docid); # raises
+ my $data = $doc->get_data;
+ $ent->{docid} = $docid;
+ $ent->{data_length} = length($data);
+ $ent->{description} => $doc->get_description;
+ $ent->{$_} = $doc->$_ for (qw(termlist_count values_count));
+ my $cur = $doc->termlist_begin;
+ my $end = $doc->termlist_end;
+ for (; $cur != $end; $cur++) {
+ my $tn = $cur->get_termname;
+ $tn =~ s/\A([A-Z]+)// or warn "$tn no prefix! (???)";
+ my $term = ($1 // '');
+ push @{$ent->{terms}->{$term}}, $tn;
+ }
+ @$_ = sort(@$_) for values %{$ent->{terms} // {}};
+ $cur = $doc->values_begin;
+ $end = $doc->values_end;
+ for (; $cur != $end; $cur++) {
+ my $n = $cur->get_valueno;
+ my $v = $cur->get_value;
+ my $iv = PublicInbox::Search::sortable_unserialise($v);
+ $v = $iv + 0 if defined $iv;
+ # not using ->[$n] since we may have large gaps in $n
+ $ent->{'values'}->{$n} = $v;
+ }
+ $ent;
+}
+
+sub inspect_num ($$) {
+ my ($lei, $num) = @_;
+ my ($docid, $ibx);
+ my $ent = { num => $num };
+ if (defined(my $dir = $lei->{opt}->{dir})) {
+ my $num2docid = $lei->{lse}->can('num2docid');
+ if (-f "$dir/ei.lock") {
+ require PublicInbox::ExtSearch;
+ $ibx = PublicInbox::ExtSearch->new($dir);
+ } elsif (-f "$dir/inbox.lock" || -d "$dir/public-inbox") {
+ require PublicInbox::Inbox; # v2, v1
+ $ibx = bless { inboxdir => $dir }, 'PublicInbox::Inbox';
+ }
+ $ent->{xdb} = $ibx->xdb //
+ return $lei->fail("no Xapian DB for $dir");
+ $docid = $num2docid->($ibx, $num);
+ } else {
+ $ibx = $lei->{lse};
+ $lei->{lse}->xdb; # set {nshard} for num2docid
+ $docid = $lei->{lse}->num2docid($num);
+ }
+ if ($ibx && $ibx->over) {
+ my $smsg = $ibx->over->get_art($num);
+ $ent->{smsg} = { %$smsg } if $smsg;
+ }
+ inspect_docid($lei, $docid, $ent);
+}
+
sub inspect1 ($$$) {
my ($lei, $item, $more) = @_;
my $ent;
@@ -72,6 +141,10 @@ sub inspect1 ($$$) {
}
} elsif ($item =~ m!\A(?:maildir|mh):!i || -d $item) {
$ent = inspect_sync_folder($lei, $item);
+ } elsif ($item =~ m!\Adocid:([0-9]+)\z!) {
+ $ent = inspect_docid($lei, $1 + 0);
+ } elsif ($item =~ m!\Anum:([0-9]+)\z!) {
+ $ent = inspect_num($lei, $1 + 0);
} else { # TODO: more things
return $lei->fail("$item not understood");
}
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 3/3] lei/store: cull redundant docids based on blob OID
2021-06-17 22:00 [PATCH 0/3] lei: internal bug fixups Eric Wong
2021-06-17 22:00 ` [PATCH 1/3] lei inspect: learn "num:" and "docid:" prefixes Eric Wong
2021-06-17 22:00 ` [PATCH 2/3] lei_input: prefix bare Maildir paths w/ "maildir:" Eric Wong
@ 2021-06-17 22:00 ` Eric Wong
2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2021-06-17 22:00 UTC (permalink / raw)
To: meta
I'm not sure how this happened (only once for me in March), but
it should not happen... In any case, we'll operate on the
lowest numbered docid and cull redundant index entries when
lei/store is open for read-write.
This also fixes the normal lei/store removal path to clean up
the xref3 table (since it's not done automatically for
public-facing -eidx due to the multi-list nature of it).
---
lib/PublicInbox/LeiStore.pm | 54 +++++++++++++++++++++++-------------
lib/PublicInbox/SearchIdx.pm | 2 +-
2 files changed, 36 insertions(+), 20 deletions(-)
diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm
index f978288a..4ba1e647 100644
--- a/lib/PublicInbox/LeiStore.pm
+++ b/lib/PublicInbox/LeiStore.pm
@@ -226,6 +226,18 @@ sub _remove_if_local { # git->cat_async arg
$self->{im}->remove($bref) if $bref;
}
+sub remove_docids ($;@) {
+ my ($self, @docids) = @_;
+ my $eidx = eidx_init($self);
+ for my $docid (@docids) {
+ $eidx->idx_shard($docid)->ipc_do('xdb_remove', $docid);
+ $self->{oidx}->delete_by_num($docid);
+ $self->{oidx}->{dbh}->do(<<EOF, undef, $docid);
+DELETE FROM xref3 WHERE docid = ?
+EOF
+ }
+}
+
# remove the entire message from the index, does not touch mail_sync.sqlite3
sub remove_eml {
my ($self, $eml) = @_;
@@ -241,13 +253,25 @@ sub remove_eml {
my $oidhex = unpack('H*', $oidbin);
$git->cat_async($oidhex, \&_remove_if_local, $self);
}
- $eidx->idx_shard($docid)->ipc_do('xdb_remove', $docid);
- $oidx->delete_by_num($docid);
}
$git->cat_async_wait;
+ remove_docids($self, @docids);
\@docids;
}
+sub oid2docid ($$) {
+ my ($self, $oid) = @_;
+ my $eidx = eidx_init($self);
+ my ($docid, @cull) = $eidx->{oidx}->blob_exists($oid);
+ if (@cull) { # fixup old bugs...
+ warn <<EOF;
+W: $oid indexed as multiple docids: $docid @cull, culling to fixup old bugs
+EOF
+ remove_docids($self, @cull);
+ }
+ wantarray ? ($docid) : $docid;
+}
+
sub add_eml {
my ($self, $eml, $vmd, $xoids) = @_;
my $im = $self->{-fake_im} // $self->importer; # may create new epoch
@@ -268,7 +292,7 @@ sub add_eml {
if (scalar keys %$xoids) {
my %docids = map { $_ => 1 } @$vivify_xvmd;
for my $oid (keys %$xoids) {
- my @id = $oidx->blob_exists($oid);
+ my @id = oid2docid($self, $oid);
@docids{@id} = @id;
}
@$vivify_xvmd = sort { $a <=> $b } keys(%docids);
@@ -356,15 +380,11 @@ sub update_xvmd {
my $oidx = $eidx->{oidx};
my %seen;
for my $oid (keys %$xoids) {
- my @docids = $oidx->blob_exists($oid) or next;
- scalar(@docids) > 1 and
- warn "W: $oid indexed as multiple docids: @docids\n";
- for my $docid (@docids) {
- next if $seen{$docid}++;
- my $idx = $eidx->idx_shard($docid);
- $idx->ipc_do('update_vmd', $docid, $vmd_mod);
- }
+ my $docid = oid2docid($self, $oid) // next;
delete $xoids->{$oid};
+ next if $seen{$docid}++;
+ my $idx = $eidx->idx_shard($docid);
+ $idx->ipc_do('update_vmd', $docid, $vmd_mod);
}
return unless scalar(keys(%$xoids));
@@ -395,15 +415,11 @@ sub set_xvmd {
# see if we can just update existing docs
for my $oid (keys %$xoids) {
- my @docids = $oidx->blob_exists($oid) or next;
- scalar(@docids) > 1 and
- warn "W: $oid indexed as multiple docids: @docids\n";
- for my $docid (@docids) {
- next if $seen{$docid}++;
- my $idx = $eidx->idx_shard($docid);
- $idx->ipc_do('set_vmd', $docid, $vmd);
- }
+ my $docid = oid2docid($self, $oid) // next;
delete $xoids->{$oid}; # all done with this oid
+ next if $seen{$docid}++;
+ my $idx = $eidx->idx_shard($docid);
+ $idx->ipc_do('set_vmd', $docid, $vmd);
}
return unless scalar(keys(%$xoids));
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index f066cc92..f553eda6 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -572,7 +572,7 @@ sub apply_vmd_mod ($$) {
my $updated = 0;
my @x = @VMD_MAP;
while (my ($field, $pfx) = splice(@x, 0, 2)) {
- # field: "label" or "kw"
+ # field: "L" or "kw"
for my $val (@{$vmd_mod->{"-$field"} // []}) {
eval {
$doc->remove_term($pfx . $val);
^ permalink raw reply related [flat|nested] 4+ messages in thread