From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 5FF821F9FC for ; Wed, 17 Mar 2021 09:39:22 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] lei_store: keywords => vmd (volatile metadata), prepare for labels Date: Wed, 17 Mar 2021 15:39:22 +0600 Message-Id: <20210317093922.2125-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Since keywords and mailboxes (AKA labels) are separate things in JMAP; and only keywords can map reliably to Maildir and mbox; we'll keep them separate in our internal data representations, too. I initially wanted to call this just "meta" for "metadata", but that might be confused with our mailing list name. "metadata" is already used in Xapian's own API, to add another layer of confusion. "tags" was also considered, but probably confusing to notmuch users since our "labels" are analogous to "tags" in notmuch, and notmuch doesn't seem to cover "keywords" separately... So "vmd" it is, since we haven't used this particular three-letter-abbreviation anywhere before; and "volatile" seems like a good description of this metadata since everything else up to this point has been mostly WORM (write-once, read-many). --- Documentation/public-inbox-glossary.pod | 13 ++++- lib/PublicInbox/LeiImport.pm | 6 +-- lib/PublicInbox/LeiStore.pm | 30 ++++++------ lib/PublicInbox/LeiToMail.pm | 2 +- lib/PublicInbox/SearchIdx.pm | 65 ++++++++++++++++--------- t/lei_store.t | 28 +++++------ 6 files changed, 85 insertions(+), 59 deletions(-) diff --git a/Documentation/public-inbox-glossary.pod b/Documentation/public-inbox-glossary.pod index 61e1e9f8..10b3f9d6 100644 --- a/Documentation/public-inbox-glossary.pod +++ b/Documentation/public-inbox-glossary.pod @@ -69,8 +69,8 @@ L or L Private, per-message keywords or flags as described in RFC 8621 section 10.4. These are conveyed in the C and -C headers for L, as IMAP FLAGS (RFC 3501 section 2.3.2), -or Maildir info flags. +C headers for L, as system IMAP FLAGS +(RFC 3501 section 2.3.2), or Maildir info flags. L ignores drafts and trashed (deleted) messages. L ignores trashed (deleted) messages, @@ -83,6 +83,15 @@ the same email into one or more virtual folders for ease-of-filtering. This is NOT tied to public-inbox names, as messages stored by lei may not be public. +These are similar in spirit to arbitrary freeform "tags" +in mail software such as L and non-system IMAP FLAGS. + +=item volatile metadata (VMD) + +For L users only, this refers to the combination of +keywords and labels which are subject to frequent change +independently of immutable message content. + =item IMAP INTERNALDATE, JMAP receivedAt, rt: search prefix The first valid timestamp value of Received: headers (top first). diff --git a/lib/PublicInbox/LeiImport.pm b/lib/PublicInbox/LeiImport.pm index 65e37371..137c22fc 100644 --- a/lib/PublicInbox/LeiImport.pm +++ b/lib/PublicInbox/LeiImport.pm @@ -12,7 +12,7 @@ use PublicInbox::PktOp qw(pkt_do); sub _import_eml { # MboxReader callback my ($eml, $sto, $set_kw) = @_; $sto->ipc_do('set_eml', $eml, $set_kw ? - @{PublicInbox::MboxReader::mbox_keywords($eml)} : ()); + { kw => PublicInbox::MboxReader::mbox_keywords($eml) } : ()); } sub import_done_wait { # dwaitpid callback @@ -150,12 +150,12 @@ error reading $input: $! sub _import_maildir { # maildir_each_eml cb my ($f, $kw, $eml, $sto, $set_kw) = @_; - $sto->ipc_do('set_eml', $eml, $set_kw ? @$kw : ()); + $sto->ipc_do('set_eml', $eml, $set_kw ? { kw => $kw }: ()); } sub _import_net { # imap_each, nntp_each cb my ($url, $uid, $kw, $eml, $sto, $set_kw) = @_; - $sto->ipc_do('set_eml', $eml, $set_kw ? @$kw : ()); + $sto->ipc_do('set_eml', $eml, $set_kw ? { kw => $kw } : ()); } sub import_path_url { diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm index 771443db..ae263914 100644 --- a/lib/PublicInbox/LeiStore.pm +++ b/lib/PublicInbox/LeiStore.pm @@ -129,38 +129,38 @@ sub _docids_for ($$) { sort { $a <=> $b } values %docids; } -sub set_eml_keywords { - my ($self, $eml, @kw) = @_; +sub set_eml_vmd { + my ($self, $eml, $vmd) = @_; my $eidx = eidx_init($self); my @docids = _docids_for($self, $eml); for my $docid (@docids) { - $eidx->idx_shard($docid)->ipc_do('set_keywords', $docid, @kw); + $eidx->idx_shard($docid)->ipc_do('set_vmd', $docid, $vmd); } \@docids; } -sub add_eml_keywords { - my ($self, $eml, @kw) = @_; +sub add_eml_vmd { + my ($self, $eml, $vmd) = @_; my $eidx = eidx_init($self); my @docids = _docids_for($self, $eml); for my $docid (@docids) { - $eidx->idx_shard($docid)->ipc_do('add_keywords', $docid, @kw); + $eidx->idx_shard($docid)->ipc_do('add_vmd', $docid, $vmd); } \@docids; } -sub remove_eml_keywords { - my ($self, $eml, @kw) = @_; +sub remove_eml_vmd { + my ($self, $eml, $vmd) = @_; my $eidx = eidx_init($self); my @docids = _docids_for($self, $eml); for my $docid (@docids) { - $eidx->idx_shard($docid)->ipc_do('remove_keywords', $docid, @kw) + $eidx->idx_shard($docid)->ipc_do('remove_vmd', $docid, $vmd); } \@docids; } sub add_eml { - my ($self, $eml, @kw) = @_; + my ($self, $eml, $vmd) = @_; my $im = $self->importer; # may create new epoch my $eidx = eidx_init($self); # writes ALL.git/objects/info/alternates my $oidx = $eidx->{oidx}; @@ -174,7 +174,7 @@ sub add_eml { $oidx->add_xref3($docid, -1, $smsg->{blob}, '.'); # add_eidx_info for List-Id $idx->ipc_do('add_eidx_info', $docid, '.', $eml); - $idx->ipc_do('add_keywords', $docid, @kw) if @kw; + $idx->ipc_do('add_vmd', $docid, $vmd) if $vmd; } \@docids; } else { @@ -183,14 +183,14 @@ sub add_eml { $oidx->add_xref3($smsg->{num}, -1, $smsg->{blob}, '.'); my $idx = $eidx->idx_shard($smsg->{num}); $idx->index_eml($eml, $smsg); - $idx->ipc_do('add_keywords', $smsg->{num}, @kw) if @kw; + $idx->ipc_do('add_vmd', $smsg->{num}, $vmd ) if $vmd; $smsg; } } sub set_eml { - my ($self, $eml, @kw) = @_; - add_eml($self, $eml, @kw) // set_eml_keywords($self, $eml, @kw); + my ($self, $eml, $vmd) = @_; + add_eml($self, $eml, $vmd) // set_eml_vmd($self, $eml, $vmd); } sub add_eml_maybe { @@ -207,7 +207,7 @@ sub set_xkw { if ($lxs->xids_for($eml, 1)) { # is it in a local external? # TODO: index keywords only } else { - set_eml($self, $eml, @$kw); + set_eml($self, $eml, { kw => $kw }); } } diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm index 27e1338f..5cea73e1 100644 --- a/lib/PublicInbox/LeiToMail.pm +++ b/lib/PublicInbox/LeiToMail.pm @@ -277,7 +277,7 @@ sub update_kw_maybe ($$$$) { return unless $lse; my $x = $lse->kw_changed($eml, $kw); if ($x) { - $lei->{sto}->ipc_do('set_eml', $eml, @$kw); + $lei->{sto}->ipc_do('set_eml', $eml, { kw => $kw }); } elsif (!defined($x)) { $lei->{sto}->ipc_do('set_xkw', $eml, $kw); } diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 772f5a64..e2a1a678 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -528,44 +528,61 @@ sub remove_eidx_info { $self->{xdb}->replace_document($docid, $doc); } -sub set_keywords { - my ($self, $docid, @kw) = @_; +my @VMD_MAP = (kw => 'K', label => 'L'); + +sub set_vmd { + my ($self, $docid, $vmd) = @_; begin_txn_lazy($self); my $doc = _get_doc($self, $docid) or return; - my %keep = map { $_ => 1 } @kw; - my %add = %keep; - my @rm; - my $end = $doc->termlist_end; - for (my $cur = $doc->termlist_begin; $cur != $end; $cur++) { - $cur->skip_to('K'); - last if $cur == $end; - my $kw = $cur->get_termname; - $kw =~ s/\AK//s or next; - $keep{$kw} ? delete($add{$kw}) : push(@rm, $kw); + my ($end, @rm, @add); + my @x = @VMD_MAP; + while (my ($field, $pfx) = splice(@x, 0, 2)) { + my $set = $vmd->{$field} // next; + my %keep = map { $_ => 1 } @$set; + my %add = %keep; + $end //= $doc->termlist_end; + for (my $cur = $doc->termlist_begin; $cur != $end; $cur++) { + $cur->skip_to($pfx); + last if $cur == $end; + my $v = $cur->get_termname; + $v =~ s/\A$pfx//s or next; + $keep{$v} ? delete($add{$v}) : push(@rm, $pfx.$v); + } + push(@add, map { $pfx.$_ } keys %add); } - return unless (scalar(@rm) + scalar(keys %add)); - $doc->remove_term('K'.$_) for @rm; - $doc->add_boolean_term('K'.$_) for (keys %add); + return unless scalar(@rm) || scalar(@add); + $doc->remove_term($_) for @rm; + $doc->add_boolean_term($_) for @add; $self->{xdb}->replace_document($docid, $doc); } -sub add_keywords { - my ($self, $docid, @kw) = @_; +sub add_vmd { + my ($self, $docid, $vmd) = @_; begin_txn_lazy($self); my $doc = _get_doc($self, $docid) or return; - $doc->add_boolean_term('K'.$_) for @kw; + my @x = @VMD_MAP; + while (my ($field, $pfx) = splice(@x, 0, 2)) { + my $add = $vmd->{$field} // next; + $doc->add_boolean_term($pfx . $_) for @$add; + } $self->{xdb}->replace_document($docid, $doc); } -sub remove_keywords { - my ($self, $docid, @kw) = @_; +sub remove_vmd { + my ($self, $docid, $vmd) = @_; begin_txn_lazy($self); my $doc = _get_doc($self, $docid) or return; my $replace; - eval { - $doc->remove_term('K'.$_); - $replace = 1 - } for @kw; + my @x = @VMD_MAP; + while (my ($field, $pfx) = splice(@x, 0, 2)) { + my $rm = $vmd->{$field} // next; + for (@$rm) { + eval { + $doc->remove_term($pfx . $_); + $replace = 1; + }; + } + } $self->{xdb}->replace_document($docid, $doc) if $replace; } diff --git a/t/lei_store.t b/t/lei_store.t index d270e1f6..024ff527 100644 --- a/t/lei_store.t +++ b/t/lei_store.t @@ -36,37 +36,37 @@ $sto->done; for my $parallel (0, 1) { $sto->{priv_eidx}->{parallel} = $parallel; - my $docids = $sto->set_eml_keywords($eml, qw(seen draft)); + my $docids = $sto->set_eml_vmd($eml, { kw => [ qw(seen draft) ] }); is(scalar @$docids, 1, 'set keywords on one doc'); $sto->done; my @kw = $sto->search->msg_keywords($docids->[0]); is_deeply(\@kw, [qw(draft seen)], 'kw matches'); - $docids = $sto->add_eml_keywords($eml, qw(seen draft)); + $docids = $sto->add_eml_vmd($eml, {kw => [qw(seen draft)]}); $sto->done; is(scalar @$docids, 1, 'idempotently added keywords to doc'); @kw = $sto->search->msg_keywords($docids->[0]); is_deeply(\@kw, [qw(draft seen)], 'kw matches after noop'); - $docids = $sto->remove_eml_keywords($eml, qw(seen draft)); + $docids = $sto->remove_eml_vmd($eml, {kw => [qw(seen draft)]}); is(scalar @$docids, 1, 'removed from one doc'); $sto->done; @kw = $sto->search->msg_keywords($docids->[0]); is_deeply(\@kw, [], 'kw matches after remove'); - $docids = $sto->remove_eml_keywords($eml, qw(answered)); + $docids = $sto->remove_eml_vmd($eml, {kw=> [qw(answered)]}); is(scalar @$docids, 1, 'removed from one doc (idempotently)'); $sto->done; @kw = $sto->search->msg_keywords($docids->[0]); is_deeply(\@kw, [], 'kw matches after remove (idempotent)'); - $docids = $sto->add_eml_keywords($eml, qw(answered)); + $docids = $sto->add_eml_vmd($eml, {kw => [qw(answered)]}); is(scalar @$docids, 1, 'added to empty doc'); $sto->done; @kw = $sto->search->msg_keywords($docids->[0]); is_deeply(\@kw, ['answered'], 'kw matches after add'); - $docids = $sto->set_eml_keywords($eml); + $docids = $sto->set_eml_vmd($eml, { kw => [] }); is(scalar @$docids, 1, 'set to clobber'); $sto->done; @kw = $sto->search->msg_keywords($docids->[0]); @@ -74,11 +74,11 @@ for my $parallel (0, 1) { my $set = eml_load('t/plack-qp.eml'); $set->header_set('Message-ID', ""); - my $ret = $sto->set_eml($set, 'seen'); + my $ret = $sto->set_eml($set, { kw => [ 'seen' ] }); is(ref $ret, 'PublicInbox::Smsg', 'initial returns smsg'); - my $ids = $sto->set_eml($set, qw(seen)); + my $ids = $sto->set_eml($set, { kw => [ 'seen' ] }); is_deeply($ids, [ $ret->{num} ], 'set_eml idempotent'); - $ids = $sto->set_eml($set, qw(seen answered)); + $ids = $sto->set_eml($set, { kw => [ qw(seen answered) ] }); is_deeply($ids, [ $ret->{num} ], 'set_eml to change kw'); $sto->done; @kw = $sto->search->msg_keywords($ids->[0]); @@ -91,23 +91,23 @@ SKIP: { $eml->header_set('Message-ID', ''); my $pid = $sto->ipc_worker_spawn('lei-store'); ok($pid > 0, 'got a worker'); - my $smsg = $sto->ipc_do('set_eml', $eml, qw(seen)); + my $smsg = $sto->ipc_do('set_eml', $eml, { kw => [ qw(seen) ] }); is(ref($smsg), 'PublicInbox::Smsg', 'set_eml works over ipc'); - my $ids = $sto->ipc_do('set_eml', $eml, qw(seen)); + my $ids = $sto->ipc_do('set_eml', $eml, { kw => [ qw(seen) ] }); is_deeply($ids, [ $smsg->{num} ], 'docid returned'); $eml->header_set('Message-ID'); - my $no_mid = $sto->ipc_do('set_eml', $eml, qw(seen)); + my $no_mid = $sto->ipc_do('set_eml', $eml, { kw => [ qw(seen) ] }); my $wait = $sto->ipc_do('done'); my @kw = $sto->search->msg_keywords($no_mid->{num}); is_deeply(\@kw, [qw(seen)], 'ipc set changed kw'); is(ref($smsg), 'PublicInbox::Smsg', 'no mid works ipc'); - $ids = $sto->ipc_do('set_eml', $eml, qw(seen)); + $ids = $sto->ipc_do('set_eml', $eml, { kw => [ qw(seen) ] }); is_deeply($ids, [ $no_mid->{num} ], 'docid returned w/o mid w/ ipc'); $sto->ipc_do('done'); $sto->ipc_worker_stop; - $ids = $sto->ipc_do('set_eml', $eml, qw(seen answered)); + $ids = $sto->ipc_do('set_eml', $eml, { kw => [ qw(seen answered) ] }); is_deeply($ids, [ $no_mid->{num} ], 'docid returned w/o mid w/o ipc'); $wait = $sto->ipc_do('done');