unofficial mirror of notmuch@notmuchmail.org
 help / color / mirror / code / Atom feed
* [PATCH] Add configurable changed tag to messages that have been changed on disk
@ 2014-04-06 16:11 Gaute Hope
  2014-04-06 20:19 ` David Mazieres
  2014-07-03 10:42 ` David Bremner
  0 siblings, 2 replies; 27+ messages in thread
From: Gaute Hope @ 2014-04-06 16:11 UTC (permalink / raw)
  To: notmuch

When one of the source files for a message is changed on disk, renamed,
deleted or a new source file is added. A configurable changed tag is
is added. The tag can be configured under the option 'changed_tags' in
the [new] section, the default is none. Tests have been updated to
accept the new config option.

notmuch-setup now asks for a changed tag after the new tags question.

This could be useful for for example 'afew' to detect remote changes in
IMAP folders and update the FolderNameFilter to also add tags or remove
tags when a _existing_ message has been added to or removed from a
maildir.
---
 notmuch-client.h    |  8 ++++++++
 notmuch-config.c    | 42 ++++++++++++++++++++++++++++++++++++++----
 notmuch-new.c       | 34 +++++++++++++++++++++++++++++-----
 notmuch-setup.c     | 17 +++++++++++++++++
 test/T030-config.sh |  1 +
 test/T040-setup.sh  |  2 ++
 6 files changed, 95 insertions(+), 9 deletions(-)

diff --git a/notmuch-client.h b/notmuch-client.h
index d110648..8b60645 100644
--- a/notmuch-client.h
+++ b/notmuch-client.h
@@ -314,6 +314,14 @@ notmuch_config_set_new_ignore (notmuch_config_t *config,
 			       const char *new_ignore[],
 			       size_t length);
 
+const char **
+notmuch_config_get_changed_tags (notmuch_config_t *config,
+			     size_t *length);
+void
+notmuch_config_set_changed_tags (notmuch_config_t *config,
+			     const char *changed_tags[],
+			     size_t length);
+
 notmuch_bool_t
 notmuch_config_get_maildir_synchronize_flags (notmuch_config_t *config);
 
diff --git a/notmuch-config.c b/notmuch-config.c
index 8d28653..5ea0635 100644
--- a/notmuch-config.c
+++ b/notmuch-config.c
@@ -51,7 +51,11 @@ static const char new_config_comment[] =
     "\n"
     "\t	NOTE: *Every* file/directory that goes by one of those\n"
     "\t	names will be ignored, independent of its depth/location\n"
-    "\t	in the mail store.\n";
+    "\t	in the mail store.\n"
+    "\n"
+    "\tchanged_tags A list (separated by ';') of tags that will be\n"
+    "\t added to messages where a one or more of its source files have\n"
+    "\t have been changed (added, renamed or deleted).\n";
 
 static const char user_config_comment[] =
     " User configuration\n"
@@ -111,10 +115,15 @@ struct _notmuch_config {
     char *user_primary_email;
     const char **user_other_email;
     size_t user_other_email_length;
+
     const char **new_tags;
     size_t new_tags_length;
     const char **new_ignore;
     size_t new_ignore_length;
+
+    const char **changed_tags;
+    size_t changed_tags_length;
+
     notmuch_bool_t maildir_synchronize_flags;
     const char **search_exclude_tags;
     size_t search_exclude_tags_length;
@@ -249,7 +258,7 @@ notmuch_config_open (void *ctx,
 	fprintf (stderr, "Out of memory.\n");
 	return NULL;
     }
-    
+
     talloc_set_destructor (config, notmuch_config_destructor);
 
     if (filename) {
@@ -273,6 +282,8 @@ notmuch_config_open (void *ctx,
     config->new_tags_length = 0;
     config->new_ignore = NULL;
     config->new_ignore_length = 0;
+    config->changed_tags = NULL;
+    config->changed_tags_length = 0;
     config->maildir_synchronize_flags = TRUE;
     config->search_exclude_tags = NULL;
     config->search_exclude_tags_length = 0;
@@ -373,6 +384,10 @@ notmuch_config_open (void *ctx,
 	notmuch_config_set_new_ignore (config, NULL, 0);
     }
 
+    if (notmuch_config_get_changed_tags (config, &tmp) == NULL) {
+      notmuch_config_set_changed_tags (config, NULL, 0);
+    }
+
     if (notmuch_config_get_search_exclude_tags (config, &tmp) == NULL) {
 	if (config->is_new) {
 	    const char *tags[] = { "deleted", "spam" };
@@ -422,7 +437,7 @@ notmuch_config_open (void *ctx,
 }
 
 /* Close the given notmuch_config_t object, freeing all resources.
- * 
+ *
  * Note: Any changes made to the configuration are *not* saved by this
  * function. To save changes, call notmuch_config_save before
  * notmuch_config_close.
@@ -631,6 +646,14 @@ notmuch_config_get_new_ignore (notmuch_config_t *config, size_t *length)
 			     &(config->new_ignore_length), length);
 }
 
+const char **
+notmuch_config_get_changed_tags (notmuch_config_t *config,   size_t *length)
+{
+    return _config_get_list (config, "new", "changed_tags",
+			     &(config->changed_tags),
+			     &(config->changed_tags_length), length);
+}
+
 void
 notmuch_config_set_user_other_email (notmuch_config_t *config,
 				     const char *list[],
@@ -658,6 +681,15 @@ notmuch_config_set_new_ignore (notmuch_config_t *config,
 		     &(config->new_ignore));
 }
 
+void
+notmuch_config_set_changed_tags (notmuch_config_t *config,
+				     const char *list[],
+				     size_t length)
+{
+    _config_set_list (config, "new", "changed_tags", list, length,
+		     &(config->changed_tags));
+}
+
 const char **
 notmuch_config_get_search_exclude_tags (notmuch_config_t *config, size_t *length)
 {
@@ -714,17 +746,19 @@ notmuch_config_command_get (notmuch_config_t *config, char *item)
     } else if (strcmp(item, "user.other_email") == 0) {
 	const char **other_email;
 	size_t i, length;
-	
+
 	other_email = notmuch_config_get_user_other_email (config, &length);
 	for (i = 0; i < length; i++)
 	    printf ("%s\n", other_email[i]);
     } else if (strcmp(item, "new.tags") == 0) {
+
 	const char **tags;
 	size_t i, length;
 
 	tags = notmuch_config_get_new_tags (config, &length);
 	for (i = 0; i < length; i++)
 	    printf ("%s\n", tags[i]);
+
     } else {
 	char **value;
 	size_t i, length;
diff --git a/notmuch-new.c b/notmuch-new.c
index 82acf69..db52961 100644
--- a/notmuch-new.c
+++ b/notmuch-new.c
@@ -50,6 +50,9 @@ typedef struct {
     const char **new_ignore;
     size_t new_ignore_length;
 
+    const char **changed_tags;
+    size_t changed_tags_length;
+
     int total_files;
     int processed_files;
     int added_messages, removed_messages, renamed_messages;
@@ -274,8 +277,12 @@ add_file (notmuch_database_t *notmuch, const char *filename,
 	break;
     /* Non-fatal issues (go on to next file). */
     case NOTMUCH_STATUS_DUPLICATE_MESSAGE_ID:
+	notmuch_message_freeze (message);
+	for (tag = state->changed_tags; *tag != NULL; tag++)
+	    notmuch_message_add_tag (message, *tag);
 	if (state->synchronize_flags)
 	    notmuch_message_maildir_flags_to_tags (message);
+	notmuch_message_thaw (message);
 	break;
     case NOTMUCH_STATUS_FILE_NOT_EMAIL:
 	fprintf (stderr, "Note: Ignoring non-mail file: %s\n", filename);
@@ -809,13 +816,25 @@ remove_filename (notmuch_database_t *notmuch,
 
     status = notmuch_database_remove_message (notmuch, path);
     if (status == NOTMUCH_STATUS_DUPLICATE_MESSAGE_ID) {
-	add_files_state->renamed_messages++;
-	if (add_files_state->synchronize_flags == TRUE)
-	    notmuch_message_maildir_flags_to_tags (message);
-	status = NOTMUCH_STATUS_SUCCESS;
+
+    add_files_state->renamed_messages++;
+
+    /* add changed tags */
+    const char **tag;
+    for (tag = add_files_state->changed_tags; *tag != NULL; tag++)
+        notmuch_message_add_tag (message, *tag);
+
+    if (add_files_state->synchronize_flags == TRUE)
+        notmuch_message_maildir_flags_to_tags (message);
+
+    status = NOTMUCH_STATUS_SUCCESS;
+
     } else if (status == NOTMUCH_STATUS_SUCCESS) {
-	add_files_state->removed_messages++;
+
+      add_files_state->removed_messages++;
+
     }
+
     notmuch_message_destroy (message);
 
   DONE:
@@ -946,8 +965,13 @@ notmuch_new_command (notmuch_config_t *config, int argc, char *argv[])
     else if (verbose)
 	add_files_state.verbosity = VERBOSITY_VERBOSE;
 
+    /* tags for added files */
     add_files_state.new_tags = notmuch_config_get_new_tags (config, &add_files_state.new_tags_length);
     add_files_state.new_ignore = notmuch_config_get_new_ignore (config, &add_files_state.new_ignore_length);
+
+    /* tags for changed files */
+    add_files_state.changed_tags = notmuch_config_get_changed_tags (config, &add_files_state.changed_tags_length);
+
     add_files_state.synchronize_flags = notmuch_config_get_maildir_synchronize_flags (config);
     db_path = notmuch_config_get_database_path (config);
 
diff --git a/notmuch-setup.c b/notmuch-setup.c
index 36a6171..cf99ee0 100644
--- a/notmuch-setup.c
+++ b/notmuch-setup.c
@@ -131,6 +131,8 @@ notmuch_setup_command (notmuch_config_t *config,
     unsigned int i;
     const char **new_tags;
     size_t new_tags_len;
+    const char **changed_tags;
+    size_t changed_tags_len;
     const char **search_exclude_tags;
     size_t search_exclude_tags_len;
 
@@ -206,6 +208,21 @@ notmuch_setup_command (notmuch_config_t *config,
 	g_ptr_array_free (tags, TRUE);
     }
 
+    changed_tags = notmuch_config_get_changed_tags (config, &changed_tags_len);
+
+    printf ("Tags to apply to all changed messages (separated by spaces) [");
+    print_tag_list (changed_tags, changed_tags_len);
+    prompt ("]: ");
+
+    if (strlen (response)) {
+	GPtrArray *tags = parse_tag_list (config, response);
+
+	notmuch_config_set_changed_tags (config, (const char **) tags->pdata,
+				     tags->len);
+
+	g_ptr_array_free (tags, TRUE);
+    }
+
 
     search_exclude_tags = notmuch_config_get_search_exclude_tags (config, &search_exclude_tags_len);
 
diff --git a/test/T030-config.sh b/test/T030-config.sh
index ca4cf33..d1e095b 100755
--- a/test/T030-config.sh
+++ b/test/T030-config.sh
@@ -52,6 +52,7 @@ user.primary_email=test_suite@notmuchmail.org
 user.other_email=test_suite_other@notmuchmail.org;test_suite@otherdomain.org
 new.tags=unread;inbox;
 new.ignore=
+new.changed_tags=
 search.exclude_tags=
 maildir.synchronize_flags=true
 foo.string=this is another string value
diff --git a/test/T040-setup.sh b/test/T040-setup.sh
index 124ef1c..7e37ea1 100755
--- a/test/T040-setup.sh
+++ b/test/T040-setup.sh
@@ -11,6 +11,7 @@ another.suite@example.com
 
 /path/to/maildir
 foo bar
+chaz
 baz
 EOF
 output=$(notmuch --config=new-notmuch-config config list)
@@ -21,6 +22,7 @@ user.primary_email=test.suite@example.com
 user.other_email=another.suite@example.com;
 new.tags=foo;bar;
 new.ignore=
+new.changed_tags=chaz;
 search.exclude_tags=baz;
 maildir.synchronize_flags=true"
 
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-06 16:11 [PATCH] Add configurable changed tag to messages that have been changed on disk Gaute Hope
@ 2014-04-06 20:19 ` David Mazieres
  2014-04-10 14:43   ` Gaute Hope
  2014-04-23 21:28   ` Austin Clements
  2014-07-03 10:42 ` David Bremner
  1 sibling, 2 replies; 27+ messages in thread
From: David Mazieres @ 2014-04-06 20:19 UTC (permalink / raw)
  To: Gaute Hope, notmuch

Gaute Hope <eg@gaute.vetsj.com> writes:

> When one of the source files for a message is changed on disk, renamed,
> deleted or a new source file is added. A configurable changed tag is
> is added. The tag can be configured under the option 'changed_tags' in
> the [new] section, the default is none. Tests have been updated to
> accept the new config option.
>
> notmuch-setup now asks for a changed tag after the new tags question.
>
> This could be useful for for example 'afew' to detect remote changes in
> IMAP folders and update the FolderNameFilter to also add tags or remove
> tags when a _existing_ message has been added to or removed from a
> maildir.

I think this is the wrong way to achieve such functionality, because
then the change tag A) is expensive to remove, B) is easy to misuse
(remember to call fsync everywhere before deleting the change tag), and
C) can be used by only one application.

A better approach would be to add a new "modtime" xapian value that is
updated whenever the tags or any other terms (such as XFDIRENTRY) are
added to or deleted from a docid.  If it's a Xapian value, rather than a
term, then modtime will be queriable just like date, allowing multiple
applications to query all docids modified since the last time they ran.

I currently have multiple applications that could significantly benefit
from such a modtime.  An obvious one is proper incremental backups with
notmuch-dump.

Another example is a tool I have that synchromizes maildirs and notmuch
tags across machines.  With the current interface, there is no way to do
this without scanning the entire database, because any message, even a
very old one, may have changed tags or links.  Moreover, something like
notmuch-dump is way, way too slow to run every time you want to check
for new mail.  notmuch-dump costs 5-10 seconds on my 110,000-message
maildir!  In fact, any approach the gathers tags associated with each
individual docid is a complete non-starter, forcing me to violate
abstraction and examine the postlists associated with each tag and
XFDIRENTRY term.  Even my highly optimized implementation takes about
250 msec (1400 msec on a 32-bit machine), which adds perceptible latency
to synchronizing my clients' notmuch maildirs with my server's when I
poll for new mail.

Yet another application is something like nottoomuch-addresses, which
currently uses an occasionally incorrect heuristic to detect new
messages based on the Date header.

Let me make a stronger statement, which is that not only are
modification times an incredibly useful and general primitive, but lack
of modification times is the single thing that kept me away from notmuch
despite years of wanting to switch.  In the end, I invested months
developing a highly-optimized change detector that efficiently diffs
Xapian's Btrees against a mysql database with a snapshot of the same
information.  My solution works, and I now enjoy a replicated notmuch
setup synchronized across three machines, including offline access on my
laptop.  But my 4,000-line C++ program might have been a 400-line shell
script if only notmuch supported docid mod times.

Also, to put this in perspective, how long does it take to remove the
changed tags from a bunch of messages?  If it's longer than 300 msec on
a 64-bit machine, then even with a single application you'd be better
off using my crazy on-the-side mysql version vector scheme.

David

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-06 20:19 ` David Mazieres
@ 2014-04-10 14:43   ` Gaute Hope
  2014-04-10 15:31     ` dm-list-email-notmuch
  2014-04-23 21:28   ` Austin Clements
  1 sibling, 1 reply; 27+ messages in thread
From: Gaute Hope @ 2014-04-10 14:43 UTC (permalink / raw)
  To: David Mazieres expires 2014-07-05 CEST; +Cc: notmuch

Excerpts from David Mazieres's message of 2014-04-06 22:19:19 +0200:
> Gaute Hope <eg@gaute.vetsj.com> writes:
>
> > When one of the source files for a message is changed on disk, renamed,
> > deleted or a new source file is added. A configurable changed tag is
> > is added. The tag can be configured under the option 'changed_tags' in
> > the [new] section, the default is none. Tests have been updated to
> > accept the new config option.
> >
> > notmuch-setup now asks for a changed tag after the new tags question.
> >
> > This could be useful for for example 'afew' to detect remote changes in
> > IMAP folders and update the FolderNameFilter to also add tags or remove
> > tags when a _existing_ message has been added to or removed from a
> > maildir.
>
> I think this is the wrong way to achieve such functionality, because
> then the change tag A) is expensive to remove, B) is easy to misuse
> (remember to call fsync everywhere before deleting the change tag), and
> C) can be used by only one application.
>
> A better approach would be to add a new "modtime" xapian value that is
> updated whenever the tags or any other terms (such as XFDIRENTRY) are
> added to or deleted from a docid.  If it's a Xapian value, rather than a
> term, then modtime will be queriable just like date, allowing multiple
> applications to query all docids modified since the last time they ran.
>
> [... snip]

This could also solve it, and probably have more uses. I don't quite see
how the opposite problem (for my use case) can be solved by this without
using a 'localchange' tag. This is to sync tag to maildir sync, when a
new tag has been added (by e.g. a user interaction in a client) it needs
to be copied to the maildir, if it is not done in the same go a
different application won't know whether the change was local or remote.
How did you solve this?

I would suggest using a Xapian- or Index-time which gets a tick
everytime a modification is made to the index. Atomic operations could
operate on the same time in case this distinction turns out to be
useful. Perhaps something like this already exists in Xapian? This way
clock skew, clock resolution (lots of operations happening in the same
second, msec or nanosec) problems won't be an issue. The crux will be to
make sure all write-operations trigger a tick on the indextime.

Regards, Gaute

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-10 14:43   ` Gaute Hope
@ 2014-04-10 15:31     ` dm-list-email-notmuch
  2014-04-10 21:10       ` Gaute Hope
  2014-04-11 11:08       ` David Bremner
  0 siblings, 2 replies; 27+ messages in thread
From: dm-list-email-notmuch @ 2014-04-10 15:31 UTC (permalink / raw)
  To: Gaute Hope; +Cc: notmuch

Gaute Hope <eg@gaute.vetsj.com> writes:

>> A better approach would be to add a new "modtime" xapian value that is
>> updated whenever the tags or any other terms (such as XFDIRENTRY) are
>> added to or deleted from a docid.  If it's a Xapian value, rather than a
>> term, then modtime will be queriable just like date, allowing multiple
>> applications to query all docids modified since the last time they ran.
>>
>> [... snip]
>
> This could also solve it, and probably have more uses. I don't quite see
> how the opposite problem (for my use case) can be solved by this without
> using a 'localchange' tag. This is to sync tag to maildir sync, when a
> new tag has been added (by e.g. a user interaction in a client) it needs
> to be copied to the maildir, if it is not done in the same go a
> different application won't know whether the change was local or remote.
> How did you solve this?

Why don't you just set maildir.synchronize_flags=true?  When I
synchronize mail across machines, I start by concurrently running
"notmuch new" on both the local and remote machines, which picks up all
the changed maildir flags.  Then I synchronize the mail and the tags
between the two maildirs.  If maildir.synchronize=true, then atomically
with setting the new tags I call notmuch_message_tags_to_maildir_flags()
to sync the new tags to the maildir.

The maildir flags question seems kind of independent of what we are
talking about, which is just having an incremental way of examining the
database.  Right now, I have to scan everything to find tags that have
changed since the last synchronization event.  If I had modtime (or
really it should be called "ctime", like inode change time), then I
could look at only the few messages that changed, and it would probably
shave 250msec off polling new mail for a 100,000-message maildir.

Note you can't use the file system ctime/mtime because the file system
may have changed since the last time you ran notmuch new.

> I would suggest using a Xapian- or Index-time which gets a tick
> everytime a modification is made to the index.

Exactly.  It could be a tick, or just the current time of day if your
clock does not go backwards.  (I'd be willing to do a full scan if the
clock ever goes backwards.)  The advantage of time is that you don't
have to synchronously update some counter.

> Atomic operations could operate on the same time in case this
> distinction turns out to be useful. Perhaps something like this
> already exists in Xapian?

I don't think it's important for atomic operations to have the same
timestamp.  All that's important is that you be able to diff the
database between the last time you scanned it.

> This way clock skew, clock resolution (lots of operations happening in
> the same second, msec or nanosec) problems won't be an issue. The crux
> will be to make sure all write-operations trigger a tick on the
> indextime.

Clock skew is not really an issue.  It takes years to amass hundreds of
thousands of email messages.  So adding 5 minutes of slop is not a big
deal--you'll just scan a few messages needlessly.

Making sure the write-operations update the time should be easy.  Most
or all of the changes are probably funneled through
_notmuch_message_sync.  Worst case, there are only 9 places in the
source code that make use of a Xapian:WritableDatabase, so I'm pretty
confident total changes wouldn't be much more than 50 lines of code.

I would do it myself if there were any kind of indication that such a
change could be upstreamed.  I brought this up in January, 2011, and
didn't get a huge amount of interest in the ctime idea.  But I was also
a lot less focused on what I needed.  Now that I have a working
distributed setup and am actually using notmuch for my mail, I have a
much better understanding of what is needed.

David

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-10 15:31     ` dm-list-email-notmuch
@ 2014-04-10 21:10       ` Gaute Hope
  2014-04-22 22:05         ` David Bremner
  2014-04-11 11:08       ` David Bremner
  1 sibling, 1 reply; 27+ messages in thread
From: Gaute Hope @ 2014-04-10 21:10 UTC (permalink / raw)
  To: David Mazieres expires 2014-07-09 PDT; +Cc: notmuch

Excerpts from dm-list-email-notmuch's message of 2014-04-10 17:31:04 +0200:
> Gaute Hope <eg@gaute.vetsj.com> writes:
>
> >> A better approach would be to add a new "modtime" xapian value that is
> >> updated whenever the tags or any other terms (such as XFDIRENTRY) are
> >> added to or deleted from a docid.  If it's a Xapian value, rather than a
> >> term, then modtime will be queriable just like date, allowing multiple
> >> applications to query all docids modified since the last time they ran.
> >>
> >> [... snip]
> >
> > This could also solve it, and probably have more uses. I don't quite see
> > how the opposite problem (for my use case) can be solved by this without
> > using a 'localchange' tag. This is to sync tag to maildir sync, when a
> > new tag has been added (by e.g. a user interaction in a client) it needs
> > to be copied to the maildir, if it is not done in the same go a
> > different application won't know whether the change was local or remote.
> > How did you solve this?
>
> Why don't you just set maildir.synchronize_flags=true?  When I
> synchronize mail across machines, I start by concurrently running
> "notmuch new" on both the local and remote machines, which picks up all
> the changed maildir flags.  Then I synchronize the mail and the tags
> between the two maildirs.  If maildir.synchronize=true, then atomically
> with setting the new tags I call notmuch_message_tags_to_maildir_flags()
> to sync the new tags to the maildir.

I am talking about syncing tags to a maildir _folder_, not flags. It
could be implemented as maildir.synchronize is now, but it would be a
larger feature which could work in a lot of different ways.

> The maildir flags question seems kind of independent of what we are
> talking about, which is just having an incremental way of examining the
> database.  Right now, I have to scan everything to find tags that have
> changed since the last synchronization event.  If I had modtime (or
> really it should be called "ctime", like inode change time), then I
> could look at only the few messages that changed, and it would probably
> shave 250msec off polling new mail for a 100,000-message maildir.
>
> Note you can't use the file system ctime/mtime because the file system
> may have changed since the last time you ran notmuch new.

If you have a unreliable clock or use a badly configured system you
could risk detecting changes in the case where application time stamp is
set in the future, a mod time now. Then the app won't know there has
been a change. The same could happen if the clock is in the past, and
the modtime is set, the clock is updated and the app won't know there
has been a change.

The only way to know is to do a full scan of the entire db. This could
be very expansive, and comparable to initial indexing, for some actions.

You would not necessarily, or reliably, be able to detect this.

With an internal tick this wouldn't be an issue.

> > I would suggest using a Xapian- or Index-time which gets a tick
> > everytime a modification is made to the index.
>
> Exactly.  It could be a tick, or just the current time of day if your
> clock does not go backwards.  (I'd be willing to do a full scan if the
> clock ever goes backwards.)  The advantage of time is that you don't
> have to synchronously update some counter.
>
> > Atomic operations could operate on the same time in case this
> > distinction turns out to be useful. Perhaps something like this
> > already exists in Xapian?
>
> I don't think it's important for atomic operations to have the same
> timestamp.  All that's important is that you be able to diff the
> database between the last time you scanned it.

Yeah, it is not necessary for anything I am planning on doing, but it
would be a way for other apps to know that a set of changes were done at
the same time.

> > This way clock skew, clock resolution (lots of operations happening in
> > the same second, msec or nanosec) problems won't be an issue. The crux
> > will be to make sure all write-operations trigger a tick on the
> > indextime.
>
> Clock skew is not really an issue.  It takes years to amass hundreds of
> thousands of email messages.  So adding 5 minutes of slop is not a big
> deal--you'll just scan a few messages needlessly.

Yes, but you risk missing changes without knowing. That is an issue for
my use case.


> Making sure the write-operations update the time should be easy.  Most
> or all of the changes are probably funneled through
> _notmuch_message_sync.  Worst case, there are only 9 places in the
> source code that make use of a Xapian:WritableDatabase, so I'm pretty
> confident total changes wouldn't be much more than 50 lines of code.

Yes :)

> I would do it myself if there were any kind of indication that such a
> change could be upstreamed.  I brought this up in January, 2011, and
> didn't get a huge amount of interest in the ctime idea.  But I was also
> a lot less focused on what I needed.  Now that I have a working
> distributed setup and am actually using notmuch for my mail, I have a
> much better understanding of what is needed.

Would be great if it could be included.. I guess a comment from
one/some of the notmuch-gurus could clarify?


- gaute

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-10 15:31     ` dm-list-email-notmuch
  2014-04-10 21:10       ` Gaute Hope
@ 2014-04-11 11:08       ` David Bremner
  2014-04-11 16:03         ` dm-list-email-notmuch
  1 sibling, 1 reply; 27+ messages in thread
From: David Bremner @ 2014-04-11 11:08 UTC (permalink / raw)
  To: David Mazieres expires 2014-07-09 PDT, Gaute Hope; +Cc: notmuch

dm-list-email-notmuch@scs.stanford.edu writes:

> Gaute Hope <eg@gaute.vetsj.com> writes:

> Exactly.  It could be a tick, or just the current time of day if your
> clock does not go backwards.  (I'd be willing to do a full scan if the
> clock ever goes backwards.)  The advantage of time is that you don't
> have to synchronously update some counter.

I think I'd lean towards global time so that one could use it to resolve
conflicts between changes to multiple copies of the database.

> Making sure the write-operations update the time should be easy.  Most
> or all of the changes are probably funneled through
> _notmuch_message_sync.  Worst case, there are only 9 places in the
> source code that make use of a Xapian:WritableDatabase, so I'm pretty
> confident total changes wouldn't be much more than 50 lines of code.

Maybe. Don't forget upgrading the database, updating the test suite, and
presumably some changes to the CLI so the new mtime can actually be
used. Not to be discouraging ;).

> I would do it myself if there were any kind of indication that such a
> change could be upstreamed.  I brought this up in January, 2011, and
> didn't get a huge amount of interest in the ctime idea.  But I was also
> a lot less focused on what I needed.  Now that I have a working
> distributed setup and am actually using notmuch for my mail, I have a
> much better understanding of what is needed.

In the ensuing time, nothing better has developed for tag
synchronization (my pet use case) so maybe it's time to pursue this
again.  It would be good to have some preliminary idea about the time
and space costs of adding document mtimes.  I guess database bloat
should not be too bad, since it's only 64bits (?) per mail message.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-11 11:08       ` David Bremner
@ 2014-04-11 16:03         ` dm-list-email-notmuch
  2014-04-12 15:58           ` David Bremner
  0 siblings, 1 reply; 27+ messages in thread
From: dm-list-email-notmuch @ 2014-04-11 16:03 UTC (permalink / raw)
  To: David Bremner, Gaute Hope; +Cc: notmuch

David Bremner <david@tethera.net> writes:

>> Exactly.  It could be a tick, or just the current time of day if your
>> clock does not go backwards.  (I'd be willing to do a full scan if the
>> clock ever goes backwards.)  The advantage of time is that you don't
>> have to synchronously update some counter.
>
> I think I'd lean towards global time so that one could use it to resolve
> conflicts between changes to multiple copies of the database.

I, too, would prefer to use time.  However, I'm doubtful it would help
resolve conflicts.  On the plus side, I'm not sure it is even needed to
resolve conflicts.  My mail synchronizer has an algorithm for resolving
conflicts that always works without human intervention and in my limited
experience does exactly what I want:

   * If there's a conflict between two replicas, ensure that each
     maildir ends up with the maximum number of the number copies of the
     message in each of the two databases being reconciled.  [Example:
     If replica A deletes a message and replica B moves it from folder
     INBOX to folder SPAM, you end up with a copy in spam.  If replica A
     moves a message to folder IMPORTANT and replica B moves it to SPAM,
     then you get two hard links to the same file, one in IMPORTANT and
     one in SPAM.]

   * If there's a conflict and two replicas have different tags on the
     same message, then the tags in notmuch's new.tags directive get
     logically ANDed, while all other tags get logically ORed.

Granted, I've only been using this system for a week.  On the other
hand, all I was doing was starting to test something I had written, yet
it ended up being so much better than my old system that I couldn't go
back and ended up using my system in production far earlier than
anticipated...

>> Making sure the write-operations update the time should be easy.  Most
>> or all of the changes are probably funneled through
>> _notmuch_message_sync.  Worst case, there are only 9 places in the
>> source code that make use of a Xapian:WritableDatabase, so I'm pretty
>> confident total changes wouldn't be much more than 50 lines of code.
>
> Maybe. Don't forget upgrading the database, updating the test suite, and
> presumably some changes to the CLI so the new mtime can actually be
> used. Not to be discouraging ;).

The CLI is trivial.  We'll just add another search keyword ctime
analogous to date.

As far as updating the test suite, etc., it's almost certain that the
core notmuch developers would be unsatisfied with whatever I've done,
since the code base is very clean and has a very uniform style.  So when
I say I'd want some "indication that such a change could be upstreamed,"
I mean more specifically that someone would be willing to shepherd the
process of getting the code into shape.

> In the ensuing time, nothing better has developed for tag
> synchronization (my pet use case) so maybe it's time to pursue this
> again.

I do have something pretty good for tag synchronization.  It requires a
full database scan each time to detect changes, but I've heavily
optimized it to be very fast by skipping over the notmuch library and
directly scanning the underlying Xapian Btrees.  Currently my bottleneck
is indexing messages (e.g., running notmuch new or calling
notmuch_database_add_message), which are painfully slow on 32-bit
machines.  (Unfortunately my mail server is a 32-bit machine.)

To give you an idea, on a 32 bit machine, if I get a handful of new mail
(e.g., 6 messages), running "notmuch new" takes 19 seconds, while
scanning the database to check for renames and changed tags adds another
1.4 seconds.  On a 64-bit machine, "notmuch new" might take 1 second,
while scanning the database adds 350 msec.

So full database scan's might not be the end of the world.  The biggest
performance bottleneck at this point is notmuch's painful indexing
performance.  It kills me that it takes 10 minutes to index 100,000 mail
messages on a 16-core machine with 48 GiB of RAM.  But the library is
non-reentrant and allocates thread IDs in such a way that it's hard to
create parallel databases and later merge them.  Basically I can't
figure out how to make productive use of more than one CPU core even
when synchronizing across 1GB Ethernet!

It's pretty beta, but my intention is to open-source my code, so glad
for beta testers if you are interested in testing tag synchronization.

> It would be good to have some preliminary idea about the time
> and space costs of adding document mtimes.  I guess database bloat
> should not be too bad, since it's only 64bits (?) per mail message.

Plus a Btree to index it, so figure at least 24 bytes per message.
Another issue is that values are always brought into memory with a
document, so it will consume more RAM.  But yeah, I don't think it
should be that bad.

David

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-11 16:03         ` dm-list-email-notmuch
@ 2014-04-12 15:58           ` David Bremner
  2014-05-03 14:01             ` Jani Nikula
  0 siblings, 1 reply; 27+ messages in thread
From: David Bremner @ 2014-04-12 15:58 UTC (permalink / raw)
  To: David Mazieres expires 2014-07-10 PDT, Gaute Hope; +Cc: notmuch

dm-list-email-notmuch@scs.stanford.edu writes:

>
> As far as updating the test suite, etc., it's almost certain that the
> core notmuch developers would be unsatisfied with whatever I've done,
> since the code base is very clean and has a very uniform style.  So when
> I say I'd want some "indication that such a change could be upstreamed,"
> I mean more specifically that someone would be willing to shepherd the
> process of getting the code into shape.
>

I think it depends on your definition of shepherd. If it means review
your patches and tell you what needs to be done, then we can probably
handle that. Things tend to stall if the patch proposer loses interest
after the first review, so we're pretty far from "fire and forget".

I'd like to hear one or two more opinions from regular contributors
before I say "yes, this is something we want to do."

cheers,

d

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-10 21:10       ` Gaute Hope
@ 2014-04-22 22:05         ` David Bremner
  2014-04-23  7:24           ` Gaute Hope
  0 siblings, 1 reply; 27+ messages in thread
From: David Bremner @ 2014-04-22 22:05 UTC (permalink / raw)
  To: Gaute Hope; +Cc: notmuch

Gaute Hope <eg@gaute.vetsj.com> writes:

>
> I am talking about syncing tags to a maildir _folder_, not flags. It
> could be implemented as maildir.synchronize is now, but it would be a
> larger feature which could work in a lot of different ways.
>

So to try and clarify the use case, this could be used to add a tag
"changed" to each message-id that had one or more files
moved/added/deleted on disk.  You would then retag that message using
something like the output of notmuch search --output=files so that a set
of tags corresponds to a set of folders containing the message. Is this
correct?   I guess the proposed ctime information could be used for this
as well, if it also tracked those non-tag related changes. I guess this
would make it worse for David M's purposes (although presumeably still
better than nothing).

d

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-22 22:05         ` David Bremner
@ 2014-04-23  7:24           ` Gaute Hope
  2014-04-23  9:00             ` David Mazieres
  0 siblings, 1 reply; 27+ messages in thread
From: Gaute Hope @ 2014-04-23  7:24 UTC (permalink / raw)
  To: David Bremner, notmuch

Excerpts from David Bremner's message of 2014-04-23 00:05:02 +0200:
> Gaute Hope <eg@gaute.vetsj.com> writes:
>
> >
> > I am talking about syncing tags to a maildir _folder_, not flags. It
> > could be implemented as maildir.synchronize is now, but it would be a
> > larger feature which could work in a lot of different ways.
> >
>
> So to try and clarify the use case, this could be used to add a tag
> "changed" to each message-id that had one or more files
> moved/added/deleted on disk.  You would then retag that message using
> something like the output of notmuch search --output=files so that a set
> of tags corresponds to a set of folders containing the message. Is this
> correct?   I guess the proposed ctime information could be used for this
> as well, if it also tracked those non-tag related changes. I guess this
> would make it worse for David M's purposes (although presumeably still
> better than nothing).

Yes, I would not know what has changed, but I would know which messages
to check for changes and then decide whether and how to re-tag it. For
the opposite case, when a message has been changed locally by a client
and I would want to decide whether I need to copy/move/delete the
message based on the tags a tag could be added by the application.

In response to the issue of cost of this operation: I don't think it
will differ much from how 'new' is handled at the moment.

One extension perhaps worth considering is to have ctimes on each source
file as well as the db entry, but it might be overkill.

I still strongly favor an intenal db-tick over ctime - or store both,
the application iterating over the 'changed' tag (or messages changed
since last time) would have to store the time of last check as well. A
whole bunch of stuff could result in this time being inaccurate,
especially if these run on different machines.

A db-tick or a _good_ ctime solution can as far as I can see solve both
David M's (correct me if I am wrong) and my purposes, as well as
probably have more use cases in the future. It would even be an
interesting direct search: show me everything that changed lately,
sorted.

As noted before, my use case could also be solved by implementing it in
a similar fashion as sync_flags are now, is it possible to hook into
this stage in some way? So that it does not need to be included in
core notmuch.

- gaute

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-23  7:24           ` Gaute Hope
@ 2014-04-23  9:00             ` David Mazieres
  2014-04-23 11:53               ` Gaute Hope
  2014-04-23 20:59               ` Austin Clements
  0 siblings, 2 replies; 27+ messages in thread
From: David Mazieres @ 2014-04-23  9:00 UTC (permalink / raw)
  To: Gaute Hope, David Bremner, notmuch

Gaute Hope <eg@gaute.vetsj.com> writes:

> A db-tick or a _good_ ctime solution can as far as I can see solve both
> David M's (correct me if I am wrong) and my purposes, as well as
> probably have more use cases in the future. It would even be an
> interesting direct search: show me everything that changed lately,
> sorted.

I could live with a db-tick scheme.  I would prefer a ctime scheme,
since then I can answer questions such as "what has changed in the last
five minutes"?  I mean all kinds of other stuff starts to break if your
clock goes backwards on a mail server machine, not the least of which is
that incremental backups will fail silently, so you risk losing your
mail.

A middle ground might be to use the maximum of two values: 1) the
time-of-day at which notmuch started executing, and 2) the highest ctime
in the database plus 100 microseconds (leaving plenty of slop to store
timestamps as IEEE doubles with 52 significant bits).  Since the values
will be Btree-indexed, computing the max plus one will be cheap.

Incidentally, if you are really this paranoid about time stamps, it
should bother you that notmuch's directory timestamps only have one
second granularity.  It's not that hard to get a new message delivered
in the same second that notmuch new finished running.  In my
synchronizer, I convert st_mtim (a struct timespec) into a double and
keep that plus size in the database to decide if I need to re-hash
files.  But for directories, I'm stuck with NOTMUCH_VALUE_TIMESTAMP,
which are quantized to the second.  (Ironically, I think
Xapian::sortable_serialize converts time_ts to doubles anyway, so
avoiding st_mtim is not really helping performance.)

David

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-23  9:00             ` David Mazieres
@ 2014-04-23 11:53               ` Gaute Hope
  2014-04-23 20:59               ` Austin Clements
  1 sibling, 0 replies; 27+ messages in thread
From: Gaute Hope @ 2014-04-23 11:53 UTC (permalink / raw)
  To: David Mazieres expires 2014-07-22 PDT; +Cc: notmuch

Excerpts from David Mazieres's message of 2014-04-23 11:00:10 +0200:
> Gaute Hope <eg@gaute.vetsj.com> writes:
>
> > A db-tick or a _good_ ctime solution can as far as I can see solve both
> > David M's (correct me if I am wrong) and my purposes, as well as
> > probably have more use cases in the future. It would even be an
> > interesting direct search: show me everything that changed lately,
> > sorted.
>
> I could live with a db-tick scheme.  I would prefer a ctime scheme,
> since then I can answer questions such as "what has changed in the last
> five minutes"?  I mean all kinds of other stuff starts to break if your
> clock goes backwards on a mail server machine, not the least of which is
> that incremental backups will fail silently, so you risk losing your
> mail.
>
> A middle ground might be to use the maximum of two values: 1) the
> time-of-day at which notmuch started executing, and 2) the highest ctime
> in the database plus 100 microseconds (leaving plenty of slop to store
> timestamps as IEEE doubles with 52 significant bits).  Since the values
> will be Btree-indexed, computing the max plus one will be cheap.
>
> Incidentally, if you are really this paranoid about time stamps, it
> should bother you that notmuch's directory timestamps only have one
> second granularity.  It's not that hard to get a new message delivered
> in the same second that notmuch new finished running.  In my
> synchronizer, I convert st_mtim (a struct timespec) into a double and
> keep that plus size in the database to decide if I need to re-hash
> files.  But for directories, I'm stuck with NOTMUCH_VALUE_TIMESTAMP,
> which are quantized to the second.  (Ironically, I think
> Xapian::sortable_serialize converts time_ts to doubles anyway, so
> avoiding st_mtim is not really helping performance.)

Agreed, it probably won't be the end of the world.. I will have to
handle conflicts anyway. With an inclusion of ctime my 'changed'-tag
patches are unnecessary.

By the way, muchsync looks very promising!

Cheers, gaute

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-23  9:00             ` David Mazieres
  2014-04-23 11:53               ` Gaute Hope
@ 2014-04-23 20:59               ` Austin Clements
  2014-04-23 22:31                 ` dm-list-email-notmuch
  1 sibling, 1 reply; 27+ messages in thread
From: Austin Clements @ 2014-04-23 20:59 UTC (permalink / raw)
  To: David Mazieres expires 2014-07-22 PDT; +Cc: notmuch

Hi Dave!

Quoth David Mazieres on Apr 23 at  2:00 am:
> Gaute Hope <eg@gaute.vetsj.com> writes:
> 
> > A db-tick or a _good_ ctime solution can as far as I can see solve both
> > David M's (correct me if I am wrong) and my purposes, as well as
> > probably have more use cases in the future. It would even be an
> > interesting direct search: show me everything that changed lately,
> > sorted.
> 
> I could live with a db-tick scheme.  I would prefer a ctime scheme,
> since then I can answer questions such as "what has changed in the last
> five minutes"?  I mean all kinds of other stuff starts to break if your
> clock goes backwards on a mail server machine, not the least of which is
> that incremental backups will fail silently, so you risk losing your
> mail.
> 
> A middle ground might be to use the maximum of two values: 1) the
> time-of-day at which notmuch started executing, and 2) the highest ctime
> in the database plus 100 microseconds (leaving plenty of slop to store
> timestamps as IEEE doubles with 52 significant bits).  Since the values
> will be Btree-indexed, computing the max plus one will be cheap.

This makes me curious if you've considered how to fit this in to
Xapian.  The Xapian query syntax supports range queries over document
"values", but within the Xapian B-tree, values are stored in docid
order, not value order, so Xapian's range query operator is actually a
full scan in implementation.  I assume it does this so it doesn't have
to store both forward and inverse indexes of values.  (I spent some
time figuring out the layout of the Xapian database and have fairly
detailed notes if anyone's curious.)

This is still reasonably fast in practice because it's a sequential
scan and only requires a few bytes per message, but it's probably not
what you'd expect.  That said, Xapian does track per-value statistics
that would suffice for the particular problem of monotonic time stamps
(e.g., Database::get_value_upper_bound).

In principle it would be possible to use user metadata or even
document terms to support true B-tree range scans by ctime order, but
I don't think it's possible to express queries over this using
Xapian's query parser.  I've written about 90% of a (new) custom query
parser for Notmuch that would enable this, but little things like my
looming thesis deadline have interfered with me finishing it.

> Incidentally, if you are really this paranoid about time stamps, it
> should bother you that notmuch's directory timestamps only have one
> second granularity.  It's not that hard to get a new message delivered
> in the same second that notmuch new finished running.  In my
> synchronizer, I convert st_mtim (a struct timespec) into a double and
> keep that plus size in the database to decide if I need to re-hash
> files.  But for directories, I'm stuck with NOTMUCH_VALUE_TIMESTAMP,
> which are quantized to the second.  (Ironically, I think
> Xapian::sortable_serialize converts time_ts to doubles anyway, so
> avoiding st_mtim is not really helping performance.)

This is historical (and, I agree, unfortunate).  But nobody's
complained, so it hasn't been worth changing the libnotmuch interface
to support sub-second directory mtimes.  However, notmuch new does
correctly handle deliveries in the same second it runs.  If the
wall-clock time when it starts is the same as the on-disk directory
mtime, it skips updating the in-database directory mtime at the end.
Hence, on the next run, it will still consider the directory
out-of-date.  It's a bit of a hack, but it's a hack that would be
necessary for supporting older file systems even if we did support
sub-second timestamps.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-06 20:19 ` David Mazieres
  2014-04-10 14:43   ` Gaute Hope
@ 2014-04-23 21:28   ` Austin Clements
  2014-04-23 22:40     ` David Mazieres expires 2014-07-22 PDT
  1 sibling, 1 reply; 27+ messages in thread
From: Austin Clements @ 2014-04-23 21:28 UTC (permalink / raw)
  To: David Mazieres expires 2014-07-05 CEST; +Cc: notmuch

Quoth David Mazieres on Apr 06 at 10:19 pm:
> Gaute Hope <eg@gaute.vetsj.com> writes:
> 
> > When one of the source files for a message is changed on disk, renamed,
> > deleted or a new source file is added. A configurable changed tag is
> > is added. The tag can be configured under the option 'changed_tags' in
> > the [new] section, the default is none. Tests have been updated to
> > accept the new config option.
> >
> > notmuch-setup now asks for a changed tag after the new tags question.
> >
> > This could be useful for for example 'afew' to detect remote changes in
> > IMAP folders and update the FolderNameFilter to also add tags or remove
> > tags when a _existing_ message has been added to or removed from a
> > maildir.
> 
> I think this is the wrong way to achieve such functionality, because
> then the change tag A) is expensive to remove, B) is easy to misuse
> (remember to call fsync everywhere before deleting the change tag), and
> C) can be used by only one application.
> 
> A better approach would be to add a new "modtime" xapian value that is
> updated whenever the tags or any other terms (such as XFDIRENTRY) are
> added to or deleted from a docid.  If it's a Xapian value, rather than a
> term, then modtime will be queriable just like date, allowing multiple
> applications to query all docids modified since the last time they ran.

I'd like to have efficient change detection, too.  In my case, I'd
like to use it to support efficient live search and show updates.  The
design I'd sketched out for that used a log rather than ctimes, and
I'm curious if you have thoughts on the relative merits and
suitability for tag sync of these approaches.

I'd been leaning toward logging because it can capture changes to
things that aren't represented as documents in the database, such as
thread membership.  This probably doesn't matter for synchronization,
but it makes it much easier to figure out which threads in thread
search results need to be refreshed.  A log can also capture message
deletion easily, while ctimes would require tombstones (which may be a
good idea for other reasons [1]).

On the other hand, logging is obviously more mechanism than ctimes,
and probably requires some garbage collection story.

[1] id:20140421162058.GE25817@mit.edu

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-23 20:59               ` Austin Clements
@ 2014-04-23 22:31                 ` dm-list-email-notmuch
  0 siblings, 0 replies; 27+ messages in thread
From: dm-list-email-notmuch @ 2014-04-23 22:31 UTC (permalink / raw)
  To: Austin Clements; +Cc: notmuch

Austin Clements <amdragon@MIT.EDU> writes:

>> A middle ground might be to use the maximum of two values: 1) the
>> time-of-day at which notmuch started executing, and 2) the highest ctime
>> in the database plus 100 microseconds (leaving plenty of slop to store
>> timestamps as IEEE doubles with 52 significant bits).  Since the values
>> will be Btree-indexed, computing the max plus one will be cheap.
>
> This makes me curious if you've considered how to fit this in to
> Xapian.  The Xapian query syntax supports range queries over document
> "values", but within the Xapian B-tree, values are stored in docid
> order, not value order, so Xapian's range query operator is actually a
> full scan in implementation.  I assume it does this so it doesn't have
> to store both forward and inverse indexes of values.  (I spent some
> time figuring out the layout of the Xapian database and have fairly
> detailed notes if anyone's curious.)

Aside from finding the previous max time, everything else should work
identically to the date query operator and NOTMUCH_VALUE_TIMESTAMP.

Though I believe you, I'm a little surprised the values aren't indexed.
An alternative design might use terms like XCTIMExxxxxxxxxxxxxxxx where
the x's are hex digits.  But this seems a bit clunky and not using
Xapian the way it is indented to be used.

When I do a query with a giant result set ordered by date (notmuch
search --sort=oldest-first "*"), the first few results come back pretty
quickly, so I guess the full database scan is not an issue, at least for
~10^5 messages.

> This is still reasonably fast in practice because it's a sequential
> scan and only requires a few bytes per message, but it's probably not
> what you'd expect.  That said, Xapian does track per-value statistics
> that would suffice for the particular problem of monotonic time stamps
> (e.g., Database::get_value_upper_bound).

Oh, well in that case there is no issue.  That max is the only statistic
we need.  Everything that requires a full database scan, like get me all
messages whose properties have changed since time X, is something that
you can't do at all right now.  And in fact I'm already scanning all
100,000 message IDs AND diffing the results against a separate sqlite
database to detect changes in only 0.09 seconds (Linux) or 1.2 seconds
(32-bit OpenBSD).  This will only make that faster, and additionally
allow other people to do what I'm doing without writing a bunch of C++
code.

> In principle it would be possible to use user metadata or even
> document terms to support true B-tree range scans by ctime order, but
> I don't think it's possible to express queries over this using
> Xapian's query parser.  I've written about 90% of a (new) custom query
> parser for Notmuch that would enable this, but little things like my
> looming thesis deadline have interfered with me finishing it.

Yeah, I've been avoiding the query parser and just scanning terms and
postlists directly.  Since the lack of ctime forced me to scan the whole
database anyway, I found it much faster to scan each tag's posting list
and dump the results into sqlite than to extract tag terms on a
per-document basis the way notmuch dump does.

>> Incidentally, if you are really this paranoid about time stamps, it
>> should bother you that notmuch's directory timestamps only have one
>> second granularity.
>
> This is historical (and, I agree, unfortunate).  But nobody's
> complained, so it hasn't been worth changing the libnotmuch interface
> to support sub-second directory mtimes.  However, notmuch new does
> correctly handle deliveries in the same second it runs.  If the
> wall-clock time when it starts is the same as the on-disk directory
> mtime, it skips updating the in-database directory mtime at the end.
> Hence, on the next run, it will still consider the directory
> out-of-date.  It's a bit of a hack, but it's a hack that would be
> necessary for supporting older file systems even if we did support
> sub-second timestamps.

Yeah, is kind of a problem me.  I currently scan the XFDIRENTRY terms
belonging to a directory only if the directory's notmuch mtime has
changed since the last time I examined Xapian's state.  I used to scan
the actual directories, which was fine, but not so useful because I
don't actually want to deal with messages that notmuch has not yet
indexed.  Conversely, if a directory has not changed since the last time
muchsync ran, but notmuch's idea of the directory has changed (because
someone ran notmuch new), then I do care about scanning for new/deleted
XFDIRENTRY terms.

But couldn't notmuch fix the sub-second problem in a fully backwards
compatible way?  After all, the database is already storing these mtimes
as doubles.  Even for OSes that don't support st_mtim, notmuch could
just add 0.00001 seconds to the previous timestamp of a modified
directory.

David

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-23 21:28   ` Austin Clements
@ 2014-04-23 22:40     ` David Mazieres expires 2014-07-22 PDT
  0 siblings, 0 replies; 27+ messages in thread
From: David Mazieres expires 2014-07-22 PDT @ 2014-04-23 22:40 UTC (permalink / raw)
  To: Austin Clements; +Cc: notmuch

Austin Clements <amdragon@MIT.EDU> writes:

> I'd like to have efficient change detection, too.  In my case, I'd
> like to use it to support efficient live search and show updates.  The
> design I'd sketched out for that used a log rather than ctimes, and
> I'm curious if you have thoughts on the relative merits and
> suitability for tag sync of these approaches.

Both logging ctime are very general mechanisms than can solve many
problems.  For example, is there still an issue that pressing "*" in
emacs notmuch-search mode can affect messages that are not visible if
someone ran notmuch new in a different window?  ctimes would be one way
to solve this.  (Conservatively exempt any messages that have changed
since the displayed search was run.)

> I'd been leaning toward logging because it can capture changes to
> things that aren't represented as documents in the database, such as
> thread membership.  This probably doesn't matter for synchronization,
> but it makes it much easier to figure out which threads in thread
> search results need to be refreshed.  A log can also capture message
> deletion easily, while ctimes would require tombstones (which may be a
> good idea for other reasons [1]).
>
> On the other hand, logging is obviously more mechanism than ctimes,
> and probably requires some garbage collection story.

The advantage of ctime over logging is that the interface is super
simple and intuitive.  How would the interface to the log work?

In terms of implementation, have you thought about leveraging the
XAPIAN_MAX_CHANGESETS mechanism to produce your logs?

David

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-12 15:58           ` David Bremner
@ 2014-05-03 14:01             ` Jani Nikula
  0 siblings, 0 replies; 27+ messages in thread
From: Jani Nikula @ 2014-05-03 14:01 UTC (permalink / raw)
  To: David Bremner, David Mazieres expires 2014-07-10 PDT, Gaute Hope; +Cc: notmuch

On Sat, 12 Apr 2014, David Bremner <david@tethera.net> wrote:
> dm-list-email-notmuch@scs.stanford.edu writes:
>
>>
>> As far as updating the test suite, etc., it's almost certain that the
>> core notmuch developers would be unsatisfied with whatever I've done,
>> since the code base is very clean and has a very uniform style.  So when
>> I say I'd want some "indication that such a change could be upstreamed,"
>> I mean more specifically that someone would be willing to shepherd the
>> process of getting the code into shape.
>>
>
> I think it depends on your definition of shepherd. If it means review
> your patches and tell you what needs to be done, then we can probably
> handle that. Things tend to stall if the patch proposer loses interest
> after the first review, so we're pretty far from "fire and forget".
>
> I'd like to hear one or two more opinions from regular contributors
> before I say "yes, this is something we want to do."

FYI everyone, message modification times has been proposed before [1].

BR,
Jani.


[1] id:1323796305-28789-1-git-send-email-schnouki@schnouki.net
http://mid.gmane.org/1323796305-28789-1-git-send-email-schnouki@schnouki.net

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-04-06 16:11 [PATCH] Add configurable changed tag to messages that have been changed on disk Gaute Hope
  2014-04-06 20:19 ` David Mazieres
@ 2014-07-03 10:42 ` David Bremner
  2014-07-28 14:37   ` Gaute Hope
  1 sibling, 1 reply; 27+ messages in thread
From: David Bremner @ 2014-07-03 10:42 UTC (permalink / raw)
  To: Gaute Hope, notmuch

Gaute Hope <eg@gaute.vetsj.com> writes:

> When one of the source files for a message is changed on disk, renamed,
> deleted or a new source file is added. A configurable changed tag is
> is added. The tag can be configured under the option 'changed_tags' in
> the [new] section, the default is none. Tests have been updated to
> accept the new config option.
>
> notmuch-setup now asks for a changed tag after the new tags question.
>
> This could be useful for for example 'afew' to detect remote changes in
> IMAP folders and update the FolderNameFilter to also add tags or remove
> tags when a _existing_ message has been added to or removed from a
> maildir.

The discussion on this proposal seems to have died out without reaching
a conclusion. David M expressed a strong preference for some kind of
modification time field in the database.  Gaute agreed with some caveats
that such an approach could solve his problems as well. On the other
hand, nobody seems to be actually working on such an approach at the
moment.  Gaute and or David do you have any interest in revisiting the
series id:1323796305-28789-1-git-send-email-schnouki@schnouki.net and
seeing if it can be reworked into mergeable shape? I suspect in
particular something needs to be added with respect to message deletion
Thomas, are you still running some variant of these patches?

d

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-07-03 10:42 ` David Bremner
@ 2014-07-28 14:37   ` Gaute Hope
  2014-08-01 18:55     ` Austin Clements
  0 siblings, 1 reply; 27+ messages in thread
From: Gaute Hope @ 2014-07-28 14:37 UTC (permalink / raw)
  To: David Bremner; +Cc: notmuch

[-- Attachment #1: Type: text/plain, Size: 1649 bytes --]

On Thu, Jul 3, 2014 at 12:42 PM, David Bremner <david@tethera.net> wrote:

> Gaute Hope <eg@gaute.vetsj.com> writes:
>
> > When one of the source files for a message is changed on disk, renamed,
> > deleted or a new source file is added. A configurable changed tag is
> > is added. The tag can be configured under the option 'changed_tags' in
> > the [new] section, the default is none. Tests have been updated to
> > accept the new config option.
> >
> > notmuch-setup now asks for a changed tag after the new tags question.
> >
> > This could be useful for for example 'afew' to detect remote changes in
> > IMAP folders and update the FolderNameFilter to also add tags or remove
> > tags when a _existing_ message has been added to or removed from a
> > maildir.
>
> The discussion on this proposal seems to have died out without reaching
> a conclusion. David M expressed a strong preference for some kind of
> modification time field in the database.  Gaute agreed with some caveats
> that such an approach could solve his problems as well. On the other
> hand, nobody seems to be actually working on such an approach at the
> moment.  Gaute and or David do you have any interest in revisiting the
> series id:1323796305-28789-1-git-send-email-schnouki@schnouki.net and
> seeing if it can be reworked into mergeable shape? I suspect in
> particular something needs to be added with respect to message deletion
> Thomas, are you still running some variant of these patches?
>
> d
>

I am afraid I don't have the chance to put in any consistent effort on this
at the moment.

I agree, message deletion needs to be solved somehow.

Regards, Gaute

[-- Attachment #2: Type: text/html, Size: 2430 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-07-28 14:37   ` Gaute Hope
@ 2014-08-01 18:55     ` Austin Clements
  2014-08-02  0:49       ` Austin Clements
                         ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: Austin Clements @ 2014-08-01 18:55 UTC (permalink / raw)
  To: notmuch

I have a prototype implementation of message modification times on my
lastmod-v1 branch at

  https://github.com/aclements/notmuch/tree/lastmod-v1

It builds on my database features series that's currently awaiting
review [1].

The series uses a monotonic revision number, rather than wall-clock
time, for reasons related to Xapian's concurrent control and detailed
in the main commit's commit message.  The implementation isn't quite
useful from the CLI yet because I haven't added any way to query the
database's current revision number.  (I'm still thinking about how I
want to do this, since search/show don't have a good way to deliver
"additional" information right now.  I might just add the last
modification for each individual message/max of all messages in a
thread, similar to what Thomas Jost's patch did long ago.)

[1] id:1406859003-11561-1-git-send-email-amdragon@mit.edu

Quoth Gaute Hope on Jul 28 at  4:37 pm:
>    On Thu, Jul 3, 2014 at 12:42 PM, David Bremner <[1]david@tethera.net>
>    wrote:
> 
>      Gaute Hope <[2]eg@gaute.vetsj.com> writes:
> 
>      > When one of the source files for a message is changed on disk,
>      renamed,
>      > deleted or a new source file is added. A configurable changed tag is
>      > is added. The tag can be configured under the option 'changed_tags' in
>      > the [new] section, the default is none. Tests have been updated to
>      > accept the new config option.
>      >
>      > notmuch-setup now asks for a changed tag after the new tags question.
>      >
>      > This could be useful for for example 'afew' to detect remote changes
>      in
>      > IMAP folders and update the FolderNameFilter to also add tags or
>      remove
>      > tags when a _existing_ message has been added to or removed from a
>      > maildir.
> 
>      The discussion on this proposal seems to have died out without reaching
>      a conclusion. David M expressed a strong preference for some kind of
>      modification time field in the database.  Gaute agreed with some caveats
>      that such an approach could solve his problems as well. On the other
>      hand, nobody seems to be actually working on such an approach at the
>      moment.  Gaute and or David do you have any interest in revisiting the
>      series [3]id:1323796305-28789-1-git-send-email-schnouki@schnouki.net and
>      seeing if it can be reworked into mergeable shape? I suspect in
>      particular something needs to be added with respect to message deletion
>      Thomas, are you still running some variant of these patches?
>      d
> 
>    I am afraid I don't have the chance to put in any consistent effort on
>    this at the moment.
> 
>    I agree, message deletion needs to be solved somehow.
>    Regards, Gaute

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-08-01 18:55     ` Austin Clements
@ 2014-08-02  0:49       ` Austin Clements
  2014-08-06  9:02       ` Gaute Hope
       [not found]       ` <1407313144-astroid-0-vyhth1tcrd-3835@strange>
  2 siblings, 0 replies; 27+ messages in thread
From: Austin Clements @ 2014-08-02  0:49 UTC (permalink / raw)
  To: notmuch

I should add that this code shouldn't be considered stable yet.  The
on-disk format may (and probably will) change, so don't try it on your
main notmuch database.

Quoth myself on Aug 01 at  2:55 pm:
> I have a prototype implementation of message modification times on my
> lastmod-v1 branch at
> 
>   https://github.com/aclements/notmuch/tree/lastmod-v1
> 
> It builds on my database features series that's currently awaiting
> review [1].
> 
> The series uses a monotonic revision number, rather than wall-clock
> time, for reasons related to Xapian's concurrent control and detailed
> in the main commit's commit message.  The implementation isn't quite
> useful from the CLI yet because I haven't added any way to query the
> database's current revision number.  (I'm still thinking about how I
> want to do this, since search/show don't have a good way to deliver
> "additional" information right now.  I might just add the last
> modification for each individual message/max of all messages in a
> thread, similar to what Thomas Jost's patch did long ago.)
> 
> [1] id:1406859003-11561-1-git-send-email-amdragon@mit.edu
> 
> Quoth Gaute Hope on Jul 28 at  4:37 pm:
> >    On Thu, Jul 3, 2014 at 12:42 PM, David Bremner <[1]david@tethera.net>
> >    wrote:
> > 
> >      Gaute Hope <[2]eg@gaute.vetsj.com> writes:
> > 
> >      > When one of the source files for a message is changed on disk,
> >      renamed,
> >      > deleted or a new source file is added. A configurable changed tag is
> >      > is added. The tag can be configured under the option 'changed_tags' in
> >      > the [new] section, the default is none. Tests have been updated to
> >      > accept the new config option.
> >      >
> >      > notmuch-setup now asks for a changed tag after the new tags question.
> >      >
> >      > This could be useful for for example 'afew' to detect remote changes
> >      in
> >      > IMAP folders and update the FolderNameFilter to also add tags or
> >      remove
> >      > tags when a _existing_ message has been added to or removed from a
> >      > maildir.
> > 
> >      The discussion on this proposal seems to have died out without reaching
> >      a conclusion. David M expressed a strong preference for some kind of
> >      modification time field in the database.  Gaute agreed with some caveats
> >      that such an approach could solve his problems as well. On the other
> >      hand, nobody seems to be actually working on such an approach at the
> >      moment.  Gaute and or David do you have any interest in revisiting the
> >      series [3]id:1323796305-28789-1-git-send-email-schnouki@schnouki.net and
> >      seeing if it can be reworked into mergeable shape? I suspect in
> >      particular something needs to be added with respect to message deletion
> >      Thomas, are you still running some variant of these patches?
> >      d
> > 
> >    I am afraid I don't have the chance to put in any consistent effort on
> >    this at the moment.
> > 
> >    I agree, message deletion needs to be solved somehow.
> >    Regards, Gaute

-- 
Austin Clements                               MIT/CSAIL/SB '06/PhD '14
amdragon@mit.edu                           http://web.mit.edu/amdragon
       Somewhere in the dream we call reality you will find me,
              searching for the reality we call dreams.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-08-01 18:55     ` Austin Clements
  2014-08-02  0:49       ` Austin Clements
@ 2014-08-06  9:02       ` Gaute Hope
  2014-08-06 17:06         ` Austin Clements
       [not found]       ` <1407313144-astroid-0-vyhth1tcrd-3835@strange>
  2 siblings, 1 reply; 27+ messages in thread
From: Gaute Hope @ 2014-08-06  9:02 UTC (permalink / raw)
  To: Austin Clements; +Cc: notmuch

On Fri, Aug 1, 2014 at 8:55 PM, Austin Clements <amdragon@mit.edu> wrote:
> I have a prototype implementation of message modification times on my
> lastmod-v1 branch at
>
>   https://github.com/aclements/notmuch/tree/lastmod-v1
>
> It builds on my database features series that's currently awaiting
> review [1].
>
> The series uses a monotonic revision number, rather than wall-clock
> time, for reasons related to Xapian's concurrent control and detailed
> in the main commit's commit message.  The implementation isn't quite
> useful from the CLI yet because I haven't added any way to query the
> database's current revision number.  (I'm still thinking about how I
> want to do this, since search/show don't have a good way to deliver
> "additional" information right now.  I might just add the last
> modification for each individual message/max of all messages in a
> thread, similar to what Thomas Jost's patch did long ago.)
>
> [1] id:1406859003-11561-1-git-send-email-amdragon@mit.edu


Hi,

this should allow me to do what I wish to accomplish. The message
deletion is still a problem though, I can see two options at the moment:

a)  output during notmuch new to a hook or a list somewhere deleted files.
   if list: notmuch will not handle this list, only append to it and
the user must
   purge it when it is safe to do so.

   if hook: for my purposes I would just create a hook appending to the
   list. as a minimum I think thread_id, message_id and revision number
   should be included.


b)  maintain a full list of deleted / dead messages. a user initiated
   purge can clean this from the database. a tag could be used for this,
   so that clients can ignore unlinked/deleted/dead messages. this
   differs from a 'deleted' message (IMAP/Maildir context) that has not
   yet been expunged so there is confusion to be avoided.

   a garbage collection function and interface must also be set up, but
   this one is probably simple.


in most cases I think a) would be sufficient, and probably much easier
to do. it might be slow in cases where large amounts of messages have been
deleted, but this is seldom the case for me at least.

cheers, gaute

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-08-06  9:02       ` Gaute Hope
@ 2014-08-06 17:06         ` Austin Clements
  0 siblings, 0 replies; 27+ messages in thread
From: Austin Clements @ 2014-08-06 17:06 UTC (permalink / raw)
  To: Gaute Hope; +Cc: notmuch

Quoth Gaute Hope on Aug 06 at 11:02 am:
> On Fri, Aug 1, 2014 at 8:55 PM, Austin Clements <amdragon@mit.edu> wrote:
> > I have a prototype implementation of message modification times on my
> > lastmod-v1 branch at
> >
> >   https://github.com/aclements/notmuch/tree/lastmod-v1
> >
> > It builds on my database features series that's currently awaiting
> > review [1].
> >
> > The series uses a monotonic revision number, rather than wall-clock
> > time, for reasons related to Xapian's concurrent control and detailed
> > in the main commit's commit message.  The implementation isn't quite
> > useful from the CLI yet because I haven't added any way to query the
> > database's current revision number.  (I'm still thinking about how I
> > want to do this, since search/show don't have a good way to deliver
> > "additional" information right now.  I might just add the last
> > modification for each individual message/max of all messages in a
> > thread, similar to what Thomas Jost's patch did long ago.)
> >
> > [1] id:1406859003-11561-1-git-send-email-amdragon@mit.edu
> 
> 
> Hi,
> 
> this should allow me to do what I wish to accomplish. The message
> deletion is still a problem though, I can see two options at the moment:
> 
> a)  output during notmuch new to a hook or a list somewhere deleted files.
>    if list: notmuch will not handle this list, only append to it and
> the user must
>    purge it when it is safe to do so.
> 
>    if hook: for my purposes I would just create a hook appending to the
>    list. as a minimum I think thread_id, message_id and revision number
>    should be included.
> 
> 
> b)  maintain a full list of deleted / dead messages. a user initiated
>    purge can clean this from the database. a tag could be used for this,
>    so that clients can ignore unlinked/deleted/dead messages. this
>    differs from a 'deleted' message (IMAP/Maildir context) that has not
>    yet been expunged so there is confusion to be avoided.
> 
>    a garbage collection function and interface must also be set up, but
>    this one is probably simple.
> 
> 
> in most cases I think a) would be sufficient, and probably much easier
> to do. it might be slow in cases where large amounts of messages have been
> deleted, but this is seldom the case for me at least.

I have a separate branch (also sitting on top of the features branch)
that implements "ghost" messages.  The main intent is to fix a bug we
currently have in threading, but it puts us in a good position to
maintain state for messages we don't have the content of, including
last modification times for deleted messages and pre-seeded tags for
undelivered messages (useful for pre-tagging sent messages as sent,
nmbug, notmuch insert, etc.)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
       [not found]       ` <1407313144-astroid-0-vyhth1tcrd-3835@strange>
@ 2014-09-22 12:06         ` Gaute Hope
  2014-09-22 15:33           ` Tomi Ollila
  2014-09-22 15:40           ` Austin Clements
  0 siblings, 2 replies; 27+ messages in thread
From: Gaute Hope @ 2014-09-22 12:06 UTC (permalink / raw)
  To: notmuch

Excerpts from Gaute Hope's message of August 6, 2014 10:29:
> Austin Clements <amdragon@MIT.EDU> wrote on Fri, 01 Aug 2014 14:55:05 -0400:
>> I have a prototype implementation of message modification times on my
>> lastmod-v1 branch at
>> 
>>   https://github.com/aclements/notmuch/tree/lastmod-v1
>> 
>> It builds on my database features series that's currently awaiting
>> review [1].
>> 
>> The series uses a monotonic revision number, rather than wall-clock
>> time, for reasons related to Xapian's concurrent control and detailed
>> in the main commit's commit message.  The implementation isn't quite
>> useful from the CLI yet because I haven't added any way to query the
>> database's current revision number.  (I'm still thinking about how I
>> want to do this, since search/show don't have a good way to deliver
>> "additional" information right now.  I might just add the last
>> modification for each individual message/max of all messages in a
>> thread, similar to what Thomas Jost's patch did long ago.)
>> 
>> [1] id:1406859003-11561-1-git-send-email-amdragon@mit.edu
 
> this should allow me to do what I wish to accomplish. The message
> deletion is still a problem though, I can see two options at the moment:

Hi list,

While exploring the possibility of syncing maildir/X-keywords with tags
I had some thoughts about lastmod and message modification:

As briefly discussed on #notmuch, I noticed that it seems that 'notmuch
new' does not detect that a message source has been changed, unless the
file is also re-named.

This means that for instance if the X-Keywords fields have been updated
in a message (from GMail with offlineimap, synclabels = yes) the lastmod
field will remain unchanged, and a source modification will be
undetectable to a client program using this value.

Would it not make sense that if a message has a more recent mtime than
at index time it is re-indexed?

Also, for the lastmod branch I would wish for a notmuch_message_touch()
method where the lastmod time is updated to the last. As well as a
notmuch_database_reindex_message () - possibly defined/documented
behaviour for notmuch_database_add_message () when the filename is
already added (in which case I would expect notmuch to re-index the
message).

Doing notmuch_database_remove_message followed by _add_message could
risk deleting the entry if this file is the only on-disk-representation.

Cheers, Gaute


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-09-22 12:06         ` Gaute Hope
@ 2014-09-22 15:33           ` Tomi Ollila
  2014-09-22 15:40           ` Austin Clements
  1 sibling, 0 replies; 27+ messages in thread
From: Tomi Ollila @ 2014-09-22 15:33 UTC (permalink / raw)
  To: Gaute Hope, notmuch

On Mon, Sep 22 2014, Gaute Hope <eg@gaute.vetsj.com> wrote:

> Excerpts from Gaute Hope's message of August 6, 2014 10:29:
>> Austin Clements <amdragon@MIT.EDU> wrote on Fri, 01 Aug 2014 14:55:05 -0400:
>>> I have a prototype implementation of message modification times on my
>>> lastmod-v1 branch at
>>> 
>>>   https://github.com/aclements/notmuch/tree/lastmod-v1
>>> 
>>> It builds on my database features series that's currently awaiting
>>> review [1].
>>> 
>>> The series uses a monotonic revision number, rather than wall-clock
>>> time, for reasons related to Xapian's concurrent control and detailed
>>> in the main commit's commit message.  The implementation isn't quite
>>> useful from the CLI yet because I haven't added any way to query the
>>> database's current revision number.  (I'm still thinking about how I
>>> want to do this, since search/show don't have a good way to deliver
>>> "additional" information right now.  I might just add the last
>>> modification for each individual message/max of all messages in a
>>> thread, similar to what Thomas Jost's patch did long ago.)
>>> 
>>> [1] id:1406859003-11561-1-git-send-email-amdragon@mit.edu
>  
>> this should allow me to do what I wish to accomplish. The message
>> deletion is still a problem though, I can see two options at the moment:
>
> Hi list,
>
> While exploring the possibility of syncing maildir/X-keywords with tags
> I had some thoughts about lastmod and message modification:
>
> As briefly discussed on #notmuch, I noticed that it seems that 'notmuch
> new' does not detect that a message source has been changed, unless the
> file is also re-named.
>
> This means that for instance if the X-Keywords fields have been updated
> in a message (from GMail with offlineimap, synclabels = yes) the lastmod
> field will remain unchanged, and a source modification will be
> undetectable to a client program using this value.
>
> Would it not make sense that if a message has a more recent mtime than
> at index time it is re-indexed?

That would require notmuch to scan the contents of a directory for changed
mtimes of the files -- now notmuch skips looking for the files unless
directory mtime has changed. Directory mtime changes when files are
added/deleted/renamed (as that is what directory needs to know) -- file
mtime change are stored in file information and therefore change there does
not probagate to parent directory (and, if such happened, to it's parent
and so on...)

That would mean the scanning would be slower than it is now.

Tomi

>
> Also, for the lastmod branch I would wish for a notmuch_message_touch()
> method where the lastmod time is updated to the last. As well as a
> notmuch_database_reindex_message () - possibly defined/documented
> behaviour for notmuch_database_add_message () when the filename is
> already added (in which case I would expect notmuch to re-index the
> message).
>
> Doing notmuch_database_remove_message followed by _add_message could
> risk deleting the entry if this file is the only on-disk-representation.
>
> Cheers, Gaute

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-09-22 12:06         ` Gaute Hope
  2014-09-22 15:33           ` Tomi Ollila
@ 2014-09-22 15:40           ` Austin Clements
  2014-09-23  6:57             ` Gaute Hope
  1 sibling, 1 reply; 27+ messages in thread
From: Austin Clements @ 2014-09-22 15:40 UTC (permalink / raw)
  To: Gaute Hope, notmuch

On Mon, 22 Sep 2014, Gaute Hope <eg@gaute.vetsj.com> wrote:
> Excerpts from Gaute Hope's message of August 6, 2014 10:29:
>> Austin Clements <amdragon@MIT.EDU> wrote on Fri, 01 Aug 2014 14:55:05 -0400:
>>> I have a prototype implementation of message modification times on my
>>> lastmod-v1 branch at
>>> 
>>>   https://github.com/aclements/notmuch/tree/lastmod-v1
>>> 
>>> It builds on my database features series that's currently awaiting
>>> review [1].
>>> 
>>> The series uses a monotonic revision number, rather than wall-clock
>>> time, for reasons related to Xapian's concurrent control and detailed
>>> in the main commit's commit message.  The implementation isn't quite
>>> useful from the CLI yet because I haven't added any way to query the
>>> database's current revision number.  (I'm still thinking about how I
>>> want to do this, since search/show don't have a good way to deliver
>>> "additional" information right now.  I might just add the last
>>> modification for each individual message/max of all messages in a
>>> thread, similar to what Thomas Jost's patch did long ago.)
>>> 
>>> [1] id:1406859003-11561-1-git-send-email-amdragon@mit.edu
>  
>> this should allow me to do what I wish to accomplish. The message
>> deletion is still a problem though, I can see two options at the moment:
>
> Hi list,
>
> While exploring the possibility of syncing maildir/X-keywords with tags
> I had some thoughts about lastmod and message modification:
>
> As briefly discussed on #notmuch, I noticed that it seems that 'notmuch
> new' does not detect that a message source has been changed, unless the
> file is also re-named.
>
> This means that for instance if the X-Keywords fields have been updated
> in a message (from GMail with offlineimap, synclabels = yes) the lastmod
> field will remain unchanged, and a source modification will be
> undetectable to a client program using this value.
>
> Would it not make sense that if a message has a more recent mtime than
> at index time it is re-indexed?

This has the potential to make notmuch new substantially more expensive.
Currently, if there are no changes, it only has to stat each directory
in your maildir (in fact, some restructuring of new would let us
eliminate almost all database access during a no-op notmuch new as
well).  Checking for changes to individual messages would require
stat'ing every single message file as well as accessing the database to
check the paths and mtimes of every message, increasing the number of
stat calls and disk accesses by several orders of magnitude.

It may be that this is fast enough that it's okay, but it would be good
to gather some evidence first.  That includes hot and cold caches, and
maildir over NFS.

With respect to X-Keywords specifically, note that it's a fairly basic
design decision that notmuch never modifies message files.  This gives
us strong robustness guarantees we would be loathe to part with.

It has puzzled me ever since offlineimap added X-Keywords why they
didn't just translate these keywords into folders and create hard links
of message files.  Anything could interact smoothly with that.

> Also, for the lastmod branch I would wish for a notmuch_message_touch()
> method where the lastmod time is updated to the last. As well as a
> notmuch_database_reindex_message () - possibly defined/documented
> behaviour for notmuch_database_add_message () when the filename is
> already added (in which case I would expect notmuch to re-index the
> message).

What's the use case for these?

> Doing notmuch_database_remove_message followed by _add_message could
> risk deleting the entry if this file is the only on-disk-representation.
>
> Cheers, Gaute

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add configurable changed tag to messages that have been changed on disk
  2014-09-22 15:40           ` Austin Clements
@ 2014-09-23  6:57             ` Gaute Hope
  0 siblings, 0 replies; 27+ messages in thread
From: Gaute Hope @ 2014-09-23  6:57 UTC (permalink / raw)
  To: Austin Clements; +Cc: notmuch

[-- Attachment #1: Type: text/plain, Size: 4447 bytes --]

22. sep. 2014 17:40 skrev "Austin Clements" <aclements@csail.mit.edu>
følgende:
>
> On Mon, 22 Sep 2014, Gaute Hope <eg@gaute.vetsj.com> wrote:
> > Excerpts from Gaute Hope's message of August 6, 2014 10:29:
> >> Austin Clements <amdragon@MIT.EDU> wrote on Fri, 01 Aug 2014 14:55:05
-0400:
> >>> I have a prototype implementation of message modification times on my
> >>> lastmod-v1 branch at
> >>>
> >>>   https://github.com/aclements/notmuch/tree/lastmod-v1
> >>>
> >>> It builds on my database features series that's currently awaiting
> >>> review [1].
> >>>
> >>> The series uses a monotonic revision number, rather than wall-clock
> >>> time, for reasons related to Xapian's concurrent control and detailed
> >>> in the main commit's commit message.  The implementation isn't quite
> >>> useful from the CLI yet because I haven't added any way to query the
> >>> database's current revision number.  (I'm still thinking about how I
> >>> want to do this, since search/show don't have a good way to deliver
> >>> "additional" information right now.  I might just add the last
> >>> modification for each individual message/max of all messages in a
> >>> thread, similar to what Thomas Jost's patch did long ago.)
> >>>
> >>> [1] id:1406859003-11561-1-git-send-email-amdragon@mit.edu
> >
> >> this should allow me to do what I wish to accomplish. The message
> >> deletion is still a problem though, I can see two options at the
moment:
> >
> > Hi list,
> >
> > While exploring the possibility of syncing maildir/X-keywords with tags
> > I had some thoughts about lastmod and message modification:
> >
> > As briefly discussed on #notmuch, I noticed that it seems that 'notmuch
> > new' does not detect that a message source has been changed, unless the
> > file is also re-named.
> >
> > This means that for instance if the X-Keywords fields have been updated
> > in a message (from GMail with offlineimap, synclabels = yes) the lastmod
> > field will remain unchanged, and a source modification will be
> > undetectable to a client program using this value.
> >
> > Would it not make sense that if a message has a more recent mtime than
> > at index time it is re-indexed?
>
> This has the potential to make notmuch new substantially more expensive.
> Currently, if there are no changes, it only has to stat each directory
> in your maildir (in fact, some restructuring of new would let us
> eliminate almost all database access during a no-op notmuch new as
> well).  Checking for changes to individual messages would require
> stat'ing every single message file as well as accessing the database to
> check the paths and mtimes of every message, increasing the number of
> stat calls and disk accesses by several orders of magnitude.
>
> It may be that this is fast enough that it's okay, but it would be good
> to gather some evidence first.  That includes hot and cold caches, and
> maildir over NFS.
>
> With respect to X-Keywords specifically, note that it's a fairly basic
> design decision that notmuch never modifies message files.  This gives
> us strong robustness guarantees we would be loathe to part with.
>
> It has puzzled me ever since offlineimap added X-Keywords why they
> didn't just translate these keywords into folders and create hard links
> of message files.  Anything could interact smoothly with that.

The information follows the message file. But, yeah, working directly on
the message source is hairy. Anyway, email is as mess in general anyway. I
consider it user-input.

>
> > Also, for the lastmod branch I would wish for a notmuch_message_touch()
> > method where the lastmod time is updated to the last. As well as a
> > notmuch_database_reindex_message () - possibly defined/documented
> > behaviour for notmuch_database_add_message () when the filename is
> > already added (in which case I would expect notmuch to re-index the
> > message).
>
> What's the use case for these?

If you make a change to the message source and want it to be reindexed.
Say, edited a draft or changed a header field. I am not asking that notmuch
modifies the message source.

For _touch, if without making an actual change to the message you wish to
indicate that it has been updated or synced at the current time. For
instance after an reindex that did not make any actual change. Perhaps not
strictly necessary.

Cheers, Gaute

[-- Attachment #2: Type: text/html, Size: 5692 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2014-09-23  6:57 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-04-06 16:11 [PATCH] Add configurable changed tag to messages that have been changed on disk Gaute Hope
2014-04-06 20:19 ` David Mazieres
2014-04-10 14:43   ` Gaute Hope
2014-04-10 15:31     ` dm-list-email-notmuch
2014-04-10 21:10       ` Gaute Hope
2014-04-22 22:05         ` David Bremner
2014-04-23  7:24           ` Gaute Hope
2014-04-23  9:00             ` David Mazieres
2014-04-23 11:53               ` Gaute Hope
2014-04-23 20:59               ` Austin Clements
2014-04-23 22:31                 ` dm-list-email-notmuch
2014-04-11 11:08       ` David Bremner
2014-04-11 16:03         ` dm-list-email-notmuch
2014-04-12 15:58           ` David Bremner
2014-05-03 14:01             ` Jani Nikula
2014-04-23 21:28   ` Austin Clements
2014-04-23 22:40     ` David Mazieres expires 2014-07-22 PDT
2014-07-03 10:42 ` David Bremner
2014-07-28 14:37   ` Gaute Hope
2014-08-01 18:55     ` Austin Clements
2014-08-02  0:49       ` Austin Clements
2014-08-06  9:02       ` Gaute Hope
2014-08-06 17:06         ` Austin Clements
     [not found]       ` <1407313144-astroid-0-vyhth1tcrd-3835@strange>
2014-09-22 12:06         ` Gaute Hope
2014-09-22 15:33           ` Tomi Ollila
2014-09-22 15:40           ` Austin Clements
2014-09-23  6:57             ` Gaute Hope

Code repositories for project(s) associated with this public inbox

	https://yhetil.org/notmuch.git/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).