unofficial mirror of notmuch@notmuchmail.org
 help / color / mirror / code / Atom feed
From: Jani Nikula <jani@nikula.org>
To: notmuch@notmuchmail.org
Subject: [RFC PATCH 4/5] cli: change the data structure for notmuch address deduplication
Date: Sat, 29 Aug 2015 17:56:35 +0300	[thread overview]
Message-ID: <ffe5975fd070d3e8e22602b01ceb8bf0d7d47ae0.1440859765.git.jani@nikula.org> (raw)
In-Reply-To: <cover.1440859765.git.jani@nikula.org>
In-Reply-To: <cover.1440859765.git.jani@nikula.org>

Currently we key the address hash table with the case sensitive "name
<address>". Switch to case insensitive keying with just address, and
store the case sensitive name and address in linked lists. This will
be helpful in adding support different deduplication schemes in the
future. There will be a slight performance penalty for the current
full case sensitive name + address deduplication, but this is simpler
as a whole when other deduplication schemes are added, and I expect
the schemes to be added to become more popular than the current
default.
---
 notmuch-client.h |  1 +
 notmuch-search.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 76 insertions(+), 16 deletions(-)

diff --git a/notmuch-client.h b/notmuch-client.h
index 882aa30563df..97d68d1158ac 100644
--- a/notmuch-client.h
+++ b/notmuch-client.h
@@ -48,6 +48,7 @@ typedef GMimeCryptoContext notmuch_crypto_context_t;
 #include <dirent.h>
 #include <errno.h>
 #include <signal.h>
+#include <ctype.h>
 
 #include "talloc-extra.h"
 
diff --git a/notmuch-search.c b/notmuch-search.c
index be8afcc0187b..60311393198d 100644
--- a/notmuch-search.c
+++ b/notmuch-search.c
@@ -258,30 +258,79 @@ static mailbox_t *new_mailbox (void *ctx, const char *name, const char *addr)
     return mailbox;
 }
 
+static int
+strcase_equal (const void *a, const void *b)
+{
+    return strcasecmp (a, b) == 0;
+}
+
+static unsigned int
+strcase_hash (const void *ptr)
+{
+    const char *s = ptr;
+
+    /* This is the djb2 hash. */
+    unsigned int hash = 5381;
+    while (s && *s) {
+	hash = ((hash << 5) + hash) + tolower (*s);
+	s++;
+    }
+
+    return hash;
+}
+
+static int mailbox_compare (const void *v1, const void *v2)
+{
+    const mailbox_t *m1 = v1, *m2 = v2;
+    int v;
+
+    if (m1->name && m2->name)
+	v = strcmp (m1->name, m2->name);
+    else
+	v = !!m1->name - !!m2->name;
+
+    if (! v)
+	v = strcmp (m1->addr, m2->addr);
+
+    return v;
+}
+
 /* Returns TRUE iff name and addr is duplicate. If not, stores the
  * name/addr pair in order to detect subsequent duplicates. */
 static notmuch_bool_t
 is_duplicate (const search_context_t *ctx, const char *name, const char *addr)
 {
     char *key;
+    GList *list, *l;
     mailbox_t *mailbox;
 
-    key = talloc_asprintf (ctx->format, "%s <%s>", name, addr);
-    if (! key)
+    mailbox = new_mailbox (ctx->format, name, addr);
+    if (! mailbox)
 	return FALSE;
 
-    mailbox = g_hash_table_lookup (ctx->addresses, key);
-    if (mailbox) {
-	mailbox->count++;
-	talloc_free (key);
-	return TRUE;
+    list = g_hash_table_lookup (ctx->addresses, addr);
+    if (list) {
+	l = g_list_find_custom (list, mailbox, mailbox_compare);
+	if (l) {
+	    talloc_free (mailbox);
+	    mailbox = l->data;
+	    mailbox->count++;
+	    return TRUE;
+	}
+
+	g_list_append (list, mailbox);
+	return FALSE;
     }
 
-    mailbox = new_mailbox (ctx->format, name, addr);
-    if (! mailbox)
+    key = talloc_strdup (ctx->format, addr);
+    if (! key)
 	return FALSE;
 
-    g_hash_table_insert (ctx->addresses, key, mailbox);
+    list = g_list_append (NULL, mailbox);
+    if (! list)
+	return FALSE;
+
+    g_hash_table_insert (ctx->addresses, key, list);
 
     return FALSE;
 }
@@ -393,12 +442,21 @@ _talloc_free_for_g_hash (void *ptr)
 }
 
 static void
-print_hash_value (unused (gpointer key), gpointer value, gpointer user_data)
+_list_free_for_g_hash (void *ptr)
+{
+    g_list_free_full (ptr, _talloc_free_for_g_hash);
+}
+
+static void
+print_list_value (void *mailbox, void *context)
 {
-    const mailbox_t *mailbox = value;
-    search_context_t *ctx = user_data;
+    print_mailbox (context, mailbox);
+}
 
-    print_mailbox (ctx, mailbox);
+static void
+print_hash_value (unused (void *key), void *list, void *context)
+{
+    g_list_foreach (list, print_list_value, context);
 }
 
 static int
@@ -778,8 +836,9 @@ notmuch_address_command (notmuch_config_t *config, int argc, char *argv[])
 				 argc - opt_index, argv + opt_index))
 	return EXIT_FAILURE;
 
-    ctx->addresses = g_hash_table_new_full (g_str_hash, g_str_equal,
-					    _talloc_free_for_g_hash, _talloc_free_for_g_hash);
+    ctx->addresses = g_hash_table_new_full (strcase_hash, strcase_equal,
+					    _talloc_free_for_g_hash,
+					    _list_free_for_g_hash);
 
     ret = do_search_messages (ctx);
 
-- 
2.1.4

  parent reply	other threads:[~2015-08-29 14:56 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-08-29 14:56 [PATCH 0/5] cli: alternative address deduplication Jani Nikula
2015-08-29 14:56 ` [RFC PATCH 1/5] cli: g_hash_table_lookup_extended is overkill Jani Nikula
2015-08-29 14:56 ` [RFC PATCH 2/5] cli: abstract new mailbox creation Jani Nikula
2015-08-29 14:56 ` [RFC PATCH 3/5] cli: add support for not deduplicating notmuch address results Jani Nikula
2015-08-30  1:29   ` David Bremner
2015-08-30  7:33     ` Jani Nikula
2015-08-29 14:56 ` Jani Nikula [this message]
2015-08-30  1:48   ` [RFC PATCH 4/5] cli: change the data structure for notmuch address deduplication David Bremner
2015-08-30  7:45     ` Jani Nikula
2015-08-30 11:46       ` David Bremner
2015-08-30  7:47     ` Jani Nikula
2015-08-29 14:56 ` [RFC PATCH 5/5] cli: add support for deduplicating based on case insensitive address Jani Nikula
2015-08-30 12:06   ` David Bremner
2015-08-31 10:52     ` Tomi Ollila
2015-08-31 11:15       ` David Bremner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://notmuchmail.org/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ffe5975fd070d3e8e22602b01ceb8bf0d7d47ae0.1440859765.git.jani@nikula.org \
    --to=jani@nikula.org \
    --cc=notmuch@notmuchmail.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://yhetil.org/notmuch.git/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).