* [PATCH v3 1/6] cli: sanitize tabs and newlines to spaces in notmuch search
2014-02-03 19:51 [PATCH v3 0/6] lib: replace the message header parser with gmime Jani Nikula
@ 2014-02-03 19:51 ` Jani Nikula
2014-02-03 19:51 ` [PATCH v3 2/6] cli: refactor reply from guessing Jani Nikula
` (4 subsequent siblings)
5 siblings, 0 replies; 14+ messages in thread
From: Jani Nikula @ 2014-02-03 19:51 UTC (permalink / raw)
To: notmuch
Sanitize tabs and newlines to spaces rather than question marks in
--output=summary --format=text output.
This will also hide any difference in unfolding a header that has been
folded with a tab. Our own header parser replaces tabs with spaces,
while gmime would retain the tab.
---
notmuch-search.c | 4 +++-
test/T090-search-output.sh | 2 +-
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/notmuch-search.c b/notmuch-search.c
index 91b5d10..0262eb3 100644
--- a/notmuch-search.c
+++ b/notmuch-search.c
@@ -41,7 +41,9 @@ sanitize_string (const void *ctx, const char *str)
loop = out = talloc_strdup (ctx, str);
for (; *loop; loop++) {
- if ((unsigned char)(*loop) < 32)
+ if (*loop == '\t' || *loop == '\n')
+ *loop = ' ';
+ else if ((unsigned char)(*loop) < 32)
*loop = '?';
}
return out;
diff --git a/test/T090-search-output.sh b/test/T090-search-output.sh
index 5ccfeaf..86544ac 100755
--- a/test/T090-search-output.sh
+++ b/test/T090-search-output.sh
@@ -388,7 +388,7 @@ add_message "[subject]='two =?ISO-8859-1?Q?line=0A_subject?=
headers'"
notmuch search id:"$gen_msg_id" | notmuch_search_sanitize >OUTPUT
cat <<EOF >EXPECTED
-thread:XXX 2001-01-05 [1/1] Notmuch Test Suite; two line? subject headers (inbox unread)
+thread:XXX 2001-01-05 [1/1] Notmuch Test Suite; two line subject headers (inbox unread)
EOF
test_expect_equal_file OUTPUT EXPECTED
--
1.8.5.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v3 2/6] cli: refactor reply from guessing
2014-02-03 19:51 [PATCH v3 0/6] lib: replace the message header parser with gmime Jani Nikula
2014-02-03 19:51 ` [PATCH v3 1/6] cli: sanitize tabs and newlines to spaces in notmuch search Jani Nikula
@ 2014-02-03 19:51 ` Jani Nikula
2014-02-03 19:51 ` [PATCH v3 3/6] util: make sanitize string available in string util for reuse Jani Nikula
` (3 subsequent siblings)
5 siblings, 0 replies; 14+ messages in thread
From: Jani Nikula @ 2014-02-03 19:51 UTC (permalink / raw)
To: notmuch
The guess_from_received_header() function had grown quite big. Chop it
up into smaller functions.
No functional changes.
---
notmuch-reply.c | 198 ++++++++++++++++++++++++++++++++++----------------------
1 file changed, 122 insertions(+), 76 deletions(-)
diff --git a/notmuch-reply.c b/notmuch-reply.c
index 79cdc83..47993d2 100644
--- a/notmuch-reply.c
+++ b/notmuch-reply.c
@@ -369,78 +369,44 @@ add_recipients_from_message (GMimeMessage *reply,
return from_addr;
}
+/*
+ * Look for the user's address in " for <email@add.res>" in the
+ * received headers.
+ *
+ * Return the address that was found, if any, and NULL otherwise.
+ */
static const char *
-guess_from_received_header (notmuch_config_t *config, notmuch_message_t *message)
+guess_from_in_received_for (notmuch_config_t *config, const char *received)
{
- const char *addr, *received, *by;
- char *mta,*ptr,*token;
- char *domain=NULL;
- char *tld=NULL;
- const char *delim=". \t";
- size_t i;
-
- const char *to_headers[] = {
- "Envelope-to",
- "X-Original-To",
- "Delivered-To",
- };
-
- /* sadly, there is no standard way to find out to which email
- * address a mail was delivered - what is in the headers depends
- * on the MTAs used along the way. So we are trying a number of
- * heuristics which hopefully will answer this question.
-
- * We only got here if none of the users email addresses are in
- * the To: or Cc: header. From here we try the following in order:
- * 1) check for an Envelope-to: header
- * 2) check for an X-Original-To: header
- * 3) check for a Delivered-To: header
- * 4) check for a (for <email@add.res>) clause in Received: headers
- * 5) check for the domain part of known email addresses in the
- * 'by' part of Received headers
- * If none of these work, we give up and return NULL
- */
- for (i = 0; i < ARRAY_SIZE (to_headers); i++) {
- const char *tohdr = notmuch_message_get_header (message, to_headers[i]);
-
- /* Note: tohdr potentially contains a list of email addresses. */
- addr = user_address_in_string (tohdr, config);
- if (addr)
- return addr;
- }
+ const char *ptr;
- /* We get the concatenated Received: headers and search from the
- * front (last Received: header added) and try to extract from
- * them indications to which email address this message was
- * delivered.
- * The Received: header is special in our get_header function
- * and is always concatenated.
- */
- received = notmuch_message_get_header (message, "received");
- if (received == NULL)
+ ptr = strstr (received, " for ");
+ if (! ptr)
return NULL;
- /* First we look for a " for <email@add.res>" in the received
- * header
- */
- ptr = strstr (received, " for ");
+ return user_address_in_string (ptr, config);
+}
- /* Note: ptr potentially contains a list of email addresses. */
- addr = user_address_in_string (ptr, config);
- if (addr)
- return addr;
-
- /* Finally, we parse all the " by MTA ..." headers to guess the
- * email address that this was originally delivered to.
- * We extract just the MTA here by removing leading whitespace and
- * assuming that the MTA name ends at the next whitespace.
- * We test for *(by+4) to be non-'\0' to make sure there's
- * something there at all - and then assume that the first
- * whitespace delimited token that follows is the receiving
- * system in this step of the receive chain
- */
- by = received;
- while((by = strstr (by, " by ")) != NULL) {
+/*
+ * Parse all the " by MTA ..." parts in received headers to guess the
+ * email address that this was originally delivered to.
+ *
+ * Extract just the MTA here by removing leading whitespace and
+ * assuming that the MTA name ends at the next whitespace. Test for
+ * *(by+4) to be non-'\0' to make sure there's something there at all
+ * - and then assume that the first whitespace delimited token that
+ * follows is the receiving system in this step of the receive chain.
+ *
+ * Return the address that was found, if any, and NULL otherwise.
+ */
+static const char *
+guess_from_in_received_by (notmuch_config_t *config, const char *received)
+{
+ const char *addr;
+ const char *by = received;
+ char *domain, *tld, *mta, *ptr, *token;
+
+ while ((by = strstr (by, " by ")) != NULL) {
by += 4;
if (*by == '\0')
break;
@@ -450,11 +416,12 @@ guess_from_received_header (notmuch_config_t *config, notmuch_message_t *message
free (mta);
break;
}
- /* Now extract the last two components of the MTA host name
- * as domain and tld.
+ /*
+ * Now extract the last two components of the MTA host name as
+ * domain and tld.
*/
domain = tld = NULL;
- while ((ptr = strsep (&token, delim)) != NULL) {
+ while ((ptr = strsep (&token, ". \t")) != NULL) {
if (*ptr == '\0')
continue;
domain = tld;
@@ -462,13 +429,14 @@ guess_from_received_header (notmuch_config_t *config, notmuch_message_t *message
}
if (domain) {
- /* Recombine domain and tld and look for it among the configured
- * email addresses.
- * This time we have a known domain name and nothing else - so
- * the test is the other way around: we check if this is a
- * substring of one of the email addresses.
+ /*
+ * Recombine domain and tld and look for it among the
+ * configured email addresses. This time we have a known
+ * domain name and nothing else - so the test is the other
+ * way around: we check if this is a substring of one of
+ * the email addresses.
*/
- *(tld-1) = '.';
+ *(tld - 1) = '.';
addr = string_in_user_address (domain, config);
if (addr) {
@@ -482,6 +450,63 @@ guess_from_received_header (notmuch_config_t *config, notmuch_message_t *message
return NULL;
}
+/*
+ * Get the concatenated Received: headers and search from the front
+ * (last Received: header added) and try to extract from them
+ * indications to which email address this message was delivered.
+ *
+ * The Received: header is special in our get_header function and is
+ * always concatenated.
+ *
+ * Return the address that was found, if any, and NULL otherwise.
+ */
+static const char *
+guess_from_in_received_headers (notmuch_config_t *config,
+ notmuch_message_t *message)
+{
+ const char *received, *addr;
+
+ received = notmuch_message_get_header (message, "received");
+ if (! received)
+ return NULL;
+
+ addr = guess_from_in_received_for (config, received);
+ if (! addr)
+ addr = guess_from_in_received_by (config, received);
+
+ return addr;
+}
+
+/*
+ * Try to find user's email address in one of the extra To-like
+ * headers: Envelope-To, X-Original-To, and Delivered-To (searched in
+ * that order).
+ *
+ * Return the address that was found, if any, and NULL otherwise.
+ */
+static const char *
+get_from_in_to_headers (notmuch_config_t *config, notmuch_message_t *message)
+{
+ size_t i;
+ const char *tohdr, *addr;
+ const char *to_headers[] = {
+ "Envelope-to",
+ "X-Original-To",
+ "Delivered-To",
+ };
+
+ for (i = 0; i < ARRAY_SIZE (to_headers); i++) {
+ tohdr = notmuch_message_get_header (message, to_headers[i]);
+
+ /* Note: tohdr potentially contains a list of email addresses. */
+ addr = user_address_in_string (tohdr, config);
+ if (addr)
+ return addr;
+ }
+
+ return NULL;
+}
+
static GMimeMessage *
create_reply_message(void *ctx,
notmuch_config_t *config,
@@ -508,9 +533,30 @@ create_reply_message(void *ctx,
from_addr = add_recipients_from_message (reply, config,
message, reply_all);
+ /*
+ * Sadly, there is no standard way to find out to which email
+ * address a mail was delivered - what is in the headers depends
+ * on the MTAs used along the way.
+ *
+ * If none of the user's email addresses are in the To: or Cc:
+ * headers, we try a number of heuristics which hopefully will
+ * answer this question.
+ *
+ * First, check for Envelope-To:, X-Original-To:, and
+ * Delivered-To: headers.
+ */
+ if (from_addr == NULL)
+ from_addr = get_from_in_to_headers (config, message);
+
+ /*
+ * Check for a (for <email@add.res>) clause in Received: headers,
+ * and the domain part of known email addresses in the 'by' part
+ * of Received: headers
+ */
if (from_addr == NULL)
- from_addr = guess_from_received_header (config, message);
+ from_addr = guess_from_in_received_headers (config, message);
+ /* Default to user's primary address. */
if (from_addr == NULL)
from_addr = notmuch_config_get_user_primary_email (config);
--
1.8.5.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v3 3/6] util: make sanitize string available in string util for reuse
2014-02-03 19:51 [PATCH v3 0/6] lib: replace the message header parser with gmime Jani Nikula
2014-02-03 19:51 ` [PATCH v3 1/6] cli: sanitize tabs and newlines to spaces in notmuch search Jani Nikula
2014-02-03 19:51 ` [PATCH v3 2/6] cli: refactor reply from guessing Jani Nikula
@ 2014-02-03 19:51 ` Jani Nikula
2014-03-08 11:32 ` David Bremner
2014-02-03 19:51 ` [PATCH v3 4/6] cli: sanitize the received header before scanning for replies Jani Nikula
` (2 subsequent siblings)
5 siblings, 1 reply; 14+ messages in thread
From: Jani Nikula @ 2014-02-03 19:51 UTC (permalink / raw)
To: notmuch
No functional changes.
---
notmuch-search.c | 19 -------------------
util/string-util.c | 22 ++++++++++++++++++++++
util/string-util.h | 7 +++++++
3 files changed, 29 insertions(+), 19 deletions(-)
diff --git a/notmuch-search.c b/notmuch-search.c
index 0262eb3..bc9be45 100644
--- a/notmuch-search.c
+++ b/notmuch-search.c
@@ -30,25 +30,6 @@ typedef enum {
OUTPUT_TAGS
} output_t;
-static char *
-sanitize_string (const void *ctx, const char *str)
-{
- char *out, *loop;
-
- if (NULL == str)
- return NULL;
-
- loop = out = talloc_strdup (ctx, str);
-
- for (; *loop; loop++) {
- if (*loop == '\t' || *loop == '\n')
- *loop = ' ';
- else if ((unsigned char)(*loop) < 32)
- *loop = '?';
- }
- return out;
-}
-
/* Return two stable query strings that identify exactly the matched
* and unmatched messages currently in thread. If there are no
* matched or unmatched messages, the returned buffers will be
diff --git a/util/string-util.c b/util/string-util.c
index a5622d7..9e2f728 100644
--- a/util/string-util.c
+++ b/util/string-util.c
@@ -37,6 +37,28 @@ strtok_len (char *s, const char *delim, size_t *len)
return *len ? s : NULL;
}
+char *
+sanitize_string (const void *ctx, const char *str)
+{
+ char *out, *loop;
+
+ if (! str)
+ return NULL;
+
+ out = talloc_strdup (ctx, str);
+ if (! out)
+ return NULL;
+
+ for (loop = out; *loop; loop++) {
+ if (*loop == '\t' || *loop == '\n')
+ *loop = ' ';
+ else if ((unsigned char)(*loop) < 32)
+ *loop = '?';
+ }
+
+ return out;
+}
+
static int
is_unquoted_terminator (unsigned char c)
{
diff --git a/util/string-util.h b/util/string-util.h
index 0194607..8a3ad19 100644
--- a/util/string-util.h
+++ b/util/string-util.h
@@ -19,6 +19,13 @@
char *strtok_len (char *s, const char *delim, size_t *len);
+/* Return a talloced string with str sanitized.
+ *
+ * Whitespace characters (tabs and newlines) are replaced with spaces,
+ * non-printable characters with question marks.
+ */
+char *sanitize_string (const void *ctx, const char *str);
+
/* Construct a boolean term query with the specified prefix (e.g.,
* "id") and search term, quoting term as necessary. Specifically, if
* term contains any non-printable ASCII characters, non-ASCII
--
1.8.5.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v3 4/6] cli: sanitize the received header before scanning for replies
2014-02-03 19:51 [PATCH v3 0/6] lib: replace the message header parser with gmime Jani Nikula
` (2 preceding siblings ...)
2014-02-03 19:51 ` [PATCH v3 3/6] util: make sanitize string available in string util for reuse Jani Nikula
@ 2014-02-03 19:51 ` Jani Nikula
2014-02-03 20:46 ` Austin Clements
2014-02-03 19:51 ` [PATCH v3 5/6] lib: replace the header parser with gmime Jani Nikula
2014-02-03 19:51 ` [PATCH v3 6/6] lib: parse messages only once Jani Nikula
5 siblings, 1 reply; 14+ messages in thread
From: Jani Nikula @ 2014-02-03 19:51 UTC (permalink / raw)
To: notmuch
This makes the from guessing agnostic to header folding by spaces or
tabs.
---
notmuch-reply.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/notmuch-reply.c b/notmuch-reply.c
index 47993d2..3f7021e 100644
--- a/notmuch-reply.c
+++ b/notmuch-reply.c
@@ -21,6 +21,7 @@
*/
#include "notmuch-client.h"
+#include "string-util.h"
#include "sprinter.h"
static void
@@ -465,14 +466,21 @@ guess_from_in_received_headers (notmuch_config_t *config,
notmuch_message_t *message)
{
const char *received, *addr;
+ char *sanitized;
received = notmuch_message_get_header (message, "received");
if (! received)
return NULL;
- addr = guess_from_in_received_for (config, received);
+ sanitized = sanitize_string (config, received);
+ if (! sanitized)
+ return NULL;
+
+ addr = guess_from_in_received_for (config, sanitized);
if (! addr)
- addr = guess_from_in_received_by (config, received);
+ addr = guess_from_in_received_by (config, sanitized);
+
+ talloc_free (sanitized);
return addr;
}
--
1.8.5.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v3 4/6] cli: sanitize the received header before scanning for replies
2014-02-03 19:51 ` [PATCH v3 4/6] cli: sanitize the received header before scanning for replies Jani Nikula
@ 2014-02-03 20:46 ` Austin Clements
2014-03-19 16:44 ` [PATCH] " Jani Nikula
0 siblings, 1 reply; 14+ messages in thread
From: Austin Clements @ 2014-02-03 20:46 UTC (permalink / raw)
To: Jani Nikula; +Cc: notmuch
Quoth Jani Nikula on Feb 03 at 9:51 pm:
> This makes the from guessing agnostic to header folding by spaces or
> tabs.
> ---
> notmuch-reply.c | 12 ++++++++++--
> 1 file changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/notmuch-reply.c b/notmuch-reply.c
> index 47993d2..3f7021e 100644
> --- a/notmuch-reply.c
> +++ b/notmuch-reply.c
> @@ -21,6 +21,7 @@
> */
>
> #include "notmuch-client.h"
> +#include "string-util.h"
> #include "sprinter.h"
>
> static void
> @@ -465,14 +466,21 @@ guess_from_in_received_headers (notmuch_config_t *config,
> notmuch_message_t *message)
> {
> const char *received, *addr;
> + char *sanitized;
>
> received = notmuch_message_get_header (message, "received");
> if (! received)
> return NULL;
>
> - addr = guess_from_in_received_for (config, received);
> + sanitized = sanitize_string (config, received);
Did you mean to pass "config" as the talloc context for
sanitize_string? It seems like a better context would be "message" or
possibly even NULL, given that you explicitly talloc_free the string.
> + if (! sanitized)
> + return NULL;
> +
> + addr = guess_from_in_received_for (config, sanitized);
> if (! addr)
> - addr = guess_from_in_received_by (config, received);
> + addr = guess_from_in_received_by (config, sanitized);
> +
> + talloc_free (sanitized);
>
> return addr;
> }
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH] cli: sanitize the received header before scanning for replies
2014-02-03 20:46 ` Austin Clements
@ 2014-03-19 16:44 ` Jani Nikula
2014-03-26 0:33 ` David Bremner
0 siblings, 1 reply; 14+ messages in thread
From: Jani Nikula @ 2014-03-19 16:44 UTC (permalink / raw)
To: notmuch
This makes the from guessing agnostic to header folding by spaces or
tabs.
---
I haven't had the time to update the rest of the series, but get the
prep patch out of the way.
---
notmuch-reply.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/notmuch-reply.c b/notmuch-reply.c
index 47993d223090..7c1c80959ed6 100644
--- a/notmuch-reply.c
+++ b/notmuch-reply.c
@@ -21,6 +21,7 @@
*/
#include "notmuch-client.h"
+#include "string-util.h"
#include "sprinter.h"
static void
@@ -465,14 +466,21 @@ guess_from_in_received_headers (notmuch_config_t *config,
notmuch_message_t *message)
{
const char *received, *addr;
+ char *sanitized;
received = notmuch_message_get_header (message, "received");
if (! received)
return NULL;
- addr = guess_from_in_received_for (config, received);
+ sanitized = sanitize_string (NULL, received);
+ if (! sanitized)
+ return NULL;
+
+ addr = guess_from_in_received_for (config, sanitized);
if (! addr)
- addr = guess_from_in_received_by (config, received);
+ addr = guess_from_in_received_by (config, sanitized);
+
+ talloc_free (sanitized);
return addr;
}
--
1.9.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v3 5/6] lib: replace the header parser with gmime
2014-02-03 19:51 [PATCH v3 0/6] lib: replace the message header parser with gmime Jani Nikula
` (3 preceding siblings ...)
2014-02-03 19:51 ` [PATCH v3 4/6] cli: sanitize the received header before scanning for replies Jani Nikula
@ 2014-02-03 19:51 ` Jani Nikula
2014-02-03 21:31 ` Austin Clements
2014-02-03 19:51 ` [PATCH v3 6/6] lib: parse messages only once Jani Nikula
5 siblings, 1 reply; 14+ messages in thread
From: Jani Nikula @ 2014-02-03 19:51 UTC (permalink / raw)
To: notmuch
The notmuch library includes a full blown message header parser. Yet
the same message headers are parsed by gmime during indexing. Switch
to gmime parsing completely.
These are the main differences between the parsers:
* Gmime stops header parsing at the first invalid header, and presumes
the message body starts from there. The current parser is quite
liberal in accepting broken headers. The change means we will be
much pickier about accepting invalid messages. (Note that the
headers after the invalid header would not have been indexed before
this change anyway, as we use gmime for that.)
* The current parser converts tabs used in header folding to
spaces. Gmime preserve the tabs. Due to a broken python library used
in mailman, there are plenty of mailing lists that produce headers
with tabs in header folding, and we'll see plenty of tabs. (This
change has been mitigated in preparatory patches.)
* For pure header parsing, the current parser is likely faster than
gmime, which parses the whole message rather than just the
headers. Since we use gmime for indexing anyway, we can drop an
extra header parsing round (this is done in a follow-up patch).
At this step, we only switch the header parsing to gmime.
---
lib/database.cc | 4 +
lib/index.cc | 11 --
lib/message-file.c | 349 ++++++++++++++++----------------------------------
lib/message.cc | 6 +
lib/notmuch-private.h | 4 +
5 files changed, 125 insertions(+), 249 deletions(-)
diff --git a/lib/database.cc b/lib/database.cc
index f395061..d1bea88 100644
--- a/lib/database.cc
+++ b/lib/database.cc
@@ -1940,6 +1940,10 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
"to",
(char *) NULL);
+ ret = notmuch_message_file_parse (message_file);
+ if (ret)
+ goto DONE;
+
try {
/* Before we do any real work, (especially before doing a
* potential SHA-1 computation on the entire file's contents),
diff --git a/lib/index.cc b/lib/index.cc
index 78c18cf..976e49f 100644
--- a/lib/index.cc
+++ b/lib/index.cc
@@ -437,7 +437,6 @@ _notmuch_message_index_file (notmuch_message_t *message,
static int initialized = 0;
char from_buf[5];
bool is_mbox = false;
- static bool mbox_warning = false;
if (! initialized) {
g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS);
@@ -471,16 +470,6 @@ _notmuch_message_index_file (notmuch_message_t *message,
ret = NOTMUCH_STATUS_FILE_NOT_EMAIL;
goto DONE;
}
- /* For historical reasons, we support single-message mboxes,
- * but this behavior is likely to change in the future, so
- * warn. */
- if (!mbox_warning) {
- mbox_warning = true;
- fprintf (stderr, "\
-Warning: %s is an mbox containing a single message,\n\
-likely caused by misconfigured mail delivery. Support for single-message\n\
-mboxes is deprecated and may be removed in the future.\n", filename);
- }
}
from = g_mime_message_get_sender (mime_message);
diff --git a/lib/message-file.c b/lib/message-file.c
index a2850c2..33f6468 100644
--- a/lib/message-file.c
+++ b/lib/message-file.c
@@ -26,30 +26,19 @@
#include <glib.h> /* GHashTable */
-typedef struct {
- char *str;
- size_t size;
- size_t len;
-} header_value_closure_t;
-
struct _notmuch_message_file {
/* File object */
FILE *file;
+ char *filename;
/* Header storage */
int restrict_headers;
GHashTable *headers;
- int broken_headers;
- int good_headers;
- size_t header_size; /* Length of full message header in bytes. */
-
- /* Parsing state */
- char *line;
- size_t line_size;
- header_value_closure_t value;
- int parsing_started;
- int parsing_finished;
+ GMimeStream *stream;
+ GMimeParser *parser;
+ GMimeMessage *message;
+ notmuch_bool_t parsed;
};
static int
@@ -76,16 +65,18 @@ strcase_hash (const void *ptr)
static int
_notmuch_message_file_destructor (notmuch_message_file_t *message)
{
- if (message->line)
- free (message->line);
-
- if (message->value.size)
- free (message->value.str);
-
if (message->headers)
g_hash_table_destroy (message->headers);
- if (message->file)
+ if (message->message)
+ g_object_unref (message->message);
+
+ if (message->parser)
+ g_object_unref (message->parser);
+
+ if (message->stream)
+ g_object_unref (message->stream);
+ else if (message->file) /* GMimeStream takes over the FILE* */
fclose (message->file);
return 0;
@@ -102,19 +93,21 @@ _notmuch_message_file_open_ctx (void *ctx, const char *filename)
if (unlikely (message == NULL))
return NULL;
+ /* only needed for error messages during parsing */
+ message->filename = talloc_strdup (message, filename);
+ if (message->filename == NULL)
+ goto FAIL;
+
talloc_set_destructor (message, _notmuch_message_file_destructor);
message->file = fopen (filename, "r");
if (message->file == NULL)
goto FAIL;
- message->headers = g_hash_table_new_full (strcase_hash,
- strcase_equal,
- free,
- g_free);
-
- message->parsing_started = 0;
- message->parsing_finished = 0;
+ message->headers = g_hash_table_new_full (strcase_hash, strcase_equal,
+ free, g_free);
+ if (message->headers == NULL)
+ goto FAIL;
return message;
@@ -143,7 +136,7 @@ notmuch_message_file_restrict_headersv (notmuch_message_file_t *message,
{
char *header;
- if (message->parsing_started)
+ if (message->parsed)
INTERNAL_ERROR ("notmuch_message_file_restrict_headers called after parsing has started");
while (1) {
@@ -167,234 +160,114 @@ notmuch_message_file_restrict_headers (notmuch_message_file_t *message, ...)
notmuch_message_file_restrict_headersv (message, va_headers);
}
-static void
-copy_header_unfolding (header_value_closure_t *value,
- const char *chunk)
+/*
+ * gmime does not provide access to all Received: headers the way we
+ * want, so we'll to use the parser header callback to gather them
+ * into a hash table.
+ */
+static void header_cb (unused(GMimeParser *parser), const char *header,
+ const char *value, unused(gint64 offset),
+ gpointer user_data)
{
- char *last;
+ notmuch_message_file_t *message = (notmuch_message_file_t *) user_data;
+ char *existing = NULL;
+ notmuch_bool_t found;
- if (chunk == NULL)
+ found = g_hash_table_lookup_extended (message->headers, header,
+ NULL, (gpointer *) &existing);
+ if (! found && message->restrict_headers)
return;
- while (*chunk == ' ' || *chunk == '\t')
- chunk++;
-
- if (value->len + 1 + strlen (chunk) + 1 > value->size) {
- unsigned int new_size = value->size;
- if (value->size == 0)
- new_size = strlen (chunk) + 1;
- else
- while (value->len + 1 + strlen (chunk) + 1 > new_size)
- new_size *= 2;
- value->str = xrealloc (value->str, new_size);
- value->size = new_size;
- }
-
- last = value->str + value->len;
- if (value->len) {
- *last = ' ';
- last++;
- value->len++;
- }
-
- strcpy (last, chunk);
- value->len += strlen (chunk);
-
- last = value->str + value->len - 1;
- if (*last == '\n') {
- *last = '\0';
- value->len--;
+ if (existing == NULL) {
+ /* No value, add one */
+ char *decoded = g_mime_utils_header_decode_text (value);
+ g_hash_table_insert (message->headers, xstrdup (header), decoded);
+ } else if (strcasecmp (header, "received") == 0) {
+ /* Concat all of the Received: headers we encounter. */
+ char *combined, *decoded;
+ size_t combined_size;
+
+ decoded = g_mime_utils_header_decode_text (value);
+
+ combined_size = strlen(existing) + strlen(decoded) + 2;
+ combined = g_malloc (combined_size);
+ snprintf (combined, combined_size, "%s %s", existing, decoded);
+ g_free (decoded);
+ g_hash_table_insert (message->headers, xstrdup (header), combined);
}
}
-/* As a special-case, a value of NULL for header_desired will force
- * the entire header to be parsed if it is not parsed already. This is
- * used by the _notmuch_message_file_get_headers_end function.
- * Another special case is the Received: header. For this header we
- * want to concatenate all instances of the header instead of just
- * hashing the first instance as we use this when analyzing the path
- * the mail has taken from sender to recipient.
- */
-const char *
-notmuch_message_file_get_header (notmuch_message_file_t *message,
- const char *header_desired)
+notmuch_status_t
+notmuch_message_file_parse (notmuch_message_file_t *message)
{
- int contains;
- char *header, *decoded_value, *header_sofar, *combined_header;
- const char *s, *colon;
- int match, newhdr, hdrsofar, is_received;
static int initialized = 0;
-
- is_received = (strcmp(header_desired,"received") == 0);
+ char from_buf[5];
+ notmuch_bool_t is_mbox = FALSE;
+ static notmuch_bool_t mbox_warning = FALSE;
if (! initialized) {
g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS);
initialized = 1;
}
- message->parsing_started = 1;
-
- if (header_desired == NULL)
- contains = 0;
- else
- contains = g_hash_table_lookup_extended (message->headers,
- header_desired, NULL,
- (gpointer *) &decoded_value);
-
- if (contains && decoded_value)
- return decoded_value;
-
- if (message->parsing_finished)
- return "";
-
-#define NEXT_HEADER_LINE(closure) \
- while (1) { \
- ssize_t bytes_read = getline (&message->line, \
- &message->line_size, \
- message->file); \
- if (bytes_read == -1) { \
- message->parsing_finished = 1; \
- break; \
- } \
- if (*message->line == '\n') { \
- message->parsing_finished = 1; \
- break; \
- } \
- if (closure && \
- (*message->line == ' ' || *message->line == '\t')) \
- { \
- copy_header_unfolding ((closure), message->line); \
- } \
- if (*message->line == ' ' || *message->line == '\t') \
- message->header_size += strlen (message->line); \
- else \
- break; \
- }
-
- if (message->line == NULL)
- NEXT_HEADER_LINE (NULL);
-
- while (1) {
-
- if (message->parsing_finished)
- break;
-
- colon = strchr (message->line, ':');
-
- if (colon == NULL) {
- message->broken_headers++;
- /* A simple heuristic for giving up on things that just
- * don't look like mail messages. */
- if (message->broken_headers >= 10 &&
- message->good_headers < 5)
- {
- message->parsing_finished = 1;
- break;
- }
- NEXT_HEADER_LINE (NULL);
- continue;
+ /* Is this mbox? */
+ if (fread (from_buf, sizeof (from_buf), 1, message->file) == 1 &&
+ strncmp (from_buf, "From ", 5) == 0)
+ is_mbox = TRUE;
+ rewind (message->file);
+
+ message->stream = g_mime_stream_file_new (message->file);
+ message->parser = g_mime_parser_new_with_stream (message->stream);
+ g_mime_parser_set_scan_from (message->parser, is_mbox);
+ g_mime_parser_set_header_regex (message->parser, ".*", header_cb,
+ (gpointer) message);
+
+ message->message = g_mime_parser_construct_message (message->parser);
+ if (! message->message)
+ return NOTMUCH_STATUS_FILE_NOT_EMAIL;
+
+ if (is_mbox) {
+ if (! g_mime_parser_eos (message->parser)) {
+ /* This is a multi-message mbox. */
+ return NOTMUCH_STATUS_FILE_NOT_EMAIL;
}
-
- message->header_size += strlen (message->line);
-
- message->good_headers++;
-
- header = xstrndup (message->line, colon - message->line);
-
- if (message->restrict_headers &&
- ! g_hash_table_lookup_extended (message->headers,
- header, NULL, NULL))
- {
- free (header);
- NEXT_HEADER_LINE (NULL);
- continue;
- }
-
- s = colon + 1;
- while (*s == ' ' || *s == '\t')
- s++;
-
- message->value.len = 0;
- copy_header_unfolding (&message->value, s);
-
- NEXT_HEADER_LINE (&message->value);
-
- if (header_desired == NULL)
- match = 0;
- else
- match = (strcasecmp (header, header_desired) == 0);
-
- decoded_value = g_mime_utils_header_decode_text (message->value.str);
- header_sofar = (char *)g_hash_table_lookup (message->headers, header);
- /* we treat the Received: header special - we want to concat ALL of
- * the Received: headers we encounter.
- * for everything else we return the first instance of a header */
- if (strcasecmp(header, "received") == 0) {
- if (header_sofar == NULL) {
- /* first Received: header we encountered; just add it */
- g_hash_table_insert (message->headers, header, decoded_value);
- } else {
- /* we need to add the header to those we already collected */
- newhdr = strlen(decoded_value);
- hdrsofar = strlen(header_sofar);
- combined_header = g_malloc(hdrsofar + newhdr + 2);
- strncpy(combined_header,header_sofar,hdrsofar);
- *(combined_header+hdrsofar) = ' ';
- strncpy(combined_header+hdrsofar+1,decoded_value,newhdr+1);
- g_free (decoded_value);
- g_hash_table_insert (message->headers, header, combined_header);
- }
- } else {
- if (header_sofar == NULL) {
- /* Only insert if we don't have a value for this header, yet. */
- g_hash_table_insert (message->headers, header, decoded_value);
- } else {
- free (header);
- g_free (decoded_value);
- decoded_value = header_sofar;
- }
+ /*
+ * For historical reasons, we support single-message mboxes,
+ * but this behavior is likely to change in the future, so
+ * warn.
+ */
+ if (! mbox_warning) {
+ mbox_warning = TRUE;
+ fprintf (stderr, "\
+Warning: %s is an mbox containing a single message,\n\
+likely caused by misconfigured mail delivery. Support for single-message\n\
+mboxes is deprecated and may be removed in the future.\n", message->filename);
}
- /* if we found a match we can bail - unless of course we are
- * collecting all the Received: headers */
- if (match && !is_received)
- return decoded_value;
}
- if (message->parsing_finished) {
- fclose (message->file);
- message->file = NULL;
- }
+ message->parsed = TRUE;
- if (message->line)
- free (message->line);
- message->line = NULL;
+ return NOTMUCH_STATUS_SUCCESS;
+}
- if (message->value.size) {
- free (message->value.str);
- message->value.str = NULL;
- message->value.size = 0;
- message->value.len = 0;
- }
+/* return NULL on errors, empty string for non-existing headers */
+const char *
+notmuch_message_file_get_header (notmuch_message_file_t *message,
+ const char *header)
+{
+ const char *value = NULL;
+ notmuch_bool_t found;
- /* For the Received: header we actually might end up here even
- * though we found the header (as we force continued parsing
- * in that case). So let's check if that's the header we were
- * looking for and return the value that we found (if any)
- */
- if (is_received)
- return (char *)g_hash_table_lookup (message->headers, "received");
-
- /* We've parsed all headers and never found the one we're looking
- * for. It's probably just not there, but let's check that we
- * didn't make a mistake preventing us from seeing it. */
- if (message->restrict_headers && header_desired &&
- ! g_hash_table_lookup_extended (message->headers,
- header_desired, NULL, NULL))
- {
- INTERNAL_ERROR ("Attempt to get header \"%s\" which was not\n"
- "included in call to notmuch_message_file_restrict_headers\n",
- header_desired);
- }
+ /* the caller shouldn't ask for headers before parsing */
+ if (! message->parsed)
+ return NULL;
+
+ found = g_hash_table_lookup_extended (message->headers, header,
+ NULL, (gpointer *) &value);
+
+ /* the caller shouldn't ask for non-restricted headers */
+ if (! found && message->restrict_headers)
+ return NULL;
- return "";
+ return value ? value : "";
}
diff --git a/lib/message.cc b/lib/message.cc
index c91f3a5..9a22d36 100644
--- a/lib/message.cc
+++ b/lib/message.cc
@@ -407,6 +407,12 @@ _notmuch_message_ensure_message_file (notmuch_message_t *message)
return;
message->message_file = _notmuch_message_file_open_ctx (message, filename);
+
+ /* XXX: better return value handling */
+ if (notmuch_message_file_parse (message->message_file)) {
+ notmuch_message_file_close (message->message_file);
+ message->message_file = NULL;
+ }
}
const char *
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index af185c7..7277df1 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -345,6 +345,10 @@ notmuch_message_file_open (const char *filename);
notmuch_message_file_t *
_notmuch_message_file_open_ctx (void *ctx, const char *filename);
+/* Parse the message */
+notmuch_status_t
+notmuch_message_file_parse (notmuch_message_file_t *message);
+
/* Close a notmuch message previously opened with notmuch_message_open. */
void
notmuch_message_file_close (notmuch_message_file_t *message);
--
1.8.5.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v3 5/6] lib: replace the header parser with gmime
2014-02-03 19:51 ` [PATCH v3 5/6] lib: replace the header parser with gmime Jani Nikula
@ 2014-02-03 21:31 ` Austin Clements
0 siblings, 0 replies; 14+ messages in thread
From: Austin Clements @ 2014-02-03 21:31 UTC (permalink / raw)
To: Jani Nikula; +Cc: notmuch
Quoth Jani Nikula on Feb 03 at 9:51 pm:
> The notmuch library includes a full blown message header parser. Yet
> the same message headers are parsed by gmime during indexing. Switch
> to gmime parsing completely.
>
> These are the main differences between the parsers:
>
> * Gmime stops header parsing at the first invalid header, and presumes
> the message body starts from there. The current parser is quite
> liberal in accepting broken headers. The change means we will be
> much pickier about accepting invalid messages. (Note that the
> headers after the invalid header would not have been indexed before
> this change anyway, as we use gmime for that.)
>
> * The current parser converts tabs used in header folding to
> spaces. Gmime preserve the tabs. Due to a broken python library used
> in mailman, there are plenty of mailing lists that produce headers
> with tabs in header folding, and we'll see plenty of tabs. (This
> change has been mitigated in preparatory patches.)
>
> * For pure header parsing, the current parser is likely faster than
> gmime, which parses the whole message rather than just the
> headers. Since we use gmime for indexing anyway, we can drop an
> extra header parsing round (this is done in a follow-up patch).
>
> At this step, we only switch the header parsing to gmime.
> ---
> lib/database.cc | 4 +
> lib/index.cc | 11 --
> lib/message-file.c | 349 ++++++++++++++++----------------------------------
> lib/message.cc | 6 +
> lib/notmuch-private.h | 4 +
> 5 files changed, 125 insertions(+), 249 deletions(-)
>
> diff --git a/lib/database.cc b/lib/database.cc
> index f395061..d1bea88 100644
> --- a/lib/database.cc
> +++ b/lib/database.cc
> @@ -1940,6 +1940,10 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
> "to",
> (char *) NULL);
>
> + ret = notmuch_message_file_parse (message_file);
> + if (ret)
> + goto DONE;
> +
> try {
> /* Before we do any real work, (especially before doing a
> * potential SHA-1 computation on the entire file's contents),
> diff --git a/lib/index.cc b/lib/index.cc
> index 78c18cf..976e49f 100644
> --- a/lib/index.cc
> +++ b/lib/index.cc
> @@ -437,7 +437,6 @@ _notmuch_message_index_file (notmuch_message_t *message,
> static int initialized = 0;
> char from_buf[5];
> bool is_mbox = false;
> - static bool mbox_warning = false;
>
> if (! initialized) {
> g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS);
> @@ -471,16 +470,6 @@ _notmuch_message_index_file (notmuch_message_t *message,
> ret = NOTMUCH_STATUS_FILE_NOT_EMAIL;
> goto DONE;
> }
> - /* For historical reasons, we support single-message mboxes,
> - * but this behavior is likely to change in the future, so
> - * warn. */
> - if (!mbox_warning) {
> - mbox_warning = true;
> - fprintf (stderr, "\
> -Warning: %s is an mbox containing a single message,\n\
> -likely caused by misconfigured mail delivery. Support for single-message\n\
> -mboxes is deprecated and may be removed in the future.\n", filename);
> - }
> }
>
> from = g_mime_message_get_sender (mime_message);
> diff --git a/lib/message-file.c b/lib/message-file.c
> index a2850c2..33f6468 100644
> --- a/lib/message-file.c
> +++ b/lib/message-file.c
> @@ -26,30 +26,19 @@
>
> #include <glib.h> /* GHashTable */
>
> -typedef struct {
> - char *str;
> - size_t size;
> - size_t len;
> -} header_value_closure_t;
> -
> struct _notmuch_message_file {
> /* File object */
> FILE *file;
> + char *filename;
>
> /* Header storage */
> int restrict_headers;
> GHashTable *headers;
> - int broken_headers;
> - int good_headers;
> - size_t header_size; /* Length of full message header in bytes. */
> -
> - /* Parsing state */
> - char *line;
> - size_t line_size;
> - header_value_closure_t value;
>
> - int parsing_started;
> - int parsing_finished;
> + GMimeStream *stream;
> + GMimeParser *parser;
> + GMimeMessage *message;
> + notmuch_bool_t parsed;
> };
>
> static int
> @@ -76,16 +65,18 @@ strcase_hash (const void *ptr)
> static int
> _notmuch_message_file_destructor (notmuch_message_file_t *message)
> {
> - if (message->line)
> - free (message->line);
> -
> - if (message->value.size)
> - free (message->value.str);
> -
> if (message->headers)
> g_hash_table_destroy (message->headers);
>
> - if (message->file)
> + if (message->message)
> + g_object_unref (message->message);
> +
> + if (message->parser)
> + g_object_unref (message->parser);
> +
> + if (message->stream)
> + g_object_unref (message->stream);
> + else if (message->file) /* GMimeStream takes over the FILE* */
> fclose (message->file);
>
> return 0;
> @@ -102,19 +93,21 @@ _notmuch_message_file_open_ctx (void *ctx, const char *filename)
> if (unlikely (message == NULL))
> return NULL;
>
> + /* only needed for error messages during parsing */
> + message->filename = talloc_strdup (message, filename);
> + if (message->filename == NULL)
> + goto FAIL;
> +
> talloc_set_destructor (message, _notmuch_message_file_destructor);
>
> message->file = fopen (filename, "r");
> if (message->file == NULL)
> goto FAIL;
>
> - message->headers = g_hash_table_new_full (strcase_hash,
> - strcase_equal,
> - free,
> - g_free);
> -
> - message->parsing_started = 0;
> - message->parsing_finished = 0;
> + message->headers = g_hash_table_new_full (strcase_hash, strcase_equal,
> + free, g_free);
> + if (message->headers == NULL)
> + goto FAIL;
>
> return message;
>
> @@ -143,7 +136,7 @@ notmuch_message_file_restrict_headersv (notmuch_message_file_t *message,
> {
> char *header;
>
> - if (message->parsing_started)
> + if (message->parsed)
> INTERNAL_ERROR ("notmuch_message_file_restrict_headers called after parsing has started");
>
> while (1) {
> @@ -167,234 +160,114 @@ notmuch_message_file_restrict_headers (notmuch_message_file_t *message, ...)
> notmuch_message_file_restrict_headersv (message, va_headers);
> }
>
> -static void
> -copy_header_unfolding (header_value_closure_t *value,
> - const char *chunk)
> +/*
> + * gmime does not provide access to all Received: headers the way we
> + * want, so we'll to use the parser header callback to gather them
> + * into a hash table.
> + */
> +static void header_cb (unused(GMimeParser *parser), const char *header,
> + const char *value, unused(gint64 offset),
> + gpointer user_data)
> {
> - char *last;
> + notmuch_message_file_t *message = (notmuch_message_file_t *) user_data;
> + char *existing = NULL;
> + notmuch_bool_t found;
>
> - if (chunk == NULL)
> + found = g_hash_table_lookup_extended (message->headers, header,
> + NULL, (gpointer *) &existing);
> + if (! found && message->restrict_headers)
> return;
>
> - while (*chunk == ' ' || *chunk == '\t')
> - chunk++;
> -
> - if (value->len + 1 + strlen (chunk) + 1 > value->size) {
> - unsigned int new_size = value->size;
> - if (value->size == 0)
> - new_size = strlen (chunk) + 1;
> - else
> - while (value->len + 1 + strlen (chunk) + 1 > new_size)
> - new_size *= 2;
> - value->str = xrealloc (value->str, new_size);
> - value->size = new_size;
> - }
> -
> - last = value->str + value->len;
> - if (value->len) {
> - *last = ' ';
> - last++;
> - value->len++;
> - }
> -
> - strcpy (last, chunk);
> - value->len += strlen (chunk);
> -
> - last = value->str + value->len - 1;
> - if (*last == '\n') {
> - *last = '\0';
> - value->len--;
> + if (existing == NULL) {
> + /* No value, add one */
> + char *decoded = g_mime_utils_header_decode_text (value);
> + g_hash_table_insert (message->headers, xstrdup (header), decoded);
> + } else if (strcasecmp (header, "received") == 0) {
> + /* Concat all of the Received: headers we encounter. */
> + char *combined, *decoded;
> + size_t combined_size;
> +
> + decoded = g_mime_utils_header_decode_text (value);
> +
> + combined_size = strlen(existing) + strlen(decoded) + 2;
> + combined = g_malloc (combined_size);
> + snprintf (combined, combined_size, "%s %s", existing, decoded);
> + g_free (decoded);
> + g_hash_table_insert (message->headers, xstrdup (header), combined);
I spent a while staring at this line trying to figure out whether
header or its duplicate was leaked. It's clear enough that this line
is correct from the documentation of g_hash_table_insert, but it might
be worth adding a comment here mentioning that g_hash_table_insert
will free this temporary copy of the key, to save future readers the
eye strain.
> }
> }
>
> -/* As a special-case, a value of NULL for header_desired will force
> - * the entire header to be parsed if it is not parsed already. This is
> - * used by the _notmuch_message_file_get_headers_end function.
> - * Another special case is the Received: header. For this header we
> - * want to concatenate all instances of the header instead of just
> - * hashing the first instance as we use this when analyzing the path
> - * the mail has taken from sender to recipient.
> - */
> -const char *
> -notmuch_message_file_get_header (notmuch_message_file_t *message,
> - const char *header_desired)
> +notmuch_status_t
> +notmuch_message_file_parse (notmuch_message_file_t *message)
> {
> - int contains;
> - char *header, *decoded_value, *header_sofar, *combined_header;
> - const char *s, *colon;
> - int match, newhdr, hdrsofar, is_received;
> static int initialized = 0;
> -
> - is_received = (strcmp(header_desired,"received") == 0);
> + char from_buf[5];
> + notmuch_bool_t is_mbox = FALSE;
> + static notmuch_bool_t mbox_warning = FALSE;
>
> if (! initialized) {
> g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS);
> initialized = 1;
> }
>
> - message->parsing_started = 1;
> -
> - if (header_desired == NULL)
> - contains = 0;
> - else
> - contains = g_hash_table_lookup_extended (message->headers,
> - header_desired, NULL,
> - (gpointer *) &decoded_value);
> -
> - if (contains && decoded_value)
> - return decoded_value;
> -
> - if (message->parsing_finished)
> - return "";
> -
> -#define NEXT_HEADER_LINE(closure) \
> - while (1) { \
> - ssize_t bytes_read = getline (&message->line, \
> - &message->line_size, \
> - message->file); \
> - if (bytes_read == -1) { \
> - message->parsing_finished = 1; \
> - break; \
> - } \
> - if (*message->line == '\n') { \
> - message->parsing_finished = 1; \
> - break; \
> - } \
> - if (closure && \
> - (*message->line == ' ' || *message->line == '\t')) \
> - { \
> - copy_header_unfolding ((closure), message->line); \
> - } \
> - if (*message->line == ' ' || *message->line == '\t') \
> - message->header_size += strlen (message->line); \
> - else \
> - break; \
> - }
> -
> - if (message->line == NULL)
> - NEXT_HEADER_LINE (NULL);
> -
> - while (1) {
> -
> - if (message->parsing_finished)
> - break;
> -
> - colon = strchr (message->line, ':');
> -
> - if (colon == NULL) {
> - message->broken_headers++;
> - /* A simple heuristic for giving up on things that just
> - * don't look like mail messages. */
> - if (message->broken_headers >= 10 &&
> - message->good_headers < 5)
> - {
> - message->parsing_finished = 1;
> - break;
> - }
> - NEXT_HEADER_LINE (NULL);
> - continue;
> + /* Is this mbox? */
> + if (fread (from_buf, sizeof (from_buf), 1, message->file) == 1 &&
> + strncmp (from_buf, "From ", 5) == 0)
> + is_mbox = TRUE;
> + rewind (message->file);
> +
> + message->stream = g_mime_stream_file_new (message->file);
Given that GMimeStream steals message->file, would it make sense to
set it to NULL here? That would also simplify
_notmuch_message_file_destructor a bit and move the knowledge of this
stealing behavior to where it happens.
> + message->parser = g_mime_parser_new_with_stream (message->stream);
> + g_mime_parser_set_scan_from (message->parser, is_mbox);
> + g_mime_parser_set_header_regex (message->parser, ".*", header_cb,
> + (gpointer) message);
Interesting.
> +
> + message->message = g_mime_parser_construct_message (message->parser);
> + if (! message->message)
> + return NOTMUCH_STATUS_FILE_NOT_EMAIL;
> +
> + if (is_mbox) {
> + if (! g_mime_parser_eos (message->parser)) {
> + /* This is a multi-message mbox. */
> + return NOTMUCH_STATUS_FILE_NOT_EMAIL;
> }
> -
> - message->header_size += strlen (message->line);
> -
> - message->good_headers++;
> -
> - header = xstrndup (message->line, colon - message->line);
> -
> - if (message->restrict_headers &&
> - ! g_hash_table_lookup_extended (message->headers,
> - header, NULL, NULL))
> - {
> - free (header);
> - NEXT_HEADER_LINE (NULL);
> - continue;
> - }
> -
> - s = colon + 1;
> - while (*s == ' ' || *s == '\t')
> - s++;
> -
> - message->value.len = 0;
> - copy_header_unfolding (&message->value, s);
> -
> - NEXT_HEADER_LINE (&message->value);
> -
> - if (header_desired == NULL)
> - match = 0;
> - else
> - match = (strcasecmp (header, header_desired) == 0);
> -
> - decoded_value = g_mime_utils_header_decode_text (message->value.str);
> - header_sofar = (char *)g_hash_table_lookup (message->headers, header);
> - /* we treat the Received: header special - we want to concat ALL of
> - * the Received: headers we encounter.
> - * for everything else we return the first instance of a header */
> - if (strcasecmp(header, "received") == 0) {
> - if (header_sofar == NULL) {
> - /* first Received: header we encountered; just add it */
> - g_hash_table_insert (message->headers, header, decoded_value);
> - } else {
> - /* we need to add the header to those we already collected */
> - newhdr = strlen(decoded_value);
> - hdrsofar = strlen(header_sofar);
> - combined_header = g_malloc(hdrsofar + newhdr + 2);
> - strncpy(combined_header,header_sofar,hdrsofar);
> - *(combined_header+hdrsofar) = ' ';
> - strncpy(combined_header+hdrsofar+1,decoded_value,newhdr+1);
> - g_free (decoded_value);
> - g_hash_table_insert (message->headers, header, combined_header);
> - }
> - } else {
> - if (header_sofar == NULL) {
> - /* Only insert if we don't have a value for this header, yet. */
> - g_hash_table_insert (message->headers, header, decoded_value);
> - } else {
> - free (header);
> - g_free (decoded_value);
> - decoded_value = header_sofar;
> - }
> + /*
> + * For historical reasons, we support single-message mboxes,
> + * but this behavior is likely to change in the future, so
> + * warn.
> + */
> + if (! mbox_warning) {
> + mbox_warning = TRUE;
> + fprintf (stderr, "\
> +Warning: %s is an mbox containing a single message,\n\
> +likely caused by misconfigured mail delivery. Support for single-message\n\
> +mboxes is deprecated and may be removed in the future.\n", message->filename);
> }
Why move this warning here? Currently we obviously only print this
once, at index time. Will this cause us to print the warning in more
situations?
> - /* if we found a match we can bail - unless of course we are
> - * collecting all the Received: headers */
> - if (match && !is_received)
> - return decoded_value;
> }
>
> - if (message->parsing_finished) {
> - fclose (message->file);
> - message->file = NULL;
> - }
> + message->parsed = TRUE;
>
> - if (message->line)
> - free (message->line);
> - message->line = NULL;
> + return NOTMUCH_STATUS_SUCCESS;
> +}
>
> - if (message->value.size) {
> - free (message->value.str);
> - message->value.str = NULL;
> - message->value.size = 0;
> - message->value.len = 0;
> - }
> +/* return NULL on errors, empty string for non-existing headers */
For consistency, the comment should be capitalized.
Also, this comment used to explain how Received headers were handled.
It seems like we lost that.
> +const char *
> +notmuch_message_file_get_header (notmuch_message_file_t *message,
> + const char *header)
> +{
> + const char *value = NULL;
> + notmuch_bool_t found;
>
> - /* For the Received: header we actually might end up here even
> - * though we found the header (as we force continued parsing
> - * in that case). So let's check if that's the header we were
> - * looking for and return the value that we found (if any)
> - */
> - if (is_received)
> - return (char *)g_hash_table_lookup (message->headers, "received");
> -
> - /* We've parsed all headers and never found the one we're looking
> - * for. It's probably just not there, but let's check that we
> - * didn't make a mistake preventing us from seeing it. */
> - if (message->restrict_headers && header_desired &&
> - ! g_hash_table_lookup_extended (message->headers,
> - header_desired, NULL, NULL))
> - {
> - INTERNAL_ERROR ("Attempt to get header \"%s\" which was not\n"
> - "included in call to notmuch_message_file_restrict_headers\n",
> - header_desired);
> - }
> + /* the caller shouldn't ask for headers before parsing */
> + if (! message->parsed)
> + return NULL;
Previously, calling notmuch_message_file_get_header triggered parsing
of the message, which ensured it didn't happen unless necessary. Is
there a reason to change that behavior?
If it were still done lazily, you wouldn't need the addition below to
_notmuch_message_ensure_message_file, or the new prototype in
notmuch-private.h (and notmuch_message_file_parse could be static).
(Ignore this comment if it's justified by the next patch; I haven't
gotten there yet.)
> +
> + found = g_hash_table_lookup_extended (message->headers, header,
> + NULL, (gpointer *) &value);
> +
> + /* the caller shouldn't ask for non-restricted headers */
> + if (! found && message->restrict_headers)
> + return NULL;
>
> - return "";
> + return value ? value : "";
> }
> diff --git a/lib/message.cc b/lib/message.cc
> index c91f3a5..9a22d36 100644
> --- a/lib/message.cc
> +++ b/lib/message.cc
> @@ -407,6 +407,12 @@ _notmuch_message_ensure_message_file (notmuch_message_t *message)
> return;
>
> message->message_file = _notmuch_message_file_open_ctx (message, filename);
> +
> + /* XXX: better return value handling */
> + if (notmuch_message_file_parse (message->message_file)) {
> + notmuch_message_file_close (message->message_file);
> + message->message_file = NULL;
> + }
> }
>
> const char *
> diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
> index af185c7..7277df1 100644
> --- a/lib/notmuch-private.h
> +++ b/lib/notmuch-private.h
> @@ -345,6 +345,10 @@ notmuch_message_file_open (const char *filename);
> notmuch_message_file_t *
> _notmuch_message_file_open_ctx (void *ctx, const char *filename);
>
> +/* Parse the message */
> +notmuch_status_t
> +notmuch_message_file_parse (notmuch_message_file_t *message);
> +
> /* Close a notmuch message previously opened with notmuch_message_open. */
> void
> notmuch_message_file_close (notmuch_message_file_t *message);
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v3 6/6] lib: parse messages only once
2014-02-03 19:51 [PATCH v3 0/6] lib: replace the message header parser with gmime Jani Nikula
` (4 preceding siblings ...)
2014-02-03 19:51 ` [PATCH v3 5/6] lib: replace the header parser with gmime Jani Nikula
@ 2014-02-03 19:51 ` Jani Nikula
2014-02-03 21:40 ` Austin Clements
5 siblings, 1 reply; 14+ messages in thread
From: Jani Nikula @ 2014-02-03 19:51 UTC (permalink / raw)
To: notmuch
Use the previously parsed gmime message for indexing instead of
running an extra parsing pass.
After this change, we'll only do unnecessary parsing of the message
body for duplicates and non-messages. For regular non-duplicate
messages, we have now shaved off an extra header parsing round during
indexing.
---
lib/database.cc | 2 +-
lib/index.cc | 59 ++++++---------------------------------------------
lib/message-file.c | 9 ++++++++
lib/notmuch-private.h | 16 ++++++++++++--
4 files changed, 30 insertions(+), 56 deletions(-)
diff --git a/lib/database.cc b/lib/database.cc
index d1bea88..3a29fe7 100644
--- a/lib/database.cc
+++ b/lib/database.cc
@@ -2029,7 +2029,7 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
date = notmuch_message_file_get_header (message_file, "date");
_notmuch_message_set_header_values (message, date, from, subject);
- ret = _notmuch_message_index_file (message, filename);
+ ret = _notmuch_message_index_file (message, message_file);
if (ret)
goto DONE;
} else {
diff --git a/lib/index.cc b/lib/index.cc
index 976e49f..71397da 100644
--- a/lib/index.cc
+++ b/lib/index.cc
@@ -425,52 +425,15 @@ _index_mime_part (notmuch_message_t *message,
notmuch_status_t
_notmuch_message_index_file (notmuch_message_t *message,
- const char *filename)
+ notmuch_message_file_t *message_file)
{
- GMimeStream *stream = NULL;
- GMimeParser *parser = NULL;
- GMimeMessage *mime_message = NULL;
+ GMimeMessage *mime_message;
InternetAddressList *addresses;
- FILE *file = NULL;
const char *from, *subject;
- notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS;
- static int initialized = 0;
- char from_buf[5];
- bool is_mbox = false;
-
- if (! initialized) {
- g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS);
- initialized = 1;
- }
-
- file = fopen (filename, "r");
- if (! file) {
- fprintf (stderr, "Error opening %s: %s\n", filename, strerror (errno));
- ret = NOTMUCH_STATUS_FILE_ERROR;
- goto DONE;
- }
-
- /* Is this mbox? */
- if (fread (from_buf, sizeof (from_buf), 1, file) == 1 &&
- strncmp (from_buf, "From ", 5) == 0)
- is_mbox = true;
- rewind (file);
-
- /* Evil GMime steals my FILE* here so I won't fclose it. */
- stream = g_mime_stream_file_new (file);
-
- parser = g_mime_parser_new_with_stream (stream);
- g_mime_parser_set_scan_from (parser, is_mbox);
- mime_message = g_mime_parser_construct_message (parser);
-
- if (is_mbox) {
- if (!g_mime_parser_eos (parser)) {
- /* This is a multi-message mbox. */
- ret = NOTMUCH_STATUS_FILE_NOT_EMAIL;
- goto DONE;
- }
- }
+ mime_message = notmuch_message_file_get_mime_message (message_file);
+ if (! mime_message)
+ return NOTMUCH_STATUS_FILE_NOT_EMAIL; /* more like internal error */
from = g_mime_message_get_sender (mime_message);
@@ -491,15 +454,5 @@ _notmuch_message_index_file (notmuch_message_t *message,
_index_mime_part (message, g_mime_message_get_mime_part (mime_message));
- DONE:
- if (mime_message)
- g_object_unref (mime_message);
-
- if (parser)
- g_object_unref (parser);
-
- if (stream)
- g_object_unref (stream);
-
- return ret;
+ return NOTMUCH_STATUS_SUCCESS;
}
diff --git a/lib/message-file.c b/lib/message-file.c
index 33f6468..99e1dc8 100644
--- a/lib/message-file.c
+++ b/lib/message-file.c
@@ -250,6 +250,15 @@ mboxes is deprecated and may be removed in the future.\n", message->filename);
return NOTMUCH_STATUS_SUCCESS;
}
+GMimeMessage *
+notmuch_message_file_get_mime_message (notmuch_message_file_t *message)
+{
+ if (! message->parsed)
+ return NULL;
+
+ return message->message;
+}
+
/* return NULL on errors, empty string for non-existing headers */
const char *
notmuch_message_file_get_header (notmuch_message_file_t *message,
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index 7277df1..7559521 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -46,6 +46,8 @@ NOTMUCH_BEGIN_DECLS
#include <talloc.h>
+#include <gmime/gmime.h>
+
#include "xutil.h"
#include "error_util.h"
@@ -320,9 +322,11 @@ notmuch_message_get_author (notmuch_message_t *message);
/* index.cc */
+typedef struct _notmuch_message_file notmuch_message_file_t;
+
notmuch_status_t
_notmuch_message_index_file (notmuch_message_t *message,
- const char *filename);
+ notmuch_message_file_t *message_file);
/* message-file.c */
@@ -330,7 +334,6 @@ _notmuch_message_index_file (notmuch_message_t *message,
* into the public interface in notmuch.h
*/
-typedef struct _notmuch_message_file notmuch_message_file_t;
/* Open a file containing a single email message.
*
@@ -377,6 +380,15 @@ void
notmuch_message_file_restrict_headersv (notmuch_message_file_t *message,
va_list va_headers);
+/* Get the gmime message of a parsed message file.
+ *
+ * Returns NULL if the message file has not been parsed.
+ *
+ * XXX: Would be nice to not have to expose GMimeMessage here.
+ */
+GMimeMessage *
+notmuch_message_file_get_mime_message (notmuch_message_file_t *message);
+
/* Get the value of the specified header from the message as a UTF-8 string.
*
* The header name is case insensitive.
--
1.8.5.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v3 6/6] lib: parse messages only once
2014-02-03 19:51 ` [PATCH v3 6/6] lib: parse messages only once Jani Nikula
@ 2014-02-03 21:40 ` Austin Clements
0 siblings, 0 replies; 14+ messages in thread
From: Austin Clements @ 2014-02-03 21:40 UTC (permalink / raw)
To: Jani Nikula; +Cc: notmuch
Quoth Jani Nikula on Feb 03 at 9:51 pm:
> Use the previously parsed gmime message for indexing instead of
> running an extra parsing pass.
>
> After this change, we'll only do unnecessary parsing of the message
> body for duplicates and non-messages. For regular non-duplicate
> messages, we have now shaved off an extra header parsing round during
> indexing.
> ---
> lib/database.cc | 2 +-
> lib/index.cc | 59 ++++++---------------------------------------------
> lib/message-file.c | 9 ++++++++
> lib/notmuch-private.h | 16 ++++++++++++--
> 4 files changed, 30 insertions(+), 56 deletions(-)
>
> diff --git a/lib/database.cc b/lib/database.cc
> index d1bea88..3a29fe7 100644
> --- a/lib/database.cc
> +++ b/lib/database.cc
> @@ -2029,7 +2029,7 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
> date = notmuch_message_file_get_header (message_file, "date");
> _notmuch_message_set_header_values (message, date, from, subject);
>
> - ret = _notmuch_message_index_file (message, filename);
> + ret = _notmuch_message_index_file (message, message_file);
> if (ret)
> goto DONE;
> } else {
> diff --git a/lib/index.cc b/lib/index.cc
> index 976e49f..71397da 100644
> --- a/lib/index.cc
> +++ b/lib/index.cc
> @@ -425,52 +425,15 @@ _index_mime_part (notmuch_message_t *message,
>
> notmuch_status_t
> _notmuch_message_index_file (notmuch_message_t *message,
> - const char *filename)
> + notmuch_message_file_t *message_file)
> {
> - GMimeStream *stream = NULL;
> - GMimeParser *parser = NULL;
> - GMimeMessage *mime_message = NULL;
> + GMimeMessage *mime_message;
> InternetAddressList *addresses;
> - FILE *file = NULL;
> const char *from, *subject;
> - notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS;
> - static int initialized = 0;
> - char from_buf[5];
> - bool is_mbox = false;
> -
> - if (! initialized) {
> - g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS);
> - initialized = 1;
> - }
> -
> - file = fopen (filename, "r");
> - if (! file) {
> - fprintf (stderr, "Error opening %s: %s\n", filename, strerror (errno));
> - ret = NOTMUCH_STATUS_FILE_ERROR;
> - goto DONE;
> - }
> -
> - /* Is this mbox? */
> - if (fread (from_buf, sizeof (from_buf), 1, file) == 1 &&
> - strncmp (from_buf, "From ", 5) == 0)
> - is_mbox = true;
> - rewind (file);
> -
> - /* Evil GMime steals my FILE* here so I won't fclose it. */
> - stream = g_mime_stream_file_new (file);
> -
> - parser = g_mime_parser_new_with_stream (stream);
> - g_mime_parser_set_scan_from (parser, is_mbox);
>
> - mime_message = g_mime_parser_construct_message (parser);
> -
> - if (is_mbox) {
> - if (!g_mime_parser_eos (parser)) {
> - /* This is a multi-message mbox. */
> - ret = NOTMUCH_STATUS_FILE_NOT_EMAIL;
> - goto DONE;
> - }
> - }
> + mime_message = notmuch_message_file_get_mime_message (message_file);
> + if (! mime_message)
> + return NOTMUCH_STATUS_FILE_NOT_EMAIL; /* more like internal error */
Are there situations other than forgetting to call
notmuch_message_file_parse that could cause this? (Speaking of which,
where is notmuch_message_file_parse called?)
>
> from = g_mime_message_get_sender (mime_message);
>
> @@ -491,15 +454,5 @@ _notmuch_message_index_file (notmuch_message_t *message,
>
> _index_mime_part (message, g_mime_message_get_mime_part (mime_message));
>
> - DONE:
> - if (mime_message)
> - g_object_unref (mime_message);
> -
> - if (parser)
> - g_object_unref (parser);
> -
> - if (stream)
> - g_object_unref (stream);
> -
> - return ret;
> + return NOTMUCH_STATUS_SUCCESS;
> }
> diff --git a/lib/message-file.c b/lib/message-file.c
> index 33f6468..99e1dc8 100644
> --- a/lib/message-file.c
> +++ b/lib/message-file.c
> @@ -250,6 +250,15 @@ mboxes is deprecated and may be removed in the future.\n", message->filename);
> return NOTMUCH_STATUS_SUCCESS;
> }
>
> +GMimeMessage *
> +notmuch_message_file_get_mime_message (notmuch_message_file_t *message)
> +{
> + if (! message->parsed)
> + return NULL;
This seems like another good opportunity to call the parser lazily and
hide notmuch_message_file_parse from the caller, rather than requiring
the caller to implement a particular call sequence (which I wasn't
even able to find above). This might also clean up the error handling
in the call to notmuch_message_file_get_mime_message above.
> +
> + return message->message;
> +}
> +
> /* return NULL on errors, empty string for non-existing headers */
> const char *
> notmuch_message_file_get_header (notmuch_message_file_t *message,
> diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
> index 7277df1..7559521 100644
> --- a/lib/notmuch-private.h
> +++ b/lib/notmuch-private.h
> @@ -46,6 +46,8 @@ NOTMUCH_BEGIN_DECLS
>
> #include <talloc.h>
>
> +#include <gmime/gmime.h>
> +
> #include "xutil.h"
> #include "error_util.h"
>
> @@ -320,9 +322,11 @@ notmuch_message_get_author (notmuch_message_t *message);
>
> /* index.cc */
>
> +typedef struct _notmuch_message_file notmuch_message_file_t;
> +
> notmuch_status_t
> _notmuch_message_index_file (notmuch_message_t *message,
> - const char *filename);
> + notmuch_message_file_t *message_file);
>
> /* message-file.c */
>
> @@ -330,7 +334,6 @@ _notmuch_message_index_file (notmuch_message_t *message,
> * into the public interface in notmuch.h
> */
>
> -typedef struct _notmuch_message_file notmuch_message_file_t;
>
> /* Open a file containing a single email message.
> *
> @@ -377,6 +380,15 @@ void
> notmuch_message_file_restrict_headersv (notmuch_message_file_t *message,
> va_list va_headers);
>
> +/* Get the gmime message of a parsed message file.
> + *
> + * Returns NULL if the message file has not been parsed.
> + *
> + * XXX: Would be nice to not have to expose GMimeMessage here.
Maybe just forward-declare struct GMimeMessage? Then you also
wouldn't need to add the gmime #include.
> + */
> +GMimeMessage *
> +notmuch_message_file_get_mime_message (notmuch_message_file_t *message);
> +
> /* Get the value of the specified header from the message as a UTF-8 string.
> *
> * The header name is case insensitive.
^ permalink raw reply [flat|nested] 14+ messages in thread