* [PATCH v4 1/5] util: Factor out boolean term quoting routine
2012-12-31 6:42 [PATCH v4 0/5] Use Xapian query syntax for batch-tag dump/restore Austin Clements
@ 2012-12-31 6:42 ` Austin Clements
2013-01-03 16:48 ` Jani Nikula
2012-12-31 6:42 ` [PATCH v4 2/5] util: Function to parse boolean term queries Austin Clements
` (4 subsequent siblings)
5 siblings, 1 reply; 14+ messages in thread
From: Austin Clements @ 2012-12-31 6:42 UTC (permalink / raw)
To: notmuch; +Cc: tomi.ollila
From: Austin Clements <amdragon@MIT.EDU>
This is now a generic boolean term quoting function. It performs
minimal quoting to produce user-friendly queries.
This could live in tag-util as well, but it is really nothing specific
to tags (although the conventions are specific to Xapian).
The API is changed from "caller-allocates" to "readline-like". The
scan for max tag length is pushed down into the quoting routine.
Furthermore, this now combines the term prefix with the quoted term;
arguably this is just as easy to do in the caller, but this will
nicely parallel the boolean term parsing function to be introduced
shortly.
This is an amalgamation of code written by David Bremner and myself.
---
notmuch-tag.c | 48 ++++++++++++---------------------------
util/string-util.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++
util/string-util.h | 14 ++++++++++++
3 files changed, 92 insertions(+), 34 deletions(-)
diff --git a/notmuch-tag.c b/notmuch-tag.c
index 88d559b..fc9d43a 100644
--- a/notmuch-tag.c
+++ b/notmuch-tag.c
@@ -19,6 +19,7 @@
*/
#include "notmuch-client.h"
+#include "string-util.h"
static volatile sig_atomic_t interrupted;
@@ -35,25 +36,6 @@ handle_sigint (unused (int sig))
interrupted = 1;
}
-static char *
-_escape_tag (char *buf, const char *tag)
-{
- const char *in = tag;
- char *out = buf;
-
- /* Boolean terms surrounded by double quotes can contain any
- * character. Double quotes are quoted by doubling them. */
- *out++ = '"';
- while (*in) {
- if (*in == '"')
- *out++ = '"';
- *out++ = *in++;
- }
- *out++ = '"';
- *out = 0;
- return buf;
-}
-
typedef struct {
const char *tag;
notmuch_bool_t remove;
@@ -71,25 +53,16 @@ _optimize_tag_query (void *ctx, const char *orig_query_string,
* parenthesize and the exclusion part of the query must not use
* the '-' operator (though the NOT operator is fine). */
- char *escaped, *query_string;
+ char *escaped = NULL;
+ size_t escaped_len = 0;
+ char *query_string;
const char *join = "";
- int i;
- unsigned int max_tag_len = 0;
+ size_t i;
/* Don't optimize if there are no tag changes. */
if (tag_ops[0].tag == NULL)
return talloc_strdup (ctx, orig_query_string);
- /* Allocate a buffer for escaping tags. This is large enough to
- * hold a fully escaped tag with every character doubled plus
- * enclosing quotes and a NUL. */
- for (i = 0; tag_ops[i].tag; i++)
- if (strlen (tag_ops[i].tag) > max_tag_len)
- max_tag_len = strlen (tag_ops[i].tag);
- escaped = talloc_array (ctx, char, max_tag_len * 2 + 3);
- if (! escaped)
- return NULL;
-
/* Build the new query string */
if (strcmp (orig_query_string, "*") == 0)
query_string = talloc_strdup (ctx, "(");
@@ -97,10 +70,17 @@ _optimize_tag_query (void *ctx, const char *orig_query_string,
query_string = talloc_asprintf (ctx, "( %s ) and (", orig_query_string);
for (i = 0; tag_ops[i].tag && query_string; i++) {
+ /* XXX in case of OOM, query_string will be deallocated when
+ * ctx is, which might be at shutdown */
+ if (make_boolean_term (ctx,
+ "tag", tag_ops[i].tag,
+ &escaped, &escaped_len))
+ return NULL;
+
query_string = talloc_asprintf_append_buffer (
- query_string, "%s%stag:%s", join,
+ query_string, "%s%s%s", join,
tag_ops[i].remove ? "" : "not ",
- _escape_tag (escaped, tag_ops[i].tag));
+ escaped);
join = " or ";
}
diff --git a/util/string-util.c b/util/string-util.c
index 44f8cd3..e4bea21 100644
--- a/util/string-util.c
+++ b/util/string-util.c
@@ -20,6 +20,7 @@
#include "string-util.h"
+#include "talloc.h"
char *
strtok_len (char *s, const char *delim, size_t *len)
@@ -32,3 +33,66 @@ strtok_len (char *s, const char *delim, size_t *len)
return *len ? s : NULL;
}
+
+int
+make_boolean_term (void *ctx, const char *prefix, const char *term,
+ char **buf, size_t *len)
+{
+ const char *in;
+ char *out;
+ size_t needed = 3;
+ int need_quoting = 0;
+
+ /* Do we need quoting? To be paranoid, we quote anything
+ * containing a quote, even though it only matters at the
+ * beginning, and anything containing non-ASCII text. */
+ for (in = term; *in && !need_quoting; in++)
+ if (*in <= ' ' || *in == ')' || *in == '"' || (unsigned char)*in > 127)
+ need_quoting = 1;
+
+ if (need_quoting)
+ for (in = term; *in; in++)
+ needed += (*in == '"') ? 2 : 1;
+ else
+ needed = strlen (term) + 1;
+
+ /* Reserve space for the prefix */
+ if (prefix)
+ needed += strlen (prefix) + 1;
+
+ if ((*buf == NULL) || (needed > *len)) {
+ *len = 2 * needed;
+ *buf = talloc_realloc (ctx, *buf, char, *len);
+ }
+
+ if (! *buf)
+ return 1;
+
+ out = *buf;
+
+ /* Copy in the prefix */
+ if (prefix) {
+ strcpy (out, prefix);
+ out += strlen (prefix);
+ *out++ = ':';
+ }
+
+ if (! need_quoting) {
+ strcpy (out, term);
+ return 0;
+ }
+
+ /* Quote term by enclosing it in double quotes and doubling any
+ * internal double quotes. */
+ *out++ = '"';
+ in = term;
+ while (*in) {
+ if (*in == '"')
+ *out++ = '"';
+ *out++ = *in++;
+ }
+ *out++ = '"';
+ *out = '\0';
+
+ return 0;
+}
diff --git a/util/string-util.h b/util/string-util.h
index ac7676c..b8844a3 100644
--- a/util/string-util.h
+++ b/util/string-util.h
@@ -19,4 +19,18 @@
char *strtok_len (char *s, const char *delim, size_t *len);
+/* Construct a boolean term query with the specified prefix (e.g.,
+ * "id") and search term, quoting term as necessary. Specifically, if
+ * term contains any non-printable ASCII characters, non-ASCII
+ * characters, close parenthesis or double quotes, it will be enclosed
+ * in double quotes and any internal double quotes will be doubled
+ * (e.g. a"b -> "a""b"). The result will be a valid notmuch query and
+ * can be parsed by parse_boolean_term.
+ *
+ * Output is into buf; it may be talloc_realloced.
+ * Return: 0 on success, non-zero on memory allocation failure.
+ */
+int make_boolean_term (void *talloc_ctx, const char *prefix, const char *term,
+ char **buf, size_t *len);
+
#endif
--
1.7.10.4
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v4 1/5] util: Factor out boolean term quoting routine
2012-12-31 6:42 ` [PATCH v4 1/5] util: Factor out boolean term quoting routine Austin Clements
@ 2013-01-03 16:48 ` Jani Nikula
2013-01-04 7:26 ` Austin Clements
0 siblings, 1 reply; 14+ messages in thread
From: Jani Nikula @ 2013-01-03 16:48 UTC (permalink / raw)
To: Austin Clements, notmuch; +Cc: tomi.ollila
On Mon, 31 Dec 2012, Austin Clements <amdragon@MIT.EDU> wrote:
> From: Austin Clements <amdragon@MIT.EDU>
>
> This is now a generic boolean term quoting function. It performs
> minimal quoting to produce user-friendly queries.
>
> This could live in tag-util as well, but it is really nothing specific
> to tags (although the conventions are specific to Xapian).
>
> The API is changed from "caller-allocates" to "readline-like". The
> scan for max tag length is pushed down into the quoting routine.
> Furthermore, this now combines the term prefix with the quoted term;
> arguably this is just as easy to do in the caller, but this will
> nicely parallel the boolean term parsing function to be introduced
> shortly.
>
> This is an amalgamation of code written by David Bremner and myself.
> ---
> notmuch-tag.c | 48 ++++++++++++---------------------------
> util/string-util.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> util/string-util.h | 14 ++++++++++++
> 3 files changed, 92 insertions(+), 34 deletions(-)
>
> diff --git a/notmuch-tag.c b/notmuch-tag.c
> index 88d559b..fc9d43a 100644
> --- a/notmuch-tag.c
> +++ b/notmuch-tag.c
> @@ -19,6 +19,7 @@
> */
>
> #include "notmuch-client.h"
> +#include "string-util.h"
>
> static volatile sig_atomic_t interrupted;
>
> @@ -35,25 +36,6 @@ handle_sigint (unused (int sig))
> interrupted = 1;
> }
>
> -static char *
> -_escape_tag (char *buf, const char *tag)
> -{
> - const char *in = tag;
> - char *out = buf;
> -
> - /* Boolean terms surrounded by double quotes can contain any
> - * character. Double quotes are quoted by doubling them. */
> - *out++ = '"';
> - while (*in) {
> - if (*in == '"')
> - *out++ = '"';
> - *out++ = *in++;
> - }
> - *out++ = '"';
> - *out = 0;
> - return buf;
> -}
> -
> typedef struct {
> const char *tag;
> notmuch_bool_t remove;
> @@ -71,25 +53,16 @@ _optimize_tag_query (void *ctx, const char *orig_query_string,
> * parenthesize and the exclusion part of the query must not use
> * the '-' operator (though the NOT operator is fine). */
>
> - char *escaped, *query_string;
> + char *escaped = NULL;
> + size_t escaped_len = 0;
> + char *query_string;
> const char *join = "";
> - int i;
> - unsigned int max_tag_len = 0;
> + size_t i;
>
> /* Don't optimize if there are no tag changes. */
> if (tag_ops[0].tag == NULL)
> return talloc_strdup (ctx, orig_query_string);
>
> - /* Allocate a buffer for escaping tags. This is large enough to
> - * hold a fully escaped tag with every character doubled plus
> - * enclosing quotes and a NUL. */
> - for (i = 0; tag_ops[i].tag; i++)
> - if (strlen (tag_ops[i].tag) > max_tag_len)
> - max_tag_len = strlen (tag_ops[i].tag);
> - escaped = talloc_array (ctx, char, max_tag_len * 2 + 3);
> - if (! escaped)
> - return NULL;
> -
> /* Build the new query string */
> if (strcmp (orig_query_string, "*") == 0)
> query_string = talloc_strdup (ctx, "(");
> @@ -97,10 +70,17 @@ _optimize_tag_query (void *ctx, const char *orig_query_string,
> query_string = talloc_asprintf (ctx, "( %s ) and (", orig_query_string);
>
> for (i = 0; tag_ops[i].tag && query_string; i++) {
> + /* XXX in case of OOM, query_string will be deallocated when
> + * ctx is, which might be at shutdown */
> + if (make_boolean_term (ctx,
> + "tag", tag_ops[i].tag,
> + &escaped, &escaped_len))
> + return NULL;
> +
> query_string = talloc_asprintf_append_buffer (
> - query_string, "%s%stag:%s", join,
> + query_string, "%s%s%s", join,
> tag_ops[i].remove ? "" : "not ",
> - _escape_tag (escaped, tag_ops[i].tag));
> + escaped);
> join = " or ";
> }
>
> diff --git a/util/string-util.c b/util/string-util.c
> index 44f8cd3..e4bea21 100644
> --- a/util/string-util.c
> +++ b/util/string-util.c
> @@ -20,6 +20,7 @@
>
>
> #include "string-util.h"
> +#include "talloc.h"
>
> char *
> strtok_len (char *s, const char *delim, size_t *len)
> @@ -32,3 +33,66 @@ strtok_len (char *s, const char *delim, size_t *len)
>
> return *len ? s : NULL;
> }
> +
> +int
> +make_boolean_term (void *ctx, const char *prefix, const char *term,
> + char **buf, size_t *len)
> +{
> + const char *in;
> + char *out;
> + size_t needed = 3;
> + int need_quoting = 0;
> +
> + /* Do we need quoting? To be paranoid, we quote anything
> + * containing a quote, even though it only matters at the
> + * beginning, and anything containing non-ASCII text. */
> + for (in = term; *in && !need_quoting; in++)
> + if (*in <= ' ' || *in == ')' || *in == '"' || (unsigned char)*in > 127)
Should that be *in >= 127?
Otherwise LGTM.
Jani.
> + need_quoting = 1;
> +
> + if (need_quoting)
> + for (in = term; *in; in++)
> + needed += (*in == '"') ? 2 : 1;
> + else
> + needed = strlen (term) + 1;
> +
> + /* Reserve space for the prefix */
> + if (prefix)
> + needed += strlen (prefix) + 1;
> +
> + if ((*buf == NULL) || (needed > *len)) {
> + *len = 2 * needed;
> + *buf = talloc_realloc (ctx, *buf, char, *len);
> + }
> +
> + if (! *buf)
> + return 1;
> +
> + out = *buf;
> +
> + /* Copy in the prefix */
> + if (prefix) {
> + strcpy (out, prefix);
> + out += strlen (prefix);
> + *out++ = ':';
> + }
> +
> + if (! need_quoting) {
> + strcpy (out, term);
> + return 0;
> + }
> +
> + /* Quote term by enclosing it in double quotes and doubling any
> + * internal double quotes. */
> + *out++ = '"';
> + in = term;
> + while (*in) {
> + if (*in == '"')
> + *out++ = '"';
> + *out++ = *in++;
> + }
> + *out++ = '"';
> + *out = '\0';
> +
> + return 0;
> +}
> diff --git a/util/string-util.h b/util/string-util.h
> index ac7676c..b8844a3 100644
> --- a/util/string-util.h
> +++ b/util/string-util.h
> @@ -19,4 +19,18 @@
>
> char *strtok_len (char *s, const char *delim, size_t *len);
>
> +/* Construct a boolean term query with the specified prefix (e.g.,
> + * "id") and search term, quoting term as necessary. Specifically, if
> + * term contains any non-printable ASCII characters, non-ASCII
> + * characters, close parenthesis or double quotes, it will be enclosed
> + * in double quotes and any internal double quotes will be doubled
> + * (e.g. a"b -> "a""b"). The result will be a valid notmuch query and
> + * can be parsed by parse_boolean_term.
> + *
> + * Output is into buf; it may be talloc_realloced.
> + * Return: 0 on success, non-zero on memory allocation failure.
> + */
> +int make_boolean_term (void *talloc_ctx, const char *prefix, const char *term,
> + char **buf, size_t *len);
> +
> #endif
> --
> 1.7.10.4
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v4 1/5] util: Factor out boolean term quoting routine
2013-01-03 16:48 ` Jani Nikula
@ 2013-01-04 7:26 ` Austin Clements
0 siblings, 0 replies; 14+ messages in thread
From: Austin Clements @ 2013-01-04 7:26 UTC (permalink / raw)
To: Jani Nikula; +Cc: tomi.ollila, notmuch
Quoth Jani Nikula on Jan 03 at 5:48 pm:
> On Mon, 31 Dec 2012, Austin Clements <amdragon@MIT.EDU> wrote:
> > From: Austin Clements <amdragon@MIT.EDU>
> >
> > This is now a generic boolean term quoting function. It performs
> > minimal quoting to produce user-friendly queries.
> >
> > This could live in tag-util as well, but it is really nothing specific
> > to tags (although the conventions are specific to Xapian).
> >
> > The API is changed from "caller-allocates" to "readline-like". The
> > scan for max tag length is pushed down into the quoting routine.
> > Furthermore, this now combines the term prefix with the quoted term;
> > arguably this is just as easy to do in the caller, but this will
> > nicely parallel the boolean term parsing function to be introduced
> > shortly.
> >
> > This is an amalgamation of code written by David Bremner and myself.
> > ---
> > notmuch-tag.c | 48 ++++++++++++---------------------------
> > util/string-util.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> > util/string-util.h | 14 ++++++++++++
> > 3 files changed, 92 insertions(+), 34 deletions(-)
> >
> > diff --git a/notmuch-tag.c b/notmuch-tag.c
> > index 88d559b..fc9d43a 100644
> > --- a/notmuch-tag.c
> > +++ b/notmuch-tag.c
> > @@ -19,6 +19,7 @@
> > */
> >
> > #include "notmuch-client.h"
> > +#include "string-util.h"
> >
> > static volatile sig_atomic_t interrupted;
> >
> > @@ -35,25 +36,6 @@ handle_sigint (unused (int sig))
> > interrupted = 1;
> > }
> >
> > -static char *
> > -_escape_tag (char *buf, const char *tag)
> > -{
> > - const char *in = tag;
> > - char *out = buf;
> > -
> > - /* Boolean terms surrounded by double quotes can contain any
> > - * character. Double quotes are quoted by doubling them. */
> > - *out++ = '"';
> > - while (*in) {
> > - if (*in == '"')
> > - *out++ = '"';
> > - *out++ = *in++;
> > - }
> > - *out++ = '"';
> > - *out = 0;
> > - return buf;
> > -}
> > -
> > typedef struct {
> > const char *tag;
> > notmuch_bool_t remove;
> > @@ -71,25 +53,16 @@ _optimize_tag_query (void *ctx, const char *orig_query_string,
> > * parenthesize and the exclusion part of the query must not use
> > * the '-' operator (though the NOT operator is fine). */
> >
> > - char *escaped, *query_string;
> > + char *escaped = NULL;
> > + size_t escaped_len = 0;
> > + char *query_string;
> > const char *join = "";
> > - int i;
> > - unsigned int max_tag_len = 0;
> > + size_t i;
> >
> > /* Don't optimize if there are no tag changes. */
> > if (tag_ops[0].tag == NULL)
> > return talloc_strdup (ctx, orig_query_string);
> >
> > - /* Allocate a buffer for escaping tags. This is large enough to
> > - * hold a fully escaped tag with every character doubled plus
> > - * enclosing quotes and a NUL. */
> > - for (i = 0; tag_ops[i].tag; i++)
> > - if (strlen (tag_ops[i].tag) > max_tag_len)
> > - max_tag_len = strlen (tag_ops[i].tag);
> > - escaped = talloc_array (ctx, char, max_tag_len * 2 + 3);
> > - if (! escaped)
> > - return NULL;
> > -
> > /* Build the new query string */
> > if (strcmp (orig_query_string, "*") == 0)
> > query_string = talloc_strdup (ctx, "(");
> > @@ -97,10 +70,17 @@ _optimize_tag_query (void *ctx, const char *orig_query_string,
> > query_string = talloc_asprintf (ctx, "( %s ) and (", orig_query_string);
> >
> > for (i = 0; tag_ops[i].tag && query_string; i++) {
> > + /* XXX in case of OOM, query_string will be deallocated when
> > + * ctx is, which might be at shutdown */
> > + if (make_boolean_term (ctx,
> > + "tag", tag_ops[i].tag,
> > + &escaped, &escaped_len))
> > + return NULL;
> > +
> > query_string = talloc_asprintf_append_buffer (
> > - query_string, "%s%stag:%s", join,
> > + query_string, "%s%s%s", join,
> > tag_ops[i].remove ? "" : "not ",
> > - _escape_tag (escaped, tag_ops[i].tag));
> > + escaped);
> > join = " or ";
> > }
> >
> > diff --git a/util/string-util.c b/util/string-util.c
> > index 44f8cd3..e4bea21 100644
> > --- a/util/string-util.c
> > +++ b/util/string-util.c
> > @@ -20,6 +20,7 @@
> >
> >
> > #include "string-util.h"
> > +#include "talloc.h"
> >
> > char *
> > strtok_len (char *s, const char *delim, size_t *len)
> > @@ -32,3 +33,66 @@ strtok_len (char *s, const char *delim, size_t *len)
> >
> > return *len ? s : NULL;
> > }
> > +
> > +int
> > +make_boolean_term (void *ctx, const char *prefix, const char *term,
> > + char **buf, size_t *len)
> > +{
> > + const char *in;
> > + char *out;
> > + size_t needed = 3;
> > + int need_quoting = 0;
> > +
> > + /* Do we need quoting? To be paranoid, we quote anything
> > + * containing a quote, even though it only matters at the
> > + * beginning, and anything containing non-ASCII text. */
> > + for (in = term; *in && !need_quoting; in++)
> > + if (*in <= ' ' || *in == ')' || *in == '"' || (unsigned char)*in > 127)
>
> Should that be *in >= 127?
Nope. Character 127 is fine (and ASCII). Technically the only
non-ASCII characters that require quoting are 0x201c and 0x201d, but
rather than decoding UTF-8 to find those characters, it's much easier
to just quote if there are any non-ASCII UTF-8 bytes. (Extra
technically, we would be in real trouble if a tag contained 8-bit
bytes but wasn't valid UTF-8; however, I think this would be the least
of our worries.)
> Otherwise LGTM.
>
> Jani.
>
> > + need_quoting = 1;
> > +
> > + if (need_quoting)
> > + for (in = term; *in; in++)
> > + needed += (*in == '"') ? 2 : 1;
> > + else
> > + needed = strlen (term) + 1;
> > +
> > + /* Reserve space for the prefix */
> > + if (prefix)
> > + needed += strlen (prefix) + 1;
> > +
> > + if ((*buf == NULL) || (needed > *len)) {
> > + *len = 2 * needed;
> > + *buf = talloc_realloc (ctx, *buf, char, *len);
> > + }
> > +
> > + if (! *buf)
> > + return 1;
> > +
> > + out = *buf;
> > +
> > + /* Copy in the prefix */
> > + if (prefix) {
> > + strcpy (out, prefix);
> > + out += strlen (prefix);
> > + *out++ = ':';
> > + }
> > +
> > + if (! need_quoting) {
> > + strcpy (out, term);
> > + return 0;
> > + }
> > +
> > + /* Quote term by enclosing it in double quotes and doubling any
> > + * internal double quotes. */
> > + *out++ = '"';
> > + in = term;
> > + while (*in) {
> > + if (*in == '"')
> > + *out++ = '"';
> > + *out++ = *in++;
> > + }
> > + *out++ = '"';
> > + *out = '\0';
> > +
> > + return 0;
> > +}
> > diff --git a/util/string-util.h b/util/string-util.h
> > index ac7676c..b8844a3 100644
> > --- a/util/string-util.h
> > +++ b/util/string-util.h
> > @@ -19,4 +19,18 @@
> >
> > char *strtok_len (char *s, const char *delim, size_t *len);
> >
> > +/* Construct a boolean term query with the specified prefix (e.g.,
> > + * "id") and search term, quoting term as necessary. Specifically, if
> > + * term contains any non-printable ASCII characters, non-ASCII
> > + * characters, close parenthesis or double quotes, it will be enclosed
> > + * in double quotes and any internal double quotes will be doubled
> > + * (e.g. a"b -> "a""b"). The result will be a valid notmuch query and
> > + * can be parsed by parse_boolean_term.
> > + *
> > + * Output is into buf; it may be talloc_realloced.
> > + * Return: 0 on success, non-zero on memory allocation failure.
> > + */
> > +int make_boolean_term (void *talloc_ctx, const char *prefix, const char *term,
> > + char **buf, size_t *len);
> > +
> > #endif
--
Austin Clements MIT/'06/PhD/CSAIL
amdragon@mit.edu http://web.mit.edu/amdragon
Somewhere in the dream we call reality you will find me,
searching for the reality we call dreams.
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v4 2/5] util: Function to parse boolean term queries
2012-12-31 6:42 [PATCH v4 0/5] Use Xapian query syntax for batch-tag dump/restore Austin Clements
2012-12-31 6:42 ` [PATCH v4 1/5] util: Factor out boolean term quoting routine Austin Clements
@ 2012-12-31 6:42 ` Austin Clements
2012-12-31 12:01 ` Mark Walters
2013-01-03 17:09 ` Jani Nikula
2012-12-31 6:42 ` [PATCH v4 3/5] dump: Disallow \n in message IDs Austin Clements
` (3 subsequent siblings)
5 siblings, 2 replies; 14+ messages in thread
From: Austin Clements @ 2012-12-31 6:42 UTC (permalink / raw)
To: notmuch; +Cc: tomi.ollila
This parses the subset of Xapian's boolean term quoting rules that are
used by make_boolean_term. This is provided as a generic string
utility, but will be used shortly in notmuch restore to parse and
optimize for ID queries.
---
util/string-util.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++
util/string-util.h | 15 ++++++++++++
2 files changed, 82 insertions(+)
diff --git a/util/string-util.c b/util/string-util.c
index e4bea21..52c7781 100644
--- a/util/string-util.c
+++ b/util/string-util.c
@@ -22,6 +22,8 @@
#include "string-util.h"
#include "talloc.h"
+#include <ctype.h>
+
char *
strtok_len (char *s, const char *delim, size_t *len)
{
@@ -96,3 +98,68 @@ make_boolean_term (void *ctx, const char *prefix, const char *term,
return 0;
}
+
+static const char*
+skip_space (const char *str)
+{
+ while (*str && isspace (*str))
+ ++str;
+ return str;
+}
+
+int
+parse_boolean_term (void *ctx, const char *str,
+ char **prefix_out, char **term_out)
+{
+ *prefix_out = *term_out = NULL;
+
+ /* Parse prefix */
+ str = skip_space (str);
+ const char *pos = strchr (str, ':');
+ if (! pos)
+ goto FAIL;
+ *prefix_out = talloc_strndup (ctx, str, pos - str);
+ ++pos;
+
+ /* Implement de-quoting compatible with make_boolean_term. */
+ if (*pos == '"') {
+ char *out = talloc_array (ctx, char, strlen (pos));
+ int closed = 0;
+ *term_out = out;
+ /* Skip the opening quote, find the closing quote, and
+ * un-double doubled internal quotes. */
+ for (++pos; *pos; ) {
+ if (*pos == '"') {
+ ++pos;
+ if (*pos != '"') {
+ /* Found the closing quote. */
+ closed = 1;
+ pos = skip_space (pos);
+ break;
+ }
+ }
+ *out++ = *pos++;
+ }
+ /* Did the term terminate without a closing quote or is there
+ * trailing text after the closing quote? */
+ if (!closed || *pos)
+ goto FAIL;
+ *out = '\0';
+ } else {
+ const char *start = pos;
+ /* Check for text after the boolean term. */
+ while (*pos > ' ' && *pos != ')')
+ ++pos;
+ if (*skip_space (pos))
+ goto FAIL;
+ /* No trailing text; dup the string so the caller can free
+ * it. */
+ *term_out = talloc_strndup (ctx, start, pos - start);
+ }
+ return 0;
+
+ FAIL:
+ talloc_free (*prefix_out);
+ talloc_free (*term_out);
+ return 1;
+}
diff --git a/util/string-util.h b/util/string-util.h
index b8844a3..8b9fe50 100644
--- a/util/string-util.h
+++ b/util/string-util.h
@@ -33,4 +33,19 @@ char *strtok_len (char *s, const char *delim, size_t *len);
int make_boolean_term (void *talloc_ctx, const char *prefix, const char *term,
char **buf, size_t *len);
+/* Parse a boolean term query consisting of a prefix, a colon, and a
+ * term that may be quoted as described for make_boolean_term. If the
+ * term is not quoted, then it ends at the first whitespace or close
+ * parenthesis. str may containing leading or trailing whitespace,
+ * but anything else is considered a parse error. This is compatible
+ * with anything produced by make_boolean_term, and supports a subset
+ * of the quoting styles supported by Xapian (and hence notmuch).
+ * *prefix_out and *term_out will be talloc'd with context ctx.
+ *
+ * Return: 0 on success, non-zero on parse error.
+ */
+int
+parse_boolean_term (void *ctx, const char *str,
+ char **prefix_out, char **term_out);
+
#endif
--
1.7.10.4
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v4 2/5] util: Function to parse boolean term queries
2012-12-31 6:42 ` [PATCH v4 2/5] util: Function to parse boolean term queries Austin Clements
@ 2012-12-31 12:01 ` Mark Walters
2013-01-03 17:09 ` Jani Nikula
1 sibling, 0 replies; 14+ messages in thread
From: Mark Walters @ 2012-12-31 12:01 UTC (permalink / raw)
To: Austin Clements, notmuch; +Cc: tomi.ollila
On Mon, 31 Dec 2012, Austin Clements <amdragon@MIT.EDU> wrote:
> This parses the subset of Xapian's boolean term quoting rules that are
> used by make_boolean_term. This is provided as a generic string
> utility, but will be used shortly in notmuch restore to parse and
> optimize for ID queries.
This looks good to me with one concern. Do you need to check that the
three talloc allocations in parse_boolean_term succeed? I think they
could fail in an OOM type situation which I think would cause a seg fault.
I guess failures in restore are much less important than in dump, and I
don't know how careful notmuch is in general.
Best wishes
Mark
> util/string-util.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> util/string-util.h | 15 ++++++++++++
> 2 files changed, 82 insertions(+)
>
> diff --git a/util/string-util.c b/util/string-util.c
> index e4bea21..52c7781 100644
> --- a/util/string-util.c
> +++ b/util/string-util.c
> @@ -22,6 +22,8 @@
> #include "string-util.h"
> #include "talloc.h"
>
> +#include <ctype.h>
> +
> char *
> strtok_len (char *s, const char *delim, size_t *len)
> {
> @@ -96,3 +98,68 @@ make_boolean_term (void *ctx, const char *prefix, const char *term,
>
> return 0;
> }
> +
> +static const char*
> +skip_space (const char *str)
> +{
> + while (*str && isspace (*str))
> + ++str;
> + return str;
> +}
> +
> +int
> +parse_boolean_term (void *ctx, const char *str,
> + char **prefix_out, char **term_out)
> +{
> + *prefix_out = *term_out = NULL;
> +
> + /* Parse prefix */
> + str = skip_space (str);
> + const char *pos = strchr (str, ':');
> + if (! pos)
> + goto FAIL;
> + *prefix_out = talloc_strndup (ctx, str, pos - str);
> + ++pos;
> +
> + /* Implement de-quoting compatible with make_boolean_term. */
> + if (*pos == '"') {
> + char *out = talloc_array (ctx, char, strlen (pos));
> + int closed = 0;
> + *term_out = out;
> + /* Skip the opening quote, find the closing quote, and
> + * un-double doubled internal quotes. */
> + for (++pos; *pos; ) {
> + if (*pos == '"') {
> + ++pos;
> + if (*pos != '"') {
> + /* Found the closing quote. */
> + closed = 1;
> + pos = skip_space (pos);
> + break;
> + }
> + }
> + *out++ = *pos++;
> + }
> + /* Did the term terminate without a closing quote or is there
> + * trailing text after the closing quote? */
> + if (!closed || *pos)
> + goto FAIL;
> + *out = '\0';
> + } else {
> + const char *start = pos;
> + /* Check for text after the boolean term. */
> + while (*pos > ' ' && *pos != ')')
> + ++pos;
> + if (*skip_space (pos))
> + goto FAIL;
> + /* No trailing text; dup the string so the caller can free
> + * it. */
> + *term_out = talloc_strndup (ctx, start, pos - start);
> + }
> + return 0;
> +
> + FAIL:
> + talloc_free (*prefix_out);
> + talloc_free (*term_out);
> + return 1;
> +}
> diff --git a/util/string-util.h b/util/string-util.h
> index b8844a3..8b9fe50 100644
> --- a/util/string-util.h
> +++ b/util/string-util.h
> @@ -33,4 +33,19 @@ char *strtok_len (char *s, const char *delim, size_t *len);
> int make_boolean_term (void *talloc_ctx, const char *prefix, const char *term,
> char **buf, size_t *len);
>
> +/* Parse a boolean term query consisting of a prefix, a colon, and a
> + * term that may be quoted as described for make_boolean_term. If the
> + * term is not quoted, then it ends at the first whitespace or close
> + * parenthesis. str may containing leading or trailing whitespace,
> + * but anything else is considered a parse error. This is compatible
> + * with anything produced by make_boolean_term, and supports a subset
> + * of the quoting styles supported by Xapian (and hence notmuch).
> + * *prefix_out and *term_out will be talloc'd with context ctx.
> + *
> + * Return: 0 on success, non-zero on parse error.
> + */
> +int
> +parse_boolean_term (void *ctx, const char *str,
> + char **prefix_out, char **term_out);
> +
> #endif
> --
> 1.7.10.4
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v4 2/5] util: Function to parse boolean term queries
2012-12-31 6:42 ` [PATCH v4 2/5] util: Function to parse boolean term queries Austin Clements
2012-12-31 12:01 ` Mark Walters
@ 2013-01-03 17:09 ` Jani Nikula
1 sibling, 0 replies; 14+ messages in thread
From: Jani Nikula @ 2013-01-03 17:09 UTC (permalink / raw)
To: Austin Clements, notmuch; +Cc: tomi.ollila
On Mon, 31 Dec 2012, Austin Clements <amdragon@MIT.EDU> wrote:
> This parses the subset of Xapian's boolean term quoting rules that are
> used by make_boolean_term. This is provided as a generic string
> utility, but will be used shortly in notmuch restore to parse and
> optimize for ID queries.
> ---
> util/string-util.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> util/string-util.h | 15 ++++++++++++
> 2 files changed, 82 insertions(+)
>
> diff --git a/util/string-util.c b/util/string-util.c
> index e4bea21..52c7781 100644
> --- a/util/string-util.c
> +++ b/util/string-util.c
> @@ -22,6 +22,8 @@
> #include "string-util.h"
> #include "talloc.h"
>
> +#include <ctype.h>
> +
> char *
> strtok_len (char *s, const char *delim, size_t *len)
> {
> @@ -96,3 +98,68 @@ make_boolean_term (void *ctx, const char *prefix, const char *term,
>
> return 0;
> }
> +
> +static const char*
> +skip_space (const char *str)
> +{
> + while (*str && isspace (*str))
Pedantic: isspace ((unsigned char) *str)
> + ++str;
> + return str;
> +}
> +
> +int
> +parse_boolean_term (void *ctx, const char *str,
> + char **prefix_out, char **term_out)
> +{
> + *prefix_out = *term_out = NULL;
> +
> + /* Parse prefix */
> + str = skip_space (str);
> + const char *pos = strchr (str, ':');
> + if (! pos)
if (! pos || pos == str) ?
> + goto FAIL;
Could just return 1 here.
> + *prefix_out = talloc_strndup (ctx, str, pos - str);
> + ++pos;
> +
> + /* Implement de-quoting compatible with make_boolean_term. */
> + if (*pos == '"') {
> + char *out = talloc_array (ctx, char, strlen (pos));
> + int closed = 0;
> + *term_out = out;
> + /* Skip the opening quote, find the closing quote, and
> + * un-double doubled internal quotes. */
> + for (++pos; *pos; ) {
> + if (*pos == '"') {
> + ++pos;
> + if (*pos != '"') {
> + /* Found the closing quote. */
> + closed = 1;
> + pos = skip_space (pos);
Is it necessary to accept trailing space?
> + break;
> + }
> + }
> + *out++ = *pos++;
> + }
> + /* Did the term terminate without a closing quote or is there
> + * trailing text after the closing quote? */
> + if (!closed || *pos)
> + goto FAIL;
> + *out = '\0';
> + } else {
> + const char *start = pos;
> + /* Check for text after the boolean term. */
> + while (*pos > ' ' && *pos != ')')
The condition could have *pos there too for clarity, though not strictly
necessary. Would be neat to have a ctype style helper that could be
shared between this and make_boolean_term.
> + ++pos;
> + if (*skip_space (pos))
Is it necessary to accept trailing space?
> + goto FAIL;
> + /* No trailing text; dup the string so the caller can free
> + * it. */
> + *term_out = talloc_strndup (ctx, start, pos - start);
> + }
> + return 0;
> +
> + FAIL:
> + talloc_free (*prefix_out);
> + talloc_free (*term_out);
> + return 1;
> +}
> diff --git a/util/string-util.h b/util/string-util.h
> index b8844a3..8b9fe50 100644
> --- a/util/string-util.h
> +++ b/util/string-util.h
> @@ -33,4 +33,19 @@ char *strtok_len (char *s, const char *delim, size_t *len);
> int make_boolean_term (void *talloc_ctx, const char *prefix, const char *term,
> char **buf, size_t *len);
>
> +/* Parse a boolean term query consisting of a prefix, a colon, and a
> + * term that may be quoted as described for make_boolean_term. If the
> + * term is not quoted, then it ends at the first whitespace or close
> + * parenthesis. str may containing leading or trailing whitespace,
> + * but anything else is considered a parse error. This is compatible
> + * with anything produced by make_boolean_term, and supports a subset
> + * of the quoting styles supported by Xapian (and hence notmuch).
> + * *prefix_out and *term_out will be talloc'd with context ctx.
> + *
> + * Return: 0 on success, non-zero on parse error.
> + */
> +int
> +parse_boolean_term (void *ctx, const char *str,
> + char **prefix_out, char **term_out);
> +
> #endif
> --
> 1.7.10.4
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v4 3/5] dump: Disallow \n in message IDs
2012-12-31 6:42 [PATCH v4 0/5] Use Xapian query syntax for batch-tag dump/restore Austin Clements
2012-12-31 6:42 ` [PATCH v4 1/5] util: Factor out boolean term quoting routine Austin Clements
2012-12-31 6:42 ` [PATCH v4 2/5] util: Function to parse boolean term queries Austin Clements
@ 2012-12-31 6:42 ` Austin Clements
2013-01-03 17:19 ` Jani Nikula
2012-12-31 6:42 ` [PATCH v4 4/5] dump/restore: Use Xapian queries for batch-tag format Austin Clements
` (2 subsequent siblings)
5 siblings, 1 reply; 14+ messages in thread
From: Austin Clements @ 2012-12-31 6:42 UTC (permalink / raw)
To: notmuch; +Cc: tomi.ollila
When we switch to using regular Xapian queries in the dump format, \n
will cause problems, so we disallow it. Specially, while Xapian can
quote and parse queries containing \n without difficultly, quoted
queries containing \n still span multiple lines, which breaks the
line-orientedness of the dump format. Strictly speaking, we could
still round-trip these, but it would significantly complicate restore
as well as scripts that deal with tag dumps. This complexity would
come at absolutely no benefit: because of the RFC 2822 unfolding
rules, no amount of standards negligence can produce a message with a
message ID containing a line break (not even Outlook can do it!).
Hence, we simply disallow it.
---
notmuch-dump.c | 9 +++++++++
test/random-corpus.c | 4 +++-
2 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/notmuch-dump.c b/notmuch-dump.c
index d2dad40..29d79da 100644
--- a/notmuch-dump.c
+++ b/notmuch-dump.c
@@ -132,6 +132,15 @@ notmuch_dump_command (unused (void *ctx), int argc, char *argv[])
if (output_format == DUMP_FORMAT_SUP) {
fputs (")\n", output);
} else {
+ if (strchr (message_id, '\n')) {
+ /* This will produce a line break in the output, which
+ * would be difficult to handle in tools. However,
+ * it's also impossible to produce an email containing
+ * a line break in a message ID because of unfolding,
+ * so we can safely disallow it. */
+ fprintf (stderr, "Error: cannot dump message id containing line break: %s\n", message_id);
+ return 1;
+ }
if (hex_encode (notmuch, message_id,
&buffer, &buffer_size) != HEX_SUCCESS) {
fprintf (stderr, "Error: failed to hex-encode msg-id %s\n",
diff --git a/test/random-corpus.c b/test/random-corpus.c
index f354d4b..8b7748e 100644
--- a/test/random-corpus.c
+++ b/test/random-corpus.c
@@ -96,7 +96,9 @@ random_utf8_string (void *ctx, size_t char_count)
buf = talloc_realloc (ctx, buf, gchar, buf_size);
}
- randomchar = random_unichar ();
+ do {
+ randomchar = random_unichar ();
+ } while (randomchar == '\n');
written = g_unichar_to_utf8 (randomchar, buf + offset);
--
1.7.10.4
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v4 3/5] dump: Disallow \n in message IDs
2012-12-31 6:42 ` [PATCH v4 3/5] dump: Disallow \n in message IDs Austin Clements
@ 2013-01-03 17:19 ` Jani Nikula
0 siblings, 0 replies; 14+ messages in thread
From: Jani Nikula @ 2013-01-03 17:19 UTC (permalink / raw)
To: Austin Clements, notmuch; +Cc: tomi.ollila
On Mon, 31 Dec 2012, Austin Clements <amdragon@MIT.EDU> wrote:
> When we switch to using regular Xapian queries in the dump format, \n
> will cause problems, so we disallow it. Specially, while Xapian can
> quote and parse queries containing \n without difficultly, quoted
> queries containing \n still span multiple lines, which breaks the
> line-orientedness of the dump format. Strictly speaking, we could
> still round-trip these, but it would significantly complicate restore
> as well as scripts that deal with tag dumps. This complexity would
> come at absolutely no benefit: because of the RFC 2822 unfolding
> rules, no amount of standards negligence can produce a message with a
> message ID containing a line break (not even Outlook can do it!).
>
> Hence, we simply disallow it.
> ---
> notmuch-dump.c | 9 +++++++++
> test/random-corpus.c | 4 +++-
> 2 files changed, 12 insertions(+), 1 deletion(-)
>
> diff --git a/notmuch-dump.c b/notmuch-dump.c
> index d2dad40..29d79da 100644
> --- a/notmuch-dump.c
> +++ b/notmuch-dump.c
> @@ -132,6 +132,15 @@ notmuch_dump_command (unused (void *ctx), int argc, char *argv[])
> if (output_format == DUMP_FORMAT_SUP) {
> fputs (")\n", output);
> } else {
> + if (strchr (message_id, '\n')) {
> + /* This will produce a line break in the output, which
> + * would be difficult to handle in tools. However,
> + * it's also impossible to produce an email containing
> + * a line break in a message ID because of unfolding,
> + * so we can safely disallow it. */
> + fprintf (stderr, "Error: cannot dump message id containing line break: %s\n", message_id);
> + return 1;
How about just skipping the message in the dump, with a warning, instead
of bailing out? If the user is desperate to do a backup for whatever
reason, I don't think it's a good idea to require deleting the message
from the db before dump can succeed. The fs holding the db might be
remounted ro and all that.
And perhaps the message id in the error message should be wrapped in
quotes, because it will span multiple lines due to having a
newline... ;)
Otherwise, LGTM.
Jani.
> + }
> if (hex_encode (notmuch, message_id,
> &buffer, &buffer_size) != HEX_SUCCESS) {
> fprintf (stderr, "Error: failed to hex-encode msg-id %s\n",
> diff --git a/test/random-corpus.c b/test/random-corpus.c
> index f354d4b..8b7748e 100644
> --- a/test/random-corpus.c
> +++ b/test/random-corpus.c
> @@ -96,7 +96,9 @@ random_utf8_string (void *ctx, size_t char_count)
> buf = talloc_realloc (ctx, buf, gchar, buf_size);
> }
>
> - randomchar = random_unichar ();
> + do {
> + randomchar = random_unichar ();
> + } while (randomchar == '\n');
>
> written = g_unichar_to_utf8 (randomchar, buf + offset);
>
> --
> 1.7.10.4
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v4 4/5] dump/restore: Use Xapian queries for batch-tag format
2012-12-31 6:42 [PATCH v4 0/5] Use Xapian query syntax for batch-tag dump/restore Austin Clements
` (2 preceding siblings ...)
2012-12-31 6:42 ` [PATCH v4 3/5] dump: Disallow \n in message IDs Austin Clements
@ 2012-12-31 6:42 ` Austin Clements
2012-12-31 12:41 ` Mark Walters
2012-12-31 6:42 ` [PATCH v4 5/5] man: Update notmuch-dump(1) and notmuch-restore(1) Austin Clements
2012-12-31 22:14 ` [PATCH v4 0/5] Use Xapian query syntax for batch-tag dump/restore Tomi Ollila
5 siblings, 1 reply; 14+ messages in thread
From: Austin Clements @ 2012-12-31 6:42 UTC (permalink / raw)
To: notmuch; +Cc: tomi.ollila
This switches the new batch-tag format away from using a home-grown
hex-encoding scheme for message IDs in the dump to simply using Xapian
queries with Xapian quoting syntax.
This has a variety of advantages beyond presenting a cleaner and more
consistent interface. Foremost is that it will dramatically simplify
the quoting for batch tagging, which shares the same input format.
While the hex-encoding is no better or worse for the simple ID queries
used by dump/restore, it becomes onerous for general-purpose queries
used in batch tagging. It also better handles strange cases like
"id:foo and bar", since this is no longer syntactically valid.
---
notmuch-dump.c | 9 +++++----
notmuch-restore.c | 22 ++++++++++------------
tag-util.c | 6 ------
test/dump-restore | 14 ++++++++------
4 files changed, 23 insertions(+), 28 deletions(-)
diff --git a/notmuch-dump.c b/notmuch-dump.c
index 29d79da..bf01a39 100644
--- a/notmuch-dump.c
+++ b/notmuch-dump.c
@@ -20,6 +20,7 @@
#include "notmuch-client.h"
#include "dump-restore-private.h"
+#include "string-util.h"
int
notmuch_dump_command (unused (void *ctx), int argc, char *argv[])
@@ -141,13 +142,13 @@ notmuch_dump_command (unused (void *ctx), int argc, char *argv[])
fprintf (stderr, "Error: cannot dump message id containing line break: %s\n", message_id);
return 1;
}
- if (hex_encode (notmuch, message_id,
- &buffer, &buffer_size) != HEX_SUCCESS) {
- fprintf (stderr, "Error: failed to hex-encode msg-id %s\n",
+ if (make_boolean_term (notmuch, "id", message_id,
+ &buffer, &buffer_size)) {
+ fprintf (stderr, "Error: failed to quote message id %s\n",
message_id);
return 1;
}
- fprintf (output, " -- id:%s\n", buffer);
+ fprintf (output, " -- %s\n", buffer);
}
notmuch_message_destroy (message);
diff --git a/notmuch-restore.c b/notmuch-restore.c
index 9ed9b51..77a4c27 100644
--- a/notmuch-restore.c
+++ b/notmuch-restore.c
@@ -207,7 +207,7 @@ notmuch_restore_command (unused (void *ctx), int argc, char *argv[])
INTERNAL_ERROR ("compile time constant regex failed.");
do {
- char *query_string;
+ char *query_string, *prefix, *term;
if (line_ctx != NULL)
talloc_free (line_ctx);
@@ -220,19 +220,17 @@ notmuch_restore_command (unused (void *ctx), int argc, char *argv[])
&query_string, tag_ops);
if (ret == 0) {
- if (strncmp ("id:", query_string, 3) != 0) {
- fprintf (stderr, "Warning: unsupported query: %s\n", query_string);
+ ret = parse_boolean_term (line_ctx, query_string,
+ &prefix, &term);
+ if (ret) {
+ fprintf (stderr, "Warning: cannot parse query: %s\n",
+ query_string);
+ continue;
+ } else if (strcmp ("id", prefix) != 0) {
+ fprintf (stderr, "Warning: not an id query: %s\n", query_string);
continue;
}
- /* delete id: from front of string; tag_message
- * expects a raw message-id.
- *
- * XXX: Note that query string id:foo and bar will be
- * interpreted as a message id "foo and bar". This
- * should eventually be fixed to give a better error
- * message.
- */
- query_string = query_string + 3;
+ query_string = term;
}
}
diff --git a/tag-util.c b/tag-util.c
index 705b7ba..e4e5dda 100644
--- a/tag-util.c
+++ b/tag-util.c
@@ -124,12 +124,6 @@ parse_tag_line (void *ctx, char *line,
}
/* tok now points to the query string */
- if (hex_decode_inplace (tok) != HEX_SUCCESS) {
- ret = line_error (TAG_PARSE_INVALID, line_for_error,
- "hex decoding of query %s failed", tok);
- goto DONE;
- }
-
*query_string = tok;
DONE:
diff --git a/test/dump-restore b/test/dump-restore
index 6a989b6..f9ae5b3 100755
--- a/test/dump-restore
+++ b/test/dump-restore
@@ -195,23 +195,25 @@ a
# the previous line was blank; also no yelling please
+%zz -- id:whatever
-+e +f id:%yy
++e +f id:"
++e +f tag:abc
# the next non-comment line should report an an empty tag error for
# batch tagging, but not for restore
+ +e -- id:20091117232137.GA7669@griffis1.net
-# highlight the sketchy id parsing; this should be last
-+g -- id:foo and bar
+# valid id, but warning about missing message
++e id:missing_message_id
EOF
cat <<EOF > EXPECTED
-Warning: unsupported query: a
+Warning: cannot parse query: a
Warning: no query string [+0]
Warning: no query string [+a +b]
Warning: missing query string [+a +b ]
Warning: no query string after -- [+c +d --]
Warning: hex decoding of tag %zz failed [+%zz -- id:whatever]
-Warning: hex decoding of query id:%yy failed [+e +f id:%yy]
-Warning: cannot apply tags to missing message: foo and bar
+Warning: cannot parse query: id:"
+Warning: not an id query: tag:abc
+Warning: cannot apply tags to missing message: missing_message_id
EOF
test_expect_equal_file EXPECTED OUTPUT
--
1.7.10.4
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v4 4/5] dump/restore: Use Xapian queries for batch-tag format
2012-12-31 6:42 ` [PATCH v4 4/5] dump/restore: Use Xapian queries for batch-tag format Austin Clements
@ 2012-12-31 12:41 ` Mark Walters
2012-12-31 16:52 ` David Bremner
0 siblings, 1 reply; 14+ messages in thread
From: Mark Walters @ 2012-12-31 12:41 UTC (permalink / raw)
To: Austin Clements, notmuch; +Cc: tomi.ollila
On Mon, 31 Dec 2012, Austin Clements <amdragon@MIT.EDU> wrote:
> This switches the new batch-tag format away from using a home-grown
> hex-encoding scheme for message IDs in the dump to simply using Xapian
> queries with Xapian quoting syntax.
>
> This has a variety of advantages beyond presenting a cleaner and more
> consistent interface. Foremost is that it will dramatically simplify
> the quoting for batch tagging, which shares the same input format.
> While the hex-encoding is no better or worse for the simple ID queries
> used by dump/restore, it becomes onerous for general-purpose queries
> used in batch tagging. It also better handles strange cases like
> "id:foo and bar", since this is no longer syntactically valid.
This series as a whole looks good to me modulo the allocation query for
parse_boolean_term and a couple of trivial points below.
> ---
> notmuch-dump.c | 9 +++++----
> notmuch-restore.c | 22 ++++++++++------------
> tag-util.c | 6 ------
> test/dump-restore | 14 ++++++++------
> 4 files changed, 23 insertions(+), 28 deletions(-)
>
> diff --git a/notmuch-dump.c b/notmuch-dump.c
> index 29d79da..bf01a39 100644
> --- a/notmuch-dump.c
> +++ b/notmuch-dump.c
> @@ -20,6 +20,7 @@
>
> #include "notmuch-client.h"
> #include "dump-restore-private.h"
> +#include "string-util.h"
>
> int
> notmuch_dump_command (unused (void *ctx), int argc, char *argv[])
> @@ -141,13 +142,13 @@ notmuch_dump_command (unused (void *ctx), int argc, char *argv[])
> fprintf (stderr, "Error: cannot dump message id containing line break: %s\n", message_id);
> return 1;
> }
> - if (hex_encode (notmuch, message_id,
> - &buffer, &buffer_size) != HEX_SUCCESS) {
> - fprintf (stderr, "Error: failed to hex-encode msg-id %s\n",
> + if (make_boolean_term (notmuch, "id", message_id,
> + &buffer, &buffer_size)) {
> + fprintf (stderr, "Error: failed to quote message id %s\n",
> message_id);
> return 1;
> }
> - fprintf (output, " -- id:%s\n", buffer);
> + fprintf (output, " -- %s\n", buffer);
> }
>
> notmuch_message_destroy (message);
> diff --git a/notmuch-restore.c b/notmuch-restore.c
> index 9ed9b51..77a4c27 100644
> --- a/notmuch-restore.c
> +++ b/notmuch-restore.c
> @@ -207,7 +207,7 @@ notmuch_restore_command (unused (void *ctx), int argc, char *argv[])
> INTERNAL_ERROR ("compile time constant regex failed.");
>
> do {
> - char *query_string;
> + char *query_string, *prefix, *term;
>
> if (line_ctx != NULL)
> talloc_free (line_ctx);
> @@ -220,19 +220,17 @@ notmuch_restore_command (unused (void *ctx), int argc, char *argv[])
> &query_string, tag_ops);
>
> if (ret == 0) {
> - if (strncmp ("id:", query_string, 3) != 0) {
> - fprintf (stderr, "Warning: unsupported query: %s\n", query_string);
> + ret = parse_boolean_term (line_ctx, query_string,
> + &prefix, &term);
> + if (ret) {
> + fprintf (stderr, "Warning: cannot parse query: %s\n",
> + query_string);
> + continue;
> + } else if (strcmp ("id", prefix) != 0) {
> + fprintf (stderr, "Warning: not an id query: %s\n", query_string);
> continue;
I think it would be worth adding "skipping" or something similar to this
fprintf as it may not be obvious whether we warn but tag anyway or warn
and skip. Perhaps also add it to the previous one but there it is
obvious we can't do anything but skip.
> }
> - /* delete id: from front of string; tag_message
> - * expects a raw message-id.
> - *
> - * XXX: Note that query string id:foo and bar will be
> - * interpreted as a message id "foo and bar". This
> - * should eventually be fixed to give a better error
> - * message.
> - */
> - query_string = query_string + 3;
> + query_string = term;
> }
> }
>
> diff --git a/tag-util.c b/tag-util.c
> index 705b7ba..e4e5dda 100644
> --- a/tag-util.c
> +++ b/tag-util.c
> @@ -124,12 +124,6 @@ parse_tag_line (void *ctx, char *line,
> }
>
> /* tok now points to the query string */
> - if (hex_decode_inplace (tok) != HEX_SUCCESS) {
> - ret = line_error (TAG_PARSE_INVALID, line_for_error,
> - "hex decoding of query %s failed", tok);
> - goto DONE;
> - }
> -
> *query_string = tok;
>
> DONE:
> diff --git a/test/dump-restore b/test/dump-restore
> index 6a989b6..f9ae5b3 100755
> --- a/test/dump-restore
> +++ b/test/dump-restore
> @@ -195,23 +195,25 @@ a
>
> # the previous line was blank; also no yelling please
> +%zz -- id:whatever
> -+e +f id:%yy
> ++e +f id:"
> ++e +f tag:abc
It might be worth adding some more test lines here to test the various
paths in parse_boolean_term: along the lines of
+e -- id:some)stuff
+e -- id:some"stuff
+e -- id:some stuff
+e -- id:"a_message_id_with""_a_quote"
+e -- id:"a message id with spaces"
One other thing that is noticeable from the errors is that most of the
rest of the errors are very informative but the parse_boolean_term one
is relatively uninformative: it just says we cannot parse the id even
though we know rather more about what the error is (trailing text, no
closing quote, illegal character in an unquoted id etc). I am happy with
it how it is but perhaps David Bremner might like to comment?
Best wishes
Mark
> # the next non-comment line should report an an empty tag error for
> # batch tagging, but not for restore
> + +e -- id:20091117232137.GA7669@griffis1.net
> -# highlight the sketchy id parsing; this should be last
> -+g -- id:foo and bar
> +# valid id, but warning about missing message
> ++e id:missing_message_id
> EOF
>
> cat <<EOF > EXPECTED
> -Warning: unsupported query: a
> +Warning: cannot parse query: a
> Warning: no query string [+0]
> Warning: no query string [+a +b]
> Warning: missing query string [+a +b ]
> Warning: no query string after -- [+c +d --]
> Warning: hex decoding of tag %zz failed [+%zz -- id:whatever]
> -Warning: hex decoding of query id:%yy failed [+e +f id:%yy]
> -Warning: cannot apply tags to missing message: foo and bar
> +Warning: cannot parse query: id:"
> +Warning: not an id query: tag:abc
> +Warning: cannot apply tags to missing message: missing_message_id
> EOF
>
> test_expect_equal_file EXPECTED OUTPUT
> --
> 1.7.10.4
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v4 4/5] dump/restore: Use Xapian queries for batch-tag format
2012-12-31 12:41 ` Mark Walters
@ 2012-12-31 16:52 ` David Bremner
0 siblings, 0 replies; 14+ messages in thread
From: David Bremner @ 2012-12-31 16:52 UTC (permalink / raw)
To: Mark Walters, Austin Clements, notmuch; +Cc: tomi.ollila
Mark Walters <markwalters1009@gmail.com> writes:
> One other thing that is noticeable from the errors is that most of the
> rest of the errors are very informative but the parse_boolean_term one
> is relatively uninformative: it just says we cannot parse the id even
> though we know rather more about what the error is (trailing text, no
> closing quote, illegal character in an unquoted id etc). I am happy with
> it how it is but perhaps David Bremner might like to comment?
I don't see a simple way to report better errors without adding more
status values to parse_boolean_term; in the other cases we report the
error as it is detected, in the cli code. I also don't think it's
correct for routines in ./util to directly print error messages.
d
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v4 5/5] man: Update notmuch-dump(1) and notmuch-restore(1)
2012-12-31 6:42 [PATCH v4 0/5] Use Xapian query syntax for batch-tag dump/restore Austin Clements
` (3 preceding siblings ...)
2012-12-31 6:42 ` [PATCH v4 4/5] dump/restore: Use Xapian queries for batch-tag format Austin Clements
@ 2012-12-31 6:42 ` Austin Clements
2012-12-31 22:14 ` [PATCH v4 0/5] Use Xapian query syntax for batch-tag dump/restore Tomi Ollila
5 siblings, 0 replies; 14+ messages in thread
From: Austin Clements @ 2012-12-31 6:42 UTC (permalink / raw)
To: notmuch; +Cc: tomi.ollila
Describe the new batch-tag format. For notmuch-restore, rather than
half-heartedly duplicating the description, we now cite notmuch-dump.
---
man/man1/notmuch-dump.1 | 11 +++++++----
man/man1/notmuch-restore.1 | 6 ++----
2 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/man/man1/notmuch-dump.1 b/man/man1/notmuch-dump.1
index 770b00f..7bd6def 100644
--- a/man/man1/notmuch-dump.1
+++ b/man/man1/notmuch-dump.1
@@ -64,13 +64,16 @@ and tags containing whitespace or non-\fBascii\fR(7) characters.
Each line has the form
.RS 4
-.RI "+<" "encoded-tag" "> " "" "+<" "encoded-tag" "> ... -- " "" " id:<" encoded-message-id >
+.RI "+<" "encoded-tag" "> " "" "+<" "encoded-tag" "> ... -- " "" " id:<" quoted-message-id >
-where encoded means that every byte not matching the regex
+Tags are hex-encoded by replacing every byte not matching the regex
.B [A-Za-z0-9@=.,_+-]
-is replace by
+with
.B %nn
-where nn is the two digit hex encoding.
+where nn is the two digit hex encoding. The message ID is a valid Xapian
+query, quoted using Xapian boolean term quoting rules: if the ID contains
+whitespace or a close paren or starts with a double quote, it must be
+enclosed in double quotes and double quotes inside the ID must be doubled.
The astute reader will notice this is a special case of the batch input
format for \fBnotmuch-tag\fR(1); note that the single message-id query is
mandatory for \fBnotmuch-restore\fR(1).
diff --git a/man/man1/notmuch-restore.1 b/man/man1/notmuch-restore.1
index 6bba628..78fef52 100644
--- a/man/man1/notmuch-restore.1
+++ b/man/man1/notmuch-restore.1
@@ -57,10 +57,8 @@ sup calls them).
The
.B batch-tag
dump format is intended to more robust against malformed message-ids
-and tags containing whitespace or non-\fBascii\fR(7) characters. This
-format hex-escapes all characters those outside of a small character
-set, intended to be suitable for e.g. pathnames in most UNIX-like
-systems.
+and tags containing whitespace or non-\fBascii\fR(7) characters. See
+\fBnotmuch-dump\fR(1) for details on this format.
.B "notmuch restore"
updates the maildir flags according to tag changes if the
--
1.7.10.4
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v4 0/5] Use Xapian query syntax for batch-tag dump/restore
2012-12-31 6:42 [PATCH v4 0/5] Use Xapian query syntax for batch-tag dump/restore Austin Clements
` (4 preceding siblings ...)
2012-12-31 6:42 ` [PATCH v4 5/5] man: Update notmuch-dump(1) and notmuch-restore(1) Austin Clements
@ 2012-12-31 22:14 ` Tomi Ollila
5 siblings, 0 replies; 14+ messages in thread
From: Tomi Ollila @ 2012-12-31 22:14 UTC (permalink / raw)
To: Austin Clements, notmuch
On Mon, Dec 31 2012, Austin Clements <amdragon@MIT.EDU> wrote:
> This obsoletes
>
> id:1356719189-2837-1-git-send-email-amdragon@mit.edu
>
> This version accepts whitespace before and after boolean terms in
> parse_boolean_term and expands its documentation comment to describe
> what it accepts and how that relates to Xapian and make_boolean_term.
>
> The diff from v3 is below.
With these changes this looks good to me...
Tomi
>
> diff --git a/util/string-util.c b/util/string-util.c
> index 83b4953..52c7781 100644
> --- a/util/string-util.c
> +++ b/util/string-util.c
> @@ -22,6 +22,8 @@
> #include "string-util.h"
> #include "talloc.h"
>
> +#include <ctype.h>
> +
> char *
> strtok_len (char *s, const char *delim, size_t *len)
> {
> @@ -97,6 +99,14 @@ make_boolean_term (void *ctx, const char *prefix, const char *term,
> return 0;
> }
>
> +static const char*
> +skip_space (const char *str)
> +{
> + while (*str && isspace (*str))
> + ++str;
> + return str;
> +}
> +
> int
> parse_boolean_term (void *ctx, const char *str,
> char **prefix_out, char **term_out)
> @@ -104,6 +114,7 @@ parse_boolean_term (void *ctx, const char *str,
> *prefix_out = *term_out = NULL;
>
> /* Parse prefix */
> + str = skip_space (str);
> const char *pos = strchr (str, ':');
> if (! pos)
> goto FAIL;
> @@ -123,6 +134,7 @@ parse_boolean_term (void *ctx, const char *str,
> if (*pos != '"') {
> /* Found the closing quote. */
> closed = 1;
> + pos = skip_space (pos);
> break;
> }
> }
> @@ -138,11 +150,11 @@ parse_boolean_term (void *ctx, const char *str,
> /* Check for text after the boolean term. */
> while (*pos > ' ' && *pos != ')')
> ++pos;
> - if (*pos)
> + if (*skip_space (pos))
> goto FAIL;
> /* No trailing text; dup the string so the caller can free
> * it. */
> - *term_out = talloc_strdup (ctx, start);
> + *term_out = talloc_strndup (ctx, start, pos - start);
> }
> return 0;
>
> diff --git a/util/string-util.h b/util/string-util.h
> index 43d49d0..8b9fe50 100644
> --- a/util/string-util.h
> +++ b/util/string-util.h
> @@ -33,12 +33,16 @@ char *strtok_len (char *s, const char *delim, size_t *len);
> int make_boolean_term (void *talloc_ctx, const char *prefix, const char *term,
> char **buf, size_t *len);
>
> -/* Parse a boolean term query produced by make_boolean_term, returning
> - * the prefix in *prefix_out and the term in *term_out. *prefix_out
> - * and *term_out will be talloc'd with context ctx.
> +/* Parse a boolean term query consisting of a prefix, a colon, and a
> + * term that may be quoted as described for make_boolean_term. If the
> + * term is not quoted, then it ends at the first whitespace or close
> + * parenthesis. str may containing leading or trailing whitespace,
> + * but anything else is considered a parse error. This is compatible
> + * with anything produced by make_boolean_term, and supports a subset
> + * of the quoting styles supported by Xapian (and hence notmuch).
> + * *prefix_out and *term_out will be talloc'd with context ctx.
> *
> - * Return: 0 on success, non-zero on parse error (including trailing
> - * data in str).
> + * Return: 0 on success, non-zero on parse error.
> */
> int
> parse_boolean_term (void *ctx, const char *str,
>
>
> _______________________________________________
> notmuch mailing list
> notmuch@notmuchmail.org
> http://notmuchmail.org/mailman/listinfo/notmuch
^ permalink raw reply [flat|nested] 14+ messages in thread