unofficial mirror of notmuch@notmuchmail.org
 help / color / mirror / code / Atom feed
* [PATCH] util/hex-escape.[ch]: encoding/decoding strings into restricted character set
@ 2011-12-11 16:19 David Bremner
  2011-12-11 17:56 ` Dmitry Kurochkin
  2011-12-12 11:29 ` Tomi Ollila
  0 siblings, 2 replies; 4+ messages in thread
From: David Bremner @ 2011-12-11 16:19 UTC (permalink / raw)
  To: notmuch; +Cc: David Bremner

From: David Bremner <bremner@debian.org>

The character set is chosen to be suitable for pathnames, and the same
as that used by contrib/nmbug. The new encoded/decoded strings are
allocated using talloc.
---
This isn't urgent, but it is useful for a couple projects I have
brewing (nmbug compatible dump/restore and tag logging), so I thought
I would get some feedback on it.


 util/Makefile.local |    4 +-
 util/hex-escape.c   |  110 +++++++++++++++++++++++++++++++++++++++++++++++++++
 util/hex-escape.h   |   10 +++++
 3 files changed, 122 insertions(+), 2 deletions(-)
 create mode 100644 util/hex-escape.c
 create mode 100644 util/hex-escape.h

diff --git a/util/Makefile.local b/util/Makefile.local
index 0340899..2e63932 100644
--- a/util/Makefile.local
+++ b/util/Makefile.local
@@ -3,11 +3,11 @@
 dir := util
 extra_cflags += -I$(srcdir)/$(dir)
 
-libutil_c_srcs := $(dir)/xutil.c $(dir)/error_util.c
+libutil_c_srcs := $(dir)/xutil.c $(dir)/error_util.c $(dir)/hex-escape.c
 
 libutil_modules := $(libutil_c_srcs:.c=.o)
 
 $(dir)/libutil.a: $(libutil_modules)
 	$(call quiet,AR) rcs $@ $^
 
-CLEAN := $(CLEAN) $(dir)/xutil.o $(dir)/error_util.o $(dir)/libutil.a
+CLEAN := $(CLEAN) $(libutil_modules) $(dir)/libutil.a
diff --git a/util/hex-escape.c b/util/hex-escape.c
new file mode 100644
index 0000000..c294bb5
--- /dev/null
+++ b/util/hex-escape.c
@@ -0,0 +1,110 @@
+/* hex-escape.c -  Manage encoding and decoding of byte strings into
+ *		   a restricted character set.
+ *
+ * Copyright (c) 2011 David Bremner
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/ .
+ *
+ * Author: David Bremner <david@tethera.net>
+ */
+
+#include <string.h>
+#include <talloc.h>
+#include "error_util.h"
+#include "hex-escape.h"
+
+static int
+escapes_needed (const char *str){
+    int escapes = 0;
+
+    while (*str) {
+	if (index (HEX_NO_ESCAPE, *str) == NULL)
+	    escapes++;
+	str++;
+    }
+
+   return escapes;
+}
+
+char *
+hex_encode (void *ctx, const char *str) {
+    char *newstr = talloc_size (ctx, strlen (str)+3*escapes_needed (str)+1);
+
+    char *out = newstr;
+
+    while (*str) {
+	if (index (HEX_NO_ESCAPE, *str)) {
+	    *out++ = *str++;
+	} else {
+	    sprintf (out, "%%%02x", *str);
+	    str++;
+	    out += 3;
+	}
+    }
+    *out = 0;
+    return newstr;
+}
+
+inline static int
+_digit (char c) {
+    if ('0' <= c && c <= '9')
+	return c - '0';
+
+    if ('A' <= c && c <= 'F')
+	return c - 'A';
+
+    if ('a' <= c && c <= 'f')
+	return c - 'a';
+
+    INTERNAL_ERROR ("Illegal hex digit %c", c);
+    /*NOTREACHED*/
+    return 0;
+}
+
+char *hex_decode (void *ctx, const char *str) {
+
+    int len = strlen(str);
+
+    const char *p;
+    char *q;
+    char *newstr;
+    int escapes = 0;
+
+    for (p = str; *p; p++)
+	escapes += (*p == HEX_ESCAPE_CHAR);
+
+    newstr = talloc_size (ctx, len - escapes*2 + 1);
+
+    p = str;
+    q = newstr;
+
+    while (*p) {
+
+	if (*p == HEX_ESCAPE_CHAR) {
+
+	    if (len < 3) INTERNAL_ERROR ("Syntax error decoding %s", str);
+
+	    *q = _digit(p[1]) * 16;
+	    *q += _digit(p[2]);
+
+	    len -= 3;
+	    p += 3;
+	    q++;
+	} else {
+	    *q++ = *p++;
+	}
+    }
+
+    return newstr;
+}
diff --git a/util/hex-escape.h b/util/hex-escape.h
new file mode 100644
index 0000000..7caff15
--- /dev/null
+++ b/util/hex-escape.h
@@ -0,0 +1,10 @@
+#ifndef _HEX_ESCAPE_H
+#define _HEX_ESCAPE_H
+
+#define HEX_ESCAPE_CHAR '%'
+#define HEX_NO_ESCAPE 	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" \
+			"0123456789+-_@=.:,"
+
+char *hex_encode (void *talloc_ctx, const char *string);
+char *hex_decode (void *talloc_ctx, const char *hex);
+#endif
-- 
1.7.7.3

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] util/hex-escape.[ch]: encoding/decoding strings into restricted character set
  2011-12-11 16:19 [PATCH] util/hex-escape.[ch]: encoding/decoding strings into restricted character set David Bremner
@ 2011-12-11 17:56 ` Dmitry Kurochkin
  2011-12-12 18:23   ` David Bremner
  2011-12-12 11:29 ` Tomi Ollila
  1 sibling, 1 reply; 4+ messages in thread
From: Dmitry Kurochkin @ 2011-12-11 17:56 UTC (permalink / raw)
  To: David Bremner, notmuch; +Cc: David Bremner

On Sun, 11 Dec 2011 12:19:44 -0400, David Bremner <david@tethera.net> wrote:
> From: David Bremner <bremner@debian.org>
> 
> The character set is chosen to be suitable for pathnames, and the same
> as that used by contrib/nmbug. The new encoded/decoded strings are
> allocated using talloc.
> ---
> This isn't urgent, but it is useful for a couple projects I have
> brewing (nmbug compatible dump/restore and tag logging), so I thought
> I would get some feedback on it.
> 

I have some free time to spend on notmuch reviews today.  So here it is
comes :)

> 
>  util/Makefile.local |    4 +-
>  util/hex-escape.c   |  110 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  util/hex-escape.h   |   10 +++++
>  3 files changed, 122 insertions(+), 2 deletions(-)
>  create mode 100644 util/hex-escape.c
>  create mode 100644 util/hex-escape.h
> 
> diff --git a/util/Makefile.local b/util/Makefile.local
> index 0340899..2e63932 100644
> --- a/util/Makefile.local
> +++ b/util/Makefile.local
> @@ -3,11 +3,11 @@
>  dir := util
>  extra_cflags += -I$(srcdir)/$(dir)
>  
> -libutil_c_srcs := $(dir)/xutil.c $(dir)/error_util.c
> +libutil_c_srcs := $(dir)/xutil.c $(dir)/error_util.c $(dir)/hex-escape.c
>  
>  libutil_modules := $(libutil_c_srcs:.c=.o)
>  
>  $(dir)/libutil.a: $(libutil_modules)
>  	$(call quiet,AR) rcs $@ $^
>  
> -CLEAN := $(CLEAN) $(dir)/xutil.o $(dir)/error_util.o $(dir)/libutil.a
> +CLEAN := $(CLEAN) $(libutil_modules) $(dir)/libutil.a

IMO this should be pushed as a separate patch (that does not need a
review :)).

> diff --git a/util/hex-escape.c b/util/hex-escape.c
> new file mode 100644
> index 0000000..c294bb5
> --- /dev/null
> +++ b/util/hex-escape.c
> @@ -0,0 +1,110 @@
> +/* hex-escape.c -  Manage encoding and decoding of byte strings into
> + *		   a restricted character set.
> + *
> + * Copyright (c) 2011 David Bremner
> + *
> + * This program is free software: you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation, either version 3 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see http://www.gnu.org/licenses/ .
> + *
> + * Author: David Bremner <david@tethera.net>
> + */
> +
> +#include <string.h>
> +#include <talloc.h>
> +#include "error_util.h"
> +#include "hex-escape.h"
> +
> +static int
> +escapes_needed (const char *str){

The name suggests that the function returns a boolean (needed or not
needed).  Consider renaming to escapes_count, count_escapes or similar.

Also there is a space missing before the {.

> +    int escapes = 0;
> +
> +    while (*str) {
> +	if (index (HEX_NO_ESCAPE, *str) == NULL)

Consider adding a function (bool escape_needed(const char c)) (similar
to isalpha() and friends) which would call index() and use it here and
in hex_encode.  (This comment would not be actual if you decide to
introduce a general char counting function.)

> +	    escapes++;
> +	str++;
> +    }
> +
> +   return escapes;

Can we replace this function with a two-line for loop similar to the one
in hex_decode?

I think we should either use inline loops for counting chars in both
hex_encode and hex_decode, or introduce a more general function that
counts the number of given characters and use it in both hex_encode and
hex_decode.

> +}
> +
> +char *
> +hex_encode (void *ctx, const char *str) {
> +    char *newstr = talloc_size (ctx, strlen (str)+3*escapes_needed (str)+1);

Do we need only strlen(str) + 2*escaped_count + 1 here?

IMO it is very weird that we have spaces after a function name, but do
not have spaces around +...

> +
> +    char *out = newstr;
> +

Why do we need both out and newstr variables?

> +    while (*str) {
> +	if (index (HEX_NO_ESCAPE, *str)) {
> +	    *out++ = *str++;
> +	} else {
> +	    sprintf (out, "%%%02x", *str);
> +	    str++;

Use *str++ as sprintf() parameter instead of doing it on a separate
line?

> +	    out += 3;
> +	}
> +    }
> +    *out = 0;

I would prefer '\0' here.

> +    return newstr;
> +}
> +
> +inline static int
> +_digit (char c) {

Perhaps rename to hex_digit?  Other static function names do not start
with underscore.  Let's be consistent.

> +    if ('0' <= c && c <= '9')
> +	return c - '0';
> +
> +    if ('A' <= c && c <= 'F')
> +	return c - 'A';
> +
> +    if ('a' <= c && c <= 'f')
> +	return c - 'a';
> +

Does this really work as expected?  'B' - 'A' would be 1, while it seems
that we expect 10.  Perhaps we should use sscanf(3) instead?  That may
make the code simpler and allow to convert both chars at once.

> +    INTERNAL_ERROR ("Illegal hex digit %c", c);
> +    /*NOTREACHED*/
> +    return 0;
> +}
> +
> +char *hex_decode (void *ctx, const char *str) {
> +
> +    int len = strlen(str);
> +
> +    const char *p;
> +    char *q;
> +    char *newstr;

If you decide to use "out" variable in hex_encode() instead of "newstr",
please rename it here as well.

> +    int escapes = 0;
> +
> +    for (p = str; *p; p++)
> +	escapes += (*p == HEX_ESCAPE_CHAR);
> +
> +    newstr = talloc_size (ctx, len - escapes*2 + 1);
> +
> +    p = str;
> +    q = newstr;
> +
> +    while (*p) {
> +
> +	if (*p == HEX_ESCAPE_CHAR) {
> +
> +	    if (len < 3) INTERNAL_ERROR ("Syntax error decoding %s", str);
> +
> +	    *q = _digit(p[1]) * 16;
> +	    *q += _digit(p[2]);
> +
> +	    len -= 3;
> +	    p += 3;
> +	    q++;
> +	} else {
> +	    *q++ = *p++;
> +	}
> +    }
> +
> +    return newstr;
> +}
> diff --git a/util/hex-escape.h b/util/hex-escape.h
> new file mode 100644
> index 0000000..7caff15
> --- /dev/null
> +++ b/util/hex-escape.h
> @@ -0,0 +1,10 @@
> +#ifndef _HEX_ESCAPE_H
> +#define _HEX_ESCAPE_H
> +
> +#define HEX_ESCAPE_CHAR '%'
> +#define HEX_NO_ESCAPE 	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" \
> +			"0123456789+-_@=.:,"
> +

Can we make these proper constants?

Are these macros supposed to be used elsewhere?  If no, we should move
them inside hex-escape.c and probably even make local to functions that
need them.

Regards,
  Dmitry

> +char *hex_encode (void *talloc_ctx, const char *string);
> +char *hex_decode (void *talloc_ctx, const char *hex);
> +#endif
> -- 
> 1.7.7.3
> 
> _______________________________________________
> notmuch mailing list
> notmuch@notmuchmail.org
> http://notmuchmail.org/mailman/listinfo/notmuch

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] util/hex-escape.[ch]: encoding/decoding strings into restricted character set
  2011-12-11 16:19 [PATCH] util/hex-escape.[ch]: encoding/decoding strings into restricted character set David Bremner
  2011-12-11 17:56 ` Dmitry Kurochkin
@ 2011-12-12 11:29 ` Tomi Ollila
  1 sibling, 0 replies; 4+ messages in thread
From: Tomi Ollila @ 2011-12-12 11:29 UTC (permalink / raw)
  To: David Bremner, notmuch; +Cc: David Bremner

On Sun, 11 Dec 2011 12:19:44 -0400, David Bremner <david@tethera.net> wrote:
> From: David Bremner <bremner@debian.org>
> 
> The character set is chosen to be suitable for pathnames, and the same
> as that used by contrib/nmbug. The new encoded/decoded strings are
> allocated using talloc.
> ---
> This isn't urgent, but it is useful for a couple projects I have
> brewing (nmbug compatible dump/restore and tag logging), so I thought
> I would get some feedback on it.
> 
> 
>  util/Makefile.local |    4 +-
>  util/hex-escape.c   |  110 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  util/hex-escape.h   |   10 +++++
>  3 files changed, 122 insertions(+), 2 deletions(-)
>  create mode 100644 util/hex-escape.c
>  create mode 100644 util/hex-escape.h

Like Dmitry mentioned, Makefile.local change in separate patch after
hex-escape additions.

> diff --git a/util/hex-escape.c b/util/hex-escape.c
> new file mode 100644
> index 0000000..c294bb5
> --- /dev/null
> +++ b/util/hex-escape.c

[ ... snip ... ]
> +
> +static int
> +escapes_needed (const char *str){

Opening { in separate line, like in all other source files.

> +    int escapes = 0;
> +
> +    while (*str) {
> +	if (index (HEX_NO_ESCAPE, *str) == NULL)

strchr() instead of index()

And, like Dmitry mentioned, static const char _hex_no_escape[] = "...";

> +	    escapes++;
> +	str++;
> +    }
> +
> +   return escapes;
> +}
> +
> +char *
> +hex_encode (void *ctx, const char *str) {
> +    char *newstr = talloc_size (ctx, strlen (str)+3*escapes_needed (str)+1);

Consistent spacing, like Dmitry mentioned (I compared with
_optimize_tag_query () in notmuch-tag.c ).

> +
> +    char *out = newstr;
> +
> +    while (*str) {
> +	if (index (HEX_NO_ESCAPE, *str)) {

... if (strchr ( _hex_no_escape, *str) != NULL) { 

[ ... snip ... ]
> +
> +inline static int
> +_digit (char c) {

Maybe _hexdigit () ?

> +    if ('0' <= c && c <= '9')
> +	return c - '0';
> +
> +    if ('A' <= c && c <= 'F')
> +	return c - 'A';
> +
> +    if ('a' <= c && c <= 'f')
> +	return c - 'a';

Fix this (or change to sscanf) like Dmitry mentioned
(c - 'A' + 10 and c - 'a' + 10)

> +
> +    INTERNAL_ERROR ("Illegal hex digit %c", c);

Is this too heavy ? -- but there may not be alternative.

> +    /*NOTREACHED*/
> +    return 0;
> +}
[ ... snip ... ]

Tomi

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] util/hex-escape.[ch]: encoding/decoding strings into restricted character set
  2011-12-11 17:56 ` Dmitry Kurochkin
@ 2011-12-12 18:23   ` David Bremner
  0 siblings, 0 replies; 4+ messages in thread
From: David Bremner @ 2011-12-12 18:23 UTC (permalink / raw)
  To: Dmitry Kurochkin, notmuch

On Sun, 11 Dec 2011 21:56:36 +0400, Dmitry Kurochkin <dmitry.kurochkin@gmail.com> wrote:
> >  
> > -CLEAN := $(CLEAN) $(dir)/xutil.o $(dir)/error_util.o $(dir)/libutil.a
> > +CLEAN := $(CLEAN) $(libutil_modules) $(dir)/libutil.a
> 
> IMO this should be pushed as a separate patch (that does not need a
> review :)).

Pushed that two line patch.

d

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2011-12-12 18:23 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-12-11 16:19 [PATCH] util/hex-escape.[ch]: encoding/decoding strings into restricted character set David Bremner
2011-12-11 17:56 ` Dmitry Kurochkin
2011-12-12 18:23   ` David Bremner
2011-12-12 11:29 ` Tomi Ollila

Code repositories for project(s) associated with this public inbox

	https://yhetil.org/notmuch.git/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).