From: Philipp Stephani <p.stephani2@gmail.com>
To: emacs-devel@gnu.org
Cc: Philipp Stephani <phst@google.com>
Subject: [PATCH] Improve error reporting when serializing non-Unicode strings to JSON
Date: Sat, 23 Dec 2017 17:58:57 +0100 [thread overview]
Message-ID: <20171223165857.25743-1-phst@google.com> (raw)
In-Reply-To: <CAArVCkR9fPzcAki3OJjM_og7vt1PewVGwRGomi7ff_Z5=LE5tg@mail.gmail.com>
* src/coding.h (EOL_SEEN_NONE, EOL_SEEN_LF, EOL_SEEN_CR)
(EOL_SEEN_CRLF): Move from coding.c.
* src/coding.c (check_utf_8): Make extern.
* src/json.c (json_check_utf8): New helper function.
(lisp_to_json_toplevel_1, lisp_to_json): Use it. To save a bit of
time, check for invalid UTF-8 strings only after encountering an
error, since Jansson already rejects them.
* test/src/json-tests.el (json-serialize/invalid-unicode): Adapt
expected error symbol.
---
src/coding.c | 10 +---------
src/coding.h | 8 ++++++++
src/json.c | 42 ++++++++++++++++++++++++++++++++++++------
test/src/json-tests.el | 10 ++++------
4 files changed, 49 insertions(+), 21 deletions(-)
diff --git a/src/coding.c b/src/coding.c
index 1705838ffa..b5cdafee4b 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -1114,14 +1114,6 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
*buf++ = id; \
} while (0)
-
-/* Bitmasks for coding->eol_seen. */
-
-#define EOL_SEEN_NONE 0
-#define EOL_SEEN_LF 1
-#define EOL_SEEN_CR 2
-#define EOL_SEEN_CRLF 4
-
\f
/*** 2. Emacs' internal format (emacs-utf-8) ***/
@@ -6266,7 +6258,7 @@ check_ascii (struct coding_system *coding)
the value is reliable only when all the source bytes are valid
UTF-8. */
-static ptrdiff_t
+ptrdiff_t
check_utf_8 (struct coding_system *coding)
{
const unsigned char *src, *end;
diff --git a/src/coding.h b/src/coding.h
index 66d125b07e..314d044def 100644
--- a/src/coding.h
+++ b/src/coding.h
@@ -662,9 +662,17 @@ struct coding_system
/* Note that this encodes utf-8, not utf-8-emacs, so it's not a no-op. */
#define ENCODE_UTF_8(str) code_convert_string_norecord (str, Qutf_8, true)
+/* Bitmasks for coding->eol_seen. */
+
+#define EOL_SEEN_NONE 0
+#define EOL_SEEN_LF 1
+#define EOL_SEEN_CR 2
+#define EOL_SEEN_CRLF 4
+
/* Extern declarations. */
extern Lisp_Object code_conversion_save (bool, bool);
extern bool encode_coding_utf_8 (struct coding_system *);
+extern ptrdiff_t check_utf_8 (struct coding_system *);
extern void setup_coding_system (Lisp_Object, struct coding_system *);
extern Lisp_Object coding_charset_list (struct coding_system *);
extern Lisp_Object coding_system_charset_list (Lisp_Object);
diff --git a/src/json.c b/src/json.c
index 689f6ac510..fc2265a793 100644
--- a/src/json.c
+++ b/src/json.c
@@ -313,6 +313,26 @@ json_check (json_t *object)
return object;
}
+/* If STRING is not a valid UTF-8 string, signal an error of type
+ `wrong-type-argument'. STRING must be a unibyte string. */
+
+static void
+json_check_utf8 (Lisp_Object string)
+{
+ eassert (!STRING_MULTIBYTE (string));
+ struct coding_system coding;
+ setup_coding_system (Qutf_8_unix, &coding);
+ /* We initialize only the fields that check_utf_8 accesses. */
+ coding.head_ascii = -1;
+ coding.src_pos = 0;
+ coding.src_pos_byte = 0;
+ coding.src_chars = SCHARS (string);
+ coding.src_bytes = SBYTES (string);
+ coding.src_object = string;
+ coding.eol_seen = EOL_SEEN_NONE;
+ CHECK_TYPE (check_utf_8 (&coding) != -1, Qutf_8_string_p, string);
+}
+
static json_t *lisp_to_json (Lisp_Object);
/* Convert a Lisp object to a toplevel JSON object (array or object).
@@ -355,9 +375,12 @@ lisp_to_json_toplevel_1 (Lisp_Object lisp, json_t **json)
int status = json_object_set_new (*json, SSDATA (key),
lisp_to_json (HASH_VALUE (h, i)));
if (status == -1)
- /* FIXME: A failure here might also indicate that the
- key is not a valid Unicode string. */
- json_out_of_memory ();
+ {
+ /* A failure can be caused either by an invalid key or
+ by low memory. */
+ json_check_utf8 (key);
+ json_out_of_memory ();
+ }
}
clear_unwind_protect (count);
return unbind_to (count, Qnil);
@@ -403,9 +426,15 @@ lisp_to_json (Lisp_Object lisp)
else if (STRINGP (lisp))
{
Lisp_Object encoded = json_encode (lisp);
- /* FIXME: We might throw an out-of-memory error here if the
- string is not valid Unicode. */
- return json_check (json_stringn (SSDATA (encoded), SBYTES (encoded)));
+ json_t *json = json_stringn (SSDATA (encoded), SBYTES (encoded));
+ if (json == NULL)
+ {
+ /* A failure can be caused either by an invalid string or by
+ low memory. */
+ json_check_utf8 (encoded);
+ json_out_of_memory ();
+ }
+ return json;
}
/* LISP now must be a vector or hashtable. */
@@ -818,6 +847,7 @@ syms_of_json (void)
DEFSYM (Qstring_without_embedded_nulls_p, "string-without-embedded-nulls-p");
DEFSYM (Qjson_value_p, "json-value-p");
+ DEFSYM (Qutf_8_string_p, "utf-8-string-p");
DEFSYM (Qutf_8_unix, "utf-8-unix");
diff --git a/test/src/json-tests.el b/test/src/json-tests.el
index 9884e9a2d5..9bdb639423 100644
--- a/test/src/json-tests.el
+++ b/test/src/json-tests.el
@@ -84,12 +84,10 @@
(ert-deftest json-serialize/invalid-unicode ()
(skip-unless (fboundp 'json-serialize))
- ;; FIXME: "out of memory" is the wrong error signal, but we don't
- ;; currently distinguish between error types when serializing.
- (should-error (json-serialize ["a\uDBBBb"]) :type 'json-out-of-memory)
- (should-error (json-serialize ["u\x110000v"]) :type 'json-out-of-memory)
- (should-error (json-serialize ["u\x3FFFFFv"]) :type 'json-out-of-memory)
- (should-error (json-serialize ["u\xCCv"]) :type 'json-out-of-memory))
+ (should-error (json-serialize ["a\uDBBBb"]) :type 'wrong-type-argument)
+ (should-error (json-serialize ["u\x110000v"]) :type 'wrong-type-argument)
+ (should-error (json-serialize ["u\x3FFFFFv"]) :type 'wrong-type-argument)
+ (should-error (json-serialize ["u\xCCv"]) :type 'wrong-type-argument))
(ert-deftest json-parse-string/null ()
(skip-unless (fboundp 'json-parse-string))
--
2.15.1
next prev parent reply other threads:[~2017-12-23 16:58 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-12-22 21:00 [PATCH] Improve error reporting when serializing non-Unicode strings to JSON Philipp Stephani
2017-12-23 8:28 ` Eli Zaretskii
2017-12-23 12:33 ` Philipp Stephani
2017-12-23 13:44 ` Eli Zaretskii
2017-12-23 14:29 ` Philipp Stephani
2017-12-23 14:52 ` Eli Zaretskii
2017-12-23 15:00 ` Eli Zaretskii
2017-12-23 15:07 ` Philipp Stephani
2017-12-23 15:19 ` Philipp Stephani
2017-12-23 15:34 ` Eli Zaretskii
2017-12-23 16:20 ` Philipp Stephani
2017-12-23 16:36 ` Eli Zaretskii
2017-12-23 16:58 ` Philipp Stephani
2017-12-23 16:58 ` Philipp Stephani [this message]
2017-12-30 22:20 ` Philipp Stephani
2017-12-31 15:52 ` Eli Zaretskii
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://www.gnu.org/software/emacs/
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20171223165857.25743-1-phst@google.com \
--to=p.stephani2@gmail.com \
--cc=emacs-devel@gnu.org \
--cc=phst@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://git.savannah.gnu.org/cgit/emacs.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).