From mboxrd@z Thu Jan 1 00:00:00 1970
Path: news.gmane.org!.POSTED!not-for-mail
From: Philipp Stephani
Newsgroups: gmane.emacs.devel
Subject: [PATCH] Improve error reporting when serializing non-Unicode strings
to JSON
Date: Sat, 23 Dec 2017 17:58:57 +0100
Message-ID: <20171223165857.25743-1-phst@google.com>
References:
NNTP-Posting-Host: blaine.gmane.org
X-Trace: blaine.gmane.org 1514048246 12874 195.159.176.226 (23 Dec 2017 16:57:26 GMT)
X-Complaints-To: usenet@blaine.gmane.org
NNTP-Posting-Date: Sat, 23 Dec 2017 16:57:26 +0000 (UTC)
Cc: Philipp Stephani
To: emacs-devel@gnu.org
Original-X-From: emacs-devel-bounces+ged-emacs-devel=m.gmane.org@gnu.org Sat Dec 23 17:57:22 2017
Return-path:
Envelope-to: ged-emacs-devel@m.gmane.org
Original-Received: from lists.gnu.org ([208.118.235.17])
by blaine.gmane.org with esmtp (Exim 4.84_2)
(envelope-from )
id 1eSn6v-00030h-0I
for ged-emacs-devel@m.gmane.org; Sat, 23 Dec 2017 17:57:21 +0100
Original-Received: from localhost ([::1]:53765 helo=lists.gnu.org)
by lists.gnu.org with esmtp (Exim 4.71)
(envelope-from )
id 1eSn8t-0005oS-Fa
for ged-emacs-devel@m.gmane.org; Sat, 23 Dec 2017 11:59:23 -0500
Original-Received: from eggs.gnu.org ([2001:4830:134:3::10]:58077)
by lists.gnu.org with esmtp (Exim 4.71)
(envelope-from ) id 1eSn8j-0005ma-HQ
for emacs-devel@gnu.org; Sat, 23 Dec 2017 11:59:14 -0500
Original-Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)
(envelope-from ) id 1eSn8g-0005O0-C4
for emacs-devel@gnu.org; Sat, 23 Dec 2017 11:59:13 -0500
Original-Received: from mail-wm0-x232.google.com ([2a00:1450:400c:c09::232]:38655)
by eggs.gnu.org with esmtps (TLS1.0:RSA_AES_128_CBC_SHA1:16)
(Exim 4.71) (envelope-from )
id 1eSn8g-0005N5-1t
for emacs-devel@gnu.org; Sat, 23 Dec 2017 11:59:10 -0500
Original-Received: by mail-wm0-x232.google.com with SMTP id 64so26436736wme.3
for ; Sat, 23 Dec 2017 08:59:09 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20161025;
h=from:to:cc:subject:date:message-id:in-reply-to:references;
bh=EVuJAgfhMKJOww5DuoGsApjZN86qf5i14oYJ3bfFYdM=;
b=Do7Aqv5upBhfsx/fsMapy0DksNEfTFR5WjIit0wJ7rwnhjty1XQ/TS73ORbEOzvc/E
WTe+vI3EDB457H7mo5e4DEJLqRmfY2JRuVKVpSMVEPWpQTnU0qC7fCs/KVrMFmKDPU3g
449y5O4r2+/ayF2Vq+w/DIib0WN/CennMJhdC+lmMPf8SCS5AnibKQnoqrhxU61zGlEI
lW78/bVXwdcuP9I7USjDgMxWIcYBUUvBfB9CXCkq6V2BQg3XbkJIfzLcO9YO8/psyn/Y
5qVqRjV7RcvZek5s0eDFWLSWFzmdnW0lfUscGlb+vZlvKlE2wUkKd0nnmmUssdzZl3a1
hTjA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20161025;
h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
:references;
bh=EVuJAgfhMKJOww5DuoGsApjZN86qf5i14oYJ3bfFYdM=;
b=j7Rg+WiQXJZ1VV+MnEcj1IyoJe4/M3iRXu6o+iJtfyYrsxjKxQ6D4bc6Zny7IsNpkh
qPlgvLo99YK/RcNJD09EWJtX8TgLFwusVu0h27WtbCkfKYbqwE8YgGjb8QIeF2x57ux5
D1Hv9aVta06+xcpwEsP848Gh1kQPPDWpiCGgUmezNKuAS5OXUH9r7wReXcPjiO2tMvva
9DNHwXwWNYCcAbN7Wep+OkfnIh+la4EWjIq+tmmRD39ag8X27wgvY1RiG3uNsQNps8Pj
9J3nqos3pTj+gEKmfBkExTV30AXdgArd4YzxM6VvuR2aucYkYZ15d/CJEQ9E+66P9Ucs
zQrQ==
X-Gm-Message-State: AKGB3mIZ01vg5cnH7Pyf3oC2349+0kC9QWlISf4QtCuyZZqUjv7LkzoC
ajkkPiKwqwk5jDKStho93Cszknmq
X-Google-Smtp-Source: ACJfBovtRpXXLtOQY78DEzZRFBcky4Tp2yakYKno+zH1sx+zzKQP/yXv6Bx+p9HQOsPDrpUb9EzA2g==
X-Received: by 10.28.13.77 with SMTP id 74mr16038282wmn.51.1514048348298;
Sat, 23 Dec 2017 08:59:08 -0800 (PST)
Original-Received: from p.fritz.box (p5B13F8C4.dip0.t-ipconnect.de. [91.19.248.196])
by smtp.gmail.com with ESMTPSA id
k42sm3775762wrf.37.2017.12.23.08.59.07
(version=TLS1_2 cipher=ECDHE-RSA-AES128-SHA bits=128/128);
Sat, 23 Dec 2017 08:59:07 -0800 (PST)
X-Google-Original-From: Philipp Stephani
X-Mailer: git-send-email 2.15.1
In-Reply-To:
X-detected-operating-system: by eggs.gnu.org: Genre and OS details not
recognized.
X-Received-From: 2a00:1450:400c:c09::232
X-BeenThere: emacs-devel@gnu.org
X-Mailman-Version: 2.1.21
Precedence: list
List-Id: "Emacs development discussions."
List-Unsubscribe: ,
List-Archive:
List-Post:
List-Help:
List-Subscribe: ,
Errors-To: emacs-devel-bounces+ged-emacs-devel=m.gmane.org@gnu.org
Original-Sender: "Emacs-devel"
Xref: news.gmane.org gmane.emacs.devel:221391
Archived-At:
* src/coding.h (EOL_SEEN_NONE, EOL_SEEN_LF, EOL_SEEN_CR)
(EOL_SEEN_CRLF): Move from coding.c.
* src/coding.c (check_utf_8): Make extern.
* src/json.c (json_check_utf8): New helper function.
(lisp_to_json_toplevel_1, lisp_to_json): Use it. To save a bit of
time, check for invalid UTF-8 strings only after encountering an
error, since Jansson already rejects them.
* test/src/json-tests.el (json-serialize/invalid-unicode): Adapt
expected error symbol.
---
src/coding.c | 10 +---------
src/coding.h | 8 ++++++++
src/json.c | 42 ++++++++++++++++++++++++++++++++++++------
test/src/json-tests.el | 10 ++++------
4 files changed, 49 insertions(+), 21 deletions(-)
diff --git a/src/coding.c b/src/coding.c
index 1705838ffa..b5cdafee4b 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -1114,14 +1114,6 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
*buf++ = id; \
} while (0)
-
-/* Bitmasks for coding->eol_seen. */
-
-#define EOL_SEEN_NONE 0
-#define EOL_SEEN_LF 1
-#define EOL_SEEN_CR 2
-#define EOL_SEEN_CRLF 4
-
/*** 2. Emacs' internal format (emacs-utf-8) ***/
@@ -6266,7 +6258,7 @@ check_ascii (struct coding_system *coding)
the value is reliable only when all the source bytes are valid
UTF-8. */
-static ptrdiff_t
+ptrdiff_t
check_utf_8 (struct coding_system *coding)
{
const unsigned char *src, *end;
diff --git a/src/coding.h b/src/coding.h
index 66d125b07e..314d044def 100644
--- a/src/coding.h
+++ b/src/coding.h
@@ -662,9 +662,17 @@ struct coding_system
/* Note that this encodes utf-8, not utf-8-emacs, so it's not a no-op. */
#define ENCODE_UTF_8(str) code_convert_string_norecord (str, Qutf_8, true)
+/* Bitmasks for coding->eol_seen. */
+
+#define EOL_SEEN_NONE 0
+#define EOL_SEEN_LF 1
+#define EOL_SEEN_CR 2
+#define EOL_SEEN_CRLF 4
+
/* Extern declarations. */
extern Lisp_Object code_conversion_save (bool, bool);
extern bool encode_coding_utf_8 (struct coding_system *);
+extern ptrdiff_t check_utf_8 (struct coding_system *);
extern void setup_coding_system (Lisp_Object, struct coding_system *);
extern Lisp_Object coding_charset_list (struct coding_system *);
extern Lisp_Object coding_system_charset_list (Lisp_Object);
diff --git a/src/json.c b/src/json.c
index 689f6ac510..fc2265a793 100644
--- a/src/json.c
+++ b/src/json.c
@@ -313,6 +313,26 @@ json_check (json_t *object)
return object;
}
+/* If STRING is not a valid UTF-8 string, signal an error of type
+ `wrong-type-argument'. STRING must be a unibyte string. */
+
+static void
+json_check_utf8 (Lisp_Object string)
+{
+ eassert (!STRING_MULTIBYTE (string));
+ struct coding_system coding;
+ setup_coding_system (Qutf_8_unix, &coding);
+ /* We initialize only the fields that check_utf_8 accesses. */
+ coding.head_ascii = -1;
+ coding.src_pos = 0;
+ coding.src_pos_byte = 0;
+ coding.src_chars = SCHARS (string);
+ coding.src_bytes = SBYTES (string);
+ coding.src_object = string;
+ coding.eol_seen = EOL_SEEN_NONE;
+ CHECK_TYPE (check_utf_8 (&coding) != -1, Qutf_8_string_p, string);
+}
+
static json_t *lisp_to_json (Lisp_Object);
/* Convert a Lisp object to a toplevel JSON object (array or object).
@@ -355,9 +375,12 @@ lisp_to_json_toplevel_1 (Lisp_Object lisp, json_t **json)
int status = json_object_set_new (*json, SSDATA (key),
lisp_to_json (HASH_VALUE (h, i)));
if (status == -1)
- /* FIXME: A failure here might also indicate that the
- key is not a valid Unicode string. */
- json_out_of_memory ();
+ {
+ /* A failure can be caused either by an invalid key or
+ by low memory. */
+ json_check_utf8 (key);
+ json_out_of_memory ();
+ }
}
clear_unwind_protect (count);
return unbind_to (count, Qnil);
@@ -403,9 +426,15 @@ lisp_to_json (Lisp_Object lisp)
else if (STRINGP (lisp))
{
Lisp_Object encoded = json_encode (lisp);
- /* FIXME: We might throw an out-of-memory error here if the
- string is not valid Unicode. */
- return json_check (json_stringn (SSDATA (encoded), SBYTES (encoded)));
+ json_t *json = json_stringn (SSDATA (encoded), SBYTES (encoded));
+ if (json == NULL)
+ {
+ /* A failure can be caused either by an invalid string or by
+ low memory. */
+ json_check_utf8 (encoded);
+ json_out_of_memory ();
+ }
+ return json;
}
/* LISP now must be a vector or hashtable. */
@@ -818,6 +847,7 @@ syms_of_json (void)
DEFSYM (Qstring_without_embedded_nulls_p, "string-without-embedded-nulls-p");
DEFSYM (Qjson_value_p, "json-value-p");
+ DEFSYM (Qutf_8_string_p, "utf-8-string-p");
DEFSYM (Qutf_8_unix, "utf-8-unix");
diff --git a/test/src/json-tests.el b/test/src/json-tests.el
index 9884e9a2d5..9bdb639423 100644
--- a/test/src/json-tests.el
+++ b/test/src/json-tests.el
@@ -84,12 +84,10 @@
(ert-deftest json-serialize/invalid-unicode ()
(skip-unless (fboundp 'json-serialize))
- ;; FIXME: "out of memory" is the wrong error signal, but we don't
- ;; currently distinguish between error types when serializing.
- (should-error (json-serialize ["a\uDBBBb"]) :type 'json-out-of-memory)
- (should-error (json-serialize ["u\x110000v"]) :type 'json-out-of-memory)
- (should-error (json-serialize ["u\x3FFFFFv"]) :type 'json-out-of-memory)
- (should-error (json-serialize ["u\xCCv"]) :type 'json-out-of-memory))
+ (should-error (json-serialize ["a\uDBBBb"]) :type 'wrong-type-argument)
+ (should-error (json-serialize ["u\x110000v"]) :type 'wrong-type-argument)
+ (should-error (json-serialize ["u\x3FFFFFv"]) :type 'wrong-type-argument)
+ (should-error (json-serialize ["u\xCCv"]) :type 'wrong-type-argument))
(ert-deftest json-parse-string/null ()
(skip-unless (fboundp 'json-parse-string))
--
2.15.1