From: "Herman, Géza" <geza.herman@gmail.com>
To: emacs-devel@gnu.org
Subject: [PATCH] Implement fast verisons of json-parse functions
Date: Tue, 19 Mar 2024 19:23:36 +0100 [thread overview]
Message-ID: <87h6h2rsgn.fsf@gmail.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 247 bytes --]
Tags: patch
Here's the latest version of my JSON parser. It is the same as before,
the only modification I did is that this patch doesn't replace the
original parser, but keeps them, and adds to two additional functions
with the -fast suffix.
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Implement-fast-verisons-of-json-parse-functions.patch --]
[-- Type: text/patch, Size: 34292 bytes --]
From 48399572efbc16887f49fc5a0f20bed3b16b2115 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A9za=20Herman?= <geza.herman@gmail.com>
Date: Tue, 19 Mar 2024 18:59:09 +0100
Subject: [PATCH] Implement fast verisons of json-parse functions
---
src/json.c | 1108 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 1108 insertions(+)
diff --git a/src/json.c b/src/json.c
index e849ccaf722..c08d9d4a4f7 100644
--- a/src/json.c
+++ b/src/json.c
@@ -23,6 +23,7 @@ Copyright (C) 2017-2024 Free Software Foundation, Inc.
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
+#include <math.h>
#include <jansson.h>
@@ -1087,6 +1088,1099 @@ DEFUN ("json-parse-buffer", Fjson_parse_buffer, Sjson_parse_buffer,
return unbind_to (count, lisp);
}
+struct json_parser
+{
+ /* Because of a possible gap in the input (an emacs buffer can have
+ a gap), the input is described by [input_begin;input_end) and
+ [secondary_input_begin;secondary_input_end). If the input is
+ continuous, then secondary_input_begin and secondary_input_end
+ should be NULL */
+ const unsigned char *input_current;
+ const unsigned char *input_begin;
+ const unsigned char *input_end;
+
+ const unsigned char *secondary_input_begin;
+ const unsigned char *secondary_input_end;
+
+ int current_line;
+ int current_column;
+
+ /* The parser has a maximum allowed depth. available_depth
+ decreases at each object/array begin. If reaches zero, then an
+ error is generated */
+ int available_depth;
+
+ struct json_configuration conf;
+
+ size_t additional_bytes_count;
+
+ /* Lisp_Objects are collected in this area during object/array
+ parsing */
+ Lisp_Object object_workspace;
+ size_t object_workspace_size;
+ size_t object_workspace_current;
+
+ /* String and number parsing uses this workspace */
+ unsigned char *byte_workspace;
+ unsigned char *byte_workspace_end;
+ unsigned char *byte_workspace_current;
+};
+
+static AVOID
+json_signal_error (struct json_parser *parser, Lisp_Object error)
+{
+ xsignal2 (error, INT_TO_INTEGER (parser->current_line),
+ INT_TO_INTEGER (parser->current_column));
+}
+
+static void
+json_parser_init (struct json_parser *parser,
+ struct json_configuration conf,
+ const unsigned char *input,
+ const unsigned char *input_end,
+ const unsigned char *secondary_input,
+ const unsigned char *secondary_input_end)
+{
+ const int initial_object_workspace_size = 64;
+ const int initial_string_workspace_size = 512;
+
+ if (secondary_input >= secondary_input_end)
+ {
+ secondary_input = NULL;
+ secondary_input_end = NULL;
+ }
+
+ if (input < input_end)
+ {
+ parser->input_begin = input;
+ parser->input_end = input_end;
+
+ parser->secondary_input_begin = secondary_input;
+ parser->secondary_input_end = secondary_input_end;
+ }
+ else
+ {
+ parser->input_begin = secondary_input;
+ parser->input_end = secondary_input_end;
+
+ parser->secondary_input_begin = NULL;
+ parser->secondary_input_end = NULL;
+ }
+
+ parser->input_current = parser->input_begin;
+
+ parser->current_line = 1;
+ parser->current_column = 0;
+ parser->available_depth = 10000;
+ parser->conf = conf;
+
+ parser->additional_bytes_count = 0;
+
+ parser->object_workspace
+ = make_vector (initial_object_workspace_size, Qnil);
+ parser->object_workspace_size = initial_object_workspace_size;
+ parser->object_workspace_current = 0;
+
+ parser->byte_workspace = xmalloc (initial_string_workspace_size);
+ parser->byte_workspace_end
+ = parser->byte_workspace + initial_string_workspace_size;
+}
+
+static void
+json_parser_done (void *parser)
+{
+ struct json_parser *p = (struct json_parser *) parser;
+ xfree (p->byte_workspace);
+}
+
+/* Makes sure that the object_workspace has 'size' available space for
+ Lisp_Objects */
+NO_INLINE static void
+json_make_object_workspace_for_slow_path (struct json_parser *parser,
+ size_t size)
+{
+ size_t needed_workspace_size
+ = (parser->object_workspace_current + size);
+ size_t new_workspace_size = parser->object_workspace_size;
+ while (new_workspace_size < needed_workspace_size)
+ {
+ if (ckd_mul (&new_workspace_size, new_workspace_size, 2))
+ {
+ json_signal_error (parser, Qjson_out_of_memory);
+ }
+ }
+ Lisp_Object new_workspace = make_vector (new_workspace_size, Qnil);
+ for (size_t i = 0; i < parser->object_workspace_current; i++)
+ {
+ ASET (new_workspace, i, AREF (parser->object_workspace, i));
+ }
+ parser->object_workspace = new_workspace;
+ parser->object_workspace_size = new_workspace_size;
+}
+
+INLINE void
+json_make_object_workspace_for (struct json_parser *parser,
+ size_t size)
+{
+ if (parser->object_workspace_size - parser->object_workspace_current
+ < size)
+ {
+ json_make_object_workspace_for_slow_path (parser, size);
+ }
+}
+
+static void
+json_byte_workspace_reset (struct json_parser *parser)
+{
+ parser->byte_workspace_current = parser->byte_workspace;
+}
+
+/* Puts 'value' into the byte_workspace. If there is no space
+ available, it allocates space */
+NO_INLINE static void
+json_byte_workspace_put_slow_path (struct json_parser *parser,
+ unsigned char value)
+{
+ size_t new_workspace_size
+ = parser->byte_workspace_end - parser->byte_workspace;
+ if (ckd_mul (&new_workspace_size, new_workspace_size, 2))
+ {
+ json_signal_error (parser, Qjson_out_of_memory);
+ }
+
+ size_t offset
+ = parser->byte_workspace_current - parser->byte_workspace;
+ parser->byte_workspace
+ = xrealloc (parser->byte_workspace, new_workspace_size);
+ parser->byte_workspace_end
+ = parser->byte_workspace + new_workspace_size;
+ parser->byte_workspace_current = parser->byte_workspace + offset;
+ *parser->byte_workspace_current++ = value;
+}
+
+INLINE void
+json_byte_workspace_put (struct json_parser *parser,
+ unsigned char value)
+{
+ if (parser->byte_workspace_current < parser->byte_workspace_end)
+ {
+ *parser->byte_workspace_current++ = value;
+ }
+ else
+ {
+ json_byte_workspace_put_slow_path (parser, value);
+ }
+}
+
+static bool
+json_input_at_eof (struct json_parser *parser)
+{
+ if (parser->input_current < parser->input_end)
+ return false;
+ return parser->secondary_input_end == NULL;
+}
+
+/* If there is a secondary buffer, it switches to it */
+static int
+json_input_switch_to_secondary (struct json_parser *parser)
+{
+ if (parser->secondary_input_begin < parser->secondary_input_end)
+ {
+ parser->additional_bytes_count
+ = parser->input_end - parser->input_begin;
+ parser->input_begin = parser->secondary_input_begin;
+ parser->input_end = parser->secondary_input_end;
+ parser->input_current = parser->secondary_input_begin;
+ parser->secondary_input_begin = NULL;
+ parser->secondary_input_end = NULL;
+ return 0;
+ }
+ else
+ return -1;
+}
+
+/* Reads a byte from the JSON input stream */
+static unsigned char
+json_input_get (struct json_parser *parser)
+{
+ if (parser->input_current >= parser->input_end
+ && json_input_switch_to_secondary (parser) < 0)
+ json_signal_error (parser, Qjson_end_of_file);
+ return *parser->input_current++;
+}
+
+/* Reads a byte from the JSON input stream, if the stream is not at
+ * eof. At eof, returns -1 */
+static int
+json_input_get_if_possible (struct json_parser *parser)
+{
+ if (parser->input_current >= parser->input_end
+ && json_input_switch_to_secondary (parser) < 0)
+ return -1;
+ return *parser->input_current++;
+}
+
+/* Puts back the last read input byte. Only one byte can be put back,
+ because otherwise this code would need to handle switching from
+ the secondary buffer to the initial */
+static void
+json_input_put_back (struct json_parser *parser)
+{
+ parser->input_current--;
+}
+
+static bool
+json_skip_whitespace_internal (struct json_parser *parser, int c)
+{
+ parser->current_column++;
+ if (c == 0x20 || c == 0x09 || c == 0x0d)
+ return false;
+ else if (c == 0x0a)
+ {
+ parser->current_line++;
+ parser->current_column = 0;
+ return false;
+ }
+ else
+ return true;
+}
+
+/* Skips JSON whitespace, and returns with the first non-whitespace
+ * character */
+static int
+json_skip_whitespace (struct json_parser *parser)
+{
+ for (;;)
+ {
+ int c = json_input_get (parser);
+ if (json_skip_whitespace_internal (parser, c))
+ return c;
+ }
+}
+
+/* Skips JSON whitespace, and returns with the first non-whitespace
+ * character, if possible. If there is no non-whitespace character
+ * (because we reached the end), it returns -1 */
+static int
+json_skip_whitespace_if_possible (struct json_parser *parser)
+{
+ for (;;)
+ {
+ int c = json_input_get_if_possible (parser);
+ if (c < 0)
+ return c;
+ if (json_skip_whitespace_internal (parser, c))
+ return c;
+ }
+}
+
+static int
+json_hex_value (int c)
+{
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ else if (c >= 'A' && c <= 'F')
+ return c - 'A' + 10;
+ else if (c >= 'a' && c <= 'f')
+ return c - 'a' + 10;
+ else
+ return -1;
+}
+
+/* Parses the CCCC part of the unicode escape sequence \uCCCC */
+static int
+json_parse_unicode (struct json_parser *parser)
+{
+ unsigned char v[4];
+ for (int i = 0; i < 4; i++)
+ {
+ int c = json_hex_value (json_input_get (parser));
+ parser->current_column++;
+ if (c < 0)
+ json_signal_error (parser, Qjson_escape_sequence_error);
+ v[i] = c;
+ }
+
+ return v[0] << 12 | v[1] << 8 | v[2] << 4 | v[3];
+}
+
+/* Parses an utf-8 code-point encoding (except the first byte), and
+ returns the numeric value of the code-point (without considering
+ the first byte) */
+static int
+json_handle_utf8_tail_bytes (struct json_parser *parser, int n)
+{
+ int v = 0;
+ for (int i = 0; i < n; i++)
+ {
+ int c = json_input_get (parser);
+ json_byte_workspace_put (parser, c);
+ if ((c & 0xc0) != 0x80)
+ json_signal_error (parser, Qjson_utf8_decode_error);
+ v = (v << 6) | (c & 0x3f);
+ }
+ return v;
+}
+
+/* Reads a JSON string, and puts the result into the byte workspace */
+static void
+json_parse_string (struct json_parser *parser)
+{
+ /* a single_uninteresting byte can be simply copied from the input
+ to output, it doesn't need any extra care. This means all the
+ characters between [0x20;0x7f], except the double quote and
+ the backslash */
+ static const char is_single_uninteresting[256] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 1 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 2 */ 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* 3 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* 4 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* 5 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
+ /* 6 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* 7 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* 8 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 9 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* a */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* b */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* c */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* d */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* e */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ };
+
+ for (;;)
+ {
+ /* This if is only here for a possible speedup. If there are 4
+ bytes available, and all of them are single_uninteresting,
+ then we can just copy these 4 bytes to output */
+ if (parser->input_end - parser->input_current >= 4)
+ {
+ int c0 = parser->input_current[0];
+ int c1 = parser->input_current[1];
+ int c2 = parser->input_current[2];
+ int c3 = parser->input_current[3];
+ bool v0 = is_single_uninteresting[c0];
+ bool v1 = is_single_uninteresting[c1];
+ bool v2 = is_single_uninteresting[c2];
+ bool v3 = is_single_uninteresting[c3];
+ if (v0 && v1 && v2 && v3)
+ {
+ json_byte_workspace_put (parser, c0);
+ json_byte_workspace_put (parser, c1);
+ json_byte_workspace_put (parser, c2);
+ json_byte_workspace_put (parser, c3);
+ parser->input_current += 4;
+ parser->current_column += 4;
+ continue;
+ }
+ }
+
+ int c = json_input_get (parser);
+ parser->current_column++;
+ if (is_single_uninteresting[c])
+ {
+ json_byte_workspace_put (parser, c);
+ continue;
+ }
+
+ if (c == '"')
+ return;
+ else if (c & 0x80)
+ {
+ /* Handle utf-8 encoding */
+ json_byte_workspace_put (parser, c);
+ if (c < 0xc0)
+ json_signal_error (parser, Qjson_utf8_decode_error);
+ else if (c < 0xe0)
+ {
+ int n = ((c & 0x1f) << 6
+ | json_handle_utf8_tail_bytes (parser, 1));
+ if (n < 0x80)
+ json_signal_error (parser, Qjson_utf8_decode_error);
+ }
+ else if (c < 0xf0)
+ {
+ int n = ((c & 0xf) << 12
+ | json_handle_utf8_tail_bytes (parser, 2));
+ if (n < 0x800 || (n >= 0xd800 && n < 0xe000))
+ json_signal_error (parser, Qjson_utf8_decode_error);
+ }
+ else if (c < 0xf8)
+ {
+ int n = ((c & 0x7) << 18
+ | json_handle_utf8_tail_bytes (parser, 3));
+ if (n < 0x10000 || n > 0x10ffff)
+ json_signal_error (parser, Qjson_utf8_decode_error);
+ }
+ else
+ json_signal_error (parser, Qjson_utf8_decode_error);
+ }
+ else if (c == '\\')
+ {
+ /* Handle escape sequences */
+ c = json_input_get (parser);
+ parser->current_column++;
+ if (c == '"')
+ json_byte_workspace_put (parser, '"');
+ else if (c == '\\')
+ json_byte_workspace_put (parser, '\\');
+ else if (c == '/')
+ json_byte_workspace_put (parser, '/');
+ else if (c == 'b')
+ json_byte_workspace_put (parser, '\b');
+ else if (c == 'f')
+ json_byte_workspace_put (parser, '\f');
+ else if (c == 'n')
+ json_byte_workspace_put (parser, '\n');
+ else if (c == 'r')
+ json_byte_workspace_put (parser, '\r');
+ else if (c == 't')
+ json_byte_workspace_put (parser, '\t');
+ else if (c == 'u')
+ {
+ int num = json_parse_unicode (parser);
+ /* is the first half of the surrogate pair */
+ if (num >= 0xd800 && num < 0xdc00)
+ {
+ parser->current_column++;
+ if (json_input_get (parser) != '\\')
+ json_signal_error (parser,
+ Qjson_invalid_surrogate_error);
+ parser->current_column++;
+ if (json_input_get (parser) != 'u')
+ json_signal_error (parser,
+ Qjson_invalid_surrogate_error);
+ int num2 = json_parse_unicode (parser);
+ if (num2 < 0xdc00 || num2 >= 0xe000)
+ json_signal_error (parser,
+ Qjson_invalid_surrogate_error);
+ num = (0x10000
+ + ((num - 0xd800) << 10 | (num2 - 0xdc00)));
+ }
+ else if (num >= 0xdc00 && num < 0xe000)
+ /* is the second half of the surrogate pair without
+ the first half */
+ json_signal_error (parser,
+ Qjson_invalid_surrogate_error);
+
+ /* utf-8 encode the code-point */
+ if (num < 0x80)
+ json_byte_workspace_put (parser, num);
+ else if (num < 0x800)
+ {
+ json_byte_workspace_put (parser, 0xc0 | num >> 6);
+ json_byte_workspace_put (parser,
+ 0x80 | (num & 0x3f));
+ }
+ else if (num < 0x10000)
+ {
+ json_byte_workspace_put (parser, 0xe0 | num >> 12);
+ json_byte_workspace_put (parser,
+ (0x80
+ | ((num >> 6) & 0x3f)));
+ json_byte_workspace_put (parser,
+ 0x80 | (num & 0x3f));
+ }
+ else
+ {
+ json_byte_workspace_put (parser, 0xf0 | num >> 18);
+ json_byte_workspace_put (parser,
+ (0x80
+ | ((num >> 12) & 0x3f)));
+ json_byte_workspace_put (parser,
+ (0x80
+ | ((num >> 6) & 0x3f)));
+ json_byte_workspace_put (parser,
+ 0x80 | (num & 0x3f));
+ }
+ }
+ else
+ json_signal_error (parser, Qjson_escape_sequence_error);
+ }
+ else
+ json_signal_error (parser, Qjson_parse_error);
+ }
+}
+
+/* If there was no integer overflow during parsing the integer, this
+ puts 'value' to the output. Otherwise this calls string_to_number
+ to parse integer on the byte workspace. This could just always
+ call string_to_number, but for performance reasons, during parsing
+ the code tries to calculate the value, so in most cases, we can
+ save call of string_to_number */
+static Lisp_Object
+json_create_integer (struct json_parser *parser,
+ bool integer_overflow, bool negative,
+ EMACS_UINT value)
+{
+ if (!integer_overflow)
+ {
+ if (negative)
+ {
+ uintmax_t v = value;
+ if (v <= (uintmax_t) INTMAX_MAX + 1)
+ return INT_TO_INTEGER ((intmax_t) -v);
+ }
+ else
+ return INT_TO_INTEGER (value);
+ }
+
+ json_byte_workspace_put (parser, 0);
+ ptrdiff_t len;
+ Lisp_Object result
+ = string_to_number ((const char *) parser->byte_workspace, 10,
+ &len);
+ if (len
+ != parser->byte_workspace_current - parser->byte_workspace - 1)
+ json_signal_error (parser, Qjson_error);
+ return result;
+}
+
+/* Parses a float using the byte workspace */
+static Lisp_Object
+json_create_float (struct json_parser *parser)
+{
+ json_byte_workspace_put (parser, 0);
+ errno = 0;
+ char *e;
+ double value = strtod ((const char *) parser->byte_workspace, &e);
+ bool out_of_range
+ = (errno != 0 && (value == HUGE_VAL || value == -HUGE_VAL));
+ if (out_of_range)
+ json_signal_error (parser, Qjson_number_out_of_range);
+ else if ((const unsigned char *) e
+ != parser->byte_workspace_current - 1)
+ json_signal_error (parser, Qjson_error);
+ else
+ return make_float (value);
+}
+
+/* Parses a number. The first character is the input parameter 'c'.
+ */
+static Lisp_Object
+json_parse_number (struct json_parser *parser, int c)
+{
+ json_byte_workspace_reset (parser);
+ json_byte_workspace_put (parser, c);
+
+ bool negative = false;
+ if (c == '-')
+ {
+ negative = true;
+ c = json_input_get (parser);
+ json_byte_workspace_put (parser, c);
+ parser->current_column++;
+ }
+ if (c < '0' || c > '9')
+ json_signal_error (parser, Qjson_parse_error);
+
+ /* The idea is that during finding the last character of the
+ number, the for loop below also tries to calculate the value. If
+ the parsed number is an integer which fits into unsigned long,
+ then the parser can use the value of 'integer' right away,
+ instead of having to re-parse the byte workspace later.
+ Ideally, this integer should have the same size as a CPU general
+ purpose register. */
+ EMACS_UINT integer = c - '0';
+ bool integer_overflow = false;
+
+ if (integer == 0)
+ {
+ if (json_input_at_eof (parser))
+ return INT_TO_INTEGER (0);
+ c = json_input_get (parser);
+ }
+ else
+ {
+ for (;;)
+ {
+ if (json_input_at_eof (parser))
+ return json_create_integer (parser, integer_overflow,
+ negative, integer);
+ c = json_input_get (parser);
+ if (c < '0' || c > '9')
+ break;
+ json_byte_workspace_put (parser, c);
+ parser->current_column++;
+
+ integer_overflow |= ckd_mul (&integer, integer, 10);
+ integer_overflow |= ckd_add (&integer, integer, c - '0');
+ }
+ }
+
+ bool is_float = false;
+ if (c == '.')
+ {
+ json_byte_workspace_put (parser, c);
+ parser->current_column++;
+
+ is_float = true;
+ c = json_input_get (parser);
+ json_byte_workspace_put (parser, c);
+ parser->current_column++;
+ if (c < '0' || c > '9')
+ json_signal_error (parser, Qjson_parse_error);
+ for (;;)
+ {
+ if (json_input_at_eof (parser))
+ return json_create_float (parser);
+ c = json_input_get (parser);
+ if (c < '0' || c > '9')
+ break;
+ json_byte_workspace_put (parser, c);
+ parser->current_column++;
+ }
+ }
+ if (c == 'e' || c == 'E')
+ {
+ json_byte_workspace_put (parser, c);
+ parser->current_column++;
+
+ is_float = true;
+ c = json_input_get (parser);
+ json_byte_workspace_put (parser, c);
+ parser->current_column++;
+ if (c == '-' || c == '+')
+ {
+ c = json_input_get (parser);
+ json_byte_workspace_put (parser, c);
+ parser->current_column++;
+ }
+ if (c < '0' || c > '9')
+ json_signal_error (parser, Qjson_parse_error);
+ for (;;)
+ {
+ if (json_input_at_eof (parser))
+ return json_create_float (parser);
+ c = json_input_get (parser);
+ if (c < '0' || c > '9')
+ break;
+ json_byte_workspace_put (parser, c);
+ parser->current_column++;
+ }
+ }
+
+ /* 'c' contains a character which is not part of the number,
+ so it is need to be put back */
+ json_input_put_back (parser);
+
+ if (is_float)
+ return json_create_float (parser);
+ else
+ return json_create_integer (parser, integer_overflow, negative,
+ integer);
+}
+
+static Lisp_Object json_parse_value (struct json_parser *parser,
+ int c);
+
+/* Parses a JSON array. */
+static Lisp_Object
+json_parse_array (struct json_parser *parser)
+{
+ int c = json_skip_whitespace (parser);
+
+ const size_t first = parser->object_workspace_current;
+
+ if (c != ']')
+ {
+ parser->available_depth--;
+ if (parser->available_depth < 0)
+ json_signal_error (parser, Qjson_object_too_deep);
+
+ size_t number_of_elements = 0;
+ /* This loop collects the array elements in the object workspace
+ */
+ for (;;)
+ {
+ Lisp_Object element = json_parse_value (parser, c);
+ json_make_object_workspace_for (parser, 1);
+ ASET (parser->object_workspace,
+ parser->object_workspace_current, element);
+ parser->object_workspace_current++;
+
+ c = json_skip_whitespace (parser);
+
+ number_of_elements++;
+ if (c == ']')
+ {
+ parser->available_depth++;
+ break;
+ }
+
+ if (c != ',')
+ json_signal_error (parser, Qjson_parse_error);
+
+ c = json_skip_whitespace (parser);
+ }
+ }
+
+ Lisp_Object result;
+ size_t number_of_elements
+ = parser->object_workspace_current - first;
+
+ switch (parser->conf.array_type)
+ {
+ case json_array_array:
+ result = make_vector (number_of_elements, Qnil);
+ for (size_t i = 0; i < number_of_elements; i++)
+ {
+ rarely_quit (i);
+ ASET (result, i,
+ AREF (parser->object_workspace, first + i));
+ }
+ break;
+ case json_array_list:
+ result = Qnil;
+ for (size_t i = 0; i < number_of_elements; ++i)
+ {
+ rarely_quit (i);
+ result
+ = Fcons (AREF (parser->object_workspace,
+ parser->object_workspace_current - i - 1),
+ result);
+ }
+ break;
+ default:
+ emacs_abort ();
+ }
+
+ parser->object_workspace_current = first;
+
+ return result;
+}
+
+/* Parses a JSON object. */
+static Lisp_Object
+json_parse_object (struct json_parser *parser)
+{
+ int c = json_skip_whitespace (parser);
+
+ const size_t first = parser->object_workspace_current;
+
+ if (c != '}')
+ {
+ parser->available_depth--;
+ if (parser->available_depth < 0)
+ json_signal_error (parser, Qjson_object_too_deep);
+
+ /* This loop collects the object members (key/value pairs) in
+ * the object workspace */
+ for (;;)
+ {
+ if (c != '"')
+ json_signal_error (parser, Qjson_parse_error);
+
+ Lisp_Object key;
+ json_byte_workspace_reset (parser);
+ switch (parser->conf.object_type)
+ {
+ case json_object_hashtable:
+ {
+ json_parse_string (parser);
+ key
+ = make_string_from_utf8 ((char *)
+ parser->byte_workspace,
+ (parser->byte_workspace_current
+ - parser->byte_workspace));
+ break;
+ }
+ case json_object_alist:
+ {
+ json_parse_string (parser);
+ key = Fintern (make_string_from_utf8 (
+ (char *) parser->byte_workspace,
+ (parser->byte_workspace_current
+ - parser->byte_workspace)),
+ Qnil);
+ break;
+ }
+ case json_object_plist:
+ {
+ json_byte_workspace_put (parser, ':');
+ json_parse_string (parser);
+ key = intern_1 ((char *) parser->byte_workspace,
+ (parser->byte_workspace_current
+ - parser->byte_workspace));
+ break;
+ }
+ default:
+ emacs_abort ();
+ }
+
+ c = json_skip_whitespace (parser);
+ if (c != ':')
+ json_signal_error (parser, Qjson_parse_error);
+
+ c = json_skip_whitespace (parser);
+
+ Lisp_Object value = json_parse_value (parser, c);
+
+ json_make_object_workspace_for (parser, 2);
+ ASET (parser->object_workspace,
+ parser->object_workspace_current, key);
+ parser->object_workspace_current++;
+ ASET (parser->object_workspace,
+ parser->object_workspace_current, value);
+ parser->object_workspace_current++;
+
+ c = json_skip_whitespace (parser);
+
+ if (c == '}')
+ {
+ parser->available_depth++;
+ break;
+ }
+
+ if (c != ',')
+ json_signal_error (parser, Qjson_parse_error);
+
+ c = json_skip_whitespace (parser);
+ }
+ }
+
+ Lisp_Object result;
+ switch (parser->conf.object_type)
+ {
+ case json_object_hashtable:
+ {
+ result
+ = CALLN (Fmake_hash_table, QCtest, Qequal, QCsize,
+ make_fixed_natnum (
+ (parser->object_workspace_current - first) / 2));
+ struct Lisp_Hash_Table *h = XHASH_TABLE (result);
+ for (size_t i = first; i < parser->object_workspace_current;
+ i += 2)
+ {
+ hash_hash_t hash;
+ Lisp_Object key = AREF (parser->object_workspace, i);
+ Lisp_Object value
+ = AREF (parser->object_workspace, i + 1);
+ ptrdiff_t i = hash_lookup_get_hash (h, key, &hash);
+ if (i < 0)
+ hash_put (h, key, value, hash);
+ else
+ set_hash_value_slot (h, i, value);
+ }
+ break;
+ }
+ case json_object_alist:
+ {
+ result = Qnil;
+ for (size_t i = parser->object_workspace_current; i > first;
+ i -= 2)
+ {
+ Lisp_Object key = AREF (parser->object_workspace, i - 2);
+ Lisp_Object value
+ = AREF (parser->object_workspace, i - 1);
+ result = Fcons (Fcons (key, value), result);
+ }
+ break;
+ }
+ case json_object_plist:
+ {
+ result = Qnil;
+ for (size_t i = parser->object_workspace_current; i > first;
+ i -= 2)
+ {
+ Lisp_Object key = AREF (parser->object_workspace, i - 2);
+ Lisp_Object value
+ = AREF (parser->object_workspace, i - 1);
+ result = Fcons (value, result);
+ result = Fcons (key, result);
+ }
+ break;
+ }
+ default:
+ emacs_abort ();
+ }
+
+ parser->object_workspace_current = first;
+
+ return result;
+}
+
+/* Token-char is not a JSON terminology. When parsing
+ null/false/true, this function tells the character set that is need
+ to be considered as part of a token. For example, if the input is
+ "truesomething", then the parser shouldn't consider it as "true",
+ and an additional later "something" token. An additional example:
+ if the input is "truetrue", then calling (json-parse-buffer) twice
+ shouldn't produce two successful calls which return t, but a
+ parsing error */
+static bool
+json_is_token_char (int c)
+{
+ return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
+ || (c >= '0' && c <= '9') || (c == '-'));
+}
+
+/* This is the entry point to the value parser, this parses a JSON
+ * value */
+Lisp_Object
+json_parse_value (struct json_parser *parser, int c)
+{
+ if (c == '{')
+ return json_parse_object (parser);
+ else if (c == '[')
+ return json_parse_array (parser);
+ else if (c == '"')
+ {
+ json_byte_workspace_reset (parser);
+ json_parse_string (parser);
+ Lisp_Object result
+ = make_string_from_utf8 ((const char *)
+ parser->byte_workspace,
+ (parser->byte_workspace_current
+ - parser->byte_workspace));
+ return result;
+ }
+ else if ((c >= '0' && c <= '9') || (c == '-'))
+ return json_parse_number (parser, c);
+ else
+ {
+ int c2 = json_input_get (parser);
+ int c3 = json_input_get (parser);
+ int c4 = json_input_get (parser);
+ int c5 = json_input_get_if_possible (parser);
+
+ if (c == 't' && c2 == 'r' && c3 == 'u' && c4 == 'e'
+ && (c5 < 0 || !json_is_token_char (c5)))
+ {
+ if (c5 >= 0)
+ json_input_put_back (parser);
+ parser->current_column += 4;
+ return Qt;
+ }
+ if (c == 'n' && c2 == 'u' && c3 == 'l' && c4 == 'l'
+ && (c5 < 0 || !json_is_token_char (c5)))
+ {
+ if (c5 >= 0)
+ json_input_put_back (parser);
+ parser->current_column += 4;
+ return parser->conf.null_object;
+ }
+ if (c == 'f' && c2 == 'a' && c3 == 'l' && c4 == 's'
+ && c5 == 'e')
+ {
+ int c6 = json_input_get_if_possible (parser);
+ if (c6 < 0 || !json_is_token_char (c6))
+ {
+ if (c6 >= 0)
+ json_input_put_back (parser);
+ parser->current_column += 5;
+ return parser->conf.false_object;
+ }
+ }
+
+ json_signal_error (parser, Qjson_parse_error);
+ }
+}
+
+enum ParseEndBehavior
+ {
+ PARSEENDBEHAVIOR_CheckForGarbage,
+ PARSEENDBEHAVIOR_MovePoint
+ };
+
+static Lisp_Object
+json_parse (struct json_parser *parser,
+ enum ParseEndBehavior parse_end_behavior)
+{
+ int c = json_skip_whitespace (parser);
+
+ Lisp_Object result = json_parse_value (parser, c);
+
+ switch (parse_end_behavior)
+ {
+ case PARSEENDBEHAVIOR_CheckForGarbage:
+ c = json_skip_whitespace_if_possible (parser);
+ if (c >= 0)
+ json_signal_error (parser, Qjson_trailing_content);
+ break;
+ case PARSEENDBEHAVIOR_MovePoint:
+ {
+ ptrdiff_t point
+ = (PT_BYTE + parser->input_current - parser->input_begin
+ + parser->additional_bytes_count);
+ SET_PT_BOTH (BYTE_TO_CHAR (point), point);
+ break;
+ }
+ }
+
+ return result;
+}
+
+DEFUN ("json-parse-string-fast", Fjson_parse_string_fast,
+ Sjson_parse_string_fast, 1, MANY, NULL,
+ doc: /* Parse the JSON STRING into a Lisp object.
+This is an experimental function, it is a faster
+implementation of json-parse-string. The functionality is the
+same, with the only exception how this version handles duplicate keys:
+If :object-type is `alist' or `plist', json-parse-string-fast
+doesn't filter duplicate keys. */)
+(ptrdiff_t nargs, Lisp_Object *args)
+{
+ specpdl_ref count = SPECPDL_INDEX ();
+
+ Lisp_Object string = args[0];
+ CHECK_STRING (string);
+ Lisp_Object encoded = json_encode (string);
+ struct json_configuration conf
+ = { json_object_hashtable, json_array_array, QCnull, QCfalse };
+ json_parse_args (nargs - 1, args + 1, &conf, true);
+
+ struct json_parser p;
+ const unsigned char *begin
+ = (const unsigned char *) SSDATA (encoded);
+ json_parser_init (&p, conf, begin, begin + SBYTES (encoded), NULL,
+ NULL);
+ record_unwind_protect_ptr (json_parser_done, &p);
+
+ return unbind_to (count,
+ json_parse (&p,
+ PARSEENDBEHAVIOR_CheckForGarbage));
+}
+
+DEFUN ("json-parse-buffer-fast", Fjson_parse_buffer_fast,
+ Sjson_parse_buffer_fast, 0, MANY, NULL,
+ doc: /* Read JSON object from current buffer starting at point.
+This is an experimental function, it is a faster
+implementation of json-parse-buffer. The functionality is the
+same, with the only exception how this version handles duplicate keys:
+If :object-type is `alist' or `plist', json-parse-buffer-fast
+doesn't filter duplicate keys. */)
+(ptrdiff_t nargs, Lisp_Object *args)
+{
+ specpdl_ref count = SPECPDL_INDEX ();
+
+ struct json_configuration conf
+ = { json_object_hashtable, json_array_array, QCnull, QCfalse };
+ json_parse_args (nargs, args, &conf, true);
+
+ struct json_parser p;
+ unsigned char *begin = PT_ADDR;
+ unsigned char *end = GPT_ADDR;
+ unsigned char *secondary_begin = NULL;
+ unsigned char *secondary_end = NULL;
+ if (GPT_ADDR < Z_ADDR)
+ {
+ secondary_begin = GAP_END_ADDR;
+ if (secondary_begin < PT_ADDR)
+ secondary_begin = PT_ADDR;
+ secondary_end = Z_ADDR;
+ }
+
+ json_parser_init (&p, conf, begin, end, secondary_begin,
+ secondary_end);
+ record_unwind_protect_ptr (json_parser_done, &p);
+
+ return unbind_to (count,
+ json_parse (&p, PARSEENDBEHAVIOR_MovePoint));
+}
+
void
syms_of_json (void)
{
@@ -1102,6 +2196,10 @@ syms_of_json (void)
DEFSYM (Qjson_end_of_file, "json-end-of-file");
DEFSYM (Qjson_trailing_content, "json-trailing-content");
DEFSYM (Qjson_object_too_deep, "json-object-too-deep");
+ DEFSYM (Qjson_utf8_decode_error, "json-utf8-decode-error")
+ DEFSYM (Qjson_invalid_surrogate_error, "json-invalid-surrogate-error")
+ DEFSYM (Qjson_number_out_of_range, "json-number-out-of-range-error")
+ DEFSYM (Qjson_escape_sequence_error, "json-escape-sequence-error")
DEFSYM (Qjson_unavailable, "json-unavailable");
define_error (Qjson_error, "generic JSON error", Qerror);
define_error (Qjson_out_of_memory,
@@ -1113,6 +2211,14 @@ syms_of_json (void)
Qjson_parse_error);
define_error (Qjson_object_too_deep,
"object cyclic or Lisp evaluation too deep", Qjson_error);
+ define_error (Qjson_utf8_decode_error,
+ "invalid utf-8 encoding", Qjson_error);
+ define_error (Qjson_invalid_surrogate_error,
+ "invalid surrogate pair", Qjson_error);
+ define_error (Qjson_number_out_of_range,
+ "number out of range", Qjson_error);
+ define_error (Qjson_escape_sequence_error,
+ "invalid escape sequence", Qjson_parse_error);
DEFSYM (Qpure, "pure");
DEFSYM (Qside_effect_free, "side-effect-free");
@@ -1137,4 +2243,6 @@ syms_of_json (void)
defsubr (&Sjson_insert);
defsubr (&Sjson_parse_string);
defsubr (&Sjson_parse_buffer);
+ defsubr (&Sjson_parse_string_fast);
+ defsubr (&Sjson_parse_buffer_fast);
}
--
2.42.0
next reply other threads:[~2024-03-19 18:23 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-03-19 18:23 Herman, Géza [this message]
2024-03-19 18:47 ` [PATCH] Implement fast verisons of json-parse functions Eli Zaretskii
2024-03-19 18:50 ` Herman, Géza
2024-03-19 19:28 ` Eli Zaretskii
2024-03-19 19:50 ` Herman, Géza
2024-03-20 3:37 ` Eli Zaretskii
2024-03-23 10:40 ` Herman, Géza
2024-03-23 11:11 ` Eli Zaretskii
2024-03-30 7:43 ` Eli Zaretskii
2024-03-30 10:50 ` Herman, Géza
2024-03-30 13:00 ` Eli Zaretskii
2024-03-30 18:36 ` Herman, Géza
2024-03-30 20:18 ` [External] : " Drew Adams
2024-03-30 20:51 ` Herman, Géza
2024-03-30 23:28 ` Drew Adams
2024-03-31 5:08 ` Eli Zaretskii
2024-03-31 14:51 ` Mattias Engdegård
2024-03-31 15:44 ` Eli Zaretskii
2024-04-01 8:48 ` Mattias Engdegård
2024-04-01 16:54 ` Herman, Géza
2024-03-31 17:48 ` Adam Porter
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=87h6h2rsgn.fsf@gmail.com \
--to=geza.herman@gmail.com \
--cc=emacs-devel@gnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this external index
https://git.savannah.gnu.org/cgit/emacs.git
https://git.savannah.gnu.org/cgit/emacs/org-mode.git
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.