[PATCH] Implement fast verisons of json-parse functions

unofficial mirror of emacs-devel@gnu.org 
 help / color / mirror / code / Atom feed

From: "Herman, Géza" <geza.herman@gmail.com>
To: emacs-devel@gnu.org
Subject: [PATCH] Implement fast verisons of json-parse functions
Date: Tue, 19 Mar 2024 19:23:36 +0100	[thread overview]
Message-ID: <87h6h2rsgn.fsf@gmail.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 247 bytes --]

Tags: patch

Here's the latest version of my JSON parser.  It is the same as before,
the only modification I did is that this patch doesn't replace the
original parser, but keeps them, and adds to two additional functions
with the -fast suffix.



[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Implement-fast-verisons-of-json-parse-functions.patch --]
[-- Type: text/patch, Size: 34292 bytes --]

From 48399572efbc16887f49fc5a0f20bed3b16b2115 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A9za=20Herman?= <geza.herman@gmail.com>
Date: Tue, 19 Mar 2024 18:59:09 +0100
Subject: [PATCH] Implement fast verisons of json-parse functions

---
 src/json.c | 1108 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1108 insertions(+)

diff --git a/src/json.c b/src/json.c
index e849ccaf722..c08d9d4a4f7 100644
--- a/src/json.c
+++ b/src/json.c
@@ -23,6 +23,7 @@ Copyright (C) 2017-2024 Free Software Foundation, Inc.
 #include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <math.h>
 
 #include <jansson.h>
 
@@ -1087,6 +1088,1099 @@ DEFUN ("json-parse-buffer", Fjson_parse_buffer, Sjson_parse_buffer,
   return unbind_to (count, lisp);
 }
 
+struct json_parser
+{
+  /* Because of a possible gap in the input (an emacs buffer can have
+     a gap), the input is described by [input_begin;input_end) and
+     [secondary_input_begin;secondary_input_end).  If the input is
+     continuous, then secondary_input_begin and secondary_input_end
+     should be NULL */
+  const unsigned char *input_current;
+  const unsigned char *input_begin;
+  const unsigned char *input_end;
+
+  const unsigned char *secondary_input_begin;
+  const unsigned char *secondary_input_end;
+
+  int current_line;
+  int current_column;
+
+  /* The parser has a maximum allowed depth.  available_depth
+     decreases at each object/array begin.  If reaches zero, then an
+     error is generated */
+  int available_depth;
+
+  struct json_configuration conf;
+
+  size_t additional_bytes_count;
+
+  /* Lisp_Objects are collected in this area during object/array
+     parsing */
+  Lisp_Object object_workspace;
+  size_t object_workspace_size;
+  size_t object_workspace_current;
+
+  /* String and number parsing uses this workspace */
+  unsigned char *byte_workspace;
+  unsigned char *byte_workspace_end;
+  unsigned char *byte_workspace_current;
+};
+
+static AVOID
+json_signal_error (struct json_parser *parser, Lisp_Object error)
+{
+  xsignal2 (error, INT_TO_INTEGER (parser->current_line),
+	    INT_TO_INTEGER (parser->current_column));
+}
+
+static void
+json_parser_init (struct json_parser *parser,
+		  struct json_configuration conf,
+		  const unsigned char *input,
+		  const unsigned char *input_end,
+		  const unsigned char *secondary_input,
+		  const unsigned char *secondary_input_end)
+{
+  const int initial_object_workspace_size = 64;
+  const int initial_string_workspace_size = 512;
+
+  if (secondary_input >= secondary_input_end)
+    {
+      secondary_input = NULL;
+      secondary_input_end = NULL;
+    }
+
+  if (input < input_end)
+    {
+      parser->input_begin = input;
+      parser->input_end = input_end;
+
+      parser->secondary_input_begin = secondary_input;
+      parser->secondary_input_end = secondary_input_end;
+    }
+  else
+    {
+      parser->input_begin = secondary_input;
+      parser->input_end = secondary_input_end;
+
+      parser->secondary_input_begin = NULL;
+      parser->secondary_input_end = NULL;
+    }
+
+  parser->input_current = parser->input_begin;
+
+  parser->current_line = 1;
+  parser->current_column = 0;
+  parser->available_depth = 10000;
+  parser->conf = conf;
+
+  parser->additional_bytes_count = 0;
+
+  parser->object_workspace
+    = make_vector (initial_object_workspace_size, Qnil);
+  parser->object_workspace_size = initial_object_workspace_size;
+  parser->object_workspace_current = 0;
+
+  parser->byte_workspace = xmalloc (initial_string_workspace_size);
+  parser->byte_workspace_end
+    = parser->byte_workspace + initial_string_workspace_size;
+}
+
+static void
+json_parser_done (void *parser)
+{
+  struct json_parser *p = (struct json_parser *) parser;
+  xfree (p->byte_workspace);
+}
+
+/* Makes sure that the object_workspace has 'size' available space for
+   Lisp_Objects */
+NO_INLINE static void
+json_make_object_workspace_for_slow_path (struct json_parser *parser,
+					  size_t size)
+{
+  size_t needed_workspace_size
+    = (parser->object_workspace_current + size);
+  size_t new_workspace_size = parser->object_workspace_size;
+  while (new_workspace_size < needed_workspace_size)
+    {
+      if (ckd_mul (&new_workspace_size, new_workspace_size, 2))
+	{
+	  json_signal_error (parser, Qjson_out_of_memory);
+	}
+    }
+  Lisp_Object new_workspace = make_vector (new_workspace_size, Qnil);
+  for (size_t i = 0; i < parser->object_workspace_current; i++)
+    {
+      ASET (new_workspace, i, AREF (parser->object_workspace, i));
+    }
+  parser->object_workspace = new_workspace;
+  parser->object_workspace_size = new_workspace_size;
+}
+
+INLINE void
+json_make_object_workspace_for (struct json_parser *parser,
+				size_t size)
+{
+  if (parser->object_workspace_size - parser->object_workspace_current
+      < size)
+    {
+      json_make_object_workspace_for_slow_path (parser, size);
+    }
+}
+
+static void
+json_byte_workspace_reset (struct json_parser *parser)
+{
+  parser->byte_workspace_current = parser->byte_workspace;
+}
+
+/* Puts 'value' into the byte_workspace.  If there is no space
+   available, it allocates space */
+NO_INLINE static void
+json_byte_workspace_put_slow_path (struct json_parser *parser,
+				   unsigned char value)
+{
+  size_t new_workspace_size
+    = parser->byte_workspace_end - parser->byte_workspace;
+  if (ckd_mul (&new_workspace_size, new_workspace_size, 2))
+    {
+      json_signal_error (parser, Qjson_out_of_memory);
+    }
+
+  size_t offset
+    = parser->byte_workspace_current - parser->byte_workspace;
+  parser->byte_workspace
+    = xrealloc (parser->byte_workspace, new_workspace_size);
+  parser->byte_workspace_end
+    = parser->byte_workspace + new_workspace_size;
+  parser->byte_workspace_current = parser->byte_workspace + offset;
+  *parser->byte_workspace_current++ = value;
+}
+
+INLINE void
+json_byte_workspace_put (struct json_parser *parser,
+			 unsigned char value)
+{
+  if (parser->byte_workspace_current < parser->byte_workspace_end)
+    {
+      *parser->byte_workspace_current++ = value;
+    }
+  else
+    {
+      json_byte_workspace_put_slow_path (parser, value);
+    }
+}
+
+static bool
+json_input_at_eof (struct json_parser *parser)
+{
+  if (parser->input_current < parser->input_end)
+    return false;
+  return parser->secondary_input_end == NULL;
+}
+
+/* If there is a secondary buffer, it switches to it */
+static int
+json_input_switch_to_secondary (struct json_parser *parser)
+{
+  if (parser->secondary_input_begin < parser->secondary_input_end)
+    {
+      parser->additional_bytes_count
+	= parser->input_end - parser->input_begin;
+      parser->input_begin = parser->secondary_input_begin;
+      parser->input_end = parser->secondary_input_end;
+      parser->input_current = parser->secondary_input_begin;
+      parser->secondary_input_begin = NULL;
+      parser->secondary_input_end = NULL;
+      return 0;
+    }
+  else
+    return -1;
+}
+
+/* Reads a byte from the JSON input stream */
+static unsigned char
+json_input_get (struct json_parser *parser)
+{
+  if (parser->input_current >= parser->input_end
+      && json_input_switch_to_secondary (parser) < 0)
+    json_signal_error (parser, Qjson_end_of_file);
+  return *parser->input_current++;
+}
+
+/* Reads a byte from the JSON input stream, if the stream is not at
+ * eof.  At eof, returns -1 */
+static int
+json_input_get_if_possible (struct json_parser *parser)
+{
+  if (parser->input_current >= parser->input_end
+      && json_input_switch_to_secondary (parser) < 0)
+    return -1;
+  return *parser->input_current++;
+}
+
+/* Puts back the last read input byte.  Only one byte can be put back,
+   because otherwise this code would need to handle switching from
+   the secondary buffer to the initial */
+static void
+json_input_put_back (struct json_parser *parser)
+{
+  parser->input_current--;
+}
+
+static bool
+json_skip_whitespace_internal (struct json_parser *parser, int c)
+{
+  parser->current_column++;
+  if (c == 0x20 || c == 0x09 || c == 0x0d)
+    return false;
+  else if (c == 0x0a)
+    {
+      parser->current_line++;
+      parser->current_column = 0;
+      return false;
+    }
+  else
+    return true;
+}
+
+/* Skips JSON whitespace, and returns with the first non-whitespace
+ * character */
+static int
+json_skip_whitespace (struct json_parser *parser)
+{
+  for (;;)
+    {
+      int c = json_input_get (parser);
+      if (json_skip_whitespace_internal (parser, c))
+	return c;
+    }
+}
+
+/* Skips JSON whitespace, and returns with the first non-whitespace
+ * character, if possible.  If there is no non-whitespace character
+ * (because we reached the end), it returns -1 */
+static int
+json_skip_whitespace_if_possible (struct json_parser *parser)
+{
+  for (;;)
+    {
+      int c = json_input_get_if_possible (parser);
+      if (c < 0)
+	return c;
+      if (json_skip_whitespace_internal (parser, c))
+	return c;
+    }
+}
+
+static int
+json_hex_value (int c)
+{
+  if (c >= '0' && c <= '9')
+    return c - '0';
+  else if (c >= 'A' && c <= 'F')
+    return c - 'A' + 10;
+  else if (c >= 'a' && c <= 'f')
+    return c - 'a' + 10;
+  else
+    return -1;
+}
+
+/* Parses the CCCC part of the unicode escape sequence \uCCCC */
+static int
+json_parse_unicode (struct json_parser *parser)
+{
+  unsigned char v[4];
+  for (int i = 0; i < 4; i++)
+    {
+      int c = json_hex_value (json_input_get (parser));
+      parser->current_column++;
+      if (c < 0)
+	json_signal_error (parser, Qjson_escape_sequence_error);
+      v[i] = c;
+    }
+
+  return v[0] << 12 | v[1] << 8 | v[2] << 4 | v[3];
+}
+
+/* Parses an utf-8 code-point encoding (except the first byte), and
+   returns the numeric value of the code-point (without considering
+   the first byte) */
+static int
+json_handle_utf8_tail_bytes (struct json_parser *parser, int n)
+{
+  int v = 0;
+  for (int i = 0; i < n; i++)
+    {
+      int c = json_input_get (parser);
+      json_byte_workspace_put (parser, c);
+      if ((c & 0xc0) != 0x80)
+	json_signal_error (parser, Qjson_utf8_decode_error);
+      v = (v << 6) | (c & 0x3f);
+    }
+  return v;
+}
+
+/* Reads a JSON string, and puts the result into the byte workspace */
+static void
+json_parse_string (struct json_parser *parser)
+{
+  /* a single_uninteresting byte can be simply copied from the input
+     to output, it doesn't need any extra care.  This means all the
+     characters between [0x20;0x7f], except the double quote and
+     the backslash */
+  static const char is_single_uninteresting[256] = {
+    /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
+    /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 1 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 2 */ 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* 3 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* 4 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* 5 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
+    /* 6 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* 7 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* 8 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 9 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* a */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* b */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* c */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* d */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* e */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  };
+
+  for (;;)
+    {
+      /* This if is only here for a possible speedup.  If there are 4
+	 bytes available, and all of them are single_uninteresting,
+	 then we can just copy these 4 bytes to output */
+      if (parser->input_end - parser->input_current >= 4)
+	{
+	  int c0 = parser->input_current[0];
+	  int c1 = parser->input_current[1];
+	  int c2 = parser->input_current[2];
+	  int c3 = parser->input_current[3];
+	  bool v0 = is_single_uninteresting[c0];
+	  bool v1 = is_single_uninteresting[c1];
+	  bool v2 = is_single_uninteresting[c2];
+	  bool v3 = is_single_uninteresting[c3];
+	  if (v0 && v1 && v2 && v3)
+	    {
+	      json_byte_workspace_put (parser, c0);
+	      json_byte_workspace_put (parser, c1);
+	      json_byte_workspace_put (parser, c2);
+	      json_byte_workspace_put (parser, c3);
+	      parser->input_current += 4;
+	      parser->current_column += 4;
+	      continue;
+	    }
+	}
+
+      int c = json_input_get (parser);
+      parser->current_column++;
+      if (is_single_uninteresting[c])
+	{
+	  json_byte_workspace_put (parser, c);
+	  continue;
+	}
+
+      if (c == '"')
+	return;
+      else if (c & 0x80)
+	{
+	  /* Handle utf-8 encoding */
+	  json_byte_workspace_put (parser, c);
+	  if (c < 0xc0)
+	    json_signal_error (parser, Qjson_utf8_decode_error);
+	  else if (c < 0xe0)
+	    {
+	      int n = ((c & 0x1f) << 6
+		       | json_handle_utf8_tail_bytes (parser, 1));
+	      if (n < 0x80)
+		json_signal_error (parser, Qjson_utf8_decode_error);
+	    }
+	  else if (c < 0xf0)
+	    {
+	      int n = ((c & 0xf) << 12
+		       | json_handle_utf8_tail_bytes (parser, 2));
+	      if (n < 0x800 || (n >= 0xd800 && n < 0xe000))
+		json_signal_error (parser, Qjson_utf8_decode_error);
+	    }
+	  else if (c < 0xf8)
+	    {
+	      int n = ((c & 0x7) << 18
+		       | json_handle_utf8_tail_bytes (parser, 3));
+	      if (n < 0x10000 || n > 0x10ffff)
+		json_signal_error (parser, Qjson_utf8_decode_error);
+	    }
+	  else
+	    json_signal_error (parser, Qjson_utf8_decode_error);
+	}
+      else if (c == '\\')
+	{
+	  /* Handle escape sequences */
+	  c = json_input_get (parser);
+	  parser->current_column++;
+	  if (c == '"')
+	    json_byte_workspace_put (parser, '"');
+	  else if (c == '\\')
+	    json_byte_workspace_put (parser, '\\');
+	  else if (c == '/')
+	    json_byte_workspace_put (parser, '/');
+	  else if (c == 'b')
+	    json_byte_workspace_put (parser, '\b');
+	  else if (c == 'f')
+	    json_byte_workspace_put (parser, '\f');
+	  else if (c == 'n')
+	    json_byte_workspace_put (parser, '\n');
+	  else if (c == 'r')
+	    json_byte_workspace_put (parser, '\r');
+	  else if (c == 't')
+	    json_byte_workspace_put (parser, '\t');
+	  else if (c == 'u')
+	    {
+	      int num = json_parse_unicode (parser);
+	      /* is the first half of the surrogate pair */
+	      if (num >= 0xd800 && num < 0xdc00)
+		{
+		  parser->current_column++;
+		  if (json_input_get (parser) != '\\')
+		    json_signal_error (parser,
+				       Qjson_invalid_surrogate_error);
+		  parser->current_column++;
+		  if (json_input_get (parser) != 'u')
+		    json_signal_error (parser,
+				       Qjson_invalid_surrogate_error);
+		  int num2 = json_parse_unicode (parser);
+		  if (num2 < 0xdc00 || num2 >= 0xe000)
+		    json_signal_error (parser,
+				       Qjson_invalid_surrogate_error);
+		  num = (0x10000
+			 + ((num - 0xd800) << 10 | (num2 - 0xdc00)));
+		}
+	      else if (num >= 0xdc00 && num < 0xe000)
+		/* is the second half of the surrogate pair without
+		   the first half */
+		json_signal_error (parser,
+				   Qjson_invalid_surrogate_error);
+
+	      /* utf-8 encode the code-point */
+	      if (num < 0x80)
+		json_byte_workspace_put (parser, num);
+	      else if (num < 0x800)
+		{
+		  json_byte_workspace_put (parser, 0xc0 | num >> 6);
+		  json_byte_workspace_put (parser,
+					   0x80 | (num & 0x3f));
+		}
+	      else if (num < 0x10000)
+		{
+		  json_byte_workspace_put (parser, 0xe0 | num >> 12);
+		  json_byte_workspace_put (parser,
+					   (0x80
+					    | ((num >> 6) & 0x3f)));
+		  json_byte_workspace_put (parser,
+					   0x80 | (num & 0x3f));
+		}
+	      else
+		{
+		  json_byte_workspace_put (parser, 0xf0 | num >> 18);
+		  json_byte_workspace_put (parser,
+					   (0x80
+					    | ((num >> 12) & 0x3f)));
+		  json_byte_workspace_put (parser,
+					   (0x80
+					    | ((num >> 6) & 0x3f)));
+		  json_byte_workspace_put (parser,
+					   0x80 | (num & 0x3f));
+		}
+	    }
+	  else
+	    json_signal_error (parser, Qjson_escape_sequence_error);
+	}
+      else
+	json_signal_error (parser, Qjson_parse_error);
+    }
+}
+
+/* If there was no integer overflow during parsing the integer, this
+   puts 'value' to the output. Otherwise this calls string_to_number
+   to parse integer on the byte workspace.  This could just always
+   call string_to_number, but for performance reasons, during parsing
+   the code tries to calculate the value, so in most cases, we can
+   save call of string_to_number */
+static Lisp_Object
+json_create_integer (struct json_parser *parser,
+		     bool integer_overflow, bool negative,
+		     EMACS_UINT value)
+{
+  if (!integer_overflow)
+    {
+      if (negative)
+	{
+	  uintmax_t v = value;
+	  if (v <= (uintmax_t) INTMAX_MAX + 1)
+	    return INT_TO_INTEGER ((intmax_t) -v);
+	}
+      else
+	return INT_TO_INTEGER (value);
+    }
+
+  json_byte_workspace_put (parser, 0);
+  ptrdiff_t len;
+  Lisp_Object result
+    = string_to_number ((const char *) parser->byte_workspace, 10,
+			&len);
+  if (len
+      != parser->byte_workspace_current - parser->byte_workspace - 1)
+    json_signal_error (parser, Qjson_error);
+  return result;
+}
+
+/* Parses a float using the byte workspace */
+static Lisp_Object
+json_create_float (struct json_parser *parser)
+{
+  json_byte_workspace_put (parser, 0);
+  errno = 0;
+  char *e;
+  double value = strtod ((const char *) parser->byte_workspace, &e);
+  bool out_of_range
+    = (errno != 0 && (value == HUGE_VAL || value == -HUGE_VAL));
+  if (out_of_range)
+    json_signal_error (parser, Qjson_number_out_of_range);
+  else if ((const unsigned char *) e
+	   != parser->byte_workspace_current - 1)
+    json_signal_error (parser, Qjson_error);
+  else
+    return make_float (value);
+}
+
+/* Parses a number.  The first character is the input parameter 'c'.
+ */
+static Lisp_Object
+json_parse_number (struct json_parser *parser, int c)
+{
+  json_byte_workspace_reset (parser);
+  json_byte_workspace_put (parser, c);
+
+  bool negative = false;
+  if (c == '-')
+    {
+      negative = true;
+      c = json_input_get (parser);
+      json_byte_workspace_put (parser, c);
+      parser->current_column++;
+    }
+  if (c < '0' || c > '9')
+    json_signal_error (parser, Qjson_parse_error);
+
+  /* The idea is that during finding the last character of the
+     number, the for loop below also tries to calculate the value.  If
+     the parsed number is an integer which fits into unsigned long,
+     then the parser can use the value of 'integer' right away,
+     instead of having to re-parse the byte workspace later.
+     Ideally, this integer should have the same size as a CPU general
+     purpose register. */
+  EMACS_UINT integer = c - '0';
+  bool integer_overflow = false;
+
+  if (integer == 0)
+    {
+      if (json_input_at_eof (parser))
+	return INT_TO_INTEGER (0);
+      c = json_input_get (parser);
+    }
+  else
+    {
+      for (;;)
+	{
+	  if (json_input_at_eof (parser))
+	    return json_create_integer (parser, integer_overflow,
+					negative, integer);
+	  c = json_input_get (parser);
+	  if (c < '0' || c > '9')
+	    break;
+	  json_byte_workspace_put (parser, c);
+	  parser->current_column++;
+
+	  integer_overflow |= ckd_mul (&integer, integer, 10);
+	  integer_overflow |= ckd_add (&integer, integer, c - '0');
+	}
+    }
+
+  bool is_float = false;
+  if (c == '.')
+    {
+      json_byte_workspace_put (parser, c);
+      parser->current_column++;
+
+      is_float = true;
+      c = json_input_get (parser);
+      json_byte_workspace_put (parser, c);
+      parser->current_column++;
+      if (c < '0' || c > '9')
+	json_signal_error (parser, Qjson_parse_error);
+      for (;;)
+	{
+	  if (json_input_at_eof (parser))
+	    return json_create_float (parser);
+	  c = json_input_get (parser);
+	  if (c < '0' || c > '9')
+	    break;
+	  json_byte_workspace_put (parser, c);
+	  parser->current_column++;
+	}
+    }
+  if (c == 'e' || c == 'E')
+    {
+      json_byte_workspace_put (parser, c);
+      parser->current_column++;
+
+      is_float = true;
+      c = json_input_get (parser);
+      json_byte_workspace_put (parser, c);
+      parser->current_column++;
+      if (c == '-' || c == '+')
+	{
+	  c = json_input_get (parser);
+	  json_byte_workspace_put (parser, c);
+	  parser->current_column++;
+	}
+      if (c < '0' || c > '9')
+	json_signal_error (parser, Qjson_parse_error);
+      for (;;)
+	{
+	  if (json_input_at_eof (parser))
+	    return json_create_float (parser);
+	  c = json_input_get (parser);
+	  if (c < '0' || c > '9')
+	    break;
+	  json_byte_workspace_put (parser, c);
+	  parser->current_column++;
+	}
+    }
+
+  /* 'c' contains a character which is not part of the number,
+     so it is need to be put back */
+  json_input_put_back (parser);
+
+  if (is_float)
+    return json_create_float (parser);
+  else
+    return json_create_integer (parser, integer_overflow, negative,
+				integer);
+}
+
+static Lisp_Object json_parse_value (struct json_parser *parser,
+				     int c);
+
+/* Parses a JSON array. */
+static Lisp_Object
+json_parse_array (struct json_parser *parser)
+{
+  int c = json_skip_whitespace (parser);
+
+  const size_t first = parser->object_workspace_current;
+
+  if (c != ']')
+    {
+      parser->available_depth--;
+      if (parser->available_depth < 0)
+	json_signal_error (parser, Qjson_object_too_deep);
+
+      size_t number_of_elements = 0;
+      /* This loop collects the array elements in the object workspace
+       */
+      for (;;)
+	{
+	  Lisp_Object element = json_parse_value (parser, c);
+	  json_make_object_workspace_for (parser, 1);
+	  ASET (parser->object_workspace,
+		parser->object_workspace_current, element);
+	  parser->object_workspace_current++;
+
+	  c = json_skip_whitespace (parser);
+
+	  number_of_elements++;
+	  if (c == ']')
+	    {
+	      parser->available_depth++;
+	      break;
+	    }
+
+	  if (c != ',')
+	    json_signal_error (parser, Qjson_parse_error);
+
+	  c = json_skip_whitespace (parser);
+	}
+    }
+
+  Lisp_Object result;
+  size_t number_of_elements
+    = parser->object_workspace_current - first;
+
+  switch (parser->conf.array_type)
+    {
+    case json_array_array:
+      result = make_vector (number_of_elements, Qnil);
+      for (size_t i = 0; i < number_of_elements; i++)
+	{
+	  rarely_quit (i);
+	  ASET (result, i,
+		AREF (parser->object_workspace, first + i));
+	}
+      break;
+    case json_array_list:
+      result = Qnil;
+      for (size_t i = 0; i < number_of_elements; ++i)
+	{
+	  rarely_quit (i);
+	  result
+	    = Fcons (AREF (parser->object_workspace,
+			   parser->object_workspace_current - i - 1),
+		     result);
+	}
+      break;
+    default:
+      emacs_abort ();
+    }
+
+  parser->object_workspace_current = first;
+
+  return result;
+}
+
+/* Parses a JSON object. */
+static Lisp_Object
+json_parse_object (struct json_parser *parser)
+{
+  int c = json_skip_whitespace (parser);
+
+  const size_t first = parser->object_workspace_current;
+
+  if (c != '}')
+    {
+      parser->available_depth--;
+      if (parser->available_depth < 0)
+	json_signal_error (parser, Qjson_object_too_deep);
+
+      /* This loop collects the object members (key/value pairs) in
+       * the object workspace */
+      for (;;)
+	{
+	  if (c != '"')
+	    json_signal_error (parser, Qjson_parse_error);
+
+	  Lisp_Object key;
+	  json_byte_workspace_reset (parser);
+	  switch (parser->conf.object_type)
+	    {
+	    case json_object_hashtable:
+	      {
+		json_parse_string (parser);
+		key
+		  = make_string_from_utf8 ((char *)
+                                           parser->byte_workspace,
+					   (parser->byte_workspace_current
+					    - parser->byte_workspace));
+		break;
+	      }
+	    case json_object_alist:
+	      {
+		json_parse_string (parser);
+		key = Fintern (make_string_from_utf8 (
+                                                      (char *) parser->byte_workspace,
+                                                      (parser->byte_workspace_current
+                                                       - parser->byte_workspace)),
+			       Qnil);
+		break;
+	      }
+	    case json_object_plist:
+	      {
+		json_byte_workspace_put (parser, ':');
+		json_parse_string (parser);
+		key = intern_1 ((char *) parser->byte_workspace,
+				(parser->byte_workspace_current
+				 - parser->byte_workspace));
+		break;
+	      }
+	    default:
+	      emacs_abort ();
+	    }
+
+	  c = json_skip_whitespace (parser);
+	  if (c != ':')
+	    json_signal_error (parser, Qjson_parse_error);
+
+	  c = json_skip_whitespace (parser);
+
+	  Lisp_Object value = json_parse_value (parser, c);
+
+	  json_make_object_workspace_for (parser, 2);
+	  ASET (parser->object_workspace,
+		parser->object_workspace_current, key);
+	  parser->object_workspace_current++;
+	  ASET (parser->object_workspace,
+		parser->object_workspace_current, value);
+	  parser->object_workspace_current++;
+
+	  c = json_skip_whitespace (parser);
+
+	  if (c == '}')
+	    {
+	      parser->available_depth++;
+	      break;
+	    }
+
+	  if (c != ',')
+	    json_signal_error (parser, Qjson_parse_error);
+
+	  c = json_skip_whitespace (parser);
+	}
+    }
+
+  Lisp_Object result;
+  switch (parser->conf.object_type)
+    {
+    case json_object_hashtable:
+      {
+	result
+	  = CALLN (Fmake_hash_table, QCtest, Qequal, QCsize,
+		   make_fixed_natnum (
+                                      (parser->object_workspace_current - first) / 2));
+	struct Lisp_Hash_Table *h = XHASH_TABLE (result);
+	for (size_t i = first; i < parser->object_workspace_current;
+	     i += 2)
+	  {
+	    hash_hash_t hash;
+	    Lisp_Object key = AREF (parser->object_workspace, i);
+	    Lisp_Object value
+	      = AREF (parser->object_workspace, i + 1);
+	    ptrdiff_t i = hash_lookup_get_hash (h, key, &hash);
+	    if (i < 0)
+	      hash_put (h, key, value, hash);
+	    else
+	      set_hash_value_slot (h, i, value);
+	  }
+	break;
+      }
+    case json_object_alist:
+      {
+	result = Qnil;
+	for (size_t i = parser->object_workspace_current; i > first;
+	     i -= 2)
+	  {
+	    Lisp_Object key = AREF (parser->object_workspace, i - 2);
+	    Lisp_Object value
+	      = AREF (parser->object_workspace, i - 1);
+	    result = Fcons (Fcons (key, value), result);
+	  }
+	break;
+      }
+    case json_object_plist:
+      {
+	result = Qnil;
+	for (size_t i = parser->object_workspace_current; i > first;
+	     i -= 2)
+	  {
+	    Lisp_Object key = AREF (parser->object_workspace, i - 2);
+	    Lisp_Object value
+	      = AREF (parser->object_workspace, i - 1);
+	    result = Fcons (value, result);
+	    result = Fcons (key, result);
+	  }
+	break;
+      }
+    default:
+      emacs_abort ();
+    }
+
+  parser->object_workspace_current = first;
+
+  return result;
+}
+
+/* Token-char is not a JSON terminology.  When parsing
+   null/false/true, this function tells the character set that is need
+   to be considered as part of a token.  For example, if the input is
+   "truesomething", then the parser shouldn't consider it as "true",
+   and an additional later "something" token. An additional example:
+   if the input is "truetrue", then calling (json-parse-buffer) twice
+   shouldn't produce two successful calls which return t, but a
+   parsing error */
+static bool
+json_is_token_char (int c)
+{
+  return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
+	  || (c >= '0' && c <= '9') || (c == '-'));
+}
+
+/* This is the entry point to the value parser, this parses a JSON
+ * value */
+Lisp_Object
+json_parse_value (struct json_parser *parser, int c)
+{
+  if (c == '{')
+    return json_parse_object (parser);
+  else if (c == '[')
+    return json_parse_array (parser);
+  else if (c == '"')
+    {
+      json_byte_workspace_reset (parser);
+      json_parse_string (parser);
+      Lisp_Object result
+	= make_string_from_utf8 ((const char *)
+                                 parser->byte_workspace,
+				 (parser->byte_workspace_current
+				  - parser->byte_workspace));
+      return result;
+    }
+  else if ((c >= '0' && c <= '9') || (c == '-'))
+    return json_parse_number (parser, c);
+  else
+    {
+      int c2 = json_input_get (parser);
+      int c3 = json_input_get (parser);
+      int c4 = json_input_get (parser);
+      int c5 = json_input_get_if_possible (parser);
+
+      if (c == 't' && c2 == 'r' && c3 == 'u' && c4 == 'e'
+	  && (c5 < 0 || !json_is_token_char (c5)))
+	{
+	  if (c5 >= 0)
+	    json_input_put_back (parser);
+	  parser->current_column += 4;
+	  return Qt;
+	}
+      if (c == 'n' && c2 == 'u' && c3 == 'l' && c4 == 'l'
+	  && (c5 < 0 || !json_is_token_char (c5)))
+	{
+	  if (c5 >= 0)
+	    json_input_put_back (parser);
+	  parser->current_column += 4;
+	  return parser->conf.null_object;
+	}
+      if (c == 'f' && c2 == 'a' && c3 == 'l' && c4 == 's'
+	  && c5 == 'e')
+	{
+	  int c6 = json_input_get_if_possible (parser);
+	  if (c6 < 0 || !json_is_token_char (c6))
+	    {
+	      if (c6 >= 0)
+		json_input_put_back (parser);
+	      parser->current_column += 5;
+	      return parser->conf.false_object;
+	    }
+	}
+
+      json_signal_error (parser, Qjson_parse_error);
+    }
+}
+
+enum ParseEndBehavior
+  {
+    PARSEENDBEHAVIOR_CheckForGarbage,
+    PARSEENDBEHAVIOR_MovePoint
+  };
+
+static Lisp_Object
+json_parse (struct json_parser *parser,
+	    enum ParseEndBehavior parse_end_behavior)
+{
+  int c = json_skip_whitespace (parser);
+
+  Lisp_Object result = json_parse_value (parser, c);
+
+  switch (parse_end_behavior)
+    {
+    case PARSEENDBEHAVIOR_CheckForGarbage:
+      c = json_skip_whitespace_if_possible (parser);
+      if (c >= 0)
+	json_signal_error (parser, Qjson_trailing_content);
+      break;
+    case PARSEENDBEHAVIOR_MovePoint:
+      {
+	ptrdiff_t point
+	  = (PT_BYTE + parser->input_current - parser->input_begin
+	     + parser->additional_bytes_count);
+	SET_PT_BOTH (BYTE_TO_CHAR (point), point);
+	break;
+      }
+    }
+
+  return result;
+}
+
+DEFUN ("json-parse-string-fast", Fjson_parse_string_fast,
+       Sjson_parse_string_fast, 1, MANY, NULL,
+       doc: /* Parse the JSON STRING into a Lisp object.
+This is an experimental function, it is a faster
+implementation of json-parse-string.  The functionality is the
+same, with the only exception how this version handles duplicate keys:
+If :object-type is `alist' or `plist', json-parse-string-fast
+doesn't filter duplicate keys. */)
+(ptrdiff_t nargs, Lisp_Object *args)
+{
+  specpdl_ref count = SPECPDL_INDEX ();
+
+  Lisp_Object string = args[0];
+  CHECK_STRING (string);
+  Lisp_Object encoded = json_encode (string);
+  struct json_configuration conf
+    = { json_object_hashtable, json_array_array, QCnull, QCfalse };
+  json_parse_args (nargs - 1, args + 1, &conf, true);
+
+  struct json_parser p;
+  const unsigned char *begin
+    = (const unsigned char *) SSDATA (encoded);
+  json_parser_init (&p, conf, begin, begin + SBYTES (encoded), NULL,
+		    NULL);
+  record_unwind_protect_ptr (json_parser_done, &p);
+
+  return unbind_to (count,
+		    json_parse (&p,
+				PARSEENDBEHAVIOR_CheckForGarbage));
+}
+
+DEFUN ("json-parse-buffer-fast", Fjson_parse_buffer_fast,
+       Sjson_parse_buffer_fast, 0, MANY, NULL,
+       doc: /* Read JSON object from current buffer starting at point.
+This is an experimental function, it is a faster
+implementation of json-parse-buffer.  The functionality is the
+same, with the only exception how this version handles duplicate keys:
+If :object-type is `alist' or `plist', json-parse-buffer-fast
+doesn't filter duplicate keys. */)
+(ptrdiff_t nargs, Lisp_Object *args)
+{
+  specpdl_ref count = SPECPDL_INDEX ();
+
+  struct json_configuration conf
+    = { json_object_hashtable, json_array_array, QCnull, QCfalse };
+  json_parse_args (nargs, args, &conf, true);
+
+  struct json_parser p;
+  unsigned char *begin = PT_ADDR;
+  unsigned char *end = GPT_ADDR;
+  unsigned char *secondary_begin = NULL;
+  unsigned char *secondary_end = NULL;
+  if (GPT_ADDR < Z_ADDR)
+    {
+      secondary_begin = GAP_END_ADDR;
+      if (secondary_begin < PT_ADDR)
+	secondary_begin = PT_ADDR;
+      secondary_end = Z_ADDR;
+    }
+
+  json_parser_init (&p, conf, begin, end, secondary_begin,
+		    secondary_end);
+  record_unwind_protect_ptr (json_parser_done, &p);
+
+  return unbind_to (count,
+		    json_parse (&p, PARSEENDBEHAVIOR_MovePoint));
+}
+
 void
 syms_of_json (void)
 {
@@ -1102,6 +2196,10 @@ syms_of_json (void)
   DEFSYM (Qjson_end_of_file, "json-end-of-file");
   DEFSYM (Qjson_trailing_content, "json-trailing-content");
   DEFSYM (Qjson_object_too_deep, "json-object-too-deep");
+  DEFSYM (Qjson_utf8_decode_error, "json-utf8-decode-error")
+  DEFSYM (Qjson_invalid_surrogate_error, "json-invalid-surrogate-error")
+  DEFSYM (Qjson_number_out_of_range, "json-number-out-of-range-error")
+  DEFSYM (Qjson_escape_sequence_error, "json-escape-sequence-error")
   DEFSYM (Qjson_unavailable, "json-unavailable");
   define_error (Qjson_error, "generic JSON error", Qerror);
   define_error (Qjson_out_of_memory,
@@ -1113,6 +2211,14 @@ syms_of_json (void)
                 Qjson_parse_error);
   define_error (Qjson_object_too_deep,
                 "object cyclic or Lisp evaluation too deep", Qjson_error);
+  define_error (Qjson_utf8_decode_error,
+                "invalid utf-8 encoding", Qjson_error);
+  define_error (Qjson_invalid_surrogate_error,
+                "invalid surrogate pair", Qjson_error);
+  define_error (Qjson_number_out_of_range,
+                "number out of range", Qjson_error);
+  define_error (Qjson_escape_sequence_error,
+                "invalid escape sequence", Qjson_parse_error);
 
   DEFSYM (Qpure, "pure");
   DEFSYM (Qside_effect_free, "side-effect-free");
@@ -1137,4 +2243,6 @@ syms_of_json (void)
   defsubr (&Sjson_insert);
   defsubr (&Sjson_parse_string);
   defsubr (&Sjson_parse_buffer);
+  defsubr (&Sjson_parse_string_fast);
+  defsubr (&Sjson_parse_buffer_fast);
 }
-- 
2.42.0

next             reply	other threads:[~2024-03-19 18:23 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-03-19 18:23 Herman, Géza [this message]
2024-03-19 18:47 ` [PATCH] Implement fast verisons of json-parse functions Eli Zaretskii
2024-03-19 18:50   ` Herman, Géza
2024-03-19 19:28     ` Eli Zaretskii
2024-03-19 19:50       ` Herman, Géza
2024-03-20  3:37         ` Eli Zaretskii
2024-03-23 10:40           ` Herman, Géza
2024-03-23 11:11             ` Eli Zaretskii
2024-03-30  7:43             ` Eli Zaretskii
2024-03-30 10:50               ` Herman, Géza
2024-03-30 13:00                 ` Eli Zaretskii
2024-03-30 18:36                   ` Herman, Géza
2024-03-30 20:18                     ` [External] : " Drew Adams
2024-03-30 20:51                       ` Herman, Géza
2024-03-30 23:28                         ` Drew Adams
2024-03-31  5:08                     ` Eli Zaretskii
2024-03-31 14:51                       ` Mattias Engdegård
2024-03-31 15:44                         ` Eli Zaretskii
2024-04-01  8:48                           ` Mattias Engdegård
2024-04-01 16:54                             ` Herman, Géza
2024-03-31 17:48                       ` Adam Porter

find likely ancestor, descendant, or conflicting patches for this message:
dfblob:e849ccaf72 dfblob:c08d9d4a4f
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/emacs/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87h6h2rsgn.fsf@gmail.com \
    --to=geza.herman@gmail.com \
    --cc=emacs-devel@gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).