Re: I created a faster JSON parser

all messages for Emacs-related lists mirrored at yhetil.org
 help / color / mirror / code / Atom feed

From: "Herman, Géza" <geza.herman@gmail.com>
To: Christopher Wellons <wellons@nullprogram.com>
Cc: "Herman, Géza" <geza.herman@gmail.com>,
	"emacs-devel@gnu.org" <emacs-devel@gnu.org>
Subject: Re: I created a faster JSON parser
Date: Sun, 10 Mar 2024 21:41:57 +0100	[thread overview]
Message-ID: <87sf0xu7qd.fsf@gmail.com> (raw)
In-Reply-To: <20240310165413.35pszp3b37m3y2kh@nullprogram.com>

[-- Attachment #1: Type: text/plain, Size: 1124 bytes --]


Christopher Wellons <wellons@nullprogram.com> writes:

>> I'd glad if you can give some advices: which fuzzy-testing 
>> framework
>> to use, which introductory material is worth reading, etc.
>
> I'm partial to AFL++, and it's what I reach for first. It also 
> works
> with GCC. It has two modes, with persistent mode preferred:

Thanks so much for the description!  I created a standalone 
version of my parser (I attached it), and used "afl-clang-fast -o 
json json.c -fsanitize=address,undefined" and afl-fuzz to test it. 
It's been running for an hour, the tester didn't find any problems 
yet.

I discovered a funny clang bug: it incorrectly optimizes around 
setjmp in do_test(): when json_parser_init runs, it stores the 
workspace pointer in a register.  And if there is an error during 
JSON parsing, it will always free the pointer which is in that 
register.  But in the meantime (I mean, after json_parser_init, 
and before the error is thrown), the parser could have updated 
it. So free() will be called on an already freed block.  I had to 
add a dummy printf("free!\n"); to circumvent this optimization.


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: json.c --]
[-- Type: text/x-csrc, Size: 29853 bytes --]

#include <stddef.h>
#include <stdlib.h>
#include <errno.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#include <setjmp.h>
#include <stdio.h>
#include <unistd.h>

#define ckd_add(R, A, B) __builtin_add_overflow((A), (B), (R))
#define ckd_mul(R, A, B) __builtin_mul_overflow((A), (B), (R))

#define bool _Bool
#define true 1
#define false 0

struct json_configuration {
};

typedef struct {
    size_t value;
} Lisp_Object;

struct json_parser {
    /* Because of a possible gap in the input (an emacs buffer can have
       a gap), the input is described by [input_begin;input_end) and
       [secondary_input_begin;secondary_input_end).  If the input is
       continuous, then secondary_input_begin and secondary_input_end
       should be NULL */
    const unsigned char *input_current;
    const unsigned char *input_begin;
    const unsigned char *input_end;

    const unsigned char *secondary_input_begin;
    const unsigned char *secondary_input_end;

    int current_line;
    int current_column;

    /* The parser has a maximum allowed depth.  available_depth
       decreases at each object/array begin.  If reaches zero, then an
       error is generated */
    int available_depth;

    struct json_configuration conf;

    size_t additional_bytes_count;

    /* Lisp_Objects are collected in this area during object/array
       parsing */
    Lisp_Object *object_workspace;
    Lisp_Object *object_workspace_end;
    Lisp_Object *object_workspace_current;

    /* String and number parsing uses this workspace */
    unsigned char *byte_workspace;
    unsigned char *byte_workspace_end;
    unsigned char *byte_workspace_current;
};

jmp_buf signal_env;

Lisp_Object Qjson_out_of_memory;
Lisp_Object Qjson_end_of_file;
Lisp_Object Qjson_escape_sequence_error;
Lisp_Object Qjson_utf8_decode_error;
Lisp_Object Qjson_invalid_surrogate_error;
Lisp_Object Qjson_parse_error;
Lisp_Object Qjson_error;
Lisp_Object Qjson_number_out_of_range;
Lisp_Object Qjson_object_too_deep;
Lisp_Object Qjson_trailing_content;

static _Noreturn void json_signal_error(struct json_parser *parser, Lisp_Object error) {
    longjmp(signal_env, 1);
    /* xsignal2(error, INT_TO_INTEGER(parser->current_line), INT_TO_INTEGER(parser->current_column)); */
}

static void json_parser_init(struct json_parser *parser, struct json_configuration conf, const unsigned char *input,
                             const unsigned char *input_end, const unsigned char *secondary_input,
                             const unsigned char *secondary_input_end) {
    const int initial_workspace_size = 64;
    const int initial_string_workspace_size = 512;

    if (secondary_input >= secondary_input_end) {
        secondary_input = NULL;
        secondary_input_end = NULL;
    }

    if (input < input_end) {
        parser->input_begin = input;
        parser->input_end = input_end;

        parser->secondary_input_begin = secondary_input;
        parser->secondary_input_end = secondary_input_end;
    } else {
        parser->input_begin = secondary_input;
        parser->input_end = secondary_input_end;

        parser->secondary_input_begin = NULL;
        parser->secondary_input_end = NULL;
    }

    parser->input_current = parser->input_begin;

    parser->current_line = 1;
    parser->current_column = 0;
    parser->available_depth = 8;
    parser->conf = conf;

    parser->additional_bytes_count = 0;

    parser->object_workspace = malloc(initial_workspace_size * sizeof(Lisp_Object));
    parser->object_workspace_end = parser->object_workspace + initial_workspace_size;
    parser->object_workspace_current = parser->object_workspace;

    parser->byte_workspace = malloc(initial_string_workspace_size);
    parser->byte_workspace_end = parser->byte_workspace + initial_string_workspace_size;
}

static void json_parser_done(void *parser) {
    struct json_parser *p = (struct json_parser *)parser;
    /* printf("free: %p %p\n", p->object_workspace, p->byte_workspace); */
    free(p->object_workspace);
    free(p->byte_workspace);
}

/* Makes sure that the object_workspace has 'size' available space for
   Lisp_Objects */
static void json_make_object_workspace_for(struct json_parser *parser, size_t size) {
    size_t available_size = parser->object_workspace_end - parser->object_workspace_current;
    if (available_size >= size) {
        return;
    }
    size_t needed_workspace_size = (parser->object_workspace_current - parser->object_workspace + size);
    size_t new_workspace_size = parser->object_workspace_end - parser->object_workspace;
    while (new_workspace_size < needed_workspace_size) {
        if (ckd_mul(&new_workspace_size, new_workspace_size, 2)) {
            json_signal_error(parser, Qjson_out_of_memory);
        }
    }
    size_t offset = parser->object_workspace_current - parser->object_workspace;
    parser->object_workspace = realloc(parser->object_workspace, new_workspace_size * sizeof(Lisp_Object));
    parser->object_workspace_end = parser->object_workspace + new_workspace_size;
    parser->object_workspace_current = parser->object_workspace + offset;
}

static void json_byte_workspace_reset(struct json_parser *parser) {
    parser->byte_workspace_current = parser->byte_workspace;
}

/* Puts 'value' into the byte_workspace.  If there is no space
   available, it allocates space */
static void json_byte_workspace_put(struct json_parser *parser, unsigned char value) {
    if (parser->byte_workspace_current < parser->byte_workspace_end) {
        *parser->byte_workspace_current++ = value;
        return;
    }

    size_t new_workspace_size = parser->byte_workspace_end - parser->byte_workspace;
    if (ckd_mul(&new_workspace_size, new_workspace_size, 2)) {
        json_signal_error(parser, Qjson_out_of_memory);
    }

    size_t offset = parser->byte_workspace_current - parser->byte_workspace;
    parser->byte_workspace = realloc(parser->byte_workspace, new_workspace_size);
    parser->byte_workspace_end = parser->byte_workspace + new_workspace_size;
    parser->byte_workspace_current = parser->byte_workspace + offset;
    *parser->byte_workspace_current++ = value;
}

static bool json_input_at_eof(struct json_parser *parser) {
    if (parser->input_current < parser->input_end)
        return false;
    return parser->secondary_input_end == NULL;
}

/* If there is a secondary buffer, it switches to it */
static int json_input_switch_to_secondary(struct json_parser *parser) {
    if (parser->secondary_input_begin < parser->secondary_input_end) {
        parser->additional_bytes_count = parser->input_end - parser->input_begin;
        parser->input_begin = parser->secondary_input_begin;
        parser->input_end = parser->secondary_input_end;
        parser->input_current = parser->secondary_input_begin;
        parser->secondary_input_begin = NULL;
        parser->secondary_input_end = NULL;
        return 0;
    } else
        return -1;
}

/* Reads a byte from the JSON input stream */
static unsigned char json_input_get(struct json_parser *parser) {
    if (parser->input_current >= parser->input_end && json_input_switch_to_secondary(parser) < 0)
        json_signal_error(parser, Qjson_end_of_file);
    return *parser->input_current++;
}

/* Reads a byte from the JSON input stream, if the stream is not at
 * eof.  At eof, returns -1 */
static int json_input_get_if_possible(struct json_parser *parser) {
    if (parser->input_current >= parser->input_end && json_input_switch_to_secondary(parser) < 0)
        return -1;
    return *parser->input_current++;
}

/* Puts back the last read input byte.  Only one byte can be put back,
   because otherwise this code would need to handle switching from
   the secondary buffer to the initial */
static void json_input_put_back(struct json_parser *parser) {
    parser->input_current--;
}

static bool json_skip_whitespace_internal(struct json_parser *parser, int c) {
    parser->current_column++;
    if (c == 0x20 || c == 0x09 || c == 0x0d)
        return false;
    else if (c == 0x0a) {
        parser->current_line++;
        parser->current_column = 0;
        return false;
    } else
        return true;
}

/* Skips JSON whitespace, and returns with the first non-whitespace
 * character */
static int json_skip_whitespace(struct json_parser *parser) {
    for (;;) {
        int c = json_input_get(parser);
        if (json_skip_whitespace_internal(parser, c))
            return c;
    }
}

/* Skips JSON whitespace, and returns with the first non-whitespace
 * character, if possible.  If there is no non-whitespace character
 * (because we reached the end), it returns -1 */
static int json_skip_whitespace_if_possible(struct json_parser *parser) {
    for (;;) {
        int c = json_input_get_if_possible(parser);
        if (c < 0)
            return c;
        if (json_skip_whitespace_internal(parser, c))
            return c;
    }
}

static int json_hex_value(int c) {
    if (c >= '0' && c <= '9')
        return c - '0';
    else if (c >= 'A' && c <= 'F')
        return c - 'A' + 10;
    else if (c >= 'a' && c <= 'f')
        return c - 'a' + 10;
    else
        return -1;
}

/* Parses the CCCC part of the unicode escape sequence \uCCCC */
static int json_parse_unicode(struct json_parser *parser) {
    unsigned char v[4];
    for (int i = 0; i < 4; i++) {
        int c = json_hex_value(json_input_get(parser));
        parser->current_column++;
        if (c < 0)
            json_signal_error(parser, Qjson_escape_sequence_error);
        v[i] = c;
    }

    return v[0] << 12 | v[1] << 8 | v[2] << 4 | v[3];
}

/* Parses an utf-8 code-point encoding (except the first byte), and
   returns the numeric value of the code-point (without considering
   the first byte) */
static int json_handle_utf8_tail_bytes(struct json_parser *parser, int n) {
    int v = 0;
    for (int i = 0; i < n; i++) {
        int c = json_input_get(parser);
        json_byte_workspace_put(parser, c);
        if ((c & 0xc0) != 0x80)
            json_signal_error(parser, Qjson_utf8_decode_error);
        v = (v << 6) | (c & 0x3f);
    }
    return v;
}

/* Reads a JSON string, and puts the result into the byte workspace */
static void json_parse_string(struct json_parser *parser) {
    /* a single_uninteresting byte can be simply copied from the input
       to output, it doesn't need any extra care. */
    static const char is_single_uninteresting[256] = {
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    };

    for (;;) {
        /* This if is only here for a possible speedup.  If there are 4
       bytes available, and all of them are single_uninteresting,
       then we can just copy these 4 bytes to output */
        if (parser->input_end - parser->input_current >= 4) {
            int c0 = parser->input_current[0];
            int c1 = parser->input_current[1];
            int c2 = parser->input_current[2];
            int c3 = parser->input_current[3];
            bool v0 = is_single_uninteresting[c0];
            bool v1 = is_single_uninteresting[c1];
            bool v2 = is_single_uninteresting[c2];
            bool v3 = is_single_uninteresting[c3];
            if (v0 && v1 && v2 && v3) {
                json_byte_workspace_put(parser, c0);
                json_byte_workspace_put(parser, c1);
                json_byte_workspace_put(parser, c2);
                json_byte_workspace_put(parser, c3);
                parser->input_current += 4;
                parser->current_column += 4;
                continue;
            }
        }

        int c = json_input_get(parser);
        parser->current_column++;
        if (is_single_uninteresting[c]) {
            json_byte_workspace_put(parser, c);
            continue;
        }

        if (c == '"')
            return;
        else if (c & 0x80) {
            /* Handle utf-8 encoding */
            json_byte_workspace_put(parser, c);
            if (c < 0xc0)
                json_signal_error(parser, Qjson_utf8_decode_error);
            else if (c < 0xe0) {
                int n = ((c & 0x1f) << 6 | json_handle_utf8_tail_bytes(parser, 1));
                if (n < 0x80)
                    json_signal_error(parser, Qjson_utf8_decode_error);
            } else if (c < 0xf0) {
                int n = ((c & 0xf) << 12 | json_handle_utf8_tail_bytes(parser, 2));
                if (n < 0x800 || (n >= 0xd800 && n < 0xe000))
                    json_signal_error(parser, Qjson_utf8_decode_error);
            } else if (c < 0xf8) {
                int n = ((c & 0x7) << 18 | json_handle_utf8_tail_bytes(parser, 3));
                if (n < 0x10000 || n > 0x10ffff)
                    json_signal_error(parser, Qjson_utf8_decode_error);
            } else
                json_signal_error(parser, Qjson_utf8_decode_error);
        } else if (c == '\\') {
            /* Handle escape sequences */
            c = json_input_get(parser);
            parser->current_column++;
            if (c == '"')
                json_byte_workspace_put(parser, '"');
            else if (c == '\\')
                json_byte_workspace_put(parser, '\\');
            else if (c == '/')
                json_byte_workspace_put(parser, '/');
            else if (c == 'b')
                json_byte_workspace_put(parser, '\b');
            else if (c == 'f')
                json_byte_workspace_put(parser, '\f');
            else if (c == 'n')
                json_byte_workspace_put(parser, '\n');
            else if (c == 'r')
                json_byte_workspace_put(parser, '\r');
            else if (c == 't')
                json_byte_workspace_put(parser, '\t');
            else if (c == 'u') {
                int num = json_parse_unicode(parser);
                /* is the first half of the surrogate pair */
                if (num >= 0xd800 && num < 0xdc00) {
                    parser->current_column++;
                    if (json_input_get(parser) != '\\')
                        json_signal_error(parser, Qjson_invalid_surrogate_error);
                    parser->current_column++;
                    if (json_input_get(parser) != 'u')
                        json_signal_error(parser, Qjson_invalid_surrogate_error);
                    int num2 = json_parse_unicode(parser);
                    if (num2 < 0xdc00 || num2 >= 0xe000)
                        json_signal_error(parser, Qjson_invalid_surrogate_error);
                    num = (0x10000 + ((num - 0xd800) << 10 | (num2 - 0xdc00)));
                } else if (num >= 0xdc00 && num < 0xe000)
                    /* is the second half of the surrogate pair without
                       the first half */
                    json_signal_error(parser, Qjson_invalid_surrogate_error);

                /* utf-8 encode the code-point */
                if (num < 0x80)
                    json_byte_workspace_put(parser, num);
                else if (num < 0x800) {
                    json_byte_workspace_put(parser, 0xc0 | num >> 6);
                    json_byte_workspace_put(parser, 0x80 | (num & 0x3f));
                } else if (num < 0x10000) {
                    json_byte_workspace_put(parser, 0xe0 | num >> 12);
                    json_byte_workspace_put(parser, (0x80 | ((num >> 6) & 0x3f)));
                    json_byte_workspace_put(parser, 0x80 | (num & 0x3f));
                } else {
                    json_byte_workspace_put(parser, 0xf0 | num >> 18);
                    json_byte_workspace_put(parser, (0x80 | ((num >> 12) & 0x3f)));
                    json_byte_workspace_put(parser, (0x80 | ((num >> 6) & 0x3f)));
                    json_byte_workspace_put(parser, 0x80 | (num & 0x3f));
                }
            } else
                json_signal_error(parser, Qjson_escape_sequence_error);
        } else
            json_signal_error(parser, Qjson_parse_error);
    }
}

/* If there was no integer overflow during parsing the integer, this
   puts 'value' to the output. Otherwise this calls string_to_number
   to parse integer on the byte workspace.  This could just always
   call string_to_number, but for performance reasons, during parsing
   the code tries to calculate the value, so in most cases, we can
   save call of string_to_number */
static Lisp_Object json_create_integer(struct json_parser *parser, bool integer_overflow, bool negative,
                                       ulong value) {
    Lisp_Object o;
    o.value = value;
    if (!integer_overflow) {
        if (negative) {
            uintmax_t v = value;
            if (v <= (uintmax_t)INTMAX_MAX + 1)
                return o;
        } else {
            return o;
        }
    }

    json_byte_workspace_put(parser, 0);
    ptrdiff_t len = strlen((const char *)parser->byte_workspace);
    if (len != parser->byte_workspace_current - parser->byte_workspace - 1)
        json_signal_error(parser, Qjson_error);
    return o;
}

/* Parses a float using the byte workspace */
static Lisp_Object json_create_float(struct json_parser *parser) {
    Lisp_Object o;
    json_byte_workspace_put(parser, 0);
    errno = 0;
    char *e;
    double value = strtod((const char *)parser->byte_workspace, &e);
    bool out_of_range = (errno != 0 && (value == HUGE_VAL || value == -HUGE_VAL));
    if (out_of_range)
        json_signal_error(parser, Qjson_number_out_of_range);
    else if ((const unsigned char *)e != parser->byte_workspace_current - 1)
        json_signal_error(parser, Qjson_error);
    else {
        o.value = value;
        return o;
    }
}

/* Parses a number.  The first character is the input parameter 'c'.
 */
static Lisp_Object json_parse_number(struct json_parser *parser, int c) {
    json_byte_workspace_reset(parser);
    json_byte_workspace_put(parser, c);

    Lisp_Object o;
    o.value = 0;

    bool negative = false;
    if (c == '-') {
        negative = true;
        c = json_input_get(parser);
        json_byte_workspace_put(parser, c);
        parser->current_column++;
    }
    if (c < '0' || c > '9')
        json_signal_error(parser, Qjson_parse_error);

    /* The idea is that during finding the last character of the
       number, the for loop below also tries to calculate the value.  If
       the parsed number is an integer which fits into unsigned long,
       then the parser can use the value of 'integer' right away,
       instead of having to re-parse the byte workspace later.
       Ideally, this integer should have the same size as a CPU general
       purpose register. */
    unsigned long integer = c - '0';
    bool integer_overflow = false;

    if (integer == 0) {
        if (json_input_at_eof(parser))
            return o;
        c = json_input_get(parser);
    } else {
        for (;;) {
            if (json_input_at_eof(parser))
                return json_create_integer(parser, integer_overflow, negative, integer);
            c = json_input_get(parser);
            if (c < '0' || c > '9')
                break;
            json_byte_workspace_put(parser, c);
            parser->current_column++;

            integer_overflow |= ckd_mul(&integer, integer, 10);
            integer_overflow |= ckd_add(&integer, integer, c - '0');
        }
    }

    bool is_float = false;
    if (c == '.') {
        json_byte_workspace_put(parser, c);
        parser->current_column++;

        is_float = true;
        c = json_input_get(parser);
        json_byte_workspace_put(parser, c);
        parser->current_column++;
        if (c < '0' || c > '9')
            json_signal_error(parser, Qjson_parse_error);
        for (;;) {
            if (json_input_at_eof(parser))
                return json_create_float(parser);
            c = json_input_get(parser);
            if (c < '0' || c > '9')
                break;
            json_byte_workspace_put(parser, c);
            parser->current_column++;
        }
    }
    if (c == 'e' || c == 'E') {
        json_byte_workspace_put(parser, c);
        parser->current_column++;

        is_float = true;
        c = json_input_get(parser);
        json_byte_workspace_put(parser, c);
        parser->current_column++;
        if (c == '-' || c == '+') {
            c = json_input_get(parser);
            json_byte_workspace_put(parser, c);
            parser->current_column++;
        }
        if (c < '0' || c > '9')
            json_signal_error(parser, Qjson_parse_error);
        for (;;) {
            if (json_input_at_eof(parser))
                return json_create_float(parser);
            c = json_input_get(parser);
            if (c < '0' || c > '9')
                break;
            json_byte_workspace_put(parser, c);
            parser->current_column++;
        }
    }

    /* 'c' contains a character which is not part of the number,
       so it is need to be put back */
    json_input_put_back(parser);

    if (is_float)
        return json_create_float(parser);
    else
        return json_create_integer(parser, integer_overflow, negative, integer);
}

static Lisp_Object json_parse_value(struct json_parser *parser, int c);

/* Parses a JSON array. */
static Lisp_Object json_parse_array(struct json_parser *parser) {
    int c = json_skip_whitespace(parser);

    const size_t begin_offset = parser->object_workspace_current - parser->object_workspace;

    if (c != ']') {
        parser->available_depth--;
        if (parser->available_depth < 0)
            json_signal_error(parser, Qjson_object_too_deep);

        size_t number_of_elements = 0;
        /* This loop collects the array elements in the object workspace
         */
        for (;;) {
            Lisp_Object element = json_parse_value(parser, c);
            json_make_object_workspace_for(parser, 1);
            *parser->object_workspace_current++ = element;

            c = json_skip_whitespace(parser);

            number_of_elements++;
            if (c == ']') {
                parser->available_depth++;
                break;
            }

            if (c != ',')
                json_signal_error(parser, Qjson_parse_error);

            c = json_skip_whitespace(parser);
        }
    }

    Lisp_Object result;
    const Lisp_Object *b = parser->object_workspace + begin_offset;
    size_t number_of_elements = parser->object_workspace_current - b;
    void *array = malloc(number_of_elements*sizeof(Lisp_Object));
    memcpy(array, b, number_of_elements*sizeof(Lisp_Object));

    result.value = (size_t)array;

    parser->object_workspace_current = parser->object_workspace + begin_offset;

    return result;
}

/* Parses a JSON object. */
static Lisp_Object json_parse_object(struct json_parser *parser) {
    int c = json_skip_whitespace(parser);

    const size_t begin_offset = parser->object_workspace_current - parser->object_workspace;

    if (c != '}') {
        parser->available_depth--;
        if (parser->available_depth < 0)
            json_signal_error(parser, Qjson_object_too_deep);

        /* This loop collects the object members (key/value pairs) in
         * the object workspace */
        for (;;) {
            if (c != '"')
                json_signal_error(parser, Qjson_parse_error);

            Lisp_Object key;
            json_byte_workspace_reset(parser);
            json_parse_string(parser);
            if (parser->byte_workspace_current > parser->byte_workspace) {
                key.value = parser->byte_workspace[0] + parser->byte_workspace_current[-1];
            } else {
                key.value = 0;
            }

            c = json_skip_whitespace(parser);
            if (c != ':')
                json_signal_error(parser, Qjson_parse_error);

            c = json_skip_whitespace(parser);

            Lisp_Object value = json_parse_value(parser, c);

            json_make_object_workspace_for(parser, 2);
            *parser->object_workspace_current++ = key;
            *parser->object_workspace_current++ = value;

            c = json_skip_whitespace(parser);

            if (c == '}') {
                parser->available_depth++;
                break;
            }

            if (c != ',')
                json_signal_error(parser, Qjson_parse_error);

            c = json_skip_whitespace(parser);
        }
    }

    Lisp_Object result;
    Lisp_Object *end = parser->object_workspace_current;
    Lisp_Object *member = parser->object_workspace + begin_offset;
    Lisp_Object *array = malloc((end - member)*sizeof(Lisp_Object));
    while (member < end) {
        *array++ = member[0];
        *array++ = member[1];

        member += 2;
    }
    result.value = (size_t)array;

    parser->object_workspace_current = parser->object_workspace + begin_offset;

    return result;
}

/* Token-char is not a JSON terminology.  When parsing
   null/false/true, this function tells the character set that is need
   to be considered as part of a token.  For example, if the input is
   "truesomething", then the parser shouldn't consider it as "true",
   and an additional later "something" token. An additional example:
   if the input is "truetrue", then calling (json-parse-buffer) twice
   shouldn't produce two successful calls which return t, but a
   parsing error */
static bool json_is_token_char(int c) {
    return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || (c == '-'));
}

/* This is the entry point to the value parser, this parses a JSON
 * value */
Lisp_Object json_parse_value(struct json_parser *parser, int c) {
    if (c == '{')
        return json_parse_object(parser);
    else if (c == '[')
        return json_parse_array(parser);
    else if (c == '"') {
        json_byte_workspace_reset(parser);
        json_parse_string(parser);
        Lisp_Object result;
        if (parser->byte_workspace_current > parser->byte_workspace) {
            result.value = parser->byte_workspace[0] + parser->byte_workspace_current[-1];
        } else {
            result.value = 0;
        }
        return result;
    } else if ((c >= '0' && c <= '9') || (c == '-'))
        return json_parse_number(parser, c);
    else {
        int c2 = json_input_get(parser);
        int c3 = json_input_get(parser);
        int c4 = json_input_get(parser);
        int c5 = json_input_get_if_possible(parser);

        if (c == 't' && c2 == 'r' && c3 == 'u' && c4 == 'e' && (c5 < 0 || !json_is_token_char(c5))) {
            if (c5 >= 0)
                json_input_put_back(parser);
            parser->current_column += 4;
            Lisp_Object o;
            o.value = 0;
            return o;
        }
        if (c == 'n' && c2 == 'u' && c3 == 'l' && c4 == 'l' && (c5 < 0 || !json_is_token_char(c5))) {
            if (c5 >= 0)
                json_input_put_back(parser);
            parser->current_column += 4;
            Lisp_Object o;
            o.value = 1;
            return o;
        }
        if (c == 'f' && c2 == 'a' && c3 == 'l' && c4 == 's' && c5 == 'e') {
            int c6 = json_input_get_if_possible(parser);
            if (c6 < 0 || !json_is_token_char(c6)) {
                if (c6 >= 0)
                    json_input_put_back(parser);
                parser->current_column += 5;
                Lisp_Object o;
                o.value = 2;
                return o;
            }
        }

        json_signal_error(parser, Qjson_parse_error);
    }
}

enum ParseEndBehavior { PARSEENDBEHAVIOR_CheckForGarbage, PARSEENDBEHAVIOR_MovePoint };

static Lisp_Object json_parse(struct json_parser *parser, enum ParseEndBehavior parse_end_behavior) {
    int c = json_skip_whitespace(parser);

    Lisp_Object result = json_parse_value(parser, c);

    switch (parse_end_behavior) {
        case PARSEENDBEHAVIOR_CheckForGarbage:
            c = json_skip_whitespace_if_possible(parser);
            if (c >= 0)
                json_signal_error(parser, Qjson_trailing_content);
            break;
        case PARSEENDBEHAVIOR_MovePoint: {
            break;
        }
    }

    return result;
}

void do_test(unsigned char *buffer, int length) {
    struct json_parser p;
    struct json_configuration conf;

    /* unsigned char json_data[] = "[12, 34]"; */

    json_parser_init(&p, conf, buffer, buffer + length, NULL, NULL);

    int x = setjmp(signal_env);
    if (x == 0) {
        json_parse(&p, PARSEENDBEHAVIOR_CheckForGarbage);
    /* } else { */
    /*     printf("error\n"); */
    }
    printf("free!\n");
    json_parser_done(&p);
}

__AFL_FUZZ_INIT();

int main() {
#ifdef __AFL_HAVE_MANUAL_CONTROL
  __AFL_INIT();
#endif

  unsigned char *buf = __AFL_FUZZ_TESTCASE_BUF;  // must be after __AFL_INIT
                                                 // and before __AFL_LOOP!

  while (__AFL_LOOP(10000)) {

    int len = __AFL_FUZZ_TESTCASE_LEN;  // don't use the macro directly in a
                                        // call!

    if (len < 8) continue;  // check for a required/useful minimum input length

    /* Setup function call, e.g. struct target *tmp = libtarget_init() */
    /* Call function to be fuzzed, e.g.: */
    do_test(buf, len);
    /* Reset state. e.g. libtarget_free(tmp) */

  }

  return 0;

}

next prev parent reply	other threads:[~2024-03-10 20:41 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-03-08 10:27 I created a faster JSON parser Herman, Géza
2024-03-08 11:41 ` Philip Kaludercic
2024-03-08 12:34   ` Herman, Géza
2024-03-08 12:03 ` Eli Zaretskii
2024-03-08 12:38   ` Herman, Géza
2024-03-08 12:59     ` Eli Zaretskii
2024-03-08 13:12       ` Herman, Géza
2024-03-08 14:10         ` Eli Zaretskii
2024-03-08 14:24           ` Collin Funk
2024-03-08 15:20           ` Herman, Géza
2024-03-08 16:22             ` Eli Zaretskii
2024-03-08 18:34               ` Herman, Géza
2024-03-08 19:57                 ` Eli Zaretskii
2024-03-08 20:22                   ` Herman, Géza
2024-03-09  6:52                     ` Eli Zaretskii
2024-03-09 11:08                       ` Herman, Géza
2024-03-09 12:23                         ` Lynn Winebarger
2024-03-09 12:58                         ` Po Lu
2024-03-09 13:13                         ` Eli Zaretskii
2024-03-09 14:00                           ` Herman, Géza
2024-03-09 14:21                             ` Eli Zaretskii
2024-03-08 13:28 ` Po Lu
2024-03-08 16:14   ` Herman, Géza
2024-03-09  1:55     ` Po Lu
2024-03-09 20:37 ` Christopher Wellons
2024-03-10  6:31   ` Eli Zaretskii
2024-03-10 21:39     ` Philip Kaludercic
2024-03-11 13:29       ` Eli Zaretskii
2024-03-11 14:05         ` Mattias Engdegård
2024-03-11 14:35           ` Herman, Géza
2024-03-12  9:26             ` Mattias Engdegård
2024-03-12 10:20               ` Gerd Möllmann
2024-03-12 11:14                 ` Mattias Engdegård
2024-03-12 11:33                   ` Gerd Möllmann
2024-03-15 13:35                 ` Herman, Géza
2024-03-15 14:56                   ` Gerd Möllmann
2024-03-19 18:49                   ` Mattias Engdegård
2024-03-19 19:05                     ` Herman, Géza
2024-03-19 19:18                       ` Gerd Möllmann
2024-03-19 19:13                     ` Gerd Möllmann
2024-03-12 10:58               ` Herman, Géza
2024-03-12 13:11                 ` Mattias Engdegård
2024-03-12 13:42                   ` Mattias Engdegård
2024-03-12 15:23                   ` Herman, Géza
2024-03-12 15:39                     ` Gerd Möllmann
2024-03-10  6:58   ` Herman, Géza
2024-03-10 16:54     ` Christopher Wellons
2024-03-10 20:41       ` Herman, Géza [this message]
2024-03-10 23:22         ` Christopher Wellons
2024-03-11  9:34           ` Herman, Géza
2024-03-11 13:47             ` Christopher Wellons

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87sf0xu7qd.fsf@gmail.com \
    --to=geza.herman@gmail.com \
    --cc=emacs-devel@gnu.org \
    --cc=wellons@nullprogram.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

Code repositories for project(s) associated with this external index

	https://git.savannah.gnu.org/cgit/emacs.git
	https://git.savannah.gnu.org/cgit/emacs/org-mode.git

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.