From 19e40c54691d23ea264db558ef6b5af986e8d7fe Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Tue, 30 Jul 2024 03:34:01 -0400 Subject: [PATCH 1/3] add compiled regexp primitive lisp object - add PVEC_REGEXP - add syms_of_regexp() - add make-regexp and regexpp - add syntax_table and more to the regexp struct - support pdumping of Lisp_Regexp - modify methods in search.c to accept a Lisp_Regexp - look up translate from the environment if set to t - make regexps purecopyable and pdumpable - replace fastmap size magic number with define - make more subroutines accept regexps and allocate registers - add match objects - make search methods accept a struct regexp_match_info * - track allocated registers in Lisp_Match - add match extraction for marks - add match-set-{starts,ends} methods - make noverlay perf tests work with non-root build dir - add regexp-perf stub - move perf.el to its own dir - make regexp-perf do its own benchmarking!! - parameterize some testing with macros - fix search_regs_1 lack of reentrancy - add string search benchmarks - comment out the marker extraction methods - propagate regex_match_info everywhere - set initialized_regs in regex match method - add the perf tests back so we can show off perf --- configure.ac | 1 + etc/emacs_lldb.py | 2 + lisp/image.el | 59 ++- src/Makefile.in | 3 +- src/alloc.c | 94 ++++ src/data.c | 4 + src/emacs.c | 1 + src/lisp.h | 70 ++- src/pdumper.c | 172 +++++- src/print.c | 48 ++ src/regex-emacs.c | 767 ++++++++++++++++++++++++++- src/regex-emacs.h | 38 +- src/search.c | 742 +++++++++++++++++--------- src/treesit.c | 4 +- test/manual/noverlay/Makefile.in | 19 +- test/manual/noverlay/overlay-perf.el | 407 -------------- test/manual/perf/perf.el | 442 +++++++++++++++ test/manual/regexp/Makefile.in | 38 ++ test/manual/regexp/regexp-perf.el | 232 ++++++++ 19 files changed, 2419 insertions(+), 724 deletions(-) create mode 100644 test/manual/perf/perf.el create mode 100644 test/manual/regexp/Makefile.in create mode 100644 test/manual/regexp/regexp-perf.el diff --git a/configure.ac b/configure.ac index 67da852667d..53b85ed4a9f 100644 --- a/configure.ac +++ b/configure.ac @@ -7900,6 +7900,7 @@ AC_DEFUN dnl ", [], [opt_makefile='$opt_makefile']" and it should work. ARCH_INDEPENDENT_CONFIG_FILES([test/Makefile]) ARCH_INDEPENDENT_CONFIG_FILES([test/manual/noverlay/Makefile]) + ARCH_INDEPENDENT_CONFIG_FILES([test/manual/regexp/Makefile]) fi opt_makefile=test/infra/Makefile if test -f "$srcdir/$opt_makefile.in"; then diff --git a/etc/emacs_lldb.py b/etc/emacs_lldb.py index ba80d3431f3..ccc096280ba 100644 --- a/etc/emacs_lldb.py +++ b/etc/emacs_lldb.py @@ -69,6 +69,8 @@ class Lisp_Object: "PVEC_MODULE_FUNCTION": "struct Lisp_Module_Function", "PVEC_NATIVE_COMP_UNIT": "struct Lisp_Native_Comp_Unit", "PVEC_SQLITE": "struct Lisp_Sqlite", + "PVEC_REGEXP": "struct Lisp_Regexp", + "PVEC_MATCH": "struct Lisp_Match", "PVEC_COMPILED": "struct Lisp_Vector", "PVEC_CHAR_TABLE": "struct Lisp_Vector", "PVEC_SUB_CHAR_TABLE": "void", diff --git a/lisp/image.el b/lisp/image.el index 3d60b485c6b..5c72181b94c 100644 --- a/lisp/image.el +++ b/lisp/image.el @@ -36,30 +36,31 @@ image (&optional filter animation-cache)) (defconst image-type-header-regexps - `(("\\`/[\t\n\r ]*\\*.*XPM.\\*/" . xpm) - ("\\`P[1-6]\\(?:\ + `((,(make-regexp "\\`/[\t\n\r ]*\\*.*XPM.\\*/") . xpm) + (,(make-regexp "\\`P[1-6]\\(?:\ \\(?:\\(?:#[^\r\n]*[\r\n]\\)*[ \t\r\n]\\)+\ \\(?:\\(?:#[^\r\n]*[\r\n]\\)*[0-9]\\)+\ -\\)\\{2\\}" . pbm) - ("\\`GIF8[79]a" . gif) - ("\\`\x89PNG\r\n\x1a\n" . png) - ("\\`[\t\n\r ]*#define \\([a-z0-9_]+\\)_width [0-9]+\n\ +\\)\\{2\\}") . pbm) + (,(make-regexp "\\`GIF8[79]a") . gif) + (,(make-regexp "\\`\x89PNG\r\n\x1a\n") . png) + (,(make-regexp "\\`[\t\n\r ]*#define \\([a-z0-9_]+\\)_width [0-9]+\n\ #define \\1_height [0-9]+\n\\(\ #define \\1_x_hot [0-9]+\n\ #define \\1_y_hot [0-9]+\n\\)?\ -static \\(unsigned \\)?char \\1_bits" . xbm) - ("\\`\\(?:MM\0\\*\\|II\\*\0\\)" . tiff) - ("\\`[\t\n\r ]*%!PS" . postscript) - ("\\`\xff\xd8" . jpeg) ; used to be (image-jpeg-p . jpeg) - ("\\`RIFF[^z-a][^z-a][^z-a][^z-a]WEBPVP8" . webp) +static \\(unsigned \\)?char \\1_bits") . xbm) + (,(make-regexp "\\`\\(?:MM\0\\*\\|II\\*\0\\)") . tiff) + (,(make-regexp "\\`[\t\n\r ]*%!PS") . postscript) + (,(make-regexp "\\`\xff\xd8") . jpeg) ; used to be (image-jpeg-p . jpeg) + (,(make-regexp "\\`RIFF[^z-a][^z-a][^z-a][^z-a]WEBPVP8") . webp) (,(let* ((incomment-re "\\(?:[^-]\\|-[^-]\\)") - (comment-re (concat "\\(?:!--" incomment-re "*-->[ \t\r\n]*<\\)"))) - (concat "\\(?:<\\?xml[ \t\r\n]+[^>]*>\\)?[ \t\r\n]*<" - comment-re "*" - "\\(?:!DOCTYPE[ \t\r\n]+[^>]*>[ \t\r\n]*<[ \t\r\n]*" comment-re "*\\)?" - "[Ss][Vv][Gg]")) + (comment-re (concat "\\(?:!--" incomment-re "*-->[ \t\r\n]*<\\)"))) + (make-regexp + (concat "\\(?:<\\?xml[ \t\r\n]+[^>]*>\\)?[ \t\r\n]*<" + comment-re "*" + "\\(?:!DOCTYPE[ \t\r\n]+[^>]*>[ \t\r\n]*<[ \t\r\n]*" comment-re "*\\)?" + "[Ss][Vv][Gg]"))) . svg) - ("\\`....ftyp\\(heic\\|heix\\|hevc\\|heim\\|heis\\|hevm\\|hevs\\|mif1\\|msf1\\)" . heic)) + (,(make-regexp "\\`....ftyp\\(heic\\|heix\\|hevc\\|heim\\|heis\\|hevm\\|hevs\\|mif1\\|msf1\\)") . heic)) "Alist of (REGEXP . IMAGE-TYPE) pairs used to auto-detect image types. When the first bytes of an image file match REGEXP, it is assumed to be of image type IMAGE-TYPE if IMAGE-TYPE is a symbol. If not a symbol, @@ -68,18 +69,18 @@ image-type-header-regexps a non-nil value, TYPE is the image's type.") (defvar image-type-file-name-regexps - '(("\\.png\\'" . png) - ("\\.gif\\'" . gif) - ("\\.jpe?g\\'" . jpeg) - ("\\.webp\\'" . webp) - ("\\.bmp\\'" . bmp) - ("\\.xpm\\'" . xpm) - ("\\.pbm\\'" . pbm) - ("\\.xbm\\'" . xbm) - ("\\.ps\\'" . postscript) - ("\\.tiff?\\'" . tiff) - ("\\.svgz?\\'" . svg) - ("\\.hei[cf]s?\\'" . heic)) + `((,(make-regexp "\\.png\\'") . png) + (,(make-regexp "\\.gif\\'") . gif) + (,(make-regexp "\\.jpe?g\\'") . jpeg) + (,(make-regexp "\\.webp\\'") . webp) + (,(make-regexp "\\.bmp\\'") . bmp) + (,(make-regexp "\\.xpm\\'") . xpm) + (,(make-regexp "\\.pbm\\'") . pbm) + (,(make-regexp "\\.xbm\\'") . xbm) + (,(make-regexp "\\.ps\\'") . postscript) + (,(make-regexp "\\.tiff?\\'") . tiff) + (,(make-regexp "\\.svgz?\\'") . svg) + (,(make-regexp "\\.hei[cf]s?\\'") . heic)) "Alist of (REGEXP . IMAGE-TYPE) pairs used to identify image files. When the name of an image file match REGEXP, it is assumed to be of image type IMAGE-TYPE.") diff --git a/src/Makefile.in b/src/Makefile.in index c278924ef94..e6ce159c051 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -537,7 +537,8 @@ all: .PHONY: all dmpstruct_headers=$(srcdir)/lisp.h $(srcdir)/buffer.h $(srcdir)/itree.h \ - $(srcdir)/intervals.h $(srcdir)/charset.h $(srcdir)/bignum.h + $(srcdir)/intervals.h $(srcdir)/charset.h $(srcdir)/bignum.h \ + $(srcdir)/regex-emacs.h ifeq ($(CHECK_STRUCTS),true) pdumper.o: dmpstruct.h endif diff --git a/src/alloc.c b/src/alloc.c index 06fe12cff3d..de7154e7575 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -3467,6 +3467,25 @@ cleanup_vector (struct Lisp_Vector *vector) } } break; + case PVEC_REGEXP: + { + struct Lisp_Regexp *r = PSEUDOVEC_STRUCT (vector, Lisp_Regexp); + eassert (r->buffer->allocated > 0); + eassert (r->buffer->used <= r->buffer->allocated); + xfree (r->buffer->buffer); + xfree (r->buffer->fastmap); + xfree (r->buffer); + } + break; + case PVEC_MATCH: + { + struct Lisp_Match *m = PSEUDOVEC_STRUCT (vector, Lisp_Match); + eassert (m->regs->num_regs > 0); + xfree (m->regs->start); + xfree (m->regs->end); + xfree (m->regs); + } + break; case PVEC_OBARRAY: { struct Lisp_Obarray *o = PSEUDOVEC_STRUCT (vector, Lisp_Obarray); @@ -5881,6 +5900,64 @@ make_pure_c_string (const char *data, ptrdiff_t nchars) static Lisp_Object purecopy (Lisp_Object obj); +static struct re_pattern_buffer * +make_pure_re_pattern_buffer (const struct re_pattern_buffer *bufp) +{ + struct re_pattern_buffer *pure = pure_alloc (sizeof *pure, -1); + *pure = *bufp; + + pure->buffer = pure_alloc (bufp->used, -1); + memcpy (pure->buffer, bufp->buffer, bufp->used); + pure->allocated = bufp->used; + pure->fastmap = pure_alloc (FASTMAP_SIZE, -1); + memcpy (pure->fastmap, bufp->fastmap, FASTMAP_SIZE); + pure->translate = purecopy (bufp->translate); + + return pure; +} + +static Lisp_Object +make_pure_regexp (const struct Lisp_Regexp *r) +{ + struct Lisp_Regexp *pure = pure_alloc (sizeof *pure, Lisp_Vectorlike); + *pure = *r; + + pure->pattern = purecopy (r->pattern); + pure->whitespace_regexp = purecopy (r->whitespace_regexp); + pure->syntax_table = purecopy (r->syntax_table); + pure->default_match_target = purecopy (r->default_match_target); + pure->buffer = make_pure_re_pattern_buffer (r->buffer); + + return make_lisp_ptr (pure, Lisp_Vectorlike); +} + +static struct re_registers * +make_pure_re_registers (const struct re_registers *regs) +{ + struct re_registers *pure = pure_alloc (sizeof *pure, -1); + *pure = *regs; + + ptrdiff_t reg_size = regs->num_regs * sizeof (ptrdiff_t); + pure->start = pure_alloc (reg_size, -1); + memcpy (pure->start, regs->start, reg_size); + pure->end = pure_alloc (reg_size, -1); + memcpy (pure->end, regs->end, reg_size); + + return pure; +} + +static Lisp_Object +make_pure_match (const struct Lisp_Match *m) +{ + struct Lisp_Match *pure = pure_alloc (sizeof *pure, Lisp_Vectorlike); + *pure = *m; + + pure->haystack = purecopy (m->haystack); + pure->regs = make_pure_re_registers (m->regs); + + return make_lisp_ptr (pure, Lisp_Vectorlike); +} + /* Return a cons allocated from pure space. Give it pure copies of CAR as car and CDR as cdr. */ @@ -6038,6 +6115,10 @@ purecopy (Lisp_Object obj) obj = make_pure_string (SSDATA (obj), SCHARS (obj), SBYTES (obj), STRING_MULTIBYTE (obj)); + else if (REGEXP_P (obj)) + obj = make_pure_regexp (XREGEXP (obj)); + else if (MATCH_P (obj)) + obj = make_pure_match (XMATCH (obj)); else if (HASH_TABLE_P (obj)) { struct Lisp_Hash_Table *table = XHASH_TABLE (obj); @@ -7311,6 +7392,19 @@ #define CHECK_ALLOCATED_AND_LIVE_SYMBOL() ((void) 0) break; } + case PVEC_REGEXP: + { + ptrdiff_t size = ptr->header.size; + if (size & PSEUDOVECTOR_FLAG) + size &= PSEUDOVECTOR_SIZE_MASK; + set_vector_marked (ptr); + mark_stack_push_values (ptr->contents, size); + + struct Lisp_Regexp *r = (struct Lisp_Regexp *)ptr; + mark_stack_push_value (r->buffer->translate); + break; + } + case PVEC_CHAR_TABLE: case PVEC_SUB_CHAR_TABLE: mark_char_table (ptr, (enum pvec_type) pvectype); diff --git a/src/data.c b/src/data.c index d947d200870..f52732741cf 100644 --- a/src/data.c +++ b/src/data.c @@ -290,6 +290,10 @@ DEFUN ("cl-type-of", Fcl_type_of, Scl_type_of, 1, 1, 0, return Qsqlite; case PVEC_SUB_CHAR_TABLE: return Qsub_char_table; + case PVEC_REGEXP: + return Qregexp; + case PVEC_MATCH: + return Qmatch; /* "Impossible" cases. */ case PVEC_MISC_PTR: case PVEC_OTHER: diff --git a/src/emacs.c b/src/emacs.c index 37c8b28fc2c..560843f9c56 100644 --- a/src/emacs.c +++ b/src/emacs.c @@ -2349,6 +2349,7 @@ main (int argc, char **argv) syms_of_window (); syms_of_xdisp (); syms_of_sqlite (); + syms_of_regexp (); syms_of_font (); #ifdef HAVE_WINDOW_SYSTEM syms_of_fringe (); diff --git a/src/lisp.h b/src/lisp.h index 8ac65ca429c..8f7e5fa1daa 100644 --- a/src/lisp.h +++ b/src/lisp.h @@ -1048,6 +1048,8 @@ DEFINE_GDB_SYMBOL_END (PSEUDOVECTOR_FLAG) PVEC_TS_NODE, PVEC_TS_COMPILED_QUERY, PVEC_SQLITE, + PVEC_REGEXP, + PVEC_MATCH, /* These should be last, for internal_equal and sxhash_obj. */ PVEC_CLOSURE, @@ -1421,6 +1423,8 @@ #define XSETWINDOW(a, b) XSETPSEUDOVECTOR (a, b, PVEC_WINDOW) #define XSETTERMINAL(a, b) XSETPSEUDOVECTOR (a, b, PVEC_TERMINAL) #define XSETSUBR(a, b) XSETPSEUDOVECTOR (a, b, PVEC_SUBR) #define XSETBUFFER(a, b) XSETPSEUDOVECTOR (a, b, PVEC_BUFFER) +#define XSETREGEXP(a, b) XSETPSEUDOVECTOR (a, b, PVEC_REGEXP) +#define XSETMATCH(a, b) XSETPSEUDOVECTOR (a, b, PVEC_MATCH) #define XSETCHAR_TABLE(a, b) XSETPSEUDOVECTOR (a, b, PVEC_CHAR_TABLE) #define XSETBOOL_VECTOR(a, b) XSETPSEUDOVECTOR (a, b, PVEC_BOOL_VECTOR) #define XSETSUB_CHAR_TABLE(a, b) XSETPSEUDOVECTOR (a, b, PVEC_SUB_CHAR_TABLE) @@ -2706,6 +2710,47 @@ XHASH_TABLE (Lisp_Object a) return XUNTAG (a, Lisp_Vectorlike, struct Lisp_Hash_Table); } +INLINE bool +REGEXP_P (Lisp_Object a) +{ + return PSEUDOVECTORP (a, PVEC_REGEXP); +} + +INLINE struct Lisp_Regexp * +XREGEXP (Lisp_Object a) +{ + eassert (REGEXP_P (a)); + return XUNTAG (a, Lisp_Vectorlike, struct Lisp_Regexp); +} + +INLINE void +CHECK_REGEXP (Lisp_Object x) +{ + CHECK_TYPE (REGEXP_P (x), Qregexpp, x); +} + +INLINE bool +MATCH_P (Lisp_Object a) +{ + return PSEUDOVECTORP (a, PVEC_MATCH); +} + +INLINE struct Lisp_Match * +XMATCH (Lisp_Object a) +{ + eassert (MATCH_P (a)); + return XUNTAG (a, Lisp_Vectorlike, struct Lisp_Match); +} + +INLINE void +CHECK_MATCH (Lisp_Object x) +{ + CHECK_TYPE (MATCH_P (x), Qmatchp, x); +} + +/* Defined in regex-emacs.c. */ +Lisp_Object allocate_match (ptrdiff_t re_nsub); + INLINE Lisp_Object make_lisp_hash_table (struct Lisp_Hash_Table *h) { @@ -2955,6 +3000,25 @@ xmint_pointer (Lisp_Object a) bool is_statement; } GCALIGNED_STRUCT; +struct Lisp_Regexp +{ + union vectorlike_header header; + Lisp_Object pattern; + Lisp_Object whitespace_regexp; + Lisp_Object syntax_table; + Lisp_Object default_match_target; + bool posix; + struct re_pattern_buffer *buffer; +} GCALIGNED_STRUCT; + +struct Lisp_Match +{ + union vectorlike_header header; + Lisp_Object haystack; + ptrdiff_t initialized_regs; + struct re_registers *regs; +} GCALIGNED_STRUCT; + struct Lisp_User_Ptr { union vectorlike_header header; @@ -4470,6 +4534,9 @@ verify (FLT_RADIX == 2 || FLT_RADIX == 16); /* Defined in sqlite.c. */ extern void syms_of_sqlite (void); +/* Defined in regex-emacs.c. */ +extern void syms_of_regexp (void); + /* Defined in xsettings.c. */ extern void syms_of_xsettings (void); @@ -5113,9 +5180,6 @@ fast_c_string_match_ignore_case (Lisp_Object regexp, ptrdiff_t, ptrdiff_t *); extern ptrdiff_t find_before_next_newline (ptrdiff_t, ptrdiff_t, ptrdiff_t, ptrdiff_t *); -extern EMACS_INT search_buffer (Lisp_Object, ptrdiff_t, ptrdiff_t, - ptrdiff_t, ptrdiff_t, EMACS_INT, - bool, Lisp_Object, Lisp_Object, bool); extern void syms_of_search (void); extern void clear_regexp_cache (void); diff --git a/src/pdumper.c b/src/pdumper.c index 53bddf91f04..3c70e590d54 100644 --- a/src/pdumper.c +++ b/src/pdumper.c @@ -1528,6 +1528,36 @@ dump_emacs_reloc_copy_from_dump (struct dump_context *ctx, dump_off dump_offset, dump_off_to_lisp (size))); } +static void +dump_remember_fixup_ptr_raw (struct dump_context *ctx, + dump_off dump_offset, + dump_off new_dump_offset); + +/* Add a byte range relocation that copies arbitrary bytes from the dump. + + When the dump is loaded, Emacs copies SIZE bytes into DUMP_OFFSET from + dump. Dump bytes are loaded from SOURCE. */ +static void +dump_cold_bytes (struct dump_context *ctx, dump_off dump_offset, + void* source, dump_off size) +{ + eassert (size >= 0); + eassert (size < (1 << EMACS_RELOC_LENGTH_BITS)); + + if (!ctx->flags.dump_object_contents) + return; + + if (size == 0) + { + eassert (source == NULL); + return; + } + eassert (source != NULL); + + dump_remember_fixup_ptr_raw (ctx, dump_offset, ctx->offset); + dump_write (ctx, source, size); +} + /* Add an Emacs relocation that sets values to arbitrary bytes. When the dump is loaded, Emacs copies SIZE bytes from the @@ -2125,6 +2155,121 @@ dump_marker (struct dump_context *ctx, const struct Lisp_Marker *marker) return finish_dump_pvec (ctx, &out->header); } +static dump_off +dump_re_pattern_buffer (struct dump_context *ctx, const struct re_pattern_buffer *bufp) +{ +#if CHECK_STRUCTS && !defined (HASH_re_pattern_buffer_36714DF24A) +# error "re_pattern_buffer changed. See CHECK_STRUCTS comment in config.h." +#endif + struct re_pattern_buffer out; + dump_object_start (ctx, &out, sizeof (out)); + if (bufp->buffer) + dump_field_fixup_later (ctx, &out, bufp, &bufp->buffer); + DUMP_FIELD_COPY (&out, bufp, allocated); + DUMP_FIELD_COPY (&out, bufp, used); + DUMP_FIELD_COPY (&out, bufp, charset_unibyte); + if (bufp->fastmap) + dump_field_fixup_later (ctx, &out, bufp, &bufp->fastmap); + dump_field_lv (ctx, &out, bufp, &bufp->translate, WEIGHT_NORMAL); + DUMP_FIELD_COPY (&out, bufp, re_nsub); + DUMP_FIELD_COPY (&out, bufp, can_be_null); + DUMP_FIELD_COPY (&out, bufp, regs_allocated); + DUMP_FIELD_COPY (&out, bufp, fastmap_accurate); + DUMP_FIELD_COPY (&out, bufp, used_syntax); + DUMP_FIELD_COPY (&out, bufp, multibyte); + DUMP_FIELD_COPY (&out, bufp, target_multibyte); + dump_off offset = dump_object_finish (ctx, &out, sizeof (out)); + if (bufp->buffer) + { + eassert (bufp->allocated > 0); + if (bufp->allocated > DUMP_OFF_MAX - 1) + error ("regex pattern buffer too large"); + dump_off total_size = ptrdiff_t_to_dump_off (bufp->allocated); + eassert (total_size > 0); + dump_cold_bytes + (ctx, + offset + dump_offsetof (struct re_pattern_buffer, buffer), + bufp->buffer, + total_size); + } + if (bufp->fastmap) + { + eassert (FASTMAP_SIZE <= DUMP_OFF_MAX - 1); + dump_off fastmap_size = FASTMAP_SIZE; + dump_cold_bytes + (ctx, + offset + dump_offsetof (struct re_pattern_buffer, fastmap), + bufp->fastmap, + fastmap_size); + } + return offset; +} + +static dump_off +dump_re_registers (struct dump_context *ctx, const struct re_registers *regs) +{ +#if CHECK_STRUCTS && !defined (HASH_re_registers_B4A76DA5D5) +# error "re_registers changed. See CHECK_STRUCTS comment in config.h." +#endif + struct re_registers out; + dump_object_start (ctx, &out, sizeof (out)); + eassert (regs->num_regs > 0); + eassert (regs->start); + eassert (regs->end); + DUMP_FIELD_COPY (&out, regs, num_regs); + dump_field_fixup_later (ctx, &out, regs, ®s->start); + dump_field_fixup_later (ctx, &out, regs, ®s->end); + dump_off offset = dump_object_finish (ctx, &out, sizeof (out)); + dump_off total_size = ptrdiff_t_to_dump_off (regs->num_regs * sizeof (ptrdiff_t)); + eassert (total_size > 0); + dump_cold_bytes + (ctx, + offset + dump_offsetof (struct re_registers, start), + regs->start, + total_size); + dump_cold_bytes + (ctx, + offset + dump_offsetof (struct re_registers, end), + regs->end, + total_size); + return offset; +} + +static dump_off +dump_match (struct dump_context *ctx, const struct Lisp_Match *match) +{ +#if CHECK_STRUCTS && !defined (HASH_Lisp_Match_EE9D54EA09) +# error "Lisp_Match changed. See CHECK_STRUCTS comment in config.h." +#endif + START_DUMP_PVEC (ctx, &match->header, struct Lisp_Match, out); + dump_pseudovector_lisp_fields (ctx, &out->header, &match->header); + dump_field_fixup_later (ctx, &out, match, &match->regs); + dump_off offset = finish_dump_pvec (ctx, &out->header); + dump_remember_fixup_ptr_raw + (ctx, + offset + dump_offsetof (struct Lisp_Match, regs), + dump_re_registers (ctx, match->regs)); + return offset; +} + +static dump_off +dump_regexp (struct dump_context *ctx, const struct Lisp_Regexp *regexp) +{ +#if CHECK_STRUCTS && !defined (HASH_Lisp_Regexp_29DF51A9AC) +# error "Lisp_Regexp changed. See CHECK_STRUCTS comment in config.h." +#endif + START_DUMP_PVEC (ctx, ®exp->header, struct Lisp_Regexp, out); + dump_pseudovector_lisp_fields (ctx, &out->header, ®exp->header); + DUMP_FIELD_COPY (out, regexp, posix); + dump_field_fixup_later (ctx, &out, regexp, ®exp->buffer); + dump_off offset = finish_dump_pvec (ctx, &out->header); + dump_remember_fixup_ptr_raw + (ctx, + offset + dump_offsetof (struct Lisp_Regexp, buffer), + dump_re_pattern_buffer (ctx, regexp->buffer)); + return offset; +} + static dump_off dump_interval_node (struct dump_context *ctx, struct itree_node *node) { @@ -2135,6 +2280,7 @@ dump_interval_node (struct dump_context *ctx, struct itree_node *node) dump_object_start (ctx, &out, sizeof (out)); if (node->parent) dump_field_fixup_later (ctx, &out, node, &node->parent); + /* FIXME: should these both be &node->{left,right} instead of &node->parent? */ if (node->left) dump_field_fixup_later (ctx, &out, node, &node->parent); if (node->right) @@ -3065,7 +3211,7 @@ dump_vectorlike (struct dump_context *ctx, Lisp_Object lv, dump_off offset) { -#if CHECK_STRUCTS && !defined HASH_pvec_type_99104541E2 +#if CHECK_STRUCTS && !defined HASH_pvec_type_A379ED4BDA # error "pvec_type changed. See CHECK_STRUCTS comment in config.h." #endif const struct Lisp_Vector *v = XVECTOR (lv); @@ -3127,6 +3273,10 @@ dump_vectorlike (struct dump_context *ctx, #ifdef HAVE_TREE_SITTER return dump_treesit_compiled_query (ctx, XTS_COMPILED_QUERY (lv)); #endif + case PVEC_REGEXP: + return dump_regexp (ctx, XREGEXP (lv)); + case PVEC_MATCH: + return dump_match (ctx, XMATCH (lv)); case PVEC_WINDOW_CONFIGURATION: case PVEC_OTHER: case PVEC_XWIDGET: @@ -3462,11 +3612,11 @@ dump_cold_string (struct dump_context *ctx, Lisp_Object string) error ("string too large"); dump_off total_size = ptrdiff_t_to_dump_off (SBYTES (string) + 1); eassert (total_size > 0); - dump_remember_fixup_ptr_raw + dump_cold_bytes (ctx, string_offset + dump_offsetof (struct Lisp_String, u.s.data), - ctx->offset); - dump_write (ctx, XSTRING (string)->u.s.data, total_size); + XSTRING (string)->u.s.data, + total_size); } static void @@ -3475,12 +3625,12 @@ dump_cold_charset (struct dump_context *ctx, Lisp_Object data) /* Dump charset lookup tables. */ int cs_i = XFIXNUM (XCAR (data)); dump_off cs_dump_offset = dump_off_from_lisp (XCDR (data)); - dump_remember_fixup_ptr_raw + struct charset *cs = charset_table + cs_i; + dump_cold_bytes (ctx, cs_dump_offset + dump_offsetof (struct charset, code_space_mask), - ctx->offset); - struct charset *cs = charset_table + cs_i; - dump_write (ctx, cs->code_space_mask, 256); + cs->code_space_mask, + 256); } static void @@ -3501,11 +3651,11 @@ dump_cold_buffer (struct dump_context *ctx, Lisp_Object data) + 1; if (nbytes > DUMP_OFF_MAX) error ("buffer too large"); - dump_remember_fixup_ptr_raw + dump_cold_bytes (ctx, buffer_offset + dump_offsetof (struct buffer, own_text.beg), - ctx->offset); - dump_write (ctx, b->own_text.beg, ptrdiff_t_to_dump_off (nbytes)); + b->own_text.beg, + ptrdiff_t_to_dump_off (nbytes)); } static void diff --git a/src/print.c b/src/print.c index 8f28b14e8b6..c87630640c3 100644 --- a/src/print.c +++ b/src/print.c @@ -2084,6 +2084,54 @@ print_vectorlike_unreadable (Lisp_Object obj, Lisp_Object printcharfun, } return; + case PVEC_REGEXP: + { + struct Lisp_Regexp *r = XREGEXP (obj); + print_c_string ("#buffer->re_nsub); + strout (buf, i, i, printcharfun); + print_c_string (" translate=", printcharfun); + print_object (r->buffer->translate, printcharfun, escapeflag); + print_c_string (" whitespace=", printcharfun); + print_object (r->whitespace_regexp, printcharfun, escapeflag); + print_c_string (" syntax_table=", printcharfun); + print_object (r->syntax_table, printcharfun, escapeflag); + if (r->posix) + { + print_c_string (" posix=true", printcharfun); + } + else + { + print_c_string (" posix=false", printcharfun); + } + print_c_string (">", printcharfun); + } + return; + + case PVEC_MATCH: + { + struct Lisp_Match *m = XMATCH (obj); + ptrdiff_t num_regs = m->regs->num_regs; + ptrdiff_t initialized_regs = m->initialized_regs; + print_c_string ("#regs->start[reg_index], + m->regs->end[reg_index]); + strout (buf, i, i, printcharfun); + } + print_c_string ("]>", printcharfun); + } + return; + case PVEC_OBARRAY: { struct Lisp_Obarray *o = XOBARRAY (obj); diff --git a/src/regex-emacs.c b/src/regex-emacs.c index 92dbdbecbf1..8fe76a70ba9 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c @@ -32,6 +32,7 @@ #include "character.h" #include "buffer.h" #include "syntax.h" +#include "charset.h" #include "category.h" #include "dispextern.h" @@ -191,7 +192,7 @@ #define BYTEWIDTH 8 /* In bits. */ re_char *string1, ptrdiff_t size1, re_char *string2, ptrdiff_t size2, ptrdiff_t pos, - struct re_registers *regs, + struct regexp_match_info *info, ptrdiff_t stop); /* These are the command codes that appear in compiled regular @@ -468,6 +469,8 @@ print_fastmap (FILE *dest, char *fastmap) bool was_a_range = false; int i = 0; + /* FIXME: unify "1 << BYTEWIDTH" and "FASTMAP_SIZE" defines (both are + the same value = 256). */ while (i < (1 << BYTEWIDTH)) { if (fastmap[i++]) @@ -3373,10 +3376,11 @@ re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, ptrdiff_t re_search (struct re_pattern_buffer *bufp, const char *string, ptrdiff_t size, - ptrdiff_t startpos, ptrdiff_t range, struct re_registers *regs) + ptrdiff_t startpos, ptrdiff_t range, + struct regexp_match_info *info) { return re_search_2 (bufp, NULL, 0, string, size, startpos, range, - regs, size); + info, size); } /* Address of POS in the concatenation of virtual string. */ @@ -3408,7 +3412,7 @@ #define POS_ADDR_VSTRING(POS) \ re_search_2 (struct re_pattern_buffer *bufp, const char *str1, ptrdiff_t size1, const char *str2, ptrdiff_t size2, ptrdiff_t startpos, ptrdiff_t range, - struct re_registers *regs, ptrdiff_t stop) + struct regexp_match_info *info, ptrdiff_t stop) { ptrdiff_t val; re_char *string1 = (re_char *) str1; @@ -3577,7 +3581,7 @@ re_search_2 (struct re_pattern_buffer *bufp, const char *str1, ptrdiff_t size1, return -1; val = re_match_2_internal (bufp, string1, size1, string2, size2, - startpos, regs, stop); + startpos, info, stop); if (val >= 0) return startpos; @@ -4047,15 +4051,18 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, re_match_2 (struct re_pattern_buffer *bufp, char const *string1, ptrdiff_t size1, char const *string2, ptrdiff_t size2, - ptrdiff_t pos, struct re_registers *regs, ptrdiff_t stop) + ptrdiff_t pos, struct regexp_match_info *info, + ptrdiff_t stop) { ptrdiff_t result; + eassert (info != NULL); - RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, pos); + if (!info->match) + RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, pos); result = re_match_2_internal (bufp, (re_char *) string1, size1, (re_char *) string2, size2, - pos, regs, stop); + pos, info, stop); return result; } @@ -4072,11 +4079,14 @@ unwind_re_match (void *ptr) re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, ptrdiff_t size1, re_char *string2, ptrdiff_t size2, - ptrdiff_t pos, struct re_registers *regs, ptrdiff_t stop) + ptrdiff_t pos, struct regexp_match_info *info, + ptrdiff_t stop) { eassume (0 <= size1); eassume (0 <= size2); eassume (0 <= pos && pos <= stop && stop <= size1 + size2); + eassert (info != NULL); + struct re_registers *regs = info->regs; /* General temporaries. */ int mcnt; @@ -4350,6 +4360,13 @@ re_match_2_internal (struct re_pattern_buffer *bufp, /* If caller wants register contents data back, do it. */ if (regs) { + if (info->match) + { + eassert (bufp->regs_allocated == REGS_FIXED); + eassert (num_regs <= info->regs->num_regs); + eassert (regs == info->regs); + } + /* Have the register data arrays been allocated? */ if (bufp->regs_allocated == REGS_UNALLOCATED) { /* No. So allocate them with malloc. */ @@ -4398,10 +4415,17 @@ re_match_2_internal (struct re_pattern_buffer *bufp, } } - /* If the regs structure we return has more elements than - were in the pattern, set the extra elements to -1. */ - for (ptrdiff_t reg = num_regs; reg < regs->num_regs; reg++) - regs->start[reg] = regs->end[reg] = -1; + if (info->match) + /* If the match info is a match object, don't + overwrite anything, and just reduce the matched + registers to the number of groups actually + matched. */ + info->match->initialized_regs = num_regs; + else + /* If the regs structure we return has more elements than + were in the pattern, set the extra elements to -1. */ + for (ptrdiff_t reg = num_regs; reg < regs->num_regs; reg++) + regs->start[reg] = regs->end[reg] = -1; } DEBUG_PRINT ("%td failure points pushed, %td popped (%td remain).\n", @@ -5353,3 +5377,720 @@ re_compile_pattern (const char *pattern, ptrdiff_t length, return NULL; return re_error_msgid[ret]; } + +DEFUN ("make-regexp", Fmake_regexp, Smake_regexp, 1, 3, 0, + doc: /* Compile a regexp object from string PATTERN. + +POSIX is non-nil if we want full backtracking (POSIX style) for +this pattern. +TRANSLATE is a translation table for ignoring case, or t to look up from +the current buffer at compile time if `case-fold-search' is on, or nil +for none. + +The value of `search-spaces-regexp' is looked up in order to translate +literal space characters in PATTERN. */) + (Lisp_Object pattern, Lisp_Object posix, Lisp_Object translate) +{ + const char *whitespace_regexp = NULL; + char *val = NULL; + bool is_posix = !NILP (posix); + struct Lisp_Regexp *p = NULL; + struct re_pattern_buffer *bufp = NULL; + + if (!NILP (Vsearch_spaces_regexp)) + { + CHECK_STRING (Vsearch_spaces_regexp); + whitespace_regexp = SSDATA (Vsearch_spaces_regexp); + } + + if (EQ(translate, Qt)) + translate = (!NILP (Vcase_fold_search) + ? BVAR (current_buffer, case_canon_table) + : Qnil); + + CHECK_STRING (pattern); + + bufp = xzalloc (sizeof (*bufp)); + + bufp->fastmap = xzalloc (FASTMAP_SIZE); + bufp->translate = translate; + bufp->multibyte = STRING_MULTIBYTE (pattern); + bufp->charset_unibyte = charset_unibyte; + + val = (char *) re_compile_pattern (SSDATA (pattern), SBYTES (pattern), + is_posix, whitespace_regexp, bufp); + + if (val) + { + xfree (bufp->buffer); + xfree (bufp->fastmap); + xfree (bufp); + xsignal1 (Qinvalid_regexp, build_string (val)); + } + + p = ALLOCATE_PSEUDOVECTOR (struct Lisp_Regexp, default_match_target, + PVEC_REGEXP); + p->pattern = pattern; + p->whitespace_regexp = Vsearch_spaces_regexp; + /* If the compiled pattern hard codes some of the contents of the + syntax-table, it can only be reused with *this* syntax table. */ + p->syntax_table = bufp->used_syntax ? BVAR (current_buffer, syntax_table) : Qt; + p->posix = is_posix; + + /* Allocate the match data implicitly stored in this regexp. */ + p->default_match_target = allocate_match (bufp->re_nsub); + /* Tell regex matching routines they do not need to allocate any + further memory, since we have allocated it here in advance. */ + bufp->regs_allocated = REGS_FIXED; + /* Fully initialize all fields. */ + p->buffer = bufp; + + return make_lisp_ptr (p, Lisp_Vectorlike); +} + +DEFUN ("regexpp", Fregexpp, Sregexpp, 1, 1, 0, + doc: /* Say whether OBJECT is a compiled regexp object. */) + (Lisp_Object object) +{ + return REGEXP_P (object)? Qt: Qnil; +} + +DEFUN ("regexp-get-pattern-string", Fregexp_get_pattern_string, + Sregexp_get_pattern_string, 1, 1, 0, + doc: /* Get the original pattern used to compile REGEXP. */) + (Lisp_Object regexp) +{ + CHECK_REGEXP (regexp); + return XREGEXP (regexp)->pattern; +} + +DEFUN ("regexp-posix-p", Fregexp_posix_p, Sregexp_posix_p, 1, 1, 0, + doc: /* Get whether REGEXP was compiled with posix behavior. */) + (Lisp_Object regexp) +{ + CHECK_REGEXP (regexp); + if (XREGEXP (regexp)->posix) + return Qt; + return Qnil; +} + +DEFUN ("regexp-get-whitespace-pattern", Fregexp_get_whitespace_pattern, + Sregexp_get_whitespace_pattern, 1, 1, 0, + doc: /* Get the value of `search-spaces-regexp' used to compile REGEXP. */) + (Lisp_Object regexp) +{ + CHECK_REGEXP (regexp); + return XREGEXP (regexp)->whitespace_regexp; +} + +DEFUN ("regexp-get-translation-table", Fregexp_get_translation_table, + Sregexp_get_translation_table, 1, 1, 0, + doc: /* Get the translation table for case folding used to compile REGEXP. */) + (Lisp_Object regexp) +{ + CHECK_REGEXP (regexp); + return XREGEXP (regexp)->buffer->translate; +} + +DEFUN ("regexp-get-num-subexps", Fregexp_get_num_subexps, + Sregexp_get_num_subexps, 1, 1, 0, + doc: /* Get the number of capturing groups (sub-expressions) for REGEXP. */) + (Lisp_Object regexp) +{ + CHECK_REGEXP (regexp); + return make_fixnum (XREGEXP (regexp)->buffer->re_nsub); +} + +DEFUN ("regexp-get-default-match-data", Fregexp_get_default_match_data, + Sregexp_get_default_match_data, 1, 1, 0, + doc: /* Retrieve the internal match object from REGEXP. + +This match object was created with `make-match-data' upon construction +of REGEXP with `make-regexp', and records the most recent match applied +to REGEXP unless an explicit match object was provided via an +'inhibit-modify' arg to a regexp search method. */) + (Lisp_Object regexp) +{ + CHECK_REGEXP (regexp); + return XREGEXP (regexp)->default_match_target; +} + +static void +reallocate_match_registers (ptrdiff_t re_nsub, struct Lisp_Match *m) +{ + ptrdiff_t needed_regs = re_nsub + 1; + eassert (needed_regs > 0); + struct re_registers *regs = m->regs; + /* If we need more elements than were already allocated, reallocate them. + If we need fewer, just leave it alone. */ + if (regs->num_regs < needed_regs) + { + regs->start = + xnrealloc (regs->start, needed_regs, sizeof *regs->start); + regs->end = + xnrealloc (regs->end, needed_regs, sizeof *regs->end); + + /* Clear any register data past what the current regexp can read. */ + for (ptrdiff_t i = regs->num_regs; i < needed_regs; ++i) + regs->start[i] = regs->end[i] = RE_MATCH_EXP_UNSET; + + /* Ensure we rewrite the allocation length. */ + regs->num_regs = needed_regs; + } +} + +DEFUN ("regexp-set-default-match-data", Fregexp_set_default_match_data, + Sregexp_set_default_match_data, 2, 2, 0, + doc: /* Overwrite the internal match object for REGEXP with MATCH. + +MATCH will be reallocated as necessary to contain enough match +registers for the sub-expressions in REGEXP. */) + (Lisp_Object regexp, Lisp_Object match) +{ + CHECK_REGEXP (regexp); + CHECK_MATCH (match); + struct Lisp_Regexp *r = XREGEXP (regexp); + struct re_pattern_buffer *bufp = r->buffer; + + /* This should always be true for any compiled regexp. */ + eassert (bufp->regs_allocated == REGS_FIXED); + + /* Overwrite the default target. */ + r->default_match_target = match; + /* Return nil */ + return Qnil; +} + +Lisp_Object +allocate_match (ptrdiff_t re_nsub) +{ + eassert (re_nsub >= 0); + /* Number of match registers always includes 0 for whole match. */ + ptrdiff_t num_regs = re_nsub + 1; + + struct re_registers *regs = xzalloc (sizeof (*regs)); + regs->num_regs = num_regs; + regs->start = xnmalloc (num_regs, sizeof (*regs->start)); + regs->end = xnmalloc (num_regs, sizeof (*regs->end)); + + /* Construct lisp match object. */ + struct Lisp_Match *m = ALLOCATE_PSEUDOVECTOR + (struct Lisp_Match, haystack, PVEC_MATCH); + + /* Initialize the "haystack" linked match target to nil before any + searches are performed against this match object. */ + m->haystack = Qnil; + /* Initialize all values to -1 for "unset". */ + for (ptrdiff_t reg_index = 0; reg_index < num_regs; ++reg_index) + regs->start[reg_index] = regs->end[reg_index] = RE_MATCH_EXP_UNSET; + /* No successful match has occurred yet, so nothing is initialized. */ + m->initialized_regs = 0; + + m->regs = regs; + return make_lisp_ptr (m, Lisp_Vectorlike); +} + +static ptrdiff_t +extract_re_nsub_arg (Lisp_Object regexp_or_num_registers) +{ + ptrdiff_t re_nsub; + if (NILP (regexp_or_num_registers)) + re_nsub = 0; + else if (REGEXP_P (regexp_or_num_registers)) + re_nsub = XREGEXP (regexp_or_num_registers)->buffer->re_nsub; + else + { + CHECK_FIXNAT (regexp_or_num_registers); + EMACS_INT n = XFIXNUM (regexp_or_num_registers); + if (n == 0) + error ("match data must allocate at least 1 register"); + re_nsub = (ptrdiff_t) n - 1; + } + return re_nsub; +} + +DEFUN ("make-match-data", Fmake_match_data, Smake_match_data, 0, 1, 0, + doc: /* Allocate match data for REGEXP-OR-NUM-REGISTERS. + +If REGEXP-OR-NUM-REGISTERS is a compiled regexp object, the result will +allocate as many match registers as its sub-expressions, plus 1 for the +0th group. If REGEXP-OR-NUM-REGISTERS is nil, then 1 register will be +allocated. Otherwise, REGEXP-OR-NUM-REGISTERS must be a fixnum > 0. */) + (Lisp_Object regexp_or_num_registers) +{ + return allocate_match (extract_re_nsub_arg (regexp_or_num_registers)); +} + +DEFUN ("reallocate-match-data", Freallocate_match_data, Sreallocate_match_data, + 2, 2, 0, + doc: /* Allocate REGEXP-OR-NUM-REGISTERS registers into MATCH. + +REGEXP-OR-NUM-REGISTERS is interpreted as in `make-match-data'. + +Returns MATCH. */) + (Lisp_Object match, Lisp_Object regexp_or_num_registers) +{ + CHECK_MATCH (match); + + struct Lisp_Match *m = XMATCH (match); + ptrdiff_t re_nsub = extract_re_nsub_arg (regexp_or_num_registers); + + /* Reallocate match register buffers as needed. */ + reallocate_match_registers (re_nsub, m); + + return match; +} + +static void +clear_match_data (struct Lisp_Match *m) +{ + /* Clear all registers. */ + m->initialized_regs = 0; + /* Unset the last-matched haystack. */ + m->haystack = Qnil; +} + +static struct Lisp_Match * +extract_regexp_or_match (Lisp_Object regexp_or_match) +{ + if (REGEXP_P (regexp_or_match)) + return XMATCH(XREGEXP (regexp_or_match)->default_match_target); + + CHECK_MATCH (regexp_or_match); + return XMATCH (regexp_or_match); +} + +DEFUN ("clear-match-data", Fclear_match_data, Sclear_match_data, 1, 1, 0, + doc: /* Unset all match data for REGEXP-OR-MATCH. + +If REGEXP-OR-MATCH was previously used to search against, its match +data will be reset to the default value. */) + (Lisp_Object regexp_or_match) +{ + clear_match_data (extract_regexp_or_match (regexp_or_match)); + return Qnil; +} + +DEFUN ("matchp", Fmatchp, Smatchp, 1, 1, 0, + doc: /* Say whether OBJECT is an allocated regexp match object. */) + (Lisp_Object object) +{ + return MATCH_P (object)? Qt: Qnil; +} + +DEFUN ("match-get-haystack", Fmatch_get_haystack, Smatch_get_haystack, + 1, 1, 0, + doc: /* Get the last search target from REGEXP-OR-MATCH. + +REGEXP-OR-MATCH is either a compiled regexp object which was last +searched against without providing a match object via 'inhibit-modify', +or a match object provided via 'inhibit-modify' to a search method. + +If this match object has not yet been used in a search, this will be nil. +If a search was initiated but failed, this object is unchanged. */) + (Lisp_Object regexp_or_match) +{ + return extract_regexp_or_match (regexp_or_match)->haystack; +} + +DEFUN ("match-set-haystack", Fmatch_set_haystack, Smatch_set_haystack, + 2, 2, 0, + doc: /* Set the last search target HAYSTACK into REGEXP-OR-MATCH. + +REGEXP-OR-MATCH is either a compiled regexp object which was last +searched against without providing a match object via 'inhibit-modify', +or a match object provided via 'inhibit-modify' to a search method. + +Returns the match object which was modified. */) + (Lisp_Object regexp_or_match, Lisp_Object haystack) +{ + struct Lisp_Match *match = extract_regexp_or_match (regexp_or_match); + match->haystack = haystack; + Lisp_Object ret; + XSETMATCH (ret, match); + return ret; +} + +DEFUN ("match-num-registers", Fmatch_num_registers, + Smatch_num_registers, 1, 1, 0, + doc: /* Get the number of initialized match registers for +REGEXP-OR-MATCH. + +This match object will be able to store match data for up to this +number of groups, including the 0th group for the entire match. */) + (Lisp_Object regexp_or_match) +{ + struct Lisp_Match *match = + extract_regexp_or_match (regexp_or_match); + return make_fixed_natnum (match->initialized_regs); +} + +DEFUN ("match-allocated-registers", Fmatch_allocated_registers, + Smatch_allocated_registers, 1, 1, 0, + doc: /* Get the number of allocated registers for REGEXP-OR-MATCH. + +If this value is less than the result of +`regexp-get-num-subexps' + 1 for some compiled regexp, then +`reallocate-match-data' will have to reallocate. + +This value will always be at least as large as the result of +`match-num-registers'. */) + (Lisp_Object regexp_or_match) +{ + struct Lisp_Match *match = + extract_regexp_or_match (regexp_or_match); + return make_fixed_natnum (match->regs->num_regs); +} + +static ptrdiff_t +extract_group_index (Lisp_Object group) +{ + if (NILP (group)) + return 0; + + CHECK_FIXNAT (group); + return XFIXNAT (group); +} + +static ptrdiff_t +index_into_registers (struct Lisp_Match *m, ptrdiff_t group_index, + bool beginningp) +{ + if (group_index >= m->initialized_regs) + error ("group %ld was out of bounds for match data with %ld registers", + group_index, m->initialized_regs); + return ((beginningp) + ? m->regs->start[group_index] + : m->regs->end[group_index]); +} + +DEFUN ("match-register-start", Fmatch_register_start, Smatch_register_start, + 1, 2, 0, + doc: /* Return position of start of text matched by last search. + +REGEXP-OR-MATCH is either a compiled regexp object which was last +searched against without providing a match object via 'inhibit-modify', +or a match object provided via 'inhibit-modify' to a search method. + +GROUP, a number, specifies the parenthesized subexpression in the last + regexp for which to return the start position. +Value is nil if GROUPth subexpression didn't match, or there were fewer + than GROUP subexpressions. +GROUP zero or nil means the entire text matched by the whole regexp or whole + string. + +Return value is undefined if the last search failed, as a failed search +does not update match data. */) + (Lisp_Object regexp_or_match, Lisp_Object group) +{ + struct Lisp_Match *m = extract_regexp_or_match (regexp_or_match); + ptrdiff_t group_index = extract_group_index (group); + + ptrdiff_t input_index = index_into_registers (m, group_index, true); + + if (input_index < 0) + return Qnil; + return make_fixed_natnum (input_index); +} + +DEFUN ("match-register-end", Fmatch_register_end, Smatch_register_end, + 1, 2, 0, + doc: /* Return position of end of text matched by last search. + +REGEXP-OR-MATCH is either a compiled regexp object which was last +searched against without providing a match object via 'inhibit-modify', +or a match object provided via 'inhibit-modify' to a search method. + +GROUP, a number, specifies the parenthesized subexpression in the last + regexp for which to return the end position. +Value is nil if GROUPth subexpression didn't match, or there were fewer + than GROUP subexpressions. +GROUP zero or nil means the entire text matched by the whole regexp or whole + string. + +Return value is undefined if the last search failed, as a failed search +does not update match data. */) + (Lisp_Object regexp_or_match, Lisp_Object group) +{ + struct Lisp_Match *m = extract_regexp_or_match (regexp_or_match); + ptrdiff_t group_index = extract_group_index (group); + + ptrdiff_t input_index = index_into_registers (m, group_index, false); + + if (input_index < 0) + return Qnil; + return make_fixed_natnum (input_index); +} + +static ptrdiff_t +extract_required_vector_length (struct Lisp_Match *m, Lisp_Object max_group) +{ + if (NILP (max_group)) + return m->initialized_regs; + + CHECK_FIXNAT (max_group); + ptrdiff_t max_group_index = XFIXNAT (max_group); + + if (max_group_index >= m->initialized_regs) + error ("max group %ld was out of bounds for match data with %ld registers", + max_group_index, m->initialized_regs); + + return max_group_index + 1; +} + +static Lisp_Object +ensure_match_result_vector (ptrdiff_t result_length, Lisp_Object out) +{ + if (NILP (out)) + return make_vector (result_length, Qnil); + + CHECK_VECTOR (out); + if (ASIZE (out) < result_length) + error ("needed at least %ld entries in out vector, but got %ld", + result_length, ASIZE (out)); + return out; +} + +DEFUN ("match-allocate-results", Fmatch_allocate_results, + Smatch_allocate_results, 1, 2, 0, + doc: /* Allocate a vector in OUT large enough for REGEXP-OR-MATCH. + +The result will be large enough to use in e.g. `match-extract-starts' +without having to allocate a new vector. + +Returns OUT. */) + (Lisp_Object regexp_or_match, Lisp_Object out) +{ + ptrdiff_t result_length; + if (REGEXP_P (regexp_or_match)) + result_length = 1 + XREGEXP (regexp_or_match)->buffer->re_nsub; + else + { + CHECK_MATCH (regexp_or_match); + result_length = XMATCH (regexp_or_match)->regs->num_regs; + } + out = ensure_match_result_vector (result_length, out); + + return out; +} + +static void +write_positions_to_vector (ptrdiff_t result_length, ptrdiff_t *positions, + Lisp_Object out) +{ + for (ptrdiff_t i = 0; i < result_length; ++i) + { + ptrdiff_t cur_pos = positions[i]; + if (cur_pos < 0) + ASET (out, i, Qnil); + else + ASET (out, i, make_fixed_natnum (cur_pos)); + } +} + +DEFUN ("match-extract-starts", Fmatch_extract_starts, Smatch_extract_starts, + 1, 3, 0, + doc: /* Write match starts to a vector. + +REGEXP-OR-MATCH is either a compiled regexp object which was last +searched against without providing a match object via 'inhibit-modify', +or a match object provided via 'inhibit-modify' to a search method. + +OUT may be a preallocated vector with `make-vector', which must have at +least MAX-GROUP + 1 elements. If nil, a new vector is created. + +MAX-GROUP, a number, specifies the maximum match group index to +write to the output. If nil, write all matches. + +Returns OUT. */) + (Lisp_Object regexp_or_match, Lisp_Object out, Lisp_Object max_group) +{ + struct Lisp_Match *m = extract_regexp_or_match (regexp_or_match); + ptrdiff_t result_length = + extract_required_vector_length (m, max_group); + out = ensure_match_result_vector (result_length, out); + + write_positions_to_vector (result_length, m->regs->start, out); + + return out; +} + +DEFUN ("match-extract-ends", Fmatch_extract_ends, Smatch_extract_ends, + 1, 3, 0, + doc: /* Write match ends to a vector. + +REGEXP-OR-MATCH is either a compiled regexp object which was last +searched against without providing a match object via 'inhibit-modify', +or a match object provided via 'inhibit-modify' to a search method. + +OUT may be a preallocated vector with `make-vector', which must have at +least MAX-GROUP + 1 elements. If nil, a new vector is created. + +MAX-GROUP, a number, specifies the maximum match group index to +write to the output. If nil, write all matches. + +Returns OUT. */) + (Lisp_Object regexp_or_match, Lisp_Object out, Lisp_Object max_group) +{ + struct Lisp_Match *m = extract_regexp_or_match (regexp_or_match); + ptrdiff_t result_length = + extract_required_vector_length (m, max_group); + out = ensure_match_result_vector (result_length, out); + + write_positions_to_vector (result_length, m->regs->end, out); + + return out; +} + +static void +reset_all_marks (Lisp_Object out) +{ + CHECK_VECTOR (out); + for (ptrdiff_t i = 0; i < ASIZE (out); ++i) + { + Lisp_Object maybe_mark = AREF (out, i); + if (MARKERP (maybe_mark)) + { + unchain_marker (XMARKER (maybe_mark)); + ASET (out, i, Qnil); + } + } +} + +static void +write_marks_to_vector (ptrdiff_t result_length, ptrdiff_t *positions, + Lisp_Object buffer, Lisp_Object out) +{ + for (ptrdiff_t i = 0; i < result_length; ++i) + { + ptrdiff_t cur_pos = positions[i]; + if (cur_pos < 0) + ASET (out, i, Qnil); + else + { + Lisp_Object new_marker = Fmake_marker (); + Fset_marker (new_marker, + make_fixed_natnum (cur_pos), + buffer); + ASET (out, i, new_marker); + } + } +} + +DEFUN ("match-extract-start-marks", Fmatch_extract_start_marks, + Smatch_extract_start_marks, 1, 4, 0, + doc: /* Write match starts to a vector. + +REGEXP-OR-MATCH is either a compiled regexp object which was last +searched against without providing a match object via 'inhibit-modify', +or a match object provided via 'inhibit-modify' to a search method. The +last search must have been performed against a buffer. + +OUT may be a preallocated vector with `make-vector', which must have at +least MAX-GROUP + 1 elements. If nil, a new vector is created. + +MAX-GROUP, a number, specifies the maximum match group index to +write to the output. If nil, write all matches. + +If RESEAT is non-nil, any previous markers on OUT will be modified to +point to nowhere. + +Returns OUT. */) + (Lisp_Object regexp_or_match, Lisp_Object out, Lisp_Object max_group, + Lisp_Object reseat) +{ + if (!NILP (reseat)) + { + if (!EQ (reseat, Qt)) + error ("RESEAT must be t or nil"); + reset_all_marks (out); + } + + struct Lisp_Match *m = extract_regexp_or_match (regexp_or_match); + Lisp_Object buffer = m->haystack; + CHECK_BUFFER (buffer); + + ptrdiff_t result_length = + extract_required_vector_length (m, max_group); + out = ensure_match_result_vector (result_length, out); + + write_marks_to_vector (result_length, m->regs->start, buffer, out); + + return out; +} + +DEFUN ("match-extract-end-marks", Fmatch_extract_end_marks, + Smatch_extract_end_marks, 1, 4, 0, + doc: /* Write match ends to a vector. + +REGEXP-OR-MATCH is either a compiled regexp object which was last +searched against without providing a match object via 'inhibit-modify', +or a match object provided via 'inhibit-modify' to a search method. The +last search must have been performed against a buffer. + +OUT may be a preallocated vector with `make-vector', which must have at +least MAX-GROUP + 1 elements. If nil, a new vector is created. + +MAX-GROUP, a number, specifies the maximum match group index to +write to the output. If nil, write all matches. + +If RESEAT is non-nil, any previous markers on OUT will be modified to +point to nowhere. + +Returns OUT. */) + (Lisp_Object regexp_or_match, Lisp_Object out, Lisp_Object max_group, + Lisp_Object reseat) +{ + if (!NILP (reseat)) + { + if (!EQ (reseat, Qt)) + error ("RESEAT must be t or nil"); + reset_all_marks (out); + } + + struct Lisp_Match *m = extract_regexp_or_match (regexp_or_match); + Lisp_Object buffer = m->haystack; + CHECK_BUFFER (buffer); + + ptrdiff_t result_length = + extract_required_vector_length (m, max_group); + out = ensure_match_result_vector (result_length, out); + + write_marks_to_vector (result_length, m->regs->end, buffer, out); + + return out; +} + +void syms_of_regexp (void) +{ + defsubr (&Smake_regexp); + defsubr (&Sregexpp); + defsubr (&Sregexp_get_pattern_string); + defsubr (&Sregexp_posix_p); + defsubr (&Sregexp_get_whitespace_pattern); + defsubr (&Sregexp_get_translation_table); + defsubr (&Sregexp_get_num_subexps); + defsubr (&Sregexp_get_default_match_data); + defsubr (&Sregexp_set_default_match_data); + defsubr (&Smake_match_data); + defsubr (&Sreallocate_match_data); + defsubr (&Sclear_match_data); + defsubr (&Smatchp); + defsubr (&Smatch_get_haystack); + defsubr (&Smatch_set_haystack); + defsubr (&Smatch_num_registers); + defsubr (&Smatch_allocated_registers); + defsubr (&Smatch_register_start); + defsubr (&Smatch_register_end); + defsubr (&Smatch_allocate_results); + defsubr (&Smatch_extract_starts); + defsubr (&Smatch_extract_ends); + defsubr (&Smatch_extract_start_marks); + defsubr (&Smatch_extract_end_marks); + + /* New symbols necessary for cl-type checking. */ + DEFSYM (Qregexp, "regexp"); + DEFSYM (Qregexpp, "regexpp"); + DEFSYM (Qmatch, "match"); + DEFSYM (Qmatchp, "matchp"); +} diff --git a/src/regex-emacs.h b/src/regex-emacs.h index 2402e539e64..00beeaf1867 100644 --- a/src/regex-emacs.h +++ b/src/regex-emacs.h @@ -21,6 +21,8 @@ #define EMACS_REGEX_H 1 #include +#define RE_MATCH_EXP_UNSET (-1) + /* This is the structure we store register match data in. Declare this before including lisp.h, since lisp.h (via thread.h) uses struct re_registers. */ @@ -33,6 +35,33 @@ #define EMACS_REGEX_H 1 #include "lisp.h" +struct regexp_match_info +{ + struct re_registers *regs; + struct Lisp_Match *match; +}; + +INLINE_HEADER_BEGIN +INLINE struct regexp_match_info +empty_regexp_match_info (void) +{ + struct regexp_match_info ret = { .regs = NULL, .match = NULL }; + return ret; +} + +INLINE struct regexp_match_info +make_regs_only_match_info (struct re_registers* regs) +{ + struct regexp_match_info ret = { .regs = regs, .match = NULL }; + return ret; +} +INLINE_HEADER_END + +/* Defined in search.c. */ +extern EMACS_INT search_buffer (Lisp_Object, ptrdiff_t, ptrdiff_t, + ptrdiff_t, ptrdiff_t, EMACS_INT, + bool, bool, struct regexp_match_info*); + /* The string or buffer being matched. It is used for looking up syntax properties. @@ -52,6 +81,9 @@ #define EMACS_REGEX_H 1 /* Roughly the maximum number of failure points on the stack. */ extern ptrdiff_t emacs_re_max_failures; +/* The size of an allocation for a fastmap. */ +#define FASTMAP_SIZE 0400 + /* Amount of memory that we can safely stack allocate. */ extern ptrdiff_t emacs_re_safe_alloca; @@ -139,7 +171,7 @@ #define EMACS_REGEX_H 1 extern ptrdiff_t re_search (struct re_pattern_buffer *buffer, const char *string, ptrdiff_t length, ptrdiff_t start, ptrdiff_t range, - struct re_registers *regs); + struct regexp_match_info* info); /* Like 're_search', but search in the concatenation of STRING1 and @@ -148,7 +180,7 @@ #define EMACS_REGEX_H 1 const char *string1, ptrdiff_t length1, const char *string2, ptrdiff_t length2, ptrdiff_t start, ptrdiff_t range, - struct re_registers *regs, + struct regexp_match_info* info, ptrdiff_t stop); @@ -157,7 +189,7 @@ #define EMACS_REGEX_H 1 extern ptrdiff_t re_match_2 (struct re_pattern_buffer *buffer, const char *string1, ptrdiff_t length1, const char *string2, ptrdiff_t length2, - ptrdiff_t start, struct re_registers *regs, + ptrdiff_t start, struct regexp_match_info* info, ptrdiff_t stop); diff --git a/src/search.c b/src/search.c index 2ff8b0599c4..cee245ab0b2 100644 --- a/src/search.c +++ b/src/search.c @@ -50,7 +50,7 @@ #define REGEXP_CACHE_SIZE 20 for any syntax-table. */ Lisp_Object syntax_table; struct re_pattern_buffer buf; - char fastmap[0400]; + char fastmap[FASTMAP_SIZE]; /* True means regexp was compiled to do full POSIX backtracking. */ bool posix; /* True means we're inside a buffer match. */ @@ -63,6 +63,8 @@ #define REGEXP_CACHE_SIZE 20 /* The head of the linked list; points to the most recently used buffer. */ static struct regexp_cache *searchbuf_head; +static void +set_regs (ptrdiff_t beg_byte, ptrdiff_t nbytes, struct re_registers *regs); static void set_search_regs (ptrdiff_t, ptrdiff_t); static void save_search_regs (void); static EMACS_INT simple_search (EMACS_INT, unsigned char *, ptrdiff_t, @@ -181,6 +183,8 @@ freeze_pattern (struct regexp_cache *searchbuf) { eassert (!searchbuf->busy); record_unwind_protect_ptr (unfreeze_pattern, searchbuf); + eassert (searchbuf != NULL); + assume (searchbuf); searchbuf->busy = true; } @@ -260,37 +264,179 @@ compile_pattern (Lisp_Object pattern, struct re_registers *regp, } -static Lisp_Object -looking_at_1 (Lisp_Object string, bool posix, bool modify_data) +static struct regexp_match_info +resolve_match_info (Lisp_Object regexp, Lisp_Object match) { - Lisp_Object val; - unsigned char *p1, *p2; - ptrdiff_t s1, s2; - register ptrdiff_t i; + /* If inhibited, neither read nor write anything, and immediately return. */ + if (!NILP (Vinhibit_changing_match_data)) + return empty_regexp_match_info (); + /* If a compiled regexp, we don't touch any thread-local state. */ + if (REGEXP_P (regexp)) + { + struct Lisp_Regexp *r = XREGEXP (regexp); + struct Lisp_Match *m = NULL; + /* If a match object was provided, use it. */ + if (MATCH_P (match)) + m = XMATCH (match); + else + { + eassert (NILP (match)); + /* Otherwise, use the built-in match target we constructed + when we compiled the regexp. */ + m = XMATCH (r->default_match_target); + } + struct regexp_match_info ret = { + .regs = m->regs, + .match = m, + }; + return ret; + } + + /* If just a string, do the old complex logic to access thread-locals + and save state as needed. Incompatible with providing a preallocated + object. */ + CHECK_STRING (regexp); + eassert (NILP (match)); if (running_asynch_code) save_search_regs (); + return make_regs_only_match_info (&search_regs); +} + +/* Patch up byte- vs char- indices, and set pointers to the last thing + searched. */ +static void +record_string_haystack (struct regexp_match_info *info, + Lisp_Object haystack) +{ + eassert (info != NULL); + CHECK_STRING (haystack); + + struct re_registers *regs = info->regs; + eassert (regs != NULL); + + ptrdiff_t num_matched_regs + = ((info->match) ? info->match->initialized_regs + : regs->num_regs); + /* The number of matched groups is at least 1, but may be less than + the total allocated space for match groups. */ + eassert (num_matched_regs > 0); + eassert (num_matched_regs <= regs->num_regs); + + /* Patch up the byte indices from the regex engine to refer to + utf-8/multibyte char indices. */ + for (ptrdiff_t i = 0; i < num_matched_regs; ++i) + { + ptrdiff_t start_byte = regs->start[i]; + ptrdiff_t end_byte = regs->end[i]; + if (start_byte < 0) + /* NB: Ignore this: this is a failed match. */ + eassert (end_byte < 0); + else + { + /* TODO: can we trust the output of the regex engine somehow + to avoid relying on the string char decoding cache here? */ + regs->start[i] = string_byte_to_char (haystack, start_byte); + regs->end[i] = string_byte_to_char (haystack, end_byte); + } + } + + if (info->match) + info->match->haystack = haystack; + else + last_thing_searched = Qt; +} + +static void +record_current_buffer_haystack (struct regexp_match_info *info) +{ + eassert (info != NULL); + + struct re_registers *regs = info->regs; + eassert (regs != NULL); + + ptrdiff_t num_matched_regs + = ((info->match) ? info->match->initialized_regs + : regs->num_regs); + /* The number of matched groups is at least 1, but may be less than + the total allocated space for match groups. */ + eassert (num_matched_regs > 0); + eassert (num_matched_regs <= regs->num_regs); + + /* Patch up the byte indices from the regex engine to refer to + utf-8/multibyte char indices. */ + for (ptrdiff_t i = 0; i < num_matched_regs; ++i) + { + ptrdiff_t start_byte = regs->start[i]; + ptrdiff_t end_byte = regs->end[i]; + if (start_byte < 0) + /* NB: Ignore this: this is a failed match. */ + eassert (end_byte < 0); + else + { + regs->start[i] = BYTE_TO_CHAR (start_byte + BEGV_BYTE); + regs->end[i] = BYTE_TO_CHAR (end_byte + BEGV_BYTE); + } + } + + if (info->match) + XSETBUFFER (info->match->haystack, current_buffer); + else + XSETBUFFER (last_thing_searched, current_buffer); +} + +static struct re_pattern_buffer * +resolve_explicit_compiled_regexp (Lisp_Object regexp, bool posix, + struct re_registers *regs, + Lisp_Object translate, bool multibyte) +{ + /* If the regexp is precompiled, then immediately return its compiled form. */ + if (REGEXP_P (regexp)) + return XREGEXP (regexp)->buffer; + + /* Otherwise, this is a string, and we have to compile it via the cache. */ + CHECK_STRING (regexp); + + /* Compile this string into a regexp via the cache. */ + struct regexp_cache *cache_entry = compile_pattern ( + regexp, regs, translate, posix, multibyte); + + /* Do a pending quit right away, to avoid paradoxical behavior */ + maybe_quit (); + + /* Mark the compiled pattern as busy. */ + freeze_pattern (cache_entry); + return &cache_entry->buf; +} + +static struct re_pattern_buffer * +resolve_compiled_regexp (Lisp_Object regexp, bool posix) +{ /* This is so set_image_of_range_1 in regex-emacs.c can find the EQV table. */ set_char_table_extras (BVAR (current_buffer, case_canon_table), 2, BVAR (current_buffer, case_eqv_table)); - CHECK_STRING (string); + Lisp_Object translate = (!NILP (Vcase_fold_search) + ? BVAR (current_buffer, case_canon_table) + : Qnil); + bool multibyte = !NILP (BVAR (current_buffer, enable_multibyte_characters)); - /* Snapshot in case Lisp changes the value. */ - bool modify_match_data = NILP (Vinhibit_changing_match_data) && modify_data; + return resolve_explicit_compiled_regexp + (regexp, posix, &search_regs, translate, multibyte); +} - struct regexp_cache *cache_entry = compile_pattern ( - string, - modify_match_data ? &search_regs : NULL, - (!NILP (Vcase_fold_search) - ? BVAR (current_buffer, case_canon_table) : Qnil), - posix, - !NILP (BVAR (current_buffer, enable_multibyte_characters))); +static Lisp_Object +looking_at_1 (Lisp_Object regexp, bool posix, struct regexp_match_info *info) +{ + Lisp_Object val; + unsigned char *p1, *p2; + ptrdiff_t s1, s2; + register ptrdiff_t i; - /* Do a pending quit right away, to avoid paradoxical behavior */ - maybe_quit (); + eassert (info != NULL); + struct re_registers *regs = info->regs; /* Get pointers and sizes of the two strings that make up the visible portion of the buffer. */ @@ -312,12 +458,14 @@ looking_at_1 (Lisp_Object string, bool posix, bool modify_data) } specpdl_ref count = SPECPDL_INDEX (); + struct re_pattern_buffer *bufp = + resolve_compiled_regexp (regexp, posix); freeze_buffer_relocation (); - freeze_pattern (cache_entry); + re_match_object = Qnil; - i = re_match_2 (&cache_entry->buf, (char *) p1, s1, (char *) p2, s2, + i = re_match_2 (bufp, (char *) p1, s1, (char *) p2, s2, PT_BYTE - BEGV_BYTE, - modify_match_data ? &search_regs : NULL, + info, ZV_BYTE - BEGV_BYTE); if (i == -2) @@ -326,32 +474,37 @@ looking_at_1 (Lisp_Object string, bool posix, bool modify_data) matcher_overflow (); } - val = (i >= 0 ? Qt : Qnil); - if (modify_match_data && i >= 0) - { - for (i = 0; i < search_regs.num_regs; i++) - if (search_regs.start[i] >= 0) - { - search_regs.start[i] - = BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE); - search_regs.end[i] - = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE); - } - /* Set last_thing_searched only when match data is changed. */ - XSETBUFFER (last_thing_searched, current_buffer); - } + /* Set last_thing_searched only when match data is changed. */ + if (regs && i >= 0) + record_current_buffer_haystack (info); + val = (i >= 0 ? Qt : Qnil); return unbind_to (count, val); } +static struct regexp_match_info +resolve_buffer_match_info (Lisp_Object regexp, Lisp_Object inhibit_modify) +{ + if (EQ (inhibit_modify, Qt)) + return empty_regexp_match_info (); + + return resolve_match_info (regexp, inhibit_modify); +} + DEFUN ("looking-at", Flooking_at, Slooking_at, 1, 2, 0, doc: /* Return t if text after point matches regular expression REGEXP. By default, this function modifies the match data that `match-beginning', `match-end' and `match-data' access. If -INHIBIT-MODIFY is non-nil, don't modify the match data. */) +INHIBIT-MODIFY is t, don't modify the match data. + +If REGEXP is a compiled regexp and INHIBIT-MODIFY is a match object, +then write match data into INHIBIT-MODIFY. Otherwise, INHIBIT-MODIFY +must be nil, and the default match target within REGEXP is used. */) (Lisp_Object regexp, Lisp_Object inhibit_modify) { - return looking_at_1 (regexp, 0, NILP (inhibit_modify)); + struct regexp_match_info info = resolve_buffer_match_info + (regexp, inhibit_modify); + return looking_at_1 (regexp, 0, &info); } DEFUN ("posix-looking-at", Fposix_looking_at, Sposix_looking_at, 1, 2, 0, @@ -360,25 +513,30 @@ DEFUN ("posix-looking-at", Fposix_looking_at, Sposix_looking_at, 1, 2, 0, By default, this function modifies the match data that `match-beginning', `match-end' and `match-data' access. If -INHIBIT-MODIFY is non-nil, don't modify the match data. */) +INHIBIT-MODIFY is t, don't modify the match data. + +If REGEXP is a compiled regexp and INHIBIT-MODIFY is a match object, +then write match data into INHIBIT-MODIFY. Otherwise, INHIBIT-MODIFY +must be nil, and the default match target within REGEXP is used. */) (Lisp_Object regexp, Lisp_Object inhibit_modify) { - return looking_at_1 (regexp, 1, NILP (inhibit_modify)); + struct regexp_match_info info = resolve_buffer_match_info + (regexp, inhibit_modify); + return looking_at_1 (regexp, 1, &info); } static Lisp_Object string_match_1 (Lisp_Object regexp, Lisp_Object string, Lisp_Object start, - bool posix, bool modify_data) + bool posix, struct regexp_match_info *info) { ptrdiff_t val; EMACS_INT pos; - ptrdiff_t pos_byte, i; - bool modify_match_data = NILP (Vinhibit_changing_match_data) && modify_data; + ptrdiff_t pos_byte; - if (running_asynch_code) - save_search_regs (); + eassert (info != NULL); + struct re_registers *regs = info->regs; + struct re_pattern_buffer *bufp = NULL; - CHECK_STRING (regexp); CHECK_STRING (string); if (NILP (start)) @@ -396,49 +554,36 @@ string_match_1 (Lisp_Object regexp, Lisp_Object string, Lisp_Object start, pos_byte = string_char_to_byte (string, pos); } - /* This is so set_image_of_range_1 in regex-emacs.c can find the EQV - table. */ - set_char_table_extras (BVAR (current_buffer, case_canon_table), 2, - BVAR (current_buffer, case_eqv_table)); - specpdl_ref count = SPECPDL_INDEX (); - struct regexp_cache *cache_entry - = compile_pattern (regexp, - modify_match_data ? &search_regs : NULL, - (!NILP (Vcase_fold_search) - ? BVAR (current_buffer, case_canon_table) - : Qnil), - posix, - STRING_MULTIBYTE (string)); - freeze_pattern (cache_entry); + bufp = resolve_compiled_regexp (regexp, posix); + re_match_object = string; - val = re_search (&cache_entry->buf, SSDATA (string), + val = re_search (bufp, SSDATA (string), SBYTES (string), pos_byte, SBYTES (string) - pos_byte, - (modify_match_data ? &search_regs : NULL)); + info); unbind_to (count, Qnil); - /* Set last_thing_searched only when match data is changed. */ - if (modify_match_data) - last_thing_searched = Qt; - if (val == -2) matcher_overflow (); if (val < 0) return Qnil; - if (modify_match_data) - for (i = 0; i < search_regs.num_regs; i++) - if (search_regs.start[i] >= 0) - { - search_regs.start[i] - = string_byte_to_char (string, search_regs.start[i]); - search_regs.end[i] - = string_byte_to_char (string, search_regs.end[i]); - } + /* Set last_thing_searched only when match data is changed. */ + if (regs) + record_string_haystack (info, string); return make_fixnum (string_byte_to_char (string, val)); } +static struct regexp_match_info +resolve_string_match_info (Lisp_Object regexp, Lisp_Object inhibit_modify) +{ + if (EQ (inhibit_modify, Qt)) + return empty_regexp_match_info (); + + return resolve_match_info (regexp, inhibit_modify); +} + DEFUN ("string-match", Fstring_match, Sstring_match, 2, 4, 0, doc: /* Return index of start of first match for REGEXP in STRING, or nil. Matching ignores case if `case-fold-search' is non-nil. @@ -455,7 +600,9 @@ DEFUN ("string-match", Fstring_match, Sstring_match, 2, 4, 0, (Lisp_Object regexp, Lisp_Object string, Lisp_Object start, Lisp_Object inhibit_modify) { - return string_match_1 (regexp, string, start, 0, NILP (inhibit_modify)); + struct regexp_match_info info = resolve_string_match_info + (regexp, inhibit_modify); + return string_match_1 (regexp, string, start, 0, &info); } DEFUN ("posix-string-match", Fposix_string_match, Sposix_string_match, 2, 4, 0, @@ -474,7 +621,9 @@ DEFUN ("posix-string-match", Fposix_string_match, Sposix_string_match, 2, 4, 0, (Lisp_Object regexp, Lisp_Object string, Lisp_Object start, Lisp_Object inhibit_modify) { - return string_match_1 (regexp, string, start, 1, NILP (inhibit_modify)); + struct regexp_match_info info = resolve_string_match_info + (regexp, inhibit_modify); + return string_match_1 (regexp, string, start, 1, &info); } /* Match REGEXP against STRING using translation table TABLE, @@ -487,12 +636,14 @@ fast_string_match_internal (Lisp_Object regexp, Lisp_Object string, { re_match_object = string; specpdl_ref count = SPECPDL_INDEX (); - struct regexp_cache *cache_entry - = compile_pattern (regexp, 0, table, 0, STRING_MULTIBYTE (string)); - freeze_pattern (cache_entry); - ptrdiff_t val = re_search (&cache_entry->buf, SSDATA (string), + + struct regexp_match_info empty_info = empty_regexp_match_info (); + struct re_pattern_buffer *bufp = resolve_explicit_compiled_regexp + (regexp, false, empty_info.regs, table, STRING_MULTIBYTE (string)); + + ptrdiff_t val = re_search (bufp, SSDATA (string), SBYTES (string), 0, - SBYTES (string), 0); + SBYTES (string), &empty_info); unbind_to (count, Qnil); return val; } @@ -509,18 +660,24 @@ fast_c_string_match_internal (Lisp_Object regexp, const char *string, ptrdiff_t len, Lisp_Object table) { - /* FIXME: This is expensive and not obviously correct when it makes - a difference. I.e., no longer "fast", and may hide bugs. - Something should be done about this. */ - regexp = string_make_unibyte (regexp); /* Record specpdl index because freeze_pattern pushes an unwind-protect on the specpdl. */ specpdl_ref count = SPECPDL_INDEX (); - struct regexp_cache *cache_entry - = compile_pattern (regexp, 0, table, 0, 0); - freeze_pattern (cache_entry); + + if (REGEXP_P (regexp)) + eassert (!XREGEXP (regexp)->buffer->target_multibyte); + /* FIXME: This is expensive and not obviously correct when it makes + a difference. I.e., no longer "fast", and may hide bugs. + Something should be done about this. */ + if (STRINGP (regexp)) + regexp = string_make_unibyte (regexp); + + struct regexp_match_info empty_info = empty_regexp_match_info (); + struct re_pattern_buffer *bufp = resolve_explicit_compiled_regexp + (regexp, false, empty_info.regs, table, false); + re_match_object = Qt; - ptrdiff_t val = re_search (&cache_entry->buf, string, len, 0, len, 0); + ptrdiff_t val = re_search (bufp, string, len, 0, len, &empty_info); unbind_to (count, Qnil); return val; } @@ -578,14 +735,17 @@ fast_looking_at (Lisp_Object regexp, ptrdiff_t pos, ptrdiff_t pos_byte, multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters)); } - struct regexp_cache *cache_entry = - compile_pattern (regexp, 0, Qnil, 0, multibyte); + specpdl_ref count = SPECPDL_INDEX (); + + struct regexp_match_info empty_info = empty_regexp_match_info (); + struct re_pattern_buffer *bufp = resolve_explicit_compiled_regexp + (regexp, false, empty_info.regs, Qnil, multibyte); freeze_buffer_relocation (); - freeze_pattern (cache_entry); + re_match_object = STRINGP (string) ? string : Qnil; - len = re_match_2 (&cache_entry->buf, (char *) p1, s1, (char *) p2, s2, - pos_byte, NULL, limit_byte); + len = re_match_2 (bufp, (char *) p1, s1, (char *) p2, s2, + pos_byte, &empty_info, limit_byte); unbind_to (count, Qnil); return len; @@ -1030,8 +1190,9 @@ find_before_next_newline (ptrdiff_t from, ptrdiff_t to, /* Subroutines of Lisp buffer search functions. */ static Lisp_Object -search_command (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, - Lisp_Object count, int direction, bool RE, bool posix) +search_command (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, + Lisp_Object count, int direction, bool RE, bool posix, + struct regexp_match_info *info) { EMACS_INT np; EMACS_INT lim; @@ -1044,7 +1205,6 @@ search_command (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, n *= XFIXNUM (count); } - CHECK_STRING (string); if (NILP (bound)) { if (n > 0) @@ -1065,23 +1225,11 @@ search_command (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, lim_byte = CHAR_TO_BYTE (lim); } - /* This is so set_image_of_range_1 in regex-emacs.c can find the EQV - table. */ - set_char_table_extras (BVAR (current_buffer, case_canon_table), 2, - BVAR (current_buffer, case_eqv_table)); - - np = search_buffer (string, PT, PT_BYTE, lim, lim_byte, n, RE, - (!NILP (Vcase_fold_search) - ? BVAR (current_buffer, case_canon_table) - : Qnil), - (!NILP (Vcase_fold_search) - ? BVAR (current_buffer, case_eqv_table) - : Qnil), - posix); + np = search_buffer (regexp, PT, PT_BYTE, lim, lim_byte, n, RE, posix, info); if (np <= 0) { if (NILP (noerror)) - xsignal1 (Qsearch_failed, string); + xsignal1 (Qsearch_failed, regexp); if (!EQ (noerror, Qt)) { @@ -1154,28 +1302,46 @@ while (0) /* Only used in search_buffer, to record the end position of the match when searching regexps and SEARCH_REGS should not be changed (i.e. Vinhibit_changing_match_data is non-nil). */ -static struct re_registers search_regs_1; +static __thread struct re_registers search_regs_1 = { + .num_regs = 0, + .start = NULL, + .end = NULL, +}; + +/* For some reason, search_buffer_re() segfaults if this memory hasn't + been allocated yet. */ +static struct re_registers * +ensure_backup_search_regs_allocated (void) +{ + eassert (search_regs_1.num_regs >= 0); + if (search_regs_1.num_regs == 0) + { + eassert (search_regs_1.start == NULL); + eassert (search_regs_1.end == NULL); + search_regs_1.start = xnmalloc (1, sizeof *search_regs_1.start); + search_regs_1.end = xnmalloc (1, sizeof *search_regs_1.end); + search_regs_1.num_regs = 1; + } + return &search_regs_1; +} static EMACS_INT -search_buffer_re (Lisp_Object string, ptrdiff_t pos, ptrdiff_t pos_byte, - ptrdiff_t lim, ptrdiff_t lim_byte, EMACS_INT n, - Lisp_Object trt, Lisp_Object inverse_trt, bool posix) +search_buffer_re (Lisp_Object regexp, ptrdiff_t pos, ptrdiff_t pos_byte, + ptrdiff_t lim, ptrdiff_t lim_byte, EMACS_INT n, bool posix, + struct regexp_match_info *info) { unsigned char *p1, *p2; ptrdiff_t s1, s2; - /* Snapshot in case Lisp changes the value. */ - bool preserve_match_data = NILP (Vinhibit_changing_match_data); - - struct regexp_cache *cache_entry = - compile_pattern (string, - preserve_match_data ? &search_regs : &search_regs_1, - trt, posix, - !NILP (BVAR (current_buffer, enable_multibyte_characters))); - struct re_pattern_buffer *bufp = &cache_entry->buf; + eassert (info != NULL); + struct re_registers *regs = info->regs; + bool is_writing_output_match_data = true; + if (!regs) + { + is_writing_output_match_data = false; + regs = ensure_backup_search_regs_allocated (); + } - maybe_quit (); /* Do a pending quit right away, - to avoid paradoxical behavior */ /* Get pointers and sizes of the two strings that make up the visible portion of the buffer. */ @@ -1196,8 +1362,8 @@ search_buffer_re (Lisp_Object string, ptrdiff_t pos, ptrdiff_t pos_byte, } specpdl_ref count = SPECPDL_INDEX (); + struct re_pattern_buffer *bufp = resolve_compiled_regexp (regexp, posix); freeze_buffer_relocation (); - freeze_pattern (cache_entry); while (n < 0) { @@ -1205,44 +1371,36 @@ search_buffer_re (Lisp_Object string, ptrdiff_t pos, ptrdiff_t pos_byte, re_match_object = Qnil; val = re_search_2 (bufp, (char *) p1, s1, (char *) p2, s2, - pos_byte - BEGV_BYTE, lim_byte - pos_byte, - preserve_match_data ? &search_regs : &search_regs_1, - /* Don't allow match past current point */ - pos_byte - BEGV_BYTE); + pos_byte - BEGV_BYTE, lim_byte - pos_byte, + info, + /* Don't allow match past current point */ + pos_byte - BEGV_BYTE); if (val == -2) - { - unbind_to (count, Qnil); - matcher_overflow (); - } + { + unbind_to (count, Qnil); + matcher_overflow (); + } if (val >= 0) - { - if (preserve_match_data) - { - pos_byte = search_regs.start[0] + BEGV_BYTE; - for (ptrdiff_t i = 0; i < search_regs.num_regs; i++) - if (search_regs.start[i] >= 0) - { - search_regs.start[i] - = BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE); - search_regs.end[i] - = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE); - } - XSETBUFFER (last_thing_searched, current_buffer); - /* Set pos to the new position. */ - pos = search_regs.start[0]; - } - else - { - pos_byte = search_regs_1.start[0] + BEGV_BYTE; - /* Set pos to the new position. */ - pos = BYTE_TO_CHAR (search_regs_1.start[0] + BEGV_BYTE); - } - } + { + if (is_writing_output_match_data) + { + pos_byte = regs->start[0] + BEGV_BYTE; + record_current_buffer_haystack (info); + /* Set pos to the new position. */ + pos = regs->start[0]; + } + else + { + pos_byte = regs->start[0] + BEGV_BYTE; + /* Set pos to the new position. */ + pos = BYTE_TO_CHAR (regs->start[0] + BEGV_BYTE); + } + } else - { - unbind_to (count, Qnil); - return (n); - } + { + unbind_to (count, Qnil); + return (n); + } n++; maybe_quit (); } @@ -1252,41 +1410,33 @@ search_buffer_re (Lisp_Object string, ptrdiff_t pos, ptrdiff_t pos_byte, re_match_object = Qnil; val = re_search_2 (bufp, (char *) p1, s1, (char *) p2, s2, - pos_byte - BEGV_BYTE, lim_byte - pos_byte, - preserve_match_data ? &search_regs : &search_regs_1, - lim_byte - BEGV_BYTE); + pos_byte - BEGV_BYTE, lim_byte - pos_byte, + info, + lim_byte - BEGV_BYTE); if (val == -2) - { - unbind_to (count, Qnil); - matcher_overflow (); - } + { + unbind_to (count, Qnil); + matcher_overflow (); + } if (val >= 0) - { - if (preserve_match_data) - { - pos_byte = search_regs.end[0] + BEGV_BYTE; - for (ptrdiff_t i = 0; i < search_regs.num_regs; i++) - if (search_regs.start[i] >= 0) - { - search_regs.start[i] - = BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE); - search_regs.end[i] - = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE); - } - XSETBUFFER (last_thing_searched, current_buffer); - pos = search_regs.end[0]; - } - else - { - pos_byte = search_regs_1.end[0] + BEGV_BYTE; - pos = BYTE_TO_CHAR (search_regs_1.end[0] + BEGV_BYTE); - } - } + { + if (is_writing_output_match_data) + { + pos_byte = regs->end[0] + BEGV_BYTE; + record_current_buffer_haystack (info); + pos = regs->end[0]; + } + else + { + pos_byte = regs->end[0] + BEGV_BYTE; + pos = BYTE_TO_CHAR (regs->end[0] + BEGV_BYTE); + } + } else - { - unbind_to (count, Qnil); - return (0 - n); - } + { + unbind_to (count, Qnil); + return (0 - n); + } n--; maybe_quit (); } @@ -1296,9 +1446,9 @@ search_buffer_re (Lisp_Object string, ptrdiff_t pos, ptrdiff_t pos_byte, static EMACS_INT search_buffer_non_re (Lisp_Object string, ptrdiff_t pos, - ptrdiff_t pos_byte, ptrdiff_t lim, ptrdiff_t lim_byte, - EMACS_INT n, bool RE, Lisp_Object trt, Lisp_Object inverse_trt, - bool posix) + ptrdiff_t pos_byte, ptrdiff_t lim, ptrdiff_t lim_byte, + EMACS_INT n, bool RE, Lisp_Object trt, Lisp_Object inverse_trt, + bool posix, struct regexp_match_info *info) { unsigned char *raw_pattern, *pat; ptrdiff_t raw_pattern_size; @@ -1496,6 +1646,15 @@ search_buffer_non_re (Lisp_Object string, ptrdiff_t pos, return result; } +static Lisp_Object +extract_regexp_pattern (Lisp_Object regexp) +{ + if (REGEXP_P (regexp)) + return XREGEXP (regexp)->pattern; + CHECK_STRING (regexp); + return regexp; +} + /* Search for the Nth occurrence of STRING in the current buffer, from buffer position POS/POS_BYTE until LIM/LIM_BYTE. @@ -1515,27 +1674,66 @@ search_buffer_non_re (Lisp_Object string, ptrdiff_t pos, Use TRT and INVERSE_TRT as character translation tables. */ EMACS_INT -search_buffer (Lisp_Object string, ptrdiff_t pos, ptrdiff_t pos_byte, +search_buffer (Lisp_Object regexp, ptrdiff_t pos, ptrdiff_t pos_byte, ptrdiff_t lim, ptrdiff_t lim_byte, EMACS_INT n, - bool RE, Lisp_Object trt, Lisp_Object inverse_trt, bool posix) + bool RE, bool posix, struct regexp_match_info *info) { - if (running_asynch_code) - save_search_regs (); - + eassert (info != NULL); + Lisp_Object pattern = extract_regexp_pattern (regexp); /* Searching 0 times means don't move. */ /* Null string is found at starting position. */ - if (n == 0 || SCHARS (string) == 0) + if (n == 0 || SCHARS (pattern) == 0) { - set_search_regs (pos_byte, 0); + /* FIXME: make a helper function to do this!!!!!! */ + struct re_registers *regs = info->regs; + if (regs) + { + ptrdiff_t char_pos = BYTE_TO_CHAR (pos); + if (info->match) + { + eassert (regs->num_regs > 0); + info->match->initialized_regs = 1; + regs->start[0] = regs->end[0] = char_pos; + Lisp_Object buf; + XSETBUFFER (buf, current_buffer); + info->match->haystack = buf; + } + else + { + /* Make sure we have registers in which to store + the match position. */ + if (regs->num_regs == 0) + { + regs->start = xnmalloc (2, sizeof *regs->start); + regs->end = xnmalloc (2, sizeof *regs->end); + regs->num_regs = 2; + } + eassert (regs->num_regs > 0); + /* Write the empty match into the registers. */ + regs->start[0] = regs->end[0] = char_pos; + /* Clear out the other registers. */ + for (ptrdiff_t i = 1; i < regs->num_regs; ++i) + regs->start[i] = regs->end[i] = RE_MATCH_EXP_UNSET; + XSETBUFFER (last_thing_searched, current_buffer); + } + } return pos; } - if (RE && !(trivial_regexp_p (string) && NILP (Vsearch_spaces_regexp))) - pos = search_buffer_re (string, pos, pos_byte, lim, lim_byte, - n, trt, inverse_trt, posix); + if (RE && !(trivial_regexp_p (pattern) && NILP (Vsearch_spaces_regexp))) + pos = search_buffer_re (regexp, pos, pos_byte, lim, lim_byte, + n, posix, info); else - pos = search_buffer_non_re (string, pos, pos_byte, lim, lim_byte, - n, RE, trt, inverse_trt, posix); + { + Lisp_Object trt = (!NILP (Vcase_fold_search) + ? BVAR (current_buffer, case_canon_table) + : Qnil); + Lisp_Object inverse_trt = (!NILP (Vcase_fold_search) + ? BVAR (current_buffer, case_eqv_table) + : Qnil); + pos = search_buffer_non_re (pattern, pos, pos_byte, lim, lim_byte, + n, RE, trt, inverse_trt, posix, info); + } return pos; } @@ -2170,12 +2368,8 @@ boyer_moore (EMACS_INT n, unsigned char *base_pat, return BYTE_TO_CHAR (pos_byte); } -/* Record beginning BEG_BYTE and end BEG_BYTE + NBYTES - for the overall match just found in the current buffer. - Also clear out the match data for registers 1 and up. */ - static void -set_search_regs (ptrdiff_t beg_byte, ptrdiff_t nbytes) +set_regs (ptrdiff_t beg_byte, ptrdiff_t nbytes, struct re_registers *regs) { ptrdiff_t i; @@ -2184,26 +2378,37 @@ set_search_regs (ptrdiff_t beg_byte, ptrdiff_t nbytes) /* Make sure we have registers in which to store the match position. */ - if (search_regs.num_regs == 0) + if (regs->num_regs == 0) { - search_regs.start = xmalloc (2 * sizeof *search_regs.start); - search_regs.end = xmalloc (2 * sizeof *search_regs.end); - search_regs.num_regs = 2; + regs->start = xmalloc (2 * sizeof *regs->start); + regs->end = xmalloc (2 * sizeof *regs->end); + regs->num_regs = 2; } /* Clear out the other registers. */ - for (i = 1; i < search_regs.num_regs; i++) + for (i = 1; i < regs->num_regs; i++) { - search_regs.start[i] = -1; - search_regs.end[i] = -1; + regs->start[i] = RE_MATCH_EXP_UNSET; + regs->end[i] = RE_MATCH_EXP_UNSET; } - search_regs.start[0] = BYTE_TO_CHAR (beg_byte); - search_regs.end[0] = BYTE_TO_CHAR (beg_byte + nbytes); + regs->start[0] = BYTE_TO_CHAR (beg_byte); + regs->end[0] = BYTE_TO_CHAR (beg_byte + nbytes); +} + +/* Record beginning BEG_BYTE and end BEG_BYTE + NBYTES + for the overall match just found in the current buffer. + Also clear out the match data for registers 1 and up. */ + +static void +set_search_regs (ptrdiff_t beg_byte, ptrdiff_t nbytes) +{ + set_regs (beg_byte, nbytes, &search_regs); XSETBUFFER (last_thing_searched, current_buffer); } + -DEFUN ("search-backward", Fsearch_backward, Ssearch_backward, 1, 4, +DEFUN ("search-backward", Fsearch_backward, Ssearch_backward, 1, 5, "MSearch backward: ", doc: /* Search backward from point for STRING. Set point to the beginning of the occurrence found, and return point. @@ -2219,17 +2424,24 @@ DEFUN ("search-backward", Fsearch_backward, Ssearch_backward, 1, 4, With COUNT positive, the match found is the COUNTth to last one (or last, if COUNT is 1 or nil) in the buffer located entirely before the origin of the search; correspondingly with COUNT negative. +Optional sixth argument INHIBIT-MODIFY provides a match object to write into. + A value of nil means to use the default match object, and a value of + t means to discard the match data. Search case-sensitivity is determined by the value of the variable `case-fold-search', which see. See also the functions `match-beginning', `match-end' and `replace-match'. */) - (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count) + (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count, + Lisp_Object inhibit_modify) { - return search_command (string, bound, noerror, count, -1, false, false); + CHECK_STRING (string); + struct regexp_match_info info = resolve_buffer_match_info + (string, inhibit_modify); + return search_command (string, bound, noerror, count, -1, false, false, &info); } -DEFUN ("search-forward", Fsearch_forward, Ssearch_forward, 1, 4, "MSearch: ", +DEFUN ("search-forward", Fsearch_forward, Ssearch_forward, 1, 5, "MSearch: ", doc: /* Search forward from point for STRING. Set point to the end of the occurrence found, and return point. An optional second argument bounds the search; it is a buffer position. @@ -2244,17 +2456,24 @@ DEFUN ("search-forward", Fsearch_forward, Ssearch_forward, 1, 4, "MSearch: ", With COUNT positive, the match found is the COUNTth one (or first, if COUNT is 1 or nil) in the buffer located entirely after the origin of the search; correspondingly with COUNT negative. +Optional sixth argument INHIBIT-MODIFY provides a match object to write into. + A value of nil means to use the default match object, and a value of + t means to discard the match data. Search case-sensitivity is determined by the value of the variable `case-fold-search', which see. See also the functions `match-beginning', `match-end' and `replace-match'. */) - (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count) + (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count, + Lisp_Object inhibit_modify) { - return search_command (string, bound, noerror, count, 1, false, false); + CHECK_STRING (string); + struct regexp_match_info info = resolve_buffer_match_info + (string, inhibit_modify); + return search_command (string, bound, noerror, count, 1, false, false, &info); } -DEFUN ("re-search-backward", Fre_search_backward, Sre_search_backward, 1, 4, +DEFUN ("re-search-backward", Fre_search_backward, Sre_search_backward, 1, 5, "sRE search backward: ", doc: /* Search backward from point for regular expression REGEXP. This function is almost identical to `re-search-forward', except that @@ -2265,12 +2484,15 @@ DEFUN ("re-search-backward", Fre_search_backward, Sre_search_backward, 1, 4, Note that searching backwards may give a shorter match than expected, because REGEXP is still matched in the forward direction. See Info anchor `(elisp) re-search-backward' for details. */) - (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count) + (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count, + Lisp_Object inhibit_modify) { - return search_command (regexp, bound, noerror, count, -1, true, false); + struct regexp_match_info info = resolve_buffer_match_info + (regexp, inhibit_modify); + return search_command (regexp, bound, noerror, count, -1, true, false, &info); } -DEFUN ("re-search-forward", Fre_search_forward, Sre_search_forward, 1, 4, +DEFUN ("re-search-forward", Fre_search_forward, Sre_search_forward, 1, 5, "sRE search: ", doc: /* Search forward from point for regular expression REGEXP. Set point to the end of the occurrence found, and return point. @@ -2290,18 +2512,24 @@ DEFUN ("re-search-forward", Fre_search_forward, Sre_search_forward, 1, 4, With COUNT positive/negative, the match found is the COUNTth/-COUNTth one in the buffer located entirely after/before the origin of the search. +Optional sixth argument INHIBIT-MODIFY provides a match object to write into. + A value of nil means to use the default match object, and a value of + t means to discard the match data. Search case-sensitivity is determined by the value of the variable `case-fold-search', which see. See also the functions `match-beginning', `match-end', `match-string', and `replace-match'. */) - (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count) + (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count, + Lisp_Object inhibit_modify) { - return search_command (regexp, bound, noerror, count, 1, true, false); + struct regexp_match_info info = resolve_buffer_match_info + (regexp, inhibit_modify); + return search_command (regexp, bound, noerror, count, 1, true, false, &info); } -DEFUN ("posix-search-backward", Fposix_search_backward, Sposix_search_backward, 1, 4, +DEFUN ("posix-search-backward", Fposix_search_backward, Sposix_search_backward, 1, 5, "sPosix search backward: ", doc: /* Search backward from point for match for REGEXP according to Posix rules. Find the longest match in accord with Posix regular expression rules. @@ -2318,18 +2546,24 @@ DEFUN ("posix-search-backward", Fposix_search_backward, Sposix_search_backward, With COUNT positive, the match found is the COUNTth to last one (or last, if COUNT is 1 or nil) in the buffer located entirely before the origin of the search; correspondingly with COUNT negative. +Optional sixth argument INHIBIT-MODIFY provides a match object to write into. + A value of nil means to use the default match object, and a value of + t means to discard the match data. Search case-sensitivity is determined by the value of the variable `case-fold-search', which see. See also the functions `match-beginning', `match-end', `match-string', and `replace-match'. */) - (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count) + (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count, + Lisp_Object inhibit_modify) { - return search_command (regexp, bound, noerror, count, -1, true, true); + struct regexp_match_info info = resolve_buffer_match_info + (regexp, inhibit_modify); + return search_command (regexp, bound, noerror, count, -1, true, true, &info); } -DEFUN ("posix-search-forward", Fposix_search_forward, Sposix_search_forward, 1, 4, +DEFUN ("posix-search-forward", Fposix_search_forward, Sposix_search_forward, 1, 5, "sPosix search: ", doc: /* Search forward from point for REGEXP according to Posix rules. Find the longest match in accord with Posix regular expression rules. @@ -2346,15 +2580,21 @@ DEFUN ("posix-search-forward", Fposix_search_forward, Sposix_search_forward, 1, With COUNT positive, the match found is the COUNTth one (or first, if COUNT is 1 or nil) in the buffer located entirely after the origin of the search; correspondingly with COUNT negative. +Optional sixth argument INHIBIT-MODIFY provides a match object to write into. + A value of nil means to use the default match object, and a value of + t means to discard the match data. Search case-sensitivity is determined by the value of the variable `case-fold-search', which see. See also the functions `match-beginning', `match-end', `match-string', and `replace-match'. */) - (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count) + (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count, + Lisp_Object inhibit_modify) { - return search_command (regexp, bound, noerror, count, 1, true, true); + struct regexp_match_info info = resolve_buffer_match_info + (regexp, inhibit_modify); + return search_command (regexp, bound, noerror, count, 1, true, true, &info); } DEFUN ("replace-match", Freplace_match, Sreplace_match, 1, 5, 0, @@ -3405,16 +3645,10 @@ DEFUN ("re--describe-compiled", Fre__describe_compiled, Sre__describe_compiled, If RAW is non-nil, just return the actual bytecode. */) (Lisp_Object regexp, Lisp_Object raw) { - struct regexp_cache *cache_entry - = compile_pattern (regexp, NULL, - (!NILP (Vcase_fold_search) - ? BVAR (current_buffer, case_canon_table) : Qnil), - false, - !NILP (BVAR (current_buffer, - enable_multibyte_characters))); + struct re_pattern_buffer *bufp = resolve_compiled_regexp (regexp, false); + if (!NILP (raw)) - return make_unibyte_string ((char *) cache_entry->buf.buffer, - cache_entry->buf.used); + return make_unibyte_string ((char *) bufp->buffer, bufp->used); else { /* FIXME: Why ENABLE_CHECKING? */ #if !defined ENABLE_CHECKING @@ -3424,8 +3658,8 @@ DEFUN ("re--describe-compiled", Fre__describe_compiled, Sre__describe_compiled, size_t size = 0; FILE* f = open_memstream (&buffer, &size); if (!f) - report_file_error ("open_memstream failed", regexp); - print_compiled_pattern (f, &cache_entry->buf); + report_file_error ("open_memstream failed", regexp); + print_compiled_pattern (f, bufp); fclose (f); if (!buffer) return Qnil; @@ -3433,7 +3667,7 @@ DEFUN ("re--describe-compiled", Fre__describe_compiled, Sre__describe_compiled, free (buffer); return description; #else /* ENABLE_CHECKING && !HAVE_OPEN_MEMSTREAM */ - print_compiled_pattern (stderr, &cache_entry->buf); + print_compiled_pattern (stderr, bufp); return build_string ("Description was sent to standard error"); #endif /* !ENABLE_CHECKING */ } diff --git a/src/treesit.c b/src/treesit.c index 27779692923..b3b229a69a2 100644 --- a/src/treesit.c +++ b/src/treesit.c @@ -26,6 +26,8 @@ Copyright (C) 2021-2024 Free Software Foundation, Inc. #include "treesit.h" #if HAVE_TREE_SITTER +/* For search_buffer(). */ +# include "regex-emacs.h" /* Dynamic loading of libtree-sitter. */ @@ -2735,7 +2737,7 @@ treesit_predicate_match (Lisp_Object args, struct capture_range captures, ZV_BYTE = end_byte; ptrdiff_t val = search_buffer (regexp, start_pos, start_byte, - end_pos, end_byte, 1, true, Qnil, Qnil, false); + end_pos, end_byte, 1, true, false, NULL); BEGV = old_begv; BEGV_BYTE = old_begv_byte; diff --git a/test/manual/noverlay/Makefile.in b/test/manual/noverlay/Makefile.in index 280230b569a..c00ec9ccb4e 100644 --- a/test/manual/noverlay/Makefile.in +++ b/test/manual/noverlay/Makefile.in @@ -27,6 +27,9 @@ LDLIBS += OBJECTS = itree-tests.o CC = gcc EMACS ?= $(top_builddir)/src/emacs +MANUAL_TEST_SRC = $(top_srcdir)/test/manual +PERF_TEST_SRC = $(MANUAL_TEST_SRC)/perf +NOVERLAY_TEST_SRC = $(MANUAL_TEST_SRC)/noverlay .PHONY: all check clean distclean perf @@ -35,10 +38,22 @@ all: check: $(PROGRAM) ./check-sanitize.sh ./$(PROGRAM) +emacs-compat.h: $(NOVERLAY_TEST_SRC)/emacs-compat.h + cp -v $< $@ + +perf.el: $(PERF_TEST_SRC)/perf.el + cp -v $< $@ + +overlay-perf.el: $(NOVERLAY_TEST_SRC)/overlay-perf.el + cp -v $< $@ + +many-errors.py: $(NOVERLAY_TEST_SRC)/many-errors.py + cp -v $< $@ + itree-tests.o: emacs-compat.h $(top_srcdir)/src/itree.c $(top_srcdir)/src/itree.h -perf: - -$(EMACS) -Q -l ./overlay-perf.el -f perf-run-batch +perf: perf.el overlay-perf.el many-errors.py + -$(EMACS) -Q -l ./perf.el -l ./overlay-perf.el -f perf-run-batch clean: rm -f -- $(OBJECTS) $(PROGRAM) diff --git a/test/manual/noverlay/overlay-perf.el b/test/manual/noverlay/overlay-perf.el index 56cb72fc308..34cc6abd310 100644 --- a/test/manual/noverlay/overlay-perf.el +++ b/test/manual/noverlay/overlay-perf.el @@ -19,396 +19,6 @@ ;;; Code: -(require 'cl-lib) -(require 'subr-x) -(require 'seq) -(require 'hi-lock) - - -;; +===================================================================================+ -;; | Framework -;; +===================================================================================+ - -(defmacro perf-define-constant-test (name &optional doc &rest body) - (declare (indent 1) (debug (symbol &optional string &rest form))) - `(progn - (put ',name 'perf-constant-test t) - (defun ,name nil ,doc ,@body))) - -(defmacro perf-define-variable-test (name args &optional doc &rest body) - (declare (indent 2) (debug defun)) - (unless (and (consp args) - (= (length args) 1)) - (error "Function %s should accept exactly one argument." name)) - `(progn - (put ',name 'perf-variable-test t) - (defun ,name ,args ,doc ,@body))) - -(defmacro perf-define-test-suite (name &rest tests) - (declare (indent 1)) - `(put ',name 'perf-test-suite - ,(cons 'list tests))) - -(defun perf-constant-test-p (test) - (get test 'perf-constant-test)) - -(defun perf-variable-test-p (test) - (get test 'perf-variable-test)) - -(defun perf-test-suite-p (suite) - (not (null (perf-test-suite-elements suite)))) - -(defun perf-test-suite-elements (suite) - (get suite 'perf-test-suite)) - -(defun perf-expand-suites (test-and-suites) - (apply #' append (mapcar (lambda (elt) - (if (perf-test-suite-p elt) - (perf-test-suite-elements elt) - (list elt))) - test-and-suites))) -(defun perf-test-p (symbol) - (or (perf-variable-test-p symbol) - (perf-constant-test-p symbol))) - -(defun perf-all-tests () - (let (result) - (mapatoms (lambda (symbol) - (when (and (fboundp symbol) - (perf-test-p symbol)) - (push symbol result)))) - (sort result #'string-lessp))) - -(defvar perf-default-test-argument 4096) - -(defun perf-run-1 (&optional k n &rest tests) - "Run TESTS K times using N as argument for non-constant ones. - -Return test-total elapsed time." - (random "") - (when (and n (not (numberp n))) - (push k tests) - (push n tests) - (setq n nil k nil)) - (when (and k (not (numberp k))) - (push k tests) - (setq k nil)) - (let* ((k (or k 1)) - (n (or n perf-default-test-argument)) - (tests (perf-expand-suites (or tests - (perf-all-tests)))) - (variable-tests (seq-filter #'perf-variable-test-p tests)) - (constant-tests (seq-filter #'perf-constant-test-p tests)) - (max-test-string-width (perf-max-symbol-length tests))) - (unless (seq-every-p #'perf-test-p tests) - (error "Some of these are not tests: %s" tests)) - (cl-labels ((format-result (result) - (cond - ((numberp result) (format "%.2f" result)) - ((stringp result) result) - ((null result) "N/A"))) - (format-test (fn) - (concat (symbol-name fn) - (make-string - (+ (- max-test-string-width - (length (symbol-name fn))) - 1) - ?\s))) - (format-summary (results _total) - (let ((min (apply #'min results)) - (max (apply #'max results)) - (avg (/ (apply #'+ results) (float (length results))))) - (format "n=%d min=%.2f avg=%.2f max=%.2f" (length results) min avg max))) - (run-test (fn) - (let ((total 0) results) - (dotimes (_ (max 0 k)) - (garbage-collect) - (princ (concat " " (format-test fn))) - (let ((result (condition-case-unless-debug err - (cond - ((perf-variable-test-p fn) - (random "") (car (funcall fn n))) - ((perf-constant-test-p fn) - (random "") (car (funcall fn))) - (t "skip")) - (error (error-message-string err))))) - (when (numberp result) - (cl-incf total result) - (push result results)) - (princ (format-result result)) - (terpri))) - (when (> (length results) 1) - (princ (concat "#" (format-test fn) - (format-summary results total))) - (terpri))))) - (when variable-tests - (terpri) - (dolist (fn variable-tests) - (run-test fn) - (terpri))) - (when constant-tests - (dolist (fn constant-tests) - (run-test fn) - (terpri)))))) - -(defun perf-run (&optional k n &rest tests) - (interactive - (let* ((n (if current-prefix-arg - (prefix-numeric-value current-prefix-arg) - perf-default-test-argument)) - (tests (mapcar #'intern - (completing-read-multiple - (format "Run tests (n=%d): " n) - (perf-all-tests) nil t nil 'perf-test-history)))) - (cons 1 (cons n tests)))) - (with-current-buffer (get-buffer-create "*perf-results*") - (let ((inhibit-read-only t) - (standard-output (current-buffer))) - (erase-buffer) - (apply #'perf-run-1 k n tests) - (display-buffer (current-buffer))))) - - -(defun perf-batch-parse-command-line (args) - (let ((k 1) - (n perf-default-test-argument) - tests) - (while args - (cond ((string-match-p "\\`-[cn]\\'" (car args)) - (unless (and (cdr args) - (string-match-p "\\`[0-9]+\\'" (cadr args))) - (error "%s expects a natnum argument" (car args))) - (if (equal (car args) "-c") - (setq k (string-to-number (cadr args))) - (setq n (string-to-number (cadr args)))) - (setq args (cddr args))) - (t (push (intern (pop args)) tests)))) - (list k n tests))) - - -(defun perf-run-batch () - "Runs tests from `command-line-args-left' and kill emacs." - (let ((standard-output #'external-debugging-output)) - (condition-case err - (cl-destructuring-bind (k n tests) - (perf-batch-parse-command-line command-line-args-left) - (apply #'perf-run-1 k n tests) - (save-buffers-kill-emacs)) - (error - (princ (error-message-string err)) - (save-buffers-kill-emacs))))) - -(defconst perf-number-of-columns 70) - -(defun perf-insert-lines (n) - "Insert N lines into the current buffer." - (dotimes (i n) - (insert (make-string 70 (if (= (% i 2) 0) - ?. - ?O)) - ?\n))) - -(defun perf-switch-to-buffer-scroll-random (n &optional buffer) - (interactive) - (set-window-buffer nil (or buffer (current-buffer))) - (goto-char (point-min)) - (redisplay t) - (dotimes (_ n) - (goto-char (random (point-max))) - (recenter) - (redisplay t))) - -(defun perf-insert-overlays (n &optional create-callback random-p) - (if random-p - (perf-insert-overlays-random n create-callback) - (perf-insert-overlays-sequential n create-callback))) - -(defun perf-insert-overlays-sequential (n &optional create-callback) - "Insert an overlay every Nth line." - (declare (indent 1)) - (let ((i 0) - (create-callback (or create-callback #'ignore))) - (save-excursion - (goto-char (point-min)) - (while (not (eobp)) - (when (= 0 (% i n)) - (let ((ov (make-overlay (point-at-bol) (point-at-eol)))) - (funcall create-callback ov) - (overlay-put ov 'priority (random (buffer-size))))) - (cl-incf i) - (forward-line))))) - -(defun perf-insert-overlays-random (n &optional create-callback) - "Insert an overlay every Nth line." - (declare (indent 1)) - (let ((create-callback (or create-callback #'ignore))) - (save-excursion - (while (>= (cl-decf n) 0) - (let* ((beg (1+ (random (point-max)))) - (ov (make-overlay beg (+ beg (random 70))))) - (funcall create-callback ov) - (overlay-put ov 'priority (random (buffer-size)))))))) - -(defun perf-insert-overlays-hierarchical (n &optional create-callback) - (let ((create-callback (or create-callback #'ignore))) - (save-excursion - (goto-char (point-min)) - (let ((spacing (floor (/ (/ (count-lines (point-min) (point-max)) - (float 3)) - n)))) - (when (< spacing 1) - (error "Hierarchical overlay overflow !!")) - (dotimes (i n) - (funcall create-callback - (make-overlay (point) - (save-excursion - (goto-char (point-max)) - (forward-line (- (* spacing i))) - (point)))) - - (when (eobp) - (error "End of buffer in hierarchical overlays")) - (forward-line spacing)))))) - -(defun perf-overlay-ascii-chart (&optional buffer width) - (interactive) - (save-current-buffer - (when buffer (set-buffer buffer)) - (unless width (setq width 100)) - (let* ((ovl (sort (overlays-in (point-min) (point-max)) - (lambda (ov1 ov2) - (or (<= (overlay-start ov1) - (overlay-start ov2)) - (and - (= (overlay-start ov1) - (overlay-start ov2)) - (< (overlay-end ov1) - (overlay-end ov2))))))) - (ov-width (apply #'max (mapcar (lambda (ov) - (- (overlay-end ov) - (overlay-start ov))) - ovl))) - (ov-min (apply #'min (mapcar #'overlay-start ovl))) - (ov-max (apply #'max (mapcar #'overlay-end ovl))) - (scale (/ (float width) (+ ov-min ov-width)))) - (with-current-buffer (get-buffer-create "*overlay-ascii-chart*") - (let ((inhibit-read-only t)) - (erase-buffer) - (buffer-disable-undo) - (insert (format "%06d%s%06d\n" ov-min (make-string (- width 12) ?\s) ov-max)) - (dolist (ov ovl) - (let ((length (round (* scale (- (overlay-end ov) - (overlay-start ov)))))) - (insert (make-string (round (* scale (overlay-start ov))) ?\s)) - (cl-case length - (0 (insert "O")) - (1 (insert "|")) - (t (insert (format "|%s|" (make-string (- length 2) ?-))))) - (insert "\n"))) - (goto-char (point-min))) - (read-only-mode 1) - (pop-to-buffer (current-buffer)))))) - -(defconst perf-overlay-faces (mapcar #'intern (seq-take hi-lock-face-defaults 3))) - -(defun perf-overlay-face-callback (ov) - (overlay-put ov 'face (nth (random (length perf-overlay-faces)) - perf-overlay-faces))) - -(defun perf-overlay-invisible-callback (ov) - (overlay-put ov 'invisble (= 1 (random 2)))) - -(defun perf-overlay-display-callback (ov) - (overlay-put ov 'display (make-string 70 ?*))) - -(defmacro perf-define-display-test (overlay-type property-type scroll-type) - (let ((name (intern (format "perf-display-%s/%s/%s" - overlay-type property-type scroll-type))) - (arg (make-symbol "n"))) - - `(perf-define-variable-test ,name (,arg) - (with-temp-buffer - (perf-insert-lines ,arg) - (overlay-recenter (point-max)) - ,@(perf-define-display-test-1 arg overlay-type property-type scroll-type))))) - -(defun perf-define-display-test-1 (arg overlay-type property-type scroll-type) - (list (append (cl-case overlay-type - (sequential - (list 'perf-insert-overlays-sequential 2)) - (hierarchical - `(perf-insert-overlays-hierarchical (/ ,arg 10))) - (random - `(perf-insert-overlays-random (/ ,arg 2))) - (t (error "Invalid insert type: %s" overlay-type))) - (list - (cl-case property-type - (display '#'perf-overlay-display-callback) - (face '#'perf-overlay-face-callback) - (invisible '#'perf-overlay-invisible-callback) - (t (error "Invalid overlay type: %s" overlay-type))))) - (list 'benchmark-run 1 - (cl-case scroll-type - (scroll '(perf-switch-to-buffer-scroll-up-and-down)) - (random `(perf-switch-to-buffer-scroll-random (/ ,arg 50))) - (t (error "Invalid scroll type: %s" overlay-type)))))) - -(defun perf-max-symbol-length (symbols) - "Return the longest symbol in SYMBOLS, or -1 if symbols is nil." - (if (null symbols) - -1 - (apply #'max (mapcar - (lambda (elt) - (length (symbol-name elt))) - symbols)))) - -(defun perf-insert-text (n) - "Insert N character into the current buffer." - (let ((ncols 68) - (char ?.)) - (dotimes (_ (/ n ncols)) - (insert (make-string (1- ncols) char) ?\n)) - (when (> (% n ncols) 0) - (insert (make-string (1- (% n ncols)) char) ?\n)))) - -(defconst perf-insert-overlays-default-length 24) - -(defun perf-insert-overlays-scattered (n &optional length) - "Insert N overlays of max length 24 randomly." - (dotimes (_ n) - (let ((begin (random (1+ (point-max))))) - (make-overlay - begin (+ begin (random (1+ (or length perf-insert-overlays-default-length 0)))))))) - -(defvar perf-marker-gc-protection nil) - -(defun perf-insert-marker-scattered (n) - "Insert N marker randomly." - (setq perf-marker-gc-protection nil) - (dotimes (_ n) - (push (copy-marker (random (1+ (point-max)))) - perf-marker-gc-protection))) - -(defun perf-switch-to-buffer-scroll-up-and-down (&optional buffer) - (interactive) - (set-window-buffer nil (or buffer (current-buffer))) - (goto-char (point-min)) - (redisplay t) - (while (condition-case nil - (progn (scroll-up) t) - (end-of-buffer nil)) - (redisplay t)) - (while (condition-case nil - (progn (scroll-down) t) - (beginning-of-buffer nil)) - (redisplay t))) - -(defun perf-emacs-lisp-setup () - (add-to-list 'imenu-generic-expression - '(nil "^\\s-*(perf-define\\(?:\\w\\|\\s_\\)*\\s-*\\(\\(?:\\w\\|\\s_\\)+\\)" 1))) - -(add-hook 'emacs-lisp-mode 'perf-emacs-lisp-setup) - ;; +===================================================================================+ ;; | Basic performance tests @@ -506,23 +116,6 @@ perf-delete-scatter-empty (let ((perf-insert-overlays-default-length 0)) (perf-delete-scatter n))) -(defmacro perf-define-marker-test (type where) - (let ((name (intern (format "perf-%s-%s-marker" type where)))) - `(perf-define-variable-test ,name (n) - (with-temp-buffer - (perf-insert-text n) - (perf-insert-marker-scattered n) - (goto-char ,(cl-case where - (after (list 'point-max)) - (t (list 'point-min)))) - (benchmark-run 1 - (dotimes (_ (/ n 2)) - ,@(when (eq where 'scatter) - (list '(goto-char (max 1 (random (point-max)))))) - ,(cl-case type - (insert (list 'insert ?X)) - (delete (list 'delete-char (if (eq where 'after) -1 1)))))))))) - (perf-define-test-suite perf-marker-suite (perf-define-marker-test insert before) (perf-define-marker-test insert after) diff --git a/test/manual/perf/perf.el b/test/manual/perf/perf.el new file mode 100644 index 00000000000..c0f4e806b20 --- /dev/null +++ b/test/manual/perf/perf.el @@ -0,0 +1,442 @@ +;; -*- lexical-binding:t -*- + +;; Copyright (C) 2015-2024 Free Software Foundation, Inc. + +;; This file is part of GNU Emacs. + +;; GNU Emacs is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. + +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs. If not, see . + +;;; Code: + +(require 'cl-lib) +(require 'subr-x) +(require 'seq) +(require 'hi-lock) + + +;; +===================================================================================+ +;; | Framework +;; +===================================================================================+ + +(defmacro perf-define-constant-test (name &optional doc &rest body) + (declare (indent 1) (debug (symbol &optional string &rest form))) + `(progn + (put ',name 'perf-constant-test t) + (defun ,name nil ,doc ,@body))) + +(defmacro perf-define-variable-test (name args &optional doc &rest body) + (declare (indent 2) (debug defun)) + (unless (and (consp args) + (= (length args) 1)) + (error "Function %s should accept exactly one argument." name)) + `(progn + (put ',name 'perf-variable-test t) + (defun ,name ,args ,doc ,@body))) + +(defmacro perf-define-test-suite (name &rest tests) + (declare (indent 1)) + `(put ',name 'perf-test-suite + ,(cons 'list tests))) + +(defun perf-constant-test-p (test) + (get test 'perf-constant-test)) + +(defun perf-variable-test-p (test) + (get test 'perf-variable-test)) + +(defun perf-test-suite-p (suite) + (not (null (perf-test-suite-elements suite)))) + +(defun perf-test-suite-elements (suite) + (get suite 'perf-test-suite)) + +(defun perf-expand-suites (test-and-suites) + (apply #' append (mapcar (lambda (elt) + (if (perf-test-suite-p elt) + (perf-test-suite-elements elt) + (list elt))) + test-and-suites))) +(defun perf-test-p (symbol) + (or (perf-variable-test-p symbol) + (perf-constant-test-p symbol))) + +(defun perf-all-tests () + (let (result) + (mapatoms (lambda (symbol) + (when (and (fboundp symbol) + (perf-test-p symbol)) + (push symbol result)))) + (sort result #'string-lessp))) + +(defvar perf-default-test-argument 4096) + +(defun perf-run-1 (&optional k n &rest tests) + "Run TESTS K times using N as argument for non-constant ones. + +Return test-total elapsed time." + (random "") + (when (and n (not (numberp n))) + (push k tests) + (push n tests) + (setq n nil k nil)) + (when (and k (not (numberp k))) + (push k tests) + (setq k nil)) + (let* ((k (or k 1)) + (n (or n perf-default-test-argument)) + (tests (perf-expand-suites (or tests + (perf-all-tests)))) + (variable-tests (seq-filter #'perf-variable-test-p tests)) + (constant-tests (seq-filter #'perf-constant-test-p tests)) + (max-test-string-width (perf-max-symbol-length tests))) + (unless (seq-every-p #'perf-test-p tests) + (error "Some of these are not tests: %s" tests)) + (cl-labels ((format-result (result) + (cond + ((numberp result) (format "%.2f" result)) + ((stringp result) result) + ((null result) "N/A"))) + (format-test (fn) + (concat (symbol-name fn) + (make-string + (+ (- max-test-string-width + (length (symbol-name fn))) + 1) + ?\s))) + (format-summary (results _total) + (let ((min (apply #'min results)) + (max (apply #'max results)) + (avg (/ (apply #'+ results) (float (length results))))) + (format "n=%d min=%.2f avg=%.2f max=%.2f" (length results) min avg max))) + (run-test (fn) + (let ((total 0) results) + (dotimes (_ (max 0 k)) + (garbage-collect) + (princ (concat " " (format-test fn))) + (let ((result (condition-case-unless-debug err + (cond + ((perf-variable-test-p fn) + (random "") (car (funcall fn n))) + ((perf-constant-test-p fn) + (random "") (car (funcall fn))) + (t "skip")) + (error (error-message-string err))))) + (when (numberp result) + (cl-incf total result) + (push result results)) + (princ (format-result result)) + (terpri))) + (when (> (length results) 1) + (princ (concat "#" (format-test fn) + (format-summary results total))) + (terpri))))) + (when variable-tests + (terpri) + (dolist (fn variable-tests) + (run-test fn) + (terpri))) + (when constant-tests + (dolist (fn constant-tests) + (run-test fn) + (terpri)))))) + +(defun perf-run (&optional k n &rest tests) + (interactive + (let* ((n (if current-prefix-arg + (prefix-numeric-value current-prefix-arg) + perf-default-test-argument)) + (tests (mapcar #'intern + (completing-read-multiple + (format "Run tests (n=%d): " n) + (perf-all-tests) nil t nil 'perf-test-history)))) + (cons 1 (cons n tests)))) + (with-current-buffer (get-buffer-create "*perf-results*") + (let ((inhibit-read-only t) + (standard-output (current-buffer))) + (erase-buffer) + (apply #'perf-run-1 k n tests) + (display-buffer (current-buffer))))) + + +(defun perf-batch-parse-command-line (args) + (let ((k 1) + (n perf-default-test-argument) + tests) + (while args + (cond ((string-match-p "\\`-[cn]\\'" (car args)) + (unless (and (cdr args) + (string-match-p "\\`[0-9]+\\'" (cadr args))) + (error "%s expects a natnum argument" (car args))) + (if (equal (car args) "-c") + (setq k (string-to-number (cadr args))) + (setq n (string-to-number (cadr args)))) + (setq args (cddr args))) + (t (push (intern (pop args)) tests)))) + (list k n tests))) + + +(defun perf-run-batch () + "Runs tests from `command-line-args-left' and kill emacs." + (let ((standard-output #'external-debugging-output)) + (condition-case err + (cl-destructuring-bind (k n tests) + (perf-batch-parse-command-line command-line-args-left) + (apply #'perf-run-1 k n tests) + (save-buffers-kill-emacs)) + (error + (princ (error-message-string err)) + (save-buffers-kill-emacs))))) + +(defconst perf-number-of-columns 70) + +(defun perf-insert-lines (n) + "Insert N lines into the current buffer." + (dotimes (i n) + (insert (make-string perf-number-of-columns + (if (= (% i 2) 0) + ?. + ?O)) + ?\n))) + +(defun perf-random-string (n) + "Create a string of N random characters." + (cl-loop with v = (make-vector n 0) + for i upfrom 0 below n + ;; This generates printable ASCII characters. + for c = (+ ?! (random (- ?~ ?!))) + do (aset v i c) + finally return (concat v))) + +(defun perf-insert-random (n) + "Insert N random characters into the current buffer." + (insert (perf-random-string n))) + +(defun perf-switch-to-buffer-scroll-random (n &optional buffer) + (interactive) + (set-window-buffer nil (or buffer (current-buffer))) + (goto-char (point-min)) + (redisplay t) + (dotimes (_ n) + (goto-char (random (point-max))) + (recenter) + (redisplay t))) + +(defun perf-insert-overlays (n &optional create-callback random-p) + (if random-p + (perf-insert-overlays-random n create-callback) + (perf-insert-overlays-sequential n create-callback))) + +(defun perf-insert-overlays-sequential (n &optional create-callback) + "Insert an overlay every Nth line." + (declare (indent 1)) + (let ((i 0) + (create-callback (or create-callback #'ignore))) + (save-excursion + (goto-char (point-min)) + (while (not (eobp)) + (when (= 0 (% i n)) + (let ((ov (make-overlay (point-at-bol) (point-at-eol)))) + (funcall create-callback ov) + (overlay-put ov 'priority (random (buffer-size))))) + (cl-incf i) + (forward-line))))) + +(defun perf-insert-overlays-random (n &optional create-callback) + "Insert an overlay every Nth line." + (declare (indent 1)) + (let ((create-callback (or create-callback #'ignore))) + (save-excursion + (while (>= (cl-decf n) 0) + (let* ((beg (1+ (random (point-max)))) + (ov (make-overlay + beg (+ beg (random perf-number-of-columns))))) + (funcall create-callback ov) + (overlay-put ov 'priority (random (buffer-size)))))))) + +(defun perf-insert-overlays-hierarchical (n &optional create-callback) + (let ((create-callback (or create-callback #'ignore))) + (save-excursion + (goto-char (point-min)) + (let ((spacing (floor (/ (/ (count-lines (point-min) (point-max)) + (float 3)) + n)))) + (when (< spacing 1) + (error "Hierarchical overlay overflow !!")) + (dotimes (i n) + (funcall create-callback + (make-overlay (point) + (save-excursion + (goto-char (point-max)) + (forward-line (- (* spacing i))) + (point)))) + + (when (eobp) + (error "End of buffer in hierarchical overlays")) + (forward-line spacing)))))) + +(defun perf-overlay-ascii-chart (&optional buffer width) + (interactive) + (save-current-buffer + (when buffer (set-buffer buffer)) + (unless width (setq width 100)) + (let* ((ovl (sort (overlays-in (point-min) (point-max)) + (lambda (ov1 ov2) + (or (<= (overlay-start ov1) + (overlay-start ov2)) + (and + (= (overlay-start ov1) + (overlay-start ov2)) + (< (overlay-end ov1) + (overlay-end ov2))))))) + (ov-width (apply #'max (mapcar (lambda (ov) + (- (overlay-end ov) + (overlay-start ov))) + ovl))) + (ov-min (apply #'min (mapcar #'overlay-start ovl))) + (ov-max (apply #'max (mapcar #'overlay-end ovl))) + (scale (/ (float width) (+ ov-min ov-width)))) + (with-current-buffer (get-buffer-create "*overlay-ascii-chart*") + (let ((inhibit-read-only t)) + (erase-buffer) + (buffer-disable-undo) + (insert (format "%06d%s%06d\n" ov-min (make-string (- width 12) ?\s) ov-max)) + (dolist (ov ovl) + (let ((length (round (* scale (- (overlay-end ov) + (overlay-start ov)))))) + (insert (make-string (round (* scale (overlay-start ov))) ?\s)) + (cl-case length + (0 (insert "O")) + (1 (insert "|")) + (t (insert (format "|%s|" (make-string (- length 2) ?-))))) + (insert "\n"))) + (goto-char (point-min))) + (read-only-mode 1) + (pop-to-buffer (current-buffer)))))) + +(defconst perf-overlay-faces (mapcar #'intern (seq-take hi-lock-face-defaults 3))) + +(defun perf-overlay-face-callback (ov) + (overlay-put ov 'face (nth (random (length perf-overlay-faces)) + perf-overlay-faces))) + +(defun perf-overlay-invisible-callback (ov) + (overlay-put ov 'invisble (= 1 (random 2)))) + +(defun perf-overlay-display-callback (ov) + (overlay-put ov 'display (make-string perf-number-of-columns ?*))) + +(defmacro perf-define-display-test (overlay-type property-type scroll-type) + (let ((name (intern (format "perf-display-%s/%s/%s" + overlay-type property-type scroll-type))) + (arg (make-symbol "n"))) + + `(perf-define-variable-test ,name (,arg) + (with-temp-buffer + (perf-insert-lines ,arg) + (overlay-recenter (point-max)) + ,@(perf-define-display-test-1 arg overlay-type property-type scroll-type))))) + +(defun perf-define-display-test-1 (arg overlay-type property-type scroll-type) + (list (append (cl-case overlay-type + (sequential + (list 'perf-insert-overlays-sequential 2)) + (hierarchical + `(perf-insert-overlays-hierarchical (/ ,arg 10))) + (random + `(perf-insert-overlays-random (/ ,arg 2))) + (t (error "Invalid insert type: %s" overlay-type))) + (list + (cl-case property-type + (display '#'perf-overlay-display-callback) + (face '#'perf-overlay-face-callback) + (invisible '#'perf-overlay-invisible-callback) + (t (error "Invalid overlay type: %s" overlay-type))))) + (list 'benchmark-run 1 + (cl-case scroll-type + (scroll '(perf-switch-to-buffer-scroll-up-and-down)) + (random `(perf-switch-to-buffer-scroll-random (/ ,arg 50))) + (t (error "Invalid scroll type: %s" overlay-type)))))) + +(defun perf-max-symbol-length (symbols) + "Return the longest symbol in SYMBOLS, or -1 if symbols is nil." + (if (null symbols) + -1 + (apply #'max (mapcar + (lambda (elt) + (length (symbol-name elt))) + symbols)))) + +(defun perf-insert-text (n) + "Insert N character into the current buffer." + (let ((ncols 68) + (char ?.)) + (dotimes (_ (/ n ncols)) + (insert (make-string (1- ncols) char) ?\n)) + (when (> (% n ncols) 0) + (insert (make-string (1- (% n ncols)) char) ?\n)))) + +(defconst perf-insert-overlays-default-length 24) + +(defun perf-insert-overlays-scattered (n &optional length) + "Insert N overlays of max length 24 randomly." + (dotimes (_ n) + (let ((begin (random (1+ (point-max))))) + (make-overlay + begin (+ begin (random (1+ (or length perf-insert-overlays-default-length 0)))))))) + +(defvar perf-marker-gc-protection nil) + +(defun perf-insert-marker-scattered (n) + "Insert N marker randomly." + (setq perf-marker-gc-protection nil) + (dotimes (_ n) + (push (copy-marker (random (1+ (point-max)))) + perf-marker-gc-protection))) + +(defun perf-switch-to-buffer-scroll-up-and-down (&optional buffer) + (interactive) + (set-window-buffer nil (or buffer (current-buffer))) + (goto-char (point-min)) + (redisplay t) + (while (condition-case nil + (progn (scroll-up) t) + (end-of-buffer nil)) + (redisplay t)) + (while (condition-case nil + (progn (scroll-down) t) + (beginning-of-buffer nil)) + (redisplay t))) + +(defmacro perf-define-marker-test (type where) + (let ((name (intern (format "perf-%s-%s-marker" type where)))) + `(perf-define-variable-test ,name (n) + (with-temp-buffer + (perf-insert-text n) + (perf-insert-marker-scattered n) + (goto-char ,(cl-case where + (after (list 'point-max)) + (t (list 'point-min)))) + (benchmark-run 1 + (dotimes (_ (/ n 2)) + ,@(when (eq where 'scatter) + (list '(goto-char (max 1 (random (point-max)))))) + ,(cl-case type + (insert (list 'insert ?X)) + (delete (list 'delete-char (if (eq where 'after) -1 1)))))))))) + +(defun perf-emacs-lisp-setup () + (add-to-list 'imenu-generic-expression + '(nil "^\\s-*(perf-define\\(?:\\w\\|\\s_\\)*\\s-*\\(\\(?:\\w\\|\\s_\\)+\\)" 1))) + +(add-hook 'emacs-lisp-mode 'perf-emacs-lisp-setup) diff --git a/test/manual/regexp/Makefile.in b/test/manual/regexp/Makefile.in new file mode 100644 index 00000000000..75e9c5cbbcd --- /dev/null +++ b/test/manual/regexp/Makefile.in @@ -0,0 +1,38 @@ +### @configure_input@ + +# Copyright (C) 2017-2024 Free Software Foundation, Inc. + +# This file is part of GNU Emacs. + +# GNU Emacs is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# GNU Emacs is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with GNU Emacs. If not, see . + +top_srcdir = @top_srcdir@ +top_builddir = @top_builddir@ +EMACS ?= $(top_builddir)/src/emacs +MANUAL_TEST_SRC = $(top_srcdir)/test/manual +PERF_TEST_SRC = $(MANUAL_TEST_SRC)/perf +REGEXP_TEST_SRC = $(MANUAL_TEST_SRC)/regexp + +.PHONY: all perf + +all: perf + +perf.el: $(PERF_TEST_SRC)/perf.el + cp -v $< $@ + +regexp-perf.el: $(REGEXP_TEST_SRC)/regexp-perf.el + cp -v $< $@ + +perf: perf.el regexp-perf.el + -$(EMACS) -Q -l ./perf.el -l ./regexp-perf.el -f perf-run-batch diff --git a/test/manual/regexp/regexp-perf.el b/test/manual/regexp/regexp-perf.el new file mode 100644 index 00000000000..51848fbae8c --- /dev/null +++ b/test/manual/regexp/regexp-perf.el @@ -0,0 +1,232 @@ +;; -*- lexical-binding:t -*- + +;; Copyright (C) 2024 Free Software Foundation, Inc. + +;; This file is part of GNU Emacs. + +;; GNU Emacs is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. + +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs. If not, see . + +;;; Code: +(require 'rx) + + +;;; helpers +(defconst perf-buffer-base-len 100 + "Length multiple of temp buffer to generate for search tests.") + +(defconst perf-string-base-len 100 + "Length multiple of string to generate for search tests.") + +(defmacro perf-with-random-buffer (size &rest body) + "Generate a temp buffer with SIZE random ASCII chars, executing BODY." + (declare (indent 1) (debug (fixnum &rest form))) + `(with-temp-buffer + (perf-insert-random ,size) + (goto-char (point-min)) + ,@body)) + +(defconst perf-regexp-short-base-len 2 + "Length of components for short regexps to generate for search tests.") + +(defconst perf-regexp-long-base-len 40 + "Length of components for long regexps to generate for search tests.") + +(defun perf-generate-random-alternation-pattern (case-len) + (rx-to-string + `(| + (literal ,(perf-random-string case-len)) + (literal ,(perf-random-string case-len))) + t)) + +(defun perf-generate-random-grouped-pattern (component-len) + (rx-to-string + `(: + (literal ,(perf-random-string component-len)) + (? (group (| (literal ,(perf-random-string component-len)) + (literal ,(perf-random-string component-len))))) + (literal ,(perf-random-string component-len))) + t)) + +(defun perf-generate-regexp-strings (n base-len kind) + (cl-loop + with v = (make-vector n nil) + with pattern-fun = + (cl-ecase kind + (alternation #'perf-generate-random-alternation-pattern) + (grouped #'perf-generate-random-grouped-pattern)) + for el across-ref v + for r = (funcall pattern-fun base-len) + do (setf el r) + finally return v)) + +(defun perf-compile-regexps (regexp-strings) + (cl-loop with v = (make-vector (length regexp-strings) nil) + for el across-ref v + for r across regexp-strings + do (setf el (make-regexp r)) + finally return v)) + +(defconst perf-num-few-regexp-patterns-to-generate 4 + "A small number of regexp patterns to try one after another.") + +(defconst perf-num-many-regexp-patterns-to-generate 30 + "A large number of regexp patterns to try one after another.") + +(defconst perf-num-regexp-match-loops 60 + "Number of times to try matching each regexp in a search test.") + +(defmacro perf-define-parameterized-regexp-strings-test + (base-name args &optional doc &rest body) + "Define a set of test cases with varying types of generated +regexp patterns." + (declare (indent 2) (debug defun)) + (unless (and (consp args) + (= (length args) 2)) + (error "Base function %s should accept exactly two arguments." + base-name)) + (let ((all-variants + '((short-patterns/few-patterns/alternation perf-regexp-short-base-len perf-num-few-regexp-patterns-to-generate 'alternation) + (short-patterns/few-patterns/grouped perf-regexp-short-base-len perf-num-few-regexp-patterns-to-generate 'grouped) + (short-patterns/many-patterns/alternation perf-regexp-short-base-len perf-num-many-regexp-patterns-to-generate 'alternation) + (short-patterns/many-patterns/grouped perf-regexp-short-base-len perf-num-many-regexp-patterns-to-generate 'grouped) + (long-patterns/few-patterns/alternation perf-regexp-long-base-len perf-num-few-regexp-patterns-to-generate 'alternation) + (long-patterns/few-patterns/grouped perf-regexp-long-base-len perf-num-few-regexp-patterns-to-generate 'grouped) + (long-patterns/many-patterns/alternation perf-regexp-long-base-len perf-num-many-regexp-patterns-to-generate 'alternation) + (long-patterns/many-patterns/grouped perf-regexp-long-base-len perf-num-many-regexp-patterns-to-generate 'grouped)))) + `(progn + ,@(cl-loop + with base-str = (symbol-name base-name) + with (rx-str-arg n-arg) = args + for (ext-name regexp-len-sym num-patterns-sym kind) in all-variants + for ext-str = (symbol-name ext-name) + for full-name = (intern (format "%s/%s" base-str ext-str)) + collect `(perf-define-variable-test + ,full-name (,n-arg) ,doc + (let ((,rx-str-arg + (perf-generate-regexp-strings + ,num-patterns-sym + ,regexp-len-sym + ,kind))) + ,@body)))))) + +(defmacro perf-define-parameterized-compile-regexp-test + (base-name args &optional doc &rest body) + "Define a pair of test cases with pre-compiled regexp patterns as well +as raw strings (which get compiled and cached on the fly). + +NB: compilation time via `perf-compile-regexps' ('compile-fun' in the +implementation) is *not* tracked in these generated benchmark tests, +while any just-in-time regex compilation from pattern strings *is* +tracked in these benchmark timings. This is intentional." + (declare (indent 2) (debug defun)) + (unless (and (consp args) + (= (length args) 2)) + (error "Base function %s should accept exactly two arguments." + base-name)) + (let ((all-variants + '((compiled . perf-compile-regexps) + (no-compile . nil)))) + `(progn + ,@(cl-loop + with base-str = (symbol-name base-name) + with (rx-str-arg n-arg) = args + for (ext-name . maybe-compile-fun) in all-variants + for ext-str = (symbol-name ext-name) + for full-name = (intern (format "%s/%s" base-str ext-str)) + collect `(perf-define-parameterized-regexp-strings-test + ,full-name (,rx-str-arg ,n-arg) ,doc + ,@(if maybe-compile-fun + `((let ((,rx-str-arg + (,maybe-compile-fun ,rx-str-arg))) + ,@body)) + body)))))) + + +;; +============================================================+ +;; | Matching performance tests without recording any match data. +;; +============================================================+ + +(perf-define-parameterized-compile-regexp-test + perf-match/no-match-data/buffer (regexps n) + "Generate random regexps and repeatedly regex search a random buffer." + (perf-with-random-buffer (* n perf-buffer-base-len) + (benchmark-run perf-num-regexp-match-loops + (cl-loop for r across regexps + do (save-excursion + (re-search-forward r nil t nil t)))))) + +(perf-define-parameterized-compile-regexp-test + perf-match/no-match-data/string (regexps n) + "Generate random regexps and repeatedly regex search a random string." + (let ((haystack (perf-random-string (* n perf-string-base-len)))) + (benchmark-run perf-num-regexp-match-loops + (cl-loop for r across regexps + do (string-match r haystack nil t))))) + + +;; +============================================================+ +;; | Match data manipulation. +;; +============================================================+ +(defconst perf-num-match-data-loops 600 + "Number of times to extract and reset match data in a test.") + +(defun perf-generate-simple-consecutive-groups-pattern (num-groups) + "Create a regex pattern with NUM-GROUPS subexpressions, each matching +a single '.' (any character except newline)." + (rx-to-string + (cl-loop for i from 1 upto num-groups + collecting '(group not-newline) into r + finally return `(: ,@r)) + t)) + +(perf-define-variable-test perf-match/match-data/string/legacy (n) + (let* ((haystack (perf-random-string (* n perf-string-base-len))) + (num-groups (max n 4)) + (r (perf-generate-simple-consecutive-groups-pattern num-groups)) + (m '(0 0))) + (benchmark-run perf-num-match-data-loops + (cl-assert (string-match r haystack nil nil)) + (match-data t m nil) + (cl-assert (and (= (cl-first m) 0) + (= (cl-second m) (1- (length haystack))) + (= (cl-third m) 0) + (= (cl-fourth m) 1))) + (cl-assert (string-match "" haystack)) + (match-data t m nil) + (cl-assert (and (= (cl-first m) 0) + (= (cl-second m) 0) + (null (cl-third m))))))) + +(perf-define-variable-test perf-match/match-data/string/match-vectors (n) + (let* ((haystack (perf-random-string (* n perf-string-base-len))) + (num-groups (max n 4)) + (r (make-regexp + (perf-generate-simple-consecutive-groups-pattern num-groups))) + (r-blank (make-regexp "")) + (starts (match-allocate-results r)) + (ends (match-allocate-results r))) + (benchmark-run perf-num-match-data-loops + (cl-assert (string-match r haystack nil nil)) + (match-extract-starts r starts) + (match-extract-ends r ends) + (cl-assert (= (length starts) (match-num-registers r))) + (cl-assert (= (length ends) (match-num-registers r))) + (cl-assert (and (= (aref starts 0) 0) + (> (aref ends 0) 0) + (> (aref starts 1) (aref ends 0)))) + (cl-assert (string-match r-blank haystack nil nil)) + (match-extract-starts r-blank starts) + (match-extract-ends r-blank ends) + (cl-assert (and (= (aref starts 0) 0) + (= (aref ends 0) 0)))))) -- 2.45.2 From fa3bbeeca09f404de965f3e5f0c49ce6269ccb37 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 5 Aug 2024 03:06:46 -0400 Subject: [PATCH 2/3] configure gdbinit for regexp and match objects --- src/.gdbinit | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/src/.gdbinit b/src/.gdbinit index 0f55cc18699..434dc909987 100644 --- a/src/.gdbinit +++ b/src/.gdbinit @@ -926,6 +926,54 @@ Set $ as a hash table pointer. This command assumes that $ is an Emacs Lisp hash table value. end +# TODO: figure out how to print re_pattern_buffer! +define xregexp + xgetptr $ + print (struct Lisp_Regexp *) $ptr + xgetptr $->pattern + printf "pattern = %s\n", $ptr ? (char *) ((struct Lisp_String *) $ptr)->u.s.data : "DEAD" + printf "posix = %d\n", $->posix +end +document xregexp +Set $ as a regexp pointer and print the pattern it was compiled from. +This command assumes $ is an Emacs Lisp compiled regexp value. +end + +# TODO: figure out how to print re_registers! +define xmatch + xgetptr $ + print (struct Lisp_Match *) $ptr + set $m = $ + xgettype $m->haystack + xgetptr $m->haystack + set $h = $ptr + xgetptr Qnil + set $nil = $ptr + if $type == Lisp_String + print (struct Lisp_String *) $h + printf "haystack = \"%s\"\n", $h ? (char *) ((struct Lisp_String *) $h)->u.s.data : "DEAD" + end + if $type == Lisp_Vectorlike + set $size = ((struct Lisp_Vector *) $h)->header.size + if ($size & PSEUDOVECTOR_FLAG) + set $vec = (enum pvec_type) (($size & PVEC_TYPE_MASK) >> PSEUDOVECTOR_AREA_BITS) + if $vec == PVEC_BUFFER + print (struct buffer *) $h + xgetptr $->name_ + printf "haystack = \n", $ptr ? (char *) ((struct Lisp_String *) $ptr)->u.s.data : "DEAD" + end + end + end + if $h == $nil + printf "haystack = nil\n" + end + printf "initialized_regs = %ld\n", $m->initialized_regs +end +document xmatch +Set $ as a regexp match pointer. +This command assumes $ is an Emacs Lisp regexp match value. +end + define xtsparser xgetptr $ print (struct Lisp_TS_Parser *) $ptr @@ -1099,6 +1147,12 @@ define xpr if $vec == PVEC_HASH_TABLE xhashtable end + if $vec == PVEC_REGEXP + xregexp + end + if $vec == PVEC_MATCH + xmatch + end if $vec == PVEC_TS_PARSER xtsparser end -- 2.45.2 From e2207e7f2d9724bb4471ad0fe4a78e2c49209189 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Tue, 6 Aug 2024 07:42:10 -0400 Subject: [PATCH 3/3] convert Lisp_Regexp->buffer and Lisp_Match->regs to inline over pointers --- src/alloc.c | 67 ++++++++++++--------- src/lisp.h | 9 ++- src/pdumper.c | 83 +++++++++++++++----------- src/print.c | 10 ++-- src/regex-emacs.c | 82 ++++++++++++++------------ src/regex-emacs.h | 147 +++++++++++++++++++++++++--------------------- src/search.c | 19 +++--- src/thread.h | 16 ++++- 8 files changed, 250 insertions(+), 183 deletions(-) diff --git a/src/alloc.c b/src/alloc.c index de7154e7575..7cf711da0dc 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -3470,20 +3470,15 @@ cleanup_vector (struct Lisp_Vector *vector) case PVEC_REGEXP: { struct Lisp_Regexp *r = PSEUDOVEC_STRUCT (vector, Lisp_Regexp); - eassert (r->buffer->allocated > 0); - eassert (r->buffer->used <= r->buffer->allocated); - xfree (r->buffer->buffer); - xfree (r->buffer->fastmap); - xfree (r->buffer); + xfree (r->buffer.buffer); + xfree (r->buffer.fastmap); } break; case PVEC_MATCH: { struct Lisp_Match *m = PSEUDOVEC_STRUCT (vector, Lisp_Match); - eassert (m->regs->num_regs > 0); - xfree (m->regs->start); - xfree (m->regs->end); - xfree (m->regs); + xfree (m->regs.start); + xfree (m->regs.end); } break; case PVEC_OBARRAY: @@ -5900,18 +5895,28 @@ make_pure_c_string (const char *data, ptrdiff_t nchars) static Lisp_Object purecopy (Lisp_Object obj); -static struct re_pattern_buffer * +static struct re_pattern_buffer make_pure_re_pattern_buffer (const struct re_pattern_buffer *bufp) { - struct re_pattern_buffer *pure = pure_alloc (sizeof *pure, -1); - *pure = *bufp; + struct re_pattern_buffer pure = *bufp; - pure->buffer = pure_alloc (bufp->used, -1); - memcpy (pure->buffer, bufp->buffer, bufp->used); - pure->allocated = bufp->used; - pure->fastmap = pure_alloc (FASTMAP_SIZE, -1); - memcpy (pure->fastmap, bufp->fastmap, FASTMAP_SIZE); - pure->translate = purecopy (bufp->translate); + pure.buffer = pure_alloc (bufp->used, -1); + memcpy (pure.buffer, bufp->buffer, bufp->used); + pure.allocated = bufp->used; + /* The fastmap is sometimes unset when using the `regexp_cache'. */ + if (pure.fastmap) + { + pure.fastmap = pure_alloc (FASTMAP_SIZE, -1); + memcpy (pure.fastmap, bufp->fastmap, FASTMAP_SIZE); + } + /* `translate' being NULL produces an invalid object from + `make_lisp_ptr()'. */ + if (pure.translate) + { + const Lisp_Object translate = + make_lisp_ptr (bufp->translate, Lisp_Vectorlike); + pure.translate = XLP (purecopy (translate)); + } return pure; } @@ -5926,22 +5931,21 @@ make_pure_regexp (const struct Lisp_Regexp *r) pure->whitespace_regexp = purecopy (r->whitespace_regexp); pure->syntax_table = purecopy (r->syntax_table); pure->default_match_target = purecopy (r->default_match_target); - pure->buffer = make_pure_re_pattern_buffer (r->buffer); + pure->buffer = make_pure_re_pattern_buffer (&r->buffer); return make_lisp_ptr (pure, Lisp_Vectorlike); } -static struct re_registers * +static struct re_registers make_pure_re_registers (const struct re_registers *regs) { - struct re_registers *pure = pure_alloc (sizeof *pure, -1); - *pure = *regs; + struct re_registers pure = *regs; ptrdiff_t reg_size = regs->num_regs * sizeof (ptrdiff_t); - pure->start = pure_alloc (reg_size, -1); - memcpy (pure->start, regs->start, reg_size); - pure->end = pure_alloc (reg_size, -1); - memcpy (pure->end, regs->end, reg_size); + pure.start = pure_alloc (reg_size, -1); + memcpy (pure.start, regs->start, reg_size); + pure.end = pure_alloc (reg_size, -1); + memcpy (pure.end, regs->end, reg_size); return pure; } @@ -5953,7 +5957,7 @@ make_pure_match (const struct Lisp_Match *m) *pure = *m; pure->haystack = purecopy (m->haystack); - pure->regs = make_pure_re_registers (m->regs); + pure->regs = make_pure_re_registers (&m->regs); return make_lisp_ptr (pure, Lisp_Vectorlike); } @@ -7401,7 +7405,14 @@ #define CHECK_ALLOCATED_AND_LIVE_SYMBOL() ((void) 0) mark_stack_push_values (ptr->contents, size); struct Lisp_Regexp *r = (struct Lisp_Regexp *)ptr; - mark_stack_push_value (r->buffer->translate); + /* `translate' being NULL produces an invalid object from + `make_lisp_ptr()'. */ + if (r->buffer.translate) + { + Lisp_Object translate = make_lisp_ptr + (r->buffer.translate, Lisp_Vectorlike); + mark_stack_push_value (translate); + } break; } diff --git a/src/lisp.h b/src/lisp.h index 8f7e5fa1daa..b4d477c29a8 100644 --- a/src/lisp.h +++ b/src/lisp.h @@ -1368,7 +1368,10 @@ clip_to_bounds (intmax_t lower, intmax_t num, intmax_t upper) } /* Construct a Lisp_Object from a value or address. */ - +/* NB: this produces an invalid Lisp_Object which causes segfaults in + gc, pdumping, and purecopying if invoked with a null reference and + Lisp_Vectorlike as the type. Adding a check for NULL and returning + Qnil produces a different segfault when allocating vectors. */ INLINE Lisp_Object make_lisp_ptr (void *ptr, enum Lisp_Type type) { @@ -3008,7 +3011,7 @@ xmint_pointer (Lisp_Object a) Lisp_Object syntax_table; Lisp_Object default_match_target; bool posix; - struct re_pattern_buffer *buffer; + struct re_pattern_buffer buffer; } GCALIGNED_STRUCT; struct Lisp_Match @@ -3016,7 +3019,7 @@ xmint_pointer (Lisp_Object a) union vectorlike_header header; Lisp_Object haystack; ptrdiff_t initialized_regs; - struct re_registers *regs; + struct re_registers regs; } GCALIGNED_STRUCT; struct Lisp_User_Ptr diff --git a/src/pdumper.c b/src/pdumper.c index 3c70e590d54..2891b2b3d6f 100644 --- a/src/pdumper.c +++ b/src/pdumper.c @@ -1935,7 +1935,11 @@ dump_field_fixup_later (struct dump_context *ctx, const void *in_start, const void *in_field) { - /* TODO: more error checking. */ + /* TODO: more error checking. Possibly adding a counter to + `dump_context' to ensure as many fields are actually dumped after + `dump_object_finish()' as "declared" with this method? Maybe some + sort of check for field size making of the known-valid allocation + from the input field? */ (void) field_relpos (in_start, in_field); } @@ -2158,19 +2162,32 @@ dump_marker (struct dump_context *ctx, const struct Lisp_Marker *marker) static dump_off dump_re_pattern_buffer (struct dump_context *ctx, const struct re_pattern_buffer *bufp) { -#if CHECK_STRUCTS && !defined (HASH_re_pattern_buffer_36714DF24A) +#if CHECK_STRUCTS && !defined (HASH_re_pattern_buffer_31B62CF6F6) # error "re_pattern_buffer changed. See CHECK_STRUCTS comment in config.h." #endif struct re_pattern_buffer out; dump_object_start (ctx, &out, sizeof (out)); - if (bufp->buffer) - dump_field_fixup_later (ctx, &out, bufp, &bufp->buffer); + + eassert (bufp->buffer != NULL); + dump_field_fixup_later (ctx, &out, bufp, &bufp->buffer); + DUMP_FIELD_COPY (&out, bufp, allocated); DUMP_FIELD_COPY (&out, bufp, used); DUMP_FIELD_COPY (&out, bufp, charset_unibyte); + + /* `fastmap' is sometimes unset when used with the `regexp_cache'. */ if (bufp->fastmap) dump_field_fixup_later (ctx, &out, bufp, &bufp->fastmap); - dump_field_lv (ctx, &out, bufp, &bufp->translate, WEIGHT_NORMAL); + + /* `translate' being NULL produces an invalid object from + `make_lisp_ptr()'. */ + if (bufp->translate) + { + const Lisp_Object translate = + make_lisp_ptr ((void *) &bufp->translate, Lisp_Vectorlike); + dump_field_lv (ctx, &out, bufp, &translate, WEIGHT_NORMAL); + } + DUMP_FIELD_COPY (&out, bufp, re_nsub); DUMP_FIELD_COPY (&out, bufp, can_be_null); DUMP_FIELD_COPY (&out, bufp, regs_allocated); @@ -2178,30 +2195,29 @@ dump_re_pattern_buffer (struct dump_context *ctx, const struct re_pattern_buffer DUMP_FIELD_COPY (&out, bufp, used_syntax); DUMP_FIELD_COPY (&out, bufp, multibyte); DUMP_FIELD_COPY (&out, bufp, target_multibyte); + dump_off offset = dump_object_finish (ctx, &out, sizeof (out)); - if (bufp->buffer) - { - eassert (bufp->allocated > 0); - if (bufp->allocated > DUMP_OFF_MAX - 1) - error ("regex pattern buffer too large"); - dump_off total_size = ptrdiff_t_to_dump_off (bufp->allocated); - eassert (total_size > 0); - dump_cold_bytes - (ctx, - offset + dump_offsetof (struct re_pattern_buffer, buffer), - bufp->buffer, - total_size); - } + + /* Now scan the dynamically-allocated sections. */ + eassert (bufp->allocated > 0); + if (bufp->allocated > DUMP_OFF_MAX - 1) + error ("regex pattern buffer too large"); + dump_off compiled_matcher_size = + ptrdiff_t_to_dump_off (bufp->allocated); + eassert (compiled_matcher_size > 0); + dump_cold_bytes + (ctx, + offset + dump_offsetof (struct re_pattern_buffer, buffer), + bufp->buffer, + compiled_matcher_size); + if (bufp->fastmap) - { - eassert (FASTMAP_SIZE <= DUMP_OFF_MAX - 1); - dump_off fastmap_size = FASTMAP_SIZE; - dump_cold_bytes - (ctx, - offset + dump_offsetof (struct re_pattern_buffer, fastmap), - bufp->fastmap, - fastmap_size); - } + dump_cold_bytes + (ctx, + offset + dump_offsetof (struct re_pattern_buffer, fastmap), + bufp->fastmap, + FASTMAP_SIZE); + return offset; } @@ -2238,7 +2254,7 @@ dump_re_registers (struct dump_context *ctx, const struct re_registers *regs) static dump_off dump_match (struct dump_context *ctx, const struct Lisp_Match *match) { -#if CHECK_STRUCTS && !defined (HASH_Lisp_Match_EE9D54EA09) +#if CHECK_STRUCTS && !defined (HASH_Lisp_Match_6EDCF0675B) # error "Lisp_Match changed. See CHECK_STRUCTS comment in config.h." #endif START_DUMP_PVEC (ctx, &match->header, struct Lisp_Match, out); @@ -2248,14 +2264,14 @@ dump_match (struct dump_context *ctx, const struct Lisp_Match *match) dump_remember_fixup_ptr_raw (ctx, offset + dump_offsetof (struct Lisp_Match, regs), - dump_re_registers (ctx, match->regs)); + dump_re_registers (ctx, &match->regs)); return offset; } static dump_off dump_regexp (struct dump_context *ctx, const struct Lisp_Regexp *regexp) { -#if CHECK_STRUCTS && !defined (HASH_Lisp_Regexp_29DF51A9AC) +#if CHECK_STRUCTS && !defined (HASH_Lisp_Regexp_C700C2DF42) # error "Lisp_Regexp changed. See CHECK_STRUCTS comment in config.h." #endif START_DUMP_PVEC (ctx, ®exp->header, struct Lisp_Regexp, out); @@ -2266,7 +2282,7 @@ dump_regexp (struct dump_context *ctx, const struct Lisp_Regexp *regexp) dump_remember_fixup_ptr_raw (ctx, offset + dump_offsetof (struct Lisp_Regexp, buffer), - dump_re_pattern_buffer (ctx, regexp->buffer)); + dump_re_pattern_buffer (ctx, ®exp->buffer)); return offset; } @@ -2280,11 +2296,10 @@ dump_interval_node (struct dump_context *ctx, struct itree_node *node) dump_object_start (ctx, &out, sizeof (out)); if (node->parent) dump_field_fixup_later (ctx, &out, node, &node->parent); - /* FIXME: should these both be &node->{left,right} instead of &node->parent? */ if (node->left) - dump_field_fixup_later (ctx, &out, node, &node->parent); + dump_field_fixup_later (ctx, &out, node, &node->left); if (node->right) - dump_field_fixup_later (ctx, &out, node, &node->parent); + dump_field_fixup_later (ctx, &out, node, &node->right); DUMP_FIELD_COPY (&out, node, begin); DUMP_FIELD_COPY (&out, node, end); DUMP_FIELD_COPY (&out, node, limit); diff --git a/src/print.c b/src/print.c index c87630640c3..337974a1b07 100644 --- a/src/print.c +++ b/src/print.c @@ -2089,10 +2089,10 @@ print_vectorlike_unreadable (Lisp_Object obj, Lisp_Object printcharfun, struct Lisp_Regexp *r = XREGEXP (obj); print_c_string ("#buffer->re_nsub); + int i = sprintf (buf, " nsub=%ld", r->buffer.re_nsub); strout (buf, i, i, printcharfun); print_c_string (" translate=", printcharfun); - print_object (r->buffer->translate, printcharfun, escapeflag); + print_object (r->buffer.translate, printcharfun, escapeflag); print_c_string (" whitespace=", printcharfun); print_object (r->whitespace_regexp, printcharfun, escapeflag); print_c_string (" syntax_table=", printcharfun); @@ -2112,7 +2112,7 @@ print_vectorlike_unreadable (Lisp_Object obj, Lisp_Object printcharfun, case PVEC_MATCH: { struct Lisp_Match *m = XMATCH (obj); - ptrdiff_t num_regs = m->regs->num_regs; + ptrdiff_t num_regs = m->regs.num_regs; ptrdiff_t initialized_regs = m->initialized_regs; print_c_string ("#regs->start[reg_index], - m->regs->end[reg_index]); + m->regs.start[reg_index], + m->regs.end[reg_index]); strout (buf, i, i, printcharfun); } print_c_string ("]>", printcharfun); diff --git a/src/regex-emacs.c b/src/regex-emacs.c index 8fe76a70ba9..1f1a5d2e249 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c @@ -36,6 +36,17 @@ #include "category.h" #include "dispextern.h" +/* See comment in header about compiler error from attempting to define + this inline. */ +struct regexp_match_info +make_full_match_info (struct Lisp_Match *match) +{ + /* Ensure we're not using NULL to mean an optional value. */ + eassert (match != NULL); + struct regexp_match_info ret = { .regs = &match->regs, .match = match }; + return ret; +} + /* Maximum number of duplicates an interval can allow. Some systems define this in other header files, but we want our value, so remove any previous define. Repeat counts are stored in opcodes as 2-byte @@ -5395,7 +5406,7 @@ DEFUN ("make-regexp", Fmake_regexp, Smake_regexp, 1, 3, 0, char *val = NULL; bool is_posix = !NILP (posix); struct Lisp_Regexp *p = NULL; - struct re_pattern_buffer *bufp = NULL; + struct re_pattern_buffer re_buf = { 0 }; if (!NILP (Vsearch_spaces_regexp)) { @@ -5410,21 +5421,18 @@ DEFUN ("make-regexp", Fmake_regexp, Smake_regexp, 1, 3, 0, CHECK_STRING (pattern); - bufp = xzalloc (sizeof (*bufp)); - - bufp->fastmap = xzalloc (FASTMAP_SIZE); - bufp->translate = translate; - bufp->multibyte = STRING_MULTIBYTE (pattern); - bufp->charset_unibyte = charset_unibyte; + re_buf.fastmap = xzalloc (FASTMAP_SIZE); + re_buf.translate = XLP (translate); + re_buf.multibyte = STRING_MULTIBYTE (pattern); + re_buf.charset_unibyte = charset_unibyte; val = (char *) re_compile_pattern (SSDATA (pattern), SBYTES (pattern), - is_posix, whitespace_regexp, bufp); + is_posix, whitespace_regexp, &re_buf); if (val) { - xfree (bufp->buffer); - xfree (bufp->fastmap); - xfree (bufp); + xfree (re_buf.buffer); + xfree (re_buf.fastmap); xsignal1 (Qinvalid_regexp, build_string (val)); } @@ -5434,16 +5442,16 @@ DEFUN ("make-regexp", Fmake_regexp, Smake_regexp, 1, 3, 0, p->whitespace_regexp = Vsearch_spaces_regexp; /* If the compiled pattern hard codes some of the contents of the syntax-table, it can only be reused with *this* syntax table. */ - p->syntax_table = bufp->used_syntax ? BVAR (current_buffer, syntax_table) : Qt; + p->syntax_table = re_buf.used_syntax ? BVAR (current_buffer, syntax_table) : Qt; p->posix = is_posix; /* Allocate the match data implicitly stored in this regexp. */ - p->default_match_target = allocate_match (bufp->re_nsub); + p->default_match_target = allocate_match (re_buf.re_nsub); /* Tell regex matching routines they do not need to allocate any further memory, since we have allocated it here in advance. */ - bufp->regs_allocated = REGS_FIXED; + re_buf.regs_allocated = REGS_FIXED; /* Fully initialize all fields. */ - p->buffer = bufp; + p->buffer = re_buf; return make_lisp_ptr (p, Lisp_Vectorlike); } @@ -5489,7 +5497,7 @@ DEFUN ("regexp-get-translation-table", Fregexp_get_translation_table, (Lisp_Object regexp) { CHECK_REGEXP (regexp); - return XREGEXP (regexp)->buffer->translate; + return make_lisp_ptr (XREGEXP (regexp)->buffer.translate, Lisp_Vectorlike); } DEFUN ("regexp-get-num-subexps", Fregexp_get_num_subexps, @@ -5498,7 +5506,7 @@ DEFUN ("regexp-get-num-subexps", Fregexp_get_num_subexps, (Lisp_Object regexp) { CHECK_REGEXP (regexp); - return make_fixnum (XREGEXP (regexp)->buffer->re_nsub); + return make_fixnum (XREGEXP (regexp)->buffer.re_nsub); } DEFUN ("regexp-get-default-match-data", Fregexp_get_default_match_data, @@ -5520,7 +5528,7 @@ reallocate_match_registers (ptrdiff_t re_nsub, struct Lisp_Match *m) { ptrdiff_t needed_regs = re_nsub + 1; eassert (needed_regs > 0); - struct re_registers *regs = m->regs; + struct re_registers *regs = &m->regs; /* If we need more elements than were already allocated, reallocate them. If we need fewer, just leave it alone. */ if (regs->num_regs < needed_regs) @@ -5550,10 +5558,9 @@ DEFUN ("regexp-set-default-match-data", Fregexp_set_default_match_data, CHECK_REGEXP (regexp); CHECK_MATCH (match); struct Lisp_Regexp *r = XREGEXP (regexp); - struct re_pattern_buffer *bufp = r->buffer; /* This should always be true for any compiled regexp. */ - eassert (bufp->regs_allocated == REGS_FIXED); + eassert (r->buffer.regs_allocated == REGS_FIXED); /* Overwrite the default target. */ r->default_match_target = match; @@ -5568,10 +5575,11 @@ allocate_match (ptrdiff_t re_nsub) /* Number of match registers always includes 0 for whole match. */ ptrdiff_t num_regs = re_nsub + 1; - struct re_registers *regs = xzalloc (sizeof (*regs)); - regs->num_regs = num_regs; - regs->start = xnmalloc (num_regs, sizeof (*regs->start)); - regs->end = xnmalloc (num_regs, sizeof (*regs->end)); + struct re_registers regs = { + .num_regs = num_regs, + .start = xnmalloc (num_regs, sizeof (*regs.start)), + .end = xnmalloc (num_regs, sizeof (*regs.end)), + }; /* Construct lisp match object. */ struct Lisp_Match *m = ALLOCATE_PSEUDOVECTOR @@ -5582,7 +5590,7 @@ allocate_match (ptrdiff_t re_nsub) m->haystack = Qnil; /* Initialize all values to -1 for "unset". */ for (ptrdiff_t reg_index = 0; reg_index < num_regs; ++reg_index) - regs->start[reg_index] = regs->end[reg_index] = RE_MATCH_EXP_UNSET; + regs.start[reg_index] = regs.end[reg_index] = RE_MATCH_EXP_UNSET; /* No successful match has occurred yet, so nothing is initialized. */ m->initialized_regs = 0; @@ -5597,7 +5605,7 @@ extract_re_nsub_arg (Lisp_Object regexp_or_num_registers) if (NILP (regexp_or_num_registers)) re_nsub = 0; else if (REGEXP_P (regexp_or_num_registers)) - re_nsub = XREGEXP (regexp_or_num_registers)->buffer->re_nsub; + re_nsub = XREGEXP (regexp_or_num_registers)->buffer.re_nsub; else { CHECK_FIXNAT (regexp_or_num_registers); @@ -5739,7 +5747,7 @@ DEFUN ("match-allocated-registers", Fmatch_allocated_registers, { struct Lisp_Match *match = extract_regexp_or_match (regexp_or_match); - return make_fixed_natnum (match->regs->num_regs); + return make_fixed_natnum (match->regs.num_regs); } static ptrdiff_t @@ -5760,8 +5768,8 @@ index_into_registers (struct Lisp_Match *m, ptrdiff_t group_index, error ("group %ld was out of bounds for match data with %ld registers", group_index, m->initialized_regs); return ((beginningp) - ? m->regs->start[group_index] - : m->regs->end[group_index]); + ? m->regs.start[group_index] + : m->regs.end[group_index]); } DEFUN ("match-register-start", Fmatch_register_start, Smatch_register_start, @@ -5772,8 +5780,8 @@ DEFUN ("match-register-start", Fmatch_register_start, Smatch_register_start, searched against without providing a match object via 'inhibit-modify', or a match object provided via 'inhibit-modify' to a search method. -GROUP, a number, specifies the parenthesized subexpression in the last - regexp for which to return the start position. +GROUP, a number, specifies the parenthesized subexpression in the regexp + last for which to return the start position. Value is nil if GROUPth subexpression didn't match, or there were fewer than GROUP subexpressions. GROUP zero or nil means the entire text matched by the whole regexp or whole @@ -5863,11 +5871,11 @@ DEFUN ("match-allocate-results", Fmatch_allocate_results, { ptrdiff_t result_length; if (REGEXP_P (regexp_or_match)) - result_length = 1 + XREGEXP (regexp_or_match)->buffer->re_nsub; + result_length = 1 + XREGEXP (regexp_or_match)->buffer.re_nsub; else { CHECK_MATCH (regexp_or_match); - result_length = XMATCH (regexp_or_match)->regs->num_regs; + result_length = XMATCH (regexp_or_match)->regs.num_regs; } out = ensure_match_result_vector (result_length, out); @@ -5910,7 +5918,7 @@ DEFUN ("match-extract-starts", Fmatch_extract_starts, Smatch_extract_starts, extract_required_vector_length (m, max_group); out = ensure_match_result_vector (result_length, out); - write_positions_to_vector (result_length, m->regs->start, out); + write_positions_to_vector (result_length, m->regs.start, out); return out; } @@ -5937,7 +5945,7 @@ DEFUN ("match-extract-ends", Fmatch_extract_ends, Smatch_extract_ends, extract_required_vector_length (m, max_group); out = ensure_match_result_vector (result_length, out); - write_positions_to_vector (result_length, m->regs->end, out); + write_positions_to_vector (result_length, m->regs.end, out); return out; } @@ -6014,7 +6022,7 @@ DEFUN ("match-extract-start-marks", Fmatch_extract_start_marks, extract_required_vector_length (m, max_group); out = ensure_match_result_vector (result_length, out); - write_marks_to_vector (result_length, m->regs->start, buffer, out); + write_marks_to_vector (result_length, m->regs.start, buffer, out); return out; } @@ -6056,7 +6064,7 @@ DEFUN ("match-extract-end-marks", Fmatch_extract_end_marks, extract_required_vector_length (m, max_group); out = ensure_match_result_vector (result_length, out); - write_marks_to_vector (result_length, m->regs->end, buffer, out); + write_marks_to_vector (result_length, m->regs.end, buffer, out); return out; } diff --git a/src/regex-emacs.h b/src/regex-emacs.h index 00beeaf1867..294643d6f2c 100644 --- a/src/regex-emacs.h +++ b/src/regex-emacs.h @@ -33,6 +33,77 @@ #define RE_MATCH_EXP_UNSET (-1) ptrdiff_t *end; }; + +/* This data structure represents a compiled pattern. Before calling + the pattern compiler, the fields 'buffer', 'allocated', 'fastmap', + and 'translate' can be set. After the pattern has been + compiled, the 're_nsub' field is available. All other fields are + private to the regex routines. */ +/* re_pattern_buffer is also used in lisp.h, and therefore needs to be + fully declared before the include. */ + +struct re_pattern_buffer +{ + /* Space that holds the compiled pattern. It is declared as + 'unsigned char *' because its elements are + sometimes used as array indexes. */ + unsigned char *buffer; + + /* Number of bytes to which 'buffer' points. */ + ptrdiff_t allocated; + + /* Number of bytes actually used in 'buffer'. */ + ptrdiff_t used; + + /* Charset of unibyte characters at compiling time. */ + int charset_unibyte; + + /* Pointer to a fastmap, if any, otherwise zero. re_search uses + the fastmap, if there is one, to skip over impossible + starting points for matches. */ + char *fastmap; + + /* Either a translate table to apply to all characters before + comparing them, or zero for no translation. The translation + applies to a pattern when it is compiled and to a string + when it is matched. */ + /* In order to use this struct in Lisp_Regexp in lisp.h, we + need to pretend it's a void pointer instead of + a Lisp_Object. */ + void *translate; + + /* Number of subexpressions found by the compiler. */ + ptrdiff_t re_nsub; + + /* True if and only if this pattern can match the empty string. + Well, in truth it's used only in 're_search_2', to see + whether or not we should use the fastmap, so we don't set + this absolutely perfectly; see 're_compile_fastmap'. */ + bool_bf can_be_null : 1; + + /* If REGS_UNALLOCATED, allocate space in the 'regs' structure + for at least (re_nsub + 1) groups. + If REGS_REALLOCATE, reallocate space if necessary. + If REGS_FIXED, use what's there. */ + unsigned regs_allocated : 2; + + /* Set to false when 'regex_compile' compiles a pattern; set to true + by 're_compile_fastmap' if it updates the fastmap. */ + bool_bf fastmap_accurate : 1; + + /* If true, the compilation of the pattern had to look up the syntax table, + so the compiled pattern is valid for the current syntax table only. */ + bool_bf used_syntax : 1; + + /* If true, multi-byte form in the regexp pattern should be + recognized as a multibyte character. */ + bool_bf multibyte : 1; + + /* If true, multi-byte form in the target of match should be + recognized as a multibyte character. */ + bool_bf target_multibyte : 1; +}; + #include "lisp.h" struct regexp_match_info @@ -50,13 +121,22 @@ empty_regexp_match_info (void) } INLINE struct regexp_match_info -make_regs_only_match_info (struct re_registers* regs) +make_regs_only_match_info (struct re_registers *regs) { + /* Ensure we're not using NULL to mean an optional value. */ + eassert (regs != NULL); struct regexp_match_info ret = { .regs = regs, .match = NULL }; return ret; } INLINE_HEADER_END +/* There's some ridiculous error "invalid use of undefined type + ‘struct Lisp_Match’" even though we just included lisp.h above here + where `struct Lisp_Match' is defined, sigh. Defined in + regex-emacs.c, luckily it doesn't matter for performance whether + this is inlined. */ +struct regexp_match_info make_full_match_info (struct Lisp_Match *match); + /* Defined in search.c. */ extern EMACS_INT search_buffer (Lisp_Object, ptrdiff_t, ptrdiff_t, ptrdiff_t, ptrdiff_t, EMACS_INT, @@ -87,71 +167,6 @@ #define FASTMAP_SIZE 0400 /* Amount of memory that we can safely stack allocate. */ extern ptrdiff_t emacs_re_safe_alloca; -/* This data structure represents a compiled pattern. Before calling - the pattern compiler, the fields 'buffer', 'allocated', 'fastmap', - and 'translate' can be set. After the pattern has been - compiled, the 're_nsub' field is available. All other fields are - private to the regex routines. */ - -struct re_pattern_buffer -{ - /* Space that holds the compiled pattern. It is declared as - 'unsigned char *' because its elements are - sometimes used as array indexes. */ - unsigned char *buffer; - - /* Number of bytes to which 'buffer' points. */ - ptrdiff_t allocated; - - /* Number of bytes actually used in 'buffer'. */ - ptrdiff_t used; - - /* Charset of unibyte characters at compiling time. */ - int charset_unibyte; - - /* Pointer to a fastmap, if any, otherwise zero. re_search uses - the fastmap, if there is one, to skip over impossible - starting points for matches. */ - char *fastmap; - - /* Either a translate table to apply to all characters before - comparing them, or zero for no translation. The translation - applies to a pattern when it is compiled and to a string - when it is matched. */ - Lisp_Object translate; - - /* Number of subexpressions found by the compiler. */ - ptrdiff_t re_nsub; - - /* True if and only if this pattern can match the empty string. - Well, in truth it's used only in 're_search_2', to see - whether or not we should use the fastmap, so we don't set - this absolutely perfectly; see 're_compile_fastmap'. */ - bool_bf can_be_null : 1; - - /* If REGS_UNALLOCATED, allocate space in the 'regs' structure - for at least (re_nsub + 1) groups. - If REGS_REALLOCATE, reallocate space if necessary. - If REGS_FIXED, use what's there. */ - unsigned regs_allocated : 2; - - /* Set to false when 'regex_compile' compiles a pattern; set to true - by 're_compile_fastmap' if it updates the fastmap. */ - bool_bf fastmap_accurate : 1; - - /* If true, the compilation of the pattern had to look up the syntax table, - so the compiled pattern is valid for the current syntax table only. */ - bool_bf used_syntax : 1; - - /* If true, multi-byte form in the regexp pattern should be - recognized as a multibyte character. */ - bool_bf multibyte : 1; - - /* If true, multi-byte form in the target of match should be - recognized as a multibyte character. */ - bool_bf target_multibyte : 1; -}; - /* Declarations for routines. */ /* Compile the regular expression PATTERN, with length LENGTH diff --git a/src/search.c b/src/search.c index cee245ab0b2..e54dc72e34f 100644 --- a/src/search.c +++ b/src/search.c @@ -286,11 +286,7 @@ resolve_match_info (Lisp_Object regexp, Lisp_Object match) when we compiled the regexp. */ m = XMATCH (r->default_match_target); } - struct regexp_match_info ret = { - .regs = m->regs, - .match = m, - }; - return ret; + return make_full_match_info (m); } /* If just a string, do the old complex logic to access thread-locals @@ -301,6 +297,15 @@ resolve_match_info (Lisp_Object regexp, Lisp_Object match) if (running_asynch_code) save_search_regs (); + /* NB: this returns the current copy of the thread-local variable, + which is saved in a (thread-local) `thread_state' struct + (`search_regs' expands to `current_thread->m_search_regs'). This + shouldn't change any behavior, as the same thread accessing + `search_regs' multiple times in sequence should always access + the same virtual memory address anyway, but is worth noting as + adding support for precompiled regexps generally required changing + many parts of matching logic which previously had to assume they + could be pre-empted and invalidated at any time. */ return make_regs_only_match_info (&search_regs); } @@ -393,7 +398,7 @@ resolve_explicit_compiled_regexp (Lisp_Object regexp, bool posix, { /* If the regexp is precompiled, then immediately return its compiled form. */ if (REGEXP_P (regexp)) - return XREGEXP (regexp)->buffer; + return &XREGEXP (regexp)->buffer; /* Otherwise, this is a string, and we have to compile it via the cache. */ CHECK_STRING (regexp); @@ -665,7 +670,7 @@ fast_c_string_match_internal (Lisp_Object regexp, specpdl_ref count = SPECPDL_INDEX (); if (REGEXP_P (regexp)) - eassert (!XREGEXP (regexp)->buffer->target_multibyte); + eassert (!XREGEXP (regexp)->buffer.target_multibyte); /* FIXME: This is expensive and not obviously correct when it makes a difference. I.e., no longer "fast", and may hide bugs. Something should be done about this. */ diff --git a/src/thread.h b/src/thread.h index eaa7b265168..942edf4d770 100644 --- a/src/thread.h +++ b/src/thread.h @@ -146,11 +146,19 @@ #define current_buffer (current_thread->m_current_buffer) /* Every call to re_search, etc., must pass &search_regs as the regs argument unless you can show it is unnecessary (i.e., if re_search is certainly going to be called again before region-around-match - can be called). + can be called). However, this is unnecessary if using precompiled + regexp and match objects with `make-regexp' and `make-match-data'. Since the registers are now dynamically allocated, we need to make sure not to refer to the Nth register before checking that it has - been allocated by checking search_regs.num_regs. + been allocated by checking search_regs.num_regs. This is *slightly* + less necessary with precompiled regexp and match objects, since: + (1) precompiled regexps know how many subexpressions they need, so + they can set this to REGS_FIXED, + (2) further DEFUNs in regex-emacs.c allow explicit lisp-level + control over allocating match object size, + but we still generally rely on the search methods checking + .num_regs, just not specifically from the search_regs thread-local. The regex code keeps track of whether it has allocated the search buffer using bits in the re_pattern_buffer. This means that whenever @@ -159,7 +167,9 @@ #define current_buffer (current_thread->m_current_buffer) time you call a searching or matching function. Therefore, we need to call re_set_registers after compiling a new pattern or after setting the match registers, so that the regex functions will be - able to free or re-allocate it properly. */ + able to free or re-allocate it properly. This is not an issue for + precompiled regexp objects, which set REGS_FIXED and can therefore + avoid dynamic allocation of match objects during matching. */ struct re_registers m_search_regs; #define search_regs (current_thread->m_search_regs) -- 2.45.2