diff --git a/src/regex-emacs.c b/src/regex-emacs.c index 746779490ad..f75f805cd9c 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c @@ -3979,7 +3979,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, /* Prevent shrinking and relocation of buffer text if GC happens while we are inside this function. The calls to - UPDATE_SYNTAX_TABLE_* macros can call Lisp (via + RE_UPDATE_SYNTAX_TABLE_* macros can call Lisp (via `internal--syntax-propertize`); these calls are careful to defend against buffer modifications, but even with no modifications, the buffer text may be relocated during GC by `compact_buffer` which would invalidate @@ -4792,12 +4792,11 @@ re_match_2_internal (struct re_pattern_buffer *bufp, int s1, s2; int dummy; ptrdiff_t offset = POINTER_TO_OFFSET (d); - ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1; - UPDATE_SYNTAX_TABLE (charpos); + RE_UPDATE_SYNTAX_TABLE_BEFORE (offset); GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); nchars++; s1 = SYNTAX (c1); - UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); + RE_UPDATE_SYNTAX_TABLE_FORWARD (offset); PREFETCH_NOLIMIT (); GET_CHAR_AFTER (c2, d, dummy); nchars++; @@ -4832,8 +4831,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, int s1, s2; int dummy; ptrdiff_t offset = POINTER_TO_OFFSET (d); - ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset); - UPDATE_SYNTAX_TABLE (charpos); + RE_UPDATE_SYNTAX_TABLE (offset); PREFETCH (); GET_CHAR_AFTER (c2, d, dummy); nchars++; @@ -4848,7 +4846,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, { GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); nchars++; - UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1); + RE_UPDATE_SYNTAX_TABLE_BACKWARD_BEFORE (offset); s1 = SYNTAX (c1); /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2) @@ -4875,8 +4873,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, int s1, s2; int dummy; ptrdiff_t offset = POINTER_TO_OFFSET (d); - ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1; - UPDATE_SYNTAX_TABLE (charpos); + RE_UPDATE_SYNTAX_TABLE_BEFORE (offset); GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); nchars++; s1 = SYNTAX (c1); @@ -4891,13 +4888,13 @@ re_match_2_internal (struct re_pattern_buffer *bufp, PREFETCH_NOLIMIT (); GET_CHAR_AFTER (c2, d, dummy); nchars++; - UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); + RE_UPDATE_SYNTAX_TABLE_FORWARD (offset); s2 = SYNTAX (c2); /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2) returns 0. */ if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2)) - goto fail; + goto fail; } } break; @@ -4917,8 +4914,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, int c1, c2; int s1, s2; ptrdiff_t offset = POINTER_TO_OFFSET (d); - ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset); - UPDATE_SYNTAX_TABLE (charpos); + RE_UPDATE_SYNTAX_TABLE (offset); PREFETCH (); c2 = RE_STRING_CHAR (d, target_multibyte); nchars++; @@ -4933,7 +4929,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, { GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); nchars++; - UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1); + RE_UPDATE_SYNTAX_TABLE_BACKWARD_BEFORE (offset); s1 = SYNTAX (c1); /* ... and S1 is Sword or Ssymbol. */ @@ -4958,8 +4954,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, int c1, c2; int s1, s2; ptrdiff_t offset = POINTER_TO_OFFSET (d); - ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1; - UPDATE_SYNTAX_TABLE (charpos); + RE_UPDATE_SYNTAX_TABLE_BEFORE (offset); GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); nchars++; s1 = SYNTAX (c1); @@ -4974,7 +4969,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, PREFETCH_NOLIMIT (); c2 = RE_STRING_CHAR (d, target_multibyte); nchars++; - UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); + RE_UPDATE_SYNTAX_TABLE_FORWARD (offset); s2 = SYNTAX (c2); /* ... and S2 is Sword or Ssymbol. */ @@ -4994,8 +4989,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, PREFETCH (); { ptrdiff_t offset = POINTER_TO_OFFSET (d); - ptrdiff_t pos1 = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset); - UPDATE_SYNTAX_TABLE (pos1); + RE_UPDATE_SYNTAX_TABLE (offset); } { int len; diff --git a/src/syntax.c b/src/syntax.c index e9e04e2d638..0245038dc2d 100644 --- a/src/syntax.c +++ b/src/syntax.c @@ -250,6 +250,8 @@ SETUP_SYNTAX_TABLE (ptrdiff_t from, ptrdiff_t count) gl_state.b_property = BEGV; gl_state.e_property = ZV + 1; gl_state.object = Qnil; + gl_state.b_re_byte = -1; + gl_state.e_re_byte = -1; if (parse_sexp_lookup_properties) { if (count > 0) @@ -268,11 +270,12 @@ SETUP_SYNTAX_TABLE (ptrdiff_t from, ptrdiff_t count) FROMBYTE is an regexp-byteoffset. */ void -RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (Lisp_Object object, - ptrdiff_t frombyte) +RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (Lisp_Object object, ptrdiff_t frombyte) { SETUP_BUFFER_SYNTAX_TABLE (); gl_state.object = object; + gl_state.b_re_byte = -1; + gl_state.e_re_byte = -1; if (BUFFERP (gl_state.object)) { struct buffer *buf = XBUFFER (gl_state.object); @@ -282,7 +285,7 @@ RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (Lisp_Object object, else if (NILP (gl_state.object)) { gl_state.b_property = BEG; - gl_state.e_property = ZV; /* FIXME: Why not +1 like in SETUP_SYNTAX_TABLE? */ + gl_state.e_property = ZV; } else if (EQ (gl_state.object, Qt)) { @@ -295,8 +298,11 @@ RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (Lisp_Object object, gl_state.e_property = 1 + SCHARS (gl_state.object); } if (parse_sexp_lookup_properties) - update_syntax_table (RE_SYNTAX_TABLE_BYTE_TO_CHAR (frombyte), - 1, 1, gl_state.object); + { + update_syntax_table (RE_SYNTAX_TABLE_BYTE_TO_CHAR (frombyte), + 1, 1, gl_state.object); + re_update_byteoffsets (); + } } /* Update gl_state to an appropriate interval which contains CHARPOS. The diff --git a/src/syntax.h b/src/syntax.h index 01982be25a0..3e20952053b 100644 --- a/src/syntax.h +++ b/src/syntax.h @@ -66,7 +66,7 @@ #define Vstandard_syntax_table BVAR (&buffer_defaults, syntax_table) struct gl_state_s { Lisp_Object object; /* The object we are scanning. */ - ptrdiff_t start; /* Where to stop. */ + ptrdiff_t start; /* Where to stop(?FIXME?). */ ptrdiff_t stop; /* Where to stop. */ bool use_global; /* Whether to use global_code or c_s_t. */ @@ -85,6 +85,11 @@ #define Vstandard_syntax_table BVAR (&buffer_defaults, syntax_table) and possibly at the intervals too, depending on: */ + /* The regexp engine prefers byteoffsets over char positions, so + store those to try and reduce the number of byte<->char conversions. + This is only kept uptodate when used from the regexp engine. */ + ptrdiff_t b_re_byte; /* First byteoffset where c_s_t is valid. */ + ptrdiff_t e_re_byte; /* First byteoffset where c_s_t is not valid. */ }; extern struct gl_state_s gl_state; @@ -145,19 +150,14 @@ SYNTAX (int c) extern unsigned char const syntax_spec_code[0400]; -/* Convert the regexp's BYTEOFFSET into a character position, - for the object recorded in gl_state with RE_SETUP_SYNTAX_TABLE_FOR_OBJECT. - - The value is meant for use in code that does nothing when - parse_sexp_lookup_properties is false, so return 0 in that case, - for speed. */ +/* Convert the regexp's BYTEOFFSET into a character position, for + the object recorded in gl_state with RE_SETUP_SYNTAX_TABLE_FOR_OBJECT. */ INLINE ptrdiff_t RE_SYNTAX_TABLE_BYTE_TO_CHAR (ptrdiff_t byteoffset) { - return (! parse_sexp_lookup_properties - ? 0 - : STRINGP (gl_state.object) + eassert (parse_sexp_lookup_properties); + return (STRINGP (gl_state.object) ? string_byte_to_char (gl_state.object, byteoffset) : BUFFERP (gl_state.object) ? ((buf_bytepos_to_charpos @@ -168,6 +168,44 @@ RE_SYNTAX_TABLE_BYTE_TO_CHAR (ptrdiff_t byteoffset) : byteoffset); } +INLINE ptrdiff_t +RE_SYNTAX_TABLE_CHAR_TO_BYTE (ptrdiff_t charpos) +{ + eassert (parse_sexp_lookup_properties); + return (STRINGP (gl_state.object) + ? string_char_to_byte (gl_state.object, charpos) + : BUFFERP (gl_state.object) + ? ((buf_charpos_to_bytepos + (XBUFFER (gl_state.object), charpos) + - BUF_BEGV_BYTE (XBUFFER (gl_state.object)))) + : NILP (gl_state.object) + ? CHAR_TO_BYTE (charpos) - BEGV_BYTE + : charpos); +} + +static void re_update_byteoffsets (void) +{ + gl_state.b_re_byte = RE_SYNTAX_TABLE_CHAR_TO_BYTE (gl_state.b_property); + eassert (gl_state.b_property + == RE_SYNTAX_TABLE_BYTE_TO_CHAR (gl_state.b_re_byte)); + /* `e_property` is often set to EOB+1 (or to some value + much further than `stop` in narrowed buffers). */ + gl_state.e_re_byte + = gl_state.e_property > gl_state.stop + ? 1 + RE_SYNTAX_TABLE_CHAR_TO_BYTE (gl_state.stop) + : RE_SYNTAX_TABLE_CHAR_TO_BYTE (gl_state.e_property); + eassert (gl_state.e_property > gl_state.stop + ? gl_state.e_property + >= 1 + RE_SYNTAX_TABLE_BYTE_TO_CHAR (gl_state.e_re_byte - 1) + : gl_state.e_property + == RE_SYNTAX_TABLE_BYTE_TO_CHAR (gl_state.e_re_byte)); +} + +/* The regexp-engine doesn't keep track of char positions, but instead + uses byteoffsets, so `syntax.c` uses `UPDATE_SYNTAX_TABLE_*` functions, + passing them `charpos`s whereas `regexp.c` uses `RE_UPDATE_SYNTAX_TABLE_*` + functions, passing them byteoffsets. */ + /* Make syntax table state (gl_state) good for CHARPOS, assuming it is currently good for a position before CHARPOS. */ @@ -178,6 +216,36 @@ UPDATE_SYNTAX_TABLE_FORWARD (ptrdiff_t charpos) update_syntax_table_forward (charpos, false, gl_state.object); } +INLINE void +RE_UPDATE_SYNTAX_TABLE_FORWARD (ptrdiff_t byteoffset) +{ /* Performs just-in-time syntax-propertization. */ + if (!parse_sexp_lookup_properties) + return; + eassert (gl_state.e_re_byte >= 0); /* gl_state.b_re_byte can be negative. */ + if (byteoffset >= gl_state.e_re_byte) + { + ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (byteoffset); + eassert (charpos >= gl_state.e_property); + UPDATE_SYNTAX_TABLE_FORWARD (charpos); + re_update_byteoffsets (); + } +} + +INLINE void +RE_UPDATE_SYNTAX_TABLE_FORWARD_BEFORE (ptrdiff_t byteoffset) +{ /* Performs just-in-time syntax-propertization. */ + if (!parse_sexp_lookup_properties) + return; + eassert (gl_state.e_re_byte >= 0); /* gl_state.b_re_byte can be negative. */ + if (byteoffset > gl_state.e_re_byte) + { + ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (byteoffset) - 1; + eassert (charpos >= gl_state.e_property); + UPDATE_SYNTAX_TABLE_FORWARD (charpos); + re_update_byteoffsets (); + } +} + /* Make syntax table state (gl_state) good for CHARPOS, assuming it is currently good for a position after CHARPOS. */ @@ -188,6 +256,36 @@ UPDATE_SYNTAX_TABLE_BACKWARD (ptrdiff_t charpos) update_syntax_table (charpos, -1, false, gl_state.object); } +INLINE void +RE_UPDATE_SYNTAX_TABLE_BACKWARD (ptrdiff_t byteoffset) +{ + if (!parse_sexp_lookup_properties) + return; + eassert (gl_state.e_re_byte >= 0); /* gl_state.b_re_byte can be negative. */ + if (byteoffset < gl_state.b_re_byte) + { + ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (byteoffset); + eassert (charpos < gl_state.b_property); + UPDATE_SYNTAX_TABLE_FORWARD (charpos); + re_update_byteoffsets (); + } +} + +INLINE void +RE_UPDATE_SYNTAX_TABLE_BACKWARD_BEFORE (ptrdiff_t byteoffset) +{ + if (!parse_sexp_lookup_properties) + return; + eassert (gl_state.e_re_byte >= 0); /* gl_state.b_re_byte can be negative. */ + if (byteoffset <= gl_state.b_re_byte) + { + ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (byteoffset); + eassert (charpos <= gl_state.b_property); + UPDATE_SYNTAX_TABLE_FORWARD (charpos - 1); + re_update_byteoffsets (); + } +} + /* Make syntax table good for CHARPOS. */ INLINE void @@ -197,6 +295,20 @@ UPDATE_SYNTAX_TABLE (ptrdiff_t charpos) UPDATE_SYNTAX_TABLE_FORWARD (charpos); } +INLINE void +RE_UPDATE_SYNTAX_TABLE (ptrdiff_t byteoffset) +{ + RE_UPDATE_SYNTAX_TABLE_BACKWARD (byteoffset); + RE_UPDATE_SYNTAX_TABLE_FORWARD (byteoffset); +} + +INLINE void +RE_UPDATE_SYNTAX_TABLE_BEFORE (ptrdiff_t byteoffset) +{ + RE_UPDATE_SYNTAX_TABLE_BACKWARD_BEFORE (byteoffset); + RE_UPDATE_SYNTAX_TABLE_FORWARD_BEFORE (byteoffset); +} + /* Set up the buffer-global syntax table. */ INLINE void