From: Stefan Monnier via "Bug reports for GNU Emacs, the Swiss army knife of text editors" <bug-gnu-emacs@gnu.org>
To: "Mattias Engdegård" <mattias.engdegard@gmail.com>
Cc: martin rudalics <rudalics@gmx.at>, Eli Zaretskii <eliz@gnu.org>,
65726@debbugs.gnu.org
Subject: bug#65726: 29.1.50; Crash in regexp engine
Date: Sat, 09 Sep 2023 11:55:28 -0400 [thread overview]
Message-ID: <jwvpm2r8htj.fsf-monnier+emacs@gnu.org> (raw)
In-Reply-To: <EE408460-BDB2-416A-A76E-F4A28887E7B0@gmail.com> ("Mattias Engdegård"'s message of "Wed, 6 Sep 2023 14:03:01 +0200")
>> Adding an `assert (p2 >= done)` confirms that this does happen,
>
> Can you give an example of such a regexp? I ran `make check` with your patch
> applied and checking enabled but got no assertion failure.
My patch didn't include the assertion :-)
The patch below does and hits an assertion failure during the dump:
Loading format...
Loading bindings...
WOW1!
Error: error ("WOW1!")
mapbacktrace(#[1028 "\1\4\203\24\0\301\302!\210\300\4!\210\301\303!\210\202\35\0\301\304!\210\3\3B\262\1\211\2035\0\300\1@!\210\211A\211\262\2\2035\0\301\305!\210\202!\0\301\306!\207" [prin1 princ " " "(" " (" " " ")\n"] 7 "\n\n(fn EVALD FUNC ARGS FLAGS)"])
debug-early-backtrace()
debug-early(error (error "WOW1!"))
string-match("\\`[^ ]+\\( [^ ]+\\)*\\'" "h r" nil t)
key-valid-p("h r")
keymap--check("h r")
keymap-set((keymap (27 keymap (119 . eww-search-words)) (111 . occur)) "h r" highlight-regexp)
define-keymap("o" occur "M-w" eww-search-words "h r" highlight-regexp "h p" highlight-phrase "h l" highlight-lines-matching-regexp "h ." highlight-symbol-at-point "h u" unhighlight-regexp "h f" hi-lock-find-patterns "h w" hi-lock-write-interactive-patterns)
(defvar search-map (define-keymap "o" 'occur "M-w" 'eww-search-words "h r" 'highlight-regexp "h p" 'highlight-phrase "h l" 'highlight-lines-matching-regexp "h ." 'highlight-symbol-at-point "h u" 'unhighlight-regexp "h f" 'hi-lock-find-patterns "h w" 'hi-lock-write-interactive-patterns) ("/home/monnier/src/emacs/main/lisp/bindings.elc" . 40677))
load("bindings")
load("loadup.el")
-- Stefan
diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index 394ba22e9b0..b47cdfbfef8 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -3643,19 +3643,125 @@ execute_charset (re_char **pp, int c, int corig, bool unibyte,
return not;
}
+/* Case where `p2` points to an `exactn`. */
+static bool
+mutually_exclusive_exactn (struct re_pattern_buffer *bufp, re_char *p1,
+ re_char *p2)
+{
+ bool multibyte = RE_MULTIBYTE_P (bufp);
+ int c
+ = (re_opcode_t) *p2 == endline ? '\n'
+ : RE_STRING_CHAR (p2 + 2, multibyte);
+
+ if ((re_opcode_t) *p1 == exactn)
+ {
+ if (c != RE_STRING_CHAR (p1 + 2, multibyte))
+ {
+ DEBUG_PRINT (" '%c' != '%c' => fast loop.\n", c, p1[2]);
+ return true;
+ }
+ }
+
+ else if ((re_opcode_t) *p1 == charset
+ || (re_opcode_t) *p1 == charset_not)
+ {
+ if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c),
+ Qnil))
+ {
+ DEBUG_PRINT (" No match => fast loop.\n");
+ return true;
+ }
+ }
+ else if ((re_opcode_t) *p1 == anychar
+ && c == '\n')
+ {
+ DEBUG_PRINT (" . != \\n => fast loop.\n");
+ return true;
+ }
+ return false;
+}
+
+/* Case where `p2` points to an `charset`. */
+static bool
+mutually_exclusive_charset (struct re_pattern_buffer *bufp, re_char *p1,
+ re_char *p2)
+{
+ /* It is hard to list up all the character in charset
+ P2 if it includes multibyte character. Give up in
+ such case. */
+ if (!RE_MULTIBYTE_P (bufp) || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
+ {
+ /* Now, we are sure that P2 has no range table.
+ So, for the size of bitmap in P2, 'p2[1]' is
+ enough. But P1 may have range table, so the
+ size of bitmap table of P1 is extracted by
+ using macro 'CHARSET_BITMAP_SIZE'.
+
+ In a multibyte case, we know that all the character
+ listed in P2 is ASCII. In a unibyte case, P1 has only a
+ bitmap table. So, in both cases, it is enough to test
+ only the bitmap table of P1. */
+
+ if ((re_opcode_t) *p1 == charset)
+ {
+ int idx;
+ /* We win if the charset inside the loop
+ has no overlap with the one after the loop. */
+ for (idx = 0;
+ (idx < (int) p2[1]
+ && idx < CHARSET_BITMAP_SIZE (p1));
+ idx++)
+ if ((p2[2 + idx] & p1[2 + idx]) != 0)
+ break;
+
+ if (idx == p2[1]
+ || idx == CHARSET_BITMAP_SIZE (p1))
+ {
+ DEBUG_PRINT (" No match => fast loop.\n");
+ return true;
+ }
+ }
+ else if ((re_opcode_t) *p1 == charset_not)
+ {
+ int idx;
+ /* We win if the charset_not inside the loop lists
+ every character listed in the charset after. */
+ for (idx = 0; idx < (int) p2[1]; idx++)
+ if (! (p2[2 + idx] == 0
+ || (idx < CHARSET_BITMAP_SIZE (p1)
+ && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
+ break;
+
+ if (idx == p2[1])
+ {
+ DEBUG_PRINT (" No match => fast loop.\n");
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
/* True if "p1 matches something" implies "p2 fails". */
+/* Avoiding inf-loops:
+ We're trying to follow all paths reachable from `p2`, but since some
+ loops can match the empty string, this can loop back to `p2`.
+ To avoid inf-looping, we rely on a lexicographic ordering using `done`
+ and `p2`: at every recursion, either `done` is larger, or it
+ is unchanged and `p2` is larger. */
static bool
-mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
- re_char *p2)
+mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
+ re_char *p2, re_char *done)
{
re_opcode_t op2;
- bool multibyte = RE_MULTIBYTE_P (bufp);
unsigned char *pend = bufp->buffer + bufp->used;
re_char *p2_orig = p2;
eassert (p1 >= bufp->buffer && p1 < pend
&& p2 >= bufp->buffer && p2 <= pend);
+ eassert (p2 >= done);
+
/* Skip over open/close-group commands.
If what follows this loop is a ...+ construct,
look at what begins its body, since we will have to
@@ -3684,98 +3790,14 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
case endline:
case exactn:
- {
- int c
- = (re_opcode_t) *p2 == endline ? '\n'
- : RE_STRING_CHAR (p2 + 2, multibyte);
-
- if ((re_opcode_t) *p1 == exactn)
- {
- if (c != RE_STRING_CHAR (p1 + 2, multibyte))
- {
- DEBUG_PRINT (" '%c' != '%c' => fast loop.\n", c, p1[2]);
- return true;
- }
- }
-
- else if ((re_opcode_t) *p1 == charset
- || (re_opcode_t) *p1 == charset_not)
- {
- if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c),
- Qnil))
- {
- DEBUG_PRINT (" No match => fast loop.\n");
- return true;
- }
- }
- else if ((re_opcode_t) *p1 == anychar
- && c == '\n')
- {
- DEBUG_PRINT (" . != \\n => fast loop.\n");
- return true;
- }
- }
- break;
+ return mutually_exclusive_exactn (bufp, p1, p2);
case charset:
{
if ((re_opcode_t) *p1 == exactn)
- /* Reuse the code above. */
- return mutually_exclusive_p (bufp, p2, p1);
-
- /* It is hard to list up all the character in charset
- P2 if it includes multibyte character. Give up in
- such case. */
- else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
- {
- /* Now, we are sure that P2 has no range table.
- So, for the size of bitmap in P2, 'p2[1]' is
- enough. But P1 may have range table, so the
- size of bitmap table of P1 is extracted by
- using macro 'CHARSET_BITMAP_SIZE'.
-
- In a multibyte case, we know that all the character
- listed in P2 is ASCII. In a unibyte case, P1 has only a
- bitmap table. So, in both cases, it is enough to test
- only the bitmap table of P1. */
-
- if ((re_opcode_t) *p1 == charset)
- {
- int idx;
- /* We win if the charset inside the loop
- has no overlap with the one after the loop. */
- for (idx = 0;
- (idx < (int) p2[1]
- && idx < CHARSET_BITMAP_SIZE (p1));
- idx++)
- if ((p2[2 + idx] & p1[2 + idx]) != 0)
- break;
-
- if (idx == p2[1]
- || idx == CHARSET_BITMAP_SIZE (p1))
- {
- DEBUG_PRINT (" No match => fast loop.\n");
- return true;
- }
- }
- else if ((re_opcode_t) *p1 == charset_not)
- {
- int idx;
- /* We win if the charset_not inside the loop lists
- every character listed in the charset after. */
- for (idx = 0; idx < (int) p2[1]; idx++)
- if (! (p2[2 + idx] == 0
- || (idx < CHARSET_BITMAP_SIZE (p1)
- && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
- break;
-
- if (idx == p2[1])
- {
- DEBUG_PRINT (" No match => fast loop.\n");
- return true;
- }
- }
- }
+ return mutually_exclusive_exactn (bufp, p2, p1);
+ else
+ return mutually_exclusive_charset (bufp, p1, p2);
}
break;
@@ -3783,9 +3805,9 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
switch (*p1)
{
case exactn:
+ return mutually_exclusive_exactn (bufp, p2, p1);
case charset:
- /* Reuse the code above. */
- return mutually_exclusive_p (bufp, p2, p1);
+ return mutually_exclusive_charset (bufp, p2, p1);
case charset_not:
/* When we have two charset_not, it's very unlikely that
they don't overlap. The union of the two sets of excluded
@@ -3830,11 +3852,25 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
int mcnt;
p2++;
EXTRACT_NUMBER_AND_INCR (mcnt, p2);
- /* Don't just test `mcnt > 0` because non-greedy loops have
- their test at the end with an unconditional jump at the start. */
- if (p2 > p2_orig && mcnt >= 0) /* Ensure forward progress. */
- return (mutually_exclusive_p (bufp, p1, p2)
- && mutually_exclusive_p (bufp, p1, p2 + mcnt));
+ re_char *p2_other = p2 + mcnt;
+ /* Presumably, any position up to `done` should be a position we
+ already considered elsewhere (because our state machines are not
+ arbitrary: they consists of syntaxically nested loops with limited
+ control flow), so we can consider those back jumps as mutually
+ exclusive here. */
+ if (p2 < done)
+ error ("WOW1!");
+ if (p2_other < done)
+ error ("WOW2!");
+ return ((p2 < done ? true
+ : p2 == done ? true
+ : mutually_exclusive_aux (bufp, p1, p2,
+ p2 > p2_orig ? done : p2))
+ && (p2_other < done ? true
+ : p2_other == done ? true
+ : mutually_exclusive_aux (bufp, p1, p2_other,
+ p2_other > p2_orig
+ ? done : p2_other)));
break;
}
@@ -3846,6 +3882,12 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
return false;
}
+static bool
+mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
+ re_char *p2)
+{
+ return mutually_exclusive_aux (bufp, p1, p2, min (p2, p1));
+}
\f
/* Matching routines. */
next prev parent reply other threads:[~2023-09-09 15:55 UTC|newest]
Thread overview: 36+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-09-04 7:46 bug#65726: 29.1.50; Crash in regexp engine martin rudalics
2023-09-04 8:44 ` Mattias Engdegård
2023-09-04 12:12 ` Eli Zaretskii
2023-09-04 13:18 ` Mattias Engdegård
2023-09-04 13:26 ` Mattias Engdegård
2023-09-04 13:28 ` Eli Zaretskii
2023-09-04 15:47 ` Mattias Engdegård
2023-09-05 12:23 ` Mattias Engdegård
2023-09-05 13:08 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-05 13:50 ` Mattias Engdegård
2023-09-05 15:33 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-06 12:03 ` Mattias Engdegård
2023-09-09 15:55 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors [this message]
2023-09-09 16:34 ` Mattias Engdegård
2023-09-14 14:41 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-15 20:03 ` Mattias Engdegård
2023-09-15 22:20 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-16 3:45 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-16 10:49 ` Mattias Engdegård
2023-09-16 15:48 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-18 2:14 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-18 3:59 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-18 12:32 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-21 17:23 ` Mattias Engdegård
2023-09-21 18:08 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-23 11:56 ` Mattias Engdegård
2023-09-04 14:32 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-04 15:57 ` Eli Zaretskii
2023-09-04 17:12 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-10 7:50 ` Stefan Kangas
2023-09-10 7:55 ` Eli Zaretskii
2023-09-10 23:09 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-11 14:46 ` Stefan Kangas
2023-09-05 7:14 ` martin rudalics
2023-09-11 8:10 ` Mattias Engdegård
2023-09-11 13:41 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=jwvpm2r8htj.fsf-monnier+emacs@gnu.org \
--to=bug-gnu-emacs@gnu.org \
--cc=65726@debbugs.gnu.org \
--cc=eliz@gnu.org \
--cc=mattias.engdegard@gmail.com \
--cc=monnier@iro.umontreal.ca \
--cc=rudalics@gmx.at \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this external index
https://git.savannah.gnu.org/cgit/emacs.git
https://git.savannah.gnu.org/cgit/emacs/org-mode.git
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.