all messages for Emacs-related lists mirrored at yhetil.org
 help / color / mirror / code / Atom feed
From: Stefan Monnier via "Bug reports for GNU Emacs, the Swiss army knife of text editors" <bug-gnu-emacs@gnu.org>
To: "Mattias Engdegård" <mattias.engdegard@gmail.com>
Cc: martin rudalics <rudalics@gmx.at>, Eli Zaretskii <eliz@gnu.org>,
	65726@debbugs.gnu.org
Subject: bug#65726: 29.1.50; Crash in regexp engine
Date: Sat, 09 Sep 2023 11:55:28 -0400	[thread overview]
Message-ID: <jwvpm2r8htj.fsf-monnier+emacs@gnu.org> (raw)
In-Reply-To: <EE408460-BDB2-416A-A76E-F4A28887E7B0@gmail.com> ("Mattias Engdegård"'s message of "Wed, 6 Sep 2023 14:03:01 +0200")

>> Adding an `assert (p2 >= done)` confirms that this does happen,
>
> Can you give an example of such a regexp? I ran `make check` with your patch
> applied and checking enabled but got no assertion failure.

My patch didn't include the assertion :-)
The patch below does and hits an assertion failure during the dump:

    Loading format...
    Loading bindings...
    WOW1!
    
    Error: error ("WOW1!")
      mapbacktrace(#[1028 "\1\4\203\24\0\301\302!\210\300\4!\210\301\303!\210\202\35\0\301\304!\210\3\3B\262\1\211\2035\0\300\1@!\210\211A\211\262\2\2035\0\301\305!\210\202!\0\301\306!\207" [prin1 princ "  " "(" "  (" " " ")\n"] 7 "\n\n(fn EVALD FUNC ARGS FLAGS)"])
      debug-early-backtrace()
      debug-early(error (error "WOW1!"))
      string-match("\\`[^ ]+\\( [^ ]+\\)*\\'" "h r" nil t)
      key-valid-p("h r")
      keymap--check("h r")
      keymap-set((keymap (27 keymap (119 . eww-search-words)) (111 . occur)) "h r" highlight-regexp)
      define-keymap("o" occur "M-w" eww-search-words "h r" highlight-regexp "h p" highlight-phrase "h l" highlight-lines-matching-regexp "h ." highlight-symbol-at-point "h u" unhighlight-regexp "h f" hi-lock-find-patterns "h w" hi-lock-write-interactive-patterns)
      (defvar search-map (define-keymap "o" 'occur "M-w" 'eww-search-words "h r" 'highlight-regexp "h p" 'highlight-phrase "h l" 'highlight-lines-matching-regexp "h ." 'highlight-symbol-at-point "h u" 'unhighlight-regexp "h f" 'hi-lock-find-patterns "h w" 'hi-lock-write-interactive-patterns) ("/home/monnier/src/emacs/main/lisp/bindings.elc" . 40677))
      load("bindings")
      load("loadup.el")


-- Stefan


diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index 394ba22e9b0..b47cdfbfef8 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -3643,19 +3643,125 @@ execute_charset (re_char **pp, int c, int corig, bool unibyte,
   return not;
 }
 
+/* Case where `p2` points to an `exactn`.  */
+static bool
+mutually_exclusive_exactn (struct re_pattern_buffer *bufp, re_char *p1,
+		           re_char *p2)
+{
+  bool multibyte = RE_MULTIBYTE_P (bufp);
+  int c
+    = (re_opcode_t) *p2 == endline ? '\n'
+      : RE_STRING_CHAR (p2 + 2, multibyte);
+
+  if ((re_opcode_t) *p1 == exactn)
+    {
+      if (c != RE_STRING_CHAR (p1 + 2, multibyte))
+	{
+	  DEBUG_PRINT ("  '%c' != '%c' => fast loop.\n", c, p1[2]);
+	  return true;
+	}
+    }
+
+  else if ((re_opcode_t) *p1 == charset
+	   || (re_opcode_t) *p1 == charset_not)
+    {
+      if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c),
+                            Qnil))
+	{
+	  DEBUG_PRINT ("	 No match => fast loop.\n");
+	  return true;
+	}
+    }
+  else if ((re_opcode_t) *p1 == anychar
+	   && c == '\n')
+    {
+      DEBUG_PRINT ("   . != \\n => fast loop.\n");
+      return true;
+    }
+  return false;
+}
+
+/* Case where `p2` points to an `charset`.  */
+static bool
+mutually_exclusive_charset (struct re_pattern_buffer *bufp, re_char *p1,
+		            re_char *p2)
+{
+  /* It is hard to list up all the character in charset
+     P2 if it includes multibyte character.  Give up in
+     such case.  */
+  if (!RE_MULTIBYTE_P (bufp) || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
+    {
+      /* Now, we are sure that P2 has no range table.
+	     So, for the size of bitmap in P2, 'p2[1]' is
+	     enough.  But P1 may have range table, so the
+	     size of bitmap table of P1 is extracted by
+	     using macro 'CHARSET_BITMAP_SIZE'.
+
+	     In a multibyte case, we know that all the character
+	     listed in P2 is ASCII.  In a unibyte case, P1 has only a
+	     bitmap table.  So, in both cases, it is enough to test
+	     only the bitmap table of P1.  */
+
+      if ((re_opcode_t) *p1 == charset)
+	{
+	  int idx;
+	  /* We win if the charset inside the loop
+		 has no overlap with the one after the loop.  */
+	  for (idx = 0;
+		(idx < (int) p2[1]
+		 && idx < CHARSET_BITMAP_SIZE (p1));
+		idx++)
+	    if ((p2[2 + idx] & p1[2 + idx]) != 0)
+	      break;
+
+	  if (idx == p2[1]
+	      || idx == CHARSET_BITMAP_SIZE (p1))
+	    {
+	      DEBUG_PRINT ("	 No match => fast loop.\n");
+	      return true;
+	    }
+	}
+      else if ((re_opcode_t) *p1 == charset_not)
+	{
+	  int idx;
+	  /* We win if the charset_not inside the loop lists
+		 every character listed in the charset after.  */
+	  for (idx = 0; idx < (int) p2[1]; idx++)
+	    if (! (p2[2 + idx] == 0
+		   || (idx < CHARSET_BITMAP_SIZE (p1)
+		       && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
+	      break;
+
+	  if (idx == p2[1])
+	    {
+	      DEBUG_PRINT ("	 No match => fast loop.\n");
+	      return true;
+	    }
+	}
+    }
+  return false;
+}
+
 /* True if "p1 matches something" implies "p2 fails".  */
+/* Avoiding inf-loops:
+   We're trying to follow all paths reachable from `p2`, but since some
+   loops can match the empty string, this can loop back to `p2`.
+   To avoid inf-looping, we rely on a lexicographic ordering using `done`
+   and `p2`: at every recursion, either `done` is larger, or it
+   is unchanged and `p2` is larger.  */
 static bool
-mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
-		      re_char *p2)
+mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
+		        re_char *p2, re_char *done)
 {
   re_opcode_t op2;
-  bool multibyte = RE_MULTIBYTE_P (bufp);
   unsigned char *pend = bufp->buffer + bufp->used;
   re_char *p2_orig = p2;
 
   eassert (p1 >= bufp->buffer && p1 < pend
 	   && p2 >= bufp->buffer && p2 <= pend);
 
+  eassert (p2 >= done);
+
   /* Skip over open/close-group commands.
      If what follows this loop is a ...+ construct,
      look at what begins its body, since we will have to
@@ -3684,98 +3790,14 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
 
     case endline:
     case exactn:
-      {
-	int c
-	  = (re_opcode_t) *p2 == endline ? '\n'
-	  : RE_STRING_CHAR (p2 + 2, multibyte);
-
-	if ((re_opcode_t) *p1 == exactn)
-	  {
-	    if (c != RE_STRING_CHAR (p1 + 2, multibyte))
-	      {
-		DEBUG_PRINT ("  '%c' != '%c' => fast loop.\n", c, p1[2]);
-		return true;
-	      }
-	  }
-
-	else if ((re_opcode_t) *p1 == charset
-		 || (re_opcode_t) *p1 == charset_not)
-	  {
-	    if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c),
-                                  Qnil))
-	      {
-		DEBUG_PRINT ("	 No match => fast loop.\n");
-		return true;
-	      }
-	  }
-	else if ((re_opcode_t) *p1 == anychar
-		 && c == '\n')
-	  {
-	    DEBUG_PRINT ("   . != \\n => fast loop.\n");
-	    return true;
-	  }
-      }
-      break;
+      return mutually_exclusive_exactn (bufp, p1, p2);
 
     case charset:
       {
 	if ((re_opcode_t) *p1 == exactn)
-	  /* Reuse the code above.  */
-	  return mutually_exclusive_p (bufp, p2, p1);
-
-      /* It is hard to list up all the character in charset
-	 P2 if it includes multibyte character.  Give up in
-	 such case.  */
-      else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
-	{
-	  /* Now, we are sure that P2 has no range table.
-	     So, for the size of bitmap in P2, 'p2[1]' is
-	     enough.  But P1 may have range table, so the
-	     size of bitmap table of P1 is extracted by
-	     using macro 'CHARSET_BITMAP_SIZE'.
-
-	     In a multibyte case, we know that all the character
-	     listed in P2 is ASCII.  In a unibyte case, P1 has only a
-	     bitmap table.  So, in both cases, it is enough to test
-	     only the bitmap table of P1.  */
-
-	  if ((re_opcode_t) *p1 == charset)
-	    {
-	      int idx;
-	      /* We win if the charset inside the loop
-		 has no overlap with the one after the loop.  */
-	      for (idx = 0;
-		   (idx < (int) p2[1]
-		    && idx < CHARSET_BITMAP_SIZE (p1));
-		   idx++)
-		if ((p2[2 + idx] & p1[2 + idx]) != 0)
-		  break;
-
-	      if (idx == p2[1]
-		  || idx == CHARSET_BITMAP_SIZE (p1))
-		{
-		  DEBUG_PRINT ("	 No match => fast loop.\n");
-		  return true;
-		}
-	    }
-	  else if ((re_opcode_t) *p1 == charset_not)
-	    {
-	      int idx;
-	      /* We win if the charset_not inside the loop lists
-		 every character listed in the charset after.  */
-	      for (idx = 0; idx < (int) p2[1]; idx++)
-		if (! (p2[2 + idx] == 0
-		       || (idx < CHARSET_BITMAP_SIZE (p1)
-			   && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
-		  break;
-
-	      if (idx == p2[1])
-		{
-		  DEBUG_PRINT ("	 No match => fast loop.\n");
-		  return true;
-		}
-	      }
-	  }
+	  return mutually_exclusive_exactn (bufp, p2, p1);
+	else
+	  return mutually_exclusive_charset (bufp, p1, p2);
       }
       break;
 
@@ -3783,9 +3805,9 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
       switch (*p1)
 	{
 	case exactn:
+	  return mutually_exclusive_exactn (bufp, p2, p1);
 	case charset:
-	  /* Reuse the code above.  */
-	  return mutually_exclusive_p (bufp, p2, p1);
+	  return mutually_exclusive_charset (bufp, p2, p1);
 	case charset_not:
 	  /* When we have two charset_not, it's very unlikely that
 	     they don't overlap.  The union of the two sets of excluded
@@ -3830,11 +3852,25 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
         int mcnt;
 	p2++;
 	EXTRACT_NUMBER_AND_INCR (mcnt, p2);
-	/* Don't just test `mcnt > 0` because non-greedy loops have
-	   their test at the end with an unconditional jump at the start.  */
-	if (p2 > p2_orig && mcnt >= 0) /* Ensure forward progress.  */
-	  return (mutually_exclusive_p (bufp, p1, p2)
-		  && mutually_exclusive_p (bufp, p1, p2 + mcnt));
+	re_char *p2_other = p2 + mcnt;
+	/* Presumably, any position up to `done` should be a position we
+	   already considered elsewhere (because our state machines are not
+	   arbitrary: they consists of syntaxically nested loops with limited
+	   control flow), so we can consider those back jumps as mutually
+	   exclusive here.  */
+	if (p2 < done)
+	  error ("WOW1!");
+	if (p2_other < done)
+	  error ("WOW2!");
+	return ((p2 < done ? true
+		 : p2 == done ? true
+		 : mutually_exclusive_aux (bufp, p1, p2,
+		                           p2 > p2_orig ? done : p2))
+		&& (p2_other < done ? true
+		    : p2_other == done ? true
+		    : mutually_exclusive_aux (bufp, p1, p2_other,
+		                              p2_other > p2_orig
+		                              ? done : p2_other)));
 	break;
       }
 
@@ -3846,6 +3882,12 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
   return false;
 }
 
+static bool
+mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
+		      re_char *p2)
+{
+  return mutually_exclusive_aux (bufp, p1, p2, min (p2, p1));
+}
 \f
 /* Matching routines.  */
 






  reply	other threads:[~2023-09-09 15:55 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-09-04  7:46 bug#65726: 29.1.50; Crash in regexp engine martin rudalics
2023-09-04  8:44 ` Mattias Engdegård
2023-09-04 12:12   ` Eli Zaretskii
2023-09-04 13:18     ` Mattias Engdegård
2023-09-04 13:26       ` Mattias Engdegård
2023-09-04 13:28       ` Eli Zaretskii
2023-09-04 15:47         ` Mattias Engdegård
2023-09-05 12:23           ` Mattias Engdegård
2023-09-05 13:08             ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-05 13:50               ` Mattias Engdegård
2023-09-05 15:33                 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-06 12:03                   ` Mattias Engdegård
2023-09-09 15:55                     ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors [this message]
2023-09-09 16:34                       ` Mattias Engdegård
2023-09-14 14:41                         ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-15 20:03                           ` Mattias Engdegård
2023-09-15 22:20                             ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-16  3:45                           ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-16 10:49                             ` Mattias Engdegård
2023-09-16 15:48                               ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-18  2:14                                 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-18  3:59                                   ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-18 12:32                                     ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-21 17:23                                       ` Mattias Engdegård
2023-09-21 18:08                                         ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-23 11:56                                           ` Mattias Engdegård
2023-09-04 14:32 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-04 15:57   ` Eli Zaretskii
2023-09-04 17:12     ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-10  7:50       ` Stefan Kangas
2023-09-10  7:55         ` Eli Zaretskii
2023-09-10 23:09           ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-11 14:46             ` Stefan Kangas
2023-09-05  7:14   ` martin rudalics
2023-09-11  8:10 ` Mattias Engdegård
2023-09-11 13:41   ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=jwvpm2r8htj.fsf-monnier+emacs@gnu.org \
    --to=bug-gnu-emacs@gnu.org \
    --cc=65726@debbugs.gnu.org \
    --cc=eliz@gnu.org \
    --cc=mattias.engdegard@gmail.com \
    --cc=monnier@iro.umontreal.ca \
    --cc=rudalics@gmx.at \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this external index

	https://git.savannah.gnu.org/cgit/emacs.git
	https://git.savannah.gnu.org/cgit/emacs/org-mode.git

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.