all messages for Emacs-related lists mirrored at yhetil.org
 help / color / mirror / code / Atom feed
* Regexp matching errors
@ 2006-09-22 17:34 Stefan Monnier
  0 siblings, 0 replies; only message in thread
From: Stefan Monnier @ 2006-09-22 17:34 UTC (permalink / raw)


The handling of the fastmap optimization in regep.c had some bugs w.r.t
eight-bit-* chars in multibyte buffers/strings when case-fold is not in use:

   src/emacs --batch -Q --eval \
             '(let ((case-fold-search nil))
                (message "%s" (list (string-match "\xa1\\|a" "éf\x81g")
                                    (string-match "\x81\\|a" "éf\x81g")
                                    (string-match "[\xa1]" "éf\x81g")
                                    (string-match "[\x81]" "éf\x81g"))))'

returned

   (2 nil 2 nil)

I've installed the patch below to hopefully fix them.
I believe this patch to be prefectly safe, but you never know, so if you
notice anything fishy about regexp-matching, please tell me.


        Stefan



2006-09-22  Stefan Monnier  <monnier@iro.umontreal.ca>

	* regex.c (analyse_first): For eight-bit-control chars, mark both the
	char's value and its leading byte in the fastmap.
	(re_search_2): When fast-scanning without translation, be careful to
	check that we only match the leading byte of a multibyte char.

	* charset.h (PREV_CHAR_BOUNDARY): Make it work from within a char's
	byte sequence.
	(AT_CHAR_BOUNDARY): New macro.

Index: src/charset.h
===================================================================
RCS file: /sources/emacs/emacs/src/charset.h,v
retrieving revision 1.83
diff -u -r1.83 charset.h
--- src/charset.h	29 May 2006 06:19:09 -0000	1.83
+++ src/charset.h	22 Sep 2006 17:29:43 -0000
@@ -658,22 +658,34 @@
   } while (0)
 
 
-/* If P is after LIMIT, advance P to the previous character boundary.
-   It assumes that P is already at a character boundary of the sane
-   mulitbyte form whose beginning address is LIMIT.  */
+/* If P is after LIMIT, advance P to the previous character boundary.  */
 
 #define PREV_CHAR_BOUNDARY(p, limit)					\
   do {									\
     if ((p) > (limit))							\
       {									\
 	const unsigned char *p0 = (p);					\
+	const unsigned char *p_limit = max (limit, p0 - MAX_MULTIBYTE_LENGTH);\
 	do {								\
 	  p0--;								\
-	} while (p0 >= limit && ! CHAR_HEAD_P (*p0));			\
-	(p) = (BYTES_BY_CHAR_HEAD (*p0) == (p) - p0) ? p0 : (p) - 1;	\
+	} while (p0 >= p_limit && ! CHAR_HEAD_P (*p0));			\
+	/* If BBCH(*p0) > p-p0, it means we were not on a boundary.  */	\
+	(p) = (BYTES_BY_CHAR_HEAD (*p0) >= (p) - p0) ? p0 : (p) - 1;	\
       }									\
   } while (0)
 
+#define AT_CHAR_BOUNDARY_P(result, p, limit)	\
+  do {						\
+    if (CHAR_HEAD_P (*(p)) || (p) <= limit)	\
+      /* Optimization for the common case. */	\
+      (result) = 1;				\
+    else					\
+      {						\
+	const unsigned char *p_aux = (p)+1;	\
+	PREV_CHAR_BOUNDARY (p_aux, limit);	\
+	(result) = (p_aux == (p));		\
+      }						\
+} while (0)
 
 #ifdef emacs
 
Index: src/regex.c
===================================================================
RCS file: /sources/emacs/emacs/src/regex.c,v
retrieving revision 1.212
diff -u -r1.212 regex.c
--- src/regex.c	16 Sep 2006 15:28:47 -0000	1.212
+++ src/regex.c	22 Sep 2006 17:29:43 -0000
@@ -3877,11 +3877,13 @@
 	  if (fastmap)
 	    {
 	      int c = RE_STRING_CHAR (p + 1, pend - p);
-
+	      /* When fast-scanning, the fastmap can be indexed either with
+		 a char (smaller than 256) or with the first byte of
+		 a char's byte sequence.  So we have to conservatively add
+		 both to the table.  */
 	      if (SINGLE_BYTE_CHAR_P (c))
 		fastmap[c] = 1;
-	      else
-		fastmap[p[1]] = 1;
+	      fastmap[p[1]] = 1;
 	    }
 	  break;
 
@@ -3899,6 +3901,10 @@
 	     So any that are not listed in the charset
 	     are possible matches, even in multibyte buffers.  */
 	  if (!fastmap) break;
+	  /* We don't need to mark LEADING_CODE_8_BIT_CONTROL specially
+	     because it will automatically be set when needed by virtue of
+	     being larger than the highest char of its charset (0xbf) but
+	     smaller than (1<<BYTEWIDTH).  */
 	  for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
 	       j < (1 << BYTEWIDTH); j++)
 	    fastmap[j] = 1;
@@ -3909,7 +3915,13 @@
 	  for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
 	       j >= 0; j--)
 	    if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
-	      fastmap[j] = 1;
+	      {
+		fastmap[j] = 1;
+#ifdef emacs
+		if (j >= 0x80 && j < 0xa0)
+		  fastmap[LEADING_CODE_8_BIT_CONTROL] = 1;
+#endif
+	      }
 
 	  if ((not && multibyte)
 	      /* Any character set can possibly contain a character
@@ -4352,11 +4364,33 @@
 		    }
 		}
 	      else
-		while (range > lim && !fastmap[*d])
+		do
 		  {
-		    d++;
-		    range--;
-		  }
+		    re_char *d_start = d;
+		    while (range > lim && !fastmap[*d])
+		      {
+			d++;
+			range--;
+		      }
+#ifdef emacs
+		    if (multibyte && range > lim)
+		      {
+			/* Check that we are at the beginning of a char.  */
+			int at_boundary;
+			AT_CHAR_BOUNDARY_P (at_boundary, d, d_start);
+			if (at_boundary)
+			  break;
+			else
+			  { /* We have matched an internal byte of a char
+			       rather than the leading byte, so it's a false
+			       positive: we should keep scanning.  */
+			    d++; range--;
+			  }
+		      }
+		    else
+#endif
+		      break;
+		  } while (1);
 
 	      startpos += irange - range;
 	    }

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2006-09-22 17:34 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-09-22 17:34 Regexp matching errors Stefan Monnier

Code repositories for project(s) associated with this external index

	https://git.savannah.gnu.org/cgit/emacs.git
	https://git.savannah.gnu.org/cgit/emacs/org-mode.git

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.