bug#3687: 23.1.50; inconsistency in multibyte eight-bit regexps [PATCH]

unofficial mirror of bug-gnu-emacs@gnu.org 
 help / color / mirror / code / Atom feed

From: "Mattias Engdegård" <mattiase@acm.org>
To: mituharu@math.s.chiba-u.ac.jp,
	Stefan Monnier <monnier@iro.umontreal.ca>,
	Eli Zaretskii <eliz@gnu.org>
Cc: 3687@debbugs.gnu.org
Subject: bug#3687: 23.1.50; inconsistency in multibyte eight-bit regexps [PATCH]
Date: Fri, 28 Jun 2019 14:41:51 +0200	[thread overview]
Message-ID: <E668D31C-1511-404B-AE6C-BBFB807B293E@acm.org> (raw)
In-Reply-To: <200906260956.n5Q9uo917123@church.math.s.chiba-u.ac.jp>

[-- Attachment #1: Type: text/plain, Size: 559 bytes --]

Let's assume the following semantics as desirable:

1. All characters and raw bytes (up to regexp syntax) match themselves no matter whether they are given as literals or in character alternatives.
2. All raw bytes C match themselves and nothing else no matter whether the pattern or target string/buffer are unibyte or multibyte.
3. Ranges from ASCII to raw bytes work as expected and do not contain Unicode characters above U+007F.
4. Ranges from non-ASCII Unicode characters to raw bytes make no sense and are treated as empty.

Here is a patch.


[-- Attachment #2: 0001-Correct-regexp-matching-of-raw-bytes.patch --]
[-- Type: application/octet-stream, Size: 9348 bytes --]

From 6683077bf5d9509abbae050e1aa4c3dddae1bba9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mattias=20Engdeg=C3=A5rd?= <mattiase@acm.org>
Date: Fri, 28 Jun 2019 10:20:55 +0200
Subject: [PATCH] Correct regexp matching of raw bytes

Make regexp matching of raw bytes work in all combination of unibyte
and multibyte patterns and targets, as exact strings and in character
alternatives (bug#3687).

* src/regex-emacs.c (analyze_first):
Include raw byte in fastmap when pattern is a multibyte exact string.
Include leading byte in fastmap for raw bytes in character alternatives.
(re_match_2_internal):
Decrement the byte count by the number of bytes in the pattern character,
not 1.
* test/src/regex-emacs-tests.el (regexp-unibyte-unibyte)
(regexp-multibyte-unibyte, regexp-unibyte-mutibyte)
(regexp-multibyte-multibyte): New tests.
---
 src/regex-emacs.c             |  24 +++++--
 test/src/regex-emacs-tests.el | 120 ++++++++++++++++++++++++++++++++++
 2 files changed, 140 insertions(+), 4 deletions(-)

diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index c353a78fb4..5887eaa30c 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -2794,6 +2794,7 @@ static int
 analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
 {
   int j, k;
+  int nbits;
   bool not;
 
   /* If all elements for base leading-codes in fastmap is set, this
@@ -2854,7 +2855,14 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
 		 each byte is a character.  Thus, this works in both
 		 cases. */
 	      fastmap[p[1]] = 1;
-	      if (! multibyte)
+	      if (multibyte)
+		{
+		  /* Cover the case of matching a raw char in a
+		     multibyte regexp against unibyte.	*/
+		  if (CHAR_BYTE8_HEAD_P (p[1]))
+		    fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1;
+		}
+	      else
 		{
 		  /* For the case of matching this unibyte regex
 		     against multibyte, we must set a leading code of
@@ -2886,11 +2894,18 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
 	case charset:
 	  if (!fastmap) break;
 	  not = (re_opcode_t) *(p - 1) == charset_not;
-	  for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
-	       j >= 0; j--)
+	  nbits = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
+	  p++;
+	  for (j = 0; j < nbits; j++)
 	    if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
 	      fastmap[j] = 1;
 
+	  /* To match raw bytes (in the 80..ff range) against multibyte
+	     strings, add their leading bytes to the fastmap.  */
+	  for (j = 0x80; j < nbits; j++)
+	    if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
+	      fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1;
+
 	  if (/* Any leading code can possibly start a character
 		 which doesn't match the specified set of characters.  */
 	      not
@@ -4251,8 +4266,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
 		  }
 		p += pat_charlen;
 		d++;
+		mcnt -= pat_charlen;
 	      }
-	    while (--mcnt);
+	    while (mcnt > 0);
 
 	  break;
 
diff --git a/test/src/regex-emacs-tests.el b/test/src/regex-emacs-tests.el
index 0ae50c94d4..50ed3e870a 100644
--- a/test/src/regex-emacs-tests.el
+++ b/test/src/regex-emacs-tests.el
@@ -683,4 +683,124 @@ regex-tests-TESTS
   (should-not (string-match "\\`x\\{65535\\}" (make-string 65534 ?x)))
   (should-error (string-match "\\`x\\{65536\\}" "X") :type 'invalid-regexp))
 
+(ert-deftest regexp-unibyte-unibyte ()
+  "Test matching a unibyte regexp against a unibyte string."
+  ;; Sanity check
+  (should-not (multibyte-string-p "ab"))
+  (should-not (multibyte-string-p "\xff"))
+  ;; ASCII
+  (should (string-match "a[b]" "ab"))
+  ;; Raw
+  (should (string-match "\xf1" "\xf1"))
+  (should-not (string-match "\xf1" "\xc1\xb1"))
+  ;; Raw, char alt
+  (should (string-match "[\xf1]" "\xf1"))
+  (should-not (string-match "[\xf1]" "\xc1\xb1"))
+  ;; Raw range
+  (should (string-match "[\x82-\xd3]" "\xbb"))
+  (should-not (string-match "[\x82-\xd3]" "a"))
+  (should-not (string-match "[\x82-\xd3]" "\x81"))
+  (should-not (string-match "[\x82-\xd3]" "\xd4"))
+  ;; ASCII-raw range
+  (should (string-match "[f-\xd3]" "q"))
+  (should (string-match "[f-\xd3]" "\xbb"))
+  (should-not (string-match "[f-\xd3]" "e"))
+  (should-not (string-match "[f-\xd3]" "\xd4")))
+
+(ert-deftest regexp-multibyte-multibyte ()
+  "Test matching a multibyte regexp against a multibyte string."
+  ;; Sanity check
+  (should (multibyte-string-p "åü"))
+  ;; ASCII
+  (should (string-match (string-to-multibyte "a[b]")
+                        (string-to-multibyte "ab")))
+  ;; Unicode
+  (should (string-match "å[ü]z" "åüz"))
+  (should-not (string-match "ü" (string-to-multibyte "\xc3\xbc")))
+  ;; Raw
+  (should (string-match (string-to-multibyte "\xf1")
+                        (string-to-multibyte "\xf1")))
+  (should-not (string-match (string-to-multibyte "\xf1")
+                            (string-to-multibyte "\xc1\xb1")))
+  (should-not (string-match (string-to-multibyte "\xc1\xb1")
+                            (string-to-multibyte "\xf1")))
+  ;; Raw, char alt
+  (should (string-match (string-to-multibyte "[\xf1]")
+                        (string-to-multibyte "\xf1")))
+  ;; Raw range
+  (should (string-match (string-to-multibyte "[\x82-\xd3]")
+                        (string-to-multibyte "\xbb")))
+  (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "a"))
+  (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "Å"))
+  (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "ü"))
+  (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "\x81"))
+  (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "\xd4"))
+  ;; ASCII-raw range: should exclude U+0100..U+10FFFF
+  (should (string-match (string-to-multibyte "[f-\xd3]")
+                        (string-to-multibyte "q")))
+  (should (string-match (string-to-multibyte "[f-\xd3]")
+                        (string-to-multibyte "\xbb")))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "e"))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "Å"))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "ü"))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "\xd4"))
+  ;; Unicode-raw range: should be empty
+  (should-not (string-match "[å-\xd3]" "å"))
+  (should-not (string-match "[å-\xd3]" (string-to-multibyte "\xd3")))
+  (should-not (string-match "[å-\xd3]" (string-to-multibyte "\xbb")))
+  (should-not (string-match "[å-\xd3]" "ü"))
+  ;; No equivalence between raw bytes and latin-1
+  (should-not (string-match "å" (string-to-multibyte "\xe5")))
+  (should-not (string-match "[å]" (string-to-multibyte "\xe5")))
+  (should-not (string-match "\xe5" "å"))
+  (should-not (string-match "[\xe5]" "å")))
+
+(ert-deftest regexp-unibyte-multibyte ()
+  "Test matching a unibyte regexp against a multibyte string."
+  ;; ASCII
+  (should (string-match "a[b]" (string-to-multibyte "ab")))
+  ;; Unicode
+  (should (string-match "a.[^b]c" (string-to-multibyte "aåüc")))
+  ;; Raw
+  (should (string-match "\xf1" (string-to-multibyte "\xf1")))
+  (should-not (string-match "\xc1\xb1" (string-to-multibyte "\xf1")))
+  ;; Raw, char alt
+  (should (string-match "[\xf1]" (string-to-multibyte "\xf1")))
+  (should-not (string-match "[\xc1][\xb1]" (string-to-multibyte "\xf1")))
+  ;; ASCII-raw range: should exclude U+0100..U+10FFFF
+  (should (string-match "[f-\xd3]" (string-to-multibyte "q")))
+  (should (string-match "[f-\xd3]" (string-to-multibyte "\xbb")))
+  (should-not (string-match "[f-\xd3]" "e"))
+  (should-not (string-match "[f-\xd3]" "Å"))
+  (should-not (string-match "[f-\xd3]" "ü"))
+  (should-not (string-match "[f-\xd3]" "\xd4"))
+  ;; No equivalence between raw bytes and latin-1
+  (should-not (string-match "\xe5" "å"))
+  (should-not (string-match "[\xe5]" "å")))
+
+(ert-deftest regexp-multibyte-unibyte ()
+  "Test matching a multibyte regexp against a unibyte string."
+  ;; ASCII
+  (should (string-match (string-to-multibyte "a[b]") "ab"))
+  ;; Unicode
+  (should (string-match "a[^ü]c" "abc"))
+  (should-not (string-match "ü" "\xc3\xbc"))
+  ;; Raw
+  (should (string-match (string-to-multibyte "\xf1") "\xf1"))
+  (should-not (string-match (string-to-multibyte "\xf1") "\xc1\xb1"))
+  ;; Raw, char alt
+  (should (string-match (string-to-multibyte "[\xf1]") "\xf1"))
+  (should-not (string-match (string-to-multibyte "[\xf1]") "\xc1\xb1"))
+  ;; ASCII-raw range: should exclude U+0100..U+10FFFF
+  (should (string-match (string-to-multibyte "[f-\xd3]") "q"))
+  (should (string-match (string-to-multibyte "[f-\xd3]") "\xbb"))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "e"))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "\xd4"))
+  ;; Unicode-raw range: should be empty
+  (should-not (string-match "[å-\xd3]" "\xd3"))
+  (should-not (string-match "[å-\xd3]" "\xbb"))
+  ;; No equivalence between raw bytes and latin-1
+  (should-not (string-match "å" "\xe5"))
+  (should-not (string-match "[å]" "\xe5")))
+
 ;;; regex-emacs-tests.el ends here
-- 
2.20.1 (Apple Git-117)

next prev parent reply	other threads:[~2019-06-28 12:41 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-06-26  9:56 bug#3687: 23.1.50; inconsistency in multibyte eight-bit regexps YAMAMOTO Mitsuharu
2009-06-26 13:43 ` Eli Zaretskii
2009-06-27  1:30   ` YAMAMOTO Mitsuharu
2009-06-27  9:36     ` Eli Zaretskii
2009-06-29  3:02       ` YAMAMOTO Mitsuharu
2009-06-29  8:47         ` Stefan Monnier
2009-07-24  1:08           ` YAMAMOTO Mitsuharu
2019-06-28 12:41 ` Mattias Engdegård [this message]
2019-06-28 13:03   ` bug#3687: 23.1.50; inconsistency in multibyte eight-bit regexps [PATCH] Eli Zaretskii
2019-06-28 14:05     ` Mattias Engdegård
2019-06-28 14:40       ` Eli Zaretskii
2019-06-28 15:00         ` Mattias Engdegård
2019-06-28 16:20           ` Eli Zaretskii
2019-06-28 16:47             ` Mattias Engdegård
2019-06-28 14:56       ` Eli Zaretskii
2019-06-28 15:18         ` Stefan Monnier
2019-06-28 15:34           ` Mattias Engdegård

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:c353a78fb dfblob:5887eaa30 dfblob:0ae50c94d dfblob:50ed3e870 )
 OR (
bs:"Correct regexp matching of raw bytes" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/emacs/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=E668D31C-1511-404B-AE6C-BBFB807B293E@acm.org \
    --to=mattiase@acm.org \
    --cc=3687@debbugs.gnu.org \
    --cc=eliz@gnu.org \
    --cc=mituharu@math.s.chiba-u.ac.jp \
    --cc=monnier@iro.umontreal.ca \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).