unofficial mirror of bug-gnu-emacs@gnu.org 
 help / color / mirror / code / Atom feed
From: "Mattias Engdegård" <mattiase@acm.org>
To: 55738@debbugs.gnu.org
Subject: bug#55738: character escape bugs in the reader
Date: Wed, 1 Jun 2022 12:00:21 +0200	[thread overview]
Message-ID: <84CE3A6F-E487-4823-96CB-208C79EB235C@acm.org> (raw)
In-Reply-To: <20BC6F3C-1C72-4469-946D-8B9583C73024@acm.org>

[-- Attachment #1: Type: text/plain, Size: 68 bytes --]

Suggested patch. It does not address the third bug above (\C-SPC).


[-- Attachment #2: 0001-Fix-reader-char-escape-bugs-bug-55738.patch --]
[-- Type: application/octet-stream, Size: 10899 bytes --]

From 801c6789d753eb1d94adec95af76e6c28dc27e21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mattias=20Engdeg=C3=A5rd?= <mattiase@acm.org>
Date: Wed, 1 Jun 2022 11:39:44 +0200
Subject: [PATCH] Fix reader char escape bugs (bug#55738)

Make the character literal ?\LF (linefeed) generate 10, not -1.

Ensure that Control escape sequences in character literals are
idempotent: ?\C-\C-a and ?\^\^a mean the same thing as ?\C-a and ?\^a,
generating the control character with value 1.  "\C-\C-a" no longer
signals an error.

* src/lread.c (read_escape): Make nonrecursive and only combine
the base char with modifiers at the end, creating control chars
if applicable.  Remove the `stringp` argument; assume character
literal syntax.  Never return -1.
(read_string_literal): Handle string-specific escape semantics here
and simplify.
* test/src/lread-tests.el (lread-misc-2): New test.
---
 src/lread.c             | 201 ++++++++++++++++++++--------------------
 test/src/lread-tests.el |  10 ++
 2 files changed, 112 insertions(+), 99 deletions(-)

diff --git a/src/lread.c b/src/lread.c
index a1045184d9..670413efc0 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -2631,93 +2631,88 @@ character_name_to_code (char const *name, ptrdiff_t name_len,
 enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 };
 
 /* Read a \-escape sequence, assuming we already read the `\'.
+   When there is a difference between string and character literal \-sequences,
+   the latter is assumed.
    If the escape sequence forces unibyte, return eight-bit char.  */
 
 static int
-read_escape (Lisp_Object readcharfun, bool stringp)
+read_escape (Lisp_Object readcharfun)
 {
+  int modifiers = 0;
+ again: ;
   int c = READCHAR;
-  /* \u allows up to four hex digits, \U up to eight.  Default to the
-     behavior for \u, and change this value in the case that \U is seen.  */
-  int unicode_hex_count = 4;
+  int unicode_hex_count;
 
   switch (c)
     {
     case -1:
       end_of_file_error ();
 
-    case 'a':
-      return '\007';
-    case 'b':
-      return '\b';
-    case 'd':
-      return 0177;
-    case 'e':
-      return 033;
-    case 'f':
-      return '\f';
-    case 'n':
-      return '\n';
-    case 'r':
-      return '\r';
-    case 't':
-      return '\t';
-    case 'v':
-      return '\v';
-    case '\n':
-      return -1;
-    case ' ':
-      if (stringp)
-	return -1;
-      return ' ';
+    case 'a': c = '\a'; break;
+    case 'b': c = '\b'; break;
+    case 'd': c = 127; break;
+    case 'e': c = 27; break;
+    case 'f': c = '\f'; break;
+    case 'n': c = '\n'; break;
+    case 'r': c = '\r'; break;
+    case 't': c = '\t'; break;
+    case 'v': c = '\v'; break;
 
     case 'M':
       c = READCHAR;
       if (c != '-')
 	error ("Invalid escape character syntax");
+      modifiers |= meta_modifier;
       c = READCHAR;
       if (c == '\\')
-	c = read_escape (readcharfun, 0);
-      return c | meta_modifier;
+	goto again;
+      break;
 
     case 'S':
       c = READCHAR;
       if (c != '-')
 	error ("Invalid escape character syntax");
+      modifiers |= shift_modifier;
       c = READCHAR;
       if (c == '\\')
-	c = read_escape (readcharfun, 0);
-      return c | shift_modifier;
+	goto again;
+      break;
 
     case 'H':
       c = READCHAR;
       if (c != '-')
 	error ("Invalid escape character syntax");
+      modifiers |= hyper_modifier;
       c = READCHAR;
       if (c == '\\')
-	c = read_escape (readcharfun, 0);
-      return c | hyper_modifier;
+	goto again;
+      break;
 
     case 'A':
       c = READCHAR;
       if (c != '-')
 	error ("Invalid escape character syntax");
+      modifiers |= alt_modifier;
       c = READCHAR;
       if (c == '\\')
-	c = read_escape (readcharfun, 0);
-      return c | alt_modifier;
+	goto again;
+      break;
 
     case 's':
       c = READCHAR;
-      if (stringp || c != '-')
+      if (c == '-')
+	{
+	  modifiers |= super_modifier;
+	  c = READCHAR;
+	  if (c == '\\')
+	    goto again;
+	}
+      else
 	{
 	  UNREAD (c);
-	  return ' ';
+	  c = ' ';
 	}
-      c = READCHAR;
-      if (c == '\\')
-	c = read_escape (readcharfun, 0);
-      return c | super_modifier;
+      break;
 
     case 'C':
       c = READCHAR;
@@ -2725,21 +2720,11 @@ read_escape (Lisp_Object readcharfun, bool stringp)
 	error ("Invalid escape character syntax");
       FALLTHROUGH;
     case '^':
+      modifiers |= ctrl_modifier;
       c = READCHAR;
       if (c == '\\')
-	c = read_escape (readcharfun, 0);
-      if ((c & ~CHAR_MODIFIER_MASK) == '?')
-	return 0177 | (c & CHAR_MODIFIER_MASK);
-      else if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
-	return c | ctrl_modifier;
-      /* ASCII control chars are made from letters (both cases),
-	 as well as the non-letters within 0100...0137.  */
-      else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
-	return (c & (037 | ~0177));
-      else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
-	return (c & (037 | ~0177));
-      else
-	return c | ctrl_modifier;
+	goto again;
+      break;
 
     case '0':
     case '1':
@@ -2749,31 +2734,30 @@ read_escape (Lisp_Object readcharfun, bool stringp)
     case '5':
     case '6':
     case '7':
-      /* An octal escape, as in ANSI C.  */
+      /* 1-3 octal digits.  */
       {
-	register int i = c - '0';
-	register int count = 0;
+	int i = c - '0';
+	int count = 0;
 	while (++count < 3)
 	  {
-	    if ((c = READCHAR) >= '0' && c <= '7')
-	      {
-		i *= 8;
-		i += c - '0';
-	      }
-	    else
+	    c = READCHAR;
+	    if (c < '0' || c > '7')
 	      {
 		UNREAD (c);
 		break;
 	      }
+	    i *= 8;
+	    i += c - '0';
 	  }
 
 	if (i >= 0x80 && i < 0x100)
 	  i = BYTE8_TO_CHAR (i);
-	return i;
+	c = i;
+	break;
       }
 
     case 'x':
-      /* A hex escape, as in ANSI C.  */
+      /* One or more hex digits.  */
       {
 	unsigned int i = 0;
 	int count = 0;
@@ -2795,16 +2779,18 @@ read_escape (Lisp_Object readcharfun, bool stringp)
 	  }
 
 	if (count < 3 && i >= 0x80)
-	  return BYTE8_TO_CHAR (i);
-	return i;
+	  i = BYTE8_TO_CHAR (i);
+	c = i;
+	break;
       }
 
-    case 'U':
-      /* Post-Unicode-2.0: Up to eight hex chars.  */
+    case 'U':			/* Eight hex digits.  */
       unicode_hex_count = 8;
-      FALLTHROUGH;
-    case 'u':
+      goto unicode;
 
+    case 'u':			/* Four hex digits.  */
+      unicode_hex_count = 4;
+    unicode:
       /* A Unicode escape.  We only permit them in strings and characters,
 	 not arbitrarily in the source code, as in some other languages.  */
       {
@@ -2815,12 +2801,8 @@ read_escape (Lisp_Object readcharfun, bool stringp)
 	  {
 	    c = READCHAR;
 	    if (c < 0)
-	      {
-		if (unicode_hex_count > 4)
-		  error ("Malformed Unicode escape: \\U%x", i);
-		else
-		  error ("Malformed Unicode escape: \\u%x", i);
-	      }
+	      error ("Malformed Unicode escape: \\%c%x",
+		     unicode_hex_count == 4 ? 'u' : 'U', i);
 	    /* `isdigit' and `isalpha' may be locale-specific, which we don't
 	       want.  */
 	    int digit = char_hexdigit (c);
@@ -2831,7 +2813,8 @@ read_escape (Lisp_Object readcharfun, bool stringp)
 	  }
 	if (i > 0x10FFFF)
 	  error ("Non-Unicode character: 0x%x", i);
-	return i;
+	c = i;
+	break;
       }
 
     case 'N':
@@ -2880,12 +2863,31 @@ read_escape (Lisp_Object readcharfun, bool stringp)
 
 	/* character_name_to_code can invoke read0, recursively.
 	   This is why read0's buffer is not static.  */
-	return character_name_to_code (name, length, readcharfun);
+	c = character_name_to_code (name, length, readcharfun);
+	break;
       }
+    }
 
-    default:
-      return c;
+  c |= modifiers;
+  if (c & ctrl_modifier)
+    {
+      int b = c & ~CHAR_MODIFIER_MASK;
+      /* If the base char is in the 0x3f..0x5f range or a lower case
+	 letter, drop the ctrl_modifier bit and generate a C0 control
+	 character instead.  */
+      if ((b >= 0x3f && b <= 0x5f) || (b >= 'a' && b <= 'z'))
+	{
+	  c &= ~ctrl_modifier;
+	  if (b == '?')
+	    /* Special case: ^? is DEL.  */
+	    b = 127;
+	  else
+	    /* Make a C0 control in 0..31 by clearing bits 5 and 6.  */
+	    b &= 0x1f;
+	}
+      c = b | (c & CHAR_MODIFIER_MASK);
     }
+  return c;
 }
 
 /* Return the digit that CHARACTER stands for in the given BASE.
@@ -3012,7 +3014,7 @@ read_char_literal (Lisp_Object readcharfun)
     }
 
   if (ch == '\\')
-    ch = read_escape (readcharfun, 0);
+    ch = read_escape (readcharfun);
 
   int modifiers = ch & CHAR_MODIFIER_MASK;
   ch &= ~CHAR_MODIFIER_MASK;
@@ -3066,14 +3068,21 @@ read_string_literal (char stackbuf[VLA_ELEMS (stackbufsize)],
 
       if (ch == '\\')
 	{
-	  ch = read_escape (readcharfun, 1);
-
-	  /* CH is -1 if \ newline or \ space has just been seen.  */
-	  if (ch == -1)
+	  ch = READCHAR;
+	  switch (ch)
 	    {
+	    case 's':
+	      ch = ' ';
+	      break;
+	    case ' ':
+	    case '\n':
 	      if (p == read_buffer)
 		cancel = true;
 	      continue;
+	    default:
+	      UNREAD (ch);
+	      ch = read_escape (readcharfun);
+	      break;
 	    }
 
 	  int modifiers = ch & CHAR_MODIFIER_MASK;
@@ -3085,19 +3094,13 @@ read_string_literal (char stackbuf[VLA_ELEMS (stackbufsize)],
 	    force_multibyte = true;
 	  else		/* I.e. ASCII_CHAR_P (ch).  */
 	    {
-	      /* Allow `\C- ' and `\C-?'.  */
-	      if (modifiers == CHAR_CTL)
+	      /* Allow `\C-SPC' and `\^SPC'.  This is done here because
+		 the literals ?\C-SPC and ?\^SPC (rather inconsistently)
+		 yield (' ' | CHAR_CTL); see bug#55738.  */
+	      if (modifiers == CHAR_CTL && ch == ' ')
 		{
-		  if (ch == ' ')
-		    {
-		      ch = 0;
-		      modifiers = 0;
-		    }
-		  else if (ch == '?')
-		    {
-		      ch = 127;
-		      modifiers = 0;
-		    }
+		  ch = 0;
+		  modifiers = 0;
 		}
 	      if (modifiers & CHAR_SHIFT)
 		{
diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el
index 47351c1d11..59d5ca076f 100644
--- a/test/src/lread-tests.el
+++ b/test/src/lread-tests.el
@@ -317,4 +317,14 @@ lread-misc
   (should (equal (read-from-string "#_")
                  '(## . 2))))
 
+(ert-deftest lread-misc-2 ()
+  ;; ?\LF should produce LF (only inside string literals do we ignore \LF).
+  (should (equal (read-from-string "?\\\n") '(?\n . 3)))
+  (should (equal (read-from-string "\"a\\\nb\"") '("ab" . 6)))
+  ;; The Control modifier constructs should be idempotent.
+  (should (equal ?\C-\C-x ?\C-x))
+  (should (equal ?\^\^x ?\C-x))
+  (should (equal ?\C-\^x ?\C-x))
+  (should (equal ?\^\C-x ?\C-x)))
+
 ;;; lread-tests.el ends here
-- 
2.32.0 (Apple Git-132)


  reply	other threads:[~2022-06-01 10:00 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-05-31 11:33 bug#55738: character escape bugs in the reader Mattias Engdegård
2022-06-01 10:00 ` Mattias Engdegård [this message]
2022-06-01 13:42   ` Lars Ingebrigtsen
2022-06-01 17:56     ` Mattias Engdegård
2022-06-01 20:48       ` Basil L. Contovounesios via Bug reports for GNU Emacs, the Swiss army knife of text editors
2022-06-01 20:53         ` Mattias Engdegård
2022-06-01 21:05           ` Basil L. Contovounesios via Bug reports for GNU Emacs, the Swiss army knife of text editors
2022-06-02 15:12             ` Mattias Engdegård
2022-06-03 11:07               ` Mattias Engdegård
2022-06-18  6:54               ` Stefan Kangas
2022-06-18  9:34                 ` Mattias Engdegård
2023-09-06  1:56                   ` Stefan Kangas

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/emacs/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=84CE3A6F-E487-4823-96CB-208C79EB235C@acm.org \
    --to=mattiase@acm.org \
    --cc=55738@debbugs.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).