unofficial mirror of emacs-devel@gnu.org 
 help / color / mirror / code / Atom feed
* "Raw" string literals for elisp
@ 2021-09-08  1:49 Anna Glasgall
  2021-09-08  7:10 ` Po Lu
                   ` (5 more replies)
  0 siblings, 6 replies; 120+ messages in thread
From: Anna Glasgall @ 2021-09-08  1:49 UTC (permalink / raw)
  To: emacs-devel; +Cc: Anna Glasgall

[-- Attachment #1: Type: text/plain, Size: 2419 bytes --]

[My previous message appears to have been eaten, or at least it's not
showing up in the archive; resending from a different From: address.
Apologies for any duplication]

Hello Emacs developers,

I've long been annoyed by the number of backslashes needed when using
string literals in elisp for certain things (regexes, UNC paths, etc),
so I started work on a patch (WIP attached) to implement support for
"raw" string literals, a la Python r-strings. These are string literals
that work exactly like normal string literals, with the exception that
backslash escapes (except for \") are not processed; \ may freely
appear in the string without need to escape. I've made good progress,
but unfortunately I've run into a roadblock and am not sure what to do
next.

I've successfully taught the elisp reader (read1 in lread.c) how to
read r-strings. I thought I had managed to make lisp-mode/elisp-mode
happy by allowing "r" to be a prefix character (C-x C-e and the
underlying forward-sexp/backward-sexp seemed to work fine at first),
but realized that I ran into trouble with strings containing the
sequence of characters '\\"'.

The reader correctly reads r"a\\"" as a string containing the sequence
of characters 'a', '\', '"', and M-: works. Unfortunately, if I try
sexp-based navigation or e.g. C-x C-e, it falls apart. The parser in
syntax.c, which afaict is what lisp-mode is using to try and find sexps
in buffer text, doesn't seem to know what to do with this expression.
I've spent some time staring at syntax.c, but I must confess that I'm
entirely defeated in terms of what changes need to be made here to
teach this other parser about prefixed strings in where the prefix has
meaning that affects the interpretation of the characters between
string fences.

I've attached a copy of my WIP patch; it's definitely not near final
code quality and doesn't have documentation yet, all of which I would
take care of before submitting for inclusion. I also haven't filled out
the copyright assignment paperwork yet, but should this work reach a
point where it was likely to be accepted, I'd be happy to do that.


I'd very much appreciate some pointers on what to try next here, or
some explanation of how syntax.c/syntax.el works beyond what's in the
reference manual. If this is a fool's errand I'm tilting at here, I'd
also appreciate being told that before I sink more time into it :)

thanks,

Anna Glasgall



[-- Attachment #2: rstrings.patch --]
[-- Type: text/x-patch, Size: 3044 bytes --]

diff --git a/lisp/progmodes/elisp-mode.el b/lisp/progmodes/elisp-mode.el
index 7ed2d3d08c..e91d81de6d 100644
--- a/lisp/progmodes/elisp-mode.el
+++ b/lisp/progmodes/elisp-mode.el
@@ -39,6 +39,7 @@ 'emacs-lisp-mode-abbrev-table
 
 (defvar emacs-lisp-mode-syntax-table
   (let ((table (make-syntax-table lisp-data-mode-syntax-table)))
+    (modify-syntax-entry ?r "_ p" table)
     ;; These are redundant, now.
     ;;(modify-syntax-entry ?\[ "(]  " table)
     ;;(modify-syntax-entry ?\] ")[  " table)
diff --git a/src/lread.c b/src/lread.c
index a6c2db5d99..8222c17d0b 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -2970,10 +2970,11 @@ read1 (Lisp_Object readcharfun, int *pch, bool first_in_list)
   bool multibyte;
   char stackbuf[stackbufsize];
   current_thread->stack_top = stackbuf;
-
+  bool raw_literal = false;
   *pch = 0;
 
  retry:
+  raw_literal = false;
 
   c = READCHAR_REPORT_MULTIBYTE (&multibyte);
   if (c < 0)
@@ -3564,7 +3565,23 @@ read1 (Lisp_Object readcharfun, int *pch, bool first_in_list)
 
 	invalid_syntax ("?", readcharfun);
       }
-
+      /* "raw" string literal syntax, a la Python; "raw" literals do
+	 not process escapes except for \" */
+    case 'r':
+      {
+	int next_ch;
+	next_ch = READCHAR;
+	if (next_ch == '\"')
+	  {
+	    raw_literal = true;
+	    /* fall through to string reading */
+	  }
+	else
+	  {
+	    UNREAD (next_ch);
+	    goto read_symbol;
+	  }
+      }
     case '"':
       {
 	ptrdiff_t count = SPECPDL_INDEX ();
@@ -3599,7 +3616,21 @@ read1 (Lisp_Object readcharfun, int *pch, bool first_in_list)
 	    if (ch == '\\')
 	      {
 		int modifiers;
-
+		if (raw_literal)
+		  {
+		    /* still have to handle backslash followed by
+		       double quote even in a raw literal */
+		    int next_ch = READCHAR;
+		    if (next_ch == '\"')
+		      {
+			ch = next_ch;
+		      }
+		    else
+		      {
+			UNREAD(next_ch);
+		      }
+		    goto read_normal_char;
+		  }
 		ch = read_escape (readcharfun, 1);
 
 		/* CH is -1 if \ newline or \ space has just been seen.  */
@@ -3653,6 +3684,7 @@ read1 (Lisp_Object readcharfun, int *pch, bool first_in_list)
 	      }
 	    else
 	      {
+	      read_normal_char:
 		p += CHAR_STRING (ch, (unsigned char *) p);
 		if (CHAR_BYTE8_P (ch))
 		  force_singlebyte = true;
diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el
index dac8f95bc4..964f3da91b 100644
--- a/test/src/lread-tests.el
+++ b/test/src/lread-tests.el
@@ -262,5 +262,15 @@ lread-float
   (should (equal (read "-0.e-5") -0.0))
   )
 
+(ert-deftest lread-string-raw-syntax ()
+  ;; syntax r"a\bc" => string composed of ?a, ?\\, ?b, ?c
+  (should (equal (read "r\"a\\bc\"") "a\\bc"))
+  ;; syntax "a\bc" => string composed of ?a, ?\b, ?c 
+  (should (equal (read "\"a\\bc\"") "a\C-hc"))
+  ;; syntax r"a\"b\"a" => string composed of ?a, ?\", ?b, ?\", ?a
+  (should (equal (read "r\"a\\\"b\\\"a\"") "a\"b\"a"))
+  ;; syntax r"a\\b" => string composed of ?a, ?\\, ?\\, ?b
+  (should (equal (read "r\"a\\\\b\"") "a\\\\b"))
+  )
 
 ;;; lread-tests.el ends here

^ permalink raw reply related	[flat|nested] 120+ messages in thread

end of thread, other threads:[~2021-10-14  4:05 UTC | newest]

Thread overview: 120+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-09-08  1:49 "Raw" string literals for elisp Anna Glasgall
2021-09-08  7:10 ` Po Lu
2021-09-08 14:19   ` Anna Glasgall
2021-09-08  7:12 ` Lars Ingebrigtsen
2021-09-08 14:20   ` Anna Glasgall
2021-09-08 11:30 ` Alan Mackenzie
2021-09-08 14:27   ` Anna Glasgall
2021-09-08 11:34 ` Adam Porter
2021-09-08 13:59   ` Clément Pit-Claudel
2021-09-08 14:12     ` Adam Porter
2021-09-09  3:09   ` Richard Stallman
2021-09-08 13:10 ` Stefan Monnier
2021-09-08 14:31   ` Anna Glasgall
2021-09-08 15:27     ` Mattias Engdegård
2021-09-08 15:41       ` Stefan Kangas
2021-09-08 16:45         ` Mattias Engdegård
2021-09-08 16:01       ` Alan Mackenzie
2021-09-08 18:24         ` Mattias Engdegård
2021-09-08 19:00           ` Alan Mackenzie
2021-09-08 19:22         ` Philip Kaludercic
2021-09-08 19:36           ` Alan Mackenzie
2021-09-08 21:11           ` Stefan Kangas
2021-09-08 21:24             ` Philip Kaludercic
2021-09-09  6:52             ` tomas
2021-09-08 15:54     ` Stefan Kangas
2021-09-08 16:05     ` tomas
2021-09-08 16:42       ` Lars Ingebrigtsen
2021-09-08 20:08         ` Stefan Monnier
2021-09-08 20:18       ` Stefan Monnier
2021-09-09  7:04         ` tomas
2021-09-09 10:30         ` Mattias Engdegård
2021-09-09 11:36           ` Stefan Kangas
2021-09-09 13:33             ` Mattias Engdegård
2021-09-09 14:32               ` tomas
2021-09-14 10:43               ` Augusto Stoffel
2021-09-14 11:42                 ` Ihor Radchenko
2021-09-14 13:18                   ` Stefan Monnier
2021-09-14 13:22                     ` Stefan Kangas
2021-09-14 14:01                       ` Ihor Radchenko
2021-09-14 14:39                       ` Clément Pit-Claudel
2021-09-14 15:33                         ` Amin Bandali
2021-09-14 16:05                         ` Eli Zaretskii
2021-09-14 17:49                   ` Jose E. Marchesi
2021-09-08 20:40 ` Anna Glasgall
2021-09-08 21:28   ` Alan Mackenzie
2021-10-02 21:03   ` Daniel Brooks
2021-10-04  0:13     ` Richard Stallman
2021-10-04  0:36       ` Daniel Brooks
2021-10-04 12:00         ` Eli Zaretskii
2021-10-04 15:36           ` character sets as they relate to “Raw” " Daniel Brooks
2021-10-04 16:34             ` Stefan Monnier
2021-10-04 20:49               ` Daniel Brooks
2021-10-04 21:19                 ` Alan Mackenzie
2021-10-04 22:19                   ` Daniel Brooks
2021-10-05 11:20                     ` Alan Mackenzie
2021-10-05 17:08                       ` Daniel Brooks
2021-10-06 20:54                         ` Richard Stallman
2021-10-07  7:01                           ` Eli Zaretskii
2021-10-05  8:55                 ` Yuri Khan
2021-10-05 16:25                   ` Juri Linkov
2021-10-05 17:15                     ` Eli Zaretskii
2021-10-05 18:40                       ` [External] : " Drew Adams
2021-10-06 20:54                       ` Richard Stallman
2021-10-07  6:54                         ` Eli Zaretskii
2021-10-07 13:14                           ` Stefan Kangas
2021-10-07 13:34                             ` Eli Zaretskii
2021-10-07 14:48                               ` Stefan Kangas
2021-10-07 16:00                                 ` Eli Zaretskii
2021-10-08  0:37                                   ` Stefan Kangas
2021-10-08  6:53                                     ` Eli Zaretskii
2021-10-08 15:09                                       ` Display of em dashes in our documentation Stefan Kangas
2021-10-08 16:12                                         ` Eli Zaretskii
2021-10-08 17:17                                           ` Stefan Kangas
2021-10-10  8:00                                             ` Juri Linkov
2021-10-08 17:27                                           ` Daniel Brooks
2021-10-08 18:26                                           ` [External] : " Drew Adams
2021-10-08 17:17                                       ` character sets as they relate to “Raw” string literals for elisp Alan Mackenzie
2021-10-08 17:42                                         ` Eli Zaretskii
2021-10-08 18:47                                           ` Eli Zaretskii
2021-10-08 20:01                                             ` Alan Mackenzie
2021-10-09  6:18                                               ` Eli Zaretskii
2021-10-09 10:57                                                 ` Alan Mackenzie
2021-10-09 11:49                                                   ` Eli Zaretskii
2021-10-09 13:08                                                     ` Alan Mackenzie
2021-10-09 13:15                                                       ` Eli Zaretskii
2021-10-09 15:07                                                         ` Alan Mackenzie
2021-10-11  0:45                                                           ` linux console limitations Daniel Brooks
2021-10-12 10:18                                                             ` Alan Mackenzie
2021-10-14  4:05                                                               ` Daniel Brooks
2021-10-10  8:03                                                   ` character sets as they relate to “Raw” string literals for elisp Juri Linkov
2021-10-05 18:23                     ` [External] : " Drew Adams
2021-10-05 19:13                       ` Stefan Kangas
2021-10-05 19:20                         ` Drew Adams
2021-10-05 17:13                   ` Daniel Brooks
2021-10-05 12:04                 ` Eli Zaretskii
2021-10-05 21:20                 ` Richard Stallman
2021-10-05 22:13                   ` Daniel Brooks
2021-10-06 12:13                     ` Eli Zaretskii
2021-10-06 18:57                       ` Daniel Brooks
2021-10-07  4:23                         ` Eli Zaretskii
2021-10-07 22:27                         ` Richard Stallman
2021-10-08 10:37                         ` Po Lu
2021-10-08 10:53                           ` Basil L. Contovounesios
2021-10-08 11:27                             ` tomas
2021-10-05 22:25                   ` character sets as they relate to “Raw†" Stefan Kangas
2021-10-06  6:21                     ` Daniel Brooks
2021-10-07 22:20                       ` Richard Stallman
2021-10-06 12:29                     ` Eli Zaretskii
2021-10-06 12:52                       ` Stefan Kangas
2021-10-06 13:10                         ` Jean-Christophe Helary
2021-10-06 11:53                   ` character sets as they relate to “Raw” " Eli Zaretskii
2021-10-04 18:57             ` Eli Zaretskii
2021-10-04 19:14               ` Yuri Khan
2021-10-05 21:20                 ` Richard Stallman
2021-10-06  3:48                   ` character sets as they relate to “Raw†" Matthew Carter
2021-10-04 22:29         ` "Raw" " Richard Stallman
2021-10-05  5:39           ` Daniel Brooks
2021-10-05  5:43             ` Jean-Christophe Helary
2021-10-05  8:24               ` Richard Stallman
2021-10-05 12:23               ` Eli Zaretskii

Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).