unofficial mirror of bug-gnu-emacs@gnu.org 
 help / color / mirror / code / Atom feed
* bug#57004: [PATCH] Fontify Escape Sequences in Python String and Byte Literals
@ 2022-08-05 13:06 Laurence Warne
  2022-08-06 13:02 ` Lars Ingebrigtsen
  0 siblings, 1 reply; 4+ messages in thread
From: Laurence Warne @ 2022-08-05 13:06 UTC (permalink / raw)
  To: 57004


[-- Attachment #1.1: Type: text/plain, Size: 796 bytes --]

Hi, this patch implements escape code syntax highlighting within string an
bytes literals for python-mode (described at
https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals)
similar to how they are highlighted by https://pygments.org.

I've written a few tests (which should probably be expanded on (: ), an
example based on one of the test samples:

b'example 1: \n \\ \u1234 \U00010348 \N{Plus-Minus Sign}'
'example 2:  \n \\ \u1234 \U00010348 \N{Plus-Minus Sign}'

e.g. in the above we see "\n", "\\" highlighted on the first line, but not
"\u1234", "\U00010348" "\N{Plus-Minus Sign}" since these items aren't valid
escape sequences in byte literals.  The second line however is a string
literal, so all of the preceding items are highlighted.

Thanks, Laurence

[-- Attachment #1.2: Type: text/html, Size: 1152 bytes --]

[-- Attachment #2: 0001-Fontify-python-escape-sequences-in-literals.patch --]
[-- Type: text/x-patch, Size: 7571 bytes --]

From 18584736ab6aa4802accf68a0f5ca8d12666b891 Mon Sep 17 00:00:00 2001
From: lWarne <laurencewarne@gmail.com>
Date: Thu, 4 Aug 2022 16:13:23 +0100
Subject: [PATCH] Fontify python escape sequences in literals

* lisp/progmodes/python.el (python-rx): Add regular expressions
matching escape codes in string and byte literals
(python--string-bytes-literal-matcher): new function
(python--not-raw-bytes-literal-start-regexp): new constant
(python--not-raw-string-literal-start-regexp): new constant
* test/lisp/progmodes/python-tests.el: Add tests for new
fontification
---
 lisp/progmodes/python.el            | 53 +++++++++++++++++++-
 test/lisp/progmodes/python-tests.el | 77 +++++++++++++++++++++++++++++
 2 files changed, 128 insertions(+), 2 deletions(-)

diff --git a/lisp/progmodes/python.el b/lisp/progmodes/python.el
index b8fc7d4c54..4c387756f7 100644
--- a/lisp/progmodes/python.el
+++ b/lisp/progmodes/python.el
@@ -427,7 +427,17 @@ python-rx
                                  (: "vim:" (* space) "set" (+ space)
                                     "fileencoding" (* space) ?= (* space)
                                     (group-n 1 (+ (or word ?-)))
-                                    (* space) ":")))))
+                                    (* space) ":"))))
+            (bytes-escape-sequence (seq (not "\\")
+                                        (group (or "\\\\" "\\'" "\\a" "\\b" "\\f"
+                                                   "\\n" "\\r" "\\t" "\\v"
+                                                   (seq "\\" (= 3 (in "0-7")))
+                                                   (seq "\\x" hex hex)))))
+            (string-escape-sequence (or bytes-escape-sequence
+                                        (seq (not "\\")
+                                             (or (group-n 1 "\\u" (= 4 hex))
+                                                 (group-n 1 "\\U" (= 8 hex))
+                                                 (group-n 1 "\\N{" (*? anychar) "}"))))))
      (rx ,@regexps)))
 
 \f
@@ -539,6 +549,28 @@ python--font-lock-f-strings
         (goto-char (min limit (1+ send)))
         (setq ppss (syntax-ppss))))))
 
+(defconst python--not-raw-bytes-literal-start-regexp
+  (rx (or bos (not alnum)) (or "b" "B") (or "\"" "\"\"\"" "'" "'''") eos)
+  "A regular expression matching the start of a not-raw bytes literal.")
+
+(defconst python--not-raw-string-literal-start-regexp
+  (rx (or bos (not alnum)) (? (or "u" "U" "F" "f")) (or "\"" "\"\"\"" "'" "'''") eos)
+  "A regular expression matching the start of a not-raw string literal.")
+
+(defun python--string-bytes-literal-matcher (regexp start-regexp)
+  "Match REGEXP within a string or bytes literal whose start matches START-REGEXP."
+  (lambda (limit)
+    (cl-loop for result = (re-search-forward regexp limit t)
+             for result-valid = (and result
+                                     (let* ((pos (nth 8 (syntax-ppss)))
+                                            (before-quote
+                                             (buffer-substring-no-properties
+                                              (max (- pos 5) (point-min))
+                                              (min (+ pos 1) (point-max)))))
+                                       (string-match-p start-regexp before-quote)))
+             until (or (not result) result-valid)
+             finally return (and result-valid result))))
+
 (defvar python-font-lock-keywords-level-1
   `((,(python-rx symbol-start "def" (1+ space) (group symbol-name))
      (1 font-lock-function-name-face))
@@ -716,7 +748,24 @@ python-font-lock-keywords-maximum-decoration
                   grouped-assignment-target (* space)
                   (or ")" "]") (* space)
                   assignment-operator))
-     (1 font-lock-variable-name-face)))
+     (1 font-lock-variable-name-face))
+    ;; escape sequences within bytes literals
+    ;;   "\\" "\'" "\a" "\b" "\f" "\n" "\r" "\t" "\v"
+    ;;   "\ooo" character with octal value ooo
+    ;;   "\xhh" character with hex value hh
+    (,(python--string-bytes-literal-matcher
+       (python-rx bytes-escape-sequence)
+       python--not-raw-bytes-literal-start-regexp)
+     (1 font-lock-constant-face t))
+    ;; escape sequences within string literals, the same as appear in bytes
+    ;; literals in addition to:
+    ;;   "\uxxxx" Character with 16-bit hex value xxxx
+    ;;   "\Uxxxxxxxx" Character with 32-bit hex value xxxxxxxx
+    ;;   "\N{name}" Character named name in the Unicode database
+    (,(python--string-bytes-literal-matcher
+       (python-rx string-escape-sequence)
+       python--not-raw-string-literal-start-regexp)
+     (1 'font-lock-constant-face t)))
   "Font lock keywords to use in `python-mode' for maximum decoration.
 
 This decoration level includes everything in
diff --git a/test/lisp/progmodes/python-tests.el b/test/lisp/progmodes/python-tests.el
index 6f2ad87f81..07f2c4f09a 100644
--- a/test/lisp/progmodes/python-tests.el
+++ b/test/lisp/progmodes/python-tests.el
@@ -380,6 +380,83 @@ python-font-lock-assignment-statement-18
      (128 . font-lock-builtin-face) (131)
      (144 . font-lock-keyword-face) (150))))
 
+(ert-deftest python-font-lock-escape-sequence-string-newline ()
+  (python-tests-assert-faces
+   "'\\n'
+\"\\n\"
+f'\\n'
+f\"\\n\"
+u'\\n'
+u\"\\n\""
+   '((1 . font-lock-doc-face)
+     (2 . font-lock-constant-face)
+     (4 . font-lock-doc-face) (5)
+     (6 . font-lock-doc-face)
+     (7 . font-lock-constant-face)
+     (9 . font-lock-doc-face) (10)
+     (12 . font-lock-string-face)
+     (13 . font-lock-constant-face)
+     (15 . font-lock-string-face) (16)
+     (18 . font-lock-string-face)
+     (19 . font-lock-constant-face)
+     (21 . font-lock-string-face) (22)
+     (24 . font-lock-string-face)
+     (25 . font-lock-constant-face)
+     (27 . font-lock-string-face) (28)
+     (30 . font-lock-string-face)
+     (31 . font-lock-constant-face)
+     (33 . font-lock-string-face))))
+
+(ert-deftest python-font-lock-escape-sequence-bytes-newline ()
+  (python-tests-assert-faces
+   "b'\\n'
+b\"\\n\""
+   '((1)
+     (2 . font-lock-doc-face)
+     (3 . font-lock-constant-face)
+     (5 . font-lock-doc-face) (6)
+     (8 . font-lock-doc-face)
+     (9 . font-lock-constant-face)
+     (11 . font-lock-doc-face))))
+
+(ert-deftest python-font-lock-escape-sequence-hex-octal ()
+  (python-tests-assert-faces
+   "b'\\x12 \\777'
+'\\x12 \\777'"
+   '((1)
+     (2 . font-lock-doc-face)
+     (3 . font-lock-constant-face)
+     (7 . font-lock-doc-face)
+     (8 . font-lock-constant-face)
+     (12 . font-lock-doc-face) (13)
+     (14 . font-lock-doc-face)
+     (15 . font-lock-constant-face)
+     (19 . font-lock-doc-face)
+     (20 . font-lock-constant-face)
+     (24 . font-lock-doc-face))))
+
+(ert-deftest python-font-lock-escape-sequence-unicode ()
+  (python-tests-assert-faces
+   "b'\\u1234 \\U00010348 \\N{Plus-Minus Sign}'
+'\\u1234 \\U00010348 \\N{Plus-Minus Sign}'"
+   '((1)
+     (2 . font-lock-doc-face) (41)
+     (42 . font-lock-doc-face)
+     (43 . font-lock-constant-face)
+     (49 . font-lock-doc-face)
+     (50 . font-lock-constant-face)
+     (60 . font-lock-doc-face)
+     (61 . font-lock-constant-face)
+     (80 . font-lock-doc-face))))
+
+(ert-deftest python-font-lock-raw-escape-sequence ()
+  (python-tests-assert-faces
+   "rb'\\x12 \123 \\n'
+r'\\x12 \123 \\n \\u1234 \\U00010348 \\N{Plus-Minus Sign}'"
+   '((1)
+     (3 . font-lock-doc-face) (14)
+     (16 . font-lock-doc-face))))
+
 \f
 ;;; Indentation
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* bug#57004: [PATCH] Fontify Escape Sequences in Python String and Byte Literals
  2022-08-05 13:06 bug#57004: [PATCH] Fontify Escape Sequences in Python String and Byte Literals Laurence Warne
@ 2022-08-06 13:02 ` Lars Ingebrigtsen
  2022-08-09  8:42   ` Laurence Warne
  0 siblings, 1 reply; 4+ messages in thread
From: Lars Ingebrigtsen @ 2022-08-06 13:02 UTC (permalink / raw)
  To: Laurence Warne; +Cc: 57004

Laurence Warne <laurencewarne@gmail.com> writes:

> Hi, this patch implements escape code syntax highlighting within string an bytes
> literals for python-mode (described at
> https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals)
> similar to how they are highlighted by https://pygments.org.

Thanks; pushed to Emacs 29.






^ permalink raw reply	[flat|nested] 4+ messages in thread

* bug#57004: [PATCH] Fontify Escape Sequences in Python String and Byte Literals
  2022-08-06 13:02 ` Lars Ingebrigtsen
@ 2022-08-09  8:42   ` Laurence Warne
  2022-08-09 15:30     ` Lars Ingebrigtsen
  0 siblings, 1 reply; 4+ messages in thread
From: Laurence Warne @ 2022-08-09  8:42 UTC (permalink / raw)
  To: Lars Ingebrigtsen; +Cc: 57004


[-- Attachment #1.1: Type: text/plain, Size: 1143 bytes --]

Great, thanks.  I saw a few bugs playing around a bit more:

1) Consecutive escape codes not fontified:
b'\x12\x23'  # Here only "\x12" would be fontified
2) Multi-line bytes literals fontified as string literals:
b'''\x12 \777 \1\23 \u1234'''  # Here '\u1234" would be (incorrectly)
fontified
3) Octal escape codes may be one to three characters instead of always
three (
https://docs.python.org/3/reference/lexical_analysis.html#escape-sequences):
"\1 \12 \123"  # Here only "\123" would be fontified

I've attached a patch which fixes the above and adds a new test.  The new
test (tests different combinations of multi-line literals) makes up the
majority of the diff.

Thanks, Laurence

On Sat, Aug 6, 2022 at 2:02 PM Lars Ingebrigtsen <larsi@gnus.org> wrote:

> Laurence Warne <laurencewarne@gmail.com> writes:
>
> > Hi, this patch implements escape code syntax highlighting within string
> an bytes
> > literals for python-mode (described at
> >
> https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
> )
> > similar to how they are highlighted by https://pygments.org.
>
> Thanks; pushed to Emacs 29.
>
>

[-- Attachment #1.2: Type: text/html, Size: 2033 bytes --]

[-- Attachment #2: 0001-Fix-python-escape-code-fontification-for-multi-line-.patch --]
[-- Type: text/x-patch, Size: 7904 bytes --]

From 95cf4580d238148070f7e80a2078e169079064ab Mon Sep 17 00:00:00 2001
From: Laurence Warne <laurencewarne@gmail.com>
Date: Tue, 9 Aug 2022 08:33:18 +0100
Subject: [PATCH] Fix python escape code fontification for multi-line literals

* lisp/progmodes/python.el (python--string-bytes-literal-matcher): Go
backward one char after a match so that consecutive escape codes are
highlighted
(python--not-raw-string-literal-start-regexp): Make regular expression
more comprehensive, so multi-line bytes literals are not caught
(python-rx): Accept one to three octal digits in octal escape codes
instead of always three
---
 lisp/progmodes/python.el            | 22 ++++---
 test/lisp/progmodes/python-tests.el | 95 ++++++++++++++++++++++++++---
 2 files changed, 102 insertions(+), 15 deletions(-)

diff --git a/lisp/progmodes/python.el b/lisp/progmodes/python.el
index 5edd6e7df5..96f9d14832 100644
--- a/lisp/progmodes/python.el
+++ b/lisp/progmodes/python.el
@@ -432,7 +432,7 @@ python-rx
              (seq (not "\\")
                   (group (or "\\\\" "\\'" "\\a" "\\b" "\\f"
                              "\\n" "\\r" "\\t" "\\v"
-                             (seq "\\" (= 3 (in "0-7")))
+                             (seq "\\" (** 1 3 (in "0-7")))
                              (seq "\\x" hex hex)))))
             (string-escape-sequence
              (or bytes-escape-sequence
@@ -556,7 +556,14 @@ python--not-raw-bytes-literal-start-regexp
   "A regular expression matching the start of a not-raw bytes literal.")
 
 (defconst python--not-raw-string-literal-start-regexp
-  (rx (or bos (not alnum)) (? (or "u" "U" "F" "f")) (or "\"" "\"\"\"" "'" "'''") eos)
+  (rx bos (or
+           ;; Multi-line string literals
+           (seq (? (? (not alnum)) (or "u" "U" "F" "f")) (or "\"\"\"" "'''"))
+           (seq (? anychar) (not alnum) (or "\"\"\"" "'''"))
+           ;; Single line string literals
+           (seq (? (** 0 2 anychar) (not alnum)) (or "u" "U" "F" "f") (or "'" "\""))
+           (seq (? (** 0 3 anychar) (not (any "'\"" alnum))) (or "'" "\"")))
+      eos)
   "A regular expression matching the start of a not-raw string literal.")
 
 (defun python--string-bytes-literal-matcher (regexp start-regexp)
@@ -565,11 +572,12 @@ python--string-bytes-literal-matcher
     (cl-loop for result = (re-search-forward regexp limit t)
              for result-valid = (and
                                  result
-                                 (let* ((pos (nth 8 (syntax-ppss)))
-                                        (before-quote
-                                         (buffer-substring-no-properties
-                                          (max (- pos 5) (point-min))
-                                          (min (+ pos 1) (point-max)))))
+                                 (when-let* ((pos (nth 8 (syntax-ppss)))
+                                             (before-quote
+                                              (buffer-substring-no-properties
+                                               (max (- pos 4) (point-min))
+                                               (min (+ pos 1) (point-max)))))
+                                   (backward-char)
                                    (string-match-p start-regexp before-quote)))
              until (or (not result) result-valid)
              finally return (and result-valid result))))
diff --git a/test/lisp/progmodes/python-tests.el b/test/lisp/progmodes/python-tests.el
index e3c8d5554a..d303050fad 100644
--- a/test/lisp/progmodes/python-tests.el
+++ b/test/lisp/progmodes/python-tests.el
@@ -407,6 +407,81 @@ python-font-lock-escape-sequence-string-newline
      (31 . font-lock-constant-face)
      (33 . font-lock-string-face))))
 
+(ert-deftest python-font-lock-escape-sequence-multiline-string ()
+  (python-tests-assert-faces
+   (let ((escape-sequences "\\x12 \123 \\n \\u1234 \\U00010348 \\N{Plus-Minus Sign}"))
+     (cl-loop for string-prefix in '("" "f" "rf" "fr" "r" "rb" "br" "b")
+              concat (cl-loop for quote-string in '("\"\"\"" "'''")
+                              concat (concat string-prefix
+                                             quote-string
+                                             escape-sequences
+                                             quote-string
+                                             "\n"))))
+   '((1 . font-lock-doc-face)
+     (4 . font-lock-constant-face)
+     (8 . font-lock-doc-face)
+     (11 . font-lock-constant-face)
+     (13 . font-lock-doc-face)
+     (14 . font-lock-constant-face)
+     (20 . font-lock-doc-face)
+     (21 . font-lock-constant-face)
+     (31 . font-lock-doc-face)
+     (32 . font-lock-constant-face)
+     (51 . font-lock-doc-face) (54)
+     (55 . font-lock-doc-face)
+     (58 . font-lock-constant-face)
+     (62 . font-lock-doc-face)
+     (65 . font-lock-constant-face)
+     (67 . font-lock-doc-face)
+     (68 . font-lock-constant-face)
+     (74 . font-lock-doc-face)
+     (75 . font-lock-constant-face)
+     (85 . font-lock-doc-face)
+     (86 . font-lock-constant-face)
+     (105 . font-lock-doc-face) (108)
+     (110 . font-lock-string-face)
+     (113 . font-lock-constant-face)
+     (117 . font-lock-string-face)
+     (120 . font-lock-constant-face)
+     (122 . font-lock-string-face)
+     (123 . font-lock-constant-face)
+     (129 . font-lock-string-face)
+     (130 . font-lock-constant-face)
+     (140 . font-lock-string-face)
+     (141 . font-lock-constant-face)
+     (160 . font-lock-string-face) (163)
+     (165 . font-lock-string-face)
+     (168 . font-lock-constant-face)
+     (172 . font-lock-string-face)
+     (175 . font-lock-constant-face)
+     (177 . font-lock-string-face)
+     (178 . font-lock-constant-face)
+     (184 . font-lock-string-face)
+     (185 . font-lock-constant-face)
+     (195 . font-lock-string-face)
+     (196 . font-lock-constant-face)
+     (215 . font-lock-string-face) (218)
+     (221 . font-lock-string-face) (274)
+     (277 . font-lock-string-face) (330)
+     (333 . font-lock-string-face) (386)
+     (389 . font-lock-string-face) (442)
+     (444 . font-lock-string-face) (497)
+     (499 . font-lock-string-face) (552)
+     (555 . font-lock-string-face) (608)
+     (611 . font-lock-string-face) (664)
+     (667 . font-lock-string-face) (720)
+     (723 . font-lock-string-face) (776)
+     (778 . font-lock-string-face)
+     (781 . font-lock-constant-face)
+     (785 . font-lock-string-face)
+     (788 . font-lock-constant-face)
+     (790 . font-lock-string-face) (831)
+     (833 . font-lock-string-face)
+     (836 . font-lock-constant-face)
+     (840 . font-lock-string-face)
+     (843 . font-lock-constant-face)
+     (845 . font-lock-string-face) (886))))
+
 (ert-deftest python-font-lock-escape-sequence-bytes-newline ()
   (python-tests-assert-faces
    "b'\\n'
@@ -421,19 +496,23 @@ python-font-lock-escape-sequence-bytes-newline
 
 (ert-deftest python-font-lock-escape-sequence-hex-octal ()
   (python-tests-assert-faces
-   "b'\\x12 \\777'
-'\\x12 \\777'"
+   "b'\\x12 \\777 \\1\\23'
+'\\x12 \\777 \\1\\23'"
    '((1)
      (2 . font-lock-doc-face)
      (3 . font-lock-constant-face)
      (7 . font-lock-doc-face)
      (8 . font-lock-constant-face)
-     (12 . font-lock-doc-face) (13)
-     (14 . font-lock-doc-face)
-     (15 . font-lock-constant-face)
-     (19 . font-lock-doc-face)
-     (20 . font-lock-constant-face)
-     (24 . font-lock-doc-face))))
+     (12 . font-lock-doc-face)
+     (13 . font-lock-constant-face)
+     (18 . font-lock-doc-face) (19)
+     (20 . font-lock-doc-face)
+     (21 . font-lock-constant-face)
+     (25 . font-lock-doc-face)
+     (26 . font-lock-constant-face)
+     (30 . font-lock-doc-face)
+     (31 . font-lock-constant-face)
+     (36 . font-lock-doc-face))))
 
 (ert-deftest python-font-lock-escape-sequence-unicode ()
   (python-tests-assert-faces
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* bug#57004: [PATCH] Fontify Escape Sequences in Python String and Byte Literals
  2022-08-09  8:42   ` Laurence Warne
@ 2022-08-09 15:30     ` Lars Ingebrigtsen
  0 siblings, 0 replies; 4+ messages in thread
From: Lars Ingebrigtsen @ 2022-08-09 15:30 UTC (permalink / raw)
  To: Laurence Warne; +Cc: 57004

Laurence Warne <laurencewarne@gmail.com> writes:

> I've attached a patch which fixes the above and adds a new test.  The
> new test (tests different combinations of multi-line literals) makes
> up the majority of the diff.

Thanks; pushed to Emacs 29.






^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-08-09 15:30 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-05 13:06 bug#57004: [PATCH] Fontify Escape Sequences in Python String and Byte Literals Laurence Warne
2022-08-06 13:02 ` Lars Ingebrigtsen
2022-08-09  8:42   ` Laurence Warne
2022-08-09 15:30     ` Lars Ingebrigtsen

Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).