[-- Attachment #1.1: Type: text/plain, Size: 796 bytes --] Hi, this patch implements escape code syntax highlighting within string an bytes literals for python-mode (described at https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals) similar to how they are highlighted by https://pygments.org. I've written a few tests (which should probably be expanded on (: ), an example based on one of the test samples: b'example 1: \n \\ \u1234 \U00010348 \N{Plus-Minus Sign}' 'example 2: \n \\ \u1234 \U00010348 \N{Plus-Minus Sign}' e.g. in the above we see "\n", "\\" highlighted on the first line, but not "\u1234", "\U00010348" "\N{Plus-Minus Sign}" since these items aren't valid escape sequences in byte literals. The second line however is a string literal, so all of the preceding items are highlighted. Thanks, Laurence [-- Attachment #1.2: Type: text/html, Size: 1152 bytes --] [-- Attachment #2: 0001-Fontify-python-escape-sequences-in-literals.patch --] [-- Type: text/x-patch, Size: 7571 bytes --] From 18584736ab6aa4802accf68a0f5ca8d12666b891 Mon Sep 17 00:00:00 2001 From: lWarne <laurencewarne@gmail.com> Date: Thu, 4 Aug 2022 16:13:23 +0100 Subject: [PATCH] Fontify python escape sequences in literals * lisp/progmodes/python.el (python-rx): Add regular expressions matching escape codes in string and byte literals (python--string-bytes-literal-matcher): new function (python--not-raw-bytes-literal-start-regexp): new constant (python--not-raw-string-literal-start-regexp): new constant * test/lisp/progmodes/python-tests.el: Add tests for new fontification --- lisp/progmodes/python.el | 53 +++++++++++++++++++- test/lisp/progmodes/python-tests.el | 77 +++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 2 deletions(-) diff --git a/lisp/progmodes/python.el b/lisp/progmodes/python.el index b8fc7d4c54..4c387756f7 100644 --- a/lisp/progmodes/python.el +++ b/lisp/progmodes/python.el @@ -427,7 +427,17 @@ python-rx (: "vim:" (* space) "set" (+ space) "fileencoding" (* space) ?= (* space) (group-n 1 (+ (or word ?-))) - (* space) ":"))))) + (* space) ":")))) + (bytes-escape-sequence (seq (not "\\") + (group (or "\\\\" "\\'" "\\a" "\\b" "\\f" + "\\n" "\\r" "\\t" "\\v" + (seq "\\" (= 3 (in "0-7"))) + (seq "\\x" hex hex))))) + (string-escape-sequence (or bytes-escape-sequence + (seq (not "\\") + (or (group-n 1 "\\u" (= 4 hex)) + (group-n 1 "\\U" (= 8 hex)) + (group-n 1 "\\N{" (*? anychar) "}")))))) (rx ,@regexps))) \f @@ -539,6 +549,28 @@ python--font-lock-f-strings (goto-char (min limit (1+ send))) (setq ppss (syntax-ppss)))))) +(defconst python--not-raw-bytes-literal-start-regexp + (rx (or bos (not alnum)) (or "b" "B") (or "\"" "\"\"\"" "'" "'''") eos) + "A regular expression matching the start of a not-raw bytes literal.") + +(defconst python--not-raw-string-literal-start-regexp + (rx (or bos (not alnum)) (? (or "u" "U" "F" "f")) (or "\"" "\"\"\"" "'" "'''") eos) + "A regular expression matching the start of a not-raw string literal.") + +(defun python--string-bytes-literal-matcher (regexp start-regexp) + "Match REGEXP within a string or bytes literal whose start matches START-REGEXP." + (lambda (limit) + (cl-loop for result = (re-search-forward regexp limit t) + for result-valid = (and result + (let* ((pos (nth 8 (syntax-ppss))) + (before-quote + (buffer-substring-no-properties + (max (- pos 5) (point-min)) + (min (+ pos 1) (point-max))))) + (string-match-p start-regexp before-quote))) + until (or (not result) result-valid) + finally return (and result-valid result)))) + (defvar python-font-lock-keywords-level-1 `((,(python-rx symbol-start "def" (1+ space) (group symbol-name)) (1 font-lock-function-name-face)) @@ -716,7 +748,24 @@ python-font-lock-keywords-maximum-decoration grouped-assignment-target (* space) (or ")" "]") (* space) assignment-operator)) - (1 font-lock-variable-name-face))) + (1 font-lock-variable-name-face)) + ;; escape sequences within bytes literals + ;; "\\" "\'" "\a" "\b" "\f" "\n" "\r" "\t" "\v" + ;; "\ooo" character with octal value ooo + ;; "\xhh" character with hex value hh + (,(python--string-bytes-literal-matcher + (python-rx bytes-escape-sequence) + python--not-raw-bytes-literal-start-regexp) + (1 font-lock-constant-face t)) + ;; escape sequences within string literals, the same as appear in bytes + ;; literals in addition to: + ;; "\uxxxx" Character with 16-bit hex value xxxx + ;; "\Uxxxxxxxx" Character with 32-bit hex value xxxxxxxx + ;; "\N{name}" Character named name in the Unicode database + (,(python--string-bytes-literal-matcher + (python-rx string-escape-sequence) + python--not-raw-string-literal-start-regexp) + (1 'font-lock-constant-face t))) "Font lock keywords to use in `python-mode' for maximum decoration. This decoration level includes everything in diff --git a/test/lisp/progmodes/python-tests.el b/test/lisp/progmodes/python-tests.el index 6f2ad87f81..07f2c4f09a 100644 --- a/test/lisp/progmodes/python-tests.el +++ b/test/lisp/progmodes/python-tests.el @@ -380,6 +380,83 @@ python-font-lock-assignment-statement-18 (128 . font-lock-builtin-face) (131) (144 . font-lock-keyword-face) (150)))) +(ert-deftest python-font-lock-escape-sequence-string-newline () + (python-tests-assert-faces + "'\\n' +\"\\n\" +f'\\n' +f\"\\n\" +u'\\n' +u\"\\n\"" + '((1 . font-lock-doc-face) + (2 . font-lock-constant-face) + (4 . font-lock-doc-face) (5) + (6 . font-lock-doc-face) + (7 . font-lock-constant-face) + (9 . font-lock-doc-face) (10) + (12 . font-lock-string-face) + (13 . font-lock-constant-face) + (15 . font-lock-string-face) (16) + (18 . font-lock-string-face) + (19 . font-lock-constant-face) + (21 . font-lock-string-face) (22) + (24 . font-lock-string-face) + (25 . font-lock-constant-face) + (27 . font-lock-string-face) (28) + (30 . font-lock-string-face) + (31 . font-lock-constant-face) + (33 . font-lock-string-face)))) + +(ert-deftest python-font-lock-escape-sequence-bytes-newline () + (python-tests-assert-faces + "b'\\n' +b\"\\n\"" + '((1) + (2 . font-lock-doc-face) + (3 . font-lock-constant-face) + (5 . font-lock-doc-face) (6) + (8 . font-lock-doc-face) + (9 . font-lock-constant-face) + (11 . font-lock-doc-face)))) + +(ert-deftest python-font-lock-escape-sequence-hex-octal () + (python-tests-assert-faces + "b'\\x12 \\777' +'\\x12 \\777'" + '((1) + (2 . font-lock-doc-face) + (3 . font-lock-constant-face) + (7 . font-lock-doc-face) + (8 . font-lock-constant-face) + (12 . font-lock-doc-face) (13) + (14 . font-lock-doc-face) + (15 . font-lock-constant-face) + (19 . font-lock-doc-face) + (20 . font-lock-constant-face) + (24 . font-lock-doc-face)))) + +(ert-deftest python-font-lock-escape-sequence-unicode () + (python-tests-assert-faces + "b'\\u1234 \\U00010348 \\N{Plus-Minus Sign}' +'\\u1234 \\U00010348 \\N{Plus-Minus Sign}'" + '((1) + (2 . font-lock-doc-face) (41) + (42 . font-lock-doc-face) + (43 . font-lock-constant-face) + (49 . font-lock-doc-face) + (50 . font-lock-constant-face) + (60 . font-lock-doc-face) + (61 . font-lock-constant-face) + (80 . font-lock-doc-face)))) + +(ert-deftest python-font-lock-raw-escape-sequence () + (python-tests-assert-faces + "rb'\\x12 \123 \\n' +r'\\x12 \123 \\n \\u1234 \\U00010348 \\N{Plus-Minus Sign}'" + '((1) + (3 . font-lock-doc-face) (14) + (16 . font-lock-doc-face)))) + \f ;;; Indentation -- 2.30.2
Laurence Warne <laurencewarne@gmail.com> writes:
> Hi, this patch implements escape code syntax highlighting within string an bytes
> literals for python-mode (described at
> https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals)
> similar to how they are highlighted by https://pygments.org.
Thanks; pushed to Emacs 29.
[-- Attachment #1.1: Type: text/plain, Size: 1143 bytes --] Great, thanks. I saw a few bugs playing around a bit more: 1) Consecutive escape codes not fontified: b'\x12\x23' # Here only "\x12" would be fontified 2) Multi-line bytes literals fontified as string literals: b'''\x12 \777 \1\23 \u1234''' # Here '\u1234" would be (incorrectly) fontified 3) Octal escape codes may be one to three characters instead of always three ( https://docs.python.org/3/reference/lexical_analysis.html#escape-sequences): "\1 \12 \123" # Here only "\123" would be fontified I've attached a patch which fixes the above and adds a new test. The new test (tests different combinations of multi-line literals) makes up the majority of the diff. Thanks, Laurence On Sat, Aug 6, 2022 at 2:02 PM Lars Ingebrigtsen <larsi@gnus.org> wrote: > Laurence Warne <laurencewarne@gmail.com> writes: > > > Hi, this patch implements escape code syntax highlighting within string > an bytes > > literals for python-mode (described at > > > https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals > ) > > similar to how they are highlighted by https://pygments.org. > > Thanks; pushed to Emacs 29. > > [-- Attachment #1.2: Type: text/html, Size: 2033 bytes --] [-- Attachment #2: 0001-Fix-python-escape-code-fontification-for-multi-line-.patch --] [-- Type: text/x-patch, Size: 7904 bytes --] From 95cf4580d238148070f7e80a2078e169079064ab Mon Sep 17 00:00:00 2001 From: Laurence Warne <laurencewarne@gmail.com> Date: Tue, 9 Aug 2022 08:33:18 +0100 Subject: [PATCH] Fix python escape code fontification for multi-line literals * lisp/progmodes/python.el (python--string-bytes-literal-matcher): Go backward one char after a match so that consecutive escape codes are highlighted (python--not-raw-string-literal-start-regexp): Make regular expression more comprehensive, so multi-line bytes literals are not caught (python-rx): Accept one to three octal digits in octal escape codes instead of always three --- lisp/progmodes/python.el | 22 ++++--- test/lisp/progmodes/python-tests.el | 95 ++++++++++++++++++++++++++--- 2 files changed, 102 insertions(+), 15 deletions(-) diff --git a/lisp/progmodes/python.el b/lisp/progmodes/python.el index 5edd6e7df5..96f9d14832 100644 --- a/lisp/progmodes/python.el +++ b/lisp/progmodes/python.el @@ -432,7 +432,7 @@ python-rx (seq (not "\\") (group (or "\\\\" "\\'" "\\a" "\\b" "\\f" "\\n" "\\r" "\\t" "\\v" - (seq "\\" (= 3 (in "0-7"))) + (seq "\\" (** 1 3 (in "0-7"))) (seq "\\x" hex hex))))) (string-escape-sequence (or bytes-escape-sequence @@ -556,7 +556,14 @@ python--not-raw-bytes-literal-start-regexp "A regular expression matching the start of a not-raw bytes literal.") (defconst python--not-raw-string-literal-start-regexp - (rx (or bos (not alnum)) (? (or "u" "U" "F" "f")) (or "\"" "\"\"\"" "'" "'''") eos) + (rx bos (or + ;; Multi-line string literals + (seq (? (? (not alnum)) (or "u" "U" "F" "f")) (or "\"\"\"" "'''")) + (seq (? anychar) (not alnum) (or "\"\"\"" "'''")) + ;; Single line string literals + (seq (? (** 0 2 anychar) (not alnum)) (or "u" "U" "F" "f") (or "'" "\"")) + (seq (? (** 0 3 anychar) (not (any "'\"" alnum))) (or "'" "\""))) + eos) "A regular expression matching the start of a not-raw string literal.") (defun python--string-bytes-literal-matcher (regexp start-regexp) @@ -565,11 +572,12 @@ python--string-bytes-literal-matcher (cl-loop for result = (re-search-forward regexp limit t) for result-valid = (and result - (let* ((pos (nth 8 (syntax-ppss))) - (before-quote - (buffer-substring-no-properties - (max (- pos 5) (point-min)) - (min (+ pos 1) (point-max))))) + (when-let* ((pos (nth 8 (syntax-ppss))) + (before-quote + (buffer-substring-no-properties + (max (- pos 4) (point-min)) + (min (+ pos 1) (point-max))))) + (backward-char) (string-match-p start-regexp before-quote))) until (or (not result) result-valid) finally return (and result-valid result)))) diff --git a/test/lisp/progmodes/python-tests.el b/test/lisp/progmodes/python-tests.el index e3c8d5554a..d303050fad 100644 --- a/test/lisp/progmodes/python-tests.el +++ b/test/lisp/progmodes/python-tests.el @@ -407,6 +407,81 @@ python-font-lock-escape-sequence-string-newline (31 . font-lock-constant-face) (33 . font-lock-string-face)))) +(ert-deftest python-font-lock-escape-sequence-multiline-string () + (python-tests-assert-faces + (let ((escape-sequences "\\x12 \123 \\n \\u1234 \\U00010348 \\N{Plus-Minus Sign}")) + (cl-loop for string-prefix in '("" "f" "rf" "fr" "r" "rb" "br" "b") + concat (cl-loop for quote-string in '("\"\"\"" "'''") + concat (concat string-prefix + quote-string + escape-sequences + quote-string + "\n")))) + '((1 . font-lock-doc-face) + (4 . font-lock-constant-face) + (8 . font-lock-doc-face) + (11 . font-lock-constant-face) + (13 . font-lock-doc-face) + (14 . font-lock-constant-face) + (20 . font-lock-doc-face) + (21 . font-lock-constant-face) + (31 . font-lock-doc-face) + (32 . font-lock-constant-face) + (51 . font-lock-doc-face) (54) + (55 . font-lock-doc-face) + (58 . font-lock-constant-face) + (62 . font-lock-doc-face) + (65 . font-lock-constant-face) + (67 . font-lock-doc-face) + (68 . font-lock-constant-face) + (74 . font-lock-doc-face) + (75 . font-lock-constant-face) + (85 . font-lock-doc-face) + (86 . font-lock-constant-face) + (105 . font-lock-doc-face) (108) + (110 . font-lock-string-face) + (113 . font-lock-constant-face) + (117 . font-lock-string-face) + (120 . font-lock-constant-face) + (122 . font-lock-string-face) + (123 . font-lock-constant-face) + (129 . font-lock-string-face) + (130 . font-lock-constant-face) + (140 . font-lock-string-face) + (141 . font-lock-constant-face) + (160 . font-lock-string-face) (163) + (165 . font-lock-string-face) + (168 . font-lock-constant-face) + (172 . font-lock-string-face) + (175 . font-lock-constant-face) + (177 . font-lock-string-face) + (178 . font-lock-constant-face) + (184 . font-lock-string-face) + (185 . font-lock-constant-face) + (195 . font-lock-string-face) + (196 . font-lock-constant-face) + (215 . font-lock-string-face) (218) + (221 . font-lock-string-face) (274) + (277 . font-lock-string-face) (330) + (333 . font-lock-string-face) (386) + (389 . font-lock-string-face) (442) + (444 . font-lock-string-face) (497) + (499 . font-lock-string-face) (552) + (555 . font-lock-string-face) (608) + (611 . font-lock-string-face) (664) + (667 . font-lock-string-face) (720) + (723 . font-lock-string-face) (776) + (778 . font-lock-string-face) + (781 . font-lock-constant-face) + (785 . font-lock-string-face) + (788 . font-lock-constant-face) + (790 . font-lock-string-face) (831) + (833 . font-lock-string-face) + (836 . font-lock-constant-face) + (840 . font-lock-string-face) + (843 . font-lock-constant-face) + (845 . font-lock-string-face) (886)))) + (ert-deftest python-font-lock-escape-sequence-bytes-newline () (python-tests-assert-faces "b'\\n' @@ -421,19 +496,23 @@ python-font-lock-escape-sequence-bytes-newline (ert-deftest python-font-lock-escape-sequence-hex-octal () (python-tests-assert-faces - "b'\\x12 \\777' -'\\x12 \\777'" + "b'\\x12 \\777 \\1\\23' +'\\x12 \\777 \\1\\23'" '((1) (2 . font-lock-doc-face) (3 . font-lock-constant-face) (7 . font-lock-doc-face) (8 . font-lock-constant-face) - (12 . font-lock-doc-face) (13) - (14 . font-lock-doc-face) - (15 . font-lock-constant-face) - (19 . font-lock-doc-face) - (20 . font-lock-constant-face) - (24 . font-lock-doc-face)))) + (12 . font-lock-doc-face) + (13 . font-lock-constant-face) + (18 . font-lock-doc-face) (19) + (20 . font-lock-doc-face) + (21 . font-lock-constant-face) + (25 . font-lock-doc-face) + (26 . font-lock-constant-face) + (30 . font-lock-doc-face) + (31 . font-lock-constant-face) + (36 . font-lock-doc-face)))) (ert-deftest python-font-lock-escape-sequence-unicode () (python-tests-assert-faces -- 2.30.2
Laurence Warne <laurencewarne@gmail.com> writes:
> I've attached a patch which fixes the above and adds a new test. The
> new test (tests different combinations of multi-line literals) makes
> up the majority of the diff.
Thanks; pushed to Emacs 29.