From 7941589941fe7a0fb6f71b8681ccb1d976946912 Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Fri, 11 Oct 2024 14:24:30 +0200 Subject: [PATCH v4 2/2] PEG: Add support for `not-in-range` and [^...] Modern PEG supports inversed class like `[^a-z]` that would get any character not in the `a-z` range. This commit adds support for that and also for a new `not-in-range` PEG pattern for scheme. * module/ice-9/peg/codegen.scm (cg-not-in-range): New function. * module/ice-9/peg/string-peg.scm: Add support for `[^...]` * test-suite/tests/peg.test: Test it. * doc/ref/api-peg.texi: Document accordingly. --- NEWS | 3 ++- doc/ref/api-peg.texi | 8 +++++++ module/ice-9/peg/codegen.scm | 22 +++++++++++++++++++ module/ice-9/peg/string-peg.scm | 38 ++++++++++++++++++++++++++++++--- test-suite/tests/peg.test | 6 +++++- 5 files changed, 72 insertions(+), 5 deletions(-) diff --git a/NEWS b/NEWS index df43f3754..17ef560b1 100644 --- a/NEWS +++ b/NEWS @@ -32,7 +32,8 @@ Changes in 3.0.11 (since 3.0.10) ** PEG parser PEG parser has been rewritten to cover all the functionality defined in -. +. Also added the `not-in-range` pattern +to `(ice-9 peg)` that is also available from PEG strings via `[^...]`. Changes in 3.0.10 (since 3.0.9) diff --git a/doc/ref/api-peg.texi b/doc/ref/api-peg.texi index 84a9e6c6b..edb090b20 100644 --- a/doc/ref/api-peg.texi +++ b/doc/ref/api-peg.texi @@ -147,6 +147,14 @@ Parses any character falling between @var{a} and @var{z}. @code{(range #\a #\z)} @end deftp +@deftp {PEG Pattern} {inverse range of characters} a z +Parses any character not falling between @var{a} and @var{z}. + +@code{"[^a-z]"} + +@code{(not-in-range #\a #\z)} +@end deftp + Example: @example diff --git a/module/ice-9/peg/codegen.scm b/module/ice-9/peg/codegen.scm index d80c3e849..82367ef55 100644 --- a/module/ice-9/peg/codegen.scm +++ b/module/ice-9/peg/codegen.scm @@ -140,6 +140,27 @@ return EXP." ((none) #`(list (1+ pos) '())) (else (error "bad accum" accum)))))))))) +;; Generates code for matching a range of characters not between start and end. +;; E.g.: (cg-not-in-range syntax #\a #\z 'body) +(define (cg-not-in-range pat accum) + (syntax-case pat () + ((start end) + (if (not (and (char? (syntax->datum #'start)) + (char? (syntax->datum #'end)))) + (error "range PEG should have characters after it; instead got" + #'start #'end)) + #`(lambda (str len pos) + (and (< pos len) + (let ((c (string-ref str pos))) + (and (or (char? c end)) + #,(case accum + ((all) #`(list (1+ pos) + (list 'cg-not-in-range (string c)))) + ((name) #`(list (1+ pos) 'cg-not-in-range)) + ((body) #`(list (1+ pos) (string c))) + ((none) #`(list (1+ pos) '())) + (else (error "bad accum" accum)))))))))) + ;; Generate code to match a pattern and do nothing with the result (define (cg-ignore pat accum) (syntax-case pat () @@ -304,6 +325,7 @@ return EXP." (assq-set! peg-compiler-alist symbol function))) (add-peg-compiler! 'range cg-range) +(add-peg-compiler! 'not-in-range cg-not-in-range) (add-peg-compiler! 'ignore cg-ignore) (add-peg-compiler! 'capture cg-capture) (add-peg-compiler! 'and cg-and) diff --git a/module/ice-9/peg/string-peg.scm b/module/ice-9/peg/string-peg.scm index 4b923220a..0026f8930 100644 --- a/module/ice-9/peg/string-peg.scm +++ b/module/ice-9/peg/string-peg.scm @@ -54,7 +54,7 @@ Prefix <-- (AND / NOT)? Suffix Suffix <-- Primary (QUESTION / STAR / PLUS)? Primary <-- Identifier !LEFTARROW / OPEN Expression CLOSE - / Literal / Class / DOT + / Literal / Class / NotInClass / DOT # Lexical syntax Identifier <-- IdentStart IdentCont* Spacing @@ -64,6 +64,7 @@ IdentCont <- IdentStart / [0-9] Literal <-- SQUOTE (!SQUOTE Char)* SQUOTE Spacing / DQUOTE (!DQUOTE Char)* DQUOTE Spacing +NotInClass <-- OPENBRACKET NOTIN (!CLOSEBRACKET Range)* CLOSEBRACKET Spacing Class <-- OPENBRACKET !NOTIN (!CLOSEBRACKET Range)* CLOSEBRACKET Spacing Range <-- Char DASH Char / Char Char <-- '\\\\' [nrt'\"\\[\\]\\\\] @@ -78,6 +79,7 @@ DQUOTE < [\"] DASH < '-' OPENBRACKET < '[' CLOSEBRACKET < ']' +NOTIN < '^' SLASH < '/' Spacing AND <-- '&' Spacing NOT <-- '!' Spacing @@ -122,6 +124,7 @@ EndOfFile < !. (and OPEN Expression CLOSE) Literal Class + NotInClass DOT)) (define-sexp-parser Identifier all (and IdentStart (* IdentCont) Spacing)) @@ -133,7 +136,11 @@ EndOfFile < !. (or (and SQUOTE (* (and (not-followed-by SQUOTE) Char)) SQUOTE Spacing) (and DQUOTE (* (and (not-followed-by DQUOTE) Char)) DQUOTE Spacing))) (define-sexp-parser Class all - (and OPENBRACKET (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing)) + (and OPENBRACKET (not-followed-by NOTIN) + (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing)) +(define-sexp-parser NotInClass all + (and OPENBRACKET NOTIN + (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing)) (define-sexp-parser Range all (or (and Char DASH Char) Char)) (define-sexp-parser Char all @@ -143,6 +150,8 @@ EndOfFile < !. (and (not-followed-by "\\") peg-any))) (define-sexp-parser LEFTARROW body (and (or "<--" "<-" "<") Spacing)) ; NOTE: <-- and < are extensions +(define-sexp-parser NOTIN none + (and "^")) (define-sexp-parser SLASH none (and "/" Spacing)) (define-sexp-parser AND all @@ -279,6 +288,7 @@ EndOfFile < !. ('Identifier (Identifier->defn value for-syntax)) ('Expression (Expression->defn value for-syntax)) ('Literal (Literal->defn value for-syntax)) + ('NotInClass (NotInClass->defn value for-syntax)) ('Class (Class->defn value for-syntax))))) ;; (Identifier "hello") @@ -291,13 +301,35 @@ EndOfFile < !. (define (Literal->defn lst for-syntax) (apply string (map (lambda (x) (Char->defn x for-syntax)) (cdr lst)))) -;; TODO: empty Class can happen: `[]`, but what does it represent? +;; (NotInClass ...) +;; `-> (and ...) +(define (NotInClass->defn lst for-syntax) + #`(and #,@(map (lambda (x) (NotInRange->defn x for-syntax)) + (cdr lst)))) + ;; (Class ...) ;; `-> (or ...) (define (Class->defn lst for-syntax) #`(or #,@(map (lambda (x) (Range->defn x for-syntax)) (cdr lst)))) +;; NOTE: It's coming from NotInClass. +;; For one character: +;; (Range (Char "a")) +;; `-> (not-in-range #\a #\a) +;; Or for a range: +;; (Range (Char "a") (Char "b")) +;; `-> (not-in-range #\a #\b) +(define (NotInRange->defn lst for-syntax) + (match lst + (('Range c) + (let ((ch (Char->defn c for-syntax))) + #`(not-in-range #,ch #,ch))) + (('Range range-beginning range-end) + #`(not-in-range + #,(Char->defn range-beginning for-syntax) + #,(Char->defn range-end for-syntax))))) + ;; For one character: ;; (Range (Char "a")) ;; `-> "a" diff --git a/test-suite/tests/peg.test b/test-suite/tests/peg.test index 1136c03f1..d9e3e1b22 100644 --- a/test-suite/tests/peg.test +++ b/test-suite/tests/peg.test @@ -38,6 +38,7 @@ (Identifier Identifier) (Literal Literal) (Class Class) + (NotInClass NotInClass) (Range Range) (Char Char) (LEFTARROW LEFTARROW) @@ -85,7 +86,7 @@ End <-- '*)' C <- Begin N* End N <- C / (!Begin !End Z) -Z <- .") +Z <- [^X-Z]") ;; Forbid some characters to test not-in-range ;; A short /etc/passwd file. (define *etc-passwd* @@ -125,6 +126,9 @@ SLASH < '/'") (match-pattern C "(*blah*)") (make-prec 0 8 "(*blah*)" '((Begin "(*") "blah" (End "*)"))))) + (pass-if + "simple comment with forbidden char" + (not (match-pattern C "(*blYh*)"))) (pass-if "simple comment padded" (equal? -- 2.46.0