From 64a17be08581465d11185b4a0ca636354d2f944c Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Fri, 11 Oct 2024 14:24:30 +0200 Subject: [PATCH v3 2/2] PEG: Add support for `not-in-range` and [^...] Modern PEG supports inversed class like `[^a-z]` that would get any character not in the `a-z` range. This commit adds support for that and also for a new `not-in-range` PEG pattern for scheme. * module/ice-9/peg/codegen.scm (cg-not-in-range): New function. * module/ice-9/peg/string-peg.scm: Add support for `[^...]` * test-suite/tests/peg.test: Test it. * doc/ref/api-peg.texi: Document accordingly. --- doc/ref/api-peg.texi | 8 +++++++ module/ice-9/peg/codegen.scm | 22 +++++++++++++++++++ module/ice-9/peg/string-peg.scm | 39 +++++++++++++++++++++++++++++---- test-suite/tests/peg.test | 6 ++++- 4 files changed, 70 insertions(+), 5 deletions(-) diff --git a/doc/ref/api-peg.texi b/doc/ref/api-peg.texi index 84a9e6c6b..edb090b20 100644 --- a/doc/ref/api-peg.texi +++ b/doc/ref/api-peg.texi @@ -147,6 +147,14 @@ Parses any character falling between @var{a} and @var{z}. @code{(range #\a #\z)} @end deftp +@deftp {PEG Pattern} {inverse range of characters} a z +Parses any character not falling between @var{a} and @var{z}. + +@code{"[^a-z]"} + +@code{(not-in-range #\a #\z)} +@end deftp + Example: @example diff --git a/module/ice-9/peg/codegen.scm b/module/ice-9/peg/codegen.scm index d80c3e849..82367ef55 100644 --- a/module/ice-9/peg/codegen.scm +++ b/module/ice-9/peg/codegen.scm @@ -140,6 +140,27 @@ return EXP." ((none) #`(list (1+ pos) '())) (else (error "bad accum" accum)))))))))) +;; Generates code for matching a range of characters not between start and end. +;; E.g.: (cg-not-in-range syntax #\a #\z 'body) +(define (cg-not-in-range pat accum) + (syntax-case pat () + ((start end) + (if (not (and (char? (syntax->datum #'start)) + (char? (syntax->datum #'end)))) + (error "range PEG should have characters after it; instead got" + #'start #'end)) + #`(lambda (str len pos) + (and (< pos len) + (let ((c (string-ref str pos))) + (and (or (char? c end)) + #,(case accum + ((all) #`(list (1+ pos) + (list 'cg-not-in-range (string c)))) + ((name) #`(list (1+ pos) 'cg-not-in-range)) + ((body) #`(list (1+ pos) (string c))) + ((none) #`(list (1+ pos) '())) + (else (error "bad accum" accum)))))))))) + ;; Generate code to match a pattern and do nothing with the result (define (cg-ignore pat accum) (syntax-case pat () @@ -304,6 +325,7 @@ return EXP." (assq-set! peg-compiler-alist symbol function))) (add-peg-compiler! 'range cg-range) +(add-peg-compiler! 'not-in-range cg-not-in-range) (add-peg-compiler! 'ignore cg-ignore) (add-peg-compiler! 'capture cg-capture) (add-peg-compiler! 'and cg-and) diff --git a/module/ice-9/peg/string-peg.scm b/module/ice-9/peg/string-peg.scm index f688653ef..2d2f972ff 100644 --- a/module/ice-9/peg/string-peg.scm +++ b/module/ice-9/peg/string-peg.scm @@ -53,7 +53,7 @@ Prefix <-- (AND / NOT)? Suffix Suffix <-- Primary (QUESTION / STAR / PLUS)? Primary <-- Identifier !LEFTARROW / OPEN Expression CLOSE - / Literal / Class / DOT + / Literal / Class / NotInClass / DOT # Lexical syntax Identifier <-- IdentStart IdentCont* Spacing @@ -63,7 +63,8 @@ IdentCont <- IdentStart / [0-9] Literal <-- SQUOTE (!SQUOTE Char)* SQUOTE Spacing / DQUOTE (!DQUOTE Char)* DQUOTE Spacing -Class <-- '[' (!']' Range)* ']' Spacing +NotInClass <-- '[' NOTIN (!']' Range)* ']' Spacing +Class <-- '[' !NOTIN (!']' Range)* ']' Spacing Range <-- Char '-' Char / Char Char <-- '\\\\' [nrt'\"\\[\\]\\\\] / '\\\\' [0-7][0-7][0-7] @@ -76,6 +77,7 @@ SQUOTE <-- ['] DQUOTE <-- [\"] OPENBRACKET < '[' CLOSEBRACKET < ']' +NOTIN < '^' SLASH < '/' Spacing AND <-- '&' Spacing NOT <-- '!' Spacing @@ -120,6 +122,7 @@ EndOfFile < !. (and OPEN Expression CLOSE) Literal Class + NotInClass DOT)) (define-sexp-parser Identifier all (and IdentStart (* IdentCont) Spacing)) @@ -131,7 +134,11 @@ EndOfFile < !. (or (and SQUOTE (* (and (not-followed-by SQUOTE) Char)) SQUOTE Spacing) (and DQUOTE (* (and (not-followed-by DQUOTE) Char)) DQUOTE Spacing))) (define-sexp-parser Class all - (and OPENBRACKET (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing)) + (and OPENBRACKET (not-followed-by NOTIN) + (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing)) +(define-sexp-parser NotInClass all + (and OPENBRACKET NOTIN + (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing)) (define-sexp-parser Range all (or (and Char DASH Char) Char)) (define-sexp-parser Char all @@ -141,6 +148,8 @@ EndOfFile < !. (and (not-followed-by "\\") peg-any))) (define-sexp-parser LEFTARROW body (and (or "<--" "<-" "<") Spacing)) ; NOTE: <-- and < are extensions +(define-sexp-parser NOTIN none + (and "^")) (define-sexp-parser SLASH none (and "/" Spacing)) (define-sexp-parser AND all @@ -275,6 +284,7 @@ EndOfFile < !. ('Identifier (Identifier->defn value for-syntax)) ('Expression (Expression->defn value for-syntax)) ('Literal (Literal->defn value for-syntax)) + ('NotInClass (NotInClass->defn value for-syntax)) ('Class (Class->defn value for-syntax))))) ;; (Identifier "hello") @@ -287,13 +297,34 @@ EndOfFile < !. (define (Literal->defn lst for-syntax) (apply string (map (lambda (x) (Char->defn x for-syntax)) (cdr lst)))) -;; TODO: empty Class can happen: `[]`, but what does it represent? +;; (NotInClass ...) +;; `-> (and ...) +(define (NotInClass->defn lst for-syntax) + #`(and #,@(map (lambda (x) (NotInRange->defn x for-syntax)) + (cdr lst)))) + ;; (Class ...) ;; `-> (or ...) (define (Class->defn lst for-syntax) #`(or #,@(map (lambda (x) (Range->defn x for-syntax)) (cdr lst)))) +;; For one character: +;; (NotInRange (Char "a")) +;; `-> (not-in-range #\a #\a) +;; Or for a range: +;; (NotInRange (Char "a") (Char "b")) +;; `-> (not-in-range #\a #\b) +(define (NotInRange->defn lst for-syntax) + (cond + ((= 2 (length lst)) + (let ((ch (Char->defn (second lst) for-syntax))) + #`(not-in-range #,ch #,ch))) + ((= 3 (length lst)) + #`(not-in-range + #,(Char->defn (second lst) for-syntax) + #,(Char->defn (third lst) for-syntax))))) + ;; For one character: ;; (Range (Char "a")) ;; `-> "a" diff --git a/test-suite/tests/peg.test b/test-suite/tests/peg.test index 556145e72..965e1c12f 100644 --- a/test-suite/tests/peg.test +++ b/test-suite/tests/peg.test @@ -38,6 +38,7 @@ (Identifier Identifier) (Literal Literal) (Class Class) + (NotInClass NotInClass) (Range Range) (Char Char) (LEFTARROW LEFTARROW) @@ -85,7 +86,7 @@ End <-- '*)' C <- Begin N* End N <- C / (!Begin !End Z) -Z <- .") +Z <- [^X-Z]") ;; Forbid some characters to test not-in-range ;; A short /etc/passwd file. (define *etc-passwd* @@ -125,6 +126,9 @@ SLASH < '/'") (match-pattern C "(*blah*)") (make-prec 0 8 "(*blah*)" '((Begin "(*") "blah" (End "*)"))))) + (pass-if + "simple comment with forbidden char" + (not (match-pattern C "(*blYh*)"))) (pass-if "simple comment padded" (equal? -- 2.46.0