From d894303ed01de72703b815061f826da97dd303f0 Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Fri, 11 Oct 2024 14:24:30 +0200 Subject: [PATCH v5 2/3] PEG: Add support for `not-in-range` and [^...] Modern PEG supports inversed class like `[^a-z]` that would get any character not in the `a-z` range. This commit adds support for that and also for a new `not-in-range` PEG pattern for scheme. * module/ice-9/peg/codegen.scm (cg-not-in-range): New function. * module/ice-9/peg/string-peg.scm: Add support for `[^...]` * test-suite/tests/peg.test: Add NotInClass to grammar-mapping. * doc/ref/api-peg.texi: Document accordingly. --- NEWS | 3 ++- doc/ref/api-peg.texi | 8 ++++++ module/ice-9/peg/codegen.scm | 22 +++++++++++++++ module/ice-9/peg/string-peg.scm | 48 ++++++++++++++++++++++++++++++--- test-suite/tests/peg.test | 2 +- 5 files changed, 77 insertions(+), 6 deletions(-) diff --git a/NEWS b/NEWS index df43f3754..17ef560b1 100644 --- a/NEWS +++ b/NEWS @@ -32,7 +32,8 @@ Changes in 3.0.11 (since 3.0.10) ** PEG parser PEG parser has been rewritten to cover all the functionality defined in -. +. Also added the `not-in-range` pattern +to `(ice-9 peg)` that is also available from PEG strings via `[^...]`. Changes in 3.0.10 (since 3.0.9) diff --git a/doc/ref/api-peg.texi b/doc/ref/api-peg.texi index 84a9e6c6b..edb090b20 100644 --- a/doc/ref/api-peg.texi +++ b/doc/ref/api-peg.texi @@ -147,6 +147,14 @@ Parses any character falling between @var{a} and @var{z}. @code{(range #\a #\z)} @end deftp +@deftp {PEG Pattern} {inverse range of characters} a z +Parses any character not falling between @var{a} and @var{z}. + +@code{"[^a-z]"} + +@code{(not-in-range #\a #\z)} +@end deftp + Example: @example diff --git a/module/ice-9/peg/codegen.scm b/module/ice-9/peg/codegen.scm index d80c3e849..82367ef55 100644 --- a/module/ice-9/peg/codegen.scm +++ b/module/ice-9/peg/codegen.scm @@ -140,6 +140,27 @@ return EXP." ((none) #`(list (1+ pos) '())) (else (error "bad accum" accum)))))))))) +;; Generates code for matching a range of characters not between start and end. +;; E.g.: (cg-not-in-range syntax #\a #\z 'body) +(define (cg-not-in-range pat accum) + (syntax-case pat () + ((start end) + (if (not (and (char? (syntax->datum #'start)) + (char? (syntax->datum #'end)))) + (error "range PEG should have characters after it; instead got" + #'start #'end)) + #`(lambda (str len pos) + (and (< pos len) + (let ((c (string-ref str pos))) + (and (or (char? c end)) + #,(case accum + ((all) #`(list (1+ pos) + (list 'cg-not-in-range (string c)))) + ((name) #`(list (1+ pos) 'cg-not-in-range)) + ((body) #`(list (1+ pos) (string c))) + ((none) #`(list (1+ pos) '())) + (else (error "bad accum" accum)))))))))) + ;; Generate code to match a pattern and do nothing with the result (define (cg-ignore pat accum) (syntax-case pat () @@ -304,6 +325,7 @@ return EXP." (assq-set! peg-compiler-alist symbol function))) (add-peg-compiler! 'range cg-range) +(add-peg-compiler! 'not-in-range cg-not-in-range) (add-peg-compiler! 'ignore cg-ignore) (add-peg-compiler! 'capture cg-capture) (add-peg-compiler! 'and cg-and) diff --git a/module/ice-9/peg/string-peg.scm b/module/ice-9/peg/string-peg.scm index ede24181c..4b92b393c 100644 --- a/module/ice-9/peg/string-peg.scm +++ b/module/ice-9/peg/string-peg.scm @@ -54,7 +54,7 @@ Prefix <-- (AND / NOT)? Suffix Suffix <-- Primary (QUESTION / STAR / PLUS)? Primary <-- Identifier !LEFTARROW / OPEN Expression CLOSE - / Literal / Class / DOT + / Literal / Class / NotInClass / DOT # Lexical syntax Identifier <-- IdentStart IdentCont* Spacing @@ -64,7 +64,8 @@ IdentCont <- IdentStart / [0-9] Literal <-- SQUOTE (!SQUOTE Char)* SQUOTE Spacing / DQUOTE (!DQUOTE Char)* DQUOTE Spacing -Class <-- OPENBRACKET (!CLOSEBRACKET Range)* CLOSEBRACKET Spacing +NotInClass <-- OPENBRACKET NOTIN (!CLOSEBRACKET Range)* CLOSEBRACKET Spacing +Class <-- OPENBRACKET !NOTIN (!CLOSEBRACKET Range)* CLOSEBRACKET Spacing Range <-- Char DASH Char / Char Char <-- '\\\\' [nrtf'\"\\[\\]\\\\] / '\\\\' [0-7][0-7][0-7] @@ -80,6 +81,7 @@ DASH < '-' OPENBRACKET < '[' CLOSEBRACKET < ']' HEX <- [0-9a-fA-F] +NOTIN < '^' SLASH < '/' Spacing AND <-- '&' Spacing NOT <-- '!' Spacing @@ -124,6 +126,7 @@ EndOfFile < !. (and OPEN Expression CLOSE) Literal Class + NotInClass DOT)) (define-sexp-parser Identifier all (and IdentStart (* IdentCont) Spacing)) @@ -135,7 +138,11 @@ EndOfFile < !. (or (and SQUOTE (* (and (not-followed-by SQUOTE) Char)) SQUOTE Spacing) (and DQUOTE (* (and (not-followed-by DQUOTE) Char)) DQUOTE Spacing))) (define-sexp-parser Class all - (and OPENBRACKET (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing)) + (and OPENBRACKET (not-followed-by NOTIN) + (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing)) +(define-sexp-parser NotInClass all + (and OPENBRACKET NOTIN + (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing)) (define-sexp-parser Range all (or (and Char DASH Char) Char)) (define-sexp-parser Char all @@ -148,6 +155,8 @@ EndOfFile < !. (and (or "<--" "<-" "<") Spacing)) ; NOTE: <-- and < are extensions (define-sexp-parser HEX body (or (range #\0 #\9) (range #\a #\f) (range #\A #\F))) +(define-sexp-parser NOTIN none + (and "^")) (define-sexp-parser SLASH none (and "/" Spacing)) (define-sexp-parser AND all @@ -284,6 +293,7 @@ EndOfFile < !. ('Identifier (Identifier->defn value for-syntax)) ('Expression (Expression->defn value for-syntax)) ('Literal (Literal->defn value for-syntax)) + ('NotInClass (NotInClass->defn value for-syntax)) ('Class (Class->defn value for-syntax))))) ;; (Identifier "hello") @@ -296,13 +306,43 @@ EndOfFile < !. (define (Literal->defn lst for-syntax) (apply string (map (lambda (x) (Char->defn x for-syntax)) (cdr lst)))) -;; TODO: empty Class can happen: `[]`, but what does it represent? +;; (NotInClass (Range ...) (Range ...)) +;; `-> (and (followed-by (not-in-range ...)) +;; (followed-by (not-in-range ...)) +;; ... +;; (not-in-range ...)) +;; NOTE: the order doesn't matter, because all `not-in-range`s will always +;; parse exactly one character, but all the elements but the last need not to +;; consume the input. +(define (NotInClass->defn lst for-syntax) + #`(and + #,@(map (lambda (x) #`(followed-by #,(NotInRange->defn x for-syntax))) + (cddr lst)) + #,(NotInRange->defn (cadr lst) for-syntax))) + ;; (Class ...) ;; `-> (or ...) (define (Class->defn lst for-syntax) #`(or #,@(map (lambda (x) (Range->defn x for-syntax)) (cdr lst)))) +;; For one character: +;; (Range (Char "a")) +;; `-> (not-in-range #\a #\a) +;; Or for a range: +;; (Range (Char "a") (Char "b")) +;; `-> (not-in-range #\a #\b) +;; NOTE: It's coming from NotInClass. +(define (NotInRange->defn lst for-syntax) + (match lst + (('Range c) + (let ((ch (Char->defn c for-syntax))) + #`(not-in-range #,ch #,ch))) + (('Range range-beginning range-end) + #`(not-in-range + #,(Char->defn range-beginning for-syntax) + #,(Char->defn range-end for-syntax))))) + ;; For one character: ;; (Range (Char "a")) ;; `-> "a" diff --git a/test-suite/tests/peg.test b/test-suite/tests/peg.test index 1136c03f1..5570fbfa8 100644 --- a/test-suite/tests/peg.test +++ b/test-suite/tests/peg.test @@ -38,6 +38,7 @@ (Identifier Identifier) (Literal Literal) (Class Class) + (NotInClass NotInClass) (Range Range) (Char Char) (LEFTARROW LEFTARROW) @@ -283,4 +284,3 @@ number <-- [0-9]+") "1+1/2*3+(1+1)/2" (equal? (eq-parse "1+1/2*3+(1+1)/2") '(+ (+ 1 (* (/ 1 2) 3)) (/ (+ 1 1) 2))))) - -- 2.46.0