From 81944c2037e0d6d8c972def2ceb1aa4c51a61431 Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Wed, 11 Sep 2024 21:19:26 +0200 Subject: [PATCH v3 1/2] PEG: Add full support for PEG + some extensions This commit adds support for PEG as described in: It adds support for the missing features (comments, underscores in identifiers and escaping) while keeping the extensions (dashes in identifiers, < and <--). The naming system tries to be as close as possible to the one proposed in the paper. * module/ice-9/peg/string-peg.scm: Rewrite PEG parser. * test-suite/tests/peg.test: Fix import --- doc/ref/api-peg.texi | 8 +- module/ice-9/peg/string-peg.scm | 442 ++++++++++++++++++++------------ test-suite/tests/peg.test | 32 ++- 3 files changed, 302 insertions(+), 180 deletions(-) diff --git a/doc/ref/api-peg.texi b/doc/ref/api-peg.texi index d34ddc64c..84a9e6c6b 100644 --- a/doc/ref/api-peg.texi +++ b/doc/ref/api-peg.texi @@ -17,6 +17,10 @@ Wikipedia has a clear and concise introduction to PEGs if you want to familiarize yourself with the syntax: @url{http://en.wikipedia.org/wiki/Parsing_expression_grammar}. +The paper that introduced PEG contains a more detailed description of how PEG +works, and describes its syntax in detail: +@url{https://bford.info/pub/lang/peg.pdf} + The @code{(ice-9 peg)} module works by compiling PEGs down to lambda expressions. These can either be stored in variables at compile-time by the define macros (@code{define-peg-pattern} and @@ -216,8 +220,8 @@ should propagate up the parse tree. The normal @code{<-} propagates the matched text up the parse tree, @code{<--} propagates the matched text up the parse tree tagged with the name of the nonterminal, and @code{<} discards that matched text and propagates nothing up the parse tree. -Also, nonterminals may consist of any alphanumeric character or a ``-'' -character (in normal PEGs nonterminals can only be alphabetic). +Also, nonterminals may include ``-'' character, while in normal PEG it is not +allowed. For example, if we: @lisp diff --git a/module/ice-9/peg/string-peg.scm b/module/ice-9/peg/string-peg.scm index 45ed14bb1..f688653ef 100644 --- a/module/ice-9/peg/string-peg.scm +++ b/module/ice-9/peg/string-peg.scm @@ -1,6 +1,7 @@ ;;;; string-peg.scm --- representing PEG grammars as strings ;;;; -;;;; Copyright (C) 2010, 2011 Free Software Foundation, Inc. +;;;; Copyright (C) 2010, 2011, Free Software Foundation, Inc. +;;;; Copyright (C) 2024 Ekaitz Zarraga ;;;; ;;;; This library is free software; you can redistribute it and/or ;;;; modify it under the terms of the GNU Lesser General Public @@ -22,9 +23,13 @@ define-peg-string-patterns peg-grammar) #:use-module (ice-9 peg using-parsers) + #:use-module (srfi srfi-1) #:use-module (ice-9 peg codegen) #:use-module (ice-9 peg simplify-tree)) +;; This module provides support for PEG as described in: +;; + ;; Gets the left-hand depth of a list. (define (depth lst) (if (or (not (list? lst)) (null? lst)) @@ -38,22 +43,57 @@ ;; Grammar for PEGs in PEG grammar. (define peg-as-peg -"grammar <-- (nonterminal ('<--' / '<-' / '<') sp pattern)+ -pattern <-- alternative (SLASH sp alternative)* -alternative <-- ([!&]? sp suffix)+ -suffix <-- primary ([*+?] sp)* -primary <-- '(' sp pattern ')' sp / '.' sp / literal / charclass / nonterminal !'<' -literal <-- ['] (!['] .)* ['] sp -charclass <-- LB (!']' (CCrange / CCsingle))* RB sp -CCrange <-- . '-' . -CCsingle <-- . -nonterminal <-- [a-zA-Z0-9-]+ sp -sp < [ \t\n]* -SLASH < '/' -LB < '[' -RB < ']' +"# Hierarchical syntax +Grammar <-- Spacing Definition+ EndOfFile +Definition <-- Identifier LEFTARROW Expression + +Expression <-- Sequence (SLASH Sequence)* +Sequence <-- Prefix* +Prefix <-- (AND / NOT)? Suffix +Suffix <-- Primary (QUESTION / STAR / PLUS)? +Primary <-- Identifier !LEFTARROW + / OPEN Expression CLOSE + / Literal / Class / DOT + +# Lexical syntax +Identifier <-- IdentStart IdentCont* Spacing +# NOTE: `-` is an extension +IdentStart <- [a-zA-Z_] / '-' +IdentCont <- IdentStart / [0-9] + +Literal <-- SQUOTE (!SQUOTE Char)* SQUOTE Spacing + / DQUOTE (!DQUOTE Char)* DQUOTE Spacing +Class <-- '[' (!']' Range)* ']' Spacing +Range <-- Char '-' Char / Char +Char <-- '\\\\' [nrt'\"\\[\\]\\\\] + / '\\\\' [0-7][0-7][0-7] + / '\\\\' [0-7][0-7]? + / !'\\\\' . + +# NOTE: `<--` and `<` are extensions +LEFTARROW <- ('<--' / '<-' / '<') Spacing +SQUOTE <-- ['] +DQUOTE <-- [\"] +OPENBRACKET < '[' +CLOSEBRACKET < ']' +SLASH < '/' Spacing +AND <-- '&' Spacing +NOT <-- '!' Spacing +QUESTION <-- '?' Spacing +STAR <-- '*' Spacing +PLUS <-- '+' Spacing +OPEN < '(' Spacing +CLOSE < ')' Spacing +DOT <-- '.' Spacing + +Spacing < (Space / Comment)* +Comment < '#' (!EndOfLine .)* EndOfLine +Space < ' ' / '\t' / EndOfLine +EndOfLine < '\\r\\n' / '\\n' / '\\r' +EndOfFile < !. ") + (define-syntax define-sexp-parser (lambda (x) (syntax-case x () @@ -63,35 +103,78 @@ RB < ']' (syn (wrap-parser-for-users x matchf accumsym #'sym))) #`(define sym #,syn)))))) -(define-sexp-parser peg-grammar all - (+ (and peg-nonterminal (or "<--" "<-" "<") peg-sp peg-pattern))) -(define-sexp-parser peg-pattern all - (and peg-alternative - (* (and (ignore "/") peg-sp peg-alternative)))) -(define-sexp-parser peg-alternative all - (+ (and (? (or "!" "&")) peg-sp peg-suffix))) -(define-sexp-parser peg-suffix all - (and peg-primary (* (and (or "*" "+" "?") peg-sp)))) -(define-sexp-parser peg-primary all - (or (and "(" peg-sp peg-pattern ")" peg-sp) - (and "." peg-sp) - peg-literal - peg-charclass - (and peg-nonterminal (not-followed-by "<")))) -(define-sexp-parser peg-literal all - (and "'" (* (and (not-followed-by "'") peg-any)) "'" peg-sp)) -(define-sexp-parser peg-charclass all - (and (ignore "[") - (* (and (not-followed-by "]") - (or charclass-range charclass-single))) - (ignore "]") - peg-sp)) -(define-sexp-parser charclass-range all (and peg-any "-" peg-any)) -(define-sexp-parser charclass-single all peg-any) -(define-sexp-parser peg-nonterminal all - (and (+ (or (range #\a #\z) (range #\A #\Z) (range #\0 #\9) "-")) peg-sp)) -(define-sexp-parser peg-sp none - (* (or " " "\t" "\n"))) +(define-sexp-parser Grammar all + (and Spacing (+ Definition) EndOfFile)) +(define-sexp-parser Definition all + (and Identifier LEFTARROW Expression)) +(define-sexp-parser Expression all + (and Sequence (* (and SLASH Sequence)))) +(define-sexp-parser Sequence all + (* Prefix)) +(define-sexp-parser Prefix all + (and (? (or AND NOT)) Suffix)) +(define-sexp-parser Suffix all + (and Primary (? (or QUESTION STAR PLUS)))) +(define-sexp-parser Primary all + (or (and Identifier (not-followed-by LEFTARROW)) + (and OPEN Expression CLOSE) + Literal + Class + DOT)) +(define-sexp-parser Identifier all + (and IdentStart (* IdentCont) Spacing)) +(define-sexp-parser IdentStart body + (or (range #\a #\z) (range #\A #\Z) "_" "-")) ; NOTE: - is an extension +(define-sexp-parser IdentCont body + (or IdentStart (range #\0 #\9))) +(define-sexp-parser Literal all + (or (and SQUOTE (* (and (not-followed-by SQUOTE) Char)) SQUOTE Spacing) + (and DQUOTE (* (and (not-followed-by DQUOTE) Char)) DQUOTE Spacing))) +(define-sexp-parser Class all + (and OPENBRACKET (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing)) +(define-sexp-parser Range all + (or (and Char DASH Char) Char)) +(define-sexp-parser Char all + (or (and "\\" (or "n" "r" "t" "'" "[" "]" "\\")) + (and "\\" (range #\0 #\7) (range #\0 #\7) (range #\0 #\7)) + (and "\\" (range #\0 #\7) (? (range #\0 #\7))) + (and (not-followed-by "\\") peg-any))) +(define-sexp-parser LEFTARROW body + (and (or "<--" "<-" "<") Spacing)) ; NOTE: <-- and < are extensions +(define-sexp-parser SLASH none + (and "/" Spacing)) +(define-sexp-parser AND all + (and "&" Spacing)) +(define-sexp-parser NOT all + (and "!" Spacing)) +(define-sexp-parser QUESTION all + (and "?" Spacing)) +(define-sexp-parser STAR all + (and "*" Spacing)) +(define-sexp-parser PLUS all + (and "+" Spacing)) +(define-sexp-parser OPEN none + (and "(" Spacing)) +(define-sexp-parser CLOSE none + (and ")" Spacing)) +(define-sexp-parser DOT all + (and "." Spacing)) +(define-sexp-parser SQUOTE none "'") +(define-sexp-parser DQUOTE none "\"") +(define-sexp-parser OPENBRACKET none "[") +(define-sexp-parser CLOSEBRACKET none "]") +(define-sexp-parser DASH none "-") +(define-sexp-parser Spacing none + (* (or Space Comment))) +(define-sexp-parser Comment none + (and "#" (* (and (not-followed-by EndOfLine) peg-any)) EndOfLine)) +(define-sexp-parser Space none + (or " " "\t" EndOfLine)) +(define-sexp-parser EndOfLine none + (or "\r\n" "\n" "\r")) +(define-sexp-parser EndOfFile none + (not-followed-by peg-any)) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;; PARSE STRING PEGS @@ -101,7 +184,7 @@ RB < ']' ;; will define all of the nonterminals in the grammar with equivalent ;; PEG s-expressions. (define (peg-parser str for-syntax) - (let ((parsed (match-pattern peg-grammar str))) + (let ((parsed (match-pattern Grammar str))) (if (not parsed) (begin ;; (display "Invalid PEG grammar!\n") @@ -110,132 +193,160 @@ RB < ']' (cond ((or (not (list? lst)) (null? lst)) lst) - ((eq? (car lst) 'peg-grammar) - #`(begin - #,@(map (lambda (x) (peg-nonterm->defn x for-syntax)) - (context-flatten (lambda (lst) (<= (depth lst) 2)) - (cdr lst)))))))))) + ((eq? (car lst) 'Grammar) + (Grammar->defn lst for-syntax))))))) -;; Macro wrapper for PEG-PARSER. Parses PEG grammars expressed as strings and -;; defines all the appropriate nonterminals. -(define-syntax define-peg-string-patterns - (lambda (x) - (syntax-case x () - ((_ str) - (peg-parser (syntax->datum #'str) x))))) +;; (Grammar (Definition ...) (Definition ...)) +(define (Grammar->defn lst for-syntax) + #`(begin + #,@(map (lambda (x) (Definition->defn x for-syntax)) + (context-flatten (lambda (lst) (<= (depth lst) 1)) + (cdr lst))))) -;; lst has format (nonterm grabber pattern), where -;; nonterm is a symbol (the name of the nonterminal), -;; grabber is a string (either "<", "<-" or "<--"), and -;; pattern is the parse of a PEG pattern expressed as as string. -(define (peg-nonterm->defn lst for-syntax) - (let* ((nonterm (car lst)) - (grabber (cadr lst)) - (pattern (caddr lst)) - (nonterm-name (datum->syntax for-syntax - (string->symbol (cadr nonterm))))) - #`(define-peg-pattern #,nonterm-name +;; (Definition (Identifier "Something") "<-" (Expression ...)) +;; `-> (define-peg-pattern Something 'all ...) +(define (Definition->defn lst for-syntax) + (let ((identifier (second (second lst))) + (grabber (third lst)) + (expression (fourth lst))) + #`(define-peg-pattern #,(datum->syntax for-syntax + (string->symbol identifier)) #,(cond ((string=? grabber "<--") (datum->syntax for-syntax 'all)) ((string=? grabber "<-") (datum->syntax for-syntax 'body)) (else (datum->syntax for-syntax 'none))) - #,(compressor (peg-pattern->defn pattern for-syntax) for-syntax)))) - -;; lst has format ('peg-pattern ...). -;; After the context-flatten, (cdr lst) has format -;; (('peg-alternative ...) ...), where the outer list is a collection -;; of elements from a '/' alternative. -(define (peg-pattern->defn lst for-syntax) - #`(or #,@(map (lambda (x) (peg-alternative->defn x for-syntax)) - (context-flatten (lambda (x) (eq? (car x) 'peg-alternative)) - (cdr lst))))) - -;; lst has format ('peg-alternative ...). -;; After the context-flatten, (cdr lst) has the format -;; (item ...), where each item has format either ("!" ...), ("&" ...), -;; or ('peg-suffix ...). -(define (peg-alternative->defn lst for-syntax) - #`(and #,@(map (lambda (x) (peg-body->defn x for-syntax)) - (context-flatten (lambda (x) (or (string? (car x)) - (eq? (car x) 'peg-suffix))) - (cdr lst))))) - -;; lst has the format either -;; ("!" ('peg-suffix ...)), ("&" ('peg-suffix ...)), or -;; ('peg-suffix ...). -(define (peg-body->defn lst for-syntax) - (cond - ((equal? (car lst) "&") - #`(followed-by #,(peg-suffix->defn (cadr lst) for-syntax))) - ((equal? (car lst) "!") - #`(not-followed-by #,(peg-suffix->defn (cadr lst) for-syntax))) - ((eq? (car lst) 'peg-suffix) - (peg-suffix->defn lst for-syntax)) - (else `(peg-parse-body-fail ,lst)))) - -;; lst has format ('peg-suffix (? (/ "*" "?" "+"))) -(define (peg-suffix->defn lst for-syntax) - (let ((inner-defn (peg-primary->defn (cadr lst) for-syntax))) - (cond - ((null? (cddr lst)) - inner-defn) - ((equal? (caddr lst) "*") - #`(* #,inner-defn)) - ((equal? (caddr lst) "?") - #`(? #,inner-defn)) - ((equal? (caddr lst) "+") - #`(+ #,inner-defn))))) - -;; Parse a primary. -(define (peg-primary->defn lst for-syntax) - (let ((el (cadr lst))) + #,(compressor (Expression->defn expression for-syntax) for-syntax)))) + +;; (Expression (Sequence X)) +;; `-> (X) +;; (Expression (Sequence X) (Sequence Y)) +;; `-> (or X Y) +;; (Expression (Sequence X) ((Sequence Y) (Sequence Z) ...)) +;; `-> (or X Y Z ...) +(define (Expression->defn lst for-syntax) + (let ((first-sequence (second lst)) + (rest (cddr lst))) + #`(or #,(Sequence->defn first-sequence for-syntax) + #,@(map (lambda (x) + (Sequence->defn x for-syntax)) + (keyword-flatten '(Sequence) rest))))) + + +(define (Sequence->defn lst for-syntax) + #`(and #,@(map (lambda (x) (Prefix->defn x for-syntax)) (cdr lst)))) + + +;; (Prefix (Suffix ...)) +;; `-> (...) +;; (Prefix (NOT "!") (Suffix ...)) +;; `-> (not-followed-by ...) +;; (Prefix (AND "&") (Suffix ...)) +;; `-> (followed-by ...) +(define (Prefix->defn lst for-syntax) + (let ((suffix (second lst))) + (case (car suffix) + ('AND #`(followed-by #,(Suffix->defn (third lst) for-syntax))) + ('NOT #`(not-followed-by #,(Suffix->defn (third lst) for-syntax))) + (else (Suffix->defn suffix for-syntax))))) + +;; (Suffix (Primary ...)) +;; `-> (...) +;; (Suffix (Primary ...) (STAR "*")) +;; `-> (* ...) +;; (Suffix (Primary ...) (QUESTION "?")) +;; `-> (? ...) +;; (Suffix (Primary ...) (PLUS "+")) +;; `-> (+ ...) +(define (Suffix->defn lst for-syntax) + (let* ((primary (second lst)) + (out (Primary->defn primary for-syntax)) + (extra (cddr lst))) + (if (null? extra) + out + (case (caar extra) + ('QUESTION #`(? #,out)) + ('STAR #`(* #,out)) + ('PLUS #`(+ #,out)))))) + +(define (Primary->defn lst for-syntax) + (let ((value (second lst))) + (case (car value) + ('DOT #'peg-any) + ('Identifier (Identifier->defn value for-syntax)) + ('Expression (Expression->defn value for-syntax)) + ('Literal (Literal->defn value for-syntax)) + ('Class (Class->defn value for-syntax))))) + +;; (Identifier "hello") +;; `-> hello +(define (Identifier->defn lst for-syntax) + (datum->syntax for-syntax (string->symbol (second lst)))) + +;; (Literal (Char "a") (Char "b") (Char "c")) +;; `-> "abc" +(define (Literal->defn lst for-syntax) + (apply string (map (lambda (x) (Char->defn x for-syntax)) (cdr lst)))) + +;; TODO: empty Class can happen: `[]`, but what does it represent? +;; (Class ...) +;; `-> (or ...) +(define (Class->defn lst for-syntax) + #`(or #,@(map (lambda (x) (Range->defn x for-syntax)) + (cdr lst)))) + +;; For one character: +;; (Range (Char "a")) +;; `-> "a" +;; Or for a range: +;; (Range (Char "a") (Char "b")) +;; `-> (range #\a #\b) +(define (Range->defn lst for-syntax) (cond - ((list? el) - (cond - ((eq? (car el) 'peg-literal) - (peg-literal->defn el for-syntax)) - ((eq? (car el) 'peg-charclass) - (peg-charclass->defn el for-syntax)) - ((eq? (car el) 'peg-nonterminal) - (datum->syntax for-syntax (string->symbol (cadr el)))))) - ((string? el) + ((= 2 (length lst)) + (second (second lst))) + ((= 3 (length lst)) + #`(range + #,(Char->defn (second lst) for-syntax) + #,(Char->defn (third lst) for-syntax))))) + +;; (Char "a") +;; `-> #\a +;; (Char "\\n") +;; `-> #\newline +;; (Char "\\135") +;; `-> #\] +(define (Char->defn lst for-syntax) + (let* ((charstr (second lst)) + (first (string-ref charstr 0))) (cond - ((equal? el "(") - (peg-pattern->defn (caddr lst) for-syntax)) - ((equal? el ".") - (datum->syntax for-syntax 'peg-any)) - (else (datum->syntax for-syntax - `(peg-parse-any unknown-string ,lst))))) - (else (datum->syntax for-syntax - `(peg-parse-any unknown-el ,lst)))))) - -;; Trims characters off the front and end of STR. -;; (trim-1chars "'ab'") -> "ab" -(define (trim-1chars str) (substring str 1 (- (string-length str) 1))) - -;; Parses a literal. -(define (peg-literal->defn lst for-syntax) - (datum->syntax for-syntax (trim-1chars (cadr lst)))) - -;; Parses a charclass. -(define (peg-charclass->defn lst for-syntax) - #`(or - #,@(map - (lambda (cc) - (cond - ((eq? (car cc) 'charclass-range) - #`(range #,(datum->syntax - for-syntax - (string-ref (cadr cc) 0)) - #,(datum->syntax - for-syntax - (string-ref (cadr cc) 2)))) - ((eq? (car cc) 'charclass-single) - (datum->syntax for-syntax (cadr cc))))) - (context-flatten - (lambda (x) (or (eq? (car x) 'charclass-range) - (eq? (car x) 'charclass-single))) - (cdr lst))))) + ((= 1 (string-length charstr)) first) + ((char-numeric? (string-ref charstr 1)) + (integer->char + (reduce + 0 + (map + (lambda (x y) + (* (- (char->integer x) (char->integer #\0)) y)) + (reverse (string->list charstr 1)) + '(1 8 64))))) + (else + (case (string-ref charstr 1) + ((#\n) #\newline) + ((#\r) #\return) + ((#\t) #\tab) + ((#\') #\') + ((#\]) #\]) + ((#\\) #\\) + ((#\[) #\[)))))) + +(define peg-grammar Grammar) + +;; Macro wrapper for PEG-PARSER. Parses PEG grammars expressed as strings and +;; defines all the appropriate nonterminals. +(define-syntax define-peg-string-patterns + (lambda (x) + (syntax-case x () + ((_ str) + (peg-parser (syntax->datum #'str) x))))) ;; Compresses a list to save the optimizer work. ;; e.g. (or (and a)) -> a @@ -263,11 +374,10 @@ RB < ']' (let ((string (syntax->datum #'str-stx))) (compile-peg-pattern (compressor - (peg-pattern->defn - (peg:tree (match-pattern peg-pattern string)) #'str-stx) + (Expression->defn + (peg:tree (match-pattern Expression string)) #'str-stx) #'str-stx) (if (eq? accum 'all) 'body accum)))) (else (error "Bad embedded PEG string" args)))) (add-peg-compiler! 'peg peg-string-compile) - diff --git a/test-suite/tests/peg.test b/test-suite/tests/peg.test index f516571e8..556145e72 100644 --- a/test-suite/tests/peg.test +++ b/test-suite/tests/peg.test @@ -28,17 +28,25 @@ ;; the nonterminals defined in the PEG parser written with ;; S-expressions. (define grammar-mapping - '((grammar peg-grammar) - (pattern peg-pattern) - (alternative peg-alternative) - (suffix peg-suffix) - (primary peg-primary) - (literal peg-literal) - (charclass peg-charclass) - (CCrange charclass-range) - (CCsingle charclass-single) - (nonterminal peg-nonterminal) - (sp peg-sp))) + '((Grammar Grammar) + (Definition Definition) + (Expression Expression) + (Sequence Sequence) + (Prefix Prefix) + (Suffix Suffix) + (Primary Primary) + (Identifier Identifier) + (Literal Literal) + (Class Class) + (Range Range) + (Char Char) + (LEFTARROW LEFTARROW) + (AND AND) + (NOT NOT) + (QUESTION QUESTION) + (STAR STAR) + (PLUS PLUS) + (DOT DOT))) ;; Transforms the nonterminals defined in the PEG parser written as a PEG to the nonterminals defined in the PEG parser written with S-expressions. (define (grammar-transform x) @@ -69,7 +77,7 @@ (peg:tree (match-pattern (@@ (ice-9 peg) peg-grammar) (@@ (ice-9 peg) peg-as-peg))) (tree-map grammar-transform - (peg:tree (match-pattern grammar (@@ (ice-9 peg) peg-as-peg))))))) + (peg:tree (match-pattern (@@ (ice-9 peg) peg-grammar) (@@ (ice-9 peg) peg-as-peg))))))) ;; A grammar for pascal-style comments from Wikipedia. (define comment-grammar -- 2.46.0