unofficial mirror of bug-guile@gnu.org 
 help / color / mirror / Atom feed
From: Ekaitz Zarraga <ekaitz@elenq.tech>
To: 73188@debbugs.gnu.org
Cc: Ekaitz Zarraga <ekaitz@elenq.tech>
Subject: bug#73188: [PATCH] PEG: Add support for `not-in-range` and [^...]
Date: Fri, 11 Oct 2024 14:31:40 +0200	[thread overview]
Message-ID: <20241011123145.7228-1-ekaitz@elenq.tech> (raw)
In-Reply-To: <78a81bc5-cd0d-0506-185b-c733c66e96ae@elenq.tech>

Modern PEG supports inversed class like `[^a-z]` that would get any
character not in the `a-z` range. This commit adds support for that and
also for a new `not-in-range` PEG pattern for scheme.

* module/ice-9/peg/codegen.scm (cg-not-in-range): New function.
* module/ice-9/peg/string-peg.scm: Add support for `[^...]`
* test-suite/tests/peg.test: Test it.
* doc/ref/api-peg.texi: Document accordingly.
---
 doc/ref/api-peg.texi            |  8 +++++++
 module/ice-9/peg/codegen.scm    | 22 +++++++++++++++++++
 module/ice-9/peg/string-peg.scm | 39 +++++++++++++++++++++++++++++----
 test-suite/tests/peg.test       |  6 ++++-
 4 files changed, 70 insertions(+), 5 deletions(-)

diff --git a/doc/ref/api-peg.texi b/doc/ref/api-peg.texi
index d34ddc64c..f2707442c 100644
--- a/doc/ref/api-peg.texi
+++ b/doc/ref/api-peg.texi
@@ -143,6 +143,14 @@ Parses any character falling between @var{a} and @var{z}.
 @code{(range #\a #\z)}
 @end deftp
 
+@deftp {PEG Pattern} {inversed range of characters} a z
+Parses any character not falling between @var{a} and @var{z}.
+
+@code{"[^a-z]"}
+
+@code{(not-in-range #\a #\z)}
+@end deftp
+
 Example:
 
 @example
diff --git a/module/ice-9/peg/codegen.scm b/module/ice-9/peg/codegen.scm
index d80c3e849..82367ef55 100644
--- a/module/ice-9/peg/codegen.scm
+++ b/module/ice-9/peg/codegen.scm
@@ -140,6 +140,27 @@ return EXP."
                          ((none) #`(list (1+ pos) '()))
                          (else (error "bad accum" accum))))))))))
 
+;; Generates code for matching a range of characters not between start and end.
+;; E.g.: (cg-not-in-range syntax #\a #\z 'body)
+(define (cg-not-in-range pat accum)
+  (syntax-case pat ()
+    ((start end)
+     (if (not (and (char? (syntax->datum #'start))
+                   (char? (syntax->datum #'end))))
+         (error "range PEG should have characters after it; instead got"
+                #'start #'end))
+     #`(lambda (str len pos)
+         (and (< pos len)
+              (let ((c (string-ref str pos)))
+                (and (or (char<? c start) (char>? c end))
+                     #,(case accum
+                         ((all) #`(list (1+ pos)
+                                        (list 'cg-not-in-range (string c))))
+                         ((name) #`(list (1+ pos) 'cg-not-in-range))
+                         ((body) #`(list (1+ pos) (string c)))
+                         ((none) #`(list (1+ pos) '()))
+                         (else (error "bad accum" accum))))))))))
+
 ;; Generate code to match a pattern and do nothing with the result
 (define (cg-ignore pat accum)
   (syntax-case pat ()
@@ -304,6 +325,7 @@ return EXP."
         (assq-set! peg-compiler-alist symbol function)))
 
 (add-peg-compiler! 'range cg-range)
+(add-peg-compiler! 'not-in-range cg-not-in-range)
 (add-peg-compiler! 'ignore cg-ignore)
 (add-peg-compiler! 'capture cg-capture)
 (add-peg-compiler! 'and cg-and)
diff --git a/module/ice-9/peg/string-peg.scm b/module/ice-9/peg/string-peg.scm
index 05755693a..f88d2f7d8 100644
--- a/module/ice-9/peg/string-peg.scm
+++ b/module/ice-9/peg/string-peg.scm
@@ -49,7 +49,7 @@ Prefix <-- (AND / NOT)? Suffix
 Suffix <-- Primary (QUESTION / STAR / PLUS)?
 Primary <-- Identifier !LEFTARROW
            / OPEN Expression CLOSE
-           / Literal / Class / DOT
+           / Literal / Class / NotInClass / DOT
 
 # Lexical syntax
 Identifier <-- IdentStart IdentCont* Spacing
@@ -59,7 +59,8 @@ IdentCont <- IdentStart / [0-9]
 
 Literal <-- SQUOTE (!SQUOTE Char)* SQUOTE Spacing
         / DQUOTE (!DQUOTE Char)* DQUOTE Spacing
-Class <-- '[' (!']' Range)* ']' Spacing
+NotInClass <-- '[' NOTIN  (!']' Range)* ']' Spacing
+Class <-- '[' !NOTIN (!']' Range)* ']' Spacing
 Range <-- Char '-' Char / Char
 Char <-- '\\\\' [nrt'\"\\[\\]\\\\]
        / '\\\\' [0-7][0-7][0-7]
@@ -72,6 +73,7 @@ SQUOTE <-- [']
 DQUOTE <-- [\"]
 OPENBRACKET < '['
 CLOSEBRACKET < ']'
+NOTIN < '^'
 SLASH < '/' Spacing
 AND <-- '&' Spacing
 NOT <-- '!' Spacing
@@ -116,6 +118,7 @@ EndOfFile < !.
       (and OPEN Expression CLOSE)
       Literal
       Class
+      NotInClass
       DOT))
 (define-sexp-parser Identifier all
   (and IdentStart (* IdentCont) Spacing))
@@ -127,7 +130,11 @@ EndOfFile < !.
   (or (and SQUOTE (* (and (not-followed-by SQUOTE) Char)) SQUOTE Spacing)
       (and DQUOTE (* (and (not-followed-by DQUOTE) Char)) DQUOTE Spacing)))
 (define-sexp-parser Class all
-  (and OPENBRACKET (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing))
+  (and OPENBRACKET (not-followed-by NOTIN)
+       (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing))
+(define-sexp-parser NotInClass all
+  (and OPENBRACKET NOTIN
+       (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing))
 (define-sexp-parser Range all
   (or (and Char DASH Char) Char))
 (define-sexp-parser Char all
@@ -137,6 +144,8 @@ EndOfFile < !.
       (and (not-followed-by "\\") peg-any)))
 (define-sexp-parser LEFTARROW body
   (and (or "<--" "<-" "<") Spacing)) ; NOTE: <-- and < are extensions
+(define-sexp-parser NOTIN none
+  (and "^"))
 (define-sexp-parser SLASH none
   (and "/" Spacing))
 (define-sexp-parser AND all
@@ -271,6 +280,7 @@ EndOfFile < !.
       ('Identifier (Identifier->defn value for-syntax))
       ('Expression (Expression->defn value for-syntax))
       ('Literal    (Literal->defn value for-syntax))
+      ('NotInClass (NotInClass->defn value for-syntax))
       ('Class      (Class->defn value for-syntax)))))
 
 ;; (Identifier "hello")
@@ -283,13 +293,34 @@ EndOfFile < !.
 (define (Literal->defn lst for-syntax)
   (apply string (map (lambda (x) (Char->defn x for-syntax)) (cdr lst))))
 
-;; TODO: empty Class can happen: `[]`, but what does it represent?
+;; (NotInClass ...)
+;;  `-> (and ...)
+(define (NotInClass->defn lst for-syntax)
+  #`(and #,@(map (lambda (x) (NotInRange->defn x for-syntax))
+                 (cdr lst))))
+
 ;; (Class ...)
 ;;  `-> (or ...)
 (define (Class->defn lst for-syntax)
   #`(or #,@(map (lambda (x) (Range->defn x for-syntax))
                 (cdr lst))))
 
+;; For one character:
+;; (NotInRange (Char "a"))
+;;  `-> (not-in-range #\a #\a)
+;; Or for a range:
+;; (NotInRange (Char "a") (Char "b"))
+;;  `-> (not-in-range #\a #\b)
+(define (NotInRange->defn lst for-syntax)
+  (cond
+    ((= 2 (length lst))
+     (let ((ch (Char->defn (second lst) for-syntax)))
+       #`(not-in-range #,ch #,ch)))
+    ((= 3 (length lst))
+     #`(not-in-range
+         #,(Char->defn (second lst) for-syntax)
+         #,(Char->defn (third lst) for-syntax)))))
+
 ;; For one character:
 ;; (Range (Char "a"))
 ;;  `-> "a"
diff --git a/test-suite/tests/peg.test b/test-suite/tests/peg.test
index 556145e72..965e1c12f 100644
--- a/test-suite/tests/peg.test
+++ b/test-suite/tests/peg.test
@@ -38,6 +38,7 @@
     (Identifier Identifier)
     (Literal Literal)
     (Class Class)
+    (NotInClass NotInClass)
     (Range Range)
     (Char Char)
     (LEFTARROW LEFTARROW)
@@ -85,7 +86,7 @@
 End <-- '*)'
 C <- Begin N* End
 N <- C / (!Begin !End Z)
-Z <- .")
+Z <- [^X-Z]") ;; Forbid some characters to test not-in-range
 
 ;; A short /etc/passwd file.
 (define *etc-passwd*
@@ -125,6 +126,9 @@ SLASH < '/'")
     (match-pattern C "(*blah*)")
     (make-prec 0 8 "(*blah*)"
 	       '((Begin "(*") "blah" (End "*)")))))
+  (pass-if
+   "simple comment with forbidden char"
+   (not (match-pattern C "(*blYh*)")))
   (pass-if
    "simple comment padded"
    (equal?
-- 
2.46.0






      parent reply	other threads:[~2024-10-11 12:31 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-09-11 22:03 bug#73188: PEG parser does not support full PEG grammar Ekaitz Zarraga
2024-09-12 20:57 ` bug#73188: [PATCH v2] PEG: Add full support for PEG + some extensions Ekaitz Zarraga
2024-10-13 20:29   ` bug#73188: PEG parser does not support full PEG grammar Ludovic Courtès
2024-10-13 20:59     ` Ekaitz Zarraga
2024-10-14 11:56       ` Ludovic Courtès
2024-10-14 14:00         ` Ekaitz Zarraga
2024-10-11 12:31 ` Ekaitz Zarraga [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/guile/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20241011123145.7228-1-ekaitz@elenq.tech \
    --to=ekaitz@elenq.tech \
    --cc=73188@debbugs.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).