unofficial mirror of guile-devel@gnu.org 
 help / color / mirror / Atom feed
* PEG Patches
@ 2011-03-06  5:25 Noah Lavine
  2011-03-07  1:28 ` Noah Lavine
  2011-03-25 18:06 ` Andy Wingo
  0 siblings, 2 replies; 10+ messages in thread
From: Noah Lavine @ 2011-03-06  5:25 UTC (permalink / raw)
  To: guile-devel

[-- Attachment #1: Type: text/plain, Size: 1208 bytes --]

Hello all,

Attached is a series of patches I've made for the wip-mlucy branch. It
splits the PEG code into several little modules which go in
module/ice-9/peg/. The original peg source file becomes very little.
At the end it finally loses its big eval-when wrapper.

There's one part of this that I'm not satisfied with, which is that
define-nonterminal goes into module/ice-9/peg/string-peg.scm, which is
supposed to be solely for pegs-as-strings. The reason for this is that
I got compiler errors if I didn't do this, and I couldn't figure out
how to stop them. I would appreciate it if someone would take a look
and try to find what I missed.

Also, a note about future ideas - the current PEG code can only parse
strings. However, there is almost nothing string-specific about the
parsing code - just a few calls to string-ref and substring in
codegen.scm. I'd like to see this extended to parse vectors filled
with arbitrary objects. This would let you use a tokenizer with it,
which is the easiest way to implement C correctly, and also probably
the easiest way to store line number information with tokens, which is
necessary for ultimately giving good error messages from PEG parsers.

Thanks,
Noah

[-- Attachment #2: 0001-Split-peg.scm.patch --]
[-- Type: application/octet-stream, Size: 19991 bytes --]

From a73cfb546fc6c044af71f0d48a63f061bca2f740 Mon Sep 17 00:00:00 2001
From: Noah Lavine <nlavine@haverford.edu>
Date: Sat, 5 Mar 2011 15:23:59 -0500
Subject: [PATCH 1/7] Split peg.scm

 * module/ice-9/peg.scm: move code generators to new module
 * module/ice-9/peg/codegen.scm: new module for PEG code generators
---
 module/ice-9/peg.scm         |  223 +-------------------------------------
 module/ice-9/peg/codegen.scm |  245 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 250 insertions(+), 218 deletions(-)
 create mode 100644 module/ice-9/peg/codegen.scm

diff --git a/module/ice-9/peg.scm b/module/ice-9/peg.scm
index d91a74e..0acc459 100644
--- a/module/ice-9/peg.scm
+++ b/module/ice-9/peg.scm
@@ -18,8 +18,7 @@
 ;;;;
 
 (define-module (ice-9 peg)
-  #:export (peg-sexp-compile
-            peg-string-compile
+  #:export (peg-string-compile
             context-flatten
             peg-parse
             define-nonterm
@@ -34,8 +33,9 @@
             peg:substring
             peg-record?
             keyword-flatten)
-  #:use-module (system base pmatch)
-  #:use-module (ice-9 pretty-print))
+  #:use-module (ice-9 peg codegen)
+  #:re-export (peg-sexp-compile)
+  #:use-module (system base pmatch))
 
 ;;;
 ;;; Helper Macros
@@ -58,222 +58,9 @@ execute the STMTs and try again."
        ((_) #t)
        (else #f)))))
 
-(define-syntax push!
-  (syntax-rules ()
-    "Push an object onto a list."
-    ((_ lst obj)
-     (set! lst (cons obj lst)))))
-
-(define-syntax single-filter
-  (syntax-rules ()
-    "If EXP is a list of one element, return the element.  Otherwise
-return EXP."
-    ((_ exp)
-     (pmatch exp
-       ((,elt) elt)
-       (,elts elts)))))
-
-(define-syntax push-not-null!
-  (syntax-rules ()
-    "If OBJ is non-null, push it onto LST, otherwise do nothing."
-    ((_ lst obj)
-     (if (not (null? obj))
-         (push! lst obj)))))
-
 (eval-when (compile load eval)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;; CODE GENERATORS
-;; These functions generate scheme code for parsing PEGs.
-;; Conventions:
-;;   accum: (all name body none)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-;; Code we generate will have a certain return structure depending on how we're
-;; accumulating (the ACCUM variable).
-(define (cg-generic-ret accum name body-uneval at)
-  ;; name, body-uneval and at are syntax
-  #`(let ((body #,body-uneval))
-     #,(cond
-        ((and (eq? accum 'all) name)
-         #`(list #,at
-                 (cond
-                  ((not (list? body)) (list '#,name body))
-                  ((null? body) '#,name)
-                  ((symbol? (car body)) (list '#,name body))
-                  (else (cons '#,name body)))))
-        ((eq? accum 'name)
-         #`(list #,at '#,name))
-        ((eq? accum 'body)
-         #`(list #,at
-                 (cond
-                  ((single? body) (car body))
-                  (else body))))
-        ((eq? accum 'none)
-         #`(list #,at '()))
-        (else
-         (begin
-           (pretty-print `(cg-generic-ret-error ,accum ,name ,body-uneval ,at))
-           (pretty-print "Defaulting to accum of none.\n")
-           #`(list #,at '()))))))
-
-;; The short name makes the formatting below much easier to read.
-(define cggr cg-generic-ret)
-
-;; Generates code that matches a particular string.
-;; E.g.: (cg-string syntax "abc" 'body)
-(define (cg-string pat accum)
-  (let ((plen (string-length pat)))
-    #`(lambda (str len pos)
-        (let ((end (+ pos #,plen)))
-          (and (<= end len)
-               (string= str #,pat pos end)
-               #,(case accum
-                   ((all) #`(list end (list 'cg-string #,pat)))
-                   ((name) #`(list end 'cg-string))
-                   ((body) #`(list end #,pat))
-                   ((none) #`(list end '()))
-                   (else (error "bad accum" accum))))))))
-
-;; Generates code for matching any character.
-;; E.g.: (cg-peg-any syntax 'body)
-(define (cg-peg-any accum)
-  #`(lambda (str len pos)
-      (and (< pos len)
-           #,(case accum
-               ((all) #`(list (1+ pos)
-                              (list 'cg-peg-any (substring str pos (1+ pos)))))
-               ((name) #`(list (1+ pos) 'cg-peg-any))
-               ((body) #`(list (1+ pos) (substring str pos (1+ pos))))
-               ((none) #`(list (1+ pos) '()))
-               (else (error "bad accum" accum))))))
-
-;; Generates code for matching a range of characters between start and end.
-;; E.g.: (cg-range syntax #\a #\z 'body)
-(define (cg-range start end accum)
-  #`(lambda (str len pos)
-      (and (< pos len)
-           (let ((c (string-ref str pos)))
-             (and (char>=? c #,start)
-                  (char<=? c #,end)
-                  #,(case accum
-                      ((all) #`(list (1+ pos) (list 'cg-range (string c))))
-                      ((name) #`(list (1+ pos) 'cg-range))
-                      ((body) #`(list (1+ pos) (string c)))
-                      ((none) #`(list (1+ pos) '()))
-                      (else (error "bad accum" accum))))))))
-
-;; Filters the accum argument to peg-sexp-compile for buildings like string
-;; literals (since we don't want to tag them with their name if we're doing an
-;; "all" accum).
-(define (builtin-accum-filter accum)
-  (cond
-   ((eq? accum 'all) 'body)
-   ((eq? accum 'name) 'name)
-   ((eq? accum 'body) 'body)
-   ((eq? accum 'none) 'none)))
-(define baf builtin-accum-filter)
-
-;; Takes an arbitrary expressions and accumulation variable, then parses it.
-;; E.g.: (peg-sexp-compile syntax '(and "abc" (or "-" (range #\a #\z))) 'all)
-(define (peg-sexp-compile pat accum)
-  (syntax-case pat (peg-any range ignore capture peg and or body)
-    (peg-any
-     (cg-peg-any (baf accum)))
-    (sym (identifier? #'sym) ;; nonterminal
-     #'sym)
-    (str (string? (syntax->datum #'str)) ;; literal string
-     (cg-string (syntax->datum #'str) (baf accum)))
-    ((range start end) ;; range of characters (e.g. [a-z])
-     (and (char? (syntax->datum #'start)) (char? (syntax->datum #'end)))
-     (cg-range (syntax->datum #'start) (syntax->datum #'end) (baf accum)))
-    ((ignore pat) ;; match but don't parse
-     (peg-sexp-compile #'pat 'none))
-    ((capture pat) ;; parse
-     (peg-sexp-compile #'pat 'body))
-    ((peg pat)  ;; embedded PEG string
-     (string? (syntax->datum #'pat))
-     (peg-string-compile #'pat (baf accum)))
-    ((and pat ...)
-     (cg-and #'(pat ...) (baf accum)))
-    ((or pat ...)
-     (cg-or #'(pat ...) (baf accum)))
-    ((body type pat num)
-     (cg-body (baf accum) #'type #'pat #'num))))
-
-;; Top-level function builder for AND.  Reduces to a call to CG-AND-INT.
-(define (cg-and clauses accum)
-  #`(lambda (str len pos)
-      (let ((body '()))
-        #,(cg-and-int clauses accum #'str #'len #'pos #'body))))
-
-;; Internal function builder for AND (calls itself).
-(define (cg-and-int clauses accum str strlen at body)
-  (syntax-case clauses ()
-    (()
-     (cggr accum 'cg-and #`(reverse #,body) at))
-    ((first rest ...)
-     #`(let ((res (#,(peg-sexp-compile #'first accum) #,str #,strlen #,at)))
-         (and res 
-              ;; update AT and BODY then recurse
-              (let ((newat (car res))
-                    (newbody (cadr res)))
-                (set! #,at newat)
-                (push-not-null! #,body (single-filter newbody))
-                #,(cg-and-int #'(rest ...) accum str strlen at body)))))))
-
-;; Top-level function builder for OR.  Reduces to a call to CG-OR-INT.
-(define (cg-or clauses accum)
-  #`(lambda (str len pos)
-      #,(cg-or-int clauses accum #'str #'len #'pos)))
-
-;; Internal function builder for OR (calls itself).
-(define (cg-or-int clauses accum str strlen at)
-  (syntax-case clauses ()
-    (()
-     #f)
-    ((first rest ...)
-     #`(or (#,(peg-sexp-compile #'first accum) #,str #,strlen #,at)
-           #,(cg-or-int #'(rest ...) accum str strlen at)))))
-
-;; Returns a function that parses a BODY element.
-(define (cg-body accum type pat num)
-  #`(lambda (str strlen at)
-      (let ((body '()))
-        (let lp ((end at) (count 0))
-          (let* ((match (#,(peg-sexp-compile pat accum) str strlen end))
-                 (new-end (if match (car match) end))
-                 (count (if (> new-end end) (1+ count) count)))
-            (if (> new-end end)
-                (push-not-null! body (single-filter (cadr match))))
-            (if (and (> new-end end)
-                     #,(syntax-case num (+ * ?)
-                         (n (number? (syntax->datum #'n))
-                            #'(< count n))
-                         (+ #t)
-                         (* #t)
-                         (? #'(< count 1))))
-                (lp new-end count)
-                (let ((success #,(syntax-case num (+ * ?)
-                                   (n (number? (syntax->datum #'n))
-                                      #'(= count n))
-                                   (+ #'(>= count 1))
-                                   (* #t)
-                                   (? #t))))
-                  #,(syntax-case type (! & lit)
-                      (!
-                       #`(if success
-                             #f
-                             #,(cggr accum 'cg-body #''() #'at)))
-                      (&
-                       #`(and success
-                              #,(cggr accum 'cg-body #''() #'at)))
-                      (lit
-                       #`(and success
-                              #,(cggr accum 'cg-body #'(reverse body) #'new-end)))))))))))
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;; FOR DEFINING AND USING NONTERMINALS
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -511,7 +298,7 @@ RB < ']'
   (let ((parsed (peg-parse peg-grammar str)))
     (if (not parsed)
         (begin
-          ;; (pretty-print "Invalid PEG grammar!\n")
+          ;; (display "Invalid PEG grammar!\n")
           #f)
         (let ((lst (peg:tree parsed)))
           (cond
diff --git a/module/ice-9/peg/codegen.scm b/module/ice-9/peg/codegen.scm
new file mode 100644
index 0000000..43f44cc
--- /dev/null
+++ b/module/ice-9/peg/codegen.scm
@@ -0,0 +1,245 @@
+;;;; codegen.scm --- code generation for composable parsers
+;;;;
+;;;; 	Copyright (C) 2011 Free Software Foundation, Inc.
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;; 
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;;;; Lesser General Public License for more details.
+;;;; 
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+(define-module (ice-9 peg codegen)
+  #:export (peg-sexp-compile)
+  #:use-module (ice-9 peg)
+  #:use-module (ice-9 pretty-print)
+  #:use-module (system base pmatch))
+
+(define-syntax single?
+  (syntax-rules ()
+    "Return #t if X is a list of one element."
+    ((_ x)
+     (pmatch x
+       ((_) #t)
+       (else #f)))))
+
+(define-syntax single-filter
+  (syntax-rules ()
+    "If EXP is a list of one element, return the element.  Otherwise
+return EXP."
+    ((_ exp)
+     (pmatch exp
+       ((,elt) elt)
+       (,elts elts)))))
+
+(define-syntax push-not-null!
+  (syntax-rules ()
+    "If OBJ is non-null, push it onto LST, otherwise do nothing."
+    ((_ lst obj)
+     (if (not (null? obj))
+         (push! lst obj)))))
+
+(define-syntax push!
+  (syntax-rules ()
+    "Push an object onto a list."
+    ((_ lst obj)
+     (set! lst (cons obj lst)))))
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;; CODE GENERATORS
+;; These functions generate scheme code for parsing PEGs.
+;; Conventions:
+;;   accum: (all name body none)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Code we generate will have a certain return structure depending on how we're
+;; accumulating (the ACCUM variable).
+(define (cg-generic-ret accum name body-uneval at)
+  ;; name, body-uneval and at are syntax
+  #`(let ((body #,body-uneval))
+     #,(cond
+        ((and (eq? accum 'all) name)
+         #`(list #,at
+                 (cond
+                  ((not (list? body)) (list '#,name body))
+                  ((null? body) '#,name)
+                  ((symbol? (car body)) (list '#,name body))
+                  (else (cons '#,name body)))))
+        ((eq? accum 'name)
+         #`(list #,at '#,name))
+        ((eq? accum 'body)
+         #`(list #,at
+                 (cond
+                  ((single? body) (car body))
+                  (else body))))
+        ((eq? accum 'none)
+         #`(list #,at '()))
+        (else
+         (begin
+           (pretty-print `(cg-generic-ret-error ,accum ,name ,body-uneval ,at))
+           (pretty-print "Defaulting to accum of none.\n")
+           #`(list #,at '()))))))
+
+;; The short name makes the formatting below much easier to read.
+(define cggr cg-generic-ret)
+
+;; Generates code that matches a particular string.
+;; E.g.: (cg-string syntax "abc" 'body)
+(define (cg-string pat accum)
+  (let ((plen (string-length pat)))
+    #`(lambda (str len pos)
+        (let ((end (+ pos #,plen)))
+          (and (<= end len)
+               (string= str #,pat pos end)
+               #,(case accum
+                   ((all) #`(list end (list 'cg-string #,pat)))
+                   ((name) #`(list end 'cg-string))
+                   ((body) #`(list end #,pat))
+                   ((none) #`(list end '()))
+                   (else (error "bad accum" accum))))))))
+
+;; Generates code for matching any character.
+;; E.g.: (cg-peg-any syntax 'body)
+(define (cg-peg-any accum)
+  #`(lambda (str len pos)
+      (and (< pos len)
+           #,(case accum
+               ((all) #`(list (1+ pos)
+                              (list 'cg-peg-any (substring str pos (1+ pos)))))
+               ((name) #`(list (1+ pos) 'cg-peg-any))
+               ((body) #`(list (1+ pos) (substring str pos (1+ pos))))
+               ((none) #`(list (1+ pos) '()))
+               (else (error "bad accum" accum))))))
+
+;; Generates code for matching a range of characters between start and end.
+;; E.g.: (cg-range syntax #\a #\z 'body)
+(define (cg-range start end accum)
+  #`(lambda (str len pos)
+      (and (< pos len)
+           (let ((c (string-ref str pos)))
+             (and (char>=? c #,start)
+                  (char<=? c #,end)
+                  #,(case accum
+                      ((all) #`(list (1+ pos) (list 'cg-range (string c))))
+                      ((name) #`(list (1+ pos) 'cg-range))
+                      ((body) #`(list (1+ pos) (string c)))
+                      ((none) #`(list (1+ pos) '()))
+                      (else (error "bad accum" accum))))))))
+
+;; Filters the accum argument to peg-sexp-compile for buildings like string
+;; literals (since we don't want to tag them with their name if we're doing an
+;; "all" accum).
+(define (builtin-accum-filter accum)
+  (cond
+   ((eq? accum 'all) 'body)
+   ((eq? accum 'name) 'name)
+   ((eq? accum 'body) 'body)
+   ((eq? accum 'none) 'none)))
+(define baf builtin-accum-filter)
+
+;; Takes an arbitrary expressions and accumulation variable, then parses it.
+;; E.g.: (peg-sexp-compile syntax '(and "abc" (or "-" (range #\a #\z))) 'all)
+(define (peg-sexp-compile pat accum)
+  (syntax-case pat (peg-any range ignore capture peg and or body)
+    (peg-any
+     (cg-peg-any (baf accum)))
+    (sym (identifier? #'sym) ;; nonterminal
+     #'sym)
+    (str (string? (syntax->datum #'str)) ;; literal string
+     (cg-string (syntax->datum #'str) (baf accum)))
+    ((range start end) ;; range of characters (e.g. [a-z])
+     (and (char? (syntax->datum #'start)) (char? (syntax->datum #'end)))
+     (cg-range (syntax->datum #'start) (syntax->datum #'end) (baf accum)))
+    ((ignore pat) ;; match but don't parse
+     (peg-sexp-compile #'pat 'none))
+    ((capture pat) ;; parse
+     (peg-sexp-compile #'pat 'body))
+    ((peg pat)  ;; embedded PEG string
+     (string? (syntax->datum #'pat))
+     (peg-string-compile #'pat (baf accum)))
+    ((and pat ...)
+     (cg-and #'(pat ...) (baf accum)))
+    ((or pat ...)
+     (cg-or #'(pat ...) (baf accum)))
+    ((body type pat num)
+     (cg-body (baf accum) #'type #'pat #'num))))
+
+;; Top-level function builder for AND.  Reduces to a call to CG-AND-INT.
+(define (cg-and clauses accum)
+  #`(lambda (str len pos)
+      (let ((body '()))
+        #,(cg-and-int clauses accum #'str #'len #'pos #'body))))
+
+;; Internal function builder for AND (calls itself).
+(define (cg-and-int clauses accum str strlen at body)
+  (syntax-case clauses ()
+    (()
+     (cggr accum 'cg-and #`(reverse #,body) at))
+    ((first rest ...)
+     #`(let ((res (#,(peg-sexp-compile #'first accum) #,str #,strlen #,at)))
+         (and res 
+              ;; update AT and BODY then recurse
+              (let ((newat (car res))
+                    (newbody (cadr res)))
+                (set! #,at newat)
+                (push-not-null! #,body (single-filter newbody))
+                #,(cg-and-int #'(rest ...) accum str strlen at body)))))))
+
+;; Top-level function builder for OR.  Reduces to a call to CG-OR-INT.
+(define (cg-or clauses accum)
+  #`(lambda (str len pos)
+      #,(cg-or-int clauses accum #'str #'len #'pos)))
+
+;; Internal function builder for OR (calls itself).
+(define (cg-or-int clauses accum str strlen at)
+  (syntax-case clauses ()
+    (()
+     #f)
+    ((first rest ...)
+     #`(or (#,(peg-sexp-compile #'first accum) #,str #,strlen #,at)
+           #,(cg-or-int #'(rest ...) accum str strlen at)))))
+
+;; Returns a function that parses a BODY element.
+(define (cg-body accum type pat num)
+  #`(lambda (str strlen at)
+      (let ((body '()))
+        (let lp ((end at) (count 0))
+          (let* ((match (#,(peg-sexp-compile pat accum) str strlen end))
+                 (new-end (if match (car match) end))
+                 (count (if (> new-end end) (1+ count) count)))
+            (if (> new-end end)
+                (push-not-null! body (single-filter (cadr match))))
+            (if (and (> new-end end)
+                     #,(syntax-case num (+ * ?)
+                         (n (number? (syntax->datum #'n))
+                            #'(< count n))
+                         (+ #t)
+                         (* #t)
+                         (? #'(< count 1))))
+                (lp new-end count)
+                (let ((success #,(syntax-case num (+ * ?)
+                                   (n (number? (syntax->datum #'n))
+                                      #'(= count n))
+                                   (+ #'(>= count 1))
+                                   (* #t)
+                                   (? #t))))
+                  #,(syntax-case type (! & lit)
+                      (!
+                       #`(if success
+                             #f
+                             #,(cggr accum 'cg-body #''() #'at)))
+                      (&
+                       #`(and success
+                              #,(cggr accum 'cg-body #''() #'at)))
+                      (lit
+                       #`(and success
+                              #,(cggr accum 'cg-body #'(reverse body) #'new-end)))))))))))
-- 
1.7.4.1


[-- Attachment #3: 0002-Rename-in-peg.scm.patch --]
[-- Type: application/octet-stream, Size: 4011 bytes --]

From 3f76e95b6a8b7cfe72fd6a328bb94f621b9e248e Mon Sep 17 00:00:00 2001
From: Noah Lavine <nlavine@haverford.edu>
Date: Sat, 5 Mar 2011 16:23:05 -0500
Subject: [PATCH 2/7] Rename in peg.scm

 * module/ice-9/peg.scm: rename peg-parse-* functions to avoid confusion
    with what PEGs do.
---
 module/ice-9/peg.scm |   36 ++++++++++++++++++------------------
 1 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/module/ice-9/peg.scm b/module/ice-9/peg.scm
index 0acc459..e256c2d 100644
--- a/module/ice-9/peg.scm
+++ b/module/ice-9/peg.scm
@@ -305,7 +305,7 @@ RB < ']'
            ((or (not (list? lst)) (null? lst))
             lst)
            ((eq? (car lst) 'peg-grammar)
-            (cons 'begin (map (lambda (x) (peg-parse-nonterm x))
+            (cons 'begin (map (lambda (x) (peg-nonterm->defn x))
                               (context-flatten (lambda (lst) (<= (depth lst) 2))
                                           (cdr lst))))))))))
 
@@ -319,7 +319,7 @@ RB < ']'
 (define define-grammar-f peg-parser)
 
 ;; Parse a nonterminal and pattern listed in LST.
-(define (peg-parse-nonterm lst)
+(define (peg-nonterm->defn lst)
   (let ((nonterm (car lst))
         (grabber (cadr lst))
         (pattern (caddr lst)))
@@ -328,23 +328,23 @@ RB < ']'
          ((string=? grabber "<--") 'all)
          ((string=? grabber "<-") 'body)
          (else 'none))
-       ,(compressor (peg-parse-pattern pattern)))))
+       ,(compressor (peg-pattern->defn pattern)))))
 
 ;; Parse a pattern.
-(define (peg-parse-pattern lst)
-  (cons 'or (map peg-parse-alternative
+(define (peg-pattern->defn lst)
+  (cons 'or (map peg-alternative->defn
                  (context-flatten (lambda (x) (eq? (car x) 'peg-alternative))
                              (cdr lst)))))
 
 ;; Parse an alternative.
-(define (peg-parse-alternative lst)
-  (cons 'and (map peg-parse-body
+(define (peg-alternative->defn lst)
+  (cons 'and (map peg-body->defn
                   (context-flatten (lambda (x) (or (string? (car x))
                                               (eq? (car x) 'peg-suffix)))
                               (cdr lst)))))
 
 ;; Parse a body.
-(define (peg-parse-body lst)
+(define (peg-body->defn lst)
   (let ((suffix '())
         (front 'lit))
     (cond
@@ -354,41 +354,41 @@ RB < ']'
       (begin (set! front (string->symbol (car lst)))
              (set! suffix (cadr lst))))
      (else `(peg-parse-body-fail ,lst)))
-    `(body ,front ,@(peg-parse-suffix suffix))))
+    `(body ,front ,@(peg-suffix->defn suffix))))
 
 ;; Parse a suffix.
-(define (peg-parse-suffix lst)
-  (list (peg-parse-primary (cadr lst))
+(define (peg-suffix->defn lst)
+  (list (peg-primary->defn (cadr lst))
         (if (null? (cddr lst))
             1
             (string->symbol (caddr lst)))))
 
 ;; Parse a primary.
-(define (peg-parse-primary lst)
+(define (peg-primary->defn lst)
   (let ((el (cadr lst)))
   (cond
    ((list? el)
     (cond
      ((eq? (car el) 'peg-literal)
-      (peg-parse-literal el))
+      (peg-literal->defn el))
      ((eq? (car el) 'peg-charclass)
-      (peg-parse-charclass el))
+      (peg-charclass->defn el))
      ((eq? (car el) 'peg-nonterminal)
       (string->symbol (cadr el)))))
    ((string? el)
     (cond
      ((equal? el "(")
-      (peg-parse-pattern (caddr lst)))
+      (peg-pattern->defn (caddr lst)))
      ((equal? el ".")
       'peg-any)
      (else `(peg-parse-any unknown-string ,lst))))
    (else `(peg-parse-any unknown-el ,lst)))))
 
 ;; Parses a literal.
-(define (peg-parse-literal lst) (trim-1chars (cadr lst)))
+(define (peg-literal->defn lst) (trim-1chars (cadr lst)))
 
 ;; Parses a charclass.
-(define (peg-parse-charclass lst)
+(define (peg-charclass->defn lst)
   (cons 'or
         (map
          (lambda (cc)
@@ -423,7 +423,7 @@ RB < ']'
    (datum->syntax
     str-stx
     (compressor
-     (peg-parse-pattern
+     (peg-pattern->defn
       (peg:tree (peg-parse peg-pattern (syntax->datum str-stx))))))
    accum))
 
-- 
1.7.4.1


[-- Attachment #4: 0003-Make-Macros-Hygienic.patch --]
[-- Type: application/octet-stream, Size: 8354 bytes --]

From 6a8c63da95728dc33ca5984328d9747fbd067e26 Mon Sep 17 00:00:00 2001
From: Noah Lavine <nlavine@haverford.edu>
Date: Sat, 5 Mar 2011 22:37:11 -0500
Subject: [PATCH 3/7] Make Macros Hygienic

 * modules/ice-9/peg.scm: convert the unhygienic macros that generate code
    for string PEGs to use hygiene.
---
 module/ice-9/peg.scm |  129 ++++++++++++++++++++++++++++----------------------
 1 files changed, 73 insertions(+), 56 deletions(-)

diff --git a/module/ice-9/peg.scm b/module/ice-9/peg.scm
index e256c2d..9bf152c 100644
--- a/module/ice-9/peg.scm
+++ b/module/ice-9/peg.scm
@@ -294,7 +294,7 @@ RB < ']'
 
 ;; Pakes a string representing a PEG grammar and defines all the nonterminals in
 ;; it as the associated PEGs.
-(define (peg-parser str)
+(define (peg-parser str for-syntax)
   (let ((parsed (peg-parse peg-grammar str)))
     (if (not parsed)
         (begin
@@ -305,9 +305,10 @@ RB < ']'
            ((or (not (list? lst)) (null? lst))
             lst)
            ((eq? (car lst) 'peg-grammar)
-            (cons 'begin (map (lambda (x) (peg-nonterm->defn x))
-                              (context-flatten (lambda (lst) (<= (depth lst) 2))
-                                          (cdr lst))))))))))
+            #`(begin
+                #,@(map (lambda (x) (peg-nonterm->defn x for-syntax))
+                        (context-flatten (lambda (lst) (<= (depth lst) 2))
+                                         (cdr lst))))))))))
 
 ;; Macro wrapper for PEG-PARSER.  Parses PEG grammars expressed as strings and
 ;; defines all the appropriate nonterminals.
@@ -315,88 +316,101 @@ RB < ']'
   (lambda (x)
     (syntax-case x ()
       ((_ str)
-       (datum->syntax x (peg-parser (syntax->datum #'str)))))))
+       (peg-parser (syntax->datum #'str) x)))))
 (define define-grammar-f peg-parser)
 
 ;; Parse a nonterminal and pattern listed in LST.
-(define (peg-nonterm->defn lst)
-  (let ((nonterm (car lst))
-        (grabber (cadr lst))
-        (pattern (caddr lst)))
-    `(define-nonterm ,(string->symbol (cadr nonterm))
-       ,(cond
-         ((string=? grabber "<--") 'all)
-         ((string=? grabber "<-") 'body)
-         (else 'none))
-       ,(compressor (peg-pattern->defn pattern)))))
+(define (peg-nonterm->defn lst for-syntax)
+  (let* ((nonterm (car lst))
+         (grabber (cadr lst))
+         (pattern (caddr lst))
+         (nonterm-name (datum->syntax for-syntax
+                                      (string->symbol (cadr nonterm)))))
+    #`(define-nonterm #,nonterm-name
+       #,(cond
+          ((string=? grabber "<--") (datum->syntax for-syntax 'all))
+          ((string=? grabber "<-") (datum->syntax for-syntax 'body))
+          (else (datum->syntax for-syntax 'none)))
+       #,(compressor (peg-pattern->defn pattern for-syntax) for-syntax))))
 
 ;; Parse a pattern.
-(define (peg-pattern->defn lst)
-  (cons 'or (map peg-alternative->defn
-                 (context-flatten (lambda (x) (eq? (car x) 'peg-alternative))
-                             (cdr lst)))))
+(define (peg-pattern->defn lst for-syntax)
+  #`(or #,@(map (lambda (x) (peg-alternative->defn x for-syntax))
+                (context-flatten (lambda (x) (eq? (car x) 'peg-alternative))
+                                 (cdr lst)))))
 
 ;; Parse an alternative.
-(define (peg-alternative->defn lst)
-  (cons 'and (map peg-body->defn
-                  (context-flatten (lambda (x) (or (string? (car x))
-                                              (eq? (car x) 'peg-suffix)))
-                              (cdr lst)))))
+(define (peg-alternative->defn lst for-syntax)
+  #`(and #,@(map (lambda (x) (peg-body->defn x for-syntax))
+                 (context-flatten (lambda (x) (or (string? (car x))
+                                             (eq? (car x) 'peg-suffix)))
+                                  (cdr lst)))))
 
 ;; Parse a body.
-(define (peg-body->defn lst)
+(define (peg-body->defn lst for-syntax)
   (let ((suffix '())
-        (front 'lit))
+        (front (datum->syntax for-syntax 'lit)))
     (cond
      ((eq? (car lst) 'peg-suffix)
       (set! suffix lst))
      ((string? (car lst))
-      (begin (set! front (string->symbol (car lst)))
+      (begin (set! front (datum->syntax for-syntax
+                                        (string->symbol (car lst))))
              (set! suffix (cadr lst))))
      (else `(peg-parse-body-fail ,lst)))
-    `(body ,front ,@(peg-suffix->defn suffix))))
+    #`(body #,front #,@(peg-suffix->defn
+                        suffix
+                        for-syntax))))
 
 ;; Parse a suffix.
-(define (peg-suffix->defn lst)
-  (list (peg-primary->defn (cadr lst))
-        (if (null? (cddr lst))
-            1
-            (string->symbol (caddr lst)))))
+(define (peg-suffix->defn lst for-syntax)
+  #`(#,(peg-primary->defn (cadr lst) for-syntax)
+     #,(if (null? (cddr lst))
+           1
+           (datum->syntax for-syntax (string->symbol (caddr lst))))))
 
 ;; Parse a primary.
-(define (peg-primary->defn lst)
+(define (peg-primary->defn lst for-syntax)
   (let ((el (cadr lst)))
   (cond
    ((list? el)
     (cond
      ((eq? (car el) 'peg-literal)
-      (peg-literal->defn el))
+      (peg-literal->defn el for-syntax))
      ((eq? (car el) 'peg-charclass)
-      (peg-charclass->defn el))
+      (peg-charclass->defn el for-syntax))
      ((eq? (car el) 'peg-nonterminal)
-      (string->symbol (cadr el)))))
+      (datum->syntax for-syntax (string->symbol (cadr el))))))
    ((string? el)
     (cond
      ((equal? el "(")
-      (peg-pattern->defn (caddr lst)))
+      (peg-pattern->defn (caddr lst) for-syntax))
      ((equal? el ".")
-      'peg-any)
-     (else `(peg-parse-any unknown-string ,lst))))
-   (else `(peg-parse-any unknown-el ,lst)))))
+      (datum->syntax for-syntax 'peg-any))
+     (else (datum->syntax for-syntax
+                          `(peg-parse-any unknown-string ,lst)))))
+   (else (datum->syntax for-syntax
+                        `(peg-parse-any unknown-el ,lst))))))
 
 ;; Parses a literal.
-(define (peg-literal->defn lst) (trim-1chars (cadr lst)))
+(define (peg-literal->defn lst for-syntax)
+  (datum->syntax for-syntax (trim-1chars (cadr lst))))
 
 ;; Parses a charclass.
-(define (peg-charclass->defn lst)
-  (cons 'or
-        (map
+(define (peg-charclass->defn lst for-syntax)
+  #`(or
+     #,@(map
          (lambda (cc)
            (cond
             ((eq? (car cc) 'charclass-range)
-             `(range ,(string-ref (cadr cc) 0) ,(string-ref (cadr cc) 2)))
+             #`(range #,(datum->syntax
+                         for-syntax
+                         (string-ref (cadr cc) 0))
+                      #,(datum->syntax
+                         for-syntax
+                         (string-ref (cadr cc) 2))))
             ((eq? (car cc) 'charclass-single)
-             (cadr cc))))
+             (datum->syntax for-syntax (cadr cc)))))
          (context-flatten
           (lambda (x) (or (eq? (car x) 'charclass-range)
                           (eq? (car x) 'charclass-single)))
@@ -404,27 +418,30 @@ RB < ']'
 
 ;; Compresses a list to save the optimizer work.
 ;; e.g. (or (and a)) -> a
-(define (compressor lst)
+(define (compressor-core lst)
   (if (or (not (list? lst)) (null? lst))
       lst
       (cond
        ((and (or (eq? (car lst) 'or) (eq? (car lst) 'and))
              (null? (cddr lst)))
-        (compressor (cadr lst)))
+        (compressor-core (cadr lst)))
        ((and (eq? (car lst) 'body)
              (eq? (cadr lst) 'lit)
              (eq? (cadddr lst) 1))
-        (compressor (caddr lst)))
-       (else (map compressor lst)))))
+        (compressor-core (caddr lst)))
+       (else (map compressor-core lst)))))
+
+(define (compressor syn for-syntax)
+  (datum->syntax for-syntax
+                 (compressor-core (syntax->datum syn))))
 
 ;; Builds a lambda-expressions for the pattern STR using accum.
 (define (peg-string-compile str-stx accum)
   (peg-sexp-compile
-   (datum->syntax
-    str-stx
-    (compressor
-     (peg-pattern->defn
-      (peg:tree (peg-parse peg-pattern (syntax->datum str-stx))))))
+   (compressor
+    (peg-pattern->defn
+     (peg:tree (peg-parse peg-pattern (syntax->datum str-stx))) str-stx)
+    str-stx)
    accum))
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-- 
1.7.4.1


[-- Attachment #5: 0004-Separate-PEG-Strings.patch --]
[-- Type: application/octet-stream, Size: 25441 bytes --]

From 0ce194f300136dc857fd06040ed451d00a992503 Mon Sep 17 00:00:00 2001
From: Noah Lavine <nlavine@haverford.edu>
Date: Sat, 5 Mar 2011 23:54:50 -0500
Subject: [PATCH 4/7] Separate PEG Strings

 * module/ice-9/peg.scm: remove functions dealing with PEGs as strings
 * module/ice-9/peg/string-peg.scm: and put them here
---
 module/ice-9/peg.scm            |  291 ++------------------------------------
 module/ice-9/peg/codegen.scm    |    1 +
 module/ice-9/peg/string-peg.scm |  305 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 315 insertions(+), 282 deletions(-)
 create mode 100644 module/ice-9/peg/string-peg.scm

diff --git a/module/ice-9/peg.scm b/module/ice-9/peg.scm
index 9bf152c..b96104a 100644
--- a/module/ice-9/peg.scm
+++ b/module/ice-9/peg.scm
@@ -18,14 +18,11 @@
 ;;;;
 
 (define-module (ice-9 peg)
-  #:export (peg-string-compile
-            context-flatten
+  #:export (context-flatten
             peg-parse
-            define-nonterm
-            define-nonterm-f
+;            define-nonterm
+;            define-nonterm-f
             peg-match
-            define-grammar
-            define-grammar-f
             peg:start
             peg:end
             peg:string
@@ -33,8 +30,13 @@
             peg:substring
             peg-record?
             keyword-flatten)
+;  #:export-syntax (define-nonterm)
   #:use-module (ice-9 peg codegen)
-  #:re-export (peg-sexp-compile)
+  #:use-module (ice-9 peg string-peg)
+  #:re-export (peg-sexp-compile
+               define-grammar
+               define-grammar-f
+               define-nonterm)
   #:use-module (system base pmatch))
 
 ;;;
@@ -64,62 +66,6 @@ execute the STMTs and try again."
 ;;;;; FOR DEFINING AND USING NONTERMINALS
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; The results of parsing using a nonterminal are cached.  Think of it like a
-;; hash with no conflict resolution.  Process for deciding on the cache size
-;; wasn't very scientific; just ran the benchmarks and stopped a little after
-;; the point of diminishing returns on my box.
-(define *cache-size* 512)
-
-(define (syntax-for-non-cache-case for-syntax matchf-syn accumsym s-syn)
-;  (let ((matchf-syn (datum->syntax for-syntax matchf)))
-   #`(lambda (str strlen at)
-      (let ((res (#,matchf-syn str strlen at)))
-        ;; Try to match the nonterminal.
-        (if res
-            ;; If we matched, do some post-processing to figure out
-            ;; what data to propagate upward.
-            (let ((at (car res))
-                  (body (cadr res)))
-              #,(cond
-                 ((eq? accumsym 'name)
-                  #`(list at '#,s-syn))
-                 ((eq? accumsym 'all)
-                  #`(list (car res)
-                          (cond
-                           ((not (list? body))
-                            (list '#,s-syn body))
-                           ((null? body) '#,s-syn)
-                           ((symbol? (car body))
-                            (list '#,s-syn body))
-                           (else (cons '#,s-syn body)))))
-                 ((eq? accumsym 'none) #`(list (car res) '()))
-                 (else #`(begin res))))
-            ;; If we didn't match, just return false.
-            #f))))
-
-;; Defines a new nonterminal symbol accumulating with ACCUM.
-(define-syntax define-nonterm
-  (lambda (x)
-    (syntax-case x ()
-      ((_ sym accum pat)
-       (let ((matchf (peg-sexp-compile #'pat (syntax->datum #'accum)))
-             (accumsym (syntax->datum #'accum))
-             (c (datum->syntax x (gensym))));; the cache
-         ;; CODE is the code to parse the string if the result isn't cached.
-         (let ((syn (syntax-for-non-cache-case x matchf accumsym #'sym)))
-           #`(begin
-               (define #,c (make-vector *cache-size* #f));; the cache
-               (define (sym str strlen at)
-                 (let* ((vref (vector-ref #,c (modulo at *cache-size*))))
-                   ;; Check to see whether the value is cached.
-                   (if (and vref (eq? (car vref) str) (= (cadr vref) at))
-                       (caddr vref);; If it is return it.
-                       (let ((fres ;; Else calculate it and cache it.
-                              (#,syn str strlen at)))
-                         (vector-set! #,c (modulo at *cache-size*)
-                                      (list str at fres))
-                         fres)))))))))))
-
 ;; Parses STRING using NONTERM
 (define (peg-parse nonterm string)
   ;; We copy the string before using it because it might have been modified
@@ -225,225 +171,6 @@ execute the STMTs and try again."
          (member (car x) keyword-lst)))
    lst))
 
-;; Gets the left-hand depth of a list.
-(define (depth lst)
-  (if (or (not (list? lst)) (null? lst))
-      0
-      (+ 1 (depth (car lst)))))
-
-;; Trims characters off the front and end of STR.
-;; (trim-1chars "'ab'") -> "ab"
-(define (trim-1chars str) (substring str 1 (- (string-length str) 1)))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;; Parse string PEGs using sexp PEGs.
-;; See the variable PEG-AS-PEG for an easier-to-read syntax.
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-;; Grammar for PEGs in PEG grammar.
-(define peg-as-peg
-"grammar <-- (nonterminal ('<--' / '<-' / '<') sp pattern)+
-pattern <-- alternative (SLASH sp alternative)*
-alternative <-- ([!&]? sp suffix)+
-suffix <-- primary ([*+?] sp)*
-primary <-- '(' sp pattern ')' sp / '.' sp / literal / charclass / nonterminal !'<'
-literal <-- ['] (!['] .)* ['] sp
-charclass <-- LB (!']' (CCrange / CCsingle))* RB sp
-CCrange <-- . '-' .
-CCsingle <-- .
-nonterminal <-- [a-zA-Z0-9-]+ sp
-sp < [ \t\n]*
-SLASH < '/'
-LB < '['
-RB < ']'
-")
-
-(define-nonterm peg-grammar all
-  (body lit (and peg-nonterminal (or "<--" "<-" "<") peg-sp peg-pattern) +))
-(define-nonterm peg-pattern all
-  (and peg-alternative
-       (body lit (and (ignore "/") peg-sp peg-alternative) *)))
-(define-nonterm peg-alternative all
-  (body lit (and (body lit (or "!" "&") ?) peg-sp peg-suffix) +))
-(define-nonterm peg-suffix all
-  (and peg-primary (body lit (and (or "*" "+" "?") peg-sp) *)))
-(define-nonterm peg-primary all
-  (or (and "(" peg-sp peg-pattern ")" peg-sp)
-      (and "." peg-sp)
-      peg-literal
-      peg-charclass
-      (and peg-nonterminal (body ! "<" 1))))
-(define-nonterm peg-literal all
-  (and "'" (body lit (and (body ! "'" 1) peg-any) *) "'" peg-sp))
-(define-nonterm peg-charclass all
-  (and (ignore "[")
-       (body lit (and (body ! "]" 1)
-                      (or charclass-range charclass-single)) *)
-       (ignore "]")
-       peg-sp))
-(define-nonterm charclass-range all (and peg-any "-" peg-any))
-(define-nonterm charclass-single all peg-any)
-(define-nonterm peg-nonterminal all
-  (and (body lit (or (range #\a #\z) (range #\A #\Z) (range #\0 #\9) "-") +) peg-sp))
-(define-nonterm peg-sp none
-  (body lit (or " " "\t" "\n") *))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;; PARSE STRING PEGS
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-;; Pakes a string representing a PEG grammar and defines all the nonterminals in
-;; it as the associated PEGs.
-(define (peg-parser str for-syntax)
-  (let ((parsed (peg-parse peg-grammar str)))
-    (if (not parsed)
-        (begin
-          ;; (display "Invalid PEG grammar!\n")
-          #f)
-        (let ((lst (peg:tree parsed)))
-          (cond
-           ((or (not (list? lst)) (null? lst))
-            lst)
-           ((eq? (car lst) 'peg-grammar)
-            #`(begin
-                #,@(map (lambda (x) (peg-nonterm->defn x for-syntax))
-                        (context-flatten (lambda (lst) (<= (depth lst) 2))
-                                         (cdr lst))))))))))
-
-;; Macro wrapper for PEG-PARSER.  Parses PEG grammars expressed as strings and
-;; defines all the appropriate nonterminals.
-(define-syntax define-grammar
-  (lambda (x)
-    (syntax-case x ()
-      ((_ str)
-       (peg-parser (syntax->datum #'str) x)))))
-(define define-grammar-f peg-parser)
-
-;; Parse a nonterminal and pattern listed in LST.
-(define (peg-nonterm->defn lst for-syntax)
-  (let* ((nonterm (car lst))
-         (grabber (cadr lst))
-         (pattern (caddr lst))
-         (nonterm-name (datum->syntax for-syntax
-                                      (string->symbol (cadr nonterm)))))
-    #`(define-nonterm #,nonterm-name
-       #,(cond
-          ((string=? grabber "<--") (datum->syntax for-syntax 'all))
-          ((string=? grabber "<-") (datum->syntax for-syntax 'body))
-          (else (datum->syntax for-syntax 'none)))
-       #,(compressor (peg-pattern->defn pattern for-syntax) for-syntax))))
-
-;; Parse a pattern.
-(define (peg-pattern->defn lst for-syntax)
-  #`(or #,@(map (lambda (x) (peg-alternative->defn x for-syntax))
-                (context-flatten (lambda (x) (eq? (car x) 'peg-alternative))
-                                 (cdr lst)))))
-
-;; Parse an alternative.
-(define (peg-alternative->defn lst for-syntax)
-  #`(and #,@(map (lambda (x) (peg-body->defn x for-syntax))
-                 (context-flatten (lambda (x) (or (string? (car x))
-                                             (eq? (car x) 'peg-suffix)))
-                                  (cdr lst)))))
-
-;; Parse a body.
-(define (peg-body->defn lst for-syntax)
-  (let ((suffix '())
-        (front (datum->syntax for-syntax 'lit)))
-    (cond
-     ((eq? (car lst) 'peg-suffix)
-      (set! suffix lst))
-     ((string? (car lst))
-      (begin (set! front (datum->syntax for-syntax
-                                        (string->symbol (car lst))))
-             (set! suffix (cadr lst))))
-     (else `(peg-parse-body-fail ,lst)))
-    #`(body #,front #,@(peg-suffix->defn
-                        suffix
-                        for-syntax))))
-
-;; Parse a suffix.
-(define (peg-suffix->defn lst for-syntax)
-  #`(#,(peg-primary->defn (cadr lst) for-syntax)
-     #,(if (null? (cddr lst))
-           1
-           (datum->syntax for-syntax (string->symbol (caddr lst))))))
-
-;; Parse a primary.
-(define (peg-primary->defn lst for-syntax)
-  (let ((el (cadr lst)))
-  (cond
-   ((list? el)
-    (cond
-     ((eq? (car el) 'peg-literal)
-      (peg-literal->defn el for-syntax))
-     ((eq? (car el) 'peg-charclass)
-      (peg-charclass->defn el for-syntax))
-     ((eq? (car el) 'peg-nonterminal)
-      (datum->syntax for-syntax (string->symbol (cadr el))))))
-   ((string? el)
-    (cond
-     ((equal? el "(")
-      (peg-pattern->defn (caddr lst) for-syntax))
-     ((equal? el ".")
-      (datum->syntax for-syntax 'peg-any))
-     (else (datum->syntax for-syntax
-                          `(peg-parse-any unknown-string ,lst)))))
-   (else (datum->syntax for-syntax
-                        `(peg-parse-any unknown-el ,lst))))))
-
-;; Parses a literal.
-(define (peg-literal->defn lst for-syntax)
-  (datum->syntax for-syntax (trim-1chars (cadr lst))))
-
-;; Parses a charclass.
-(define (peg-charclass->defn lst for-syntax)
-  #`(or
-     #,@(map
-         (lambda (cc)
-           (cond
-            ((eq? (car cc) 'charclass-range)
-             #`(range #,(datum->syntax
-                         for-syntax
-                         (string-ref (cadr cc) 0))
-                      #,(datum->syntax
-                         for-syntax
-                         (string-ref (cadr cc) 2))))
-            ((eq? (car cc) 'charclass-single)
-             (datum->syntax for-syntax (cadr cc)))))
-         (context-flatten
-          (lambda (x) (or (eq? (car x) 'charclass-range)
-                          (eq? (car x) 'charclass-single)))
-          (cdr lst)))))
-
-;; Compresses a list to save the optimizer work.
-;; e.g. (or (and a)) -> a
-(define (compressor-core lst)
-  (if (or (not (list? lst)) (null? lst))
-      lst
-      (cond
-       ((and (or (eq? (car lst) 'or) (eq? (car lst) 'and))
-             (null? (cddr lst)))
-        (compressor-core (cadr lst)))
-       ((and (eq? (car lst) 'body)
-             (eq? (cadr lst) 'lit)
-             (eq? (cadddr lst) 1))
-        (compressor-core (caddr lst)))
-       (else (map compressor-core lst)))))
-
-(define (compressor syn for-syntax)
-  (datum->syntax for-syntax
-                 (compressor-core (syntax->datum syn))))
-
-;; Builds a lambda-expressions for the pattern STR using accum.
-(define (peg-string-compile str-stx accum)
-  (peg-sexp-compile
-   (compressor
-    (peg-pattern->defn
-     (peg:tree (peg-parse peg-pattern (syntax->datum str-stx))) str-stx)
-    str-stx)
-   accum))
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;; PMATCH STRUCTURE MUNGING
 ;; Pretty self-explanatory.
diff --git a/module/ice-9/peg/codegen.scm b/module/ice-9/peg/codegen.scm
index 43f44cc..2c85ccc 100644
--- a/module/ice-9/peg/codegen.scm
+++ b/module/ice-9/peg/codegen.scm
@@ -20,6 +20,7 @@
 (define-module (ice-9 peg codegen)
   #:export (peg-sexp-compile)
   #:use-module (ice-9 peg)
+  #:use-module (ice-9 peg string-peg)
   #:use-module (ice-9 pretty-print)
   #:use-module (system base pmatch))
 
diff --git a/module/ice-9/peg/string-peg.scm b/module/ice-9/peg/string-peg.scm
new file mode 100644
index 0000000..f7e21f6
--- /dev/null
+++ b/module/ice-9/peg/string-peg.scm
@@ -0,0 +1,305 @@
+;;;; string-peg.scm --- representing PEG grammars as strings
+;;;;
+;;;; 	Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;; 
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;;;; Lesser General Public License for more details.
+;;;; 
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+(define-module (ice-9 peg string-peg)
+  #:export (peg-string-compile
+            peg-as-peg
+            define-grammar
+            define-grammar-f
+            define-nonterm
+            peg-grammar)
+  #:use-module (ice-9 peg)
+  #:use-module (ice-9 peg codegen))
+
+;; The results of parsing using a nonterminal are cached.  Think of it like a
+;; hash with no conflict resolution.  Process for deciding on the cache size
+;; wasn't very scientific; just ran the benchmarks and stopped a little after
+;; the point of diminishing returns on my box.
+(define *cache-size* 512)
+
+;; Gets the left-hand depth of a list.
+(define (depth lst)
+  (if (or (not (list? lst)) (null? lst))
+      0
+      (+ 1 (depth (car lst)))))
+
+(eval-when (compile load eval)
+(define (syntax-for-non-cache-case for-syntax matchf-syn accumsym s-syn)
+;  (let ((matchf-syn (datum->syntax for-syntax matchf)))
+   #`(lambda (str strlen at)
+      (let ((res (#,matchf-syn str strlen at)))
+        ;; Try to match the nonterminal.
+        (if res
+            ;; If we matched, do some post-processing to figure out
+            ;; what data to propagate upward.
+            (let ((at (car res))
+                  (body (cadr res)))
+              #,(cond
+                 ((eq? accumsym 'name)
+                  #`(list at '#,s-syn))
+                 ((eq? accumsym 'all)
+                  #`(list (car res)
+                          (cond
+                           ((not (list? body))
+                            (list '#,s-syn body))
+                           ((null? body) '#,s-syn)
+                           ((symbol? (car body))
+                            (list '#,s-syn body))
+                           (else (cons '#,s-syn body)))))
+                 ((eq? accumsym 'none) #`(list (car res) '()))
+                 (else #`(begin res))))
+            ;; If we didn't match, just return false.
+            #f))))
+)
+
+;; Defines a new nonterminal symbol accumulating with ACCUM.
+(define-syntax define-nonterm
+  (lambda (x)
+    (syntax-case x ()
+      ((_ sym accum pat)
+       (let ((matchf (peg-sexp-compile #'pat (syntax->datum #'accum)))
+             (accumsym (syntax->datum #'accum))
+             (c (datum->syntax x (gensym))));; the cache
+         ;; CODE is the code to parse the string if the result isn't cached.
+         (let ((syn (syntax-for-non-cache-case x matchf accumsym #'sym)))
+           #`(begin
+               (define #,c (make-vector *cache-size* #f));; the cache
+               (define (sym str strlen at)
+                 (let* ((vref (vector-ref #,c (modulo at *cache-size*))))
+                   ;; Check to see whether the value is cached.
+                   (if (and vref (eq? (car vref) str) (= (cadr vref) at))
+                       (caddr vref);; If it is return it.
+                       (let ((fres ;; Else calculate it and cache it.
+                              (#,syn str strlen at)))
+                         (vector-set! #,c (modulo at *cache-size*)
+                                      (list str at fres))
+                         fres)))))))))))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;; Parse string PEGs using sexp PEGs.
+;; See the variable PEG-AS-PEG for an easier-to-read syntax.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Grammar for PEGs in PEG grammar.
+(define peg-as-peg
+"grammar <-- (nonterminal ('<--' / '<-' / '<') sp pattern)+
+pattern <-- alternative (SLASH sp alternative)*
+alternative <-- ([!&]? sp suffix)+
+suffix <-- primary ([*+?] sp)*
+primary <-- '(' sp pattern ')' sp / '.' sp / literal / charclass / nonterminal !'<'
+literal <-- ['] (!['] .)* ['] sp
+charclass <-- LB (!']' (CCrange / CCsingle))* RB sp
+CCrange <-- . '-' .
+CCsingle <-- .
+nonterminal <-- [a-zA-Z0-9-]+ sp
+sp < [ \t\n]*
+SLASH < '/'
+LB < '['
+RB < ']'
+")
+
+(define-nonterm peg-grammar all
+  (body lit (and peg-nonterminal (or "<--" "<-" "<") peg-sp peg-pattern) +))
+(define-nonterm peg-pattern all
+  (and peg-alternative
+       (body lit (and (ignore "/") peg-sp peg-alternative) *)))
+(define-nonterm peg-alternative all
+  (body lit (and (body lit (or "!" "&") ?) peg-sp peg-suffix) +))
+(define-nonterm peg-suffix all
+  (and peg-primary (body lit (and (or "*" "+" "?") peg-sp) *)))
+(define-nonterm peg-primary all
+  (or (and "(" peg-sp peg-pattern ")" peg-sp)
+      (and "." peg-sp)
+      peg-literal
+      peg-charclass
+      (and peg-nonterminal (body ! "<" 1))))
+(define-nonterm peg-literal all
+  (and "'" (body lit (and (body ! "'" 1) peg-any) *) "'" peg-sp))
+(define-nonterm peg-charclass all
+  (and (ignore "[")
+       (body lit (and (body ! "]" 1)
+                      (or charclass-range charclass-single)) *)
+       (ignore "]")
+       peg-sp))
+(define-nonterm charclass-range all (and peg-any "-" peg-any))
+(define-nonterm charclass-single all peg-any)
+(define-nonterm peg-nonterminal all
+  (and (body lit (or (range #\a #\z) (range #\A #\Z) (range #\0 #\9) "-") +) peg-sp))
+(define-nonterm peg-sp none
+  (body lit (or " " "\t" "\n") *))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;; PARSE STRING PEGS
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Pakes a string representing a PEG grammar and defines all the nonterminals in
+;; it as the associated PEGs.
+(define (peg-parser str for-syntax)
+  (let ((parsed (peg-parse peg-grammar str)))
+    (if (not parsed)
+        (begin
+          ;; (display "Invalid PEG grammar!\n")
+          #f)
+        (let ((lst (peg:tree parsed)))
+          (cond
+           ((or (not (list? lst)) (null? lst))
+            lst)
+           ((eq? (car lst) 'peg-grammar)
+            #`(begin
+                #,@(map (lambda (x) (peg-nonterm->defn x for-syntax))
+                        (context-flatten (lambda (lst) (<= (depth lst) 2))
+                                         (cdr lst))))))))))
+
+;; Macro wrapper for PEG-PARSER.  Parses PEG grammars expressed as strings and
+;; defines all the appropriate nonterminals.
+(define-syntax define-grammar
+  (lambda (x)
+    (syntax-case x ()
+      ((_ str)
+       (peg-parser (syntax->datum #'str) x)))))
+(define define-grammar-f peg-parser)
+
+;; Parse a nonterminal and pattern listed in LST.
+(define (peg-nonterm->defn lst for-syntax)
+  (let* ((nonterm (car lst))
+         (grabber (cadr lst))
+         (pattern (caddr lst))
+         (nonterm-name (datum->syntax for-syntax
+                                      (string->symbol (cadr nonterm)))))
+    #`(define-nonterm #,nonterm-name
+       #,(cond
+          ((string=? grabber "<--") (datum->syntax for-syntax 'all))
+          ((string=? grabber "<-") (datum->syntax for-syntax 'body))
+          (else (datum->syntax for-syntax 'none)))
+       #,(compressor (peg-pattern->defn pattern for-syntax) for-syntax))))
+
+;; Parse a pattern.
+(define (peg-pattern->defn lst for-syntax)
+  #`(or #,@(map (lambda (x) (peg-alternative->defn x for-syntax))
+                (context-flatten (lambda (x) (eq? (car x) 'peg-alternative))
+                                 (cdr lst)))))
+
+;; Parse an alternative.
+(define (peg-alternative->defn lst for-syntax)
+  #`(and #,@(map (lambda (x) (peg-body->defn x for-syntax))
+                 (context-flatten (lambda (x) (or (string? (car x))
+                                             (eq? (car x) 'peg-suffix)))
+                                  (cdr lst)))))
+
+;; Parse a body.
+(define (peg-body->defn lst for-syntax)
+  (let ((suffix '())
+        (front (datum->syntax for-syntax 'lit)))
+    (cond
+     ((eq? (car lst) 'peg-suffix)
+      (set! suffix lst))
+     ((string? (car lst))
+      (begin (set! front (datum->syntax for-syntax
+                                        (string->symbol (car lst))))
+             (set! suffix (cadr lst))))
+     (else `(peg-parse-body-fail ,lst)))
+    #`(body #,front #,@(peg-suffix->defn
+                        suffix
+                        for-syntax))))
+
+;; Parse a suffix.
+(define (peg-suffix->defn lst for-syntax)
+  #`(#,(peg-primary->defn (cadr lst) for-syntax)
+     #,(if (null? (cddr lst))
+           1
+           (datum->syntax for-syntax (string->symbol (caddr lst))))))
+
+;; Parse a primary.
+(define (peg-primary->defn lst for-syntax)
+  (let ((el (cadr lst)))
+  (cond
+   ((list? el)
+    (cond
+     ((eq? (car el) 'peg-literal)
+      (peg-literal->defn el for-syntax))
+     ((eq? (car el) 'peg-charclass)
+      (peg-charclass->defn el for-syntax))
+     ((eq? (car el) 'peg-nonterminal)
+      (datum->syntax for-syntax (string->symbol (cadr el))))))
+   ((string? el)
+    (cond
+     ((equal? el "(")
+      (peg-pattern->defn (caddr lst) for-syntax))
+     ((equal? el ".")
+      (datum->syntax for-syntax 'peg-any))
+     (else (datum->syntax for-syntax
+                          `(peg-parse-any unknown-string ,lst)))))
+   (else (datum->syntax for-syntax
+                        `(peg-parse-any unknown-el ,lst))))))
+
+;; Trims characters off the front and end of STR.
+;; (trim-1chars "'ab'") -> "ab"
+(define (trim-1chars str) (substring str 1 (- (string-length str) 1)))
+
+;; Parses a literal.
+(define (peg-literal->defn lst for-syntax)
+  (datum->syntax for-syntax (trim-1chars (cadr lst))))
+
+;; Parses a charclass.
+(define (peg-charclass->defn lst for-syntax)
+  #`(or
+     #,@(map
+         (lambda (cc)
+           (cond
+            ((eq? (car cc) 'charclass-range)
+             #`(range #,(datum->syntax
+                         for-syntax
+                         (string-ref (cadr cc) 0))
+                      #,(datum->syntax
+                         for-syntax
+                         (string-ref (cadr cc) 2))))
+            ((eq? (car cc) 'charclass-single)
+             (datum->syntax for-syntax (cadr cc)))))
+         (context-flatten
+          (lambda (x) (or (eq? (car x) 'charclass-range)
+                          (eq? (car x) 'charclass-single)))
+          (cdr lst)))))
+
+;; Compresses a list to save the optimizer work.
+;; e.g. (or (and a)) -> a
+(define (compressor-core lst)
+  (if (or (not (list? lst)) (null? lst))
+      lst
+      (cond
+       ((and (or (eq? (car lst) 'or) (eq? (car lst) 'and))
+             (null? (cddr lst)))
+        (compressor-core (cadr lst)))
+       ((and (eq? (car lst) 'body)
+             (eq? (cadr lst) 'lit)
+             (eq? (cadddr lst) 1))
+        (compressor-core (caddr lst)))
+       (else (map compressor-core lst)))))
+
+(define (compressor syn for-syntax)
+  (datum->syntax for-syntax
+                 (compressor-core (syntax->datum syn))))
+
+;; Builds a lambda-expressions for the pattern STR using accum.
+(define (peg-string-compile str-stx accum)
+  (peg-sexp-compile
+   (compressor
+    (peg-pattern->defn
+     (peg:tree (peg-parse peg-pattern (syntax->datum str-stx))) str-stx)
+    str-stx)
+   accum))
-- 
1.7.4.1


[-- Attachment #6: 0005-Factor-PEG-Functions.patch --]
[-- Type: application/octet-stream, Size: 8366 bytes --]

From 58d7f11462917061349e023c84293f1ff065753f Mon Sep 17 00:00:00 2001
From: Noah Lavine <nlavine@haverford.edu>
Date: Sun, 6 Mar 2011 00:02:27 -0500
Subject: [PATCH 5/7] Factor PEG Functions

 * module/ice-9/peg.scm: take out the functions for simplifying trees
 * module/ice-9/peg/simplify-tree.scm: and put them here
---
 module/ice-9/peg.scm               |   87 ++------------------------------
 module/ice-9/peg/simplify-tree.scm |   97 ++++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 81 deletions(-)
 create mode 100644 module/ice-9/peg/simplify-tree.scm

diff --git a/module/ice-9/peg.scm b/module/ice-9/peg.scm
index b96104a..cb79c60 100644
--- a/module/ice-9/peg.scm
+++ b/module/ice-9/peg.scm
@@ -18,8 +18,7 @@
 ;;;;
 
 (define-module (ice-9 peg)
-  #:export (context-flatten
-            peg-parse
+  #:export (peg-parse
 ;            define-nonterm
 ;            define-nonterm-f
             peg-match
@@ -28,16 +27,17 @@
             peg:string
             peg:tree
             peg:substring
-            peg-record?
-            keyword-flatten)
+            peg-record?)
 ;  #:export-syntax (define-nonterm)
   #:use-module (ice-9 peg codegen)
   #:use-module (ice-9 peg string-peg)
+  #:use-module (ice-9 peg simplify-tree)
   #:re-export (peg-sexp-compile
                define-grammar
                define-grammar-f
-               define-nonterm)
-  #:use-module (system base pmatch))
+               define-nonterm
+               keyword-flatten
+               context-flatten))
 
 ;;;
 ;;; Helper Macros
@@ -52,14 +52,6 @@ execute the STMTs and try again."
        (or test
            (begin stmt stmt* ... (lp)))))))
 
-(define-syntax single?
-  (syntax-rules ()
-    "Return #t if X is a list of one element."
-    ((_ x)
-     (pmatch x
-       ((_) #t)
-       (else #f)))))
-
 (eval-when (compile load eval)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -105,73 +97,6 @@ execute the STMTs and try again."
                         (string-collapse match))))))))))))
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;; POST-PROCESSING FUNCTIONS (TO CANONICALIZE MATCH TREES)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-;; Is everything in LST true?
-(define (andlst lst)
-  (or (null? lst)
-      (and (car lst) (andlst (cdr lst)))))
-
-;; Is LST a list of strings?
-(define (string-list? lst)
-  (and (list? lst) (not (null? lst))
-       (andlst (map string? lst))))
-
-;; Groups all strings that are next to each other in LST.  Used in
-;; STRING-COLLAPSE.
-(define (string-group lst)
-  (if (not (list? lst))
-      lst
-      (if (null? lst)
-          '()
-          (let ((next (string-group (cdr lst))))
-            (if (not (string? (car lst)))
-                (cons (car lst) next)
-                (if (and (not (null? next))
-                         (list? (car next))
-                         (string? (caar next)))
-                    (cons (cons (car lst) (car next)) (cdr next))
-                    (cons (list (car lst)) next)))))))
-
-
-;; Collapses all the string in LST.
-;; ("a" "b" (c d) "e" "f") -> ("ab" (c d) "ef")
-(define (string-collapse lst)
-  (if (list? lst)
-      (let ((res (map (lambda (x) (if (string-list? x)
-                                      (apply string-append x)
-                                      x))
-                      (string-group (map string-collapse lst)))))
-        (if (single? res) (car res) res))
-      lst))
-
-;; If LST is an atom, return (list LST), else return LST.
-(define (mklst lst)
-  (if (not (list? lst)) (list lst) lst))
-
-;; Takes a list and "flattens" it, using the predicate TST to know when to stop
-;; instead of terminating on atoms (see tutorial).
-(define (context-flatten tst lst)
-  (if (or (not (list? lst)) (null? lst))
-      lst
-      (if (tst lst)
-          (list lst)
-          (apply append
-                 (map (lambda (x) (mklst (context-flatten tst x)))
-                      lst)))))
-
-;; Takes a list and "flattens" it, using the list of keywords KEYWORD-LST to
-;; know when to stop at (see tutorial).
-(define (keyword-flatten keyword-lst lst)
-  (context-flatten
-   (lambda (x)
-     (if (or (not (list? x)) (null? x))
-         #t
-         (member (car x) keyword-lst)))
-   lst))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;; PMATCH STRUCTURE MUNGING
 ;; Pretty self-explanatory.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/module/ice-9/peg/simplify-tree.scm b/module/ice-9/peg/simplify-tree.scm
new file mode 100644
index 0000000..4c781a1
--- /dev/null
+++ b/module/ice-9/peg/simplify-tree.scm
@@ -0,0 +1,97 @@
+;;;; simplify-tree.scm --- utility functions for the PEG parser
+;;;;
+;;;; 	Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;; 
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;;;; Lesser General Public License for more details.
+;;;; 
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+(define-module (ice-9 peg simplify-tree)
+  #:export (keyword-flatten context-flatten string-collapse)
+  #:use-module (system base pmatch))
+
+(define-syntax single?
+  (syntax-rules ()
+    "Return #t if X is a list of one element."
+    ((_ x)
+     (pmatch x
+       ((_) #t)
+       (else #f)))))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;; POST-PROCESSING FUNCTIONS (TO CANONICALIZE MATCH TREES)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Is everything in LST true?
+(define (andlst lst)
+  (or (null? lst)
+      (and (car lst) (andlst (cdr lst)))))
+
+;; Is LST a list of strings?
+(define (string-list? lst)
+  (and (list? lst) (not (null? lst))
+       (andlst (map string? lst))))
+
+;; Groups all strings that are next to each other in LST.  Used in
+;; STRING-COLLAPSE.
+(define (string-group lst)
+  (if (not (list? lst))
+      lst
+      (if (null? lst)
+          '()
+          (let ((next (string-group (cdr lst))))
+            (if (not (string? (car lst)))
+                (cons (car lst) next)
+                (if (and (not (null? next))
+                         (list? (car next))
+                         (string? (caar next)))
+                    (cons (cons (car lst) (car next)) (cdr next))
+                    (cons (list (car lst)) next)))))))
+
+
+;; Collapses all the string in LST.
+;; ("a" "b" (c d) "e" "f") -> ("ab" (c d) "ef")
+(define (string-collapse lst)
+  (if (list? lst)
+      (let ((res (map (lambda (x) (if (string-list? x)
+                                      (apply string-append x)
+                                      x))
+                      (string-group (map string-collapse lst)))))
+        (if (single? res) (car res) res))
+      lst))
+
+;; If LST is an atom, return (list LST), else return LST.
+(define (mklst lst)
+  (if (not (list? lst)) (list lst) lst))
+
+;; Takes a list and "flattens" it, using the predicate TST to know when to stop
+;; instead of terminating on atoms (see tutorial).
+(define (context-flatten tst lst)
+  (if (or (not (list? lst)) (null? lst))
+      lst
+      (if (tst lst)
+          (list lst)
+          (apply append
+                 (map (lambda (x) (mklst (context-flatten tst x)))
+                      lst)))))
+
+;; Takes a list and "flattens" it, using the list of keywords KEYWORD-LST to
+;; know when to stop at (see tutorial).
+(define (keyword-flatten keyword-lst lst)
+  (context-flatten
+   (lambda (x)
+     (if (or (not (list? x)) (null? x))
+         #t
+         (member (car x) keyword-lst)))
+   lst))
-- 
1.7.4.1


[-- Attachment #7: 0006-Factor-PEG-Structure.patch --]
[-- Type: application/octet-stream, Size: 4687 bytes --]

From e5b7b22b9e916b395e6070c0ee52b75ac237fc2f Mon Sep 17 00:00:00 2001
From: Noah Lavine <nlavine@haverford.edu>
Date: Sun, 6 Mar 2011 00:09:16 -0500
Subject: [PATCH 6/7] Factor PEG Structure

 * modules/ice-9/peg.scm: remove the part that defines a match structure
 * modules/ice-9/peg/match-record.scm: and put it here
---
 module/ice-9/peg.scm              |   38 +++++++-------------------------
 module/ice-9/peg/match-record.scm |   43 +++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 29 deletions(-)
 create mode 100644 module/ice-9/peg/match-record.scm

diff --git a/module/ice-9/peg.scm b/module/ice-9/peg.scm
index cb79c60..4269f9b 100644
--- a/module/ice-9/peg.scm
+++ b/module/ice-9/peg.scm
@@ -21,23 +21,24 @@
   #:export (peg-parse
 ;            define-nonterm
 ;            define-nonterm-f
-            peg-match
-            peg:start
-            peg:end
-            peg:string
-            peg:tree
-            peg:substring
-            peg-record?)
+            peg-match)
 ;  #:export-syntax (define-nonterm)
   #:use-module (ice-9 peg codegen)
   #:use-module (ice-9 peg string-peg)
   #:use-module (ice-9 peg simplify-tree)
+  #:use-module (ice-9 peg match-record)
   #:re-export (peg-sexp-compile
                define-grammar
                define-grammar-f
                define-nonterm
                keyword-flatten
-               context-flatten))
+               context-flatten
+               peg:start
+               peg:end
+               peg:string
+               peg:tree
+               peg:substring
+               peg-record?))
 
 ;;;
 ;;; Helper Macros
@@ -96,26 +97,5 @@ execute the STMTs and try again."
                         at end string
                         (string-collapse match))))))))))))
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;; PMATCH STRUCTURE MUNGING
-;; Pretty self-explanatory.
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(define prec
-  (make-record-type "peg" '(start end string tree)))
-(define make-prec
-  (record-constructor prec '(start end string tree)))
-(define (peg:start pm)
-  (if pm ((record-accessor prec 'start) pm) #f))
-(define (peg:end pm)
-  (if pm ((record-accessor prec 'end) pm) #f))
-(define (peg:string pm)
-  (if pm ((record-accessor prec 'string) pm) #f))
-(define (peg:tree pm)
-  (if pm ((record-accessor prec 'tree) pm) #f))
-(define (peg:substring pm)
-  (if pm (substring (peg:string pm) (peg:start pm) (peg:end pm)) #f))
-(define peg-record? (record-predicate prec))
-
 )
 
diff --git a/module/ice-9/peg/match-record.scm b/module/ice-9/peg/match-record.scm
new file mode 100644
index 0000000..87785a5
--- /dev/null
+++ b/module/ice-9/peg/match-record.scm
@@ -0,0 +1,43 @@
+;;;; match-record.scm --- records to hold PEG parser results
+;;;;
+;;;; 	Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;; 
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;;;; Lesser General Public License for more details.
+;;;; 
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+(define-module (ice-9 peg match-record)
+  #:export (prec make-prec peg:start peg:end peg:string
+            peg:tree peg:substring peg-record?))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;; PMATCH STRUCTURE MUNGING
+;; Pretty self-explanatory.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define prec
+  (make-record-type "peg" '(start end string tree)))
+(define make-prec
+  (record-constructor prec '(start end string tree)))
+(define (peg:start pm)
+  (if pm ((record-accessor prec 'start) pm) #f))
+(define (peg:end pm)
+  (if pm ((record-accessor prec 'end) pm) #f))
+(define (peg:string pm)
+  (if pm ((record-accessor prec 'string) pm) #f))
+(define (peg:tree pm)
+  (if pm ((record-accessor prec 'tree) pm) #f))
+(define (peg:substring pm)
+  (if pm (substring (peg:string pm) (peg:start pm) (peg:end pm)) #f))
+(define peg-record? (record-predicate prec))
-- 
1.7.4.1


[-- Attachment #8: 0007-Remove-eval-when.patch --]
[-- Type: application/octet-stream, Size: 1039 bytes --]

From d3394d374c6d7124ae3c25212340772285d9290a Mon Sep 17 00:00:00 2001
From: Noah Lavine <nlavine@haverford.edu>
Date: Sun, 6 Mar 2011 00:12:37 -0500
Subject: [PATCH 7/7] Remove eval-when

 * module/ice-9/peg.scm: remove the eval-when statement
---
 module/ice-9/peg.scm |    5 -----
 1 files changed, 0 insertions(+), 5 deletions(-)

diff --git a/module/ice-9/peg.scm b/module/ice-9/peg.scm
index 4269f9b..644af6d 100644
--- a/module/ice-9/peg.scm
+++ b/module/ice-9/peg.scm
@@ -53,8 +53,6 @@ execute the STMTs and try again."
        (or test
            (begin stmt stmt* ... (lp)))))))
 
-(eval-when (compile load eval)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;; FOR DEFINING AND USING NONTERMINALS
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -96,6 +94,3 @@ execute the STMTs and try again."
                        (make-prec
                         at end string
                         (string-collapse match))))))))))))
-
-)
-
-- 
1.7.4.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: PEG Patches
  2011-03-06  5:25 PEG Patches Noah Lavine
@ 2011-03-07  1:28 ` Noah Lavine
  2011-03-25 18:06 ` Andy Wingo
  1 sibling, 0 replies; 10+ messages in thread
From: Noah Lavine @ 2011-03-07  1:28 UTC (permalink / raw)
  To: guile-devel

[-- Attachment #1: Type: text/plain, Size: 1567 bytes --]

Here's another patch, which in retrospect may be the most useful of
the series. It adds a section called "PEG Internals" to the manual,
and begins documenting how PEG actually works. This should make
hacking PEG a lot easier.

Noah

On Sun, Mar 6, 2011 at 12:25 AM, Noah Lavine <noah.b.lavine@gmail.com> wrote:
> Hello all,
>
> Attached is a series of patches I've made for the wip-mlucy branch. It
> splits the PEG code into several little modules which go in
> module/ice-9/peg/. The original peg source file becomes very little.
> At the end it finally loses its big eval-when wrapper.
>
> There's one part of this that I'm not satisfied with, which is that
> define-nonterminal goes into module/ice-9/peg/string-peg.scm, which is
> supposed to be solely for pegs-as-strings. The reason for this is that
> I got compiler errors if I didn't do this, and I couldn't figure out
> how to stop them. I would appreciate it if someone would take a look
> and try to find what I missed.
>
> Also, a note about future ideas - the current PEG code can only parse
> strings. However, there is almost nothing string-specific about the
> parsing code - just a few calls to string-ref and substring in
> codegen.scm. I'd like to see this extended to parse vectors filled
> with arbitrary objects. This would let you use a tokenizer with it,
> which is the easiest way to implement C correctly, and also probably
> the easiest way to store line number information with tokens, which is
> necessary for ultimately giving good error messages from PEG parsers.
>
> Thanks,
> Noah
>

[-- Attachment #2: 0001-Document-PEG-Internals.patch --]
[-- Type: application/octet-stream, Size: 3799 bytes --]

From ab9739f2855cb4051ff24029d84cf59631f205a3 Mon Sep 17 00:00:00 2001
From: Noah Lavine <nlavine@haverford.edu>
Date: Sun, 6 Mar 2011 20:24:13 -0500
Subject: [PATCH] Document PEG Internals

 * doc/ref/api-peg.texi: add a manual section about the PEG internals.
---
 doc/ref/api-peg.texi |   71 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 71 insertions(+), 0 deletions(-)

diff --git a/doc/ref/api-peg.texi b/doc/ref/api-peg.texi
index b53139d..0c83365 100644
--- a/doc/ref/api-peg.texi
+++ b/doc/ref/api-peg.texi
@@ -35,6 +35,7 @@ reference, and a tutorial.
 * PEG Syntax Reference::
 * PEG API Reference::
 * PEG Tutorial::
+* PEG Internals::
 @end menu
 
 @node PEG Syntax Reference
@@ -921,3 +922,73 @@ instantiation of a common pattern for matching syntactically irrelevant
 information.  Since it's tagged with @code{<} and ends with @code{*} it
 won't clutter up the parse trees (all the empty lists will be discarded
 during the compression step) and it will never cause parsing to fail.
+
+@node PEG Internals
+@subsection PEG Internals
+
+A PEG parser takes a string as input and attempts to parse it as a given
+nonterminal. The key idea of the PEG implementation is that every
+nonterminal is just a function that takes a string as an argument and
+attempts to parse that string as its nonterminal. The functions always
+start from the beginning, but a parse is considered successful if there
+is material left over at the end.
+
+This makes it easy to model different PEG parsing operations. For
+instance, consider the PEG grammar @code{"ab"}, which could also be
+written @code{(and "a" "b")}. It matches the string ``ab''. Here's how
+that might be implemented in the PEG style:
+
+@lisp
+(define (match-and-a-b str)
+  (match-a str)
+  (match-b str))
+@end lisp
+
+As you can see, the use of functions provides an easy way to model
+sequencing. In a similar way, one could model @code{(or a b)} with
+something like the following:
+
+@lisp
+(define (match-or-a-b str)
+  (or (match-a str) (match-b str)))
+@end lisp
+
+Here the semantics of a PEG @code{or} expression map naturally onto
+Scheme's @code{or} operator. This function will attempt to run
+@code{(match-a str)}, and return its result if it succeeds. Otherwise it
+will run @code{(match-b str)}.
+
+Of course, the code above wouldn't quite work. We need some way for the
+parsing functions to communicate. The actual interface used is below.
+
+@subsubheading Parsing Function Interface
+
+A parsing function takes three arguments - a string, the length of that
+string, and the position in that string it should start parsing at. In
+effect, the parsing functions pass around substrings in pieces - the
+first argument is a buffer of characters, and the second two give a
+range within that buffer that the parsing function should look at.
+
+Parsing functions return either #f, if they failed to match their
+nonterminal, or a list whose first element must be an integer
+representing the final position in the string they matched and whose cdr
+can be any other data the function wishes to return, or '() if it
+doesn't have any more data.
+
+The one caveat is that if the extra data it returns is a list, any
+adjacent strings in that list will be appended by @code{peg-parse}. For
+instance, if a parsing function returns @code{(13 ("a" "b" "c"))},
+@code{peg-parse} will take @code{(13 ("abc"))} as its value.
+
+For example, here is a function to match ``ab'' using the actual
+interface.
+
+@lisp
+(define (match-a-b str len pos)
+   (and (<= (+ pos 2) len)
+        (string= str "ab" pos (+ pos 2))
+        (list (+ pos 2) '()))) ; we return no extra information
+@end lisp
+
+The above function can be used to match a string by running
+@code{(peg-parse match-a-b "ab")}.
-- 
1.7.4.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: PEG Patches
  2011-03-06  5:25 PEG Patches Noah Lavine
  2011-03-07  1:28 ` Noah Lavine
@ 2011-03-25 18:06 ` Andy Wingo
  2011-03-28 20:44   ` Noah Lavine
  1 sibling, 1 reply; 10+ messages in thread
From: Andy Wingo @ 2011-03-25 18:06 UTC (permalink / raw)
  To: Noah Lavine; +Cc: guile-devel

On Sun 06 Mar 2011 06:25, Noah Lavine <noah.b.lavine@gmail.com> writes:

> Attached is a series of patches I've made for the wip-mlucy branch. It
> splits the PEG code into several little modules which go in
> module/ice-9/peg/. The original peg source file becomes very little.
> At the end it finally loses its big eval-when wrapper.

Cool!  These cleanups are great.

However... when you added the files, you did not add them to
Makefile.am, so they don't get built.  I went back to add them, but they
don't compile, and it's because of the circularity we have discussed in
other threads.

I think the solution is to confront the circularity directly.  It exists
because the PEG s-exp grammar also deals with the string grammar, which
needs an already-build PEG parser.

Let's break it instead into layers without cycles: removing the string
grammar from the s-exp code generator.  If we want a layer with both, we
build it on top of the two lower layers.

What do you think?

Andy
-- 
http://wingolog.org/



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: PEG Patches
  2011-03-25 18:06 ` Andy Wingo
@ 2011-03-28 20:44   ` Noah Lavine
  2011-03-28 20:46     ` Noah Lavine
                       ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Noah Lavine @ 2011-03-28 20:44 UTC (permalink / raw)
  To: Andy Wingo; +Cc: guile-devel

[-- Attachment #1: Type: text/plain, Size: 3312 bytes --]

Hi,

> I think the solution is to confront the circularity directly.  It exists
> because the PEG s-exp grammar also deals with the string grammar, which
> needs an already-build PEG parser.
>
> Let's break it instead into layers without cycles: removing the string
> grammar from the s-exp code generator.  If we want a layer with both, we
> build it on top of the two lower layers.
>
> What do you think?

I've been working on that. The attached two patches break the
circularity. The code still isn't organized brilliantly, but after
applying these I think we would only want pretty minor cleanups before
merging PEG into the main branch.

However, there's an interesting issue which I am not sure how to
confront. Here it is:

Currently, peg-sexp-compile is defined as a big case statement:

(define (peg-sexp-compile pattern accum)
  (syntax-case pattern (....)
    <lots of cases here>))

What these patches do is take out the case for embedded PEG strings,
so the case statement has one fewer case. Then they add a new function
peg-extended-compile, defined by

(define (peg-extended-compile pattern accum)
  (syntax-case pattern (peg)
    ((peg str)
     (string? (syntax->datum #'str))
     (peg-string-compile #'str (if (eq? accum 'all) 'body accum)))
    (else (peg-sexp-compile pattern accum))))

peg-string-compile takes a string, parses it, and then calls
peg-sexp-compile on the result, so this is noncircular.

Unfortunately, this sacrifices a feature. The trouble is that the
cases in peg-sexp-compile call peg-sexp-compile on parts of
themselves, because PEG expressions are recursive. Those inner PEG
expressions can never contain embedded string PEGs with this
definition, because those calls never go through peg-extended-compile.

I see a few options:
 - say that string PEGs can only occur at the top level of a PEG
expression. The peg module has never been released, so no one uses
this feature now anyway.
 - instead of defining a new function peg-extended-compile, redefine
peg-sexp-compile via set! once we have string pegs.
 - write peg-extended-compile as its own big case statement, basically
duplicating peg-sexp-compile.
 - adopt some interface that allows people to extend the cases in
peg-sexp-compile. We would start with just s-expression PEGs, then use
this interface to add string PEGs later in the load sequence.

The second and third options seem hackish to me. The third option is
especially bad because I think some of the calls to peg-sexp-compile
are in helper functions that peg-sexp-compile calls, so we might have
to duplicate most of codegen.scm to make this work.

The fourth option seems elegant, but I'm not sure what a good
interface for that is. Is there anything in Guile now that can
idiomatically be used for an extensible list of cases? It seems almost
like something GOOPS would do, but not quite. I am also a bit
concerned about the fourth option because it could become an interface
that is only ever used once, and might just add unnecessary
complexity.

I think the first option is the best one for now, because it doesn't
require much work and it would allow a smooth transition if we ever
enable non-top-level PEG strings in the future. What do other people
think?

Noah

[-- Attachment #2: 0001-Move-define-nonterm.patch --]
[-- Type: application/octet-stream, Size: 11103 bytes --]

From 296a31d0ecf3a6f758871f9c3dc2b6937592b25d Mon Sep 17 00:00:00 2001
From: Noah Lavine <nlavine@haverford.edu>
Date: Mon, 28 Mar 2011 15:13:35 -0400
Subject: [PATCH 1/2] Move define-nonterm

* module/ice-9/peg/string-peg.scm: remove define-nonterm and make a simpler
   macro called `define-sexp-parser' to make the PEG grammar
* module/ice-9/peg.scm: move define-nonterm macro to this file
* module/ice-9/peg/codegen.scm: move code to wrap a parser result nicely to
   this file, under name `wrap-parser-for-users'
---
 module/ice-9/peg.scm            |   33 +++++++++++-
 module/ice-9/peg/codegen.scm    |   29 ++++++++++-
 module/ice-9/peg/string-peg.scm |  107 +++++++++++----------------------------
 3 files changed, 89 insertions(+), 80 deletions(-)

diff --git a/module/ice-9/peg.scm b/module/ice-9/peg.scm
index 644af6d..4f4bbf8 100644
--- a/module/ice-9/peg.scm
+++ b/module/ice-9/peg.scm
@@ -19,7 +19,7 @@
 
 (define-module (ice-9 peg)
   #:export (peg-parse
-;            define-nonterm
+            define-nonterm
 ;            define-nonterm-f
             peg-match)
 ;  #:export-syntax (define-nonterm)
@@ -30,7 +30,7 @@
   #:re-export (peg-sexp-compile
                define-grammar
                define-grammar-f
-               define-nonterm
+;               define-nonterm
                keyword-flatten
                context-flatten
                peg:start
@@ -67,6 +67,35 @@ execute the STMTs and try again."
         #f
         (make-prec 0 (car res) string (string-collapse (cadr res))))))
 
+;; The results of parsing using a nonterminal are cached.  Think of it like a
+;; hash with no conflict resolution.  Process for deciding on the cache size
+;; wasn't very scientific; just ran the benchmarks and stopped a little after
+;; the point of diminishing returns on my box.
+(define *cache-size* 512)
+
+;; Defines a new nonterminal symbol accumulating with ACCUM.
+(define-syntax define-nonterm
+  (lambda (x)
+    (syntax-case x ()
+      ((_ sym accum pat)
+       (let ((matchf (peg-sexp-compile #'pat (syntax->datum #'accum)))
+             (accumsym (syntax->datum #'accum))
+             (c (datum->syntax x (gensym))));; the cache
+         ;; CODE is the code to parse the string if the result isn't cached.
+         (let ((syn (wrap-parser-for-users x matchf accumsym #'sym)))
+           #`(begin
+               (define #,c (make-vector *cache-size* #f));; the cache
+               (define (sym str strlen at)
+                 (let* ((vref (vector-ref #,c (modulo at *cache-size*))))
+                   ;; Check to see whether the value is cached.
+                   (if (and vref (eq? (car vref) str) (= (cadr vref) at))
+                       (caddr vref);; If it is return it.
+                       (let ((fres ;; Else calculate it and cache it.
+                              (#,syn str strlen at)))
+                         (vector-set! #,c (modulo at *cache-size*)
+                                      (list str at fres))
+                         fres)))))))))))
+
 ;; Searches through STRING for something that parses to PEG-MATCHER.  Think
 ;; regexp search.
 (define-syntax peg-match
diff --git a/module/ice-9/peg/codegen.scm b/module/ice-9/peg/codegen.scm
index 2c85ccc..0804d1e 100644
--- a/module/ice-9/peg/codegen.scm
+++ b/module/ice-9/peg/codegen.scm
@@ -18,7 +18,7 @@
 ;;;;
 
 (define-module (ice-9 peg codegen)
-  #:export (peg-sexp-compile)
+  #:export (peg-sexp-compile wrap-parser-for-users)
   #:use-module (ice-9 peg)
   #:use-module (ice-9 peg string-peg)
   #:use-module (ice-9 pretty-print)
@@ -244,3 +244,30 @@ return EXP."
                       (lit
                        #`(and success
                               #,(cggr accum 'cg-body #'(reverse body) #'new-end)))))))))))
+
+;; Packages the results of a parser
+(define (wrap-parser-for-users for-syntax parser accumsym s-syn)
+   #`(lambda (str strlen at)
+      (let ((res (#,parser str strlen at)))
+        ;; Try to match the nonterminal.
+        (if res
+            ;; If we matched, do some post-processing to figure out
+            ;; what data to propagate upward.
+            (let ((at (car res))
+                  (body (cadr res)))
+              #,(cond
+                 ((eq? accumsym 'name)
+                  #`(list at '#,s-syn))
+                 ((eq? accumsym 'all)
+                  #`(list (car res)
+                          (cond
+                           ((not (list? body))
+                            (list '#,s-syn body))
+                           ((null? body) '#,s-syn)
+                           ((symbol? (car body))
+                            (list '#,s-syn body))
+                           (else (cons '#,s-syn body)))))
+                 ((eq? accumsym 'none) #`(list (car res) '()))
+                 (else #`(begin res))))
+            ;; If we didn't match, just return false.
+            #f))))
diff --git a/module/ice-9/peg/string-peg.scm b/module/ice-9/peg/string-peg.scm
index f7e21f6..a899727 100644
--- a/module/ice-9/peg/string-peg.scm
+++ b/module/ice-9/peg/string-peg.scm
@@ -22,16 +22,11 @@
             peg-as-peg
             define-grammar
             define-grammar-f
-            define-nonterm
             peg-grammar)
   #:use-module (ice-9 peg)
-  #:use-module (ice-9 peg codegen))
-
-;; The results of parsing using a nonterminal are cached.  Think of it like a
-;; hash with no conflict resolution.  Process for deciding on the cache size
-;; wasn't very scientific; just ran the benchmarks and stopped a little after
-;; the point of diminishing returns on my box.
-(define *cache-size* 512)
+  #:use-module (ice-9 peg codegen)
+  #:use-module (ice-9 peg match-record)
+  #:use-module (ice-9 peg simplify-tree))
 
 ;; Gets the left-hand depth of a list.
 (define (depth lst)
@@ -39,58 +34,6 @@
       0
       (+ 1 (depth (car lst)))))
 
-(eval-when (compile load eval)
-(define (syntax-for-non-cache-case for-syntax matchf-syn accumsym s-syn)
-;  (let ((matchf-syn (datum->syntax for-syntax matchf)))
-   #`(lambda (str strlen at)
-      (let ((res (#,matchf-syn str strlen at)))
-        ;; Try to match the nonterminal.
-        (if res
-            ;; If we matched, do some post-processing to figure out
-            ;; what data to propagate upward.
-            (let ((at (car res))
-                  (body (cadr res)))
-              #,(cond
-                 ((eq? accumsym 'name)
-                  #`(list at '#,s-syn))
-                 ((eq? accumsym 'all)
-                  #`(list (car res)
-                          (cond
-                           ((not (list? body))
-                            (list '#,s-syn body))
-                           ((null? body) '#,s-syn)
-                           ((symbol? (car body))
-                            (list '#,s-syn body))
-                           (else (cons '#,s-syn body)))))
-                 ((eq? accumsym 'none) #`(list (car res) '()))
-                 (else #`(begin res))))
-            ;; If we didn't match, just return false.
-            #f))))
-)
-
-;; Defines a new nonterminal symbol accumulating with ACCUM.
-(define-syntax define-nonterm
-  (lambda (x)
-    (syntax-case x ()
-      ((_ sym accum pat)
-       (let ((matchf (peg-sexp-compile #'pat (syntax->datum #'accum)))
-             (accumsym (syntax->datum #'accum))
-             (c (datum->syntax x (gensym))));; the cache
-         ;; CODE is the code to parse the string if the result isn't cached.
-         (let ((syn (syntax-for-non-cache-case x matchf accumsym #'sym)))
-           #`(begin
-               (define #,c (make-vector *cache-size* #f));; the cache
-               (define (sym str strlen at)
-                 (let* ((vref (vector-ref #,c (modulo at *cache-size*))))
-                   ;; Check to see whether the value is cached.
-                   (if (and vref (eq? (car vref) str) (= (cadr vref) at))
-                       (caddr vref);; If it is return it.
-                       (let ((fres ;; Else calculate it and cache it.
-                              (#,syn str strlen at)))
-                         (vector-set! #,c (modulo at *cache-size*)
-                                      (list str at fres))
-                         fres)))))))))))
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;; Parse string PEGs using sexp PEGs.
 ;; See the variable PEG-AS-PEG for an easier-to-read syntax.
@@ -114,34 +57,43 @@ LB < '['
 RB < ']'
 ")
 
-(define-nonterm peg-grammar all
+(define-syntax define-sexp-parser
+  (lambda (x)
+    (syntax-case x ()
+      ((_ sym accum pat)
+       (let* ((matchf (peg-sexp-compile #'pat (syntax->datum #'accum)))
+              (accumsym (syntax->datum #'accum))
+              (syn (wrap-parser-for-users x matchf accumsym #'sym)))
+           #`(define sym #,syn))))))
+
+(define-sexp-parser peg-grammar all
   (body lit (and peg-nonterminal (or "<--" "<-" "<") peg-sp peg-pattern) +))
-(define-nonterm peg-pattern all
+(define-sexp-parser peg-pattern all
   (and peg-alternative
        (body lit (and (ignore "/") peg-sp peg-alternative) *)))
-(define-nonterm peg-alternative all
+(define-sexp-parser peg-alternative all
   (body lit (and (body lit (or "!" "&") ?) peg-sp peg-suffix) +))
-(define-nonterm peg-suffix all
+(define-sexp-parser peg-suffix all
   (and peg-primary (body lit (and (or "*" "+" "?") peg-sp) *)))
-(define-nonterm peg-primary all
+(define-sexp-parser peg-primary all
   (or (and "(" peg-sp peg-pattern ")" peg-sp)
       (and "." peg-sp)
       peg-literal
       peg-charclass
       (and peg-nonterminal (body ! "<" 1))))
-(define-nonterm peg-literal all
+(define-sexp-parser peg-literal all
   (and "'" (body lit (and (body ! "'" 1) peg-any) *) "'" peg-sp))
-(define-nonterm peg-charclass all
+(define-sexp-parser peg-charclass all
   (and (ignore "[")
        (body lit (and (body ! "]" 1)
                       (or charclass-range charclass-single)) *)
        (ignore "]")
        peg-sp))
-(define-nonterm charclass-range all (and peg-any "-" peg-any))
-(define-nonterm charclass-single all peg-any)
-(define-nonterm peg-nonterminal all
+(define-sexp-parser charclass-range all (and peg-any "-" peg-any))
+(define-sexp-parser charclass-single all peg-any)
+(define-sexp-parser peg-nonterminal all
   (and (body lit (or (range #\a #\z) (range #\A #\Z) (range #\0 #\9) "-") +) peg-sp))
-(define-nonterm peg-sp none
+(define-sexp-parser peg-sp none
   (body lit (or " " "\t" "\n") *))
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -297,9 +249,10 @@ RB < ']'
 
 ;; Builds a lambda-expressions for the pattern STR using accum.
 (define (peg-string-compile str-stx accum)
-  (peg-sexp-compile
-   (compressor
-    (peg-pattern->defn
-     (peg:tree (peg-parse peg-pattern (syntax->datum str-stx))) str-stx)
-    str-stx)
-   accum))
+  (let ((string (syntax->datum str-stx)))
+    (peg-sexp-compile
+     (compressor
+      (peg-pattern->defn
+       (peg:tree (peg-parse peg-pattern string)) str-stx)
+      str-stx)
+     accum)))
-- 
1.7.4.1


[-- Attachment #3: 0002-Separate-PEG-Concerns.patch --]
[-- Type: application/octet-stream, Size: 2314 bytes --]

From 3febf116a67af979ff4f692693a509b317354ff7 Mon Sep 17 00:00:00 2001
From: Noah Lavine <nlavine@haverford.edu>
Date: Mon, 28 Mar 2011 15:18:27 -0400
Subject: [PATCH 2/2] Separate PEG Concerns

* module/ice-9/peg/codegen.scm: peg-sexp-compile no longer knows about
   string PEGs
* module/ice-9/peg.scm: add a new function peg-extended-compile that
   calls peg-sexp-compile or peg-string-compile on its argument as
   appropriate
---
 module/ice-9/peg.scm         |    9 ++++++++-
 module/ice-9/peg/codegen.scm |    3 ---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/module/ice-9/peg.scm b/module/ice-9/peg.scm
index 4f4bbf8..58e35ce 100644
--- a/module/ice-9/peg.scm
+++ b/module/ice-9/peg.scm
@@ -67,6 +67,13 @@ execute the STMTs and try again."
         #f
         (make-prec 0 (car res) string (string-collapse (cadr res))))))
 
+(define (peg-extended-compile pattern accum)
+  (syntax-case pattern (peg)
+    ((peg str)
+     (string? (syntax->datum #'str))
+     (peg-string-compile #'str (if (eq? accum 'all) 'body accum)))
+    (else (peg-sexp-compile pattern accum))))
+
 ;; The results of parsing using a nonterminal are cached.  Think of it like a
 ;; hash with no conflict resolution.  Process for deciding on the cache size
 ;; wasn't very scientific; just ran the benchmarks and stopped a little after
@@ -78,7 +85,7 @@ execute the STMTs and try again."
   (lambda (x)
     (syntax-case x ()
       ((_ sym accum pat)
-       (let ((matchf (peg-sexp-compile #'pat (syntax->datum #'accum)))
+       (let ((matchf (peg-extended-compile #'pat (syntax->datum #'accum)))
              (accumsym (syntax->datum #'accum))
              (c (datum->syntax x (gensym))));; the cache
          ;; CODE is the code to parse the string if the result isn't cached.
diff --git a/module/ice-9/peg/codegen.scm b/module/ice-9/peg/codegen.scm
index 0804d1e..8dd507c 100644
--- a/module/ice-9/peg/codegen.scm
+++ b/module/ice-9/peg/codegen.scm
@@ -164,9 +164,6 @@ return EXP."
      (peg-sexp-compile #'pat 'none))
     ((capture pat) ;; parse
      (peg-sexp-compile #'pat 'body))
-    ((peg pat)  ;; embedded PEG string
-     (string? (syntax->datum #'pat))
-     (peg-string-compile #'pat (baf accum)))
     ((and pat ...)
      (cg-and #'(pat ...) (baf accum)))
     ((or pat ...)
-- 
1.7.4.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: PEG Patches
  2011-03-28 20:44   ` Noah Lavine
@ 2011-03-28 20:46     ` Noah Lavine
  2011-03-28 22:17     ` Michael Lucy
  2011-03-29  8:00     ` Andy Wingo
  2 siblings, 0 replies; 10+ messages in thread
From: Noah Lavine @ 2011-03-28 20:46 UTC (permalink / raw)
  To: Andy Wingo; +Cc: guile-devel

> I've been working on that. The attached two patches break the
> circularity. The code still isn't organized brilliantly, but after
> applying these I think we would only want pretty minor cleanups before
> merging PEG into the main branch.

Actually, forget this bit. I wrote it before I remembered that the
s-expression language will probably need to be changed.



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: PEG Patches
  2011-03-28 20:44   ` Noah Lavine
  2011-03-28 20:46     ` Noah Lavine
@ 2011-03-28 22:17     ` Michael Lucy
  2011-03-29  8:00     ` Andy Wingo
  2 siblings, 0 replies; 10+ messages in thread
From: Michael Lucy @ 2011-03-28 22:17 UTC (permalink / raw)
  To: Noah Lavine; +Cc: Andy Wingo, guile-devel

A variant on the second option would be first defining
peg-string-compile to just throw an error, then redefining it later to
actually compile the string.  That seems a little less hackish, at
least to me.

A fifth option would be to make peg-sexp-compile take an optional
argument FUN-RECUR that it will call instead of recursing into itself
(so in your example FUN-RECUR would be peg-extended-compile).  This
involves more rewriting than the other options to pass the optional
argument around, but it's pretty clean and would allow users to write
other parsing layers on top of peg-sexp-compile should they wish
(achieving similar results to the fourth option).

On Mon, Mar 28, 2011 at 3:44 PM, Noah Lavine <noah.b.lavine@gmail.com> wrote:
> Hi,
>
>> I think the solution is to confront the circularity directly.  It exists
>> because the PEG s-exp grammar also deals with the string grammar, which
>> needs an already-build PEG parser.
>>
>> Let's break it instead into layers without cycles: removing the string
>> grammar from the s-exp code generator.  If we want a layer with both, we
>> build it on top of the two lower layers.
>>
>> What do you think?
>
> I've been working on that. The attached two patches break the
> circularity. The code still isn't organized brilliantly, but after
> applying these I think we would only want pretty minor cleanups before
> merging PEG into the main branch.
>
> However, there's an interesting issue which I am not sure how to
> confront. Here it is:
>
> Currently, peg-sexp-compile is defined as a big case statement:
>
> (define (peg-sexp-compile pattern accum)
>  (syntax-case pattern (....)
>    <lots of cases here>))
>
> What these patches do is take out the case for embedded PEG strings,
> so the case statement has one fewer case. Then they add a new function
> peg-extended-compile, defined by
>
> (define (peg-extended-compile pattern accum)
>  (syntax-case pattern (peg)
>    ((peg str)
>     (string? (syntax->datum #'str))
>     (peg-string-compile #'str (if (eq? accum 'all) 'body accum)))
>    (else (peg-sexp-compile pattern accum))))
>
> peg-string-compile takes a string, parses it, and then calls
> peg-sexp-compile on the result, so this is noncircular.
>
> Unfortunately, this sacrifices a feature. The trouble is that the
> cases in peg-sexp-compile call peg-sexp-compile on parts of
> themselves, because PEG expressions are recursive. Those inner PEG
> expressions can never contain embedded string PEGs with this
> definition, because those calls never go through peg-extended-compile.
>
> I see a few options:
>  - say that string PEGs can only occur at the top level of a PEG
> expression. The peg module has never been released, so no one uses
> this feature now anyway.
>  - instead of defining a new function peg-extended-compile, redefine
> peg-sexp-compile via set! once we have string pegs.
>  - write peg-extended-compile as its own big case statement, basically
> duplicating peg-sexp-compile.
>  - adopt some interface that allows people to extend the cases in
> peg-sexp-compile. We would start with just s-expression PEGs, then use
> this interface to add string PEGs later in the load sequence.
>
> The second and third options seem hackish to me. The third option is
> especially bad because I think some of the calls to peg-sexp-compile
> are in helper functions that peg-sexp-compile calls, so we might have
> to duplicate most of codegen.scm to make this work.
>
> The fourth option seems elegant, but I'm not sure what a good
> interface for that is. Is there anything in Guile now that can
> idiomatically be used for an extensible list of cases? It seems almost
> like something GOOPS would do, but not quite. I am also a bit
> concerned about the fourth option because it could become an interface
> that is only ever used once, and might just add unnecessary
> complexity.
>
> I think the first option is the best one for now, because it doesn't
> require much work and it would allow a smooth transition if we ever
> enable non-top-level PEG strings in the future. What do other people
> think?
>
> Noah
>



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: PEG Patches
  2011-03-28 20:44   ` Noah Lavine
  2011-03-28 20:46     ` Noah Lavine
  2011-03-28 22:17     ` Michael Lucy
@ 2011-03-29  8:00     ` Andy Wingo
  2011-03-29 12:47       ` Noah Lavine
  2 siblings, 1 reply; 10+ messages in thread
From: Andy Wingo @ 2011-03-29  8:00 UTC (permalink / raw)
  To: Noah Lavine; +Cc: guile-devel

On Mon 28 Mar 2011 22:44, Noah Lavine <noah.b.lavine@gmail.com> writes:

>  - say that string PEGs can only occur at the top level of a PEG
> expression. The peg module has never been released, so no one uses
> this feature now anyway.
>  - instead of defining a new function peg-extended-compile, redefine
> peg-sexp-compile via set! once we have string pegs.
>  - write peg-extended-compile as its own big case statement, basically
> duplicating peg-sexp-compile.
>  - adopt some interface that allows people to extend the cases in
> peg-sexp-compile. We would start with just s-expression PEGs, then use
> this interface to add string PEGs later in the load sequence.

This last is the best.  What if we define a module that serves as a
registry of PEG match behaviors, like `(ice-9 peg matchers)'.  Then we
define `define-peg-matcher' or something, so that we can:

(define-peg-matcher and cg-and)

where define-peg-matcher is

(define-syntax define-peg-matcher
  (syntax-rules ()
    ((_ name binding)
     (module-define! (resolve-module '(ice-9 peg matchers))
                      'name
                      binding))))

Then instead of defining separate cases for ignore, range, etc the
peg-sexp-compile macro does:

  ((matcher arg ...) (identifier? #'matcher)
   ((module-ref (resolve-module '(ice-9 peg matchers))
                (syntax->datum #'matcher))
    #'(arg ...)
    mode))

Then the peg-string module registers a matcher for `peg'.

Dunno.  WDYT?

Andy
-- 
http://wingolog.org/



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: PEG Patches
  2011-03-29  8:00     ` Andy Wingo
@ 2011-03-29 12:47       ` Noah Lavine
  2011-03-29 13:20         ` Andy Wingo
  0 siblings, 1 reply; 10+ messages in thread
From: Noah Lavine @ 2011-03-29 12:47 UTC (permalink / raw)
  To: Andy Wingo; +Cc: guile-devel

> This last is the best.  What if we define a module that serves as a
> registry of PEG match behaviors, like `(ice-9 peg matchers)'.  Then we
> define `define-peg-matcher' or something, so that we can:
>
> (define-peg-matcher and cg-and)
>
> where define-peg-matcher is
>
> (define-syntax define-peg-matcher
>  (syntax-rules ()
>    ((_ name binding)
>     (module-define! (resolve-module '(ice-9 peg matchers))
>                      'name
>                      binding))))
>
> Then instead of defining separate cases for ignore, range, etc the
> peg-sexp-compile macro does:
>
>  ((matcher arg ...) (identifier? #'matcher)
>   ((module-ref (resolve-module '(ice-9 peg matchers))
>                (syntax->datum #'matcher))
>    #'(arg ...)
>    mode))
>
> Then the peg-string module registers a matcher for `peg'.
>
> Dunno.  WDYT?

That's doable. But if we're going to choose what to do entirely based
on the first element of the list, then we could also just not define
peg-sexp-compile at all and make each of the code generation functions
into macros.

How does that sound?



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: PEG Patches
  2011-03-29 12:47       ` Noah Lavine
@ 2011-03-29 13:20         ` Andy Wingo
  2011-03-31 21:48           ` Noah Lavine
  0 siblings, 1 reply; 10+ messages in thread
From: Andy Wingo @ 2011-03-29 13:20 UTC (permalink / raw)
  To: Noah Lavine; +Cc: guile-devel

On Tue 29 Mar 2011 14:47, Noah Lavine <noah.b.lavine@gmail.com> writes:

>> (define-peg-matcher and cg-and)
>
> That's doable. But if we're going to choose what to do entirely based
> on the first element of the list, then we could also just not define
> peg-sexp-compile at all and make each of the code generation functions
> into macros.
>
> How does that sound?

Good idea.  Sounds great to me!

Andy
-- 
http://wingolog.org/



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: PEG Patches
  2011-03-29 13:20         ` Andy Wingo
@ 2011-03-31 21:48           ` Noah Lavine
  0 siblings, 0 replies; 10+ messages in thread
From: Noah Lavine @ 2011-03-31 21:48 UTC (permalink / raw)
  To: Andy Wingo; +Cc: guile-devel

[-- Attachment #1: Type: text/plain, Size: 1426 bytes --]

Hello again,

I was about to do this, and then I discovered that it wouldn't work,
because there are a few special case PEGs that don't make sense as
macros. Specifically, in the context of a PEG, we interpret strings as
matching themselves, and those can't be made into macros.

So I went ahead and implemented a simple way to extend
peg-sexp-compile. It turned out to be much less difficult than I was
afraid of.

The first attached patch adds the interface to (ice-9 peg codegen) and
changes most of the functions there to use it, and also adds some
documentation in the PEG Internals section. The second one updates
(ice-9 peg string-peg) to use it as well, and gets rid of
peg-extended-compile from peg.scm since it's no longer needed.

I wrote the patches on top of the last two that I sent, because those
included some cleanups that I wanted to keep.

Noah

On Tue, Mar 29, 2011 at 9:20 AM, Andy Wingo <wingo@pobox.com> wrote:
> On Tue 29 Mar 2011 14:47, Noah Lavine <noah.b.lavine@gmail.com> writes:
>
>>> (define-peg-matcher and cg-and)
>>
>> That's doable. But if we're going to choose what to do entirely based
>> on the first element of the list, then we could also just not define
>> peg-sexp-compile at all and make each of the code generation functions
>> into macros.
>>
>> How does that sound?
>
> Good idea.  Sounds great to me!
>
> Andy
> --
> http://wingolog.org/
>

[-- Attachment #2: 0001-Extensible-PEG-Syntax.patch --]
[-- Type: application/octet-stream, Size: 11741 bytes --]

From 0f28602bf3fde35e7bfc6fa38b7608b97af3c017 Mon Sep 17 00:00:00 2001
From: Noah Lavine <nlavine@haverford.edu>
Date: Thu, 31 Mar 2011 17:04:06 -0400
Subject: [PATCH 1/2] Extensible PEG Syntax

* module/ice-9/peg/codegen.scm: Make the PEG syntax extensible, and
    move most of the current code generators to the new interface
* doc/ref/api-peg.texi: Document PEG extensions in the PEG Internals
    section of the manual
---
 doc/ref/api-peg.texi         |   32 ++++++++
 module/ice-9/peg/codegen.scm |  176 ++++++++++++++++++++++++------------------
 2 files changed, 133 insertions(+), 75 deletions(-)

diff --git a/doc/ref/api-peg.texi b/doc/ref/api-peg.texi
index 0c83365..6d0a346 100644
--- a/doc/ref/api-peg.texi
+++ b/doc/ref/api-peg.texi
@@ -992,3 +992,35 @@ interface.
 
 The above function can be used to match a string by running
 @code{(peg-parse match-a-b "ab")}.
+
+@subsubheading Code Generators and Extensible Syntax
+
+PEG expressions, such as those in a @code{define-nonterm} form, are
+interpreted internally in two steps.
+
+First, any string PEG is expanded into an s-expression PEG by the code
+in the @code{(ice-9 peg string-peg)} module.
+
+Then, then s-expression PEG that results is compiled into a parsing
+function by the @code{(ice-9 peg codegen)} module. In particular, the
+function @code{peg-sexp-compile} is called on the s-expression. It then
+decides what to do based on the form it is passed.
+
+The PEG syntax can be expanded by providing @code{peg-sexp-compile} more
+options for what to do with its forms. The extended syntax will be
+associated with a symbol, for instance @code{my-parsing-form}, and will
+be called on all PEG expressions of the form
+@lisp
+(my-parsing-form ...)
+@end lisp
+
+The parsing function should take two arguments. The first will be a
+syntax object containing a list with all of the arguments to the form
+(but not the form's name), and the second will be the
+@code{capture-type} argument that is passed to @code{define-nonterm}.
+
+New functions can be registered by calling @code{(add-peg-compiler!
+symbol function)}, where @code{symbol} is the symbol that will indicate
+a form of this type and @code{function} is the code generating function
+described above. The function @code{add-peg-compiler!} is exported from
+the @code{(ice-9 peg codegen)} module.
diff --git a/module/ice-9/peg/codegen.scm b/module/ice-9/peg/codegen.scm
index 8dd507c..597ead9 100644
--- a/module/ice-9/peg/codegen.scm
+++ b/module/ice-9/peg/codegen.scm
@@ -18,9 +18,7 @@
 ;;;;
 
 (define-module (ice-9 peg codegen)
-  #:export (peg-sexp-compile wrap-parser-for-users)
-  #:use-module (ice-9 peg)
-  #:use-module (ice-9 peg string-peg)
+  #:export (peg-sexp-compile wrap-parser-for-users add-peg-compiler!)
   #:use-module (ice-9 pretty-print)
   #:use-module (system base pmatch))
 
@@ -123,18 +121,35 @@ return EXP."
 
 ;; Generates code for matching a range of characters between start and end.
 ;; E.g.: (cg-range syntax #\a #\z 'body)
-(define (cg-range start end accum)
-  #`(lambda (str len pos)
-      (and (< pos len)
-           (let ((c (string-ref str pos)))
-             (and (char>=? c #,start)
-                  (char<=? c #,end)
-                  #,(case accum
-                      ((all) #`(list (1+ pos) (list 'cg-range (string c))))
-                      ((name) #`(list (1+ pos) 'cg-range))
-                      ((body) #`(list (1+ pos) (string c)))
-                      ((none) #`(list (1+ pos) '()))
-                      (else (error "bad accum" accum))))))))
+(define (cg-range pat accum)
+  (syntax-case pat ()
+    ((start end)
+     (if (not (and (char? (syntax->datum #'start))
+                   (char? (syntax->datum #'end))))
+         (error "range PEG should have characters after it; instead got"
+                #'start #'end))
+     #`(lambda (str len pos)
+         (and (< pos len)
+              (let ((c (string-ref str pos)))
+                (and (char>=? c start)
+                     (char<=? c end)
+                     #,(case accum
+                         ((all) #`(list (1+ pos) (list 'cg-range (string c))))
+                         ((name) #`(list (1+ pos) 'cg-range))
+                         ((body) #`(list (1+ pos) (string c)))
+                         ((none) #`(list (1+ pos) '()))
+                         (else (error "bad accum" accum))))))))))
+
+;; Generate code to match a pattern and do nothing with the result
+(define (cg-ignore pat accum)
+  (syntax-case pat ()
+    ((inner)
+     (peg-sexp-compile #'inner 'none))))
+
+(define (cg-capture pat accum)
+  (syntax-case pat ()
+    ((inner)
+     (peg-sexp-compile #'inner 'body))))
 
 ;; Filters the accum argument to peg-sexp-compile for buildings like string
 ;; literals (since we don't want to tag them with their name if we're doing an
@@ -147,35 +162,11 @@ return EXP."
    ((eq? accum 'none) 'none)))
 (define baf builtin-accum-filter)
 
-;; Takes an arbitrary expressions and accumulation variable, then parses it.
-;; E.g.: (peg-sexp-compile syntax '(and "abc" (or "-" (range #\a #\z))) 'all)
-(define (peg-sexp-compile pat accum)
-  (syntax-case pat (peg-any range ignore capture peg and or body)
-    (peg-any
-     (cg-peg-any (baf accum)))
-    (sym (identifier? #'sym) ;; nonterminal
-     #'sym)
-    (str (string? (syntax->datum #'str)) ;; literal string
-     (cg-string (syntax->datum #'str) (baf accum)))
-    ((range start end) ;; range of characters (e.g. [a-z])
-     (and (char? (syntax->datum #'start)) (char? (syntax->datum #'end)))
-     (cg-range (syntax->datum #'start) (syntax->datum #'end) (baf accum)))
-    ((ignore pat) ;; match but don't parse
-     (peg-sexp-compile #'pat 'none))
-    ((capture pat) ;; parse
-     (peg-sexp-compile #'pat 'body))
-    ((and pat ...)
-     (cg-and #'(pat ...) (baf accum)))
-    ((or pat ...)
-     (cg-or #'(pat ...) (baf accum)))
-    ((body type pat num)
-     (cg-body (baf accum) #'type #'pat #'num))))
-
 ;; Top-level function builder for AND.  Reduces to a call to CG-AND-INT.
 (define (cg-and clauses accum)
   #`(lambda (str len pos)
       (let ((body '()))
-        #,(cg-and-int clauses accum #'str #'len #'pos #'body))))
+        #,(cg-and-int clauses (baf accum) #'str #'len #'pos #'body))))
 
 ;; Internal function builder for AND (calls itself).
 (define (cg-and-int clauses accum str strlen at body)
@@ -195,7 +186,7 @@ return EXP."
 ;; Top-level function builder for OR.  Reduces to a call to CG-OR-INT.
 (define (cg-or clauses accum)
   #`(lambda (str len pos)
-      #,(cg-or-int clauses accum #'str #'len #'pos)))
+      #,(cg-or-int clauses (baf accum) #'str #'len #'pos)))
 
 ;; Internal function builder for OR (calls itself).
 (define (cg-or-int clauses accum str strlen at)
@@ -207,40 +198,75 @@ return EXP."
            #,(cg-or-int #'(rest ...) accum str strlen at)))))
 
 ;; Returns a function that parses a BODY element.
-(define (cg-body accum type pat num)
-  #`(lambda (str strlen at)
-      (let ((body '()))
-        (let lp ((end at) (count 0))
-          (let* ((match (#,(peg-sexp-compile pat accum) str strlen end))
-                 (new-end (if match (car match) end))
-                 (count (if (> new-end end) (1+ count) count)))
-            (if (> new-end end)
-                (push-not-null! body (single-filter (cadr match))))
-            (if (and (> new-end end)
-                     #,(syntax-case num (+ * ?)
-                         (n (number? (syntax->datum #'n))
-                            #'(< count n))
-                         (+ #t)
-                         (* #t)
-                         (? #'(< count 1))))
-                (lp new-end count)
-                (let ((success #,(syntax-case num (+ * ?)
-                                   (n (number? (syntax->datum #'n))
-                                      #'(= count n))
-                                   (+ #'(>= count 1))
-                                   (* #t)
-                                   (? #t))))
-                  #,(syntax-case type (! & lit)
-                      (!
-                       #`(if success
-                             #f
-                             #,(cggr accum 'cg-body #''() #'at)))
-                      (&
-                       #`(and success
-                              #,(cggr accum 'cg-body #''() #'at)))
-                      (lit
-                       #`(and success
-                              #,(cggr accum 'cg-body #'(reverse body) #'new-end)))))))))))
+(define (cg-body args accum)
+  (syntax-case args ()
+    ((type pat num)
+     #`(lambda (str strlen at)
+         (let ((body '()))
+           (let lp ((end at) (count 0))
+             (let* ((match (#,(peg-sexp-compile #'pat (baf accum))
+                            str strlen end))
+                    (new-end (if match (car match) end))
+                    (count (if (> new-end end) (1+ count) count)))
+               (if (> new-end end)
+                   (push-not-null! body (single-filter (cadr match))))
+               (if (and (> new-end end)
+                        #,(syntax-case #'num (+ * ?)
+                            (n (number? (syntax->datum #'n))
+                               #'(< count n))
+                            (+ #t)
+                            (* #t)
+                            (? #'(< count 1))))
+                   (lp new-end count)
+                   (let ((success #,(syntax-case #'num (+ * ?)
+                                      (n (number? (syntax->datum #'n))
+                                         #'(= count n))
+                                      (+ #'(>= count 1))
+                                      (* #t)
+                                      (? #t))))
+                     #,(syntax-case #'type (! & lit)
+                         (!
+                          #`(if success
+                                #f
+                                #,(cggr (baf accum) 'cg-body #''() #'at)))
+                         (&
+                          #`(and success
+                                 #,(cggr (baf accum) 'cg-body #''() #'at)))
+                         (lit
+                          #`(and success
+                                 #,(cggr (baf accum) 'cg-body
+                                         #'(reverse body) #'new-end)))))))))))))
+
+;; Association list of functions to handle different expressions as PEGs
+(define peg-compiler-alist '())
+
+(define (add-peg-compiler! symbol function)
+  (set! peg-compiler-alist
+        (assq-set! peg-compiler-alist symbol function)))
+
+(add-peg-compiler! 'range cg-range)
+(add-peg-compiler! 'ignore cg-ignore)
+(add-peg-compiler! 'capture cg-capture)
+(add-peg-compiler! 'and cg-and)
+(add-peg-compiler! 'or cg-or)
+(add-peg-compiler! 'body cg-body)
+
+;; Takes an arbitrary expressions and accumulation variable, then parses it.
+;; E.g.: (peg-sexp-compile syntax '(and "abc" (or "-" (range #\a #\z))) 'all)
+(define (peg-sexp-compile pat accum)
+  (syntax-case pat (peg-any range ignore capture peg and or body)
+    (peg-any
+     (cg-peg-any (baf accum)))
+    (sym (identifier? #'sym) ;; nonterminal
+     #'sym)
+    (str (string? (syntax->datum #'str)) ;; literal string
+     (cg-string (syntax->datum #'str) (baf accum)))
+    ((name . args) (let* ((nm (syntax->datum #'name))
+                          (entry (assq-ref peg-compiler-alist nm)))
+                     (if entry
+                         (entry #'args accum)
+                         (error "Bad peg form" nm #'args
+                                "Not one of" (map car peg-compiler-alist)))))))
 
 ;; Packages the results of a parser
 (define (wrap-parser-for-users for-syntax parser accumsym s-syn)
-- 
1.7.4.1


[-- Attachment #3: 0002-Update-String-PEGs.patch --]
[-- Type: application/octet-stream, Size: 4595 bytes --]

From a34f30694462ed7965cb885781dcfe6c45b04646 Mon Sep 17 00:00:00 2001
From: Noah Lavine <nlavine@haverford.edu>
Date: Thu, 31 Mar 2011 17:42:36 -0400
Subject: [PATCH 2/2] Update String PEGs

* module/ice-9/peg/string-peg.scm: use new interface for extending PEG
   syntax
* module/ice-9/peg.scm: remove peg-extended-compile
---
 module/ice-9/peg.scm            |   20 +++++++-------------
 module/ice-9/peg/string-peg.scm |   25 +++++++++++++++----------
 2 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/module/ice-9/peg.scm b/module/ice-9/peg.scm
index 58e35ce..730e048 100644
--- a/module/ice-9/peg.scm
+++ b/module/ice-9/peg.scm
@@ -22,7 +22,6 @@
             define-nonterm
 ;            define-nonterm-f
             peg-match)
-;  #:export-syntax (define-nonterm)
   #:use-module (ice-9 peg codegen)
   #:use-module (ice-9 peg string-peg)
   #:use-module (ice-9 peg simplify-tree)
@@ -30,7 +29,6 @@
   #:re-export (peg-sexp-compile
                define-grammar
                define-grammar-f
-;               define-nonterm
                keyword-flatten
                context-flatten
                peg:start
@@ -67,13 +65,6 @@ execute the STMTs and try again."
         #f
         (make-prec 0 (car res) string (string-collapse (cadr res))))))
 
-(define (peg-extended-compile pattern accum)
-  (syntax-case pattern (peg)
-    ((peg str)
-     (string? (syntax->datum #'str))
-     (peg-string-compile #'str (if (eq? accum 'all) 'body accum)))
-    (else (peg-sexp-compile pattern accum))))
-
 ;; The results of parsing using a nonterminal are cached.  Think of it like a
 ;; hash with no conflict resolution.  Process for deciding on the cache size
 ;; wasn't very scientific; just ran the benchmarks and stopped a little after
@@ -85,7 +76,7 @@ execute the STMTs and try again."
   (lambda (x)
     (syntax-case x ()
       ((_ sym accum pat)
-       (let ((matchf (peg-extended-compile #'pat (syntax->datum #'accum)))
+       (let ((matchf (peg-sexp-compile #'pat (syntax->datum #'accum)))
              (accumsym (syntax->datum #'accum))
              (c (datum->syntax x (gensym))));; the cache
          ;; CODE is the code to parse the string if the result isn't cached.
@@ -103,6 +94,11 @@ execute the STMTs and try again."
                                       (list str at fres))
                          fres)))))))))))
 
+(define (peg-like->peg pat)
+  (syntax-case pat ()
+    (str (string? (syntax->datum #'str)) #'(peg str))
+    (else pat)))
+
 ;; Searches through STRING for something that parses to PEG-MATCHER.  Think
 ;; regexp search.
 (define-syntax peg-match
@@ -110,9 +106,7 @@ execute the STMTs and try again."
     (syntax-case x ()
       ((_ pattern string-uncopied)
        (let ((pmsym (syntax->datum #'pattern)))
-         (let ((matcher (if (string? (syntax->datum #'pattern))
-                            (peg-string-compile #'pattern 'body)
-                            (peg-sexp-compile #'pattern 'body))))
+         (let ((matcher (peg-sexp-compile (peg-like->peg #'pattern) 'body)))
            ;; We copy the string before using it because it might have been
            ;; modified in-place since the last time it was parsed, which would
            ;; invalidate the cache.  Guile uses copy-on-write for strings, so
diff --git a/module/ice-9/peg/string-peg.scm b/module/ice-9/peg/string-peg.scm
index a899727..181ec05 100644
--- a/module/ice-9/peg/string-peg.scm
+++ b/module/ice-9/peg/string-peg.scm
@@ -18,8 +18,7 @@
 ;;;;
 
 (define-module (ice-9 peg string-peg)
-  #:export (peg-string-compile
-            peg-as-peg
+  #:export (peg-as-peg
             define-grammar
             define-grammar-f
             peg-grammar)
@@ -248,11 +247,17 @@ RB < ']'
                  (compressor-core (syntax->datum syn))))
 
 ;; Builds a lambda-expressions for the pattern STR using accum.
-(define (peg-string-compile str-stx accum)
-  (let ((string (syntax->datum str-stx)))
-    (peg-sexp-compile
-     (compressor
-      (peg-pattern->defn
-       (peg:tree (peg-parse peg-pattern string)) str-stx)
-      str-stx)
-     accum)))
+(define (peg-string-compile args accum)
+  (syntax-case args ()
+    ((str-stx) (string? (syntax->datum #'str-stx))
+     (let ((string (syntax->datum #'str-stx)))
+       (peg-sexp-compile
+        (compressor
+         (peg-pattern->defn
+          (peg:tree (peg-parse peg-pattern string)) #'str-stx)
+         #'str-stx)
+        (if (eq? accum 'all) 'body accum))))
+     (else (error "Bad embedded PEG string" args))))
+
+(add-peg-compiler! 'peg peg-string-compile)
+
-- 
1.7.4.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2011-03-31 21:48 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-03-06  5:25 PEG Patches Noah Lavine
2011-03-07  1:28 ` Noah Lavine
2011-03-25 18:06 ` Andy Wingo
2011-03-28 20:44   ` Noah Lavine
2011-03-28 20:46     ` Noah Lavine
2011-03-28 22:17     ` Michael Lucy
2011-03-29  8:00     ` Andy Wingo
2011-03-29 12:47       ` Noah Lavine
2011-03-29 13:20         ` Andy Wingo
2011-03-31 21:48           ` Noah Lavine

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).