unofficial mirror of bug-guile@gnu.org 
 help / color / mirror / Atom feed
From: Ekaitz Zarraga <ekaitz@elenq.tech>
To: 73188@debbugs.gnu.org
Cc: ludo@gnu.org, Ekaitz Zarraga <ekaitz@elenq.tech>
Subject: bug#73188: [PATCH 3/3] PEG: add large string-peg patch
Date: Sun, 22 Dec 2024 21:01:08 +0100	[thread overview]
Message-ID: <20241222200128.13782-3-ekaitz@elenq.tech> (raw)
In-Reply-To: <20241222200128.13782-1-ekaitz@elenq.tech>

---
 test-suite/tests/peg.test | 117 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 113 insertions(+), 4 deletions(-)

diff --git a/test-suite/tests/peg.test b/test-suite/tests/peg.test
index d9e3e1b22..d8d047288 100644
--- a/test-suite/tests/peg.test
+++ b/test-suite/tests/peg.test
@@ -86,7 +86,7 @@
 End <-- '*)'
 C <- Begin N* End
 N <- C / (!Begin !End Z)
-Z <- [^X-Z]") ;; Forbid some characters to test not-in-range
+Z <- .")
 
 ;; A short /etc/passwd file.
 (define *etc-passwd*
@@ -126,9 +126,6 @@ SLASH < '/'")
     (match-pattern C "(*blah*)")
     (make-prec 0 8 "(*blah*)"
 	       '((Begin "(*") "blah" (End "*)")))))
-  (pass-if
-   "simple comment with forbidden char"
-   (not (match-pattern C "(*blYh*)")))
   (pass-if
    "simple comment padded"
    (equal?
@@ -288,3 +285,115 @@ number <-- [0-9]+")
    (equal? (eq-parse "1+1/2*3+(1+1)/2")
 	   '(+ (+ 1 (* (/ 1 2) 3)) (/ (+ 1 1) 2)))))
 
+
+(define html-grammar
+"
+# Based on code from https://github.com/Fantom-Factory/afHtmlParser
+# 2014-2023 Steve Eynon. This code was originally released under the following
+# terms:
+#
+#      Permission to use, copy, modify, and/or distribute this software for any
+#      purpose with or without fee is hereby granted, provided that the above
+#      copyright notice and this permission notice appear in all copies.
+#
+#      THE SOFTWARE IS PROVIDED \"AS IS\" AND THE AUTHOR DISCLAIMS ALL
+#      WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
+#      OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE
+#      FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
+#      DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
+#      IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+#      OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+# PEG Rules for parsing well formed HTML 5 documents
+# https://html.spec.whatwg.org/multipage/syntax.html
+
+html        <-- bom? blurb* doctype? blurb* xmlProlog? blurb* elem blurb*
+bom         <-- \"\\uFEFF\"
+xmlProlog   <-- \"<?xml\" (!\"?>\" .)+ \"?>\"
+
+# ---- Doctype ----
+
+doctype           <-- \"<!DOCTYPE\" [ \\t\\n\\f\\r]+ [a-zA-Z0-9]+ (doctypePublicId / doctypeSystemId)* [ \\t\\n\\f\\r]* \">\"
+doctypePublicId   <-- [ \\t\\n\\f\\r]+  \"PUBLIC\" [ \\t\\n\\f\\r]+   ((\"\\\"\" [^\"]* \"\\\"\") / (\"'\" [^']* \"'\"))
+doctypeSystemId   <-- [ \\t\\n\\f\\r]+ (\"SYSTEM\" [ \\t\\n\\f\\r]+)? ((\"\\\"\" [^\"]* \"\\\"\") / (\"'\" [^']* \"'\"))
+
+# ---- Elems ----
+
+elem              <-- voidElem / rawTextElem / escRawTextElem / selfClosingElem / normalElem
+voidElem          <-- \"<\"  voidElemName       attributes  \">\"
+rawTextElem       <-- \"<\"  rawTextElemName    attributes  \">\" rawTextContent endElem
+escRawTextElem    <-- \"<\"  escRawTextElemName attributes  \">\" escRawTextContent endElem
+selfClosingElem   <-- \"<\"  elemName           attributes \"/>\"
+normalElem        <-- \"<\"  elemName           attributes  \">\" normalContent? endElem?
+endElem           <-- \"</\" elemName                       \">\"
+
+elemName            <-- [a-zA-Z] [^\\t\\n\\f />]*
+voidElemName        <-- \"area\" / \"base\" / \"br\" / \"col\" / \"embed\" /
+                      \"hr\" / \"img\" / \"input\" / \"keygen\" / \"link\" /
+                      \"meta\" / \"param\" / \"source\" / \"track\" / \"wbr\"
+rawTextElemName     <-- \"script\" / \"style\"
+escRawTextElemName  <-- \"textarea\" / \"title\"
+
+rawTextContent      <-- (!(\"</script>\" / \"</style>\") .)+
+escRawTextContent   <-- ((!(\"</textarea>\" / \"</title>\" / \"&\") .)+ / charRef)*
+normalContent       <-- !\"</\" (([^<&]+ / charRef) / comment / cdata / elem)*
+
+# ---- Attributes ----
+
+attributes        <-- (&[^/>] ([ \\t]+ / doubleQuoteAttr / singleQuoteAttr / unquotedAttr / emptyAttr))*
+attrName          <-- [^ \\t\\n\\r\\f\"'>/=]+
+emptyAttr         <-- attrName+
+unquotedAttr      <-- attrName [ \\t]* \"=\" [ \\t]*      (charRef / [^ \\t\\n\\r\\f\"'=<>`&]+)+
+singleQuoteAttr   <-- attrName [ \\t]* \"=\" [ \\t]* \"'\"  (charRef / [^'&]+)* \"'\"
+doubleQuoteAttr   <-- attrName [ \\t]* \"=\" [ \\t]* \"\\\"\" (charRef / [^\"&]+)* \"\\\"\"
+
+# ---- Character References ----
+
+charRef         <-- &\"&\" (decNumCharRef / hexNumCharRef / namedCharRef / borkedRef)
+namedCharRef    <-- \"&\"   [^;>]+ \";\"
+decNumCharRef   <-- \"&#\"  [0-9]+ \";\"
+hexNumCharRef   <-- \"&#x\" [a-fA-F0-9]+ \";\"
+borkedRef       <-- \"&\"  &[ \\t]
+
+# ---- Misc ----
+
+cdata       <-- \"<![CDATA[\" (!\"]]>\" .)+ \"]]>\"
+comment     <-- \"<!--\" (!\"--\" .)+ \"-->\"
+blurb       <-- [ \\t\\n\\f\\r]+ / comment")
+
+(define html-example "
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Example Domain</title>
+    <meta charset=\"utf-8\" />
+    <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\" />
+    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
+    <style type=\"text/css\">
+    body {
+        background-color: #f0f0f2;
+        margin: 0;
+        padding: 0;
+    }
+    </style>
+</head>
+
+<body>
+<div>
+    <h1>Example Domain</h1>
+    <p>This domain is for use in illustrative examples in documents. You may
+    use this domain in literature without prior coordination or asking for
+    permission.</p> <p><a href=\"https://www.iana.org/domains/example\">More
+    information...</a></p>
+</div>
+</body>
+</html>
+")
+
+(with-test-prefix "Parsing with complex grammars"
+  (eeval `(define-peg-string-patterns ,html-grammar))
+  (pass-if
+    "HTML parsing"
+    (equal?
+      (peg:tree (match-pattern html html-example))
+      '(html (blurb "\n") (doctype "<!DOCTYPE html>") (blurb "\n") (elem (normalElem "<" (elemName "html") attributes ">" (normalContent "\n" (elem (normalElem "<" (elemName "head") attributes ">" (normalContent "\n    " (elem (escRawTextElem "<" (escRawTextElemName "title") attributes ">" (escRawTextContent "Example Domain") (endElem "</" (elemName "title") ">"))) "\n    " (elem (selfClosingElem "<" (elemName "meta") (attributes " " (doubleQuoteAttr (attrName "charset") "=\"utf-8\"") " ") "/>")) "\n    " (elem (selfClosingElem "<" (elemName "meta") (attributes " " (doubleQuoteAttr (attrName "http-equiv") "=\"Content-type\"") " " (doubleQuoteAttr (attrName "content") "=\"text/html; charset=utf-8\"") " ") "/>")) "\n    " (elem (selfClosingElem "<" (elemName "meta") (attributes " " (doubleQuoteAttr (attrName "name") "=\"viewport\"") " " (doubleQuoteAttr (attrName "content") "=\"width=device-width, initial-scale=1\"") " ") "/>")) "\n    " (elem (rawTextElem "<" (rawTextElemName "style") (attributes " " (doubleQuoteAttr (attrName "type") "=\"text/css\"")) ">" (rawTextContent "\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n    }\n    ") (endElem "</" (elemName "style") ">"))) "\n") (endElem "</" (elemName "head") ">"))) "\n\n" (elem (normalElem "<" (elemName "body") attributes ">" (normalContent "\n" (elem (normalElem "<" (elemName "div") attributes ">" (normalContent "\n    " (elem (normalElem "<" (elemName "h1") attributes ">" (normalContent "Example Domain") (endElem "</" (elemName "h1") ">"))) "\n    " (elem (normalElem "<" (elemName "p") attributes ">" (normalContent "This domain is for use in illustrative examples in documents. You may\n    use this domain in literature without prior coordination or asking for\n    permission.") (endElem "</" (elemName "p") ">"))) " " (elem (normalElem "<" (elemName "p") attributes ">" (normalContent (elem (normalElem "<" (elemName "a") (attributes " " (doubleQuoteAttr (attrName "href") "=\"https://www.iana.org/domains/example\"")) ">" (normalContent "More\n    information...") (endElem "</" (elemName "a") ">")))) (endElem "</" (elemName "p") ">"))) "\n") (endElem "</" (elemName "div") ">"))) "\n") (endElem "</" (elemName "body") ">"))) "\n") (endElem "</" (elemName "html") ">"))) (blurb "\n")))))
-- 
2.46.0






  parent reply	other threads:[~2024-12-22 20:01 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-09-11 22:03 bug#73188: PEG parser does not support full PEG grammar Ekaitz Zarraga
2024-09-12 20:57 ` bug#73188: [PATCH v2] PEG: Add full support for PEG + some extensions Ekaitz Zarraga
2024-10-13 20:29   ` bug#73188: PEG parser does not support full PEG grammar Ludovic Courtès
2024-10-13 20:59     ` Ekaitz Zarraga
2024-10-14 11:56       ` Ludovic Courtès
2024-10-14 14:00         ` Ekaitz Zarraga
2024-10-20 10:10           ` Ludovic Courtès
2024-10-20 20:18             ` Ekaitz Zarraga
2024-12-09 17:23               ` Ludovic Courtès
2024-10-11 12:31 ` bug#73188: [PATCH] PEG: Add support for `not-in-range` and [^...] Ekaitz Zarraga
2024-10-30 19:04 ` bug#73188: PEG: Fix bugs and add complex PEG for testing Ekaitz Zarraga
2024-12-22 17:45 ` bug#73188: PEG parser does not support full PEG grammar Ekaitz Zarraga
2024-12-22 20:09   ` Ekaitz Zarraga
2024-12-22 20:01 ` bug#73188: [PATCH 1/3] PEG: fix [^...] Ekaitz Zarraga
2024-12-22 20:01   ` bug#73188: [PATCH 2/3] PEG: string-peg: better support for escaping Ekaitz Zarraga
2024-12-22 20:01   ` Ekaitz Zarraga [this message]
2024-12-22 21:22   ` bug#73188: PEG parser does not support full PEG grammar Ludovic Courtès
2024-12-23 22:04     ` bokr
2024-12-23 22:13       ` Ekaitz Zarraga
2024-12-28 20:30         ` Bengt Richter
2024-12-28 20:44           ` Ekaitz Zarraga

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/guile/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20241222200128.13782-3-ekaitz@elenq.tech \
    --to=ekaitz@elenq.tech \
    --cc=73188@debbugs.gnu.org \
    --cc=ludo@gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).