unofficial mirror of guile-devel@gnu.org 
 help / color / mirror / Atom feed
* [PATCH] Fix of upstream parsing of CDATA
@ 2020-01-16 12:00 Linus Björnstam
  2020-03-12  8:35 ` Linus Björnstam
  0 siblings, 1 reply; 2+ messages in thread
From: Linus Björnstam @ 2020-01-16 12:00 UTC (permalink / raw)
  To: guile-devel

[-- Attachment #1: Type: text/plain, Size: 1286 bytes --]

Hello Guilers!

RhodiumToad found an error in sxml where it would not properly parse CDATA: &gt would be converted to > inside CDATA blocks. This is probably due to some wrong reading of the XML spec:

    "Within a CDATA section, only the CDEnd string is recognized as markup, so that left angle brackets and ampersands may occur in their literal form; they need not (and cannot) be escaped using ' < ' and ' & '.".

Notice that it mentions that only CDEnd is recognized, but omitts > in the enumeration of things that need-not-and-cannot be escaped. 

No other XML libraries behave this way. Take for example python's Etree:

Python 2.7.17 (default, Dec 23 2019, 21:25:33)
>>> import xml.etree.ElementTree as ET
>>> root = ET.fromstring("<e><![CDATA[&gt;]]></e>")
>>> root.text
'&gt;'

The same thing with the un-patched (sxml ssax) (or rather (sxml simple)): looks different:

(xml->sxml "<e><![CDATA[&gt;]]></e>")
;; => (*TOP* (e ">"))

The question is whether this patch should be sent upstream. Since there has been very little activity there, I suspect it is a lost cause.

Failing tests have been looked through, verified and fixed. No unexpected errors were encountered. All SXML tests pass after this patch.

Best regards
  Linus Björnstam

[-- Attachment #2: 0001-module-sxml-upstream-SSAX.scm-Fix-improper-handling-.patch --]
[-- Type: application/octet-stream, Size: 4803 bytes --]

From 47c1c8bc125d78d72b644331f4607442ad2bd627 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20Bj=C3=B6rnstam?= <linus.bjornstam@fastmail.se>
Date: Thu, 16 Jan 2020 12:26:32 +0100
Subject: [PATCH]  * module/sxml/upstream/SSAX.scm: Fix improper handling of
 &gt; in CDATA    and correct tests.

---
 configure.ac                  |  4 ++--
 module/sxml/upstream/SSAX.scm | 30 ++++++++----------------------
 2 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/configure.ac b/configure.ac
index bb9a9281f..6198c7e6e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3,7 +3,7 @@ dnl   Process this file with autoconf to produce configure.
 dnl
 
 define(GUILE_CONFIGURE_COPYRIGHT,[[
-Copyright 1998-2019 Free Software Foundation, Inc.
+Copyright 1998-2020 Free Software Foundation, Inc.
 
 This file is part of Guile.
 
@@ -25,7 +25,7 @@ License along with Guile.  If not, see
 AC_PREREQ(2.61)
 
 AC_INIT([GNU Guile],
-        m4_esyscmd([build-aux/git-version-gen --match v2.9.\* .tarball-version]),
+        m4_esyscmd([build-aux/git-version-gen --match v3.0.\* .tarball-version]),
         [bug-guile@gnu.org])
 AC_CONFIG_AUX_DIR([build-aux])
 AC_CONFIG_MACRO_DIR([m4])
diff --git a/module/sxml/upstream/SSAX.scm b/module/sxml/upstream/SSAX.scm
index d2b8fd925..4bfaa9bcb 100644
--- a/module/sxml/upstream/SSAX.scm
+++ b/module/sxml/upstream/SSAX.scm
@@ -950,11 +950,10 @@
 ;	CR, LF, and CRLF are treated as line delimiters, and passed
 ;	as a single #\newline to the STR-HANDLER
 ;	"]]>" combination is the end of the CDATA section.
-;	&gt; is treated as an embedded #\> character
-; Note, &lt; and &amp; are not specially recognized (and are not expanded)!
+; Note, &lt;, &gt; and &amp; are not specially recognized (and are not expanded)!
 
 (define ssax:read-cdata-body 
-  (let ((cdata-delimiters (list char-return #\newline #\] #\&)))
+  (let ((cdata-delimiters (list char-return #\newline #\])))
 
     (lambda (port str-handler seed)
       (let loop ((seed seed))
@@ -974,18 +973,6 @@
 		((#\]) (check-after-second-braket
 			(str-handler "]" "" seed)))
 		(else (loop (str-handler "]]" "" seed)))))))
-       ((#\&)		; Note that #\& within CDATA may stand for itself
-	(let ((ent-ref 	; it does not have to start an entity ref
-               (next-token-of (lambda (c) 
-		 (and (not (eof-object? c)) (char-alphabetic? c) c)) port)))
-	  (cond		; "&gt;" is to be replaced with #\>
-	   ((and (string=? "gt" ent-ref) (eqv? (peek-char port) #\;))
-	    (read-char port)
-	    (loop (str-handler fragment ">" seed)))
-	   (else
-	    (loop 
-	     (str-handler ent-ref ""
-			  (str-handler fragment "&" seed)))))))
        (else		; Must be CR: if the next char is #\newline, skip it
          (if (eqv? (peek-char port) #\newline) (read-char port))
          (loop (str-handler fragment nl seed)))
@@ -1004,7 +991,7 @@
 		     (call-with-input-string (unesc-string str)
 		       (lambda (port) (ssax:read-cdata-body port consumer '()))
 		       ))))
-	     (write result)
+             (write result)
 	     (assert (equal? result expected-result)))))
    )
   (test "]]>" '())
@@ -1017,10 +1004,9 @@
   (test "%r%n%r%n]]>" '("" " NL" "" " NL"))
   (test "%r%n%r%na]]>" '("" " NL" "" " NL" "a" ""))
   (test "%r%r%r%na]]>" '("" " NL" "" " NL" "" " NL" "a" ""))
-  (test "abc&!!!]]>" '("abc" "&" "" "" "!!!" ""))
+  (test "abc&!!!]]>" '("abc&!!!" ""))
   (test "abc]]&gt;&gt&amp;]]]&gt;and]]>"
-    '("abc" "" "]]" "" "" ">" "" "&" "gt" "" "" "&" "amp" "" ";" "" "]" ""
-      "]]" "" "" ">" "and" ""))
+        '("abc" "" "]]" "" "&gt;&gt&amp;" "" "]" "" "]]" "" "&gt;and" ""))
 ))
 
             
@@ -2596,10 +2582,10 @@
 	 `(('"itemize" ('"item" "This   is item 1 ")
 	    ,(unesc-string "%n") ('"item" "Item 2") ,(unesc-string "%n "))))
   (test " <P><![CDATA[<BR>%n<![CDATA[<BR>]]&gt;]]></P>"
-	dummy-doctype-fn  `(('"P" "<BR>" ,nl "<![CDATA[<BR>" "]]" "" ">")))
+  	dummy-doctype-fn  `(('"P" "<BR>" ,nl  "<![CDATA[<BR>" "]]" "&gt;")))
 
   (test " <P><![CDATA[<BR>%r<![CDATA[<BR>]]&gt;]]></P>"
-	dummy-doctype-fn `(('"P" "<BR>" ,nl "<![CDATA[<BR>" "]]" "" ">")))
+  	dummy-doctype-fn `(('"P" "<BR>" ,nl "<![CDATA[<BR>" "]]" "&gt;")))
 
   (test "<?xml version='1.0'?>%n%n<Reports TStamp='1'></Reports>"
 	dummy-doctype-fn '(('"Reports" (@ ('"TStamp" "1")))))
@@ -2964,7 +2950,7 @@
 	  `(*TOP* (P ,(unesc-string "some text <1%n\"")
 		      (B "strong") ,(unesc-string "\"%n"))))
     (test " <P><![CDATA[<BR>%n<![CDATA[<BR>]]&gt;]]></P>" '()
-	  `(*TOP* (P ,(unesc-string "<BR>%n<![CDATA[<BR>]]>"))))
+     `(*TOP* (P ,(unesc-string "<BR>%n<![CDATA[<BR>]]&gt;"))))
 ;    (test "<T1><T2>it&apos;s%r%nand   that%n</T2>%r%n%r%n%n</T1>" '()
 ;	  '(*TOP* (T1 (T2 "it's%nand   that%n") "%n%n%n")))
     (test "<T1><T2>it&apos;s%r%nand   that%n</T2>%r%n%r%n%n</T1>" '()
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2020-03-12  8:35 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-01-16 12:00 [PATCH] Fix of upstream parsing of CDATA Linus Björnstam
2020-03-12  8:35 ` Linus Björnstam

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).