From 5c9fdd9df32b87be9f81e037336332984bc3b16c Mon Sep 17 00:00:00 2001 Message-ID: <5c9fdd9df32b87be9f81e037336332984bc3b16c.1735049605.git.yantar92@posteo.net> From: Ihor Radchenko Date: Tue, 24 Dec 2024 15:11:22 +0100 Subject: [PATCH] ox-odt: Avoid putting forbidden characters into ODT xml * lisp/ox-odt.el (org-odt-forbidden-char-re): (org-odt-discouraged-char-re): New constants codifying characters that are prohibited in XML spec. (org-odt--remove-forbidden): New function removing the prohibited characters. (org-odt--encode-plain-text): Remove the prohibited characters. (org-odt-plain-text): Update comment. Reported-by: Joseph Turner Link: https://orgmode.org/list/87o711l4u4.fsf@christianmoe.com --- lisp/ox-odt.el | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/lisp/ox-odt.el b/lisp/ox-odt.el index ec81637ef0..61c8d4ec75 100644 --- a/lisp/ox-odt.el +++ b/lisp/ox-odt.el @@ -170,6 +170,28 @@ (defconst org-odt-special-string-regexps ("\\.\\.\\." . "…")) ; hellip "Regular expressions for special string conversion.") +(defconst org-odt-forbidden-char-re + (rx (not (in ?\N{U+9} ?\N{U+A} ?\N{U+D} + (?\N{U+20} . ?\N{U+D7FF}) + (?\N{U+E000} . ?\N{U+FFFD}) + (?\N{U+10000} . ?\N{U+10FFFF})))) + "Regexp matching forbidden XML1.0 characters. +https://www.w3.org/TR/REC-xml/#charsets") + +(defconst org-odt-discouraged-char-re + (rx (in (?\N{U+7F} . ?\N{U+84}) (?\N{U+86} . ?\N{U+9F}) + (?\N{U+FDD0} . ?\N{U+FDEF}) (?\N{U+1FFFE} . ?\N{U+1FFFF}) + (?\N{U+2FFFE} . ?\N{U+2FFFF}) (?\N{U+3FFFE} . ?\N{U+3FFFF}) + (?\N{U+4FFFE} . ?\N{U+4FFFF}) (?\N{U+5FFFE} . ?\N{U+5FFFF}) + (?\N{U+6FFFE} . ?\N{U+6FFFF}) (?\N{U+7FFFE} . ?\N{U+7FFFF}) + (?\N{U+8FFFE} . ?\N{U+8FFFF}) (?\N{U+9FFFE} . ?\N{U+9FFFF}) + (?\N{U+AFFFE} . ?\N{U+AFFFF}) (?\N{U+BFFFE} . ?\N{U+BFFFF}) + (?\N{U+CFFFE} . ?\N{U+CFFFF}) (?\N{U+DFFFE} . ?\N{U+DFFFF}) + (?\N{U+EFFFE} . ?\N{U+EFFFF}) (?\N{U+FFFFE} . ?\N{U+FFFFF}) + (?\N{U+10FFFE} . ?\N{U+10FFFF}))) + "Regexp matching discouraged XML1.0 characters. +https://www.w3.org/TR/REC-xml/#charsets") + (defconst org-odt-schema-dir-list (list (expand-file-name "./schema/" org-odt-data-dir)) "List of directories to search for OpenDocument schema files. @@ -2892,18 +2914,28 @@ (defun org-odt--encode-tabs-and-spaces (line) (format " " (1- (length s))))) line)) +(defun org-odt--remove-forbidden (text) + "Remove forbidden and discouraged characters from TEXT. +https://www.w3.org/TR/REC-xml/#charsets" + (replace-regexp-in-string + org-odt-forbidden-char-re "" + (replace-regexp-in-string + org-odt-discouraged-char-re "" + text))) + (defun org-odt--encode-plain-text (text &optional no-whitespace-filling) (dolist (pair '(("&" . "&") ("<" . "<") (">" . ">"))) (setq text (replace-regexp-in-string (car pair) (cdr pair) text t t))) - (if no-whitespace-filling text - (org-odt--encode-tabs-and-spaces text))) + (org-odt--remove-forbidden + (if no-whitespace-filling text + (org-odt--encode-tabs-and-spaces text)))) (defun org-odt-plain-text (text info) "Transcode a TEXT string from Org to ODT. TEXT is the string to transcode. INFO is a plist holding contextual information." (let ((output text)) - ;; Protect &, < and >. + ;; Protect &, < and >, and remove forbidden characters. (setq output (org-odt--encode-plain-text output t)) ;; Handle smart quotes. Be sure to provide original string since ;; OUTPUT may have been modified. -- 2.47.1