all messages for Emacs-related lists mirrored at yhetil.org
 help / color / mirror / code / Atom feed
From: "Mattias Engdegård" <mattiase@acm.org>
To: Eli Zaretskii <eliz@gnu.org>
Cc: 33205@debbugs.gnu.org
Subject: bug#33205: 26.1; unibyte/multibyte missing in rx.el
Date: Mon, 19 Nov 2018 21:07:39 +0100	[thread overview]
Message-ID: <5203F729-8090-4453-80CC-1249DB064631@acm.org> (raw)
In-Reply-To: <834lcsd7qu.fsf@gnu.org>

[-- Attachment #1: Type: text/plain, Size: 652 bytes --]

I tried using rx to match raw bytes. (rx (any (?\200 . ?\377))) doesn't work, since that is translated to the corresponding Unicode range; (any (#x3fff80 . #x3fffff)) must be used instead. Maybe that is evident, or would it merit a mention in the doc string?

The alternative formulation (rx (any "\200-\377")) doesn't work either, and this seems to be a bug. Looking at rx-check-any-string, a second bug is revealed: the code uses the regex ".-." to pick out ranges, which means that \n cannot be a range endpoint.

Perhaps you want me to open a new bug for the above? I'm attaching a patch all the same, but you may prefer doing it differently.

[-- Attachment #2: rx-any-raw-bytes.patch --]
[-- Type: application/octet-stream, Size: 3680 bytes --]

 lisp/emacs-lisp/rx.el            | 49 ++++++++++++++++++++++------------------
 test/lisp/emacs-lisp/rx-tests.el | 22 ++++++++++++++++++
 2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
index 1230df4f15..04069e8d50 100644
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -449,28 +449,33 @@ Only both edges of each range is checked."
 
 
 (defun rx-check-any-string (str)
-  "Check string argument STR for Rx `any'."
-  (let ((i 0)
-	c1 c2 l)
-    (if (= 0 (length str))
-	(error "String arg for Rx `any' must not be empty"))
-    (while (string-match ".-." str i)
-      ;; string before range: convert it to characters
-      (if (< i (match-beginning 0))
-	  (setq l (nconc
-		   l
-		   (append (substring str i (match-beginning 0)) nil))))
-      ;; range
-      (setq i (match-end 0)
-	    c1 (aref str (match-beginning 0))
-	    c2 (aref str (1- i)))
-      (cond
-       ((< c1 c2) (setq l (nconc l (list (cons c1 c2)))))
-       ((= c1 c2) (setq l (nconc l (list c1))))))
-    ;; rest?
-    (if (< i (length str))
-	(setq l (nconc l (append (substring str i) nil))))
-    l))
+  "Turn a string argument to `any' into a list of characters and, representing
+ranges, dotted pairs of characters. The original order is not preserved."
+  (let ((decode-char
+         ;; Make sure raw bytes are decoded as such, to avoid confusion with
+         ;; U+0080..U+00FF.
+         (if (multibyte-string-p str)
+             #'identity
+           (lambda (c) (if (and (>= c #x80) (<= c #xff))
+                           (+ c #x3fff00)
+                         c))))
+        (len (length str))
+        (i 0)
+        (ret nil))
+    (while (< i len)
+      (cond ((and (< i (- len 2))
+                  (= (aref str (+ i 1)) ?-))
+             ;; Range.
+             (let ((start (funcall decode-char (aref str i)))
+                   (end   (funcall decode-char (aref str (+ i 2)))))
+               (cond ((< start end) (push (cons start end) ret))
+                     ((= start end) (push start ret)))
+               (setq i (+ i 3))))
+            (t
+             ;; Single character.
+             (push (funcall decode-char (aref str i)) ret)
+             (setq i (+ i 1)))))
+    ret))
 
 
 (defun rx-check-any (arg)
diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el
index d15e3d7719..fb268c58f9 100644
--- a/test/lisp/emacs-lisp/rx-tests.el
+++ b/test/lisp/emacs-lisp/rx-tests.el
@@ -33,6 +33,28 @@
                                   (number-sequence ?< ?\])
                                   (number-sequence ?- ?:))))))
 
+(ert-deftest rx-char-any-range-nl ()
+  "Character alternatives with \n as a range endpoint."
+  (should (equal (rx (any "\n-\r"))
+                 "[\n-\r]"))
+  (should (equal (rx (any "\a-\n"))
+                 "[\a-\n]")))
+
+(ert-deftest rx-char-any-raw-byte ()
+  "Raw bytes in character alternatives."
+  ;; Separate raw characters.
+  (should (equal (string-match-p (rx (any "\326A\333B"))
+                                 "X\326\333")
+                 1))
+  ;; Range of raw characters, unibyte.
+  (should (equal (string-match-p (rx (any "\200-\377"))
+                                 "ÿA\310B")
+                 2))
+  ;; Range of raw characters, multibyte.
+  (should (equal (string-match-p (rx (any "Å\211\326-\377\177"))
+                                 "XY\355\177\327")
+                 2)))
+
 (ert-deftest rx-pcase ()
   (should (equal (pcase "a 1 2 3 1 1 b"
                    ((rx (let u (+ digit)) space

  parent reply	other threads:[~2018-11-19 20:07 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-10-30 15:03 bug#33205: 26.1; unibyte/multibyte missing in rx.el Mattias Engdegård
2018-10-30 17:27 ` Eli Zaretskii
2018-10-31 15:27   ` Mattias Engdegård
2018-10-31 15:55     ` Eli Zaretskii
2018-11-05 16:49       ` Eli Zaretskii
2018-11-07 18:08         ` Mattias Engdegård
2018-11-07 19:10           ` Eli Zaretskii
2018-11-07 20:19             ` Mattias Engdegård
2018-11-19 20:07             ` Mattias Engdegård [this message]
2018-12-08  8:56               ` Eli Zaretskii
2018-12-08  9:23                 ` Mattias Engdegård
2018-12-08 11:11                   ` Eli Zaretskii
2018-12-28 18:17                 ` Mattias Engdegård
2018-12-29  9:24                   ` Eli Zaretskii
2018-12-29  9:23               ` Eli Zaretskii
2018-12-29 10:43                 ` Mattias Engdegård
2018-12-29 14:55                   ` Eli Zaretskii

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5203F729-8090-4453-80CC-1249DB064631@acm.org \
    --to=mattiase@acm.org \
    --cc=33205@debbugs.gnu.org \
    --cc=eliz@gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this external index

	https://git.savannah.gnu.org/cgit/emacs.git
	https://git.savannah.gnu.org/cgit/emacs/org-mode.git

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.