unofficial mirror of bug-gnu-emacs@gnu.org 
 help / color / mirror / code / Atom feed
From: "Mattias Engdegård" <mattiase@acm.org>
To: Lars Ingebrigtsen <larsi@gnus.org>
Cc: 43598@debbugs.gnu.org
Subject: bug#43598: replace-in-string: finishing touches
Date: Sun, 27 Sep 2020 13:12:04 +0200	[thread overview]
Message-ID: <9EAF49DE-5B51-4F8D-B5F1-50D853FCA4B9@acm.org> (raw)
In-Reply-To: <878scwaxw6.fsf@gnus.org>

[-- Attachment #1: Type: text/plain, Size: 505 bytes --]

27 sep. 2020 kl. 02.03 skrev Lars Ingebrigtsen <larsi@gnus.org>:

> *phew*

Not bad! This seems to work all right.
Here are some minor optimisations:

- Do the fast all-ASCII test (bytes == chars) before iterating through the bytes to check for non-ASCII chars.
- Faster check for non-ASCII non-raw bytes (no need for the complex code in string_char_advance).

It is tempting to vectorise the all-ASCII loop. Maybe another day...

The patch also adds some more test cases for completeness.


[-- Attachment #2: string-search-tweaks.diff --]
[-- Type: application/octet-stream, Size: 5808 bytes --]

diff --git a/src/fns.c b/src/fns.c
index 0f76871154..f626fe11b2 100644
--- a/src/fns.c
+++ b/src/fns.c
@@ -5457,16 +5457,11 @@ DEFUN ("buffer-hash", Fbuffer_hash, Sbuffer_hash, 0, 1, 0,
 static bool
 string_ascii_p (Lisp_Object string)
 {
-  if (STRING_MULTIBYTE (string))
-    return SBYTES (string) == SCHARS (string);
-  else
-    {
-      ptrdiff_t nbytes = SBYTES (string);
-      for (ptrdiff_t i = 0; i < nbytes; i++)
-	if (SREF (string, i) > 127)
-	  return false;
-      return true;
-    }
+  ptrdiff_t nbytes = SBYTES (string);
+  for (ptrdiff_t i = 0; i < nbytes; i++)
+    if (SREF (string, i) > 127)
+      return false;
+  return true;
 }
 
 DEFUN ("string-search", Fstring_search, Sstring_search, 2, 3, 0,
@@ -5505,9 +5500,14 @@ DEFUN ("string-search", Fstring_search, Sstring_search, 2, 3, 0,
   haystart = SSDATA (haystack) + start_byte;
   haybytes = SBYTES (haystack) - start_byte;
 
-  if (STRING_MULTIBYTE (haystack) == STRING_MULTIBYTE (needle)
-      || string_ascii_p (needle)
-      || string_ascii_p (haystack))
+  /* We can do a direct byte-string search if both strings have the
+     same multibyteness, or if at least one of them consists of ASCII
+     characters only.  */
+  if (STRING_MULTIBYTE (haystack)
+      ? (STRING_MULTIBYTE (needle)
+         || SCHARS (haystack) == SBYTES (haystack) || string_ascii_p (needle))
+      : (!STRING_MULTIBYTE (needle)
+         || SCHARS (needle) == SBYTES (needle) || string_ascii_p (haystack)))
     res = memmem (haystart, haybytes,
 		  SSDATA (needle), SBYTES (needle));
   else if (STRING_MULTIBYTE (haystack))  /* unibyte needle */
@@ -5521,26 +5521,21 @@ DEFUN ("string-search", Fstring_search, Sstring_search, 2, 3, 0,
       /* The only possible way we can find the multibyte needle in the
 	 unibyte stack (since we know that neither are pure-ASCII) is
 	 if they contain "raw bytes" (and no other non-ASCII chars.)  */
-      ptrdiff_t chars = SCHARS (needle);
-      const unsigned char *src = SDATA (needle);
-
-      for (ptrdiff_t i = 0; i < chars; i++)
-	{
-	  int c = string_char_advance (&src);
-
-	  if (!CHAR_BYTE8_P (c)
-	      && !ASCII_CHAR_P (c))
-	    /* Found a char that can't be in the haystack.  */
-	    return Qnil;
-	}
+      ptrdiff_t nbytes = SBYTES (needle);
+      for (ptrdiff_t i = 0; i < nbytes; i++)
+        {
+          int c = SREF (needle, i);
+          if (CHAR_BYTE8_HEAD_P (c))
+            i++;                /* Skip raw byte.  */
+          else if (!ASCII_CHAR_P (c))
+            return Qnil;  /* Found a char that can't be in the haystack.  */
+        }
 
-      {
-	/* "Raw bytes" (aka eighth-bit) are represented differently in
-	   multibyte and unibyte strings.  */
-	Lisp_Object uni_needle = Fstring_to_unibyte (needle);
-	res = memmem (haystart, haybytes,
-		      SSDATA (uni_needle), SBYTES (uni_needle));
-      }
+      /* "Raw bytes" (aka eighth-bit) are represented differently in
+         multibyte and unibyte strings.  */
+      Lisp_Object uni_needle = Fstring_to_unibyte (needle);
+      res = memmem (haystart, haybytes,
+                    SSDATA (uni_needle), SBYTES (uni_needle));
     }
 
   if (! res)
diff --git a/test/src/fns-tests.el b/test/src/fns-tests.el
index 41969f2af2..d3c22f966e 100644
--- a/test/src/fns-tests.el
+++ b/test/src/fns-tests.el
@@ -913,6 +913,7 @@ string-search
   (should (equal (string-search "ab\0" "ab") nil))
   (should (equal (string-search "ab" "abababab" 3) 4))
   (should (equal (string-search "ab" "ababac" 3) nil))
+  (should (equal (string-search "aaa" "aa") nil))
   (let ((case-fold-search t))
     (should (equal (string-search "ab" "AB") nil)))
 
@@ -936,14 +937,16 @@ string-search
   (should (equal (string-search (string-to-multibyte "\377") "ab\377c") 2))
   (should (equal (string-search "\303" "aøb") nil))
   (should (equal (string-search "\270" "aøb") nil))
-  ;; This test currently fails, but it shouldn't!
-  ;;(should (equal (string-search "ø" "\303\270") nil))
+  (should (equal (string-search "ø" "\303\270") nil))
+
+  (should (equal (string-search "a\U00010f98z" "a\U00010f98a\U00010f98z") 2))
 
   (should-error (string-search "a" "abc" -1))
   (should-error (string-search "a" "abc" 4))
   (should-error (string-search "a" "abc" 100000000000))
 
   (should (equal (string-search "a" "aaa" 3) nil))
+  (should (equal (string-search "aa" "aa" 1) nil))
   (should (equal (string-search "\0" "") nil))
 
   (should (equal (string-search "" "") 0))
@@ -955,6 +958,21 @@ string-search
   (should-error (string-search "" "abc" -1))
 
   (should-not (string-search "ø" "foo\303\270"))
+  (should-not (string-search "\303\270" "ø"))
+  (should-not (string-search "\370" "ø"))
+  (should-not (string-search (string-to-multibyte "\370") "ø"))
+  (should-not (string-search "ø" "\370"))
+  (should-not (string-search "ø" (string-to-multibyte "\370")))
+  (should-not (string-search "\303\270" "\370"))
+  (should-not (string-search (string-to-multibyte "\303\270") "\370"))
+  (should-not (string-search "\303\270" (string-to-multibyte "\370")))
+  (should-not (string-search (string-to-multibyte "\303\270")
+                             (string-to-multibyte "\370")))
+  (should-not (string-search "\370" "\303\270"))
+  (should-not (string-search (string-to-multibyte "\370") "\303\270"))
+  (should-not (string-search "\370" (string-to-multibyte "\303\270")))
+  (should-not (string-search (string-to-multibyte "\370")
+                             (string-to-multibyte "\303\270")))
   (should (equal (string-search (string-to-multibyte "o\303\270") "foo\303\270")
                  2))
   (should (equal (string-search "\303\270" "foo\303\270") 3)))

  parent reply	other threads:[~2020-09-27 11:12 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-24 20:52 bug#43598: replace-in-string: finishing touches Mattias Engdegård
2020-09-24 21:12 ` Lars Ingebrigtsen
2020-09-24 21:19 ` Lars Ingebrigtsen
2020-09-24 23:18   ` Lars Ingebrigtsen
2020-09-25  9:21     ` Eli Zaretskii
2020-09-25 10:09       ` Lars Ingebrigtsen
2020-09-24 23:54 ` Lars Ingebrigtsen
2020-09-25 10:42   ` Mattias Engdegård
2020-09-25 11:11     ` Lars Ingebrigtsen
2020-09-25 11:22       ` Mattias Engdegård
2020-09-25 11:32         ` Lars Ingebrigtsen
2020-09-27  0:03         ` Lars Ingebrigtsen
2020-09-27  0:34           ` Lars Ingebrigtsen
2020-09-27  8:45             ` Mattias Engdegård
2020-09-28  3:41             ` Richard Stallman
2020-09-28  9:40               ` Mattias Engdegård
2020-09-29  3:29                 ` Richard Stallman
2020-09-29  4:12                   ` Eli Zaretskii
2020-09-27 11:12           ` Mattias Engdegård [this message]
2020-09-27 11:48             ` Lars Ingebrigtsen
2020-09-27 11:57               ` Mattias Engdegård
2020-09-27 12:02                 ` Lars Ingebrigtsen
2020-09-27 16:14                   ` Mattias Engdegård
2020-09-27 16:19                     ` Eli Zaretskii
2020-09-27 16:41                       ` Lars Ingebrigtsen
2020-09-27 16:48                         ` Eli Zaretskii
2020-09-26 22:44     ` Lars Ingebrigtsen
2020-09-26 22:25 ` Lars Ingebrigtsen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/emacs/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=9EAF49DE-5B51-4F8D-B5F1-50D853FCA4B9@acm.org \
    --to=mattiase@acm.org \
    --cc=43598@debbugs.gnu.org \
    --cc=larsi@gnus.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).