unofficial mirror of guile-devel@gnu.org 
 help / color / mirror / Atom feed
From: Daniel Hartwig <mandyke@gmail.com>
To: Mark H Weaver <mhw@netris.org>
Cc: guile-devel@gnu.org
Subject: Re: [PATCH] In string-split, add support for character sets and predicates.
Date: Fri, 12 Oct 2012 14:38:11 +0800	[thread overview]
Message-ID: <CAN3veRfnkFuw+iCdA_6dS603ziP8z=Tz2sPiCkmhchJK4FQcOw@mail.gmail.com> (raw)
In-Reply-To: <87a9vulpv0.fsf@tines.lan>

[-- Attachment #1: Type: text/plain, Size: 31 bytes --]

Patch with .texi updated also.

[-- Attachment #2: 0001-In-string-split-add-support-for-character-sets-and-p.patch --]
[-- Type: application/octet-stream, Size: 9659 bytes --]

From 8923e727d3d77933aa3446f2f72f27fc40b46b1d Mon Sep 17 00:00:00 2001
From: Daniel Hartwig <mandyke@gmail.com>
Date: Mon, 8 Oct 2012 18:35:00 +0800
Subject: [PATCH] In string-split, add support for character sets and
 predicates.

* libguile/srfi-13.c (string-split): Add support for splitting on
  character sets and predicates, like string-index and others.
* test-suite/tests/strings.test (string-split): Add tests covering
  the new argument types.
* doc/ref/api-data.texi (string-split): Update.
---
 doc/ref/api-data.texi         |   22 +++++++--
 libguile/srfi-13.c            |   97 ++++++++++++++++++++++++++++-------------
 libguile/srfi-13.h            |    2 +-
 test-suite/tests/strings.test |   62 ++++++++++++++++++++++++++-
 4 files changed, 146 insertions(+), 37 deletions(-)

diff --git a/doc/ref/api-data.texi b/doc/ref/api-data.texi
index 39c9790..6d8de2b 100644
--- a/doc/ref/api-data.texi
+++ b/doc/ref/api-data.texi
@@ -3152,12 +3152,24 @@ These procedures are useful for similar tasks.
 Convert the string @var{str} into a list of characters.
 @end deffn
 
-@deffn {Scheme Procedure} string-split str chr
-@deffnx {C Function} scm_string_split (str, chr)
+@deffn {Scheme Procedure} string-split str char_pred
+@deffnx {C Function} scm_string_split (str, char_pred)
 Split the string @var{str} into a list of substrings delimited
-by appearances of the character @var{chr}.  Note that an empty substring
-between separator characters will result in an empty string in the
-result list.
+by appearances of characters that
+
+@itemize @bullet
+@item
+equal @var{char_pred}, if it is a character,
+
+@item
+satisfy the predicate @var{char_pred}, if it is a procedure,
+
+@item
+are in the set @var{char_pred}, if it is a character set.
+@end itemize
+
+Note that an empty substring between separator characters will result in
+an empty string in the result list.
 
 @lisp
 (string-split "root:x:0:0:root:/root:/bin/bash" #\:)
diff --git a/libguile/srfi-13.c b/libguile/srfi-13.c
index 2834553..97c5a1d 100644
--- a/libguile/srfi-13.c
+++ b/libguile/srfi-13.c
@@ -2993,11 +2993,22 @@ SCM_DEFINE (scm_string_tokenize, "string-tokenize", 1, 3, 0,
 #undef FUNC_NAME
 
 SCM_DEFINE (scm_string_split, "string-split", 2, 0, 0,
-	    (SCM str, SCM chr),
+	    (SCM str, SCM char_pred),
 	    "Split the string @var{str} into a list of the substrings delimited\n"
-	    "by appearances of the character @var{chr}.  Note that an empty substring\n"
-	    "between separator characters will result in an empty string in the\n"
-	    "result list.\n"
+            "by appearances of characters that\n"
+            "\n"
+            "@itemize @bullet\n"
+            "@item\n"
+            "equal @var{char_pred}, if it is a character,\n"
+            "\n"
+            "@item\n"
+            "satisfy the predicate @var{char_pred}, if it is a procedure,\n"
+            "\n"
+            "@item\n"
+            "are in the set @var{char_pred}, if it is a character set.\n"
+            "@end itemize\n\n"
+            "Note that an empty substring between separator characters\n"
+            "will result in an empty string in the result list.\n"
 	    "\n"
 	    "@lisp\n"
 	    "(string-split \"root:x:0:0:root:/root:/bin/bash\" #\\:)\n"
@@ -3014,47 +3025,73 @@ SCM_DEFINE (scm_string_split, "string-split", 2, 0, 0,
 	    "@end lisp")
 #define FUNC_NAME s_scm_string_split
 {
-  long idx, last_idx;
-  int narrow;
   SCM res = SCM_EOL;
 
   SCM_VALIDATE_STRING (1, str);
-  SCM_VALIDATE_CHAR (2, chr);
   
-  /* This is explicit wide/narrow logic (instead of using
-     scm_i_string_ref) is a speed optimization.  */
-  idx = scm_i_string_length (str);
-  narrow = scm_i_is_narrow_string (str);
-  if (narrow)
+  if (SCM_CHARP (char_pred))
     {
-      const char *buf = scm_i_string_chars (str);
-      while (idx >= 0)
+      long idx, last_idx;
+      int narrow;
+
+      /* This is explicit wide/narrow logic (instead of using
+         scm_i_string_ref) is a speed optimization.  */
+      idx = scm_i_string_length (str);
+      narrow = scm_i_is_narrow_string (str);
+      if (narrow)
+        {
+          const char *buf = scm_i_string_chars (str);
+          while (idx >= 0)
+            {
+              last_idx = idx;
+              while (idx > 0 && buf[idx-1] != (char) SCM_CHAR(char_pred))
+                idx--;
+              if (idx >= 0)
+                {
+                  res = scm_cons (scm_i_substring (str, idx, last_idx), res);
+                  idx--;
+                }
+            }
+        }
+      else
         {
-          last_idx = idx;
-          while (idx > 0 && buf[idx-1] != (char) SCM_CHAR(chr))
-            idx--;
-          if (idx >= 0)
+          const scm_t_wchar *buf = scm_i_string_wide_chars (str);
+          while (idx >= 0)
             {
-              res = scm_cons (scm_i_substring (str, idx, last_idx), res);
-              idx--;
+              last_idx = idx;
+              while (idx > 0 && buf[idx-1] != SCM_CHAR(char_pred))
+                idx--;
+              if (idx >= 0)
+                {
+                  res = scm_cons (scm_i_substring (str, idx, last_idx), res);
+                  idx--;
+                }
             }
         }
     }
   else
     {
-      const scm_t_wchar *buf = scm_i_string_wide_chars (str);
-      while (idx >= 0)
+      SCM sidx, slast_idx;
+
+      if (!SCM_CHARSETP (char_pred))
+        SCM_ASSERT (scm_is_true (scm_procedure_p (char_pred)),
+                    char_pred, SCM_ARG2, FUNC_NAME);
+
+      /* Supporting predicates and character sets involves handling SCM
+         values so there is less chance to optimize. */
+      slast_idx = scm_string_length (str);
+      for (;;)
         {
-          last_idx = idx;
-          while (idx > 0 && buf[idx-1] != SCM_CHAR(chr))
-            idx--;
-          if (idx >= 0)
-            {
-              res = scm_cons (scm_i_substring (str, idx, last_idx), res);
-              idx--;
-            }
+          sidx = scm_string_index_right (str, char_pred, SCM_INUM0, slast_idx);
+          if (scm_is_false (sidx))
+            break;
+          res = scm_cons (scm_substring (str, scm_oneplus (sidx), slast_idx), res);
+          slast_idx = sidx;
         }
+
+      res = scm_cons (scm_substring (str, SCM_INUM0, slast_idx), res);
     }
+
   scm_remember_upto_here_1 (str);
   return res;
 }
diff --git a/libguile/srfi-13.h b/libguile/srfi-13.h
index f63239a..325e222 100644
--- a/libguile/srfi-13.h
+++ b/libguile/srfi-13.h
@@ -110,7 +110,7 @@ SCM_API SCM scm_xsubstring (SCM s, SCM from, SCM to, SCM start, SCM end);
 SCM_API SCM scm_string_xcopy_x (SCM target, SCM tstart, SCM s, SCM sfrom, SCM sto, SCM start, SCM end);
 SCM_API SCM scm_string_replace (SCM s1, SCM s2, SCM start1, SCM end1, SCM start2, SCM end2);
 SCM_API SCM scm_string_tokenize (SCM s, SCM token_char, SCM start, SCM end);
-SCM_API SCM scm_string_split (SCM s, SCM chr);
+SCM_API SCM scm_string_split (SCM s, SCM char_pred);
 SCM_API SCM scm_string_filter (SCM char_pred, SCM s, SCM start, SCM end);
 SCM_API SCM scm_string_delete (SCM char_pred, SCM s, SCM start, SCM end);
 
diff --git a/test-suite/tests/strings.test b/test-suite/tests/strings.test
index d892b70..679e173 100644
--- a/test-suite/tests/strings.test
+++ b/test-suite/tests/strings.test
@@ -557,7 +557,67 @@
   (pass-if "char 255"
     (equal? '("a" "b")
 	    (string-split (string #\a (integer->char 255) #\b)
-			  (integer->char 255)))))
+			  (integer->char 255))))
+
+  (pass-if "empty string - char"
+    (equal? '("")
+            (string-split "" #\:)))
+
+  (pass-if "non-empty - char - no delimiters"
+    (equal? '("foobarfrob")
+            (string-split "foobarfrob" #\:)))
+
+  (pass-if "non-empty - char - delimiters"
+    (equal? '("foo" "bar" "frob")
+            (string-split "foo:bar:frob" #\:)))
+
+  (pass-if "non-empty - char - leading delimiters"
+    (equal? '("" "" "foo" "bar" "frob")
+            (string-split "::foo:bar:frob" #\:)))
+
+  (pass-if "non-empty - char - trailing delimiters"
+    (equal? '("foo" "bar" "frob" "" "")
+            (string-split "foo:bar:frob::" #\:)))
+
+  (pass-if "empty string - charset"
+    (equal? '("")
+            (string-split "" (char-set #\:))))
+
+  (pass-if "non-empty - charset - no delimiters"
+    (equal? '("foobarfrob")
+            (string-split "foobarfrob" (char-set #\:))))
+
+  (pass-if "non-empty - charset - delimiters"
+    (equal? '("foo" "bar" "frob")
+            (string-split "foo:bar:frob" (char-set #\:))))
+
+  (pass-if "non-empty - charset - leading delimiters"
+    (equal? '("" "" "foo" "bar" "frob")
+            (string-split "::foo:bar:frob" (char-set #\:))))
+
+  (pass-if "non-empty - charset - trailing delimiters"
+    (equal? '("foo" "bar" "frob" "" "")
+            (string-split "foo:bar:frob::" (char-set #\:))))
+
+  (pass-if "empty string - pred"
+    (equal? '("")
+            (string-split "" (negate char-alphabetic?))))
+
+  (pass-if "non-empty - pred - no delimiters"
+    (equal? '("foobarfrob")
+            (string-split "foobarfrob" (negate char-alphabetic?))))
+
+  (pass-if "non-empty - pred - delimiters"
+    (equal? '("foo" "bar" "frob")
+            (string-split "foo:bar:frob" (negate char-alphabetic?))))
+
+  (pass-if "non-empty - pred - leading delimiters"
+    (equal? '("" "" "foo" "bar" "frob")
+            (string-split "::foo:bar:frob" (negate char-alphabetic?))))
+
+  (pass-if "non-empty - pred - trailing delimiters"
+    (equal? '("foo" "bar" "frob" "" "")
+            (string-split "foo:bar:frob::" (negate char-alphabetic?)))))
 
 (with-test-prefix "substring-move!"
 
-- 
1.7.9


  parent reply	other threads:[~2012-10-12  6:38 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-10-08 11:23 [PATCH] In string-split, add support for character sets and predicates Daniel Hartwig
2012-10-08 15:40 ` Mark H Weaver
2012-10-09  3:34   ` Daniel Hartwig
2012-10-09 17:48     ` Mark H Weaver
2012-10-10  1:37       ` Daniel Hartwig
2012-10-10  2:14         ` Mark H Weaver
2012-10-10  3:15           ` Daniel Hartwig
2012-10-10  3:25             ` Mark H Weaver
2012-10-10  3:28               ` Daniel Hartwig
2012-10-10  7:59                 ` Mark H Weaver
2012-10-10 20:44                   ` Ludovic Courtès
2012-10-12  6:38                   ` Daniel Hartwig [this message]
2012-10-12 12:23                     ` Mark H Weaver
2012-10-10 20:42               ` Ludovic Courtès

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/guile/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAN3veRfnkFuw+iCdA_6dS603ziP8z=Tz2sPiCkmhchJK4FQcOw@mail.gmail.com' \
    --to=mandyke@gmail.com \
    --cc=guile-devel@gnu.org \
    --cc=mhw@netris.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).