From: Daniel Hartwig <mandyke@gmail.com>
To: Mark H Weaver <mhw@netris.org>
Cc: guile-devel@gnu.org
Subject: Re: [PATCH] In string-split, add support for character sets and predicates.
Date: Tue, 9 Oct 2012 11:34:46 +0800 [thread overview]
Message-ID: <CAN3veRccaDPbctZP8g7MYdhY5PLdCNwVb9BxNnM2TFU4d+Eprw@mail.gmail.com> (raw)
In-Reply-To: <87sj9pm0oz.fsf@tines.lan>
[-- Attachment #1: Type: text/plain, Size: 924 bytes --]
Hi Mark
Thanks for the speedy response.
General ACK on all your comments. Some additional notes:
On 8 October 2012 23:40, Mark H Weaver <mhw@netris.org> wrote:
>> [which vs. that]
This was straight out of the doc string for string-index. Changed.
>> + if (SCM_CHARP (char_pred))
>> + {
>> + goto split_char;
>
> I'd prefer to avoid the use of 'goto' here, and instead use nested 'if's
> here. (I admit that 'goto's occasionally make code simpler and more
> readable, but not in this case IMO).
I had done this only to avoid diff noise on the original code path
since I wasn't changing it. I agree that it adds nothing to the
structure of the code. Changed.
> Can you please put the code between 'split_char' and 'done' within this
> 'if', and all the code from here to 'split_char' within the 'else'?
Done. Also adjusted the scope of the branch-specific variables.
Updated version attached.
Regards
[-- Attachment #2: 0001-In-string-split-add-support-for-character-sets-and-p.patch --]
[-- Type: application/octet-stream, Size: 8472 bytes --]
From 46178db9eecc9ca402d9571c3ee520074f15ef5a Mon Sep 17 00:00:00 2001
From: Daniel Hartwig <mandyke@gmail.com>
Date: Mon, 8 Oct 2012 18:35:00 +0800
Subject: [PATCH] In string-split, add support for character sets and
predicates.
* libguile/srfi-13.c (string-split): Add support for splitting on
character sets and predicates, like string-index and others.
* test-suite/tests/strings.test (string-split): Add tests covering
the new argument types.
---
libguile/srfi-13.c | 102 +++++++++++++++++++++++++++++------------
libguile/srfi-13.h | 2 +-
test-suite/tests/strings.test | 62 ++++++++++++++++++++++++-
3 files changed, 134 insertions(+), 32 deletions(-)
diff --git a/libguile/srfi-13.c b/libguile/srfi-13.c
index 2834553..605ba21 100644
--- a/libguile/srfi-13.c
+++ b/libguile/srfi-13.c
@@ -2993,11 +2993,22 @@ SCM_DEFINE (scm_string_tokenize, "string-tokenize", 1, 3, 0,
#undef FUNC_NAME
SCM_DEFINE (scm_string_split, "string-split", 2, 0, 0,
- (SCM str, SCM chr),
+ (SCM str, SCM char_pred),
"Split the string @var{str} into a list of the substrings delimited\n"
- "by appearances of the character @var{chr}. Note that an empty substring\n"
- "between separator characters will result in an empty string in the\n"
- "result list.\n"
+ "by appearances of characters that\n"
+ "\n"
+ "@itemize @bullet\n"
+ "@item\n"
+ "equal @var{char_pred}, if it is a character,\n"
+ "\n"
+ "@item\n"
+ "satisfy the predicate @var{char_pred}, if it is a procedure,\n"
+ "\n"
+ "@item\n"
+ "are in the set @var{char_pred}, if it is a character set.\n"
+ "@end itemize\n\n"
+ "Note that an empty substring between separator characters\n"
+ "will result in an empty string in the result list.\n"
"\n"
"@lisp\n"
"(string-split \"root:x:0:0:root:/root:/bin/bash\" #\\:)\n"
@@ -3014,47 +3025,78 @@ SCM_DEFINE (scm_string_split, "string-split", 2, 0, 0,
"@end lisp")
#define FUNC_NAME s_scm_string_split
{
- long idx, last_idx;
- int narrow;
SCM res = SCM_EOL;
SCM_VALIDATE_STRING (1, str);
- SCM_VALIDATE_CHAR (2, chr);
- /* This is explicit wide/narrow logic (instead of using
- scm_i_string_ref) is a speed optimization. */
- idx = scm_i_string_length (str);
- narrow = scm_i_is_narrow_string (str);
- if (narrow)
+ if (SCM_CHARP (char_pred))
{
- const char *buf = scm_i_string_chars (str);
- while (idx >= 0)
+ long idx, last_idx;
+ int narrow;
+
+ /* This is explicit wide/narrow logic (instead of using
+ scm_i_string_ref) is a speed optimization. */
+ idx = scm_i_string_length (str);
+ narrow = scm_i_is_narrow_string (str);
+ if (narrow)
{
- last_idx = idx;
- while (idx > 0 && buf[idx-1] != (char) SCM_CHAR(chr))
- idx--;
- if (idx >= 0)
+ const char *buf = scm_i_string_chars (str);
+ while (idx >= 0)
{
- res = scm_cons (scm_i_substring (str, idx, last_idx), res);
- idx--;
+ last_idx = idx;
+ while (idx > 0 && buf[idx-1] != (char) SCM_CHAR(char_pred))
+ idx--;
+ if (idx >= 0)
+ {
+ res = scm_cons (scm_i_substring (str, idx, last_idx), res);
+ idx--;
+ }
}
}
+ else
+ {
+ const scm_t_wchar *buf = scm_i_string_wide_chars (str);
+ while (idx >= 0)
+ {
+ last_idx = idx;
+ while (idx > 0 && buf[idx-1] != SCM_CHAR(char_pred))
+ idx--;
+ if (idx >= 0)
+ {
+ res = scm_cons (scm_i_substring (str, idx, last_idx), res);
+ idx--;
+ }
+ }
+ }
+ goto done;
}
else
{
- const scm_t_wchar *buf = scm_i_string_wide_chars (str);
- while (idx >= 0)
+ SCM sidx, slast_idx;
+
+ if (!SCM_CHARSETP (char_pred))
{
- last_idx = idx;
- while (idx > 0 && buf[idx-1] != SCM_CHAR(chr))
- idx--;
- if (idx >= 0)
- {
- res = scm_cons (scm_i_substring (str, idx, last_idx), res);
- idx--;
- }
+ SCM_ASSERT (scm_is_true (scm_procedure_p (char_pred)),
+ char_pred, SCM_ARG2, FUNC_NAME);
+ }
+
+ /* Supporting predicates and character sets involves handling SCM
+ values so there is less chance to optimize. */
+ slast_idx = scm_string_length (str);
+ for (;;)
+ {
+ sidx = scm_string_index_right (str, char_pred, SCM_INUM0, slast_idx);
+ if (scm_is_false (sidx))
+ break;
+ res = scm_cons (scm_substring (str, scm_oneplus (sidx), slast_idx), res);
+ slast_idx = sidx;
}
+
+ res = scm_cons (scm_substring (str, SCM_INUM0, slast_idx), res);
+ goto done;
}
+
+ done:
scm_remember_upto_here_1 (str);
return res;
}
diff --git a/libguile/srfi-13.h b/libguile/srfi-13.h
index f63239a..325e222 100644
--- a/libguile/srfi-13.h
+++ b/libguile/srfi-13.h
@@ -110,7 +110,7 @@ SCM_API SCM scm_xsubstring (SCM s, SCM from, SCM to, SCM start, SCM end);
SCM_API SCM scm_string_xcopy_x (SCM target, SCM tstart, SCM s, SCM sfrom, SCM sto, SCM start, SCM end);
SCM_API SCM scm_string_replace (SCM s1, SCM s2, SCM start1, SCM end1, SCM start2, SCM end2);
SCM_API SCM scm_string_tokenize (SCM s, SCM token_char, SCM start, SCM end);
-SCM_API SCM scm_string_split (SCM s, SCM chr);
+SCM_API SCM scm_string_split (SCM s, SCM char_pred);
SCM_API SCM scm_string_filter (SCM char_pred, SCM s, SCM start, SCM end);
SCM_API SCM scm_string_delete (SCM char_pred, SCM s, SCM start, SCM end);
diff --git a/test-suite/tests/strings.test b/test-suite/tests/strings.test
index d892b70..679e173 100644
--- a/test-suite/tests/strings.test
+++ b/test-suite/tests/strings.test
@@ -557,7 +557,67 @@
(pass-if "char 255"
(equal? '("a" "b")
(string-split (string #\a (integer->char 255) #\b)
- (integer->char 255)))))
+ (integer->char 255))))
+
+ (pass-if "empty string - char"
+ (equal? '("")
+ (string-split "" #\:)))
+
+ (pass-if "non-empty - char - no delimiters"
+ (equal? '("foobarfrob")
+ (string-split "foobarfrob" #\:)))
+
+ (pass-if "non-empty - char - delimiters"
+ (equal? '("foo" "bar" "frob")
+ (string-split "foo:bar:frob" #\:)))
+
+ (pass-if "non-empty - char - leading delimiters"
+ (equal? '("" "" "foo" "bar" "frob")
+ (string-split "::foo:bar:frob" #\:)))
+
+ (pass-if "non-empty - char - trailing delimiters"
+ (equal? '("foo" "bar" "frob" "" "")
+ (string-split "foo:bar:frob::" #\:)))
+
+ (pass-if "empty string - charset"
+ (equal? '("")
+ (string-split "" (char-set #\:))))
+
+ (pass-if "non-empty - charset - no delimiters"
+ (equal? '("foobarfrob")
+ (string-split "foobarfrob" (char-set #\:))))
+
+ (pass-if "non-empty - charset - delimiters"
+ (equal? '("foo" "bar" "frob")
+ (string-split "foo:bar:frob" (char-set #\:))))
+
+ (pass-if "non-empty - charset - leading delimiters"
+ (equal? '("" "" "foo" "bar" "frob")
+ (string-split "::foo:bar:frob" (char-set #\:))))
+
+ (pass-if "non-empty - charset - trailing delimiters"
+ (equal? '("foo" "bar" "frob" "" "")
+ (string-split "foo:bar:frob::" (char-set #\:))))
+
+ (pass-if "empty string - pred"
+ (equal? '("")
+ (string-split "" (negate char-alphabetic?))))
+
+ (pass-if "non-empty - pred - no delimiters"
+ (equal? '("foobarfrob")
+ (string-split "foobarfrob" (negate char-alphabetic?))))
+
+ (pass-if "non-empty - pred - delimiters"
+ (equal? '("foo" "bar" "frob")
+ (string-split "foo:bar:frob" (negate char-alphabetic?))))
+
+ (pass-if "non-empty - pred - leading delimiters"
+ (equal? '("" "" "foo" "bar" "frob")
+ (string-split "::foo:bar:frob" (negate char-alphabetic?))))
+
+ (pass-if "non-empty - pred - trailing delimiters"
+ (equal? '("foo" "bar" "frob" "" "")
+ (string-split "foo:bar:frob::" (negate char-alphabetic?)))))
(with-test-prefix "substring-move!"
--
1.7.9
next prev parent reply other threads:[~2012-10-09 3:34 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-10-08 11:23 [PATCH] In string-split, add support for character sets and predicates Daniel Hartwig
2012-10-08 15:40 ` Mark H Weaver
2012-10-09 3:34 ` Daniel Hartwig [this message]
2012-10-09 17:48 ` Mark H Weaver
2012-10-10 1:37 ` Daniel Hartwig
2012-10-10 2:14 ` Mark H Weaver
2012-10-10 3:15 ` Daniel Hartwig
2012-10-10 3:25 ` Mark H Weaver
2012-10-10 3:28 ` Daniel Hartwig
2012-10-10 7:59 ` Mark H Weaver
2012-10-10 20:44 ` Ludovic Courtès
2012-10-12 6:38 ` Daniel Hartwig
2012-10-12 12:23 ` Mark H Weaver
2012-10-10 20:42 ` Ludovic Courtès
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://www.gnu.org/software/guile/
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CAN3veRccaDPbctZP8g7MYdhY5PLdCNwVb9BxNnM2TFU4d+Eprw@mail.gmail.com \
--to=mandyke@gmail.com \
--cc=guile-devel@gnu.org \
--cc=mhw@netris.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).