[PATCH] Enable utf8->string to take a range

unofficial mirror of guile-devel@gnu.org 
 help / color / mirror / Atom feed

* [PATCH] Enable utf8->string to take a range
@ 2022-01-21  3:23 Vijay Marupudi
  2022-01-21 16:53 ` Maxime Devos
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Vijay Marupudi @ 2022-01-21  3:23 UTC (permalink / raw)
  To: guile-devel

[-- Attachment #1: Type: text/plain, Size: 1288 bytes --]

Hello,

Attached is a patch that allows `utf8->string' to take
additional parameters indicating the start and end indicies of the
bytevector that it is converting.

This preserves backwards compatibility by adding the new functionality
to an `scm_utf8_to_string_range' function (open to any alternative name
suggestions) and letting the old `scm_utf8_to_string' function call it.

For my work, I am currently handling bytevectors with large strings
embedded as part of the bytevector. This patch would reduce the need for
spurious allocations and copying to convert part of a bytevector to a
string using pure Scheme. It would also make R7RS compatibility easier,
since the current compatibility module involves copies to a fresh
bytevector.

For example, from module/scheme/base.scm

> (define (%subbytevector bv start end)
>   (define mlen (- end start))
>   (define out (make-bytevector mlen))
>   (bytevector-copy! bv start out 0 mlen)
>   out)
>
> (define (%subbytevector1 bv start)
>   (%subbytevector bv start (bytevector-length bv)))
>
> (define r7:utf8->string
>   (case-lambda*
>     ((bv) (utf8->string bv))
>     ((bv start #:optional (end (bytevector-length bv)))
>      (utf8->string (%subbytevector bv start end)))))


Would appreciate any thoughts and feedback.

~ Vijay


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Enable-utf8-string-to-take-a-range.patch --]
[-- Type: text/x-patch, Size: 3635 bytes --]

From 695c2a6189458a292819df8fba659ea488dc0b4e Mon Sep 17 00:00:00 2001
From: Vijay Marupudi <vijay@vijaymarupudi.com>
Date: Thu, 20 Jan 2022 22:19:25 -0500
Subject: [PATCH] Enable utf8->string to take a range

Additionally, adds a scm_utf8_to_string_range function for access from
C.
---
 doc/ref/api-data.texi  |  3 ++-
 libguile/bytevectors.c | 48 +++++++++++++++++++++++++++++++++++-------
 libguile/bytevectors.h |  1 +
 3 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/doc/ref/api-data.texi b/doc/ref/api-data.texi
index b6c2c4d61..1bdd1f7ed 100644
--- a/doc/ref/api-data.texi
+++ b/doc/ref/api-data.texi
@@ -7139,10 +7139,11 @@ UTF-32 (aka. UCS-4) encoding of @var{str}.  For UTF-16 and UTF-32,
 it defaults to big endian.
 @end deffn
 
-@deffn {Scheme Procedure} utf8->string utf
+@deffn {Scheme Procedure} utf8->string utf [start [end]]
 @deffnx {Scheme Procedure} utf16->string utf [endianness]
 @deffnx {Scheme Procedure} utf32->string utf [endianness]
 @deffnx {C Function} scm_utf8_to_string (utf)
+@deffnx {C Function} scm_utf8_to_string_range (utf, start, end)
 @deffnx {C Function} scm_utf16_to_string (utf, endianness)
 @deffnx {C Function} scm_utf32_to_string (utf, endianness)
 Return a newly allocated string that contains from the UTF-8-, UTF-16-,
diff --git a/libguile/bytevectors.c b/libguile/bytevectors.c
index f42fbb427..44a062257 100644
--- a/libguile/bytevectors.c
+++ b/libguile/bytevectors.c
@@ -2094,27 +2094,59 @@ SCM_DEFINE (scm_string_to_utf32, "string->utf32",
   return (str);
 
 
-SCM_DEFINE (scm_utf8_to_string, "utf8->string",
-	    1, 0, 0,
-	    (SCM utf),
+SCM_DEFINE (scm_utf8_to_string_range, "utf8->string",
+	    1, 2, 0,
+	    (SCM utf, SCM start, SCM end),
 	    "Return a newly allocate string that contains from the UTF-8-"
 	    "encoded contents of bytevector @var{utf}.")
-#define FUNC_NAME s_scm_utf8_to_string
+#define FUNC_NAME s_scm_utf8_to_string_range
 {
   SCM str;
   const char *c_utf;
-  size_t c_utf_len = 0;
+  size_t c_start = 0;
+  size_t c_end;
+  size_t c_len;
 
   SCM_VALIDATE_BYTEVECTOR (1, utf);
-
-  c_utf_len = SCM_BYTEVECTOR_LENGTH (utf);
   c_utf = (char *) SCM_BYTEVECTOR_CONTENTS (utf);
-  str = scm_from_utf8_stringn (c_utf, c_utf_len);
+  c_len = SCM_BYTEVECTOR_LENGTH(utf);
+  c_end = c_len;
+
+  if (!scm_is_eq (start, SCM_UNDEFINED))
+    {
+      c_start = scm_to_size_t (start);
+      if (SCM_UNLIKELY (c_start >= c_len))
+        {
+          scm_out_of_range (FUNC_NAME, start);
+        }
+
+      if (!scm_is_eq (end, SCM_UNDEFINED))
+	{
+	  c_end = scm_to_size_t (end);
+	  if (SCM_UNLIKELY (c_end > c_len))
+	    scm_out_of_range (FUNC_NAME, end);
+	}
+    }
+
+  if (SCM_UNLIKELY(c_end < c_start)) {
+    scm_out_of_range (FUNC_NAME, end);
+  }
+
+  str = scm_from_utf8_stringn (c_utf + c_start, c_end - c_start);
 
   return (str);
 }
 #undef FUNC_NAME
 
+SCM
+scm_utf8_to_string(SCM utf)
+#define FUNC_NAME s_scm_utf8_to_string
+{
+  return scm_utf8_to_string_range(utf, SCM_UNDEFINED, SCM_UNDEFINED);
+}
+#undef FUNC_NAME
+
+
 SCM_DEFINE (scm_utf16_to_string, "utf16->string",
 	    1, 1, 0,
 	    (SCM utf, SCM endianness),
diff --git a/libguile/bytevectors.h b/libguile/bytevectors.h
index 980d6e267..82a66ee5e 100644
--- a/libguile/bytevectors.h
+++ b/libguile/bytevectors.h
@@ -113,6 +113,7 @@ SCM_API SCM scm_string_to_utf8 (SCM);
 SCM_API SCM scm_string_to_utf16 (SCM, SCM);
 SCM_API SCM scm_string_to_utf32 (SCM, SCM);
 SCM_API SCM scm_utf8_to_string (SCM);
+SCM_API SCM scm_utf8_to_string_range (SCM, SCM, SCM);
 SCM_API SCM scm_utf16_to_string (SCM, SCM);
 SCM_API SCM scm_utf32_to_string (SCM, SCM);
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] Enable utf8->string to take a range
  2022-01-21  3:23 [PATCH] Enable utf8->string to take a range Vijay Marupudi
@ 2022-01-21 16:53 ` Maxime Devos
  2022-01-21 16:54 ` Maxime Devos
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 14+ messages in thread
From: Maxime Devos @ 2022-01-21 16:53 UTC (permalink / raw)
  To: Vijay Marupudi, guile-devel

[-- Attachment #1: Type: text/plain, Size: 738 bytes --]

Vijay Marupudi schreef op do 20-01-2022 om 22:23 [-0500]:
> +      c_start = scm_to_size_t (start);
> +      if (SCM_UNLIKELY (c_start >= c_len))
> +        {
> +          scm_out_of_range (FUNC_NAME, start);
> +        }
> +
> +      if (!scm_is_eq (end, SCM_UNDEFINED))
> +	{
> +	  c_end = scm_to_size_t (end);
> +	  if (SCM_UNLIKELY (c_end > c_len))
> +	    scm_out_of_range (FUNC_NAME, end);

IIUC, this will cause an out-of-range error for the following:

(utf8->string "" 0 0)

However, the following works:

(substring "" 0 0) ; -> empty string

There seems to be an inconsistency here.  Can (c_start >= c_len) be
relaxed to c_start > c_len?

Greetings,
Maxime.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 260 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Enable utf8->string to take a range
  2022-01-21  3:23 [PATCH] Enable utf8->string to take a range Vijay Marupudi
  2022-01-21 16:53 ` Maxime Devos
@ 2022-01-21 16:54 ` Maxime Devos
  2022-01-21 16:55 ` Maxime Devos
  2022-01-21 17:04 ` Maxime Devos
  3 siblings, 0 replies; 14+ messages in thread
From: Maxime Devos @ 2022-01-21 16:54 UTC (permalink / raw)
  To: Vijay Marupudi, guile-devel

[-- Attachment #1: Type: text/plain, Size: 1151 bytes --]

Vijay Marupudi schreef op do 20-01-2022 om 22:23 [-0500]:
> +@deffn {Scheme Procedure} utf8->string utf [start [end]]
>  @deffnx {Scheme Procedure} utf16->string utf [endianness]
>  @deffnx {Scheme Procedure} utf32->string utf [endianness]
>  @deffnx {C Function} scm_utf8_to_string (utf)
> +@deffnx {C Function} scm_utf8_to_string_range (utf, start, end)

It would be nice to document if it's an open, closed or half-
open/closed range.  E.g. see the documentation of 'substring':

 -- Scheme Procedure: substring str start [end]
 -- C Function: scm_substring (str, start, end)
     Return a new string formed from the characters of STR beginning
     with index START (inclusive) and ending with index END
(exclusive).
     STR must be a string, START and END must be exact integers
     satisfying:

     0 <= START <= END <= ‘(string-length STR)’.

     The returned string shares storage with STR initially, but it is
     copied as soon as one of the two strings is modified.

It seems a bit weird to support [start] and [end] for utf8->string but
not for utf16->string and utf32->string.

Greetings,
Maxime. 

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 260 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Enable utf8->string to take a range
  2022-01-21  3:23 [PATCH] Enable utf8->string to take a range Vijay Marupudi
  2022-01-21 16:53 ` Maxime Devos
  2022-01-21 16:54 ` Maxime Devos
@ 2022-01-21 16:55 ` Maxime Devos
  2022-01-21 17:04 ` Maxime Devos
  3 siblings, 0 replies; 14+ messages in thread
From: Maxime Devos @ 2022-01-21 16:55 UTC (permalink / raw)
  To: Vijay Marupudi, guile-devel

[-- Attachment #1: Type: text/plain, Size: 232 bytes --]

Vijay Marupudi schreef op do 20-01-2022 om 22:23 [-0500]:
> --- a/libguile/bytevectors.c
> +++ b/libguile/bytevectors.c
> [...]

Boundary conditions can be tricky, I would recommend writing some
tests.

Greetings,
Maxime.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 260 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Enable utf8->string to take a range
  2022-01-21  3:23 [PATCH] Enable utf8->string to take a range Vijay Marupudi
                   ` (2 preceding siblings ...)
  2022-01-21 16:55 ` Maxime Devos
@ 2022-01-21 17:04 ` Maxime Devos
  2022-01-21 20:20   ` Vijay Marupudi
  3 siblings, 1 reply; 14+ messages in thread
From: Maxime Devos @ 2022-01-21 17:04 UTC (permalink / raw)
  To: Vijay Marupudi, guile-devel

[-- Attachment #1: Type: text/plain, Size: 288 bytes --]

Vijay Marupudi schreef op do 20-01-2022 om 22:23 [-0500]:
> +      c_start = scm_to_size_t (start);

This seems suboptimal because if start > SIZE_MAX,
then this will throw an 'out-of-range' exception without attributing
it to 'utf8->string' (untested).

Greetings,
Maxime.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 260 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Enable utf8->string to take a range
  2022-01-21 17:04 ` Maxime Devos
@ 2022-01-21 20:20   ` Vijay Marupudi
  2022-01-21 22:08     ` Maxime Devos
  0 siblings, 1 reply; 14+ messages in thread
From: Vijay Marupudi @ 2022-01-21 20:20 UTC (permalink / raw)
  To: Maxime Devos, guile-devel

[-- Attachment #1: Type: text/plain, Size: 1043 bytes --]

> There seems to be an inconsistency here.  Can (c_start >= c_len) be
> relaxed to c_start > c_len?

Done. `substring' was a useful reference.
 
> It would be nice to document if it's an open, closed or half-
> open/closed range.  E.g. see the documentation of 'substring':

Done.

> It seems a bit weird to support [start] and [end] for utf8->string but
> not for utf16->string and utf32->string.

I will gladly implement those too. I wanted feedback on the code before
I implemented all of them, which you provided, thank you. I still want
thoughts on the name of the new C function. If that is okay, then I can
implement the rest following the same name pattern.

>> +      c_start = scm_to_size_t (start);
>
> This seems suboptimal because if start > SIZE_MAX,
> then this will throw an 'out-of-range' exception without attributing
> it to 'utf8->string' (untested).

Switched to scm_to_unsigned_integer, that does bounds checks. This is
what `substring' does.

The updated patch is attached.

~ Vijay


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Enable-utf8-string-to-take-a-range.patch --]
[-- Type: text/x-patch, Size: 6329 bytes --]

From 61b4b444eec1a8825d54604cbcb5a68bcfa9cef5 Mon Sep 17 00:00:00 2001
From: Vijay Marupudi <vijay@vijaymarupudi.com>
Date: Thu, 20 Jan 2022 22:19:25 -0500
Subject: [PATCH] Enable utf8->string to take a range

Additionally, adds a scm_utf8_to_string_range function for access from
C. Behaves like substring.

* doc/ref/api-data.texi: Updated documentation to reflect new function
  and range constraints
* libguile/bytevectors.c: Added new function.
* libguile/bytevectors.h: Added new function declaration.
* test-suite/tests/bytevectors.test: Added tests for exceptions and
  behavior for edge cases
---
 doc/ref/api-data.texi             |  8 +++-
 libguile/bytevectors.c            | 66 ++++++++++++++++++++++++++-----
 libguile/bytevectors.h            |  1 +
 test-suite/tests/bytevectors.test | 27 +++++++++++++
 4 files changed, 91 insertions(+), 11 deletions(-)

diff --git a/doc/ref/api-data.texi b/doc/ref/api-data.texi
index b6c2c4d61..0206435d3 100644
--- a/doc/ref/api-data.texi
+++ b/doc/ref/api-data.texi
@@ -7139,16 +7139,22 @@ UTF-32 (aka. UCS-4) encoding of @var{str}.  For UTF-16 and UTF-32,
 it defaults to big endian.
 @end deffn
 
-@deffn {Scheme Procedure} utf8->string utf
+@deffn {Scheme Procedure} utf8->string utf [start [end]]
 @deffnx {Scheme Procedure} utf16->string utf [endianness]
 @deffnx {Scheme Procedure} utf32->string utf [endianness]
 @deffnx {C Function} scm_utf8_to_string (utf)
+@deffnx {C Function} scm_utf8_to_string_range (utf, start, end)
 @deffnx {C Function} scm_utf16_to_string (utf, endianness)
 @deffnx {C Function} scm_utf32_to_string (utf, endianness)
 Return a newly allocated string that contains from the UTF-8-, UTF-16-,
 or UTF-32-decoded contents of bytevector @var{utf}.  For UTF-16 and UTF-32,
 @var{endianness} should be the symbol @code{big} or @code{little}; when omitted,
 it defaults to big endian.
+
+@var{start} and @var{end}, when provided, must be exact integers
+satisfying:
+
+0 <= @var{start} <= @var{end} <= @code{(bytevector-length @var{utf})}.
 @end deffn
 
 @node Bytevectors as Arrays
diff --git a/libguile/bytevectors.c b/libguile/bytevectors.c
index f42fbb427..3e128e667 100644
--- a/libguile/bytevectors.c
+++ b/libguile/bytevectors.c
@@ -2094,27 +2094,73 @@ SCM_DEFINE (scm_string_to_utf32, "string->utf32",
   return (str);
 
 
-SCM_DEFINE (scm_utf8_to_string, "utf8->string",
-	    1, 0, 0,
-	    (SCM utf),
-	    "Return a newly allocate string that contains from the UTF-8-"
-	    "encoded contents of bytevector @var{utf}.")
-#define FUNC_NAME s_scm_utf8_to_string
+static inline void
+validate_bytevector_range(const char* function_name, size_t len, size_t start, size_t end) {
+  if (SCM_UNLIKELY (start > len))
+    {
+      scm_out_of_range (function_name, scm_from_size_t(start));
+    }
+  if (SCM_UNLIKELY (end > len))
+    {
+      scm_out_of_range (function_name, scm_from_size_t(end));
+    }
+  if (SCM_UNLIKELY(end < start))
+    {
+      scm_out_of_range (function_name, scm_from_size_t(end));
+    }
+}
+
+
+SCM_DEFINE (scm_utf8_to_string_range, "utf8->string",
+            1, 2, 0,
+            (SCM utf, SCM start, SCM end),
+            "Return a newly allocate string that contains from the UTF-8-"
+            "encoded contents of bytevector @var{utf}.")
+#define FUNC_NAME s_scm_utf8_to_string_range
 {
   SCM str;
   const char *c_utf;
-  size_t c_utf_len = 0;
+  size_t c_start;
+  size_t c_end;
+  size_t c_len;
 
   SCM_VALIDATE_BYTEVECTOR (1, utf);
-
-  c_utf_len = SCM_BYTEVECTOR_LENGTH (utf);
   c_utf = (char *) SCM_BYTEVECTOR_CONTENTS (utf);
-  str = scm_from_utf8_stringn (c_utf, c_utf_len);
+  c_len = SCM_BYTEVECTOR_LENGTH(utf);
 
+  if (!scm_is_eq (start, SCM_UNDEFINED))
+    {
+      c_start = scm_to_unsigned_integer (start, 0, c_len);
+    }
+  else
+    {
+      c_start = 0;
+    }
+
+  if (!scm_is_eq (end, SCM_UNDEFINED))
+    {
+      c_end = scm_to_unsigned_integer (end, 0, c_len);
+    }
+  else
+    {
+      c_end = c_len;
+    }
+
+  validate_bytevector_range(FUNC_NAME, c_len, c_start, c_end);
+  str = scm_from_utf8_stringn (c_utf + c_start, c_end - c_start);
   return (str);
 }
 #undef FUNC_NAME
 
+SCM
+scm_utf8_to_string(SCM utf)
+#define FUNC_NAME s_scm_utf8_to_string
+{
+  return scm_utf8_to_string_range(utf, SCM_UNDEFINED, SCM_UNDEFINED);
+}
+#undef FUNC_NAME
+
+
 SCM_DEFINE (scm_utf16_to_string, "utf16->string",
 	    1, 1, 0,
 	    (SCM utf, SCM endianness),
diff --git a/libguile/bytevectors.h b/libguile/bytevectors.h
index 980d6e267..82a66ee5e 100644
--- a/libguile/bytevectors.h
+++ b/libguile/bytevectors.h
@@ -113,6 +113,7 @@ SCM_API SCM scm_string_to_utf8 (SCM);
 SCM_API SCM scm_string_to_utf16 (SCM, SCM);
 SCM_API SCM scm_string_to_utf32 (SCM, SCM);
 SCM_API SCM scm_utf8_to_string (SCM);
+SCM_API SCM scm_utf8_to_string_range (SCM, SCM, SCM);
 SCM_API SCM scm_utf16_to_string (SCM, SCM);
 SCM_API SCM scm_utf32_to_string (SCM, SCM);
 
diff --git a/test-suite/tests/bytevectors.test b/test-suite/tests/bytevectors.test
index 732aadb3e..08719703a 100644
--- a/test-suite/tests/bytevectors.test
+++ b/test-suite/tests/bytevectors.test
@@ -558,6 +558,33 @@
       exception:decoding-error
     (utf8->string #vu8(104 105 239 191 50)))
 
+  (pass-if "utf8->string range: start provided"
+    (let* ((utf8 (string->utf8 "gnu guile"))
+           (str (utf8->string utf8 4)))
+      (string=? str "guile")))
+
+  (pass-if "utf8->string range: start and end provided"
+    (let* ((utf8 (string->utf8 "gnu guile"))
+           (str (utf8->string utf8 4 7)))
+      (string=? str "gui")))
+
+  (pass-if "utf8->string range: start = end = 0"
+    (let* ((utf8 (string->utf8 "gnu guile"))
+           (str (utf8->string utf8 0 0)))
+      (string=? str "")))
+
+  (pass-if-exception "utf8->string range: start > len"
+      exception:out-of-range
+    (let* ((utf8 (string->utf8 "four")))
+      ;; 4 as start is expected to return an empty string, in congruence
+      ;; with `substring'.
+      (utf8->string utf8 5)))
+
+  (pass-if-exception "utf8->string range: end < start"
+      exception:out-of-range
+      (let* ((utf8 (string->utf8 "gnu guile")))
+        (utf8->string utf8 1 0)))
+
   (pass-if "utf16->string"
     (let* ((utf16  (uint-list->bytevector (map char->integer
                                                (string->list "hello, world"))
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] Enable utf8->string to take a range
  2022-01-21 20:20   ` Vijay Marupudi
@ 2022-01-21 22:08     ` Maxime Devos
  2022-01-22  1:21       ` Vijay Marupudi
  0 siblings, 1 reply; 14+ messages in thread
From: Maxime Devos @ 2022-01-21 22:08 UTC (permalink / raw)
  To: Vijay Marupudi, guile-devel

[-- Attachment #1.1: Type: text/plain, Size: 1169 bytes --]

Vijay Marupudi schreef op vr 21-01-2022 om 15:20 [-0500]:
+  (pass-if-exception "utf8->string range: end < start"
+      exception:out-of-range
+      (let* ((utf8 (string->utf8 "gnu guile")))
+        (utf8->string utf8 1 0)))
+  [other tests]

It would be nice to check multibyte characters as well,
to verify that byte indices and not character indices are used.

E.g., (utf8->string #vu8(195 169) 0 2) should return "é".

Another nice test: (utf8->string #vu8(195 169) 0 1) should raise
a 'decoding-error', even though #vu8(195 169) is valid UTF-8.

And (utf8->string #vu8(0 32 196) 0 2) should return "\x00 " even
though #vu8(0 32 195) is invalid UTF-8 -- and as a bonus, it checks
that the nul character is supported -- which can be easily forgotten
because Guile is implemented in C which usually terminates strings
by zero instead of using a length field.

Overall, the patch you sent seems a reasonable approach to me, though
I didn't verify the details.  I find myself at times copying a part
of a bytevector to a new bytevector because some procedure doesn't
allow specifying byte ranges ...

Greetings,
Maxime

[-- Attachment #1.2: Type: text/html, Size: 1597 bytes --]

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 260 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Enable utf8->string to take a range
  2022-01-21 22:08     ` Maxime Devos
@ 2022-01-22  1:21       ` Vijay Marupudi
  2022-03-09 13:20         ` Maxime Devos
                           ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Vijay Marupudi @ 2022-01-22  1:21 UTC (permalink / raw)
  To: Maxime Devos, guile-devel

[-- Attachment #1: Type: text/plain, Size: 1828 bytes --]

> It would be nice to check multibyte characters as well,
> to verify that byte indices and not character indices are used.
>
> E.g., (utf8->string #vu8(195 169) 0 2) should return "é".
>
> Another nice test: (utf8->string #vu8(195 169) 0 1) should raise
> a 'decoding-error', even though #vu8(195 169) is valid UTF-8.
>
> And (utf8->string #vu8(0 32 196) 0 2) should return "\x00 " even
> though #vu8(0 32 195) is invalid UTF-8 -- and as a bonus, it checks
> that the nul character is supported -- which can be easily forgotten
> because Guile is implemented in C which usually terminates strings
> by zero instead of using a length field.

Thank you for the suggestions. I have added all the tests you suggested
to the test suite, and they all pass.

> Overall, the patch you sent seems a reasonable approach to me, though
> I didn't verify the details. I find myself at times copying a part of
> a bytevector to a new bytevector because some procedure doesn't allow
> specifying byte ranges ...

I'm glad it will be useful for you!

I addition to those tests, I have added the range functionality to both
utf16->string, and utf32->string. I have updated the documentation, and
the tests pass. I have also changed the name of the functions to
emphasize that they are a range on the bytevector (not the string). The
new C functions are the following.

SCM scm_utf8_range_to_string (SCM, SCM, SCM);
SCM scm_utf16_range_to_string (SCM, SCM, SCM, SCM);
SCM scm_utf32_range_to_string (SCM, SCM, SCM, SCM);

In a separate patch, I have removed the wrapper function for R7RS
compatibility and have exported the new changed utf8->string function. I
have removed a function that was not being used anywhere in the process.

I have attached the edited patch, and the new R7RS patch.

~ Vijay


[-- Attachment #2: 0001-Allow-utf8-string-utf16-string-utf32-string-to-take-.patch --]
[-- Type: text/x-patch, Size: 12490 bytes --]

From c6be127b4818d43a0244592c18a52de113d3ff08 Mon Sep 17 00:00:00 2001
From: Vijay Marupudi <vijay@vijaymarupudi.com>
Date: Thu, 20 Jan 2022 22:19:25 -0500
Subject: [PATCH 1/2] Allow utf8->string, utf16->string, utf32->string to take
 ranges

Added the following new functions, that behave like substring, but for
bytevector to string conversion.

scm_utf8_range_to_string (SCM, SCM, SCM);
scm_utf16_range_to_string (SCM, SCM, SCM, SCM);
scm_utf32_range_to_string (SCM, SCM, SCM, SCM);

* doc/ref/api-data.texi: Updated documentation to reflect new function
  and range constraints
* libguile/bytevectors.c: Added new function.
* libguile/bytevectors.h: Added new function declaration.
* test-suite/tests/bytevectors.test: Added tests for exceptions and
  behavior for edge cases
---
 doc/ref/api-data.texi             |  15 +++-
 libguile/bytevectors.c            | 144 +++++++++++++++++++++++-------
 libguile/bytevectors.h            |   3 +
 test-suite/tests/bytevectors.test |  37 ++++++++
 4 files changed, 164 insertions(+), 35 deletions(-)

diff --git a/doc/ref/api-data.texi b/doc/ref/api-data.texi
index b6c2c4d61..44b64454f 100644
--- a/doc/ref/api-data.texi
+++ b/doc/ref/api-data.texi
@@ -7139,16 +7139,25 @@ UTF-32 (aka. UCS-4) encoding of @var{str}.  For UTF-16 and UTF-32,
 it defaults to big endian.
 @end deffn
 
-@deffn {Scheme Procedure} utf8->string utf
-@deffnx {Scheme Procedure} utf16->string utf [endianness]
-@deffnx {Scheme Procedure} utf32->string utf [endianness]
+@deffn {Scheme Procedure} utf8->string utf [start [end]]
+@deffnx {Scheme Procedure} utf16->string utf [endianness [start [end]]]
+@deffnx {Scheme Procedure} utf32->string utf [endianness [start [end]]]
 @deffnx {C Function} scm_utf8_to_string (utf)
+@deffnx {C Function} scm_utf8_range_to_string (utf, start, end)
 @deffnx {C Function} scm_utf16_to_string (utf, endianness)
+@deffnx {C Function} scm_utf16_range_to_string (utf, endianness, start, end)
 @deffnx {C Function} scm_utf32_to_string (utf, endianness)
+@deffnx {C Function} scm_utf32_range_to_string (utf, endianness, start, end)
+
 Return a newly allocated string that contains from the UTF-8-, UTF-16-,
 or UTF-32-decoded contents of bytevector @var{utf}.  For UTF-16 and UTF-32,
 @var{endianness} should be the symbol @code{big} or @code{little}; when omitted,
 it defaults to big endian.
+
+@var{start} and @var{end}, when provided, must be exact integers
+satisfying:
+
+0 <= @var{start} <= @var{end} <= @code{(bytevector-length @var{utf})}.
 @end deffn
 
 @node Bytevectors as Arrays
diff --git a/libguile/bytevectors.c b/libguile/bytevectors.c
index f42fbb427..12d299042 100644
--- a/libguile/bytevectors.c
+++ b/libguile/bytevectors.c
@@ -2061,25 +2061,46 @@ SCM_DEFINE (scm_string_to_utf32, "string->utf32",
 
 /* Produce the body of a function that converts a UTF-encoded bytevector to a
    string.  */
-#define UTF_TO_STRING(_utf_width)					\
+#define UTF_TO_STRING(_utf_width, utf, endianness, start, end)          \
   SCM str = SCM_BOOL_F;							\
   int err;								\
   char *c_str = NULL;                                                   \
   char c_utf_name[MAX_UTF_ENCODING_NAME_LEN];				\
   char *c_utf;                                                          \
-  size_t c_strlen = 0, c_utf_len = 0;					\
+  size_t c_strlen = 0, c_utf_len, c_start, c_end;                       \
 									\
-  SCM_VALIDATE_BYTEVECTOR (1, utf);					\
-  if (scm_is_eq (endianness, SCM_UNDEFINED))                            \
-    endianness = sym_big;						\
+  SCM_VALIDATE_BYTEVECTOR (1, (utf));					\
+  if (scm_is_eq ((endianness), SCM_UNDEFINED))                          \
+    (endianness) = sym_big;						\
   else									\
-    SCM_VALIDATE_SYMBOL (2, endianness);				\
+    SCM_VALIDATE_SYMBOL (2, (endianness));				\
 									\
-  c_utf_len = SCM_BYTEVECTOR_LENGTH (utf);				\
-  c_utf = (char *) SCM_BYTEVECTOR_CONTENTS (utf);			\
-  utf_encoding_name (c_utf_name, (_utf_width), endianness);		\
+  c_utf_len = SCM_BYTEVECTOR_LENGTH ((utf));				\
+  c_utf = (char *) SCM_BYTEVECTOR_CONTENTS ((utf));			\
+  utf_encoding_name (c_utf_name, (_utf_width), (endianness));		\
+                                                                        \
+  if (!scm_is_eq ((start), SCM_UNDEFINED))                              \
+    {                                                                   \
+      c_start = scm_to_unsigned_integer ((start), 0, c_utf_len);        \
+    }                                                                   \
+  else                                                                  \
+    {                                                                   \
+      c_start = 0;                                                      \
+    }                                                                   \
+                                                                        \
+  if (!scm_is_eq ((end), SCM_UNDEFINED))                                \
+    {                                                                   \
+      c_end = scm_to_unsigned_integer ((end), 0, c_utf_len);            \
+    }                                                                   \
+  else                                                                  \
+    {                                                                   \
+      c_end = c_utf_len;                                                \
+    }                                                                   \
+                                                                        \
+  validate_bytevector_range(FUNC_NAME, c_utf_len, c_start, c_end);      \
+                                                                        \
 									\
-  err = mem_iconveh (c_utf, c_utf_len,					\
+  err = mem_iconveh (c_utf + c_start, c_end - c_start,                  \
 		     c_utf_name, "UTF-8",				\
 		     iconveh_question_mark, NULL,			\
 		     &c_str, &c_strlen);				\
@@ -2094,46 +2115,105 @@ SCM_DEFINE (scm_string_to_utf32, "string->utf32",
   return (str);
 
 
-SCM_DEFINE (scm_utf8_to_string, "utf8->string",
-	    1, 0, 0,
-	    (SCM utf),
-	    "Return a newly allocate string that contains from the UTF-8-"
-	    "encoded contents of bytevector @var{utf}.")
-#define FUNC_NAME s_scm_utf8_to_string
+static inline void
+validate_bytevector_range(const char* function_name, size_t len, size_t start, size_t end) {
+  if (SCM_UNLIKELY (start > len))
+    {
+      scm_out_of_range (function_name, scm_from_size_t(start));
+    }
+  if (SCM_UNLIKELY (end > len))
+    {
+      scm_out_of_range (function_name, scm_from_size_t(end));
+    }
+  if (SCM_UNLIKELY(end < start))
+    {
+      scm_out_of_range (function_name, scm_from_size_t(end));
+    }
+}
+
+
+SCM_DEFINE (scm_utf8_range_to_string, "utf8->string",
+            1, 2, 0,
+            (SCM utf, SCM start, SCM end),
+            "Return a newly allocate string that contains from the UTF-8-"
+            "encoded contents of bytevector @var{utf}.")
+#define FUNC_NAME s_scm_utf8_range_to_string
 {
   SCM str;
   const char *c_utf;
-  size_t c_utf_len = 0;
+  size_t c_start;
+  size_t c_end;
+  size_t c_len;
 
   SCM_VALIDATE_BYTEVECTOR (1, utf);
-
-  c_utf_len = SCM_BYTEVECTOR_LENGTH (utf);
   c_utf = (char *) SCM_BYTEVECTOR_CONTENTS (utf);
-  str = scm_from_utf8_stringn (c_utf, c_utf_len);
+  c_len = SCM_BYTEVECTOR_LENGTH(utf);
+
+  if (!scm_is_eq (start, SCM_UNDEFINED))
+    {
+      c_start = scm_to_unsigned_integer (start, 0, c_len);
+    }
+  else
+    {
+      c_start = 0;
+    }
 
+  if (!scm_is_eq (end, SCM_UNDEFINED))
+    {
+      c_end = scm_to_unsigned_integer (end, 0, c_len);
+    }
+  else
+    {
+      c_end = c_len;
+    }
+
+  validate_bytevector_range(FUNC_NAME, c_len, c_start, c_end);
+  str = scm_from_utf8_stringn (c_utf + c_start, c_end - c_start);
   return (str);
 }
 #undef FUNC_NAME
 
-SCM_DEFINE (scm_utf16_to_string, "utf16->string",
-	    1, 1, 0,
-	    (SCM utf, SCM endianness),
-	    "Return a newly allocate string that contains from the UTF-16-"
-	    "encoded contents of bytevector @var{utf}.")
+SCM
+scm_utf8_to_string(SCM utf)
+#define FUNC_NAME s_scm_utf8_to_string
+{
+  return scm_utf8_range_to_string(utf, SCM_UNDEFINED, SCM_UNDEFINED);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_utf16_range_to_string, "utf16->string",
+            1, 3, 0,
+            (SCM utf, SCM endianness, SCM start, SCM end),
+            "Return a newly allocate string that contains from the UTF-8-"
+            "encoded contents of bytevector @var{utf}.")
+#define FUNC_NAME s_scm_utf16_range_to_string
+{
+  UTF_TO_STRING(16, utf, endianness, start, end);
+}
+#undef FUNC_NAME
+
+SCM scm_utf16_to_string (SCM utf, SCM endianness)
 #define FUNC_NAME s_scm_utf16_to_string
 {
-  UTF_TO_STRING (16);
+  return scm_utf16_range_to_string(utf, endianness, SCM_UNDEFINED, SCM_UNDEFINED);
 }
 #undef FUNC_NAME
 
-SCM_DEFINE (scm_utf32_to_string, "utf32->string",
-	    1, 1, 0,
-	    (SCM utf, SCM endianness),
-	    "Return a newly allocate string that contains from the UTF-32-"
-	    "encoded contents of bytevector @var{utf}.")
+SCM_DEFINE (scm_utf32_range_to_string, "utf32->string",
+            1, 3, 0,
+            (SCM utf, SCM endianness, SCM start, SCM end),
+            "Return a newly allocate string that contains from the UTF-8-"
+            "encoded contents of bytevector @var{utf}.")
+#define FUNC_NAME s_scm_utf32_range_to_string
+{
+  UTF_TO_STRING(32, utf, endianness, start, end);
+}
+#undef FUNC_NAME
+
+SCM scm_utf32_to_string (SCM utf, SCM endianness)
 #define FUNC_NAME s_scm_utf32_to_string
 {
-  UTF_TO_STRING (32);
+  return scm_utf32_range_to_string(utf, endianness, SCM_UNDEFINED, SCM_UNDEFINED);
 }
 #undef FUNC_NAME
 
diff --git a/libguile/bytevectors.h b/libguile/bytevectors.h
index 980d6e267..63d8e3119 100644
--- a/libguile/bytevectors.h
+++ b/libguile/bytevectors.h
@@ -113,8 +113,11 @@ SCM_API SCM scm_string_to_utf8 (SCM);
 SCM_API SCM scm_string_to_utf16 (SCM, SCM);
 SCM_API SCM scm_string_to_utf32 (SCM, SCM);
 SCM_API SCM scm_utf8_to_string (SCM);
+SCM_API SCM scm_utf8_range_to_string (SCM, SCM, SCM);
 SCM_API SCM scm_utf16_to_string (SCM, SCM);
+SCM_API SCM scm_utf16_range_to_string (SCM, SCM, SCM, SCM);
 SCM_API SCM scm_utf32_to_string (SCM, SCM);
+SCM_API SCM scm_utf32_range_to_string (SCM, SCM, SCM, SCM);
 
 
 \f
diff --git a/test-suite/tests/bytevectors.test b/test-suite/tests/bytevectors.test
index 732aadb3e..f8c6a8df1 100644
--- a/test-suite/tests/bytevectors.test
+++ b/test-suite/tests/bytevectors.test
@@ -558,6 +558,43 @@
       exception:decoding-error
     (utf8->string #vu8(104 105 239 191 50)))
 
+  (pass-if "utf8->string range: start provided"
+    (let* ((utf8 (string->utf8 "gnu guile"))
+           (str (utf8->string utf8 4)))
+      (string=? str "guile")))
+
+  (pass-if "utf8->string range: start and end provided"
+    (let* ((utf8 (string->utf8 "gnu guile"))
+           (str (utf8->string utf8 4 7)))
+      (string=? str "gui")))
+
+  (pass-if "utf8->string range: start = end = 0"
+    (let* ((utf8 (string->utf8 "gnu guile"))
+           (str (utf8->string utf8 0 0)))
+      (string=? str "")))
+
+  (pass-if-exception "utf8->string range: start > len"
+      exception:out-of-range
+    (let* ((utf8 (string->utf8 "four")))
+      ;; 4 as start is expected to return an empty string, in congruence
+      ;; with `substring'.
+      (utf8->string utf8 5)))
+
+  (pass-if-exception "utf8->string range: end < start"
+      exception:out-of-range
+      (let* ((utf8 (string->utf8 "gnu guile")))
+        (utf8->string utf8 1 0)))
+
+  (pass-if "utf8->string range: multibyte characters"
+    (string=? (utf8->string #vu8(195 169 67) 0 2) "é"))
+
+  (pass-if-exception "utf8->string range: decoding error for invalid range"
+      exception:decoding-error
+    (utf8->string #vu8(195 169) 0 1))
+
+  (pass-if "utf8->string range: null byte non-termination"
+    (string=? (utf8->string #vu8(0 32 196) 0 2) "\x00 "))
+
   (pass-if "utf16->string"
     (let* ((utf16  (uint-list->bytevector (map char->integer
                                                (string->list "hello, world"))
-- 
2.34.1


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: 0002-Re-export-utf8-string-instead-of-wrapper-with-extra-.patch --]
[-- Type: text/x-patch, Size: 1951 bytes --]

From 94bbaaf6dc4760eb22bb4b2594648b1f8dbf83a5 Mon Sep 17 00:00:00 2001
From: Vijay Marupudi <vijay@vijaymarupudi.com>
Date: Fri, 21 Jan 2022 20:17:42 -0500
Subject: [PATCH 2/2] Re-export utf8->string instead of wrapper with extra
 allocation.

In addition, remove the redundant function that was dead code.

module/scheme/base.scm: Deleted wrapped and exported the default
utf8->string.
---
 module/scheme/base.scm | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/module/scheme/base.scm b/module/scheme/base.scm
index c6a73c092..28adb2e32 100644
--- a/module/scheme/base.scm
+++ b/module/scheme/base.scm
@@ -60,7 +60,6 @@
             vector-append vector-for-each vector-map
             (r7:bytevector-copy . bytevector-copy)
             (r7:bytevector-copy! . bytevector-copy!)
-            (r7:utf8->string . utf8->string)
             square
             (r7:expt . expt)
             boolean=? symbol=?
@@ -109,7 +108,7 @@
    string-copy string-copy! string-fill! string-for-each
    string-length string-ref string-set! string<=? string<?
    string=? string>=? string>? string? substring symbol->string
-   symbol? syntax-error syntax-rules truncate
+   symbol? utf8->string syntax-error syntax-rules truncate
    truncate-quotient truncate-remainder truncate/
    (char-ready? . u8-ready?)
    unless
@@ -494,9 +493,6 @@
   (bytevector-copy! bv start out 0 mlen)
   out)
 
-(define (%subbytevector1 bv start)
-  (%subbytevector bv start (bytevector-length bv)))
-
 (define r7:bytevector-copy!
   (case-lambda*
    ((to at from #:optional
@@ -512,12 +508,6 @@
     ((bv start #:optional (end (bytevector-length bv)))
      (%subbytevector bv start end))))
 
-(define r7:utf8->string
-  (case-lambda*
-    ((bv) (utf8->string bv))
-    ((bv start #:optional (end (bytevector-length bv)))
-     (utf8->string (%subbytevector bv start end)))))
-
 (define (square x) (* x x))
 
 (define (r7:expt x y)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] Enable utf8->string to take a range
  2022-01-22  1:21       ` Vijay Marupudi
@ 2022-03-09 13:20         ` Maxime Devos
  2022-03-09 13:20         ` Maxime Devos
  2022-03-09 13:24         ` Maxime Devos
  2 siblings, 0 replies; 14+ messages in thread
From: Maxime Devos @ 2022-03-09 13:20 UTC (permalink / raw)
  To: Vijay Marupudi, guile-devel

[-- Attachment #1: Type: text/plain, Size: 409 bytes --]

Vijay Marupudi schreef op vr 21-01-2022 om 20:21 [-0500]:
> +SCM_DEFINE (scm_utf16_range_to_string, "utf16->string",
> +            1, 3, 0,
> +            (SCM utf, SCM endianness, SCM start, SCM end),
> +            "Return a newly allocate string that contains from the
> UTF-8-"
> +            "encoded contents of bytevector @var{utf}.")

Did you mean UTF-16?

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 260 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Enable utf8->string to take a range
  2022-01-22  1:21       ` Vijay Marupudi
  2022-03-09 13:20         ` Maxime Devos
@ 2022-03-09 13:20         ` Maxime Devos
  2022-03-09 13:24         ` Maxime Devos
  2 siblings, 0 replies; 14+ messages in thread
From: Maxime Devos @ 2022-03-09 13:20 UTC (permalink / raw)
  To: Vijay Marupudi, guile-devel

Vijay Marupudi schreef op vr 21-01-2022 om 20:21 [-0500]:
> +SCM_DEFINE (scm_utf8_range_to_string, "utf8->string",
> +            1, 2, 0,
> +            (SCM utf, SCM start, SCM end),
> +            "Return a newly allocate string that contains from the
> UTF-8-"
> +            "encoded contents of bytevector @var{utf}.")





^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Enable utf8->string to take a range
  2022-01-22  1:21       ` Vijay Marupudi
  2022-03-09 13:20         ` Maxime Devos
  2022-03-09 13:20         ` Maxime Devos
@ 2022-03-09 13:24         ` Maxime Devos
  2022-03-09 13:27           ` Maxime Devos
  2 siblings, 1 reply; 14+ messages in thread
From: Maxime Devos @ 2022-03-09 13:24 UTC (permalink / raw)
  To: Vijay Marupudi, guile-devel

[-- Attachment #1: Type: text/plain, Size: 901 bytes --]

Vijay Marupudi schreef op vr 21-01-2022 om 20:21 [-0500]:
> +SCM_DEFINE (scm_utf8_range_to_string, "utf8->string",
> +            1, 2, 0,
> +            (SCM utf, SCM start, SCM end),
> +            "Return a newly allocate string that contains from the
> UTF-8-"
> +            "encoded contents of bytevector @var{utf}.")

This is incorrect, since the nul character is encoded even though UTF-
proper does not allow encoding the nul character -- UTF-8 with an
encoding of the nul character is sometimes called ‘modified UTF-8’.

The distinction is sometimes relevant, e.g. the GNS specifications asks
for labels to be encoded in UTF-8, and according to the spec writers,
that implied that nul characters are forbidden.

As such, I cannot rely on 'utf8->string' to verify that there aren't
any nul characters.

Greetings,
Maxime.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 260 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Enable utf8->string to take a range
  2022-03-09 13:24         ` Maxime Devos
@ 2022-03-09 13:27           ` Maxime Devos
  2022-03-09 13:35             ` Maxime Devos
  0 siblings, 1 reply; 14+ messages in thread
From: Maxime Devos @ 2022-03-09 13:27 UTC (permalink / raw)
  To: Vijay Marupudi, guile-devel

[-- Attachment #1: Type: text/plain, Size: 359 bytes --]

Maxime Devos schreef op wo 09-03-2022 om 14:24 [+0100]:
> This is incorrect, since the nul character is encoded even though
> UTF-
> proper does not allow encoding the nul character -- UTF-8 with an
> encoding of the nul character is sometimes called ‘modified UTF-8’.

That's not quite correct, seems like Guile uses another encoding, but
still.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 260 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Enable utf8->string to take a range
  2022-03-09 13:27           ` Maxime Devos
@ 2022-03-09 13:35             ` Maxime Devos
  2022-03-09 14:50               ` Vijay Marupudi
  0 siblings, 1 reply; 14+ messages in thread
From: Maxime Devos @ 2022-03-09 13:35 UTC (permalink / raw)
  To: Vijay Marupudi, guile-devel

[-- Attachment #1: Type: text/plain, Size: 304 bytes --]

Maxime Devos schreef op wo 09-03-2022 om 14:27 [+0100]:
> That's not quite correct, seems like Guile uses another encoding, but
> still.

Nevermind, seems like a misinterpreded a comment and #vu8(97 0 98) is
valid UTF-8 after all, it's just not possible to encode it as a zero-
terminated string.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 260 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Enable utf8->string to take a range
  2022-03-09 13:35             ` Maxime Devos
@ 2022-03-09 14:50               ` Vijay Marupudi
  0 siblings, 0 replies; 14+ messages in thread
From: Vijay Marupudi @ 2022-03-09 14:50 UTC (permalink / raw)
  To: Maxime Devos, guile-devel

[-- Attachment #1: Type: text/plain, Size: 449 bytes --]

Maxime Devos <maximedevos@telenet.be> writes:

> Nevermind, seems like a misinterpreded a comment and #vu8(97 0 98) is
> valid UTF-8 after all, it's just not possible to encode it as a zero-
> terminated string.

Thanks for the catch on the typo in the docstrings. I've attached the
updated versions of the patches that fixes that typo. Assuming that no
changes are required for the null bytes comments.

Hoping this can get merged soon!

~ Vijay



[-- Attachment #2: 0001-Allow-utf8-string-utf16-string-utf32-string-to-take-.patch --]
[-- Type: text/x-patch, Size: 12492 bytes --]

From 6f287255456dbefb75a1c2242904c8f0046ad5bb Mon Sep 17 00:00:00 2001
From: Vijay Marupudi <vijay@vijaymarupudi.com>
Date: Thu, 20 Jan 2022 22:19:25 -0500
Subject: [PATCH 1/2] Allow utf8->string, utf16->string, utf32->string to take
 ranges

Added the following new functions, that behave like substring, but for
bytevector to string conversion.

scm_utf8_range_to_string (SCM, SCM, SCM);
scm_utf16_range_to_string (SCM, SCM, SCM, SCM);
scm_utf32_range_to_string (SCM, SCM, SCM, SCM);

* doc/ref/api-data.texi: Updated documentation to reflect new function
  and range constraints
* libguile/bytevectors.c: Added new function.
* libguile/bytevectors.h: Added new function declaration.
* test-suite/tests/bytevectors.test: Added tests for exceptions and
  behavior for edge cases
---
 doc/ref/api-data.texi             |  15 +++-
 libguile/bytevectors.c            | 144 +++++++++++++++++++++++-------
 libguile/bytevectors.h            |   3 +
 test-suite/tests/bytevectors.test |  37 ++++++++
 4 files changed, 164 insertions(+), 35 deletions(-)

diff --git a/doc/ref/api-data.texi b/doc/ref/api-data.texi
index b6c2c4d61..44b64454f 100644
--- a/doc/ref/api-data.texi
+++ b/doc/ref/api-data.texi
@@ -7139,16 +7139,25 @@ UTF-32 (aka. UCS-4) encoding of @var{str}.  For UTF-16 and UTF-32,
 it defaults to big endian.
 @end deffn
 
-@deffn {Scheme Procedure} utf8->string utf
-@deffnx {Scheme Procedure} utf16->string utf [endianness]
-@deffnx {Scheme Procedure} utf32->string utf [endianness]
+@deffn {Scheme Procedure} utf8->string utf [start [end]]
+@deffnx {Scheme Procedure} utf16->string utf [endianness [start [end]]]
+@deffnx {Scheme Procedure} utf32->string utf [endianness [start [end]]]
 @deffnx {C Function} scm_utf8_to_string (utf)
+@deffnx {C Function} scm_utf8_range_to_string (utf, start, end)
 @deffnx {C Function} scm_utf16_to_string (utf, endianness)
+@deffnx {C Function} scm_utf16_range_to_string (utf, endianness, start, end)
 @deffnx {C Function} scm_utf32_to_string (utf, endianness)
+@deffnx {C Function} scm_utf32_range_to_string (utf, endianness, start, end)
+
 Return a newly allocated string that contains from the UTF-8-, UTF-16-,
 or UTF-32-decoded contents of bytevector @var{utf}.  For UTF-16 and UTF-32,
 @var{endianness} should be the symbol @code{big} or @code{little}; when omitted,
 it defaults to big endian.
+
+@var{start} and @var{end}, when provided, must be exact integers
+satisfying:
+
+0 <= @var{start} <= @var{end} <= @code{(bytevector-length @var{utf})}.
 @end deffn
 
 @node Bytevectors as Arrays
diff --git a/libguile/bytevectors.c b/libguile/bytevectors.c
index f42fbb427..4c1f4ce42 100644
--- a/libguile/bytevectors.c
+++ b/libguile/bytevectors.c
@@ -2061,25 +2061,46 @@ SCM_DEFINE (scm_string_to_utf32, "string->utf32",
 
 /* Produce the body of a function that converts a UTF-encoded bytevector to a
    string.  */
-#define UTF_TO_STRING(_utf_width)					\
+#define UTF_TO_STRING(_utf_width, utf, endianness, start, end)          \
   SCM str = SCM_BOOL_F;							\
   int err;								\
   char *c_str = NULL;                                                   \
   char c_utf_name[MAX_UTF_ENCODING_NAME_LEN];				\
   char *c_utf;                                                          \
-  size_t c_strlen = 0, c_utf_len = 0;					\
+  size_t c_strlen = 0, c_utf_len, c_start, c_end;                       \
 									\
-  SCM_VALIDATE_BYTEVECTOR (1, utf);					\
-  if (scm_is_eq (endianness, SCM_UNDEFINED))                            \
-    endianness = sym_big;						\
+  SCM_VALIDATE_BYTEVECTOR (1, (utf));					\
+  if (scm_is_eq ((endianness), SCM_UNDEFINED))                          \
+    (endianness) = sym_big;						\
   else									\
-    SCM_VALIDATE_SYMBOL (2, endianness);				\
+    SCM_VALIDATE_SYMBOL (2, (endianness));				\
 									\
-  c_utf_len = SCM_BYTEVECTOR_LENGTH (utf);				\
-  c_utf = (char *) SCM_BYTEVECTOR_CONTENTS (utf);			\
-  utf_encoding_name (c_utf_name, (_utf_width), endianness);		\
+  c_utf_len = SCM_BYTEVECTOR_LENGTH ((utf));				\
+  c_utf = (char *) SCM_BYTEVECTOR_CONTENTS ((utf));			\
+  utf_encoding_name (c_utf_name, (_utf_width), (endianness));		\
+                                                                        \
+  if (!scm_is_eq ((start), SCM_UNDEFINED))                              \
+    {                                                                   \
+      c_start = scm_to_unsigned_integer ((start), 0, c_utf_len);        \
+    }                                                                   \
+  else                                                                  \
+    {                                                                   \
+      c_start = 0;                                                      \
+    }                                                                   \
+                                                                        \
+  if (!scm_is_eq ((end), SCM_UNDEFINED))                                \
+    {                                                                   \
+      c_end = scm_to_unsigned_integer ((end), 0, c_utf_len);            \
+    }                                                                   \
+  else                                                                  \
+    {                                                                   \
+      c_end = c_utf_len;                                                \
+    }                                                                   \
+                                                                        \
+  validate_bytevector_range(FUNC_NAME, c_utf_len, c_start, c_end);      \
+                                                                        \
 									\
-  err = mem_iconveh (c_utf, c_utf_len,					\
+  err = mem_iconveh (c_utf + c_start, c_end - c_start,                  \
 		     c_utf_name, "UTF-8",				\
 		     iconveh_question_mark, NULL,			\
 		     &c_str, &c_strlen);				\
@@ -2094,46 +2115,105 @@ SCM_DEFINE (scm_string_to_utf32, "string->utf32",
   return (str);
 
 
-SCM_DEFINE (scm_utf8_to_string, "utf8->string",
-	    1, 0, 0,
-	    (SCM utf),
-	    "Return a newly allocate string that contains from the UTF-8-"
-	    "encoded contents of bytevector @var{utf}.")
-#define FUNC_NAME s_scm_utf8_to_string
+static inline void
+validate_bytevector_range(const char* function_name, size_t len, size_t start, size_t end) {
+  if (SCM_UNLIKELY (start > len))
+    {
+      scm_out_of_range (function_name, scm_from_size_t(start));
+    }
+  if (SCM_UNLIKELY (end > len))
+    {
+      scm_out_of_range (function_name, scm_from_size_t(end));
+    }
+  if (SCM_UNLIKELY(end < start))
+    {
+      scm_out_of_range (function_name, scm_from_size_t(end));
+    }
+}
+
+
+SCM_DEFINE (scm_utf8_range_to_string, "utf8->string",
+            1, 2, 0,
+            (SCM utf, SCM start, SCM end),
+            "Return a newly allocate string that contains from the UTF-8-"
+            "encoded contents of bytevector @var{utf}.")
+#define FUNC_NAME s_scm_utf8_range_to_string
 {
   SCM str;
   const char *c_utf;
-  size_t c_utf_len = 0;
+  size_t c_start;
+  size_t c_end;
+  size_t c_len;
 
   SCM_VALIDATE_BYTEVECTOR (1, utf);
-
-  c_utf_len = SCM_BYTEVECTOR_LENGTH (utf);
   c_utf = (char *) SCM_BYTEVECTOR_CONTENTS (utf);
-  str = scm_from_utf8_stringn (c_utf, c_utf_len);
+  c_len = SCM_BYTEVECTOR_LENGTH(utf);
+
+  if (!scm_is_eq (start, SCM_UNDEFINED))
+    {
+      c_start = scm_to_unsigned_integer (start, 0, c_len);
+    }
+  else
+    {
+      c_start = 0;
+    }
 
+  if (!scm_is_eq (end, SCM_UNDEFINED))
+    {
+      c_end = scm_to_unsigned_integer (end, 0, c_len);
+    }
+  else
+    {
+      c_end = c_len;
+    }
+
+  validate_bytevector_range(FUNC_NAME, c_len, c_start, c_end);
+  str = scm_from_utf8_stringn (c_utf + c_start, c_end - c_start);
   return (str);
 }
 #undef FUNC_NAME
 
-SCM_DEFINE (scm_utf16_to_string, "utf16->string",
-	    1, 1, 0,
-	    (SCM utf, SCM endianness),
-	    "Return a newly allocate string that contains from the UTF-16-"
-	    "encoded contents of bytevector @var{utf}.")
+SCM
+scm_utf8_to_string(SCM utf)
+#define FUNC_NAME s_scm_utf8_to_string
+{
+  return scm_utf8_range_to_string(utf, SCM_UNDEFINED, SCM_UNDEFINED);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_utf16_range_to_string, "utf16->string",
+            1, 3, 0,
+            (SCM utf, SCM endianness, SCM start, SCM end),
+            "Return a newly allocate string that contains from the UTF-16-"
+            "encoded contents of bytevector @var{utf}.")
+#define FUNC_NAME s_scm_utf16_range_to_string
+{
+  UTF_TO_STRING(16, utf, endianness, start, end);
+}
+#undef FUNC_NAME
+
+SCM scm_utf16_to_string (SCM utf, SCM endianness)
 #define FUNC_NAME s_scm_utf16_to_string
 {
-  UTF_TO_STRING (16);
+  return scm_utf16_range_to_string(utf, endianness, SCM_UNDEFINED, SCM_UNDEFINED);
 }
 #undef FUNC_NAME
 
-SCM_DEFINE (scm_utf32_to_string, "utf32->string",
-	    1, 1, 0,
-	    (SCM utf, SCM endianness),
-	    "Return a newly allocate string that contains from the UTF-32-"
-	    "encoded contents of bytevector @var{utf}.")
+SCM_DEFINE (scm_utf32_range_to_string, "utf32->string",
+            1, 3, 0,
+            (SCM utf, SCM endianness, SCM start, SCM end),
+            "Return a newly allocate string that contains from the UTF-32-"
+            "encoded contents of bytevector @var{utf}.")
+#define FUNC_NAME s_scm_utf32_range_to_string
+{
+  UTF_TO_STRING(32, utf, endianness, start, end);
+}
+#undef FUNC_NAME
+
+SCM scm_utf32_to_string (SCM utf, SCM endianness)
 #define FUNC_NAME s_scm_utf32_to_string
 {
-  UTF_TO_STRING (32);
+  return scm_utf32_range_to_string(utf, endianness, SCM_UNDEFINED, SCM_UNDEFINED);
 }
 #undef FUNC_NAME
 
diff --git a/libguile/bytevectors.h b/libguile/bytevectors.h
index 980d6e267..63d8e3119 100644
--- a/libguile/bytevectors.h
+++ b/libguile/bytevectors.h
@@ -113,8 +113,11 @@ SCM_API SCM scm_string_to_utf8 (SCM);
 SCM_API SCM scm_string_to_utf16 (SCM, SCM);
 SCM_API SCM scm_string_to_utf32 (SCM, SCM);
 SCM_API SCM scm_utf8_to_string (SCM);
+SCM_API SCM scm_utf8_range_to_string (SCM, SCM, SCM);
 SCM_API SCM scm_utf16_to_string (SCM, SCM);
+SCM_API SCM scm_utf16_range_to_string (SCM, SCM, SCM, SCM);
 SCM_API SCM scm_utf32_to_string (SCM, SCM);
+SCM_API SCM scm_utf32_range_to_string (SCM, SCM, SCM, SCM);
 
 
 \f
diff --git a/test-suite/tests/bytevectors.test b/test-suite/tests/bytevectors.test
index 732aadb3e..f8c6a8df1 100644
--- a/test-suite/tests/bytevectors.test
+++ b/test-suite/tests/bytevectors.test
@@ -558,6 +558,43 @@
       exception:decoding-error
     (utf8->string #vu8(104 105 239 191 50)))
 
+  (pass-if "utf8->string range: start provided"
+    (let* ((utf8 (string->utf8 "gnu guile"))
+           (str (utf8->string utf8 4)))
+      (string=? str "guile")))
+
+  (pass-if "utf8->string range: start and end provided"
+    (let* ((utf8 (string->utf8 "gnu guile"))
+           (str (utf8->string utf8 4 7)))
+      (string=? str "gui")))
+
+  (pass-if "utf8->string range: start = end = 0"
+    (let* ((utf8 (string->utf8 "gnu guile"))
+           (str (utf8->string utf8 0 0)))
+      (string=? str "")))
+
+  (pass-if-exception "utf8->string range: start > len"
+      exception:out-of-range
+    (let* ((utf8 (string->utf8 "four")))
+      ;; 4 as start is expected to return an empty string, in congruence
+      ;; with `substring'.
+      (utf8->string utf8 5)))
+
+  (pass-if-exception "utf8->string range: end < start"
+      exception:out-of-range
+      (let* ((utf8 (string->utf8 "gnu guile")))
+        (utf8->string utf8 1 0)))
+
+  (pass-if "utf8->string range: multibyte characters"
+    (string=? (utf8->string #vu8(195 169 67) 0 2) "é"))
+
+  (pass-if-exception "utf8->string range: decoding error for invalid range"
+      exception:decoding-error
+    (utf8->string #vu8(195 169) 0 1))
+
+  (pass-if "utf8->string range: null byte non-termination"
+    (string=? (utf8->string #vu8(0 32 196) 0 2) "\x00 "))
+
   (pass-if "utf16->string"
     (let* ((utf16  (uint-list->bytevector (map char->integer
                                                (string->list "hello, world"))
-- 
2.35.1


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: 0002-Re-export-utf8-string-instead-of-wrapper-with-extra-.patch --]
[-- Type: text/x-patch, Size: 1951 bytes --]

From febfaed2c5fc681fe014805c901026bdab8ea7cd Mon Sep 17 00:00:00 2001
From: Vijay Marupudi <vijay@vijaymarupudi.com>
Date: Fri, 21 Jan 2022 20:17:42 -0500
Subject: [PATCH 2/2] Re-export utf8->string instead of wrapper with extra
 allocation.

In addition, remove the redundant function that was dead code.

module/scheme/base.scm: Deleted wrapped and exported the default
utf8->string.
---
 module/scheme/base.scm | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/module/scheme/base.scm b/module/scheme/base.scm
index c6a73c092..28adb2e32 100644
--- a/module/scheme/base.scm
+++ b/module/scheme/base.scm
@@ -60,7 +60,6 @@
             vector-append vector-for-each vector-map
             (r7:bytevector-copy . bytevector-copy)
             (r7:bytevector-copy! . bytevector-copy!)
-            (r7:utf8->string . utf8->string)
             square
             (r7:expt . expt)
             boolean=? symbol=?
@@ -109,7 +108,7 @@
    string-copy string-copy! string-fill! string-for-each
    string-length string-ref string-set! string<=? string<?
    string=? string>=? string>? string? substring symbol->string
-   symbol? syntax-error syntax-rules truncate
+   symbol? utf8->string syntax-error syntax-rules truncate
    truncate-quotient truncate-remainder truncate/
    (char-ready? . u8-ready?)
    unless
@@ -494,9 +493,6 @@
   (bytevector-copy! bv start out 0 mlen)
   out)
 
-(define (%subbytevector1 bv start)
-  (%subbytevector bv start (bytevector-length bv)))
-
 (define r7:bytevector-copy!
   (case-lambda*
    ((to at from #:optional
@@ -512,12 +508,6 @@
     ((bv start #:optional (end (bytevector-length bv)))
      (%subbytevector bv start end))))
 
-(define r7:utf8->string
-  (case-lambda*
-    ((bv) (utf8->string bv))
-    ((bv start #:optional (end (bytevector-length bv)))
-     (utf8->string (%subbytevector bv start end)))))
-
 (define (square x) (* x x))
 
 (define (r7:expt x y)
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2022-03-09 14:50 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-21  3:23 [PATCH] Enable utf8->string to take a range Vijay Marupudi
2022-01-21 16:53 ` Maxime Devos
2022-01-21 16:54 ` Maxime Devos
2022-01-21 16:55 ` Maxime Devos
2022-01-21 17:04 ` Maxime Devos
2022-01-21 20:20   ` Vijay Marupudi
2022-01-21 22:08     ` Maxime Devos
2022-01-22  1:21       ` Vijay Marupudi
2022-03-09 13:20         ` Maxime Devos
2022-03-09 13:20         ` Maxime Devos
2022-03-09 13:24         ` Maxime Devos
2022-03-09 13:27           ` Maxime Devos
2022-03-09 13:35             ` Maxime Devos
2022-03-09 14:50               ` Vijay Marupudi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).