From c6be127b4818d43a0244592c18a52de113d3ff08 Mon Sep 17 00:00:00 2001 From: Vijay Marupudi Date: Thu, 20 Jan 2022 22:19:25 -0500 Subject: [PATCH 1/2] Allow utf8->string, utf16->string, utf32->string to take ranges Added the following new functions, that behave like substring, but for bytevector to string conversion. scm_utf8_range_to_string (SCM, SCM, SCM); scm_utf16_range_to_string (SCM, SCM, SCM, SCM); scm_utf32_range_to_string (SCM, SCM, SCM, SCM); * doc/ref/api-data.texi: Updated documentation to reflect new function and range constraints * libguile/bytevectors.c: Added new function. * libguile/bytevectors.h: Added new function declaration. * test-suite/tests/bytevectors.test: Added tests for exceptions and behavior for edge cases --- doc/ref/api-data.texi | 15 +++- libguile/bytevectors.c | 144 +++++++++++++++++++++++------- libguile/bytevectors.h | 3 + test-suite/tests/bytevectors.test | 37 ++++++++ 4 files changed, 164 insertions(+), 35 deletions(-) diff --git a/doc/ref/api-data.texi b/doc/ref/api-data.texi index b6c2c4d61..44b64454f 100644 --- a/doc/ref/api-data.texi +++ b/doc/ref/api-data.texi @@ -7139,16 +7139,25 @@ UTF-32 (aka. UCS-4) encoding of @var{str}. For UTF-16 and UTF-32, it defaults to big endian. @end deffn -@deffn {Scheme Procedure} utf8->string utf -@deffnx {Scheme Procedure} utf16->string utf [endianness] -@deffnx {Scheme Procedure} utf32->string utf [endianness] +@deffn {Scheme Procedure} utf8->string utf [start [end]] +@deffnx {Scheme Procedure} utf16->string utf [endianness [start [end]]] +@deffnx {Scheme Procedure} utf32->string utf [endianness [start [end]]] @deffnx {C Function} scm_utf8_to_string (utf) +@deffnx {C Function} scm_utf8_range_to_string (utf, start, end) @deffnx {C Function} scm_utf16_to_string (utf, endianness) +@deffnx {C Function} scm_utf16_range_to_string (utf, endianness, start, end) @deffnx {C Function} scm_utf32_to_string (utf, endianness) +@deffnx {C Function} scm_utf32_range_to_string (utf, endianness, start, end) + Return a newly allocated string that contains from the UTF-8-, UTF-16-, or UTF-32-decoded contents of bytevector @var{utf}. For UTF-16 and UTF-32, @var{endianness} should be the symbol @code{big} or @code{little}; when omitted, it defaults to big endian. + +@var{start} and @var{end}, when provided, must be exact integers +satisfying: + +0 <= @var{start} <= @var{end} <= @code{(bytevector-length @var{utf})}. @end deffn @node Bytevectors as Arrays diff --git a/libguile/bytevectors.c b/libguile/bytevectors.c index f42fbb427..12d299042 100644 --- a/libguile/bytevectors.c +++ b/libguile/bytevectors.c @@ -2061,25 +2061,46 @@ SCM_DEFINE (scm_string_to_utf32, "string->utf32", /* Produce the body of a function that converts a UTF-encoded bytevector to a string. */ -#define UTF_TO_STRING(_utf_width) \ +#define UTF_TO_STRING(_utf_width, utf, endianness, start, end) \ SCM str = SCM_BOOL_F; \ int err; \ char *c_str = NULL; \ char c_utf_name[MAX_UTF_ENCODING_NAME_LEN]; \ char *c_utf; \ - size_t c_strlen = 0, c_utf_len = 0; \ + size_t c_strlen = 0, c_utf_len, c_start, c_end; \ \ - SCM_VALIDATE_BYTEVECTOR (1, utf); \ - if (scm_is_eq (endianness, SCM_UNDEFINED)) \ - endianness = sym_big; \ + SCM_VALIDATE_BYTEVECTOR (1, (utf)); \ + if (scm_is_eq ((endianness), SCM_UNDEFINED)) \ + (endianness) = sym_big; \ else \ - SCM_VALIDATE_SYMBOL (2, endianness); \ + SCM_VALIDATE_SYMBOL (2, (endianness)); \ \ - c_utf_len = SCM_BYTEVECTOR_LENGTH (utf); \ - c_utf = (char *) SCM_BYTEVECTOR_CONTENTS (utf); \ - utf_encoding_name (c_utf_name, (_utf_width), endianness); \ + c_utf_len = SCM_BYTEVECTOR_LENGTH ((utf)); \ + c_utf = (char *) SCM_BYTEVECTOR_CONTENTS ((utf)); \ + utf_encoding_name (c_utf_name, (_utf_width), (endianness)); \ + \ + if (!scm_is_eq ((start), SCM_UNDEFINED)) \ + { \ + c_start = scm_to_unsigned_integer ((start), 0, c_utf_len); \ + } \ + else \ + { \ + c_start = 0; \ + } \ + \ + if (!scm_is_eq ((end), SCM_UNDEFINED)) \ + { \ + c_end = scm_to_unsigned_integer ((end), 0, c_utf_len); \ + } \ + else \ + { \ + c_end = c_utf_len; \ + } \ + \ + validate_bytevector_range(FUNC_NAME, c_utf_len, c_start, c_end); \ + \ \ - err = mem_iconveh (c_utf, c_utf_len, \ + err = mem_iconveh (c_utf + c_start, c_end - c_start, \ c_utf_name, "UTF-8", \ iconveh_question_mark, NULL, \ &c_str, &c_strlen); \ @@ -2094,46 +2115,105 @@ SCM_DEFINE (scm_string_to_utf32, "string->utf32", return (str); -SCM_DEFINE (scm_utf8_to_string, "utf8->string", - 1, 0, 0, - (SCM utf), - "Return a newly allocate string that contains from the UTF-8-" - "encoded contents of bytevector @var{utf}.") -#define FUNC_NAME s_scm_utf8_to_string +static inline void +validate_bytevector_range(const char* function_name, size_t len, size_t start, size_t end) { + if (SCM_UNLIKELY (start > len)) + { + scm_out_of_range (function_name, scm_from_size_t(start)); + } + if (SCM_UNLIKELY (end > len)) + { + scm_out_of_range (function_name, scm_from_size_t(end)); + } + if (SCM_UNLIKELY(end < start)) + { + scm_out_of_range (function_name, scm_from_size_t(end)); + } +} + + +SCM_DEFINE (scm_utf8_range_to_string, "utf8->string", + 1, 2, 0, + (SCM utf, SCM start, SCM end), + "Return a newly allocate string that contains from the UTF-8-" + "encoded contents of bytevector @var{utf}.") +#define FUNC_NAME s_scm_utf8_range_to_string { SCM str; const char *c_utf; - size_t c_utf_len = 0; + size_t c_start; + size_t c_end; + size_t c_len; SCM_VALIDATE_BYTEVECTOR (1, utf); - - c_utf_len = SCM_BYTEVECTOR_LENGTH (utf); c_utf = (char *) SCM_BYTEVECTOR_CONTENTS (utf); - str = scm_from_utf8_stringn (c_utf, c_utf_len); + c_len = SCM_BYTEVECTOR_LENGTH(utf); + + if (!scm_is_eq (start, SCM_UNDEFINED)) + { + c_start = scm_to_unsigned_integer (start, 0, c_len); + } + else + { + c_start = 0; + } + if (!scm_is_eq (end, SCM_UNDEFINED)) + { + c_end = scm_to_unsigned_integer (end, 0, c_len); + } + else + { + c_end = c_len; + } + + validate_bytevector_range(FUNC_NAME, c_len, c_start, c_end); + str = scm_from_utf8_stringn (c_utf + c_start, c_end - c_start); return (str); } #undef FUNC_NAME -SCM_DEFINE (scm_utf16_to_string, "utf16->string", - 1, 1, 0, - (SCM utf, SCM endianness), - "Return a newly allocate string that contains from the UTF-16-" - "encoded contents of bytevector @var{utf}.") +SCM +scm_utf8_to_string(SCM utf) +#define FUNC_NAME s_scm_utf8_to_string +{ + return scm_utf8_range_to_string(utf, SCM_UNDEFINED, SCM_UNDEFINED); +} +#undef FUNC_NAME + +SCM_DEFINE (scm_utf16_range_to_string, "utf16->string", + 1, 3, 0, + (SCM utf, SCM endianness, SCM start, SCM end), + "Return a newly allocate string that contains from the UTF-8-" + "encoded contents of bytevector @var{utf}.") +#define FUNC_NAME s_scm_utf16_range_to_string +{ + UTF_TO_STRING(16, utf, endianness, start, end); +} +#undef FUNC_NAME + +SCM scm_utf16_to_string (SCM utf, SCM endianness) #define FUNC_NAME s_scm_utf16_to_string { - UTF_TO_STRING (16); + return scm_utf16_range_to_string(utf, endianness, SCM_UNDEFINED, SCM_UNDEFINED); } #undef FUNC_NAME -SCM_DEFINE (scm_utf32_to_string, "utf32->string", - 1, 1, 0, - (SCM utf, SCM endianness), - "Return a newly allocate string that contains from the UTF-32-" - "encoded contents of bytevector @var{utf}.") +SCM_DEFINE (scm_utf32_range_to_string, "utf32->string", + 1, 3, 0, + (SCM utf, SCM endianness, SCM start, SCM end), + "Return a newly allocate string that contains from the UTF-8-" + "encoded contents of bytevector @var{utf}.") +#define FUNC_NAME s_scm_utf32_range_to_string +{ + UTF_TO_STRING(32, utf, endianness, start, end); +} +#undef FUNC_NAME + +SCM scm_utf32_to_string (SCM utf, SCM endianness) #define FUNC_NAME s_scm_utf32_to_string { - UTF_TO_STRING (32); + return scm_utf32_range_to_string(utf, endianness, SCM_UNDEFINED, SCM_UNDEFINED); } #undef FUNC_NAME diff --git a/libguile/bytevectors.h b/libguile/bytevectors.h index 980d6e267..63d8e3119 100644 --- a/libguile/bytevectors.h +++ b/libguile/bytevectors.h @@ -113,8 +113,11 @@ SCM_API SCM scm_string_to_utf8 (SCM); SCM_API SCM scm_string_to_utf16 (SCM, SCM); SCM_API SCM scm_string_to_utf32 (SCM, SCM); SCM_API SCM scm_utf8_to_string (SCM); +SCM_API SCM scm_utf8_range_to_string (SCM, SCM, SCM); SCM_API SCM scm_utf16_to_string (SCM, SCM); +SCM_API SCM scm_utf16_range_to_string (SCM, SCM, SCM, SCM); SCM_API SCM scm_utf32_to_string (SCM, SCM); +SCM_API SCM scm_utf32_range_to_string (SCM, SCM, SCM, SCM); diff --git a/test-suite/tests/bytevectors.test b/test-suite/tests/bytevectors.test index 732aadb3e..f8c6a8df1 100644 --- a/test-suite/tests/bytevectors.test +++ b/test-suite/tests/bytevectors.test @@ -558,6 +558,43 @@ exception:decoding-error (utf8->string #vu8(104 105 239 191 50))) + (pass-if "utf8->string range: start provided" + (let* ((utf8 (string->utf8 "gnu guile")) + (str (utf8->string utf8 4))) + (string=? str "guile"))) + + (pass-if "utf8->string range: start and end provided" + (let* ((utf8 (string->utf8 "gnu guile")) + (str (utf8->string utf8 4 7))) + (string=? str "gui"))) + + (pass-if "utf8->string range: start = end = 0" + (let* ((utf8 (string->utf8 "gnu guile")) + (str (utf8->string utf8 0 0))) + (string=? str ""))) + + (pass-if-exception "utf8->string range: start > len" + exception:out-of-range + (let* ((utf8 (string->utf8 "four"))) + ;; 4 as start is expected to return an empty string, in congruence + ;; with `substring'. + (utf8->string utf8 5))) + + (pass-if-exception "utf8->string range: end < start" + exception:out-of-range + (let* ((utf8 (string->utf8 "gnu guile"))) + (utf8->string utf8 1 0))) + + (pass-if "utf8->string range: multibyte characters" + (string=? (utf8->string #vu8(195 169 67) 0 2) "é")) + + (pass-if-exception "utf8->string range: decoding error for invalid range" + exception:decoding-error + (utf8->string #vu8(195 169) 0 1)) + + (pass-if "utf8->string range: null byte non-termination" + (string=? (utf8->string #vu8(0 32 196) 0 2) "\x00 ")) + (pass-if "utf16->string" (let* ((utf16 (uint-list->bytevector (map char->integer (string->list "hello, world")) -- 2.34.1