unofficial mirror of bug-guile@gnu.org 
 help / color / mirror / Atom feed
* bug#56413: [PATCH 1/1] scm_i_utf8_string_hash: compute u8 chars not bytes
@ 2022-07-06  1:23 Rob Browning
  2022-07-06  3:04 ` Rob Browning
  2022-11-05 22:18 ` Ludovic Courtès
  0 siblings, 2 replies; 14+ messages in thread
From: Rob Browning @ 2022-07-06  1:23 UTC (permalink / raw)
  To: 56413

Noticed while investigating a migration to utf-8 strings.  After making
changes that routed non-ascii symbol hashing through this function,
encoding-iso88597.test began intermittently failing because it would
traverse trailing garbage when u8_strnlen reported 8 chars instead of 4.

Change the scm_i_str2symbol internal hash type to unsigned long to
explicitly match the hashing result type.
---

 Proposed for at least main.

 libguile/hash.c                      |  2 +-
 libguile/symbols.c                   |  2 +-
 test-suite/standalone/Makefile.am    |  7 ++++
 test-suite/standalone/test-hashing.c | 61 ++++++++++++++++++++++++++++
 4 files changed, 70 insertions(+), 2 deletions(-)
 create mode 100644 test-suite/standalone/test-hashing.c

diff --git a/libguile/hash.c b/libguile/hash.c
index 93431102f..0740b2645 100644
--- a/libguile/hash.c
+++ b/libguile/hash.c
@@ -188,7 +188,7 @@ scm_i_utf8_string_hash (const char *str, size_t len)
     /* Invalid UTF-8; punt.  */
     return scm_i_string_hash (scm_from_utf8_stringn (str, len));
 
-  length = u8_strnlen (ustr, len);
+  length = u8_mbsnlen (ustr, len);
 
   /* Set up the internal state.  */
   a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + 47;
diff --git a/libguile/symbols.c b/libguile/symbols.c
index ad5f22f57..cd9cda3de 100644
--- a/libguile/symbols.c
+++ b/libguile/symbols.c
@@ -239,7 +239,7 @@ static SCM
 scm_i_str2symbol (SCM str)
 {
   SCM symbol;
-  size_t raw_hash = scm_i_string_hash (str);
+  unsigned long raw_hash = scm_i_string_hash (str);
 
   symbol = lookup_interned_symbol (str, raw_hash);
   if (scm_is_true (symbol))
diff --git a/test-suite/standalone/Makefile.am b/test-suite/standalone/Makefile.am
index e87100c96..ca1b3131b 100644
--- a/test-suite/standalone/Makefile.am
+++ b/test-suite/standalone/Makefile.am
@@ -167,6 +167,13 @@ test_conversion_LDADD = $(LIBGUILE_LDADD) $(top_builddir)/lib/libgnu.la
 check_PROGRAMS += test-conversion
 TESTS += test-conversion
 
+# test-hashing
+test_hashing_SOURCES = test-hashing.c
+test_hashing_CFLAGS = ${test_cflags}
+test_hashing_LDADD = $(LIBGUILE_LDADD) $(top_builddir)/lib/libgnu.la
+check_PROGRAMS += test-hashing
+TESTS += test-hashing
+
 # test-loose-ends
 test_loose_ends_SOURCES = test-loose-ends.c
 test_loose_ends_CFLAGS = ${test_cflags}
diff --git a/test-suite/standalone/test-hashing.c b/test-suite/standalone/test-hashing.c
new file mode 100644
index 000000000..476181fe2
--- /dev/null
+++ b/test-suite/standalone/test-hashing.c
@@ -0,0 +1,61 @@
+/* Copyright 2022
+     Free Software Foundation, Inc.
+
+   This file is part of Guile.
+
+   Guile is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   Guile is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with Guile.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <libguile.h>
+
+#include <stdio.h>
+
+static void
+test_hashing ()
+{
+  // Make sure a utf-8 symbol has the expected hash.  In addition to
+  // catching algorithmic regressions, this would have caught a
+  // long-standing buffer overflow.
+
+  // περί
+  char about_u8[] = {0xce, 0xa0, 0xce, 0xb5, 0xcf, 0x81, 0xce, 0xaf, 0};
+  SCM sym = scm_from_utf8_symbol (about_u8);
+
+  const unsigned long expect = 4029223418961680680;
+  const unsigned long actual = scm_to_ulong (scm_symbol_hash (sym));
+
+  if (actual != expect)
+    {
+      fprintf (stderr, "fail: unexpected utf-8 symbol hash (%lu != %lu)\n",
+               actual, expect);
+      exit (EXIT_FAILURE);
+    }
+}
+
+static void
+tests (void *data, int argc, char **argv)
+{
+  test_hashing ();
+}
+
+int
+main (int argc, char *argv[])
+{
+  scm_boot_guile (argc, argv, tests, NULL);
+  return 0;
+}
-- 
2.30.2






^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2023-03-13 11:29 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-06  1:23 bug#56413: [PATCH 1/1] scm_i_utf8_string_hash: compute u8 chars not bytes Rob Browning
2022-07-06  3:04 ` Rob Browning
2022-11-05 22:18 ` Ludovic Courtès
2022-11-06 16:44   ` Rob Browning
2022-11-06 17:45     ` Rob Browning
2022-11-07 13:06     ` Ludovic Courtès
2022-11-06 19:46   ` Rob Browning
2022-11-07 13:07     ` Ludovic Courtès
2022-11-08  5:05     ` Rob Browning
2022-11-08 10:09       ` Ludovic Courtès
2023-03-05 22:21         ` bug#56413: [PATCH v2 " Rob Browning
2023-03-06 16:39           ` Ludovic Courtès
2023-03-12 19:30             ` bug#56413: [PATCH v3 " Rob Browning
2023-03-13 11:29               ` Ludovic Courtès

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).