From 33cd595b883ab5e27ab410648bac89fab0459078 Mon Sep 17 00:00:00 2001 From: Mark H Weaver Date: Mon, 5 Mar 2012 10:35:06 -0500 Subject: [PATCH 3/3] Efficient gensym hack: generate gensym names lazily * libguile/strings.c (scm_i_symbol_to_string_no_lazy_gensym_check, scm_i_stringbuf_from_string, scm_i_string_from_stringbuf): New internal functions needed by symbols.c. (symbol_stringbuf): New internal static function to replace most uses of SYMBOL_STRINGBUF. Handles forcing lazy gensyms. (scm_i_symbol_length, scm_c_symbol_length, scm_i_is_narrow_symbol, scm_i_symbol_chars, scm_i_symbol_wide_chars, scm_i_symbol_substring, scm_sys_symbol_dump): Use symbol_stringbuf instead of SYMBOL_STRINGBUF. * libguile/strings.h (scm_i_symbol_to_string_no_lazy_gensym_check, scm_i_stringbuf_from_string, scm_i_string_from_stringbuf): Add prototypes. * libguile/symbols.c (scm_i_symbol_hash): New internal function to replace macro of the same name. Handles forcing lazy gensyms. (scm_gensym): Don't construct the name or even increment the gensym_counter here. Just return a new symbol with the SCM_I_F_SYMBOL_LAZY_GENSYM flag set, with hash value 0, and with a stringbuf containing only the prefix. (scm_i_force_lazy_gensym): New internal procedure used when a lazy gensym is queried for its name or hash value. (symbol_lookup_hash_fn, symbol_lookup_assoc_fn): Avoid lazy gensym checks. * libguile/symbols.h (scm_i_symbol_hash): Remove macro, and replace it with a prototype for the new internal function of the same name. (scm_i_force_lazy_gensym): Add prototype. (scm_i_symbol_is_lazy_gensym): New macro. (SCM_I_F_SYMBOL_LAZY_GENSYM): New flag. * doc/ref/api-data.texi (Symbol Primitives): Update documentation. * test-suite/tests/symbols.test (gensym): Add tests. --- doc/ref/api-data.texi | 4 +- libguile/strings.c | 65 +++++++++++++++++++++++++--- libguile/strings.h | 3 + libguile/symbols.c | 96 ++++++++++++++++++++++++++++++---------- libguile/symbols.h | 6 ++- test-suite/tests/symbols.test | 36 +++++++++++++++- 6 files changed, 175 insertions(+), 35 deletions(-) diff --git a/doc/ref/api-data.texi b/doc/ref/api-data.texi index 39c9790..a1203f0 100644 --- a/doc/ref/api-data.texi +++ b/doc/ref/api-data.texi @@ -5293,8 +5293,8 @@ code. The @code{gensym} primitive meets this need: @deffnx {C Function} scm_gensym (prefix) Create a new symbol with a name constructed from a prefix and a counter value. The string @var{prefix} can be specified as an optional -argument. Default prefix is @samp{@w{ g}}. The counter is increased by 1 -at each call. There is no provision for resetting the counter. +argument. Default prefix is @samp{@w{ g}}. The name is constructed +lazily, when the name or hash of the symbol is first requested. @end deffn The symbols generated by @code{gensym} are @emph{likely} to be unique, diff --git a/libguile/strings.c b/libguile/strings.c index 35757f0..cc49c7f 100644 --- a/libguile/strings.c +++ b/libguile/strings.c @@ -334,6 +334,39 @@ get_str_buf_start (SCM *str, SCM *buf, size_t *start) *buf = STRING_STRINGBUF (*str); } +/* This is needed by the lazy gensym code in symbols.c. + It produces a shared stringbuf (so it will not be mutated) + containing exactly the characters in 'str'. If possible, + it uses 'str's stringbuf. However, if 'str' refers to only + part of its stringbuf, the stringbuf must be copied. */ +SCM +scm_i_stringbuf_from_string (SCM str) +{ + SCM inner_str, buf; + size_t len, start; + + len = STRING_LENGTH (str); + inner_str = str; + get_str_buf_start (&inner_str, &buf, &start); + if (STRINGBUF_LENGTH (buf) == len) + set_stringbuf_shared (buf); + else + { + SCM new_str = scm_i_substring_copy (str, 0, len); + buf = STRING_STRINGBUF (new_str); + } + return buf; +} + +/* This is needed by the lazy gensym code in symbols.c. */ +SCM +scm_i_string_from_stringbuf (SCM buf) +{ + size_t len = STRINGBUF_LENGTH (buf); + return scm_double_cell (STRING_TAG, SCM_UNPACK (buf), + (scm_t_bits) 0, (scm_t_bits) len); +} + SCM scm_i_substring (SCM str, size_t start, size_t end) { @@ -734,6 +767,14 @@ scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr) #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1 +static SCM +symbol_stringbuf (SCM symbol) +{ + if (SCM_UNLIKELY (scm_i_symbol_is_lazy_gensym (symbol))) + scm_i_force_lazy_gensym (symbol); + return SYMBOL_STRINGBUF (symbol); +} + SCM scm_i_make_symbol (SCM name, scm_t_bits flags, unsigned long hash, SCM props) @@ -793,7 +834,7 @@ scm_i_c_make_symbol (const char *name, size_t len, size_t scm_i_symbol_length (SCM sym) { - return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym)); + return STRINGBUF_LENGTH (symbol_stringbuf (sym)); } size_t @@ -802,7 +843,7 @@ scm_c_symbol_length (SCM sym) { SCM_VALIDATE_SYMBOL (1, sym); - return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym)); + return STRINGBUF_LENGTH (symbol_stringbuf (sym)); } #undef FUNC_NAME @@ -813,7 +854,7 @@ scm_i_is_narrow_symbol (SCM sym) { SCM buf; - buf = SYMBOL_STRINGBUF (sym); + buf = symbol_stringbuf (sym); return !STRINGBUF_WIDE (buf); } @@ -824,7 +865,7 @@ scm_i_symbol_chars (SCM sym) { SCM buf; - buf = SYMBOL_STRINGBUF (sym); + buf = symbol_stringbuf (sym); if (!STRINGBUF_WIDE (buf)) return (const char *) STRINGBUF_CHARS (buf); else @@ -839,7 +880,7 @@ scm_i_symbol_wide_chars (SCM sym) { SCM buf; - buf = SYMBOL_STRINGBUF (sym); + buf = symbol_stringbuf (sym); if (STRINGBUF_WIDE (buf)) return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf); else @@ -850,12 +891,22 @@ scm_i_symbol_wide_chars (SCM sym) SCM scm_i_symbol_substring (SCM sym, size_t start, size_t end) { - SCM buf = SYMBOL_STRINGBUF (sym); + SCM buf = symbol_stringbuf (sym); set_stringbuf_shared (buf); return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf), (scm_t_bits)start, (scm_t_bits) end - start); } +SCM +scm_i_symbol_to_string_no_lazy_gensym_check (SCM sym) +{ + SCM buf = SYMBOL_STRINGBUF (sym); + size_t len = STRINGBUF_LENGTH (buf); + set_stringbuf_shared (buf); + return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf), + (scm_t_bits) 0, (scm_t_bits) len); +} + /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */ scm_t_wchar scm_i_symbol_ref (SCM sym, size_t x) @@ -1000,7 +1051,7 @@ SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym), scm_from_ulong (scm_i_symbol_hash (sym))); e3 = scm_cons (scm_from_latin1_symbol ("interned"), scm_symbol_interned_p (sym)); - buf = SYMBOL_STRINGBUF (sym); + buf = symbol_stringbuf (sym); /* Stringbuf info */ if (!STRINGBUF_WIDE (buf)) diff --git a/libguile/strings.h b/libguile/strings.h index 9735913..5c51980 100644 --- a/libguile/strings.h +++ b/libguile/strings.h @@ -200,6 +200,8 @@ SCM_INTERNAL scm_t_wchar scm_i_string_ref (SCM str, size_t x); SCM_INTERNAL int scm_i_string_contains_char (SCM str, char c); SCM_INTERNAL int scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr); SCM_INTERNAL void scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr); +SCM_INTERNAL SCM scm_i_stringbuf_from_string (SCM str); +SCM_INTERNAL SCM scm_i_string_from_stringbuf (SCM buf); /* internal functions related to symbols. */ @@ -213,6 +215,7 @@ SCM_INTERNAL const scm_t_wchar *scm_i_symbol_wide_chars (SCM sym); SCM_INTERNAL size_t scm_i_symbol_length (SCM sym); SCM_INTERNAL int scm_i_is_narrow_symbol (SCM str); SCM_INTERNAL SCM scm_i_symbol_substring (SCM sym, size_t start, size_t end); +SCM_INTERNAL SCM scm_i_symbol_to_string_no_lazy_gensym_check (SCM sym); SCM_INTERNAL scm_t_wchar scm_i_symbol_ref (SCM sym, size_t x); SCM_INTERNAL void scm_encoding_error (const char *subr, int err, const char *message, SCM port, SCM chr); diff --git a/libguile/symbols.c b/libguile/symbols.c index 08512a6..31aa3a2 100644 --- a/libguile/symbols.c +++ b/libguile/symbols.c @@ -70,6 +70,16 @@ SCM_DEFINE (scm_sys_symbols, "%symbols", 0, 0, 0, /* {Symbols} */ +#define SYMBOL_HASH(x) ((unsigned long) SCM_CELL_WORD_2 (x)) + +unsigned long +scm_i_symbol_hash (SCM symbol) +{ + if (SCM_UNLIKELY (scm_i_symbol_is_lazy_gensym (symbol))) + scm_i_force_lazy_gensym (symbol); + return SYMBOL_HASH (symbol); +} + unsigned long scm_i_hash_symbol (SCM obj, unsigned long n, void *closure) { @@ -165,7 +175,10 @@ lookup_interned_latin1_symbol (const char *str, size_t len, static unsigned long symbol_lookup_hash_fn (SCM obj, unsigned long max, void *closure) { - return scm_i_symbol_hash (obj) % max; + /* We must avoid forcing lazy gensyms here, because + scm_i_force_lazy_gensym needs to intern its symbol before clearing + the lazy gensym flag. */ + return SYMBOL_HASH (obj) % max; } static SCM @@ -175,9 +188,13 @@ symbol_lookup_assoc_fn (SCM obj, SCM alist, void *closure) { SCM sym = SCM_CAAR (alist); - if (scm_i_symbol_hash (sym) == scm_i_symbol_hash (obj) - && scm_is_true (scm_string_equal_p (scm_symbol_to_string (sym), - scm_symbol_to_string (obj)))) + /* We must avoid forcing lazy gensyms here, because + scm_i_force_lazy_gensym needs to intern its symbol before + clearing the lazy gensym flag. */ + if (SYMBOL_HASH (sym) == SYMBOL_HASH (obj) + && scm_is_true (scm_string_equal_p + (scm_i_symbol_to_string_no_lazy_gensym_check (sym), + scm_i_symbol_to_string_no_lazy_gensym_check (obj)))) return SCM_CAR (alist); } @@ -340,38 +357,69 @@ SCM_DEFINE (scm_string_ci_to_symbol, "string-ci->symbol", 1, 0, 0, /* The default prefix for `gensym'd symbols. */ static SCM default_gensym_prefix; -#define MAX_PREFIX_LENGTH 30 - SCM_DEFINE (scm_gensym, "gensym", 0, 1, 0, (SCM prefix), "Create a new symbol with a name constructed from a prefix and\n" - "a counter value. The string @var{prefix} can be specified as\n" - "an optional argument. Default prefix is @code{ g}. The counter\n" - "is increased by 1 at each call. There is no provision for\n" - "resetting the counter.") + "a counter value. The string @var{prefix} can be specified as\n" + "an optional argument. Default prefix is @code{ g}. The name\n" + "is constructed lazily, when the name or hash of the symbol is\n" + "first requested.") #define FUNC_NAME s_scm_gensym { - static int gensym_counter = 0; - - SCM suffix, name; - int n, n_digits; - char buf[SCM_INTBUFLEN]; + SCM prefix_stringbuf; if (SCM_UNBNDP (prefix)) prefix = default_gensym_prefix; + else + SCM_VALIDATE_STRING (1, prefix); - /* mutex in case another thread looks and incs at the exact same moment */ - scm_i_scm_pthread_mutex_lock (&scm_i_misc_mutex); - n = gensym_counter++; - scm_i_pthread_mutex_unlock (&scm_i_misc_mutex); - - n_digits = scm_iint2str (n, 10, buf); - suffix = scm_from_latin1_stringn (buf, n_digits); - name = scm_string_append (scm_list_2 (prefix, suffix)); - return scm_string_to_symbol (name); + prefix_stringbuf = scm_i_stringbuf_from_string (prefix); + return scm_double_cell (scm_tc7_symbol | SCM_I_F_SYMBOL_LAZY_GENSYM, + SCM_UNPACK (prefix_stringbuf), (scm_t_bits) 0, + SCM_UNPACK (scm_cons (SCM_BOOL_F, SCM_EOL))); } #undef FUNC_NAME +void +scm_i_force_lazy_gensym (SCM sym) +{ + static int gensym_counter = 0; + + SCM prefix, suffix, name, handle; + int n, n_digits; + char buf[SCM_INTBUFLEN]; + + /* mutex in case another thread forces a gensym (possibly this one) */ + scm_i_pthread_mutex_lock (&symbols_lock); + if (SCM_LIKELY (scm_i_symbol_is_lazy_gensym (sym))) + { + prefix = scm_i_string_from_stringbuf (SCM_CELL_OBJECT_1 (sym)); + do + { + n = gensym_counter++; + + n_digits = scm_iint2str (n, 10, buf); + suffix = scm_from_latin1_stringn (buf, n_digits); + name = scm_string_append (scm_list_2 (prefix, suffix)); + + SCM_SET_CELL_OBJECT_1 (sym, scm_i_stringbuf_from_string (name)); + SCM_SET_CELL_WORD_2 (sym, scm_i_string_hash (name)); + handle = scm_hash_fn_create_handle_x (symbols, sym, SCM_UNDEFINED, + symbol_lookup_hash_fn, + symbol_lookup_assoc_fn, + NULL); + } while (SCM_UNLIKELY (!scm_is_eq (sym, SCM_CAR (handle)))); + + /* We must not clear the lazy gensym flag until we've found a name + that has not been previously interned, and all other cell words + contain their final values. The lock does not save us here, + because symbols can be accessed without locking. */ + SCM_SET_CELL_WORD_0 (sym, (SCM_CELL_WORD_0 (sym) + & ~SCM_I_F_SYMBOL_LAZY_GENSYM)); + } + scm_i_pthread_mutex_unlock (&symbols_lock); +} + SCM_DEFINE (scm_symbol_hash, "symbol-hash", 1, 0, 0, (SCM symbol), "Return a hash value for @var{symbol}.") diff --git a/libguile/symbols.h b/libguile/symbols.h index 6106f9e..b8fe997 100644 --- a/libguile/symbols.h +++ b/libguile/symbols.h @@ -28,11 +28,13 @@ #define scm_is_symbol(x) (!SCM_IMP (x) \ && (SCM_TYP7 (x) == scm_tc7_symbol)) -#define scm_i_symbol_hash(x) ((unsigned long) SCM_CELL_WORD_2 (x)) #define scm_i_symbol_is_interned(x) \ (!(SCM_CELL_WORD_0 (x) & SCM_I_F_SYMBOL_UNINTERNED)) +#define scm_i_symbol_is_lazy_gensym(x) \ + (SCM_CELL_WORD_0 (x) & SCM_I_F_SYMBOL_LAZY_GENSYM) #define SCM_I_F_SYMBOL_UNINTERNED 0x100 +#define SCM_I_F_SYMBOL_LAZY_GENSYM 0x200 @@ -90,8 +92,10 @@ SCM_API SCM scm_take_utf8_symboln (char *sym, size_t len); /* internal functions. */ +SCM_INTERNAL unsigned long scm_i_symbol_hash (SCM symbol); SCM_INTERNAL unsigned long scm_i_hash_symbol (SCM obj, unsigned long n, void *closure); +SCM_INTERNAL void scm_i_force_lazy_gensym (SCM sym); SCM_INTERNAL void scm_symbols_prehistory (void); SCM_INTERNAL void scm_init_symbols (void); diff --git a/test-suite/tests/symbols.test b/test-suite/tests/symbols.test index 6fbc6be..0dbb121 100644 --- a/test-suite/tests/symbols.test +++ b/test-suite/tests/symbols.test @@ -149,7 +149,41 @@ (symbol? (gensym (make-string 4000 #\!)))) (pass-if "accepts embedded NULs" - (> (string-length (symbol->string (gensym "foo\0bar\0braz\0foo\0bar\0braz\0foo\0bar\0braz\0foo\0bar\0braz\0foo\0bar\0braz\0foo\0bar\0braz\0"))) 6))) + (> (string-length (symbol->string (gensym "foo\0bar\0braz\0foo\0bar\0braz\0foo\0bar\0braz\0foo\0bar\0braz\0foo\0bar\0braz\0foo\0bar\0braz\0"))) 6)) + + (pass-if "accepts substring prefixes" + (let* ((prefix (substring "foobar" 1 4)) + (symbol (gensym prefix)) + (name (symbol->string symbol))) + (string= "oob" (substring name 0 3)))) + + (pass-if "accepts shared substring prefixes" + (let* ((prefix (substring/shared (string-copy "foobar") + 1 4)) + (symbol (gensym prefix)) + (name (symbol->string symbol))) + (string= "oob" (substring name 0 3)))) + + (pass-if "counter incremented lazily" + (let* ((s1 (gensym "")) + (s2 (gensym "")) + (s3 (gensym "")) + (s4 (gensym "")) + (s4-counter (string->number (symbol->string s4))) + (s1-counter (string->number (symbol->string s1)))) + (= s1-counter (1+ s4-counter)))) + + (pass-if "unaffected by mutation of prefix" + (let* ((prefix (string-copy "foo")) + (symbol (gensym prefix))) + (string-set! prefix 0 #\g) + (string= "foo" (substring (symbol->string symbol) 0 3)))) + + (pass-if "avoids existing interned symbols" + (let* ((n (1+ (string->number (symbol->string (gensym ""))))) + (colliding-symbol (string->symbol (number->string n))) + (symbol (gensym ""))) + (< n (string->number (symbol->string symbol)))))) (with-test-prefix "extended read syntax" (pass-if (equal? "#{}#" (object->string (string->symbol "")))) -- 1.7.5.4