From 7eff2e5ee0230b11a1ad38b4fd1cf4a470a9b3bc Mon Sep 17 00:00:00 2001 From: Mark H Weaver Date: Mon, 5 Mar 2012 10:35:06 -0500 Subject: [PATCH 3/3] Efficient gensym hack: generate gensym names lazily * libguile/strings.c (scm_i_stringbuf_from_string, scm_i_string_from_stringbuf): New internal functions. (symbol_stringbuf): New internal static function to replace most uses of SYMBOL_STRINGBUF. Handles forcing lazy gensyms. (scm_i_symbol_length, scm_c_symbol_length, scm_i_is_narrow_symbol, scm_i_symbol_chars, scm_i_symbol_wide_chars, scm_i_symbol_substring, scm_sys_symbol_dump): Use symbol_stringbuf instead of SYMBOL_STRINGBUF. * libguile/strings.h (scm_i_stringbuf_from_string, scm_i_string_from_stringbuf): Add prototypes. * libguile/symbols.c (SYMBOL_STRINGBUF): New internal macro. (scm_i_symbol_hash): New internal function to replace the macro of the same name. Handles forcing lazy gensyms. (scm_gensym): Don't construct the name or even increment the gensym_counter here. Just return a special symbol with the SCM_I_F_SYMBOL_LAZY_GENSYM flag set, with hash value 0, and with a stringbuf containing only the prefix. (scm_i_force_lazy_gensym): New internal procedure used when a lazy gensym is queried for its name or hash value. (symbol_lookup_hash_fn, symbol_lookup_assoc_fn): Avoid lazy gensym checks. * libguile/symbols.h (scm_i_symbol_hash): Remove macro, and replace it with a prototype for the new internal function of the same name. (scm_i_force_lazy_gensym): Add prototype. (scm_i_symbol_is_lazy_gensym): New macro. (SCM_I_F_SYMBOL_LAZY_GENSYM): New flag. * doc/ref/api-data.texi (Symbol Primitives): Update documentation. * test-suite/tests/symbols.test (gensym): Add tests. --- doc/ref/api-data.texi | 4 +- libguile/strings.c | 58 +++++++++++++++-- libguile/strings.h | 2 + libguile/symbols.c | 140 ++++++++++++++++++++++++++++++++++------- libguile/symbols.h | 6 ++- test-suite/tests/symbols.test | 36 ++++++++++- 6 files changed, 211 insertions(+), 35 deletions(-) diff --git a/doc/ref/api-data.texi b/doc/ref/api-data.texi index 39c9790..a1203f0 100644 --- a/doc/ref/api-data.texi +++ b/doc/ref/api-data.texi @@ -5293,8 +5293,8 @@ code. The @code{gensym} primitive meets this need: @deffnx {C Function} scm_gensym (prefix) Create a new symbol with a name constructed from a prefix and a counter value. The string @var{prefix} can be specified as an optional -argument. Default prefix is @samp{@w{ g}}. The counter is increased by 1 -at each call. There is no provision for resetting the counter. +argument. Default prefix is @samp{@w{ g}}. The name is constructed +lazily, when the name or hash of the symbol is first requested. @end deffn The symbols generated by @code{gensym} are @emph{likely} to be unique, diff --git a/libguile/strings.c b/libguile/strings.c index 35757f0..b4f42d4 100644 --- a/libguile/strings.c +++ b/libguile/strings.c @@ -334,6 +334,41 @@ get_str_buf_start (SCM *str, SCM *buf, size_t *start) *buf = STRING_STRINGBUF (*str); } +/* scm_i_stringbuf_from_string returns a stringbuf containing exactly + the characters in 'str'. If possible, it returns 'str's stringbuf + (marking it shared). However, if 'str' refers to only part of its + stringbuf, the relevant portion is copied into a fresh stringbuf. + + This is needed by the lazy gensym code in symbols.c. */ +SCM +scm_i_stringbuf_from_string (SCM str) +{ + SCM inner_str, buf; + size_t len, start; + + len = STRING_LENGTH (str); + inner_str = str; + get_str_buf_start (&inner_str, &buf, &start); + if (STRINGBUF_LENGTH (buf) == len) + set_stringbuf_shared (buf); + else + { + SCM new_str = scm_i_substring_copy (str, 0, len); + buf = STRING_STRINGBUF (new_str); + } + return buf; +} + +/* Needed by the lazy gensym code in symbols.c. */ +SCM +scm_i_string_from_stringbuf (SCM buf) +{ + size_t len = STRINGBUF_LENGTH (buf); + set_stringbuf_shared (buf); + return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf), + (scm_t_bits) 0, (scm_t_bits) len); +} + SCM scm_i_substring (SCM str, size_t start, size_t end) { @@ -732,8 +767,17 @@ scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr) internals of strings and string-like objects confined to this file. */ +/* Must be kept in sync with the matching definition in symbols.c */ #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1 +static SCM +symbol_stringbuf (SCM symbol) +{ + if (SCM_UNLIKELY (scm_i_symbol_is_lazy_gensym (symbol))) + scm_i_force_lazy_gensym (symbol); + return SYMBOL_STRINGBUF (symbol); +} + SCM scm_i_make_symbol (SCM name, scm_t_bits flags, unsigned long hash, SCM props) @@ -793,7 +837,7 @@ scm_i_c_make_symbol (const char *name, size_t len, size_t scm_i_symbol_length (SCM sym) { - return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym)); + return STRINGBUF_LENGTH (symbol_stringbuf (sym)); } size_t @@ -802,7 +846,7 @@ scm_c_symbol_length (SCM sym) { SCM_VALIDATE_SYMBOL (1, sym); - return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym)); + return STRINGBUF_LENGTH (symbol_stringbuf (sym)); } #undef FUNC_NAME @@ -813,7 +857,7 @@ scm_i_is_narrow_symbol (SCM sym) { SCM buf; - buf = SYMBOL_STRINGBUF (sym); + buf = symbol_stringbuf (sym); return !STRINGBUF_WIDE (buf); } @@ -824,7 +868,7 @@ scm_i_symbol_chars (SCM sym) { SCM buf; - buf = SYMBOL_STRINGBUF (sym); + buf = symbol_stringbuf (sym); if (!STRINGBUF_WIDE (buf)) return (const char *) STRINGBUF_CHARS (buf); else @@ -839,7 +883,7 @@ scm_i_symbol_wide_chars (SCM sym) { SCM buf; - buf = SYMBOL_STRINGBUF (sym); + buf = symbol_stringbuf (sym); if (STRINGBUF_WIDE (buf)) return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf); else @@ -850,7 +894,7 @@ scm_i_symbol_wide_chars (SCM sym) SCM scm_i_symbol_substring (SCM sym, size_t start, size_t end) { - SCM buf = SYMBOL_STRINGBUF (sym); + SCM buf = symbol_stringbuf (sym); set_stringbuf_shared (buf); return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf), (scm_t_bits)start, (scm_t_bits) end - start); @@ -1000,7 +1044,7 @@ SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym), scm_from_ulong (scm_i_symbol_hash (sym))); e3 = scm_cons (scm_from_latin1_symbol ("interned"), scm_symbol_interned_p (sym)); - buf = SYMBOL_STRINGBUF (sym); + buf = symbol_stringbuf (sym); /* Stringbuf info */ if (!STRINGBUF_WIDE (buf)) diff --git a/libguile/strings.h b/libguile/strings.h index 9735913..afb5a53 100644 --- a/libguile/strings.h +++ b/libguile/strings.h @@ -200,6 +200,8 @@ SCM_INTERNAL scm_t_wchar scm_i_string_ref (SCM str, size_t x); SCM_INTERNAL int scm_i_string_contains_char (SCM str, char c); SCM_INTERNAL int scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr); SCM_INTERNAL void scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr); +SCM_INTERNAL SCM scm_i_stringbuf_from_string (SCM str); +SCM_INTERNAL SCM scm_i_string_from_stringbuf (SCM buf); /* internal functions related to symbols. */ diff --git a/libguile/symbols.c b/libguile/symbols.c index 08512a6..07556fa 100644 --- a/libguile/symbols.c +++ b/libguile/symbols.c @@ -70,6 +70,19 @@ SCM_DEFINE (scm_sys_symbols, "%symbols", 0, 0, 0, /* {Symbols} */ +/* Must be kept in sync with the matching definition in strings.c */ +#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1 + +#define SYMBOL_HASH(x) ((unsigned long) SCM_CELL_WORD_2 (x)) + +unsigned long +scm_i_symbol_hash (SCM symbol) +{ + if (SCM_UNLIKELY (scm_i_symbol_is_lazy_gensym (symbol))) + scm_i_force_lazy_gensym (symbol); + return SYMBOL_HASH (symbol); +} + unsigned long scm_i_hash_symbol (SCM obj, unsigned long n, void *closure) { @@ -165,7 +178,16 @@ lookup_interned_latin1_symbol (const char *str, size_t len, static unsigned long symbol_lookup_hash_fn (SCM obj, unsigned long max, void *closure) { - return scm_i_symbol_hash (obj) % max; + /* We must avoid forcing lazy gensyms here, because + scm_i_force_lazy_gensym needs to intern its symbol before clearing + the lazy gensym flag. */ + return SYMBOL_HASH (obj) % max; +} + +static SCM +symbol_to_string_no_lazy_gensym_check (SCM sym) +{ + return scm_i_string_from_stringbuf (SYMBOL_STRINGBUF (sym)); } static SCM @@ -175,9 +197,13 @@ symbol_lookup_assoc_fn (SCM obj, SCM alist, void *closure) { SCM sym = SCM_CAAR (alist); - if (scm_i_symbol_hash (sym) == scm_i_symbol_hash (obj) - && scm_is_true (scm_string_equal_p (scm_symbol_to_string (sym), - scm_symbol_to_string (obj)))) + /* We must avoid forcing lazy gensyms here, because + scm_i_force_lazy_gensym needs to intern its symbol before + clearing the lazy gensym flag. */ + if (SYMBOL_HASH (sym) == SYMBOL_HASH (obj) + && scm_is_true (scm_string_equal_p + (symbol_to_string_no_lazy_gensym_check (sym), + symbol_to_string_no_lazy_gensym_check (obj)))) return SCM_CAR (alist); } @@ -340,38 +366,104 @@ SCM_DEFINE (scm_string_ci_to_symbol, "string-ci->symbol", 1, 0, 0, /* The default prefix for `gensym'd symbols. */ static SCM default_gensym_prefix; -#define MAX_PREFIX_LENGTH 30 - SCM_DEFINE (scm_gensym, "gensym", 0, 1, 0, (SCM prefix), "Create a new symbol with a name constructed from a prefix and\n" - "a counter value. The string @var{prefix} can be specified as\n" - "an optional argument. Default prefix is @code{ g}. The counter\n" - "is increased by 1 at each call. There is no provision for\n" - "resetting the counter.") + "a counter value. The string @var{prefix} can be specified as\n" + "an optional argument. Default prefix is @code{ g}. The name\n" + "is constructed lazily, when the name or hash of the symbol is\n" + "first requested.") #define FUNC_NAME s_scm_gensym { - static int gensym_counter = 0; - - SCM suffix, name; - int n, n_digits; - char buf[SCM_INTBUFLEN]; + SCM prefix_stringbuf; if (SCM_UNBNDP (prefix)) prefix = default_gensym_prefix; + else + SCM_VALIDATE_STRING (1, prefix); - /* mutex in case another thread looks and incs at the exact same moment */ - scm_i_scm_pthread_mutex_lock (&scm_i_misc_mutex); - n = gensym_counter++; - scm_i_pthread_mutex_unlock (&scm_i_misc_mutex); + prefix_stringbuf = scm_i_stringbuf_from_string (prefix); - n_digits = scm_iint2str (n, 10, buf); - suffix = scm_from_latin1_stringn (buf, n_digits); - name = scm_string_append (scm_list_2 (prefix, suffix)); - return scm_string_to_symbol (name); + /* Allocate a special symbol with the lazy gensym flag set. Except + for a few special exceptions, all code must check this flag before + accessing the name or hash fields of symbols. When the gensym is + forced, it will set the name and hash fields to their final values, + and then clear the lazy gensym flag. For now, we store the gensym + prefix as the symbol name, and 0 as the hash value. */ + return scm_double_cell (scm_tc7_symbol | SCM_I_F_SYMBOL_LAZY_GENSYM, + SCM_UNPACK (prefix_stringbuf), (scm_t_bits) 0, + SCM_UNPACK (scm_cons (SCM_BOOL_F, SCM_EOL))); } #undef FUNC_NAME +/* + * Forcing lazy gensyms + * + * Here we must choose a name for our gensym and set its 'equal?' hash + * value to match its name. In most cases, we will simply append the + * current gensym counter to the prefix to form the name, increment the + * counter, and intern the symbol. However, there are some + * complications. + * + * The name we ultimately assign to the gensym _must_ not already be + * interned. To understand why, consider this scenario: Suppose the + * user asks for a lazy gensym with prefix "foo", and we assign it the + * number 6. Now suppose sometime later, but before the gensym is + * forced, the symbol 'foo6' is independently interned. Now we have two + * distinct symbols (in the sense of 'eq?'), both semantically interned, + * with the same name. This is a violation of the most fundamental + * property of symbols. + * + * Therefore, if the first counter value we try yields a name that has + * already been interned, we try the next counter value, and repeat + * until we successfully intern our symbol. Only then can we clear the + * lazy gensym flag and thereby allow the name and 'equal?' hash value + * to be accessed. + */ +void +scm_i_force_lazy_gensym (SCM sym) +{ + static int gensym_counter = 0; + + SCM prefix, suffix, name, handle; + int n, n_digits; + char buf[SCM_INTBUFLEN]; + + /* mutex in case another thread forces a gensym (possibly this one) */ + scm_i_pthread_mutex_lock (&symbols_lock); + if (SCM_LIKELY (scm_i_symbol_is_lazy_gensym (sym))) + { + prefix = scm_i_string_from_stringbuf (SYMBOL_STRINGBUF (sym)); + + do + { + n = gensym_counter++; + + n_digits = scm_iint2str (n, 10, buf); + suffix = scm_from_latin1_stringn (buf, n_digits); + name = scm_string_append (scm_list_2 (prefix, suffix)); + + /* Set the name and hash to their candidate values. */ + SCM_SET_CELL_OBJECT_1 (sym, scm_i_stringbuf_from_string (name)); + SCM_SET_CELL_WORD_2 (sym, scm_i_string_hash (name)); + + /* Attempt to intern the symbol */ + handle = scm_hash_fn_create_handle_x (symbols, sym, SCM_UNDEFINED, + symbol_lookup_hash_fn, + symbol_lookup_assoc_fn, + NULL); + } while (SCM_UNLIKELY (!scm_is_eq (sym, SCM_CAR (handle)))); + + /* We must not clear the lazy gensym flag until our symbol has + been interned. The lock does not save us here, because another + thread could retrieve our gensym's name or hash outside of any + lock. */ + SCM_SET_CELL_WORD_0 (sym, (SCM_CELL_WORD_0 (sym) + & ~SCM_I_F_SYMBOL_LAZY_GENSYM)); + } + scm_i_pthread_mutex_unlock (&symbols_lock); +} + SCM_DEFINE (scm_symbol_hash, "symbol-hash", 1, 0, 0, (SCM symbol), "Return a hash value for @var{symbol}.") diff --git a/libguile/symbols.h b/libguile/symbols.h index 6106f9e..b8fe997 100644 --- a/libguile/symbols.h +++ b/libguile/symbols.h @@ -28,11 +28,13 @@ #define scm_is_symbol(x) (!SCM_IMP (x) \ && (SCM_TYP7 (x) == scm_tc7_symbol)) -#define scm_i_symbol_hash(x) ((unsigned long) SCM_CELL_WORD_2 (x)) #define scm_i_symbol_is_interned(x) \ (!(SCM_CELL_WORD_0 (x) & SCM_I_F_SYMBOL_UNINTERNED)) +#define scm_i_symbol_is_lazy_gensym(x) \ + (SCM_CELL_WORD_0 (x) & SCM_I_F_SYMBOL_LAZY_GENSYM) #define SCM_I_F_SYMBOL_UNINTERNED 0x100 +#define SCM_I_F_SYMBOL_LAZY_GENSYM 0x200 @@ -90,8 +92,10 @@ SCM_API SCM scm_take_utf8_symboln (char *sym, size_t len); /* internal functions. */ +SCM_INTERNAL unsigned long scm_i_symbol_hash (SCM symbol); SCM_INTERNAL unsigned long scm_i_hash_symbol (SCM obj, unsigned long n, void *closure); +SCM_INTERNAL void scm_i_force_lazy_gensym (SCM sym); SCM_INTERNAL void scm_symbols_prehistory (void); SCM_INTERNAL void scm_init_symbols (void); diff --git a/test-suite/tests/symbols.test b/test-suite/tests/symbols.test index 6fbc6be..0dbb121 100644 --- a/test-suite/tests/symbols.test +++ b/test-suite/tests/symbols.test @@ -149,7 +149,41 @@ (symbol? (gensym (make-string 4000 #\!)))) (pass-if "accepts embedded NULs" - (> (string-length (symbol->string (gensym "foo\0bar\0braz\0foo\0bar\0braz\0foo\0bar\0braz\0foo\0bar\0braz\0foo\0bar\0braz\0foo\0bar\0braz\0"))) 6))) + (> (string-length (symbol->string (gensym "foo\0bar\0braz\0foo\0bar\0braz\0foo\0bar\0braz\0foo\0bar\0braz\0foo\0bar\0braz\0foo\0bar\0braz\0"))) 6)) + + (pass-if "accepts substring prefixes" + (let* ((prefix (substring "foobar" 1 4)) + (symbol (gensym prefix)) + (name (symbol->string symbol))) + (string= "oob" (substring name 0 3)))) + + (pass-if "accepts shared substring prefixes" + (let* ((prefix (substring/shared (string-copy "foobar") + 1 4)) + (symbol (gensym prefix)) + (name (symbol->string symbol))) + (string= "oob" (substring name 0 3)))) + + (pass-if "counter incremented lazily" + (let* ((s1 (gensym "")) + (s2 (gensym "")) + (s3 (gensym "")) + (s4 (gensym "")) + (s4-counter (string->number (symbol->string s4))) + (s1-counter (string->number (symbol->string s1)))) + (= s1-counter (1+ s4-counter)))) + + (pass-if "unaffected by mutation of prefix" + (let* ((prefix (string-copy "foo")) + (symbol (gensym prefix))) + (string-set! prefix 0 #\g) + (string= "foo" (substring (symbol->string symbol) 0 3)))) + + (pass-if "avoids existing interned symbols" + (let* ((n (1+ (string->number (symbol->string (gensym ""))))) + (colliding-symbol (string->symbol (number->string n))) + (symbol (gensym ""))) + (< n (string->number (symbol->string symbol)))))) (with-test-prefix "extended read syntax" (pass-if (equal? "#{}#" (object->string (string->symbol "")))) -- 1.7.5.4