From: Dmitry Antipov <antipov@mvista.com>
To: emacs-devel@gnu.org
Subject: immediate strings
Date: Sat, 26 Nov 2011 06:15:24 +0400 [thread overview]
Message-ID: <4ED04BBC.70803@yandex.ru> (raw)
[-- Attachment #1: Type: text/plain, Size: 456 bytes --]
Small strings (up to 3 bytes on 32-bit, up to 7 bytes on 64-bin) can be represented
as 'immediates' of struct Lisp_String, without allocating data separately. I observed
two usage scenarios (editing with a few tens of buffers and long byte-force-recompile
runs), and in both cases ~10% of live strings are less than 4 bytes long, and ~30% of
live strings are less than 8 bytes long. So I think it's worth playing with such a
little complication.
Dmitry
[-- Attachment #2: immstr.patch --]
[-- Type: text/plain, Size: 9262 bytes --]
=== modified file 'src/alloc.c'
--- src/alloc.c 2011-11-20 03:07:02 +0000
+++ src/alloc.c 2011-11-26 02:01:23 +0000
@@ -136,9 +136,9 @@
/* Mark, unmark, query mark bit of a Lisp string. S must be a pointer
to a struct Lisp_String. */
-#define MARK_STRING(S) ((S)->size |= ARRAY_MARK_FLAG)
-#define UNMARK_STRING(S) ((S)->size &= ~ARRAY_MARK_FLAG)
-#define STRING_MARKED_P(S) (((S)->size & ARRAY_MARK_FLAG) != 0)
+#define MARK_STRING(S) ((S)->gcmarkbit = 1)
+#define UNMARK_STRING(S) ((S)->gcmarkbit = 0)
+#define STRING_MARKED_P(S) ((S)->gcmarkbit)
#define VECTOR_MARK(V) ((V)->header.size |= ARRAY_MARK_FLAG)
#define VECTOR_UNMARK(V) ((V)->header.size &= ~ARRAY_MARK_FLAG)
@@ -1733,7 +1733,8 @@
a pointer to the `u.data' member of its sdata structure; the
structure starts at a constant offset in front of that. */
-#define SDATA_OF_STRING(S) ((struct sdata *) ((S)->data - SDATA_DATA_OFFSET))
+#define SDATA_OF_STRING(S) ((S)->immbit ? (abort (), (struct sdata *) NULL) \
+ : ((struct sdata *) ((S)->u.ptrdata - SDATA_DATA_OFFSET)))
#ifdef GC_CHECK_STRING_OVERRUN
@@ -1824,12 +1825,25 @@
string_bytes (struct Lisp_String *s)
{
EMACS_INT nbytes =
- (s->size_byte < 0 ? s->size & ~ARRAY_MARK_FLAG : s->size_byte);
+ (s->size_byte < 0 ? s->size : s->size_byte);
- if (!PURE_POINTER_P (s)
- && s->data
- && nbytes != SDATA_NBYTES (SDATA_OF_STRING (s)))
- abort ();
+ if (s->immbit)
+ {
+ if (nbytes >= STRING_IMMEDIATE_MAX)
+ /* Impossible immediate string. */
+ abort ();
+ }
+ else
+ {
+ if (nbytes < STRING_IMMEDIATE_MAX)
+ /* Impossible normal string. */
+ abort ();
+ if (!PURE_POINTER_P (s) &&
+ s->u.ptrdata &&
+ nbytes != SDATA_NBYTES (SDATA_OF_STRING (s)))
+ /* Normal non-pure string with size mismatch. */
+ abort ();
+ }
return nbytes;
}
@@ -2000,7 +2014,7 @@
/* Determine the number of bytes needed to store NBYTES bytes
of string data. */
needed = SDATA_SIZE (nbytes);
- old_data = s->data ? SDATA_OF_STRING (s) : NULL;
+ old_data = s->u.ptrdata ? SDATA_OF_STRING (s) : NULL;
old_nbytes = GC_STRING_BYTES (s);
MALLOC_BLOCK_INPUT;
@@ -2060,13 +2074,11 @@
MALLOC_UNBLOCK_INPUT;
data->string = s;
- s->data = SDATA_DATA (data);
+ s->u.ptrdata = SDATA_DATA (data);
#ifdef GC_CHECK_STRING_BYTES
SDATA_NBYTES (data) = nbytes;
#endif
- s->size = nchars;
- s->size_byte = nbytes;
- s->data[nbytes] = '\0';
+ s->u.ptrdata[nbytes] = '\0';
#ifdef GC_CHECK_STRING_OVERRUN
memcpy ((char *) data + needed, string_overrun_cookie,
GC_STRING_OVERRUN_COOKIE_SIZE);
@@ -2109,21 +2121,20 @@
{
struct Lisp_String *s = b->strings + i;
- if (s->data)
+ if (STRING_MARKED_P (s))
+ {
+ /* String is live; unmark it and its intervals. */
+ UNMARK_STRING (s);
+
+ if (!NULL_INTERVAL_P (s->intervals))
+ UNMARK_BALANCE_INTERVALS (s->intervals);
+
+ ++total_strings;
+ total_string_size += STRING_BYTES (s);
+ }
+ else
{
- /* String was not on free-list before. */
- if (STRING_MARKED_P (s))
- {
- /* String is live; unmark it and its intervals. */
- UNMARK_STRING (s);
-
- if (!NULL_INTERVAL_P (s->intervals))
- UNMARK_BALANCE_INTERVALS (s->intervals);
-
- ++total_strings;
- total_string_size += STRING_BYTES (s);
- }
- else
+ if (!s->immbit && s->u.ptrdata)
{
/* String is dead. Put it on the free-list. */
struct sdata *data = SDATA_OF_STRING (s);
@@ -2138,20 +2149,13 @@
data->u.nbytes = GC_STRING_BYTES (s);
#endif
data->string = NULL;
-
- /* Reset the strings's `data' member so that we
- know it's free. */
- s->data = NULL;
-
- /* Put the string on the free-list. */
- NEXT_FREE_LISP_STRING (s) = string_free_list;
- string_free_list = s;
- ++nfree;
}
- }
- else
- {
- /* S was on the free-list before. Put it there again. */
+
+ /* Reset the strings's `data' member so that we know it's
+ free. This also zeroes an immediate string storage. */
+ s->u.ptrdata = NULL;
+
+ /* Put the string on the free-list. */
NEXT_FREE_LISP_STRING (s) = string_free_list;
string_free_list = s;
++nfree;
@@ -2284,7 +2288,7 @@
{
xassert (tb != b || to < from);
memmove (to, from, nbytes + GC_STRING_EXTRA);
- to->string->data = SDATA_DATA (to);
+ to->string->u.ptrdata = SDATA_DATA (to);
}
/* Advance past the sdata we copied to. */
@@ -2533,7 +2537,15 @@
return empty_multibyte_string;
s = allocate_string ();
- allocate_string_data (s, nchars, nbytes);
+ s->size = nchars;
+ s->size_byte = nbytes;
+ if (nbytes < STRING_IMMEDIATE_MAX)
+ s->immbit = 1;
+ else
+ {
+ s->immbit = 0;
+ allocate_string_data (s, nchars, nbytes);
+ }
XSETSTRING (string, s);
string_chars_consed += nbytes;
return string;
@@ -3901,7 +3913,9 @@
return (offset >= 0
&& offset % sizeof b->strings[0] == 0
&& offset < (STRING_BLOCK_SIZE * sizeof b->strings[0])
- && ((struct Lisp_String *) p)->data != NULL);
+ /* Live immediate string always has at least one
+ non-zero byte, so this check will be passed. */
+ && ((struct Lisp_String *) p)->u.ptrdata != NULL);
}
else
return 0;
@@ -4801,13 +4815,25 @@
struct Lisp_String *s;
s = (struct Lisp_String *) pure_alloc (sizeof *s, Lisp_String);
- s->data = (unsigned char *) find_string_data_in_pure (data, nbytes);
- if (s->data == NULL)
- {
- s->data = (unsigned char *) pure_alloc (nbytes + 1, -1);
- memcpy (s->data, data, nbytes);
- s->data[nbytes] = '\0';
- }
+
+ if (nbytes < STRING_IMMEDIATE_MAX)
+ {
+ memcpy (s->u.immdata, data, nbytes);
+ s->u.immdata[nbytes] = '\0';
+ s->immbit = 1;
+ }
+ else
+ {
+ s->u.ptrdata = (unsigned char *) find_string_data_in_pure (data, nbytes);
+ if (s->u.ptrdata == NULL)
+ {
+ s->u.ptrdata = (unsigned char *) pure_alloc (nbytes + 1, -1);
+ memcpy (s->u.ptrdata, data, nbytes);
+ s->u.ptrdata[nbytes] = '\0';
+ }
+ s->immbit = 0;
+ }
+
s->size = nchars;
s->size_byte = multibyte ? nbytes : -1;
s->intervals = NULL_INTERVAL;
@@ -4828,7 +4854,19 @@
s = (struct Lisp_String *) pure_alloc (sizeof *s, Lisp_String);
s->size = nchars;
s->size_byte = -1;
- s->data = (unsigned char *) data;
+
+ if (nchars < STRING_IMMEDIATE_MAX)
+ {
+ memcpy (s->u.immdata, data, nchars);
+ s->u.immdata[nchars] = '\0';
+ s->immbit = 1;
+ }
+ else
+ {
+ s->u.ptrdata = (unsigned char *) data;
+ s->immbit = 0;
+ }
+
s->intervals = NULL_INTERVAL;
XSETSTRING (string, s);
return string;
=== modified file 'src/fns.c'
--- src/fns.c 2011-11-19 09:18:31 +0000
+++ src/fns.c 2011-11-25 10:42:04 +0000
@@ -2176,8 +2176,8 @@
int len = CHAR_STRING (charval, str);
EMACS_INT size_byte = SBYTES (array);
- if (INT_MULTIPLY_OVERFLOW (SCHARS (array), len)
- || SCHARS (array) * len != size_byte)
+ if (INT_MULTIPLY_OVERFLOW (size, len)
+ || size * len != size_byte)
error ("Attempt to change byte length of a string");
for (idx = 0; idx < size_byte; idx++)
*p++ = str[idx % len];
=== modified file 'src/lisp.h'
--- src/lisp.h 2011-11-20 03:07:02 +0000
+++ src/lisp.h 2011-11-25 15:15:57 +0000
@@ -691,7 +691,9 @@
/* Convenience macros for dealing with Lisp strings. */
-#define SDATA(string) (XSTRING (string)->data + 0)
+#define SDATA(string) (XSTRING (string)->immbit ? \
+ (XSTRING (string)->u.immdata + 0) : \
+ (XSTRING (string)->u.ptrdata + 0))
#define SREF(string, index) (SDATA (string)[index] + 0)
#define SSET(string, index, new) (SDATA (string)[index] = (new))
#define SCHARS(string) (XSTRING (string)->size + 0)
@@ -824,6 +826,9 @@
#define STRING_BYTES_BOUND \
min (MOST_POSITIVE_FIXNUM, (ptrdiff_t) min (SIZE_MAX, PTRDIFF_MAX) - 1)
+/* Maximum amount of bytes, including '\0', in an immediate string. */
+#define STRING_IMMEDIATE_MAX (sizeof (unsigned char *))
+
/* Mark STR as a unibyte string. */
#define STRING_SET_UNIBYTE(STR) \
do { if (EQ (STR, empty_multibyte_string)) \
@@ -843,14 +848,24 @@
/* Set text properties. */
#define STRING_SET_INTERVALS(STR, INT) (XSTRING (STR)->intervals = (INT))
-/* In a string or vector, the sign bit of the `size' is the gc mark bit */
-
struct Lisp_String
{
- EMACS_INT size;
- EMACS_INT size_byte;
- INTERVAL intervals; /* text properties in this string */
- unsigned char *data;
+ EMACS_INT size : BITS_PER_EMACS_INT - 1;
+ EMACS_INT size_byte : BITS_PER_EMACS_INT - 1;
+ unsigned gcmarkbit : 1;
+
+ /* 1 if this is a small immediate string with data stored within IMMDATA.
+ Otherwise this is a normal string with PTRDATA points to actual data. */
+ unsigned immbit : 1;
+
+ /* Text properties in this string. */
+ INTERVAL intervals;
+
+ /* Actual string data, depending on the value of IMMBIT. */
+ union {
+ unsigned char *ptrdata;
+ unsigned char immdata[STRING_IMMEDIATE_MAX];
+ } u;
};
/* Header of vector-like objects. This documents the layout constraints on
next reply other threads:[~2011-11-26 2:15 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-11-26 2:15 Dmitry Antipov [this message]
2011-11-26 7:56 ` immediate strings Andreas Schwab
2011-11-26 8:05 ` Paul Eggert
2011-11-26 10:03 ` Eli Zaretskii
2011-11-26 14:28 ` Stefan Monnier
2011-11-26 14:30 ` Dmitry Antipov
2011-11-26 21:13 ` Paul Eggert
2011-11-28 4:19 ` --with-wide-int Stefan Monnier
2011-11-28 7:56 ` --with-wide-int Paul Eggert
2011-11-28 16:52 ` --with-wide-int Stefan Monnier
2011-11-28 18:04 ` --with-wide-int Dan Nicolaescu
2011-11-28 18:34 ` --with-wide-int Eli Zaretskii
2011-11-28 19:35 ` --with-wide-int Stefan Monnier
2011-11-29 4:26 ` --with-wide-int Stephen J. Turnbull
2011-11-29 7:35 ` --with-wide-int Paul Eggert
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://www.gnu.org/software/emacs/
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4ED04BBC.70803@yandex.ru \
--to=antipov@mvista.com \
--cc=emacs-devel@gnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://git.savannah.gnu.org/cgit/emacs.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).