unofficial mirror of emacs-devel@gnu.org 
 help / color / mirror / code / Atom feed
* immediate strings
@ 2011-11-26  2:15 Dmitry Antipov
  2011-11-26  7:56 ` Andreas Schwab
  2011-11-26  8:05 ` Paul Eggert
  0 siblings, 2 replies; 15+ messages in thread
From: Dmitry Antipov @ 2011-11-26  2:15 UTC (permalink / raw)
  To: emacs-devel

[-- Attachment #1: Type: text/plain, Size: 456 bytes --]

Small strings (up to 3 bytes on 32-bit, up to 7 bytes on 64-bin) can be represented
as 'immediates' of struct Lisp_String, without allocating data separately. I observed
two usage scenarios (editing with a few tens of buffers and long byte-force-recompile
runs), and in both cases ~10% of live strings are less than 4 bytes long, and ~30% of
live strings are less than 8 bytes long. So I think it's worth playing with such a
little complication.

Dmitry



[-- Attachment #2: immstr.patch --]
[-- Type: text/plain, Size: 9262 bytes --]

=== modified file 'src/alloc.c'
--- src/alloc.c	2011-11-20 03:07:02 +0000
+++ src/alloc.c	2011-11-26 02:01:23 +0000
@@ -136,9 +136,9 @@
 /* Mark, unmark, query mark bit of a Lisp string.  S must be a pointer
    to a struct Lisp_String.  */
 
-#define MARK_STRING(S)		((S)->size |= ARRAY_MARK_FLAG)
-#define UNMARK_STRING(S)	((S)->size &= ~ARRAY_MARK_FLAG)
-#define STRING_MARKED_P(S)	(((S)->size & ARRAY_MARK_FLAG) != 0)
+#define MARK_STRING(S)		((S)->gcmarkbit = 1)
+#define UNMARK_STRING(S)	((S)->gcmarkbit = 0)
+#define STRING_MARKED_P(S)	((S)->gcmarkbit)
 
 #define VECTOR_MARK(V)		((V)->header.size |= ARRAY_MARK_FLAG)
 #define VECTOR_UNMARK(V)	((V)->header.size &= ~ARRAY_MARK_FLAG)
@@ -1733,7 +1733,8 @@
    a pointer to the `u.data' member of its sdata structure; the
    structure starts at a constant offset in front of that.  */
 
-#define SDATA_OF_STRING(S) ((struct sdata *) ((S)->data - SDATA_DATA_OFFSET))
+#define SDATA_OF_STRING(S) ((S)->immbit ? (abort (), (struct sdata *) NULL) \
+  : ((struct sdata *) ((S)->u.ptrdata - SDATA_DATA_OFFSET)))
 
 
 #ifdef GC_CHECK_STRING_OVERRUN
@@ -1824,12 +1825,25 @@
 string_bytes (struct Lisp_String *s)
 {
   EMACS_INT nbytes =
-    (s->size_byte < 0 ? s->size & ~ARRAY_MARK_FLAG : s->size_byte);
+    (s->size_byte < 0 ? s->size : s->size_byte);
 
-  if (!PURE_POINTER_P (s)
-      && s->data
-      && nbytes != SDATA_NBYTES (SDATA_OF_STRING (s)))
-    abort ();
+  if (s->immbit)
+    {
+      if (nbytes >= STRING_IMMEDIATE_MAX)
+	/* Impossible immediate string.  */
+	abort ();
+    }
+  else
+    {
+      if (nbytes < STRING_IMMEDIATE_MAX)
+	/* Impossible normal string.  */
+	abort ();
+      if (!PURE_POINTER_P (s) &&
+	  s->u.ptrdata &&
+	  nbytes != SDATA_NBYTES (SDATA_OF_STRING (s)))
+	/* Normal non-pure string with size mismatch.  */
+	abort ();
+    }
   return nbytes;
 }
 
@@ -2000,7 +2014,7 @@
   /* Determine the number of bytes needed to store NBYTES bytes
      of string data.  */
   needed = SDATA_SIZE (nbytes);
-  old_data = s->data ? SDATA_OF_STRING (s) : NULL;
+  old_data = s->u.ptrdata ? SDATA_OF_STRING (s) : NULL;
   old_nbytes = GC_STRING_BYTES (s);
 
   MALLOC_BLOCK_INPUT;
@@ -2060,13 +2074,11 @@
   MALLOC_UNBLOCK_INPUT;
 
   data->string = s;
-  s->data = SDATA_DATA (data);
+  s->u.ptrdata = SDATA_DATA (data);
 #ifdef GC_CHECK_STRING_BYTES
   SDATA_NBYTES (data) = nbytes;
 #endif
-  s->size = nchars;
-  s->size_byte = nbytes;
-  s->data[nbytes] = '\0';
+  s->u.ptrdata[nbytes] = '\0';
 #ifdef GC_CHECK_STRING_OVERRUN
   memcpy ((char *) data + needed, string_overrun_cookie,
 	  GC_STRING_OVERRUN_COOKIE_SIZE);
@@ -2109,21 +2121,20 @@
 	{
 	  struct Lisp_String *s = b->strings + i;
 
-	  if (s->data)
+	  if (STRING_MARKED_P (s))
+	    {	      
+	      /* String is live; unmark it and its intervals.  */
+	      UNMARK_STRING (s);
+
+	      if (!NULL_INTERVAL_P (s->intervals))
+		UNMARK_BALANCE_INTERVALS (s->intervals);
+
+	      ++total_strings;
+	      total_string_size += STRING_BYTES (s);
+	    }
+	  else
 	    {
-	      /* String was not on free-list before.  */
-	      if (STRING_MARKED_P (s))
-		{
-		  /* String is live; unmark it and its intervals.  */
-		  UNMARK_STRING (s);
-
-		  if (!NULL_INTERVAL_P (s->intervals))
-		    UNMARK_BALANCE_INTERVALS (s->intervals);
-
-		  ++total_strings;
-		  total_string_size += STRING_BYTES (s);
-		}
-	      else
+	      if (!s->immbit && s->u.ptrdata)
 		{
 		  /* String is dead.  Put it on the free-list.  */
 		  struct sdata *data = SDATA_OF_STRING (s);
@@ -2138,20 +2149,13 @@
 		  data->u.nbytes = GC_STRING_BYTES (s);
 #endif
 		  data->string = NULL;
-
-		  /* Reset the strings's `data' member so that we
-		     know it's free.  */
-		  s->data = NULL;
-
-		  /* Put the string on the free-list.  */
-		  NEXT_FREE_LISP_STRING (s) = string_free_list;
-		  string_free_list = s;
-		  ++nfree;
 		}
-	    }
-	  else
-	    {
-	      /* S was on the free-list before.  Put it there again.  */
+	      
+	      /* Reset the strings's `data' member so that we know it's
+		 free.  This also zeroes an immediate string storage.  */
+	      s->u.ptrdata = NULL;
+
+	      /* Put the string on the free-list.  */
 	      NEXT_FREE_LISP_STRING (s) = string_free_list;
 	      string_free_list = s;
 	      ++nfree;
@@ -2284,7 +2288,7 @@
 		{
 		  xassert (tb != b || to < from);
 		  memmove (to, from, nbytes + GC_STRING_EXTRA);
-		  to->string->data = SDATA_DATA (to);
+		  to->string->u.ptrdata = SDATA_DATA (to);
 		}
 
 	      /* Advance past the sdata we copied to.  */
@@ -2533,7 +2537,15 @@
     return empty_multibyte_string;
 
   s = allocate_string ();
-  allocate_string_data (s, nchars, nbytes);
+  s->size = nchars;
+  s->size_byte = nbytes;
+  if (nbytes < STRING_IMMEDIATE_MAX)
+    s->immbit = 1;
+  else
+    {
+      s->immbit = 0;
+      allocate_string_data (s, nchars, nbytes);
+    }
   XSETSTRING (string, s);
   string_chars_consed += nbytes;
   return string;
@@ -3901,7 +3913,9 @@
       return (offset >= 0
 	      && offset % sizeof b->strings[0] == 0
 	      && offset < (STRING_BLOCK_SIZE * sizeof b->strings[0])
-	      && ((struct Lisp_String *) p)->data != NULL);
+	      /* Live immediate string always has at least one
+		 non-zero byte, so this check will be passed.  */
+	      && ((struct Lisp_String *) p)->u.ptrdata != NULL);
     }
   else
     return 0;
@@ -4801,13 +4815,25 @@
   struct Lisp_String *s;
 
   s = (struct Lisp_String *) pure_alloc (sizeof *s, Lisp_String);
-  s->data = (unsigned char *) find_string_data_in_pure (data, nbytes);
-  if (s->data == NULL)
-    {
-      s->data = (unsigned char *) pure_alloc (nbytes + 1, -1);
-      memcpy (s->data, data, nbytes);
-      s->data[nbytes] = '\0';
-    }
+
+  if (nbytes < STRING_IMMEDIATE_MAX)
+    {
+      memcpy (s->u.immdata, data, nbytes);
+      s->u.immdata[nbytes] = '\0';
+      s->immbit = 1;
+    }
+  else
+    {
+      s->u.ptrdata = (unsigned char *) find_string_data_in_pure (data, nbytes);
+      if (s->u.ptrdata == NULL)
+	{
+	  s->u.ptrdata = (unsigned char *) pure_alloc (nbytes + 1, -1);
+	  memcpy (s->u.ptrdata, data, nbytes);
+	  s->u.ptrdata[nbytes] = '\0';
+	}
+      s->immbit = 0;
+    }
+
   s->size = nchars;
   s->size_byte = multibyte ? nbytes : -1;
   s->intervals = NULL_INTERVAL;
@@ -4828,7 +4854,19 @@
   s = (struct Lisp_String *) pure_alloc (sizeof *s, Lisp_String);
   s->size = nchars;
   s->size_byte = -1;
-  s->data = (unsigned char *) data;
+
+  if (nchars < STRING_IMMEDIATE_MAX)
+    {
+      memcpy (s->u.immdata, data, nchars);
+      s->u.immdata[nchars] = '\0';
+      s->immbit = 1;
+    }
+  else
+    {
+      s->u.ptrdata = (unsigned char *) data;
+      s->immbit = 0;
+    }
+
   s->intervals = NULL_INTERVAL;
   XSETSTRING (string, s);
   return string;

=== modified file 'src/fns.c'
--- src/fns.c	2011-11-19 09:18:31 +0000
+++ src/fns.c	2011-11-25 10:42:04 +0000
@@ -2176,8 +2176,8 @@
 	  int len = CHAR_STRING (charval, str);
 	  EMACS_INT size_byte = SBYTES (array);
 
-	  if (INT_MULTIPLY_OVERFLOW (SCHARS (array), len)
-	      || SCHARS (array) * len != size_byte)
+	  if (INT_MULTIPLY_OVERFLOW (size, len)
+	      || size * len != size_byte)
 	    error ("Attempt to change byte length of a string");
 	  for (idx = 0; idx < size_byte; idx++)
 	    *p++ = str[idx % len];

=== modified file 'src/lisp.h'
--- src/lisp.h	2011-11-20 03:07:02 +0000
+++ src/lisp.h	2011-11-25 15:15:57 +0000
@@ -691,7 +691,9 @@
 
 /* Convenience macros for dealing with Lisp strings.  */
 
-#define SDATA(string)		(XSTRING (string)->data + 0)
+#define SDATA(string)		(XSTRING (string)->immbit ? \
+				 (XSTRING (string)->u.immdata + 0) : \
+				 (XSTRING (string)->u.ptrdata + 0))
 #define SREF(string, index)	(SDATA (string)[index] + 0)
 #define SSET(string, index, new) (SDATA (string)[index] = (new))
 #define SCHARS(string)		(XSTRING (string)->size + 0)
@@ -824,6 +826,9 @@
 #define STRING_BYTES_BOUND  \
   min (MOST_POSITIVE_FIXNUM, (ptrdiff_t) min (SIZE_MAX, PTRDIFF_MAX) - 1)
 
+/* Maximum amount of bytes, including '\0', in an immediate string.  */
+#define STRING_IMMEDIATE_MAX (sizeof (unsigned char *))
+
 /* Mark STR as a unibyte string.  */
 #define STRING_SET_UNIBYTE(STR)  \
   do { if (EQ (STR, empty_multibyte_string))  \
@@ -843,14 +848,24 @@
 /* Set text properties.  */
 #define STRING_SET_INTERVALS(STR, INT) (XSTRING (STR)->intervals = (INT))
 
-/* In a string or vector, the sign bit of the `size' is the gc mark bit */
-
 struct Lisp_String
   {
-    EMACS_INT size;
-    EMACS_INT size_byte;
-    INTERVAL intervals;		/* text properties in this string */
-    unsigned char *data;
+    EMACS_INT size : BITS_PER_EMACS_INT - 1;
+    EMACS_INT size_byte : BITS_PER_EMACS_INT - 1;
+    unsigned gcmarkbit : 1;
+
+    /* 1 if this is a small immediate string with data stored within IMMDATA.
+       Otherwise this is a normal string with PTRDATA points to actual data.  */
+    unsigned immbit : 1;
+
+    /* Text properties in this string.  */
+    INTERVAL intervals;
+
+    /* Actual string data, depending on the value of IMMBIT.  */
+    union {
+      unsigned char *ptrdata;
+      unsigned char immdata[STRING_IMMEDIATE_MAX];
+    } u;
   };
 
 /* Header of vector-like objects.  This documents the layout constraints on


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2011-11-29  7:35 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-11-26  2:15 immediate strings Dmitry Antipov
2011-11-26  7:56 ` Andreas Schwab
2011-11-26  8:05 ` Paul Eggert
2011-11-26 10:03   ` Eli Zaretskii
2011-11-26 14:28     ` Stefan Monnier
2011-11-26 14:30   ` Dmitry Antipov
2011-11-26 21:13     ` Paul Eggert
2011-11-28  4:19       ` --with-wide-int Stefan Monnier
2011-11-28  7:56         ` --with-wide-int Paul Eggert
2011-11-28 16:52           ` --with-wide-int Stefan Monnier
2011-11-28 18:04             ` --with-wide-int Dan Nicolaescu
2011-11-28 18:34               ` --with-wide-int Eli Zaretskii
2011-11-28 19:35               ` --with-wide-int Stefan Monnier
2011-11-29  4:26                 ` --with-wide-int Stephen J. Turnbull
2011-11-29  7:35             ` --with-wide-int Paul Eggert

Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).