Proposal: immediate strings

unofficial mirror of emacs-devel@gnu.org 
 help / color / mirror / code / Atom feed

* Proposal: immediate strings
@ 2012-05-22  8:44 Dmitry Antipov
  2012-05-22 20:51 ` Miles Bader
  2012-05-24  5:17 ` Stefan Monnier
  0 siblings, 2 replies; 30+ messages in thread
From: Dmitry Antipov @ 2012-05-22  8:44 UTC (permalink / raw)
  To: emacs-devel

[-- Attachment #1: Type: text/plain, Size: 238 bytes --]

This is an old attempt (updated to current bzr trunk) to represent small strings
as "immediates" of struct Lisp_String without allocating data separately. It was
already discussed some time ago, and I would like to bump it again.

Dmitry

[-- Attachment #2: immstr.patch --]
[-- Type: text/plain, Size: 19418 bytes --]

=== modified file 'configure.in'
--- configure.in	2012-05-21 15:36:54 +0000
+++ configure.in	2012-05-22 08:32:37 +0000
@@ -3658,6 +3658,12 @@
 #define NO_INLINE
 #endif
 
+#if __GNUC__ > 1 /* Any GCC still in use should support this.  */
+#define PACKED __attribute__((packed))
+#else
+#define PACKED
+#endif
+
 #if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
 #define EXTERNALLY_VISIBLE __attribute__((externally_visible))
 #else

=== modified file 'src/alloc.c'
--- src/alloc.c	2012-05-21 15:36:54 +0000
+++ src/alloc.c	2012-05-22 08:29:06 +0000
@@ -147,20 +147,14 @@
 /* Mark, unmark, query mark bit of a Lisp string.  S must be a pointer
    to a struct Lisp_String.  */
 
-#define MARK_STRING(S)		((S)->size |= ARRAY_MARK_FLAG)
-#define UNMARK_STRING(S)	((S)->size &= ~ARRAY_MARK_FLAG)
-#define STRING_MARKED_P(S)	(((S)->size & ARRAY_MARK_FLAG) != 0)
+#define MARK_STRING(S)		((S)->u.imm.gcmarkbit = 1)
+#define UNMARK_STRING(S)	((S)->u.imm.gcmarkbit = 0)
+#define STRING_MARKED_P(S)	((S)->u.imm.gcmarkbit)
 
 #define VECTOR_MARK(V)		((V)->header.size |= ARRAY_MARK_FLAG)
 #define VECTOR_UNMARK(V)	((V)->header.size &= ~ARRAY_MARK_FLAG)
 #define VECTOR_MARKED_P(V)	(((V)->header.size & ARRAY_MARK_FLAG) != 0)
 
-/* Value is the number of bytes of S, a pointer to a struct Lisp_String.
-   Be careful during GC, because S->size contains the mark bit for
-   strings.  */
-
-#define GC_STRING_BYTES(S)	(STRING_BYTES (S))
-
 /* Global variables.  */
 struct emacs_globals globals;
 
@@ -392,6 +386,7 @@
 static void mark_stack (void);
 static int live_vector_p (struct mem_node *, void *);
 static int live_buffer_p (struct mem_node *, void *);
+static int live_string_data_p (struct Lisp_String *);
 static int live_string_p (struct mem_node *, void *);
 static int live_cons_p (struct mem_node *, void *);
 static int live_symbol_p (struct mem_node *, void *);
@@ -1761,7 +1756,8 @@
    a pointer to the `u.data' member of its sdata structure; the
    structure starts at a constant offset in front of that.  */
 
-#define SDATA_OF_STRING(S) ((struct sdata *) ((S)->data - SDATA_DATA_OFFSET))
+#define SDATA_OF_STRING(S) ((S)->u.imm.immbit ? (struct sdata *) NULL \
+  : ((struct sdata *) ((S)->u.dat.data - SDATA_DATA_OFFSET)))
 
 
 #ifdef GC_CHECK_STRING_OVERRUN
@@ -1843,21 +1839,34 @@
 
 static int check_string_bytes_count;
 
-#define CHECK_STRING_BYTES(S)	STRING_BYTES (S)
-
-
-/* Like GC_STRING_BYTES, but with debugging check.  */
+#define CHECK_STRING_BYTES(S) string_bytes (S)
 
 EMACS_INT
 string_bytes (struct Lisp_String *s)
 {
-  EMACS_INT nbytes =
-    (s->size_byte < 0 ? s->size & ~ARRAY_MARK_FLAG : s->size_byte);
+  EMACS_INT nbytes;
 
-  if (!PURE_POINTER_P (s)
-      && s->data
-      && nbytes != SDATA_NBYTES (SDATA_OF_STRING (s)))
-    abort ();
+  if (s->u.imm.immbit)
+    {
+      nbytes = s->u.imm.size_byte < 0 ?
+	s->u.imm.size : s->u.imm.size_byte;
+      if (nbytes >= STRING_IMM_MAX)
+	/* Impossible immediate string.  */
+	abort ();
+    }
+  else
+    {
+      nbytes = s->u.dat.size_byte < 0 ?
+	s->u.dat.size : s->u.dat.size_byte;
+      if (nbytes < STRING_IMM_MAX)
+	/* Impossible normal string.  */
+	abort ();
+      if (!PURE_POINTER_P (s) &&
+	  s->u.dat.data &&
+	  nbytes != SDATA_NBYTES (SDATA_OF_STRING (s)))
+	/* Normal non-pure string with size mismatch.  */
+	abort ();
+    }
   return nbytes;
 }
 
@@ -1882,7 +1891,7 @@
 	CHECK_STRING_BYTES (from->string);
 
       if (from->string)
-	nbytes = GC_STRING_BYTES (from->string);
+	nbytes = string_bytes (from->string);
       else
 	nbytes = SDATA_NBYTES (from);
 
@@ -2028,8 +2037,8 @@
   /* Determine the number of bytes needed to store NBYTES bytes
      of string data.  */
   needed = SDATA_SIZE (nbytes);
-  old_data = s->data ? SDATA_OF_STRING (s) : NULL;
-  old_nbytes = GC_STRING_BYTES (s);
+  old_data = s->u.dat.data ? SDATA_OF_STRING (s) : NULL;
+  old_nbytes = string_bytes (s);
 
   MALLOC_BLOCK_INPUT;
 
@@ -2088,13 +2097,11 @@
   MALLOC_UNBLOCK_INPUT;
 
   data->string = s;
-  s->data = SDATA_DATA (data);
+  s->u.dat.data = SDATA_DATA (data);
 #ifdef GC_CHECK_STRING_BYTES
   SDATA_NBYTES (data) = nbytes;
 #endif
-  s->size = nchars;
-  s->size_byte = nbytes;
-  s->data[nbytes] = '\0';
+  s->u.dat.data[nbytes] = '\0';
 #ifdef GC_CHECK_STRING_OVERRUN
   memcpy ((char *) data + needed, string_overrun_cookie,
 	  GC_STRING_OVERRUN_COOKIE_SIZE);
@@ -2112,6 +2119,12 @@
   consing_since_gc += needed;
 }
 
+#ifdef GC_STRING_STATS
+
+static EMACS_INT total_imm_strings, total_dat_strings, total_imm_intervals;
+static EMACS_INT total_imm_bytes, total_dat_bytes, total_dat_intervals;
+
+#endif
 
 /* Sweep and compact strings.  */
 
@@ -2125,6 +2138,12 @@
   total_strings = total_free_strings = 0;
   total_string_size = 0;
 
+#ifdef GC_STRING_STATS
+  total_imm_strings = total_dat_strings = 0;
+  total_imm_bytes = total_dat_bytes = 0;
+  total_imm_intervals = total_dat_intervals = 0;
+#endif
+
   /* Scan strings_blocks, free Lisp_Strings that aren't marked.  */
   for (b = string_blocks; b; b = next)
     {
@@ -2137,49 +2156,64 @@
 	{
 	  struct Lisp_String *s = b->strings + i;
 
-	  if (s->data)
+	  if (STRING_MARKED_P (s))
+	    {	      
+	      /* String is live; unmark it and its intervals.  */
+	      UNMARK_STRING (s);
+
+	      if (!NULL_INTERVAL_P (s->intervals))
+		UNMARK_BALANCE_INTERVALS (s->intervals);
+
+	      ++total_strings;
+	      total_string_size += string_bytes (s);
+#ifdef GC_STRING_STATS
+	      if (s->u.imm.immbit)
+		{
+		  total_imm_strings++;
+		  total_imm_bytes += string_bytes (s);
+		  if (!NULL_INTERVAL_P (s->intervals))
+		    total_imm_intervals++;
+		}
+	      else
+		{
+		  total_dat_strings++;
+		  total_dat_bytes += string_bytes (s);
+		  if (!NULL_INTERVAL_P (s->intervals))
+		    total_dat_intervals++;
+		}
+#endif /* GC_STRING_STATS */
+	    }
+	  else
 	    {
-	      /* String was not on free-list before.  */
-	      if (STRING_MARKED_P (s))
-		{
-		  /* String is live; unmark it and its intervals.  */
-		  UNMARK_STRING (s);
-
-		  if (!NULL_INTERVAL_P (s->intervals))
-		    UNMARK_BALANCE_INTERVALS (s->intervals);
-
-		  ++total_strings;
-		  total_string_size += STRING_BYTES (s);
-		}
+	      if (s->u.imm.immbit)
+		/* Fill data with special pattern. Used by
+		   GC to find dead immediate strings.  */
+		memset (s->u.imm.data, 0xff, STRING_IMM_MAX);
 	      else
 		{
-		  /* String is dead.  Put it on the free-list.  */
-		  struct sdata *data = SDATA_OF_STRING (s);
+		  if (s->u.dat.data)
+		    {
+		      /* String is dead.  Put it on the free-list.  */
+		      struct sdata *data = SDATA_OF_STRING (s);
 
-		  /* Save the size of S in its sdata so that we know
-		     how large that is.  Reset the sdata's string
-		     back-pointer so that we know it's free.  */
+		      /* Save the size of S in its sdata so that we know
+			 how large that is.  Reset the sdata's string
+			 back-pointer so that we know it's free.  */
 #ifdef GC_CHECK_STRING_BYTES
-		  if (GC_STRING_BYTES (s) != SDATA_NBYTES (data))
-		    abort ();
+		      if (string_bytes (s) != SDATA_NBYTES (data))
+			abort ();
 #else
-		  data->u.nbytes = GC_STRING_BYTES (s);
+		      data->u.nbytes = string_bytes (s);
 #endif
-		  data->string = NULL;
-
-		  /* Reset the strings's `data' member so that we
-		     know it's free.  */
-		  s->data = NULL;
-
-		  /* Put the string on the free-list.  */
-		  NEXT_FREE_LISP_STRING (s) = string_free_list;
-		  string_free_list = s;
-		  ++nfree;
+		      data->string = NULL;
+
+		      /* Reset the strings's `data' member so that we
+			 know it's free.  */
+		      s->u.dat.data = NULL;
+		    }
 		}
-	    }
-	  else
-	    {
-	      /* S was on the free-list before.  Put it there again.  */
+
+	      /* Put the string on the free-list.  */
 	      NEXT_FREE_LISP_STRING (s) = string_free_list;
 	      string_free_list = s;
 	      ++nfree;
@@ -2271,12 +2305,12 @@
 	  /* Check that the string size recorded in the string is the
 	     same as the one recorded in the sdata structure. */
 	  if (from->string
-	      && GC_STRING_BYTES (from->string) != SDATA_NBYTES (from))
+	      && string_bytes (from->string) != SDATA_NBYTES (from))
 	    abort ();
 #endif /* GC_CHECK_STRING_BYTES */
 
 	  if (from->string)
-	    nbytes = GC_STRING_BYTES (from->string);
+	    nbytes = string_bytes (from->string);
 	  else
 	    nbytes = SDATA_NBYTES (from);
 
@@ -2312,7 +2346,7 @@
 		{
 		  xassert (tb != b || to < from);
 		  memmove (to, from, nbytes + GC_STRING_EXTRA);
-		  to->string->data = SDATA_DATA (to);
+		  to->string->u.dat.data = SDATA_DATA (to);
 		}
 
 	      /* Advance past the sdata we copied to.  */
@@ -2561,13 +2595,24 @@
     return empty_multibyte_string;
 
   s = allocate_string ();
-  allocate_string_data (s, nchars, nbytes);
+  if (nbytes < STRING_IMM_MAX)
+    {
+      s->u.imm.immbit = 1;
+      s->u.imm.size = nchars;
+      s->u.imm.size_byte = nbytes;
+    }
+  else
+    {
+      s->u.imm.immbit = 0;
+      s->u.dat.size = nchars;
+      s->u.dat.size_byte = nbytes;
+      allocate_string_data (s, nchars, nbytes);
+    }
   XSETSTRING (string, s);
   string_chars_consed += nbytes;
   return string;
 }
 
-
 \f
 /***********************************************************************
 			   Float Allocation
@@ -3936,6 +3981,22 @@
   x->color = MEM_BLACK;
 }
 
+/* Non-zero if data of S is valid.  */
+
+static inline int
+live_string_data_p (struct Lisp_String *s)
+{
+  if (s->u.imm.immbit)
+    {
+      unsigned char *p;
+
+      for (p = s->u.imm.data; p < s->u.imm.data + STRING_IMM_MAX; p++)
+	if (*p != 0xff)
+	  return 1;
+      return 0;
+    }
+  return s->u.dat.data != NULL;
+}
 
 /* Value is non-zero if P is a pointer to a live Lisp string on
    the heap.  M is a pointer to the mem_block for P.  */
@@ -3953,7 +4014,7 @@
       return (offset >= 0
 	      && offset % sizeof b->strings[0] == 0
 	      && offset < (STRING_BLOCK_SIZE * sizeof b->strings[0])
-	      && ((struct Lisp_String *) p)->data != NULL);
+	      && live_string_data_p ((struct Lisp_String *) p));
     }
   else
     return 0;
@@ -4868,15 +4929,29 @@
   struct Lisp_String *s;
 
   s = (struct Lisp_String *) pure_alloc (sizeof *s, Lisp_String);
-  s->data = (unsigned char *) find_string_data_in_pure (data, nbytes);
-  if (s->data == NULL)
-    {
-      s->data = (unsigned char *) pure_alloc (nbytes + 1, -1);
-      memcpy (s->data, data, nbytes);
-      s->data[nbytes] = '\0';
-    }
-  s->size = nchars;
-  s->size_byte = multibyte ? nbytes : -1;
+
+  if (nbytes < STRING_IMM_MAX)
+    {
+      memcpy (s->u.imm.data, data, nbytes);
+      s->u.imm.data[nbytes] = '\0';
+      s->u.imm.immbit = 1;
+      s->u.imm.size = nchars;
+      s->u.imm.size_byte = multibyte ? nbytes : -1;
+    }
+  else
+    {
+      s->u.dat.data = (unsigned char *) find_string_data_in_pure (data, nbytes);
+      if (s->u.dat.data == NULL)
+	{
+	  s->u.dat.data = (unsigned char *) pure_alloc (nbytes + 1, -1);
+	  memcpy (s->u.dat.data, data, nbytes);
+	  s->u.dat.data[nbytes] = '\0';
+	}
+      s->u.imm.immbit = 0;
+      s->u.dat.size = nchars;
+      s->u.dat.size_byte = multibyte ? nbytes : -1;
+    }
+
   s->intervals = NULL_INTERVAL;
   XSETSTRING (string, s);
   return string;
@@ -4893,9 +4968,23 @@
   EMACS_INT nchars = strlen (data);
 
   s = (struct Lisp_String *) pure_alloc (sizeof *s, Lisp_String);
-  s->size = nchars;
-  s->size_byte = -1;
-  s->data = (unsigned char *) data;
+
+  if (nchars < STRING_IMM_MAX)
+    {
+      memcpy (s->u.imm.data, data, nchars);
+      s->u.imm.data[nchars] = '\0';
+      s->u.imm.immbit = 1;
+      s->u.imm.size = nchars;
+      s->u.imm.size_byte = -1;
+    }
+  else
+    {
+      s->u.dat.data = (unsigned char *) data;
+      s->u.imm.immbit = 0;
+      s->u.dat.size = nchars;
+      s->u.dat.size_byte = -1;
+    }
+
   s->intervals = NULL_INTERVAL;
   XSETSTRING (string, s);
   return string;
@@ -6318,6 +6407,34 @@
   return Flist (8, consed);
 }
 
+#ifdef GC_STRING_STATS
+
+DEFUN ("string-stats", Fstring_stats, Sstring_stats, 0, 0, 0,
+       doc: /* Return a list of counters that measures how much
+strings of a particular internal structure are alive after last
+garbage collection, and how many bytes are in them.
+The elements of the value are are as follows:
+  (IMM-STRINGS IMM-BYTES IMM-INTERVALS DAT-STRINGS DAT-BYTES DAT-INTERVALS)
+where IMM-STRINGS is the number of immediate strings, IMM-BYTES is the total
+number of bytes in them, and IMM-INTERVALS is the number of immediate string
+with non-nil text properties. The rest three numbers has the same meaning
+for normal strings, respectively.  */)
+  (void)
+{
+  Lisp_Object data[6];
+
+  data[0] = make_number (min (MOST_POSITIVE_FIXNUM, total_imm_strings));
+  data[1] = make_number (min (MOST_POSITIVE_FIXNUM, total_imm_bytes));
+  data[2] = make_number (min (MOST_POSITIVE_FIXNUM, total_imm_intervals));
+  data[3] = make_number (min (MOST_POSITIVE_FIXNUM, total_dat_strings));
+  data[4] = make_number (min (MOST_POSITIVE_FIXNUM, total_dat_bytes));
+  data[5] = make_number (min (MOST_POSITIVE_FIXNUM, total_dat_intervals));
+
+  return Flist (6, data);
+}
+
+#endif /* GC_STRING_STATS */
+
 /* Find at most FIND_MAX symbols which have OBJ as their value or
    function.  This is used in gdbinit's `xwhichsymbols' command.  */
 
@@ -6546,7 +6663,9 @@
   defsubr (&Sgarbage_collect);
   defsubr (&Smemory_limit);
   defsubr (&Smemory_use_counts);
-
+#ifdef GC_STRING_STATS
+  defsubr (&Sstring_stats);
+#endif
 #if GC_MARK_STACK == GC_USE_GCPROS_CHECK_ZOMBIES
   defsubr (&Sgc_status);
 #endif

=== modified file 'src/fns.c'
--- src/fns.c	2012-05-21 15:36:54 +0000
+++ src/fns.c	2012-05-22 08:29:06 +0000
@@ -2172,8 +2172,8 @@
 	  int len = CHAR_STRING (charval, str);
 	  EMACS_INT size_byte = SBYTES (array);
 
-	  if (INT_MULTIPLY_OVERFLOW (SCHARS (array), len)
-	      || SCHARS (array) * len != size_byte)
+	  if (INT_MULTIPLY_OVERFLOW (size, len)
+	      || size * len != size_byte)
 	    error ("Attempt to change byte length of a string");
 	  for (idx = 0; idx < size_byte; idx++)
 	    *p++ = str[idx % len];

=== modified file 'src/lisp.h'
--- src/lisp.h	2012-05-21 15:36:54 +0000
+++ src/lisp.h	2012-05-22 08:29:06 +0000
@@ -727,17 +727,23 @@
 
 /* Convenience macros for dealing with Lisp strings.  */
 
-#define SDATA(string)		(XSTRING (string)->data + 0)
+#define SDATA(string)		(XSTRING (string)->u.imm.immbit ? \
+				 (XSTRING (string)->u.imm.data + 0) : \
+				 (XSTRING (string)->u.dat.data + 0))
 #define SREF(string, index)	(SDATA (string)[index] + 0)
 #define SSET(string, index, new) (SDATA (string)[index] = (new))
-#define SCHARS(string)		(XSTRING (string)->size + 0)
-#define SBYTES(string)		(STRING_BYTES (XSTRING (string)) + 0)
+#define SCHARS(string)		(XSTRING (string)->u.imm.immbit ? \
+				 (XSTRING (string)->u.imm.size + 0) : \
+				 (XSTRING (string)->u.dat.size + 0))
+#define SBYTES(string)		(string_bytes (XSTRING (string)) + 0)
 
 /* Avoid "differ in sign" warnings.  */
 #define SSDATA(x)  ((char *) SDATA (x))
 
 #define STRING_SET_CHARS(string, newsize) \
-    (XSTRING (string)->size = (newsize))
+  (XSTRING (string)->u.imm.immbit ? \
+   (XSTRING (string)->u.imm.size = (newsize)) : \
+   (XSTRING (string)->u.dat.size = (newsize)))
 
 #define STRING_COPYIN(string, index, new, count) \
     memcpy (SDATA (string) + index, new, count)
@@ -827,24 +833,12 @@
 #define CDR_SAFE(c)				\
   (CONSP ((c)) ? XCDR ((c)) : Qnil)
 
+#define STRING_SIZE_BYTE(string) (XSTRING (string)->u.imm.immbit ? \
+				  XSTRING (string)->u.imm.size_byte : \
+				  XSTRING (string)->u.dat.size_byte)
+
 /* Nonzero if STR is a multibyte string.  */
-#define STRING_MULTIBYTE(STR)  \
-  (XSTRING (STR)->size_byte >= 0)
-
-/* Return the length in bytes of STR.  */
-
-#ifdef GC_CHECK_STRING_BYTES
-
-struct Lisp_String;
-extern EMACS_INT string_bytes (struct Lisp_String *);
-#define STRING_BYTES(S) string_bytes ((S))
-
-#else /* not GC_CHECK_STRING_BYTES */
-
-#define STRING_BYTES(STR)  \
-  ((STR)->size_byte < 0 ? (STR)->size : (STR)->size_byte)
-
-#endif /* not GC_CHECK_STRING_BYTES */
+#define STRING_MULTIBYTE(string) (STRING_SIZE_BYTE (string) > 0)
 
 /* An upper bound on the number of bytes in a Lisp string, not
    counting the terminating null.  This a tight enough bound to
@@ -860,18 +854,28 @@
 #define STRING_BYTES_BOUND  \
   min (MOST_POSITIVE_FIXNUM, (ptrdiff_t) min (SIZE_MAX, PTRDIFF_MAX) - 1)
 
+/* Maximum amount of bytes, including '\0', in an immediate string.
+   This assumes that sizeof (EMACS_INT) is equal to sizeof (void * ).  */
+#define STRING_IMM_MAX (3 * sizeof (EMACS_INT) - 2)
+
 /* Mark STR as a unibyte string.  */
 #define STRING_SET_UNIBYTE(STR)  \
   do { if (EQ (STR, empty_multibyte_string))  \
       (STR) = empty_unibyte_string;  \
-    else XSTRING (STR)->size_byte = -1; } while (0)
+    else if (XSTRING (STR)->u.imm.immbit) \
+      XSTRING (STR)->u.imm.size_byte = -1; \
+    else \
+      XSTRING (STR)->u.dat.size_byte = -1; } while (0)
 
 /* Mark STR as a multibyte string.  Assure that STR contains only
    ASCII characters in advance.  */
 #define STRING_SET_MULTIBYTE(STR)  \
   do { if (EQ (STR, empty_unibyte_string))  \
       (STR) = empty_multibyte_string;  \
-    else XSTRING (STR)->size_byte = XSTRING (STR)->size; } while (0)
+    else if (XSTRING (STR)->u.imm.immbit) \
+      XSTRING (STR)->u.imm.size_byte = XSTRING (STR)->u.imm.size; \
+    else \
+      XSTRING (STR)->u.dat.size_byte = XSTRING (STR)->u.dat.size; } while (0)
 
 /* Get text properties.  */
 #define STRING_INTERVALS(STR)  (XSTRING (STR)->intervals + 0)
@@ -879,16 +883,58 @@
 /* Set text properties.  */
 #define STRING_SET_INTERVALS(STR, INT) (XSTRING (STR)->intervals = (INT))
 
-/* In a string or vector, the sign bit of the `size' is the gc mark bit */
-
 struct Lisp_String
   {
-    EMACS_INT size;
-    EMACS_INT size_byte;
-    INTERVAL intervals;		/* text properties in this string */
-    unsigned char *data;
+    /* Text properties in this string.  */
+    INTERVAL intervals;
+
+    union {
+
+      /* GC mark bit and subtype bit are in IMM just by convention - when
+	 IMMBIT is 0, the DAT field is used except it's UNUSED field.  */
+
+      struct {
+	unsigned gcmarkbit : 1;
+	unsigned immbit : 1;
+	EMACS_INT size : 7;
+	EMACS_INT size_byte : 7;
+	unsigned char data[STRING_IMM_MAX];
+      } PACKED imm;
+      
+      struct {
+	unsigned unused : 2; /* Do not access this placeholder.  */
+	EMACS_INT size : BITS_PER_EMACS_INT - 1;
+	EMACS_INT size_byte : BITS_PER_EMACS_INT - 1;
+	unsigned char *data;
+      } PACKED dat;
+
+    } u;
   };
 
+/* Return the length in bytes of STR.  */
+
+#ifdef GC_CHECK_STRING_BYTES
+
+/* Slower version with debugging check.  */
+
+extern EMACS_INT string_bytes (struct Lisp_String *);
+
+#else /* not GC_CHECK_STRING_BYTES */
+
+static inline
+EMACS_INT string_bytes (struct Lisp_String *s)
+{
+  EMACS_INT size, size_byte;
+
+  if (s->u.imm.immbit)
+    size = s->u.imm.size, size_byte = s->u.imm.size_byte;
+  else
+    size = s->u.dat.size, size_byte = s->u.dat.size_byte;
+  return size_byte < 0 ? size : size_byte;
+}
+
+#endif /* GC_CHECK_STRING_BYTES */
+
 /* Header of vector-like objects.  This documents the layout constraints on
    vectors and pseudovectors other than struct Lisp_Subr.  It also prevents
    compilers from being fooled by Emacs's type punning: the XSETPSEUDOVECTOR


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-22  8:44 Proposal: immediate strings Dmitry Antipov
@ 2012-05-22 20:51 ` Miles Bader
  2012-05-22 22:13   ` Paul Eggert
  2012-05-24  5:17 ` Stefan Monnier
  1 sibling, 1 reply; 30+ messages in thread
From: Miles Bader @ 2012-05-22 20:51 UTC (permalink / raw)
  To: Dmitry Antipov; +Cc: emacs-devel

Dmitry Antipov <dmantipov@yandex.ru> writes:
> +/* Maximum amount of bytes, including '\0', in an immediate string.
> +   This assumes that sizeof (EMACS_INT) is equal to sizeof (void * ).  */
> +#define STRING_IMM_MAX (3 * sizeof (EMACS_INT) - 2)

Not a good assumption (especially given that it's easy to do it right)...

-miles

-- 
values of β will give rise to dom!



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-22 20:51 ` Miles Bader
@ 2012-05-22 22:13   ` Paul Eggert
  0 siblings, 0 replies; 30+ messages in thread
From: Paul Eggert @ 2012-05-22 22:13 UTC (permalink / raw)
  To: Miles Bader; +Cc: Dmitry Antipov, emacs-devel

On 05/22/2012 01:51 PM, Miles Bader wrote:
> Dmitry Antipov <dmantipov@yandex.ru> writes:
>> +   This assumes that sizeof (EMACS_INT) is equal to sizeof (void * ).  */
> 
> Not a good assumption (especially given that it's easy to do it right)...

Agreed, especially since the assumption is false on x86
when configured --with-wide-int.  Another comment:

  #if __GNUC__ > 1 /* Any GCC still in use should support this.  */
  #define PACKED __attribute__((packed))
  #else
  #define PACKED
  #endif

Shouldn't this also support lcc / MSVC / Sun Studio (#pragma pack(1)),
and IRIX cc (#pragma pack 1).  Better yet, have 'configure' check
for the packing syntax.  Or do these other compilers not support
the right kind of packing?

Come to think of it, why use packing at all?
Why not use a layout like the following
in struct Lisp_String?

    union {

      /* If IMMBIT is 1, use IMM; otherwise use DAT.  */

      struct {
	unsigned immbit : 1;
	unsigned size : 7;
	unsigned gcmarkbit : 1;
	unsigned size_byte : 7;
	unsigned char data[STRING_IMM_MAX];
      } PACKED imm;

      struct {
        unsigned immbit : 1;
	EMACS_UINT size : BITS_PER_EMACS_INT - 1;
	unsigned gcmarkbit : 1;
	EMACS_UINT size_byte : BITS_PER_EMACS_INT - 1;
	unsigned char *data;
      } PACKED dat;

  } u;

That way, we don't need to worry about "packed".

There are other possibilities too, I expect.
The idea is that "packed" is not simply a portability
problem, it's a performance issue, and we're better off
avoiding it if we can.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-22  8:44 Proposal: immediate strings Dmitry Antipov
  2012-05-22 20:51 ` Miles Bader
@ 2012-05-24  5:17 ` Stefan Monnier
  2012-05-24  5:41   ` Ken Raeburn
                     ` (2 more replies)
  1 sibling, 3 replies; 30+ messages in thread
From: Stefan Monnier @ 2012-05-24  5:17 UTC (permalink / raw)
  To: Dmitry Antipov; +Cc: emacs-devel

> +#if __GNUC__ > 1 /* Any GCC still in use should support this.  */
> +#define PACKED __attribute__((packed))
> +#else
> +#define PACKED
> +#endif

Let's not use "packed" structures: too much trouble.

> +      if (nbytes >= STRING_IMM_MAX)
> +	/* Impossible immediate string.  */

Actually, we can allocate all the pure strings as "immediate strings"
(within the limits of imm.nbytes, of course), in which case we'd have
strings with more bytes than STRING_IMM_MAX, so while this assertion may
have been useful for debugging, we'll want to get rid of it.

> +      if (nbytes < STRING_IMM_MAX)
> +	/* Impossible normal string.  */
> +	abort ();

Please use "eassert (nbytes >= STRING_IMM_MAX);" instead.

> +#ifdef GC_STRING_STATS
> +  total_imm_strings = total_dat_strings = 0;
> +  total_imm_bytes = total_dat_bytes = 0;
> +  total_imm_intervals = total_dat_intervals = 0;
> +#endif

Have you looked at some of those stats?  I'd be interested to know which
proportion of the strings are "immediate and using intervals",
v.s. "just slightly too big to be immediate, but not using intervals".

> +		/* Fill data with special pattern. Used by
> +		   GC to find dead immediate strings.  */
> +		memset (s->u.imm.data, 0xff, STRING_IMM_MAX);

Is that really 100% reliable?  Why not use the `intervals' field with
a 0xdeadbeef pointer instead?

> +#define SDATA(string)		(XSTRING (string)->u.imm.immbit ? \
> +				 (XSTRING (string)->u.imm.data + 0) : \
> +				 (XSTRING (string)->u.dat.data + 0))

IIUC the "+ 0" are unneeded here, because using a "..?..:.." already
makes sure we have an rvalue.  Same for SCHARS.

> +   This assumes that sizeof (EMACS_INT) is equal to sizeof (void * ).  */
> +#define STRING_IMM_MAX (3 * sizeof (EMACS_INT) - 2)

Miles already pointed out that this is not a valid assumption.

>    do { if (EQ (STR, empty_multibyte_string))  \
>        (STR) = empty_unibyte_string;  \
> -    else XSTRING (STR)->size_byte = -1; } while (0)
> +    else if (XSTRING (STR)->u.imm.immbit) \
> +      XSTRING (STR)->u.imm.size_byte = -1; \
> +    else \
> +      XSTRING (STR)->u.dat.size_byte = -1; } while (0)

BTW, TAB in cc-mode will not only properly indent the code, but also
nicely align the \ at the end of lines.  ;-)

> +    union {
> +
> +      /* GC mark bit and subtype bit are in IMM just by convention - when
> +	 IMMBIT is 0, the DAT field is used except it's UNUSED field.  */
> +
> +      struct {
> +	unsigned gcmarkbit : 1;
> +	unsigned immbit : 1;
> +	EMACS_INT size : 7;
> +	EMACS_INT size_byte : 7;
> +	unsigned char data[STRING_IMM_MAX];
> +      } PACKED imm;
> +      
> +      struct {
> +	unsigned unused : 2; /* Do not access this placeholder.  */
> +	EMACS_INT size : BITS_PER_EMACS_INT - 1;
> +	EMACS_INT size_byte : BITS_PER_EMACS_INT - 1;
> +	unsigned char *data;
> +      } PACKED dat;
> +
> +    } u;
>    };

Access to your `size' field is inefficient because it will straddle
two words.  We could move the gcmarkbit as Paul suggests (at the cost of
having to check immbit before knowing where gcmarkbit is located), or
maybe we can move both the mark bit and the immbit into the `intervals'
fields (after all, we know those are aligned on a multiple of at least
4 on all architectures on which Emacs is known to run, so we have
2 bits free for (ab)use there).

        Stefan

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-24  5:17 ` Stefan Monnier
@ 2012-05-24  5:41   ` Ken Raeburn
  2012-05-24  5:50     ` Miles Bader
  2012-05-24  6:08   ` Paul Eggert
  2012-05-29  6:55   ` Dmitry Antipov
  2 siblings, 1 reply; 30+ messages in thread
From: Ken Raeburn @ 2012-05-24  5:41 UTC (permalink / raw)
  To: Emacs Dev

On May 24, 2012, at 01:17, Stefan Monnier wrote:
>> +#define SDATA(string)		(XSTRING (string)->u.imm.immbit ? \
>> +				 (XSTRING (string)->u.imm.data + 0) : \
>> +				 (XSTRING (string)->u.dat.data + 0))
> 
> IIUC the "+ 0" are unneeded here, because using a "..?..:.." already
> makes sure we have an rvalue.  Same for SCHARS.

FWIW, there used to be a GCC extension that caused the ternary operator to be an lvalue if both the second and third operands were lvalues.  I'm not sure when it was removed; possible in 4.0, which might be old enough that we can ignore it...

Ken


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-24  5:41   ` Ken Raeburn
@ 2012-05-24  5:50     ` Miles Bader
  0 siblings, 0 replies; 30+ messages in thread
From: Miles Bader @ 2012-05-24  5:50 UTC (permalink / raw)
  To: Ken Raeburn; +Cc: Emacs Dev

Ken Raeburn <raeburn@raeburn.org> writes:
> FWIW, there used to be a GCC extension that caused the ternary
> operator to be an lvalue if both the second and third operands were
> lvalues.  I'm not sure when it was removed; possible in 4.0, which
> might be old enough that we can ignore it...

It can be ignored, I think -- after all, the presence of that
extension won't cause any miscompiles, but will merely result in a
failure to diagnose a potential problem should the user change some
code that uses those macros.  So as long as enough active Emacs
developers use non-ancient versions of gcc, there's really no
problem...

-miles

-- 
... reality itself is blind unintelligent force, and it is only a fluke,
it is only as a result of pure chances, that resulting from the
exuberance of this energy there are people, with values, with reason,
with languages, with cultures, ... and with love.  Just a fluke.
[Alan Watts, "The Ceramic and the Fully Automatic"]



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-24  5:17 ` Stefan Monnier
  2012-05-24  5:41   ` Ken Raeburn
@ 2012-05-24  6:08   ` Paul Eggert
  2012-05-24  7:14     ` Stefan Monnier
  2012-05-29  6:55   ` Dmitry Antipov
  2 siblings, 1 reply; 30+ messages in thread
From: Paul Eggert @ 2012-05-24  6:08 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: Dmitry Antipov, emacs-devel

On 05/23/2012 10:17 PM, Stefan Monnier wrote:
> maybe we can move both the mark bit and the immbit into the `intervals'
> fields (after all, we know those are aligned on a multiple of at least
> 4 on all architectures on which Emacs is known to run, so we have
> 2 bits free for (ab)use there).

That might work.  But this raises another idea: assuming most tiny strings
don't have text properties, won't it improve performance overall if
any string with text properties is forced to be an ordinary string, so
that immediate strings can reuse the rarely-used 'intervals' member
for data?

Another thought that comes to mind, is that we could leave
the mark bit where it is (the most significant bit of 'size'),
and reserve 'size' values >= (EMACS_INT / 2 & ~0xffff)
to represent immediate strings, with the size and size_bytes
values packed into the low-order 16 bits of 'size'.
Something like this:

   struct Lisp_String
     {
       EMACS_INT size;
       union
         {
	   struct Ordinary_Lisp_String_Component
	     {
	       EMACS_INT size_byte;
	       INTERVAL intervals;
	       unsigned char *data;
	     } ordinary;
           unsigned char data[sizeof (struct Ordinary_Lisp_String_Component)];
       } u;
     };

   #define IMMEDIATE_STRING(s) (EMACS_INT_MAX & ~0xffff <= (s)->size)
   #define SDATA(s) (IMMEDIATE_STRING (XSTRING (s)) \
                     ? XSTRING (s)->data \
                     : XSTRING (s)->ordinary.data)
   #define SCHARS(s) (IMMEDIATE_STRING (XSTRING (s)) \
                      ? XSTRING (s)->size & 0xff
		      : XSTRING (s)->size)
   #define SBYTES(s) (IMMEDIATE_STRING (XSTRING (s)) \
                      ? (XSTRING (s)->size >> 8) & 0xff
		      : STRING_BYTES (XSTRING (s)))

etc.




^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-24  6:08   ` Paul Eggert
@ 2012-05-24  7:14     ` Stefan Monnier
  2012-05-24  7:52       ` Paul Eggert
  2012-05-28 11:32       ` Dmitry Antipov
  0 siblings, 2 replies; 30+ messages in thread
From: Stefan Monnier @ 2012-05-24  7:14 UTC (permalink / raw)
  To: Paul Eggert; +Cc: Dmitry Antipov, emacs-devel

> That might work.  But this raises another idea: assuming most tiny strings
> don't have text properties, won't it improve performance overall if
> any string with text properties is forced to be an ordinary string, so
> that immediate strings can reuse the rarely-used 'intervals' member
> for data?

That's part of the reason why I'm interested in his statistics about
which strings have text properties.

But even if there's a potential gain here, there's still the issue that
we may then have to convert an immediate string into an ordinary string
when text-properties are added, which will imply basically relocating
the string data, with the usual risk that we might happen to do that
right in the middle of a loop keeping a pointer to the inside of the
string's data.

> Another thought that comes to mind, is that we could leave
> the mark bit where it is (the most significant bit of 'size'),
> and reserve 'size' values >= (EMACS_INT / 2 & ~0xffff)

No need to be that careful: the `size' field has 2 free bits (since the
size is limited by what integer can fit in a Lisp_Object), and only 1 is
used (for the markbit), so we could just grab a second bit from `size'
for the `immbit'.

        Stefan

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-24  7:14     ` Stefan Monnier
@ 2012-05-24  7:52       ` Paul Eggert
  2012-05-24 12:51         ` Stefan Monnier
  2012-05-28 11:32       ` Dmitry Antipov
  1 sibling, 1 reply; 30+ messages in thread
From: Paul Eggert @ 2012-05-24  7:52 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: Dmitry Antipov, emacs-devel

On 05/24/2012 12:14 AM, Stefan Monnier wrote:
> we could just grab a second bit from `size'
> for the `immbit'.

Only if MOST_POSITIVE_FIXNUM <= min (SIZE_MAX, PTRDIFF_MAX) / 2.
This inequality is commonly true, but it's false when EMACS_INT
is wider than both size_t and ptrdiff_t.  The method I suggested
should work even on these platforms.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-24  7:52       ` Paul Eggert
@ 2012-05-24 12:51         ` Stefan Monnier
  2012-05-24 16:35           ` Paul Eggert
  0 siblings, 1 reply; 30+ messages in thread
From: Stefan Monnier @ 2012-05-24 12:51 UTC (permalink / raw)
  To: Paul Eggert; +Cc: Dmitry Antipov, emacs-devel

>> we could just grab a second bit from `size'
>> for the `immbit'.
> Only if MOST_POSITIVE_FIXNUM <= min (SIZE_MAX, PTRDIFF_MAX) / 2.

Why?
This would be true if `size' were changed to ptrdiff_t (which would make
sense, indeed), but it's currently EMACS_INT, so it should always have
2 free bits.


        Stefan



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-24 12:51         ` Stefan Monnier
@ 2012-05-24 16:35           ` Paul Eggert
  2012-05-25  6:43             ` Dmitry Antipov
  0 siblings, 1 reply; 30+ messages in thread
From: Paul Eggert @ 2012-05-24 16:35 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: Dmitry Antipov, emacs-devel

On 05/24/2012 05:51 AM, Stefan Monnier wrote:
>>> we could just grab a second bit from `size'
>>> >> for the `immbit'.
>> > Only if MOST_POSITIVE_FIXNUM <= min (SIZE_MAX, PTRDIFF_MAX) / 2.
> Why?
> This would be true if `size' were changed to ptrdiff_t (which would make
> sense, indeed), but it's currently EMACS_INT,

Ah, sorry, I was assuming the integer-overflow patch
(for bug#9874).  That patch indeed changes 'size'
to ptrdiff_t.  Any changes for immediate strings
should assume that this patch has been applied.
I haven't applied that patch only because the trunk
has been frozen against big changes since God
knows when.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-24 16:35           ` Paul Eggert
@ 2012-05-25  6:43             ` Dmitry Antipov
  2012-05-25  7:30               ` Paul Eggert
  0 siblings, 1 reply; 30+ messages in thread
From: Dmitry Antipov @ 2012-05-25  6:43 UTC (permalink / raw)
  To: Paul Eggert; +Cc: Stefan Monnier, emacs-devel

On 05/24/2012 08:35 PM, Paul Eggert wrote:

> Ah, sorry, I was assuming the integer-overflow patch
> (for bug#9874).  That patch indeed changes 'size'
> to ptrdiff_t.  Any changes for immediate strings
> should assume that this patch has been applied.
> I haven't applied that patch only because the trunk
> has been frozen against big changes since God
> knows when.

Will it be pushed in some near future?
It's hard to sync your own bits with stuff like that.

Dmitry






^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-25  6:43             ` Dmitry Antipov
@ 2012-05-25  7:30               ` Paul Eggert
  0 siblings, 0 replies; 30+ messages in thread
From: Paul Eggert @ 2012-05-25  7:30 UTC (permalink / raw)
  To: Dmitry Antipov; +Cc: Stefan Monnier, emacs-devel

On 05/24/2012 11:43 PM, Dmitry Antipov wrote:
>> I haven't applied that patch only because the trunk
>> has been frozen against big changes since God
>> knows when.
> 
> Will it be pushed in some near future?

I'd like to push the Bug#9874 patch now.
The advantage in permitting further trunk development
would outweigh the disadvantage in complicating the
merge of future emacs-24 patches, in my view.  But
this is Stefan's and/or Chong's call.



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-24  7:14     ` Stefan Monnier
  2012-05-24  7:52       ` Paul Eggert
@ 2012-05-28 11:32       ` Dmitry Antipov
  2012-05-28 14:25         ` Stefan Monnier
  1 sibling, 1 reply; 30+ messages in thread
From: Dmitry Antipov @ 2012-05-28 11:32 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: Paul Eggert, emacs-devel

[-- Attachment #1: Type: text/plain, Size: 1058 bytes --]

On 05/24/2012 11:14 AM, Stefan Monnier wrote:

>> That might work.  But this raises another idea: assuming most tiny strings
>> don't have text properties, won't it improve performance overall if
>> any string with text properties is forced to be an ordinary string, so
>> that immediate strings can reuse the rarely-used 'intervals' member
>> for data?
>
> That's part of the reason why I'm interested in his statistics about
> which strings have text properties.

YMMV since it depends on what happens. For a long byte-compile runs,
~0.05% - 0.1% of live strings may have text properties. On the other
side, some editing operations may cause this percentage to grow; for
example, (indent-region) in C mode buffers for a huge regions (a size
of xdisp.c :-) gives ~15% of live strings with properties, and almost
all of them are small (less than 16 bytes).

BTW, is it possible to attach a properties to a string used to represent
a symbol name? If not, we can drop some bits from mark_object at the
cost of having some precautions in Fmake_symbol.

Dmitry


[-- Attachment #2: symbol_name_props.patch --]
[-- Type: text/plain, Size: 646 bytes --]

=== modified file 'src/alloc.c'
--- src/alloc.c	2012-05-25 18:19:24 +0000
+++ src/alloc.c	2012-05-28 11:18:45 +0000
@@ -3211,6 +3211,9 @@
 
   MALLOC_BLOCK_INPUT;
 
+  if (!NULL_INTERVAL_P (STRING_INTERVALS (name)))
+    name = build_string (SDATA (name));
+
   if (symbol_free_list)
     {
       XSETSYMBOL (val, symbol_free_list);
@@ -5691,7 +5694,9 @@
 	  }
 	if (!PURE_POINTER_P (XSTRING (ptr->xname)))
 	  MARK_STRING (XSTRING (ptr->xname));
-	MARK_INTERVAL_TREE (STRING_INTERVALS (ptr->xname));
+
+	/* Symbol name should have no properties.  */
+	eassert (NULL_INTERVAL_P (STRING_INTERVALS (ptr->xname)));
 
 	ptr = ptr->next;
 	if (ptr)


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-28 11:32       ` Dmitry Antipov
@ 2012-05-28 14:25         ` Stefan Monnier
  0 siblings, 0 replies; 30+ messages in thread
From: Stefan Monnier @ 2012-05-28 14:25 UTC (permalink / raw)
  To: Dmitry Antipov; +Cc: Paul Eggert, emacs-devel

> YMMV since it depends on what happens. For a long byte-compile runs,
> ~0.05% - 0.1% of live strings may have text properties. On the other
> side, some editing operations may cause this percentage to grow; for
> example, (indent-region) in C mode buffers for a huge regions (a size
> of xdisp.c :-) gives ~15% of live strings with properties, and almost
> all of them are small (less than 16 bytes).

Thanks.  I think we can keep the `intervals' field for now.

> BTW, is it possible to attach a properties to a string used to represent
> a symbol name? If not, we can drop some bits from mark_object at the
> cost of having some precautions in Fmake_symbol.

Currently, `symbol-name' returns the actual string passed to
make-symbol/intern, so those can have text properties and you can even
change those properties later on.  Hell, you can even get weird results
with things like (aset (symbol-name <yoursymbol>) ?a).


        Stefan



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-24  5:17 ` Stefan Monnier
  2012-05-24  5:41   ` Ken Raeburn
  2012-05-24  6:08   ` Paul Eggert
@ 2012-05-29  6:55   ` Dmitry Antipov
  2012-05-29  7:38     ` Paul Eggert
  2012-05-29  7:38     ` Proposal: immediate strings Andreas Schwab
  2 siblings, 2 replies; 30+ messages in thread
From: Dmitry Antipov @ 2012-05-29  6:55 UTC (permalink / raw)
  To: emacs-devel; +Cc: Stefan Monnier

[-- Attachment #1: Type: text/plain, Size: 1120 bytes --]

On 05/24/2012 09:17 AM, Stefan Monnier wrote:

>> +#if __GNUC__>  1 /* Any GCC still in use should support this.  */
>> +#define PACKED __attribute__((packed))
>> +#else
>> +#define PACKED
>> +#endif
>
> Let's not use "packed" structures: too much trouble.

To avoid using 'packed' attribute and speedup an inefficient access to u.dat.size
field, it's possible to use 'ptrdiff_t X : BITS_PER_PTRDIFF_T - 2'
bitfields - it's still wide enough to hold the values up to STRING_BYTES_BOUND.

> Actually, we can allocate all the pure strings as "immediate strings"
> (within the limits of imm.nbytes, of course), in which case we'd have
> strings with more bytes than STRING_IMM_MAX, so while this assertion may
> have been useful for debugging, we'll want to get rid of it.

I'm not sure it's worth making string code even more complicated with this.

> Is that really 100% reliable?  Why not use the `intervals' field with
> a 0xdeadbeef pointer instead?

Better solutions are welcome, never seen a crash with this; 'intervals'
field can't be used because NEXT_FREE_LISP_STRING (X) = Y will overwrite
X->intervals.

Dmitry

[-- Attachment #2: immstr.patch --]
[-- Type: text/plain, Size: 19456 bytes --]

=== modified file 'src/alloc.c'
--- src/alloc.c	2012-05-25 18:19:24 +0000
+++ src/alloc.c	2012-05-29 05:51:10 +0000
@@ -147,20 +147,14 @@
 /* Mark, unmark, query mark bit of a Lisp string.  S must be a pointer
    to a struct Lisp_String.  */
 
-#define MARK_STRING(S)		((S)->size |= ARRAY_MARK_FLAG)
-#define UNMARK_STRING(S)	((S)->size &= ~ARRAY_MARK_FLAG)
-#define STRING_MARKED_P(S)	(((S)->size & ARRAY_MARK_FLAG) != 0)
+#define MARK_STRING(S)		((S)->u.imm.gcmarkbit = 1)
+#define UNMARK_STRING(S)	((S)->u.imm.gcmarkbit = 0)
+#define STRING_MARKED_P(S)	((S)->u.imm.gcmarkbit)
 
 #define VECTOR_MARK(V)		((V)->header.size |= ARRAY_MARK_FLAG)
 #define VECTOR_UNMARK(V)	((V)->header.size &= ~ARRAY_MARK_FLAG)
 #define VECTOR_MARKED_P(V)	(((V)->header.size & ARRAY_MARK_FLAG) != 0)
 
-/* Value is the number of bytes of S, a pointer to a struct Lisp_String.
-   Be careful during GC, because S->size contains the mark bit for
-   strings.  */
-
-#define GC_STRING_BYTES(S)	(STRING_BYTES (S))
-
 /* Global variables.  */
 struct emacs_globals globals;
 
@@ -392,6 +386,7 @@
 static void mark_stack (void);
 static int live_vector_p (struct mem_node *, void *);
 static int live_buffer_p (struct mem_node *, void *);
+static int live_string_data_p (struct Lisp_String *);
 static int live_string_p (struct mem_node *, void *);
 static int live_cons_p (struct mem_node *, void *);
 static int live_symbol_p (struct mem_node *, void *);
@@ -1761,7 +1756,8 @@
    a pointer to the `u.data' member of its sdata structure; the
    structure starts at a constant offset in front of that.  */
 
-#define SDATA_OF_STRING(S) ((struct sdata *) ((S)->data - SDATA_DATA_OFFSET))
+#define SDATA_OF_STRING(S) ((S)->u.imm.immbit ? (struct sdata *) NULL \
+  : ((struct sdata *) ((S)->u.dat.data - SDATA_DATA_OFFSET)))
 
 
 #ifdef GC_CHECK_STRING_OVERRUN
@@ -1843,21 +1839,28 @@
 
 static int check_string_bytes_count;
 
-#define CHECK_STRING_BYTES(S)	STRING_BYTES (S)
-
-
-/* Like GC_STRING_BYTES, but with debugging check.  */
+#define CHECK_STRING_BYTES(S) string_bytes (S)
 
 ptrdiff_t
 string_bytes (struct Lisp_String *s)
 {
-  ptrdiff_t nbytes =
-    (s->size_byte < 0 ? s->size & ~ARRAY_MARK_FLAG : s->size_byte);
-
-  if (!PURE_POINTER_P (s)
-      && s->data
-      && nbytes != SDATA_NBYTES (SDATA_OF_STRING (s)))
-    abort ();
+  ptrdiff_t nbytes;
+
+  if (s->u.imm.immbit)
+    {
+      nbytes = s->u.imm.size_byte < 0 ? s->u.imm.size : s->u.imm.size_byte;
+      eassert (nbytes < STRING_IMM_MAX);
+    }
+  else
+    {
+      nbytes = s->u.dat.size_byte < 0 ? s->u.dat.size : s->u.dat.size_byte;
+      eassert (nbytes >= STRING_IMM_MAX);
+      if (!PURE_POINTER_P (s) && s->u.dat.data &&
+	  nbytes != SDATA_NBYTES (SDATA_OF_STRING (s)))
+	/* Normal non-pure string with size mismatch.  */
+	abort ();
+    }
+
   return nbytes;
 }
 
@@ -1882,7 +1885,7 @@
 	CHECK_STRING_BYTES (from->string);
 
       if (from->string)
-	nbytes = GC_STRING_BYTES (from->string);
+	nbytes = string_bytes (from->string);
       else
 	nbytes = SDATA_NBYTES (from);
 
@@ -2028,8 +2031,8 @@
   /* Determine the number of bytes needed to store NBYTES bytes
      of string data.  */
   needed = SDATA_SIZE (nbytes);
-  old_data = s->data ? SDATA_OF_STRING (s) : NULL;
-  old_nbytes = GC_STRING_BYTES (s);
+  old_data = s->u.dat.data ? SDATA_OF_STRING (s) : NULL;
+  old_nbytes = string_bytes (s);
 
   MALLOC_BLOCK_INPUT;
 
@@ -2088,13 +2091,11 @@
   MALLOC_UNBLOCK_INPUT;
 
   data->string = s;
-  s->data = SDATA_DATA (data);
+  s->u.dat.data = SDATA_DATA (data);
 #ifdef GC_CHECK_STRING_BYTES
   SDATA_NBYTES (data) = nbytes;
 #endif
-  s->size = nchars;
-  s->size_byte = nbytes;
-  s->data[nbytes] = '\0';
+  s->u.dat.data[nbytes] = '\0';
 #ifdef GC_CHECK_STRING_OVERRUN
   memcpy ((char *) data + needed, string_overrun_cookie,
 	  GC_STRING_OVERRUN_COOKIE_SIZE);
@@ -2112,6 +2113,12 @@
   consing_since_gc += needed;
 }
 
+#ifdef GC_STRING_STATS
+
+static EMACS_INT total_imm_strings, total_dat_strings, total_imm_intervals;
+static EMACS_INT total_imm_bytes, total_dat_bytes, total_dat_intervals;
+
+#endif
 
 /* Sweep and compact strings.  */
 
@@ -2125,6 +2132,12 @@
   total_strings = total_free_strings = 0;
   total_string_size = 0;
 
+#ifdef GC_STRING_STATS
+  total_imm_strings = total_dat_strings = 0;
+  total_imm_bytes = total_dat_bytes = 0;
+  total_imm_intervals = total_dat_intervals = 0;
+#endif
+
   /* Scan strings_blocks, free Lisp_Strings that aren't marked.  */
   for (b = string_blocks; b; b = next)
     {
@@ -2137,49 +2150,64 @@
 	{
 	  struct Lisp_String *s = b->strings + i;
 
-	  if (s->data)
+	  if (STRING_MARKED_P (s))
+	    {	      
+	      /* String is live; unmark it and its intervals.  */
+	      UNMARK_STRING (s);
+
+	      if (!NULL_INTERVAL_P (s->intervals))
+		UNMARK_BALANCE_INTERVALS (s->intervals);
+
+	      ++total_strings;
+	      total_string_size += string_bytes (s);
+#ifdef GC_STRING_STATS
+	      if (s->u.imm.immbit)
+		{
+		  total_imm_strings++;
+		  total_imm_bytes += string_bytes (s);
+		  if (!NULL_INTERVAL_P (s->intervals))
+		    total_imm_intervals++;
+		}
+	      else
+		{
+		  total_dat_strings++;
+		  total_dat_bytes += string_bytes (s);
+		  if (!NULL_INTERVAL_P (s->intervals))
+		    total_dat_intervals++;
+		}
+#endif /* GC_STRING_STATS */
+	    }
+	  else
 	    {
-	      /* String was not on free-list before.  */
-	      if (STRING_MARKED_P (s))
-		{
-		  /* String is live; unmark it and its intervals.  */
-		  UNMARK_STRING (s);
-
-		  if (!NULL_INTERVAL_P (s->intervals))
-		    UNMARK_BALANCE_INTERVALS (s->intervals);
-
-		  ++total_strings;
-		  total_string_size += STRING_BYTES (s);
-		}
+	      if (s->u.imm.immbit)
+		/* Fill data with special pattern. Used by
+		   GC to find dead immediate strings.  */
+		memset (s->u.imm.data, 0xff, STRING_IMM_MAX);
 	      else
 		{
-		  /* String is dead.  Put it on the free-list.  */
-		  struct sdata *data = SDATA_OF_STRING (s);
+		  if (s->u.dat.data)
+		    {
+		      /* String is dead.  Put it on the free-list.  */
+		      struct sdata *data = SDATA_OF_STRING (s);
 
-		  /* Save the size of S in its sdata so that we know
-		     how large that is.  Reset the sdata's string
-		     back-pointer so that we know it's free.  */
+		      /* Save the size of S in its sdata so that we know
+			 how large that is.  Reset the sdata's string
+			 back-pointer so that we know it's free.  */
 #ifdef GC_CHECK_STRING_BYTES
-		  if (GC_STRING_BYTES (s) != SDATA_NBYTES (data))
-		    abort ();
+		      if (string_bytes (s) != SDATA_NBYTES (data))
+			abort ();
 #else
-		  data->u.nbytes = GC_STRING_BYTES (s);
+		      data->u.nbytes = string_bytes (s);
 #endif
-		  data->string = NULL;
-
-		  /* Reset the strings's `data' member so that we
-		     know it's free.  */
-		  s->data = NULL;
-
-		  /* Put the string on the free-list.  */
-		  NEXT_FREE_LISP_STRING (s) = string_free_list;
-		  string_free_list = s;
-		  ++nfree;
+		      data->string = NULL;
+
+		      /* Reset the strings's `data' member so that we
+			 know it's free.  */
+		      s->u.dat.data = NULL;
+		    }
 		}
-	    }
-	  else
-	    {
-	      /* S was on the free-list before.  Put it there again.  */
+
+	      /* Put the string on the free-list.  */
 	      NEXT_FREE_LISP_STRING (s) = string_free_list;
 	      string_free_list = s;
 	      ++nfree;
@@ -2271,12 +2299,12 @@
 	  /* Check that the string size recorded in the string is the
 	     same as the one recorded in the sdata structure. */
 	  if (from->string
-	      && GC_STRING_BYTES (from->string) != SDATA_NBYTES (from))
+	      && string_bytes (from->string) != SDATA_NBYTES (from))
 	    abort ();
 #endif /* GC_CHECK_STRING_BYTES */
 
 	  if (from->string)
-	    nbytes = GC_STRING_BYTES (from->string);
+	    nbytes = string_bytes (from->string);
 	  else
 	    nbytes = SDATA_NBYTES (from);
 
@@ -2312,7 +2340,7 @@
 		{
 		  xassert (tb != b || to < from);
 		  memmove (to, from, nbytes + GC_STRING_EXTRA);
-		  to->string->data = SDATA_DATA (to);
+		  to->string->u.dat.data = SDATA_DATA (to);
 		}
 
 	      /* Advance past the sdata we copied to.  */
@@ -2562,13 +2590,24 @@
     return empty_multibyte_string;
 
   s = allocate_string ();
-  allocate_string_data (s, nchars, nbytes);
+  if (nbytes < STRING_IMM_MAX)
+    {
+      s->u.imm.immbit = 1;
+      s->u.imm.size = nchars;
+      s->u.imm.size_byte = nbytes;
+    }
+  else
+    {
+      s->u.imm.immbit = 0;
+      s->u.dat.size = nchars;
+      s->u.dat.size_byte = nbytes;
+      allocate_string_data (s, nchars, nbytes);
+    }
   XSETSTRING (string, s);
   string_chars_consed += nbytes;
   return string;
 }
 
-
 \f
 /***********************************************************************
 			   Float Allocation
@@ -3937,6 +3976,22 @@
   x->color = MEM_BLACK;
 }
 
+/* Non-zero if data of S is valid.  */
+
+static inline int
+live_string_data_p (struct Lisp_String *s)
+{
+  if (s->u.imm.immbit)
+    {
+      unsigned char *p;
+
+      for (p = s->u.imm.data; p < s->u.imm.data + STRING_IMM_MAX; p++)
+	if (*p != 0xff)
+	  return 1;
+      return 0;
+    }
+  return s->u.dat.data != NULL;
+}
 
 /* Value is non-zero if P is a pointer to a live Lisp string on
    the heap.  M is a pointer to the mem_block for P.  */
@@ -3954,7 +4009,7 @@
       return (offset >= 0
 	      && offset % sizeof b->strings[0] == 0
 	      && offset < (STRING_BLOCK_SIZE * sizeof b->strings[0])
-	      && ((struct Lisp_String *) p)->data != NULL);
+	      && live_string_data_p ((struct Lisp_String *) p));
     }
   else
     return 0;
@@ -4869,15 +4924,29 @@
   struct Lisp_String *s;
 
   s = (struct Lisp_String *) pure_alloc (sizeof *s, Lisp_String);
-  s->data = (unsigned char *) find_string_data_in_pure (data, nbytes);
-  if (s->data == NULL)
-    {
-      s->data = (unsigned char *) pure_alloc (nbytes + 1, -1);
-      memcpy (s->data, data, nbytes);
-      s->data[nbytes] = '\0';
-    }
-  s->size = nchars;
-  s->size_byte = multibyte ? nbytes : -1;
+
+  if (nbytes < STRING_IMM_MAX)
+    {
+      memcpy (s->u.imm.data, data, nbytes);
+      s->u.imm.data[nbytes] = '\0';
+      s->u.imm.immbit = 1;
+      s->u.imm.size = nchars;
+      s->u.imm.size_byte = multibyte ? nbytes : -1;
+    }
+  else
+    {
+      s->u.dat.data = (unsigned char *) find_string_data_in_pure (data, nbytes);
+      if (s->u.dat.data == NULL)
+	{
+	  s->u.dat.data = (unsigned char *) pure_alloc (nbytes + 1, -1);
+	  memcpy (s->u.dat.data, data, nbytes);
+	  s->u.dat.data[nbytes] = '\0';
+	}
+      s->u.imm.immbit = 0;
+      s->u.dat.size = nchars;
+      s->u.dat.size_byte = multibyte ? nbytes : -1;
+    }
+
   s->intervals = NULL_INTERVAL;
   XSETSTRING (string, s);
   return string;
@@ -4894,9 +4963,23 @@
   ptrdiff_t nchars = strlen (data);
 
   s = (struct Lisp_String *) pure_alloc (sizeof *s, Lisp_String);
-  s->size = nchars;
-  s->size_byte = -1;
-  s->data = (unsigned char *) data;
+
+  if (nchars < STRING_IMM_MAX)
+    {
+      memcpy (s->u.imm.data, data, nchars);
+      s->u.imm.data[nchars] = '\0';
+      s->u.imm.immbit = 1;
+      s->u.imm.size = nchars;
+      s->u.imm.size_byte = -1;
+    }
+  else
+    {
+      s->u.dat.data = (unsigned char *) data;
+      s->u.imm.immbit = 0;
+      s->u.dat.size = nchars;
+      s->u.dat.size_byte = -1;
+    }
+
   s->intervals = NULL_INTERVAL;
   XSETSTRING (string, s);
   return string;
@@ -6319,6 +6402,34 @@
   return Flist (8, consed);
 }
 
+#ifdef GC_STRING_STATS
+
+DEFUN ("string-stats", Fstring_stats, Sstring_stats, 0, 0, 0,
+       doc: /* Return a list of counters that measures how much
+strings of a particular internal structure are alive after last
+garbage collection, and how many bytes are in them.
+The elements of the value are are as follows:
+  (IMM-STRINGS IMM-BYTES IMM-INTERVALS DAT-STRINGS DAT-BYTES DAT-INTERVALS)
+where IMM-STRINGS is the number of immediate strings, IMM-BYTES is the total
+number of bytes in them, and IMM-INTERVALS is the number of immediate string
+with non-nil text properties. The rest three numbers has the same meaning
+for normal strings, respectively.  */)
+  (void)
+{
+  Lisp_Object data[6];
+
+  data[0] = make_number (min (MOST_POSITIVE_FIXNUM, total_imm_strings));
+  data[1] = make_number (min (MOST_POSITIVE_FIXNUM, total_imm_bytes));
+  data[2] = make_number (min (MOST_POSITIVE_FIXNUM, total_imm_intervals));
+  data[3] = make_number (min (MOST_POSITIVE_FIXNUM, total_dat_strings));
+  data[4] = make_number (min (MOST_POSITIVE_FIXNUM, total_dat_bytes));
+  data[5] = make_number (min (MOST_POSITIVE_FIXNUM, total_dat_intervals));
+
+  return Flist (6, data);
+}
+
+#endif /* GC_STRING_STATS */
+
 /* Find at most FIND_MAX symbols which have OBJ as their value or
    function.  This is used in gdbinit's `xwhichsymbols' command.  */
 
@@ -6547,7 +6658,9 @@
   defsubr (&Sgarbage_collect);
   defsubr (&Smemory_limit);
   defsubr (&Smemory_use_counts);
-
+#ifdef GC_STRING_STATS
+  defsubr (&Sstring_stats);
+#endif
 #if GC_MARK_STACK == GC_USE_GCPROS_CHECK_ZOMBIES
   defsubr (&Sgc_status);
 #endif

=== modified file 'src/fns.c'
--- src/fns.c	2012-05-25 18:19:24 +0000
+++ src/fns.c	2012-05-28 15:03:58 +0000
@@ -2166,8 +2166,8 @@
 	  int len = CHAR_STRING (charval, str);
 	  ptrdiff_t size_byte = SBYTES (array);
 
-	  if (INT_MULTIPLY_OVERFLOW (SCHARS (array), len)
-	      || SCHARS (array) * len != size_byte)
+	  if (INT_MULTIPLY_OVERFLOW (size, len)
+	      || size * len != size_byte)
 	    error ("Attempt to change byte length of a string");
 	  for (idx = 0; idx < size_byte; idx++)
 	    *p++ = str[idx % len];

=== modified file 'src/lisp.h'
--- src/lisp.h	2012-05-27 07:51:09 +0000
+++ src/lisp.h	2012-05-29 05:54:03 +0000
@@ -69,7 +69,8 @@
     BITS_PER_SHORT     = CHAR_BIT * sizeof (short),
     BITS_PER_INT       = CHAR_BIT * sizeof (int),
     BITS_PER_LONG      = CHAR_BIT * sizeof (long int),
-    BITS_PER_EMACS_INT = CHAR_BIT * sizeof (EMACS_INT)
+    BITS_PER_EMACS_INT = CHAR_BIT * sizeof (EMACS_INT),
+    BITS_PER_PTRDIFF_T = CHAR_BIT * sizeof (ptrdiff_t)
   };
 
 /* printmax_t and uprintmax_t are types for printing large integers.
@@ -743,17 +744,23 @@
 
 /* Convenience macros for dealing with Lisp strings.  */
 
-#define SDATA(string)		(XSTRING (string)->data + 0)
+#define SDATA(string)		(XSTRING (string)->u.imm.immbit ? \
+				 (XSTRING (string)->u.imm.data) : \
+				 (XSTRING (string)->u.dat.data))
 #define SREF(string, index)	(SDATA (string)[index] + 0)
 #define SSET(string, index, new) (SDATA (string)[index] = (new))
-#define SCHARS(string)		(XSTRING (string)->size + 0)
-#define SBYTES(string)		(STRING_BYTES (XSTRING (string)) + 0)
+#define SCHARS(string)		(XSTRING (string)->u.imm.immbit ? \
+				 (XSTRING (string)->u.imm.size) : \
+				 (XSTRING (string)->u.dat.size))
+#define SBYTES(string)		(string_bytes (XSTRING (string)))
 
 /* Avoid "differ in sign" warnings.  */
 #define SSDATA(x)  ((char *) SDATA (x))
 
-#define STRING_SET_CHARS(string, newsize) \
-    (XSTRING (string)->size = (newsize))
+#define STRING_SET_CHARS(string, newsize)	\
+  (XSTRING (string)->u.imm.immbit ?		\
+   (XSTRING (string)->u.imm.size = (newsize)) : \
+   (XSTRING (string)->u.dat.size = (newsize)))
 
 #define STRING_COPYIN(string, index, new, count) \
     memcpy (SDATA (string) + index, new, count)
@@ -843,24 +850,12 @@
 #define CDR_SAFE(c)				\
   (CONSP ((c)) ? XCDR ((c)) : Qnil)
 
+#define STRING_SIZE_BYTE(string) (XSTRING (string)->u.imm.immbit ?    \
+				  XSTRING (string)->u.imm.size_byte : \
+				  XSTRING (string)->u.dat.size_byte)
+
 /* Nonzero if STR is a multibyte string.  */
-#define STRING_MULTIBYTE(STR)  \
-  (XSTRING (STR)->size_byte >= 0)
-
-/* Return the length in bytes of STR.  */
-
-#ifdef GC_CHECK_STRING_BYTES
-
-struct Lisp_String;
-extern ptrdiff_t string_bytes (struct Lisp_String *);
-#define STRING_BYTES(S) string_bytes ((S))
-
-#else /* not GC_CHECK_STRING_BYTES */
-
-#define STRING_BYTES(STR)  \
-  ((STR)->size_byte < 0 ? (STR)->size : (STR)->size_byte)
-
-#endif /* not GC_CHECK_STRING_BYTES */
+#define STRING_MULTIBYTE(string) (STRING_SIZE_BYTE (string) > 0)
 
 /* An upper bound on the number of bytes in a Lisp string, not
    counting the terminating null.  This a tight enough bound to
@@ -876,18 +871,28 @@
 #define STRING_BYTES_BOUND  \
   min (MOST_POSITIVE_FIXNUM, (ptrdiff_t) min (SIZE_MAX, PTRDIFF_MAX) - 1)
 
+/* Maximum amount of bytes, including '\0', in an immediate string.  */
+#define STRING_IMM_MAX (3 * sizeof (ptrdiff_t) - 2)
+
 /* Mark STR as a unibyte string.  */
 #define STRING_SET_UNIBYTE(STR)  \
-  do { if (EQ (STR, empty_multibyte_string))  \
-      (STR) = empty_unibyte_string;  \
-    else XSTRING (STR)->size_byte = -1; } while (0)
+  do { if (EQ (STR, empty_multibyte_string))		\
+      (STR) = empty_unibyte_string;			\
+    else if (XSTRING (STR)->u.imm.immbit)		\
+      XSTRING (STR)->u.imm.size_byte = -1;		\
+    else						\
+      XSTRING (STR)->u.dat.size_byte = -1; } while (0)
 
 /* Mark STR as a multibyte string.  Assure that STR contains only
    ASCII characters in advance.  */
-#define STRING_SET_MULTIBYTE(STR)  \
-  do { if (EQ (STR, empty_unibyte_string))  \
-      (STR) = empty_multibyte_string;  \
-    else XSTRING (STR)->size_byte = XSTRING (STR)->size; } while (0)
+#define STRING_SET_MULTIBYTE(STR)				  \
+  do { if (EQ (STR, empty_unibyte_string))			  \
+      (STR) = empty_multibyte_string;				  \
+    else if (XSTRING (STR)->u.imm.immbit)			  \
+      XSTRING (STR)->u.imm.size_byte = XSTRING (STR)->u.imm.size; \
+    else							  \
+      XSTRING (STR)->u.dat.size_byte = XSTRING (STR)->u.dat.size; \
+  } while (0)
 
 /* Get text properties.  */
 #define STRING_INTERVALS(STR)  (XSTRING (STR)->intervals + 0)
@@ -898,12 +903,55 @@
 /* In a string or vector, the sign bit of the `size' is the gc mark bit */
 
 struct Lisp_String
-  {
-    ptrdiff_t size;
-    ptrdiff_t size_byte;
-    INTERVAL intervals;		/* text properties in this string */
-    unsigned char *data;
-  };
+{
+  /* Text properties in this string.  Should be the first
+     member since NEXT_FREE_LISP_STRING from alloc.c uses it.  */
+  INTERVAL intervals;
+
+  union {
+    /* GC mark bit and subtype bit are in IMM just by convention - when
+       IMMBIT is 0, the DAT field is used except it's UNUSED field.  */
+    struct {
+      unsigned gcmarkbit : 1;
+      unsigned immbit : 1;
+      ptrdiff_t size : 7;
+      ptrdiff_t size_byte : 7;
+      unsigned char data[STRING_IMM_MAX];
+    } imm;
+
+    struct {
+      unsigned unused : 2; /* Do not access this placeholder.  */
+      ptrdiff_t size : BITS_PER_PTRDIFF_T - 2;
+      ptrdiff_t size_byte : BITS_PER_PTRDIFF_T - 2;
+      unsigned char *data;
+    } dat;
+  } u;
+};
+
+/* Return the length in bytes of STR.  */
+
+#ifdef GC_CHECK_STRING_BYTES
+
+struct Lisp_String;
+extern ptrdiff_t string_bytes (struct Lisp_String *);
+#define STRING_BYTES(S) string_bytes ((S))
+
+#else /* not GC_CHECK_STRING_BYTES */
+
+static inline
+ptrdiff_t string_bytes (struct Lisp_String *s)
+{
+  ptrdiff_t size, size_byte;
+
+  if (s->u.imm.immbit)
+    size = s->u.imm.size, size_byte = s->u.imm.size_byte;
+  else
+    size = s->u.dat.size, size_byte = s->u.dat.size_byte;
+
+  return size_byte < 0 ? size : size_byte;
+}
+
+#endif /* not GC_CHECK_STRING_BYTES */
 
 /* Header of vector-like objects.  This documents the layout constraints on
    vectors and pseudovectors other than struct Lisp_Subr.  It also prevents


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-29  6:55   ` Dmitry Antipov
@ 2012-05-29  7:38     ` Paul Eggert
  2012-05-29 13:33       ` Dmitry Antipov
  2012-05-29  7:38     ` Proposal: immediate strings Andreas Schwab
  1 sibling, 1 reply; 30+ messages in thread
From: Paul Eggert @ 2012-05-29  7:38 UTC (permalink / raw)
  To: Dmitry Antipov; +Cc: Stefan Monnier, emacs-devel

On 05/28/2012 11:55 PM, Dmitry Antipov wrote:
> To avoid using 'packed' attribute and speedup an inefficient access to u.dat.size
> field, it's possible to use 'ptrdiff_t X : BITS_PER_PTRDIFF_T - 2'
> bitfields - it's still wide enough to hold the values up to STRING_BYTES_BOUND.

That's not true on 32-bit hosts configured --with-wide-int.
On such hosts, a string can contain up to PTRDIFF_MAX - 1 bytes.
This proposal would lower the limit to PTRDIFF_MAX / 4 bytes.
In contrast, the proposal in
<http://lists.gnu.org/archive/html/emacs-devel/2012-05/msg00460.html>
would merely lower it to PTRDIFF_MAX - 65536 bytes.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-29  6:55   ` Dmitry Antipov
  2012-05-29  7:38     ` Paul Eggert
@ 2012-05-29  7:38     ` Andreas Schwab
  1 sibling, 0 replies; 30+ messages in thread
From: Andreas Schwab @ 2012-05-29  7:38 UTC (permalink / raw)
  To: Dmitry Antipov; +Cc: Stefan Monnier, emacs-devel

Dmitry Antipov <dmantipov@yandex.ru> writes:

> +    struct {
> +      unsigned unused : 2; /* Do not access this placeholder.  */
> +      ptrdiff_t size : BITS_PER_PTRDIFF_T - 2;
> +      ptrdiff_t size_byte : BITS_PER_PTRDIFF_T - 2;

The latter could be full-width.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-29  7:38     ` Paul Eggert
@ 2012-05-29 13:33       ` Dmitry Antipov
  2012-05-29 15:24         ` Paul Eggert
  0 siblings, 1 reply; 30+ messages in thread
From: Dmitry Antipov @ 2012-05-29 13:33 UTC (permalink / raw)
  To: emacs-devel; +Cc: Paul Eggert, Stefan Monnier

[-- Attachment #1: Type: text/plain, Size: 906 bytes --]

On 05/29/2012 11:38 AM, Paul Eggert wrote:

> On 05/28/2012 11:55 PM, Dmitry Antipov wrote:
>> To avoid using 'packed' attribute and speedup an inefficient access to u.dat.size
>> field, it's possible to use 'ptrdiff_t X : BITS_PER_PTRDIFF_T - 2'
>> bitfields - it's still wide enough to hold the values up to STRING_BYTES_BOUND.
>
> That's not true on 32-bit hosts configured --with-wide-int.
> On such hosts, a string can contain up to PTRDIFF_MAX - 1 bytes.
> This proposal would lower the limit to PTRDIFF_MAX / 4 bytes.
> In contrast, the proposal in
> <http://lists.gnu.org/archive/html/emacs-devel/2012-05/msg00460.html>
> would merely lower it to PTRDIFF_MAX - 65536 bytes.

OK, this allows

min (MOST_POSITIVE_FIXNUM, (ptrdiff_t) min (SIZE_MAX, PTRDIFF_MAX - 1) - 1)

in all configurations (so PTRDIFF_MAX - 2 for --with-wide-int), eventually
uses _all_ bits by eliminating signed values.

Dmitry

[-- Attachment #2: immstr-dev.patch --]
[-- Type: text/plain, Size: 19720 bytes --]

=== modified file 'src/alloc.c'
--- src/alloc.c	2012-05-25 18:19:24 +0000
+++ src/alloc.c	2012-05-29 13:28:14 +0000
@@ -147,20 +147,14 @@
 /* Mark, unmark, query mark bit of a Lisp string.  S must be a pointer
    to a struct Lisp_String.  */
 
-#define MARK_STRING(S)		((S)->size |= ARRAY_MARK_FLAG)
-#define UNMARK_STRING(S)	((S)->size &= ~ARRAY_MARK_FLAG)
-#define STRING_MARKED_P(S)	(((S)->size & ARRAY_MARK_FLAG) != 0)
+#define MARK_STRING(S)		((S)->u.imm.gcmarkbit = 1)
+#define UNMARK_STRING(S)	((S)->u.imm.gcmarkbit = 0)
+#define STRING_MARKED_P(S)	((S)->u.imm.gcmarkbit)
 
 #define VECTOR_MARK(V)		((V)->header.size |= ARRAY_MARK_FLAG)
 #define VECTOR_UNMARK(V)	((V)->header.size &= ~ARRAY_MARK_FLAG)
 #define VECTOR_MARKED_P(V)	(((V)->header.size & ARRAY_MARK_FLAG) != 0)
 
-/* Value is the number of bytes of S, a pointer to a struct Lisp_String.
-   Be careful during GC, because S->size contains the mark bit for
-   strings.  */
-
-#define GC_STRING_BYTES(S)	(STRING_BYTES (S))
-
 /* Global variables.  */
 struct emacs_globals globals;
 
@@ -392,6 +386,7 @@
 static void mark_stack (void);
 static int live_vector_p (struct mem_node *, void *);
 static int live_buffer_p (struct mem_node *, void *);
+static int live_string_data_p (struct Lisp_String *);
 static int live_string_p (struct mem_node *, void *);
 static int live_cons_p (struct mem_node *, void *);
 static int live_symbol_p (struct mem_node *, void *);
@@ -1761,7 +1756,8 @@
    a pointer to the `u.data' member of its sdata structure; the
    structure starts at a constant offset in front of that.  */
 
-#define SDATA_OF_STRING(S) ((struct sdata *) ((S)->data - SDATA_DATA_OFFSET))
+#define SDATA_OF_STRING(S) ((S)->u.imm.immbit ? (struct sdata *) NULL \
+  : ((struct sdata *) ((S)->u.dat.data - SDATA_DATA_OFFSET)))
 
 
 #ifdef GC_CHECK_STRING_OVERRUN
@@ -1843,21 +1839,30 @@
 
 static int check_string_bytes_count;
 
-#define CHECK_STRING_BYTES(S)	STRING_BYTES (S)
-
-
-/* Like GC_STRING_BYTES, but with debugging check.  */
+#define CHECK_STRING_BYTES(S) string_bytes (S)
 
 ptrdiff_t
 string_bytes (struct Lisp_String *s)
 {
-  ptrdiff_t nbytes =
-    (s->size_byte < 0 ? s->size & ~ARRAY_MARK_FLAG : s->size_byte);
-
-  if (!PURE_POINTER_P (s)
-      && s->data
-      && nbytes != SDATA_NBYTES (SDATA_OF_STRING (s)))
-    abort ();
+  ptrdiff_t nbytes;
+
+  if (s->u.imm.immbit)
+    {
+      nbytes = s->u.imm.size_byte == STRING_UNIBYTE_IMM_MAX ?
+	s->u.imm.size : s->u.imm.size_byte;
+      eassert (nbytes < STRING_IMM_SIZE);
+    }
+  else
+    {
+      nbytes = s->u.dat.size_byte == STRING_UNIBYTE_DAT_MAX ?
+	s->u.dat.size : s->u.dat.size_byte;
+      eassert (nbytes >= STRING_IMM_SIZE);
+      if (!PURE_POINTER_P (s) && s->u.dat.data &&
+	  nbytes != SDATA_NBYTES (SDATA_OF_STRING (s)))
+	/* Normal non-pure string with size mismatch.  */
+	abort ();
+    }
+
   return nbytes;
 }
 
@@ -1882,7 +1887,7 @@
 	CHECK_STRING_BYTES (from->string);
 
       if (from->string)
-	nbytes = GC_STRING_BYTES (from->string);
+	nbytes = string_bytes (from->string);
       else
 	nbytes = SDATA_NBYTES (from);
 
@@ -2028,8 +2033,8 @@
   /* Determine the number of bytes needed to store NBYTES bytes
      of string data.  */
   needed = SDATA_SIZE (nbytes);
-  old_data = s->data ? SDATA_OF_STRING (s) : NULL;
-  old_nbytes = GC_STRING_BYTES (s);
+  old_data = s->u.dat.data ? SDATA_OF_STRING (s) : NULL;
+  old_nbytes = string_bytes (s);
 
   MALLOC_BLOCK_INPUT;
 
@@ -2088,13 +2093,11 @@
   MALLOC_UNBLOCK_INPUT;
 
   data->string = s;
-  s->data = SDATA_DATA (data);
+  s->u.dat.data = SDATA_DATA (data);
 #ifdef GC_CHECK_STRING_BYTES
   SDATA_NBYTES (data) = nbytes;
 #endif
-  s->size = nchars;
-  s->size_byte = nbytes;
-  s->data[nbytes] = '\0';
+  s->u.dat.data[nbytes] = '\0';
 #ifdef GC_CHECK_STRING_OVERRUN
   memcpy ((char *) data + needed, string_overrun_cookie,
 	  GC_STRING_OVERRUN_COOKIE_SIZE);
@@ -2112,6 +2115,12 @@
   consing_since_gc += needed;
 }
 
+#ifdef GC_STRING_STATS
+
+static EMACS_INT total_imm_strings, total_dat_strings, total_imm_intervals;
+static EMACS_INT total_imm_bytes, total_dat_bytes, total_dat_intervals;
+
+#endif
 
 /* Sweep and compact strings.  */
 
@@ -2125,6 +2134,12 @@
   total_strings = total_free_strings = 0;
   total_string_size = 0;
 
+#ifdef GC_STRING_STATS
+  total_imm_strings = total_dat_strings = 0;
+  total_imm_bytes = total_dat_bytes = 0;
+  total_imm_intervals = total_dat_intervals = 0;
+#endif
+
   /* Scan strings_blocks, free Lisp_Strings that aren't marked.  */
   for (b = string_blocks; b; b = next)
     {
@@ -2137,49 +2152,64 @@
 	{
 	  struct Lisp_String *s = b->strings + i;
 
-	  if (s->data)
+	  if (STRING_MARKED_P (s))
+	    {	      
+	      /* String is live; unmark it and its intervals.  */
+	      UNMARK_STRING (s);
+
+	      if (!NULL_INTERVAL_P (s->intervals))
+		UNMARK_BALANCE_INTERVALS (s->intervals);
+
+	      ++total_strings;
+	      total_string_size += string_bytes (s);
+#ifdef GC_STRING_STATS
+	      if (s->u.imm.immbit)
+		{
+		  total_imm_strings++;
+		  total_imm_bytes += string_bytes (s);
+		  if (!NULL_INTERVAL_P (s->intervals))
+		    total_imm_intervals++;
+		}
+	      else
+		{
+		  total_dat_strings++;
+		  total_dat_bytes += string_bytes (s);
+		  if (!NULL_INTERVAL_P (s->intervals))
+		    total_dat_intervals++;
+		}
+#endif /* GC_STRING_STATS */
+	    }
+	  else
 	    {
-	      /* String was not on free-list before.  */
-	      if (STRING_MARKED_P (s))
-		{
-		  /* String is live; unmark it and its intervals.  */
-		  UNMARK_STRING (s);
-
-		  if (!NULL_INTERVAL_P (s->intervals))
-		    UNMARK_BALANCE_INTERVALS (s->intervals);
-
-		  ++total_strings;
-		  total_string_size += STRING_BYTES (s);
-		}
+	      if (s->u.imm.immbit)
+		/* Fill data with special pattern. Used by
+		   GC to find dead immediate strings.  */
+		memset (s->u.imm.data, 0xff, STRING_IMM_SIZE);
 	      else
 		{
-		  /* String is dead.  Put it on the free-list.  */
-		  struct sdata *data = SDATA_OF_STRING (s);
+		  if (s->u.dat.data)
+		    {
+		      /* String is dead.  Put it on the free-list.  */
+		      struct sdata *data = SDATA_OF_STRING (s);
 
-		  /* Save the size of S in its sdata so that we know
-		     how large that is.  Reset the sdata's string
-		     back-pointer so that we know it's free.  */
+		      /* Save the size of S in its sdata so that we know
+			 how large that is.  Reset the sdata's string
+			 back-pointer so that we know it's free.  */
 #ifdef GC_CHECK_STRING_BYTES
-		  if (GC_STRING_BYTES (s) != SDATA_NBYTES (data))
-		    abort ();
+		      if (string_bytes (s) != SDATA_NBYTES (data))
+			abort ();
 #else
-		  data->u.nbytes = GC_STRING_BYTES (s);
+		      data->u.nbytes = string_bytes (s);
 #endif
-		  data->string = NULL;
-
-		  /* Reset the strings's `data' member so that we
-		     know it's free.  */
-		  s->data = NULL;
-
-		  /* Put the string on the free-list.  */
-		  NEXT_FREE_LISP_STRING (s) = string_free_list;
-		  string_free_list = s;
-		  ++nfree;
+		      data->string = NULL;
+
+		      /* Reset the strings's `data' member so that we
+			 know it's free.  */
+		      s->u.dat.data = NULL;
+		    }
 		}
-	    }
-	  else
-	    {
-	      /* S was on the free-list before.  Put it there again.  */
+
+	      /* Put the string on the free-list.  */
 	      NEXT_FREE_LISP_STRING (s) = string_free_list;
 	      string_free_list = s;
 	      ++nfree;
@@ -2271,12 +2301,12 @@
 	  /* Check that the string size recorded in the string is the
 	     same as the one recorded in the sdata structure. */
 	  if (from->string
-	      && GC_STRING_BYTES (from->string) != SDATA_NBYTES (from))
+	      && string_bytes (from->string) != SDATA_NBYTES (from))
 	    abort ();
 #endif /* GC_CHECK_STRING_BYTES */
 
 	  if (from->string)
-	    nbytes = GC_STRING_BYTES (from->string);
+	    nbytes = string_bytes (from->string);
 	  else
 	    nbytes = SDATA_NBYTES (from);
 
@@ -2312,7 +2342,7 @@
 		{
 		  xassert (tb != b || to < from);
 		  memmove (to, from, nbytes + GC_STRING_EXTRA);
-		  to->string->data = SDATA_DATA (to);
+		  to->string->u.dat.data = SDATA_DATA (to);
 		}
 
 	      /* Advance past the sdata we copied to.  */
@@ -2562,13 +2592,24 @@
     return empty_multibyte_string;
 
   s = allocate_string ();
-  allocate_string_data (s, nchars, nbytes);
+  if (nbytes < STRING_IMM_SIZE)
+    {
+      s->u.imm.immbit = 1;
+      s->u.imm.size = nchars;
+      s->u.imm.size_byte = nbytes;
+    }
+  else
+    {
+      s->u.imm.immbit = 0;
+      s->u.dat.size = nchars;
+      s->u.dat.size_byte = nbytes;
+      allocate_string_data (s, nchars, nbytes);
+    }
   XSETSTRING (string, s);
   string_chars_consed += nbytes;
   return string;
 }
 
-
 \f
 /***********************************************************************
 			   Float Allocation
@@ -3937,6 +3978,22 @@
   x->color = MEM_BLACK;
 }
 
+/* Non-zero if data of S is valid.  */
+
+static inline int
+live_string_data_p (struct Lisp_String *s)
+{
+  if (s->u.imm.immbit)
+    {
+      unsigned char *p;
+
+      for (p = s->u.imm.data; p < s->u.imm.data + STRING_IMM_SIZE; p++)
+	if (*p != 0xff)
+	  return 1;
+      return 0;
+    }
+  return s->u.dat.data != NULL;
+}
 
 /* Value is non-zero if P is a pointer to a live Lisp string on
    the heap.  M is a pointer to the mem_block for P.  */
@@ -3954,7 +4011,7 @@
       return (offset >= 0
 	      && offset % sizeof b->strings[0] == 0
 	      && offset < (STRING_BLOCK_SIZE * sizeof b->strings[0])
-	      && ((struct Lisp_String *) p)->data != NULL);
+	      && live_string_data_p ((struct Lisp_String *) p));
     }
   else
     return 0;
@@ -4869,15 +4926,29 @@
   struct Lisp_String *s;
 
   s = (struct Lisp_String *) pure_alloc (sizeof *s, Lisp_String);
-  s->data = (unsigned char *) find_string_data_in_pure (data, nbytes);
-  if (s->data == NULL)
-    {
-      s->data = (unsigned char *) pure_alloc (nbytes + 1, -1);
-      memcpy (s->data, data, nbytes);
-      s->data[nbytes] = '\0';
-    }
-  s->size = nchars;
-  s->size_byte = multibyte ? nbytes : -1;
+
+  if (nbytes < STRING_IMM_SIZE)
+    {
+      memcpy (s->u.imm.data, data, nbytes);
+      s->u.imm.data[nbytes] = '\0';
+      s->u.imm.immbit = 1;
+      s->u.imm.size = nchars;
+      s->u.imm.size_byte = multibyte ? nbytes : STRING_UNIBYTE_IMM_MAX;
+    }
+  else
+    {
+      s->u.dat.data = (unsigned char *) find_string_data_in_pure (data, nbytes);
+      if (s->u.dat.data == NULL)
+	{
+	  s->u.dat.data = (unsigned char *) pure_alloc (nbytes + 1, -1);
+	  memcpy (s->u.dat.data, data, nbytes);
+	  s->u.dat.data[nbytes] = '\0';
+	}
+      s->u.imm.immbit = 0;
+      s->u.dat.size = nchars;
+      s->u.dat.size_byte = multibyte ? nbytes : STRING_UNIBYTE_DAT_MAX;
+    }
+
   s->intervals = NULL_INTERVAL;
   XSETSTRING (string, s);
   return string;
@@ -4894,9 +4965,23 @@
   ptrdiff_t nchars = strlen (data);
 
   s = (struct Lisp_String *) pure_alloc (sizeof *s, Lisp_String);
-  s->size = nchars;
-  s->size_byte = -1;
-  s->data = (unsigned char *) data;
+
+  if (nchars < STRING_IMM_SIZE)
+    {
+      memcpy (s->u.imm.data, data, nchars);
+      s->u.imm.data[nchars] = '\0';
+      s->u.imm.immbit = 1;
+      s->u.imm.size = nchars;
+      s->u.imm.size_byte = STRING_UNIBYTE_IMM_MAX;
+    }
+  else
+    {
+      s->u.dat.data = (unsigned char *) data;
+      s->u.imm.immbit = 0;
+      s->u.dat.size = nchars;
+      s->u.dat.size_byte = STRING_UNIBYTE_DAT_MAX;
+    }
+
   s->intervals = NULL_INTERVAL;
   XSETSTRING (string, s);
   return string;
@@ -6319,6 +6404,34 @@
   return Flist (8, consed);
 }
 
+#ifdef GC_STRING_STATS
+
+DEFUN ("string-stats", Fstring_stats, Sstring_stats, 0, 0, 0,
+       doc: /* Return a list of counters that measures how much
+strings of a particular internal structure are alive after last
+garbage collection, and how many bytes are in them.
+The elements of the value are are as follows:
+  (IMM-STRINGS IMM-BYTES IMM-INTERVALS DAT-STRINGS DAT-BYTES DAT-INTERVALS)
+where IMM-STRINGS is the number of immediate strings, IMM-BYTES is the total
+number of bytes in them, and IMM-INTERVALS is the number of immediate string
+with non-nil text properties. The rest three numbers has the same meaning
+for normal strings, respectively.  */)
+  (void)
+{
+  Lisp_Object data[6];
+
+  data[0] = make_number (min (MOST_POSITIVE_FIXNUM, total_imm_strings));
+  data[1] = make_number (min (MOST_POSITIVE_FIXNUM, total_imm_bytes));
+  data[2] = make_number (min (MOST_POSITIVE_FIXNUM, total_imm_intervals));
+  data[3] = make_number (min (MOST_POSITIVE_FIXNUM, total_dat_strings));
+  data[4] = make_number (min (MOST_POSITIVE_FIXNUM, total_dat_bytes));
+  data[5] = make_number (min (MOST_POSITIVE_FIXNUM, total_dat_intervals));
+
+  return Flist (6, data);
+}
+
+#endif /* GC_STRING_STATS */
+
 /* Find at most FIND_MAX symbols which have OBJ as their value or
    function.  This is used in gdbinit's `xwhichsymbols' command.  */
 
@@ -6547,7 +6660,9 @@
   defsubr (&Sgarbage_collect);
   defsubr (&Smemory_limit);
   defsubr (&Smemory_use_counts);
-
+#ifdef GC_STRING_STATS
+  defsubr (&Sstring_stats);
+#endif
 #if GC_MARK_STACK == GC_USE_GCPROS_CHECK_ZOMBIES
   defsubr (&Sgc_status);
 #endif

=== modified file 'src/fns.c'
--- src/fns.c	2012-05-25 18:19:24 +0000
+++ src/fns.c	2012-05-29 10:00:53 +0000
@@ -2166,8 +2166,8 @@
 	  int len = CHAR_STRING (charval, str);
 	  ptrdiff_t size_byte = SBYTES (array);
 
-	  if (INT_MULTIPLY_OVERFLOW (SCHARS (array), len)
-	      || SCHARS (array) * len != size_byte)
+	  if (INT_MULTIPLY_OVERFLOW (size, len)
+	      || size * len != size_byte)
 	    error ("Attempt to change byte length of a string");
 	  for (idx = 0; idx < size_byte; idx++)
 	    *p++ = str[idx % len];

=== modified file 'src/lisp.h'
--- src/lisp.h	2012-05-27 07:51:09 +0000
+++ src/lisp.h	2012-05-29 13:27:57 +0000
@@ -743,17 +743,23 @@
 
 /* Convenience macros for dealing with Lisp strings.  */
 
-#define SDATA(string)		(XSTRING (string)->data + 0)
+#define SDATA(string)		(XSTRING (string)->u.imm.immbit ? \
+				 (XSTRING (string)->u.imm.data) : \
+				 (XSTRING (string)->u.dat.data))
 #define SREF(string, index)	(SDATA (string)[index] + 0)
 #define SSET(string, index, new) (SDATA (string)[index] = (new))
-#define SCHARS(string)		(XSTRING (string)->size + 0)
-#define SBYTES(string)		(STRING_BYTES (XSTRING (string)) + 0)
+#define SCHARS(string)		(XSTRING (string)->u.imm.immbit ? \
+				 (XSTRING (string)->u.imm.size) : \
+				 (XSTRING (string)->u.dat.size))
+#define SBYTES(string)		(string_bytes (XSTRING (string)))
 
 /* Avoid "differ in sign" warnings.  */
 #define SSDATA(x)  ((char *) SDATA (x))
 
-#define STRING_SET_CHARS(string, newsize) \
-    (XSTRING (string)->size = (newsize))
+#define STRING_SET_CHARS(string, newsize)	\
+  (XSTRING (string)->u.imm.immbit ?		\
+   (XSTRING (string)->u.imm.size = (newsize)) : \
+   (XSTRING (string)->u.dat.size = (newsize)))
 
 #define STRING_COPYIN(string, index, new, count) \
     memcpy (SDATA (string) + index, new, count)
@@ -843,24 +849,17 @@
 #define CDR_SAFE(c)				\
   (CONSP ((c)) ? XCDR ((c)) : Qnil)
 
+/* For unibyte immediate string, SIZE_BYTE field is always set to this.  */
+#define STRING_UNIBYTE_IMM_MAX ((1 << (BITS_PER_CHAR - 1)) - 1)
+
+/* For unibyte normal string, SIZE_BYTE field is always set to this.  */
+#define STRING_UNIBYTE_DAT_MAX ((1UL << (BITS_PER_LONG - 1)) - 1)
+
 /* Nonzero if STR is a multibyte string.  */
-#define STRING_MULTIBYTE(STR)  \
-  (XSTRING (STR)->size_byte >= 0)
-
-/* Return the length in bytes of STR.  */
-
-#ifdef GC_CHECK_STRING_BYTES
-
-struct Lisp_String;
-extern ptrdiff_t string_bytes (struct Lisp_String *);
-#define STRING_BYTES(S) string_bytes ((S))
-
-#else /* not GC_CHECK_STRING_BYTES */
-
-#define STRING_BYTES(STR)  \
-  ((STR)->size_byte < 0 ? (STR)->size : (STR)->size_byte)
-
-#endif /* not GC_CHECK_STRING_BYTES */
+#define STRING_MULTIBYTE(string)					\
+  (XSTRING (string)->u.imm.immbit ?					\
+   (XSTRING (string)->u.imm.size_byte != STRING_UNIBYTE_IMM_MAX) :	\
+   (XSTRING (string)->u.dat.size_byte != STRING_UNIBYTE_DAT_MAX))
 
 /* An upper bound on the number of bytes in a Lisp string, not
    counting the terminating null.  This a tight enough bound to
@@ -874,20 +873,32 @@
    private.  The cast to ptrdiff_t ensures that STRING_BYTES_BOUND is
    signed.  */
 #define STRING_BYTES_BOUND  \
-  min (MOST_POSITIVE_FIXNUM, (ptrdiff_t) min (SIZE_MAX, PTRDIFF_MAX) - 1)
+  min (MOST_POSITIVE_FIXNUM, \
+       (ptrdiff_t) min (SIZE_MAX, STRING_UNIBYTE_DAT_MAX - 1) - 1)
+
+/* Maximum amount of bytes, including '\0', in an immediate string.  */
+#define STRING_IMM_SIZE (sizeof (void *) + 2 * sizeof (unsigned long) - 2)
 
 /* Mark STR as a unibyte string.  */
 #define STRING_SET_UNIBYTE(STR)  \
-  do { if (EQ (STR, empty_multibyte_string))  \
-      (STR) = empty_unibyte_string;  \
-    else XSTRING (STR)->size_byte = -1; } while (0)
+  do { if (EQ (STR, empty_multibyte_string))			\
+      (STR) = empty_unibyte_string;				\
+    else if (XSTRING (STR)->u.imm.immbit)			\
+      XSTRING (STR)->u.imm.size_byte = STRING_UNIBYTE_IMM_MAX;	\
+    else							\
+      XSTRING (STR)->u.dat.size_byte = STRING_UNIBYTE_DAT_MAX;	\
+  } while (0)
 
 /* Mark STR as a multibyte string.  Assure that STR contains only
    ASCII characters in advance.  */
-#define STRING_SET_MULTIBYTE(STR)  \
-  do { if (EQ (STR, empty_unibyte_string))  \
-      (STR) = empty_multibyte_string;  \
-    else XSTRING (STR)->size_byte = XSTRING (STR)->size; } while (0)
+#define STRING_SET_MULTIBYTE(STR)				  \
+  do { if (EQ (STR, empty_unibyte_string))			  \
+      (STR) = empty_multibyte_string;				  \
+    else if (XSTRING (STR)->u.imm.immbit)			  \
+      XSTRING (STR)->u.imm.size_byte = XSTRING (STR)->u.imm.size; \
+    else							  \
+      XSTRING (STR)->u.dat.size_byte = XSTRING (STR)->u.dat.size; \
+  } while (0)
 
 /* Get text properties.  */
 #define STRING_INTERVALS(STR)  (XSTRING (STR)->intervals + 0)
@@ -898,12 +909,53 @@
 /* In a string or vector, the sign bit of the `size' is the gc mark bit */
 
 struct Lisp_String
-  {
-    ptrdiff_t size;
-    ptrdiff_t size_byte;
-    INTERVAL intervals;		/* text properties in this string */
-    unsigned char *data;
-  };
+{
+  /* Text properties in this string.  Should be the first
+     member since NEXT_FREE_LISP_STRING from alloc.c uses it.  */
+  INTERVAL intervals;
+
+  union {
+    /* GC mark bit and subtype bit are in IMM just by convention - when
+       IMMBIT is 0, the DAT field is used except it's UNUSED field.  */
+    struct {
+      unsigned immbit : 1;
+      unsigned size : BITS_PER_CHAR - 1;
+      unsigned char data[STRING_IMM_SIZE];
+      unsigned size_byte : BITS_PER_CHAR - 1;
+      unsigned gcmarkbit : 1;
+    } imm;
+
+    struct {
+      unsigned immbit : 1;
+      unsigned long size : BITS_PER_LONG - 1;
+      unsigned char *data;
+      unsigned long size_byte : BITS_PER_LONG - 1;
+      unsigned gcmarkbit : 1;
+    } dat;
+  } u;
+};
+
+/* Return the length in bytes of STR.  */
+
+#ifdef GC_CHECK_STRING_BYTES
+
+struct Lisp_String;
+extern ptrdiff_t string_bytes (struct Lisp_String *);
+#define STRING_BYTES(S) string_bytes ((S))
+
+#else /* not GC_CHECK_STRING_BYTES */
+
+static inline
+ptrdiff_t string_bytes (struct Lisp_String *s)
+{
+  if (s->u.imm.immbit)
+    return s->u.imm.size_byte == STRING_UNIBYTE_IMM_MAX ?
+      s->u.imm.size : s->u.imm.size_byte;
+  return s->u.dat.size_byte == STRING_UNIBYTE_DAT_MAX ?
+    s->u.dat.size : s->u.dat.size_byte;
+}
+
+#endif /* not GC_CHECK_STRING_BYTES */
 
 /* Header of vector-like objects.  This documents the layout constraints on
    vectors and pseudovectors other than struct Lisp_Subr.  It also prevents


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-29 13:33       ` Dmitry Antipov
@ 2012-05-29 15:24         ` Paul Eggert
  2012-05-31  9:28           ` Dmitry Antipov
  0 siblings, 1 reply; 30+ messages in thread
From: Paul Eggert @ 2012-05-29 15:24 UTC (permalink / raw)
  To: Dmitry Antipov; +Cc: Stefan Monnier, emacs-devel

On 05/29/2012 06:33 AM, Dmitry Antipov wrote:
> On 05/29/2012 11:38 AM, Paul Eggert wrote:
> this allows
> 
> min (MOST_POSITIVE_FIXNUM, (ptrdiff_t) min (SIZE_MAX, PTRDIFF_MAX - 1) - 1)
> 
> in all configurations (so PTRDIFF_MAX - 2 for --with-wide-int), eventually
> uses _all_ bits by eliminating signed values.

That one's clever, but doesn't it grow the size of "struct Lisp_String"
by a word, due to the padding bytes after the gcmarkbit member?

Why does gcmarkbit have to be in the same place for the two kinds
of strings?  If you could put gcmarkbit in different places in the
different union members, you wouldn't need that padding.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-29 15:24         ` Paul Eggert
@ 2012-05-31  9:28           ` Dmitry Antipov
  2012-05-31 16:34             ` Paul Eggert
  0 siblings, 1 reply; 30+ messages in thread
From: Dmitry Antipov @ 2012-05-31  9:28 UTC (permalink / raw)
  To: Paul Eggert; +Cc: Stefan Monnier, emacs-devel

On 05/29/2012 07:24 PM, Paul Eggert wrote:

> That one's clever, but doesn't it grow the size of "struct Lisp_String"
> by a word, due to the padding bytes after the gcmarkbit member?
>
> Why does gcmarkbit have to be in the same place for the two kinds
> of strings?  If you could put gcmarkbit in different places in the
> different union members, you wouldn't need that padding.

IIUC this is the only possible layout which: 1) uses bitfields 2) not requires
__attribute__((packed)) and 3) provides 16-bytes Lisp_String with 10 bytes
for immediate data on 32-bit and 32-bytes Lisp_String with 22 bytes
for immediate data on 64-bit.

Dmitry



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-31  9:28           ` Dmitry Antipov
@ 2012-05-31 16:34             ` Paul Eggert
  2012-06-06  6:14               ` Dmitry Antipov
  0 siblings, 1 reply; 30+ messages in thread
From: Paul Eggert @ 2012-05-31 16:34 UTC (permalink / raw)
  To: Dmitry Antipov; +Cc: Stefan Monnier, emacs-devel

On 05/31/2012 02:28 AM, Dmitry Antipov wrote:
> IIUC this is the only possible layout which: 1) uses bitfields 2) not requires
> __attribute__((packed)) and 3) provides 16-bytes Lisp_String with 10 bytes
> for immediate data on 32-bit and 32-bytes Lisp_String with 22 bytes
> for immediate data on 64-bit.

Ah, sorry, I misunderstood
<http://lists.gnu.org/archive/html/emacs-devel/2012-05/msg00509.html>'s
proposal.  I thought that it grew struct Lisp_String.

Part of my misunderstanding was due to that the code confuses 'unsigned long'
with 'ptrdiff_t'.  The two types need not be the same width.
The code should use 'sizeof (ptrdiff_t)' where it currently
uses 'sizeof (unsigned long)', and we need a BITS_PER_PTRDIFF_T
enum to size the fields correctly.

What are the performance results for this version?

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-05-31 16:34             ` Paul Eggert
@ 2012-06-06  6:14               ` Dmitry Antipov
  2012-06-06  6:41                 ` Paul Eggert
  0 siblings, 1 reply; 30+ messages in thread
From: Dmitry Antipov @ 2012-06-06  6:14 UTC (permalink / raw)
  To: Paul Eggert; +Cc: Stefan Monnier, emacs-devel

On 05/31/2012 08:34 PM, Paul Eggert wrote:

> Part of my misunderstanding was due to that the code confuses 'unsigned long'
> with 'ptrdiff_t'.  The two types need not be the same width.
> The code should use 'sizeof (ptrdiff_t)' where it currently
> uses 'sizeof (unsigned long)', and we need a BITS_PER_PTRDIFF_T
> enum to size the fields correctly.

The problem with ptrdiff_t is that it's signed (and has no unsigned version),
and the code assumes that the bitfields are unsigned.

This raises the question which I wanted to raise long time ago: why ptrdiff_t
(read: signed difference between pointers) is used for size values, although
there is a special type ssize_t (and it's unsigned counterpart size_t)?

Dmitry

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-06-06  6:14               ` Dmitry Antipov
@ 2012-06-06  6:41                 ` Paul Eggert
  2012-06-06  7:29                   ` Dmitry Antipov
  0 siblings, 1 reply; 30+ messages in thread
From: Paul Eggert @ 2012-06-06  6:41 UTC (permalink / raw)
  To: Dmitry Antipov; +Cc: Stefan Monnier, emacs-devel

On 06/05/2012 11:14 PM, Dmitry Antipov wrote:
> The problem with ptrdiff_t is that it's signed (and has no unsigned version),
> and the code assumes that the bitfields are unsigned.

OK, but can't we then simply say 'unsigned int foo : N' where N is the
proper width?

> why ptrdiff_t
> (read: signed difference between pointers) is used for size values, although
> there is a special type ssize_t (and it's unsigned counterpart size_t)?

ssize_t is there mostly for historical reasons; it need not be as
wide as size_t (and on some ancient hosts is not).  It's best used
only when the POSIX API uses it.

Generally speaking we prefer signed types in Emacs source code, for
several reasons.  For example, comparisons work better (you don't get
strange results like zero is less than -1).  Another example: you can
trap signed integer overflow when debugging, and catch some bugs that way.
This is why we generally prefer ptrdiff_t to size_t.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-06-06  6:41                 ` Paul Eggert
@ 2012-06-06  7:29                   ` Dmitry Antipov
  2012-06-06 15:14                     ` Eli Zaretskii
  0 siblings, 1 reply; 30+ messages in thread
From: Dmitry Antipov @ 2012-06-06  7:29 UTC (permalink / raw)
  To: Paul Eggert; +Cc: emacs-devel

On 06/06/2012 10:41 AM, Paul Eggert wrote:

> OK, but can't we then simply say 'unsigned int foo : N' where N is the
> proper width?

struct foo {
   unsigned int x : BITS_PER_PTRDIFF_T - 1;
   unsigned y : 1;
};

will work only if sizeof (ptrdiff_t) == sizeof (int); if
sizeof (ptrdiff_t) == sizeof (long) and sizeof (long) > sizeof (int),
(the most of 64-bit environments), compilers will complain about
an invalid bitfield size. For the particular case of gcc, it's
'width of ‘x’ exceeds its type' error.

Note 'unsigned long' bitfield type:

struct foo {
   unsigned long x : BITS_PER_PTRDIFF_T - 1;
   unsigned y : 1;
}

works even if BITS_PER_PTRDIFF_T < BITS_PER_LONG.

Dmitry

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-06-06  7:29                   ` Dmitry Antipov
@ 2012-06-06 15:14                     ` Eli Zaretskii
  2012-06-06 21:44                       ` Paul Eggert
  2012-07-04  8:27                       ` Old topic(s) again [was: Re: Proposal: immediate strings] Dmitry Antipov
  0 siblings, 2 replies; 30+ messages in thread
From: Eli Zaretskii @ 2012-06-06 15:14 UTC (permalink / raw)
  To: Dmitry Antipov; +Cc: eggert, emacs-devel

> Date: Wed, 06 Jun 2012 11:29:47 +0400
> From: Dmitry Antipov <dmantipov@yandex.ru>
> Cc: emacs-devel@gnu.org
> 
> Note 'unsigned long' bitfield type:
> 
> struct foo {
>    unsigned long x : BITS_PER_PTRDIFF_T - 1;
>    unsigned y : 1;
> }
> 
> works even if BITS_PER_PTRDIFF_T < BITS_PER_LONG.

But only on LP64 hosts, right?



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Proposal: immediate strings
  2012-06-06 15:14                     ` Eli Zaretskii
@ 2012-06-06 21:44                       ` Paul Eggert
  2012-07-04  8:27                       ` Old topic(s) again [was: Re: Proposal: immediate strings] Dmitry Antipov
  1 sibling, 0 replies; 30+ messages in thread
From: Paul Eggert @ 2012-06-06 21:44 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Dmitry Antipov, emacs-devel

On 06/06/2012 08:14 AM, Eli Zaretskii wrote:
>> >    unsigned long x : BITS_PER_PTRDIFF_T - 1;
>> >    unsigned y : 1;
>> > }
>> > 
>> > works even if BITS_PER_PTRDIFF_T < BITS_PER_LONG.
> But only on LP64 hosts, right?

Yes, I suppose the bitfield type would be
unsigned long long on hosts where ptrdiff_t
is too wide for unsigned long; this is something
that be determined via something like
"#if ULONG_MAX < PTRDIFF_MAX".

Or perhaps it'd be simpler to say this:

   size_t x : BITS_PER_PTRDIFF_T - 1;
   unsigned y : 1;

if that's portable in practice.



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Old topic(s) again [was: Re: Proposal: immediate strings]
  2012-06-06 15:14                     ` Eli Zaretskii
  2012-06-06 21:44                       ` Paul Eggert
@ 2012-07-04  8:27                       ` Dmitry Antipov
  2012-07-04 13:08                         ` Stefan Monnier
  1 sibling, 1 reply; 30+ messages in thread
From: Dmitry Antipov @ 2012-07-04  8:27 UTC (permalink / raw)
  To: emacs-devel

[-- Attachment #1: Type: text/plain, Size: 89 bytes --]

This feature is not thrown away, so I would like to get more comments around it.

Dmitry

[-- Attachment #2: immstr.patch --]
[-- Type: text/plain, Size: 22868 bytes --]

=== modified file 'configure.in'
--- configure.in	2012-07-04 08:07:26 +0000
+++ configure.in	2012-07-04 08:09:13 +0000
@@ -3080,6 +3080,29 @@
      declarations.  Define as empty for no equivalent.])
 fi
 
+dnl Determine the basic type of ptrdiff_t.
+AC_CHECK_SIZEOF([int])
+AC_CHECK_SIZEOF([long])
+AC_CHECK_SIZEOF([long long])
+AC_CHECK_SIZEOF([ptrdiff_t])
+AC_MSG_CHECKING([for the basic type of ptrdiff_t])
+if test $ac_cv_sizeof_int = $ac_cv_sizeof_ptrdiff_t; then
+   emacs_cv_type_ptrdiff_t="int"
+elif test $ac_cv_sizeof_long = $ac_cv_sizeof_ptrdiff_t; then
+   emacs_cv_type_ptrdiff_t="long"
+elif test $ac_cv_sizeof_long_long = $ac_cv_sizeof_ptrdiff_t; then
+   emacs_cv_type_ptrdiff_t="long long"
+else
+   emacs_cv_type_ptrdiff_t="unknown"
+fi
+AC_MSG_RESULT([$emacs_cv_type_ptrdiff_t])
+if test $emacs_cv_type_ptrdiff_t != "unknown"; then
+   AC_DEFINE_UNQUOTED([TYPE_PTRDIFF_T], [$emacs_cv_type_ptrdiff_t],
+                      [Define to the basic type of ptrdiff_t])
+else
+   AC_MSG_ERROR([Unable to find the basic type of ptrdiff_t.])
+fi
+
 dnl Fixme: AC_SYS_POSIX_TERMIOS should probably be used, but it's not clear
 dnl how the tty code is related to POSIX and/or other versions of termios.
 dnl The following looks like a useful start.

=== modified file 'src/alloc.c'
--- src/alloc.c	2012-07-03 16:35:53 +0000
+++ src/alloc.c	2012-07-04 08:09:13 +0000
@@ -147,20 +147,14 @@
 /* Mark, unmark, query mark bit of a Lisp string.  S must be a pointer
    to a struct Lisp_String.  */
 
-#define MARK_STRING(S)		((S)->size |= ARRAY_MARK_FLAG)
-#define UNMARK_STRING(S)	((S)->size &= ~ARRAY_MARK_FLAG)
-#define STRING_MARKED_P(S)	(((S)->size & ARRAY_MARK_FLAG) != 0)
+#define MARK_STRING(S)		((S)->u.imm.gcmarkbit = 1)
+#define UNMARK_STRING(S)	((S)->u.imm.gcmarkbit = 0)
+#define STRING_MARKED_P(S)	((S)->u.imm.gcmarkbit)
 
 #define VECTOR_MARK(V)		((V)->header.size |= ARRAY_MARK_FLAG)
 #define VECTOR_UNMARK(V)	((V)->header.size &= ~ARRAY_MARK_FLAG)
 #define VECTOR_MARKED_P(V)	(((V)->header.size & ARRAY_MARK_FLAG) != 0)
 
-/* Value is the number of bytes of S, a pointer to a struct Lisp_String.
-   Be careful during GC, because S->size contains the mark bit for
-   strings.  */
-
-#define GC_STRING_BYTES(S)	(STRING_BYTES (S))
-
 /* Global variables.  */
 struct emacs_globals globals;
 
@@ -394,6 +388,7 @@
 static void mark_stack (void);
 static int live_vector_p (struct mem_node *, void *);
 static int live_buffer_p (struct mem_node *, void *);
+static int live_string_data_p (struct Lisp_String *);
 static int live_string_p (struct mem_node *, void *);
 static int live_cons_p (struct mem_node *, void *);
 static int live_symbol_p (struct mem_node *, void *);
@@ -1709,6 +1704,37 @@
 
 static EMACS_INT total_string_size;
 
+#ifdef GC_STRING_STATS
+
+/* All of these counters are invalid between GCs
+   because they are updated at the sweep phase.  */
+
+/* Number of live immediate strings.  */
+
+static EMACS_INT total_imm_strings;
+
+/* Amount of data bytes used by them.  */
+
+static EMACS_INT total_imm_bytes;
+
+/* Number of intervals attached to them.  */
+
+static EMACS_INT total_imm_intervals;
+
+/* Number of live normal strings.  */
+
+static EMACS_INT total_dat_strings;
+
+/* Amount of data bytes used by them.  */
+
+static EMACS_INT total_dat_bytes;
+
+/* Number of intervals attached to them.  */
+
+static EMACS_INT total_dat_intervals;
+
+#endif /* GC_STRING_STATS */
+
 /* Given a pointer to a Lisp_String S which is on the free-list
    string_free_list, return a pointer to its successor in the
    free-list.  */
@@ -1720,7 +1746,8 @@
    a pointer to the `u.data' member of its sdata structure; the
    structure starts at a constant offset in front of that.  */
 
-#define SDATA_OF_STRING(S) ((struct sdata *) ((S)->data - SDATA_DATA_OFFSET))
+#define SDATA_OF_STRING(S) ((S)->u.imm.immbit ? (struct sdata *) NULL \
+  : ((struct sdata *) ((S)->u.dat.data - SDATA_DATA_OFFSET)))
 
 
 #ifdef GC_CHECK_STRING_OVERRUN
@@ -1797,26 +1824,32 @@
   empty_multibyte_string = make_pure_string ("", 0, 0, 1);
 }
 
-
 #ifdef GC_CHECK_STRING_BYTES
 
 static int check_string_bytes_count;
 
-#define CHECK_STRING_BYTES(S)	STRING_BYTES (S)
-
-
-/* Like GC_STRING_BYTES, but with debugging check.  */
+#define CHECK_STRING_BYTES(S) string_bytes (S)
 
 ptrdiff_t
 string_bytes (struct Lisp_String *s)
 {
-  ptrdiff_t nbytes =
-    (s->size_byte < 0 ? s->size & ~ARRAY_MARK_FLAG : s->size_byte);
-
-  if (!PURE_POINTER_P (s)
-      && s->data
-      && nbytes != SDATA_NBYTES (SDATA_OF_STRING (s)))
-    abort ();
+  ptrdiff_t nbytes;
+
+  if (s->u.imm.immbit)
+    {
+      nbytes = s->u.imm.size_byte == STRING_UNIBYTE_IMM_MARK ?
+	s->u.imm.size : s->u.imm.size_byte;
+      eassert (nbytes < STRING_IMM_SIZE);
+    }
+  else
+    {
+      nbytes = s->u.dat.size_byte == STRING_UNIBYTE_DAT_MARK ?
+	s->u.dat.size : s->u.dat.size_byte;
+      eassert (nbytes >= STRING_IMM_SIZE);
+      if (!PURE_POINTER_P (s) && s->u.dat.data)
+	eassert (nbytes == SDATA_NBYTES (SDATA_OF_STRING (s)));
+    }
+
   return nbytes;
 }
 
@@ -1841,7 +1874,7 @@
 	CHECK_STRING_BYTES (from->string);
 
       if (from->string)
-	nbytes = GC_STRING_BYTES (from->string);
+	nbytes = string_bytes (from->string);
       else
 	nbytes = SDATA_NBYTES (from);
 
@@ -1926,8 +1959,9 @@
       for (i = STRING_BLOCK_SIZE - 1; i >= 0; --i)
 	{
 	  s = b->strings + i;
-	  /* Every string on a free list should have NULL data pointer.  */
-	  s->data = NULL;
+	  /* Every string on a free list is immediate.  */
+	  s->u.imm.immbit = 1;
+	  s->u.imm.gcmarkbit = 0;
 	  NEXT_FREE_LISP_STRING (s) = string_free_list;
 	  string_free_list = s;
 	}
@@ -2043,13 +2077,11 @@
   MALLOC_UNBLOCK_INPUT;
 
   data->string = s;
-  s->data = SDATA_DATA (data);
+  s->u.dat.data = SDATA_DATA (data);
 #ifdef GC_CHECK_STRING_BYTES
   SDATA_NBYTES (data) = nbytes;
 #endif
-  s->size = nchars;
-  s->size_byte = nbytes;
-  s->data[nbytes] = '\0';
+  s->u.dat.data[nbytes] = '\0';
 #ifdef GC_CHECK_STRING_OVERRUN
   memcpy ((char *) data + needed, string_overrun_cookie,
 	  GC_STRING_OVERRUN_COOKIE_SIZE);
@@ -2070,6 +2102,12 @@
   total_strings = total_free_strings = 0;
   total_string_size = 0;
 
+#ifdef GC_STRING_STATS
+  total_imm_strings = total_dat_strings = 0;
+  total_imm_bytes = total_dat_bytes = 0;
+  total_imm_intervals = total_dat_intervals = 0;
+#endif
+
   /* Scan strings_blocks, free Lisp_Strings that aren't marked.  */
   for (b = string_blocks; b; b = next)
     {
@@ -2082,49 +2120,64 @@
 	{
 	  struct Lisp_String *s = b->strings + i;
 
-	  if (s->data)
+	  if (STRING_MARKED_P (s))
+	    {	      
+	      /* String is live; unmark it and its intervals.  */
+	      UNMARK_STRING (s);
+
+	      if (!NULL_INTERVAL_P (s->intervals))
+		UNMARK_BALANCE_INTERVALS (s->intervals);
+
+	      ++total_strings;
+	      total_string_size += string_bytes (s);
+#ifdef GC_STRING_STATS
+	      if (s->u.imm.immbit)
+		{
+		  total_imm_strings++;
+		  total_imm_bytes += string_bytes (s);
+		  if (!NULL_INTERVAL_P (s->intervals))
+		    total_imm_intervals++;
+		}
+	      else
+		{
+		  total_dat_strings++;
+		  total_dat_bytes += string_bytes (s);
+		  if (!NULL_INTERVAL_P (s->intervals))
+		    total_dat_intervals++;
+		}
+#endif /* GC_STRING_STATS */
+	    }
+	  else
 	    {
-	      /* String was not on free-list before.  */
-	      if (STRING_MARKED_P (s))
-		{
-		  /* String is live; unmark it and its intervals.  */
-		  UNMARK_STRING (s);
-
-		  if (!NULL_INTERVAL_P (s->intervals))
-		    UNMARK_BALANCE_INTERVALS (s->intervals);
-
-		  ++total_strings;
-		  total_string_size += STRING_BYTES (s);
-		}
+	      if (s->u.imm.immbit)
+		/* Fill data with special pattern. Used
+		   by GC to find dead immediate strings.  */
+		memset (s->u.imm.data, 0xff, STRING_IMM_SIZE);
 	      else
 		{
-		  /* String is dead.  Put it on the free-list.  */
-		  struct sdata *data = SDATA_OF_STRING (s);
+		  if (s->u.dat.data)
+		    {
+		      /* String is dead.  Put it on the free-list.  */
+		      struct sdata *data = SDATA_OF_STRING (s);
 
-		  /* Save the size of S in its sdata so that we know
-		     how large that is.  Reset the sdata's string
-		     back-pointer so that we know it's free.  */
+		      /* Save the size of S in its sdata so that we know
+			 how large that is.  Reset the sdata's string
+			 back-pointer so that we know it's free.  */
 #ifdef GC_CHECK_STRING_BYTES
-		  if (GC_STRING_BYTES (s) != SDATA_NBYTES (data))
-		    abort ();
+		      if (string_bytes (s) != SDATA_NBYTES (data))
+			abort ();
 #else
-		  data->u.nbytes = GC_STRING_BYTES (s);
+		      data->u.nbytes = string_bytes (s);
 #endif
-		  data->string = NULL;
-
-		  /* Reset the strings's `data' member so that we
-		     know it's free.  */
-		  s->data = NULL;
-
-		  /* Put the string on the free-list.  */
-		  NEXT_FREE_LISP_STRING (s) = string_free_list;
-		  string_free_list = s;
-		  ++nfree;
+		      data->string = NULL;
+
+		      /* Reset the strings's `data' member so that we
+			 know it's free.  */
+		      s->u.dat.data = NULL;
+		    }
 		}
-	    }
-	  else
-	    {
-	      /* S was on the free-list before.  Put it there again.  */
+
+	      /* Put the string on the free-list.  */
 	      NEXT_FREE_LISP_STRING (s) = string_free_list;
 	      string_free_list = s;
 	      ++nfree;
@@ -2216,12 +2269,12 @@
 	  /* Check that the string size recorded in the string is the
 	     same as the one recorded in the sdata structure. */
 	  if (from->string
-	      && GC_STRING_BYTES (from->string) != SDATA_NBYTES (from))
+	      && string_bytes (from->string) != SDATA_NBYTES (from))
 	    abort ();
 #endif /* GC_CHECK_STRING_BYTES */
 
 	  if (from->string)
-	    nbytes = GC_STRING_BYTES (from->string);
+	    nbytes = string_bytes (from->string);
 	  else
 	    nbytes = SDATA_NBYTES (from);
 
@@ -2257,7 +2310,7 @@
 		{
 		  eassert (tb != b || to < from);
 		  memmove (to, from, nbytes + GC_STRING_EXTRA);
-		  to->string->data = SDATA_DATA (to);
+		  to->string->u.dat.data = SDATA_DATA (to);
 		}
 
 	      /* Advance past the sdata we copied to.  */
@@ -2497,8 +2550,25 @@
     return empty_multibyte_string;
 
   s = allocate_string ();
+  s->u.imm.gcmarkbit = 0;
+
+  if (nbytes < STRING_IMM_SIZE)
+    {
+      s->u.imm.immbit = 1;
+      s->u.imm.data[nbytes] = '\0';
+      s->u.imm.size = nchars;
+      s->u.imm.size_byte = nbytes;
+    }
+  else
+    {
+      s->u.imm.immbit = 0;
+      s->u.dat.data = NULL;
+      s->u.dat.size = nchars;
+      s->u.dat.size_byte = nbytes;
+      allocate_string_data (s, nchars, nbytes);
+    }
+
   s->intervals = NULL_INTERVAL;
-  allocate_string_data (s, nchars, nbytes);
   XSETSTRING (string, s);
   string_chars_consed += nbytes;
   return string;
@@ -4196,6 +4266,22 @@
   x->color = MEM_BLACK;
 }
 
+/* Non-zero if data of S is valid.  */
+
+static inline int
+live_string_data_p (struct Lisp_String *s)
+{
+  if (s->u.imm.immbit)
+    {
+      unsigned char *p;
+
+      for (p = s->u.imm.data; p < s->u.imm.data + STRING_IMM_SIZE; p++)
+	if (*p != 0xff)
+	  return 1;
+      return 0;
+    }
+  return s->u.dat.data != NULL;
+}
 
 /* Value is non-zero if P is a pointer to a live Lisp string on
    the heap.  M is a pointer to the mem_block for P.  */
@@ -4213,7 +4299,7 @@
       return (offset >= 0
 	      && offset % sizeof b->strings[0] == 0
 	      && offset < (STRING_BLOCK_SIZE * sizeof b->strings[0])
-	      && ((struct Lisp_String *) p)->data != NULL);
+	      && live_string_data_p ((struct Lisp_String *) p));
     }
   else
     return 0;
@@ -5159,15 +5245,29 @@
   struct Lisp_String *s;
 
   s = (struct Lisp_String *) pure_alloc (sizeof *s, Lisp_String);
-  s->data = (unsigned char *) find_string_data_in_pure (data, nbytes);
-  if (s->data == NULL)
-    {
-      s->data = (unsigned char *) pure_alloc (nbytes + 1, -1);
-      memcpy (s->data, data, nbytes);
-      s->data[nbytes] = '\0';
-    }
-  s->size = nchars;
-  s->size_byte = multibyte ? nbytes : -1;
+
+  if (nbytes < STRING_IMM_SIZE)
+    {
+      memcpy (s->u.imm.data, data, nbytes);
+      s->u.imm.data[nbytes] = '\0';
+      s->u.imm.immbit = 1;
+      s->u.imm.size = nchars;
+      s->u.imm.size_byte = multibyte ? nbytes : STRING_UNIBYTE_IMM_MARK;
+    }
+  else
+    {
+      s->u.dat.data = (unsigned char *) find_string_data_in_pure (data, nbytes);
+      if (s->u.dat.data == NULL)
+	{
+	  s->u.dat.data = (unsigned char *) pure_alloc (nbytes + 1, -1);
+	  memcpy (s->u.dat.data, data, nbytes);
+	  s->u.dat.data[nbytes] = '\0';
+	}
+      s->u.imm.immbit = 0;
+      s->u.dat.size = nchars;
+      s->u.dat.size_byte = multibyte ? nbytes : STRING_UNIBYTE_DAT_MARK;
+    }
+
   s->intervals = NULL_INTERVAL;
   XSETSTRING (string, s);
   return string;
@@ -5184,9 +5284,23 @@
   ptrdiff_t nchars = strlen (data);
 
   s = (struct Lisp_String *) pure_alloc (sizeof *s, Lisp_String);
-  s->size = nchars;
-  s->size_byte = -1;
-  s->data = (unsigned char *) data;
+
+  if (nchars < STRING_IMM_SIZE)
+    {
+      memcpy (s->u.imm.data, data, nchars);
+      s->u.imm.data[nchars] = '\0';
+      s->u.imm.immbit = 1;
+      s->u.imm.size = nchars;
+      s->u.imm.size_byte = STRING_UNIBYTE_IMM_MARK;
+    }
+  else
+    {
+      s->u.dat.data = (unsigned char *) data;
+      s->u.imm.immbit = 0;
+      s->u.dat.size = nchars;
+      s->u.dat.size_byte = STRING_UNIBYTE_DAT_MARK;
+    }
+
   s->intervals = NULL_INTERVAL;
   XSETSTRING (string, s);
   return string;
@@ -6587,6 +6701,34 @@
   return Flist (8, consed);
 }
 
+#ifdef GC_STRING_STATS
+
+DEFUN ("string-stats", Fstring_stats, Sstring_stats, 0, 0, 0,
+       doc: /* Return a list of counters that measures how much
+strings of a particular internal structure are alive after last
+garbage collection, and how many bytes are in them.
+The elements of the value are are as follows:
+  (IMM-STRINGS IMM-BYTES IMM-INTERVALS DAT-STRINGS DAT-BYTES DAT-INTERVALS)
+where IMM-STRINGS is the number of immediate strings, IMM-BYTES is the total
+number of bytes in them, and IMM-INTERVALS is the number of immediate string
+with non-nil text properties. The rest three numbers has the same meaning
+for normal strings, respectively.  */)
+  (void)
+{
+  Lisp_Object data[6];
+
+  data[0] = make_number (min (MOST_POSITIVE_FIXNUM, total_imm_strings));
+  data[1] = make_number (min (MOST_POSITIVE_FIXNUM, total_imm_bytes));
+  data[2] = make_number (min (MOST_POSITIVE_FIXNUM, total_imm_intervals));
+  data[3] = make_number (min (MOST_POSITIVE_FIXNUM, total_dat_strings));
+  data[4] = make_number (min (MOST_POSITIVE_FIXNUM, total_dat_bytes));
+  data[5] = make_number (min (MOST_POSITIVE_FIXNUM, total_dat_intervals));
+
+  return Flist (6, data);
+}
+
+#endif /* GC_STRING_STATS */
+
 /* Find at most FIND_MAX symbols which have OBJ as their value or
    function.  This is used in gdbinit's `xwhichsymbols' command.  */
 
@@ -6815,7 +6957,9 @@
   defsubr (&Sgarbage_collect);
   defsubr (&Smemory_limit);
   defsubr (&Smemory_use_counts);
-
+#ifdef GC_STRING_STATS
+  defsubr (&Sstring_stats);
+#endif
 #if GC_MARK_STACK == GC_USE_GCPROS_CHECK_ZOMBIES
   defsubr (&Sgc_status);
 #endif

=== modified file 'src/fns.c'
--- src/fns.c	2012-06-28 07:50:27 +0000
+++ src/fns.c	2012-07-04 08:09:13 +0000
@@ -2166,8 +2166,8 @@
 	  int len = CHAR_STRING (charval, str);
 	  ptrdiff_t size_byte = SBYTES (array);
 
-	  if (INT_MULTIPLY_OVERFLOW (SCHARS (array), len)
-	      || SCHARS (array) * len != size_byte)
+	  if (INT_MULTIPLY_OVERFLOW (size, len)
+	      || size * len != size_byte)
 	    error ("Attempt to change byte length of a string");
 	  for (idx = 0; idx < size_byte; idx++)
 	    *p++ = str[idx % len];

=== modified file 'src/lisp.h'
--- src/lisp.h	2012-07-03 20:34:47 +0000
+++ src/lisp.h	2012-07-04 08:09:13 +0000
@@ -69,7 +69,8 @@
     BITS_PER_SHORT     = CHAR_BIT * sizeof (short),
     BITS_PER_INT       = CHAR_BIT * sizeof (int),
     BITS_PER_LONG      = CHAR_BIT * sizeof (long int),
-    BITS_PER_EMACS_INT = CHAR_BIT * sizeof (EMACS_INT)
+    BITS_PER_EMACS_INT = CHAR_BIT * sizeof (EMACS_INT),
+    BITS_PER_PTRDIFF_T = CHAR_BIT * sizeof (ptrdiff_t)
   };
 
 /* printmax_t and uprintmax_t are types for printing large integers.
@@ -576,23 +577,6 @@
    eassert ((IDX) >= 0 && (IDX) < ASIZE (ARRAY)),	\
    AREF ((ARRAY), (IDX)) = (VAL))
 
-/* Convenience macros for dealing with Lisp strings.  */
-
-#define SDATA(string)		(XSTRING (string)->data + 0)
-#define SREF(string, index)	(SDATA (string)[index] + 0)
-#define SSET(string, index, new) (SDATA (string)[index] = (new))
-#define SCHARS(string)		(XSTRING (string)->size + 0)
-#define SBYTES(string)		(STRING_BYTES (XSTRING (string)) + 0)
-
-/* Avoid "differ in sign" warnings.  */
-#define SSDATA(x)  ((char *) SDATA (x))
-
-#define STRING_SET_CHARS(string, newsize) \
-    (XSTRING (string)->size = (newsize))
-
-#define STRING_COPYIN(string, index, new, count) \
-    memcpy (SDATA (string) + index, new, count)
-
 /* Type checking.  */
 
 #define CHECK_TYPE(ok, Qxxxp, x) \
@@ -678,24 +662,40 @@
 #define CDR_SAFE(c)				\
   (CONSP ((c)) ? XCDR ((c)) : Qnil)
 
+/* Convenience macros for dealing with Lisp strings.  */
+
+#define SDATA(string)		(XSTRING (string)->u.imm.immbit ? \
+				 (XSTRING (string)->u.imm.data) : \
+				 (XSTRING (string)->u.dat.data))
+#define SREF(string, index)	(SDATA (string)[index] + 0)
+#define SSET(string, index, new) (SDATA (string)[index] = (new))
+#define SCHARS(string)		(XSTRING (string)->u.imm.immbit ? \
+				 (XSTRING (string)->u.imm.size) : \
+				 (XSTRING (string)->u.dat.size))
+#define SBYTES(string)		(string_bytes (XSTRING (string)))
+
+/* Avoid "differ in sign" warnings.  */
+#define SSDATA(x)  ((char *) SDATA (x))
+
+#define STRING_SET_CHARS(string, newsize)	\
+  (XSTRING (string)->u.imm.immbit ?		\
+   (XSTRING (string)->u.imm.size = (newsize)) : \
+   (XSTRING (string)->u.dat.size = (newsize)))
+
+#define STRING_COPYIN(string, index, new, count)	\
+  memcpy (SDATA (string) + index, new, count)
+
+/* For unibyte immediate string, SIZE_BYTE field is always set to this.  */
+#define STRING_UNIBYTE_IMM_MARK ((1 << (BITS_PER_CHAR - 1)) - 1)
+
+/* For unibyte normal string, SIZE_BYTE field is always set to this.  */
+#define STRING_UNIBYTE_DAT_MARK ((1UL << (BITS_PER_PTRDIFF_T - 1)) - 1)
+
 /* Nonzero if STR is a multibyte string.  */
-#define STRING_MULTIBYTE(STR)  \
-  (XSTRING (STR)->size_byte >= 0)
-
-/* Return the length in bytes of STR.  */
-
-#ifdef GC_CHECK_STRING_BYTES
-
-struct Lisp_String;
-extern ptrdiff_t string_bytes (struct Lisp_String *);
-#define STRING_BYTES(S) string_bytes ((S))
-
-#else /* not GC_CHECK_STRING_BYTES */
-
-#define STRING_BYTES(STR)  \
-  ((STR)->size_byte < 0 ? (STR)->size : (STR)->size_byte)
-
-#endif /* not GC_CHECK_STRING_BYTES */
+#define STRING_MULTIBYTE(string)					\
+  (XSTRING (string)->u.imm.immbit ?					\
+   (XSTRING (string)->u.imm.size_byte != STRING_UNIBYTE_IMM_MARK) :	\
+   (XSTRING (string)->u.dat.size_byte != STRING_UNIBYTE_DAT_MARK))
 
 /* An upper bound on the number of bytes in a Lisp string, not
    counting the terminating null.  This a tight enough bound to
@@ -708,21 +708,33 @@
    would expose alloc.c internal details that we'd rather keep
    private.  The cast to ptrdiff_t ensures that STRING_BYTES_BOUND is
    signed.  */
-#define STRING_BYTES_BOUND  \
-  min (MOST_POSITIVE_FIXNUM, (ptrdiff_t) min (SIZE_MAX, PTRDIFF_MAX) - 1)
+#define STRING_BYTES_BOUND						\
+  min (MOST_POSITIVE_FIXNUM,						\
+       (ptrdiff_t) min (SIZE_MAX, STRING_UNIBYTE_DAT_MARK - 1) - 1)
+
+/* Maximum amount of bytes, including '\0', in an immediate string.  */
+#define STRING_IMM_SIZE (sizeof (void *) + 2 * sizeof (TYPE_PTRDIFF_T) - 2)
 
 /* Mark STR as a unibyte string.  */
 #define STRING_SET_UNIBYTE(STR)  \
-  do { if (EQ (STR, empty_multibyte_string))  \
-      (STR) = empty_unibyte_string;  \
-    else XSTRING (STR)->size_byte = -1; } while (0)
+  do { if (EQ (STR, empty_multibyte_string))			\
+      (STR) = empty_unibyte_string;				\
+    else if (XSTRING (STR)->u.imm.immbit)			\
+      XSTRING (STR)->u.imm.size_byte = STRING_UNIBYTE_IMM_MARK;	\
+    else							\
+      XSTRING (STR)->u.dat.size_byte = STRING_UNIBYTE_DAT_MARK;	\
+  } while (0)
 
 /* Mark STR as a multibyte string.  Assure that STR contains only
    ASCII characters in advance.  */
-#define STRING_SET_MULTIBYTE(STR)  \
-  do { if (EQ (STR, empty_unibyte_string))  \
-      (STR) = empty_multibyte_string;  \
-    else XSTRING (STR)->size_byte = XSTRING (STR)->size; } while (0)
+#define STRING_SET_MULTIBYTE(STR)				  \
+  do { if (EQ (STR, empty_unibyte_string))			  \
+      (STR) = empty_multibyte_string;				  \
+    else if (XSTRING (STR)->u.imm.immbit)			  \
+      XSTRING (STR)->u.imm.size_byte = XSTRING (STR)->u.imm.size; \
+    else							  \
+      XSTRING (STR)->u.dat.size_byte = XSTRING (STR)->u.dat.size; \
+  } while (0)
 
 /* Get text properties.  */
 #define STRING_INTERVALS(STR)  (XSTRING (STR)->intervals + 0)
@@ -730,15 +742,54 @@
 /* Set text properties.  */
 #define STRING_SET_INTERVALS(STR, INT) (XSTRING (STR)->intervals = (INT))
 
-/* In a string or vector, the sign bit of the `size' is the gc mark bit */
-
 struct Lisp_String
-  {
-    ptrdiff_t size;
-    ptrdiff_t size_byte;
-    INTERVAL intervals;		/* text properties in this string */
-    unsigned char *data;
-  };
+{
+  /* Text properties in this string.  Should be the first
+     member since NEXT_FREE_LISP_STRING from alloc.c uses it.  */
+  INTERVAL intervals;
+
+  union {
+    /* GC mark bit and subtype bit are in IMM just by convention - when
+       IMMBIT is 0, the DAT field is used except it's UNUSED field.  */
+    struct {
+      unsigned immbit : 1;
+      unsigned size : BITS_PER_CHAR - 1;
+      unsigned char data[STRING_IMM_SIZE];
+      unsigned size_byte : BITS_PER_CHAR - 1;
+      unsigned gcmarkbit : 1;
+    } imm;
+
+    struct {
+      unsigned immbit : 1;
+      unsigned TYPE_PTRDIFF_T size : BITS_PER_PTRDIFF_T - 1;
+      unsigned char *data;
+      unsigned TYPE_PTRDIFF_T size_byte : BITS_PER_PTRDIFF_T - 1;
+      unsigned gcmarkbit : 1;
+    } dat;
+  } u;
+};
+
+/* Return the length in bytes of STR.  */
+
+#ifdef GC_CHECK_STRING_BYTES
+
+struct Lisp_String;
+extern ptrdiff_t string_bytes (struct Lisp_String *);
+#define STRING_BYTES(S) string_bytes ((S))
+
+#else /* not GC_CHECK_STRING_BYTES */
+
+static inline
+ptrdiff_t string_bytes (struct Lisp_String *s)
+{
+  if (s->u.imm.immbit)
+    return s->u.imm.size_byte == STRING_UNIBYTE_IMM_MARK ?
+      s->u.imm.size : s->u.imm.size_byte;
+  return s->u.dat.size_byte == STRING_UNIBYTE_DAT_MARK ?
+    s->u.dat.size : s->u.dat.size_byte;
+}
+
+#endif /* not GC_CHECK_STRING_BYTES */
 
 /* Header of vector-like objects.  This documents the layout constraints on
    vectors and pseudovectors other than struct Lisp_Subr.  It also prevents


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Old topic(s) again [was: Re: Proposal: immediate strings]
  2012-07-04  8:27                       ` Old topic(s) again [was: Re: Proposal: immediate strings] Dmitry Antipov
@ 2012-07-04 13:08                         ` Stefan Monnier
  2012-07-04 19:32                           ` Paul Eggert
  0 siblings, 1 reply; 30+ messages in thread
From: Stefan Monnier @ 2012-07-04 13:08 UTC (permalink / raw)
  To: Dmitry Antipov; +Cc: emacs-devel

> This feature is not thrown away, so I would like to get more comments
> around it.

I'm still not sure it's worth the trouble (although I've been running
here with a similar patch for the last several years).

A few comments below.

> +dnl Determine the basic type of ptrdiff_t.
> +AC_CHECK_SIZEOF([int])
> +AC_CHECK_SIZEOF([long])
> +AC_CHECK_SIZEOF([long long])
> +AC_CHECK_SIZEOF([ptrdiff_t])
> +AC_MSG_CHECKING([for the basic type of ptrdiff_t])
> +if test $ac_cv_sizeof_int = $ac_cv_sizeof_ptrdiff_t; then
> +   emacs_cv_type_ptrdiff_t="int"
> +elif test $ac_cv_sizeof_long = $ac_cv_sizeof_ptrdiff_t; then
> +   emacs_cv_type_ptrdiff_t="long"
> +elif test $ac_cv_sizeof_long_long = $ac_cv_sizeof_ptrdiff_t; then
> +   emacs_cv_type_ptrdiff_t="long long"
> +else
> +   emacs_cv_type_ptrdiff_t="unknown"
> +fi
> +AC_MSG_RESULT([$emacs_cv_type_ptrdiff_t])
> +if test $emacs_cv_type_ptrdiff_t != "unknown"; then
> +   AC_DEFINE_UNQUOTED([TYPE_PTRDIFF_T], [$emacs_cv_type_ptrdiff_t],
> +                      [Define to the basic type of ptrdiff_t])
> +else
> +   AC_MSG_ERROR([Unable to find the basic type of ptrdiff_t.])
> +fi

I really would much rather avoid such things.  Why do we need it?

> -	      && ((struct Lisp_String *) p)->data != NULL);
> +	      && live_string_data_p ((struct Lisp_String *) p));

Why can't we keep using the same simple test (combined with testing
immbit, obviously)?

> -/* Convenience macros for dealing with Lisp strings.  */

Why do you move this block of definitions?

> +{
> +  /* Text properties in this string.  Should be the first
> +     member since NEXT_FREE_LISP_STRING from alloc.c uses it.  */
> +  INTERVAL intervals;

I don't understand this comment (you haven't changed
NEXT_FREE_LISP_STRING AFAICT, but the field is not first in the current
code).

> +  union {
> +    /* GC mark bit and subtype bit are in IMM just by convention - when
> +       IMMBIT is 0, the DAT field is used except it's UNUSED field.  */

This comment seems out of date: gcmarkbit seems to be in both `imm' and
`dat', and I don't see any "subtype" bit, especially not one that's only
in `imm'.  Finally, I can't find any `unused' field either.

> +    struct {
> +      unsigned immbit : 1;
> +      unsigned size : BITS_PER_CHAR - 1;
> +      unsigned char data[STRING_IMM_SIZE];
> +      unsigned size_byte : BITS_PER_CHAR - 1;
> +      unsigned gcmarkbit : 1;
> +    } imm;
> +
> +    struct {
> +      unsigned immbit : 1;
> +      unsigned TYPE_PTRDIFF_T size : BITS_PER_PTRDIFF_T - 1;
> +      unsigned char *data;
> +      unsigned TYPE_PTRDIFF_T size_byte : BITS_PER_PTRDIFF_T - 1;
> +      unsigned gcmarkbit : 1;
> +    } dat;
> +  } u;
> +};

The comment should explain here the use of STRING_UNIBYTE_IMM/DAT_MARK.

One more thing: while I'm quite willing to believe that this placement
of gcmarkbit at the end of both structs (placed after differently-sized
bit fields) works fine in practice, I'd be interested to know to what
extent the C language guarantees that it will work.


        Stefan



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Old topic(s) again [was: Re: Proposal: immediate strings]
  2012-07-04 13:08                         ` Stefan Monnier
@ 2012-07-04 19:32                           ` Paul Eggert
  0 siblings, 0 replies; 30+ messages in thread
From: Paul Eggert @ 2012-07-04 19:32 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: Dmitry Antipov, emacs-devel

On 07/04/2012 06:08 AM, Stefan Monnier wrote:
>> +dnl Determine the basic type of ptrdiff_t.
>> +AC_CHECK_SIZEOF([int])
>> +...
> 
> I really would much rather avoid such things.  Why do we need it?

We shouldn't need it.  Instead of sizeof (TYPE_PTRDIFF_T), the code
can use sizeof (ptrdiff_t).  And instead of a member declaration:

      unsigned TYPE_PTRDIFF_T size : BITS_PER_PTRDIFF_T - 1;

the code should be able to use size_t:

      size_t size : BITS_PER_PTRDIFF_T - 1;

The C standard doesn't guarantee this to work, but compilers generally
support it and if we run into any exceptions we can deal with them the
same way that ENUM_BF deals with enum bitfields.

> while I'm quite willing to believe that this placement
> of gcmarkbit at the end of both structs (placed after differently-sized
> bit fields) works fine in practice, I'd be interested to know to what
> extent the C language guarantees that it will work.

There's no guarantee.  The only guarantees are that members are laid
out in increasing address order, that they're aligned, and that the
first member is at offset zero.  There's no guarantee about where
later members are placed: there can be padding pretty much anywhere
after the first member.

We can easily verify at compile-time that the two layouts have the
same overall size and alignment; that should catch many potential
errors in this area, though not all.

I did not review the meat of the patch carefully, but
I noticed some minor Emacs style indenting issues, e.g.:

      nbytes = s->u.imm.size_byte == STRING_UNIBYTE_IMM_MARK ?
	s->u.imm.size : s->u.imm.size_byte;

should be:

      nbytes = (s->u.imm.size_byte == STRING_UNIBYTE_IMM_MARK
                ? s->u.imm.size : s->u.imm.size_byte);

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2012-07-04 19:32 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-22  8:44 Proposal: immediate strings Dmitry Antipov
2012-05-22 20:51 ` Miles Bader
2012-05-22 22:13   ` Paul Eggert
2012-05-24  5:17 ` Stefan Monnier
2012-05-24  5:41   ` Ken Raeburn
2012-05-24  5:50     ` Miles Bader
2012-05-24  6:08   ` Paul Eggert
2012-05-24  7:14     ` Stefan Monnier
2012-05-24  7:52       ` Paul Eggert
2012-05-24 12:51         ` Stefan Monnier
2012-05-24 16:35           ` Paul Eggert
2012-05-25  6:43             ` Dmitry Antipov
2012-05-25  7:30               ` Paul Eggert
2012-05-28 11:32       ` Dmitry Antipov
2012-05-28 14:25         ` Stefan Monnier
2012-05-29  6:55   ` Dmitry Antipov
2012-05-29  7:38     ` Paul Eggert
2012-05-29 13:33       ` Dmitry Antipov
2012-05-29 15:24         ` Paul Eggert
2012-05-31  9:28           ` Dmitry Antipov
2012-05-31 16:34             ` Paul Eggert
2012-06-06  6:14               ` Dmitry Antipov
2012-06-06  6:41                 ` Paul Eggert
2012-06-06  7:29                   ` Dmitry Antipov
2012-06-06 15:14                     ` Eli Zaretskii
2012-06-06 21:44                       ` Paul Eggert
2012-07-04  8:27                       ` Old topic(s) again [was: Re: Proposal: immediate strings] Dmitry Antipov
2012-07-04 13:08                         ` Stefan Monnier
2012-07-04 19:32                           ` Paul Eggert
2012-05-29  7:38     ` Proposal: immediate strings Andreas Schwab

Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).