all messages for Emacs-related lists mirrored at yhetil.org
 help / color / mirror / code / Atom feed
* Untagging by subtraction instead of masking on USE_LSB_TAG
@ 2008-01-28  2:07 YAMAMOTO Mitsuharu
  2008-01-28  3:52 ` Thien-Thi Nguyen
  2008-01-28 15:07 ` Stefan Monnier
  0 siblings, 2 replies; 6+ messages in thread
From: YAMAMOTO Mitsuharu @ 2008-01-28  2:07 UTC (permalink / raw)
  To: emacs-devel

Sorry if this has already been discussed before.

On USE_LSB_TAG environments, untagging of Lisp_Object can be performed
by subtracting a small number instead of bit masking.  The subtraction
enables us to optimize the typical sequence of untagging ->
dereference -> struct-member-access into a single instruction.  Below
is the result of gcc -Os in PowerPC with masking (left) and
subtraction (right).

  _cons_to_long:                  _cons_to_long:
	  andi. r0,r3,7                   andi. r0,r3,7
	  srawi r0,r3,3                   srawi r0,r3,3
	  beq cr0,L592                    beq cr0,L592
	  rlwinm r2,r3,0,0,28
	  lwz r9,4(r2)                    lwz r9,-1(r3)
	  lwz r3,0(r2)                    lwz r3,-5(r3)
	  rlwinm r0,r9,0,29,31            rlwinm r0,r9,0,29,31
	  cmpwi cr7,r0,5                  cmpwi cr7,r0,5
	  bne cr7,L593                    bne cr7,L593
	  rlwinm r2,r9,0,0,28
	  lwz r9,0(r2)                    lwz r9,-5(r9)
  L593:                           L593:
	  rlwinm r2,r3,13,0,15            rlwinm r2,r3,13,0,15
	  srawi r0,r9,3                   srawi r0,r9,3
	  or r0,r2,r0                     or r0,r2,r0
  L592:                           L592:
	  mr r3,r0                        mr r3,r0
	  blr                             blr

This would make sense if the latency of load/store does not depend on
its displacement (I'm not sure if that is the case in general).
Comments?

				     YAMAMOTO Mitsuharu
				mituharu@math.s.chiba-u.ac.jp

Index: src/lisp.h
===================================================================
RCS file: /cvsroot/emacs/emacs/src/lisp.h,v
retrieving revision 1.605
diff -c -r1.605 lisp.h
*** src/lisp.h	13 Jan 2008 22:08:06 -0000	1.605
--- src/lisp.h	28 Jan 2008 00:18:18 -0000
***************
*** 397,402 ****
--- 397,403 ----
  #define XSETFASTINT(a, b) ((a) = make_number (b))
  
  #define XPNTR(a) ((EMACS_INT) ((a) & ~TYPEMASK))
+ #define XUNTAG(a, type) ((EMACS_UINT) (a) - (type))
  
  #else  /* not USE_LSB_TAG */
  
***************
*** 509,514 ****
--- 510,516 ----
  #define XPNTR(a) ((EMACS_INT) XUINT (a))
  #endif
  #endif /* not HAVE_SHM */
+ #define XUNTAG(a, type) XPNTR (a)
  #endif /* no XPNTR */
  
  /* Largest and smallest representable fixnum values.  These are the C
***************
*** 528,542 ****
  
  /* Extract a value or address from a Lisp_Object.  */
  
! #define XCONS(a) (eassert (GC_CONSP(a)),(struct Lisp_Cons *) XPNTR(a))
! #define XVECTOR(a) (eassert (GC_VECTORLIKEP(a)),(struct Lisp_Vector *) XPNTR(a))
! #define XSTRING(a) (eassert (GC_STRINGP(a)),(struct Lisp_String *) XPNTR(a))
! #define XSYMBOL(a) (eassert (GC_SYMBOLP(a)),(struct Lisp_Symbol *) XPNTR(a))
! #define XFLOAT(a) (eassert (GC_FLOATP(a)),(struct Lisp_Float *) XPNTR(a))
  
  /* Misc types.  */
  
! #define XMISC(a)   ((union Lisp_Misc *) XPNTR(a))
  #define XMISCANY(a)	(eassert (MISCP (a)), &(XMISC(a)->u_any))
  #define XMISCTYPE(a)   (XMISCANY (a)->type)
  #define XMARKER(a)	(eassert (MARKERP (a)), &(XMISC(a)->u_marker))
--- 530,549 ----
  
  /* Extract a value or address from a Lisp_Object.  */
  
! #define XCONS(a) (eassert (GC_CONSP(a)), \
! 		  (struct Lisp_Cons *) XUNTAG(a, Lisp_Cons))
! #define XVECTOR(a) (eassert (GC_VECTORLIKEP(a)), \
! 		    (struct Lisp_Vector *) XUNTAG(a, Lisp_Vectorlike))
! #define XSTRING(a) (eassert (GC_STRINGP(a)), \
! 		    (struct Lisp_String *) XUNTAG(a, Lisp_String))
! #define XSYMBOL(a) (eassert (GC_SYMBOLP(a)), \
! 		    (struct Lisp_Symbol *) XUNTAG(a, Lisp_Symbol))
! #define XFLOAT(a) (eassert (GC_FLOATP(a)), \
! 		   (struct Lisp_Float *) XUNTAG(a, Lisp_Float))
  
  /* Misc types.  */
  
! #define XMISC(a)   ((union Lisp_Misc *) XUNTAG(a, Lisp_Misc))
  #define XMISCANY(a)	(eassert (MISCP (a)), &(XMISC(a)->u_any))
  #define XMISCTYPE(a)   (XMISCANY (a)->type)
  #define XMARKER(a)	(eassert (MARKERP (a)), &(XMISC(a)->u_marker))
***************
*** 554,566 ****
  
  /* Pseudovector types.  */
  
! #define XPROCESS(a) (eassert (GC_PROCESSP(a)),(struct Lisp_Process *) XPNTR(a))
! #define XWINDOW(a) (eassert (GC_WINDOWP(a)),(struct window *) XPNTR(a))
! #define XTERMINAL(a) (eassert (GC_TERMINALP(a)),(struct terminal *) XPNTR(a))
! #define XSUBR(a) (eassert (GC_SUBRP(a)),(struct Lisp_Subr *) XPNTR(a))
! #define XBUFFER(a) (eassert (GC_BUFFERP(a)),(struct buffer *) XPNTR(a))
! #define XCHAR_TABLE(a) (eassert (GC_CHAR_TABLE_P (a)), (struct Lisp_Char_Table *) XPNTR(a))
! #define XBOOL_VECTOR(a) (eassert (GC_BOOL_VECTOR_P (a)), (struct Lisp_Bool_Vector *) XPNTR(a))
  
  /* Construct a Lisp_Object from a value or address.  */
  
--- 561,580 ----
  
  /* Pseudovector types.  */
  
! #define XPROCESS(a) (eassert (GC_PROCESSP(a)), \
! 		     (struct Lisp_Process *) XUNTAG(a, Lisp_Vectorlike))
! #define XWINDOW(a) (eassert (GC_WINDOWP(a)), \
! 		    (struct window *) XUNTAG(a, Lisp_Vectorlike))
! #define XTERMINAL(a) (eassert (GC_TERMINALP(a)), \
! 		      (struct terminal *) XUNTAG(a, Lisp_Vectorlike))
! #define XSUBR(a) (eassert (GC_SUBRP(a)), \
! 		  (struct Lisp_Subr *) XUNTAG(a, Lisp_Vectorlike))
! #define XBUFFER(a) (eassert (GC_BUFFERP(a)), \
! 		    (struct buffer *) XUNTAG(a, Lisp_Vectorlike))
! #define XCHAR_TABLE(a) (eassert (GC_CHAR_TABLE_P (a)), \
! 			(struct Lisp_Char_Table *) XUNTAG(a, Lisp_Vectorlike))
! #define XBOOL_VECTOR(a) (eassert (GC_BOOL_VECTOR_P (a)), \
! 			 (struct Lisp_Bool_Vector *) XUNTAG(a, Lisp_Vectorlike))
  
  /* Construct a Lisp_Object from a value or address.  */
  
***************
*** 1089,1095 ****
  
  
  #define XHASH_TABLE(OBJ) \
!      ((struct Lisp_Hash_Table *) XPNTR (OBJ))
  
  #define XSET_HASH_TABLE(VAR, PTR) \
       (XSETPSEUDOVECTOR (VAR, PTR, PVEC_HASH_TABLE))
--- 1103,1109 ----
  
  
  #define XHASH_TABLE(OBJ) \
!      ((struct Lisp_Hash_Table *) XUNTAG(OBJ, Lisp_Vectorlike))
  
  #define XSET_HASH_TABLE(VAR, PTR) \
       (XSETPSEUDOVECTOR (VAR, PTR, PVEC_HASH_TABLE))
Index: src/frame.h
===================================================================
RCS file: /cvsroot/emacs/emacs/src/frame.h,v
retrieving revision 1.131
diff -c -r1.131 frame.h
*** src/frame.h	8 Jan 2008 20:44:17 -0000	1.131
--- src/frame.h	28 Jan 2008 00:18:18 -0000
***************
*** 475,481 ****
  
  typedef struct frame *FRAME_PTR;
  
! #define XFRAME(p) (eassert (GC_FRAMEP(p)),(struct frame *) XPNTR (p))
  #define XSETFRAME(a, b) (XSETPSEUDOVECTOR (a, b, PVEC_FRAME))
  
  /* Given a window, return its frame as a Lisp_Object.  */
--- 475,482 ----
  
  typedef struct frame *FRAME_PTR;
  
! #define XFRAME(p) (eassert (GC_FRAMEP(p)), \
! 		   (struct frame *) XUNTAG(p, Lisp_Vectorlike))
  #define XSETFRAME(a, b) (XSETPSEUDOVECTOR (a, b, PVEC_FRAME))
  
  /* Given a window, return its frame as a Lisp_Object.  */

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Untagging by subtraction instead of masking on USE_LSB_TAG
  2008-01-28  2:07 Untagging by subtraction instead of masking on USE_LSB_TAG YAMAMOTO Mitsuharu
@ 2008-01-28  3:52 ` Thien-Thi Nguyen
  2008-01-28  4:22   ` Miles Bader
  2008-01-28 15:07 ` Stefan Monnier
  1 sibling, 1 reply; 6+ messages in thread
From: Thien-Thi Nguyen @ 2008-01-28  3:52 UTC (permalink / raw)
  To: YAMAMOTO Mitsuharu; +Cc: emacs-devel

() YAMAMOTO Mitsuharu <mituharu@math.s.chiba-u.ac.jp>
() Mon, 28 Jan 2008 11:07:28 +0900

     _cons_to_long:                  _cons_to_long:
             andi. r0,r3,7                   andi. r0,r3,7
             srawi r0,r3,3                   srawi r0,r3,3
             beq cr0,L592                    beq cr0,L592
             rlwinm r2,r3,0,0,28
A            lwz r9,4(r2)                    lwz r9,-1(r3)
B            lwz r3,0(r2)                    lwz r3,-5(r3)
             rlwinm r0,r9,0,29,31            rlwinm r0,r9,0,29,31
             cmpwi cr7,r0,5                  cmpwi cr7,r0,5
             bne cr7,L593                    bne cr7,L593
             rlwinm r2,r9,0,0,28
C            lwz r9,0(r2)                    lwz r9,-5(r9)
     L593:                           L593:
             rlwinm r2,r3,13,0,15            rlwinm r2,r3,13,0,15
             srawi r0,r9,3                   srawi r0,r9,3
             or r0,r2,r0                     or r0,r2,r0
     L592:                           L592:
             mr r3,r0                        mr r3,r0
             blr                             blr

   This would make sense if the latency of load/store does not
   depend on its displacement (I'm not sure if that is the case in
   general).  Comments?

For masking, i see offsets (lwz) of 4,0,0 (lines A,B,C).
For subtraction, -1,-5,-5.

It's very possible that the machine can handle 4,0,0 more
efficiently; those all are even (0, modulo 2) and in two cases
"nothing"!  Furthermore, the maximum absolute offset for the
subtraction method is 5, which is larger (faaarther away) than 4.

Anyway, here is an excerpt from p.532 of "PowerPC 405, Embedded
Processor Core, User's Manual":

| C.2.6     Alignment in Scalar Load and Store Instructions
| 
| The PPC405 requires an extra cycle to execute scalar loads and
| stores having unaligned big or little endian data (except for
| lwarx and stwcx., which require word-aligned operands). If the
| target data is not operand aligned, and the sum of the least two
| significant bits of the effective address (EA) and the byte count
| is greater than four, the PPC405 decomposes a load or store scalar
| into two load or store operations. That is, the PPC405 never
| presents the DCU with a request for a transfer that crosses a word
| boundary. For example, a lwz with an EA of 0b11 causes the PPC405
| to decompose the lwz into two load operations. The first load
| operation is for a byte at the starting effective address; the
| second load operation is for three bytes, starting at the next
| word address.

But don't heed my (mostly) ignorant gut feelings!  Esperience sez:
isolate the variable; build two versions; compare on "typical"
workload; if (dis)advantage is under some "wow!"  threshold, write
down your findings in the notebook (for Emacs, comments would be
fine), but prioritize maintainability (i.e, refrain from
implementing).

I am interested in how you define "typical" and "wow!".

Seasons change, pipelines change.  Keep in mind that sometimes
optimization now translates to pessimization down the road.

thi

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Untagging by subtraction instead of masking on USE_LSB_TAG
  2008-01-28  3:52 ` Thien-Thi Nguyen
@ 2008-01-28  4:22   ` Miles Bader
  2008-01-28  4:25     ` Miles Bader
  0 siblings, 1 reply; 6+ messages in thread
From: Miles Bader @ 2008-01-28  4:22 UTC (permalink / raw)
  To: Thien-Thi Nguyen; +Cc: YAMAMOTO Mitsuharu, emacs-devel

Thien-Thi Nguyen <ttn@gnuvola.org> writes:
> For masking, i see offsets (lwz) of 4,0,0 (lines A,B,C).
> For subtraction, -1,-5,-5.
>
> It's very possible that the machine can handle 4,0,0 more
> efficiently; those all are even (0, modulo 2) and in two cases
> "nothing"!  Furthermore, the maximum absolute offset for the
> subtraction method is 5, which is larger (faaarther away) than 4.
>
> Anyway, here is an excerpt from p.532 of "PowerPC 405, Embedded
> Processor Core, User's Manual":

Note that the important value here is the EA, not the offset.  Typically
in a LSB tag scheme, the tags are arranged so that after adding these
"funny" offsets, the resulting EA is aligned properly.

-Miles

-- 
Apologize, v. To lay the foundation for a future offense.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Untagging by subtraction instead of masking on USE_LSB_TAG
  2008-01-28  4:22   ` Miles Bader
@ 2008-01-28  4:25     ` Miles Bader
  2008-01-28  5:02       ` YAMAMOTO Mitsuharu
  0 siblings, 1 reply; 6+ messages in thread
From: Miles Bader @ 2008-01-28  4:25 UTC (permalink / raw)
  To: Thien-Thi Nguyen; +Cc: YAMAMOTO Mitsuharu, emacs-devel

Miles Bader <miles.bader@necel.com> writes:
> Note that the important value here is the EA, not the offset.  Typically
> in a LSB tag scheme, the tags are arranged so that after adding these
> "funny" offsets, the resulting EA is aligned properly.

BTW, such a method won't work on architectures where offsets are
inherently aligned -- for instance, doesn't the x86 implicitly scale the
offset by the size of the value being loaded?  Does anybody know what is
usually done on such an architecture?

Thanks,

-Miles

-- 
Year, n. A period of three hundred and sixty-five disappointments.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Untagging by subtraction instead of masking on USE_LSB_TAG
  2008-01-28  4:25     ` Miles Bader
@ 2008-01-28  5:02       ` YAMAMOTO Mitsuharu
  0 siblings, 0 replies; 6+ messages in thread
From: YAMAMOTO Mitsuharu @ 2008-01-28  5:02 UTC (permalink / raw)
  To: Miles Bader; +Cc: Thien-Thi Nguyen, emacs-devel

>>>>> On Mon, 28 Jan 2008 13:25:30 +0900, Miles Bader <miles.bader@necel.com> said:

> Miles Bader <miles.bader@necel.com> writes:
>> Note that the important value here is the EA, not the offset.
>> Typically in a LSB tag scheme, the tags are arranged so that after
>> adding these "funny" offsets, the resulting EA is aligned properly.

> BTW, such a method won't work on architectures where offsets are
> inherently aligned -- for instance, doesn't the x86 implicitly scale
> the offset by the size of the value being loaded?  Does anybody know
> what is usually done on such an architecture?

I'm not sure, but there might be.  At least, the result on intel Mac
with gcc -Os was similar to that on PowerPC.

  _cons_to_long:                  _cons_to_long:
	  pushl   %ebp                    pushl   %ebp
	  movl    %esp, %ebp              movl    %esp, %ebp
	  movl    8(%ebp), %eax           movl    8(%ebp), %eax
	  testb   $7, %al                 testb   $7, %al
	  jne     L617                    jne     L617
	  sarl    $3, %eax                sarl    $3, %eax
	  jmp     L619                    jmp     L619
  L617:                           L617:
	  andl    $-8, %eax
	  movl    (%eax), %ecx            movl    -5(%eax), %ecx
	  movl    4(%eax), %edx           movl    -1(%eax), %edx
	  movl    %edx, %eax              movl    %edx, %eax
	  andl    $7, %eax                andl    $7, %eax
	  cmpl    $5, %eax                cmpl    $5, %eax
	  jne     L620                    jne     L620
	  movl    %edx, %eax
	  andl    $-8, %eax
	  movl    (%eax), %edx            movl    -5(%edx), %edx
  L620:                           L620:
	  movl    %ecx, %eax              movl    %ecx, %eax
	  sarl    $3, %eax                sarl    $3, %eax
	  sall    $16, %eax               sall    $16, %eax
	  sarl    $3, %edx                sarl    $3, %edx
	  orl     %edx, %eax              orl     %edx, %eax
  L619:                           L619:
	  popl    %ebp                    popl    %ebp
	  ret                             ret

And Solaris/SPARC with gcc -Os:

  cons_to_long:                   cons_to_long:
	  !#PROLOGUE# 0                   !#PROLOGUE# 0
	  !#PROLOGUE# 1                   !#PROLOGUE# 1
	  and     %o0, -8, %o4            mov     %o0, %o4
	  andcc   %o0, 7, %g0             andcc   %o0, 7, %g0
	  be      .LL952                  be      .LL949
	  sra     %o0, 3, %o0             sra     %o0, 3, %o0
	  ld      [%o4+4], %o5            ld      [%o4-1], %o5
	  and     %o5, 7, %g1             and     %o5, 7, %g1
	  cmp     %g1, 5                  cmp     %g1, 5
	  ld      [%o4], %g1              ld      [%o4-5], %g1
	  sra     %g1, 3, %g1             sra     %g1, 3, %g1
	  and     %o5, -8, %o3
	  bne     .LL954                  bne     .LL951
	  sll     %g1, 16, %o0            sll     %g1, 16, %o0
	  ld      [%o3], %o5              ld      [%o5-5], %o5
  .LL954:                         .LL951:
	  sra     %o5, 3, %g1             sra     %o5, 3, %g1
	  or      %o0, %g1, %o0           or      %o0, %g1, %o0
  .LL952:                         .LL949:
	  retl                            retl
	  nop                             nop

				     YAMAMOTO Mitsuharu
				mituharu@math.s.chiba-u.ac.jp

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Untagging by subtraction instead of masking on USE_LSB_TAG
  2008-01-28  2:07 Untagging by subtraction instead of masking on USE_LSB_TAG YAMAMOTO Mitsuharu
  2008-01-28  3:52 ` Thien-Thi Nguyen
@ 2008-01-28 15:07 ` Stefan Monnier
  1 sibling, 0 replies; 6+ messages in thread
From: Stefan Monnier @ 2008-01-28 15:07 UTC (permalink / raw)
  To: YAMAMOTO Mitsuharu; +Cc: emacs-devel

> This would make sense if the latency of load/store does not depend on
> its displacement (I'm not sure if that is the case in general).
> Comments?

Yes, all RISC processors I can think of (other than ADM29K, maybe)
provide a "small" offset for free: a 0 offset is no faster than
a -5 offset.

Additionally, masking the lower 3 bits may sometimes be surprisingly
costly (the obvious solution of ANDing with ~7 requires a 32bit
constant, which tends to be more costly than a smaller constant, and
the rsh+lsh can be even worse).

So, yes, your patch looks like a very good idea in general.
It's also what is typically used in Lisp/Smalltalk/younameit compilers.


        Stefan


PS: This trick has another advantage: on some hardware (e.g. SPARC) if
after adding/subtracting the offset, the resulting address is not
aligned, you can get an exception, which means you get tag-checking for
free.  This is no coincidence, since the SPARC architecture benefitted
from the experience of the SOAR (Smalltalk On A RISC) project.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2008-01-28 15:07 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-01-28  2:07 Untagging by subtraction instead of masking on USE_LSB_TAG YAMAMOTO Mitsuharu
2008-01-28  3:52 ` Thien-Thi Nguyen
2008-01-28  4:22   ` Miles Bader
2008-01-28  4:25     ` Miles Bader
2008-01-28  5:02       ` YAMAMOTO Mitsuharu
2008-01-28 15:07 ` Stefan Monnier

Code repositories for project(s) associated with this external index

	https://git.savannah.gnu.org/cgit/emacs.git
	https://git.savannah.gnu.org/cgit/emacs/org-mode.git

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.