* Untagging by subtraction instead of masking on USE_LSB_TAG
@ 2008-01-28 2:07 YAMAMOTO Mitsuharu
2008-01-28 3:52 ` Thien-Thi Nguyen
2008-01-28 15:07 ` Stefan Monnier
0 siblings, 2 replies; 6+ messages in thread
From: YAMAMOTO Mitsuharu @ 2008-01-28 2:07 UTC (permalink / raw)
To: emacs-devel
Sorry if this has already been discussed before.
On USE_LSB_TAG environments, untagging of Lisp_Object can be performed
by subtracting a small number instead of bit masking. The subtraction
enables us to optimize the typical sequence of untagging ->
dereference -> struct-member-access into a single instruction. Below
is the result of gcc -Os in PowerPC with masking (left) and
subtraction (right).
_cons_to_long: _cons_to_long:
andi. r0,r3,7 andi. r0,r3,7
srawi r0,r3,3 srawi r0,r3,3
beq cr0,L592 beq cr0,L592
rlwinm r2,r3,0,0,28
lwz r9,4(r2) lwz r9,-1(r3)
lwz r3,0(r2) lwz r3,-5(r3)
rlwinm r0,r9,0,29,31 rlwinm r0,r9,0,29,31
cmpwi cr7,r0,5 cmpwi cr7,r0,5
bne cr7,L593 bne cr7,L593
rlwinm r2,r9,0,0,28
lwz r9,0(r2) lwz r9,-5(r9)
L593: L593:
rlwinm r2,r3,13,0,15 rlwinm r2,r3,13,0,15
srawi r0,r9,3 srawi r0,r9,3
or r0,r2,r0 or r0,r2,r0
L592: L592:
mr r3,r0 mr r3,r0
blr blr
This would make sense if the latency of load/store does not depend on
its displacement (I'm not sure if that is the case in general).
Comments?
YAMAMOTO Mitsuharu
mituharu@math.s.chiba-u.ac.jp
Index: src/lisp.h
===================================================================
RCS file: /cvsroot/emacs/emacs/src/lisp.h,v
retrieving revision 1.605
diff -c -r1.605 lisp.h
*** src/lisp.h 13 Jan 2008 22:08:06 -0000 1.605
--- src/lisp.h 28 Jan 2008 00:18:18 -0000
***************
*** 397,402 ****
--- 397,403 ----
#define XSETFASTINT(a, b) ((a) = make_number (b))
#define XPNTR(a) ((EMACS_INT) ((a) & ~TYPEMASK))
+ #define XUNTAG(a, type) ((EMACS_UINT) (a) - (type))
#else /* not USE_LSB_TAG */
***************
*** 509,514 ****
--- 510,516 ----
#define XPNTR(a) ((EMACS_INT) XUINT (a))
#endif
#endif /* not HAVE_SHM */
+ #define XUNTAG(a, type) XPNTR (a)
#endif /* no XPNTR */
/* Largest and smallest representable fixnum values. These are the C
***************
*** 528,542 ****
/* Extract a value or address from a Lisp_Object. */
! #define XCONS(a) (eassert (GC_CONSP(a)),(struct Lisp_Cons *) XPNTR(a))
! #define XVECTOR(a) (eassert (GC_VECTORLIKEP(a)),(struct Lisp_Vector *) XPNTR(a))
! #define XSTRING(a) (eassert (GC_STRINGP(a)),(struct Lisp_String *) XPNTR(a))
! #define XSYMBOL(a) (eassert (GC_SYMBOLP(a)),(struct Lisp_Symbol *) XPNTR(a))
! #define XFLOAT(a) (eassert (GC_FLOATP(a)),(struct Lisp_Float *) XPNTR(a))
/* Misc types. */
! #define XMISC(a) ((union Lisp_Misc *) XPNTR(a))
#define XMISCANY(a) (eassert (MISCP (a)), &(XMISC(a)->u_any))
#define XMISCTYPE(a) (XMISCANY (a)->type)
#define XMARKER(a) (eassert (MARKERP (a)), &(XMISC(a)->u_marker))
--- 530,549 ----
/* Extract a value or address from a Lisp_Object. */
! #define XCONS(a) (eassert (GC_CONSP(a)), \
! (struct Lisp_Cons *) XUNTAG(a, Lisp_Cons))
! #define XVECTOR(a) (eassert (GC_VECTORLIKEP(a)), \
! (struct Lisp_Vector *) XUNTAG(a, Lisp_Vectorlike))
! #define XSTRING(a) (eassert (GC_STRINGP(a)), \
! (struct Lisp_String *) XUNTAG(a, Lisp_String))
! #define XSYMBOL(a) (eassert (GC_SYMBOLP(a)), \
! (struct Lisp_Symbol *) XUNTAG(a, Lisp_Symbol))
! #define XFLOAT(a) (eassert (GC_FLOATP(a)), \
! (struct Lisp_Float *) XUNTAG(a, Lisp_Float))
/* Misc types. */
! #define XMISC(a) ((union Lisp_Misc *) XUNTAG(a, Lisp_Misc))
#define XMISCANY(a) (eassert (MISCP (a)), &(XMISC(a)->u_any))
#define XMISCTYPE(a) (XMISCANY (a)->type)
#define XMARKER(a) (eassert (MARKERP (a)), &(XMISC(a)->u_marker))
***************
*** 554,566 ****
/* Pseudovector types. */
! #define XPROCESS(a) (eassert (GC_PROCESSP(a)),(struct Lisp_Process *) XPNTR(a))
! #define XWINDOW(a) (eassert (GC_WINDOWP(a)),(struct window *) XPNTR(a))
! #define XTERMINAL(a) (eassert (GC_TERMINALP(a)),(struct terminal *) XPNTR(a))
! #define XSUBR(a) (eassert (GC_SUBRP(a)),(struct Lisp_Subr *) XPNTR(a))
! #define XBUFFER(a) (eassert (GC_BUFFERP(a)),(struct buffer *) XPNTR(a))
! #define XCHAR_TABLE(a) (eassert (GC_CHAR_TABLE_P (a)), (struct Lisp_Char_Table *) XPNTR(a))
! #define XBOOL_VECTOR(a) (eassert (GC_BOOL_VECTOR_P (a)), (struct Lisp_Bool_Vector *) XPNTR(a))
/* Construct a Lisp_Object from a value or address. */
--- 561,580 ----
/* Pseudovector types. */
! #define XPROCESS(a) (eassert (GC_PROCESSP(a)), \
! (struct Lisp_Process *) XUNTAG(a, Lisp_Vectorlike))
! #define XWINDOW(a) (eassert (GC_WINDOWP(a)), \
! (struct window *) XUNTAG(a, Lisp_Vectorlike))
! #define XTERMINAL(a) (eassert (GC_TERMINALP(a)), \
! (struct terminal *) XUNTAG(a, Lisp_Vectorlike))
! #define XSUBR(a) (eassert (GC_SUBRP(a)), \
! (struct Lisp_Subr *) XUNTAG(a, Lisp_Vectorlike))
! #define XBUFFER(a) (eassert (GC_BUFFERP(a)), \
! (struct buffer *) XUNTAG(a, Lisp_Vectorlike))
! #define XCHAR_TABLE(a) (eassert (GC_CHAR_TABLE_P (a)), \
! (struct Lisp_Char_Table *) XUNTAG(a, Lisp_Vectorlike))
! #define XBOOL_VECTOR(a) (eassert (GC_BOOL_VECTOR_P (a)), \
! (struct Lisp_Bool_Vector *) XUNTAG(a, Lisp_Vectorlike))
/* Construct a Lisp_Object from a value or address. */
***************
*** 1089,1095 ****
#define XHASH_TABLE(OBJ) \
! ((struct Lisp_Hash_Table *) XPNTR (OBJ))
#define XSET_HASH_TABLE(VAR, PTR) \
(XSETPSEUDOVECTOR (VAR, PTR, PVEC_HASH_TABLE))
--- 1103,1109 ----
#define XHASH_TABLE(OBJ) \
! ((struct Lisp_Hash_Table *) XUNTAG(OBJ, Lisp_Vectorlike))
#define XSET_HASH_TABLE(VAR, PTR) \
(XSETPSEUDOVECTOR (VAR, PTR, PVEC_HASH_TABLE))
Index: src/frame.h
===================================================================
RCS file: /cvsroot/emacs/emacs/src/frame.h,v
retrieving revision 1.131
diff -c -r1.131 frame.h
*** src/frame.h 8 Jan 2008 20:44:17 -0000 1.131
--- src/frame.h 28 Jan 2008 00:18:18 -0000
***************
*** 475,481 ****
typedef struct frame *FRAME_PTR;
! #define XFRAME(p) (eassert (GC_FRAMEP(p)),(struct frame *) XPNTR (p))
#define XSETFRAME(a, b) (XSETPSEUDOVECTOR (a, b, PVEC_FRAME))
/* Given a window, return its frame as a Lisp_Object. */
--- 475,482 ----
typedef struct frame *FRAME_PTR;
! #define XFRAME(p) (eassert (GC_FRAMEP(p)), \
! (struct frame *) XUNTAG(p, Lisp_Vectorlike))
#define XSETFRAME(a, b) (XSETPSEUDOVECTOR (a, b, PVEC_FRAME))
/* Given a window, return its frame as a Lisp_Object. */
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: Untagging by subtraction instead of masking on USE_LSB_TAG
2008-01-28 2:07 Untagging by subtraction instead of masking on USE_LSB_TAG YAMAMOTO Mitsuharu
@ 2008-01-28 3:52 ` Thien-Thi Nguyen
2008-01-28 4:22 ` Miles Bader
2008-01-28 15:07 ` Stefan Monnier
1 sibling, 1 reply; 6+ messages in thread
From: Thien-Thi Nguyen @ 2008-01-28 3:52 UTC (permalink / raw)
To: YAMAMOTO Mitsuharu; +Cc: emacs-devel
() YAMAMOTO Mitsuharu <mituharu@math.s.chiba-u.ac.jp>
() Mon, 28 Jan 2008 11:07:28 +0900
_cons_to_long: _cons_to_long:
andi. r0,r3,7 andi. r0,r3,7
srawi r0,r3,3 srawi r0,r3,3
beq cr0,L592 beq cr0,L592
rlwinm r2,r3,0,0,28
A lwz r9,4(r2) lwz r9,-1(r3)
B lwz r3,0(r2) lwz r3,-5(r3)
rlwinm r0,r9,0,29,31 rlwinm r0,r9,0,29,31
cmpwi cr7,r0,5 cmpwi cr7,r0,5
bne cr7,L593 bne cr7,L593
rlwinm r2,r9,0,0,28
C lwz r9,0(r2) lwz r9,-5(r9)
L593: L593:
rlwinm r2,r3,13,0,15 rlwinm r2,r3,13,0,15
srawi r0,r9,3 srawi r0,r9,3
or r0,r2,r0 or r0,r2,r0
L592: L592:
mr r3,r0 mr r3,r0
blr blr
This would make sense if the latency of load/store does not
depend on its displacement (I'm not sure if that is the case in
general). Comments?
For masking, i see offsets (lwz) of 4,0,0 (lines A,B,C).
For subtraction, -1,-5,-5.
It's very possible that the machine can handle 4,0,0 more
efficiently; those all are even (0, modulo 2) and in two cases
"nothing"! Furthermore, the maximum absolute offset for the
subtraction method is 5, which is larger (faaarther away) than 4.
Anyway, here is an excerpt from p.532 of "PowerPC 405, Embedded
Processor Core, User's Manual":
| C.2.6 Alignment in Scalar Load and Store Instructions
|
| The PPC405 requires an extra cycle to execute scalar loads and
| stores having unaligned big or little endian data (except for
| lwarx and stwcx., which require word-aligned operands). If the
| target data is not operand aligned, and the sum of the least two
| significant bits of the effective address (EA) and the byte count
| is greater than four, the PPC405 decomposes a load or store scalar
| into two load or store operations. That is, the PPC405 never
| presents the DCU with a request for a transfer that crosses a word
| boundary. For example, a lwz with an EA of 0b11 causes the PPC405
| to decompose the lwz into two load operations. The first load
| operation is for a byte at the starting effective address; the
| second load operation is for three bytes, starting at the next
| word address.
But don't heed my (mostly) ignorant gut feelings! Esperience sez:
isolate the variable; build two versions; compare on "typical"
workload; if (dis)advantage is under some "wow!" threshold, write
down your findings in the notebook (for Emacs, comments would be
fine), but prioritize maintainability (i.e, refrain from
implementing).
I am interested in how you define "typical" and "wow!".
Seasons change, pipelines change. Keep in mind that sometimes
optimization now translates to pessimization down the road.
thi
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: Untagging by subtraction instead of masking on USE_LSB_TAG
2008-01-28 3:52 ` Thien-Thi Nguyen
@ 2008-01-28 4:22 ` Miles Bader
2008-01-28 4:25 ` Miles Bader
0 siblings, 1 reply; 6+ messages in thread
From: Miles Bader @ 2008-01-28 4:22 UTC (permalink / raw)
To: Thien-Thi Nguyen; +Cc: YAMAMOTO Mitsuharu, emacs-devel
Thien-Thi Nguyen <ttn@gnuvola.org> writes:
> For masking, i see offsets (lwz) of 4,0,0 (lines A,B,C).
> For subtraction, -1,-5,-5.
>
> It's very possible that the machine can handle 4,0,0 more
> efficiently; those all are even (0, modulo 2) and in two cases
> "nothing"! Furthermore, the maximum absolute offset for the
> subtraction method is 5, which is larger (faaarther away) than 4.
>
> Anyway, here is an excerpt from p.532 of "PowerPC 405, Embedded
> Processor Core, User's Manual":
Note that the important value here is the EA, not the offset. Typically
in a LSB tag scheme, the tags are arranged so that after adding these
"funny" offsets, the resulting EA is aligned properly.
-Miles
--
Apologize, v. To lay the foundation for a future offense.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: Untagging by subtraction instead of masking on USE_LSB_TAG
2008-01-28 4:22 ` Miles Bader
@ 2008-01-28 4:25 ` Miles Bader
2008-01-28 5:02 ` YAMAMOTO Mitsuharu
0 siblings, 1 reply; 6+ messages in thread
From: Miles Bader @ 2008-01-28 4:25 UTC (permalink / raw)
To: Thien-Thi Nguyen; +Cc: YAMAMOTO Mitsuharu, emacs-devel
Miles Bader <miles.bader@necel.com> writes:
> Note that the important value here is the EA, not the offset. Typically
> in a LSB tag scheme, the tags are arranged so that after adding these
> "funny" offsets, the resulting EA is aligned properly.
BTW, such a method won't work on architectures where offsets are
inherently aligned -- for instance, doesn't the x86 implicitly scale the
offset by the size of the value being loaded? Does anybody know what is
usually done on such an architecture?
Thanks,
-Miles
--
Year, n. A period of three hundred and sixty-five disappointments.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: Untagging by subtraction instead of masking on USE_LSB_TAG
2008-01-28 4:25 ` Miles Bader
@ 2008-01-28 5:02 ` YAMAMOTO Mitsuharu
0 siblings, 0 replies; 6+ messages in thread
From: YAMAMOTO Mitsuharu @ 2008-01-28 5:02 UTC (permalink / raw)
To: Miles Bader; +Cc: Thien-Thi Nguyen, emacs-devel
>>>>> On Mon, 28 Jan 2008 13:25:30 +0900, Miles Bader <miles.bader@necel.com> said:
> Miles Bader <miles.bader@necel.com> writes:
>> Note that the important value here is the EA, not the offset.
>> Typically in a LSB tag scheme, the tags are arranged so that after
>> adding these "funny" offsets, the resulting EA is aligned properly.
> BTW, such a method won't work on architectures where offsets are
> inherently aligned -- for instance, doesn't the x86 implicitly scale
> the offset by the size of the value being loaded? Does anybody know
> what is usually done on such an architecture?
I'm not sure, but there might be. At least, the result on intel Mac
with gcc -Os was similar to that on PowerPC.
_cons_to_long: _cons_to_long:
pushl %ebp pushl %ebp
movl %esp, %ebp movl %esp, %ebp
movl 8(%ebp), %eax movl 8(%ebp), %eax
testb $7, %al testb $7, %al
jne L617 jne L617
sarl $3, %eax sarl $3, %eax
jmp L619 jmp L619
L617: L617:
andl $-8, %eax
movl (%eax), %ecx movl -5(%eax), %ecx
movl 4(%eax), %edx movl -1(%eax), %edx
movl %edx, %eax movl %edx, %eax
andl $7, %eax andl $7, %eax
cmpl $5, %eax cmpl $5, %eax
jne L620 jne L620
movl %edx, %eax
andl $-8, %eax
movl (%eax), %edx movl -5(%edx), %edx
L620: L620:
movl %ecx, %eax movl %ecx, %eax
sarl $3, %eax sarl $3, %eax
sall $16, %eax sall $16, %eax
sarl $3, %edx sarl $3, %edx
orl %edx, %eax orl %edx, %eax
L619: L619:
popl %ebp popl %ebp
ret ret
And Solaris/SPARC with gcc -Os:
cons_to_long: cons_to_long:
!#PROLOGUE# 0 !#PROLOGUE# 0
!#PROLOGUE# 1 !#PROLOGUE# 1
and %o0, -8, %o4 mov %o0, %o4
andcc %o0, 7, %g0 andcc %o0, 7, %g0
be .LL952 be .LL949
sra %o0, 3, %o0 sra %o0, 3, %o0
ld [%o4+4], %o5 ld [%o4-1], %o5
and %o5, 7, %g1 and %o5, 7, %g1
cmp %g1, 5 cmp %g1, 5
ld [%o4], %g1 ld [%o4-5], %g1
sra %g1, 3, %g1 sra %g1, 3, %g1
and %o5, -8, %o3
bne .LL954 bne .LL951
sll %g1, 16, %o0 sll %g1, 16, %o0
ld [%o3], %o5 ld [%o5-5], %o5
.LL954: .LL951:
sra %o5, 3, %g1 sra %o5, 3, %g1
or %o0, %g1, %o0 or %o0, %g1, %o0
.LL952: .LL949:
retl retl
nop nop
YAMAMOTO Mitsuharu
mituharu@math.s.chiba-u.ac.jp
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: Untagging by subtraction instead of masking on USE_LSB_TAG
2008-01-28 2:07 Untagging by subtraction instead of masking on USE_LSB_TAG YAMAMOTO Mitsuharu
2008-01-28 3:52 ` Thien-Thi Nguyen
@ 2008-01-28 15:07 ` Stefan Monnier
1 sibling, 0 replies; 6+ messages in thread
From: Stefan Monnier @ 2008-01-28 15:07 UTC (permalink / raw)
To: YAMAMOTO Mitsuharu; +Cc: emacs-devel
> This would make sense if the latency of load/store does not depend on
> its displacement (I'm not sure if that is the case in general).
> Comments?
Yes, all RISC processors I can think of (other than ADM29K, maybe)
provide a "small" offset for free: a 0 offset is no faster than
a -5 offset.
Additionally, masking the lower 3 bits may sometimes be surprisingly
costly (the obvious solution of ANDing with ~7 requires a 32bit
constant, which tends to be more costly than a smaller constant, and
the rsh+lsh can be even worse).
So, yes, your patch looks like a very good idea in general.
It's also what is typically used in Lisp/Smalltalk/younameit compilers.
Stefan
PS: This trick has another advantage: on some hardware (e.g. SPARC) if
after adding/subtracting the offset, the resulting address is not
aligned, you can get an exception, which means you get tag-checking for
free. This is no coincidence, since the SPARC architecture benefitted
from the experience of the SOAR (Smalltalk On A RISC) project.
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2008-01-28 15:07 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-01-28 2:07 Untagging by subtraction instead of masking on USE_LSB_TAG YAMAMOTO Mitsuharu
2008-01-28 3:52 ` Thien-Thi Nguyen
2008-01-28 4:22 ` Miles Bader
2008-01-28 4:25 ` Miles Bader
2008-01-28 5:02 ` YAMAMOTO Mitsuharu
2008-01-28 15:07 ` Stefan Monnier
Code repositories for project(s) associated with this public inbox
https://git.savannah.gnu.org/cgit/emacs.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).