unofficial mirror of guix-patches@gnu.org

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
 From 799859f6635d68487ea2472bd79d96a7639a1ab1 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sun, 6 Aug 2017 10:44:30 -0700
Subject: [PATCH 04/90] x86-64: Use _dl_runtime_resolve_opt only with AVX512F
 [BZ #21871]

On AVX machines with XGETBV (ECX == 1) like Skylake processors,

(gdb) disass _dl_runtime_resolve_avx_opt
Dump of assembler code for function _dl_runtime_resolve_avx_opt:
   0x0000000000015890 <+0>:	push   %rax
   0x0000000000015891 <+1>:	push   %rcx
   0x0000000000015892 <+2>:	push   %rdx
   0x0000000000015893 <+3>:	mov    $0x1,%ecx
   0x0000000000015898 <+8>:	xgetbv
   0x000000000001589b <+11>:	mov    %eax,%r11d
   0x000000000001589e <+14>:	pop    %rdx
   0x000000000001589f <+15>:	pop    %rcx
   0x00000000000158a0 <+16>:	pop    %rax
   0x00000000000158a1 <+17>:	and    $0x4,%r11d
   0x00000000000158a5 <+21>:	bnd je 0x16200 <_dl_runtime_resolve_sse_vex>
End of assembler dump.

is slower than:

(gdb) disass _dl_runtime_resolve_avx_slow
Dump of assembler code for function _dl_runtime_resolve_avx_slow:
   0x0000000000015850 <+0>:	vorpd  %ymm0,%ymm1,%ymm8
   0x0000000000015854 <+4>:	vorpd  %ymm2,%ymm3,%ymm9
   0x0000000000015858 <+8>:	vorpd  %ymm4,%ymm5,%ymm10
   0x000000000001585c <+12>:	vorpd  %ymm6,%ymm7,%ymm11
   0x0000000000015860 <+16>:	vorpd  %ymm8,%ymm9,%ymm9
   0x0000000000015865 <+21>:	vorpd  %ymm10,%ymm11,%ymm10
   0x000000000001586a <+26>:	vpcmpeqd %xmm8,%xmm8,%xmm8
   0x000000000001586f <+31>:	vorpd  %ymm9,%ymm10,%ymm10
   0x0000000000015874 <+36>:	vptest %ymm10,%ymm8
   0x0000000000015879 <+41>:	bnd jae 0x158b0 <_dl_runtime_resolve_avx>
   0x000000000001587c <+44>:	vzeroupper
   0x000000000001587f <+47>:	bnd jmpq 0x16200 <_dl_runtime_resolve_sse_vex>
End of assembler dump.
(gdb)

since xgetbv takes much more cycles than single cycle operations like
vpord/vvpcmpeq/ptest.  _dl_runtime_resolve_opt should be used only with
AVX512 where AVX512 instructions lead to lower CPU frequency on Skylake
server.

	[BZ #21871]
	* sysdeps/x86/cpu-features.c (init_cpu_features): Set
	bit_arch_Use_dl_runtime_resolve_opt only with AVX512F.

(cherry picked from commit d2cf37c0a2a375cf2fde69f1afbcc49e45368fc4)

diff --git a/ChangeLog b/ChangeLog
index 4357ad1eb8..764c827161 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2017-08-06  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #21871]
+	* sysdeps/x86/cpu-features.c (init_cpu_features): Set
+	bit_arch_Use_dl_runtime_resolve_opt only with AVX512F.
+
 2017-08-03  Aurelien Jarno  <aurelien@aurel32.net>
 
 	* stdlib/getentropy.c (getentropy): Change return type to int.
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 1d087ea732..6f900840d4 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -244,10 +244,13 @@ init_cpu_features (struct cpu_features *cpu_features)
 	  |= bit_arch_Prefer_No_AVX512;
 
       /* To avoid SSE transition penalty, use _dl_runtime_resolve_slow.
-         If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt.  */
+         If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt.
+	 Use _dl_runtime_resolve_opt only with AVX512F since it is
+	 slower than _dl_runtime_resolve_slow with AVX.  */
       cpu_features->feature[index_arch_Use_dl_runtime_resolve_slow]
 	|= bit_arch_Use_dl_runtime_resolve_slow;
-      if (cpu_features->max_cpuid >= 0xd)
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
+	  && cpu_features->max_cpuid >= 0xd)
 	{
 	  unsigned int eax;