This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch hjl/x86/xgetbv created. glibc-2.24-88-gd963b83


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, hjl/x86/xgetbv has been created
        at  d963b835c1e0fe430a88168a81c4c69dcd9ad00c (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d963b835c1e0fe430a88168a81c4c69dcd9ad00c

commit d963b835c1e0fe430a88168a81c4c69dcd9ad00c
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Aug 23 09:09:32 2016 -0700

    X86-64: Add _dl_runtime_resolve_avx[512]_opt [BZ #20508]
    
    There is transition penalty when SSE instructions are mixed with 256-bit
    AVX or 512-bit AVX512 load instructions.  Since _dl_runtime_resolve_avx
    and _dl_runtime_profile_avx512 save/restore 256-bit YMM/512-bit ZMM
    registers, there is transition penalty when SSE instructions are used
    with lazy binding on AVX and AVX512 processors.
    
    For AVX and AVX512 processors which support XGETBV with ECX == 1, we can
    use XGETBV with ECX == 1 to check if the upper 128 bits of YMM registers
    or the upper 256 bits of ZMM registers are zero.  We can restore only the
    non-zero portion of vector registers with AVX/AVX512 load instructions
    which will zero-extend upper bits of vector registers.
    
    This patch adds _dl_runtime_resolve_sse_vex which saves and restores
    XMM registers with 128-bit AVX store/load instructions.  It is used to
    preserve YMM/ZMM registers when only the lower 128 bits are non-zero.
    _dl_runtime_resolve_avx_opt and _dl_runtime_resolve_avx512_opt are added
    and used on AVX/AVX512 processors supporting XGETBV with ECX == 1 so
    that we store and load only the non-zero portion of vector registers.
    This avoids SSE transition penalty caused by _dl_runtime_resolve_avx and
    _dl_runtime_profile_avx512 when only the lower 128 bits of vector
    registers are used.
    
    	[BZ #20508]
    	* sysdeps/x86/cpu-features.c (init_cpu_features): Set
    	Use_dl_runtime_resolve_opt if XGETBV suports ECX == 1.
    	* sysdeps/x86/cpu-features.h (bit_arch_Use_dl_runtime_resolve_opt):
    	New.
    	(index_arch_Use_dl_runtime_resolve_opt): Likewise.
    	* sysdeps/x86_64/dl-machine.h (elf_machine_runtime_setup): Use
    	_dl_runtime_resolve_avx512_opt and _dl_runtime_resolve_avx_opt
    	if Use_dl_runtime_resolve_opt is set.
    	* sysdeps/x86_64/dl-trampoline.S: Include <cpu-features.h>.
    	(_dl_runtime_resolve_opt): New.  Defined for AVX and AVX512.
    	(_dl_runtime_resolve): Add one for _dl_runtime_resolve_sse_vex.
    	* sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt): New.
    	(_dl_runtime_profile): Define only if _dl_runtime_profile is
    	defined.

diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 9ce4b49..49ce3c6 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -205,6 +205,17 @@ init_cpu_features (struct cpu_features *cpu_features)
       if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable))
 	cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load]
 	  |= bit_arch_AVX_Fast_Unaligned_Load;
+
+      if (cpu_features->max_cpuid >= 0xd)
+	{
+	  unsigned int eax;
+
+	  __cpuid_count (0xd, 1, eax, ebx, ecx, edx);
+	  /* Use _dl_runtime_resolve_opt if XGETBV suports ECX == 1.  */
+	  if ((eax & (1 << 2)) != 0)
+	    cpu_features->feature[index_arch_Use_dl_runtime_resolve_opt]
+	      |= bit_arch_Use_dl_runtime_resolve_opt;
+	}
     }
   /* This spells out "AuthenticAMD".  */
   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index e891036..8ee3e8c 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -37,6 +37,7 @@
 #define bit_arch_Prefer_No_VZEROUPPER		(1 << 17)
 #define bit_arch_Fast_Unaligned_Copy		(1 << 18)
 #define bit_arch_Prefer_ERMS			(1 << 19)
+#define bit_arch_Use_dl_runtime_resolve_opt	(1 << 20)
 
 /* CPUID Feature flags.  */
 
@@ -107,6 +108,7 @@
 # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE
 # define index_arch_Fast_Unaligned_Copy	FEATURE_INDEX_1*FEATURE_SIZE
 # define index_arch_Prefer_ERMS		FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Use_dl_runtime_resolve_opt FEATURE_INDEX_1*FEATURE_SIZE
 
 
 # if defined (_LIBC) && !IS_IN (nonlib)
@@ -277,6 +279,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
 # define index_arch_Fast_Unaligned_Copy	FEATURE_INDEX_1
 # define index_arch_Prefer_ERMS		FEATURE_INDEX_1
+# define index_arch_Use_dl_runtime_resolve_opt FEATURE_INDEX_1
 
 #endif	/* !__ASSEMBLER__ */
 
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index ed0c1a8..26b3b1a 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -68,7 +68,9 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
   Elf64_Addr *got;
   extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx_opt (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx512_opt (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
@@ -118,9 +120,23 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	     indicated by the offset on the stack, and then jump to
 	     the resolved address.  */
 	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
-	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
+	    {
+	      if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
+		*(ElfW(Addr) *) (got + 2)
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx512_opt;
+	      else
+		*(ElfW(Addr) *) (got + 2)
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
+	    }
 	  else if (HAS_ARCH_FEATURE (AVX_Usable))
-	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx;
+	    {
+	      if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
+		*(ElfW(Addr) *) (got + 2)
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx_opt;
+	      else
+		*(ElfW(Addr) *) (got + 2)
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx;
+	    }
 	  else
 	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse;
 	}
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 12f1a5c..39f595e 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -18,6 +18,7 @@
 
 #include <config.h>
 #include <sysdep.h>
+#include <cpu-features.h>
 #include <link-defines.h>
 
 #ifndef DL_STACK_ALIGNMENT
@@ -86,9 +87,11 @@
 #endif
 #define VEC(i)			zmm##i
 #define _dl_runtime_resolve	_dl_runtime_resolve_avx512
+#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx512_opt
 #define _dl_runtime_profile	_dl_runtime_profile_avx512
 #include "dl-trampoline.h"
 #undef _dl_runtime_resolve
+#undef _dl_runtime_resolve_opt
 #undef _dl_runtime_profile
 #undef VEC
 #undef VMOV
@@ -104,9 +107,11 @@
 #endif
 #define VEC(i)			ymm##i
 #define _dl_runtime_resolve	_dl_runtime_resolve_avx
+#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx_opt
 #define _dl_runtime_profile	_dl_runtime_profile_avx
 #include "dl-trampoline.h"
 #undef _dl_runtime_resolve
+#undef _dl_runtime_resolve_opt
 #undef _dl_runtime_profile
 #undef VEC
 #undef VMOV
@@ -126,3 +131,18 @@
 #define _dl_runtime_profile	_dl_runtime_profile_sse
 #undef RESTORE_AVX
 #include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef _dl_runtime_profile
+#undef VMOV
+#undef VMOVA
+
+/* Used by _dl_runtime_resolve_avx_opt/_dl_runtime_resolve_avx512_opt
+   to preserve the full vector registers with zero upper bits.  */
+#define VMOVA			vmovdqa
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			vmovdqa
+#else
+# define VMOV			vmovdqu
+#endif
+#define _dl_runtime_resolve	_dl_runtime_resolve_sse_vex
+#include "dl-trampoline.h"
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index 8161f96..2094b3a 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -50,6 +50,61 @@
 #endif
 
 	.text
+#ifdef _dl_runtime_resolve_opt
+/* Use the smallest vector registers to preserve the full YMM/ZMM
+   registers to avoid SSE transition penalty.  */
+	.globl _dl_runtime_resolve_opt
+	.hidden _dl_runtime_resolve_opt
+	.type _dl_runtime_resolve_opt, @function
+	.align 16
+_dl_runtime_resolve_opt:
+	cfi_startproc
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+	pushq %rax
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%rax, 0)
+	pushq %rcx
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%rcx, 0)
+	pushq %rdx
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%rdx, 0)
+	movl $1, %ecx
+	xgetbv
+	movl %eax, %r11d
+	popq %rdx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore (%rdx)
+	popq %rcx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore (%rcx)
+	popq %rax
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore (%rax)
+# if VEC_SIZE == 32
+	# For YMM registers, check if YMM state is in use.
+	andl $bit_YMM_state, %r11d
+	# Only preserve %xmm0 - %xmm7 registers with zero upper bits if
+	# YMM state isn't in use.
+	jz _dl_runtime_resolve_sse_vex
+# elif VEC_SIZE == 64
+	# For ZMM registers, check if YMM state and ZMM state are in
+	# use.
+	andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
+	cmpl $bit_YMM_state, %r11d
+	# Only preserve %xmm0 - %xmm7 registers with zero upper bits if
+	# neither YMM state nor ZMM state are in use.
+	jl _dl_runtime_resolve_sse_vex
+	# Only preserve %ymm0 - %ymm7 registers with zero upper bits if
+	# ZMM state isn't in use.
+	je _dl_runtime_resolve_avx
+# else
+#  error Unsupported VEC_SIZE!
+# endif
+	cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
+	cfi_endproc
+	.size _dl_runtime_resolve_opt, .-_dl_runtime_resolve_opt
+#endif
 	.globl _dl_runtime_resolve
 	.hidden _dl_runtime_resolve
 	.type _dl_runtime_resolve, @function
@@ -164,7 +219,10 @@ _dl_runtime_resolve:
 	.size _dl_runtime_resolve, .-_dl_runtime_resolve
 
 
-#ifndef PROF
+/* To preserve %xmm0 - %xmm7 registers, dl-trampoline.h is included
+   twice, for _dl_runtime_resolve_sse and _dl_runtime_resolve_sse_vex.
+   But we don't need another _dl_runtime_profile for XMM registers.  */
+#if !defined PROF && defined _dl_runtime_profile
 # if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
 #  error LR_VECTOR_OFFSET must be multples of VEC_SIZE
 # endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d4e9985c033d90c310d7798f2d1f0634a64cedff

commit d4e9985c033d90c310d7798f2d1f0634a64cedff
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu Aug 18 14:52:42 2016 -0700

    X86-64: Correct CFA in _dl_runtime_resolve
    
    When stack is re-aligned in _dl_runtime_resolve, there is no need to
    adjust CFA when allocating register save area on stack.
    
    	* sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve): Don't
    	adjust CFA when allocating register save area on re-aligned
    	stack.

diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index b90836a..8161f96 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -69,7 +69,9 @@ _dl_runtime_resolve:
 	and $-VEC_SIZE, %RSP_LP
 #endif
 	sub $REGISTER_SAVE_AREA, %RSP_LP
+#if !DL_RUNTIME_RESOLVE_REALIGN_STACK
 	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
+#endif
 	# Preserve registers otherwise clobbered.
 	movq %rax, REGISTER_SAVE_RAX(%rsp)
 	movq %rcx, REGISTER_SAVE_RCX(%rsp)

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]