This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch hjl/plt/2.21 created. glibc-2.21-31-g22ce180


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, hjl/plt/2.21 has been created
        at  22ce18019f83b1f9826c32aa2ee56dc0df3fbd49 (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=22ce18019f83b1f9826c32aa2ee56dc0df3fbd49

commit 22ce18019f83b1f9826c32aa2ee56dc0df3fbd49
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Aug 2 22:27:47 2015 -0700

    Don't run tst-getpid2 with LD_BIND_NOW=1
    
    Since _dl_x86_64_save_sse and _dl_x86_64_restore_sse are removed now,
    we don't need to run tst-getpid2 with LD_BIND_NOW=1.
    
    	* nptl/Makefile (tst-getpid2-ENV): Removed.

diff --git a/nptl/Makefile b/nptl/Makefile
index 43d8510..1e8f566 100644
--- a/nptl/Makefile
+++ b/nptl/Makefile
@@ -463,11 +463,6 @@ tst-cancel7-ARGS = --command "exec $(host-test-program-cmd)"
 tst-cancelx7-ARGS = $(tst-cancel7-ARGS)
 tst-umask1-ARGS = $(objpfx)tst-umask1.temp
 
-# In this test, we create a CLONE_VM "thread" that shares TLS storage
-# with the original thread. Both threads then race in ld.so with lazy PLT
-# resolution. Avoid this race by disabling lazy binding. BZ #11214.
-tst-getpid2-ENV = LD_BIND_NOW=1
-
 $(objpfx)tst-atfork2: $(libdl) $(shared-thread-library)
 LDFLAGS-tst-atfork2 = -rdynamic
 tst-atfork2-ENV = MALLOC_TRACE=$(objpfx)tst-atfork2.mtrace

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f0f347fa1c5ac0bb367f1ad85fe69fbdc1d7df41

commit f0f347fa1c5ac0bb367f1ad85fe69fbdc1d7df41
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Jul 29 04:49:38 2015 -0700

    Use SSE optimized strcmp in x86-64 ld.so
    
    Since ld.so preserves vector registers now, we can SSE optimized strcmp
    in x86-64 ld.so.
    
    	* sysdeps/x86_64/strcmp.S: Remove "#if !IS_IN (libc)".

diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 1329649..1624b5d 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -29,13 +29,6 @@
 #endif
 
 #ifdef USE_AS_STRNCMP
-/* The simplified code below is not set up to handle strncmp() so far.
-   Should this become necessary it has to be implemented.  For now
-   just report the problem.  */
-# if !IS_IN (libc)
-#  error "strncmp not implemented so far"
-# endif
-
 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
    if the new counter > the old one or is 0.  */
 # define UPDATE_STRNCMP_COUNTER				\
@@ -50,20 +43,10 @@
 #elif defined USE_AS_STRCASECMP_L
 # include "locale-defines.h"
 
-/* No support for strcasecmp outside libc so far since it is not needed.  */
-# if !IS_IN (libc)
-#  error "strcasecmp_l not implemented so far"
-# endif
-
 # define UPDATE_STRNCMP_COUNTER
 #elif defined USE_AS_STRNCASECMP_L
 # include "locale-defines.h"
 
-/* No support for strncasecmp outside libc so far since it is not needed.  */
-# if !IS_IN (libc)
-#  error "strncasecmp_l not implemented so far"
-# endif
-
 # define UPDATE_STRNCMP_COUNTER				\
 	/* calculate left number to compare */		\
 	lea	-16(%rcx, %r11), %r9;			\
@@ -126,63 +109,44 @@ libc_hidden_def (__strncasecmp)
 #endif
 
 ENTRY (STRCMP)
-#if !IS_IN (libc)
-/* Simple version since we can't use SSE registers in ld.so.  */
-L(oop):	movb	(%rdi), %al
-	cmpb	(%rsi), %al
-	jne	L(neq)
-	incq	%rdi
-	incq	%rsi
-	testb	%al, %al
-	jnz	L(oop)
-
-	xorl	%eax, %eax
-	ret
-
-L(neq):	movl	$1, %eax
-	movl	$-1, %ecx
-	cmovbl	%ecx, %eax
-	ret
-END (STRCMP)
-#else	/* !IS_IN (libc) */
-# ifdef USE_AS_STRCASECMP_L
+#ifdef USE_AS_STRCASECMP_L
 	/* We have to fall back on the C implementation for locales
 	   with encodings not matching ASCII for single bytes.  */
-#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
 	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
-#  else
+# else
 	mov	(%rdx), %RAX_LP
-#  endif
+# endif
 	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
 	jne	__strcasecmp_l_nonascii
-# elif defined USE_AS_STRNCASECMP_L
+#elif defined USE_AS_STRNCASECMP_L
 	/* We have to fall back on the C implementation for locales
 	   with encodings not matching ASCII for single bytes.  */
-#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
 	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
-#  else
+# else
 	mov	(%rcx), %RAX_LP
-#  endif
+# endif
 	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
 	jne	__strncasecmp_l_nonascii
-# endif
+#endif
 
 /*
  * This implementation uses SSE to compare up to 16 bytes at a time.
  */
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	test	%rdx, %rdx
 	je	LABEL(strcmp_exitz)
 	cmp	$1, %rdx
 	je	LABEL(Byte0)
 	mov	%rdx, %r11
-# endif
+#endif
 	mov	%esi, %ecx
 	mov	%edi, %eax
 /* Use 64bit AND here to avoid long NOP padding.  */
 	and	$0x3f, %rcx		/* rsi alignment in cache line */
 	and	$0x3f, %rax		/* rdi alignment in cache line */
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	.section .rodata.cst16,"aM",@progbits,16
 	.align 16
 .Lbelowupper:
@@ -196,12 +160,12 @@ END (STRCMP)
 	.quad	0x2020202020202020
 	.previous
 	movdqa	.Lbelowupper(%rip), %xmm5
-#  define UCLOW_reg %xmm5
+# define UCLOW_reg %xmm5
 	movdqa	.Ltopupper(%rip), %xmm6
-#  define UCHIGH_reg %xmm6
+# define UCHIGH_reg %xmm6
 	movdqa	.Ltouppermask(%rip), %xmm7
-#  define LCQWORD_reg %xmm7
-# endif
+# define LCQWORD_reg %xmm7
+#endif
 	cmp	$0x30, %ecx
 	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
 	cmp	$0x30, %eax
@@ -210,8 +174,8 @@ END (STRCMP)
 	movlpd	(%rsi), %xmm2
 	movhpd	8(%rdi), %xmm1
 	movhpd	8(%rsi), %xmm2
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-#  define TOLOWER(reg1, reg2) \
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
 	movdqa	reg1, %xmm8;					\
 	movdqa	UCHIGH_reg, %xmm9;				\
 	movdqa	reg2, %xmm10;					\
@@ -227,9 +191,9 @@ END (STRCMP)
 	por	%xmm8, reg1;					\
 	por	%xmm10, reg2
 	TOLOWER (%xmm1, %xmm2)
-# else
-#  define TOLOWER(reg1, reg2)
-# endif
+#else
+# define TOLOWER(reg1, reg2)
+#endif
 	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
 	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
 	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
@@ -237,10 +201,10 @@ END (STRCMP)
 	pmovmskb %xmm1, %edx
 	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
 	jnz	LABEL(less16bytes)	/* If not, find different value or null char */
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)	/* finish comparision */
-# endif
+#endif
 	add	$16, %rsi		/* prepare to search next 16 bytes */
 	add	$16, %rdi		/* prepare to search next 16 bytes */
 
@@ -282,13 +246,13 @@ LABEL(ashr_0):
 	movdqa	(%rsi), %xmm1
 	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
 	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
-# else
+#else
 	movdqa	(%rdi), %xmm2
 	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm2, %xmm1			/* compare 16 bytes for equality */
-# endif
+#endif
 	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
 	pmovmskb %xmm1, %r9d
 	shr	%cl, %edx			/* adjust 0xffff for offset */
@@ -321,10 +285,10 @@ LABEL(loop_ashr_0):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)		/* mismatch or null char seen */
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	add	$16, %rcx
 	movdqa	(%rsi, %rcx), %xmm1
 	movdqa	(%rdi, %rcx), %xmm2
@@ -336,10 +300,10 @@ LABEL(loop_ashr_0):
 	pmovmskb %xmm1, %edx
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	add	$16, %rcx
 	jmp	LABEL(loop_ashr_0)
 
@@ -388,13 +352,13 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		 /* store for next cycle */
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -404,10 +368,10 @@ LABEL(gobble_ashr_1):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
 
@@ -418,13 +382,13 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		/* store for next cycle */
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -434,10 +398,10 @@ LABEL(gobble_ashr_1):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
 	jmp	LABEL(loop_ashr_1)
@@ -453,10 +417,10 @@ LABEL(nibble_ashr_1):
 	test	$0xfffe, %edx
 	jnz	LABEL(ashr_1_exittail)	/* find null char*/
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$15, %r11
 	jbe	LABEL(ashr_1_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10		/* substract 4K from %r10 */
@@ -518,13 +482,13 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -534,10 +498,10 @@ LABEL(gobble_ashr_2):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -549,13 +513,13 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -565,10 +529,10 @@ LABEL(gobble_ashr_2):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -581,10 +545,10 @@ LABEL(nibble_ashr_2):
 	test	$0xfffc, %edx
 	jnz	LABEL(ashr_2_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$14, %r11
 	jbe	LABEL(ashr_2_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -643,13 +607,13 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -659,10 +623,10 @@ LABEL(gobble_ashr_3):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -674,13 +638,13 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -690,10 +654,10 @@ LABEL(gobble_ashr_3):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -706,10 +670,10 @@ LABEL(nibble_ashr_3):
 	test	$0xfff8, %edx
 	jnz	LABEL(ashr_3_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$13, %r11
 	jbe	LABEL(ashr_3_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -768,13 +732,13 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -784,10 +748,10 @@ LABEL(gobble_ashr_4):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -799,13 +763,13 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -815,10 +779,10 @@ LABEL(gobble_ashr_4):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -831,10 +795,10 @@ LABEL(nibble_ashr_4):
 	test	$0xfff0, %edx
 	jnz	LABEL(ashr_4_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$12, %r11
 	jbe	LABEL(ashr_4_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -893,13 +857,13 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -909,10 +873,10 @@ LABEL(gobble_ashr_5):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -924,13 +888,13 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -940,10 +904,10 @@ LABEL(gobble_ashr_5):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -956,10 +920,10 @@ LABEL(nibble_ashr_5):
 	test	$0xffe0, %edx
 	jnz	LABEL(ashr_5_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$11, %r11
 	jbe	LABEL(ashr_5_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1018,13 +982,13 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1034,10 +998,10 @@ LABEL(gobble_ashr_6):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1049,13 +1013,13 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1065,10 +1029,10 @@ LABEL(gobble_ashr_6):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1081,10 +1045,10 @@ LABEL(nibble_ashr_6):
 	test	$0xffc0, %edx
 	jnz	LABEL(ashr_6_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$10, %r11
 	jbe	LABEL(ashr_6_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1143,13 +1107,13 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1159,10 +1123,10 @@ LABEL(gobble_ashr_7):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1174,13 +1138,13 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1190,10 +1154,10 @@ LABEL(gobble_ashr_7):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1206,10 +1170,10 @@ LABEL(nibble_ashr_7):
 	test	$0xff80, %edx
 	jnz	LABEL(ashr_7_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$9, %r11
 	jbe	LABEL(ashr_7_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1268,13 +1232,13 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1284,10 +1248,10 @@ LABEL(gobble_ashr_8):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1299,13 +1263,13 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1315,10 +1279,10 @@ LABEL(gobble_ashr_8):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1331,10 +1295,10 @@ LABEL(nibble_ashr_8):
 	test	$0xff00, %edx
 	jnz	LABEL(ashr_8_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$8, %r11
 	jbe	LABEL(ashr_8_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1393,13 +1357,13 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1409,10 +1373,10 @@ LABEL(gobble_ashr_9):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1424,13 +1388,13 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1440,10 +1404,10 @@ LABEL(gobble_ashr_9):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3		/* store for next cycle */
@@ -1456,10 +1420,10 @@ LABEL(nibble_ashr_9):
 	test	$0xfe00, %edx
 	jnz	LABEL(ashr_9_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$7, %r11
 	jbe	LABEL(ashr_9_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1518,13 +1482,13 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1534,10 +1498,10 @@ LABEL(gobble_ashr_10):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1549,13 +1513,13 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1565,10 +1529,10 @@ LABEL(gobble_ashr_10):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1581,10 +1545,10 @@ LABEL(nibble_ashr_10):
 	test	$0xfc00, %edx
 	jnz	LABEL(ashr_10_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$6, %r11
 	jbe	LABEL(ashr_10_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1643,13 +1607,13 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1659,10 +1623,10 @@ LABEL(gobble_ashr_11):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1674,13 +1638,13 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1690,10 +1654,10 @@ LABEL(gobble_ashr_11):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1706,10 +1670,10 @@ LABEL(nibble_ashr_11):
 	test	$0xf800, %edx
 	jnz	LABEL(ashr_11_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$5, %r11
 	jbe	LABEL(ashr_11_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1768,13 +1732,13 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1784,10 +1748,10 @@ LABEL(gobble_ashr_12):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1799,13 +1763,13 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1815,10 +1779,10 @@ LABEL(gobble_ashr_12):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1831,10 +1795,10 @@ LABEL(nibble_ashr_12):
 	test	$0xf000, %edx
 	jnz	LABEL(ashr_12_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$4, %r11
 	jbe	LABEL(ashr_12_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1893,13 +1857,13 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1909,10 +1873,10 @@ LABEL(gobble_ashr_13):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1924,13 +1888,13 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1940,10 +1904,10 @@ LABEL(gobble_ashr_13):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1956,10 +1920,10 @@ LABEL(nibble_ashr_13):
 	test	$0xe000, %edx
 	jnz	LABEL(ashr_13_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$3, %r11
 	jbe	LABEL(ashr_13_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -2018,13 +1982,13 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2034,10 +1998,10 @@ LABEL(gobble_ashr_14):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2049,13 +2013,13 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2065,10 +2029,10 @@ LABEL(gobble_ashr_14):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2081,10 +2045,10 @@ LABEL(nibble_ashr_14):
 	test	$0xc000, %edx
 	jnz	LABEL(ashr_14_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$2, %r11
 	jbe	LABEL(ashr_14_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -2145,13 +2109,13 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2161,10 +2125,10 @@ LABEL(gobble_ashr_15):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2176,13 +2140,13 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2192,10 +2156,10 @@ LABEL(gobble_ashr_15):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2208,10 +2172,10 @@ LABEL(nibble_ashr_15):
 	test	$0x8000, %edx
 	jnz	LABEL(ashr_15_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmpq	$1, %r11
 	jbe	LABEL(ashr_15_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -2246,18 +2210,18 @@ LABEL(ret):
 LABEL(less16bytes):
 	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	%rdx, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	movzbl	(%rsi, %rdx), %ecx
 	movzbl	(%rdi, %rdx), %eax
 
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
 	movl	(%rdx,%rcx,4), %ecx
 	movl	(%rdx,%rax,4), %eax
-# endif
+#endif
 
 	sub	%ecx, %eax
 	ret
@@ -2271,11 +2235,11 @@ LABEL(Byte0):
 	movzx	(%rsi), %ecx
 	movzx	(%rdi), %eax
 
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
 	movl	(%rdx,%rcx,4), %ecx
 	movl	(%rdx,%rax,4), %eax
-# endif
+#endif
 
 	sub	%ecx, %eax
 	ret
@@ -2300,5 +2264,4 @@ LABEL(unaligned_table):
 	.int	LABEL(ashr_14) - LABEL(unaligned_table)
 	.int	LABEL(ashr_15) - LABEL(unaligned_table)
 	.int	LABEL(ashr_0) - LABEL(unaligned_table)
-#endif /* !IS_IN (libc) */
 libc_hidden_builtin_def (STRCMP)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=969ca0cbde44e2c531f4fb4358da354e45a16895

commit 969ca0cbde44e2c531f4fb4358da354e45a16895
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Jul 29 03:56:14 2015 -0700

    Remove x86-64 rtld-xxx.c and rtld-xxx.S
    
    Since ld.so preserves vector registers now, we can use the regular,
    non-ifunc string and memory functions in ld.so.
    
    	* sysdeps/x86_64/rtld-memcmp.c: Removed.
    	* sysdeps/x86_64/rtld-memset.S: Likewise.
    	* sysdeps/x86_64/rtld-strchr.S: Likewise.
    	* sysdeps/x86_64/rtld-strlen.S: Likewise.
    	* sysdeps/x86_64/multiarch/rtld-memcmp.c: Likewise.
    	* sysdeps/x86_64/multiarch/rtld-memset.S: Likewise.

diff --git a/sysdeps/x86_64/multiarch/rtld-memcmp.c b/sysdeps/x86_64/multiarch/rtld-memcmp.c
deleted file mode 100644
index 0f27135..0000000
--- a/sysdeps/x86_64/multiarch/rtld-memcmp.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../rtld-memcmp.c"
diff --git a/sysdeps/x86_64/multiarch/rtld-memset.S b/sysdeps/x86_64/multiarch/rtld-memset.S
deleted file mode 100644
index 8092aa0..0000000
--- a/sysdeps/x86_64/multiarch/rtld-memset.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../rtld-memset.S"
diff --git a/sysdeps/x86_64/rtld-memcmp.c b/sysdeps/x86_64/rtld-memcmp.c
deleted file mode 100644
index 2ee4032..0000000
--- a/sysdeps/x86_64/rtld-memcmp.c
+++ /dev/null
@@ -1 +0,0 @@
-#include <string/memcmp.c>
diff --git a/sysdeps/x86_64/rtld-memset.S b/sysdeps/x86_64/rtld-memset.S
deleted file mode 100644
index f8df333..0000000
--- a/sysdeps/x86_64/rtld-memset.S
+++ /dev/null
@@ -1,37 +0,0 @@
-/* memset implementation for the dynamic linker.  This is separate from the
-   libc implementation to avoid writing to SSE registers.
-   Copyright (C) 2013-2015 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
-	.text
-/* void *memset (void *dest, char c, size_t count)
-   dest	 => %rdi
-   c	 => %rsi
-   count => %rdx  */
-ENTRY (memset)
-	mov	%rdx, %rcx
-	movzbl	%sil, %eax
-	mov	%rdi, %rdx
-	rep	stosb
-	mov	%rdx, %rax
-	ret
-END (memset)
-libc_hidden_builtin_def (memset)
diff --git a/sysdeps/x86_64/rtld-strchr.S b/sysdeps/x86_64/rtld-strchr.S
deleted file mode 100644
index cc694d7..0000000
--- a/sysdeps/x86_64/rtld-strchr.S
+++ /dev/null
@@ -1,288 +0,0 @@
-/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
-   For AMD x86-64.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
-	.text
-ENTRY (strchr)
-
-	/* Before we start with the main loop we process single bytes
-	   until the source pointer is aligned.  This has two reasons:
-	   1. aligned 64-bit memory access is faster
-	   and (more important)
-	   2. we process in the main loop 64 bit in one step although
-	      we don't know the end of the string.  But accessing at
-	      8-byte alignment guarantees that we never access illegal
-	      memory if this would not also be done by the trivial
-	      implementation (this is because all processor inherent
-	      boundaries are multiples of 8).  */
-
-	movq	%rdi, %rdx
-	andl	$7, %edx	/* Mask alignment bits  */
-	movq	%rdi, %rax	/* duplicate destination.  */
-	jz	1f		/* aligned => start loop */
-	neg	%edx
-	addl	$8, %edx	/* Align to 8 bytes.  */
-
-	/* Search the first bytes directly.  */
-0:	movb	(%rax), %cl	/* load byte  */
-	cmpb	%cl,%sil	/* compare byte.  */
-	je	6f		/* target found */
-	testb	%cl,%cl		/* is byte NUL? */
-	je	7f		/* yes => return NULL */
-	incq	%rax		/* increment pointer */
-	decl	%edx
-	jnz	0b
-
-
-1:
-	/* At the moment %rsi contains C.  What we need for the
-	   algorithm is C in all bytes of the register.  Avoid
-	   operations on 16 bit words because these require an
-	   prefix byte (and one more cycle).  */
-	/* Populate 8 bit data to full 64-bit.  */
-	movabs	$0x0101010101010101,%r9
-	movzbl	%sil,%edx
-	imul	%rdx,%r9
-
-	movq $0xfefefefefefefeff, %r8 /* Save magic.  */
-
-      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
-	 change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of QUARDWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.	If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24 tec..  If one of bits 54-63 is set, there will be a carry
-	 into bit 64 (=carry flag), so all of the hole bits will
-	 be changed.
-
-	 3) But wait!  Aren't we looking for C, not zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-	.p2align 4
-4:
-	/* Main Loop is unrolled 4 times.  */
-	/* First unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 7f			/* found NUL => return NULL */
-
-	/* Second unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 7f			/* found NUL => return NULL */
-	/* Third unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 7f			/* found NUL => return NULL */
-	/* Fourth unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jz 4b			/* no NUL found => restart loop */
-
-
-7:	/* Return NULL.  */
-	xorl %eax, %eax
-	retq
-
-
-	/* We now scan for the byte in which the character was matched.
-	   But we have to take care of the case that a NUL char is
-	   found before this in the dword.  Note that we XORed %rcx
-	   with the byte we're looking for, therefore the tests below look
-	   reversed.  */
-
-
-	.p2align 4		/* Align, it's a jump target.  */
-3:	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */
-	subq	$8,%rax		/* correct pointer increment.  */
-	testb %cl, %cl		/* is first byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is first byte NUL? */
-	je 7b			/* yes => return NULL */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %ch		/* is second byte NUL? */
-	je 7b			/* yes => return NULL? */
-	incq %rax		/* increment pointer */
-
-	shrq $16, %rcx		/* make upper bytes accessible */
-	testb %cl, %cl		/* is third byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is third byte NUL? */
-	je 7b			/* yes => return NULL */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is fourth byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %ch		/* is fourth byte NUL? */
-	je 7b			/* yes => return NULL? */
-	incq %rax		/* increment pointer */
-
-	shrq $16, %rcx		/* make upper bytes accessible */
-	testb %cl, %cl		/* is fifth byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is fifth byte NUL? */
-	je 7b			/* yes => return NULL */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is sixth byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %ch		/* is sixth byte NUL? */
-	je 7b			/* yes => return NULL? */
-	incq %rax		/* increment pointer */
-
-	shrq $16, %rcx		/* make upper bytes accessible */
-	testb %cl, %cl		/* is seventh byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is seventh byte NUL? */
-	je 7b			/* yes => return NULL */
-
-	/* It must be in the eigth byte and it cannot be NUL.  */
-	incq %rax
-
-6:
-	nop
-	retq
-END (strchr)
-
-weak_alias (strchr, index)
-libc_hidden_builtin_def (strchr)
diff --git a/sysdeps/x86_64/rtld-strlen.S b/sysdeps/x86_64/rtld-strlen.S
deleted file mode 100644
index 1328652..0000000
--- a/sysdeps/x86_64/rtld-strlen.S
+++ /dev/null
@@ -1,136 +0,0 @@
-/* strlen(str) -- determine the length of the string STR.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
-   Based on i486 version contributed by Ulrich Drepper <drepper@redhat.com>.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
-	.text
-ENTRY (strlen)
-	movq %rdi, %rcx		/* Duplicate source pointer. */
-	andl $7, %ecx		/* mask alignment bits */
-	movq %rdi, %rax		/* duplicate destination.  */
-	jz 1f			/* aligned => start loop */
-
-	neg %ecx		/* We need to align to 8 bytes.  */
-	addl $8,%ecx
-	/* Search the first bytes directly.  */
-0:	cmpb $0x0,(%rax)	/* is byte NUL? */
-	je 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-	decl %ecx
-	jnz 0b
-
-1:	movq $0xfefefefefefefeff,%r8 /* Save magic.  */
-
-	.p2align 4		/* Align loop.  */
-4:	/* Main Loop is unrolled 4 times.  */
-	/* First unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Second unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Third unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Fourth unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jz 4b			/* no NUL found => continue loop */
-
-	.p2align 4		/* Align, it's a jump target.  */
-3:	subq $8,%rax		/* correct pointer increment.  */
-
-	testb %cl, %cl		/* is first byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testl $0x00ff0000, %ecx /* is third byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-
-	testl $0xff000000, %ecx /* is fourth byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-
-	shrq $32, %rcx		/* look at other half.  */
-
-	testb %cl, %cl		/* is first byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testl $0xff0000, %ecx	/* is third byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-2:
-	subq %rdi, %rax		/* compute difference to string start */
-	ret
-END (strlen)
-libc_hidden_builtin_def (strlen)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c245d87cf4754a8553b5284206da1d15f1fbb7dc

commit c245d87cf4754a8553b5284206da1d15f1fbb7dc
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Jul 29 03:47:54 2015 -0700

    Replace %xmm8 with %xmm0
    
    Since ld.so preserves vector registers now, we can use %xmm0 to avoid
    the REX prefix.
    
    	* sysdeps/x86_64/memset.S: Replace %xmm8 with %xmm0.

diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index e496254..3855cc8 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -24,7 +24,7 @@
 ENTRY(__bzero)
 	movq	%rdi, %rax /* Set return value.  */
 	movq	%rsi, %rdx /* Set n.  */
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	jmp	L(entry_from_bzero)
 END(__bzero)
 weak_alias (__bzero, bzero)
@@ -33,10 +33,10 @@ weak_alias (__bzero, bzero)
 ENTRY(__memset_tail)
 	movq	%rcx, %rax /* Set return value.  */
 
-	movd	%esi, %xmm8
-	punpcklbw	%xmm8, %xmm8
-	punpcklwd	%xmm8, %xmm8
-	pshufd	$0, %xmm8, %xmm8
+	movd	%esi, %xmm0
+	punpcklbw	%xmm0, %xmm0
+	punpcklwd	%xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
 
 	jmp	L(entry_from_bzero)
 END(__memset_tail)
@@ -50,57 +50,57 @@ END_CHK (__memset_chk)
 #endif
 
 ENTRY (memset)
-	movd	%esi, %xmm8
+	movd	%esi, %xmm0
 	movq	%rdi, %rax
-	punpcklbw	%xmm8, %xmm8
-	punpcklwd	%xmm8, %xmm8
-	pshufd	$0, %xmm8, %xmm8
+	punpcklbw	%xmm0, %xmm0
+	punpcklwd	%xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
 L(entry_from_bzero):
 	cmpq	$64, %rdx
 	ja	L(loop_start)
 	cmpq	$16, %rdx
 	jbe	L(less_16_bytes)
 	cmpq	$32, %rdx
-	movdqu	%xmm8, (%rdi)
-	movdqu	%xmm8, -16(%rdi,%rdx)
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm0, -16(%rdi,%rdx)
 	ja	L(between_32_64_bytes)
 L(return):
 	rep
 	ret
 	.p2align 4
 L(between_32_64_bytes):
-	movdqu	%xmm8, 16(%rdi)
-	movdqu	%xmm8, -32(%rdi,%rdx)
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm0, -32(%rdi,%rdx)
 	ret
 	.p2align 4
 L(loop_start):
 	leaq	64(%rdi), %rcx
-	movdqu	%xmm8, (%rdi)
+	movdqu	%xmm0, (%rdi)
 	andq	$-64, %rcx
-	movdqu	%xmm8, -16(%rdi,%rdx)
-	movdqu	%xmm8, 16(%rdi)
-	movdqu	%xmm8, -32(%rdi,%rdx)
-	movdqu	%xmm8, 32(%rdi)
-	movdqu	%xmm8, -48(%rdi,%rdx)
-	movdqu	%xmm8, 48(%rdi)
-	movdqu	%xmm8, -64(%rdi,%rdx)
+	movdqu	%xmm0, -16(%rdi,%rdx)
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm0, -32(%rdi,%rdx)
+	movdqu	%xmm0, 32(%rdi)
+	movdqu	%xmm0, -48(%rdi,%rdx)
+	movdqu	%xmm0, 48(%rdi)
+	movdqu	%xmm0, -64(%rdi,%rdx)
 	addq	%rdi, %rdx
 	andq	$-64, %rdx
 	cmpq	%rdx, %rcx
 	je	L(return)
 	.p2align 4
 L(loop):
-	movdqa	%xmm8, (%rcx)
-	movdqa	%xmm8, 16(%rcx)
-	movdqa	%xmm8, 32(%rcx)
-	movdqa	%xmm8, 48(%rcx)
+	movdqa	%xmm0, (%rcx)
+	movdqa	%xmm0, 16(%rcx)
+	movdqa	%xmm0, 32(%rcx)
+	movdqa	%xmm0, 48(%rcx)
 	addq	$64, %rcx
 	cmpq	%rcx, %rdx
 	jne	L(loop)
 	rep
 	ret
 L(less_16_bytes):
-	movq %xmm8, %rcx
+	movq %xmm0, %rcx
 	testb	$24, %dl
 	jne	L(between8_16bytes)
 	testb	$4, %dl

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=1cb03e05847b1e8e68a06473f0fedf79ecf49005

commit 1cb03e05847b1e8e68a06473f0fedf79ecf49005
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Jul 29 03:44:39 2015 -0700

    Replace %xmm[8-12] with %xmm[0-4]
    
    Since ld.so preserves vector registers now, we can use %xmm[0-4] to
    avoid the REX prefix.
    
    	* sysdeps/x86_64/strlen.S: Replace %xmm[8-12] with %xmm[0-4].

diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index c382c8d..0725333 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -20,7 +20,7 @@
 
 /* Long lived register in strlen(s), strnlen(s, n) are:
 
-	%xmm11 - zero
+	%xmm3 - zero
 	%rdi   - s
 	%r10  (s+n) & (~(64-1))
 	%r11   s+n
@@ -32,14 +32,14 @@ ENTRY(strlen)
 
 /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
 #define FIND_ZERO	\
-	pcmpeqb	(%rax), %xmm8;	\
-	pcmpeqb	16(%rax), %xmm9;	\
-	pcmpeqb	32(%rax), %xmm10;	\
-	pcmpeqb	48(%rax), %xmm11;	\
-	pmovmskb	%xmm8, %esi;	\
-	pmovmskb	%xmm9, %edx;	\
-	pmovmskb	%xmm10, %r8d;	\
-	pmovmskb	%xmm11, %ecx;	\
+	pcmpeqb	(%rax), %xmm0;	\
+	pcmpeqb	16(%rax), %xmm1;	\
+	pcmpeqb	32(%rax), %xmm2;	\
+	pcmpeqb	48(%rax), %xmm3;	\
+	pmovmskb	%xmm0, %esi;	\
+	pmovmskb	%xmm1, %edx;	\
+	pmovmskb	%xmm2, %r8d;	\
+	pmovmskb	%xmm3, %ecx;	\
 	salq	$16, %rdx;	\
 	salq	$16, %rcx;	\
 	orq	%rsi, %rdx;	\
@@ -63,10 +63,10 @@ L(n_nonzero):
 	mov	%rsi, %r11
 #endif
 
-	pxor	%xmm8, %xmm8
-	pxor	%xmm9, %xmm9
-	pxor	%xmm10, %xmm10
-	pxor	%xmm11, %xmm11
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
 	movq	%rdi, %rax
 	movq	%rdi, %rcx
 	andq	$4095, %rcx
@@ -103,9 +103,9 @@ L(n_nonzero):
 	FIND_ZERO
 #else
 	/* Test first 16 bytes unaligned.  */
-	movdqu	(%rax), %xmm12
-	pcmpeqb	%xmm8, %xmm12
-	pmovmskb	%xmm12, %edx
+	movdqu	(%rax), %xmm4
+	pcmpeqb	%xmm0, %xmm4
+	pmovmskb	%xmm4, %edx
 	test	%edx, %edx
 	je 	L(next48_bytes)
 	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
@@ -114,12 +114,12 @@ L(n_nonzero):
 L(next48_bytes):
 /* Same as FIND_ZERO except we do not check first 16 bytes.  */
 	andq	$-16, %rax
-	pcmpeqb 16(%rax), %xmm9
-	pcmpeqb 32(%rax), %xmm10
-	pcmpeqb 48(%rax), %xmm11
-	pmovmskb	%xmm9, %edx
-	pmovmskb	%xmm10, %r8d
-	pmovmskb	%xmm11, %ecx
+	pcmpeqb 16(%rax), %xmm1
+	pcmpeqb 32(%rax), %xmm2
+	pcmpeqb 48(%rax), %xmm3
+	pmovmskb	%xmm1, %edx
+	pmovmskb	%xmm2, %r8d
+	pmovmskb	%xmm3, %ecx
 	salq	$16, %rdx
 	salq	$16, %rcx
 	orq	%r8, %rcx
@@ -127,7 +127,7 @@ L(next48_bytes):
 	orq	%rcx, %rdx
 #endif
 
-	/* When no zero byte is found xmm9-11 are zero so we do not have to
+	/* When no zero byte is found xmm1-3 are zero so we do not have to
 	   zero them.  */
 	PROLOG(loop)
 
@@ -149,9 +149,9 @@ L(strnlen_ret):
 #endif
 	.p2align 4
 L(loop_init):
-	pxor	%xmm9, %xmm9
-	pxor	%xmm10, %xmm10
-	pxor	%xmm11, %xmm11
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
 #ifdef AS_STRNLEN
 	.p2align 4
 L(loop):
@@ -160,12 +160,12 @@ L(loop):
 	cmpq	%rax, %r10
 	je	L(exit_end)
 
-	movdqa	(%rax), %xmm8
-	pminub	16(%rax), %xmm8
-	pminub	32(%rax), %xmm8
-	pminub	48(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
+	movdqa	(%rax), %xmm0
+	pminub	16(%rax), %xmm0
+	pminub	32(%rax), %xmm0
+	pminub	48(%rax), %xmm0
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit)
 	jmp	L(loop)
@@ -174,7 +174,7 @@ L(loop):
 L(exit_end):
 	cmp	%rax, %r11
 	je	L(first) /* Do not read when end is at page boundary.  */
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	FIND_ZERO
 
 L(first):
@@ -186,7 +186,7 @@ L(first):
 
 	.p2align 4
 L(exit):
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	FIND_ZERO
 
 	bsfq	%rdx, %rdx
@@ -200,23 +200,23 @@ L(exit):
 	.p2align 4
 L(loop):
 
-	movdqa	64(%rax), %xmm8
-	pminub	80(%rax), %xmm8
-	pminub	96(%rax), %xmm8
-	pminub	112(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
+	movdqa	64(%rax), %xmm0
+	pminub	80(%rax), %xmm0
+	pminub	96(%rax), %xmm0
+	pminub	112(%rax), %xmm0
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit64)
 
 	subq	$-128, %rax
 
-	movdqa	(%rax), %xmm8
-	pminub	16(%rax), %xmm8
-	pminub	32(%rax), %xmm8
-	pminub	48(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
+	movdqa	(%rax), %xmm0
+	pminub	16(%rax), %xmm0
+	pminub	32(%rax), %xmm0
+	pminub	48(%rax), %xmm0
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit0)
 	jmp	L(loop)
@@ -225,7 +225,7 @@ L(loop):
 L(exit64):
 	addq	$64, %rax
 L(exit0):
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	FIND_ZERO
 
 	bsfq	%rdx, %rdx

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0b92f51d8303a5148aa99dc101d1e73244199a61

commit 0b92f51d8303a5148aa99dc101d1e73244199a61
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Jul 28 18:56:18 2015 -0700

    Don't disable SSE in x86-64 ld.so
    
    Since ld.so preserves vector registers now, we can use SSE in ld.so.
    
    	* sysdeps/i386/Makefile [$(subdir) == elf] (CFLAGS-.os): Add
    	-mno-sse -mno-mmx for $(all-rtld-routines).
    	[$(subdir) == elf] (tests-special): Add
    	$(objpfx)tst-ld-sse-use.out.
    	[$(subdir) == elf] ($(objpfx)tst-ld-sse-use.out): New rule.
    	* sysdeps/x86/Makefile [$(subdir) == elf] (CFLAGS-.os): Removed.
    	[$(subdir) == elf] (tests-special): Likewise.
    	[$(subdir) == elf] ($(objpfx)tst-ld-sse-use.out): Likewise.
    	* sysdeps/x86_64/Makefile [$(subdir) == elf] (CFLAGS-.os): Add
    	-mno-mmx for $(all-rtld-routines).

diff --git a/sysdeps/i386/Makefile b/sysdeps/i386/Makefile
index 9f12190..53e8ab9 100644
--- a/sysdeps/i386/Makefile
+++ b/sysdeps/i386/Makefile
@@ -81,3 +81,14 @@ endif
 ifeq ($(subdir),csu)
 gen-as-const-headers += tlsdesc.sym
 endif
+
+ifeq ($(subdir),elf)
+CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
+		   -mno-sse -mno-mmx)
+
+tests-special += $(objpfx)tst-ld-sse-use.out
+$(objpfx)tst-ld-sse-use.out: ../sysdeps/x86/tst-ld-sse-use.sh $(objpfx)ld.so
+	@echo "Checking ld.so for SSE register use.  This will take a few seconds..."
+	$(BASH) $< $(objpfx) '$(NM)' '$(OBJDUMP)' '$(READELF)' > $@; \
+	$(evaluate-test)
+endif
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 74d2b17..0fabdae 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -1,14 +1,3 @@
-ifeq ($(subdir),elf)
-CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
-		   -mno-sse -mno-mmx)
-
-tests-special += $(objpfx)tst-ld-sse-use.out
-$(objpfx)tst-ld-sse-use.out: ../sysdeps/x86/tst-ld-sse-use.sh $(objpfx)ld.so
-	@echo "Checking ld.so for SSE register use.  This will take a few seconds..."
-	$(BASH) $< $(objpfx) '$(NM)' '$(OBJDUMP)' '$(READELF)' > $@; \
-	$(evaluate-test)
-endif
-
 ifeq ($(subdir),csu)
 gen-as-const-headers += cpu-features-offsets.sym rtld-global-offsets.sym
 endif
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 5cf79e6..859e6a9 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -19,6 +19,9 @@ gen-as-const-headers += locale-defines.sym
 endif
 
 ifeq ($(subdir),elf)
+CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
+		   -mno-mmx)
+
 sysdep-dl-routines += tlsdesc dl-tlsdesc
 sysdep_routines += tlsdesc dl-tlsdesc
 sysdep-rtld-routines += tlsdesc dl-tlsdesc

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=00c7c14c2131849983307800f5917c32b58d29d9

commit 00c7c14c2131849983307800f5917c32b58d29d9
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sat Jul 11 13:25:25 2015 -0700

    Save and restore vector registers in x86-64 ld.so
    
    This patch initiaizes GLRO(dl_x86_xstate) in dl_platform_init to
    indicate if the processor supports SSE, AVX or AVX512.  It uses
    this information to properly save and restore vector registers in
    ld.so.  Now we can use SSE in ld.so and delete FOREIGN_CALL macros.
    
    	[BZ #15128]
    	* sysdeps/x86_64/Makefile [$(subdir) == elf] (tests): Add
    	ifuncmain8.
    	(modules-names): Add ifuncmod8.
    	($(objpfx)ifuncmain8): New rule.
    	* sysdeps/x86_64/dl-machine.h: Include <dl-procinfo.h> and
    	<cpuid.h>.
    	(elf_machine_runtime_setup): Use _dl_runtime_resolve_sse,
    	_dl_runtime_resolve_avx, or _dl_runtime_resolve_avx512,
    	_dl_runtime_profile_sse, _dl_runtime_profile_avx, or
    	_dl_runtime_profile_avx512, based on HAS_ARCH_FEATURE.
    	* sysdeps/x86_64/dl-trampoline.S: Rewrite.
    	* sysdeps/x86_64/dl-trampoline.h: Likewise.
    	* sysdeps/x86_64/ifuncmain8.c: New file.
    	* sysdeps/x86_64/ifuncmod8.c: Likewise.
    	* sysdeps/x86_64/nptl/tcb-offsets.sym (RTLD_SAVESPACE_SSE):
    	Removed.
    	* sysdeps/x86_64/nptl/tls.h (__128bits): Removed.
    	(tcbhead_t): Change rtld_must_xmm_save to __glibc_unused1.
    	Change rtld_savespace_sse to __glibc_unused2.
    	(RTLD_CHECK_FOREIGN_CALL): Removed.
    	(RTLD_ENABLE_FOREIGN_CALL): Likewise.
    	(RTLD_PREPARE_FOREIGN_CALL): Likewise.
    	(RTLD_FINALIZE_FOREIGN_CALL): Likewise.

diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 32b36d5..5cf79e6 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -23,6 +23,11 @@ sysdep-dl-routines += tlsdesc dl-tlsdesc
 sysdep_routines += tlsdesc dl-tlsdesc
 sysdep-rtld-routines += tlsdesc dl-tlsdesc
 
+tests += ifuncmain8
+modules-names += ifuncmod8
+
+$(objpfx)ifuncmain8: $(objpfx)ifuncmod8.so
+
 tests += tst-quad1 tst-quad2
 modules-names += tst-quadmod1 tst-quadmod2
 
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index 3fa5e61..03a6625 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -66,8 +66,12 @@ static inline int __attribute__ ((unused, always_inline))
 elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 {
   Elf64_Addr *got;
-  extern void _dl_runtime_resolve (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_profile (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
 
   if (l->l_info[DT_JMPREL] && lazy)
     {
@@ -95,7 +99,12 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	 end in this function.  */
       if (__glibc_unlikely (profile))
 	{
-	  *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile;
+	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
+	  else if (HAS_ARCH_FEATURE (AVX_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
+	  else
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
 
 	  if (GLRO(dl_profile) != NULL
 	      && _dl_name_match_p (GLRO(dl_profile), l))
@@ -104,9 +113,17 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	    GL(dl_profile_map) = l;
 	}
       else
-	/* This function will get called to fix up the GOT entry indicated by
-	   the offset on the stack, and then jump to the resolved address.  */
-	*(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve;
+	{
+	  /* This function will get called to fix up the GOT entry
+	     indicated by the offset on the stack, and then jump to
+	     the resolved address.  */
+	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
+	  else if (HAS_ARCH_FEATURE (AVX_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx;
+	  else
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse;
+	}
     }
 
   if (l->l_info[ADDRIDX (DT_TLSDESC_GOT)] && lazy)
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 678c57f..8475d26 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -20,23 +20,40 @@
 #include <sysdep.h>
 #include <link-defines.h>
 
-#if (RTLD_SAVESPACE_SSE % 32) != 0
-# error RTLD_SAVESPACE_SSE must be aligned to 32 bytes
+#ifndef DL_STACK_ALIGNMENT
+/* Due to GCC bug:
+
+   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+
+   __tls_get_addr may be called with 8-byte stack alignment.  Although
+   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
+   that stack will be always aligned at 16 bytes.  We use unaligned
+   16-byte move to load and store SSE registers, which has no penalty
+   on modern processors if stack is 16-byte aligned.  */
+# define DL_STACK_ALIGNMENT 8
+#endif
+
+#ifndef DL_RUNIME_UNALIGNED_VEC_SIZE
+/* The maximum size of unaligned vector load and store.  */
+# define DL_RUNIME_UNALIGNED_VEC_SIZE 16
 #endif
 
+/* True if _dl_runtime_resolve should align stack to VEC_SIZE bytes.  */
+#define DL_RUNIME_RESOLVE_REALIGN_STACK \
+  (VEC_SIZE > DL_STACK_ALIGNMENT \
+   && VEC_SIZE > DL_RUNIME_UNALIGNED_VEC_SIZE)
+
+/* Align vector register save area to 16 bytes.  */
+#define REGISTER_SAVE_VEC_OFF	0
+
 /* Area on stack to save and restore registers used for parameter
    passing when calling _dl_fixup.  */
 #ifdef __ILP32__
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX.  */
-# define REGISTER_SAVE_AREA	(8 * 7)
-# define REGISTER_SAVE_RAX	0
+# define REGISTER_SAVE_RAX	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
 # define PRESERVE_BND_REGS_PREFIX
 #else
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0,
-   BND1, BND2, BND3.  */
-# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 4)
 /* Align bound register save area to 16 bytes.  */
-# define REGISTER_SAVE_BND0	0
+# define REGISTER_SAVE_BND0	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
 # define REGISTER_SAVE_BND1	(REGISTER_SAVE_BND0 + 16)
 # define REGISTER_SAVE_BND2	(REGISTER_SAVE_BND1 + 16)
 # define REGISTER_SAVE_BND3	(REGISTER_SAVE_BND2 + 16)
@@ -54,386 +71,53 @@
 #define REGISTER_SAVE_R8	(REGISTER_SAVE_RDI + 8)
 #define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
 
-	.text
-	.globl _dl_runtime_resolve
-	.type _dl_runtime_resolve, @function
-	.align 16
-	cfi_startproc
-_dl_runtime_resolve:
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
-	subq $REGISTER_SAVE_AREA,%rsp
-	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
-	# Preserve registers otherwise clobbered.
-	movq %rax, REGISTER_SAVE_RAX(%rsp)
-	movq %rcx, REGISTER_SAVE_RCX(%rsp)
-	movq %rdx, REGISTER_SAVE_RDX(%rsp)
-	movq %rsi, REGISTER_SAVE_RSI(%rsp)
-	movq %rdi, REGISTER_SAVE_RDI(%rsp)
-	movq %r8, REGISTER_SAVE_R8(%rsp)
-	movq %r9, REGISTER_SAVE_R9(%rsp)
-#ifndef __ILP32__
-	# We also have to preserve bound registers.  These are nops if
-	# Intel MPX isn't available or disabled.
-# ifdef HAVE_MPX_SUPPORT
-	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
-	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
-	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
-	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
-# else
-#  if REGISTER_SAVE_BND0 == 0
-	.byte 0x66,0x0f,0x1b,0x04,0x24
-#  else
-	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
-#  endif
-	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
-	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
-	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
-# endif
-#endif
-	# Copy args pushed by PLT in register.
-	# %rdi: link_map, %rsi: reloc_index
-	movq (REGISTER_SAVE_AREA + 8)(%rsp), %rsi
-	movq REGISTER_SAVE_AREA(%rsp), %rdi
-	call _dl_fixup		# Call resolver.
-	movq %rax, %r11		# Save return value
-#ifndef __ILP32__
-	# Restore bound registers.  These are nops if Intel MPX isn't
-	# avaiable or disabled.
-# ifdef HAVE_MPX_SUPPORT
-	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
-	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
-	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
-	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
-# else
-	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
-	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
-	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
-#  if REGISTER_SAVE_BND0 == 0
-	.byte 0x66,0x0f,0x1a,0x04,0x24
-#  else
-	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
-#  endif
-# endif
+#define VEC_SIZE		64
+#define VMOVA			vmovdqa64
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			vmovdqa64
+#else
+# define VMOV			vmovdqu64
 #endif
-	# Get register content back.
-	movq REGISTER_SAVE_R9(%rsp), %r9
-	movq REGISTER_SAVE_R8(%rsp), %r8
-	movq REGISTER_SAVE_RDI(%rsp), %rdi
-	movq REGISTER_SAVE_RSI(%rsp), %rsi
-	movq REGISTER_SAVE_RDX(%rsp), %rdx
-	movq REGISTER_SAVE_RCX(%rsp), %rcx
-	movq REGISTER_SAVE_RAX(%rsp), %rax
-	# Adjust stack(PLT did 2 pushes)
-	addq $(REGISTER_SAVE_AREA + 16), %rsp
-	cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))
-	# Preserve bound registers.
-	PRESERVE_BND_REGS_PREFIX
-	jmp *%r11		# Jump to function address.
-	cfi_endproc
-	.size _dl_runtime_resolve, .-_dl_runtime_resolve
-
-
-#ifndef PROF
-	.globl _dl_runtime_profile
-	.type _dl_runtime_profile, @function
-	.align 16
-	cfi_startproc
-
-_dl_runtime_profile:
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
-	/* The La_x86_64_regs data structure pointed to by the
-	   fourth paramater must be 16-byte aligned.  This must
-	   be explicitly enforced.  We have the set up a dynamically
-	   sized stack frame.  %rbx points to the top half which
-	   has a fixed size and preserves the original stack pointer.  */
-
-	subq $32, %rsp		# Allocate the local storage.
-	cfi_adjust_cfa_offset(32)
-	movq %rbx, (%rsp)
-	cfi_rel_offset(%rbx, 0)
-
-	/* On the stack:
-		56(%rbx)	parameter #1
-		48(%rbx)	return address
-
-		40(%rbx)	reloc index
-		32(%rbx)	link_map
-
-		24(%rbx)	La_x86_64_regs pointer
-		16(%rbx)	framesize
-		 8(%rbx)	rax
-		  (%rbx)	rbx
-	*/
-
-	movq %rax, 8(%rsp)
-	movq %rsp, %rbx
-	cfi_def_cfa_register(%rbx)
-
-	/* Actively align the La_x86_64_regs structure.  */
-	andq $0xfffffffffffffff0, %rsp
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
-	   to detect if any xmm0-xmm7 registers are changed by audit
-	   module.  */
-	subq $(LR_SIZE + XMM_SIZE*8), %rsp
-# else
-	subq $LR_SIZE, %rsp		# sizeof(La_x86_64_regs)
-# endif
-	movq %rsp, 24(%rbx)
-
-	/* Fill the La_x86_64_regs structure.  */
-	movq %rdx, LR_RDX_OFFSET(%rsp)
-	movq %r8,  LR_R8_OFFSET(%rsp)
-	movq %r9,  LR_R9_OFFSET(%rsp)
-	movq %rcx, LR_RCX_OFFSET(%rsp)
-	movq %rsi, LR_RSI_OFFSET(%rsp)
-	movq %rdi, LR_RDI_OFFSET(%rsp)
-	movq %rbp, LR_RBP_OFFSET(%rsp)
-
-	leaq 48(%rbx), %rax
-	movq %rax, LR_RSP_OFFSET(%rsp)
-
-	/* We always store the XMM registers even if AVX is available.
-	   This is to provide backward binary compatibility for existing
-	   audit modules.  */
-	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
-	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
-	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
-	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
-	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
-	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
-	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
-	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
-
-# ifndef __ILP32__
-#  ifdef HAVE_MPX_SUPPORT
-	bndmov %bnd0, 		   (LR_BND_OFFSET)(%rsp)  # Preserve bound
-	bndmov %bnd1, (LR_BND_OFFSET +   BND_SIZE)(%rsp)  # registers. Nops if
-	bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp)  # MPX not available
-	bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp)  # or disabled.
-#  else
-	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
-	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
-	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
-	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
-#  endif
-# endif
-
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	.data
-L(have_avx):
-	.zero 4
-	.size L(have_avx), 4
-	.previous
-
-	cmpl	$0, L(have_avx)(%rip)
-	jne	L(defined)
-	movq	%rbx, %r11		# Save rbx
-	movl	$1, %eax
-	cpuid
-	movq	%r11,%rbx		# Restore rbx
-	xorl	%eax, %eax
-	// AVX and XSAVE supported?
-	andl	$((1 << 28) | (1 << 27)), %ecx
-	cmpl	$((1 << 28) | (1 << 27)), %ecx
-	jne	10f
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	// AVX512 supported in processor?
-	movq	%rbx, %r11		# Save rbx
-	xorl	%ecx, %ecx
-	mov	$0x7, %eax
-	cpuid
-	andl	$(1 << 16), %ebx
-#  endif
-	xorl	%ecx, %ecx
-	// Get XFEATURE_ENABLED_MASK
-	xgetbv
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	test	%ebx, %ebx
-	movq	%r11, %rbx		# Restore rbx
-	je	20f
-	// Verify that XCR0[7:5] = '111b' and
-	// XCR0[2:1] = '11b' which means
-	// that zmm state is enabled
-	andl	$0xe6, %eax
-	cmpl	$0xe6, %eax
-	jne	20f
-	movl	%eax, L(have_avx)(%rip)
-L(avx512):
-#   define RESTORE_AVX
-#   define VMOV    vmovdqu64
-#   define VEC(i)  zmm##i
-#   define MORE_CODE
-#   include "dl-trampoline.h"
-#   undef VMOV
-#   undef VEC
-#   undef RESTORE_AVX
-#  endif
-20:	andl	$0x6, %eax
-10:	subl	$0x5, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0, %eax
-
-L(defined):
-	js	L(no_avx)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512)
-#  endif
-
-#  define RESTORE_AVX
-#  define VMOV    vmovdqu
-#  define VEC(i)  ymm##i
-#  define MORE_CODE
-#  include "dl-trampoline.h"
-
-	.align 16
-L(no_avx):
-# endif
-
-# undef RESTORE_AVX
-# include "dl-trampoline.h"
-
-	cfi_endproc
-	.size _dl_runtime_profile, .-_dl_runtime_profile
+#define VEC(i)			zmm##i
+#define _dl_runtime_resolve	_dl_runtime_resolve_avx512
+#define _dl_runtime_profile	_dl_runtime_profile_avx512
+#define RESTORE_AVX
+#include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef _dl_runtime_profile
+#undef VEC
+#undef VMOV
+#undef VMOVA
+#undef VEC_SIZE
+
+#define VEC_SIZE		32
+#define VMOVA			vmovdqa
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			vmovdqa
+#else
+# define VMOV			vmovdqu
 #endif
-
-
-#ifdef SHARED
-	.globl _dl_x86_64_save_sse
-	.type _dl_x86_64_save_sse, @function
-	.align 16
-	cfi_startproc
-_dl_x86_64_save_sse:
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	jne	L(defined_5)
-	movq	%rbx, %r11		# Save rbx
-	movl	$1, %eax
-	cpuid
-	movq	%r11,%rbx		# Restore rbx
-	xorl	%eax, %eax
-	// AVX and XSAVE supported?
-	andl	$((1 << 28) | (1 << 27)), %ecx
-	cmpl	$((1 << 28) | (1 << 27)), %ecx
-	jne	1f
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	// AVX512 supported in a processor?
-	movq	%rbx, %r11              # Save rbx
-	xorl	%ecx,%ecx
-	mov	$0x7,%eax
-	cpuid
-	andl	$(1 << 16), %ebx
-#  endif
-	xorl	%ecx, %ecx
-	// Get XFEATURE_ENABLED_MASK
-	xgetbv
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	test	%ebx, %ebx
-	movq	%r11, %rbx		# Restore rbx
-	je	2f
-	// Verify that XCR0[7:5] = '111b' and
-	// XCR0[2:1] = '11b' which means
-	// that zmm state is enabled
-	andl	$0xe6, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0xe6, %eax
-	je	L(avx512_5)
-#  endif
-
-2:	andl	$0x6, %eax
-1:	subl	$0x5, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0, %eax
-
-L(defined_5):
-	js	L(no_avx5)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512_5)
-#  endif
-
-	vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE
-	vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE
-	vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE
-	vmovdqa %ymm3, %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE
-	vmovdqa %ymm4, %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE
-	vmovdqa %ymm5, %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE
-	vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE
-	vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE
-	ret
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-L(avx512_5):
-	vmovdqu64 %zmm0, %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE
-	vmovdqu64 %zmm1, %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE
-	vmovdqu64 %zmm2, %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE
-	vmovdqu64 %zmm3, %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE
-	vmovdqu64 %zmm4, %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE
-	vmovdqu64 %zmm5, %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE
-	vmovdqu64 %zmm6, %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE
-	vmovdqu64 %zmm7, %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE
-	ret
-#  endif
-L(no_avx5):
-# endif
-	movdqa	%xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE
-	movdqa	%xmm1, %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE
-	movdqa	%xmm2, %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE
-	movdqa	%xmm3, %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE
-	movdqa	%xmm4, %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE
-	movdqa	%xmm5, %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE
-	movdqa	%xmm6, %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE
-	movdqa	%xmm7, %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE
-	ret
-	cfi_endproc
-	.size _dl_x86_64_save_sse, .-_dl_x86_64_save_sse
-
-
-	.globl _dl_x86_64_restore_sse
-	.type _dl_x86_64_restore_sse, @function
-	.align 16
-	cfi_startproc
-_dl_x86_64_restore_sse:
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	js	L(no_avx6)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512_6)
-#  endif
-
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE, %ymm2
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE, %ymm3
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE, %ymm4
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE, %ymm5
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7
-	ret
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-L(avx512_6):
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE, %zmm0
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE, %zmm1
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE, %zmm2
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE, %zmm3
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE, %zmm4
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE, %zmm5
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE, %zmm6
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE, %zmm7
-	ret
-#  endif
-L(no_avx6):
-# endif
-	movdqa	%fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0
-	movdqa	%fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE, %xmm1
-	movdqa	%fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE, %xmm2
-	movdqa	%fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE, %xmm3
-	movdqa	%fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE, %xmm4
-	movdqa	%fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE, %xmm5
-	movdqa	%fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE, %xmm6
-	movdqa	%fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE, %xmm7
-	ret
-	cfi_endproc
-	.size _dl_x86_64_restore_sse, .-_dl_x86_64_restore_sse
+#define VEC(i)			ymm##i
+#define _dl_runtime_resolve	_dl_runtime_resolve_avx
+#define _dl_runtime_profile	_dl_runtime_profile_avx
+#include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef _dl_runtime_profile
+#undef VEC
+#undef VMOV
+#undef VMOVA
+#undef VEC_SIZE
+
+/* movaps/movups is 1-byte shorter.  */
+#define VEC_SIZE		16
+#define VMOVA			movaps
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			movaps
+#else
+# define VMOV			movups
 #endif
+#define VEC(i)			xmm##i
+#define _dl_runtime_resolve	_dl_runtime_resolve_sse
+#define _dl_runtime_profile	_dl_runtime_profile_sse
+#undef RESTORE_AVX
+#include "dl-trampoline.h"
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index d542428..dd6d7c7 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -1,5 +1,4 @@
-/* Partial PLT profile trampoline to save and restore x86-64 vector
-   registers.
+/* PLT trampolines.  x86-64 version.
    Copyright (C) 2009-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,16 +16,252 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifdef RESTORE_AVX
+#undef REGISTER_SAVE_AREA_RAW
+#ifdef __ILP32__
+/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to
+   VEC7.  */
+# define REGISTER_SAVE_AREA_RAW	(8 * 7 + VEC_SIZE * 8)
+#else
+/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
+   BND0, BND1, BND2, BND3 and VEC0 to VEC7. */
+# define REGISTER_SAVE_AREA_RAW	(8 * 7 + 16 * 4 + VEC_SIZE * 8)
+#endif
+
+#undef REGISTER_SAVE_AREA
+#undef LOCAL_STORAGE_AREA
+#undef BASE
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+# define REGISTER_SAVE_AREA	(REGISTER_SAVE_AREA_RAW + 8)
+/* Local stack area before jumping to function address: RBX.  */
+# define LOCAL_STORAGE_AREA	8
+# define BASE			rbx
+# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0
+#  error REGISTER_SAVE_AREA must be multples of VEC_SIZE
+# endif
+#else
+# define REGISTER_SAVE_AREA	REGISTER_SAVE_AREA_RAW
+/* Local stack area before jumping to function address:  All saved
+   registers.  */
+# define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
+# define BASE			rsp
+# if (REGISTER_SAVE_AREA % 16) != 8
+#  error REGISTER_SAVE_AREA must be odd multples of 8
+# endif
+#endif
+
+	.text
+	.globl _dl_runtime_resolve
+	.hidden _dl_runtime_resolve
+	.type _dl_runtime_resolve, @function
+	.align 16
+	cfi_startproc
+_dl_runtime_resolve:
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+# if LOCAL_STORAGE_AREA != 8
+#  error LOCAL_STORAGE_AREA must be 8
+# endif
+	pushq %rbx			# push subtracts stack by 8.
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%rbx, 0)
+	mov %RSP_LP, %RBX_LP
+	cfi_def_cfa_register(%rbx)
+	and $-VEC_SIZE, %RSP_LP
+#endif
+	sub $REGISTER_SAVE_AREA, %RSP_LP
+	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
+	# Preserve registers otherwise clobbered.
+	movq %rax, REGISTER_SAVE_RAX(%rsp)
+	movq %rcx, REGISTER_SAVE_RCX(%rsp)
+	movq %rdx, REGISTER_SAVE_RDX(%rsp)
+	movq %rsi, REGISTER_SAVE_RSI(%rsp)
+	movq %rdi, REGISTER_SAVE_RDI(%rsp)
+	movq %r8, REGISTER_SAVE_R8(%rsp)
+	movq %r9, REGISTER_SAVE_R9(%rsp)
+	VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp)
+	VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp)
+	VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp)
+	VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp)
+	VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp)
+	VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp)
+	VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp)
+	VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp)
+#ifndef __ILP32__
+	# We also have to preserve bound registers.  These are nops if
+	# Intel MPX isn't available or disabled.
+# ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
+	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
+	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
+	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
+# else
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1b,0x04,0x24
+#  else
+	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
+	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
+	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
+# endif
+#endif
+	# Copy args pushed by PLT in register.
+	# %rdi: link_map, %rsi: reloc_index
+	mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP
+	mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP
+	call _dl_fixup		# Call resolver.
+	mov %RAX_LP, %R11_LP	# Save return value
+#ifndef __ILP32__
+	# Restore bound registers.  These are nops if Intel MPX isn't
+	# avaiable or disabled.
+# ifdef HAVE_MPX_SUPPORT
+	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
+	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
+	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
+	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
+# else
+	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
+	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1a,0x04,0x24
+#  else
+	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
+# endif
+#endif
+	# Get register content back.
+	movq REGISTER_SAVE_R9(%rsp), %r9
+	movq REGISTER_SAVE_R8(%rsp), %r8
+	movq REGISTER_SAVE_RDI(%rsp), %rdi
+	movq REGISTER_SAVE_RSI(%rsp), %rsi
+	movq REGISTER_SAVE_RDX(%rsp), %rdx
+	movq REGISTER_SAVE_RCX(%rsp), %rcx
+	movq REGISTER_SAVE_RAX(%rsp), %rax
+	VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7)
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+	mov %RBX_LP, %RSP_LP
+	cfi_def_cfa_register(%rsp)
+	movq (%rsp), %rbx
+	cfi_restore(%rbx)
+#endif
+	# Adjust stack(PLT did 2 pushes)
+	add $(LOCAL_STORAGE_AREA + 16), %RSP_LP
+	cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16))
+	# Preserve bound registers.
+	PRESERVE_BND_REGS_PREFIX
+	jmp *%r11		# Jump to function address.
+	cfi_endproc
+	.size _dl_runtime_resolve, .-_dl_runtime_resolve
+
+
+#ifndef PROF
+# if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
+#  error LR_VECTOR_OFFSET must be multples of VEC_SIZE
+# endif
+
+	.globl _dl_runtime_profile
+	.hidden _dl_runtime_profile
+	.type _dl_runtime_profile, @function
+	.align 16
+_dl_runtime_profile:
+	cfi_startproc
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+	/* The La_x86_64_regs data structure pointed to by the
+	   fourth paramater must be VEC_SIZE-byte aligned.  This must
+	   be explicitly enforced.  We have the set up a dynamically
+	   sized stack frame.  %rbx points to the top half which
+	   has a fixed size and preserves the original stack pointer.  */
+
+	sub $32, %RSP_LP	# Allocate the local storage.
+	cfi_adjust_cfa_offset(32)
+	movq %rbx, (%rsp)
+	cfi_rel_offset(%rbx, 0)
+
+	/* On the stack:
+		56(%rbx)	parameter #1
+		48(%rbx)	return address
+
+		40(%rbx)	reloc index
+		32(%rbx)	link_map
+
+		24(%rbx)	La_x86_64_regs pointer
+		16(%rbx)	framesize
+		 8(%rbx)	rax
+		  (%rbx)	rbx
+	*/
+
+	movq %rax, 8(%rsp)
+	mov %RSP_LP, %RBX_LP
+	cfi_def_cfa_register(%rbx)
+
+	/* Actively align the La_x86_64_regs structure.  */
+	and $-VEC_SIZE, %RSP_LP
+# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
+	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
+	   to detect if any xmm0-xmm7 registers are changed by audit
+	   module.  */
+	sub $(LR_SIZE + XMM_SIZE*8), %RSP_LP
+# else
+	sub $LR_SIZE, %RSP_LP		# sizeof(La_x86_64_regs)
+# endif
+	movq %rsp, 24(%rbx)
+
+	/* Fill the La_x86_64_regs structure.  */
+	movq %rdx, LR_RDX_OFFSET(%rsp)
+	movq %r8,  LR_R8_OFFSET(%rsp)
+	movq %r9,  LR_R9_OFFSET(%rsp)
+	movq %rcx, LR_RCX_OFFSET(%rsp)
+	movq %rsi, LR_RSI_OFFSET(%rsp)
+	movq %rdi, LR_RDI_OFFSET(%rsp)
+	movq %rbp, LR_RBP_OFFSET(%rsp)
+
+	lea 48(%rbx), %RAX_LP
+	movq %rax, LR_RSP_OFFSET(%rsp)
+
+	/* We always store the XMM registers even if AVX is available.
+	   This is to provide backward binary compatibility for existing
+	   audit modules.  */
+	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
+	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
+	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
+	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
+	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
+	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
+	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
+	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
+
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, 		   (LR_BND_OFFSET)(%rsp)  # Preserve bound
+	bndmov %bnd1, (LR_BND_OFFSET +   BND_SIZE)(%rsp)  # registers. Nops if
+	bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp)  # MPX not available
+	bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp)  # or disabled.
+#  else
+	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
+	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
+	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+#  endif
+# endif
+
+# ifdef RESTORE_AVX
 	/* This is to support AVX audit modules.  */
-	VMOV %VEC(0),		      (LR_VECTOR_OFFSET)(%rsp)
-	VMOV %VEC(1), (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
-	VMOV %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
-	VMOV %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
-	VMOV %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
-	VMOV %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
-	VMOV %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
-	VMOV %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+	VMOVA %VEC(0),		      (LR_VECTOR_OFFSET)(%rsp)
+	VMOVA %VEC(1), (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
+	VMOVA %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+	VMOVA %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+	VMOVA %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+	VMOVA %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+	VMOVA %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+	VMOVA %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
 
 	/* Save xmm0-xmm7 registers to detect if any of them are
 	   changed by audit module.  */
@@ -38,7 +273,7 @@
 	vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
 	vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
 	vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
-#endif
+# endif
 
 	mov %RSP_LP, %RCX_LP	# La_x86_64_regs pointer to %rcx.
 	mov 48(%rbx), %RDX_LP	# Load return address if needed.
@@ -63,7 +298,7 @@
 	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
 	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* Check if any xmm0-xmm7 registers are changed by audit
 	   module.  */
 	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
@@ -72,7 +307,7 @@
 	je 2f
 	vmovdqa	%xmm0, (LR_VECTOR_OFFSET)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
+2:	VMOVA (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
 	vmovdqa	%xmm0, (LR_XMM_OFFSET)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
@@ -81,7 +316,7 @@
 	je 2f
 	vmovdqa	%xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
 	vmovdqa	%xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
@@ -90,7 +325,7 @@
 	je 2f
 	vmovdqa	%xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
 	vmovdqa	%xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
@@ -99,7 +334,7 @@
 	je 2f
 	vmovdqa	%xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
 	vmovdqa	%xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
@@ -108,7 +343,7 @@
 	je 2f
 	vmovdqa	%xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
 	vmovdqa	%xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
@@ -117,7 +352,7 @@
 	je 2f
 	vmovdqa	%xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
 	vmovdqa	%xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
@@ -126,7 +361,7 @@
 	je 2f
 	vmovdqa	%xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
 	vmovdqa	%xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
@@ -135,25 +370,25 @@
 	je 2f
 	vmovdqa	%xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
 	vmovdqa	%xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
 
 1:
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov              (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
 	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
 	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
 	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
-# else
+#  else
 	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
 	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
 	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
 	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+#  endif
 # endif
-#endif
 
 	mov  16(%rbx), %R10_LP	# Anything in framesize?
 	test %R10_LP, %R10_LP
@@ -168,12 +403,12 @@
 	movq LR_RSI_OFFSET(%rsp), %rsi
 	movq LR_RDI_OFFSET(%rsp), %rdi
 
-	movq %rbx, %rsp
+	mov %RBX_LP, %RSP_LP
 	movq (%rsp), %rbx
-	cfi_restore(rbx)
+	cfi_restore(%rbx)
 	cfi_def_cfa_register(%rsp)
 
-	addq $48, %rsp		# Adjust the stack to the return value
+	add $48, %RSP_LP	# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
 	PRESERVE_BND_REGS_PREFIX
@@ -189,13 +424,13 @@
 	   temporary buffer of the size specified by the 'framesize'
 	   returned from _dl_profile_fixup */
 
-	leaq LR_RSP_OFFSET(%rbx), %rsi	# stack
-	addq $8, %r10
-	andq $0xfffffffffffffff0, %r10
-	movq %r10, %rcx
-	subq %r10, %rsp
-	movq %rsp, %rdi
-	shrq $3, %rcx
+	lea LR_RSP_OFFSET(%rbx), %RSI_LP # stack
+	add $8, %R10_LP
+	and $-16, %R10_LP
+	mov %R10_LP, %RCX_LP
+	sub %R10_LP, %RSP_LP
+	mov %RSP_LP, %RDI_LP
+	shr $3, %RCX_LP
 	rep
 	movsq
 
@@ -206,21 +441,21 @@
 	PRESERVE_BND_REGS_PREFIX
 	call *%r11
 
-	mov 24(%rbx), %rsp	# Drop the copied stack content
+	mov 24(%rbx), %RSP_LP	# Drop the copied stack content
 
 	/* Now we have to prepare the La_x86_64_retval structure for the
 	   _dl_call_pltexit.  The La_x86_64_regs is being pointed by rsp now,
 	   so we just need to allocate the sizeof(La_x86_64_retval) space on
 	   the stack, since the alignment has already been taken care of. */
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* sizeof(La_x86_64_retval).  Need extra space for 2 SSE
 	   registers to detect if xmm0/xmm1 registers are changed
 	   by audit module.  */
-	subq $(LRV_SIZE + XMM_SIZE*2), %rsp
-#else
-	subq $LRV_SIZE, %rsp	# sizeof(La_x86_64_retval)
-#endif
-	movq %rsp, %rcx		# La_x86_64_retval argument to %rcx.
+	sub $(LRV_SIZE + XMM_SIZE*2), %RSP_LP
+# else
+	sub $LRV_SIZE, %RSP_LP	# sizeof(La_x86_64_retval)
+# endif
+	mov %RSP_LP, %RCX_LP	# La_x86_64_retval argument to %rcx.
 
 	/* Fill in the La_x86_64_retval structure.  */
 	movq %rax, LRV_RAX_OFFSET(%rcx)
@@ -229,26 +464,26 @@
 	movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
 	movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* This is to support AVX audit modules.  */
-	VMOV %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
-	VMOV %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
+	VMOVA %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
+	VMOVA %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
 
 	/* Save xmm0/xmm1 registers to detect if they are changed
 	   by audit module.  */
 	vmovdqa %xmm0,		  (LRV_SIZE)(%rcx)
 	vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov %bnd0, LRV_BND0_OFFSET(%rcx)  # Preserve returned bounds.
 	bndmov %bnd1, LRV_BND1_OFFSET(%rcx)
-# else
+#  else
 	.byte  0x66,0x0f,0x1b,0x81;.long (LRV_BND0_OFFSET)
 	.byte  0x66,0x0f,0x1b,0x89;.long (LRV_BND1_OFFSET)
+#  endif
 # endif
-#endif
 
 	fstpt LRV_ST0_OFFSET(%rcx)
 	fstpt LRV_ST1_OFFSET(%rcx)
@@ -265,50 +500,47 @@
 	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
 	movaps LRV_XMM1_OFFSET(%rsp), %xmm1
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* Check if xmm0/xmm1 registers are changed by audit module.  */
 	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
 	vpmovmskb %xmm2, %esi
 	cmpl $0xffff, %esi
 	jne 1f
-	VMOV LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
+	VMOVA LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
 
 1:	vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
 	vpmovmskb %xmm2, %esi
 	cmpl $0xffff, %esi
 	jne 1f
-	VMOV LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
+	VMOVA LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
 
 1:
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov LRV_BND0_OFFSET(%rsp), %bnd0  # Restore bound registers.
 	bndmov LRV_BND1_OFFSET(%rsp), %bnd1
-# else
+#  else
 	.byte  0x66,0x0f,0x1a,0x84,0x24;.long (LRV_BND0_OFFSET)
 	.byte  0x66,0x0f,0x1a,0x8c,0x24;.long (LRV_BND1_OFFSET)
+#  endif
 # endif
-#endif
 
 	fldt LRV_ST1_OFFSET(%rsp)
 	fldt LRV_ST0_OFFSET(%rsp)
 
-	movq %rbx, %rsp
+	mov %RBX_LP, %RSP_LP
 	movq (%rsp), %rbx
-	cfi_restore(rbx)
+	cfi_restore(%rbx)
 	cfi_def_cfa_register(%rsp)
 
-	addq $48, %rsp		# Adjust the stack to the return value
+	add $48, %RSP_LP	# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
 	PRESERVE_BND_REGS_PREFIX
 	retq
 
-#ifdef MORE_CODE
-	cfi_adjust_cfa_offset(48)
-	cfi_rel_offset(%rbx, 0)
-	cfi_def_cfa_register(%rbx)
-# undef MORE_CODE
+	cfi_endproc
+	.size _dl_runtime_profile, .-_dl_runtime_profile
 #endif
diff --git a/sysdeps/x86_64/ifuncmain8.c b/sysdeps/x86_64/ifuncmain8.c
new file mode 100644
index 0000000..f75771d
--- /dev/null
+++ b/sysdeps/x86_64/ifuncmain8.c
@@ -0,0 +1,32 @@
+/* Test IFUNC selector with floating-point parameters.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+
+extern float foo (float);
+
+static int
+do_test (void)
+{
+  if (foo (2) != 3)
+    abort ();
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/sysdeps/x86_64/ifuncmod8.c b/sysdeps/x86_64/ifuncmod8.c
new file mode 100644
index 0000000..741aa13
--- /dev/null
+++ b/sysdeps/x86_64/ifuncmod8.c
@@ -0,0 +1,36 @@
+/* Test IFUNC selector with floating-point parameters.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <emmintrin.h>
+
+void * foo_ifunc (void) __asm__ ("foo");
+__asm__(".type foo, %gnu_indirect_function");
+
+static float
+foo_impl (float x)
+{
+  return x + 1;
+}
+
+void *
+foo_ifunc (void)
+{
+  __m128i xmm = _mm_set1_epi32 (-1);
+  asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" );
+  return foo_impl;
+}
diff --git a/sysdeps/x86_64/nptl/tcb-offsets.sym b/sysdeps/x86_64/nptl/tcb-offsets.sym
index 729d1da..aeb7526 100644
--- a/sysdeps/x86_64/nptl/tcb-offsets.sym
+++ b/sysdeps/x86_64/nptl/tcb-offsets.sym
@@ -16,7 +16,6 @@ VGETCPU_CACHE_OFFSET	offsetof (tcbhead_t, vgetcpu_cache)
 #ifndef __ASSUME_PRIVATE_FUTEX
 PRIVATE_FUTEX		offsetof (tcbhead_t, private_futex)
 #endif
-RTLD_SAVESPACE_SSE	offsetof (tcbhead_t, rtld_savespace_sse)
 
 -- Not strictly offsets, but these values are also used in the TCB.
 TCB_CANCELSTATE_BITMASK	 CANCELSTATE_BITMASK
diff --git a/sysdeps/x86_64/nptl/tls.h b/sysdeps/x86_64/nptl/tls.h
index b947d57..457c7dd 100644
--- a/sysdeps/x86_64/nptl/tls.h
+++ b/sysdeps/x86_64/nptl/tls.h
@@ -67,14 +67,15 @@ typedef struct
 # else
   int __glibc_reserved1;
 # endif
-  int rtld_must_xmm_save;
+  int __glibc_unused1;
   /* Reservation of some values for the TM ABI.  */
   void *__private_tm[4];
   /* GCC split stack support.  */
   void *__private_ss;
   long int __glibc_reserved2;
-  /* Have space for the post-AVX register size.  */
-  __128bits rtld_savespace_sse[8][4] __attribute__ ((aligned (32)));
+  /* Must be kept even if it is no longer used by glibc since programs,
+     like AddressSanitizer, depend on the size of tcbhead_t.  */
+  __128bits __glibc_unused2[8][4] __attribute__ ((aligned (32)));
 
   void *__padding[8];
 } tcbhead_t;
@@ -384,41 +385,6 @@ typedef struct
 # define THREAD_GSCOPE_WAIT() \
   GL(dl_wait_lookup_done) ()
 
-
-# ifdef SHARED
-/* Defined in dl-trampoline.S.  */
-extern void _dl_x86_64_save_sse (void);
-extern void _dl_x86_64_restore_sse (void);
-
-# define RTLD_CHECK_FOREIGN_CALL \
-  (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) != 0)
-
-/* NB: Don't use the xchg operation because that would imply a lock
-   prefix which is expensive and unnecessary.  The cache line is also
-   not contested at all.  */
-#  define RTLD_ENABLE_FOREIGN_CALL \
-  int old_rtld_must_xmm_save = THREAD_GETMEM (THREAD_SELF,		      \
-					      header.rtld_must_xmm_save);     \
-  THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 1)
-
-#  define RTLD_PREPARE_FOREIGN_CALL \
-  do if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save))	      \
-    {									      \
-      _dl_x86_64_save_sse ();						      \
-      THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0);	      \
-    }									      \
-  while (0)
-
-#  define RTLD_FINALIZE_FOREIGN_CALL \
-  do {									      \
-    if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) == 0)	      \
-      _dl_x86_64_restore_sse ();					      \
-    THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save,		      \
-		   old_rtld_must_xmm_save);				      \
-  } while (0)
-# endif
-
-
 #endif /* __ASSEMBLER__ */
 
 #endif	/* tls.h */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b4bf7d64c8f7597cc03abd608b50064f7cca4f40

commit b4bf7d64c8f7597cc03abd608b50064f7cca4f40
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Jul 12 14:41:20 2015 -0700

    Align stack when calling __errno_location
    
    We should align stack to 16 bytes when calling __errno_location.
    
    	[BZ #18661]
    	* sysdeps/x86_64/fpu/s_cosf.S (__cosf): Align stack to 16 bytes
    	when calling __errno_location.
    	* sysdeps/x86_64/fpu/s_sincosf.S (__sincosf): Likewise.
    	* sysdeps/x86_64/fpu/s_sinf.S (__sinf): Likewise.

diff --git a/sysdeps/x86_64/fpu/s_cosf.S b/sysdeps/x86_64/fpu/s_cosf.S
index b7868ce..bea10ef 100644
--- a/sysdeps/x86_64/fpu/s_cosf.S
+++ b/sysdeps/x86_64/fpu/s_cosf.S
@@ -310,8 +310,14 @@ L(arg_inf_or_nan):
 	/* Here if |x| is Inf or NAN */
 	jne	L(skip_errno_setting)	/* in case of x is NaN */
 
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
 	/* Here if x is Inf. Set errno to EDOM.  */
 	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
 	movl	$EDOM, (%rax)
 
 	.p2align	4
diff --git a/sysdeps/x86_64/fpu/s_sincosf.S b/sysdeps/x86_64/fpu/s_sincosf.S
index 21db70a..a2f3133 100644
--- a/sysdeps/x86_64/fpu/s_sincosf.S
+++ b/sysdeps/x86_64/fpu/s_sincosf.S
@@ -354,8 +354,14 @@ L(arg_inf_or_nan):
 	/* Here if |x| is Inf or NAN */
 	jne	L(skip_errno_setting)	/* in case of x is NaN */
 
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
 	/* Here if x is Inf. Set errno to EDOM.  */
 	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
 	movl	$EDOM, (%rax)
 
 	.p2align	4
diff --git a/sysdeps/x86_64/fpu/s_sinf.S b/sysdeps/x86_64/fpu/s_sinf.S
index dc92164..90afbe8 100644
--- a/sysdeps/x86_64/fpu/s_sinf.S
+++ b/sysdeps/x86_64/fpu/s_sinf.S
@@ -336,8 +336,14 @@ L(arg_inf_or_nan):
 	/* Here if |x| is Inf or NAN */
 	jne	L(skip_errno_setting)	/* in case of x is NaN */
 
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
 	/* Here if x is Inf. Set errno to EDOM.  */
 	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
 	movl	$EDOM, (%rax)
 
 	.p2align	4

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=1c8f0fb6a4ebef1080e254753e6a4e6c07a66a4f

commit 1c8f0fb6a4ebef1080e254753e6a4e6c07a66a4f
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Jul 12 14:40:25 2015 -0700

    Align stack to 16 bytes when calling __gettimeofday
    
    Subtract stack by 24 bytes instead of 16 bytes so that stack is aligned
    to 16 bytes when calling __gettimeofday.
    
    	[BZ #18661]
    	* sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
    	(__lll_timedwait_tid): Align stack to 16 bytes when calling
    	__gettimeofday.

diff --git a/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S b/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
index 0935db5..8e1a39d 100644
--- a/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
+++ b/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
@@ -394,8 +394,9 @@ __lll_timedwait_tid:
 	movq	%rdi, %r12
 	movq	%rsi, %r13
 
-	subq	$16, %rsp
-	cfi_adjust_cfa_offset(16)
+	/* Align stack to 16 bytes when calling __gettimeofday.  */
+	subq	$24, %rsp
+	cfi_adjust_cfa_offset(24)
 
 	/* Get current time.  */
 2:	movq	%rsp, %rdi
@@ -441,8 +442,8 @@ __lll_timedwait_tid:
 	jne	1f
 4:	xorl	%eax, %eax
 
-8:	addq	$16, %rsp
-	cfi_adjust_cfa_offset(-16)
+8:	addq	$24, %rsp
+	cfi_adjust_cfa_offset(-24)
 	popq	%r13
 	cfi_adjust_cfa_offset(-8)
 	cfi_restore(%r13)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=08094d48c130d3ae6182296ef3c9561ca1066b39

commit 08094d48c130d3ae6182296ef3c9561ca1066b39
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Jul 12 14:38:58 2015 -0700

    Align stack to 16 bytes when calling __setcontext
    
    Don't use pop to restore %rdi so that stack is aligned to 16 bytes
    when calling __setcontext.
    
    	[BZ #18661]
    	* sysdeps/unix/sysv/linux/x86_64/__start_context.S
    	(__start_context): Don't use pop to restore %rdi so that stack
    	is aligned to 16 bytes when calling __setcontext.

diff --git a/sysdeps/unix/sysv/linux/x86_64/__start_context.S b/sysdeps/unix/sysv/linux/x86_64/__start_context.S
index 52a5afa..96366e0 100644
--- a/sysdeps/unix/sysv/linux/x86_64/__start_context.S
+++ b/sysdeps/unix/sysv/linux/x86_64/__start_context.S
@@ -31,8 +31,8 @@ ENTRY(__start_context)
 	   on the stack pointer for the next context.  */
 	movq	%rbx, %rsp
 
-	popq	%rdi			/* This is the next context.  */
-	cfi_adjust_cfa_offset(-8)
+	/* Don't use pop here so that stack is aligned to 16 bytes.  */
+	movq	(%rsp), %rdi		/* This is the next context.  */
 	testq	%rdi, %rdi
 	je	2f			/* If it is zero exit.  */
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=723a5bd465082390bd9369d4f95ec11b32cf47dd

commit 723a5bd465082390bd9369d4f95ec11b32cf47dd
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Jul 29 03:41:58 2015 -0700

    Compile {memcpy,strcmp}-sse2-unaligned.S only for libc
    
    {memcpy,strcmp}-sse2-unaligned.S aren't needed in ld.so.
    
    	* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: Compile
    	only for libc.
    	* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Likewise.

diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index c5450af..5693ba7 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -16,6 +16,8 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#if IS_IN (libc)
+
 #include <sysdep.h>
 
 #include "asm-syntax.h"
@@ -169,3 +171,5 @@ L(between_5_8):
 	movl	%eax, -4(%rdi,%rdx)
 	jmp	L(return)
 END(__memcpy_sse2_unaligned)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index 20b65fa..c6606b4 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -16,6 +16,8 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#if IS_IN (libc)
+
 #include "sysdep.h"
 
 ENTRY ( __strcmp_sse2_unaligned)
@@ -207,3 +209,5 @@ L(different):
 	subl	%ecx, %eax
 	ret
 END (__strcmp_sse2_unaligned)
+
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3d497ce3d1ab11a363c543b42737a0696329ed40

commit 3d497ce3d1ab11a363c543b42737a0696329ed40
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sat Aug 1 07:47:16 2015 -0700

    Update x86 elision-conf.c for <cpu-features.h>
    
    This patch updates x86 elision-conf.c to use the newly defined
    HAS_CPU_FEATURE from <cpu-features.h>.
    
    	* sysdeps/unix/sysv/linux/x86/elision-conf.c (elision_init):
    	Replace HAS_RTM with HAS_CPU_FEATURE (RTM).

diff --git a/sysdeps/unix/sysv/linux/x86/elision-conf.c b/sysdeps/unix/sysv/linux/x86/elision-conf.c
index 84902ac..4a73382 100644
--- a/sysdeps/unix/sysv/linux/x86/elision-conf.c
+++ b/sysdeps/unix/sysv/linux/x86/elision-conf.c
@@ -62,11 +62,11 @@ elision_init (int argc __attribute__ ((unused)),
 	      char **argv  __attribute__ ((unused)),
 	      char **environ)
 {
-  __elision_available = HAS_RTM;
+  __elision_available = HAS_CPU_FEATURE (RTM);
 #ifdef ENABLE_LOCK_ELISION
   __pthread_force_elision = __libc_enable_secure ? 0 : __elision_available;
 #endif
-  if (!HAS_RTM)
+  if (!HAS_CPU_FEATURE (RTM))
     __elision_aconf.retry_try_xbegin = 0; /* Disable elision on rwlocks */
 }
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3dbbb788b8b59f0c88ac8eeae341f491dd739949

commit 3dbbb788b8b59f0c88ac8eeae341f491dd739949
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Jul 31 16:52:19 2015 -0700

    Update i686 multiarch functions for <cpu-features.h>
    
    This patch updates i686 multiarch functions to use the newly defined
    HAS_CPU_FEATURE, HAS_ARCH_FEATURE, LOAD_GOT_AND_RTLD_GLOBAL_RO and
    LOAD_FUNC_GOT_EAX from <cpu-features.h>.
    
    	* sysdeps/i386/i686/fpu/multiarch/e_expf.c: Replace HAS_XXX
    	with HAS_CPU_FEATURE/HAS_ARCH_FEATURE (XXX).
    	* sysdeps/i386/i686/fpu/multiarch/s_cosf.c: Likewise.
    	* sysdeps/i386/i686/fpu/multiarch/s_cosf.c: Likewise.
    	* sysdeps/i386/i686/fpu/multiarch/s_sincosf.c: Likewise.
    	* sysdeps/i386/i686/fpu/multiarch/s_sinf.c: Likewise.
    	* sysdeps/i386/i686/multiarch/ifunc-impl-list.c: Likewise.
    	* sysdeps/i386/i686/multiarch/s_fma.c: Likewise.
    	* sysdeps/i386/i686/multiarch/s_fmaf.c: Likewise.
    	* sysdeps/i386/i686/multiarch/bcopy.S: Remove __init_cpu_features
    	call.  Merge SHARED and !SHARED.  Add LOAD_GOT_AND_RTLD_GLOBAL_RO.
    	Use LOAD_FUNC_GOT_EAX to load function address.  Replace HAS_XXX
    	with HAS_CPU_FEATURE/HAS_ARCH_FEATURE (XXX).
    	* sysdeps/i386/i686/multiarch/bzero.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memchr.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memcmp.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memcpy.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memcpy_chk.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memmove.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memmove_chk.S: Likewise.
    	* sysdeps/i386/i686/multiarch/mempcpy.S: Likewise.
    	* sysdeps/i386/i686/multiarch/mempcpy_chk.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memrchr.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memset.S: Likewise.
    	* sysdeps/i386/i686/multiarch/memset_chk.S: Likewise.
    	* sysdeps/i386/i686/multiarch/rawmemchr.S: Likewise.
    	* sysdeps/i386/i686/multiarch/strcasecmp.S: Likewise.
    	* sysdeps/i386/i686/multiarch/strcat.S: Likewise.
    	* sysdeps/i386/i686/multiarch/strchr.S: Likewise.
    	* sysdeps/i386/i686/multiarch/strcmp.S: Likewise.
    	* sysdeps/i386/i686/multiarch/strcpy.S: Likewise.
    	* sysdeps/i386/i686/multiarch/strcspn.S: Likewise.
    	* sysdeps/i386/i686/multiarch/strlen.S: Likewise.
    	* sysdeps/i386/i686/multiarch/strncase.S: Likewise.
    	* sysdeps/i386/i686/multiarch/strnlen.S: Likewise.
    	* sysdeps/i386/i686/multiarch/strrchr.S: Likewise.
    	* sysdeps/i386/i686/multiarch/strspn.S: Likewise.
    	* sysdeps/i386/i686/multiarch/wcschr.S: Likewise.
    	* sysdeps/i386/i686/multiarch/wcscmp.S: Likewise.
    	* sysdeps/i386/i686/multiarch/wcscpy.S: Likewise.
    	* sysdeps/i386/i686/multiarch/wcslen.S: Likewise.
    	* sysdeps/i386/i686/multiarch/wcsrchr.S: Likewise.
    	* sysdeps/i386/i686/multiarch/wmemcmp.S: Likewise.

diff --git a/sysdeps/i386/i686/fpu/multiarch/e_expf.c b/sysdeps/i386/i686/fpu/multiarch/e_expf.c
index 5102dae..6978883 100644
--- a/sysdeps/i386/i686/fpu/multiarch/e_expf.c
+++ b/sysdeps/i386/i686/fpu/multiarch/e_expf.c
@@ -23,11 +23,15 @@ extern double __ieee754_expf_ia32 (double);
 
 double __ieee754_expf (double);
 libm_ifunc (__ieee754_expf,
-	    HAS_SSE2 ? __ieee754_expf_sse2 : __ieee754_expf_ia32);
+	    HAS_CPU_FEATURE (SSE2)
+	    ? __ieee754_expf_sse2
+	    : __ieee754_expf_ia32);
 
 extern double __expf_finite_sse2 (double);
 extern double __expf_finite_ia32 (double);
 
 double __expf_finite (double);
 libm_ifunc (__expf_finite,
-	    HAS_SSE2 ? __expf_finite_sse2 : __expf_finite_ia32);
+	    HAS_CPU_FEATURE (SSE2)
+	    ? __expf_finite_sse2
+	    : __expf_finite_ia32);
diff --git a/sysdeps/i386/i686/fpu/multiarch/s_cosf.c b/sysdeps/i386/i686/fpu/multiarch/s_cosf.c
index 0799dca..e32b2f4 100644
--- a/sysdeps/i386/i686/fpu/multiarch/s_cosf.c
+++ b/sysdeps/i386/i686/fpu/multiarch/s_cosf.c
@@ -22,7 +22,7 @@ extern float __cosf_sse2 (float);
 extern float __cosf_ia32 (float);
 float __cosf (float);
 
-libm_ifunc (__cosf, HAS_SSE2 ? __cosf_sse2 : __cosf_ia32);
+libm_ifunc (__cosf, HAS_CPU_FEATURE (SSE2) ? __cosf_sse2 : __cosf_ia32);
 weak_alias (__cosf, cosf);
 
 #define COSF __cosf_ia32
diff --git a/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c b/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c
index 384d844..0d827e0 100644
--- a/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c
+++ b/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c
@@ -22,7 +22,8 @@ extern void __sincosf_sse2 (float, float *, float *);
 extern void __sincosf_ia32 (float, float *, float *);
 void __sincosf (float, float *, float *);
 
-libm_ifunc (__sincosf, HAS_SSE2 ? __sincosf_sse2 : __sincosf_ia32);
+libm_ifunc (__sincosf,
+	    HAS_CPU_FEATURE (SSE2) ? __sincosf_sse2 : __sincosf_ia32);
 weak_alias (__sincosf, sincosf);
 
 #define SINCOSF __sincosf_ia32
diff --git a/sysdeps/i386/i686/fpu/multiarch/s_sinf.c b/sysdeps/i386/i686/fpu/multiarch/s_sinf.c
index 6b62772..18afaa2 100644
--- a/sysdeps/i386/i686/fpu/multiarch/s_sinf.c
+++ b/sysdeps/i386/i686/fpu/multiarch/s_sinf.c
@@ -22,7 +22,7 @@ extern float __sinf_sse2 (float);
 extern float __sinf_ia32 (float);
 float __sinf (float);
 
-libm_ifunc (__sinf, HAS_SSE2 ? __sinf_sse2 : __sinf_ia32);
+libm_ifunc (__sinf, HAS_CPU_FEATURE (SSE2) ? __sinf_sse2 : __sinf_ia32);
 weak_alias (__sinf, sinf);
 #define SINF __sinf_ia32
 #include <sysdeps/ieee754/flt-32/s_sinf.c>
diff --git a/sysdeps/i386/i686/multiarch/bcopy.S b/sysdeps/i386/i686/multiarch/bcopy.S
index e767d97..3fc95dc 100644
--- a/sysdeps/i386/i686/multiarch/bcopy.S
+++ b/sysdeps/i386/i686/multiarch/bcopy.S
@@ -23,51 +23,24 @@
 
 /* Define multiple versions only for the definition in lib.  */
 #if IS_IN (libc)
-# ifdef SHARED
 	.text
 ENTRY(bcopy)
 	.type	bcopy, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__bcopy_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__bcopy_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__bcopy_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__bcopy_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leal	__bcopy_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__bcopy_ssse3)
+	HAS_CPU_FEATURE (Fast_Rep_String)
 	jz	2f
-	leal	__bcopy_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(bcopy)
-# else
-	.text
-ENTRY(bcopy)
-	.type	bcopy, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__bcopy_ia32, %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	__bcopy_ssse3, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
-	jz	2f
-	leal	__bcopy_ssse3_rep, %eax
+	LOAD_FUNC_GOT_EAX (__bcopy_ssse3_rep)
 2:	ret
 END(bcopy)
-# endif
 
 # undef ENTRY
 # define ENTRY(name) \
diff --git a/sysdeps/i386/i686/multiarch/bzero.S b/sysdeps/i386/i686/multiarch/bzero.S
index e8dc85f..95c96a8 100644
--- a/sysdeps/i386/i686/multiarch/bzero.S
+++ b/sysdeps/i386/i686/multiarch/bzero.S
@@ -23,46 +23,19 @@
 
 /* Define multiple versions only for the definition in lib.  */
 #if IS_IN (libc)
-# ifdef SHARED
-	.text
-ENTRY(__bzero)
-	.type	__bzero, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__bzero_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__bzero_sse2@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__bzero_sse2_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(__bzero)
-# else
 	.text
 ENTRY(__bzero)
 	.type	__bzero, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__bzero_ia32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__bzero_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__bzero_sse2, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
+	LOAD_FUNC_GOT_EAX ( __bzero_sse2)
+	HAS_CPU_FEATURE (Fast_Rep_String)
 	jz	2f
-	leal	__bzero_sse2_rep, %eax
+	LOAD_FUNC_GOT_EAX (__bzero_sse2_rep)
 2:	ret
 END(__bzero)
-# endif
 
 # undef ENTRY
 # define ENTRY(name) \
diff --git a/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
index 92366a7..a6735a8 100644
--- a/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
+++ b/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
@@ -38,152 +38,179 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/i386/i686/multiarch/bcopy.S.  */
   IFUNC_IMPL (i, name, bcopy,
-	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3),
 			      __bcopy_ssse3_rep)
-	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_SSSE3, __bcopy_ssse3)
-	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3),
+			      __bcopy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSE2),
 			      __bcopy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/bzero.S.  */
   IFUNC_IMPL (i, name, bzero,
-	      IFUNC_IMPL_ADD (array, i, bzero, HAS_SSE2, __bzero_sse2_rep)
-	      IFUNC_IMPL_ADD (array, i, bzero, HAS_SSE2, __bzero_sse2)
+	      IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
+			      __bzero_sse2_rep)
+	      IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
+			      __bzero_sse2)
 	      IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memchr.S.  */
   IFUNC_IMPL (i, name, memchr,
-	      IFUNC_IMPL_ADD (array, i, memchr, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2),
 			      __memchr_sse2_bsf)
-	      IFUNC_IMPL_ADD (array, i, memchr, HAS_SSE2, __memchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2),
+			      __memchr_sse2)
 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memcmp.S.  */
   IFUNC_IMPL (i, name, memcmp,
-	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_2),
 			      __memcmp_sse4_2)
-	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
+			      __memcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memmove_chk.S.  */
   IFUNC_IMPL (i, name, __memmove_chk,
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __memmove_chk_ssse3_rep)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __memmove_chk_ssse3)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSE2),
 			      __memmove_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memmove.S.  */
   IFUNC_IMPL (i, name, memmove,
-	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
 			      __memmove_ssse3_rep)
-	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
 			      __memmove_ssse3)
-	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSE2),
 			      __memmove_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memrchr.S.  */
   IFUNC_IMPL (i, name, memrchr,
-	      IFUNC_IMPL_ADD (array, i, memrchr, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2),
 			      __memrchr_sse2_bsf)
-	      IFUNC_IMPL_ADD (array, i, memrchr, HAS_SSE2, __memrchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2),
+			      __memrchr_sse2)
 	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memset_chk.S.  */
   IFUNC_IMPL (i, name, __memset_chk,
-	      IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_CPU_FEATURE (SSE2),
 			      __memset_chk_sse2_rep)
-	      IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_CPU_FEATURE (SSE2),
 			      __memset_chk_sse2)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
 			      __memset_chk_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memset.S.  */
   IFUNC_IMPL (i, name, memset,
-	      IFUNC_IMPL_ADD (array, i, memset, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
 			      __memset_sse2_rep)
-	      IFUNC_IMPL_ADD (array, i, memset, HAS_SSE2, __memset_sse2)
+	      IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
+			      __memset_sse2)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/rawmemchr.S.  */
   IFUNC_IMPL (i, name, rawmemchr,
-	      IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2),
 			      __rawmemchr_sse2_bsf)
-	      IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2),
 			      __rawmemchr_sse2)
 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/stpncpy.S.  */
   IFUNC_IMPL (i, name, stpncpy,
-	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
 			      __stpncpy_ssse3)
-	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_SSE2, __stpncpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSE2),
+			      __stpncpy_sse2)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/stpcpy.S.  */
   IFUNC_IMPL (i, name, stpcpy,
-	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_SSSE3, __stpcpy_ssse3)
-	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_SSE2, __stpcpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
+			      __stpcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSE2),
+			      __stpcpy_sse2)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strcasecmp.S.  */
   IFUNC_IMPL (i, name, strcasecmp,
-	      IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
 			      __strcasecmp_sse4_2)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __strcasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strcasecmp_l.S.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
 			      __strcasecmp_l_sse4_2)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __strcasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
 			      __strcasecmp_l_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strcat.S.  */
   IFUNC_IMPL (i, name, strcat,
-	      IFUNC_IMPL_ADD (array, i, strcat, HAS_SSSE3, __strcat_ssse3)
-	      IFUNC_IMPL_ADD (array, i, strcat, HAS_SSE2, __strcat_sse2)
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
+			      __strcat_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSE2),
+			      __strcat_sse2)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strchr.S.  */
   IFUNC_IMPL (i, name, strchr,
-	      IFUNC_IMPL_ADD (array, i, strchr, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2),
 			      __strchr_sse2_bsf)
-	      IFUNC_IMPL_ADD (array, i, strchr, HAS_SSE2, __strchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2),
+			      __strchr_sse2)
 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strcmp.S.  */
   IFUNC_IMPL (i, name, strcmp,
-	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
 			      __strcmp_sse4_2)
-	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSSE3, __strcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
+			      __strcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strcpy.S.  */
   IFUNC_IMPL (i, name, strcpy,
-	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_SSSE3, __strcpy_ssse3)
-	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_SSE2, __strcpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
+			      __strcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSE2),
+			      __strcpy_sse2)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strcspn.S.  */
   IFUNC_IMPL (i, name, strcspn,
-	      IFUNC_IMPL_ADD (array, i, strcspn, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2),
 			      __strcspn_sse42)
 	      IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strncase.S.  */
   IFUNC_IMPL (i, name, strncasecmp,
-	      IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
 			      __strncasecmp_sse4_2)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __strncasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
 			      __strncasecmp_ia32))
@@ -191,136 +218,156 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/i386/i686/multiarch/strncase_l.S.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
-			      HAS_SSE4_2, __strncasecmp_l_sse4_2)
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strncasecmp_l_sse4_2)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
-			      HAS_SSSE3, __strncasecmp_l_ssse3)
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strncasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
 			      __strncasecmp_l_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strncat.S.  */
   IFUNC_IMPL (i, name, strncat,
-	      IFUNC_IMPL_ADD (array, i, strncat, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
 			      __strncat_ssse3)
-	      IFUNC_IMPL_ADD (array, i, strncat, HAS_SSE2, __strncat_sse2)
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSE2),
+			      __strncat_sse2)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strncpy.S.  */
   IFUNC_IMPL (i, name, strncpy,
-	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
 			      __strncpy_ssse3)
-	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_SSE2, __strncpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSE2),
+			      __strncpy_sse2)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strnlen.S.  */
   IFUNC_IMPL (i, name, strnlen,
-	      IFUNC_IMPL_ADD (array, i, strnlen, HAS_SSE2, __strnlen_sse2)
+	      IFUNC_IMPL_ADD (array, i, strnlen, HAS_CPU_FEATURE (SSE2),
+			      __strnlen_sse2)
 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strpbrk.S.  */
   IFUNC_IMPL (i, name, strpbrk,
-	      IFUNC_IMPL_ADD (array, i, strpbrk, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2),
 			      __strpbrk_sse42)
 	      IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strrchr.S.  */
   IFUNC_IMPL (i, name, strrchr,
-	      IFUNC_IMPL_ADD (array, i, strrchr, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2),
 			      __strrchr_sse2_bsf)
-	      IFUNC_IMPL_ADD (array, i, strrchr, HAS_SSE2, __strrchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2),
+			      __strrchr_sse2)
 	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strspn.S.  */
   IFUNC_IMPL (i, name, strspn,
-	      IFUNC_IMPL_ADD (array, i, strspn, HAS_SSE4_2, __strspn_sse42)
+	      IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2),
+			      __strspn_sse42)
 	      IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/wcschr.S.  */
   IFUNC_IMPL (i, name, wcschr,
-	      IFUNC_IMPL_ADD (array, i, wcschr, HAS_SSE2, __wcschr_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcschr, HAS_CPU_FEATURE (SSE2),
+			      __wcschr_sse2)
 	      IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/wcscmp.S.  */
   IFUNC_IMPL (i, name, wcscmp,
-	      IFUNC_IMPL_ADD (array, i, wcscmp, HAS_SSE2, __wcscmp_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcscmp, HAS_CPU_FEATURE (SSE2),
+			      __wcscmp_sse2)
 	      IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/wcscpy.S.  */
   IFUNC_IMPL (i, name, wcscpy,
-	      IFUNC_IMPL_ADD (array, i, wcscpy, HAS_SSSE3, __wcscpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
+			      __wcscpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/wcslen.S.  */
   IFUNC_IMPL (i, name, wcslen,
-	      IFUNC_IMPL_ADD (array, i, wcslen, HAS_SSE2, __wcslen_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcslen, HAS_CPU_FEATURE (SSE2),
+			      __wcslen_sse2)
 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/wcsrchr.S.  */
   IFUNC_IMPL (i, name, wcsrchr,
-	      IFUNC_IMPL_ADD (array, i, wcsrchr, HAS_SSE2, __wcsrchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcsrchr, HAS_CPU_FEATURE (SSE2),
+			      __wcsrchr_sse2)
 	      IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/wmemcmp.S.  */
   IFUNC_IMPL (i, name, wmemcmp,
-	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_2),
 			      __wmemcmp_sse4_2)
-	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
 			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_ia32))
 
 #ifdef SHARED
   /* Support sysdeps/i386/i686/multiarch/memcpy_chk.S.  */
   IFUNC_IMPL (i, name, __memcpy_chk,
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_chk_ssse3_rep)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_chk_ssse3)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSE2),
 			      __memcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memcpy.S.  */
   IFUNC_IMPL (i, name, memcpy,
-	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_ssse3_rep)
-	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
-	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSE2),
 			      __memcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S.  */
   IFUNC_IMPL (i, name, __mempcpy_chk,
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_chk_ssse3_rep)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_chk_ssse3)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSE2),
 			      __mempcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/mempcpy.S.  */
   IFUNC_IMPL (i, name, mempcpy,
-	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3_rep)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSE2),
 			      __mempcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strlen.S.  */
   IFUNC_IMPL (i, name, strlen,
-	      IFUNC_IMPL_ADD (array, i, strlen, HAS_SSE2,
+	      IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2),
 			      __strlen_sse2_bsf)
-	      IFUNC_IMPL_ADD (array, i, strlen, HAS_SSE2, __strlen_sse2)
+	      IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2),
+			      __strlen_sse2)
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strncmp.S.  */
   IFUNC_IMPL (i, name, strncmp,
-	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
 			      __strncmp_sse4_2)
-	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
 			      __strncmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_ia32))
 #endif
diff --git a/sysdeps/i386/i686/multiarch/memchr.S b/sysdeps/i386/i686/multiarch/memchr.S
index 02994d0..65e6b96 100644
--- a/sysdeps/i386/i686/multiarch/memchr.S
+++ b/sysdeps/i386/i686/multiarch/memchr.S
@@ -22,46 +22,22 @@
 #include <init-arch.h>
 
 #if IS_IN (libc)
-# define CFI_POP(REG) \
-	cfi_adjust_cfa_offset (-4); \
-	cfi_restore (REG)
-
-# define CFI_PUSH(REG) \
-	cfi_adjust_cfa_offset (4); \
-	cfi_rel_offset (REG, 0)
-
 	.text
 ENTRY(__memchr)
 	.type	__memchr, @gnu_indirect_function
-	pushl	%ebx
-	CFI_PUSH (%ebx)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-
-1:	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	HAS_ARCH_FEATURE (Slow_BSF)
 	jz	3f
 
-	leal	__memchr_sse2@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+	LOAD_FUNC_GOT_EAX ( __memchr_sse2)
 	ret
 
-	CFI_PUSH (%ebx)
-
-2:	leal	__memchr_ia32@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+2:	LOAD_FUNC_GOT_EAX (__memchr_ia32)
 	ret
 
-	CFI_PUSH (%ebx)
-
-3:	leal	__memchr_sse2_bsf@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+3:	LOAD_FUNC_GOT_EAX (__memchr_sse2_bsf)
 	ret
 END(__memchr)
 
diff --git a/sysdeps/i386/i686/multiarch/memcmp.S b/sysdeps/i386/i686/multiarch/memcmp.S
index 6b607eb..d4d7d2e 100644
--- a/sysdeps/i386/i686/multiarch/memcmp.S
+++ b/sysdeps/i386/i686/multiarch/memcmp.S
@@ -23,46 +23,19 @@
 
 /* Define multiple versions only for the definition in libc. */
 #if IS_IN (libc)
-# ifdef SHARED
-	.text
-ENTRY(memcmp)
-	.type	memcmp, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memcmp_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memcmp_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memcmp_sse4_2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(memcmp)
-# else
 	.text
 ENTRY(memcmp)
 	.type	memcmp, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memcmp_ia32, %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leal	__memcmp_ssse3, %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
+	LOAD_FUNC_GOT_EAX (__memcmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
 	jz	2f
-	leal	__memcmp_sse4_2, %eax
+	LOAD_FUNC_GOT_EAX (__memcmp_sse4_2)
 2:	ret
 END(memcmp)
-# endif
 
 # undef ENTRY
 # define ENTRY(name) \
diff --git a/sysdeps/i386/i686/multiarch/memcpy.S b/sysdeps/i386/i686/multiarch/memcpy.S
index c6d20bd..9a4d183 100644
--- a/sysdeps/i386/i686/multiarch/memcpy.S
+++ b/sysdeps/i386/i686/multiarch/memcpy.S
@@ -28,29 +28,20 @@
 	.text
 ENTRY(memcpy)
 	.type	memcpy, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memcpy_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcpy_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__memcpy_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memcpy_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leal	__memcpy_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memcpy_ssse3)
+	HAS_CPU_FEATURE (Fast_Rep_String)
 	jz	2f
-	leal	__memcpy_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__memcpy_ssse3_rep)
+2:	ret
 END(memcpy)
 
 # undef ENTRY
diff --git a/sysdeps/i386/i686/multiarch/memcpy_chk.S b/sysdeps/i386/i686/multiarch/memcpy_chk.S
index 9399587..3bbd921 100644
--- a/sysdeps/i386/i686/multiarch/memcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -29,29 +29,20 @@
 	.text
 ENTRY(__memcpy_chk)
 	.type	__memcpy_chk, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memcpy_chk_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__memcpy_chk_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leal	__memcpy_chk_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3)
+	HAS_CPU_FEATURE (Fast_Rep_String)
 	jz	2f
-	leal	__memcpy_chk_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3_rep)
+2:	ret
 END(__memcpy_chk)
 # else
 #  include "../memcpy_chk.S"
diff --git a/sysdeps/i386/i686/multiarch/memmove.S b/sysdeps/i386/i686/multiarch/memmove.S
index 7033463..2bf427f 100644
--- a/sysdeps/i386/i686/multiarch/memmove.S
+++ b/sysdeps/i386/i686/multiarch/memmove.S
@@ -23,37 +23,28 @@
 
 /* Define multiple versions only for the definition in lib.  */
 #if IS_IN (libc)
-# ifdef SHARED
 	.text
 ENTRY(memmove)
 	.type	memmove, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memmove_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memmove_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__memmove_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memmove_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leal	__memmove_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memmove_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
 	jz	2f
-	leal	__memmove_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__memmove_ssse3_rep)
+2:	ret
 END(memmove)
 
-# undef ENTRY
-# define ENTRY(name) \
+# ifdef SHARED
+#  undef ENTRY
+#  define ENTRY(name) \
 	.type __memmove_ia32, @function; \
 	.p2align 4; \
 	.globl __memmove_ia32; \
@@ -61,29 +52,8 @@ END(memmove)
 	__memmove_ia32: cfi_startproc; \
 	CALL_MCOUNT
 # else
-	.text
-ENTRY(memmove)
-	.type	memmove, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memmove_ia32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
-	jz	2f
-	leal	__memmove_sse2_unaligned, %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features
-	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	__memmove_ssse3, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
-	jz	2f
-	leal	__memmove_ssse3_rep, %eax
-2:	ret
-END(memmove)
-
-# undef ENTRY
-# define ENTRY(name) \
+#  undef ENTRY
+#  define ENTRY(name) \
 	.type __memmove_ia32, @function; \
 	.globl __memmove_ia32; \
 	.p2align 4; \
diff --git a/sysdeps/i386/i686/multiarch/memmove_chk.S b/sysdeps/i386/i686/multiarch/memmove_chk.S
index 2b576d4..b17f6ed 100644
--- a/sysdeps/i386/i686/multiarch/memmove_chk.S
+++ b/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -23,56 +23,26 @@
 
 /* Define multiple versions only for the definition in lib.  */
 #if IS_IN (libc)
-# ifdef SHARED
 	.text
 ENTRY(__memmove_chk)
 	.type	__memmove_chk, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memmove_chk_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__memmove_chk_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memmove_chk_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leal	__memmove_chk_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3)
+	HAS_CPU_FEATURE (Fast_Rep_String)
 	jz	2f
-	leal	__memmove_chk_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(__memmove_chk)
-# else
-	.text
-ENTRY(__memmove_chk)
-	.type	__memmove_chk, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memmove_chk_ia32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
-	jz	2f
-	leal	__memmove_chk_sse2_unaligned, %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features
-	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	__memmove_chk_ssse3, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
-	jz	2f
-	leal	__memmove_chk_ssse3_rep, %eax
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3_rep)
 2:	ret
 END(__memmove_chk)
 
+# ifndef SHARED
 	.type __memmove_chk_sse2_unaligned, @function
 	.p2align 4;
 __memmove_chk_sse2_unaligned:
diff --git a/sysdeps/i386/i686/multiarch/mempcpy.S b/sysdeps/i386/i686/multiarch/mempcpy.S
index 39c934e..021558a 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -28,29 +28,20 @@
 	.text
 ENTRY(__mempcpy)
 	.type	__mempcpy, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__mempcpy_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__mempcpy_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__mempcpy_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__mempcpy_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leal	__mempcpy_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__mempcpy_ssse3)
+	HAS_CPU_FEATURE (Fast_Rep_String)
 	jz	2f
-	leal	__mempcpy_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__mempcpy_ssse3_rep)
+2:	ret
 END(__mempcpy)
 
 # undef ENTRY
diff --git a/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
index b6fa202..1bea6ea 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -29,29 +29,20 @@
 	.text
 ENTRY(__mempcpy_chk)
 	.type	__mempcpy_chk, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__mempcpy_chk_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__mempcpy_chk_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leal	__mempcpy_chk_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3)
+	HAS_CPU_FEATURE (Fast_Rep_String)
 	jz	2f
-	leal	__mempcpy_chk_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3_rep)
+2:	ret
 END(__mempcpy_chk)
 # else
 #  include "../mempcpy_chk.S"
diff --git a/sysdeps/i386/i686/multiarch/memrchr.S b/sysdeps/i386/i686/multiarch/memrchr.S
index 321e0b7..32fb1a6 100644
--- a/sysdeps/i386/i686/multiarch/memrchr.S
+++ b/sysdeps/i386/i686/multiarch/memrchr.S
@@ -22,46 +22,22 @@
 #include <init-arch.h>
 
 #if IS_IN (libc)
-# define CFI_POP(REG) \
-	cfi_adjust_cfa_offset (-4); \
-	cfi_restore (REG)
-
-# define CFI_PUSH(REG) \
-	cfi_adjust_cfa_offset (4); \
-	cfi_rel_offset (REG, 0)
-
 	.text
 ENTRY(__memrchr)
 	.type	__memrchr, @gnu_indirect_function
-	pushl	%ebx
-	CFI_PUSH (%ebx)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-
-1:	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	HAS_ARCH_FEATURE (Slow_BSF)
 	jz	3f
 
-	leal	__memrchr_sse2@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+	LOAD_FUNC_GOT_EAX (__memrchr_sse2)
 	ret
 
-	CFI_PUSH (%ebx)
-
-2:	leal	__memrchr_ia32@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+2:	LOAD_FUNC_GOT_EAX (__memrchr_ia32)
 	ret
 
-	CFI_PUSH (%ebx)
-
-3:	leal	__memrchr_sse2_bsf@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+3:	LOAD_FUNC_GOT_EAX (__memrchr_sse2_bsf)
 	ret
 END(__memrchr)
 
diff --git a/sysdeps/i386/i686/multiarch/memset.S b/sysdeps/i386/i686/multiarch/memset.S
index 6d7d919..8015d57 100644
--- a/sysdeps/i386/i686/multiarch/memset.S
+++ b/sysdeps/i386/i686/multiarch/memset.S
@@ -23,46 +23,19 @@
 
 /* Define multiple versions only for the definition in lib.  */
 #if IS_IN (libc)
-# ifdef SHARED
-	.text
-ENTRY(memset)
-	.type	memset, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memset_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memset_sse2@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memset_sse2_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(memset)
-# else
 	.text
 ENTRY(memset)
 	.type	memset, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memset_ia32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memset_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__memset_sse2, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
+	LOAD_FUNC_GOT_EAX (__memset_sse2)
+	HAS_CPU_FEATURE (Fast_Rep_String)
 	jz	2f
-	leal	__memset_sse2_rep, %eax
+	LOAD_FUNC_GOT_EAX (__memset_sse2_rep)
 2:	ret
 END(memset)
-# endif
 
 # undef ENTRY
 # define ENTRY(name) \
diff --git a/sysdeps/i386/i686/multiarch/memset_chk.S b/sysdeps/i386/i686/multiarch/memset_chk.S
index a770c0d..7be45e7 100644
--- a/sysdeps/i386/i686/multiarch/memset_chk.S
+++ b/sysdeps/i386/i686/multiarch/memset_chk.S
@@ -23,50 +23,26 @@
 
 /* Define multiple versions only for the definition in lib.  */
 #if IS_IN (libc)
-# ifdef SHARED
 	.text
 ENTRY(__memset_chk)
 	.type	__memset_chk, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memset_chk_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memset_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__memset_chk_sse2@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memset_chk_sse2)
+	HAS_CPU_FEATURE (Fast_Rep_String)
 	jz	2f
-	leal	__memset_chk_sse2_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__memset_chk_sse2_rep)
+2:	ret
 END(__memset_chk)
 
+# ifdef SHARED
 strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
 	.section .gnu.warning.__memset_zero_constant_len_parameter
 	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
 # else
 	.text
-ENTRY(__memset_chk)
-	.type	__memset_chk, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memset_chk_ia32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
-	jz	2f
-	leal	__memset_chk_sse2, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
-	jz	2f
-	leal	__memset_chk_sse2_rep, %eax
-2:	ret
-END(__memset_chk)
-
 	.type __memset_chk_sse2, @function
 	.p2align 4;
 __memset_chk_sse2:
diff --git a/sysdeps/i386/i686/multiarch/rawmemchr.S b/sysdeps/i386/i686/multiarch/rawmemchr.S
index c2b7ee6..2cfbe1b 100644
--- a/sysdeps/i386/i686/multiarch/rawmemchr.S
+++ b/sysdeps/i386/i686/multiarch/rawmemchr.S
@@ -22,46 +22,22 @@
 #include <init-arch.h>
 
 #if IS_IN (libc)
-# define CFI_POP(REG) \
-	cfi_adjust_cfa_offset (-4); \
-	cfi_restore (REG)
-
-# define CFI_PUSH(REG) \
-	cfi_adjust_cfa_offset (4); \
-	cfi_rel_offset (REG, 0)
-
 	.text
 ENTRY(__rawmemchr)
 	.type	__rawmemchr, @gnu_indirect_function
-	pushl	%ebx
-	CFI_PUSH (%ebx)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-
-1:	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	HAS_ARCH_FEATURE (Slow_BSF)
 	jz	3f
 
-	leal	__rawmemchr_sse2@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+	LOAD_FUNC_GOT_EAX (__rawmemchr_sse2)
 	ret
 
-	CFI_PUSH (%ebx)
-
-2:	leal	__rawmemchr_ia32@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+2:	LOAD_FUNC_GOT_EAX (__rawmemchr_ia32)
 	ret
 
-	CFI_PUSH (%ebx)
-
-3:	leal	__rawmemchr_sse2_bsf@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+3:	LOAD_FUNC_GOT_EAX (__rawmemchr_sse2_bsf)
 	ret
 END(__rawmemchr)
 
diff --git a/sysdeps/i386/i686/multiarch/s_fma.c b/sysdeps/i386/i686/multiarch/s_fma.c
index dd70f78..cf2ede5 100644
--- a/sysdeps/i386/i686/multiarch/s_fma.c
+++ b/sysdeps/i386/i686/multiarch/s_fma.c
@@ -26,7 +26,8 @@
 extern double __fma_ia32 (double x, double y, double z) attribute_hidden;
 extern double __fma_fma (double x, double y, double z) attribute_hidden;
 
-libm_ifunc (__fma, HAS_FMA ? __fma_fma : __fma_ia32);
+libm_ifunc (__fma,
+	    HAS_ARCH_FEATURE (FMA_Usable) ? __fma_fma : __fma_ia32);
 weak_alias (__fma, fma)
 
 # define __fma __fma_ia32
diff --git a/sysdeps/i386/i686/multiarch/s_fmaf.c b/sysdeps/i386/i686/multiarch/s_fmaf.c
index 9ffa4f1..526cdf1 100644
--- a/sysdeps/i386/i686/multiarch/s_fmaf.c
+++ b/sysdeps/i386/i686/multiarch/s_fmaf.c
@@ -26,7 +26,8 @@
 extern float __fmaf_ia32 (float x, float y, float z) attribute_hidden;
 extern float __fmaf_fma (float x, float y, float z) attribute_hidden;
 
-libm_ifunc (__fmaf, HAS_FMA ? __fmaf_fma : __fmaf_ia32);
+libm_ifunc (__fmaf,
+	    HAS_ARCH_FEATURE (FMA_Usable) ? __fmaf_fma : __fmaf_ia32);
 weak_alias (__fmaf, fmaf)
 
 # define __fmaf __fmaf_ia32
diff --git a/sysdeps/i386/i686/multiarch/strcasecmp.S b/sysdeps/i386/i686/multiarch/strcasecmp.S
index c30ac3a..e4b3cf5 100644
--- a/sysdeps/i386/i686/multiarch/strcasecmp.S
+++ b/sysdeps/i386/i686/multiarch/strcasecmp.S
@@ -20,49 +20,20 @@
 #include <sysdep.h>
 #include <init-arch.h>
 
-#ifdef SHARED
 	.text
 ENTRY(__strcasecmp)
 	.type	__strcasecmp, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strcasecmp_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strcasecmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leal	__strcasecmp_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__strcasecmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
 	jz	2f
-	testl	$bit_Slow_SSE4_2, FEATURE_OFFSET+index_Slow_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
 	jnz	2f
-	leal	__strcasecmp_sse4_2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(__strcasecmp)
-#else
-	.text
-ENTRY(__strcasecmp)
-	.type	__strcasecmp, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strcasecmp_ia32, %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	__strcasecmp_ssse3, %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
-	jz	2f
-	testl	$bit_Slow_SSE4_2, FEATURE_OFFSET+index_Slow_SSE4_2+__cpu_features
-	jnz	2f
-	leal	__strcasecmp_sse4_2, %eax
+	LOAD_FUNC_GOT_EAX (__strcasecmp_sse4_2)
 2:	ret
 END(__strcasecmp)
-#endif
 
 weak_alias (__strcasecmp, strcasecmp)
diff --git a/sysdeps/i386/i686/multiarch/strcat.S b/sysdeps/i386/i686/multiarch/strcat.S
index 474f753..45d84cd 100644
--- a/sysdeps/i386/i686/multiarch/strcat.S
+++ b/sysdeps/i386/i686/multiarch/strcat.S
@@ -45,52 +45,22 @@
    need strncat before the initialization happened.  */
 #if IS_IN (libc)
 
-# ifdef SHARED
 	.text
 ENTRY(STRCAT)
 	.type	STRCAT, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	STRCAT_IA32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	STRCAT_SSE2@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
-	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	STRCAT_SSSE3@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(STRCAT)
-# else
-
-ENTRY(STRCAT)
-	.type	STRCAT, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	STRCAT_IA32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCAT_IA32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	STRCAT_SSE2, %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features
+	LOAD_FUNC_GOT_EAX (STRCAT_SSE2)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leal	STRCAT_SSSE3, %eax
+	LOAD_FUNC_GOT_EAX (STRCAT_SSSE3)
 2:	ret
 END(STRCAT)
 
-# endif
-
 # undef ENTRY
 # define ENTRY(name) \
 	.type STRCAT_IA32, @function; \
diff --git a/sysdeps/i386/i686/multiarch/strchr.S b/sysdeps/i386/i686/multiarch/strchr.S
index 45624fd..6b46565 100644
--- a/sysdeps/i386/i686/multiarch/strchr.S
+++ b/sysdeps/i386/i686/multiarch/strchr.S
@@ -25,24 +25,15 @@
 	.text
 ENTRY(strchr)
 	.type	strchr, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strchr_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strchr_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__strchr_sse2_bsf@GOTOFF(%ebx), %eax
-	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__strchr_sse2_bsf)
+	HAS_ARCH_FEATURE (Slow_BSF)
 	jz	2f
-	leal	__strchr_sse2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__strchr_sse2)
+2:	ret
 END(strchr)
 
 # undef ENTRY
diff --git a/sysdeps/i386/i686/multiarch/strcmp.S b/sysdeps/i386/i686/multiarch/strcmp.S
index 9df4008..cad179d 100644
--- a/sysdeps/i386/i686/multiarch/strcmp.S
+++ b/sysdeps/i386/i686/multiarch/strcmp.S
@@ -51,50 +51,21 @@
    define multiple versions for strncmp in static library since we
    need strncmp before the initialization happened.  */
 #if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc)
-# ifdef SHARED
 	.text
 ENTRY(STRCMP)
 	.type	STRCMP, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__STRCMP_IA32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__STRCMP_IA32)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leal	__STRCMP_SSSE3@GOTOFF(%ebx), %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__STRCMP_SSSE3)
+	HAS_CPU_FEATURE (SSE4_2)
 	jz	2f
-	testl	$bit_Slow_SSE4_2, FEATURE_OFFSET+index_Slow_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
 	jnz	2f
-	leal	__STRCMP_SSE4_2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(STRCMP)
-# else
-	.text
-ENTRY(STRCMP)
-	.type	STRCMP, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__STRCMP_IA32, %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	__STRCMP_SSSE3, %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
-	jz	2f
-	testl	$bit_Slow_SSE4_2, FEATURE_OFFSET+index_Slow_SSE4_2+__cpu_features
-	jnz	2f
-	leal	__STRCMP_SSE4_2, %eax
+	LOAD_FUNC_GOT_EAX (__STRCMP_SSE4_2)
 2:	ret
 END(STRCMP)
-# endif
 
 # undef ENTRY
 # define ENTRY(name) \
diff --git a/sysdeps/i386/i686/multiarch/strcpy.S b/sysdeps/i386/i686/multiarch/strcpy.S
index c279d46..e9db766 100644
--- a/sysdeps/i386/i686/multiarch/strcpy.S
+++ b/sysdeps/i386/i686/multiarch/strcpy.S
@@ -61,52 +61,22 @@
    need strncpy before the initialization happened.  */
 #if IS_IN (libc)
 
-# ifdef SHARED
 	.text
 ENTRY(STRCPY)
 	.type	STRCPY, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	STRCPY_IA32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCPY_IA32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	STRCPY_SSE2@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (STRCPY_SSE2)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leal	STRCPY_SSSE3@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(STRCPY)
-# else
-
-ENTRY(STRCPY)
-	.type	STRCPY, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	STRCPY_IA32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
-	jz	2f
-	leal	STRCPY_SSE2, %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features
-	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	STRCPY_SSSE3, %eax
+	LOAD_FUNC_GOT_EAX (STRCPY_SSSE3)
 2:	ret
 END(STRCPY)
 
-# endif
-
 # undef ENTRY
 # define ENTRY(name) \
 	.type STRCPY_IA32, @function; \
diff --git a/sysdeps/i386/i686/multiarch/strcspn.S b/sysdeps/i386/i686/multiarch/strcspn.S
index e6ea454..b669b97 100644
--- a/sysdeps/i386/i686/multiarch/strcspn.S
+++ b/sysdeps/i386/i686/multiarch/strcspn.S
@@ -42,40 +42,16 @@
    define multiple versions for strpbrk in static library since we
    need strpbrk before the initialization happened.  */
 #if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc)
-# ifdef SHARED
 	.text
 ENTRY(STRCSPN)
 	.type	STRCSPN, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	STRCSPN_IA32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCSPN_IA32)
+	HAS_CPU_FEATURE (SSE4_2)
 	jz	2f
-	leal	STRCSPN_SSE42@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
-END(STRCSPN)
-# else
-	.text
-ENTRY(STRCSPN)
-	.type	STRCSPN, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	STRCSPN_IA32, %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
-	jz	2f
-	leal	STRCSPN_SSE42, %eax
+	LOAD_FUNC_GOT_EAX (STRCSPN_SSE42)
 2:	ret
 END(STRCSPN)
-# endif
 
 # undef ENTRY
 # define ENTRY(name) \
diff --git a/sysdeps/i386/i686/multiarch/strlen.S b/sysdeps/i386/i686/multiarch/strlen.S
index 2e6993b..613559c 100644
--- a/sysdeps/i386/i686/multiarch/strlen.S
+++ b/sysdeps/i386/i686/multiarch/strlen.S
@@ -28,24 +28,15 @@
 	.text
 ENTRY(strlen)
 	.type	strlen, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strlen_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strlen_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__strlen_sse2_bsf@GOTOFF(%ebx), %eax
-	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__strlen_sse2_bsf)
+	HAS_ARCH_FEATURE (Slow_BSF)
 	jz	2f
-	leal	__strlen_sse2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__strlen_sse2)
+2:	ret
 END(strlen)
 
 # undef ENTRY
diff --git a/sysdeps/i386/i686/multiarch/strncase.S b/sysdeps/i386/i686/multiarch/strncase.S
index c2cb03c..0cdbeff 100644
--- a/sysdeps/i386/i686/multiarch/strncase.S
+++ b/sysdeps/i386/i686/multiarch/strncase.S
@@ -20,49 +20,20 @@
 #include <sysdep.h>
 #include <init-arch.h>
 
-#ifdef SHARED
 	.text
 ENTRY(__strncasecmp)
 	.type	__strncasecmp, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strncasecmp_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strncasecmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leal	__strncasecmp_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__strncasecmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
 	jz	2f
-	testl	$bit_Slow_SSE4_2, FEATURE_OFFSET+index_Slow_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
 	jnz	2f
-	leal	__strncasecmp_sse4_2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(__strncasecmp)
-#else
-	.text
-ENTRY(__strncasecmp)
-	.type	__strncasecmp, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strncasecmp_ia32, %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	__strncasecmp_ssse3, %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
-	jz	2f
-	testl	$bit_Slow_SSE4_2, FEATURE_OFFSET+index_Slow_SSE4_2+__cpu_features
-	jnz	2f
-	leal	__strncasecmp_sse4_2, %eax
+	LOAD_FUNC_GOT_EAX (__strncasecmp_sse4_2)
 2:	ret
 END(__strncasecmp)
-#endif
 
 weak_alias (__strncasecmp, strncasecmp)
diff --git a/sysdeps/i386/i686/multiarch/strnlen.S b/sysdeps/i386/i686/multiarch/strnlen.S
index 56a5136..baf21fc 100644
--- a/sysdeps/i386/i686/multiarch/strnlen.S
+++ b/sysdeps/i386/i686/multiarch/strnlen.S
@@ -25,21 +25,12 @@
 	.text
 ENTRY(__strnlen)
 	.type	__strnlen, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strnlen_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strnlen_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__strnlen_sse2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__strnlen_sse2)
+2:	ret
 END(__strnlen)
 
 weak_alias(__strnlen, strnlen)
diff --git a/sysdeps/i386/i686/multiarch/strrchr.S b/sysdeps/i386/i686/multiarch/strrchr.S
index 91074b4..6aa3321 100644
--- a/sysdeps/i386/i686/multiarch/strrchr.S
+++ b/sysdeps/i386/i686/multiarch/strrchr.S
@@ -25,24 +25,15 @@
 	.text
 ENTRY(strrchr)
 	.type	strrchr, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strrchr_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strrchr_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__strrchr_sse2_bsf@GOTOFF(%ebx), %eax
-	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__strrchr_sse2_bsf)
+	HAS_ARCH_FEATURE (Slow_BSF)
 	jz	2f
-	leal	__strrchr_sse2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__strrchr_sse2)
+2:	ret
 END(strrchr)
 
 # undef ENTRY
diff --git a/sysdeps/i386/i686/multiarch/strspn.S b/sysdeps/i386/i686/multiarch/strspn.S
index 9d353a2..4ba87be 100644
--- a/sysdeps/i386/i686/multiarch/strspn.S
+++ b/sysdeps/i386/i686/multiarch/strspn.S
@@ -27,40 +27,16 @@
 
 /* Define multiple versions only for the definition in libc.  */
 #if IS_IN (libc)
-# ifdef SHARED
 	.text
 ENTRY(strspn)
 	.type	strspn, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strspn_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strspn_ia32)
+	HAS_CPU_FEATURE (SSE4_2)
 	jz	2f
-	leal	__strspn_sse42@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
-END(strspn)
-# else
-	.text
-ENTRY(strspn)
-	.type	strspn, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strspn_ia32, %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
-	jz	2f
-	leal	__strspn_sse42, %eax
+	LOAD_FUNC_GOT_EAX (__strspn_sse42)
 2:	ret
 END(strspn)
-# endif
 
 # undef ENTRY
 # define ENTRY(name) \
diff --git a/sysdeps/i386/i686/multiarch/wcschr.S b/sysdeps/i386/i686/multiarch/wcschr.S
index 603d7d7..5918b12 100644
--- a/sysdeps/i386/i686/multiarch/wcschr.S
+++ b/sysdeps/i386/i686/multiarch/wcschr.S
@@ -25,21 +25,12 @@
 	.text
 ENTRY(__wcschr)
 	.type	wcschr, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__wcschr_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcschr_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__wcschr_sse2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__wcschr_sse2)
+2:	ret
 END(__wcschr)
 weak_alias (__wcschr, wcschr)
 #endif
diff --git a/sysdeps/i386/i686/multiarch/wcscmp.S b/sysdeps/i386/i686/multiarch/wcscmp.S
index 43fe0a3..733380c 100644
--- a/sysdeps/i386/i686/multiarch/wcscmp.S
+++ b/sysdeps/i386/i686/multiarch/wcscmp.S
@@ -28,20 +28,11 @@
 	.text
 ENTRY(wcscmp)
 	.type	wcscmp, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__wcscmp_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcscmp_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__wcscmp_sse2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__wcscmp_sse2)
+2:	ret
 END(wcscmp)
 #endif
diff --git a/sysdeps/i386/i686/multiarch/wcscpy.S b/sysdeps/i386/i686/multiarch/wcscpy.S
index f7253c7..5f14970 100644
--- a/sysdeps/i386/i686/multiarch/wcscpy.S
+++ b/sysdeps/i386/i686/multiarch/wcscpy.S
@@ -26,20 +26,11 @@
 	.text
 ENTRY(wcscpy)
 	.type	wcscpy, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__wcscpy_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcscpy_ia32)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leal	__wcscpy_ssse3@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__wcscpy_ssse3)
+2:	ret
 END(wcscpy)
 #endif
diff --git a/sysdeps/i386/i686/multiarch/wcslen.S b/sysdeps/i386/i686/multiarch/wcslen.S
index 3926a50..7740404 100644
--- a/sysdeps/i386/i686/multiarch/wcslen.S
+++ b/sysdeps/i386/i686/multiarch/wcslen.S
@@ -25,21 +25,12 @@
 	.text
 ENTRY(__wcslen)
 	.type	__wcslen, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__wcslen_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcslen_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__wcslen_sse2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__wcslen_sse2)
+2:	ret
 END(__wcslen)
 
 weak_alias(__wcslen, wcslen)
diff --git a/sysdeps/i386/i686/multiarch/wcsrchr.S b/sysdeps/i386/i686/multiarch/wcsrchr.S
index 5c96129..9ed6810 100644
--- a/sysdeps/i386/i686/multiarch/wcsrchr.S
+++ b/sysdeps/i386/i686/multiarch/wcsrchr.S
@@ -25,20 +25,11 @@
 	.text
 ENTRY(wcsrchr)
 	.type	wcsrchr, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__wcsrchr_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcsrchr_ia32)
+	HAS_CPU_FEATURE (SSE2)
 	jz	2f
-	leal	__wcsrchr_sse2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__wcsrchr_sse2)
+2:	ret
 END(wcsrchr)
 #endif
diff --git a/sysdeps/i386/i686/multiarch/wmemcmp.S b/sysdeps/i386/i686/multiarch/wmemcmp.S
index 6ca6053..6025942 100644
--- a/sysdeps/i386/i686/multiarch/wmemcmp.S
+++ b/sysdeps/i386/i686/multiarch/wmemcmp.S
@@ -27,23 +27,14 @@
 	.text
 ENTRY(wmemcmp)
 	.type	wmemcmp, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__wmemcmp_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wmemcmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leal	__wmemcmp_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__wmemcmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
 	jz	2f
-	leal	__wmemcmp_sse4_2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__wmemcmp_sse4_2)
+2:	ret
 END(wmemcmp)
 #endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0fb36bf746cd24c110898a39881c88a7398e4e59

commit 0fb36bf746cd24c110898a39881c88a7398e4e59
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Jul 31 13:41:04 2015 -0700

    Update x86_64 multiarch functions for <cpu-features.h>
    
    This patch updates x86_64 multiarch functions to use the newly defined
    HAS_CPU_FEATURE, HAS_ARCH_FEATURE and LOAD_RTLD_GLOBAL_RO_RDX from
    <cpu-features.h>.
    
    	* sysdeps/x86_64/fpu/multiarch/e_asin.c: Replace HAS_XXX with
    	HAS_CPU_FEATURE/HAS_ARCH_FEATURE (XXX).
    	* sysdeps/x86_64/fpu/multiarch/e_atan2.c: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/e_exp.c: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/e_log.c: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/e_pow.c: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/s_atan.c: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/s_fma.c: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/s_fmaf.c: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/s_sin.c: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/s_tan.c: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/s_ceil.S: Use
    	LOAD_RTLD_GLOBAL_RO_RDX and HAS_CPU_FEATURE (SSE4_1).
    	* sysdeps/x86_64/fpu/multiarch/s_ceilf.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/s_floor.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/s_floorf.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/s_nearbyint.S : Likewise.
    	* sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/s_rintf.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/s_rintf.S : Likewise.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Likewise.
    	* sysdeps/x86_64/multiarch/sched_cpucount.c: Likewise.
    	* sysdeps/x86_64/multiarch/strstr.c: Likewise.
    	* sysdeps/x86_64/multiarch/memmove.c: Likewise.
    	* sysdeps/x86_64/multiarch/memmove_chk.c: Likewise.
    	* sysdeps/x86_64/multiarch/test-multiarch.c: Likewise.
    	* sysdeps/x86_64/multiarch/memcmp.S: Remove __init_cpu_features
    	call.  Add LOAD_RTLD_GLOBAL_RO_RDX.  Replace HAS_XXX with
    	HAS_CPU_FEATURE/HAS_ARCH_FEATURE (XXX).
    	* sysdeps/x86_64/multiarch/memcpy.S: Likewise.
    	* sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise.
    	* sysdeps/x86_64/multiarch/mempcpy.S: Likewise.
    	* sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise.
    	* sysdeps/x86_64/multiarch/memset.S: Likewise.
    	* sysdeps/x86_64/multiarch/memset_chk.S: Likewise.
    	* sysdeps/x86_64/multiarch/strcat.S: Likewise.
    	* sysdeps/x86_64/multiarch/strchr.S: Likewise.
    	* sysdeps/x86_64/multiarch/strcmp.S: Likewise.
    	* sysdeps/x86_64/multiarch/strcpy.S: Likewise.
    	* sysdeps/x86_64/multiarch/strcspn.S: Likewise.
    	* sysdeps/x86_64/multiarch/strspn.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcscpy.S: Likewise.
    	* sysdeps/x86_64/multiarch/wmemcmp.S: Likewise.

diff --git a/sysdeps/x86_64/fpu/multiarch/e_asin.c b/sysdeps/x86_64/fpu/multiarch/e_asin.c
index e742a9c..5a9091b 100644
--- a/sysdeps/x86_64/fpu/multiarch/e_asin.c
+++ b/sysdeps/x86_64/fpu/multiarch/e_asin.c
@@ -8,11 +8,15 @@ extern double __ieee754_acos_fma4 (double);
 extern double __ieee754_asin_fma4 (double);
 
 libm_ifunc (__ieee754_acos,
-	    HAS_FMA4 ? __ieee754_acos_fma4 : __ieee754_acos_sse2);
+	    HAS_ARCH_FEATURE (FMA4_Usable)
+	    ? __ieee754_acos_fma4
+	    : __ieee754_acos_sse2);
 strong_alias (__ieee754_acos, __acos_finite)
 
 libm_ifunc (__ieee754_asin,
-	    HAS_FMA4 ? __ieee754_asin_fma4 : __ieee754_asin_sse2);
+	    HAS_ARCH_FEATURE (FMA4_Usable)
+	    ? __ieee754_asin_fma4
+	    : __ieee754_asin_sse2);
 strong_alias (__ieee754_asin, __asin_finite)
 
 # define __ieee754_acos __ieee754_acos_sse2
diff --git a/sysdeps/x86_64/fpu/multiarch/e_atan2.c b/sysdeps/x86_64/fpu/multiarch/e_atan2.c
index 6867c6e..a865ed6 100644
--- a/sysdeps/x86_64/fpu/multiarch/e_atan2.c
+++ b/sysdeps/x86_64/fpu/multiarch/e_atan2.c
@@ -7,14 +7,15 @@ extern double __ieee754_atan2_avx (double, double);
 # ifdef HAVE_FMA4_SUPPORT
 extern double __ieee754_atan2_fma4 (double, double);
 # else
-#  undef HAS_FMA4
-#  define HAS_FMA4 0
+#  undef HAS_ARCH_FEATURE
+#  define HAS_ARCH_FEATURE(feature) 0
 #  define __ieee754_atan2_fma4 ((void *) 0)
 # endif
 
 libm_ifunc (__ieee754_atan2,
-	    HAS_FMA4 ? __ieee754_atan2_fma4
-	    : (HAS_AVX ? __ieee754_atan2_avx : __ieee754_atan2_sse2));
+	    HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_atan2_fma4
+	    : (HAS_ARCH_FEATURE (AVX_Usable)
+	       ? __ieee754_atan2_avx : __ieee754_atan2_sse2));
 strong_alias (__ieee754_atan2, __atan2_finite)
 
 # define __ieee754_atan2 __ieee754_atan2_sse2
diff --git a/sysdeps/x86_64/fpu/multiarch/e_exp.c b/sysdeps/x86_64/fpu/multiarch/e_exp.c
index d244954..9c124ca 100644
--- a/sysdeps/x86_64/fpu/multiarch/e_exp.c
+++ b/sysdeps/x86_64/fpu/multiarch/e_exp.c
@@ -8,14 +8,15 @@ extern double __ieee754_exp_avx (double);
 # ifdef HAVE_FMA4_SUPPORT
 extern double __ieee754_exp_fma4 (double);
 # else
-#  undef HAS_FMA4
-#  define HAS_FMA4 0
+#  undef HAS_ARCH_FEATURE
+#  define HAS_ARCH_FEATURE(feature) 0
 #  define __ieee754_exp_fma4 ((void *) 0)
 # endif
 
 libm_ifunc (__ieee754_exp,
-	    HAS_FMA4 ? __ieee754_exp_fma4
-	    : (HAS_AVX ? __ieee754_exp_avx : __ieee754_exp_sse2));
+	    HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_exp_fma4
+	    : (HAS_ARCH_FEATURE (AVX_Usable)
+	       ? __ieee754_exp_avx : __ieee754_exp_sse2));
 strong_alias (__ieee754_exp, __exp_finite)
 
 # define __ieee754_exp __ieee754_exp_sse2
diff --git a/sysdeps/x86_64/fpu/multiarch/e_log.c b/sysdeps/x86_64/fpu/multiarch/e_log.c
index 05f3668..bf932c6 100644
--- a/sysdeps/x86_64/fpu/multiarch/e_log.c
+++ b/sysdeps/x86_64/fpu/multiarch/e_log.c
@@ -7,14 +7,15 @@ extern double __ieee754_log_avx (double);
 # ifdef HAVE_FMA4_SUPPORT
 extern double __ieee754_log_fma4 (double);
 # else
-#  undef HAS_FMA4
-#  define HAS_FMA4 0
+#  undef HAS_ARCH_FEATURE
+#  define HAS_ARCH_FEATURE(feature) 0
 #  define __ieee754_log_fma4 ((void *) 0)
 # endif
 
 libm_ifunc (__ieee754_log,
-	    HAS_FMA4 ? __ieee754_log_fma4
-	    : (HAS_AVX ? __ieee754_log_avx : __ieee754_log_sse2));
+	    HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_log_fma4
+	    : (HAS_ARCH_FEATURE (AVX_Usable)
+	       ? __ieee754_log_avx : __ieee754_log_sse2));
 strong_alias (__ieee754_log, __log_finite)
 
 # define __ieee754_log __ieee754_log_sse2
diff --git a/sysdeps/x86_64/fpu/multiarch/e_pow.c b/sysdeps/x86_64/fpu/multiarch/e_pow.c
index 433cce0..6d422d6 100644
--- a/sysdeps/x86_64/fpu/multiarch/e_pow.c
+++ b/sysdeps/x86_64/fpu/multiarch/e_pow.c
@@ -6,7 +6,10 @@
 extern double __ieee754_pow_sse2 (double, double);
 extern double __ieee754_pow_fma4 (double, double);
 
-libm_ifunc (__ieee754_pow, HAS_FMA4 ? __ieee754_pow_fma4 : __ieee754_pow_sse2);
+libm_ifunc (__ieee754_pow,
+	    HAS_ARCH_FEATURE (FMA4_Usable)
+	    ? __ieee754_pow_fma4
+	    : __ieee754_pow_sse2);
 strong_alias (__ieee754_pow, __pow_finite)
 
 # define __ieee754_pow __ieee754_pow_sse2
diff --git a/sysdeps/x86_64/fpu/multiarch/s_atan.c b/sysdeps/x86_64/fpu/multiarch/s_atan.c
index ae16d7c..57b5c65 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_atan.c
+++ b/sysdeps/x86_64/fpu/multiarch/s_atan.c
@@ -7,13 +7,14 @@ extern double __atan_avx (double);
 # ifdef HAVE_FMA4_SUPPORT
 extern double __atan_fma4 (double);
 # else
-#  undef HAS_FMA4
-#  define HAS_FMA4 0
+#  undef HAS_ARCH_FEATURE
+#  define HAS_ARCH_FEATURE(feature) 0
 #  define __atan_fma4 ((void *) 0)
 # endif
 
-libm_ifunc (atan, (HAS_FMA4 ? __atan_fma4 :
-		   HAS_AVX ? __atan_avx : __atan_sse2));
+libm_ifunc (atan, (HAS_ARCH_FEATURE (FMA4_Usable) ? __atan_fma4 :
+		   HAS_ARCH_FEATURE (AVX_Usable)
+		   ? __atan_avx : __atan_sse2));
 
 # define atan __atan_sse2
 #endif
diff --git a/sysdeps/x86_64/fpu/multiarch/s_ceil.S b/sysdeps/x86_64/fpu/multiarch/s_ceil.S
index 00ecede..c1b9026 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_ceil.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_ceil.S
@@ -22,10 +22,9 @@
 
 ENTRY(__ceil)
 	.type	__ceil, @gnu_indirect_function
-	call	__get_cpu_features@plt
-	movq	%rax, %rdx
+	LOAD_RTLD_GLOBAL_RO_RDX
 	leaq	__ceil_sse41(%rip), %rax
-	testl	$bit_SSE4_1, CPUID_OFFSET+index_SSE4_1(%rdx)
+	HAS_CPU_FEATURE (SSE4_1)
 	jnz	2f
 	leaq	__ceil_c(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/fpu/multiarch/s_ceilf.S b/sysdeps/x86_64/fpu/multiarch/s_ceilf.S
index c8ed705..7809e03 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_ceilf.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_ceilf.S
@@ -22,10 +22,9 @@
 
 ENTRY(__ceilf)
 	.type	__ceilf, @gnu_indirect_function
-	call	__get_cpu_features@plt
-	movq	%rax, %rdx
+	LOAD_RTLD_GLOBAL_RO_RDX
 	leaq	__ceilf_sse41(%rip), %rax
-	testl	$bit_SSE4_1, CPUID_OFFSET+index_SSE4_1(%rdx)
+	HAS_CPU_FEATURE (SSE4_1)
 	jnz	2f
 	leaq	__ceilf_c(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/fpu/multiarch/s_floor.S b/sysdeps/x86_64/fpu/multiarch/s_floor.S
index 952ffaa..fa3f98e 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_floor.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_floor.S
@@ -22,10 +22,9 @@
 
 ENTRY(__floor)
 	.type	__floor, @gnu_indirect_function
-	call	__get_cpu_features@plt
-	movq	%rax, %rdx
+	LOAD_RTLD_GLOBAL_RO_RDX
 	leaq	__floor_sse41(%rip), %rax
-	testl	$bit_SSE4_1, CPUID_OFFSET+index_SSE4_1(%rdx)
+	HAS_CPU_FEATURE (SSE4_1)
 	jnz	2f
 	leaq	__floor_c(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/fpu/multiarch/s_floorf.S b/sysdeps/x86_64/fpu/multiarch/s_floorf.S
index c8231e8..f60f662 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_floorf.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_floorf.S
@@ -22,10 +22,10 @@
 
 ENTRY(__floorf)
 	.type	__floorf, @gnu_indirect_function
-	call	__get_cpu_features@plt
+	LOAD_RTLD_GLOBAL_RO_RDX
 	movq	%rax, %rdx
 	leaq	__floorf_sse41(%rip), %rax
-	testl	$bit_SSE4_1, CPUID_OFFSET+index_SSE4_1(%rdx)
+	HAS_CPU_FEATURE (SSE4_1)
 	jnz	2f
 	leaq	__floorf_c(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/fpu/multiarch/s_fma.c b/sysdeps/x86_64/fpu/multiarch/s_fma.c
index 0963a0b..78e7732 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_fma.c
+++ b/sysdeps/x86_64/fpu/multiarch/s_fma.c
@@ -42,14 +42,15 @@ __fma_fma4 (double x, double y, double z)
   return x;
 }
 # else
-#  undef HAS_FMA4
-#  define HAS_FMA4 0
+#  undef HAS_ARCH_FEATURE
+#  define HAS_ARCH_FEATURE(feature) 0
 #  define __fma_fma4 ((void *) 0)
 # endif
 
 
-libm_ifunc (__fma, HAS_FMA
-	    ? __fma_fma3 : (HAS_FMA4 ? __fma_fma4 : __fma_sse2));
+libm_ifunc (__fma, HAS_ARCH_FEATURE (FMA_Usable)
+	    ? __fma_fma3 : (HAS_ARCH_FEATURE (FMA4_Usable)
+			    ? __fma_fma4 : __fma_sse2));
 weak_alias (__fma, fma)
 
 # define __fma __fma_sse2
diff --git a/sysdeps/x86_64/fpu/multiarch/s_fmaf.c b/sysdeps/x86_64/fpu/multiarch/s_fmaf.c
index 6046961..bebd3ee 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_fmaf.c
+++ b/sysdeps/x86_64/fpu/multiarch/s_fmaf.c
@@ -41,14 +41,15 @@ __fmaf_fma4 (float x, float y, float z)
   return x;
 }
 # else
-#  undef HAS_FMA4
-#  define HAS_FMA4 0
+#  undef HAS_ARCH_FEATURE
+#  define HAS_ARCH_FEATURE(feature) 0
 #  define __fmaf_fma4 ((void *) 0)
 # endif
 
 
-libm_ifunc (__fmaf, HAS_FMA
-	    ? __fmaf_fma3 : (HAS_FMA4 ? __fmaf_fma4 : __fmaf_sse2));
+libm_ifunc (__fmaf, HAS_ARCH_FEATURE (FMA_Usable)
+	    ? __fmaf_fma3 : (HAS_ARCH_FEATURE (FMA4_Usable)
+			     ? __fmaf_fma4 : __fmaf_sse2));
 weak_alias (__fmaf, fmaf)
 
 # define __fmaf __fmaf_sse2
diff --git a/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S b/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S
index b5d32b5..109395c 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S
@@ -22,10 +22,10 @@
 
 ENTRY(__nearbyint)
 	.type	__nearbyint, @gnu_indirect_function
-	call	__get_cpu_features@plt
+	LOAD_RTLD_GLOBAL_RO_RDX
 	movq	%rax, %rdx
 	leaq	__nearbyint_sse41(%rip), %rax
-	testl	$bit_SSE4_1, CPUID_OFFSET+index_SSE4_1(%rdx)
+	HAS_CPU_FEATURE (SSE4_1)
 	jnz	2f
 	leaq	__nearbyint_c(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S b/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S
index cd7e177..b870c0c 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S
@@ -22,10 +22,9 @@
 
 ENTRY(__nearbyintf)
 	.type	__nearbyintf, @gnu_indirect_function
-	call	__get_cpu_features@plt
-	movq	%rax, %rdx
+	LOAD_RTLD_GLOBAL_RO_RDX
 	leaq	__nearbyintf_sse41(%rip), %rax
-	testl	$bit_SSE4_1, CPUID_OFFSET+index_SSE4_1(%rdx)
+	HAS_CPU_FEATURE (SSE4_1)
 	jnz	2f
 	leaq	__nearbyintf_c(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/fpu/multiarch/s_rint.S b/sysdeps/x86_64/fpu/multiarch/s_rint.S
index f52cef6..b238d49 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_rint.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_rint.S
@@ -22,10 +22,9 @@
 
 ENTRY(__rint)
 	.type	__rint, @gnu_indirect_function
-	call	__get_cpu_features@plt
-	movq	%rax, %rdx
+	LOAD_RTLD_GLOBAL_RO_RDX
 	leaq	__rint_sse41(%rip), %rax
-	testl	$bit_SSE4_1, CPUID_OFFSET+index_SSE4_1(%rdx)
+	HAS_CPU_FEATURE (SSE4_1)
 	jnz	2f
 	leaq	__rint_c(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/fpu/multiarch/s_rintf.S b/sysdeps/x86_64/fpu/multiarch/s_rintf.S
index e2608d4..8869196 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_rintf.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_rintf.S
@@ -22,10 +22,9 @@
 
 ENTRY(__rintf)
 	.type	__rintf, @gnu_indirect_function
-	call	__get_cpu_features@plt
-	movq	%rax, %rdx
+	LOAD_RTLD_GLOBAL_RO_RDX
 	leaq	__rintf_sse41(%rip), %rax
-	testl	$bit_SSE4_1, CPUID_OFFSET+index_SSE4_1(%rdx)
+	HAS_CPU_FEATURE (SSE4_1)
 	jnz	2f
 	leaq	__rintf_c(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/fpu/multiarch/s_sin.c b/sysdeps/x86_64/fpu/multiarch/s_sin.c
index a0c2521..3bc7330 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_sin.c
+++ b/sysdeps/x86_64/fpu/multiarch/s_sin.c
@@ -11,18 +11,20 @@ extern double __sin_avx (double);
 extern double __cos_fma4 (double);
 extern double __sin_fma4 (double);
 # else
-#  undef HAS_FMA4
-#  define HAS_FMA4 0
+#  undef HAS_ARCH_FEATURE
+#  define HAS_ARCH_FEATURE(feature) 0
 #  define __cos_fma4 ((void *) 0)
 #  define __sin_fma4 ((void *) 0)
 # endif
 
-libm_ifunc (__cos, (HAS_FMA4 ? __cos_fma4 :
-		    HAS_AVX ? __cos_avx : __cos_sse2));
+libm_ifunc (__cos, (HAS_ARCH_FEATURE (FMA4_Usable) ? __cos_fma4 :
+		    HAS_ARCH_FEATURE (AVX_Usable)
+		    ? __cos_avx : __cos_sse2));
 weak_alias (__cos, cos)
 
-libm_ifunc (__sin, (HAS_FMA4 ? __sin_fma4 :
-		    HAS_AVX ? __sin_avx : __sin_sse2));
+libm_ifunc (__sin, (HAS_ARCH_FEATURE (FMA4_Usable) ? __sin_fma4 :
+		    HAS_ARCH_FEATURE (AVX_Usable)
+		    ? __sin_avx : __sin_sse2));
 weak_alias (__sin, sin)
 
 # define __cos __cos_sse2
diff --git a/sysdeps/x86_64/fpu/multiarch/s_tan.c b/sysdeps/x86_64/fpu/multiarch/s_tan.c
index 904308f..d99d9db 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_tan.c
+++ b/sysdeps/x86_64/fpu/multiarch/s_tan.c
@@ -7,13 +7,14 @@ extern double __tan_avx (double);
 # ifdef HAVE_FMA4_SUPPORT
 extern double __tan_fma4 (double);
 # else
-#  undef HAS_FMA4
-#  define HAS_FMA4 0
+#  undef HAS_ARCH_FEATURE
+#  define HAS_ARCH_FEATURE(feature) 0
 #  define __tan_fma4 ((void *) 0)
 # endif
 
-libm_ifunc (tan, (HAS_FMA4 ? __tan_fma4 :
-		  HAS_AVX ? __tan_avx : __tan_sse2));
+libm_ifunc (tan, (HAS_ARCH_FEATURE (FMA4_Usable) ? __tan_fma4 :
+		  HAS_ARCH_FEATURE (AVX_Usable)
+		  ? __tan_avx : __tan_sse2));
 
 # define tan __tan_sse2
 #endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index b64e4f1..f5a576c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -39,48 +39,57 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memcmp.S.  */
   IFUNC_IMPL (i, name, memcmp,
-	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1,
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1),
 			      __memcmp_sse4_1)
-	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
+			      __memcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
   IFUNC_IMPL (i, name, __memmove_chk,
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memmove_chk_avx_unaligned)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __memmove_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __memmove_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memmove.S.  */
   IFUNC_IMPL (i, name, memmove,
-	      IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX,
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memmove_avx_unaligned)
-	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
 			      __memmove_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
 			      __memmove_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
 
 #ifdef HAVE_AVX2_SUPPORT
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
   IFUNC_IMPL (i, name, __memset_chk,
-	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1, __memset_chk_sse2)
-	      IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_AVX2,
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_sse2)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_chk_avx2))
 
   /* Support sysdeps/x86_64/multiarch/memset.S.  */
   IFUNC_IMPL (i, name, memset,
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
-	      IFUNC_IMPL_ADD (array, i, memset, HAS_AVX2, __memset_avx2))
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_avx2))
 #endif
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.S.  */
   IFUNC_IMPL (i, name, stpncpy,
-	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
 			      __stpncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
 			      __stpncpy_sse2_unaligned)
@@ -88,27 +97,34 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/stpcpy.S.  */
   IFUNC_IMPL (i, name, stpcpy,
-	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_SSSE3, __stpcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
+			      __stpcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S.  */
   IFUNC_IMPL (i, name, strcasecmp,
-	      IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_AVX,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __strcasecmp_avx)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
 			      __strcasecmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __strcasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_AVX,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __strcasecmp_l_avx)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
 			      __strcasecmp_l_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __strcasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
 			      __strcasecmp_l_sse2))
@@ -119,7 +135,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcat.S.  */
   IFUNC_IMPL (i, name, strcat,
-	      IFUNC_IMPL_ADD (array, i, strcat, HAS_SSSE3, __strcat_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
+			      __strcat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
 
@@ -130,48 +147,57 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcmp.S.  */
   IFUNC_IMPL (i, name, strcmp,
-	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSE4_2, __strcmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSSE3, __strcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
+			      __strcmp_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
+			      __strcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcpy.S.  */
   IFUNC_IMPL (i, name, strcpy,
-	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_SSSE3, __strcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
+			      __strcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcspn.S.  */
   IFUNC_IMPL (i, name, strcspn,
-	      IFUNC_IMPL_ADD (array, i, strcspn, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2),
 			      __strcspn_sse42)
 	      IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.S.  */
   IFUNC_IMPL (i, name, strncasecmp,
-	      IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_AVX,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __strncasecmp_avx)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
 			      __strncasecmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __strncasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
 			      __strncasecmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.S.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_AVX,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __strncasecmp_l_avx)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
 			      __strncasecmp_l_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __strncasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
 			      __strncasecmp_l_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strncat.S.  */
   IFUNC_IMPL (i, name, strncat,
-	      IFUNC_IMPL_ADD (array, i, strncat, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
 			      __strncat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
 			      __strncat_sse2_unaligned)
@@ -179,7 +205,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncpy.S.  */
   IFUNC_IMPL (i, name, strncpy,
-	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
 			      __strncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
 			      __strncpy_sse2_unaligned)
@@ -187,14 +213,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strpbrk.S.  */
   IFUNC_IMPL (i, name, strpbrk,
-	      IFUNC_IMPL_ADD (array, i, strpbrk, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2),
 			      __strpbrk_sse42)
 	      IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2))
 
 
   /* Support sysdeps/x86_64/multiarch/strspn.S.  */
   IFUNC_IMPL (i, name, strspn,
-	      IFUNC_IMPL_ADD (array, i, strspn, HAS_SSE4_2, __strspn_sse42)
+	      IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2),
+			      __strspn_sse42)
 	      IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strstr.c.  */
@@ -204,65 +231,75 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/wcscpy.S.  */
   IFUNC_IMPL (i, name, wcscpy,
-	      IFUNC_IMPL_ADD (array, i, wcscpy, HAS_SSSE3, __wcscpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
+			      __wcscpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wmemcmp.S.  */
   IFUNC_IMPL (i, name, wmemcmp,
-	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSE4_1,
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1),
 			      __wmemcmp_sse4_1)
-	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
 			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
 
 #ifdef SHARED
   /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
   IFUNC_IMPL (i, name, __memcpy_chk,
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_chk_avx_unaligned)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memcpy.S.  */
   IFUNC_IMPL (i, name, memcpy,
-	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX,
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_avx_unaligned)
-	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
   IFUNC_IMPL (i, name, __mempcpy_chk,
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_chk_avx_unaligned)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
   IFUNC_IMPL (i, name, mempcpy,
-	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX,
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_avx_unaligned)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strncmp.S.  */
   IFUNC_IMPL (i, name, strncmp,
-	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
 			      __strncmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
 			      __strncmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
 #endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index f8b4636..871a081 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -26,16 +26,13 @@
 	.text
 ENTRY(memcmp)
 	.type	memcmp, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-
-1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_CPU_FEATURE (SSSE3)
 	jnz	2f
 	leaq	__memcmp_sse2(%rip), %rax
 	ret
 
-2:	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+2:	HAS_CPU_FEATURE (SSE4_1)
 	jz	3f
 	leaq	__memcmp_sse4_1(%rip), %rax
 	ret
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 4e18cd3..7e119d3 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -29,19 +29,17 @@
 	.text
 ENTRY(__new_memcpy)
 	.type	__new_memcpy, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__memcpy_avx_unaligned(%rip), %rax
-	testl	$bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__memcpy_avx_unaligned(%rip), %rax
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
 	jz 1f
 	ret
 1:	leaq	__memcpy_sse2(%rip), %rax
-	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
+	HAS_ARCH_FEATURE (Slow_BSF)
 	jnz	2f
 	leaq	__memcpy_sse2_unaligned(%rip), %rax
 	ret
-2:	testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+2:	HAS_CPU_FEATURE (SSSE3)
 	jz 3f
 	leaq    __memcpy_ssse3(%rip), %rax
 3:	ret
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 1e756ea..81f83dd 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -29,17 +29,15 @@
 	.text
 ENTRY(__memcpy_chk)
 	.type	__memcpy_chk, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__memcpy_chk_sse2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__memcpy_chk_sse2(%rip), %rax
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
 	leaq	__memcpy_chk_ssse3(%rip), %rax
-	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
 	jz	2f
 	leaq	__memcpy_chk_ssse3_back(%rip), %rax
-	testl   $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
 	jz  2f
 	leaq    __memcpy_chk_avx_unaligned(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
index dd153a3..bbddbc1 100644
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -49,10 +49,10 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
    ifunc symbol properly.  */
 extern __typeof (__redirect_memmove) __libc_memmove;
 libc_ifunc (__libc_memmove,
-	    HAS_AVX_FAST_UNALIGNED_LOAD
+	    HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
 	    ? __memmove_avx_unaligned
-	    : (HAS_SSSE3
-	       ? (HAS_FAST_COPY_BACKWARD
+	    : (HAS_CPU_FEATURE (SSSE3)
+	       ? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
 	          ? __memmove_ssse3_back : __memmove_ssse3)
 	       : __memmove_sse2));
 
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
index 8b12d00..5f70e3a 100644
--- a/sysdeps/x86_64/multiarch/memmove_chk.c
+++ b/sysdeps/x86_64/multiarch/memmove_chk.c
@@ -30,8 +30,8 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
 #include "debug/memmove_chk.c"
 
 libc_ifunc (__memmove_chk,
-	    HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned :
-	    (HAS_SSSE3
-	    ? (HAS_FAST_COPY_BACKWARD
+	    HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) ? __memmove_chk_avx_unaligned :
+	    (HAS_CPU_FEATURE (SSSE3)
+	    ? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
 	       ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
 	    : __memmove_chk_sse2));
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index 2eaacdf..ad36840 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -27,17 +27,15 @@
 #if defined SHARED && IS_IN (libc)
 ENTRY(__mempcpy)
 	.type	__mempcpy, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__mempcpy_sse2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__mempcpy_sse2(%rip), %rax
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
 	leaq	__mempcpy_ssse3(%rip), %rax
-	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
 	jz	2f
 	leaq	__mempcpy_ssse3_back(%rip), %rax
-	testl	$bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
 	jz	2f
 	leaq	__mempcpy_avx_unaligned(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index 17b8470..0a46b56 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -29,17 +29,15 @@
 	.text
 ENTRY(__mempcpy_chk)
 	.type	__mempcpy_chk, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__mempcpy_chk_sse2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__mempcpy_chk_sse2(%rip), %rax
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
 	leaq	__mempcpy_chk_ssse3(%rip), %rax
-	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
 	jz	2f
 	leaq	__mempcpy_chk_ssse3_back(%rip), %rax
-	testl	$bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
 	jz	2f
 	leaq	__mempcpy_chk_avx_unaligned(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index c5f1fb3..16fefa7 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -26,11 +26,9 @@
 # if IS_IN (libc)
 ENTRY(memset)
 	.type	memset, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__memset_sse2(%rip), %rax
-	testl	$bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__memset_sse2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
 	jz	2f
 	leaq	__memset_avx2(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S
index 64fed31..ef8c64f 100644
--- a/sysdeps/x86_64/multiarch/memset_chk.S
+++ b/sysdeps/x86_64/multiarch/memset_chk.S
@@ -25,11 +25,9 @@
 # if defined SHARED && defined HAVE_AVX2_SUPPORT
 ENTRY(__memset_chk)
 	.type	__memset_chk, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__memset_chk_sse2(%rip), %rax
-	testl	$bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__memset_chk_sse2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
 	jz	2f
 	leaq	__memset_chk_avx2(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/sched_cpucount.c b/sysdeps/x86_64/multiarch/sched_cpucount.c
index 72ad7b0..e9391a2 100644
--- a/sysdeps/x86_64/multiarch/sched_cpucount.c
+++ b/sysdeps/x86_64/multiarch/sched_cpucount.c
@@ -33,4 +33,4 @@
 #undef __sched_cpucount
 
 libc_ifunc (__sched_cpucount,
-	    HAS_POPCOUNT ? popcount_cpucount : generic_cpucount);
+	    HAS_CPU_FEATURE (POPCOUNT) ? popcount_cpucount : generic_cpucount);
diff --git a/sysdeps/x86_64/multiarch/strcat.S b/sysdeps/x86_64/multiarch/strcat.S
index 44993fa..25d926c 100644
--- a/sysdeps/x86_64/multiarch/strcat.S
+++ b/sysdeps/x86_64/multiarch/strcat.S
@@ -47,14 +47,12 @@
 	.text
 ENTRY(STRCAT)
 	.type	STRCAT, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	STRCAT_SSE2_UNALIGNED(%rip), %rax
-	testl	$bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	STRCAT_SSE2_UNALIGNED(%rip), %rax
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
 	jnz	2f
 	leaq	STRCAT_SSE2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
 	leaq	STRCAT_SSSE3(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index af55fac..0c5fdd9 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -25,11 +25,9 @@
 	.text
 ENTRY(strchr)
 	.type	strchr, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__strchr_sse2(%rip), %rax
-2:	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__strchr_sse2(%rip), %rax
+2:	HAS_ARCH_FEATURE (Slow_BSF)
 	jz	3f
 	leaq    __strchr_sse2_no_bsf(%rip), %rax
 3:	ret
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index f50f26c..c180ce6 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -84,24 +84,20 @@
 	.text
 ENTRY(STRCMP)
 	.type	STRCMP, @gnu_indirect_function
-	/* Manually inlined call to __get_cpu_features.  */
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:
+	LOAD_RTLD_GLOBAL_RO_RDX
 #ifdef USE_AS_STRCMP
 	leaq	__strcmp_sse2_unaligned(%rip), %rax
-	testl   $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
 	jnz     3f
 #else
-	testl	$bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip)
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
 	jnz	2f
 	leaq	STRCMP_SSE42(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+	HAS_CPU_FEATURE (SSE4_2)
 	jnz	3f
 #endif
 2:	leaq	STRCMP_SSSE3(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_CPU_FEATURE (SSSE3)
 	jnz	3f
 	leaq	STRCMP_SSE2(%rip), %rax
 3:	ret
@@ -110,23 +106,19 @@ END(STRCMP)
 # ifdef USE_AS_STRCASECMP_L
 ENTRY(__strcasecmp)
 	.type	__strcasecmp, @gnu_indirect_function
-	/* Manually inlined call to __get_cpu_features.  */
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:
+	LOAD_RTLD_GLOBAL_RO_RDX
 #  ifdef HAVE_AVX_SUPPORT
 	leaq	__strcasecmp_avx(%rip), %rax
-	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX_Usable)
 	jnz	3f
 #  endif
-	testl	$bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip)
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
 	jnz	2f
 	leaq	__strcasecmp_sse42(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+	HAS_CPU_FEATURE (SSE4_2)
 	jnz	3f
 2:	leaq	__strcasecmp_ssse3(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_CPU_FEATURE (SSSE3)
 	jnz	3f
 	leaq	__strcasecmp_sse2(%rip), %rax
 3:	ret
@@ -136,23 +128,19 @@ weak_alias (__strcasecmp, strcasecmp)
 # ifdef USE_AS_STRNCASECMP_L
 ENTRY(__strncasecmp)
 	.type	__strncasecmp, @gnu_indirect_function
-	/* Manually inlined call to __get_cpu_features.  */
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:
+	LOAD_RTLD_GLOBAL_RO_RDX
 #  ifdef HAVE_AVX_SUPPORT
 	leaq	__strncasecmp_avx(%rip), %rax
-	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX_Usable)
 	jnz	3f
 #  endif
-	testl	$bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip)
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
 	jnz	2f
 	leaq	__strncasecmp_sse42(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+	HAS_CPU_FEATURE (SSE4_2)
 	jnz	3f
 2:	leaq	__strncasecmp_ssse3(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_CPU_FEATURE (SSSE3)
 	jnz	3f
 	leaq	__strncasecmp_sse2(%rip), %rax
 3:	ret
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
index 9464ee8..3aae8ee 100644
--- a/sysdeps/x86_64/multiarch/strcpy.S
+++ b/sysdeps/x86_64/multiarch/strcpy.S
@@ -61,14 +61,12 @@
 	.text
 ENTRY(STRCPY)
 	.type	STRCPY, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	STRCPY_SSE2_UNALIGNED(%rip), %rax
-	testl	$bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	STRCPY_SSE2_UNALIGNED(%rip), %rax
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
 	jnz	2f
 	leaq	STRCPY_SSE2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
 	leaq	STRCPY_SSSE3(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S
index 00e4617..9c3af5b 100644
--- a/sysdeps/x86_64/multiarch/strcspn.S
+++ b/sysdeps/x86_64/multiarch/strcspn.S
@@ -45,11 +45,9 @@
 	.text
 ENTRY(STRCSPN)
 	.type	STRCSPN, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	STRCSPN_SSE2(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	STRCSPN_SSE2(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_2)
 	jz	2f
 	leaq	STRCSPN_SSE42(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S
index aea8e4c..684f730 100644
--- a/sysdeps/x86_64/multiarch/strspn.S
+++ b/sysdeps/x86_64/multiarch/strspn.S
@@ -30,11 +30,9 @@
 	.text
 ENTRY(strspn)
 	.type	strspn, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__strspn_sse2(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__strspn_sse2(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_2)
 	jz	2f
 	leaq	__strspn_sse42(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index 507994b..b8827f0 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -41,7 +41,10 @@ extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 extern __typeof (__redirect_strstr) __libc_strstr;
-libc_ifunc (__libc_strstr, HAS_FAST_UNALIGNED_LOAD ? __strstr_sse2_unaligned : __strstr_sse2)
+libc_ifunc (__libc_strstr,
+	    HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	    ? __strstr_sse2_unaligned
+	    : __strstr_sse2)
 
 #undef strstr
 strong_alias (__libc_strstr, strstr)
diff --git a/sysdeps/x86_64/multiarch/test-multiarch.c b/sysdeps/x86_64/multiarch/test-multiarch.c
index 949d26e..e893894 100644
--- a/sysdeps/x86_64/multiarch/test-multiarch.c
+++ b/sysdeps/x86_64/multiarch/test-multiarch.c
@@ -75,12 +75,18 @@ do_test (int argc, char **argv)
   int fails;
 
   get_cpuinfo ();
-  fails = check_proc ("avx", HAS_AVX, "HAS_AVX");
-  fails += check_proc ("fma4", HAS_FMA4, "HAS_FMA4");
-  fails += check_proc ("sse4_2", HAS_SSE4_2, "HAS_SSE4_2");
-  fails += check_proc ("sse4_1", HAS_SSE4_1, "HAS_SSE4_1");
-  fails += check_proc ("ssse3", HAS_SSSE3, "HAS_SSSE3");
-  fails += check_proc ("popcnt", HAS_POPCOUNT, "HAS_POPCOUNT");
+  fails = check_proc ("avx", HAS_ARCH_FEATURE (AVX_Usable),
+		      "HAS_ARCH_FEATURE (AVX_Usable)");
+  fails += check_proc ("fma4", HAS_ARCH_FEATURE (FMA4_Usable),
+		       "HAS_ARCH_FEATURE (FMA4_Usable)");
+  fails += check_proc ("sse4_2", HAS_CPU_FEATURE (SSE4_2),
+		       "HAS_CPU_FEATURE (SSE4_2)");
+  fails += check_proc ("sse4_1", HAS_CPU_FEATURE (SSE4_1)
+		       , "HAS_CPU_FEATURE (SSE4_1)");
+  fails += check_proc ("ssse3", HAS_CPU_FEATURE (SSSE3),
+		       "HAS_CPU_FEATURE (SSSE3)");
+  fails += check_proc ("popcnt", HAS_CPU_FEATURE (POPCOUNT),
+		       "HAS_CPU_FEATURE (POPCOUNT)");
 
   printf ("%d differences between /proc/cpuinfo and glibc code.\n", fails);
 
diff --git a/sysdeps/x86_64/multiarch/wcscpy.S b/sysdeps/x86_64/multiarch/wcscpy.S
index ff2f5a7..c47c51c 100644
--- a/sysdeps/x86_64/multiarch/wcscpy.S
+++ b/sysdeps/x86_64/multiarch/wcscpy.S
@@ -27,11 +27,8 @@
 	.text
 ENTRY(wcscpy)
 	.type	wcscpy, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-
-1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_CPU_FEATURE (SSSE3)
 	jnz	2f
 	leaq	__wcscpy_sse2(%rip), %rax
 	ret
diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
index 109e245..62215f4 100644
--- a/sysdeps/x86_64/multiarch/wmemcmp.S
+++ b/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -26,16 +26,13 @@
 	.text
 ENTRY(wmemcmp)
 	.type	wmemcmp, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-
-1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_CPU_FEATURE (SSSE3)
 	jnz	2f
 	leaq	__wmemcmp_sse2(%rip), %rax
 	ret
 
-2:	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+2:	HAS_CPU_FEATURE (SSE4_1)
 	jz	3f
 	leaq	__wmemcmp_sse4_1(%rip), %rax
 	ret

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=14fda2e2ef67727228e3b7e2dea174cac4ffd19d

commit 14fda2e2ef67727228e3b7e2dea174cac4ffd19d
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Jul 31 07:30:04 2015 -0700

    Add _dl_x86_cpu_features to rtld_global in ld.so
    
    This patch adds _dl_x86_cpu_features to rtld_global in x86 ld.so
    and initializes it early before __libc_start_main is called so that
    cpu_features is always available when it is used and we can avoid
    calling __init_cpu_features in IFUNC selectors.
    
    	* sysdeps/i386/dl-machine.h: Include <cpu-features.c>.
    	(dl_platform_init): Call init_cpu_features.
    	* sysdeps/i386/dl-procinfo.c (_dl_x86_cpu_features): New.
    	* sysdeps/i386/i686/cacheinfo.c
    	(DISABLE_PREFERRED_MEMORY_INSTRUCTION): Removed.
    	* sysdeps/i386/i686/multiarch/Makefile (aux): Remove init-arch.
    	* sysdeps/i386/i686/multiarch/Versions: Removed.
    	* sysdeps/i386/i686/multiarch/ifunc-defines.sym (KIND_OFFSET):
    	Removed.
    	* sysdeps/i386/ldsodefs.h: Include <cpu-features.h>.
    	* sysdeps/unix/sysv/linux/x86/Makefile
    	(libpthread-sysdep_routines): Remove init-arch.
    	* sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c: Include
    	<sysdeps/x86_64/dl-procinfo.c> instead of
    	sysdeps/generic/dl-procinfo.c>.
    	* sysdeps/x86/Makefile [$(subdir) == csu] (gen-as-const-headers):
    	Add cpu-features-offsets.sym and rtld-global-offsets.sym.
    	[$(subdir) == elf] (sysdep-dl-routines): Add dl-get-cpu-features.
    	[$(subdir) == elf] (sysdep-rtld-routines): Likewise.
    	[$(subdir) == elf] (sysdep_routines): Likewise.
    	[$(subdir) == elf] (elide-routines.os): Likewise.
    	[$(subdir) == elf] (tests): Add tst-get-cpu-features.
    	[$(subdir) == elf] (tests-static): Add
    	tst-get-cpu-features-static.
    	* sysdeps/x86/Versions: New file.
    	* sysdeps/x86/cpu-features-offsets.sym: Likewise.
    	* sysdeps/x86/cpu-features.c: Likewise.
    	* sysdeps/x86/cpu-features.h: Likewise.
    	* sysdeps/x86/dl-get-cpu-features.c: Likewise.
    	* sysdeps/x86/libc-start.c: Likewise.
    	* sysdeps/x86/rtld-global-offsets.sym: Likewise.
    	* sysdeps/x86/tst-get-cpu-features-static.c: Likewise.
    	* sysdeps/x86/tst-get-cpu-features.c: Likewise.
    	* sysdeps/x86_64/dl-procinfo.c: Likewise.
    	* sysdeps/x86_64/cacheinfo.c (__cpuid_count): Removed.
    	Assume USE_MULTIARCH is defined and don't check it.
    	(is_intel): Replace __cpu_features with GLRO(dl_x86_cpu_features).
    	(is_amd): Likewise.
    	(max_cpuid): Likewise.
    	(intel_check_word): Likewise.
    	(__cache_sysconf): Don't call __init_cpu_features.
    	(__x86_preferred_memory_instruction): Removed.
    	(init_cacheinfo): Don't call __init_cpu_features. Replace
    	__cpu_features with GLRO(dl_x86_cpu_features).
    	* sysdeps/x86_64/dl-machine.h: <cpu-features.c>.
    	(dl_platform_init): Call init_cpu_features.
    	* sysdeps/x86_64/ldsodefs.h: Include <cpu-features.h>.
    	* sysdeps/x86_64/multiarch/Makefile (aux): Remove init-arch.
    	* sysdeps/x86_64/multiarch/Versions: Removed.
    	* sysdeps/x86_64/multiarch/cacheinfo.c: Likewise.
    	* sysdeps/x86_64/multiarch/init-arch.c: Likewise.
    	* sysdeps/x86_64/multiarch/ifunc-defines.sym (KIND_OFFSET):
    	Removed.
    	* sysdeps/x86_64/multiarch/init-arch.h: Rewrite.

diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
index a8a90f1..fe75d59 100644
--- a/sysdeps/i386/dl-machine.h
+++ b/sysdeps/i386/dl-machine.h
@@ -25,6 +25,7 @@
 #include <sysdep.h>
 #include <tls.h>
 #include <dl-tlsdesc.h>
+#include <cpu-features.c>
 
 /* Return nonzero iff ELF header is compatible with the running host.  */
 static inline int __attribute__ ((unused))
@@ -235,6 +236,8 @@ dl_platform_init (void)
   if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
     /* Avoid an empty string which would disturb us.  */
     GLRO(dl_platform) = NULL;
+
+  init_cpu_features (&GLRO(dl_x86_cpu_features));
 }
 
 static inline Elf32_Addr
diff --git a/sysdeps/i386/dl-procinfo.c b/sysdeps/i386/dl-procinfo.c
index b673b3c..e95f335 100644
--- a/sysdeps/i386/dl-procinfo.c
+++ b/sysdeps/i386/dl-procinfo.c
@@ -43,6 +43,22 @@
 # define PROCINFO_CLASS
 #endif
 
+#if !IS_IN (ldconfig)
+# if !defined PROCINFO_DECL && defined SHARED
+  ._dl_x86_cpu_features
+# else
+PROCINFO_CLASS struct cpu_features _dl_x86_cpu_features
+# endif
+# ifndef PROCINFO_DECL
+= { }
+# endif
+# if !defined SHARED || defined PROCINFO_DECL
+;
+# else
+,
+# endif
+#endif
+
 #if !defined PROCINFO_DECL && defined SHARED
   ._dl_x86_cap_flags
 #else
diff --git a/sysdeps/i386/i686/cacheinfo.c b/sysdeps/i386/i686/cacheinfo.c
index 0f869df..0b50c6d 100644
--- a/sysdeps/i386/i686/cacheinfo.c
+++ b/sysdeps/i386/i686/cacheinfo.c
@@ -1,4 +1,3 @@
 #define DISABLE_PREFETCHW
-#define DISABLE_PREFERRED_MEMORY_INSTRUCTION
 
 #include <sysdeps/x86_64/cacheinfo.c>
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 11ce4ba..31bfd39 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -1,5 +1,4 @@
 ifeq ($(subdir),csu)
-aux += init-arch
 tests += test-multiarch
 gen-as-const-headers += ifunc-defines.sym
 endif
diff --git a/sysdeps/i386/i686/multiarch/ifunc-defines.sym b/sysdeps/i386/i686/multiarch/ifunc-defines.sym
index eb1538a..96e9cfa 100644
--- a/sysdeps/i386/i686/multiarch/ifunc-defines.sym
+++ b/sysdeps/i386/i686/multiarch/ifunc-defines.sym
@@ -4,7 +4,6 @@
 --
 
 CPU_FEATURES_SIZE	sizeof (struct cpu_features)
-KIND_OFFSET		offsetof (struct cpu_features, kind)
 CPUID_OFFSET		offsetof (struct cpu_features, cpuid)
 CPUID_SIZE		sizeof (struct cpuid_registers)
 CPUID_EAX_OFFSET	offsetof (struct cpuid_registers, eax)
diff --git a/sysdeps/i386/ldsodefs.h b/sysdeps/i386/ldsodefs.h
index d80cf01..dae2d04 100644
--- a/sysdeps/i386/ldsodefs.h
+++ b/sysdeps/i386/ldsodefs.h
@@ -20,6 +20,7 @@
 #define	_I386_LDSODEFS_H	1
 
 #include <elf.h>
+#include <cpu-features.h>
 
 struct La_i86_regs;
 struct La_i86_retval;
diff --git a/sysdeps/unix/sysv/linux/x86/Makefile b/sysdeps/unix/sysv/linux/x86/Makefile
index d6be472..9e6ec44 100644
--- a/sysdeps/unix/sysv/linux/x86/Makefile
+++ b/sysdeps/unix/sysv/linux/x86/Makefile
@@ -15,7 +15,6 @@ sysdep_headers += sys/elf.h sys/perm.h sys/reg.h sys/vm86.h sys/debugreg.h sys/i
 endif
 
 ifeq ($(subdir),nptl)
-libpthread-sysdep_routines += init-arch
 libpthread-sysdep_routines += elision-lock elision-unlock elision-timed \
 			      elision-trylock
 endif
diff --git a/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c b/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c
index 8ac351e..a3c0c19 100644
--- a/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c
+++ b/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c
@@ -1,5 +1,5 @@
 #if IS_IN (ldconfig)
 # include <sysdeps/i386/dl-procinfo.c>
 #else
-# include <sysdeps/generic/dl-procinfo.c>
+# include <sysdeps/x86_64/dl-procinfo.c>
 #endif
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 19f5eca..74d2b17 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -8,3 +8,17 @@ $(objpfx)tst-ld-sse-use.out: ../sysdeps/x86/tst-ld-sse-use.sh $(objpfx)ld.so
 	$(BASH) $< $(objpfx) '$(NM)' '$(OBJDUMP)' '$(READELF)' > $@; \
 	$(evaluate-test)
 endif
+
+ifeq ($(subdir),csu)
+gen-as-const-headers += cpu-features-offsets.sym rtld-global-offsets.sym
+endif
+
+ifeq ($(subdir),elf)
+sysdep-ld-routines += dl-get-cpu-features
+sysdep-rtld-routines += dl-get-cpu-features
+sysdep_routines += dl-get-cpu-features
+elide-routines.os += dl-get-cpu-features
+
+tests += tst-get-cpu-features
+tests-static += tst-get-cpu-features-static
+endif
diff --git a/sysdeps/i386/i686/multiarch/Versions b/sysdeps/x86/Versions
similarity index 87%
rename from sysdeps/i386/i686/multiarch/Versions
rename to sysdeps/x86/Versions
index 59b185a..e029237 100644
--- a/sysdeps/i386/i686/multiarch/Versions
+++ b/sysdeps/x86/Versions
@@ -1,4 +1,4 @@
-libc {
+ld {
   GLIBC_PRIVATE {
     __get_cpu_features;
   }
diff --git a/sysdeps/x86/cpu-features-offsets.sym b/sysdeps/x86/cpu-features-offsets.sym
new file mode 100644
index 0000000..a9d53d1
--- /dev/null
+++ b/sysdeps/x86/cpu-features-offsets.sym
@@ -0,0 +1,7 @@
+#define SHARED 1
+
+#include <ldsodefs.h>
+
+#define rtld_global_ro_offsetof(mem) offsetof (struct rtld_global_ro, mem)
+
+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET rtld_global_ro_offsetof (_dl_x86_cpu_features)
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86/cpu-features.c
similarity index 60%
rename from sysdeps/x86_64/multiarch/init-arch.c
rename to sysdeps/x86/cpu-features.c
index 7dec218..587080c 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86/cpu-features.c
@@ -1,7 +1,6 @@
 /* Initialize CPU feature data.
    This file is part of the GNU C Library.
    Copyright (C) 2008-2015 Free Software Foundation, Inc.
-   Contributed by Ulrich Drepper <drepper@redhat.com>.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -17,48 +16,40 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <atomic.h>
 #include <cpuid.h>
-#include "init-arch.h"
+#include <cpu-features.h>
 
-
-struct cpu_features __cpu_features attribute_hidden;
-
-
-static void
-get_common_indeces (unsigned int *family, unsigned int *model)
+static inline void
+get_common_indeces (struct cpu_features *cpu_features,
+		    unsigned int *family, unsigned int *model)
 {
-  __cpuid (1, __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax,
-	   __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx,
-	   __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx,
-	   __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx);
-
-  unsigned int eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
+  unsigned int eax;
+  __cpuid (1, eax, cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx,
+	   cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx,
+	   cpu_features->cpuid[COMMON_CPUID_INDEX_1].edx);
+  GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].eax = eax;
   *family = (eax >> 8) & 0x0f;
   *model = (eax >> 4) & 0x0f;
 }
 
-
-void
-__init_cpu_features (void)
+static inline void
+init_cpu_features (struct cpu_features *cpu_features)
 {
-  unsigned int ebx;
-  unsigned int ecx;
-  unsigned int edx;
+  unsigned int ebx, ecx, edx;
   unsigned int family = 0;
   unsigned int model = 0;
   enum cpu_features_kind kind;
 
-  __cpuid (0, __cpu_features.max_cpuid, ebx, ecx, edx);
+  __cpuid (0, cpu_features->max_cpuid, ebx, ecx, edx);
 
   /* This spells out "GenuineIntel".  */
   if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
     {
       kind = arch_kind_intel;
 
-      get_common_indeces (&family, &model);
+      get_common_indeces (cpu_features, &family, &model);
 
-      unsigned int eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
+      unsigned int eax = cpu_features->cpuid[COMMON_CPUID_INDEX_1].eax;
       unsigned int extended_family = (eax >> 20) & 0xff;
       unsigned int extended_model = (eax >> 12) & 0xf0;
       if (family == 0x0f)
@@ -68,14 +59,14 @@ __init_cpu_features (void)
 	}
       else if (family == 0x06)
 	{
-	  ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
+	  ecx = cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx;
 	  model += extended_model;
 	  switch (model)
 	    {
 	    case 0x1c:
 	    case 0x26:
 	      /* BSF is slow on Atom.  */
-	      __cpu_features.feature[index_Slow_BSF] |= bit_Slow_BSF;
+	      cpu_features->feature[index_Slow_BSF] |= bit_Slow_BSF;
 	      break;
 
 	    case 0x37:
@@ -91,7 +82,7 @@ __init_cpu_features (void)
 #if index_Fast_Unaligned_Load != index_Slow_SSE4_2
 # error index_Fast_Unaligned_Load != index_Slow_SSE4_2
 #endif
-	      __cpu_features.feature[index_Fast_Unaligned_Load]
+	      cpu_features->feature[index_Fast_Unaligned_Load]
 		|= (bit_Fast_Unaligned_Load
 		    | bit_Prefer_PMINUB_for_stringop
 		    | bit_Slow_SSE4_2);
@@ -121,7 +112,7 @@ __init_cpu_features (void)
 #if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
 # error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
 #endif
-	      __cpu_features.feature[index_Fast_Rep_String]
+	      cpu_features->feature[index_Fast_Rep_String]
 		|= (bit_Fast_Rep_String
 		    | bit_Fast_Copy_Backward
 		    | bit_Fast_Unaligned_Load
@@ -135,31 +126,31 @@ __init_cpu_features (void)
     {
       kind = arch_kind_amd;
 
-      get_common_indeces (&family, &model);
+      get_common_indeces (cpu_features, &family, &model);
 
-      ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
+      ecx = cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx;
 
       unsigned int eax;
       __cpuid (0x80000000, eax, ebx, ecx, edx);
       if (eax >= 0x80000001)
 	__cpuid (0x80000001,
-		 __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].eax,
-		 __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].ebx,
-		 __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].ecx,
-		 __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].edx);
+		 cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].eax,
+		 cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].ebx,
+		 cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].ecx,
+		 cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].edx);
     }
   else
     kind = arch_kind_other;
 
-  if (__cpu_features.max_cpuid >= 7)
+  if (cpu_features->max_cpuid >= 7)
     __cpuid_count (7, 0,
-		   __cpu_features.cpuid[COMMON_CPUID_INDEX_7].eax,
-		   __cpu_features.cpuid[COMMON_CPUID_INDEX_7].ebx,
-		   __cpu_features.cpuid[COMMON_CPUID_INDEX_7].ecx,
-		   __cpu_features.cpuid[COMMON_CPUID_INDEX_7].edx);
+		   cpu_features->cpuid[COMMON_CPUID_INDEX_7].eax,
+		   cpu_features->cpuid[COMMON_CPUID_INDEX_7].ebx,
+		   cpu_features->cpuid[COMMON_CPUID_INDEX_7].ecx,
+		   cpu_features->cpuid[COMMON_CPUID_INDEX_7].edx);
 
   /* Can we call xgetbv?  */
-  if (CPUID_OSXSAVE)
+  if (HAS_CPU_FEATURE (OSXSAVE))
     {
       unsigned int xcrlow;
       unsigned int xcrhigh;
@@ -169,38 +160,43 @@ __init_cpu_features (void)
 	  (bit_YMM_state | bit_XMM_state))
 	{
 	  /* Determine if AVX is usable.  */
-	  if (CPUID_AVX)
-	    __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable;
+	  if (HAS_CPU_FEATURE (AVX))
+	    cpu_features->feature[index_AVX_Usable] |= bit_AVX_Usable;
 #if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
 # error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
 #endif
 	  /* Determine if AVX2 is usable.  Unaligned load with 256-bit
 	     AVX registers are faster on processors with AVX2.  */
-	  if (CPUID_AVX2)
-	    __cpu_features.feature[index_AVX2_Usable]
+	  if (HAS_CPU_FEATURE (AVX2))
+	    cpu_features->feature[index_AVX2_Usable]
 	      |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load;
+	  /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and
+	     ZMM16-ZMM31 state are enabled.  */
+	  if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state
+			 | bit_ZMM16_31_state)) ==
+	      (bit_Opmask_state | bit_ZMM0_15_state | bit_ZMM16_31_state))
+	    {
+	      /* Determine if AVX512F is usable.  */
+	      if (HAS_CPU_FEATURE (AVX512F))
+		{
+		  cpu_features->feature[index_AVX512F_Usable]
+		    |= bit_AVX512F_Usable;
+		  /* Determine if AVX512DQ is usable.  */
+		  if (HAS_CPU_FEATURE (AVX512DQ))
+		    cpu_features->feature[index_AVX512DQ_Usable]
+		      |= bit_AVX512DQ_Usable;
+		}
+	    }
 	  /* Determine if FMA is usable.  */
-	  if (CPUID_FMA)
-	    __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable;
+	  if (HAS_CPU_FEATURE (FMA))
+	    cpu_features->feature[index_FMA_Usable] |= bit_FMA_Usable;
 	  /* Determine if FMA4 is usable.  */
-	  if (CPUID_FMA4)
-	    __cpu_features.feature[index_FMA4_Usable] |= bit_FMA4_Usable;
+	  if (HAS_CPU_FEATURE (FMA4))
+	    cpu_features->feature[index_FMA4_Usable] |= bit_FMA4_Usable;
 	}
     }
 
-  __cpu_features.family = family;
-  __cpu_features.model = model;
-  atomic_write_barrier ();
-  __cpu_features.kind = kind;
-}
-
-#undef __get_cpu_features
-
-const struct cpu_features *
-__get_cpu_features (void)
-{
-  if (__cpu_features.kind == arch_kind_unknown)
-    __init_cpu_features ();
-
-  return &__cpu_features;
+  cpu_features->family = family;
+  cpu_features->model = model;
+  cpu_features->kind = kind;
 }
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86/cpu-features.h
similarity index 54%
copy from sysdeps/x86_64/multiarch/init-arch.h
copy to sysdeps/x86/cpu-features.h
index e6b5ba5..22e5abb 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86/cpu-features.h
@@ -15,6 +15,9 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#ifndef cpu_features_h
+#define cpu_features_h
+
 #define bit_Fast_Rep_String		(1 << 0)
 #define bit_Fast_Copy_Backward		(1 << 1)
 #define bit_Slow_BSF			(1 << 2)
@@ -26,6 +29,8 @@
 #define bit_Slow_SSE4_2			(1 << 9)
 #define bit_AVX2_Usable			(1 << 10)
 #define bit_AVX_Fast_Unaligned_Load	(1 << 11)
+#define bit_AVX512F_Usable		(1 << 12)
+#define bit_AVX512DQ_Usable		(1 << 13)
 
 /* CPUID Feature flags.  */
 
@@ -43,20 +48,26 @@
 /* COMMON_CPUID_INDEX_7.  */
 #define bit_RTM		(1 << 11)
 #define bit_AVX2	(1 << 5)
+#define bit_AVX512F	(1 << 16)
+#define bit_AVX512DQ	(1 << 17)
 
 /* XCR0 Feature flags.  */
 #define bit_XMM_state  (1 << 1)
 #define bit_YMM_state  (2 << 1)
+#define bit_Opmask_state	(1 << 5)
+#define bit_ZMM0_15_state	(1 << 6)
+#define bit_ZMM16_31_state	(1 << 7)
 
 /* The integer bit array index for the first set of internal feature bits.  */
-# define FEATURE_INDEX_1 0
+#define FEATURE_INDEX_1 0
 
 /* The current maximum size of the feature integer bit array.  */
-# define FEATURE_INDEX_MAX 1
+#define FEATURE_INDEX_MAX 1
 
 #ifdef	__ASSEMBLER__
 
 # include <ifunc-defines.h>
+# include <rtld-global-offsets.h>
 
 # define index_SSE2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
 # define index_SSSE3	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
@@ -76,10 +87,62 @@
 # define index_Slow_SSE4_2		FEATURE_INDEX_1*FEATURE_SIZE
 # define index_AVX2_Usable		FEATURE_INDEX_1*FEATURE_SIZE
 # define index_AVX_Fast_Unaligned_Load	FEATURE_INDEX_1*FEATURE_SIZE
+# define index_AVX512F_Usable		FEATURE_INDEX_1*FEATURE_SIZE
+# define index_AVX512DQ_Usable		FEATURE_INDEX_1*FEATURE_SIZE
+
+# if defined (_LIBC) && !IS_IN (nonlib)
+#  ifdef __x86_64__
+#   ifdef SHARED
+#    if IS_IN (rtld)
+#     define LOAD_RTLD_GLOBAL_RO_RDX
+#     define HAS_FEATURE(offset, name) \
+  testl $(bit_##name), _rtld_local_ro+offset+(index_##name)(%rip)
+#    else
+#      define LOAD_RTLD_GLOBAL_RO_RDX \
+  mov _rtld_global_ro@GOTPCREL(%rip), %RDX_LP
+#     define HAS_FEATURE(offset, name) \
+  testl $(bit_##name), \
+	RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+offset+(index_##name)(%rdx)
+#    endif
+#   else /* SHARED */
+#    define LOAD_RTLD_GLOBAL_RO_RDX
+#    define HAS_FEATURE(offset, name) \
+  testl $(bit_##name), _dl_x86_cpu_features+offset+(index_##name)(%rip)
+#   endif /* !SHARED */
+#  else  /* __x86_64__ */
+#   ifdef SHARED
+#    define LOAD_FUNC_GOT_EAX(func) \
+  leal func@GOTOFF(%edx), %eax
+#    if IS_IN (rtld)
+#    define LOAD_GOT_AND_RTLD_GLOBAL_RO \
+  LOAD_PIC_REG(dx)
+#     define HAS_FEATURE(offset, name) \
+  testl $(bit_##name), offset+(index_##name)+_rtld_local_ro@GOTOFF(%edx)
+#    else
+#     define LOAD_GOT_AND_RTLD_GLOBAL_RO \
+  LOAD_PIC_REG(dx); \
+  mov _rtld_global_ro@GOT(%edx), %ecx
+#     define HAS_FEATURE(offset, name) \
+  testl $(bit_##name), \
+	RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+offset+(index_##name)(%ecx)
+#    endif
+#   else  /* SHARED */
+#    define LOAD_FUNC_GOT_EAX(func) \
+  leal func, %eax
+#    define LOAD_GOT_AND_RTLD_GLOBAL_RO
+#    define HAS_FEATURE(offset, name) \
+  testl $(bit_##name), _dl_x86_cpu_features+offset+(index_##name)
+#   endif /* !SHARED */
+#  endif /* !__x86_64__ */
+# else /* _LIBC && !nonlib */
+#  error "Sorry, <cpu-features.h> is unimplemented for assembler"
+# endif /* !_LIBC || nonlib */
 
-#else	/* __ASSEMBLER__ */
+/* HAS_* evaluates to true if we may use the feature at runtime.  */
+# define HAS_CPU_FEATURE(name)	HAS_FEATURE (CPUID_OFFSET, name)
+# define HAS_ARCH_FEATURE(name) HAS_FEATURE (FEATURE_OFFSET, name)
 
-# include <sys/param.h>
+#else	/* __ASSEMBLER__ */
 
 enum
   {
@@ -90,7 +153,7 @@ enum
     COMMON_CPUID_INDEX_MAX
   };
 
-extern struct cpu_features
+struct cpu_features
 {
   enum cpu_features_kind
     {
@@ -110,56 +173,53 @@ extern struct cpu_features
   unsigned int family;
   unsigned int model;
   unsigned int feature[FEATURE_INDEX_MAX];
-} __cpu_features attribute_hidden;
-
-
-extern void __init_cpu_features (void) attribute_hidden;
-# define INIT_ARCH() \
-  do							\
-    if (__cpu_features.kind == arch_kind_unknown)	\
-      __init_cpu_features ();				\
-  while (0)
+};
 
-/* Used from outside libc.so to get access to the CPU features structure.  */
+/* Used from outside of glibc to get access to the CPU features
+   structure.  */
 extern const struct cpu_features *__get_cpu_features (void)
      __attribute__ ((const));
 
-# if IS_IN (libc)
-#  define __get_cpu_features()	(&__cpu_features)
+# if defined (_LIBC) && !IS_IN (nonlib)
+/* Unused for x86.  */
+#  define INIT_ARCH()
+#  define __get_cpu_features()	(&GLRO(dl_x86_cpu_features))
 # endif
 
-# define HAS_CPU_FEATURE(idx, reg, bit) \
-  ((__get_cpu_features ()->cpuid[idx].reg & (bit)) != 0)
-
-/* Following are the feature tests used throughout libc.  */
-
-/* CPUID_* evaluates to true if the feature flag is enabled.
-   We always use &__cpu_features because the HAS_CPUID_* macros
-   are called only within __init_cpu_features, where we can't
-   call __get_cpu_features without infinite recursion.  */
-# define HAS_CPUID_FLAG(idx, reg, bit) \
-  (((&__cpu_features)->cpuid[idx].reg & (bit)) != 0)
-
-# define CPUID_OSXSAVE \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_OSXSAVE)
-# define CPUID_AVX \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_AVX)
-# define CPUID_FMA \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_FMA)
-# define CPUID_FMA4 \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_80000001, ecx, bit_FMA4)
-# define CPUID_RTM \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_RTM)
-# define CPUID_AVX2 \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_AVX2)
 
 /* HAS_* evaluates to true if we may use the feature at runtime.  */
-# define HAS_SSE2	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, bit_SSE2)
-# define HAS_POPCOUNT	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_POPCOUNT)
-# define HAS_SSSE3	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSSE3)
-# define HAS_SSE4_1	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_1)
-# define HAS_SSE4_2	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_2)
-# define HAS_RTM	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_7, ebx, bit_RTM)
+# define HAS_CPU_FEATURE(name) \
+  ((__get_cpu_features ()->cpuid[index_##name].reg_##name & (bit_##name)) != 0)
+# define HAS_ARCH_FEATURE(name) \
+  ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
+
+# define index_SSE2		COMMON_CPUID_INDEX_1
+# define index_SSSE3		COMMON_CPUID_INDEX_1
+# define index_SSE4_1		COMMON_CPUID_INDEX_1
+# define index_SSE4_2		COMMON_CPUID_INDEX_1
+# define index_AVX		COMMON_CPUID_INDEX_1
+# define index_AVX2		COMMON_CPUID_INDEX_7
+# define index_AVX512F		COMMON_CPUID_INDEX_7
+# define index_AVX512DQ		COMMON_CPUID_INDEX_7
+# define index_RTM		COMMON_CPUID_INDEX_7
+# define index_FMA		COMMON_CPUID_INDEX_1
+# define index_FMA4		COMMON_CPUID_INDEX_80000001
+# define index_POPCOUNT		COMMON_CPUID_INDEX_1
+# define index_OSXSAVE		COMMON_CPUID_INDEX_1
+
+# define reg_SSE2		edx
+# define reg_SSSE3		ecx
+# define reg_SSE4_1		ecx
+# define reg_SSE4_2		ecx
+# define reg_AVX		ecx
+# define reg_AVX2		ebx
+# define reg_AVX512F		ebx
+# define reg_AVX512DQ		ebx
+# define reg_RTM		ebx
+# define reg_FMA		ecx
+# define reg_FMA4		ecx
+# define reg_POPCOUNT		ecx
+# define reg_OSXSAVE		ecx
 
 # define index_Fast_Rep_String		FEATURE_INDEX_1
 # define index_Fast_Copy_Backward	FEATURE_INDEX_1
@@ -172,18 +232,9 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_Slow_SSE4_2		FEATURE_INDEX_1
 # define index_AVX2_Usable		FEATURE_INDEX_1
 # define index_AVX_Fast_Unaligned_Load	FEATURE_INDEX_1
+# define index_AVX512F_Usable		FEATURE_INDEX_1
+# define index_AVX512DQ_Usable		FEATURE_INDEX_1
 
-# define HAS_ARCH_FEATURE(name) \
-  ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
+#endif	/* !__ASSEMBLER__ */
 
-# define HAS_FAST_REP_STRING		HAS_ARCH_FEATURE (Fast_Rep_String)
-# define HAS_FAST_COPY_BACKWARD		HAS_ARCH_FEATURE (Fast_Copy_Backward)
-# define HAS_SLOW_BSF			HAS_ARCH_FEATURE (Slow_BSF)
-# define HAS_FAST_UNALIGNED_LOAD	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
-# define HAS_AVX			HAS_ARCH_FEATURE (AVX_Usable)
-# define HAS_AVX2			HAS_ARCH_FEATURE (AVX2_Usable)
-# define HAS_FMA			HAS_ARCH_FEATURE (FMA_Usable)
-# define HAS_FMA4			HAS_ARCH_FEATURE (FMA4_Usable)
-# define HAS_AVX_FAST_UNALIGNED_LOAD	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-
-#endif	/* __ASSEMBLER__ */
+#endif  /* cpu_features_h */
diff --git a/sysdeps/x86/dl-get-cpu-features.c b/sysdeps/x86/dl-get-cpu-features.c
new file mode 100644
index 0000000..080e5e8
--- /dev/null
+++ b/sysdeps/x86/dl-get-cpu-features.c
@@ -0,0 +1,27 @@
+/* This file is part of the GNU C Library.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#include <ldsodefs.h>
+
+#undef __get_cpu_features
+
+const struct cpu_features *
+__get_cpu_features (void)
+{
+  return &GLRO(dl_x86_cpu_features);
+}
diff --git a/sysdeps/i386/ldsodefs.h b/sysdeps/x86/libc-start.c
similarity index 50%
copy from sysdeps/i386/ldsodefs.h
copy to sysdeps/x86/libc-start.c
index d80cf01..9f0c045 100644
--- a/sysdeps/i386/ldsodefs.h
+++ b/sysdeps/x86/libc-start.c
@@ -1,5 +1,4 @@
-/* Run-time dynamic linker data structures for loaded ELF shared objects.
-   Copyright (C) 1995-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,25 +15,27 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef	_I386_LDSODEFS_H
-#define	_I386_LDSODEFS_H	1
-
-#include <elf.h>
-
-struct La_i86_regs;
-struct La_i86_retval;
-
-#define ARCH_PLTENTER_MEMBERS						\
-    Elf32_Addr (*i86_gnu_pltenter) (Elf32_Sym *, unsigned int, uintptr_t *, \
-				    uintptr_t *, struct La_i86_regs *,	\
-				    unsigned int *, const char *name,	\
-				    long int *framesizep)
-
-#define ARCH_PLTEXIT_MEMBERS						\
-    unsigned int (*i86_gnu_pltexit) (Elf32_Sym *, unsigned int, uintptr_t *, \
-				     uintptr_t *, const struct La_i86_regs *, \
-				     struct La_i86_retval *, const char *)
-
-#include_next <ldsodefs.h>
-
+#ifdef SHARED
+# include <csu/libc-start.c>
+# else
+/* The main work is done in the generic function.  */
+# define LIBC_START_DISABLE_INLINE
+# define LIBC_START_MAIN generic_start_main
+# include <csu/libc-start.c>
+# include <cpu-features.h>
+# include <cpu-features.c>
+
+extern struct cpu_features _dl_x86_cpu_features;
+
+int
+__libc_start_main (int (*main) (int, char **, char ** MAIN_AUXVEC_DECL),
+		   int argc, char **argv,
+		   __typeof (main) init,
+		   void (*fini) (void),
+		   void (*rtld_fini) (void), void *stack_end)
+{
+  init_cpu_features (&_dl_x86_cpu_features);
+  return generic_start_main (main, argc, argv, init, fini, rtld_fini,
+			     stack_end);
+}
 #endif
diff --git a/sysdeps/x86/rtld-global-offsets.sym b/sysdeps/x86/rtld-global-offsets.sym
new file mode 100644
index 0000000..a9d53d1
--- /dev/null
+++ b/sysdeps/x86/rtld-global-offsets.sym
@@ -0,0 +1,7 @@
+#define SHARED 1
+
+#include <ldsodefs.h>
+
+#define rtld_global_ro_offsetof(mem) offsetof (struct rtld_global_ro, mem)
+
+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET rtld_global_ro_offsetof (_dl_x86_cpu_features)
diff --git a/sysdeps/x86/tst-get-cpu-features-static.c b/sysdeps/x86/tst-get-cpu-features-static.c
new file mode 100644
index 0000000..03f5906
--- /dev/null
+++ b/sysdeps/x86/tst-get-cpu-features-static.c
@@ -0,0 +1 @@
+#include "tst-get-cpu-features.c"
diff --git a/sysdeps/i386/ldsodefs.h b/sysdeps/x86/tst-get-cpu-features.c
similarity index 50%
copy from sysdeps/i386/ldsodefs.h
copy to sysdeps/x86/tst-get-cpu-features.c
index d80cf01..c17060f 100644
--- a/sysdeps/i386/ldsodefs.h
+++ b/sysdeps/x86/tst-get-cpu-features.c
@@ -1,5 +1,5 @@
-/* Run-time dynamic linker data structures for loaded ELF shared objects.
-   Copyright (C) 1995-2015 Free Software Foundation, Inc.
+/* Test case for x86 __get_cpu_features interface
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,25 +16,16 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef	_I386_LDSODEFS_H
-#define	_I386_LDSODEFS_H	1
+#include <stdlib.h>
+#include <cpu-features.h>
 
-#include <elf.h>
+static int
+do_test (void)
+{
+  if (__get_cpu_features ()->kind == arch_kind_unknown)
+    abort ();
+  return 0;
+}
 
-struct La_i86_regs;
-struct La_i86_retval;
-
-#define ARCH_PLTENTER_MEMBERS						\
-    Elf32_Addr (*i86_gnu_pltenter) (Elf32_Sym *, unsigned int, uintptr_t *, \
-				    uintptr_t *, struct La_i86_regs *,	\
-				    unsigned int *, const char *name,	\
-				    long int *framesizep)
-
-#define ARCH_PLTEXIT_MEMBERS						\
-    unsigned int (*i86_gnu_pltexit) (Elf32_Sym *, unsigned int, uintptr_t *, \
-				     uintptr_t *, const struct La_i86_regs *, \
-				     struct La_i86_retval *, const char *)
-
-#include_next <ldsodefs.h>
-
-#endif
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index f1cbf50..0ff5309 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -21,40 +21,11 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <cpuid.h>
+#include "multiarch/init-arch.h"
 
-#ifndef __cpuid_count
-/* FIXME: Provide __cpuid_count if it isn't defined.  Copied from gcc
-   4.4.0.  Remove this if gcc 4.4 is the minimum requirement.  */
-# if defined(__i386__) && defined(__PIC__)
-/* %ebx may be the PIC register.  */
-#  define __cpuid_count(level, count, a, b, c, d)		\
-  __asm__ ("xchg{l}\t{%%}ebx, %1\n\t"			\
-	   "cpuid\n\t"					\
-	   "xchg{l}\t{%%}ebx, %1\n\t"			\
-	   : "=a" (a), "=r" (b), "=c" (c), "=d" (d)	\
-	   : "0" (level), "2" (count))
-# else
-#  define __cpuid_count(level, count, a, b, c, d)		\
-  __asm__ ("cpuid\n\t"					\
-	   : "=a" (a), "=b" (b), "=c" (c), "=d" (d)	\
-	   : "0" (level), "2" (count))
-# endif
-#endif
-
-#ifdef USE_MULTIARCH
-# include "multiarch/init-arch.h"
-
-# define is_intel __cpu_features.kind == arch_kind_intel
-# define is_amd __cpu_features.kind == arch_kind_amd
-# define max_cpuid __cpu_features.max_cpuid
-#else
-  /* This spells out "GenuineIntel".  */
-# define is_intel \
-  ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69
-  /* This spells out "AuthenticAMD".  */
-# define is_amd \
-  ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65
-#endif
+#define is_intel GLRO(dl_x86_cpu_features).kind == arch_kind_intel
+#define is_amd GLRO(dl_x86_cpu_features).kind == arch_kind_amd
+#define max_cpuid GLRO(dl_x86_cpu_features).max_cpuid
 
 static const struct intel_02_cache_info
 {
@@ -235,21 +206,8 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
 	      /* Intel reused this value.  For family 15, model 6 it
 		 specifies the 3rd level cache.  Otherwise the 2nd
 		 level cache.  */
-	      unsigned int family;
-	      unsigned int model;
-#ifdef USE_MULTIARCH
-	      family = __cpu_features.family;
-	      model = __cpu_features.model;
-#else
-	      unsigned int eax;
-	      unsigned int ebx;
-	      unsigned int ecx;
-	      unsigned int edx;
-	      __cpuid (1, eax, ebx, ecx, edx);
-
-	      family = ((eax >> 20) & 0xff) + ((eax >> 8) & 0xf);
-	      model = (((eax >>16) & 0xf) << 4) + ((eax >> 4) & 0xf);
-#endif
+	      unsigned int family = GLRO(dl_x86_cpu_features).family;
+	      unsigned int model = GLRO(dl_x86_cpu_features).model;
 
 	      if (family == 15 && model == 6)
 		{
@@ -476,18 +434,6 @@ long int
 attribute_hidden
 __cache_sysconf (int name)
 {
-#ifdef USE_MULTIARCH
-  if (__cpu_features.kind == arch_kind_unknown)
-    __init_cpu_features ();
-#else
-  /* Find out what brand of processor.  */
-  unsigned int max_cpuid;
-  unsigned int ebx;
-  unsigned int ecx;
-  unsigned int edx;
-  __cpuid (0, max_cpuid, ebx, ecx, edx);
-#endif
-
   if (is_intel)
     return handle_intel (name, max_cpuid);
 
@@ -523,18 +469,6 @@ long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
 int __x86_prefetchw attribute_hidden;
 #endif
 
-#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION
-/* Instructions preferred for memory and string routines.
-
-  0: Regular instructions
-  1: MMX instructions
-  2: SSE2 instructions
-  3: SSSE3 instructions
-
-  */
-int __x86_preferred_memory_instruction attribute_hidden;
-#endif
-
 
 static void
 __attribute__((constructor))
@@ -551,14 +485,6 @@ init_cacheinfo (void)
   unsigned int level;
   unsigned int threads = 0;
 
-#ifdef USE_MULTIARCH
-  if (__cpu_features.kind == arch_kind_unknown)
-    __init_cpu_features ();
-#else
-  int max_cpuid;
-  __cpuid (0, max_cpuid, ebx, ecx, edx);
-#endif
-
   if (is_intel)
     {
       data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
@@ -574,30 +500,13 @@ init_cacheinfo (void)
 	  shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
 	}
 
-      unsigned int ebx_1;
-
-#ifdef USE_MULTIARCH
-      eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
-      ebx_1 = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx;
-      ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
-      edx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx;
-#else
-      __cpuid (1, eax, ebx_1, ecx, edx);
-#endif
-
-#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION
-      /* Intel prefers SSSE3 instructions for memory/string routines
-	 if they are available.  */
-      if ((ecx & 0x200))
-	__x86_preferred_memory_instruction = 3;
-      else
-	__x86_preferred_memory_instruction = 2;
-#endif
-
       /* Figure out the number of logical threads that share the
 	 highest cache level.  */
       if (max_cpuid >= 4)
 	{
+	  unsigned int family = GLRO(dl_x86_cpu_features).family;
+	  unsigned int model = GLRO(dl_x86_cpu_features).model;
+
 	  int i = 0;
 
 	  /* Query until desired cache level is enumerated.  */
@@ -647,13 +556,33 @@ init_cacheinfo (void)
 		}
 	    }
 	  threads += 1;
+	  if (threads > 2 && level == 2 && family == 6)
+	    {
+	      switch (model)
+		{
+		case 0x57:
+		  /* Knights Landing has L2 cache shared by 2 cores.  */
+		case 0x37:
+		case 0x4a:
+		case 0x4d:
+		case 0x5a:
+		case 0x5d:
+		  /* Silvermont has L2 cache shared by 2 cores.  */
+		  threads = 2;
+		  break;
+		default:
+		  break;
+		}
+	    }
 	}
       else
 	{
 	intel_bug_no_cache_info:
 	  /* Assume that all logical threads share the highest cache level.  */
 
-	  threads = (ebx_1 >> 16) & 0xff;
+	  threads
+	    = ((GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].ebx
+		>> 16) & 0xff);
 	}
 
       /* Cap usage of highest cache level to the number of supported
@@ -668,25 +597,6 @@ init_cacheinfo (void)
       long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
 
-#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION
-# ifdef USE_MULTIARCH
-      eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
-      ebx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx;
-      ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
-      edx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx;
-# else
-      __cpuid (1, eax, ebx, ecx, edx);
-# endif
-
-      /* AMD prefers SSSE3 instructions for memory/string routines
-	 if they are avaiable, otherwise it prefers integer
-	 instructions.  */
-      if ((ecx & 0x200))
-	__x86_preferred_memory_instruction = 3;
-      else
-	__x86_preferred_memory_instruction = 0;
-#endif
-
       /* Get maximum extended function. */
       __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
 
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index fd79275..3fa5e61 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -26,6 +26,7 @@
 #include <sysdep.h>
 #include <tls.h>
 #include <dl-tlsdesc.h>
+#include <cpu-features.c>
 
 /* Return nonzero iff ELF header is compatible with the running host.  */
 static inline int __attribute__ ((unused))
@@ -205,6 +206,8 @@ dl_platform_init (void)
   if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
     /* Avoid an empty string which would disturb us.  */
     GLRO(dl_platform) = NULL;
+
+  init_cpu_features (&GLRO(dl_x86_cpu_features));
 }
 
 static inline ElfW(Addr)
diff --git a/sysdeps/i386/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
similarity index 61%
copy from sysdeps/i386/dl-procinfo.c
copy to sysdeps/x86_64/dl-procinfo.c
index b673b3c..851681a 100644
--- a/sysdeps/i386/dl-procinfo.c
+++ b/sysdeps/x86_64/dl-procinfo.c
@@ -1,7 +1,6 @@
-/* Data for i386 version of processor capability information.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+/* Data for x86-64 version of processor capability information.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@redhat.com>, 2001.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -17,10 +16,7 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This information must be kept in sync with the _DL_HWCAP_COUNT and
-   _DL_PLATFORM_COUNT definitions in procinfo.h.
-
-   If anything should be added here check whether the size of each string
+/* If anything should be added here check whether the size of each string
    is still ok with the given array size.
 
    All the #ifdefs in the definitions are quite irritating but
@@ -44,33 +40,12 @@
 #endif
 
 #if !defined PROCINFO_DECL && defined SHARED
-  ._dl_x86_cap_flags
-#else
-PROCINFO_CLASS const char _dl_x86_cap_flags[32][8]
-#endif
-#ifndef PROCINFO_DECL
-= {
-    "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
-    "cx8", "apic", "10", "sep", "mtrr", "pge", "mca", "cmov",
-    "pat", "pse36", "pn", "clflush", "20", "dts", "acpi", "mmx",
-    "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe"
-  }
-#endif
-#if !defined SHARED || defined PROCINFO_DECL
-;
-#else
-,
-#endif
-
-#if !defined PROCINFO_DECL && defined SHARED
-  ._dl_x86_platforms
+  ._dl_x86_cpu_features
 #else
-PROCINFO_CLASS const char _dl_x86_platforms[4][5]
+PROCINFO_CLASS struct cpu_features _dl_x86_cpu_features
 #endif
 #ifndef PROCINFO_DECL
-= {
-    "i386", "i486", "i586", "i686"
-  }
+= { }
 #endif
 #if !defined SHARED || defined PROCINFO_DECL
 ;
diff --git a/sysdeps/x86_64/ldsodefs.h b/sysdeps/x86_64/ldsodefs.h
index 84d36e8..e3f2da2 100644
--- a/sysdeps/x86_64/ldsodefs.h
+++ b/sysdeps/x86_64/ldsodefs.h
@@ -20,6 +20,7 @@
 #define	_X86_64_LDSODEFS_H	1
 
 #include <elf.h>
+#include <cpu-features.h>
 
 struct La_x86_64_regs;
 struct La_x86_64_retval;
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d7002a9..d10b4d4 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -1,5 +1,4 @@
 ifeq ($(subdir),csu)
-aux += init-arch
 tests += test-multiarch
 gen-as-const-headers += ifunc-defines.sym
 endif
diff --git a/sysdeps/x86_64/multiarch/Versions b/sysdeps/x86_64/multiarch/Versions
deleted file mode 100644
index 59b185a..0000000
--- a/sysdeps/x86_64/multiarch/Versions
+++ /dev/null
@@ -1,5 +0,0 @@
-libc {
-  GLIBC_PRIVATE {
-    __get_cpu_features;
-  }
-}
diff --git a/sysdeps/x86_64/multiarch/cacheinfo.c b/sysdeps/x86_64/multiarch/cacheinfo.c
deleted file mode 100644
index f87b8dc..0000000
--- a/sysdeps/x86_64/multiarch/cacheinfo.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define DISABLE_PREFERRED_MEMORY_INSTRUCTION
-#include "../cacheinfo.c"
diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym b/sysdeps/x86_64/multiarch/ifunc-defines.sym
index a410d88..3df946f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-defines.sym
+++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym
@@ -4,7 +4,6 @@
 --
 
 CPU_FEATURES_SIZE	sizeof (struct cpu_features)
-KIND_OFFSET		offsetof (struct cpu_features, kind)
 CPUID_OFFSET		offsetof (struct cpu_features, cpuid)
 CPUID_SIZE		sizeof (struct cpuid_registers)
 CPUID_EAX_OFFSET	offsetof (struct cpuid_registers, eax)
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index e6b5ba5..2b9988e 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -15,175 +15,8 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define bit_Fast_Rep_String		(1 << 0)
-#define bit_Fast_Copy_Backward		(1 << 1)
-#define bit_Slow_BSF			(1 << 2)
-#define bit_Fast_Unaligned_Load		(1 << 4)
-#define bit_Prefer_PMINUB_for_stringop	(1 << 5)
-#define bit_AVX_Usable			(1 << 6)
-#define bit_FMA_Usable			(1 << 7)
-#define bit_FMA4_Usable			(1 << 8)
-#define bit_Slow_SSE4_2			(1 << 9)
-#define bit_AVX2_Usable			(1 << 10)
-#define bit_AVX_Fast_Unaligned_Load	(1 << 11)
-
-/* CPUID Feature flags.  */
-
-/* COMMON_CPUID_INDEX_1.  */
-#define bit_SSE2	(1 << 26)
-#define bit_SSSE3	(1 << 9)
-#define bit_SSE4_1	(1 << 19)
-#define bit_SSE4_2	(1 << 20)
-#define bit_OSXSAVE	(1 << 27)
-#define bit_AVX		(1 << 28)
-#define bit_POPCOUNT	(1 << 23)
-#define bit_FMA		(1 << 12)
-#define bit_FMA4	(1 << 16)
-
-/* COMMON_CPUID_INDEX_7.  */
-#define bit_RTM		(1 << 11)
-#define bit_AVX2	(1 << 5)
-
-/* XCR0 Feature flags.  */
-#define bit_XMM_state  (1 << 1)
-#define bit_YMM_state  (2 << 1)
-
-/* The integer bit array index for the first set of internal feature bits.  */
-# define FEATURE_INDEX_1 0
-
-/* The current maximum size of the feature integer bit array.  */
-# define FEATURE_INDEX_MAX 1
-
-#ifdef	__ASSEMBLER__
-
-# include <ifunc-defines.h>
-
-# define index_SSE2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
-# define index_SSSE3	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-# define index_SSE4_1	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-# define index_SSE4_2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-# define index_AVX	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-# define index_AVX2	COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
-
-# define index_Fast_Rep_String		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Fast_Copy_Backward	FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Slow_BSF			FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Fast_Unaligned_Load	FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_FMA_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_FMA4_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Slow_SSE4_2		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX2_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX_Fast_Unaligned_Load	FEATURE_INDEX_1*FEATURE_SIZE
-
-#else	/* __ASSEMBLER__ */
-
-# include <sys/param.h>
-
-enum
-  {
-    COMMON_CPUID_INDEX_1 = 0,
-    COMMON_CPUID_INDEX_7,
-    COMMON_CPUID_INDEX_80000001,	/* for AMD */
-    /* Keep the following line at the end.  */
-    COMMON_CPUID_INDEX_MAX
-  };
-
-extern struct cpu_features
-{
-  enum cpu_features_kind
-    {
-      arch_kind_unknown = 0,
-      arch_kind_intel,
-      arch_kind_amd,
-      arch_kind_other
-    } kind;
-  int max_cpuid;
-  struct cpuid_registers
-  {
-    unsigned int eax;
-    unsigned int ebx;
-    unsigned int ecx;
-    unsigned int edx;
-  } cpuid[COMMON_CPUID_INDEX_MAX];
-  unsigned int family;
-  unsigned int model;
-  unsigned int feature[FEATURE_INDEX_MAX];
-} __cpu_features attribute_hidden;
-
-
-extern void __init_cpu_features (void) attribute_hidden;
-# define INIT_ARCH() \
-  do							\
-    if (__cpu_features.kind == arch_kind_unknown)	\
-      __init_cpu_features ();				\
-  while (0)
-
-/* Used from outside libc.so to get access to the CPU features structure.  */
-extern const struct cpu_features *__get_cpu_features (void)
-     __attribute__ ((const));
-
-# if IS_IN (libc)
-#  define __get_cpu_features()	(&__cpu_features)
-# endif
-
-# define HAS_CPU_FEATURE(idx, reg, bit) \
-  ((__get_cpu_features ()->cpuid[idx].reg & (bit)) != 0)
-
-/* Following are the feature tests used throughout libc.  */
-
-/* CPUID_* evaluates to true if the feature flag is enabled.
-   We always use &__cpu_features because the HAS_CPUID_* macros
-   are called only within __init_cpu_features, where we can't
-   call __get_cpu_features without infinite recursion.  */
-# define HAS_CPUID_FLAG(idx, reg, bit) \
-  (((&__cpu_features)->cpuid[idx].reg & (bit)) != 0)
-
-# define CPUID_OSXSAVE \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_OSXSAVE)
-# define CPUID_AVX \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_AVX)
-# define CPUID_FMA \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_FMA)
-# define CPUID_FMA4 \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_80000001, ecx, bit_FMA4)
-# define CPUID_RTM \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_RTM)
-# define CPUID_AVX2 \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_AVX2)
-
-/* HAS_* evaluates to true if we may use the feature at runtime.  */
-# define HAS_SSE2	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, bit_SSE2)
-# define HAS_POPCOUNT	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_POPCOUNT)
-# define HAS_SSSE3	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSSE3)
-# define HAS_SSE4_1	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_1)
-# define HAS_SSE4_2	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_2)
-# define HAS_RTM	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_7, ebx, bit_RTM)
-
-# define index_Fast_Rep_String		FEATURE_INDEX_1
-# define index_Fast_Copy_Backward	FEATURE_INDEX_1
-# define index_Slow_BSF			FEATURE_INDEX_1
-# define index_Fast_Unaligned_Load	FEATURE_INDEX_1
-# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1
-# define index_AVX_Usable		FEATURE_INDEX_1
-# define index_FMA_Usable		FEATURE_INDEX_1
-# define index_FMA4_Usable		FEATURE_INDEX_1
-# define index_Slow_SSE4_2		FEATURE_INDEX_1
-# define index_AVX2_Usable		FEATURE_INDEX_1
-# define index_AVX_Fast_Unaligned_Load	FEATURE_INDEX_1
-
-# define HAS_ARCH_FEATURE(name) \
-  ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
-
-# define HAS_FAST_REP_STRING		HAS_ARCH_FEATURE (Fast_Rep_String)
-# define HAS_FAST_COPY_BACKWARD		HAS_ARCH_FEATURE (Fast_Copy_Backward)
-# define HAS_SLOW_BSF			HAS_ARCH_FEATURE (Slow_BSF)
-# define HAS_FAST_UNALIGNED_LOAD	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
-# define HAS_AVX			HAS_ARCH_FEATURE (AVX_Usable)
-# define HAS_AVX2			HAS_ARCH_FEATURE (AVX2_Usable)
-# define HAS_FMA			HAS_ARCH_FEATURE (FMA_Usable)
-# define HAS_FMA4			HAS_ARCH_FEATURE (FMA4_Usable)
-# define HAS_AVX_FAST_UNALIGNED_LOAD	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-
-#endif	/* __ASSEMBLER__ */
+#ifdef  __ASSEMBLER__
+# include <cpu-features.h>
+#else
+# include <ldsodefs.h>
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=fdd3aa0b19700ab564da895a2a85bf3fdb0bedf0

commit fdd3aa0b19700ab564da895a2a85bf3fdb0bedf0
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu Jul 9 09:30:09 2015 -0700

    Improve bndmov encoding with zero displacement
    
    If x86-64 assembler doesn't support MPX, we encode bndmov instruction by
    hand.  When displacement is zero, assembler generates shorter encoding.
    This patch improves bndmov encoding with zero displacement so that ld.so
    is identical when using assemblers with and without MPX support.
    
    	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve): Improve
    	bndmov encoding with zero displacement.

diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index b151d35..678c57f 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -80,7 +80,11 @@ _dl_runtime_resolve:
 	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
 	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
 # else
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1b,0x04,0x24
+#  else
 	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
 	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
 	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
 	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
@@ -104,7 +108,11 @@ _dl_runtime_resolve:
 	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
 	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
 	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1a,0x04,0x24
+#  else
 	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
 # endif
 #endif
 	# Get register content back.

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9e4fc7f67f6695f294ae759a6b540f5d08a56f0c

commit 9e4fc7f67f6695f294ae759a6b540f5d08a56f0c
Author: Igor Zamyatin <igor.zamyatin@intel.com>
Date:   Thu Jul 9 06:50:12 2015 -0700

    Preserve bound registers for pointer pass/return
    
    We need to save/restore bound registers and add a BND prefix before
    branches in _dl_runtime_profile so that bound registers for pointer
    pass and return are preserved when LD_AUDIT is used.
    
    	[BZ #18134]
    	* sysdeps/i386/configure.ac: Set HAVE_MPX_SUPPORT.
    	* sysdeps/i386/configure: Regenerated.
    	* sysdeps/i386/dl-trampoline.S (PRESERVE_BND_REGS_PREFIX): New.
    	(_dl_runtime_profile): Save and restore Intel MPX return bound
    	registers when calling _dl_call_pltexit.  Add
    	PRESERVE_BND_REGS_PREFIX before return.
    	* sysdeps/i386/link-defines.sym (LRV_BND0_OFFSET): New.
    	(LRV_BND1_OFFSET): Likewise.
    	* sysdeps/x86/bits/link.h (La_i86_retval): Add lrv_bnd0 and
    	lrv_bnd1.
    	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Fix
    	typo in bndmov encoding.
    	* sysdeps/x86_64/dl-trampoline.h: Properly save and restore
    	Intel MPX bound registers.  Add PRESERVE_BND_REGS_PREFIX before
    	branch instructions to preserve bounds.

diff --git a/sysdeps/i386/configure b/sysdeps/i386/configure
index 6e89b59..ab66c08 100644
--- a/sysdeps/i386/configure
+++ b/sysdeps/i386/configure
@@ -240,6 +240,33 @@ $as_echo "$libc_cv_cc_novzeroupper" >&6; }
 config_vars="$config_vars
 config-cflags-novzeroupper = $libc_cv_cc_novzeroupper"
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Intel MPX support" >&5
+$as_echo_n "checking for Intel MPX support... " >&6; }
+if ${libc_cv_asm_mpx+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat > conftest.s <<\EOF
+        bndmov %bnd0,(%esp)
+EOF
+if { ac_try='${CC-cc} -c $ASFLAGS conftest.s 1>&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+  libc_cv_asm_mpx=yes
+else
+  libc_cv_asm_mpx=no
+fi
+rm -f conftest*
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_asm_mpx" >&5
+$as_echo "$libc_cv_asm_mpx" >&6; }
+if test $libc_cv_asm_mpx == yes; then
+  $as_echo "#define HAVE_MPX_SUPPORT 1" >>confdefs.h
+
+fi
+
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 support" >&5
 $as_echo_n "checking for AVX2 support... " >&6; }
 if ${libc_cv_cc_avx2+:} false; then :
diff --git a/sysdeps/i386/configure.ac b/sysdeps/i386/configure.ac
index 35c4522..a3f3067 100644
--- a/sysdeps/i386/configure.ac
+++ b/sysdeps/i386/configure.ac
@@ -88,6 +88,21 @@ LIBC_TRY_CC_OPTION([-mno-vzeroupper],
 ])
 LIBC_CONFIG_VAR([config-cflags-novzeroupper], [$libc_cv_cc_novzeroupper])
 
+dnl Check whether asm supports Intel MPX
+AC_CACHE_CHECK(for Intel MPX support, libc_cv_asm_mpx, [dnl
+cat > conftest.s <<\EOF
+        bndmov %bnd0,(%esp)
+EOF
+if AC_TRY_COMMAND(${CC-cc} -c $ASFLAGS conftest.s 1>&AS_MESSAGE_LOG_FD); then
+  libc_cv_asm_mpx=yes
+else
+  libc_cv_asm_mpx=no
+fi
+rm -f conftest*])
+if test $libc_cv_asm_mpx == yes; then
+  AC_DEFINE(HAVE_MPX_SUPPORT)
+fi
+
 dnl Check if -mavx2 works.
 AC_CACHE_CHECK(for AVX2 support, libc_cv_cc_avx2, [dnl
 LIBC_TRY_CC_OPTION([-mavx2], [libc_cv_cc_avx2=yes], [libc_cv_cc_avx2=no])
diff --git a/sysdeps/i386/dl-trampoline.S b/sysdeps/i386/dl-trampoline.S
index 7c72b03..8a2fd8d 100644
--- a/sysdeps/i386/dl-trampoline.S
+++ b/sysdeps/i386/dl-trampoline.S
@@ -19,6 +19,12 @@
 #include <sysdep.h>
 #include <link-defines.h>
 
+#ifdef HAVE_MPX_SUPPORT
+# define PRESERVE_BND_REGS_PREFIX bnd
+#else
+# define PRESERVE_BND_REGS_PREFIX .byte 0xf2
+#endif
+
 	.text
 	.globl _dl_runtime_resolve
 	.type _dl_runtime_resolve, @function
@@ -172,6 +178,13 @@ _dl_runtime_profile:
 	movl %edx, LRV_EDX_OFFSET(%esp)
 	fstpt LRV_ST0_OFFSET(%esp)
 	fstpt LRV_ST1_OFFSET(%esp)
+#ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, LRV_BND0_OFFSET(%esp)
+	bndmov %bnd1, LRV_BND1_OFFSET(%esp)
+#else
+	.byte 0x66,0x0f,0x1b,0x44,0x24,LRV_BND0_OFFSET
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,LRV_BND1_OFFSET
+#endif
 	pushl %esp
 	cfi_adjust_cfa_offset (4)
 	# Address of La_i86_regs area.
@@ -185,9 +198,17 @@ _dl_runtime_profile:
 	movl LRV_EDX_OFFSET(%esp), %edx
 	fldt LRV_ST1_OFFSET(%esp)
 	fldt LRV_ST0_OFFSET(%esp)
+#ifdef HAVE_MPX_SUPPORT
+	bndmov LRV_BND0_OFFSET(%esp), %bnd0
+	bndmov LRV_BND1_OFFSET(%esp), %bnd1
+#else
+	.byte 0x66,0x0f,0x1a,0x44,0x24,LRV_BND0_OFFSET
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,LRV_BND1_OFFSET
+#endif
 	# Restore stack before return.
 	addl $(LRV_SIZE + 4 + LR_SIZE + 4), %esp
 	cfi_adjust_cfa_offset (-(LRV_SIZE + 4 + LR_SIZE + 4))
+	PRESERVE_BND_REGS_PREFIX
 	ret
 	cfi_endproc
 	.size _dl_runtime_profile, .-_dl_runtime_profile
diff --git a/sysdeps/i386/link-defines.sym b/sysdeps/i386/link-defines.sym
index a63dcb9..0995adb 100644
--- a/sysdeps/i386/link-defines.sym
+++ b/sysdeps/i386/link-defines.sym
@@ -16,3 +16,5 @@ LRV_EAX_OFFSET		offsetof (struct La_i86_retval, lrv_eax)
 LRV_EDX_OFFSET		offsetof (struct La_i86_retval, lrv_edx)
 LRV_ST0_OFFSET		offsetof (struct La_i86_retval, lrv_st0)
 LRV_ST1_OFFSET		offsetof (struct La_i86_retval, lrv_st1)
+LRV_BND0_OFFSET		offsetof (struct La_i86_retval, lrv_bnd0)
+LRV_BND1_OFFSET		offsetof (struct La_i86_retval, lrv_bnd1)
diff --git a/sysdeps/x86/bits/link.h b/sysdeps/x86/bits/link.h
index 3f559c9..0bf9b9a 100644
--- a/sysdeps/x86/bits/link.h
+++ b/sysdeps/x86/bits/link.h
@@ -38,6 +38,8 @@ typedef struct La_i86_retval
   uint32_t lrv_edx;
   long double lrv_st0;
   long double lrv_st1;
+  uint64_t lrv_bnd0;
+  uint64_t lrv_bnd1;
 } La_i86_retval;
 
 
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 5f9b35d..b151d35 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -206,8 +206,8 @@ _dl_runtime_profile:
 #  else
 	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
 	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
-	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
-	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
 #  endif
 # endif
 
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index 0e5a6fb..d542428 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -63,20 +63,6 @@
 	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
 	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
-	bndmov 		    (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
-	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
-	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
-	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
-# else
-	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
-	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
-	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
-	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
-# endif
-#endif
-
 #ifdef RESTORE_AVX
 	/* Check if any xmm0-xmm7 registers are changed by audit
 	   module.  */
@@ -154,8 +140,24 @@
 
 1:
 #endif
+
+#ifndef __ILP32__
+# ifdef HAVE_MPX_SUPPORT
+	bndmov              (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
+	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
+	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
+	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
+# else
+	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
+	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
+	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+# endif
+#endif
+
 	mov  16(%rbx), %R10_LP	# Anything in framesize?
 	test %R10_LP, %R10_LP
+	PRESERVE_BND_REGS_PREFIX
 	jns 3f
 
 	/* There's nothing in the frame size, so there
@@ -174,6 +176,7 @@
 	addq $48, %rsp		# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
+	PRESERVE_BND_REGS_PREFIX
 	jmp *%r11		# Jump to function address.
 
 3:
@@ -200,6 +203,7 @@
 	movq 32(%rdi), %rsi
 	movq 40(%rdi), %rdi
 
+	PRESERVE_BND_REGS_PREFIX
 	call *%r11
 
 	mov 24(%rbx), %rsp	# Drop the copied stack content
@@ -280,11 +284,11 @@
 
 #ifndef __ILP32__
 # ifdef HAVE_MPX_SUPPORT
-	bndmov LRV_BND0_OFFSET(%rcx), %bnd0  # Restore bound registers.
-	bndmov LRV_BND1_OFFSET(%rcx), %bnd1
+	bndmov LRV_BND0_OFFSET(%rsp), %bnd0  # Restore bound registers.
+	bndmov LRV_BND1_OFFSET(%rsp), %bnd1
 # else
-	.byte  0x66,0x0f,0x1a,0x81;.long (LRV_BND0_OFFSET)
-	.byte  0x66,0x0f,0x1a,0x89;.long (LRV_BND1_OFFSET)
+	.byte  0x66,0x0f,0x1a,0x84,0x24;.long (LRV_BND0_OFFSET)
+	.byte  0x66,0x0f,0x1a,0x8c,0x24;.long (LRV_BND1_OFFSET)
 # endif
 #endif
 
@@ -299,6 +303,7 @@
 	addq $48, %rsp		# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
+	PRESERVE_BND_REGS_PREFIX
 	retq
 
 #ifdef MORE_CODE

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b15f277bac14482c8b2dda4931ebf919644932fa

commit b15f277bac14482c8b2dda4931ebf919644932fa
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Mar 16 14:58:43 2015 -0700

    Preserve bound registers in _dl_runtime_resolve
    
    We need to add a BND prefix before indirect branch at the end of
    _dl_runtime_resolve to preserve bound registers.
    
    	[BZ #18134]
    	* sysdeps/x86_64/dl-trampoline.S (PRESERVE_BND_REGS_PREFIX): New.
    	(_dl_runtime_resolve): Add a BND prefix before indirect branch.

diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 394441d..5f9b35d 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -30,6 +30,7 @@
 /* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX.  */
 # define REGISTER_SAVE_AREA	(8 * 7)
 # define REGISTER_SAVE_RAX	0
+# define PRESERVE_BND_REGS_PREFIX
 #else
 /* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0,
    BND1, BND2, BND3.  */
@@ -40,6 +41,11 @@
 # define REGISTER_SAVE_BND2	(REGISTER_SAVE_BND1 + 16)
 # define REGISTER_SAVE_BND3	(REGISTER_SAVE_BND2 + 16)
 # define REGISTER_SAVE_RAX	(REGISTER_SAVE_BND3 + 16)
+# ifdef HAVE_MPX_SUPPORT
+#  define PRESERVE_BND_REGS_PREFIX bnd
+# else
+#  define PRESERVE_BND_REGS_PREFIX .byte 0xf2
+# endif
 #endif
 #define REGISTER_SAVE_RCX	(REGISTER_SAVE_RAX + 8)
 #define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
@@ -112,6 +118,8 @@ _dl_runtime_resolve:
 	# Adjust stack(PLT did 2 pushes)
 	addq $(REGISTER_SAVE_AREA + 16), %rsp
 	cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))
+	# Preserve bound registers.
+	PRESERVE_BND_REGS_PREFIX
 	jmp *%r11		# Jump to function address.
 	cfi_endproc
 	.size _dl_runtime_resolve, .-_dl_runtime_resolve

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=20f87a053b1e6a9106851f764c205d59fd4b6879

commit 20f87a053b1e6a9106851f764c205d59fd4b6879
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Jul 7 05:23:24 2015 -0700

    Add and use sysdeps/i386/link-defines.sym
    
    Define macros for fields in La_i86_regs and La_i86_retval and use them
    in dl-trampoline.S, instead of hardcoded values.
    
    	* sysdeps/i386/Makefile (gen-as-const-headers)[elf]: Add
    	link-defines.sym.
    	* sysdeps/i386/dl-trampoline.S: Include <link-defines.h>.
    	(_dl_runtime_profile): Use LONG_DOUBLE_SIZE, LRV_SIZE,
    	LRV_EAX_OFFSET, LRV_EDX_OFFSET, LRV_ST0_OFFSET, LRV_ST1_OFFSET
    	and LR_SIZE.
    	* sysdeps/i386/link-defines.sym: New file.

diff --git a/sysdeps/i386/Makefile b/sysdeps/i386/Makefile
index 922be06..9f12190 100644
--- a/sysdeps/i386/Makefile
+++ b/sysdeps/i386/Makefile
@@ -33,6 +33,7 @@ sysdep-CFLAGS += -mpreferred-stack-boundary=4
 else
 ifeq ($(subdir),csu)
 sysdep-CFLAGS += -mpreferred-stack-boundary=4
+gen-as-const-headers += link-defines.sym
 else
 # Likewise, any function which calls user callbacks
 uses-callbacks += -mpreferred-stack-boundary=4
diff --git a/sysdeps/i386/dl-trampoline.S b/sysdeps/i386/dl-trampoline.S
index f11972c..7c72b03 100644
--- a/sysdeps/i386/dl-trampoline.S
+++ b/sysdeps/i386/dl-trampoline.S
@@ -17,6 +17,7 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#include <link-defines.h>
 
 	.text
 	.globl _dl_runtime_resolve
@@ -161,24 +162,32 @@ _dl_runtime_profile:
 	    +4      free
 	   %esp     free
 	*/
-	subl $20, %esp
-	cfi_adjust_cfa_offset (20)
-	movl %eax, (%esp)
-	movl %edx, 4(%esp)
-	fstpt 8(%esp)
-	fstpt 20(%esp)
+#if LONG_DOUBLE_SIZE != 12
+# error "long double size must be 12 bytes"
+#endif
+	# Allocate space for La_i86_retval and subtract 12 free bytes.
+	subl $(LRV_SIZE - 12), %esp
+	cfi_adjust_cfa_offset (LRV_SIZE - 12)
+	movl %eax, LRV_EAX_OFFSET(%esp)
+	movl %edx, LRV_EDX_OFFSET(%esp)
+	fstpt LRV_ST0_OFFSET(%esp)
+	fstpt LRV_ST1_OFFSET(%esp)
 	pushl %esp
 	cfi_adjust_cfa_offset (4)
-	leal 36(%esp), %ecx
-	movl 56(%esp), %eax
-	movl 60(%esp), %edx
+	# Address of La_i86_regs area.
+	leal (LRV_SIZE + 4)(%esp), %ecx
+	# PLT2
+	movl (LRV_SIZE + 4 + LR_SIZE)(%esp), %eax
+	# PLT1
+	movl (LRV_SIZE + 4 + LR_SIZE + 4)(%esp), %edx
 	call _dl_call_pltexit
-	movl (%esp), %eax
-	movl 4(%esp), %edx
-	fldt 20(%esp)
-	fldt 8(%esp)
-	addl $60, %esp
-	cfi_adjust_cfa_offset (-60)
+	movl LRV_EAX_OFFSET(%esp), %eax
+	movl LRV_EDX_OFFSET(%esp), %edx
+	fldt LRV_ST1_OFFSET(%esp)
+	fldt LRV_ST0_OFFSET(%esp)
+	# Restore stack before return.
+	addl $(LRV_SIZE + 4 + LR_SIZE + 4), %esp
+	cfi_adjust_cfa_offset (-(LRV_SIZE + 4 + LR_SIZE + 4))
 	ret
 	cfi_endproc
 	.size _dl_runtime_profile, .-_dl_runtime_profile
diff --git a/sysdeps/i386/link-defines.sym b/sysdeps/i386/link-defines.sym
new file mode 100644
index 0000000..a63dcb9
--- /dev/null
+++ b/sysdeps/i386/link-defines.sym
@@ -0,0 +1,18 @@
+#include "link.h"
+#include <stddef.h>
+
+--
+LONG_DOUBLE_SIZE	sizeof (long double)
+
+LR_SIZE			sizeof (struct La_i86_regs)
+LR_EDX_OFFSET		offsetof (struct La_i86_regs, lr_edx)
+LR_ECX_OFFSET		offsetof (struct La_i86_regs, lr_ecx)
+LR_EAX_OFFSET		offsetof (struct La_i86_regs, lr_eax)
+LR_EBP_OFFSET		offsetof (struct La_i86_regs, lr_ebp)
+LR_ESP_OFFSET		offsetof (struct La_i86_regs, lr_esp)
+
+LRV_SIZE		sizeof (struct La_i86_retval)
+LRV_EAX_OFFSET		offsetof (struct La_i86_retval, lrv_eax)
+LRV_EDX_OFFSET		offsetof (struct La_i86_retval, lrv_edx)
+LRV_ST0_OFFSET		offsetof (struct La_i86_retval, lrv_st0)
+LRV_ST1_OFFSET		offsetof (struct La_i86_retval, lrv_st1)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d6fd297566e2389ab855325754bcb25f604c6b7d

commit d6fd297566e2389ab855325754bcb25f604c6b7d
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Jul 7 05:09:16 2015 -0700

    Add a testcase for i386 LD_AUDIT
    
    This patch adds a testcase for i386 LD_AUDIT to check function return
    and parameters passed in registers.
    
    	* sysdeps/i386/Makefile (tests)[elf]: Add tst-audit3.
    	(modules-names): Add tst-auditmod3a tst-auditmod3b.
    	($(objpfx)tst-audit3): New rule.
    	($(objpfx)tst-audit3.out): Likewise.
    	* sysdeps/i386/tst-audit3.c: New file.
    	* sysdeps/i386/tst-audit3.h: Likewise.
    	* sysdeps/i386/tst-auditmod3a.c: Likewise.
    	* sysdeps/i386/tst-auditmod3b.c: Likewise.

diff --git a/sysdeps/i386/Makefile b/sysdeps/i386/Makefile
index e1deece..922be06 100644
--- a/sysdeps/i386/Makefile
+++ b/sysdeps/i386/Makefile
@@ -68,6 +68,13 @@ ifeq ($(subdir),elf)
 sysdep-dl-routines += tlsdesc dl-tlsdesc
 sysdep_routines += tlsdesc dl-tlsdesc
 sysdep-rtld-routines += tlsdesc dl-tlsdesc
+
+tests += tst-audit3
+modules-names += tst-auditmod3a tst-auditmod3b
+
+$(objpfx)tst-audit3: $(objpfx)tst-auditmod3a.so
+$(objpfx)tst-audit3.out: $(objpfx)tst-auditmod3b.so
+tst-audit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod3b.so
 endif
 
 ifeq ($(subdir),csu)
diff --git a/sysdeps/i386/tst-audit3.c b/sysdeps/i386/tst-audit3.c
new file mode 100644
index 0000000..cb2fb90
--- /dev/null
+++ b/sysdeps/i386/tst-audit3.c
@@ -0,0 +1,37 @@
+/* Test case for i386 preserved registers in dynamic linker.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include "tst-audit3.h"
+
+static int
+do_test (void)
+{
+  long long ll = audit1_test (1, 2, 3);
+  if (ll != 30)
+    abort ();
+
+  float f = audit2_test (1, 2, 3);
+  if (f != 30)
+    abort ();
+
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/sysdeps/i386/tst-audit3.h b/sysdeps/i386/tst-audit3.h
new file mode 100644
index 0000000..9395605
--- /dev/null
+++ b/sysdeps/i386/tst-audit3.h
@@ -0,0 +1,20 @@
+/* Test case for i386 preserved registers in dynamic linker.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+extern long long audit1_test (int, int, int) __attribute__ ((regparm(3)));
+extern float audit2_test (int, int, int) __attribute__ ((regparm(3)));
diff --git a/sysdeps/i386/tst-auditmod3a.c b/sysdeps/i386/tst-auditmod3a.c
new file mode 100644
index 0000000..72fcfa7
--- /dev/null
+++ b/sysdeps/i386/tst-auditmod3a.c
@@ -0,0 +1,38 @@
+/* Test case for i386 preserved registers in dynamic linker.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include "tst-audit3.h"
+
+long long
+__attribute__ ((regparm(3)))
+audit1_test (int i, int j, int k)
+{
+  if (i != 1 || j != 2 || k != 3)
+    abort ();
+  return 30;
+}
+
+float
+__attribute__ ((regparm(3)))
+audit2_test (int i, int j, int k)
+{
+  if (i != 1 || j != 2 || k != 3)
+    abort ();
+  return 30;
+}
diff --git a/sysdeps/i386/tst-auditmod3b.c b/sysdeps/i386/tst-auditmod3b.c
new file mode 100644
index 0000000..5b26a13
--- /dev/null
+++ b/sysdeps/i386/tst-auditmod3b.c
@@ -0,0 +1,185 @@
+/* Test case for i386 preserved registers in dynamic linker.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <link.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  const char *flagstr;
+  switch (flag)
+    {
+    case LA_ACT_CONSISTENT:
+      flagstr = "consistent";
+      break;
+    case LA_ACT_ADD:
+      flagstr = "add";
+      break;
+    case LA_ACT_DELETE:
+      flagstr = "delete";
+      break;
+    default:
+      printf ("activity: unknown activity %u\n", flag);
+      return;
+    }
+  printf ("activity: %s\n", flagstr);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  const char *flagstr;
+  switch (flag)
+    {
+    case LA_SER_ORIG:
+      flagstr = "LA_SET_ORIG";
+      break;
+    case LA_SER_LIBPATH:
+      flagstr = "LA_SER_LIBPATH";
+      break;
+    case LA_SER_RUNPATH:
+      flagstr = "LA_SER_RUNPATH";
+      break;
+    case LA_SER_CONFIG:
+      flagstr = "LA_SER_CONFIG";
+      break;
+    case LA_SER_DEFAULT:
+      flagstr = "LA_SER_DEFAULT";
+    case LA_SER_SECURE:
+      flagstr = "LA_SER_SECURE";
+      break;
+    default:
+      printf ("objsearch: %s, unknown flag %d\n", name, flag);
+      return (char *) name;
+    }
+
+  printf ("objsearch: %s, %s\n", name, flagstr);
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#include "tst-audit.h"
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  if (strcmp (symname, "audit1_test") == 0
+      || strcmp (symname, "audit2_test") == 0)
+    {
+      if (regs->lr_eax != 1
+	  || regs->lr_edx != 2
+	  || regs->lr_ecx != 3)
+	abort ();
+
+      *framesizep = 200;
+    }
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx,
+	  (ptrdiff_t) outregs->int_retval);
+
+  if (strcmp (symname, "audit1_test") == 0
+      || strcmp (symname, "audit2_test") == 0)
+    {
+      if (inregs->lr_eax != 1
+	  || inregs->lr_edx != 2
+	  || inregs->lr_ecx != 3)
+	abort ();
+
+      if (strcmp (symname, "audit1_test") == 0)
+	{
+	  long long x = ((unsigned long long) outregs->lrv_eax
+			 | (unsigned long long) outregs->lrv_edx << 32);
+
+	  if (x != 30)
+	    abort ();
+	}
+      else if (strcmp (symname, "audit2_test") == 0)
+	{
+	  if (outregs->lrv_st0 != 30)
+	    abort ();
+	}
+    }
+
+  return 0;
+}

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]