This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch hjl/plt/master created. glibc-2.21-684-gf6db06d
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 3 Aug 2015 19:31:57 -0000
- Subject: GNU C Library master sources branch hjl/plt/master created. glibc-2.21-684-gf6db06d
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, hjl/plt/master has been created
at f6db06d3805c47ca7f0fd0bed17d32abb02689d3 (commit)
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f6db06d3805c47ca7f0fd0bed17d32abb02689d3
commit f6db06d3805c47ca7f0fd0bed17d32abb02689d3
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Aug 2 22:27:47 2015 -0700
Don't run tst-getpid2 with LD_BIND_NOW=1
Since _dl_x86_64_save_sse and _dl_x86_64_restore_sse are removed now,
we don't need to run tst-getpid2 with LD_BIND_NOW=1.
* sysdeps/unix/sysv/linux/Makefile (tst-getpid2-ENV): Removed.
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index bfbabd4..2c67a66 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -193,9 +193,4 @@ endif
ifeq ($(subdir),nptl)
tests += tst-setgetname tst-align-clone tst-getpid1 tst-getpid2
-
-# In this test, we create a CLONE_VM "thread" that shares TLS storage
-# with the original thread. Both threads then race in ld.so with lazy PLT
-# resolution. Avoid this race by disabling lazy binding. BZ #11214.
-tst-getpid2-ENV = LD_BIND_NOW=1
endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e9df5c18b1424d0b2c6e5f848fbd0ffa4f0ccf13
commit e9df5c18b1424d0b2c6e5f848fbd0ffa4f0ccf13
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Jul 29 04:49:38 2015 -0700
Use SSE optimized strcmp in x86-64 ld.so
Since ld.so preserves vector registers now, we can SSE optimized strcmp
in x86-64 ld.so.
* sysdeps/x86_64/strcmp.S: Remove "#if !IS_IN (libc)".
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 1329649..1624b5d 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -29,13 +29,6 @@
#endif
#ifdef USE_AS_STRNCMP
-/* The simplified code below is not set up to handle strncmp() so far.
- Should this become necessary it has to be implemented. For now
- just report the problem. */
-# if !IS_IN (libc)
-# error "strncmp not implemented so far"
-# endif
-
/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
if the new counter > the old one or is 0. */
# define UPDATE_STRNCMP_COUNTER \
@@ -50,20 +43,10 @@
#elif defined USE_AS_STRCASECMP_L
# include "locale-defines.h"
-/* No support for strcasecmp outside libc so far since it is not needed. */
-# if !IS_IN (libc)
-# error "strcasecmp_l not implemented so far"
-# endif
-
# define UPDATE_STRNCMP_COUNTER
#elif defined USE_AS_STRNCASECMP_L
# include "locale-defines.h"
-/* No support for strncasecmp outside libc so far since it is not needed. */
-# if !IS_IN (libc)
-# error "strncasecmp_l not implemented so far"
-# endif
-
# define UPDATE_STRNCMP_COUNTER \
/* calculate left number to compare */ \
lea -16(%rcx, %r11), %r9; \
@@ -126,63 +109,44 @@ libc_hidden_def (__strncasecmp)
#endif
ENTRY (STRCMP)
-#if !IS_IN (libc)
-/* Simple version since we can't use SSE registers in ld.so. */
-L(oop): movb (%rdi), %al
- cmpb (%rsi), %al
- jne L(neq)
- incq %rdi
- incq %rsi
- testb %al, %al
- jnz L(oop)
-
- xorl %eax, %eax
- ret
-
-L(neq): movl $1, %eax
- movl $-1, %ecx
- cmovbl %ecx, %eax
- ret
-END (STRCMP)
-#else /* !IS_IN (libc) */
-# ifdef USE_AS_STRCASECMP_L
+#ifdef USE_AS_STRCASECMP_L
/* We have to fall back on the C implementation for locales
with encodings not matching ASCII for single bytes. */
-# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
-# else
+# else
mov (%rdx), %RAX_LP
-# endif
+# endif
testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
jne __strcasecmp_l_nonascii
-# elif defined USE_AS_STRNCASECMP_L
+#elif defined USE_AS_STRNCASECMP_L
/* We have to fall back on the C implementation for locales
with encodings not matching ASCII for single bytes. */
-# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
-# else
+# else
mov (%rcx), %RAX_LP
-# endif
+# endif
testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
jne __strncasecmp_l_nonascii
-# endif
+#endif
/*
* This implementation uses SSE to compare up to 16 bytes at a time.
*/
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
test %rdx, %rdx
je LABEL(strcmp_exitz)
cmp $1, %rdx
je LABEL(Byte0)
mov %rdx, %r11
-# endif
+#endif
mov %esi, %ecx
mov %edi, %eax
/* Use 64bit AND here to avoid long NOP padding. */
and $0x3f, %rcx /* rsi alignment in cache line */
and $0x3f, %rax /* rdi alignment in cache line */
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
.section .rodata.cst16,"aM",@progbits,16
.align 16
.Lbelowupper:
@@ -196,12 +160,12 @@ END (STRCMP)
.quad 0x2020202020202020
.previous
movdqa .Lbelowupper(%rip), %xmm5
-# define UCLOW_reg %xmm5
+# define UCLOW_reg %xmm5
movdqa .Ltopupper(%rip), %xmm6
-# define UCHIGH_reg %xmm6
+# define UCHIGH_reg %xmm6
movdqa .Ltouppermask(%rip), %xmm7
-# define LCQWORD_reg %xmm7
-# endif
+# define LCQWORD_reg %xmm7
+#endif
cmp $0x30, %ecx
ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
cmp $0x30, %eax
@@ -210,8 +174,8 @@ END (STRCMP)
movlpd (%rsi), %xmm2
movhpd 8(%rdi), %xmm1
movhpd 8(%rsi), %xmm2
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# define TOLOWER(reg1, reg2) \
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
movdqa reg1, %xmm8; \
movdqa UCHIGH_reg, %xmm9; \
movdqa reg2, %xmm10; \
@@ -227,9 +191,9 @@ END (STRCMP)
por %xmm8, reg1; \
por %xmm10, reg2
TOLOWER (%xmm1, %xmm2)
-# else
-# define TOLOWER(reg1, reg2)
-# endif
+#else
+# define TOLOWER(reg1, reg2)
+#endif
pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
pcmpeqb %xmm1, %xmm0 /* Any null chars? */
pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
@@ -237,10 +201,10 @@ END (STRCMP)
pmovmskb %xmm1, %edx
sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
jnz LABEL(less16bytes) /* If not, find different value or null char */
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz) /* finish comparision */
-# endif
+#endif
add $16, %rsi /* prepare to search next 16 bytes */
add $16, %rdi /* prepare to search next 16 bytes */
@@ -282,13 +246,13 @@ LABEL(ashr_0):
movdqa (%rsi), %xmm1
pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
pcmpeqb %xmm1, %xmm0 /* Any null chars? */
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
-# else
+#else
movdqa (%rdi), %xmm2
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
-# endif
+#endif
psubb %xmm0, %xmm1 /* packed sub of comparison results*/
pmovmskb %xmm1, %r9d
shr %cl, %edx /* adjust 0xffff for offset */
@@ -321,10 +285,10 @@ LABEL(loop_ashr_0):
sub $0xffff, %edx
jnz LABEL(exit) /* mismatch or null char seen */
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
@@ -336,10 +300,10 @@ LABEL(loop_ashr_0):
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
jmp LABEL(loop_ashr_0)
@@ -388,13 +352,13 @@ LABEL(gobble_ashr_1):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4 /* store for next cycle */
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $1, %xmm3
pslldq $15, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -404,10 +368,10 @@ LABEL(gobble_ashr_1):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -418,13 +382,13 @@ LABEL(gobble_ashr_1):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4 /* store for next cycle */
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $1, %xmm3
pslldq $15, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -434,10 +398,10 @@ LABEL(gobble_ashr_1):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
jmp LABEL(loop_ashr_1)
@@ -453,10 +417,10 @@ LABEL(nibble_ashr_1):
test $0xfffe, %edx
jnz LABEL(ashr_1_exittail) /* find null char*/
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp $15, %r11
jbe LABEL(ashr_1_exittail)
-# endif
+#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10 /* substract 4K from %r10 */
@@ -518,13 +482,13 @@ LABEL(gobble_ashr_2):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $2, %xmm3
pslldq $14, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -534,10 +498,10 @@ LABEL(gobble_ashr_2):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -549,13 +513,13 @@ LABEL(gobble_ashr_2):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $2, %xmm3
pslldq $14, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -565,10 +529,10 @@ LABEL(gobble_ashr_2):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -581,10 +545,10 @@ LABEL(nibble_ashr_2):
test $0xfffc, %edx
jnz LABEL(ashr_2_exittail)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp $14, %r11
jbe LABEL(ashr_2_exittail)
-# endif
+#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -643,13 +607,13 @@ LABEL(gobble_ashr_3):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $3, %xmm3
pslldq $13, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -659,10 +623,10 @@ LABEL(gobble_ashr_3):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -674,13 +638,13 @@ LABEL(gobble_ashr_3):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $3, %xmm3
pslldq $13, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -690,10 +654,10 @@ LABEL(gobble_ashr_3):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -706,10 +670,10 @@ LABEL(nibble_ashr_3):
test $0xfff8, %edx
jnz LABEL(ashr_3_exittail)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp $13, %r11
jbe LABEL(ashr_3_exittail)
-# endif
+#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -768,13 +732,13 @@ LABEL(gobble_ashr_4):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $4, %xmm3
pslldq $12, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -784,10 +748,10 @@ LABEL(gobble_ashr_4):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -799,13 +763,13 @@ LABEL(gobble_ashr_4):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $4, %xmm3
pslldq $12, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -815,10 +779,10 @@ LABEL(gobble_ashr_4):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -831,10 +795,10 @@ LABEL(nibble_ashr_4):
test $0xfff0, %edx
jnz LABEL(ashr_4_exittail)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp $12, %r11
jbe LABEL(ashr_4_exittail)
-# endif
+#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -893,13 +857,13 @@ LABEL(gobble_ashr_5):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $5, %xmm3
pslldq $11, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -909,10 +873,10 @@ LABEL(gobble_ashr_5):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -924,13 +888,13 @@ LABEL(gobble_ashr_5):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $5, %xmm3
pslldq $11, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -940,10 +904,10 @@ LABEL(gobble_ashr_5):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -956,10 +920,10 @@ LABEL(nibble_ashr_5):
test $0xffe0, %edx
jnz LABEL(ashr_5_exittail)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp $11, %r11
jbe LABEL(ashr_5_exittail)
-# endif
+#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -1018,13 +982,13 @@ LABEL(gobble_ashr_6):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $6, %xmm3
pslldq $10, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1034,10 +998,10 @@ LABEL(gobble_ashr_6):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1049,13 +1013,13 @@ LABEL(gobble_ashr_6):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $6, %xmm3
pslldq $10, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1065,10 +1029,10 @@ LABEL(gobble_ashr_6):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1081,10 +1045,10 @@ LABEL(nibble_ashr_6):
test $0xffc0, %edx
jnz LABEL(ashr_6_exittail)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp $10, %r11
jbe LABEL(ashr_6_exittail)
-# endif
+#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -1143,13 +1107,13 @@ LABEL(gobble_ashr_7):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $7, %xmm3
pslldq $9, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1159,10 +1123,10 @@ LABEL(gobble_ashr_7):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1174,13 +1138,13 @@ LABEL(gobble_ashr_7):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $7, %xmm3
pslldq $9, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1190,10 +1154,10 @@ LABEL(gobble_ashr_7):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1206,10 +1170,10 @@ LABEL(nibble_ashr_7):
test $0xff80, %edx
jnz LABEL(ashr_7_exittail)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp $9, %r11
jbe LABEL(ashr_7_exittail)
-# endif
+#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -1268,13 +1232,13 @@ LABEL(gobble_ashr_8):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $8, %xmm3
pslldq $8, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1284,10 +1248,10 @@ LABEL(gobble_ashr_8):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1299,13 +1263,13 @@ LABEL(gobble_ashr_8):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $8, %xmm3
pslldq $8, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1315,10 +1279,10 @@ LABEL(gobble_ashr_8):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1331,10 +1295,10 @@ LABEL(nibble_ashr_8):
test $0xff00, %edx
jnz LABEL(ashr_8_exittail)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp $8, %r11
jbe LABEL(ashr_8_exittail)
-# endif
+#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -1393,13 +1357,13 @@ LABEL(gobble_ashr_9):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $9, %xmm3
pslldq $7, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1409,10 +1373,10 @@ LABEL(gobble_ashr_9):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1424,13 +1388,13 @@ LABEL(gobble_ashr_9):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $9, %xmm3
pslldq $7, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1440,10 +1404,10 @@ LABEL(gobble_ashr_9):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3 /* store for next cycle */
@@ -1456,10 +1420,10 @@ LABEL(nibble_ashr_9):
test $0xfe00, %edx
jnz LABEL(ashr_9_exittail)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp $7, %r11
jbe LABEL(ashr_9_exittail)
-# endif
+#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -1518,13 +1482,13 @@ LABEL(gobble_ashr_10):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $10, %xmm3
pslldq $6, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1534,10 +1498,10 @@ LABEL(gobble_ashr_10):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1549,13 +1513,13 @@ LABEL(gobble_ashr_10):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $10, %xmm3
pslldq $6, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1565,10 +1529,10 @@ LABEL(gobble_ashr_10):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1581,10 +1545,10 @@ LABEL(nibble_ashr_10):
test $0xfc00, %edx
jnz LABEL(ashr_10_exittail)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp $6, %r11
jbe LABEL(ashr_10_exittail)
-# endif
+#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -1643,13 +1607,13 @@ LABEL(gobble_ashr_11):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $11, %xmm3
pslldq $5, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1659,10 +1623,10 @@ LABEL(gobble_ashr_11):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1674,13 +1638,13 @@ LABEL(gobble_ashr_11):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $11, %xmm3
pslldq $5, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1690,10 +1654,10 @@ LABEL(gobble_ashr_11):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1706,10 +1670,10 @@ LABEL(nibble_ashr_11):
test $0xf800, %edx
jnz LABEL(ashr_11_exittail)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp $5, %r11
jbe LABEL(ashr_11_exittail)
-# endif
+#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -1768,13 +1732,13 @@ LABEL(gobble_ashr_12):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $12, %xmm3
pslldq $4, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1784,10 +1748,10 @@ LABEL(gobble_ashr_12):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1799,13 +1763,13 @@ LABEL(gobble_ashr_12):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $12, %xmm3
pslldq $4, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1815,10 +1779,10 @@ LABEL(gobble_ashr_12):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1831,10 +1795,10 @@ LABEL(nibble_ashr_12):
test $0xf000, %edx
jnz LABEL(ashr_12_exittail)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp $4, %r11
jbe LABEL(ashr_12_exittail)
-# endif
+#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -1893,13 +1857,13 @@ LABEL(gobble_ashr_13):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $13, %xmm3
pslldq $3, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1909,10 +1873,10 @@ LABEL(gobble_ashr_13):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1924,13 +1888,13 @@ LABEL(gobble_ashr_13):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $13, %xmm3
pslldq $3, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1940,10 +1904,10 @@ LABEL(gobble_ashr_13):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1956,10 +1920,10 @@ LABEL(nibble_ashr_13):
test $0xe000, %edx
jnz LABEL(ashr_13_exittail)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp $3, %r11
jbe LABEL(ashr_13_exittail)
-# endif
+#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -2018,13 +1982,13 @@ LABEL(gobble_ashr_14):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $14, %xmm3
pslldq $2, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -2034,10 +1998,10 @@ LABEL(gobble_ashr_14):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -2049,13 +2013,13 @@ LABEL(gobble_ashr_14):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $14, %xmm3
pslldq $2, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -2065,10 +2029,10 @@ LABEL(gobble_ashr_14):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -2081,10 +2045,10 @@ LABEL(nibble_ashr_14):
test $0xc000, %edx
jnz LABEL(ashr_14_exittail)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp $2, %r11
jbe LABEL(ashr_14_exittail)
-# endif
+#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -2145,13 +2109,13 @@ LABEL(gobble_ashr_15):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $15, %xmm3
pslldq $1, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -2161,10 +2125,10 @@ LABEL(gobble_ashr_15):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -2176,13 +2140,13 @@ LABEL(gobble_ashr_15):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
psrldq $15, %xmm3
pslldq $1, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-# else
+#else
palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
-# endif
+#endif
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -2192,10 +2156,10 @@ LABEL(gobble_ashr_15):
sub $0xffff, %edx
jnz LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -2208,10 +2172,10 @@ LABEL(nibble_ashr_15):
test $0x8000, %edx
jnz LABEL(ashr_15_exittail)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmpq $1, %r11
jbe LABEL(ashr_15_exittail)
-# endif
+#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -2246,18 +2210,18 @@ LABEL(ret):
LABEL(less16bytes):
bsf %rdx, %rdx /* find and store bit index in %rdx */
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub %rdx, %r11
jbe LABEL(strcmp_exitz)
-# endif
+#endif
movzbl (%rsi, %rdx), %ecx
movzbl (%rdi, %rdx), %eax
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
movl (%rdx,%rcx,4), %ecx
movl (%rdx,%rax,4), %eax
-# endif
+#endif
sub %ecx, %eax
ret
@@ -2271,11 +2235,11 @@ LABEL(Byte0):
movzx (%rsi), %ecx
movzx (%rdi), %eax
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
movl (%rdx,%rcx,4), %ecx
movl (%rdx,%rax,4), %eax
-# endif
+#endif
sub %ecx, %eax
ret
@@ -2300,5 +2264,4 @@ LABEL(unaligned_table):
.int LABEL(ashr_14) - LABEL(unaligned_table)
.int LABEL(ashr_15) - LABEL(unaligned_table)
.int LABEL(ashr_0) - LABEL(unaligned_table)
-#endif /* !IS_IN (libc) */
libc_hidden_builtin_def (STRCMP)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e54d30f31423f0e8c703a2dc0466bc4b492400d8
commit e54d30f31423f0e8c703a2dc0466bc4b492400d8
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Jul 29 03:56:14 2015 -0700
Remove x86-64 rtld-xxx.c and rtld-xxx.S
Since ld.so preserves vector registers now, we can use the regular,
non-ifunc string and memory functions in ld.so.
* sysdeps/x86_64/rtld-memcmp.c: Removed.
* sysdeps/x86_64/rtld-memset.S: Likewise.
* sysdeps/x86_64/rtld-strchr.S: Likewise.
* sysdeps/x86_64/rtld-strlen.S: Likewise.
* sysdeps/x86_64/multiarch/rtld-memcmp.c: Likewise.
* sysdeps/x86_64/multiarch/rtld-memset.S: Likewise.
diff --git a/sysdeps/x86_64/multiarch/rtld-memcmp.c b/sysdeps/x86_64/multiarch/rtld-memcmp.c
deleted file mode 100644
index 0f27135..0000000
--- a/sysdeps/x86_64/multiarch/rtld-memcmp.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../rtld-memcmp.c"
diff --git a/sysdeps/x86_64/multiarch/rtld-memset.S b/sysdeps/x86_64/multiarch/rtld-memset.S
deleted file mode 100644
index 8092aa0..0000000
--- a/sysdeps/x86_64/multiarch/rtld-memset.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../rtld-memset.S"
diff --git a/sysdeps/x86_64/rtld-memcmp.c b/sysdeps/x86_64/rtld-memcmp.c
deleted file mode 100644
index 2ee4032..0000000
--- a/sysdeps/x86_64/rtld-memcmp.c
+++ /dev/null
@@ -1 +0,0 @@
-#include <string/memcmp.c>
diff --git a/sysdeps/x86_64/rtld-memset.S b/sysdeps/x86_64/rtld-memset.S
deleted file mode 100644
index f8df333..0000000
--- a/sysdeps/x86_64/rtld-memset.S
+++ /dev/null
@@ -1,37 +0,0 @@
-/* memset implementation for the dynamic linker. This is separate from the
- libc implementation to avoid writing to SSE registers.
- Copyright (C) 2013-2015 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
- .text
-/* void *memset (void *dest, char c, size_t count)
- dest => %rdi
- c => %rsi
- count => %rdx */
-ENTRY (memset)
- mov %rdx, %rcx
- movzbl %sil, %eax
- mov %rdi, %rdx
- rep stosb
- mov %rdx, %rax
- ret
-END (memset)
-libc_hidden_builtin_def (memset)
diff --git a/sysdeps/x86_64/rtld-strchr.S b/sysdeps/x86_64/rtld-strchr.S
deleted file mode 100644
index cc694d7..0000000
--- a/sysdeps/x86_64/rtld-strchr.S
+++ /dev/null
@@ -1,288 +0,0 @@
-/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
- For AMD x86-64.
- Copyright (C) 2002-2015 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
- .text
-ENTRY (strchr)
-
- /* Before we start with the main loop we process single bytes
- until the source pointer is aligned. This has two reasons:
- 1. aligned 64-bit memory access is faster
- and (more important)
- 2. we process in the main loop 64 bit in one step although
- we don't know the end of the string. But accessing at
- 8-byte alignment guarantees that we never access illegal
- memory if this would not also be done by the trivial
- implementation (this is because all processor inherent
- boundaries are multiples of 8). */
-
- movq %rdi, %rdx
- andl $7, %edx /* Mask alignment bits */
- movq %rdi, %rax /* duplicate destination. */
- jz 1f /* aligned => start loop */
- neg %edx
- addl $8, %edx /* Align to 8 bytes. */
-
- /* Search the first bytes directly. */
-0: movb (%rax), %cl /* load byte */
- cmpb %cl,%sil /* compare byte. */
- je 6f /* target found */
- testb %cl,%cl /* is byte NUL? */
- je 7f /* yes => return NULL */
- incq %rax /* increment pointer */
- decl %edx
- jnz 0b
-
-
-1:
- /* At the moment %rsi contains C. What we need for the
- algorithm is C in all bytes of the register. Avoid
- operations on 16 bit words because these require an
- prefix byte (and one more cycle). */
- /* Populate 8 bit data to full 64-bit. */
- movabs $0x0101010101010101,%r9
- movzbl %sil,%edx
- imul %rdx,%r9
-
- movq $0xfefefefefefefeff, %r8 /* Save magic. */
-
- /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
- change any of the hole bits of LONGWORD.
-
- 1) Is this safe? Will it catch all the zero bytes?
- Suppose there is a byte with all zeros. Any carry bits
- propagating from its left will fall into the hole at its
- least significant bit and stop. Since there will be no
- carry from its most significant bit, the LSB of the
- byte to the left will be unchanged, and the zero will be
- detected.
-
- 2) Is this worthwhile? Will it ignore everything except
- zero bytes? Suppose every byte of QUARDWORD has a bit set
- somewhere. There will be a carry into bit 8. If bit 8
- is set, this will carry into bit 16. If bit 8 is clear,
- one of bits 9-15 must be set, so there will be a carry
- into bit 16. Similarly, there will be a carry into bit
- 24 tec.. If one of bits 54-63 is set, there will be a carry
- into bit 64 (=carry flag), so all of the hole bits will
- be changed.
-
- 3) But wait! Aren't we looking for C, not zero?
- Good point. So what we do is XOR LONGWORD with a longword,
- each of whose bytes is C. This turns each byte that is C
- into a zero. */
-
- .p2align 4
-4:
- /* Main Loop is unrolled 4 times. */
- /* First unroll. */
- movq (%rax), %rcx /* get double word (= 8 bytes) in question */
- addq $8,%rax /* adjust pointer for next word */
- movq %r8, %rdx /* magic value */
- xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c
- are now 0 */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jnz 3f /* found c => return pointer */
-
- /* The quadword we looked at does not contain the value we're looking
- for. Let's search now whether we have reached the end of the
- string. */
- xorq %r9, %rcx /* restore original dword without reload */
- movq %r8, %rdx /* magic value */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 7f /* highest byte is NUL => return NULL */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jnz 7f /* found NUL => return NULL */
-
- /* Second unroll. */
- movq (%rax), %rcx /* get double word (= 8 bytes) in question */
- addq $8,%rax /* adjust pointer for next word */
- movq %r8, %rdx /* magic value */
- xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c
- are now 0 */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jnz 3f /* found c => return pointer */
-
- /* The quadword we looked at does not contain the value we're looking
- for. Let's search now whether we have reached the end of the
- string. */
- xorq %r9, %rcx /* restore original dword without reload */
- movq %r8, %rdx /* magic value */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 7f /* highest byte is NUL => return NULL */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jnz 7f /* found NUL => return NULL */
- /* Third unroll. */
- movq (%rax), %rcx /* get double word (= 8 bytes) in question */
- addq $8,%rax /* adjust pointer for next word */
- movq %r8, %rdx /* magic value */
- xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c
- are now 0 */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jnz 3f /* found c => return pointer */
-
- /* The quadword we looked at does not contain the value we're looking
- for. Let's search now whether we have reached the end of the
- string. */
- xorq %r9, %rcx /* restore original dword without reload */
- movq %r8, %rdx /* magic value */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 7f /* highest byte is NUL => return NULL */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jnz 7f /* found NUL => return NULL */
- /* Fourth unroll. */
- movq (%rax), %rcx /* get double word (= 8 bytes) in question */
- addq $8,%rax /* adjust pointer for next word */
- movq %r8, %rdx /* magic value */
- xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c
- are now 0 */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jnz 3f /* found c => return pointer */
-
- /* The quadword we looked at does not contain the value we're looking
- for. Let's search now whether we have reached the end of the
- string. */
- xorq %r9, %rcx /* restore original dword without reload */
- movq %r8, %rdx /* magic value */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 7f /* highest byte is NUL => return NULL */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jz 4b /* no NUL found => restart loop */
-
-
-7: /* Return NULL. */
- xorl %eax, %eax
- retq
-
-
- /* We now scan for the byte in which the character was matched.
- But we have to take care of the case that a NUL char is
- found before this in the dword. Note that we XORed %rcx
- with the byte we're looking for, therefore the tests below look
- reversed. */
-
-
- .p2align 4 /* Align, it's a jump target. */
-3: movq %r9,%rdx /* move to %rdx so that we can access bytes */
- subq $8,%rax /* correct pointer increment. */
- testb %cl, %cl /* is first byte C? */
- jz 6f /* yes => return pointer */
- cmpb %dl, %cl /* is first byte NUL? */
- je 7b /* yes => return NULL */
- incq %rax /* increment pointer */
-
- testb %ch, %ch /* is second byte C? */
- jz 6f /* yes => return pointer */
- cmpb %dl, %ch /* is second byte NUL? */
- je 7b /* yes => return NULL? */
- incq %rax /* increment pointer */
-
- shrq $16, %rcx /* make upper bytes accessible */
- testb %cl, %cl /* is third byte C? */
- jz 6f /* yes => return pointer */
- cmpb %dl, %cl /* is third byte NUL? */
- je 7b /* yes => return NULL */
- incq %rax /* increment pointer */
-
- testb %ch, %ch /* is fourth byte C? */
- jz 6f /* yes => return pointer */
- cmpb %dl, %ch /* is fourth byte NUL? */
- je 7b /* yes => return NULL? */
- incq %rax /* increment pointer */
-
- shrq $16, %rcx /* make upper bytes accessible */
- testb %cl, %cl /* is fifth byte C? */
- jz 6f /* yes => return pointer */
- cmpb %dl, %cl /* is fifth byte NUL? */
- je 7b /* yes => return NULL */
- incq %rax /* increment pointer */
-
- testb %ch, %ch /* is sixth byte C? */
- jz 6f /* yes => return pointer */
- cmpb %dl, %ch /* is sixth byte NUL? */
- je 7b /* yes => return NULL? */
- incq %rax /* increment pointer */
-
- shrq $16, %rcx /* make upper bytes accessible */
- testb %cl, %cl /* is seventh byte C? */
- jz 6f /* yes => return pointer */
- cmpb %dl, %cl /* is seventh byte NUL? */
- je 7b /* yes => return NULL */
-
- /* It must be in the eigth byte and it cannot be NUL. */
- incq %rax
-
-6:
- nop
- retq
-END (strchr)
-
-weak_alias (strchr, index)
-libc_hidden_builtin_def (strchr)
diff --git a/sysdeps/x86_64/rtld-strlen.S b/sysdeps/x86_64/rtld-strlen.S
deleted file mode 100644
index 1328652..0000000
--- a/sysdeps/x86_64/rtld-strlen.S
+++ /dev/null
@@ -1,136 +0,0 @@
-/* strlen(str) -- determine the length of the string STR.
- Copyright (C) 2002-2015 Free Software Foundation, Inc.
- Based on i486 version contributed by Ulrich Drepper <drepper@redhat.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
- .text
-ENTRY (strlen)
- movq %rdi, %rcx /* Duplicate source pointer. */
- andl $7, %ecx /* mask alignment bits */
- movq %rdi, %rax /* duplicate destination. */
- jz 1f /* aligned => start loop */
-
- neg %ecx /* We need to align to 8 bytes. */
- addl $8,%ecx
- /* Search the first bytes directly. */
-0: cmpb $0x0,(%rax) /* is byte NUL? */
- je 2f /* yes => return */
- incq %rax /* increment pointer */
- decl %ecx
- jnz 0b
-
-1: movq $0xfefefefefefefeff,%r8 /* Save magic. */
-
- .p2align 4 /* Align loop. */
-4: /* Main Loop is unrolled 4 times. */
- /* First unroll. */
- movq (%rax), %rcx /* get double word (= 8 bytes) in question */
- addq $8,%rax /* adjust pointer for next word */
- movq %r8, %rdx /* magic value */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jnz 3f /* found NUL => return pointer */
-
- /* Second unroll. */
- movq (%rax), %rcx /* get double word (= 8 bytes) in question */
- addq $8,%rax /* adjust pointer for next word */
- movq %r8, %rdx /* magic value */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jnz 3f /* found NUL => return pointer */
-
- /* Third unroll. */
- movq (%rax), %rcx /* get double word (= 8 bytes) in question */
- addq $8,%rax /* adjust pointer for next word */
- movq %r8, %rdx /* magic value */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jnz 3f /* found NUL => return pointer */
-
- /* Fourth unroll. */
- movq (%rax), %rcx /* get double word (= 8 bytes) in question */
- addq $8,%rax /* adjust pointer for next word */
- movq %r8, %rdx /* magic value */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jz 4b /* no NUL found => continue loop */
-
- .p2align 4 /* Align, it's a jump target. */
-3: subq $8,%rax /* correct pointer increment. */
-
- testb %cl, %cl /* is first byte NUL? */
- jz 2f /* yes => return */
- incq %rax /* increment pointer */
-
- testb %ch, %ch /* is second byte NUL? */
- jz 2f /* yes => return */
- incq %rax /* increment pointer */
-
- testl $0x00ff0000, %ecx /* is third byte NUL? */
- jz 2f /* yes => return pointer */
- incq %rax /* increment pointer */
-
- testl $0xff000000, %ecx /* is fourth byte NUL? */
- jz 2f /* yes => return pointer */
- incq %rax /* increment pointer */
-
- shrq $32, %rcx /* look at other half. */
-
- testb %cl, %cl /* is first byte NUL? */
- jz 2f /* yes => return */
- incq %rax /* increment pointer */
-
- testb %ch, %ch /* is second byte NUL? */
- jz 2f /* yes => return */
- incq %rax /* increment pointer */
-
- testl $0xff0000, %ecx /* is third byte NUL? */
- jz 2f /* yes => return pointer */
- incq %rax /* increment pointer */
-2:
- subq %rdi, %rax /* compute difference to string start */
- ret
-END (strlen)
-libc_hidden_builtin_def (strlen)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=7448ae107ca0f19c96e0246892abb07e29949d6b
commit 7448ae107ca0f19c96e0246892abb07e29949d6b
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Jul 29 03:47:54 2015 -0700
Replace %xmm8 with %xmm0
Since ld.so preserves vector registers now, we can use %xmm0 to avoid
the REX prefix.
* sysdeps/x86_64/memset.S: Replace %xmm8 with %xmm0.
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index e496254..3855cc8 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -24,7 +24,7 @@
ENTRY(__bzero)
movq %rdi, %rax /* Set return value. */
movq %rsi, %rdx /* Set n. */
- pxor %xmm8, %xmm8
+ pxor %xmm0, %xmm0
jmp L(entry_from_bzero)
END(__bzero)
weak_alias (__bzero, bzero)
@@ -33,10 +33,10 @@ weak_alias (__bzero, bzero)
ENTRY(__memset_tail)
movq %rcx, %rax /* Set return value. */
- movd %esi, %xmm8
- punpcklbw %xmm8, %xmm8
- punpcklwd %xmm8, %xmm8
- pshufd $0, %xmm8, %xmm8
+ movd %esi, %xmm0
+ punpcklbw %xmm0, %xmm0
+ punpcklwd %xmm0, %xmm0
+ pshufd $0, %xmm0, %xmm0
jmp L(entry_from_bzero)
END(__memset_tail)
@@ -50,57 +50,57 @@ END_CHK (__memset_chk)
#endif
ENTRY (memset)
- movd %esi, %xmm8
+ movd %esi, %xmm0
movq %rdi, %rax
- punpcklbw %xmm8, %xmm8
- punpcklwd %xmm8, %xmm8
- pshufd $0, %xmm8, %xmm8
+ punpcklbw %xmm0, %xmm0
+ punpcklwd %xmm0, %xmm0
+ pshufd $0, %xmm0, %xmm0
L(entry_from_bzero):
cmpq $64, %rdx
ja L(loop_start)
cmpq $16, %rdx
jbe L(less_16_bytes)
cmpq $32, %rdx
- movdqu %xmm8, (%rdi)
- movdqu %xmm8, -16(%rdi,%rdx)
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm0, -16(%rdi,%rdx)
ja L(between_32_64_bytes)
L(return):
rep
ret
.p2align 4
L(between_32_64_bytes):
- movdqu %xmm8, 16(%rdi)
- movdqu %xmm8, -32(%rdi,%rdx)
+ movdqu %xmm0, 16(%rdi)
+ movdqu %xmm0, -32(%rdi,%rdx)
ret
.p2align 4
L(loop_start):
leaq 64(%rdi), %rcx
- movdqu %xmm8, (%rdi)
+ movdqu %xmm0, (%rdi)
andq $-64, %rcx
- movdqu %xmm8, -16(%rdi,%rdx)
- movdqu %xmm8, 16(%rdi)
- movdqu %xmm8, -32(%rdi,%rdx)
- movdqu %xmm8, 32(%rdi)
- movdqu %xmm8, -48(%rdi,%rdx)
- movdqu %xmm8, 48(%rdi)
- movdqu %xmm8, -64(%rdi,%rdx)
+ movdqu %xmm0, -16(%rdi,%rdx)
+ movdqu %xmm0, 16(%rdi)
+ movdqu %xmm0, -32(%rdi,%rdx)
+ movdqu %xmm0, 32(%rdi)
+ movdqu %xmm0, -48(%rdi,%rdx)
+ movdqu %xmm0, 48(%rdi)
+ movdqu %xmm0, -64(%rdi,%rdx)
addq %rdi, %rdx
andq $-64, %rdx
cmpq %rdx, %rcx
je L(return)
.p2align 4
L(loop):
- movdqa %xmm8, (%rcx)
- movdqa %xmm8, 16(%rcx)
- movdqa %xmm8, 32(%rcx)
- movdqa %xmm8, 48(%rcx)
+ movdqa %xmm0, (%rcx)
+ movdqa %xmm0, 16(%rcx)
+ movdqa %xmm0, 32(%rcx)
+ movdqa %xmm0, 48(%rcx)
addq $64, %rcx
cmpq %rcx, %rdx
jne L(loop)
rep
ret
L(less_16_bytes):
- movq %xmm8, %rcx
+ movq %xmm0, %rcx
testb $24, %dl
jne L(between8_16bytes)
testb $4, %dl
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0bbf91204e74db2c6f7a89990443c98d16ca7300
commit 0bbf91204e74db2c6f7a89990443c98d16ca7300
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Jul 29 03:44:39 2015 -0700
Replace %xmm[8-12] with %xmm[0-4]
Since ld.so preserves vector registers now, we can use %xmm[0-4] to
avoid the REX prefix.
* sysdeps/x86_64/strlen.S: Replace %xmm[8-12] with %xmm[0-4].
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index c382c8d..0725333 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -20,7 +20,7 @@
/* Long lived register in strlen(s), strnlen(s, n) are:
- %xmm11 - zero
+ %xmm3 - zero
%rdi - s
%r10 (s+n) & (~(64-1))
%r11 s+n
@@ -32,14 +32,14 @@ ENTRY(strlen)
/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
#define FIND_ZERO \
- pcmpeqb (%rax), %xmm8; \
- pcmpeqb 16(%rax), %xmm9; \
- pcmpeqb 32(%rax), %xmm10; \
- pcmpeqb 48(%rax), %xmm11; \
- pmovmskb %xmm8, %esi; \
- pmovmskb %xmm9, %edx; \
- pmovmskb %xmm10, %r8d; \
- pmovmskb %xmm11, %ecx; \
+ pcmpeqb (%rax), %xmm0; \
+ pcmpeqb 16(%rax), %xmm1; \
+ pcmpeqb 32(%rax), %xmm2; \
+ pcmpeqb 48(%rax), %xmm3; \
+ pmovmskb %xmm0, %esi; \
+ pmovmskb %xmm1, %edx; \
+ pmovmskb %xmm2, %r8d; \
+ pmovmskb %xmm3, %ecx; \
salq $16, %rdx; \
salq $16, %rcx; \
orq %rsi, %rdx; \
@@ -63,10 +63,10 @@ L(n_nonzero):
mov %rsi, %r11
#endif
- pxor %xmm8, %xmm8
- pxor %xmm9, %xmm9
- pxor %xmm10, %xmm10
- pxor %xmm11, %xmm11
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
movq %rdi, %rax
movq %rdi, %rcx
andq $4095, %rcx
@@ -103,9 +103,9 @@ L(n_nonzero):
FIND_ZERO
#else
/* Test first 16 bytes unaligned. */
- movdqu (%rax), %xmm12
- pcmpeqb %xmm8, %xmm12
- pmovmskb %xmm12, %edx
+ movdqu (%rax), %xmm4
+ pcmpeqb %xmm0, %xmm4
+ pmovmskb %xmm4, %edx
test %edx, %edx
je L(next48_bytes)
bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
@@ -114,12 +114,12 @@ L(n_nonzero):
L(next48_bytes):
/* Same as FIND_ZERO except we do not check first 16 bytes. */
andq $-16, %rax
- pcmpeqb 16(%rax), %xmm9
- pcmpeqb 32(%rax), %xmm10
- pcmpeqb 48(%rax), %xmm11
- pmovmskb %xmm9, %edx
- pmovmskb %xmm10, %r8d
- pmovmskb %xmm11, %ecx
+ pcmpeqb 16(%rax), %xmm1
+ pcmpeqb 32(%rax), %xmm2
+ pcmpeqb 48(%rax), %xmm3
+ pmovmskb %xmm1, %edx
+ pmovmskb %xmm2, %r8d
+ pmovmskb %xmm3, %ecx
salq $16, %rdx
salq $16, %rcx
orq %r8, %rcx
@@ -127,7 +127,7 @@ L(next48_bytes):
orq %rcx, %rdx
#endif
- /* When no zero byte is found xmm9-11 are zero so we do not have to
+ /* When no zero byte is found xmm1-3 are zero so we do not have to
zero them. */
PROLOG(loop)
@@ -149,9 +149,9 @@ L(strnlen_ret):
#endif
.p2align 4
L(loop_init):
- pxor %xmm9, %xmm9
- pxor %xmm10, %xmm10
- pxor %xmm11, %xmm11
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
#ifdef AS_STRNLEN
.p2align 4
L(loop):
@@ -160,12 +160,12 @@ L(loop):
cmpq %rax, %r10
je L(exit_end)
- movdqa (%rax), %xmm8
- pminub 16(%rax), %xmm8
- pminub 32(%rax), %xmm8
- pminub 48(%rax), %xmm8
- pcmpeqb %xmm11, %xmm8
- pmovmskb %xmm8, %edx
+ movdqa (%rax), %xmm0
+ pminub 16(%rax), %xmm0
+ pminub 32(%rax), %xmm0
+ pminub 48(%rax), %xmm0
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
testl %edx, %edx
jne L(exit)
jmp L(loop)
@@ -174,7 +174,7 @@ L(loop):
L(exit_end):
cmp %rax, %r11
je L(first) /* Do not read when end is at page boundary. */
- pxor %xmm8, %xmm8
+ pxor %xmm0, %xmm0
FIND_ZERO
L(first):
@@ -186,7 +186,7 @@ L(first):
.p2align 4
L(exit):
- pxor %xmm8, %xmm8
+ pxor %xmm0, %xmm0
FIND_ZERO
bsfq %rdx, %rdx
@@ -200,23 +200,23 @@ L(exit):
.p2align 4
L(loop):
- movdqa 64(%rax), %xmm8
- pminub 80(%rax), %xmm8
- pminub 96(%rax), %xmm8
- pminub 112(%rax), %xmm8
- pcmpeqb %xmm11, %xmm8
- pmovmskb %xmm8, %edx
+ movdqa 64(%rax), %xmm0
+ pminub 80(%rax), %xmm0
+ pminub 96(%rax), %xmm0
+ pminub 112(%rax), %xmm0
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
testl %edx, %edx
jne L(exit64)
subq $-128, %rax
- movdqa (%rax), %xmm8
- pminub 16(%rax), %xmm8
- pminub 32(%rax), %xmm8
- pminub 48(%rax), %xmm8
- pcmpeqb %xmm11, %xmm8
- pmovmskb %xmm8, %edx
+ movdqa (%rax), %xmm0
+ pminub 16(%rax), %xmm0
+ pminub 32(%rax), %xmm0
+ pminub 48(%rax), %xmm0
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
testl %edx, %edx
jne L(exit0)
jmp L(loop)
@@ -225,7 +225,7 @@ L(loop):
L(exit64):
addq $64, %rax
L(exit0):
- pxor %xmm8, %xmm8
+ pxor %xmm0, %xmm0
FIND_ZERO
bsfq %rdx, %rdx
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ca43850f17b4b366db6c43bce86f054bd8850e3b
commit ca43850f17b4b366db6c43bce86f054bd8850e3b
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Tue Jul 28 18:56:18 2015 -0700
Don't disable SSE in x86-64 ld.so
Since ld.so preserves vector registers now, we can use SSE in ld.so.
* sysdeps/i386/Makefile [$(subdir) == elf] (CFLAGS-.os): Add
-mno-sse -mno-mmx for $(all-rtld-routines).
[$(subdir) == elf] (tests-special): Add
$(objpfx)tst-ld-sse-use.out.
[$(subdir) == elf] ($(objpfx)tst-ld-sse-use.out): New rule.
* sysdeps/x86/Makefile [$(subdir) == elf] (CFLAGS-.os): Removed.
[$(subdir) == elf] (tests-special): Likewise.
[$(subdir) == elf] ($(objpfx)tst-ld-sse-use.out): Likewise.
* sysdeps/x86_64/Makefile [$(subdir) == elf] (CFLAGS-.os): Add
-mno-mmx for $(all-rtld-routines).
diff --git a/sysdeps/i386/Makefile b/sysdeps/i386/Makefile
index 11f425d..2c08907 100644
--- a/sysdeps/i386/Makefile
+++ b/sysdeps/i386/Makefile
@@ -79,3 +79,14 @@ endif
ifeq ($(subdir),csu)
gen-as-const-headers += tlsdesc.sym
endif
+
+ifeq ($(subdir),elf)
+CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
+ -mno-sse -mno-mmx)
+
+tests-special += $(objpfx)tst-ld-sse-use.out
+$(objpfx)tst-ld-sse-use.out: ../sysdeps/x86/tst-ld-sse-use.sh $(objpfx)ld.so
+ @echo "Checking ld.so for SSE register use. This will take a few seconds..."
+ $(BASH) $< $(objpfx) '$(NM)' '$(OBJDUMP)' '$(READELF)' > $@; \
+ $(evaluate-test)
+endif
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index c262fdf..0de4f42 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -1,14 +1,3 @@
-ifeq ($(subdir),elf)
-CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
- -mno-sse -mno-mmx)
-
-tests-special += $(objpfx)tst-ld-sse-use.out
-$(objpfx)tst-ld-sse-use.out: ../sysdeps/x86/tst-ld-sse-use.sh $(objpfx)ld.so
- @echo "Checking ld.so for SSE register use. This will take a few seconds..."
- $(BASH) $< $(objpfx) '$(NM)' '$(OBJDUMP)' '$(READELF)' > $@; \
- $(evaluate-test)
-endif
-
ifeq ($(subdir),csu)
gen-as-const-headers += cpu-features-offsets.sym rtld-global-offsets.sym
endif
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index f6ef064..e155691 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -19,6 +19,9 @@ gen-as-const-headers += locale-defines.sym
endif
ifeq ($(subdir),elf)
+CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
+ -mno-mmx)
+
sysdep-dl-routines += tlsdesc dl-tlsdesc
tests += ifuncmain8
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=698c7e058547f634639bbf06cfb88d572adfd6de
commit 698c7e058547f634639bbf06cfb88d572adfd6de
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sat Jul 11 13:25:25 2015 -0700
Save and restore vector registers in x86-64 ld.so
This patch initiaizes GLRO(dl_x86_xstate) in dl_platform_init to
indicate if the processor supports SSE, AVX or AVX512. It uses
this information to properly save and restore vector registers in
ld.so. Now we can use SSE in ld.so and delete FOREIGN_CALL macros.
[BZ #15128]
* sysdeps/x86_64/Makefile [$(subdir) == elf] (tests): Add
ifuncmain8.
(modules-names): Add ifuncmod8.
($(objpfx)ifuncmain8): New rule.
* sysdeps/x86_64/dl-machine.h: Include <dl-procinfo.h> and
<cpuid.h>.
(elf_machine_runtime_setup): Use _dl_runtime_resolve_sse,
_dl_runtime_resolve_avx, or _dl_runtime_resolve_avx512,
_dl_runtime_profile_sse, _dl_runtime_profile_avx, or
_dl_runtime_profile_avx512, based on HAS_ARCH_FEATURE.
* sysdeps/x86_64/dl-trampoline.S: Rewrite.
* sysdeps/x86_64/dl-trampoline.h: Likewise.
* sysdeps/x86_64/ifuncmain8.c: New file.
* sysdeps/x86_64/ifuncmod8.c: Likewise.
* sysdeps/x86_64/nptl/tcb-offsets.sym (RTLD_SAVESPACE_SSE):
Removed.
* sysdeps/x86_64/nptl/tls.h (__128bits): Removed.
(tcbhead_t): Change rtld_must_xmm_save to __glibc_unused1.
Change rtld_savespace_sse to __glibc_unused2.
(RTLD_CHECK_FOREIGN_CALL): Removed.
(RTLD_ENABLE_FOREIGN_CALL): Likewise.
(RTLD_PREPARE_FOREIGN_CALL): Likewise.
(RTLD_FINALIZE_FOREIGN_CALL): Likewise.
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index ef70a50..f6ef064 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -21,6 +21,11 @@ endif
ifeq ($(subdir),elf)
sysdep-dl-routines += tlsdesc dl-tlsdesc
+tests += ifuncmain8
+modules-names += ifuncmod8
+
+$(objpfx)ifuncmain8: $(objpfx)ifuncmod8.so
+
tests += tst-quad1 tst-quad2
modules-names += tst-quadmod1 tst-quadmod2
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index d22359d..32d25da 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -66,8 +66,12 @@ static inline int __attribute__ ((unused, always_inline))
elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
{
Elf64_Addr *got;
- extern void _dl_runtime_resolve (ElfW(Word)) attribute_hidden;
- extern void _dl_runtime_profile (ElfW(Word)) attribute_hidden;
+ extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden;
+ extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden;
+ extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden;
+ extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
+ extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
+ extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
if (l->l_info[DT_JMPREL] && lazy)
{
@@ -95,7 +99,12 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
end in this function. */
if (__glibc_unlikely (profile))
{
- *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile;
+ if (HAS_ARCH_FEATURE (AVX512F_Usable))
+ *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
+ else if (HAS_ARCH_FEATURE (AVX_Usable))
+ *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
+ else
+ *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
if (GLRO(dl_profile) != NULL
&& _dl_name_match_p (GLRO(dl_profile), l))
@@ -104,9 +113,17 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
GL(dl_profile_map) = l;
}
else
- /* This function will get called to fix up the GOT entry indicated by
- the offset on the stack, and then jump to the resolved address. */
- *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve;
+ {
+ /* This function will get called to fix up the GOT entry
+ indicated by the offset on the stack, and then jump to
+ the resolved address. */
+ if (HAS_ARCH_FEATURE (AVX512F_Usable))
+ *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
+ else if (HAS_ARCH_FEATURE (AVX_Usable))
+ *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx;
+ else
+ *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse;
+ }
}
if (l->l_info[ADDRIDX (DT_TLSDESC_GOT)] && lazy)
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 678c57f..8475d26 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -20,23 +20,40 @@
#include <sysdep.h>
#include <link-defines.h>
-#if (RTLD_SAVESPACE_SSE % 32) != 0
-# error RTLD_SAVESPACE_SSE must be aligned to 32 bytes
+#ifndef DL_STACK_ALIGNMENT
+/* Due to GCC bug:
+
+ https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+
+ __tls_get_addr may be called with 8-byte stack alignment. Although
+ this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
+ that stack will be always aligned at 16 bytes. We use unaligned
+ 16-byte move to load and store SSE registers, which has no penalty
+ on modern processors if stack is 16-byte aligned. */
+# define DL_STACK_ALIGNMENT 8
+#endif
+
+#ifndef DL_RUNIME_UNALIGNED_VEC_SIZE
+/* The maximum size of unaligned vector load and store. */
+# define DL_RUNIME_UNALIGNED_VEC_SIZE 16
#endif
+/* True if _dl_runtime_resolve should align stack to VEC_SIZE bytes. */
+#define DL_RUNIME_RESOLVE_REALIGN_STACK \
+ (VEC_SIZE > DL_STACK_ALIGNMENT \
+ && VEC_SIZE > DL_RUNIME_UNALIGNED_VEC_SIZE)
+
+/* Align vector register save area to 16 bytes. */
+#define REGISTER_SAVE_VEC_OFF 0
+
/* Area on stack to save and restore registers used for parameter
passing when calling _dl_fixup. */
#ifdef __ILP32__
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX. */
-# define REGISTER_SAVE_AREA (8 * 7)
-# define REGISTER_SAVE_RAX 0
+# define REGISTER_SAVE_RAX (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
# define PRESERVE_BND_REGS_PREFIX
#else
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0,
- BND1, BND2, BND3. */
-# define REGISTER_SAVE_AREA (8 * 7 + 16 * 4)
/* Align bound register save area to 16 bytes. */
-# define REGISTER_SAVE_BND0 0
+# define REGISTER_SAVE_BND0 (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
# define REGISTER_SAVE_BND1 (REGISTER_SAVE_BND0 + 16)
# define REGISTER_SAVE_BND2 (REGISTER_SAVE_BND1 + 16)
# define REGISTER_SAVE_BND3 (REGISTER_SAVE_BND2 + 16)
@@ -54,386 +71,53 @@
#define REGISTER_SAVE_R8 (REGISTER_SAVE_RDI + 8)
#define REGISTER_SAVE_R9 (REGISTER_SAVE_R8 + 8)
- .text
- .globl _dl_runtime_resolve
- .type _dl_runtime_resolve, @function
- .align 16
- cfi_startproc
-_dl_runtime_resolve:
- cfi_adjust_cfa_offset(16) # Incorporate PLT
- subq $REGISTER_SAVE_AREA,%rsp
- cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
- # Preserve registers otherwise clobbered.
- movq %rax, REGISTER_SAVE_RAX(%rsp)
- movq %rcx, REGISTER_SAVE_RCX(%rsp)
- movq %rdx, REGISTER_SAVE_RDX(%rsp)
- movq %rsi, REGISTER_SAVE_RSI(%rsp)
- movq %rdi, REGISTER_SAVE_RDI(%rsp)
- movq %r8, REGISTER_SAVE_R8(%rsp)
- movq %r9, REGISTER_SAVE_R9(%rsp)
-#ifndef __ILP32__
- # We also have to preserve bound registers. These are nops if
- # Intel MPX isn't available or disabled.
-# ifdef HAVE_MPX_SUPPORT
- bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
- bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
- bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
- bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
-# else
-# if REGISTER_SAVE_BND0 == 0
- .byte 0x66,0x0f,0x1b,0x04,0x24
-# else
- .byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
-# endif
- .byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
- .byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
- .byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
-# endif
-#endif
- # Copy args pushed by PLT in register.
- # %rdi: link_map, %rsi: reloc_index
- movq (REGISTER_SAVE_AREA + 8)(%rsp), %rsi
- movq REGISTER_SAVE_AREA(%rsp), %rdi
- call _dl_fixup # Call resolver.
- movq %rax, %r11 # Save return value
-#ifndef __ILP32__
- # Restore bound registers. These are nops if Intel MPX isn't
- # avaiable or disabled.
-# ifdef HAVE_MPX_SUPPORT
- bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
- bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
- bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
- bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
-# else
- .byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
- .byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
- .byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
-# if REGISTER_SAVE_BND0 == 0
- .byte 0x66,0x0f,0x1a,0x04,0x24
-# else
- .byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
-# endif
-# endif
+#define VEC_SIZE 64
+#define VMOVA vmovdqa64
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV vmovdqa64
+#else
+# define VMOV vmovdqu64
#endif
- # Get register content back.
- movq REGISTER_SAVE_R9(%rsp), %r9
- movq REGISTER_SAVE_R8(%rsp), %r8
- movq REGISTER_SAVE_RDI(%rsp), %rdi
- movq REGISTER_SAVE_RSI(%rsp), %rsi
- movq REGISTER_SAVE_RDX(%rsp), %rdx
- movq REGISTER_SAVE_RCX(%rsp), %rcx
- movq REGISTER_SAVE_RAX(%rsp), %rax
- # Adjust stack(PLT did 2 pushes)
- addq $(REGISTER_SAVE_AREA + 16), %rsp
- cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))
- # Preserve bound registers.
- PRESERVE_BND_REGS_PREFIX
- jmp *%r11 # Jump to function address.
- cfi_endproc
- .size _dl_runtime_resolve, .-_dl_runtime_resolve
-
-
-#ifndef PROF
- .globl _dl_runtime_profile
- .type _dl_runtime_profile, @function
- .align 16
- cfi_startproc
-
-_dl_runtime_profile:
- cfi_adjust_cfa_offset(16) # Incorporate PLT
- /* The La_x86_64_regs data structure pointed to by the
- fourth paramater must be 16-byte aligned. This must
- be explicitly enforced. We have the set up a dynamically
- sized stack frame. %rbx points to the top half which
- has a fixed size and preserves the original stack pointer. */
-
- subq $32, %rsp # Allocate the local storage.
- cfi_adjust_cfa_offset(32)
- movq %rbx, (%rsp)
- cfi_rel_offset(%rbx, 0)
-
- /* On the stack:
- 56(%rbx) parameter #1
- 48(%rbx) return address
-
- 40(%rbx) reloc index
- 32(%rbx) link_map
-
- 24(%rbx) La_x86_64_regs pointer
- 16(%rbx) framesize
- 8(%rbx) rax
- (%rbx) rbx
- */
-
- movq %rax, 8(%rsp)
- movq %rsp, %rbx
- cfi_def_cfa_register(%rbx)
-
- /* Actively align the La_x86_64_regs structure. */
- andq $0xfffffffffffffff0, %rsp
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
- /* sizeof(La_x86_64_regs). Need extra space for 8 SSE registers
- to detect if any xmm0-xmm7 registers are changed by audit
- module. */
- subq $(LR_SIZE + XMM_SIZE*8), %rsp
-# else
- subq $LR_SIZE, %rsp # sizeof(La_x86_64_regs)
-# endif
- movq %rsp, 24(%rbx)
-
- /* Fill the La_x86_64_regs structure. */
- movq %rdx, LR_RDX_OFFSET(%rsp)
- movq %r8, LR_R8_OFFSET(%rsp)
- movq %r9, LR_R9_OFFSET(%rsp)
- movq %rcx, LR_RCX_OFFSET(%rsp)
- movq %rsi, LR_RSI_OFFSET(%rsp)
- movq %rdi, LR_RDI_OFFSET(%rsp)
- movq %rbp, LR_RBP_OFFSET(%rsp)
-
- leaq 48(%rbx), %rax
- movq %rax, LR_RSP_OFFSET(%rsp)
-
- /* We always store the XMM registers even if AVX is available.
- This is to provide backward binary compatibility for existing
- audit modules. */
- movaps %xmm0, (LR_XMM_OFFSET)(%rsp)
- movaps %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
- movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
- movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
- movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
- movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
- movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
- movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
-
-# ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
- bndmov %bnd0, (LR_BND_OFFSET)(%rsp) # Preserve bound
- bndmov %bnd1, (LR_BND_OFFSET + BND_SIZE)(%rsp) # registers. Nops if
- bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp) # MPX not available
- bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp) # or disabled.
-# else
- .byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
- .byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
- .byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
- .byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
-# endif
-# endif
-
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
- .data
-L(have_avx):
- .zero 4
- .size L(have_avx), 4
- .previous
-
- cmpl $0, L(have_avx)(%rip)
- jne L(defined)
- movq %rbx, %r11 # Save rbx
- movl $1, %eax
- cpuid
- movq %r11,%rbx # Restore rbx
- xorl %eax, %eax
- // AVX and XSAVE supported?
- andl $((1 << 28) | (1 << 27)), %ecx
- cmpl $((1 << 28) | (1 << 27)), %ecx
- jne 10f
-# ifdef HAVE_AVX512_ASM_SUPPORT
- // AVX512 supported in processor?
- movq %rbx, %r11 # Save rbx
- xorl %ecx, %ecx
- mov $0x7, %eax
- cpuid
- andl $(1 << 16), %ebx
-# endif
- xorl %ecx, %ecx
- // Get XFEATURE_ENABLED_MASK
- xgetbv
-# ifdef HAVE_AVX512_ASM_SUPPORT
- test %ebx, %ebx
- movq %r11, %rbx # Restore rbx
- je 20f
- // Verify that XCR0[7:5] = '111b' and
- // XCR0[2:1] = '11b' which means
- // that zmm state is enabled
- andl $0xe6, %eax
- cmpl $0xe6, %eax
- jne 20f
- movl %eax, L(have_avx)(%rip)
-L(avx512):
-# define RESTORE_AVX
-# define VMOV vmovdqu64
-# define VEC(i) zmm##i
-# define MORE_CODE
-# include "dl-trampoline.h"
-# undef VMOV
-# undef VEC
-# undef RESTORE_AVX
-# endif
-20: andl $0x6, %eax
-10: subl $0x5, %eax
- movl %eax, L(have_avx)(%rip)
- cmpl $0, %eax
-
-L(defined):
- js L(no_avx)
-# ifdef HAVE_AVX512_ASM_SUPPORT
- cmpl $0xe6, L(have_avx)(%rip)
- je L(avx512)
-# endif
-
-# define RESTORE_AVX
-# define VMOV vmovdqu
-# define VEC(i) ymm##i
-# define MORE_CODE
-# include "dl-trampoline.h"
-
- .align 16
-L(no_avx):
-# endif
-
-# undef RESTORE_AVX
-# include "dl-trampoline.h"
-
- cfi_endproc
- .size _dl_runtime_profile, .-_dl_runtime_profile
+#define VEC(i) zmm##i
+#define _dl_runtime_resolve _dl_runtime_resolve_avx512
+#define _dl_runtime_profile _dl_runtime_profile_avx512
+#define RESTORE_AVX
+#include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef _dl_runtime_profile
+#undef VEC
+#undef VMOV
+#undef VMOVA
+#undef VEC_SIZE
+
+#define VEC_SIZE 32
+#define VMOVA vmovdqa
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV vmovdqa
+#else
+# define VMOV vmovdqu
#endif
-
-
-#ifdef SHARED
- .globl _dl_x86_64_save_sse
- .type _dl_x86_64_save_sse, @function
- .align 16
- cfi_startproc
-_dl_x86_64_save_sse:
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
- cmpl $0, L(have_avx)(%rip)
- jne L(defined_5)
- movq %rbx, %r11 # Save rbx
- movl $1, %eax
- cpuid
- movq %r11,%rbx # Restore rbx
- xorl %eax, %eax
- // AVX and XSAVE supported?
- andl $((1 << 28) | (1 << 27)), %ecx
- cmpl $((1 << 28) | (1 << 27)), %ecx
- jne 1f
-# ifdef HAVE_AVX512_ASM_SUPPORT
- // AVX512 supported in a processor?
- movq %rbx, %r11 # Save rbx
- xorl %ecx,%ecx
- mov $0x7,%eax
- cpuid
- andl $(1 << 16), %ebx
-# endif
- xorl %ecx, %ecx
- // Get XFEATURE_ENABLED_MASK
- xgetbv
-# ifdef HAVE_AVX512_ASM_SUPPORT
- test %ebx, %ebx
- movq %r11, %rbx # Restore rbx
- je 2f
- // Verify that XCR0[7:5] = '111b' and
- // XCR0[2:1] = '11b' which means
- // that zmm state is enabled
- andl $0xe6, %eax
- movl %eax, L(have_avx)(%rip)
- cmpl $0xe6, %eax
- je L(avx512_5)
-# endif
-
-2: andl $0x6, %eax
-1: subl $0x5, %eax
- movl %eax, L(have_avx)(%rip)
- cmpl $0, %eax
-
-L(defined_5):
- js L(no_avx5)
-# ifdef HAVE_AVX512_ASM_SUPPORT
- cmpl $0xe6, L(have_avx)(%rip)
- je L(avx512_5)
-# endif
-
- vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE
- vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE
- vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE
- vmovdqa %ymm3, %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE
- vmovdqa %ymm4, %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE
- vmovdqa %ymm5, %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE
- vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE
- vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE
- ret
-# ifdef HAVE_AVX512_ASM_SUPPORT
-L(avx512_5):
- vmovdqu64 %zmm0, %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE
- vmovdqu64 %zmm1, %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE
- vmovdqu64 %zmm2, %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE
- vmovdqu64 %zmm3, %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE
- vmovdqu64 %zmm4, %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE
- vmovdqu64 %zmm5, %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE
- vmovdqu64 %zmm6, %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE
- vmovdqu64 %zmm7, %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE
- ret
-# endif
-L(no_avx5):
-# endif
- movdqa %xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE
- movdqa %xmm1, %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE
- movdqa %xmm2, %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE
- movdqa %xmm3, %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE
- movdqa %xmm4, %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE
- movdqa %xmm5, %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE
- movdqa %xmm6, %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE
- movdqa %xmm7, %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE
- ret
- cfi_endproc
- .size _dl_x86_64_save_sse, .-_dl_x86_64_save_sse
-
-
- .globl _dl_x86_64_restore_sse
- .type _dl_x86_64_restore_sse, @function
- .align 16
- cfi_startproc
-_dl_x86_64_restore_sse:
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
- cmpl $0, L(have_avx)(%rip)
- js L(no_avx6)
-# ifdef HAVE_AVX512_ASM_SUPPORT
- cmpl $0xe6, L(have_avx)(%rip)
- je L(avx512_6)
-# endif
-
- vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0
- vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1
- vmovdqa %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE, %ymm2
- vmovdqa %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE, %ymm3
- vmovdqa %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE, %ymm4
- vmovdqa %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE, %ymm5
- vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6
- vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7
- ret
-# ifdef HAVE_AVX512_ASM_SUPPORT
-L(avx512_6):
- vmovdqu64 %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE, %zmm0
- vmovdqu64 %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE, %zmm1
- vmovdqu64 %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE, %zmm2
- vmovdqu64 %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE, %zmm3
- vmovdqu64 %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE, %zmm4
- vmovdqu64 %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE, %zmm5
- vmovdqu64 %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE, %zmm6
- vmovdqu64 %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE, %zmm7
- ret
-# endif
-L(no_avx6):
-# endif
- movdqa %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0
- movdqa %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE, %xmm1
- movdqa %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE, %xmm2
- movdqa %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE, %xmm3
- movdqa %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE, %xmm4
- movdqa %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE, %xmm5
- movdqa %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE, %xmm6
- movdqa %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE, %xmm7
- ret
- cfi_endproc
- .size _dl_x86_64_restore_sse, .-_dl_x86_64_restore_sse
+#define VEC(i) ymm##i
+#define _dl_runtime_resolve _dl_runtime_resolve_avx
+#define _dl_runtime_profile _dl_runtime_profile_avx
+#include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef _dl_runtime_profile
+#undef VEC
+#undef VMOV
+#undef VMOVA
+#undef VEC_SIZE
+
+/* movaps/movups is 1-byte shorter. */
+#define VEC_SIZE 16
+#define VMOVA movaps
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV movaps
+#else
+# define VMOV movups
#endif
+#define VEC(i) xmm##i
+#define _dl_runtime_resolve _dl_runtime_resolve_sse
+#define _dl_runtime_profile _dl_runtime_profile_sse
+#undef RESTORE_AVX
+#include "dl-trampoline.h"
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index d542428..dd6d7c7 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -1,5 +1,4 @@
-/* Partial PLT profile trampoline to save and restore x86-64 vector
- registers.
+/* PLT trampolines. x86-64 version.
Copyright (C) 2009-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -17,16 +16,252 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#ifdef RESTORE_AVX
+#undef REGISTER_SAVE_AREA_RAW
+#ifdef __ILP32__
+/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to
+ VEC7. */
+# define REGISTER_SAVE_AREA_RAW (8 * 7 + VEC_SIZE * 8)
+#else
+/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
+ BND0, BND1, BND2, BND3 and VEC0 to VEC7. */
+# define REGISTER_SAVE_AREA_RAW (8 * 7 + 16 * 4 + VEC_SIZE * 8)
+#endif
+
+#undef REGISTER_SAVE_AREA
+#undef LOCAL_STORAGE_AREA
+#undef BASE
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+# define REGISTER_SAVE_AREA (REGISTER_SAVE_AREA_RAW + 8)
+/* Local stack area before jumping to function address: RBX. */
+# define LOCAL_STORAGE_AREA 8
+# define BASE rbx
+# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0
+# error REGISTER_SAVE_AREA must be multples of VEC_SIZE
+# endif
+#else
+# define REGISTER_SAVE_AREA REGISTER_SAVE_AREA_RAW
+/* Local stack area before jumping to function address: All saved
+ registers. */
+# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA
+# define BASE rsp
+# if (REGISTER_SAVE_AREA % 16) != 8
+# error REGISTER_SAVE_AREA must be odd multples of 8
+# endif
+#endif
+
+ .text
+ .globl _dl_runtime_resolve
+ .hidden _dl_runtime_resolve
+ .type _dl_runtime_resolve, @function
+ .align 16
+ cfi_startproc
+_dl_runtime_resolve:
+ cfi_adjust_cfa_offset(16) # Incorporate PLT
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+# if LOCAL_STORAGE_AREA != 8
+# error LOCAL_STORAGE_AREA must be 8
+# endif
+ pushq %rbx # push subtracts stack by 8.
+ cfi_adjust_cfa_offset(8)
+ cfi_rel_offset(%rbx, 0)
+ mov %RSP_LP, %RBX_LP
+ cfi_def_cfa_register(%rbx)
+ and $-VEC_SIZE, %RSP_LP
+#endif
+ sub $REGISTER_SAVE_AREA, %RSP_LP
+ cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
+ # Preserve registers otherwise clobbered.
+ movq %rax, REGISTER_SAVE_RAX(%rsp)
+ movq %rcx, REGISTER_SAVE_RCX(%rsp)
+ movq %rdx, REGISTER_SAVE_RDX(%rsp)
+ movq %rsi, REGISTER_SAVE_RSI(%rsp)
+ movq %rdi, REGISTER_SAVE_RDI(%rsp)
+ movq %r8, REGISTER_SAVE_R8(%rsp)
+ movq %r9, REGISTER_SAVE_R9(%rsp)
+ VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp)
+ VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp)
+ VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp)
+ VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp)
+ VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp)
+ VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp)
+ VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp)
+ VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp)
+#ifndef __ILP32__
+ # We also have to preserve bound registers. These are nops if
+ # Intel MPX isn't available or disabled.
+# ifdef HAVE_MPX_SUPPORT
+ bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
+ bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
+ bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
+ bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
+# else
+# if REGISTER_SAVE_BND0 == 0
+ .byte 0x66,0x0f,0x1b,0x04,0x24
+# else
+ .byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
+# endif
+ .byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
+ .byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
+ .byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
+# endif
+#endif
+ # Copy args pushed by PLT in register.
+ # %rdi: link_map, %rsi: reloc_index
+ mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP
+ mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP
+ call _dl_fixup # Call resolver.
+ mov %RAX_LP, %R11_LP # Save return value
+#ifndef __ILP32__
+ # Restore bound registers. These are nops if Intel MPX isn't
+ # avaiable or disabled.
+# ifdef HAVE_MPX_SUPPORT
+ bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
+ bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
+ bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
+ bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
+# else
+ .byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
+ .byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
+ .byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
+# if REGISTER_SAVE_BND0 == 0
+ .byte 0x66,0x0f,0x1a,0x04,0x24
+# else
+ .byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
+# endif
+# endif
+#endif
+ # Get register content back.
+ movq REGISTER_SAVE_R9(%rsp), %r9
+ movq REGISTER_SAVE_R8(%rsp), %r8
+ movq REGISTER_SAVE_RDI(%rsp), %rdi
+ movq REGISTER_SAVE_RSI(%rsp), %rsi
+ movq REGISTER_SAVE_RDX(%rsp), %rdx
+ movq REGISTER_SAVE_RCX(%rsp), %rcx
+ movq REGISTER_SAVE_RAX(%rsp), %rax
+ VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0)
+ VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1)
+ VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2)
+ VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3)
+ VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4)
+ VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5)
+ VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6)
+ VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7)
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+ mov %RBX_LP, %RSP_LP
+ cfi_def_cfa_register(%rsp)
+ movq (%rsp), %rbx
+ cfi_restore(%rbx)
+#endif
+ # Adjust stack(PLT did 2 pushes)
+ add $(LOCAL_STORAGE_AREA + 16), %RSP_LP
+ cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16))
+ # Preserve bound registers.
+ PRESERVE_BND_REGS_PREFIX
+ jmp *%r11 # Jump to function address.
+ cfi_endproc
+ .size _dl_runtime_resolve, .-_dl_runtime_resolve
+
+
+#ifndef PROF
+# if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
+# error LR_VECTOR_OFFSET must be multples of VEC_SIZE
+# endif
+
+ .globl _dl_runtime_profile
+ .hidden _dl_runtime_profile
+ .type _dl_runtime_profile, @function
+ .align 16
+_dl_runtime_profile:
+ cfi_startproc
+ cfi_adjust_cfa_offset(16) # Incorporate PLT
+ /* The La_x86_64_regs data structure pointed to by the
+ fourth paramater must be VEC_SIZE-byte aligned. This must
+ be explicitly enforced. We have the set up a dynamically
+ sized stack frame. %rbx points to the top half which
+ has a fixed size and preserves the original stack pointer. */
+
+ sub $32, %RSP_LP # Allocate the local storage.
+ cfi_adjust_cfa_offset(32)
+ movq %rbx, (%rsp)
+ cfi_rel_offset(%rbx, 0)
+
+ /* On the stack:
+ 56(%rbx) parameter #1
+ 48(%rbx) return address
+
+ 40(%rbx) reloc index
+ 32(%rbx) link_map
+
+ 24(%rbx) La_x86_64_regs pointer
+ 16(%rbx) framesize
+ 8(%rbx) rax
+ (%rbx) rbx
+ */
+
+ movq %rax, 8(%rsp)
+ mov %RSP_LP, %RBX_LP
+ cfi_def_cfa_register(%rbx)
+
+ /* Actively align the La_x86_64_regs structure. */
+ and $-VEC_SIZE, %RSP_LP
+# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
+ /* sizeof(La_x86_64_regs). Need extra space for 8 SSE registers
+ to detect if any xmm0-xmm7 registers are changed by audit
+ module. */
+ sub $(LR_SIZE + XMM_SIZE*8), %RSP_LP
+# else
+ sub $LR_SIZE, %RSP_LP # sizeof(La_x86_64_regs)
+# endif
+ movq %rsp, 24(%rbx)
+
+ /* Fill the La_x86_64_regs structure. */
+ movq %rdx, LR_RDX_OFFSET(%rsp)
+ movq %r8, LR_R8_OFFSET(%rsp)
+ movq %r9, LR_R9_OFFSET(%rsp)
+ movq %rcx, LR_RCX_OFFSET(%rsp)
+ movq %rsi, LR_RSI_OFFSET(%rsp)
+ movq %rdi, LR_RDI_OFFSET(%rsp)
+ movq %rbp, LR_RBP_OFFSET(%rsp)
+
+ lea 48(%rbx), %RAX_LP
+ movq %rax, LR_RSP_OFFSET(%rsp)
+
+ /* We always store the XMM registers even if AVX is available.
+ This is to provide backward binary compatibility for existing
+ audit modules. */
+ movaps %xmm0, (LR_XMM_OFFSET)(%rsp)
+ movaps %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
+ movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
+ movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
+ movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
+ movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
+ movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
+ movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
+
+# ifndef __ILP32__
+# ifdef HAVE_MPX_SUPPORT
+ bndmov %bnd0, (LR_BND_OFFSET)(%rsp) # Preserve bound
+ bndmov %bnd1, (LR_BND_OFFSET + BND_SIZE)(%rsp) # registers. Nops if
+ bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp) # MPX not available
+ bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp) # or disabled.
+# else
+ .byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
+ .byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
+ .byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+ .byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+# endif
+# endif
+
+# ifdef RESTORE_AVX
/* This is to support AVX audit modules. */
- VMOV %VEC(0), (LR_VECTOR_OFFSET)(%rsp)
- VMOV %VEC(1), (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
- VMOV %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
- VMOV %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
- VMOV %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
- VMOV %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
- VMOV %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
- VMOV %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+ VMOVA %VEC(0), (LR_VECTOR_OFFSET)(%rsp)
+ VMOVA %VEC(1), (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
+ VMOVA %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+ VMOVA %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+ VMOVA %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+ VMOVA %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+ VMOVA %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+ VMOVA %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
/* Save xmm0-xmm7 registers to detect if any of them are
changed by audit module. */
@@ -38,7 +273,7 @@
vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
-#endif
+# endif
mov %RSP_LP, %RCX_LP # La_x86_64_regs pointer to %rcx.
mov 48(%rbx), %RDX_LP # Load return address if needed.
@@ -63,7 +298,7 @@
movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
/* Check if any xmm0-xmm7 registers are changed by audit
module. */
vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
@@ -72,7 +307,7 @@
je 2f
vmovdqa %xmm0, (LR_VECTOR_OFFSET)(%rsp)
jmp 1f
-2: VMOV (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
+2: VMOVA (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
vmovdqa %xmm0, (LR_XMM_OFFSET)(%rsp)
1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
@@ -81,7 +316,7 @@
je 2f
vmovdqa %xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
jmp 1f
-2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
+2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
vmovdqa %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
@@ -90,7 +325,7 @@
je 2f
vmovdqa %xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
jmp 1f
-2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
+2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
vmovdqa %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
@@ -99,7 +334,7 @@
je 2f
vmovdqa %xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
jmp 1f
-2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
+2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
vmovdqa %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
@@ -108,7 +343,7 @@
je 2f
vmovdqa %xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
jmp 1f
-2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
+2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
vmovdqa %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
@@ -117,7 +352,7 @@
je 2f
vmovdqa %xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
jmp 1f
-2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
+2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
vmovdqa %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
@@ -126,7 +361,7 @@
je 2f
vmovdqa %xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
jmp 1f
-2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
+2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
vmovdqa %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
@@ -135,25 +370,25 @@
je 2f
vmovdqa %xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
jmp 1f
-2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
+2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
vmovdqa %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
1:
-#endif
+# endif
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+# ifdef HAVE_MPX_SUPPORT
bndmov (LR_BND_OFFSET)(%rsp), %bnd0 # Restore bound
bndmov (LR_BND_OFFSET + BND_SIZE)(%rsp), %bnd1 # registers.
bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
-# else
+# else
.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+# endif
# endif
-#endif
mov 16(%rbx), %R10_LP # Anything in framesize?
test %R10_LP, %R10_LP
@@ -168,12 +403,12 @@
movq LR_RSI_OFFSET(%rsp), %rsi
movq LR_RDI_OFFSET(%rsp), %rdi
- movq %rbx, %rsp
+ mov %RBX_LP, %RSP_LP
movq (%rsp), %rbx
- cfi_restore(rbx)
+ cfi_restore(%rbx)
cfi_def_cfa_register(%rsp)
- addq $48, %rsp # Adjust the stack to the return value
+ add $48, %RSP_LP # Adjust the stack to the return value
# (eats the reloc index and link_map)
cfi_adjust_cfa_offset(-48)
PRESERVE_BND_REGS_PREFIX
@@ -189,13 +424,13 @@
temporary buffer of the size specified by the 'framesize'
returned from _dl_profile_fixup */
- leaq LR_RSP_OFFSET(%rbx), %rsi # stack
- addq $8, %r10
- andq $0xfffffffffffffff0, %r10
- movq %r10, %rcx
- subq %r10, %rsp
- movq %rsp, %rdi
- shrq $3, %rcx
+ lea LR_RSP_OFFSET(%rbx), %RSI_LP # stack
+ add $8, %R10_LP
+ and $-16, %R10_LP
+ mov %R10_LP, %RCX_LP
+ sub %R10_LP, %RSP_LP
+ mov %RSP_LP, %RDI_LP
+ shr $3, %RCX_LP
rep
movsq
@@ -206,21 +441,21 @@
PRESERVE_BND_REGS_PREFIX
call *%r11
- mov 24(%rbx), %rsp # Drop the copied stack content
+ mov 24(%rbx), %RSP_LP # Drop the copied stack content
/* Now we have to prepare the La_x86_64_retval structure for the
_dl_call_pltexit. The La_x86_64_regs is being pointed by rsp now,
so we just need to allocate the sizeof(La_x86_64_retval) space on
the stack, since the alignment has already been taken care of. */
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
/* sizeof(La_x86_64_retval). Need extra space for 2 SSE
registers to detect if xmm0/xmm1 registers are changed
by audit module. */
- subq $(LRV_SIZE + XMM_SIZE*2), %rsp
-#else
- subq $LRV_SIZE, %rsp # sizeof(La_x86_64_retval)
-#endif
- movq %rsp, %rcx # La_x86_64_retval argument to %rcx.
+ sub $(LRV_SIZE + XMM_SIZE*2), %RSP_LP
+# else
+ sub $LRV_SIZE, %RSP_LP # sizeof(La_x86_64_retval)
+# endif
+ mov %RSP_LP, %RCX_LP # La_x86_64_retval argument to %rcx.
/* Fill in the La_x86_64_retval structure. */
movq %rax, LRV_RAX_OFFSET(%rcx)
@@ -229,26 +464,26 @@
movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
/* This is to support AVX audit modules. */
- VMOV %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
- VMOV %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
+ VMOVA %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
+ VMOVA %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
/* Save xmm0/xmm1 registers to detect if they are changed
by audit module. */
vmovdqa %xmm0, (LRV_SIZE)(%rcx)
vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
-#endif
+# endif
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+# ifdef HAVE_MPX_SUPPORT
bndmov %bnd0, LRV_BND0_OFFSET(%rcx) # Preserve returned bounds.
bndmov %bnd1, LRV_BND1_OFFSET(%rcx)
-# else
+# else
.byte 0x66,0x0f,0x1b,0x81;.long (LRV_BND0_OFFSET)
.byte 0x66,0x0f,0x1b,0x89;.long (LRV_BND1_OFFSET)
+# endif
# endif
-#endif
fstpt LRV_ST0_OFFSET(%rcx)
fstpt LRV_ST1_OFFSET(%rcx)
@@ -265,50 +500,47 @@
movaps LRV_XMM0_OFFSET(%rsp), %xmm0
movaps LRV_XMM1_OFFSET(%rsp), %xmm1
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
/* Check if xmm0/xmm1 registers are changed by audit module. */
vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
vpmovmskb %xmm2, %esi
cmpl $0xffff, %esi
jne 1f
- VMOV LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
+ VMOVA LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
vpmovmskb %xmm2, %esi
cmpl $0xffff, %esi
jne 1f
- VMOV LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
+ VMOVA LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
1:
-#endif
+# endif
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+# ifdef HAVE_MPX_SUPPORT
bndmov LRV_BND0_OFFSET(%rsp), %bnd0 # Restore bound registers.
bndmov LRV_BND1_OFFSET(%rsp), %bnd1
-# else
+# else
.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LRV_BND0_OFFSET)
.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LRV_BND1_OFFSET)
+# endif
# endif
-#endif
fldt LRV_ST1_OFFSET(%rsp)
fldt LRV_ST0_OFFSET(%rsp)
- movq %rbx, %rsp
+ mov %RBX_LP, %RSP_LP
movq (%rsp), %rbx
- cfi_restore(rbx)
+ cfi_restore(%rbx)
cfi_def_cfa_register(%rsp)
- addq $48, %rsp # Adjust the stack to the return value
+ add $48, %RSP_LP # Adjust the stack to the return value
# (eats the reloc index and link_map)
cfi_adjust_cfa_offset(-48)
PRESERVE_BND_REGS_PREFIX
retq
-#ifdef MORE_CODE
- cfi_adjust_cfa_offset(48)
- cfi_rel_offset(%rbx, 0)
- cfi_def_cfa_register(%rbx)
-# undef MORE_CODE
+ cfi_endproc
+ .size _dl_runtime_profile, .-_dl_runtime_profile
#endif
diff --git a/sysdeps/x86_64/ifuncmain8.c b/sysdeps/x86_64/ifuncmain8.c
new file mode 100644
index 0000000..f75771d
--- /dev/null
+++ b/sysdeps/x86_64/ifuncmain8.c
@@ -0,0 +1,32 @@
+/* Test IFUNC selector with floating-point parameters.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdlib.h>
+
+extern float foo (float);
+
+static int
+do_test (void)
+{
+ if (foo (2) != 3)
+ abort ();
+ return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/sysdeps/x86_64/ifuncmod8.c b/sysdeps/x86_64/ifuncmod8.c
new file mode 100644
index 0000000..741aa13
--- /dev/null
+++ b/sysdeps/x86_64/ifuncmod8.c
@@ -0,0 +1,36 @@
+/* Test IFUNC selector with floating-point parameters.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <emmintrin.h>
+
+void * foo_ifunc (void) __asm__ ("foo");
+__asm__(".type foo, %gnu_indirect_function");
+
+static float
+foo_impl (float x)
+{
+ return x + 1;
+}
+
+void *
+foo_ifunc (void)
+{
+ __m128i xmm = _mm_set1_epi32 (-1);
+ asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" );
+ return foo_impl;
+}
diff --git a/sysdeps/x86_64/nptl/tcb-offsets.sym b/sysdeps/x86_64/nptl/tcb-offsets.sym
index 729d1da..aeb7526 100644
--- a/sysdeps/x86_64/nptl/tcb-offsets.sym
+++ b/sysdeps/x86_64/nptl/tcb-offsets.sym
@@ -16,7 +16,6 @@ VGETCPU_CACHE_OFFSET offsetof (tcbhead_t, vgetcpu_cache)
#ifndef __ASSUME_PRIVATE_FUTEX
PRIVATE_FUTEX offsetof (tcbhead_t, private_futex)
#endif
-RTLD_SAVESPACE_SSE offsetof (tcbhead_t, rtld_savespace_sse)
-- Not strictly offsets, but these values are also used in the TCB.
TCB_CANCELSTATE_BITMASK CANCELSTATE_BITMASK
diff --git a/sysdeps/x86_64/nptl/tls.h b/sysdeps/x86_64/nptl/tls.h
index d7543c6..b73e7ed 100644
--- a/sysdeps/x86_64/nptl/tls.h
+++ b/sysdeps/x86_64/nptl/tls.h
@@ -67,14 +67,15 @@ typedef struct
# else
int __glibc_reserved1;
# endif
- int rtld_must_xmm_save;
+ int __glibc_unused1;
/* Reservation of some values for the TM ABI. */
void *__private_tm[4];
/* GCC split stack support. */
void *__private_ss;
long int __glibc_reserved2;
- /* Have space for the post-AVX register size. */
- __128bits rtld_savespace_sse[8][4] __attribute__ ((aligned (32)));
+ /* Must be kept even if it is no longer used by glibc since programs,
+ like AddressSanitizer, depend on the size of tcbhead_t. */
+ __128bits __glibc_unused2[8][4] __attribute__ ((aligned (32)));
void *__padding[8];
} tcbhead_t;
@@ -384,41 +385,6 @@ typedef struct
# define THREAD_GSCOPE_WAIT() \
GL(dl_wait_lookup_done) ()
-
-# ifdef SHARED
-/* Defined in dl-trampoline.S. */
-extern void _dl_x86_64_save_sse (void);
-extern void _dl_x86_64_restore_sse (void);
-
-# define RTLD_CHECK_FOREIGN_CALL \
- (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) != 0)
-
-/* NB: Don't use the xchg operation because that would imply a lock
- prefix which is expensive and unnecessary. The cache line is also
- not contested at all. */
-# define RTLD_ENABLE_FOREIGN_CALL \
- int old_rtld_must_xmm_save = THREAD_GETMEM (THREAD_SELF, \
- header.rtld_must_xmm_save); \
- THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 1)
-
-# define RTLD_PREPARE_FOREIGN_CALL \
- do if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save)) \
- { \
- _dl_x86_64_save_sse (); \
- THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0); \
- } \
- while (0)
-
-# define RTLD_FINALIZE_FOREIGN_CALL \
- do { \
- if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) == 0) \
- _dl_x86_64_restore_sse (); \
- THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, \
- old_rtld_must_xmm_save); \
- } while (0)
-# endif
-
-
#endif /* __ASSEMBLER__ */
#endif /* tls.h */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0622f480da5844b1f4c7f383fa71e8ed65a32264
commit 0622f480da5844b1f4c7f383fa71e8ed65a32264
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Jul 12 14:41:20 2015 -0700
Align stack when calling __errno_location
We should align stack to 16 bytes when calling __errno_location.
[BZ #18661]
* sysdeps/x86_64/fpu/s_cosf.S (__cosf): Align stack to 16 bytes
when calling __errno_location.
* sysdeps/x86_64/fpu/s_sincosf.S (__sincosf): Likewise.
* sysdeps/x86_64/fpu/s_sinf.S (__sinf): Likewise.
diff --git a/sysdeps/x86_64/fpu/s_cosf.S b/sysdeps/x86_64/fpu/s_cosf.S
index b7868ce..bea10ef 100644
--- a/sysdeps/x86_64/fpu/s_cosf.S
+++ b/sysdeps/x86_64/fpu/s_cosf.S
@@ -310,8 +310,14 @@ L(arg_inf_or_nan):
/* Here if |x| is Inf or NAN */
jne L(skip_errno_setting) /* in case of x is NaN */
+ /* Align stack to 16 bytes. */
+ subq $8, %rsp
+ cfi_adjust_cfa_offset (8)
/* Here if x is Inf. Set errno to EDOM. */
call JUMPTARGET(__errno_location)
+ addq $8, %rsp
+ cfi_adjust_cfa_offset (-8)
+
movl $EDOM, (%rax)
.p2align 4
diff --git a/sysdeps/x86_64/fpu/s_sincosf.S b/sysdeps/x86_64/fpu/s_sincosf.S
index 21db70a..a2f3133 100644
--- a/sysdeps/x86_64/fpu/s_sincosf.S
+++ b/sysdeps/x86_64/fpu/s_sincosf.S
@@ -354,8 +354,14 @@ L(arg_inf_or_nan):
/* Here if |x| is Inf or NAN */
jne L(skip_errno_setting) /* in case of x is NaN */
+ /* Align stack to 16 bytes. */
+ subq $8, %rsp
+ cfi_adjust_cfa_offset (8)
/* Here if x is Inf. Set errno to EDOM. */
call JUMPTARGET(__errno_location)
+ addq $8, %rsp
+ cfi_adjust_cfa_offset (-8)
+
movl $EDOM, (%rax)
.p2align 4
diff --git a/sysdeps/x86_64/fpu/s_sinf.S b/sysdeps/x86_64/fpu/s_sinf.S
index dc92164..90afbe8 100644
--- a/sysdeps/x86_64/fpu/s_sinf.S
+++ b/sysdeps/x86_64/fpu/s_sinf.S
@@ -336,8 +336,14 @@ L(arg_inf_or_nan):
/* Here if |x| is Inf or NAN */
jne L(skip_errno_setting) /* in case of x is NaN */
+ /* Align stack to 16 bytes. */
+ subq $8, %rsp
+ cfi_adjust_cfa_offset (8)
/* Here if x is Inf. Set errno to EDOM. */
call JUMPTARGET(__errno_location)
+ addq $8, %rsp
+ cfi_adjust_cfa_offset (-8)
+
movl $EDOM, (%rax)
.p2align 4
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=23662bce8ea3a3b1c822e95efc71f1f8602eeaf8
commit 23662bce8ea3a3b1c822e95efc71f1f8602eeaf8
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Jul 12 14:40:25 2015 -0700
Align stack to 16 bytes when calling __gettimeofday
Subtract stack by 24 bytes instead of 16 bytes so that stack is aligned
to 16 bytes when calling __gettimeofday.
[BZ #18661]
* sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
(__lll_timedwait_tid): Align stack to 16 bytes when calling
__gettimeofday.
diff --git a/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S b/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
index 0935db5..8e1a39d 100644
--- a/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
+++ b/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
@@ -394,8 +394,9 @@ __lll_timedwait_tid:
movq %rdi, %r12
movq %rsi, %r13
- subq $16, %rsp
- cfi_adjust_cfa_offset(16)
+ /* Align stack to 16 bytes when calling __gettimeofday. */
+ subq $24, %rsp
+ cfi_adjust_cfa_offset(24)
/* Get current time. */
2: movq %rsp, %rdi
@@ -441,8 +442,8 @@ __lll_timedwait_tid:
jne 1f
4: xorl %eax, %eax
-8: addq $16, %rsp
- cfi_adjust_cfa_offset(-16)
+8: addq $24, %rsp
+ cfi_adjust_cfa_offset(-24)
popq %r13
cfi_adjust_cfa_offset(-8)
cfi_restore(%r13)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ffc7fb0b97ba9a4f97800003077ff678a235fc8f
commit ffc7fb0b97ba9a4f97800003077ff678a235fc8f
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Jul 12 14:38:58 2015 -0700
Align stack to 16 bytes when calling __setcontext
Don't use pop to restore %rdi so that stack is aligned to 16 bytes
when calling __setcontext.
[BZ #18661]
* sysdeps/unix/sysv/linux/x86_64/__start_context.S
(__start_context): Don't use pop to restore %rdi so that stack
is aligned to 16 bytes when calling __setcontext.
diff --git a/sysdeps/unix/sysv/linux/x86_64/__start_context.S b/sysdeps/unix/sysv/linux/x86_64/__start_context.S
index 52a5afa..96366e0 100644
--- a/sysdeps/unix/sysv/linux/x86_64/__start_context.S
+++ b/sysdeps/unix/sysv/linux/x86_64/__start_context.S
@@ -31,8 +31,8 @@ ENTRY(__start_context)
on the stack pointer for the next context. */
movq %rbx, %rsp
- popq %rdi /* This is the next context. */
- cfi_adjust_cfa_offset(-8)
+ /* Don't use pop here so that stack is aligned to 16 bytes. */
+ movq (%rsp), %rdi /* This is the next context. */
testq %rdi, %rdi
je 2f /* If it is zero exit. */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=40d3b80e0767dd3dc102d6fb544d9ef0c610c207
commit 40d3b80e0767dd3dc102d6fb544d9ef0c610c207
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Jul 29 03:41:58 2015 -0700
Compile {memcpy,strcmp}-sse2-unaligned.S only for libc
{memcpy,strcmp}-sse2-unaligned.S aren't needed in ld.so.
* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: Compile
only for libc.
* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Likewise.
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index c5450af..5693ba7 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -16,6 +16,8 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
+#if IS_IN (libc)
+
#include <sysdep.h>
#include "asm-syntax.h"
@@ -169,3 +171,5 @@ L(between_5_8):
movl %eax, -4(%rdi,%rdx)
jmp L(return)
END(__memcpy_sse2_unaligned)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index 20b65fa..c6606b4 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -16,6 +16,8 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
+#if IS_IN (libc)
+
#include "sysdep.h"
ENTRY ( __strcmp_sse2_unaligned)
@@ -207,3 +209,5 @@ L(different):
subl %ecx, %eax
ret
END (__strcmp_sse2_unaligned)
+
+#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6427f82096f4d75a46801672360dda2082eb1ce4
commit 6427f82096f4d75a46801672360dda2082eb1ce4
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Jul 31 13:46:05 2015 -0700
Update libmvec multiarch functions for <cpu-features.h>
This patch updates libmvec multiarch functions to use the newly defined
HAS_CPU_FEATURE, HAS_ARCH_FEATURE and LOAD_RTLD_GLOBAL_RO_RDX from
<cpu-features.h>.
* math/Makefile ($(addprefix $(objpfx), $(libm-vec-tests))):
Remove $(objpfx)init-arch.o.
* sysdeps/x86_64/fpu/Makefile (libmvec-support): Remove
init-arch.
* sysdeps/x86_64/fpu/math-tests-arch.h (avx_usable): Removed.
(INIT_ARCH_EXT): Defined as empty.
(CHECK_ARCH_EXT): Replace HAS_XXX with HAS_ARCH_FEATURE (XXX).
* sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S: Remove
__init_cpu_features call. Replace HAS_XXX with
HAS_CPU_FEATURE/HAS_ARCH_FEATURE (XXX).
* sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S: Likewise.
diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
index 1ebe511..f98f6cf 100644
--- a/sysdeps/x86_64/fpu/Makefile
+++ b/sysdeps/x86_64/fpu/Makefile
@@ -20,7 +20,7 @@ libmvec-support += svml_d_cos2_core svml_d_cos4_core_avx \
svml_d_pow_data svml_s_powf4_core svml_s_powf8_core_avx \
svml_s_powf8_core svml_s_powf16_core svml_s_powf_data \
svml_s_sincosf4_core svml_s_sincosf8_core_avx \
- svml_s_sincosf8_core svml_s_sincosf16_core init-arch
+ svml_s_sincosf8_core svml_s_sincosf16_core
endif
# Variables for libmvec tests.
-----------------------------------------------------------------------
hooks/post-receive
--
GNU C Library master sources