This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch, master, updated. glibc-2.14-607-gbbe315e
- From: drepper at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 23 Dec 2011 17:03:08 -0000
- Subject: GNU C Library master sources branch, master, updated. glibc-2.14-607-gbbe315e
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via bbe315ea364e86166bb985e2e605af029482a124 (commit)
via 15db4de19dc0043c25ff6a205bfbc25a180b1c48 (commit)
from 2b2596b1e94d9d51bd8febe81b759fa45a62e3cb (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=bbe315ea364e86166bb985e2e605af029482a124
commit bbe315ea364e86166bb985e2e605af029482a124
Author: Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
Date: Fri Dec 23 12:02:53 2011 -0500
CL
diff --git a/ChangeLog b/ChangeLog
index 2eed115..37e70e7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,8 +1,8 @@
2011-12-23 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
[BZ #13540]
- * sysdeps/x86_64/multiarch/strcpy-ssse3.S: Fix bug.
- Fix overrun in destination buffer.
+ * sysdeps/x86_64/multiarch/strcpy-ssse3.S: Fix overrun in
+ destination buffer.
* sysdeps/x86_64/multiarch/wcscpy-ssse3.S: Likewise.
2011-12-23 Marek Polacek <polacek@redhat.com>
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=15db4de19dc0043c25ff6a205bfbc25a180b1c48
commit 15db4de19dc0043c25ff6a205bfbc25a180b1c48
Author: Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
Date: Fri Dec 23 12:02:15 2011 -0500
Fix overrun in destination buffer
diff --git a/ChangeLog b/ChangeLog
index a883f93..2eed115 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2011-12-23 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
+
+ [BZ #13540]
+ * sysdeps/x86_64/multiarch/strcpy-ssse3.S: Fix bug.
+ Fix overrun in destination buffer.
+ * sysdeps/x86_64/multiarch/wcscpy-ssse3.S: Likewise.
+
2011-12-23 Marek Polacek <polacek@redhat.com>
* elf/dl-addr.c (determine_info): Add inline keyword.
@@ -13,6 +20,7 @@
2011-12-23 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
+ [BZ #13540]
* sysdeps/i386/i686/multiarch/wcscpy-ssse3.S: Fix wrong copying
processing for last bytes.
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
index c4ec54c..b104765 100644
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -29,6 +29,7 @@
.section .text.ssse3,"ax",@progbits
ENTRY (STRCPY)
+
mov %rsi, %rcx
# ifdef USE_AS_STRNCPY
mov %rdx, %r8
@@ -39,7 +40,7 @@ ENTRY (STRCPY)
jz L(Exit0)
cmp $8, %r8
jbe L(StrncpyExit8Bytes)
-# endif
+# endif
cmpb $0, (%rcx)
jz L(Exit1)
cmpb $0, 1(%rcx)
@@ -56,10 +57,10 @@ ENTRY (STRCPY)
jz L(Exit7)
cmpb $0, 7(%rcx)
jz L(Exit8)
-# ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
cmp $16, %r8
jb L(StrncpyExit15Bytes)
-# endif
+# endif
cmpb $0, 8(%rcx)
jz L(Exit9)
cmpb $0, 9(%rcx)
@@ -74,10 +75,10 @@ ENTRY (STRCPY)
jz L(Exit14)
cmpb $0, 14(%rcx)
jz L(Exit15)
-# ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
cmp $16, %r8
je L(Exit16)
-# endif
+# endif
cmpb $0, 15(%rcx)
jz L(Exit16)
# endif
@@ -87,25 +88,15 @@ ENTRY (STRCPY)
sub $16, %r8
and $0xf, %rsi
-/* add 16 bytes rcx_shift to r8 */
+/* add 16 bytes rcx_offset to r8 */
+
add %rsi, %r8
# endif
lea 16(%rcx), %rsi
-/* Now:
- rsi = alignment_16(rcx) + rcx_shift + 16;
- rcx_shift = rcx - alignment_16(rcx)
-*/
and $-16, %rsi
-/* Now:
- rsi = alignment_16(rcx) + 16
-*/
pxor %xmm0, %xmm0
mov (%rcx), %r9
mov %r9, (%rdx)
-/*
- look if there is zero symbol in next 16 bytes of string
- from rsi to rsi + 15 and form mask in xmm0
-*/
pcmpeqb (%rsi), %xmm0
mov 8(%rcx), %r9
mov %r9, 8(%rdx)
@@ -115,10 +106,6 @@ ENTRY (STRCPY)
pmovmskb %xmm0, %rax
sub %rcx, %rsi
-/* rsi = 16 - rcx_shift */
-
-/* rax = 0: there isn't end of string from position rsi to rsi+15 */
-
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(CopyFrom1To16BytesCase2OrCase3)
@@ -128,17 +115,9 @@ ENTRY (STRCPY)
mov %rdx, %rax
lea 16(%rdx), %rdx
-/* Now:
- rdx = rdx + 16 = alignment_16(rdx) + rdx_shift + 16
-*/
and $-16, %rdx
-
-/* Now: rdx = alignment_16(rdx) + 16 */
-
sub %rdx, %rax
-/* Now: rax = rdx_shift - 16 */
-
# ifdef USE_AS_STRNCPY
add %rax, %rsi
lea -1(%rsi), %rsi
@@ -150,22 +129,11 @@ ENTRY (STRCPY)
L(ContinueCopy):
# endif
sub %rax, %rcx
-/* Now:
- case rcx_shift >= rdx_shift:
- rcx = alignment_16(rcx) + (rcx_shift - rdx_shift) + 16
- case rcx_shift < rdx_shift:
- rcx = alignment_16(rcx) + (16 + rcx_shift - rdx_shift)
-*/
mov %rcx, %rax
and $0xf, %rax
-/* Now:
- case rcx_shift >= rdx_shift: rax = rcx_shift - rdx_shift
- case rcx_shift < rdx_shift: rax = (16 + rcx_shift - rdx_shift)
- rax can be 0, 1, ..., 15
-*/
mov $0, %rsi
-/* case: rcx_shift == rdx_shift */
+/* case: rcx_offset == rdx_offset */
jz L(Align16Both)
@@ -282,10 +250,11 @@ L(Align16Both):
sub %rcx, %rax
sub %rax, %rdx
# ifdef USE_AS_STRNCPY
- lea 48+64(%r8, %rax), %r8
+ lea 112(%r8, %rax), %r8
# endif
mov $-0x40, %rsi
+ .p2align 4
L(Aligned64Loop):
movaps (%rcx), %xmm2
movaps %xmm2, %xmm4
@@ -366,7 +335,6 @@ L(Shl1Start):
jnz L(Shl1LoopExit)
palignr $1, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 31(%rcx), %xmm2
@@ -374,7 +342,7 @@ L(Shl1Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit1Case2OrCase3)
@@ -382,10 +350,9 @@ L(Shl1Start):
test %rax, %rax
jnz L(Shl1LoopExit)
- palignr $1, %xmm1, %xmm2
+ palignr $1, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 31(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -400,7 +367,6 @@ L(Shl1Start):
jnz L(Shl1LoopExit)
palignr $1, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 31(%rcx), %xmm2
@@ -408,7 +374,6 @@ L(Shl1Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit1Case2OrCase3)
@@ -416,8 +381,7 @@ L(Shl1Start):
test %rax, %rax
jnz L(Shl1LoopExit)
- palignr $1, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $1, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 31(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -432,6 +396,8 @@ L(Shl1Start):
# endif
movaps -1(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl1LoopStart):
movaps 15(%rcx), %xmm2
movaps 31(%rcx), %xmm3
@@ -465,11 +431,9 @@ L(Shl1LoopStart):
jmp L(Shl1LoopStart)
L(Shl1LoopExit):
- movaps (%rdx), %xmm6
- psrldq $15, %xmm6
+ movdqu -1(%rcx), %xmm1
mov $15, %rsi
- palignr $1, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -1(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -488,7 +452,6 @@ L(Shl2Start):
jnz L(Shl2LoopExit)
palignr $2, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 30(%rcx), %xmm2
@@ -496,7 +459,7 @@ L(Shl2Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit2Case2OrCase3)
@@ -504,10 +467,9 @@ L(Shl2Start):
test %rax, %rax
jnz L(Shl2LoopExit)
- palignr $2, %xmm1, %xmm2
+ palignr $2, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 30(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -522,7 +484,6 @@ L(Shl2Start):
jnz L(Shl2LoopExit)
palignr $2, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 30(%rcx), %xmm2
@@ -530,7 +491,6 @@ L(Shl2Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit2Case2OrCase3)
@@ -538,8 +498,7 @@ L(Shl2Start):
test %rax, %rax
jnz L(Shl2LoopExit)
- palignr $2, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $2, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 30(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -554,6 +513,8 @@ L(Shl2Start):
# endif
movaps -2(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl2LoopStart):
movaps 14(%rcx), %xmm2
movaps 30(%rcx), %xmm3
@@ -587,11 +548,9 @@ L(Shl2LoopStart):
jmp L(Shl2LoopStart)
L(Shl2LoopExit):
- movaps (%rdx), %xmm6
- psrldq $14, %xmm6
+ movdqu -2(%rcx), %xmm1
mov $14, %rsi
- palignr $2, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -2(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -610,7 +569,6 @@ L(Shl3Start):
jnz L(Shl3LoopExit)
palignr $3, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 29(%rcx), %xmm2
@@ -618,7 +576,7 @@ L(Shl3Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit3Case2OrCase3)
@@ -626,10 +584,9 @@ L(Shl3Start):
test %rax, %rax
jnz L(Shl3LoopExit)
- palignr $3, %xmm1, %xmm2
+ palignr $3, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 29(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -644,7 +601,6 @@ L(Shl3Start):
jnz L(Shl3LoopExit)
palignr $3, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 29(%rcx), %xmm2
@@ -652,7 +608,6 @@ L(Shl3Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit3Case2OrCase3)
@@ -660,8 +615,7 @@ L(Shl3Start):
test %rax, %rax
jnz L(Shl3LoopExit)
- palignr $3, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $3, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 29(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -676,6 +630,8 @@ L(Shl3Start):
# endif
movaps -3(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl3LoopStart):
movaps 13(%rcx), %xmm2
movaps 29(%rcx), %xmm3
@@ -709,11 +665,9 @@ L(Shl3LoopStart):
jmp L(Shl3LoopStart)
L(Shl3LoopExit):
- movaps (%rdx), %xmm6
- psrldq $13, %xmm6
+ movdqu -3(%rcx), %xmm1
mov $13, %rsi
- palignr $3, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -3(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -732,7 +686,6 @@ L(Shl4Start):
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
@@ -740,7 +693,7 @@ L(Shl4Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit4Case2OrCase3)
@@ -748,10 +701,9 @@ L(Shl4Start):
test %rax, %rax
jnz L(Shl4LoopExit)
- palignr $4, %xmm1, %xmm2
+ palignr $4, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -766,7 +718,6 @@ L(Shl4Start):
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
@@ -774,7 +725,6 @@ L(Shl4Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit4Case2OrCase3)
@@ -782,8 +732,7 @@ L(Shl4Start):
test %rax, %rax
jnz L(Shl4LoopExit)
- palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $4, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 28(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -798,6 +747,8 @@ L(Shl4Start):
# endif
movaps -4(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl4LoopStart):
movaps 12(%rcx), %xmm2
movaps 28(%rcx), %xmm3
@@ -831,11 +782,9 @@ L(Shl4LoopStart):
jmp L(Shl4LoopStart)
L(Shl4LoopExit):
- movaps (%rdx), %xmm6
- psrldq $12, %xmm6
+ movdqu -4(%rcx), %xmm1
mov $12, %rsi
- palignr $4, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -4(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -854,7 +803,6 @@ L(Shl5Start):
jnz L(Shl5LoopExit)
palignr $5, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 27(%rcx), %xmm2
@@ -862,7 +810,7 @@ L(Shl5Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit5Case2OrCase3)
@@ -870,10 +818,9 @@ L(Shl5Start):
test %rax, %rax
jnz L(Shl5LoopExit)
- palignr $5, %xmm1, %xmm2
+ palignr $5, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 27(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -888,7 +835,6 @@ L(Shl5Start):
jnz L(Shl5LoopExit)
palignr $5, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 27(%rcx), %xmm2
@@ -896,7 +842,6 @@ L(Shl5Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit5Case2OrCase3)
@@ -904,8 +849,7 @@ L(Shl5Start):
test %rax, %rax
jnz L(Shl5LoopExit)
- palignr $5, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $5, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 27(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -920,6 +864,8 @@ L(Shl5Start):
# endif
movaps -5(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl5LoopStart):
movaps 11(%rcx), %xmm2
movaps 27(%rcx), %xmm3
@@ -953,11 +899,9 @@ L(Shl5LoopStart):
jmp L(Shl5LoopStart)
L(Shl5LoopExit):
- movaps (%rdx), %xmm6
- psrldq $11, %xmm6
+ movdqu -5(%rcx), %xmm1
mov $11, %rsi
- palignr $5, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -5(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -976,7 +920,6 @@ L(Shl6Start):
jnz L(Shl6LoopExit)
palignr $6, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 26(%rcx), %xmm2
@@ -984,7 +927,7 @@ L(Shl6Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit6Case2OrCase3)
@@ -992,10 +935,9 @@ L(Shl6Start):
test %rax, %rax
jnz L(Shl6LoopExit)
- palignr $6, %xmm1, %xmm2
+ palignr $6, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 26(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1010,7 +952,6 @@ L(Shl6Start):
jnz L(Shl6LoopExit)
palignr $6, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 26(%rcx), %xmm2
@@ -1018,7 +959,6 @@ L(Shl6Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit6Case2OrCase3)
@@ -1026,8 +966,7 @@ L(Shl6Start):
test %rax, %rax
jnz L(Shl6LoopExit)
- palignr $6, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $6, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 26(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -1042,6 +981,8 @@ L(Shl6Start):
# endif
movaps -6(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl6LoopStart):
movaps 10(%rcx), %xmm2
movaps 26(%rcx), %xmm3
@@ -1075,11 +1016,11 @@ L(Shl6LoopStart):
jmp L(Shl6LoopStart)
L(Shl6LoopExit):
- movaps (%rdx), %xmm6
- psrldq $10, %xmm6
+ mov (%rcx), %r9
+ mov 6(%rcx), %esi
+ mov %r9, (%rdx)
+ mov %esi, 6(%rdx)
mov $10, %rsi
- palignr $6, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1098,7 +1039,6 @@ L(Shl7Start):
jnz L(Shl7LoopExit)
palignr $7, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 25(%rcx), %xmm2
@@ -1106,7 +1046,7 @@ L(Shl7Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit7Case2OrCase3)
@@ -1114,10 +1054,9 @@ L(Shl7Start):
test %rax, %rax
jnz L(Shl7LoopExit)
- palignr $7, %xmm1, %xmm2
+ palignr $7, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 25(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1132,7 +1071,6 @@ L(Shl7Start):
jnz L(Shl7LoopExit)
palignr $7, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 25(%rcx), %xmm2
@@ -1140,7 +1078,6 @@ L(Shl7Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit7Case2OrCase3)
@@ -1148,8 +1085,7 @@ L(Shl7Start):
test %rax, %rax
jnz L(Shl7LoopExit)
- palignr $7, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $7, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 25(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -1164,6 +1100,8 @@ L(Shl7Start):
# endif
movaps -7(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl7LoopStart):
movaps 9(%rcx), %xmm2
movaps 25(%rcx), %xmm3
@@ -1197,11 +1135,11 @@ L(Shl7LoopStart):
jmp L(Shl7LoopStart)
L(Shl7LoopExit):
- movaps (%rdx), %xmm6
- psrldq $9, %xmm6
+ mov (%rcx), %r9
+ mov 5(%rcx), %esi
+ mov %r9, (%rdx)
+ mov %esi, 5(%rdx)
mov $9, %rsi
- palignr $7, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1220,7 +1158,6 @@ L(Shl8Start):
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
@@ -1228,7 +1165,7 @@ L(Shl8Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit8Case2OrCase3)
@@ -1236,10 +1173,9 @@ L(Shl8Start):
test %rax, %rax
jnz L(Shl8LoopExit)
- palignr $8, %xmm1, %xmm2
+ palignr $8, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1254,7 +1190,6 @@ L(Shl8Start):
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
@@ -1262,7 +1197,6 @@ L(Shl8Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit8Case2OrCase3)
@@ -1270,8 +1204,7 @@ L(Shl8Start):
test %rax, %rax
jnz L(Shl8LoopExit)
- palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $8, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 24(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -1286,6 +1219,8 @@ L(Shl8Start):
# endif
movaps -8(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl8LoopStart):
movaps 8(%rcx), %xmm2
movaps 24(%rcx), %xmm3
@@ -1319,11 +1254,9 @@ L(Shl8LoopStart):
jmp L(Shl8LoopStart)
L(Shl8LoopExit):
- movaps (%rdx), %xmm6
- psrldq $8, %xmm6
+ mov (%rcx), %r9
mov $8, %rsi
- palignr $8, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, (%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1342,7 +1275,6 @@ L(Shl9Start):
jnz L(Shl9LoopExit)
palignr $9, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 23(%rcx), %xmm2
@@ -1350,7 +1282,7 @@ L(Shl9Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit9Case2OrCase3)
@@ -1358,10 +1290,9 @@ L(Shl9Start):
test %rax, %rax
jnz L(Shl9LoopExit)
- palignr $9, %xmm1, %xmm2
+ palignr $9, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 23(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1376,7 +1307,6 @@ L(Shl9Start):
jnz L(Shl9LoopExit)
palignr $9, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 23(%rcx), %xmm2
@@ -1384,7 +1314,6 @@ L(Shl9Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit9Case2OrCase3)
@@ -1392,8 +1321,7 @@ L(Shl9Start):
test %rax, %rax
jnz L(Shl9LoopExit)
- palignr $9, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $9, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 23(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -1408,6 +1336,8 @@ L(Shl9Start):
# endif
movaps -9(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl9LoopStart):
movaps 7(%rcx), %xmm2
movaps 23(%rcx), %xmm3
@@ -1441,11 +1371,9 @@ L(Shl9LoopStart):
jmp L(Shl9LoopStart)
L(Shl9LoopExit):
- movaps (%rdx), %xmm6
- psrldq $7, %xmm6
+ mov -1(%rcx), %r9
mov $7, %rsi
- palignr $9, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -1(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1464,7 +1392,6 @@ L(Shl10Start):
jnz L(Shl10LoopExit)
palignr $10, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 22(%rcx), %xmm2
@@ -1472,7 +1399,7 @@ L(Shl10Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit10Case2OrCase3)
@@ -1480,10 +1407,9 @@ L(Shl10Start):
test %rax, %rax
jnz L(Shl10LoopExit)
- palignr $10, %xmm1, %xmm2
+ palignr $10, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 22(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1498,7 +1424,6 @@ L(Shl10Start):
jnz L(Shl10LoopExit)
palignr $10, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 22(%rcx), %xmm2
@@ -1506,7 +1431,6 @@ L(Shl10Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit10Case2OrCase3)
@@ -1514,8 +1438,7 @@ L(Shl10Start):
test %rax, %rax
jnz L(Shl10LoopExit)
- palignr $10, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $10, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 22(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -1530,6 +1453,8 @@ L(Shl10Start):
# endif
movaps -10(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl10LoopStart):
movaps 6(%rcx), %xmm2
movaps 22(%rcx), %xmm3
@@ -1563,11 +1488,9 @@ L(Shl10LoopStart):
jmp L(Shl10LoopStart)
L(Shl10LoopExit):
- movaps (%rdx), %xmm6
- psrldq $6, %xmm6
+ mov -2(%rcx), %r9
mov $6, %rsi
- palignr $10, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -2(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1586,7 +1509,6 @@ L(Shl11Start):
jnz L(Shl11LoopExit)
palignr $11, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 21(%rcx), %xmm2
@@ -1594,7 +1516,7 @@ L(Shl11Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit11Case2OrCase3)
@@ -1602,10 +1524,9 @@ L(Shl11Start):
test %rax, %rax
jnz L(Shl11LoopExit)
- palignr $11, %xmm1, %xmm2
+ palignr $11, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 21(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1620,7 +1541,6 @@ L(Shl11Start):
jnz L(Shl11LoopExit)
palignr $11, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 21(%rcx), %xmm2
@@ -1628,7 +1548,6 @@ L(Shl11Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit11Case2OrCase3)
@@ -1636,8 +1555,7 @@ L(Shl11Start):
test %rax, %rax
jnz L(Shl11LoopExit)
- palignr $11, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $11, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 21(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -1652,6 +1570,8 @@ L(Shl11Start):
# endif
movaps -11(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl11LoopStart):
movaps 5(%rcx), %xmm2
movaps 21(%rcx), %xmm3
@@ -1685,11 +1605,9 @@ L(Shl11LoopStart):
jmp L(Shl11LoopStart)
L(Shl11LoopExit):
- movaps (%rdx), %xmm6
- psrldq $5, %xmm6
+ mov -3(%rcx), %r9
mov $5, %rsi
- palignr $11, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -3(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1708,7 +1626,6 @@ L(Shl12Start):
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
@@ -1716,7 +1633,7 @@ L(Shl12Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit12Case2OrCase3)
@@ -1724,10 +1641,9 @@ L(Shl12Start):
test %rax, %rax
jnz L(Shl12LoopExit)
- palignr $12, %xmm1, %xmm2
+ palignr $12, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1742,7 +1658,6 @@ L(Shl12Start):
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
@@ -1750,7 +1665,6 @@ L(Shl12Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit12Case2OrCase3)
@@ -1758,8 +1672,7 @@ L(Shl12Start):
test %rax, %rax
jnz L(Shl12LoopExit)
- palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $12, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 20(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -1774,6 +1687,8 @@ L(Shl12Start):
# endif
movaps -12(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl12LoopStart):
movaps 4(%rcx), %xmm2
movaps 20(%rcx), %xmm3
@@ -1807,11 +1722,9 @@ L(Shl12LoopStart):
jmp L(Shl12LoopStart)
L(Shl12LoopExit):
- movaps (%rdx), %xmm6
- psrldq $4, %xmm6
+ mov (%rcx), %r9d
mov $4, %rsi
- palignr $12, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, (%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1830,7 +1743,6 @@ L(Shl13Start):
jnz L(Shl13LoopExit)
palignr $13, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 19(%rcx), %xmm2
@@ -1838,7 +1750,7 @@ L(Shl13Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit13Case2OrCase3)
@@ -1846,10 +1758,9 @@ L(Shl13Start):
test %rax, %rax
jnz L(Shl13LoopExit)
- palignr $13, %xmm1, %xmm2
+ palignr $13, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 19(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1864,7 +1775,6 @@ L(Shl13Start):
jnz L(Shl13LoopExit)
palignr $13, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 19(%rcx), %xmm2
@@ -1872,7 +1782,6 @@ L(Shl13Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit13Case2OrCase3)
@@ -1880,8 +1789,7 @@ L(Shl13Start):
test %rax, %rax
jnz L(Shl13LoopExit)
- palignr $13, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $13, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 19(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -1896,6 +1804,8 @@ L(Shl13Start):
# endif
movaps -13(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl13LoopStart):
movaps 3(%rcx), %xmm2
movaps 19(%rcx), %xmm3
@@ -1929,11 +1839,9 @@ L(Shl13LoopStart):
jmp L(Shl13LoopStart)
L(Shl13LoopExit):
- movaps (%rdx), %xmm6
- psrldq $3, %xmm6
+ mov -1(%rcx), %r9d
mov $3, %rsi
- palignr $13, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -1(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1952,7 +1860,6 @@ L(Shl14Start):
jnz L(Shl14LoopExit)
palignr $14, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 18(%rcx), %xmm2
@@ -1960,7 +1867,7 @@ L(Shl14Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit14Case2OrCase3)
@@ -1968,10 +1875,9 @@ L(Shl14Start):
test %rax, %rax
jnz L(Shl14LoopExit)
- palignr $14, %xmm1, %xmm2
+ palignr $14, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 18(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1986,7 +1892,6 @@ L(Shl14Start):
jnz L(Shl14LoopExit)
palignr $14, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 18(%rcx), %xmm2
@@ -1994,7 +1899,6 @@ L(Shl14Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit14Case2OrCase3)
@@ -2002,8 +1906,7 @@ L(Shl14Start):
test %rax, %rax
jnz L(Shl14LoopExit)
- palignr $14, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $14, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 18(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -2018,6 +1921,8 @@ L(Shl14Start):
# endif
movaps -14(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl14LoopStart):
movaps 2(%rcx), %xmm2
movaps 18(%rcx), %xmm3
@@ -2051,11 +1956,9 @@ L(Shl14LoopStart):
jmp L(Shl14LoopStart)
L(Shl14LoopExit):
- movaps (%rdx), %xmm6
- psrldq $2, %xmm6
+ mov -2(%rcx), %r9d
mov $2, %rsi
- palignr $14, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -2(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -2074,7 +1977,6 @@ L(Shl15Start):
jnz L(Shl15LoopExit)
palignr $15, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 17(%rcx), %xmm2
@@ -2082,7 +1984,7 @@ L(Shl15Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit15Case2OrCase3)
@@ -2090,10 +1992,9 @@ L(Shl15Start):
test %rax, %rax
jnz L(Shl15LoopExit)
- palignr $15, %xmm1, %xmm2
+ palignr $15, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 17(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -2108,7 +2009,6 @@ L(Shl15Start):
jnz L(Shl15LoopExit)
palignr $15, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 17(%rcx), %xmm2
@@ -2116,7 +2016,6 @@ L(Shl15Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit15Case2OrCase3)
@@ -2124,8 +2023,7 @@ L(Shl15Start):
test %rax, %rax
jnz L(Shl15LoopExit)
- palignr $15, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $15, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 17(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -2140,6 +2038,8 @@ L(Shl15Start):
# endif
movaps -15(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl15LoopStart):
movaps 1(%rcx), %xmm2
movaps 17(%rcx), %xmm3
@@ -2173,16 +2073,15 @@ L(Shl15LoopStart):
jmp L(Shl15LoopStart)
L(Shl15LoopExit):
- movaps (%rdx), %xmm6
- psrldq $1, %xmm6
+ mov -3(%rcx), %r9d
mov $1, %rsi
- palignr $15, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -3(%rdx)
# ifdef USE_AS_STRCAT
jmp L(CopyFrom1To16Bytes)
# endif
# ifndef USE_AS_STRCAT
+
.p2align 4
L(CopyFrom1To16Bytes):
# ifdef USE_AS_STRNCPY
@@ -2463,7 +2362,7 @@ L(Exit4):
# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
+# endif
# endif
ret
@@ -2485,7 +2384,7 @@ L(Exit5):
# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
+# endif
# endif
ret
@@ -2507,7 +2406,7 @@ L(Exit6):
# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
+# endif
# endif
ret
@@ -2617,7 +2516,7 @@ L(Exit12):
# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
+# endif
# endif
ret
@@ -2955,11 +2854,10 @@ L(StrncpyExit8Bytes):
ret
# endif
-
# endif
# ifdef USE_AS_STRNCPY
-
+ .p2align 4
L(StrncpyLeaveCase2OrCase3):
test %rax, %rax
jnz L(Aligned64LeaveCase2)
@@ -3014,710 +2912,639 @@ L(Aligned64LeaveCase2):
lea -16(%r8), %r8
jmp L(CopyFrom1To16BytesCase2)
/*--------------------------------------------------*/
+ .p2align 4
L(StrncpyExit1Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $15, %xmm6
+ movdqu -1(%rcx), %xmm0
+ movdqu %xmm0, -1(%rdx)
mov $15, %rsi
- palignr $1, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit2Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $14, %xmm6
+ movdqu -2(%rcx), %xmm0
+ movdqu %xmm0, -2(%rdx)
mov $14, %rsi
- palignr $2, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit3Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $13, %xmm6
+ movdqu -3(%rcx), %xmm0
+ movdqu %xmm0, -3(%rdx)
mov $13, %rsi
- palignr $3, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit4Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $12, %xmm6
+ movdqu -4(%rcx), %xmm0
+ movdqu %xmm0, -4(%rdx)
mov $12, %rsi
- palignr $4, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit5Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $11, %xmm6
+ movdqu -5(%rcx), %xmm0
+ movdqu %xmm0, -5(%rdx)
mov $11, %rsi
- palignr $5, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit6Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $10, %xmm6
- mov $10, %rsi
- palignr $6, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov (%rcx), %rsi
+ mov 6(%rcx), %r9d
+ mov %r9d, 6(%rdx)
+ mov %rsi, (%rdx)
test %rax, %rax
+ mov $10, %rsi
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit7Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $9, %xmm6
- mov $9, %rsi
- palignr $7, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov (%rcx), %rsi
+ mov 5(%rcx), %r9d
+ mov %r9d, 5(%rdx)
+ mov %rsi, (%rdx)
test %rax, %rax
+ mov $9, %rsi
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit8Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $8, %xmm6
+ mov (%rcx), %r9
mov $8, %rsi
- palignr $8, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit9Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $7, %xmm6
+ mov -1(%rcx), %r9
mov $7, %rsi
- palignr $9, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -1(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit10Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $6, %xmm6
+ mov -2(%rcx), %r9
mov $6, %rsi
- palignr $10, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -2(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit11Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $5, %xmm6
+ mov -3(%rcx), %r9
mov $5, %rsi
- palignr $11, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -3(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit12Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $4, %xmm6
+ mov (%rcx), %r9d
mov $4, %rsi
- palignr $12, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit13Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $3, %xmm6
+ mov -1(%rcx), %r9d
mov $3, %rsi
- palignr $13, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -1(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit14Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $2, %xmm6
+ mov -2(%rcx), %r9d
mov $2, %rsi
- palignr $14, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -2(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit15Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $1, %xmm6
+ mov -3(%rcx), %r9d
mov $1, %rsi
- palignr $15, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -3(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave1):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit1)
palignr $1, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 31(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit1)
- palignr $1, %xmm1, %xmm2
+ palignr $1, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 31+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit1)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit1)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit1):
- movaps (%rdx, %rsi), %xmm6
- psrldq $15, %xmm6
- palignr $1, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 15(%rsi), %rsi
+ lea 15(%rdx, %rsi), %rdx
+ lea 15(%rcx, %rsi), %rcx
+ mov -15(%rcx), %rsi
+ mov -8(%rcx), %rax
+ mov %rsi, -15(%rdx)
+ mov %rax, -8(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave2):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit2)
palignr $2, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 30(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit2)
- palignr $2, %xmm1, %xmm2
+ palignr $2, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 30+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit2)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit2)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit2):
- movaps (%rdx, %rsi), %xmm6
- psrldq $14, %xmm6
- palignr $2, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 14(%rsi), %rsi
+ lea 14(%rdx, %rsi), %rdx
+ lea 14(%rcx, %rsi), %rcx
+ mov -14(%rcx), %rsi
+ mov -8(%rcx), %rax
+ mov %rsi, -14(%rdx)
+ mov %rax, -8(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave3):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit3)
palignr $3, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 29(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit3)
- palignr $3, %xmm1, %xmm2
+ palignr $3, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 29+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit3)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit3)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit3):
- movaps (%rdx, %rsi), %xmm6
- psrldq $13, %xmm6
- palignr $3, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 13(%rsi), %rsi
+ lea 13(%rdx, %rsi), %rdx
+ lea 13(%rcx, %rsi), %rcx
+ mov -13(%rcx), %rsi
+ mov -8(%rcx), %rax
+ mov %rsi, -13(%rdx)
+ mov %rax, -8(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave4):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit4)
palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit4)
- palignr $4, %xmm1, %xmm2
+ palignr $4, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 28+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit4)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit4)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit4):
- movaps (%rdx, %rsi), %xmm6
- psrldq $12, %xmm6
- palignr $4, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 12(%rsi), %rsi
+ lea 12(%rdx, %rsi), %rdx
+ lea 12(%rcx, %rsi), %rcx
+ mov -12(%rcx), %rsi
+ mov -4(%rcx), %eax
+ mov %rsi, -12(%rdx)
+ mov %eax, -4(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave5):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit5)
palignr $5, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 27(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit5)
- palignr $5, %xmm1, %xmm2
+ palignr $5, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 27+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit5)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit5)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit5):
- movaps (%rdx, %rsi), %xmm6
- psrldq $11, %xmm6
- palignr $5, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 11(%rsi), %rsi
+ lea 11(%rdx, %rsi), %rdx
+ lea 11(%rcx, %rsi), %rcx
+ mov -11(%rcx), %rsi
+ mov -4(%rcx), %eax
+ mov %rsi, -11(%rdx)
+ mov %eax, -4(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave6):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit6)
palignr $6, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 26(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit6)
- palignr $6, %xmm1, %xmm2
+ palignr $6, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 26+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit6)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit6)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit6):
- movaps (%rdx, %rsi), %xmm6
- psrldq $10, %xmm6
- palignr $6, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 10(%rsi), %rsi
+ lea 10(%rdx, %rsi), %rdx
+ lea 10(%rcx, %rsi), %rcx
+ mov -10(%rcx), %rsi
+ movw -2(%rcx), %ax
+ mov %rsi, -10(%rdx)
+ movw %ax, -2(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave7):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit7)
palignr $7, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 25(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit7)
- palignr $7, %xmm1, %xmm2
+ palignr $7, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 25+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit7)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit7)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit7):
- movaps (%rdx, %rsi), %xmm6
- psrldq $9, %xmm6
- palignr $7, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 9(%rsi), %rsi
+ lea 9(%rdx, %rsi), %rdx
+ lea 9(%rcx, %rsi), %rcx
+ mov -9(%rcx), %rsi
+ movb -1(%rcx), %ah
+ mov %rsi, -9(%rdx)
+ movb %ah, -1(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave8):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit8)
palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit8)
- palignr $8, %xmm1, %xmm2
+ palignr $8, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 24+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit8)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit8)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit8):
- movaps (%rdx, %rsi), %xmm6
- psrldq $8, %xmm6
- palignr $8, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 8(%rsi), %rsi
+ lea 8(%rdx, %rsi), %rdx
+ lea 8(%rcx, %rsi), %rcx
+ mov -8(%rcx), %rax
+ xor %rsi, %rsi
+ mov %rax, -8(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave9):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit9)
palignr $9, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 23(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit9)
- palignr $9, %xmm1, %xmm2
+ palignr $9, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 23+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit9)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit9)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit9):
- movaps (%rdx, %rsi), %xmm6
- psrldq $7, %xmm6
- palignr $9, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 7(%rsi), %rsi
+ lea 7(%rdx, %rsi), %rdx
+ lea 7(%rcx, %rsi), %rcx
+ mov -8(%rcx), %rax
+ xor %rsi, %rsi
+ mov %rax, -8(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave10):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit10)
palignr $10, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 22(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit10)
- palignr $10, %xmm1, %xmm2
+ palignr $10, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 22+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit10)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit10)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit10):
- movaps (%rdx, %rsi), %xmm6
- psrldq $6, %xmm6
- palignr $10, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 6(%rsi), %rsi
+ lea 6(%rdx, %rsi), %rdx
+ lea 6(%rcx, %rsi), %rcx
+ mov -8(%rcx), %rax
+ xor %rsi, %rsi
+ mov %rax, -8(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave11):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit11)
palignr $11, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 21(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit11)
- palignr $11, %xmm1, %xmm2
+ palignr $11, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 21+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit11)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit11)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit11):
- movaps (%rdx, %rsi), %xmm6
- psrldq $5, %xmm6
- palignr $11, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 5(%rsi), %rsi
+ lea 5(%rdx, %rsi), %rdx
+ lea 5(%rcx, %rsi), %rcx
+ mov -8(%rcx), %rax
+ xor %rsi, %rsi
+ mov %rax, -8(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave12):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit12)
palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit12)
- palignr $12, %xmm1, %xmm2
+ palignr $12, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 20+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit12)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit12)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit12):
- movaps (%rdx, %rsi), %xmm6
- psrldq $4, %xmm6
- palignr $12, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 4(%rsi), %rsi
+ lea 4(%rdx, %rsi), %rdx
+ lea 4(%rcx, %rsi), %rcx
+ mov -4(%rcx), %eax
+ xor %rsi, %rsi
+ mov %eax, -4(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave13):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit13)
palignr $13, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 19(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit13)
- palignr $13, %xmm1, %xmm2
+ palignr $13, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 19+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit13)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit13)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit13):
- movaps (%rdx, %rsi), %xmm6
- psrldq $3, %xmm6
- palignr $13, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 3(%rsi), %rsi
+ lea 3(%rdx, %rsi), %rdx
+ lea 3(%rcx, %rsi), %rcx
+ mov -4(%rcx), %eax
+ xor %rsi, %rsi
+ mov %eax, -4(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave14):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit14)
palignr $14, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 18(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit14)
- palignr $14, %xmm1, %xmm2
+ palignr $14, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 18+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit14)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit14)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit14):
- movaps (%rdx, %rsi), %xmm6
- psrldq $2, %xmm6
- palignr $14, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 2(%rsi), %rsi
+ lea 2(%rdx, %rsi), %rdx
+ lea 2(%rcx, %rsi), %rcx
+ movw -2(%rcx), %ax
+ xor %rsi, %rsi
+ movw %ax, -2(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave15):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit15)
palignr $15, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 17(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit15)
- palignr $15, %xmm1, %xmm2
+ palignr $15, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 17+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit15)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit15)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit15):
- movaps (%rdx, %rsi), %xmm6
- psrldq $1, %xmm6
- palignr $15, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 1(%rsi), %rsi
+ lea 1(%rdx, %rsi), %rdx
+ lea 1(%rcx, %rsi), %rcx
+ movb -1(%rcx), %ah
+ xor %rsi, %rsi
+ movb %ah, -1(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+
# endif
# ifndef USE_AS_STRCAT
END (STRCPY)
diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
index 4e292f3..477b2cb 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
@@ -21,8 +21,9 @@
#ifndef NOT_IN_libc
# include <sysdep.h>
-.text
+ .section .text.ssse3,"ax",@progbits
ENTRY (__wcscpy_ssse3)
+
mov %rsi, %rcx
mov %rdi, %rdx
@@ -136,6 +137,7 @@ L(Align16Both):
mov $-0x40, %rsi
+ .p2align 4
L(Aligned64Loop):
movaps (%rcx), %xmm2
movaps %xmm2, %xmm4
@@ -205,7 +207,6 @@ L(Shl4Start):
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
@@ -213,15 +214,14 @@ L(Shl4Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
test %rax, %rax
jnz L(Shl4LoopExit)
- palignr $4, %xmm1, %xmm2
+ palignr $4, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqd %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -233,7 +233,6 @@ L(Shl4Start):
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
@@ -245,8 +244,7 @@ L(Shl4Start):
test %rax, %rax
jnz L(Shl4LoopExit)
- palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $4, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 28(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -259,6 +257,7 @@ L(Shl4Start):
movaps -4(%rcx), %xmm1
+ .p2align 4
L(Shl4LoopStart):
movaps 12(%rcx), %xmm2
movaps 28(%rcx), %xmm3
@@ -289,11 +288,9 @@ L(Shl4LoopStart):
jmp L(Shl4LoopStart)
L(Shl4LoopExit):
- movaps (%rdx), %xmm6
- psrldq $12, %xmm6
+ movdqu -4(%rcx), %xmm1
mov $12, %rsi
- palignr $4, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -4(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -309,7 +306,6 @@ L(Shl8Start):
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
@@ -317,15 +313,14 @@ L(Shl8Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
test %rax, %rax
jnz L(Shl8LoopExit)
- palignr $8, %xmm1, %xmm2
+ palignr $8, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqd %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -337,7 +332,6 @@ L(Shl8Start):
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
@@ -345,13 +339,11 @@ L(Shl8Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
test %rax, %rax
jnz L(Shl8LoopExit)
- palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $8, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 24(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -364,6 +356,7 @@ L(Shl8Start):
movaps -8(%rcx), %xmm1
+ .p2align 4
L(Shl8LoopStart):
movaps 8(%rcx), %xmm2
movaps 24(%rcx), %xmm3
@@ -394,11 +387,9 @@ L(Shl8LoopStart):
jmp L(Shl8LoopStart)
L(Shl8LoopExit):
- movaps (%rdx), %xmm6
- psrldq $8, %xmm6
+ mov (%rcx), %r9
mov $8, %rsi
- palignr $8, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, (%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -414,7 +405,6 @@ L(Shl12Start):
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
@@ -422,15 +412,14 @@ L(Shl12Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
test %rax, %rax
jnz L(Shl12LoopExit)
- palignr $12, %xmm1, %xmm2
+ palignr $12, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqd %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -442,7 +431,6 @@ L(Shl12Start):
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
@@ -450,13 +438,11 @@ L(Shl12Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
test %rax, %rax
jnz L(Shl12LoopExit)
- palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $12, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 20(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -469,6 +455,7 @@ L(Shl12Start):
movaps -12(%rcx), %xmm1
+ .p2align 4
L(Shl12LoopStart):
movaps 4(%rcx), %xmm2
movaps 20(%rcx), %xmm3
@@ -498,11 +485,10 @@ L(Shl12LoopStart):
jmp L(Shl12LoopStart)
L(Shl12LoopExit):
- movaps (%rdx), %xmm6
- psrldq $4, %xmm6
+ mov (%rcx), %r9d
mov $4, %rsi
- palignr $12, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
.p2align 4
L(CopyFrom1To16Bytes):
@@ -556,8 +542,10 @@ L(Exit12):
.p2align 4
L(Exit16):
- movdqu (%rcx), %xmm0
- movdqu %xmm0, (%rdx)
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 8(%rcx), %rax
+ mov %rax, 8(%rdx)
mov %rdi, %rax
ret
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 8 +
sysdeps/x86_64/multiarch/strcpy-ssse3.S | 767 ++++++++++++-------------------
sysdeps/x86_64/multiarch/wcscpy-ssse3.S | 64 +--
3 files changed, 331 insertions(+), 508 deletions(-)
hooks/post-receive
--
GNU C Library master sources