This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch hjl/strcpy-avx2-cleanup-v3-direct-branches created. glibc-2.27.9000-656-ge58ce5f
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 26 Sep 2018 22:34:00 -0000
- Subject: GNU C Library master sources branch hjl/strcpy-avx2-cleanup-v3-direct-branches created. glibc-2.27.9000-656-ge58ce5f
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, hjl/strcpy-avx2-cleanup-v3-direct-branches has been created
at e58ce5fabbb5635db944f2232155dc204fd45501 (commit)
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e58ce5fabbb5635db944f2232155dc204fd45501
commit e58ce5fabbb5635db944f2232155dc204fd45501
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Sep 26 15:33:22 2018 -0700
Merge L(Exit1)
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 84fba69..7eeb255 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -368,11 +368,18 @@ L(CopyVecSizeExit):
jae L(Exit5_7)
cmp $3, %edx
je L(Exit4)
- cmp $2, %edx
- je L(Exit3)
- cmp $0, %edx
- ja L(Exit2)
- je L(Exit1)
+ cmp $1, %edx
+ ja L(Exit3)
+ je L(Exit2)
+ movb $0, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea (%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $1, %r8
+ lea 1(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
VZEROUPPER
ret
@@ -729,20 +736,6 @@ L(CopyVecSizeTail1Case2OrCase3):
/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
.p2align 4
-L(Exit1):
- mov %dh, (%rdi)
-# ifdef USE_AS_STPCPY
- lea (%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $1, %r8
- lea 1(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
L(Exit2):
mov (%rsi), %dx
mov %dx, (%rdi)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=57cb5d8a1327ee54fd441b6e3ec4f1fa4b51127a
commit 57cb5d8a1327ee54fd441b6e3ec4f1fa4b51127a
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Sep 26 15:29:26 2018 -0700
More L(CopyVecSizeExit) use
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 7efcec2..84fba69 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -391,31 +391,7 @@ L(CopyTwoVecSize):
add %rcx, %rsi
add $VEC_SIZE, %edx
sub %ecx, %edx
- cmp $63, %edx
- je L(Exit64)
- cmp $32, %edx
- jae L(Exit33_63)
- cmp $31, %edx
- je L(Exit32)
- cmp $16, %edx
- jae L(Exit17_31)
- cmp $15, %edx
- je L(Exit16)
- cmp $8, %edx
- jae L(Exit9_15)
- cmp $7, %edx
- je L(Exit8)
- cmp $4, %edx
- jae L(Exit5_7)
- cmp $3, %edx
- je L(Exit4)
- cmp $2, %edx
- je L(Exit3)
- cmp $0, %edx
- ja L(Exit2)
- je L(Exit1)
- VZEROUPPER
- ret
+ jmp L(CopyVecSizeExit)
.p2align 4
L(CopyVecSizeUnaligned_0):
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=08e3f3162ba71a22298179a2c5c8476287359ea2
commit 08e3f3162ba71a22298179a2c5c8476287359ea2
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Sep 26 15:26:21 2018 -0700
Merge L(CopyVecSizeTail1)
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 85c284a..7efcec2 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -347,6 +347,7 @@ L(CopyVecSize):
# endif
L(CopyVecSizeTail):
add %rcx, %rsi
+L(CopyVecSizeTail1):
bsf %edx, %edx
L(CopyVecSizeExit):
cmp $63, %edx
@@ -382,33 +383,7 @@ L(CopyTwoVecSize1):
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $VEC_SIZE, %r8
# endif
-L(CopyVecSizeTail1):
- bsf %edx, %edx
- cmp $63, %edx
- je L(Exit64)
- cmp $32, %edx
- jae L(Exit33_63)
- cmp $31, %edx
- je L(Exit32)
- cmp $16, %edx
- jae L(Exit17_31)
- cmp $15, %edx
- je L(Exit16)
- cmp $8, %edx
- jae L(Exit9_15)
- cmp $7, %edx
- je L(Exit8)
- cmp $4, %edx
- jae L(Exit5_7)
- cmp $3, %edx
- je L(Exit4)
- cmp $2, %edx
- je L(Exit3)
- cmp $0, %edx
- ja L(Exit2)
- je L(Exit1)
- VZEROUPPER
- ret
+ jmp L(CopyVecSizeTail1)
.p2align 4
L(CopyTwoVecSize):
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c83b5a852f0a7ff09dbe9490e169cb30397ad17a
commit c83b5a852f0a7ff09dbe9490e169cb30397ad17a
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Sep 26 15:23:41 2018 -0700
Use L(CopyVecSizeExit)
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 8dfb2d3..85c284a 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -295,32 +295,7 @@ L(UnalignedFourVecSizeLeave):
# else
add $(VEC_SIZE * 3), %rsi
add $(VEC_SIZE * 3), %rdi
- cmp $63, %edx
- je L(Exit64)
- cmp $32, %edx
- jae L(Exit33_63)
- cmp $31, %edx
- je L(Exit32)
- cmp $16, %edx
- jae L(Exit17_31)
- cmp $15, %edx
- je L(Exit16)
- cmp $8, %edx
- jae L(Exit9_15)
- cmp $7, %edx
- je L(Exit8)
- cmp $4, %edx
- jae L(Exit5_7)
- cmp $3, %edx
- je L(Exit4)
- cmp $2, %edx
- je L(Exit3)
- cmp $0, %edx
- ja L(Exit2)
- je L(Exit1)
- VZEROUPPER
- ret
-
+ jmp L(CopyVecSizeExit)
# endif
/* If source address alignment == destination address alignment */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=95cc78baad0e4544967659bab8a0bec6863fe4ce
commit 95cc78baad0e4544967659bab8a0bec6863fe4ce
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Sep 26 15:20:47 2018 -0700
Merge L(CopyVecSizeExit)
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index d3b4590..8dfb2d3 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -369,39 +369,11 @@ L(SourceStringAlignmentLessTwoVecSize):
.p2align 4
L(CopyVecSize):
add %rcx, %rdi
- add %rcx, %rsi
- bsf %edx, %edx
# endif
-L(CopyVecSizeExit):
- cmp $63, %edx
- je L(Exit64)
- cmp $32, %edx
- jae L(Exit33_63)
- cmp $31, %edx
- je L(Exit32)
- cmp $16, %edx
- jae L(Exit17_31)
- cmp $15, %edx
- je L(Exit16)
- cmp $8, %edx
- jae L(Exit9_15)
- cmp $7, %edx
- je L(Exit8)
- cmp $4, %edx
- jae L(Exit5_7)
- cmp $3, %edx
- je L(Exit4)
- cmp $2, %edx
- je L(Exit3)
- cmp $0, %edx
- ja L(Exit2)
- je L(Exit1)
- VZEROUPPER
- ret
- .p2align 4
L(CopyVecSizeTail):
add %rcx, %rsi
bsf %edx, %edx
+L(CopyVecSizeExit):
cmp $63, %edx
je L(Exit64)
cmp $32, %edx
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=14d8b1cae5d1528a05f0e396d7bd2fa96aa2f64b
commit 14d8b1cae5d1528a05f0e396d7bd2fa96aa2f64b
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Sep 26 15:16:28 2018 -0700
Merger L(CopyVecSizeExit)
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 00dfeb9..d3b4590 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -371,6 +371,8 @@ L(CopyVecSize):
add %rcx, %rdi
add %rcx, %rsi
bsf %edx, %edx
+# endif
+L(CopyVecSizeExit):
cmp $63, %edx
je L(Exit64)
cmp $32, %edx
@@ -396,7 +398,6 @@ L(CopyVecSize):
je L(Exit1)
VZEROUPPER
ret
-# endif
.p2align 4
L(CopyVecSizeTail):
add %rcx, %rsi
@@ -507,31 +508,7 @@ L(CopyVecSizeUnaligned_0):
lea 1(%rdi, %rdx), %rdi
jmp L(StrncpyFillTailWithZero)
# else
- cmp $63, %edx
- je L(Exit64)
- cmp $32, %edx
- jae L(Exit33_63)
- cmp $31, %edx
- je L(Exit32)
- cmp $16, %edx
- jae L(Exit17_31)
- cmp $15, %edx
- je L(Exit16)
- cmp $8, %edx
- jae L(Exit9_15)
- cmp $7, %edx
- je L(Exit8)
- cmp $4, %edx
- jae L(Exit5_7)
- cmp $3, %edx
- je L(Exit4)
- cmp $2, %edx
- je L(Exit3)
- cmp $0, %edx
- ja L(Exit2)
- je L(Exit1)
- VZEROUPPER
- ret
+ jmp L(CopyVecSizeExit)
# endif
.p2align 4
@@ -550,31 +527,7 @@ L(CopyVecSizeUnaligned_16):
# else
add $VEC_SIZE, %rsi
add $VEC_SIZE, %rdi
- cmp $63, %edx
- je L(Exit64)
- cmp $32, %edx
- jae L(Exit33_63)
- cmp $31, %edx
- je L(Exit32)
- cmp $16, %edx
- jae L(Exit17_31)
- cmp $15, %edx
- je L(Exit16)
- cmp $8, %edx
- jae L(Exit9_15)
- cmp $7, %edx
- je L(Exit8)
- cmp $4, %edx
- jae L(Exit5_7)
- cmp $3, %edx
- je L(Exit4)
- cmp $2, %edx
- je L(Exit3)
- cmp $0, %edx
- ja L(Exit2)
- je L(Exit1)
- VZEROUPPER
- ret
+ jmp L(CopyVecSizeExit)
# endif
.p2align 4
@@ -594,31 +547,7 @@ L(CopyVecSizeUnaligned_32):
# else
add $(VEC_SIZE * 2), %rsi
add $(VEC_SIZE * 2), %rdi
- cmp $63, %edx
- je L(Exit64)
- cmp $32, %edx
- jae L(Exit33_63)
- cmp $31, %edx
- je L(Exit32)
- cmp $16, %edx
- jae L(Exit17_31)
- cmp $15, %edx
- je L(Exit16)
- cmp $8, %edx
- jae L(Exit9_15)
- cmp $7, %edx
- je L(Exit8)
- cmp $4, %edx
- jae L(Exit5_7)
- cmp $3, %edx
- je L(Exit4)
- cmp $2, %edx
- je L(Exit3)
- cmp $0, %edx
- ja L(Exit2)
- je L(Exit1)
- VZEROUPPER
- ret
+ jmp L(CopyVecSizeExit)
# endif
# ifdef USE_AS_STRNCPY
@@ -649,34 +578,6 @@ L(CopyVecSizeUnalignedVec1):
jmp L(CopyVecSizeVecExit)
# endif
- .p2align 4
-L(CopyVecSizeExit):
- cmp $63, %edx
- je L(Exit64)
- cmp $32, %edx
- jae L(Exit33_63)
- cmp $31, %edx
- je L(Exit32)
- cmp $16, %edx
- jae L(Exit17_31)
- cmp $15, %edx
- je L(Exit16)
- cmp $8, %edx
- jae L(Exit9_15)
- cmp $7, %edx
- je L(Exit8)
- cmp $4, %edx
- jae L(Exit5_7)
- cmp $3, %edx
- je L(Exit4)
- cmp $2, %edx
- je L(Exit3)
- cmp $0, %edx
- ja L(Exit2)
- je L(Exit1)
- VZEROUPPER
- ret
-
/* Case2 */
.p2align 4
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b98f38b0e21a732eba99827fbf10382ed9900d68
commit b98f38b0e21a732eba99827fbf10382ed9900d68
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Wed Sep 26 12:35:38 2018 -0500
use 32-bits register
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 408c038..00dfeb9 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1392,19 +1392,19 @@ L(StrncpyFillLessFourVecSize):
jl L(StrncpyFillExit)
vmovdqa %ymmZ, (%rdi)
add $VEC_SIZE, %rdi
- cmp $32, %r8
+ cmp $32, %r8d
je L(Fill32)
- cmp $17, %r8
+ cmp $17, %r8d
jae L(Fill17_31)
- cmp $15, %r8
+ cmp $15, %r8d
jae L(Fill15_16)
- cmp $8, %r8
+ cmp $8, %r8d
jae L(Fill8_14)
- cmp $4, %r8
+ cmp $4, %r8d
jae L(Fill4_7)
- cmp $3, %r8
+ cmp $3, %r8d
je L(Fill3)
- cmp $1, %r8
+ cmp $1, %r8d
ja L(Fill2)
je L(Fill1)
VZEROUPPER
@@ -1416,19 +1416,19 @@ L(StrncpyFillLessTwoVecSize):
jl L(StrncpyFillExit)
vmovdqa %ymmZ, (%rdi)
add $VEC_SIZE, %rdi
- cmp $32, %r8
+ cmp $32, %r8d
je L(Fill32)
- cmp $17, %r8
+ cmp $17, %r8d
jae L(Fill17_31)
- cmp $15, %r8
+ cmp $15, %r8d
jae L(Fill15_16)
- cmp $8, %r8
+ cmp $8, %r8d
jae L(Fill8_14)
- cmp $4, %r8
+ cmp $4, %r8d
jae L(Fill4_7)
- cmp $3, %r8
+ cmp $3, %r8d
je L(Fill3)
- cmp $1, %r8
+ cmp $1, %r8d
ja L(Fill2)
je L(Fill1)
VZEROUPPER
@@ -1436,19 +1436,19 @@ L(StrncpyFillLessTwoVecSize):
L(StrncpyFillExit):
add $VEC_SIZE, %r8
- cmp $32, %r8
+ cmp $32, %r8d
je L(Fill32)
- cmp $17, %r8
+ cmp $17, %r8d
jae L(Fill17_31)
- cmp $15, %r8
+ cmp $15, %r8d
jae L(Fill15_16)
- cmp $8, %r8
+ cmp $8, %r8d
jae L(Fill8_14)
- cmp $4, %r8
+ cmp $4, %r8d
jae L(Fill4_7)
- cmp $3, %r8
+ cmp $3, %r8d
je L(Fill3)
- cmp $1, %r8
+ cmp $1, %r8d
ja L(Fill2)
je L(Fill1)
VZEROUPPER
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=532b9eda8b8ae7d5e87e3b7c9a0bf8a2496a02b0
commit 532b9eda8b8ae7d5e87e3b7c9a0bf8a2496a02b0
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Wed Sep 26 12:33:07 2018 -0500
use 32-bits register
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 0c4a1e6..408c038 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -687,25 +687,25 @@ L(CopyVecSizeCase2):
bsf %edx, %edx
cmp %r8d, %edx
jb L(CopyVecSizeExit)
- cmp $65, %r8
+ cmp $65, %r8d
je L(StrncpyExit65)
- cmp $33, %r8
+ cmp $33, %r8d
jae L(StrncpyExit33_64)
- cmp $17, %r8
+ cmp $17, %r8d
jae L(StrncpyExit17_32)
- cmp $16, %r8
+ cmp $16, %r8d
je L(StrncpyExit16)
- cmp $15, %r8
+ cmp $15, %r8d
je L(StrncpyExit15)
- cmp $8, %r8
+ cmp $8, %r8d
jae L(StrncpyExit8_14)
- cmp $4, %r8
+ cmp $4, %r8d
jae L(StrncpyExit4_7)
- cmp $3, %r8
+ cmp $3, %r8d
je L(StrncpyExit3)
- cmp $2, %r8
+ cmp $2, %r8d
je L(StrncpyExit2)
- cmp $0, %r8
+ cmp $0, %r8d
ja L(StrncpyExit1)
je L(StrncpyExit0)
VZEROUPPER
@@ -719,25 +719,25 @@ L(CopyTwoVecSizeCase2):
sub %ecx, %edx
cmp %r8d, %edx
jb L(CopyVecSizeExit)
- cmp $65, %r8
+ cmp $65, %r8d
je L(StrncpyExit65)
- cmp $33, %r8
+ cmp $33, %r8d
jae L(StrncpyExit33_64)
- cmp $17, %r8
+ cmp $17, %r8d
jae L(StrncpyExit17_32)
- cmp $16, %r8
+ cmp $16, %r8d
je L(StrncpyExit16)
- cmp $15, %r8
+ cmp $15, %r8d
je L(StrncpyExit15)
- cmp $8, %r8
+ cmp $8, %r8d
jae L(StrncpyExit8_14)
- cmp $4, %r8
+ cmp $4, %r8d
jae L(StrncpyExit4_7)
- cmp $3, %r8
+ cmp $3, %r8d
je L(StrncpyExit3)
- cmp $2, %r8
+ cmp $2, %r8d
je L(StrncpyExit2)
- cmp $0, %r8
+ cmp $0, %r8d
ja L(StrncpyExit1)
je L(StrncpyExit0)
VZEROUPPER
@@ -749,25 +749,25 @@ L(CopyVecSizeTailCase2):
bsf %edx, %edx
cmp %r8d, %edx
jb L(CopyVecSizeExit)
- cmp $65, %r8
+ cmp $65, %r8d
je L(StrncpyExit65)
- cmp $33, %r8
+ cmp $33, %r8d
jae L(StrncpyExit33_64)
- cmp $17, %r8
+ cmp $17, %r8d
jae L(StrncpyExit17_32)
- cmp $16, %r8
+ cmp $16, %r8d
je L(StrncpyExit16)
- cmp $15, %r8
+ cmp $15, %r8d
je L(StrncpyExit15)
- cmp $8, %r8
+ cmp $8, %r8d
jae L(StrncpyExit8_14)
- cmp $4, %r8
+ cmp $4, %r8d
jae L(StrncpyExit4_7)
- cmp $3, %r8
+ cmp $3, %r8d
je L(StrncpyExit3)
- cmp $2, %r8
+ cmp $2, %r8d
je L(StrncpyExit2)
- cmp $0, %r8
+ cmp $0, %r8d
ja L(StrncpyExit1)
je L(StrncpyExit0)
VZEROUPPER
@@ -777,25 +777,25 @@ L(CopyVecSizeTail1Case2):
bsf %edx, %edx
cmp %r8d, %edx
jb L(CopyVecSizeExit)
- cmp $65, %r8
+ cmp $65, %r8d
je L(StrncpyExit65)
- cmp $33, %r8
+ cmp $33, %r8d
jae L(StrncpyExit33_64)
- cmp $17, %r8
+ cmp $17, %r8d
jae L(StrncpyExit17_32)
- cmp $16, %r8
+ cmp $16, %r8d
je L(StrncpyExit16)
- cmp $15, %r8
+ cmp $15, %r8d
je L(StrncpyExit15)
- cmp $8, %r8
+ cmp $8, %r8d
jae L(StrncpyExit8_14)
- cmp $4, %r8
+ cmp $4, %r8d
jae L(StrncpyExit4_7)
- cmp $3, %r8
+ cmp $3, %r8d
je L(StrncpyExit3)
- cmp $2, %r8
+ cmp $2, %r8d
je L(StrncpyExit2)
- cmp $0, %r8
+ cmp $0, %r8d
ja L(StrncpyExit1)
je L(StrncpyExit0)
VZEROUPPER
@@ -811,25 +811,25 @@ L(CopyVecSizeCase3):
add $VEC_SIZE, %r8
add %rcx, %rdi
add %rcx, %rsi
- cmp $65, %r8
+ cmp $65, %r8d
je L(StrncpyExit65)
- cmp $33, %r8
+ cmp $33, %r8d
jae L(StrncpyExit33_64)
- cmp $17, %r8
+ cmp $17, %r8d
jae L(StrncpyExit17_32)
- cmp $16, %r8
+ cmp $16, %r8d
je L(StrncpyExit16)
- cmp $15, %r8
+ cmp $15, %r8d
je L(StrncpyExit15)
- cmp $8, %r8
+ cmp $8, %r8d
jae L(StrncpyExit8_14)
- cmp $4, %r8
+ cmp $4, %r8d
jae L(StrncpyExit4_7)
- cmp $3, %r8
+ cmp $3, %r8d
je L(StrncpyExit3)
- cmp $2, %r8
+ cmp $2, %r8d
je L(StrncpyExit2)
- cmp $0, %r8
+ cmp $0, %r8d
ja L(StrncpyExit1)
je L(StrncpyExit0)
VZEROUPPER
@@ -840,25 +840,25 @@ L(CopyTwoVecSizeCase2OrCase3):
test %rdx, %rdx
jnz L(CopyTwoVecSizeCase2)
add %rcx, %rsi
- cmp $65, %r8
+ cmp $65, %r8d
je L(StrncpyExit65)
- cmp $33, %r8
+ cmp $33, %r8d
jae L(StrncpyExit33_64)
- cmp $17, %r8
+ cmp $17, %r8d
jae L(StrncpyExit17_32)
- cmp $16, %r8
+ cmp $16, %r8d
je L(StrncpyExit16)
- cmp $15, %r8
+ cmp $15, %r8d
je L(StrncpyExit15)
- cmp $8, %r8
+ cmp $8, %r8d
jae L(StrncpyExit8_14)
- cmp $4, %r8
+ cmp $4, %r8d
jae L(StrncpyExit4_7)
- cmp $3, %r8
+ cmp $3, %r8d
je L(StrncpyExit3)
- cmp $2, %r8
+ cmp $2, %r8d
je L(StrncpyExit2)
- cmp $0, %r8
+ cmp $0, %r8d
ja L(StrncpyExit1)
je L(StrncpyExit0)
VZEROUPPER
@@ -869,25 +869,25 @@ L(CopyVecSizeTailCase2OrCase3):
test %rdx, %rdx
jnz L(CopyVecSizeTailCase2)
add %rcx, %rsi
- cmp $65, %r8
+ cmp $65, %r8d
je L(StrncpyExit65)
- cmp $33, %r8
+ cmp $33, %r8d
jae L(StrncpyExit33_64)
- cmp $17, %r8
+ cmp $17, %r8d
jae L(StrncpyExit17_32)
- cmp $16, %r8
+ cmp $16, %r8d
je L(StrncpyExit16)
- cmp $15, %r8
+ cmp $15, %r8d
je L(StrncpyExit15)
- cmp $8, %r8
+ cmp $8, %r8d
jae L(StrncpyExit8_14)
- cmp $4, %r8
+ cmp $4, %r8d
jae L(StrncpyExit4_7)
- cmp $3, %r8
+ cmp $3, %r8d
je L(StrncpyExit3)
- cmp $2, %r8
+ cmp $2, %r8d
je L(StrncpyExit2)
- cmp $0, %r8
+ cmp $0, %r8d
ja L(StrncpyExit1)
je L(StrncpyExit0)
VZEROUPPER
@@ -901,25 +901,25 @@ L(CopyTwoVecSize1Case2OrCase3):
L(CopyVecSizeTail1Case2OrCase3):
test %rdx, %rdx
jnz L(CopyVecSizeTail1Case2)
- cmp $65, %r8
+ cmp $65, %r8d
je L(StrncpyExit65)
- cmp $33, %r8
+ cmp $33, %r8d
jae L(StrncpyExit33_64)
- cmp $17, %r8
+ cmp $17, %r8d
jae L(StrncpyExit17_32)
- cmp $16, %r8
+ cmp $16, %r8d
je L(StrncpyExit16)
- cmp $15, %r8
+ cmp $15, %r8d
je L(StrncpyExit15)
- cmp $8, %r8
+ cmp $8, %r8d
jae L(StrncpyExit8_14)
- cmp $4, %r8
+ cmp $4, %r8d
jae L(StrncpyExit4_7)
- cmp $3, %r8
+ cmp $3, %r8d
je L(StrncpyExit3)
- cmp $2, %r8
+ cmp $2, %r8d
je L(StrncpyExit2)
- cmp $0, %r8
+ cmp $0, %r8d
ja L(StrncpyExit1)
je L(StrncpyExit0)
VZEROUPPER
@@ -1532,25 +1532,25 @@ L(UnalignedFourVecSizeLeaveCase2):
bsf %edx, %edx
cmp %r8d, %edx
jb L(CopyVecSizeExit)
- cmp $65, %r8
+ cmp $65, %r8d
je L(StrncpyExit65)
- cmp $33, %r8
+ cmp $33, %r8d
jae L(StrncpyExit33_64)
- cmp $17, %r8
+ cmp $17, %r8d
jae L(StrncpyExit17_32)
- cmp $16, %r8
+ cmp $16, %r8d
je L(StrncpyExit16)
- cmp $15, %r8
+ cmp $15, %r8d
je L(StrncpyExit15)
- cmp $8, %r8
+ cmp $8, %r8d
jae L(StrncpyExit8_14)
- cmp $4, %r8
+ cmp $4, %r8d
jae L(StrncpyExit4_7)
- cmp $3, %r8
+ cmp $3, %r8d
je L(StrncpyExit3)
- cmp $2, %r8
+ cmp $2, %r8d
je L(StrncpyExit2)
- cmp $0, %r8
+ cmp $0, %r8d
ja L(StrncpyExit1)
je L(StrncpyExit0)
VZEROUPPER
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=71564cd3ec32db73b3d3a368d82fbd074e67b53f
commit 71564cd3ec32db73b3d3a368d82fbd074e67b53f
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Wed Sep 26 12:27:37 2018 -0500
remove jump tables
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index e1bf592..0c4a1e6 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1571,177 +1571,4 @@ END (STRCPY)
# else
END (STRCAT)
# endif
- .p2align 4
- .section .rodata
-L(ExitTable):
- .int JMPTBL(L(Exit1), L(ExitTable))
- .int JMPTBL(L(Exit2), L(ExitTable))
- .int JMPTBL(L(Exit3), L(ExitTable))
- .int JMPTBL(L(Exit4), L(ExitTable))
- .int JMPTBL(L(Exit5_7), L(ExitTable))
- .int JMPTBL(L(Exit5_7), L(ExitTable))
- .int JMPTBL(L(Exit5_7), L(ExitTable))
- .int JMPTBL(L(Exit8), L(ExitTable))
- .int JMPTBL(L(Exit9_15), L(ExitTable))
- .int JMPTBL(L(Exit9_15), L(ExitTable))
- .int JMPTBL(L(Exit9_15), L(ExitTable))
- .int JMPTBL(L(Exit9_15), L(ExitTable))
- .int JMPTBL(L(Exit9_15), L(ExitTable))
- .int JMPTBL(L(Exit9_15), L(ExitTable))
- .int JMPTBL(L(Exit9_15), L(ExitTable))
- .int JMPTBL(L(Exit16), L(ExitTable))
- .int JMPTBL(L(Exit17_31), L(ExitTable))
- .int JMPTBL(L(Exit17_31), L(ExitTable))
- .int JMPTBL(L(Exit17_31), L(ExitTable))
- .int JMPTBL(L(Exit17_31), L(ExitTable))
- .int JMPTBL(L(Exit17_31), L(ExitTable))
- .int JMPTBL(L(Exit17_31), L(ExitTable))
- .int JMPTBL(L(Exit17_31), L(ExitTable))
- .int JMPTBL(L(Exit17_31), L(ExitTable))
- .int JMPTBL(L(Exit17_31), L(ExitTable))
- .int JMPTBL(L(Exit17_31), L(ExitTable))
- .int JMPTBL(L(Exit17_31), L(ExitTable))
- .int JMPTBL(L(Exit17_31), L(ExitTable))
- .int JMPTBL(L(Exit17_31), L(ExitTable))
- .int JMPTBL(L(Exit17_31), L(ExitTable))
- .int JMPTBL(L(Exit17_31), L(ExitTable))
- .int JMPTBL(L(Exit32), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit33_63), L(ExitTable))
- .int JMPTBL(L(Exit64), L(ExitTable))
-# ifdef USE_AS_STRNCPY
-L(ExitStrncpyTable):
- .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit65), L(ExitStrncpyTable))
-# ifndef USE_AS_STRCAT
- .p2align 4
-L(FillTable):
- .int JMPTBL(L(Fill0), L(FillTable))
- .int JMPTBL(L(Fill1), L(FillTable))
- .int JMPTBL(L(Fill2), L(FillTable))
- .int JMPTBL(L(Fill3), L(FillTable))
- .int JMPTBL(L(Fill4_7), L(FillTable))
- .int JMPTBL(L(Fill4_7), L(FillTable))
- .int JMPTBL(L(Fill4_7), L(FillTable))
- .int JMPTBL(L(Fill4_7), L(FillTable))
- .int JMPTBL(L(Fill8_14), L(FillTable))
- .int JMPTBL(L(Fill8_14), L(FillTable))
- .int JMPTBL(L(Fill8_14), L(FillTable))
- .int JMPTBL(L(Fill8_14), L(FillTable))
- .int JMPTBL(L(Fill8_14), L(FillTable))
- .int JMPTBL(L(Fill8_14), L(FillTable))
- .int JMPTBL(L(Fill8_14), L(FillTable))
- .int JMPTBL(L(Fill15_16), L(FillTable))
- .int JMPTBL(L(Fill15_16), L(FillTable))
- .int JMPTBL(L(Fill17_31), L(FillTable))
- .int JMPTBL(L(Fill17_31), L(FillTable))
- .int JMPTBL(L(Fill17_31), L(FillTable))
- .int JMPTBL(L(Fill17_31), L(FillTable))
- .int JMPTBL(L(Fill17_31), L(FillTable))
- .int JMPTBL(L(Fill17_31), L(FillTable))
- .int JMPTBL(L(Fill17_31), L(FillTable))
- .int JMPTBL(L(Fill17_31), L(FillTable))
- .int JMPTBL(L(Fill17_31), L(FillTable))
- .int JMPTBL(L(Fill17_31), L(FillTable))
- .int JMPTBL(L(Fill17_31), L(FillTable))
- .int JMPTBL(L(Fill17_31), L(FillTable))
- .int JMPTBL(L(Fill17_31), L(FillTable))
- .int JMPTBL(L(Fill17_31), L(FillTable))
- .int JMPTBL(L(Fill17_31), L(FillTable))
- .int JMPTBL(L(Fill32), L(FillTable))
-# endif
-# endif
#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9c70650443520d88715d0c7ba6aef9641557175d
commit 9c70650443520d88715d0c7ba6aef9641557175d
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Wed Sep 26 12:22:31 2018 -0500
use direct branches for ExitN labels
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 3b93a2c..e1bf592 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -295,7 +295,32 @@ L(UnalignedFourVecSizeLeave):
# else
add $(VEC_SIZE * 3), %rsi
add $(VEC_SIZE * 3), %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+ cmp $63, %edx
+ je L(Exit64)
+ cmp $32, %edx
+ jae L(Exit33_63)
+ cmp $31, %edx
+ je L(Exit32)
+ cmp $16, %edx
+ jae L(Exit17_31)
+ cmp $15, %edx
+ je L(Exit16)
+ cmp $8, %edx
+ jae L(Exit9_15)
+ cmp $7, %edx
+ je L(Exit8)
+ cmp $4, %edx
+ jae L(Exit5_7)
+ cmp $3, %edx
+ je L(Exit4)
+ cmp $2, %edx
+ je L(Exit3)
+ cmp $0, %edx
+ ja L(Exit2)
+ je L(Exit1)
+ VZEROUPPER
+ ret
+
# endif
/* If source address alignment == destination address alignment */
@@ -346,13 +371,61 @@ L(CopyVecSize):
add %rcx, %rdi
add %rcx, %rsi
bsf %edx, %edx
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+ cmp $63, %edx
+ je L(Exit64)
+ cmp $32, %edx
+ jae L(Exit33_63)
+ cmp $31, %edx
+ je L(Exit32)
+ cmp $16, %edx
+ jae L(Exit17_31)
+ cmp $15, %edx
+ je L(Exit16)
+ cmp $8, %edx
+ jae L(Exit9_15)
+ cmp $7, %edx
+ je L(Exit8)
+ cmp $4, %edx
+ jae L(Exit5_7)
+ cmp $3, %edx
+ je L(Exit4)
+ cmp $2, %edx
+ je L(Exit3)
+ cmp $0, %edx
+ ja L(Exit2)
+ je L(Exit1)
+ VZEROUPPER
+ ret
# endif
.p2align 4
L(CopyVecSizeTail):
add %rcx, %rsi
bsf %edx, %edx
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+ cmp $63, %edx
+ je L(Exit64)
+ cmp $32, %edx
+ jae L(Exit33_63)
+ cmp $31, %edx
+ je L(Exit32)
+ cmp $16, %edx
+ jae L(Exit17_31)
+ cmp $15, %edx
+ je L(Exit16)
+ cmp $8, %edx
+ jae L(Exit9_15)
+ cmp $7, %edx
+ je L(Exit8)
+ cmp $4, %edx
+ jae L(Exit5_7)
+ cmp $3, %edx
+ je L(Exit4)
+ cmp $2, %edx
+ je L(Exit3)
+ cmp $0, %edx
+ ja L(Exit2)
+ je L(Exit1)
+ VZEROUPPER
+ ret
.p2align 4
L(CopyTwoVecSize1):
@@ -363,7 +436,31 @@ L(CopyTwoVecSize1):
# endif
L(CopyVecSizeTail1):
bsf %edx, %edx
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+ cmp $63, %edx
+ je L(Exit64)
+ cmp $32, %edx
+ jae L(Exit33_63)
+ cmp $31, %edx
+ je L(Exit32)
+ cmp $16, %edx
+ jae L(Exit17_31)
+ cmp $15, %edx
+ je L(Exit16)
+ cmp $8, %edx
+ jae L(Exit9_15)
+ cmp $7, %edx
+ je L(Exit8)
+ cmp $4, %edx
+ jae L(Exit5_7)
+ cmp $3, %edx
+ je L(Exit4)
+ cmp $2, %edx
+ je L(Exit3)
+ cmp $0, %edx
+ ja L(Exit2)
+ je L(Exit1)
+ VZEROUPPER
+ ret
.p2align 4
L(CopyTwoVecSize):
@@ -371,7 +468,31 @@ L(CopyTwoVecSize):
add %rcx, %rsi
add $VEC_SIZE, %edx
sub %ecx, %edx
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+ cmp $63, %edx
+ je L(Exit64)
+ cmp $32, %edx
+ jae L(Exit33_63)
+ cmp $31, %edx
+ je L(Exit32)
+ cmp $16, %edx
+ jae L(Exit17_31)
+ cmp $15, %edx
+ je L(Exit16)
+ cmp $8, %edx
+ jae L(Exit9_15)
+ cmp $7, %edx
+ je L(Exit8)
+ cmp $4, %edx
+ jae L(Exit5_7)
+ cmp $3, %edx
+ je L(Exit4)
+ cmp $2, %edx
+ je L(Exit3)
+ cmp $0, %edx
+ ja L(Exit2)
+ je L(Exit1)
+ VZEROUPPER
+ ret
.p2align 4
L(CopyVecSizeUnaligned_0):
@@ -386,7 +507,31 @@ L(CopyVecSizeUnaligned_0):
lea 1(%rdi, %rdx), %rdi
jmp L(StrncpyFillTailWithZero)
# else
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+ cmp $63, %edx
+ je L(Exit64)
+ cmp $32, %edx
+ jae L(Exit33_63)
+ cmp $31, %edx
+ je L(Exit32)
+ cmp $16, %edx
+ jae L(Exit17_31)
+ cmp $15, %edx
+ je L(Exit16)
+ cmp $8, %edx
+ jae L(Exit9_15)
+ cmp $7, %edx
+ je L(Exit8)
+ cmp $4, %edx
+ jae L(Exit5_7)
+ cmp $3, %edx
+ je L(Exit4)
+ cmp $2, %edx
+ je L(Exit3)
+ cmp $0, %edx
+ ja L(Exit2)
+ je L(Exit1)
+ VZEROUPPER
+ ret
# endif
.p2align 4
@@ -405,7 +550,31 @@ L(CopyVecSizeUnaligned_16):
# else
add $VEC_SIZE, %rsi
add $VEC_SIZE, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+ cmp $63, %edx
+ je L(Exit64)
+ cmp $32, %edx
+ jae L(Exit33_63)
+ cmp $31, %edx
+ je L(Exit32)
+ cmp $16, %edx
+ jae L(Exit17_31)
+ cmp $15, %edx
+ je L(Exit16)
+ cmp $8, %edx
+ jae L(Exit9_15)
+ cmp $7, %edx
+ je L(Exit8)
+ cmp $4, %edx
+ jae L(Exit5_7)
+ cmp $3, %edx
+ je L(Exit4)
+ cmp $2, %edx
+ je L(Exit3)
+ cmp $0, %edx
+ ja L(Exit2)
+ je L(Exit1)
+ VZEROUPPER
+ ret
# endif
.p2align 4
@@ -425,7 +594,31 @@ L(CopyVecSizeUnaligned_32):
# else
add $(VEC_SIZE * 2), %rsi
add $(VEC_SIZE * 2), %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+ cmp $63, %edx
+ je L(Exit64)
+ cmp $32, %edx
+ jae L(Exit33_63)
+ cmp $31, %edx
+ je L(Exit32)
+ cmp $16, %edx
+ jae L(Exit17_31)
+ cmp $15, %edx
+ je L(Exit16)
+ cmp $8, %edx
+ jae L(Exit9_15)
+ cmp $7, %edx
+ je L(Exit8)
+ cmp $4, %edx
+ jae L(Exit5_7)
+ cmp $3, %edx
+ je L(Exit4)
+ cmp $2, %edx
+ je L(Exit3)
+ cmp $0, %edx
+ ja L(Exit2)
+ je L(Exit1)
+ VZEROUPPER
+ ret
# endif
# ifdef USE_AS_STRNCPY
@@ -458,7 +651,31 @@ L(CopyVecSizeUnalignedVec1):
.p2align 4
L(CopyVecSizeExit):
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+ cmp $63, %edx
+ je L(Exit64)
+ cmp $32, %edx
+ jae L(Exit33_63)
+ cmp $31, %edx
+ je L(Exit32)
+ cmp $16, %edx
+ jae L(Exit17_31)
+ cmp $15, %edx
+ je L(Exit16)
+ cmp $8, %edx
+ jae L(Exit9_15)
+ cmp $7, %edx
+ je L(Exit8)
+ cmp $4, %edx
+ jae L(Exit5_7)
+ cmp $3, %edx
+ je L(Exit4)
+ cmp $2, %edx
+ je L(Exit3)
+ cmp $0, %edx
+ ja L(Exit2)
+ je L(Exit1)
+ VZEROUPPER
+ ret
/* Case2 */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=27c20556d42ab6511d164aca811dec3500f17b30
commit 27c20556d42ab6511d164aca811dec3500f17b30
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Wed Sep 26 09:20:09 2018 -0500
use direct branches instead of the StrncpyTable jump table
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 2516bdd..3b93a2c 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -470,7 +470,29 @@ L(CopyVecSizeCase2):
bsf %edx, %edx
cmp %r8d, %edx
jb L(CopyVecSizeExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+ cmp $65, %r8
+ je L(StrncpyExit65)
+ cmp $33, %r8
+ jae L(StrncpyExit33_64)
+ cmp $17, %r8
+ jae L(StrncpyExit17_32)
+ cmp $16, %r8
+ je L(StrncpyExit16)
+ cmp $15, %r8
+ je L(StrncpyExit15)
+ cmp $8, %r8
+ jae L(StrncpyExit8_14)
+ cmp $4, %r8
+ jae L(StrncpyExit4_7)
+ cmp $3, %r8
+ je L(StrncpyExit3)
+ cmp $2, %r8
+ je L(StrncpyExit2)
+ cmp $0, %r8
+ ja L(StrncpyExit1)
+ je L(StrncpyExit0)
+ VZEROUPPER
+ ret
.p2align 4
L(CopyTwoVecSizeCase2):
@@ -480,20 +502,87 @@ L(CopyTwoVecSizeCase2):
sub %ecx, %edx
cmp %r8d, %edx
jb L(CopyVecSizeExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+ cmp $65, %r8
+ je L(StrncpyExit65)
+ cmp $33, %r8
+ jae L(StrncpyExit33_64)
+ cmp $17, %r8
+ jae L(StrncpyExit17_32)
+ cmp $16, %r8
+ je L(StrncpyExit16)
+ cmp $15, %r8
+ je L(StrncpyExit15)
+ cmp $8, %r8
+ jae L(StrncpyExit8_14)
+ cmp $4, %r8
+ jae L(StrncpyExit4_7)
+ cmp $3, %r8
+ je L(StrncpyExit3)
+ cmp $2, %r8
+ je L(StrncpyExit2)
+ cmp $0, %r8
+ ja L(StrncpyExit1)
+ je L(StrncpyExit0)
+ VZEROUPPER
+ ret
+
L(CopyVecSizeTailCase2):
add %rcx, %rsi
bsf %edx, %edx
cmp %r8d, %edx
jb L(CopyVecSizeExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+ cmp $65, %r8
+ je L(StrncpyExit65)
+ cmp $33, %r8
+ jae L(StrncpyExit33_64)
+ cmp $17, %r8
+ jae L(StrncpyExit17_32)
+ cmp $16, %r8
+ je L(StrncpyExit16)
+ cmp $15, %r8
+ je L(StrncpyExit15)
+ cmp $8, %r8
+ jae L(StrncpyExit8_14)
+ cmp $4, %r8
+ jae L(StrncpyExit4_7)
+ cmp $3, %r8
+ je L(StrncpyExit3)
+ cmp $2, %r8
+ je L(StrncpyExit2)
+ cmp $0, %r8
+ ja L(StrncpyExit1)
+ je L(StrncpyExit0)
+ VZEROUPPER
+ ret
L(CopyVecSizeTail1Case2):
bsf %edx, %edx
cmp %r8d, %edx
jb L(CopyVecSizeExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+ cmp $65, %r8
+ je L(StrncpyExit65)
+ cmp $33, %r8
+ jae L(StrncpyExit33_64)
+ cmp $17, %r8
+ jae L(StrncpyExit17_32)
+ cmp $16, %r8
+ je L(StrncpyExit16)
+ cmp $15, %r8
+ je L(StrncpyExit15)
+ cmp $8, %r8
+ jae L(StrncpyExit8_14)
+ cmp $4, %r8
+ jae L(StrncpyExit4_7)
+ cmp $3, %r8
+ je L(StrncpyExit3)
+ cmp $2, %r8
+ je L(StrncpyExit2)
+ cmp $0, %r8
+ ja L(StrncpyExit1)
+ je L(StrncpyExit0)
+ VZEROUPPER
+ ret
/* Case2 or Case3, Case3 */
@@ -505,21 +594,87 @@ L(CopyVecSizeCase3):
add $VEC_SIZE, %r8
add %rcx, %rdi
add %rcx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+ cmp $65, %r8
+ je L(StrncpyExit65)
+ cmp $33, %r8
+ jae L(StrncpyExit33_64)
+ cmp $17, %r8
+ jae L(StrncpyExit17_32)
+ cmp $16, %r8
+ je L(StrncpyExit16)
+ cmp $15, %r8
+ je L(StrncpyExit15)
+ cmp $8, %r8
+ jae L(StrncpyExit8_14)
+ cmp $4, %r8
+ jae L(StrncpyExit4_7)
+ cmp $3, %r8
+ je L(StrncpyExit3)
+ cmp $2, %r8
+ je L(StrncpyExit2)
+ cmp $0, %r8
+ ja L(StrncpyExit1)
+ je L(StrncpyExit0)
+ VZEROUPPER
+ ret
.p2align 4
L(CopyTwoVecSizeCase2OrCase3):
test %rdx, %rdx
jnz L(CopyTwoVecSizeCase2)
add %rcx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+ cmp $65, %r8
+ je L(StrncpyExit65)
+ cmp $33, %r8
+ jae L(StrncpyExit33_64)
+ cmp $17, %r8
+ jae L(StrncpyExit17_32)
+ cmp $16, %r8
+ je L(StrncpyExit16)
+ cmp $15, %r8
+ je L(StrncpyExit15)
+ cmp $8, %r8
+ jae L(StrncpyExit8_14)
+ cmp $4, %r8
+ jae L(StrncpyExit4_7)
+ cmp $3, %r8
+ je L(StrncpyExit3)
+ cmp $2, %r8
+ je L(StrncpyExit2)
+ cmp $0, %r8
+ ja L(StrncpyExit1)
+ je L(StrncpyExit0)
+ VZEROUPPER
+ ret
.p2align 4
L(CopyVecSizeTailCase2OrCase3):
test %rdx, %rdx
jnz L(CopyVecSizeTailCase2)
add %rcx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+ cmp $65, %r8
+ je L(StrncpyExit65)
+ cmp $33, %r8
+ jae L(StrncpyExit33_64)
+ cmp $17, %r8
+ jae L(StrncpyExit17_32)
+ cmp $16, %r8
+ je L(StrncpyExit16)
+ cmp $15, %r8
+ je L(StrncpyExit15)
+ cmp $8, %r8
+ jae L(StrncpyExit8_14)
+ cmp $4, %r8
+ jae L(StrncpyExit4_7)
+ cmp $3, %r8
+ je L(StrncpyExit3)
+ cmp $2, %r8
+ je L(StrncpyExit2)
+ cmp $0, %r8
+ ja L(StrncpyExit1)
+ je L(StrncpyExit0)
+ VZEROUPPER
+ ret
.p2align 4
L(CopyTwoVecSize1Case2OrCase3):
@@ -529,7 +684,29 @@ L(CopyTwoVecSize1Case2OrCase3):
L(CopyVecSizeTail1Case2OrCase3):
test %rdx, %rdx
jnz L(CopyVecSizeTail1Case2)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+ cmp $65, %r8
+ je L(StrncpyExit65)
+ cmp $33, %r8
+ jae L(StrncpyExit33_64)
+ cmp $17, %r8
+ jae L(StrncpyExit17_32)
+ cmp $16, %r8
+ je L(StrncpyExit16)
+ cmp $15, %r8
+ je L(StrncpyExit15)
+ cmp $8, %r8
+ jae L(StrncpyExit8_14)
+ cmp $4, %r8
+ jae L(StrncpyExit4_7)
+ cmp $3, %r8
+ je L(StrncpyExit3)
+ cmp $2, %r8
+ je L(StrncpyExit2)
+ cmp $0, %r8
+ ja L(StrncpyExit1)
+ je L(StrncpyExit0)
+ VZEROUPPER
+ ret
# endif
@@ -1138,7 +1315,29 @@ L(UnalignedFourVecSizeLeaveCase2):
bsf %edx, %edx
cmp %r8d, %edx
jb L(CopyVecSizeExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+ cmp $65, %r8
+ je L(StrncpyExit65)
+ cmp $33, %r8
+ jae L(StrncpyExit33_64)
+ cmp $17, %r8
+ jae L(StrncpyExit17_32)
+ cmp $16, %r8
+ je L(StrncpyExit16)
+ cmp $15, %r8
+ je L(StrncpyExit15)
+ cmp $8, %r8
+ jae L(StrncpyExit8_14)
+ cmp $4, %r8
+ jae L(StrncpyExit4_7)
+ cmp $3, %r8
+ je L(StrncpyExit3)
+ cmp $2, %r8
+ je L(StrncpyExit2)
+ cmp $0, %r8
+ ja L(StrncpyExit1)
+ je L(StrncpyExit0)
+ VZEROUPPER
+ ret
.p2align 4
L(ExitZero):
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ec5844206bed7e0212a99e6778e70132a2dfaced
commit ec5844206bed7e0212a99e6778e70132a2dfaced
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Wed Sep 26 09:04:04 2018 -0500
use direct branches on L(StrncpyFillLessTwoVecSize) and L(StrncpyFillExit)
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index d9148f6..2516bdd 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1022,11 +1022,43 @@ L(StrncpyFillLessTwoVecSize):
jl L(StrncpyFillExit)
vmovdqa %ymmZ, (%rdi)
add $VEC_SIZE, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+ cmp $32, %r8
+ je L(Fill32)
+ cmp $17, %r8
+ jae L(Fill17_31)
+ cmp $15, %r8
+ jae L(Fill15_16)
+ cmp $8, %r8
+ jae L(Fill8_14)
+ cmp $4, %r8
+ jae L(Fill4_7)
+ cmp $3, %r8
+ je L(Fill3)
+ cmp $1, %r8
+ ja L(Fill2)
+ je L(Fill1)
+ VZEROUPPER
+ ret
L(StrncpyFillExit):
add $VEC_SIZE, %r8
- BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+ cmp $32, %r8
+ je L(Fill32)
+ cmp $17, %r8
+ jae L(Fill17_31)
+ cmp $15, %r8
+ jae L(Fill15_16)
+ cmp $8, %r8
+ jae L(Fill8_14)
+ cmp $4, %r8
+ jae L(Fill4_7)
+ cmp $3, %r8
+ je L(Fill3)
+ cmp $1, %r8
+ ja L(Fill2)
+ je L(Fill1)
+ VZEROUPPER
+ ret
/* end of ifndef USE_AS_STRCAT */
# endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e664fdf02f0e0ebb14fba0fb5a2c14ff5ec4bed0
commit e664fdf02f0e0ebb14fba0fb5a2c14ff5ec4bed0
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Tue Sep 25 14:56:08 2018 -0700
Replace jz with je, remove 1 cmp and 1 jz
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 31e1d80..d9148f6 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -999,7 +999,7 @@ L(StrncpyFillLessFourVecSize):
vmovdqa %ymmZ, (%rdi)
add $VEC_SIZE, %rdi
cmp $32, %r8
- jz L(Fill32)
+ je L(Fill32)
cmp $17, %r8
jae L(Fill17_31)
cmp $15, %r8
@@ -1009,14 +1009,14 @@ L(StrncpyFillLessFourVecSize):
cmp $4, %r8
jae L(Fill4_7)
cmp $3, %r8
- jz L(Fill3)
- cmp $2, %r8
- jz L(Fill2)
+ je L(Fill3)
cmp $1, %r8
- jz L(Fill1)
- cmp $0, %r8
- jz L(Fill0)
+ ja L(Fill2)
+ je L(Fill1)
+ VZEROUPPER
+ ret
+ .p2align 4
L(StrncpyFillLessTwoVecSize):
add $VEC_SIZE, %r8
jl L(StrncpyFillExit)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a691ad38c0eaea9eb51f56bd713e24362b770588
commit a691ad38c0eaea9eb51f56bd713e24362b770588
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Tue Sep 25 16:40:20 2018 -0500
use direct branches on L(StrncpyFillLessFourVecSize)
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 993abf8..31e1d80 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -998,7 +998,24 @@ L(StrncpyFillLessFourVecSize):
jl L(StrncpyFillExit)
vmovdqa %ymmZ, (%rdi)
add $VEC_SIZE, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+ cmp $32, %r8
+ jz L(Fill32)
+ cmp $17, %r8
+ jae L(Fill17_31)
+ cmp $15, %r8
+ jae L(Fill15_16)
+ cmp $8, %r8
+ jae L(Fill8_14)
+ cmp $4, %r8
+ jae L(Fill4_7)
+ cmp $3, %r8
+ jz L(Fill3)
+ cmp $2, %r8
+ jz L(Fill2)
+ cmp $1, %r8
+ jz L(Fill1)
+ cmp $0, %r8
+ jz L(Fill0)
L(StrncpyFillLessTwoVecSize):
add $VEC_SIZE, %r8
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=7768744075492fb194a8f7631f882aa14bfa76b4
commit 7768744075492fb194a8f7631f882aa14bfa76b4
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Tue Sep 25 15:38:05 2018 -0500
remove whitespaces
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index cc17312..993abf8 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -648,8 +648,8 @@ L(Exit9_15):
.p2align 4
L(Exit16):
- vmovdqu (%rsi), %xmm0
- vmovdqu %xmm0, (%rdi)
+ vmovdqu (%rsi), %xmm0
+ vmovdqu %xmm0, (%rdi)
# ifdef USE_AS_STPCPY
lea 15(%rdi), %rax
# endif
@@ -681,8 +681,8 @@ L(Exit17_31):
.p2align 4
L(Exit32):
- vmovdqu (%rsi), %ymm0
- vmovdqu %ymm0, (%rdi)
+ vmovdqu (%rsi), %ymm0
+ vmovdqu %ymm0, (%rdi)
# ifdef USE_AS_STPCPY
lea 31(%rdi), %rax
# endif
@@ -830,8 +830,8 @@ L(StrncpyExit15):
.p2align 4
L(StrncpyExit16):
- vmovdqu (%rsi), %xmm0
- vmovdqu %xmm0, (%rdi)
+ vmovdqu (%rsi), %xmm0
+ vmovdqu %xmm0, (%rdi)
# ifdef USE_AS_STPCPY
lea 16(%rdi), %rax
# endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=98b7ab9829061290ea0907cf16aa38ff64cccbe6
commit 98b7ab9829061290ea0907cf16aa38ff64cccbe6
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Tue Sep 25 15:31:55 2018 -0500
consolidate StrncpyExit32 into the 17_31
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index aaa9200..cc17312 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -842,7 +842,7 @@ L(StrncpyExit16):
ret
.p2align 4
-L(StrncpyExit17_31):
+L(StrncpyExit17_32):
vmovdqu (%rsi), %xmm0
vmovdqu -16(%rsi, %r8), %xmm2
vmovdqu %xmm0, (%rdi)
@@ -857,139 +857,6 @@ L(StrncpyExit17_31):
ret
.p2align 4
-L(StrncpyExit24):
- vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %rcx
- vmovdqu %xmm0, (%rdi)
- mov %rcx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 24(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 24(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit25):
- vmovdqu (%rsi), %xmm0
- vmovdqu 9(%rsi), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, 9(%rdi)
-# ifdef USE_AS_STPCPY
- lea 25(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 25(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit26):
- vmovdqu (%rsi), %xmm0
- vmovdqu 10(%rsi), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, 10(%rdi)
-# ifdef USE_AS_STPCPY
- lea 26(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 26(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit27):
- vmovdqu (%rsi), %xmm0
- vmovdqu 11(%rsi), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, 11(%rdi)
-# ifdef USE_AS_STPCPY
- lea 27(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 27(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit28):
- vmovdqu (%rsi), %xmm0
- vmovdqu 12(%rsi), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, 12(%rdi)
-# ifdef USE_AS_STPCPY
- lea 28(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 28(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit29):
- vmovdqu (%rsi), %xmm0
- vmovdqu 13(%rsi), %xmm2
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm2, 13(%rdi)
-# ifdef USE_AS_STPCPY
- lea 29(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 29(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit30):
- vmovdqu (%rsi), %xmm0
- vmovdqu 14(%rsi), %xmm2
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm2, 14(%rdi)
-# ifdef USE_AS_STPCPY
- lea 30(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 30(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit31):
- vmovdqu (%rsi), %xmm0
- vmovdqu 15(%rsi), %xmm2
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm2, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 31(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 31(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit32):
- vmovdqu (%rsi), %ymm0
- vmovdqu %ymm0, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 32(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 32(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
L(StrncpyExit33_64):
/* 0/32, 31/16 */
vmovdqu (%rsi), %ymm0
@@ -1325,22 +1192,22 @@ L(ExitStrncpyTable):
.int JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=16bd51bc6e15a1825d42221052024bd4cf55be32
commit 16bd51bc6e15a1825d42221052024bd4cf55be32
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Tue Sep 25 15:27:57 2018 -0500
consolidate StrncpyExit64 into the 33_63
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 48f7273..aaa9200 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -990,7 +990,7 @@ L(StrncpyExit32):
ret
.p2align 4
-L(StrncpyExit33_63):
+L(StrncpyExit33_64):
/* 0/32, 31/16 */
vmovdqu (%rsi), %ymm0
vmovdqu -VEC_SIZE(%rsi, %r8), %ymm2
@@ -1006,22 +1006,6 @@ L(StrncpyExit33_63):
ret
.p2align 4
-L(StrncpyExit64):
- /* 0/32, 32/32 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %ymm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm2, 32(%rdi)
-# ifdef USE_AS_STPCPY
- lea 64(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 64(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
L(StrncpyExit65):
/* 0/32, 32/32, 64/1 */
vmovdqu (%rsi), %ymm0
@@ -1357,38 +1341,38 @@ L(ExitStrncpyTable):
.int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit65), L(ExitStrncpyTable))
# ifndef USE_AS_STRCAT
.p2align 4
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0d68b69aa12cd27e7f21f3bc3d207d105dbde1b2
commit 0d68b69aa12cd27e7f21f3bc3d207d105dbde1b2
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Tue Sep 25 15:15:24 2018 -0500
consolidate Exit5-Exit7 labels
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 5e12e4f..48f7273 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -596,50 +596,18 @@ L(Exit4):
ret
.p2align 4
-L(Exit5):
+L(Exit5_7):
mov (%rsi), %ecx
- mov %dh, 4(%rdi)
mov %ecx, (%rdi)
+ mov -3(%rsi, %rdx), %ecx
+ mov %ecx, -3(%rdi, %rdx)
# ifdef USE_AS_STPCPY
- lea 4(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $5, %r8
- lea 5(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit6):
- mov (%rsi), %ecx
- mov 4(%rsi), %dx
- mov %ecx, (%rdi)
- mov %dx, 4(%rdi)
-# ifdef USE_AS_STPCPY
- lea 5(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $6, %r8
- lea 6(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit7):
- mov (%rsi), %ecx
- mov 3(%rsi), %edx
- mov %ecx, (%rdi)
- mov %edx, 3(%rdi)
-# ifdef USE_AS_STPCPY
- lea 6(%rdi), %rax
+ lea (%rdi, %rdx), %rax
# endif
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $7, %r8
- lea 7(%rdi), %rdi
+ sub %rdx, %r8
+ sub $1, %r8
+ lea 1(%rdi, %rdx), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER
@@ -1294,9 +1262,9 @@ L(ExitTable):
.int JMPTBL(L(Exit2), L(ExitTable))
.int JMPTBL(L(Exit3), L(ExitTable))
.int JMPTBL(L(Exit4), L(ExitTable))
- .int JMPTBL(L(Exit5), L(ExitTable))
- .int JMPTBL(L(Exit6), L(ExitTable))
- .int JMPTBL(L(Exit7), L(ExitTable))
+ .int JMPTBL(L(Exit5_7), L(ExitTable))
+ .int JMPTBL(L(Exit5_7), L(ExitTable))
+ .int JMPTBL(L(Exit5_7), L(ExitTable))
.int JMPTBL(L(Exit8), L(ExitTable))
.int JMPTBL(L(Exit9_15), L(ExitTable))
.int JMPTBL(L(Exit9_15), L(ExitTable))
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3cc2a758ed5a5b0084ec73ac452a84cb42b9a4d6
commit 3cc2a758ed5a5b0084ec73ac452a84cb42b9a4d6
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Tue Sep 25 13:55:40 2018 -0500
consolidate Exit9_15
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 709caef..5e12e4f 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -661,118 +661,18 @@ L(Exit8):
ret
.p2align 4
-L(Exit9):
+L(Exit9_15):
mov (%rsi), %rcx
- mov %dh, 8(%rdi)
+ mov -7(%rsi, %rdx), %r9
mov %rcx, (%rdi)
+ mov %r9, -7(%rdi, %rdx)
# ifdef USE_AS_STPCPY
- lea 8(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $9, %r8
- lea 9(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit10):
- mov (%rsi), %rcx
- mov 8(%rsi), %dx
- mov %rcx, (%rdi)
- mov %dx, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 9(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $10, %r8
- lea 10(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit11):
- mov (%rsi), %rcx
- mov 7(%rsi), %edx
- mov %rcx, (%rdi)
- mov %edx, 7(%rdi)
-# ifdef USE_AS_STPCPY
- lea 10(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $11, %r8
- lea 11(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit12):
- mov (%rsi), %rcx
- mov 8(%rsi), %edx
- mov %rcx, (%rdi)
- mov %edx, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 11(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $12, %r8
- lea 12(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit13):
- mov (%rsi), %rcx
- mov 5(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 5(%rdi)
-# ifdef USE_AS_STPCPY
- lea 12(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $13, %r8
- lea 13(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit14):
- mov (%rsi), %rcx
- mov 6(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 6(%rdi)
-# ifdef USE_AS_STPCPY
- lea 13(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $14, %r8
- lea 14(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit15):
- mov (%rsi), %rcx
- mov 7(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 7(%rdi)
-# ifdef USE_AS_STPCPY
- lea 14(%rdi), %rax
+ lea (%rdi, %rdx), %rax
# endif
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $15, %r8
- lea 15(%rdi), %rdi
+ sub %rdx, %r8
+ sub $1, %r8
+ lea 1(%rdi, %rdx), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER
@@ -1398,13 +1298,13 @@ L(ExitTable):
.int JMPTBL(L(Exit6), L(ExitTable))
.int JMPTBL(L(Exit7), L(ExitTable))
.int JMPTBL(L(Exit8), L(ExitTable))
- .int JMPTBL(L(Exit9), L(ExitTable))
- .int JMPTBL(L(Exit10), L(ExitTable))
- .int JMPTBL(L(Exit11), L(ExitTable))
- .int JMPTBL(L(Exit12), L(ExitTable))
- .int JMPTBL(L(Exit13), L(ExitTable))
- .int JMPTBL(L(Exit14), L(ExitTable))
- .int JMPTBL(L(Exit15), L(ExitTable))
+ .int JMPTBL(L(Exit9_15), L(ExitTable))
+ .int JMPTBL(L(Exit9_15), L(ExitTable))
+ .int JMPTBL(L(Exit9_15), L(ExitTable))
+ .int JMPTBL(L(Exit9_15), L(ExitTable))
+ .int JMPTBL(L(Exit9_15), L(ExitTable))
+ .int JMPTBL(L(Exit9_15), L(ExitTable))
+ .int JMPTBL(L(Exit9_15), L(ExitTable))
.int JMPTBL(L(Exit16), L(ExitTable))
.int JMPTBL(L(Exit17_31), L(ExitTable))
.int JMPTBL(L(Exit17_31), L(ExitTable))
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a7fc68d9e66b1df6d164feca3250d182c2192d59
commit a7fc68d9e66b1df6d164feca3250d182c2192d59
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Tue Sep 25 09:50:33 2018 -0500
remove old branches already consolidated
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 7fc02db..709caef 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -974,96 +974,6 @@ L(StrncpyExit16):
ret
.p2align 4
-L(StrncpyExit17):
- vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %cl
- vmovdqu %xmm0, (%rdi)
- mov %cl, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 17(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 17(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit18):
- vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %cx
- vmovdqu %xmm0, (%rdi)
- mov %cx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 18(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 18(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit19):
- vmovdqu (%rsi), %xmm0
- mov 15(%rsi), %ecx
- vmovdqu %xmm0, (%rdi)
- mov %ecx, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 19(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 19(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit20):
- vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %ecx
- vmovdqu %xmm0, (%rdi)
- mov %ecx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 20(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 20(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit21):
- vmovdqu (%rsi), %xmm0
- mov 13(%rsi), %rcx
- vmovdqu %xmm0, (%rdi)
- mov %rcx, 13(%rdi)
-# ifdef USE_AS_STPCPY
- lea 21(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 21(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit22):
- vmovdqu (%rsi), %xmm0
- mov 14(%rsi), %rcx
- vmovdqu %xmm0, (%rdi)
- mov %rcx, 14(%rdi)
-# ifdef USE_AS_STPCPY
- lea 22(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 22(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
L(StrncpyExit17_31):
vmovdqu (%rsi), %xmm0
vmovdqu -16(%rsi, %r8), %xmm2
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d4f1935722e7bb4ebb4a0416bad63dd8c0604baa
commit d4f1935722e7bb4ebb4a0416bad63dd8c0604baa
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Tue Sep 25 09:29:28 2018 -0500
consolidate StrncpyExit8 to 14
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 78990be..7fc02db 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -931,104 +931,16 @@ L(StrncpyExit4_7):
ret
.p2align 4
-L(StrncpyExit8):
- mov (%rsi), %rdx
- mov %rdx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 8(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 8(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit9):
- mov (%rsi), %rcx
- mov 8(%rsi), %dl
- mov %rcx, (%rdi)
- mov %dl, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 9(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 9(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit10):
- mov (%rsi), %rcx
- mov 8(%rsi), %dx
- mov %rcx, (%rdi)
- mov %dx, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 10(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 10(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit11):
- mov (%rsi), %rcx
- mov 7(%rsi), %edx
- mov %rcx, (%rdi)
- mov %edx, 7(%rdi)
-# ifdef USE_AS_STPCPY
- lea 11(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 11(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit12):
- mov (%rsi), %rcx
- mov 8(%rsi), %edx
- mov %rcx, (%rdi)
- mov %edx, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 12(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 12(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit13):
- mov (%rsi), %rcx
- mov 5(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 5(%rdi)
-# ifdef USE_AS_STPCPY
- lea 13(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 13(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit14):
+L(StrncpyExit8_14):
mov (%rsi), %rcx
- mov 6(%rsi), %rdx
+ mov -8(%rsi, %r8), %rdx
mov %rcx, (%rdi)
- mov %rdx, 6(%rdi)
+ mov %rdx, -8(%rdi, %r8)
# ifdef USE_AS_STPCPY
- lea 14(%rdi), %rax
+ lea (%rdi, %r8), %rax
# endif
# ifdef USE_AS_STRCAT
- movb $0, 14(%rdi)
+ movb $0, (%rdi, %r8)
# endif
VZEROUPPER
ret
@@ -1642,13 +1554,13 @@ L(ExitStrncpyTable):
.int JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ee1d3cd9a99b79560ed9e44870b1873c0fa2fc8c
commit ee1d3cd9a99b79560ed9e44870b1873c0fa2fc8c
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Tue Sep 25 09:20:47 2018 -0500
consolidate StrncpyExit4 to 7
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 9719509..78990be 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -916,59 +916,16 @@ L(StrncpyExit3):
ret
.p2align 4
-L(StrncpyExit4):
- mov (%rsi), %edx
- mov %edx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 4(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 4(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit5):
- mov (%rsi), %ecx
- mov 4(%rsi), %dl
- mov %ecx, (%rdi)
- mov %dl, 4(%rdi)
-# ifdef USE_AS_STPCPY
- lea 5(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 5(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit6):
- mov (%rsi), %ecx
- mov 4(%rsi), %dx
- mov %ecx, (%rdi)
- mov %dx, 4(%rdi)
-# ifdef USE_AS_STPCPY
- lea 6(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 6(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit7):
+L(StrncpyExit4_7):
mov (%rsi), %ecx
- mov 3(%rsi), %edx
+ mov -4(%rsi, %r8), %edx
mov %ecx, (%rdi)
- mov %edx, 3(%rdi)
+ mov %edx, -4(%rdi, %r8)
# ifdef USE_AS_STPCPY
- lea 7(%rdi), %rax
+ lea (%rdi, %r8), %rax
# endif
# ifdef USE_AS_STRCAT
- movb $0, 7(%rdi)
+ movb $0, (%rdi, %r8)
# endif
VZEROUPPER
ret
@@ -1681,10 +1638,10 @@ L(ExitStrncpyTable):
.int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9c436d942c28ef4f0602f11534a9d49e657e8f70
commit 9c436d942c28ef4f0602f11534a9d49e657e8f70
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Tue Sep 25 08:43:21 2018 -0500
consolidate Fill8 with Fill9_14
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 8a84fb2..9719509 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1425,13 +1425,7 @@ L(Fill4_7):
ret
.p2align 4
-L(Fill8):
- mov %rdx, (%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill9_14):
+L(Fill8_14):
mov %rdx, (%rdi)
mov %rdx, -8(%rdi, %r8)
VZEROUPPER
@@ -1760,13 +1754,13 @@ L(FillTable):
.int JMPTBL(L(Fill4_7), L(FillTable))
.int JMPTBL(L(Fill4_7), L(FillTable))
.int JMPTBL(L(Fill4_7), L(FillTable))
- .int JMPTBL(L(Fill8), L(FillTable))
- .int JMPTBL(L(Fill9_14), L(FillTable))
- .int JMPTBL(L(Fill9_14), L(FillTable))
- .int JMPTBL(L(Fill9_14), L(FillTable))
- .int JMPTBL(L(Fill9_14), L(FillTable))
- .int JMPTBL(L(Fill9_14), L(FillTable))
- .int JMPTBL(L(Fill9_14), L(FillTable))
+ .int JMPTBL(L(Fill8_14), L(FillTable))
+ .int JMPTBL(L(Fill8_14), L(FillTable))
+ .int JMPTBL(L(Fill8_14), L(FillTable))
+ .int JMPTBL(L(Fill8_14), L(FillTable))
+ .int JMPTBL(L(Fill8_14), L(FillTable))
+ .int JMPTBL(L(Fill8_14), L(FillTable))
+ .int JMPTBL(L(Fill8_14), L(FillTable))
.int JMPTBL(L(Fill15_16), L(FillTable))
.int JMPTBL(L(Fill15_16), L(FillTable))
.int JMPTBL(L(Fill17_31), L(FillTable))
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e511145b5dd1dc4df1d1e03ea10006bb22c32cc0
commit e511145b5dd1dc4df1d1e03ea10006bb22c32cc0
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Tue Sep 25 08:36:51 2018 -0500
consolidate Fill4_7
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 606a968..8a84fb2 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1418,28 +1418,9 @@ L(Fill3):
ret
.p2align 4
-L(Fill4):
+L(Fill4_7):
mov %edx, (%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill5):
- mov %edx, (%rdi)
- mov %dl, 4(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill6):
- mov %edx, (%rdi)
- mov %dx, 4(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill7):
- mov %rdx, -1(%rdi)
+ mov %edx, -4(%rdi, %r8)
VZEROUPPER
ret
@@ -1775,10 +1756,10 @@ L(FillTable):
.int JMPTBL(L(Fill1), L(FillTable))
.int JMPTBL(L(Fill2), L(FillTable))
.int JMPTBL(L(Fill3), L(FillTable))
- .int JMPTBL(L(Fill4), L(FillTable))
- .int JMPTBL(L(Fill5), L(FillTable))
- .int JMPTBL(L(Fill6), L(FillTable))
- .int JMPTBL(L(Fill7), L(FillTable))
+ .int JMPTBL(L(Fill4_7), L(FillTable))
+ .int JMPTBL(L(Fill4_7), L(FillTable))
+ .int JMPTBL(L(Fill4_7), L(FillTable))
+ .int JMPTBL(L(Fill4_7), L(FillTable))
.int JMPTBL(L(Fill8), L(FillTable))
.int JMPTBL(L(Fill9_14), L(FillTable))
.int JMPTBL(L(Fill9_14), L(FillTable))
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e7ce816c982f673360d2ac04658632fdfca4df19
commit e7ce816c982f673360d2ac04658632fdfca4df19
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Sep 24 15:19:50 2018 -0700
Consolidate more entries in FillTable
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 0e67a7b..606a968 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1450,56 +1450,15 @@ L(Fill8):
ret
.p2align 4
-L(Fill9):
+L(Fill9_14):
mov %rdx, (%rdi)
- mov %dl, 8(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill10):
- mov %rdx, (%rdi)
- mov %dx, 8(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill11):
- mov %rdx, (%rdi)
- mov %edx, 7(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill12):
- mov %rdx, (%rdi)
- mov %edx, 8(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill13):
- mov %rdx, (%rdi)
- mov %rdx, 5(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill14):
- mov %rdx, (%rdi)
- mov %rdx, 6(%rdi)
+ mov %rdx, -8(%rdi, %r8)
VZEROUPPER
ret
.p2align 4
-L(Fill15):
- vmovdqu %xmmZ, -1(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill16):
- vmovdqu %xmmZ, (%rdi)
+L(Fill15_16):
+ vmovdqu %xmmZ, -16(%rdi,%r8)
VZEROUPPER
ret
@@ -1821,14 +1780,14 @@ L(FillTable):
.int JMPTBL(L(Fill6), L(FillTable))
.int JMPTBL(L(Fill7), L(FillTable))
.int JMPTBL(L(Fill8), L(FillTable))
- .int JMPTBL(L(Fill9), L(FillTable))
- .int JMPTBL(L(Fill10), L(FillTable))
- .int JMPTBL(L(Fill11), L(FillTable))
- .int JMPTBL(L(Fill12), L(FillTable))
- .int JMPTBL(L(Fill13), L(FillTable))
- .int JMPTBL(L(Fill14), L(FillTable))
- .int JMPTBL(L(Fill15), L(FillTable))
- .int JMPTBL(L(Fill16), L(FillTable))
+ .int JMPTBL(L(Fill9_14), L(FillTable))
+ .int JMPTBL(L(Fill9_14), L(FillTable))
+ .int JMPTBL(L(Fill9_14), L(FillTable))
+ .int JMPTBL(L(Fill9_14), L(FillTable))
+ .int JMPTBL(L(Fill9_14), L(FillTable))
+ .int JMPTBL(L(Fill9_14), L(FillTable))
+ .int JMPTBL(L(Fill15_16), L(FillTable))
+ .int JMPTBL(L(Fill15_16), L(FillTable))
.int JMPTBL(L(Fill17_31), L(FillTable))
.int JMPTBL(L(Fill17_31), L(FillTable))
.int JMPTBL(L(Fill17_31), L(FillTable))
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=81dbdce369b3376bab235925effbddb2c476d864
commit 81dbdce369b3376bab235925effbddb2c476d864
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Mon Sep 24 16:40:19 2018 -0500
compact Fill17 to Fill31 labels into a single one
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 4ab4c66..0e67a7b 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1504,114 +1504,9 @@ L(Fill16):
ret
.p2align 4
-L(Fill17):
+L(Fill17_31):
vmovdqu %xmmZ, (%rdi)
- mov %dl, 16(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill18):
- vmovdqu %xmmZ, (%rdi)
- mov %dx, 16(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill19):
- vmovdqu %xmmZ, (%rdi)
- mov %edx, 15(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill20):
- vmovdqu %xmmZ, (%rdi)
- mov %edx, 16(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill21):
- vmovdqu %xmmZ, (%rdi)
- mov %edx, 16(%rdi)
- mov %dl, 20(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill22):
- vmovdqu %xmmZ, (%rdi)
- mov %edx, 16(%rdi)
- mov %dx, 20(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill23):
- vmovdqu %xmmZ, (%rdi)
- mov %rdx, 15(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill24):
- vmovdqu %xmmZ, (%rdi)
- mov %rdx, 16(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill25):
- vmovdqu %xmmZ, (%rdi)
- mov %rdx, 16(%rdi)
- mov %dl, 24(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill26):
- vmovdqu %xmmZ, (%rdi)
- mov %rdx, 16(%rdi)
- mov %dx, 24(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill27):
- vmovdqu %xmmZ, (%rdi)
- mov %rdx, 16(%rdi)
- mov %edx, 23(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill28):
- vmovdqu %xmmZ, (%rdi)
- mov %rdx, 16(%rdi)
- mov %edx, 24(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill29):
- vmovdqu %xmmZ, (%rdi)
- mov %rdx, 16(%rdi)
- mov %rdx, 21(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill30):
- vmovdqu %xmmZ, (%rdi)
- mov %rdx, 16(%rdi)
- mov %rdx, 22(%rdi)
- VZEROUPPER
- ret
-
- .p2align 4
-L(Fill31):
- vmovdqu %ymmZ, -1(%rdi)
+ vmovdqu %xmmZ, -16(%rdi, %r8)
VZEROUPPER
ret
@@ -1934,21 +1829,21 @@ L(FillTable):
.int JMPTBL(L(Fill14), L(FillTable))
.int JMPTBL(L(Fill15), L(FillTable))
.int JMPTBL(L(Fill16), L(FillTable))
- .int JMPTBL(L(Fill17), L(FillTable))
- .int JMPTBL(L(Fill18), L(FillTable))
- .int JMPTBL(L(Fill19), L(FillTable))
- .int JMPTBL(L(Fill20), L(FillTable))
- .int JMPTBL(L(Fill21), L(FillTable))
- .int JMPTBL(L(Fill22), L(FillTable))
- .int JMPTBL(L(Fill23), L(FillTable))
- .int JMPTBL(L(Fill24), L(FillTable))
- .int JMPTBL(L(Fill25), L(FillTable))
- .int JMPTBL(L(Fill26), L(FillTable))
- .int JMPTBL(L(Fill27), L(FillTable))
- .int JMPTBL(L(Fill28), L(FillTable))
- .int JMPTBL(L(Fill29), L(FillTable))
- .int JMPTBL(L(Fill30), L(FillTable))
- .int JMPTBL(L(Fill31), L(FillTable))
+ .int JMPTBL(L(Fill17_31), L(FillTable))
+ .int JMPTBL(L(Fill17_31), L(FillTable))
+ .int JMPTBL(L(Fill17_31), L(FillTable))
+ .int JMPTBL(L(Fill17_31), L(FillTable))
+ .int JMPTBL(L(Fill17_31), L(FillTable))
+ .int JMPTBL(L(Fill17_31), L(FillTable))
+ .int JMPTBL(L(Fill17_31), L(FillTable))
+ .int JMPTBL(L(Fill17_31), L(FillTable))
+ .int JMPTBL(L(Fill17_31), L(FillTable))
+ .int JMPTBL(L(Fill17_31), L(FillTable))
+ .int JMPTBL(L(Fill17_31), L(FillTable))
+ .int JMPTBL(L(Fill17_31), L(FillTable))
+ .int JMPTBL(L(Fill17_31), L(FillTable))
+ .int JMPTBL(L(Fill17_31), L(FillTable))
+ .int JMPTBL(L(Fill17_31), L(FillTable))
.int JMPTBL(L(Fill32), L(FillTable))
# endif
# endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2a584bf6c3c2ff2275641cc9c28d1f3410b6ce11
commit 2a584bf6c3c2ff2275641cc9c28d1f3410b6ce11
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Mon Sep 24 16:12:35 2018 -0500
remove useless StrncpyExit33-63 labels
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 5a521ed..4ab4c66 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1343,229 +1343,6 @@ L(StrncpyExit32):
ret
.p2align 4
-L(StrncpyExit33):
- vmovdqu (%rsi), %ymm0
- mov 32(%rsi), %cl
- vmovdqu %ymm0, (%rdi)
- mov %cl, 32(%rdi)
-# ifdef USE_AS_STPCPY
- lea 33(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 33(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit34):
- /* 0/32, 32/2 */
- vmovdqu (%rsi), %ymm0
- mov 32(%rsi), %cx
- vmovdqu %ymm0, (%rdi)
- mov %cx, 32(%rdi)
-# ifdef USE_AS_STPCPY
- lea 34(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 34(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit35):
- /* 0/32, 31/4 */
- vmovdqu (%rsi), %ymm0
- mov 31(%rsi), %ecx
- vmovdqu %ymm0, (%rdi)
- mov %ecx, 31(%rdi)
-# ifdef USE_AS_STPCPY
- lea 35(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 35(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit36):
- /* 0/32, 32/4 */
- vmovdqu (%rsi), %ymm0
- mov 32(%rsi), %ecx
- vmovdqu %ymm0, (%rdi)
- mov %ecx, 32(%rdi)
-# ifdef USE_AS_STPCPY
- lea 36(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 36(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit37):
- /* 0/32, 29/8 */
- vmovdqu (%rsi), %ymm0
- mov 29(%rsi), %rcx
- vmovdqu %ymm0, (%rdi)
- mov %rcx, 29(%rdi)
-# ifdef USE_AS_STPCPY
- lea 37(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 37(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit38):
- /* 0/32, 30/8 */
- vmovdqu (%rsi), %ymm0
- mov 30(%rsi), %rcx
- vmovdqu %ymm0, (%rdi)
- mov %rcx, 30(%rdi)
-# ifdef USE_AS_STPCPY
- lea 38(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 38(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit39):
- /* 0/32, 31/8 */
- vmovdqu (%rsi), %ymm0
- mov 31(%rsi), %rcx
- vmovdqu %ymm0, (%rdi)
- mov %rcx, 31(%rdi)
-# ifdef USE_AS_STPCPY
- lea 39(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 39(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit40):
- /* 0/32, 32/8 */
- vmovdqu (%rsi), %ymm0
- mov 32(%rsi), %rcx
- vmovdqu %ymm0, (%rdi)
- mov %rcx, 32(%rdi)
-# ifdef USE_AS_STPCPY
- lea 40(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 40(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit41):
- /* 0/32, 32/8, 40/1 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 25(%rsi), %xmm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 25(%rdi)
-# ifdef USE_AS_STPCPY
- lea 41(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 41(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit42):
- /* 0/32, 32/8, 40/2 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 26(%rsi), %xmm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 26(%rdi)
-# ifdef USE_AS_STPCPY
- lea 42(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 42(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit43):
- /* 0/32, 27/16 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 27(%rsi), %xmm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm2, 27(%rdi)
-# ifdef USE_AS_STPCPY
- lea 43(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 43(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit44):
- /* 0/32, 28/16 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 28(%rsi), %xmm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm2, 28(%rdi)
-# ifdef USE_AS_STPCPY
- lea 44(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 44(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit45):
- /* 0/32, 29/16 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 29(%rsi), %xmm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm2, 29(%rdi)
-# ifdef USE_AS_STPCPY
- lea 45(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 45(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit46):
- /* 0/32, 30/16 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 30(%rsi), %xmm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm2, 30(%rdi)
-# ifdef USE_AS_STPCPY
- lea 46(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 46(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
L(StrncpyExit33_63):
/* 0/32, 31/16 */
vmovdqu (%rsi), %ymm0
@@ -1582,263 +1359,6 @@ L(StrncpyExit33_63):
ret
.p2align 4
-L(StrncpyExit48):
- /* 0/32, 32/16 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm2, 32(%rdi)
-# ifdef USE_AS_STPCPY
- lea 48(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 48(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit49):
- /* 0/32, 32/16, 48/1 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 17(%rsi), %ymm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm2, 17(%rdi)
-# ifdef USE_AS_STPCPY
- lea 49(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 49(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit50):
- /* 0/32, 32/16, 48/2 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 18(%rsi), %ymm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm2, 18(%rdi)
-# ifdef USE_AS_STPCPY
- lea 50(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 50(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit51):
- /* 0/32, 32/16, 47/4 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 19(%rsi), %ymm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm2, 19(%rdi)
-# ifdef USE_AS_STPCPY
- lea 51(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 51(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit52):
- /* 0/32, 32/16, 48/4 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 20(%rsi), %ymm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm2, 20(%rdi)
-# ifdef USE_AS_STPCPY
- lea 52(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 52(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit53):
- /* 0/32, 32/16, 45/8 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 21(%rsi), %ymm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm2, 21(%rdi)
-# ifdef USE_AS_STPCPY
- lea 53(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 53(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit54):
- /* 0/32, 32/16, 46/8 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 22(%rsi), %ymm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm2, 22(%rdi)
-# ifdef USE_AS_STPCPY
- lea 54(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 54(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit55):
- /* 0/32, 32/16, 47/8 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 23(%rsi), %ymm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm2, 23(%rdi)
-# ifdef USE_AS_STPCPY
- lea 55(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 55(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit56):
- /* 0/32, 32/16, 48/8 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 24(%rsi), %ymm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm2, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 56(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 56(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit57):
- /* 0/32, 25/32 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 25(%rsi), %ymm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm2, 25(%rdi)
-# ifdef USE_AS_STPCPY
- lea 57(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 57(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit58):
- /* 0/32, 26/32 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 26(%rsi), %ymm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm2, 26(%rdi)
-# ifdef USE_AS_STPCPY
- lea 58(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 58(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit59):
- /* 0/32, 27/32 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 27(%rsi), %ymm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm2, 27(%rdi)
-# ifdef USE_AS_STPCPY
- lea 59(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 59(%rdi)
-# endif
- VZEROUPPER
- ret
-
-
- .p2align 4
-L(StrncpyExit60):
- /* 0/32, 28/32 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 28(%rsi), %ymm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm2, 28(%rdi)
-# ifdef USE_AS_STPCPY
- lea 60(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 60(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit61):
- /* 0/32, 29/32 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 29(%rsi), %ymm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm2, 29(%rdi)
-# ifdef USE_AS_STPCPY
- lea 61(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 61(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit62):
- /* 0/32, 30/32 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 30(%rsi), %ymm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm2, 30(%rdi)
-# ifdef USE_AS_STPCPY
- lea 62(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 62(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(StrncpyExit63):
- /* 0/32, 31/32 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 31(%rsi), %ymm2
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm2, 31(%rdi)
-# ifdef USE_AS_STPCPY
- lea 63(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 63(%rdi)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
L(StrncpyExit64):
/* 0/32, 32/32 */
vmovdqu (%rsi), %ymm0
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f24c12a6cf0552d60c7ea69e9f8cc4a719fdb592
commit f24c12a6cf0552d60c7ea69e9f8cc4a719fdb592
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Mon Sep 24 15:50:49 2018 -0500
create a single StrncpyExit17_31
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 71d57c4..5a521ed 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1195,7 +1195,7 @@ L(StrncpyExit22):
ret
.p2align 4
-L(StrncpyExit23):
+L(StrncpyExit17_31):
vmovdqu (%rsi), %xmm0
vmovdqu -16(%rsi, %r8), %xmm2
vmovdqu %xmm0, (%rdi)
@@ -2345,21 +2345,21 @@ L(ExitStrncpyTable):
.int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=1b9e998325aee369aff3175b9bd5a14b86ca7f35
commit 1b9e998325aee369aff3175b9bd5a14b86ca7f35
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Mon Sep 24 15:46:09 2018 -0500
prepare StrncpyExit23 for 17-31 labels
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 8f2c482..71d57c4 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1196,15 +1196,15 @@ L(StrncpyExit22):
.p2align 4
L(StrncpyExit23):
- vmovdqu (%rsi), %xmm0
- mov 15(%rsi), %rcx
- vmovdqu %xmm0, (%rdi)
- mov %rcx, 15(%rdi)
+ vmovdqu (%rsi), %xmm0
+ vmovdqu -16(%rsi, %r8), %xmm2
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm2, -16(%rdi, %r8)
# ifdef USE_AS_STPCPY
- lea 23(%rdi), %rax
+ lea (%rdi, %r8), %rax
# endif
# ifdef USE_AS_STRCAT
- movb $0, 23(%rdi)
+ movb $0, (%rdi, %r8)
# endif
VZEROUPPER
ret
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ec1472b4329dfd4fcdc66a0ae2285a9e1929f844
commit ec1472b4329dfd4fcdc66a0ae2285a9e1929f844
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Mon Sep 24 15:40:36 2018 -0500
Create a single label for StrncpyExit33-StrncpyExit63 cases
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index cc99867..8f2c482 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1566,7 +1566,7 @@ L(StrncpyExit46):
ret
.p2align 4
-L(StrncpyExit47):
+L(StrncpyExit33_63):
/* 0/32, 31/16 */
vmovdqu (%rsi), %ymm0
vmovdqu -VEC_SIZE(%rsi, %r8), %ymm2
@@ -2361,37 +2361,37 @@ L(ExitStrncpyTable):
.int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit34), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit35), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit36), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit37), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit38), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit39), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit40), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit41), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit42), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit43), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit44), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit45), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit46), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit47), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit48), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit49), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit50), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit51), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit52), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit53), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit54), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit55), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit56), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit57), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit58), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit59), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit60), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit61), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit62), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit64), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit65), L(ExitStrncpyTable))
# ifndef USE_AS_STRCAT
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a527c9d1d7e966caf71d549b782681012d62055c
commit a527c9d1d7e966caf71d549b782681012d62055c
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Mon Sep 24 15:29:56 2018 -0500
prepare StrncpyExit47 for multiple label
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 99b9bc7..cc99867 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1569,14 +1569,14 @@ L(StrncpyExit46):
L(StrncpyExit47):
/* 0/32, 31/16 */
vmovdqu (%rsi), %ymm0
- vmovdqu 31(%rsi), %xmm2
+ vmovdqu -VEC_SIZE(%rsi, %r8), %ymm2
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm2, 31(%rdi)
+ vmovdqu %ymm2, -VEC_SIZE(%rdi, %r8)
# ifdef USE_AS_STPCPY
- lea 47(%rdi), %rax
+ lea (%rdi, %r8), %rax
# endif
# ifdef USE_AS_STRCAT
- movb $0, 47(%rdi)
+ movb $0, (%rdi, %r8)
# endif
VZEROUPPER
ret
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2e7d2b76ec0ba6f8b6bccd888b246d381756a4d4
commit 2e7d2b76ec0ba6f8b6bccd888b246d381756a4d4
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Mon Sep 24 14:38:15 2018 -0500
remove obsolete exit branches
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 5890b8a..99b9bc7 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -794,107 +794,6 @@ L(Exit16):
ret
.p2align 4
-L(Exit17):
- vmovdqu (%rsi), %xmm0
- vmovdqu %xmm0, (%rdi)
- mov %dh, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 16(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $17, %r8
- lea 17(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit18):
- vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %cx
- vmovdqu %xmm0, (%rdi)
- mov %cx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 17(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $18, %r8
- lea 18(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit19):
- vmovdqu (%rsi), %xmm0
- mov 15(%rsi), %ecx
- vmovdqu %xmm0, (%rdi)
- mov %ecx, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 18(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $19, %r8
- lea 19(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit20):
- vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %ecx
- vmovdqu %xmm0, (%rdi)
- mov %ecx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 19(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $20, %r8
- lea 20(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit21):
- vmovdqu (%rsi), %xmm0
- mov 13(%rsi), %rcx
- vmovdqu %xmm0, (%rdi)
- mov %rcx, 13(%rdi)
-# ifdef USE_AS_STPCPY
- lea 20(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $21, %r8
- lea 21(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit22):
- vmovdqu (%rsi), %xmm0
- mov 14(%rsi), %rcx
- vmovdqu %xmm0, (%rdi)
- mov %rcx, 14(%rdi)
-# ifdef USE_AS_STPCPY
- lea 21(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $22, %r8
- lea 22(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
L(Exit17_31):
vmovdqu (%rsi), %xmm0
vmovdqu -15(%rsi, %rdx), %xmm1
@@ -913,142 +812,6 @@ L(Exit17_31):
ret
.p2align 4
-L(Exit24):
- vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %rcx
- vmovdqu %xmm0, (%rdi)
- mov %rcx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 23(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $24, %r8
- lea 24(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit25):
- vmovdqu (%rsi), %xmm0
- vmovdqu 9(%rsi), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, 9(%rdi)
-# ifdef USE_AS_STPCPY
- lea 24(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $25, %r8
- lea 25(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit26):
- vmovdqu (%rsi), %xmm0
- vmovdqu 10(%rsi), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, 10(%rdi)
-# ifdef USE_AS_STPCPY
- lea 25(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $26, %r8
- lea 26(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit27):
- vmovdqu (%rsi), %xmm0
- vmovdqu 11(%rsi), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, 11(%rdi)
-# ifdef USE_AS_STPCPY
- lea 26(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $27, %r8
- lea 27(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit28):
- vmovdqu (%rsi), %xmm0
- vmovdqu 12(%rsi), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, 12(%rdi)
-# ifdef USE_AS_STPCPY
- lea 27(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $28, %r8
- lea 28(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit29):
- vmovdqu (%rsi), %xmm0
- vmovdqu 13(%rsi), %xmm2
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm2, 13(%rdi)
-# ifdef USE_AS_STPCPY
- lea 28(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $29, %r8
- lea 29(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit30):
- vmovdqu (%rsi), %xmm0
- vmovdqu 14(%rsi), %xmm2
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm2, 14(%rdi)
-# ifdef USE_AS_STPCPY
- lea 29(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $30, %r8
- lea 30(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit31):
- vmovdqu (%rsi), %xmm0
- vmovdqu 15(%rsi), %xmm2
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm2, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 30(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $31, %r8
- lea 31(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
L(Exit32):
vmovdqu (%rsi), %ymm0
vmovdqu %ymm0, (%rdi)
@@ -1065,7 +828,6 @@ L(Exit32):
.p2align 4
L(Exit33_63):
- /* 0/32, 31/16 */
vmovdqu (%rsi), %ymm0
vmovdqu -31(%rsi, %rdx), %ymm1
vmovdqu %ymm0, (%rdi)
@@ -1083,296 +845,7 @@ L(Exit33_63):
ret
.p2align 4
-L(Exit48):
- /* 0/32, 32/16 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 32(%rdi)
-# ifdef USE_AS_STPCPY
- lea 47(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $48, %r8
- lea 48(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit49):
- /* 0/32, 32/16, 48/1 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 17(%rsi), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 17(%rdi)
-# ifdef USE_AS_STPCPY
- lea 48(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $49, %r8
- lea 49(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit50):
- /* 0/32, 32/16, 48/2 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 18(%rsi), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 18(%rdi)
-# ifdef USE_AS_STPCPY
- lea 49(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $50, %r8
- lea 50(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit51):
- /* 0/32, 32/16, 47/4 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 19(%rsi), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 19(%rdi)
-# ifdef USE_AS_STPCPY
- lea 50(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $51, %r8
- lea 51(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit52):
- /* 0/32, 32/16, 48/4 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 20(%rsi), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 20(%rdi)
-# ifdef USE_AS_STPCPY
- lea 51(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $52, %r8
- lea 52(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit53):
- /* 0/32, 32/16, 45/8 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 21(%rsi), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 21(%rdi)
-# ifdef USE_AS_STPCPY
- lea 52(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $53, %r8
- lea 53(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit54):
- /* 0/32, 32/16, 46/8 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 22(%rsi), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 22(%rdi)
-# ifdef USE_AS_STPCPY
- lea 53(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $54, %r8
- lea 54(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit55):
- /* 0/32, 32/16, 47/8 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 23(%rsi), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 23(%rdi)
-# ifdef USE_AS_STPCPY
- lea 54(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $55, %r8
- lea 55(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit56):
- /* 0/32, 32/16, 48/8 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 24(%rsi), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 55(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $56, %r8
- lea 56(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit57):
- /* 0/32, 25/32 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 25(%rsi), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 25(%rdi)
-# ifdef USE_AS_STPCPY
- lea 56(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $57, %r8
- lea 57(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit58):
- /* 0/32, 26/32 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 26(%rsi), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 26(%rdi)
-# ifdef USE_AS_STPCPY
- lea 57(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $58, %r8
- lea 58(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit59):
- /* 0/32, 27/32 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 27(%rsi), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 27(%rdi)
-# ifdef USE_AS_STPCPY
- lea 58(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $59, %r8
- lea 59(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit60):
- /* 0/32, 28/32 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 28(%rsi), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 28(%rdi)
-# ifdef USE_AS_STPCPY
- lea 59(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $60, %r8
- lea 60(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit61):
- /* 0/32, 29/32 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 29(%rsi), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 29(%rdi)
-# ifdef USE_AS_STPCPY
- lea 60(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $61, %r8
- lea 61(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit62):
- /* 0/32, 30/32 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 30(%rsi), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 30(%rdi)
-# ifdef USE_AS_STPCPY
- lea 61(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $62, %r8
- lea 62(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit63):
- /* 0/32, 31/32 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 31(%rsi), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 31(%rdi)
-# ifdef USE_AS_STPCPY
- lea 62(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $63, %r8
- lea 63(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
L(Exit64):
- /* 0/32, 32/32 */
vmovdqu (%rsi), %ymm0
vmovdqu 32(%rsi), %ymm1
vmovdqu %ymm0, (%rdi)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=acaf1ef714e072b891aa51630d47716ee08929c4
commit acaf1ef714e072b891aa51630d47716ee08929c4
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Mon Sep 24 14:30:28 2018 -0500
create a single Exit17_31 label for Exit17 to Exit31 labels
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index fd46d28..5890b8a 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -895,7 +895,7 @@ L(Exit22):
ret
.p2align 4
-L(Exit23):
+L(Exit17_31):
vmovdqu (%rsi), %xmm0
vmovdqu -15(%rsi, %rdx), %xmm1
vmovdqu %xmm0, (%rdi)
@@ -2805,21 +2805,21 @@ L(ExitTable):
.int JMPTBL(L(Exit14), L(ExitTable))
.int JMPTBL(L(Exit15), L(ExitTable))
.int JMPTBL(L(Exit16), L(ExitTable))
- .int JMPTBL(L(Exit17), L(ExitTable))
- .int JMPTBL(L(Exit18), L(ExitTable))
- .int JMPTBL(L(Exit19), L(ExitTable))
- .int JMPTBL(L(Exit20), L(ExitTable))
- .int JMPTBL(L(Exit21), L(ExitTable))
- .int JMPTBL(L(Exit22), L(ExitTable))
- .int JMPTBL(L(Exit23), L(ExitTable))
- .int JMPTBL(L(Exit24), L(ExitTable))
- .int JMPTBL(L(Exit25), L(ExitTable))
- .int JMPTBL(L(Exit26), L(ExitTable))
- .int JMPTBL(L(Exit27), L(ExitTable))
- .int JMPTBL(L(Exit28), L(ExitTable))
- .int JMPTBL(L(Exit29), L(ExitTable))
- .int JMPTBL(L(Exit30), L(ExitTable))
- .int JMPTBL(L(Exit31), L(ExitTable))
+ .int JMPTBL(L(Exit17_31), L(ExitTable))
+ .int JMPTBL(L(Exit17_31), L(ExitTable))
+ .int JMPTBL(L(Exit17_31), L(ExitTable))
+ .int JMPTBL(L(Exit17_31), L(ExitTable))
+ .int JMPTBL(L(Exit17_31), L(ExitTable))
+ .int JMPTBL(L(Exit17_31), L(ExitTable))
+ .int JMPTBL(L(Exit17_31), L(ExitTable))
+ .int JMPTBL(L(Exit17_31), L(ExitTable))
+ .int JMPTBL(L(Exit17_31), L(ExitTable))
+ .int JMPTBL(L(Exit17_31), L(ExitTable))
+ .int JMPTBL(L(Exit17_31), L(ExitTable))
+ .int JMPTBL(L(Exit17_31), L(ExitTable))
+ .int JMPTBL(L(Exit17_31), L(ExitTable))
+ .int JMPTBL(L(Exit17_31), L(ExitTable))
+ .int JMPTBL(L(Exit17_31), L(ExitTable))
.int JMPTBL(L(Exit32), L(ExitTable))
.int JMPTBL(L(Exit33_63), L(ExitTable))
.int JMPTBL(L(Exit33_63), L(ExitTable))
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=fa50be1e0599168aeda6497a08f1955279198efd
commit fa50be1e0599168aeda6497a08f1955279198efd
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Mon Sep 24 14:25:20 2018 -0500
strcpy-avx2.S: change Exit23 to prepare for Exit17 to Exit31
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 2f275f5..fd46d28 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -896,17 +896,18 @@ L(Exit22):
.p2align 4
L(Exit23):
- vmovdqu (%rsi), %xmm0
- mov 15(%rsi), %rcx
- vmovdqu %xmm0, (%rdi)
- mov %rcx, 15(%rdi)
+ vmovdqu (%rsi), %xmm0
+ vmovdqu -15(%rsi, %rdx), %xmm1
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm1, -15(%rdi, %rdx)
# ifdef USE_AS_STPCPY
- lea 22(%rdi), %rax
+ lea (%rdi, %rdx), %rax
# endif
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $23, %r8
- lea 23(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
+ sub %rdx, %r8
+ sub $1, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER
ret
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3a7ccd25d15b2b34ff5f86cd68b2679cae46c335
commit 3a7ccd25d15b2b34ff5f86cd68b2679cae46c335
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Mon Sep 24 12:16:19 2018 -0500
strcpy-avx2.S: usage of ymmZ vector
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 38b36e9..2f275f5 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -262,7 +262,7 @@ L(UnalignedFourVecSizeLoop_start):
jz L(UnalignedFourVecSizeLoop_start)
L(UnalignedFourVecSizeLeave):
- vpcmpeqb %ymm4, %ymm0, %ymm0
+ vpcmpeqb %ymm4, %ymmZ, %ymm0
vpmovmskb %ymm0, %edx
test %edx, %edx
jnz L(CopyVecSizeUnaligned_0)
@@ -272,7 +272,7 @@ L(UnalignedFourVecSizeLeave):
test %ecx, %ecx
jnz L(CopyVecSizeUnaligned_16)
- vpcmpeqb %ymm6, %ymm0, %ymm0
+ vpcmpeqb %ymm6, %ymmZ, %ymm0
vpmovmskb %ymm0, %edx
test %edx, %edx
jnz L(CopyVecSizeUnaligned_32)
@@ -2724,7 +2724,7 @@ L(UnalignedFourVecSizeLeaveCase3):
.p2align 4
L(UnalignedFourVecSizeLeaveCase2):
xor %ecx, %ecx
- vpcmpeqb %ymm4, %ymm0, %ymm0
+ vpcmpeqb %ymm4, %ymmZ, %ymm0
vpmovmskb %ymm0, %edx
add $(VEC_SIZE * 3), %r8
jle L(CopyVecSizeCase2OrCase3)
@@ -2734,7 +2734,7 @@ L(UnalignedFourVecSizeLeaveCase2):
# else
jnz L(CopyVecSize)
# endif
- vpcmpeqb %ymm5, %ymm0, %ymm0
+ vpcmpeqb %ymm5, %ymmZ, %ymm0
vpmovmskb %ymm0, %edx
vmovdqu %ymm4, (%rdi)
add $VEC_SIZE, %rcx
@@ -2747,7 +2747,7 @@ L(UnalignedFourVecSizeLeaveCase2):
jnz L(CopyVecSize)
# endif
- vpcmpeqb %ymm6, %ymm0, %ymm0
+ vpcmpeqb %ymm6, %ymmZ, %ymm0
vpmovmskb %ymm0, %edx
vmovdqu %ymm5, VEC_SIZE(%rdi)
add $VEC_SIZE, %rcx
@@ -2760,7 +2760,7 @@ L(UnalignedFourVecSizeLeaveCase2):
jnz L(CopyVecSize)
# endif
- vpcmpeqb %ymm7, %ymm0, %ymm0
+ vpcmpeqb %ymm7, %ymmZ, %ymm0
vpmovmskb %ymm0, %edx
vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
lea VEC_SIZE(%rdi, %rcx), %rdi
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=da995e6f074b3504d4e8c9860b41caa33ffdea23
commit da995e6f074b3504d4e8c9860b41caa33ffdea23
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Mon Sep 24 12:06:49 2018 -0500
strcpy-avx2.S: remove useless vpxor instructions
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 9dbcd08..38b36e9 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -71,8 +71,6 @@ ENTRY (STRCPY)
and $-VEC_SIZE, %rsi
and $(VEC_SIZE - 1), %ecx
- vpxor %xmm0, %xmm0, %xmm0
- vpxor %xmm1, %xmm1, %xmm1
vpcmpeqb (%rsi), %ymmZ, %ymm1
vpmovmskb %ymm1, %edx
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a27d35e8a9806b211d31449c13a8c97b114e5221
commit a27d35e8a9806b211d31449c13a8c97b114e5221
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Sep 21 09:00:16 2018 -0700
Replace Exit33 to Exit63 with Exit33_63
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index b4f4738..9dbcd08 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1065,258 +1065,7 @@ L(Exit32):
ret
.p2align 4
-L(Exit33):
- /* 0/32, 32/1 */
- vmovdqu (%rsi), %ymm0
- vmovdqu %ymm0, (%rdi)
- mov %dh, 32(%rdi)
-# ifdef USE_AS_STPCPY
- lea 32(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $33, %r8
- lea 33(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit34):
- /* 0/32, 32/2 */
- vmovdqu (%rsi), %ymm0
- mov 32(%rsi), %dx
- vmovdqu %ymm0, (%rdi)
- mov %dx, 32(%rdi)
-# ifdef USE_AS_STPCPY
- lea 33(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $34, %r8
- lea 34(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit35):
- /* 0/32, 31/4 */
- vmovdqu (%rsi), %ymm0
- mov 31(%rsi), %edx
- vmovdqu %ymm0, (%rdi)
- mov %edx, 31(%rdi)
-# ifdef USE_AS_STPCPY
- lea 34(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $35, %r8
- lea 35(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit36):
- /* 0/32, 32/4 */
- vmovdqu (%rsi), %ymm0
- mov 32(%rsi), %edx
- vmovdqu %ymm0, (%rdi)
- mov %edx, 32(%rdi)
-# ifdef USE_AS_STPCPY
- lea 35(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $36, %r8
- lea 36(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit37):
- /* 0/32, 29/8 */
- vmovdqu (%rsi), %ymm0
- mov 29(%rsi), %rdx
- vmovdqu %ymm0, (%rdi)
- mov %rdx, 29(%rdi)
-# ifdef USE_AS_STPCPY
- lea 36(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $37, %r8
- lea 37(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit38):
- /* 0/32, 30/8 */
- vmovdqu (%rsi), %ymm0
- mov 30(%rsi), %rdx
- vmovdqu %ymm0, (%rdi)
- mov %rdx, 30(%rdi)
-# ifdef USE_AS_STPCPY
- lea 37(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $38, %r8
- lea 38(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit39):
- /* 0/32, 31/8 */
- vmovdqu (%rsi), %ymm0
- mov 31(%rsi), %rdx
- vmovdqu %ymm0, (%rdi)
- mov %rdx, 31(%rdi)
-# ifdef USE_AS_STPCPY
- lea 38(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $39, %r8
- lea 39(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit40):
- /* 0/32, 32/8 */
- vmovdqu (%rsi), %ymm0
- mov 32(%rsi), %rdx
- vmovdqu %ymm0, (%rdi)
- mov %rdx, 32(%rdi)
-# ifdef USE_AS_STPCPY
- lea 39(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $40, %r8
- lea 40(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit41):
- /* 0/32, 32/8, 40/1 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 25(%rsi), %xmm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 25(%rdi)
-# ifdef USE_AS_STPCPY
- lea 40(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $41, %r8
- lea 41(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit42):
- /* 0/32, 32/8, 40/2 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 26(%rsi), %xmm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 26(%rdi)
-# ifdef USE_AS_STPCPY
- lea 41(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $42, %r8
- lea 42(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit43):
- /* 0/32, 27/16 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 27(%rsi), %xmm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 27(%rdi)
-# ifdef USE_AS_STPCPY
- lea 42(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $43, %r8
- lea 43(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit44):
- /* 0/32, 28/16 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 28(%rsi), %xmm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 28(%rdi)
-# ifdef USE_AS_STPCPY
- lea 43(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $44, %r8
- lea 44(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit45):
- /* 0/32, 29/16 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 29(%rsi), %xmm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 29(%rdi)
-# ifdef USE_AS_STPCPY
- lea 44(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $45, %r8
- lea 45(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit46):
- /* 0/32, 30/16 */
- vmovdqu (%rsi), %ymm0
- vmovdqu 30(%rsi), %xmm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 30(%rdi)
-# ifdef USE_AS_STPCPY
- lea 45(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $46, %r8
- lea 46(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(Exit47):
+L(Exit33_63):
/* 0/32, 31/16 */
vmovdqu (%rsi), %ymm0
vmovdqu -31(%rsi, %rdx), %ymm1
@@ -3073,37 +2822,37 @@ L(ExitTable):
.int JMPTBL(L(Exit30), L(ExitTable))
.int JMPTBL(L(Exit31), L(ExitTable))
.int JMPTBL(L(Exit32), L(ExitTable))
- .int JMPTBL(L(Exit33), L(ExitTable))
- .int JMPTBL(L(Exit34), L(ExitTable))
- .int JMPTBL(L(Exit35), L(ExitTable))
- .int JMPTBL(L(Exit36), L(ExitTable))
- .int JMPTBL(L(Exit37), L(ExitTable))
- .int JMPTBL(L(Exit38), L(ExitTable))
- .int JMPTBL(L(Exit39), L(ExitTable))
- .int JMPTBL(L(Exit40), L(ExitTable))
- .int JMPTBL(L(Exit41), L(ExitTable))
- .int JMPTBL(L(Exit42), L(ExitTable))
- .int JMPTBL(L(Exit43), L(ExitTable))
- .int JMPTBL(L(Exit44), L(ExitTable))
- .int JMPTBL(L(Exit45), L(ExitTable))
- .int JMPTBL(L(Exit46), L(ExitTable))
- .int JMPTBL(L(Exit47), L(ExitTable))
- .int JMPTBL(L(Exit48), L(ExitTable))
- .int JMPTBL(L(Exit49), L(ExitTable))
- .int JMPTBL(L(Exit50), L(ExitTable))
- .int JMPTBL(L(Exit51), L(ExitTable))
- .int JMPTBL(L(Exit52), L(ExitTable))
- .int JMPTBL(L(Exit53), L(ExitTable))
- .int JMPTBL(L(Exit54), L(ExitTable))
- .int JMPTBL(L(Exit55), L(ExitTable))
- .int JMPTBL(L(Exit56), L(ExitTable))
- .int JMPTBL(L(Exit57), L(ExitTable))
- .int JMPTBL(L(Exit58), L(ExitTable))
- .int JMPTBL(L(Exit59), L(ExitTable))
- .int JMPTBL(L(Exit60), L(ExitTable))
- .int JMPTBL(L(Exit61), L(ExitTable))
- .int JMPTBL(L(Exit62), L(ExitTable))
- .int JMPTBL(L(Exit63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
+ .int JMPTBL(L(Exit33_63), L(ExitTable))
.int JMPTBL(L(Exit64), L(ExitTable))
# ifdef USE_AS_STRNCPY
L(ExitStrncpyTable):
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=782590dfdd140ba21765a7ac563ebbc6d64958c2
commit 782590dfdd140ba21765a7ac563ebbc6d64958c2
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Sep 21 08:56:27 2018 -0700
Change L(Exit47) to handle Exit33 to Exit63
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index e1b7431..b4f4738 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1319,15 +1319,16 @@ L(Exit46):
L(Exit47):
/* 0/32, 31/16 */
vmovdqu (%rsi), %ymm0
- vmovdqu 31(%rsi), %xmm1
+ vmovdqu -31(%rsi, %rdx), %ymm1
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 31(%rdi)
+ vmovdqu %ymm1, -31(%rdi, %rdx)
# ifdef USE_AS_STPCPY
- lea 46(%rdi), %rax
+ lea (%rdi, %rdx), %rax
# endif
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $47, %r8
- lea 47(%rdi), %rdi
+ sub %rdx, %r8
+ sub $1, %r8
+ lea 1(%rdi, %rdx), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e10ee0652bc868d9b4f18de0fc392524b451bc88
commit e10ee0652bc868d9b4f18de0fc392524b451bc88
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Sep 20 16:01:59 2018 -0700
Use 32-bit registers for vpmovmskb YMM
YMM registers are 32 bytes. 32-bit registers are sufficient.
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 548e0e9..e1b7431 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -75,7 +75,7 @@ ENTRY (STRCPY)
vpxor %xmm1, %xmm1, %xmm1
vpcmpeqb (%rsi), %ymmZ, %ymm1
- vpmovmskb %ymm1, %rdx
+ vpmovmskb %ymm1, %edx
shr %cl, %rdx
# ifdef USE_AS_STRNCPY
@@ -90,18 +90,18 @@ ENTRY (STRCPY)
# endif
jbe L(CopyVecSizeTailCase2OrCase3)
# endif
- test %rdx, %rdx
+ test %edx, %edx
jnz L(CopyVecSizeTail)
vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm0
- vpmovmskb %ymm0, %rdx
+ vpmovmskb %ymm0, %edx
# ifdef USE_AS_STRNCPY
add $VEC_SIZE, %r10
cmp %r10, %r8
jbe L(CopyTwoVecSizeCase2OrCase3)
# endif
- test %rdx, %rdx
+ test %edx, %edx
jnz L(CopyTwoVecSize)
vmovdqu (%rsi, %rcx), %ymm1 /* copy VEC_SIZE bytes */
@@ -121,13 +121,13 @@ L(UnalignVecSizeBoth):
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
vmovdqu %ymm1, (%rdi, %rcx)
vpcmpeqb %ymm2, %ymmZ, %ymm0
- vpmovmskb %ymm0, %rdx
+ vpmovmskb %ymm0, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
sub $(VEC_SIZE * 3), %r8
jbe L(CopyVecSizeCase2OrCase3)
# endif
- test %rdx, %rdx
+ test %edx, %edx
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec2)
# else
@@ -137,13 +137,13 @@ L(UnalignVecSizeBoth):
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
vmovdqu %ymm2, (%rdi, %rcx)
vpcmpeqb %ymm3, %ymmZ, %ymm0
- vpmovmskb %ymm0, %rdx
+ vpmovmskb %ymm0, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
sub $VEC_SIZE, %r8
jbe L(CopyVecSizeCase2OrCase3)
# endif
- test %rdx, %rdx
+ test %edx, %edx
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec3)
# else
@@ -153,13 +153,13 @@ L(UnalignVecSizeBoth):
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
vmovdqu %ymm3, (%rdi, %rcx)
vpcmpeqb %ymm4, %ymmZ, %ymm0
- vpmovmskb %ymm0, %rdx
+ vpmovmskb %ymm0, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
sub $VEC_SIZE, %r8
jbe L(CopyVecSizeCase2OrCase3)
# endif
- test %rdx, %rdx
+ test %edx, %edx
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec4)
# else
@@ -169,13 +169,13 @@ L(UnalignVecSizeBoth):
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm1
vmovdqu %ymm4, (%rdi, %rcx)
vpcmpeqb %ymm1, %ymmZ, %ymm0
- vpmovmskb %ymm0, %rdx
+ vpmovmskb %ymm0, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
sub $VEC_SIZE, %r8
jbe L(CopyVecSizeCase2OrCase3)
# endif
- test %rdx, %rdx
+ test %edx, %edx
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec1)
# else
@@ -185,13 +185,13 @@ L(UnalignVecSizeBoth):
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
vmovdqu %ymm1, (%rdi, %rcx)
vpcmpeqb %ymm2, %ymmZ, %ymm0
- vpmovmskb %ymm0, %rdx
+ vpmovmskb %ymm0, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
sub $VEC_SIZE, %r8
jbe L(CopyVecSizeCase2OrCase3)
# endif
- test %rdx, %rdx
+ test %edx, %edx
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec2)
# else
@@ -201,13 +201,13 @@ L(UnalignVecSizeBoth):
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
vmovdqu %ymm2, (%rdi, %rcx)
vpcmpeqb %ymm3, %ymmZ, %ymm0
- vpmovmskb %ymm0, %rdx
+ vpmovmskb %ymm0, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
sub $VEC_SIZE, %r8
jbe L(CopyVecSizeCase2OrCase3)
# endif
- test %rdx, %rdx
+ test %edx, %edx
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec3)
# else
@@ -232,12 +232,12 @@ L(UnalignedFourVecSizeLoop):
vpminub %ymm7, %ymm6, %ymm3
vpminub %ymm2, %ymm3, %ymm3
vpcmpeqb %ymm0, %ymm3, %ymm3
- vpmovmskb %ymm3, %rdx
+ vpmovmskb %ymm3, %edx
# ifdef USE_AS_STRNCPY
sub $(VEC_SIZE * 4), %r8
jbe L(UnalignedLeaveCase2OrCase3)
# endif
- test %rdx, %rdx
+ test %edx, %edx
jnz L(UnalignedFourVecSizeLeave)
L(UnalignedFourVecSizeLoop_start):
@@ -255,33 +255,33 @@ L(UnalignedFourVecSizeLoop_start):
vpminub %ymm7, %ymm6, %ymm3
vpminub %ymm2, %ymm3, %ymm3
vpcmpeqb %ymm0, %ymm3, %ymm3
- vpmovmskb %ymm3, %rdx
+ vpmovmskb %ymm3, %edx
# ifdef USE_AS_STRNCPY
sub $(VEC_SIZE * 4), %r8
jbe L(UnalignedLeaveCase2OrCase3)
# endif
- test %rdx, %rdx
+ test %edx, %edx
jz L(UnalignedFourVecSizeLoop_start)
L(UnalignedFourVecSizeLeave):
vpcmpeqb %ymm4, %ymm0, %ymm0
- vpmovmskb %ymm0, %rdx
- test %rdx, %rdx
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
jnz L(CopyVecSizeUnaligned_0)
vpcmpeqb %ymm5, %ymmZ, %ymm1
- vpmovmskb %ymm1, %rcx
- test %rcx, %rcx
+ vpmovmskb %ymm1, %ecx
+ test %ecx, %ecx
jnz L(CopyVecSizeUnaligned_16)
vpcmpeqb %ymm6, %ymm0, %ymm0
- vpmovmskb %ymm0, %rdx
- test %rdx, %rdx
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
jnz L(CopyVecSizeUnaligned_32)
vpcmpeqb %ymm7, %ymmZ, %ymm1
- vpmovmskb %ymm1, %rcx
- bsf %rcx, %rdx
+ vpmovmskb %ymm1, %ecx
+ bsf %ecx, %edx
vmovdqu %ymm4, (%rdi)
vmovdqu %ymm5, VEC_SIZE(%rdi)
vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
@@ -306,7 +306,7 @@ L(SourceStringAlignmentLessTwoVecSize):
vmovdqu (%rsi), %ymm1
vmovdqu VEC_SIZE(%rsi), %ymm2
vpcmpeqb %ymm1, %ymmZ, %ymm0
- vpmovmskb %ymm0, %rdx
+ vpmovmskb %ymm0, %edx
# ifdef USE_AS_STRNCPY
# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
@@ -316,12 +316,12 @@ L(SourceStringAlignmentLessTwoVecSize):
# endif
jbe L(CopyVecSizeTail1Case2OrCase3)
# endif
- test %rdx, %rdx
+ test %edx, %edx
jnz L(CopyVecSizeTail1)
vpcmpeqb %ymm2, %ymmZ, %ymm0
vmovdqu %ymm1, (%rdi)
- vpmovmskb %ymm0, %rdx
+ vpmovmskb %ymm0, %edx
# ifdef USE_AS_STRNCPY
# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
@@ -331,7 +331,7 @@ L(SourceStringAlignmentLessTwoVecSize):
# endif
jbe L(CopyTwoVecSize1Case2OrCase3)
# endif
- test %rdx, %rdx
+ test %edx, %edx
jnz L(CopyTwoVecSize1)
and $-VEC_SIZE, %rsi
@@ -347,13 +347,13 @@ L(SourceStringAlignmentLessTwoVecSize):
L(CopyVecSize):
add %rcx, %rdi
add %rcx, %rsi
- bsf %rdx, %rdx
+ bsf %edx, %edx
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
# endif
.p2align 4
L(CopyVecSizeTail):
add %rcx, %rsi
- bsf %rdx, %rdx
+ bsf %edx, %edx
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
.p2align 4
@@ -364,20 +364,20 @@ L(CopyTwoVecSize1):
sub $VEC_SIZE, %r8
# endif
L(CopyVecSizeTail1):
- bsf %rdx, %rdx
+ bsf %edx, %edx
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
.p2align 4
L(CopyTwoVecSize):
- bsf %rdx, %rdx
+ bsf %edx, %edx
add %rcx, %rsi
- add $VEC_SIZE, %rdx
- sub %rcx, %rdx
+ add $VEC_SIZE, %edx
+ sub %ecx, %edx
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
.p2align 4
L(CopyVecSizeUnaligned_0):
- bsf %rdx, %rdx
+ bsf %edx, %edx
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
# ifdef USE_AS_STPCPY
lea (%rdi, %rdx), %rax
@@ -393,7 +393,7 @@ L(CopyVecSizeUnaligned_0):
.p2align 4
L(CopyVecSizeUnaligned_16):
- bsf %rcx, %rdx
+ bsf %ecx, %edx
vmovdqu %ymm4, (%rdi)
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
# ifdef USE_AS_STPCPY
@@ -412,7 +412,7 @@ L(CopyVecSizeUnaligned_16):
.p2align 4
L(CopyVecSizeUnaligned_32):
- bsf %rdx, %rdx
+ bsf %edx, %edx
vmovdqu %ymm4, (%rdi)
vmovdqu %ymm5, VEC_SIZE(%rdi)
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
@@ -469,31 +469,31 @@ L(CopyVecSizeCase2):
add $VEC_SIZE, %r8
add %rcx, %rdi
add %rcx, %rsi
- bsf %rdx, %rdx
- cmp %r8, %rdx
+ bsf %edx, %edx
+ cmp %r8d, %edx
jb L(CopyVecSizeExit)
BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
.p2align 4
L(CopyTwoVecSizeCase2):
add %rcx, %rsi
- bsf %rdx, %rdx
- add $VEC_SIZE, %rdx
- sub %rcx, %rdx
- cmp %r8, %rdx
+ bsf %edx, %edx
+ add $VEC_SIZE, %edx
+ sub %ecx, %edx
+ cmp %r8d, %edx
jb L(CopyVecSizeExit)
BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
L(CopyVecSizeTailCase2):
add %rcx, %rsi
- bsf %rdx, %rdx
- cmp %r8, %rdx
+ bsf %edx, %edx
+ cmp %r8d, %edx
jb L(CopyVecSizeExit)
BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
L(CopyVecSizeTail1Case2):
- bsf %rdx, %rdx
- cmp %r8, %rdx
+ bsf %edx, %edx
+ cmp %r8d, %edx
jb L(CopyVecSizeExit)
BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
@@ -2885,7 +2885,7 @@ L(CopyVecSizeUnalignedVec2):
.p2align 4
L(CopyVecSizeVecExit):
- bsf %rdx, %rdx
+ bsf %edx, %edx
add $(VEC_SIZE - 1), %r8
add %rcx, %rdi
# ifdef USE_AS_STPCPY
@@ -2977,22 +2977,22 @@ L(UnalignedFourVecSizeLeaveCase3):
L(UnalignedFourVecSizeLeaveCase2):
xor %ecx, %ecx
vpcmpeqb %ymm4, %ymm0, %ymm0
- vpmovmskb %ymm0, %rdx
+ vpmovmskb %ymm0, %edx
add $(VEC_SIZE * 3), %r8
jle L(CopyVecSizeCase2OrCase3)
- test %rdx, %rdx
+ test %edx, %edx
# ifndef USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec4)
# else
jnz L(CopyVecSize)
# endif
vpcmpeqb %ymm5, %ymm0, %ymm0
- vpmovmskb %ymm0, %rdx
+ vpmovmskb %ymm0, %edx
vmovdqu %ymm4, (%rdi)
add $VEC_SIZE, %rcx
sub $VEC_SIZE, %r8
jbe L(CopyVecSizeCase2OrCase3)
- test %rdx, %rdx
+ test %edx, %edx
# ifndef USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec5)
# else
@@ -3000,12 +3000,12 @@ L(UnalignedFourVecSizeLeaveCase2):
# endif
vpcmpeqb %ymm6, %ymm0, %ymm0
- vpmovmskb %ymm0, %rdx
+ vpmovmskb %ymm0, %edx
vmovdqu %ymm5, VEC_SIZE(%rdi)
add $VEC_SIZE, %rcx
sub $VEC_SIZE, %r8
jbe L(CopyVecSizeCase2OrCase3)
- test %rdx, %rdx
+ test %edx, %edx
# ifndef USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec6)
# else
@@ -3013,12 +3013,12 @@ L(UnalignedFourVecSizeLeaveCase2):
# endif
vpcmpeqb %ymm7, %ymm0, %ymm0
- vpmovmskb %ymm0, %rdx
+ vpmovmskb %ymm0, %edx
vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
lea VEC_SIZE(%rdi, %rcx), %rdi
lea VEC_SIZE(%rsi, %rcx), %rsi
- bsf %rdx, %rdx
- cmp %r8, %rdx
+ bsf %edx, %edx
+ cmp %r8d, %edx
jb L(CopyVecSizeExit)
BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0706e178920c6cea7f23bb415e73365fda4a9f57
commit 0706e178920c6cea7f23bb415e73365fda4a9f57
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Tue Sep 18 12:21:18 2018 -0700
Use 32-bit AND when upper 32 bits are zero
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index ea39094..548e0e9 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -65,12 +65,12 @@ ENTRY (STRCPY)
vpxor %xmmZ, %xmmZ, %xmmZ
- and $((VEC_SIZE * 4) - 1), %rcx
- cmp $(VEC_SIZE * 2), %rcx
+ and $((VEC_SIZE * 4) - 1), %ecx
+ cmp $(VEC_SIZE * 2), %ecx
jbe L(SourceStringAlignmentLessTwoVecSize)
and $-VEC_SIZE, %rsi
- and $(VEC_SIZE - 1), %rcx
+ and $(VEC_SIZE - 1), %ecx
vpxor %xmm0, %xmm0, %xmm0
vpxor %xmm1, %xmm1, %xmm1
@@ -335,7 +335,7 @@ L(SourceStringAlignmentLessTwoVecSize):
jnz L(CopyTwoVecSize1)
and $-VEC_SIZE, %rsi
- and $(VEC_SIZE - 1), %rcx
+ and $(VEC_SIZE - 1), %ecx
jmp L(UnalignVecSizeBoth)
/*------End of main part with loops---------------------*/
@@ -2904,7 +2904,7 @@ L(StrncpyFillTailWithZero):
add $VEC_SIZE, %rdi
mov %rdi, %rsi
- and $(VEC_SIZE - 1), %rsi
+ and $(VEC_SIZE - 1), %esi
sub %rsi, %rdi
add %rsi, %r8
sub $(VEC_SIZE * 4), %r8
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=307f7ebcdefbb32f4d2451c3c052448272b13f26
commit 307f7ebcdefbb32f4d2451c3c052448272b13f26
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Tue Sep 18 12:18:22 2018 -0700
Replace xor %ch, %ch/movb %ch, (%rdi) with movb $0, (%rdi)
Please compare performance before and after.
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 222826d..ea39094 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1647,8 +1647,7 @@ L(StrncpyExit0):
mov %rdi, %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, (%rdi)
+ movb $0, (%rdi)
# endif
VZEROUPPER
ret
@@ -1661,8 +1660,7 @@ L(StrncpyExit1):
lea 1(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 1(%rdi)
+ movb $0, 1(%rdi)
# endif
VZEROUPPER
ret
@@ -1675,8 +1673,7 @@ L(StrncpyExit2):
lea 2(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 2(%rdi)
+ movb $0, 2(%rdi)
# endif
VZEROUPPER
ret
@@ -1691,8 +1688,7 @@ L(StrncpyExit3):
lea 3(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 3(%rdi)
+ movb $0, 3(%rdi)
# endif
VZEROUPPER
ret
@@ -1705,8 +1701,7 @@ L(StrncpyExit4):
lea 4(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 4(%rdi)
+ movb $0, 4(%rdi)
# endif
VZEROUPPER
ret
@@ -1721,8 +1716,7 @@ L(StrncpyExit5):
lea 5(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 5(%rdi)
+ movb $0, 5(%rdi)
# endif
VZEROUPPER
ret
@@ -1737,8 +1731,7 @@ L(StrncpyExit6):
lea 6(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 6(%rdi)
+ movb $0, 6(%rdi)
# endif
VZEROUPPER
ret
@@ -1753,8 +1746,7 @@ L(StrncpyExit7):
lea 7(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 7(%rdi)
+ movb $0, 7(%rdi)
# endif
VZEROUPPER
ret
@@ -1767,8 +1759,7 @@ L(StrncpyExit8):
lea 8(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 8(%rdi)
+ movb $0, 8(%rdi)
# endif
VZEROUPPER
ret
@@ -1783,8 +1774,7 @@ L(StrncpyExit9):
lea 9(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 9(%rdi)
+ movb $0, 9(%rdi)
# endif
VZEROUPPER
ret
@@ -1799,8 +1789,7 @@ L(StrncpyExit10):
lea 10(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 10(%rdi)
+ movb $0, 10(%rdi)
# endif
VZEROUPPER
ret
@@ -1815,8 +1804,7 @@ L(StrncpyExit11):
lea 11(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 11(%rdi)
+ movb $0, 11(%rdi)
# endif
VZEROUPPER
ret
@@ -1831,8 +1819,7 @@ L(StrncpyExit12):
lea 12(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 12(%rdi)
+ movb $0, 12(%rdi)
# endif
VZEROUPPER
ret
@@ -1847,8 +1834,7 @@ L(StrncpyExit13):
lea 13(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 13(%rdi)
+ movb $0, 13(%rdi)
# endif
VZEROUPPER
ret
@@ -1863,8 +1849,7 @@ L(StrncpyExit14):
lea 14(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 14(%rdi)
+ movb $0, 14(%rdi)
# endif
VZEROUPPER
ret
@@ -1879,8 +1864,7 @@ L(StrncpyExit15):
lea 15(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 15(%rdi)
+ movb $0, 15(%rdi)
# endif
VZEROUPPER
ret
@@ -1893,8 +1877,7 @@ L(StrncpyExit16):
lea 16(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 16(%rdi)
+ movb $0, 16(%rdi)
# endif
VZEROUPPER
ret
@@ -1909,8 +1892,7 @@ L(StrncpyExit17):
lea 17(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 17(%rdi)
+ movb $0, 17(%rdi)
# endif
VZEROUPPER
ret
@@ -1925,8 +1907,7 @@ L(StrncpyExit18):
lea 18(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 18(%rdi)
+ movb $0, 18(%rdi)
# endif
VZEROUPPER
ret
@@ -1941,8 +1922,7 @@ L(StrncpyExit19):
lea 19(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 19(%rdi)
+ movb $0, 19(%rdi)
# endif
VZEROUPPER
ret
@@ -1957,8 +1937,7 @@ L(StrncpyExit20):
lea 20(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 20(%rdi)
+ movb $0, 20(%rdi)
# endif
VZEROUPPER
ret
@@ -1973,8 +1952,7 @@ L(StrncpyExit21):
lea 21(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 21(%rdi)
+ movb $0, 21(%rdi)
# endif
VZEROUPPER
ret
@@ -1989,8 +1967,7 @@ L(StrncpyExit22):
lea 22(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 22(%rdi)
+ movb $0, 22(%rdi)
# endif
VZEROUPPER
ret
@@ -2005,8 +1982,7 @@ L(StrncpyExit23):
lea 23(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 23(%rdi)
+ movb $0, 23(%rdi)
# endif
VZEROUPPER
ret
@@ -2021,8 +1997,7 @@ L(StrncpyExit24):
lea 24(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 24(%rdi)
+ movb $0, 24(%rdi)
# endif
VZEROUPPER
ret
@@ -2037,8 +2012,7 @@ L(StrncpyExit25):
lea 25(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 25(%rdi)
+ movb $0, 25(%rdi)
# endif
VZEROUPPER
ret
@@ -2053,8 +2027,7 @@ L(StrncpyExit26):
lea 26(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 26(%rdi)
+ movb $0, 26(%rdi)
# endif
VZEROUPPER
ret
@@ -2069,8 +2042,7 @@ L(StrncpyExit27):
lea 27(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 27(%rdi)
+ movb $0, 27(%rdi)
# endif
VZEROUPPER
ret
@@ -2085,8 +2057,7 @@ L(StrncpyExit28):
lea 28(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 28(%rdi)
+ movb $0, 28(%rdi)
# endif
VZEROUPPER
ret
@@ -2101,8 +2072,7 @@ L(StrncpyExit29):
lea 29(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 29(%rdi)
+ movb $0, 29(%rdi)
# endif
VZEROUPPER
ret
@@ -2117,8 +2087,7 @@ L(StrncpyExit30):
lea 30(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 30(%rdi)
+ movb $0, 30(%rdi)
# endif
VZEROUPPER
ret
@@ -2133,8 +2102,7 @@ L(StrncpyExit31):
lea 31(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 31(%rdi)
+ movb $0, 31(%rdi)
# endif
VZEROUPPER
ret
@@ -2147,8 +2115,7 @@ L(StrncpyExit32):
lea 32(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 32(%rdi)
+ movb $0, 32(%rdi)
# endif
VZEROUPPER
ret
@@ -2163,8 +2130,7 @@ L(StrncpyExit33):
lea 33(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 33(%rdi)
+ movb $0, 33(%rdi)
# endif
VZEROUPPER
ret
@@ -2180,8 +2146,7 @@ L(StrncpyExit34):
lea 34(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 34(%rdi)
+ movb $0, 34(%rdi)
# endif
VZEROUPPER
ret
@@ -2197,8 +2162,7 @@ L(StrncpyExit35):
lea 35(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 35(%rdi)
+ movb $0, 35(%rdi)
# endif
VZEROUPPER
ret
@@ -2214,8 +2178,7 @@ L(StrncpyExit36):
lea 36(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 36(%rdi)
+ movb $0, 36(%rdi)
# endif
VZEROUPPER
ret
@@ -2231,8 +2194,7 @@ L(StrncpyExit37):
lea 37(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 37(%rdi)
+ movb $0, 37(%rdi)
# endif
VZEROUPPER
ret
@@ -2248,8 +2210,7 @@ L(StrncpyExit38):
lea 38(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 38(%rdi)
+ movb $0, 38(%rdi)
# endif
VZEROUPPER
ret
@@ -2265,8 +2226,7 @@ L(StrncpyExit39):
lea 39(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 39(%rdi)
+ movb $0, 39(%rdi)
# endif
VZEROUPPER
ret
@@ -2282,8 +2242,7 @@ L(StrncpyExit40):
lea 40(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 40(%rdi)
+ movb $0, 40(%rdi)
# endif
VZEROUPPER
ret
@@ -2299,8 +2258,7 @@ L(StrncpyExit41):
lea 41(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 41(%rdi)
+ movb $0, 41(%rdi)
# endif
VZEROUPPER
ret
@@ -2316,8 +2274,7 @@ L(StrncpyExit42):
lea 42(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 42(%rdi)
+ movb $0, 42(%rdi)
# endif
VZEROUPPER
ret
@@ -2333,8 +2290,7 @@ L(StrncpyExit43):
lea 43(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 43(%rdi)
+ movb $0, 43(%rdi)
# endif
VZEROUPPER
ret
@@ -2350,8 +2306,7 @@ L(StrncpyExit44):
lea 44(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 44(%rdi)
+ movb $0, 44(%rdi)
# endif
VZEROUPPER
ret
@@ -2367,8 +2322,7 @@ L(StrncpyExit45):
lea 45(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 45(%rdi)
+ movb $0, 45(%rdi)
# endif
VZEROUPPER
ret
@@ -2384,8 +2338,7 @@ L(StrncpyExit46):
lea 46(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 46(%rdi)
+ movb $0, 46(%rdi)
# endif
VZEROUPPER
ret
@@ -2401,8 +2354,7 @@ L(StrncpyExit47):
lea 47(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 47(%rdi)
+ movb $0, 47(%rdi)
# endif
VZEROUPPER
ret
@@ -2418,8 +2370,7 @@ L(StrncpyExit48):
lea 48(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 48(%rdi)
+ movb $0, 48(%rdi)
# endif
VZEROUPPER
ret
@@ -2435,8 +2386,7 @@ L(StrncpyExit49):
lea 49(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 49(%rdi)
+ movb $0, 49(%rdi)
# endif
VZEROUPPER
ret
@@ -2452,8 +2402,7 @@ L(StrncpyExit50):
lea 50(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 50(%rdi)
+ movb $0, 50(%rdi)
# endif
VZEROUPPER
ret
@@ -2469,8 +2418,7 @@ L(StrncpyExit51):
lea 51(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 51(%rdi)
+ movb $0, 51(%rdi)
# endif
VZEROUPPER
ret
@@ -2486,8 +2434,7 @@ L(StrncpyExit52):
lea 52(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 52(%rdi)
+ movb $0, 52(%rdi)
# endif
VZEROUPPER
ret
@@ -2503,8 +2450,7 @@ L(StrncpyExit53):
lea 53(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 53(%rdi)
+ movb $0, 53(%rdi)
# endif
VZEROUPPER
ret
@@ -2520,8 +2466,7 @@ L(StrncpyExit54):
lea 54(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 54(%rdi)
+ movb $0, 54(%rdi)
# endif
VZEROUPPER
ret
@@ -2537,8 +2482,7 @@ L(StrncpyExit55):
lea 55(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 55(%rdi)
+ movb $0, 55(%rdi)
# endif
VZEROUPPER
ret
@@ -2554,8 +2498,7 @@ L(StrncpyExit56):
lea 56(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 56(%rdi)
+ movb $0, 56(%rdi)
# endif
VZEROUPPER
ret
@@ -2571,8 +2514,7 @@ L(StrncpyExit57):
lea 57(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 57(%rdi)
+ movb $0, 57(%rdi)
# endif
VZEROUPPER
ret
@@ -2588,8 +2530,7 @@ L(StrncpyExit58):
lea 58(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 58(%rdi)
+ movb $0, 58(%rdi)
# endif
VZEROUPPER
ret
@@ -2605,8 +2546,7 @@ L(StrncpyExit59):
lea 59(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 59(%rdi)
+ movb $0, 59(%rdi)
# endif
VZEROUPPER
ret
@@ -2623,8 +2563,7 @@ L(StrncpyExit60):
lea 60(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 60(%rdi)
+ movb $0, 60(%rdi)
# endif
VZEROUPPER
ret
@@ -2640,8 +2579,7 @@ L(StrncpyExit61):
lea 61(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 61(%rdi)
+ movb $0, 61(%rdi)
# endif
VZEROUPPER
ret
@@ -2657,8 +2595,7 @@ L(StrncpyExit62):
lea 62(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 62(%rdi)
+ movb $0, 62(%rdi)
# endif
VZEROUPPER
ret
@@ -2674,8 +2611,7 @@ L(StrncpyExit63):
lea 63(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 63(%rdi)
+ movb $0, 63(%rdi)
# endif
VZEROUPPER
ret
@@ -2691,8 +2627,7 @@ L(StrncpyExit64):
lea 64(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 64(%rdi)
+ movb $0, 64(%rdi)
# endif
VZEROUPPER
ret
@@ -2710,8 +2645,7 @@ L(StrncpyExit65):
lea 65(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 65(%rdi)
+ movb $0, 65(%rdi)
# endif
VZEROUPPER
ret
@@ -3034,8 +2968,7 @@ L(UnalignedFourVecSizeLeaveCase3):
lea (VEC_SIZE * 4)(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, (VEC_SIZE * 4)(%rdi)
+ movb $0, (VEC_SIZE * 4)(%rdi)
# endif
VZEROUPPER
ret
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a551f0943fcc0deac13942d1bd1f3f3d40a1235a
commit a551f0943fcc0deac13942d1bd1f3f3d40a1235a
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Tue Sep 18 11:40:31 2018 -0700
Initial xmmZ/ymmZ: a fixed all-zero vector register
Please update, keep only one
vpxor %xmmZ, %xmmZ, %xmmZ
and remove other vpxor:
vpxor %xmm0, %xmm0, %xmm0
vpxor %xmm1, %xmm1, %xmm1
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 22bd063..222826d 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -44,6 +44,9 @@
# define VZEROUPPER vzeroupper
# endif
+#define xmmZ xmm8
+#define ymmZ ymm8
+
# ifndef USE_AS_STRCAT
.text
@@ -60,6 +63,8 @@ ENTRY (STRCPY)
# endif
+ vpxor %xmmZ, %xmmZ, %xmmZ
+
and $((VEC_SIZE * 4) - 1), %rcx
cmp $(VEC_SIZE * 2), %rcx
jbe L(SourceStringAlignmentLessTwoVecSize)
@@ -69,7 +74,7 @@ ENTRY (STRCPY)
vpxor %xmm0, %xmm0, %xmm0
vpxor %xmm1, %xmm1, %xmm1
- vpcmpeqb (%rsi), %ymm1, %ymm1
+ vpcmpeqb (%rsi), %ymmZ, %ymm1
vpmovmskb %ymm1, %rdx
shr %cl, %rdx
@@ -88,7 +93,7 @@ ENTRY (STRCPY)
test %rdx, %rdx
jnz L(CopyVecSizeTail)
- vpcmpeqb VEC_SIZE(%rsi), %ymm0, %ymm0
+ vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm0
vpmovmskb %ymm0, %rdx
# ifdef USE_AS_STRNCPY
@@ -115,7 +120,7 @@ L(UnalignVecSizeBoth):
vmovdqa (%rsi, %rcx), %ymm1
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
vmovdqu %ymm1, (%rdi, %rcx)
- vpcmpeqb %ymm2, %ymm0, %ymm0
+ vpcmpeqb %ymm2, %ymmZ, %ymm0
vpmovmskb %ymm0, %rdx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
@@ -131,7 +136,7 @@ L(UnalignVecSizeBoth):
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
vmovdqu %ymm2, (%rdi, %rcx)
- vpcmpeqb %ymm3, %ymm0, %ymm0
+ vpcmpeqb %ymm3, %ymmZ, %ymm0
vpmovmskb %ymm0, %rdx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
@@ -147,7 +152,7 @@ L(UnalignVecSizeBoth):
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
vmovdqu %ymm3, (%rdi, %rcx)
- vpcmpeqb %ymm4, %ymm0, %ymm0
+ vpcmpeqb %ymm4, %ymmZ, %ymm0
vpmovmskb %ymm0, %rdx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
@@ -163,7 +168,7 @@ L(UnalignVecSizeBoth):
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm1
vmovdqu %ymm4, (%rdi, %rcx)
- vpcmpeqb %ymm1, %ymm0, %ymm0
+ vpcmpeqb %ymm1, %ymmZ, %ymm0
vpmovmskb %ymm0, %rdx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
@@ -179,7 +184,7 @@ L(UnalignVecSizeBoth):
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
vmovdqu %ymm1, (%rdi, %rcx)
- vpcmpeqb %ymm2, %ymm0, %ymm0
+ vpcmpeqb %ymm2, %ymmZ, %ymm0
vpmovmskb %ymm0, %rdx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
@@ -195,7 +200,7 @@ L(UnalignVecSizeBoth):
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
vmovdqu %ymm2, (%rdi, %rcx)
- vpcmpeqb %ymm3, %ymm0, %ymm0
+ vpcmpeqb %ymm3, %ymmZ, %ymm0
vpmovmskb %ymm0, %rdx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
@@ -259,14 +264,12 @@ L(UnalignedFourVecSizeLoop_start):
jz L(UnalignedFourVecSizeLoop_start)
L(UnalignedFourVecSizeLeave):
- vpxor %xmm1, %xmm1, %xmm1
-
vpcmpeqb %ymm4, %ymm0, %ymm0
vpmovmskb %ymm0, %rdx
test %rdx, %rdx
jnz L(CopyVecSizeUnaligned_0)
- vpcmpeqb %ymm5, %ymm1, %ymm1
+ vpcmpeqb %ymm5, %ymmZ, %ymm1
vpmovmskb %ymm1, %rcx
test %rcx, %rcx
jnz L(CopyVecSizeUnaligned_16)
@@ -276,7 +279,7 @@ L(UnalignedFourVecSizeLeave):
test %rdx, %rdx
jnz L(CopyVecSizeUnaligned_32)
- vpcmpeqb %ymm7, %ymm1, %ymm1
+ vpcmpeqb %ymm7, %ymmZ, %ymm1
vpmovmskb %ymm1, %rcx
bsf %rcx, %rdx
vmovdqu %ymm4, (%rdi)
@@ -300,10 +303,9 @@ L(UnalignedFourVecSizeLeave):
/* If source address alignment == destination address alignment */
L(SourceStringAlignmentLessTwoVecSize):
- vpxor %xmm0, %xmm0, %xmm0
vmovdqu (%rsi), %ymm1
vmovdqu VEC_SIZE(%rsi), %ymm2
- vpcmpeqb %ymm1, %ymm0, %ymm0
+ vpcmpeqb %ymm1, %ymmZ, %ymm0
vpmovmskb %ymm0, %rdx
# ifdef USE_AS_STRNCPY
@@ -317,7 +319,7 @@ L(SourceStringAlignmentLessTwoVecSize):
test %rdx, %rdx
jnz L(CopyVecSizeTail1)
- vpcmpeqb %ymm2, %ymm0, %ymm0
+ vpcmpeqb %ymm2, %ymmZ, %ymm0
vmovdqu %ymm1, (%rdi)
vpmovmskb %ymm0, %rdx
@@ -2815,47 +2817,47 @@ L(Fill14):
.p2align 4
L(Fill15):
- vmovdqu %xmm0, -1(%rdi)
+ vmovdqu %xmmZ, -1(%rdi)
VZEROUPPER
ret
.p2align 4
L(Fill16):
- vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmmZ, (%rdi)
VZEROUPPER
ret
.p2align 4
L(Fill17):
- vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmmZ, (%rdi)
mov %dl, 16(%rdi)
VZEROUPPER
ret
.p2align 4
L(Fill18):
- vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmmZ, (%rdi)
mov %dx, 16(%rdi)
VZEROUPPER
ret
.p2align 4
L(Fill19):
- vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmmZ, (%rdi)
mov %edx, 15(%rdi)
VZEROUPPER
ret
.p2align 4
L(Fill20):
- vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmmZ, (%rdi)
mov %edx, 16(%rdi)
VZEROUPPER
ret
.p2align 4
L(Fill21):
- vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmmZ, (%rdi)
mov %edx, 16(%rdi)
mov %dl, 20(%rdi)
VZEROUPPER
@@ -2863,7 +2865,7 @@ L(Fill21):
.p2align 4
L(Fill22):
- vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmmZ, (%rdi)
mov %edx, 16(%rdi)
mov %dx, 20(%rdi)
VZEROUPPER
@@ -2871,21 +2873,21 @@ L(Fill22):
.p2align 4
L(Fill23):
- vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmmZ, (%rdi)
mov %rdx, 15(%rdi)
VZEROUPPER
ret
.p2align 4
L(Fill24):
- vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmmZ, (%rdi)
mov %rdx, 16(%rdi)
VZEROUPPER
ret
.p2align 4
L(Fill25):
- vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmmZ, (%rdi)
mov %rdx, 16(%rdi)
mov %dl, 24(%rdi)
VZEROUPPER
@@ -2893,7 +2895,7 @@ L(Fill25):
.p2align 4
L(Fill26):
- vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmmZ, (%rdi)
mov %rdx, 16(%rdi)
mov %dx, 24(%rdi)
VZEROUPPER
@@ -2901,7 +2903,7 @@ L(Fill26):
.p2align 4
L(Fill27):
- vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmmZ, (%rdi)
mov %rdx, 16(%rdi)
mov %edx, 23(%rdi)
VZEROUPPER
@@ -2909,7 +2911,7 @@ L(Fill27):
.p2align 4
L(Fill28):
- vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmmZ, (%rdi)
mov %rdx, 16(%rdi)
mov %edx, 24(%rdi)
VZEROUPPER
@@ -2917,7 +2919,7 @@ L(Fill28):
.p2align 4
L(Fill29):
- vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmmZ, (%rdi)
mov %rdx, 16(%rdi)
mov %rdx, 21(%rdi)
VZEROUPPER
@@ -2925,7 +2927,7 @@ L(Fill29):
.p2align 4
L(Fill30):
- vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmmZ, (%rdi)
mov %rdx, 16(%rdi)
mov %rdx, 22(%rdi)
VZEROUPPER
@@ -2933,13 +2935,13 @@ L(Fill30):
.p2align 4
L(Fill31):
- vmovdqu %ymm0, -1(%rdi)
+ vmovdqu %ymmZ, -1(%rdi)
VZEROUPPER
ret
.p2align 4
L(Fill32):
- vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymmZ, (%rdi)
VZEROUPPER
ret
@@ -2960,12 +2962,11 @@ L(CopyVecSizeVecExit):
.p2align 4
L(StrncpyFillTailWithZero):
- vpxor %xmm0, %xmm0, %xmm0
- xor %rdx, %rdx
+ xor %edx, %edx
sub $VEC_SIZE, %r8
jbe L(StrncpyFillExit)
- vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymmZ, (%rdi)
add $VEC_SIZE, %rdi
mov %rdi, %rsi
@@ -2976,10 +2977,10 @@ L(StrncpyFillTailWithZero):
jb L(StrncpyFillLessFourVecSize)
L(StrncpyFillLoopVmovdqa):
- vmovdqa %ymm0, (%rdi)
- vmovdqa %ymm0, VEC_SIZE(%rdi)
- vmovdqa %ymm0, (VEC_SIZE * 2)(%rdi)
- vmovdqa %ymm0, (VEC_SIZE * 3)(%rdi)
+ vmovdqa %ymmZ, (%rdi)
+ vmovdqa %ymmZ, VEC_SIZE(%rdi)
+ vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
+ vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
add $(VEC_SIZE * 4), %rdi
sub $(VEC_SIZE * 4), %r8
jae L(StrncpyFillLoopVmovdqa)
@@ -2987,19 +2988,19 @@ L(StrncpyFillLoopVmovdqa):
L(StrncpyFillLessFourVecSize):
add $(VEC_SIZE * 2), %r8
jl L(StrncpyFillLessTwoVecSize)
- vmovdqa %ymm0, (%rdi)
- vmovdqa %ymm0, VEC_SIZE(%rdi)
+ vmovdqa %ymmZ, (%rdi)
+ vmovdqa %ymmZ, VEC_SIZE(%rdi)
add $(VEC_SIZE * 2), %rdi
sub $VEC_SIZE, %r8
jl L(StrncpyFillExit)
- vmovdqa %ymm0, (%rdi)
+ vmovdqa %ymmZ, (%rdi)
add $VEC_SIZE, %rdi
BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
L(StrncpyFillLessTwoVecSize):
add $VEC_SIZE, %r8
jl L(StrncpyFillExit)
- vmovdqa %ymm0, (%rdi)
+ vmovdqa %ymmZ, (%rdi)
add $VEC_SIZE, %rdi
BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=fc43bfda27f0b0a15c4a590dffcace9d25c06bf8
commit fc43bfda27f0b0a15c4a590dffcace9d25c06bf8
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Tue Sep 18 11:20:51 2018 -0700
Replace 2 load/store with 1 load/store
Please compare performance of 2 load/store2 vs 1 load/store.
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 2f93cec..22bd063 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -863,10 +863,9 @@ L(Exit20):
.p2align 4
L(Exit21):
vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %ecx
+ mov 13(%rsi), %rcx
vmovdqu %xmm0, (%rdi)
- mov %ecx, 16(%rdi)
- mov %dh, 20(%rdi)
+ mov %rcx, 13(%rdi)
# ifdef USE_AS_STPCPY
lea 20(%rdi), %rax
# endif
@@ -932,10 +931,9 @@ L(Exit24):
.p2align 4
L(Exit25):
vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %rcx
+ vmovdqu 9(%rsi), %xmm1
vmovdqu %xmm0, (%rdi)
- mov %rcx, 16(%rdi)
- mov %dh, 24(%rdi)
+ vmovdqu %xmm1, 9(%rdi)
# ifdef USE_AS_STPCPY
lea 24(%rdi), %rax
# endif
@@ -950,11 +948,9 @@ L(Exit25):
.p2align 4
L(Exit26):
vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %cx
+ vmovdqu 10(%rsi), %xmm1
vmovdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %cx, 24(%rdi)
+ vmovdqu %xmm1, 10(%rdi)
# ifdef USE_AS_STPCPY
lea 25(%rdi), %rax
# endif
@@ -969,11 +965,9 @@ L(Exit26):
.p2align 4
L(Exit27):
vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 23(%rsi), %ecx
+ vmovdqu 11(%rsi), %xmm1
vmovdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %ecx, 23(%rdi)
+ vmovdqu %xmm1, 11(%rdi)
# ifdef USE_AS_STPCPY
lea 26(%rdi), %rax
# endif
@@ -988,11 +982,9 @@ L(Exit27):
.p2align 4
L(Exit28):
vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %ecx
+ vmovdqu 12(%rsi), %xmm1
vmovdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %ecx, 24(%rdi)
+ vmovdqu %xmm1, 12(%rdi)
# ifdef USE_AS_STPCPY
lea 27(%rdi), %rax
# endif
@@ -1217,10 +1209,9 @@ L(Exit40):
L(Exit41):
/* 0/32, 32/8, 40/1 */
vmovdqu (%rsi), %ymm0
- mov 32(%rsi), %rcx
+ vmovdqu 25(%rsi), %xmm1
vmovdqu %ymm0, (%rdi)
- mov %rcx, 32(%rdi)
- mov %dh, 40(%rdi)
+ vmovdqu %xmm1, 25(%rdi)
# ifdef USE_AS_STPCPY
lea 40(%rdi), %rax
# endif
@@ -1236,11 +1227,9 @@ L(Exit41):
L(Exit42):
/* 0/32, 32/8, 40/2 */
vmovdqu (%rsi), %ymm0
- mov 32(%rsi), %rcx
- mov 40(%rsi), %dx
+ vmovdqu 26(%rsi), %xmm1
vmovdqu %ymm0, (%rdi)
- mov %rcx, 32(%rdi)
- mov %dx, 40(%rdi)
+ vmovdqu %xmm1, 26(%rdi)
# ifdef USE_AS_STPCPY
lea 41(%rdi), %rax
# endif
@@ -1364,10 +1353,9 @@ L(Exit48):
L(Exit49):
/* 0/32, 32/16, 48/1 */
vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm1
+ vmovdqu 17(%rsi), %ymm1
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 32(%rdi)
- mov %dh, 48(%rdi)
+ vmovdqu %ymm1, 17(%rdi)
# ifdef USE_AS_STPCPY
lea 48(%rdi), %rax
# endif
@@ -1383,11 +1371,9 @@ L(Exit49):
L(Exit50):
/* 0/32, 32/16, 48/2 */
vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm1
- mov 48(%rsi), %dx
+ vmovdqu 18(%rsi), %ymm1
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 32(%rdi)
- mov %dx, 48(%rdi)
+ vmovdqu %ymm1, 18(%rdi)
# ifdef USE_AS_STPCPY
lea 49(%rdi), %rax
# endif
@@ -1403,11 +1389,9 @@ L(Exit50):
L(Exit51):
/* 0/32, 32/16, 47/4 */
vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm1
- mov 47(%rsi), %edx
+ vmovdqu 19(%rsi), %ymm1
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 32(%rdi)
- mov %edx, 47(%rdi)
+ vmovdqu %ymm1, 19(%rdi)
# ifdef USE_AS_STPCPY
lea 50(%rdi), %rax
# endif
@@ -1423,11 +1407,9 @@ L(Exit51):
L(Exit52):
/* 0/32, 32/16, 48/4 */
vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm1
- mov 48(%rsi), %edx
+ vmovdqu 20(%rsi), %ymm1
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 32(%rdi)
- mov %edx, 48(%rdi)
+ vmovdqu %ymm1, 20(%rdi)
# ifdef USE_AS_STPCPY
lea 51(%rdi), %rax
# endif
@@ -1443,11 +1425,9 @@ L(Exit52):
L(Exit53):
/* 0/32, 32/16, 45/8 */
vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm1
- mov 45(%rsi), %rdx
+ vmovdqu 21(%rsi), %ymm1
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 32(%rdi)
- mov %rdx, 45(%rdi)
+ vmovdqu %ymm1, 21(%rdi)
# ifdef USE_AS_STPCPY
lea 52(%rdi), %rax
# endif
@@ -1463,11 +1443,9 @@ L(Exit53):
L(Exit54):
/* 0/32, 32/16, 46/8 */
vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm1
- mov 46(%rsi), %rdx
+ vmovdqu 22(%rsi), %ymm1
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 32(%rdi)
- mov %rdx, 46(%rdi)
+ vmovdqu %ymm1, 22(%rdi)
# ifdef USE_AS_STPCPY
lea 53(%rdi), %rax
# endif
@@ -1483,11 +1461,9 @@ L(Exit54):
L(Exit55):
/* 0/32, 32/16, 47/8 */
vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm1
- mov 47(%rsi), %rdx
+ vmovdqu 23(%rsi), %ymm1
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 32(%rdi)
- mov %rdx, 47(%rdi)
+ vmovdqu %ymm1, 23(%rdi)
# ifdef USE_AS_STPCPY
lea 54(%rdi), %rax
# endif
@@ -1503,11 +1479,9 @@ L(Exit55):
L(Exit56):
/* 0/32, 32/16, 48/8 */
vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm1
- mov 48(%rsi), %rdx
+ vmovdqu 24(%rsi), %ymm1
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm1, 32(%rdi)
- mov %rdx, 48(%rdi)
+ vmovdqu %ymm1, 24(%rdi)
# ifdef USE_AS_STPCPY
lea 55(%rdi), %rax
# endif
@@ -1990,11 +1964,9 @@ L(StrncpyExit20):
.p2align 4
L(StrncpyExit21):
vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %ecx
- mov 20(%rsi), %dl
+ mov 13(%rsi), %rcx
vmovdqu %xmm0, (%rdi)
- mov %ecx, 16(%rdi)
- mov %dl, 20(%rdi)
+ mov %rcx, 13(%rdi)
# ifdef USE_AS_STPCPY
lea 21(%rdi), %rax
# endif
@@ -2056,11 +2028,9 @@ L(StrncpyExit24):
.p2align 4
L(StrncpyExit25):
vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %cl
+ vmovdqu 9(%rsi), %xmm1
vmovdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %cl, 24(%rdi)
+ vmovdqu %xmm1, 9(%rdi)
# ifdef USE_AS_STPCPY
lea 25(%rdi), %rax
# endif
@@ -2074,11 +2044,9 @@ L(StrncpyExit25):
.p2align 4
L(StrncpyExit26):
vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %cx
+ vmovdqu 10(%rsi), %xmm1
vmovdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %cx, 24(%rdi)
+ vmovdqu %xmm1, 10(%rdi)
# ifdef USE_AS_STPCPY
lea 26(%rdi), %rax
# endif
@@ -2092,11 +2060,9 @@ L(StrncpyExit26):
.p2align 4
L(StrncpyExit27):
vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 23(%rsi), %ecx
+ vmovdqu 11(%rsi), %xmm1
vmovdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %ecx, 23(%rdi)
+ vmovdqu %xmm1, 11(%rdi)
# ifdef USE_AS_STPCPY
lea 27(%rdi), %rax
# endif
@@ -2110,11 +2076,9 @@ L(StrncpyExit27):
.p2align 4
L(StrncpyExit28):
vmovdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %ecx
+ vmovdqu 12(%rsi), %xmm1
vmovdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %ecx, 24(%rdi)
+ vmovdqu %xmm1, 12(%rdi)
# ifdef USE_AS_STPCPY
lea 28(%rdi), %rax
# endif
@@ -2326,11 +2290,9 @@ L(StrncpyExit40):
L(StrncpyExit41):
/* 0/32, 32/8, 40/1 */
vmovdqu (%rsi), %ymm0
- mov 32(%rsi), %rdx
- mov 40(%rsi), %cl
+ vmovdqu 25(%rsi), %xmm1
vmovdqu %ymm0, (%rdi)
- mov %rdx, 32(%rdi)
- mov %cl, 40(%rdi)
+ vmovdqu %xmm1, 25(%rdi)
# ifdef USE_AS_STPCPY
lea 41(%rdi), %rax
# endif
@@ -2345,11 +2307,9 @@ L(StrncpyExit41):
L(StrncpyExit42):
/* 0/32, 32/8, 40/2 */
vmovdqu (%rsi), %ymm0
- mov 32(%rsi), %rdx
- mov 40(%rsi), %cx
+ vmovdqu 26(%rsi), %xmm1
vmovdqu %ymm0, (%rdi)
- mov %rdx, 32(%rdi)
- mov %cx, 40(%rdi)
+ vmovdqu %xmm1, 26(%rdi)
# ifdef USE_AS_STPCPY
lea 42(%rdi), %rax
# endif
@@ -2466,11 +2426,9 @@ L(StrncpyExit48):
L(StrncpyExit49):
/* 0/32, 32/16, 48/1 */
vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm2
- mov 48(%rsi), %cl
+ vmovdqu 17(%rsi), %ymm2
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm2, 32(%rdi)
- mov %cl, 48(%rdi)
+ vmovdqu %ymm2, 17(%rdi)
# ifdef USE_AS_STPCPY
lea 49(%rdi), %rax
# endif
@@ -2485,11 +2443,9 @@ L(StrncpyExit49):
L(StrncpyExit50):
/* 0/32, 32/16, 48/2 */
vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm2
- mov 48(%rsi), %cx
+ vmovdqu 18(%rsi), %ymm2
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm2, 32(%rdi)
- mov %cx, 48(%rdi)
+ vmovdqu %ymm2, 18(%rdi)
# ifdef USE_AS_STPCPY
lea 50(%rdi), %rax
# endif
@@ -2504,11 +2460,9 @@ L(StrncpyExit50):
L(StrncpyExit51):
/* 0/32, 32/16, 47/4 */
vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm2
- mov 47(%rsi), %ecx
+ vmovdqu 19(%rsi), %ymm2
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm2, 32(%rdi)
- mov %ecx, 47(%rdi)
+ vmovdqu %ymm2, 19(%rdi)
# ifdef USE_AS_STPCPY
lea 51(%rdi), %rax
# endif
@@ -2523,11 +2477,9 @@ L(StrncpyExit51):
L(StrncpyExit52):
/* 0/32, 32/16, 48/4 */
vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm2
- mov 48(%rsi), %ecx
+ vmovdqu 20(%rsi), %ymm2
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm2, 32(%rdi)
- mov %ecx, 48(%rdi)
+ vmovdqu %ymm2, 20(%rdi)
# ifdef USE_AS_STPCPY
lea 52(%rdi), %rax
# endif
@@ -2542,11 +2494,9 @@ L(StrncpyExit52):
L(StrncpyExit53):
/* 0/32, 32/16, 45/8 */
vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm2
- mov 45(%rsi), %rcx
+ vmovdqu 21(%rsi), %ymm2
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm2, 32(%rdi)
- mov %rcx, 45(%rdi)
+ vmovdqu %ymm2, 21(%rdi)
# ifdef USE_AS_STPCPY
lea 53(%rdi), %rax
# endif
@@ -2561,11 +2511,9 @@ L(StrncpyExit53):
L(StrncpyExit54):
/* 0/32, 32/16, 46/8 */
vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm2
- mov 46(%rsi), %rcx
+ vmovdqu 22(%rsi), %ymm2
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm2, 32(%rdi)
- mov %rcx, 46(%rdi)
+ vmovdqu %ymm2, 22(%rdi)
# ifdef USE_AS_STPCPY
lea 54(%rdi), %rax
# endif
@@ -2580,11 +2528,9 @@ L(StrncpyExit54):
L(StrncpyExit55):
/* 0/32, 32/16, 47/8 */
vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm2
- mov 47(%rsi), %rcx
+ vmovdqu 23(%rsi), %ymm2
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm2, 32(%rdi)
- mov %rcx, 47(%rdi)
+ vmovdqu %ymm2, 23(%rdi)
# ifdef USE_AS_STPCPY
lea 55(%rdi), %rax
# endif
@@ -2599,11 +2545,9 @@ L(StrncpyExit55):
L(StrncpyExit56):
/* 0/32, 32/16, 48/8 */
vmovdqu (%rsi), %ymm0
- vmovdqu 32(%rsi), %xmm2
- mov 48(%rsi), %rcx
+ vmovdqu 24(%rsi), %ymm2
vmovdqu %ymm0, (%rdi)
- vmovdqu %xmm2, 32(%rdi)
- mov %rcx, 48(%rdi)
+ vmovdqu %ymm2, 24(%rdi)
# ifdef USE_AS_STPCPY
lea 56(%rdi), %rax
# endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=7121fc964d416749df76aaef61b6afd2524a8a90
commit 7121fc964d416749df76aaef61b6afd2524a8a90
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Tue Sep 18 10:14:06 2018 -0700
Replace movdqu with vmovdqu
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 1215bc3..2f93cec 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -780,8 +780,8 @@ L(Exit15):
.p2align 4
L(Exit16):
- movdqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
+ vmovdqu (%rsi), %xmm0
+ vmovdqu %xmm0, (%rdi)
# ifdef USE_AS_STPCPY
lea 15(%rdi), %rax
# endif
@@ -795,8 +795,8 @@ L(Exit16):
.p2align 4
L(Exit17):
- movdqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
+ vmovdqu (%rsi), %xmm0
+ vmovdqu %xmm0, (%rdi)
mov %dh, 16(%rdi)
# ifdef USE_AS_STPCPY
lea 16(%rdi), %rax
@@ -811,9 +811,9 @@ L(Exit17):
.p2align 4
L(Exit18):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %cx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %cx, 16(%rdi)
# ifdef USE_AS_STPCPY
lea 17(%rdi), %rax
@@ -828,9 +828,9 @@ L(Exit18):
.p2align 4
L(Exit19):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 15(%rsi), %ecx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %ecx, 15(%rdi)
# ifdef USE_AS_STPCPY
lea 18(%rdi), %rax
@@ -845,9 +845,9 @@ L(Exit19):
.p2align 4
L(Exit20):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %ecx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %ecx, 16(%rdi)
# ifdef USE_AS_STPCPY
lea 19(%rdi), %rax
@@ -862,9 +862,9 @@ L(Exit20):
.p2align 4
L(Exit21):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %ecx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %ecx, 16(%rdi)
mov %dh, 20(%rdi)
# ifdef USE_AS_STPCPY
@@ -880,9 +880,9 @@ L(Exit21):
.p2align 4
L(Exit22):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 14(%rsi), %rcx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %rcx, 14(%rdi)
# ifdef USE_AS_STPCPY
lea 21(%rdi), %rax
@@ -897,9 +897,9 @@ L(Exit22):
.p2align 4
L(Exit23):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 15(%rsi), %rcx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %rcx, 15(%rdi)
# ifdef USE_AS_STPCPY
lea 22(%rdi), %rax
@@ -914,9 +914,9 @@ L(Exit23):
.p2align 4
L(Exit24):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %rcx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %rcx, 16(%rdi)
# ifdef USE_AS_STPCPY
lea 23(%rdi), %rax
@@ -931,9 +931,9 @@ L(Exit24):
.p2align 4
L(Exit25):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %rcx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %rcx, 16(%rdi)
mov %dh, 24(%rdi)
# ifdef USE_AS_STPCPY
@@ -949,10 +949,10 @@ L(Exit25):
.p2align 4
L(Exit26):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %rdx
mov 24(%rsi), %cx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %rdx, 16(%rdi)
mov %cx, 24(%rdi)
# ifdef USE_AS_STPCPY
@@ -968,10 +968,10 @@ L(Exit26):
.p2align 4
L(Exit27):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %rdx
mov 23(%rsi), %ecx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %rdx, 16(%rdi)
mov %ecx, 23(%rdi)
# ifdef USE_AS_STPCPY
@@ -987,10 +987,10 @@ L(Exit27):
.p2align 4
L(Exit28):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %rdx
mov 24(%rsi), %ecx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %rdx, 16(%rdi)
mov %ecx, 24(%rdi)
# ifdef USE_AS_STPCPY
@@ -1006,10 +1006,10 @@ L(Exit28):
.p2align 4
L(Exit29):
- movdqu (%rsi), %xmm0
- movdqu 13(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 13(%rdi)
+ vmovdqu (%rsi), %xmm0
+ vmovdqu 13(%rsi), %xmm2
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm2, 13(%rdi)
# ifdef USE_AS_STPCPY
lea 28(%rdi), %rax
# endif
@@ -1023,10 +1023,10 @@ L(Exit29):
.p2align 4
L(Exit30):
- movdqu (%rsi), %xmm0
- movdqu 14(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 14(%rdi)
+ vmovdqu (%rsi), %xmm0
+ vmovdqu 14(%rsi), %xmm2
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm2, 14(%rdi)
# ifdef USE_AS_STPCPY
lea 29(%rdi), %rax
# endif
@@ -1040,10 +1040,10 @@ L(Exit30):
.p2align 4
L(Exit31):
- movdqu (%rsi), %xmm0
- movdqu 15(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 15(%rdi)
+ vmovdqu (%rsi), %xmm0
+ vmovdqu 15(%rsi), %xmm2
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm2, 15(%rdi)
# ifdef USE_AS_STPCPY
lea 30(%rdi), %rax
# endif
@@ -1057,10 +1057,8 @@ L(Exit31):
.p2align 4
L(Exit32):
- movdqu (%rsi), %xmm0
- movdqu 16(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 16(%rdi)
+ vmovdqu (%rsi), %ymm0
+ vmovdqu %ymm0, (%rdi)
# ifdef USE_AS_STPCPY
lea 31(%rdi), %rax
# endif
@@ -1913,8 +1911,8 @@ L(StrncpyExit15):
.p2align 4
L(StrncpyExit16):
- movdqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
+ vmovdqu (%rsi), %xmm0
+ vmovdqu %xmm0, (%rdi)
# ifdef USE_AS_STPCPY
lea 16(%rdi), %rax
# endif
@@ -1927,9 +1925,9 @@ L(StrncpyExit16):
.p2align 4
L(StrncpyExit17):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %cl
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %cl, 16(%rdi)
# ifdef USE_AS_STPCPY
lea 17(%rdi), %rax
@@ -1943,9 +1941,9 @@ L(StrncpyExit17):
.p2align 4
L(StrncpyExit18):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %cx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %cx, 16(%rdi)
# ifdef USE_AS_STPCPY
lea 18(%rdi), %rax
@@ -1959,9 +1957,9 @@ L(StrncpyExit18):
.p2align 4
L(StrncpyExit19):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 15(%rsi), %ecx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %ecx, 15(%rdi)
# ifdef USE_AS_STPCPY
lea 19(%rdi), %rax
@@ -1975,9 +1973,9 @@ L(StrncpyExit19):
.p2align 4
L(StrncpyExit20):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %ecx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %ecx, 16(%rdi)
# ifdef USE_AS_STPCPY
lea 20(%rdi), %rax
@@ -1991,10 +1989,10 @@ L(StrncpyExit20):
.p2align 4
L(StrncpyExit21):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %ecx
mov 20(%rsi), %dl
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %ecx, 16(%rdi)
mov %dl, 20(%rdi)
# ifdef USE_AS_STPCPY
@@ -2009,9 +2007,9 @@ L(StrncpyExit21):
.p2align 4
L(StrncpyExit22):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 14(%rsi), %rcx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %rcx, 14(%rdi)
# ifdef USE_AS_STPCPY
lea 22(%rdi), %rax
@@ -2025,9 +2023,9 @@ L(StrncpyExit22):
.p2align 4
L(StrncpyExit23):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 15(%rsi), %rcx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %rcx, 15(%rdi)
# ifdef USE_AS_STPCPY
lea 23(%rdi), %rax
@@ -2041,9 +2039,9 @@ L(StrncpyExit23):
.p2align 4
L(StrncpyExit24):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %rcx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %rcx, 16(%rdi)
# ifdef USE_AS_STPCPY
lea 24(%rdi), %rax
@@ -2057,10 +2055,10 @@ L(StrncpyExit24):
.p2align 4
L(StrncpyExit25):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %rdx
mov 24(%rsi), %cl
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %rdx, 16(%rdi)
mov %cl, 24(%rdi)
# ifdef USE_AS_STPCPY
@@ -2075,10 +2073,10 @@ L(StrncpyExit25):
.p2align 4
L(StrncpyExit26):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %rdx
mov 24(%rsi), %cx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %rdx, 16(%rdi)
mov %cx, 24(%rdi)
# ifdef USE_AS_STPCPY
@@ -2093,10 +2091,10 @@ L(StrncpyExit26):
.p2align 4
L(StrncpyExit27):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %rdx
mov 23(%rsi), %ecx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %rdx, 16(%rdi)
mov %ecx, 23(%rdi)
# ifdef USE_AS_STPCPY
@@ -2111,10 +2109,10 @@ L(StrncpyExit27):
.p2align 4
L(StrncpyExit28):
- movdqu (%rsi), %xmm0
+ vmovdqu (%rsi), %xmm0
mov 16(%rsi), %rdx
mov 24(%rsi), %ecx
- movdqu %xmm0, (%rdi)
+ vmovdqu %xmm0, (%rdi)
mov %rdx, 16(%rdi)
mov %ecx, 24(%rdi)
# ifdef USE_AS_STPCPY
@@ -2129,10 +2127,10 @@ L(StrncpyExit28):
.p2align 4
L(StrncpyExit29):
- movdqu (%rsi), %xmm0
- movdqu 13(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 13(%rdi)
+ vmovdqu (%rsi), %xmm0
+ vmovdqu 13(%rsi), %xmm2
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm2, 13(%rdi)
# ifdef USE_AS_STPCPY
lea 29(%rdi), %rax
# endif
@@ -2145,10 +2143,10 @@ L(StrncpyExit29):
.p2align 4
L(StrncpyExit30):
- movdqu (%rsi), %xmm0
- movdqu 14(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 14(%rdi)
+ vmovdqu (%rsi), %xmm0
+ vmovdqu 14(%rsi), %xmm2
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm2, 14(%rdi)
# ifdef USE_AS_STPCPY
lea 30(%rdi), %rax
# endif
@@ -2161,10 +2159,10 @@ L(StrncpyExit30):
.p2align 4
L(StrncpyExit31):
- movdqu (%rsi), %xmm0
- movdqu 15(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 15(%rdi)
+ vmovdqu (%rsi), %xmm0
+ vmovdqu 15(%rsi), %xmm2
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm2, 15(%rdi)
# ifdef USE_AS_STPCPY
lea 31(%rdi), %rax
# endif
@@ -2177,10 +2175,8 @@ L(StrncpyExit31):
.p2align 4
L(StrncpyExit32):
- movdqu (%rsi), %xmm0
- movdqu 16(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 16(%rdi)
+ vmovdqu (%rsi), %ymm0
+ vmovdqu %ymm0, (%rdi)
# ifdef USE_AS_STPCPY
lea 32(%rdi), %rax
# endif
@@ -2193,11 +2189,9 @@ L(StrncpyExit32):
.p2align 4
L(StrncpyExit33):
- movdqu (%rsi), %xmm0
- movdqu 16(%rsi), %xmm2
+ vmovdqu (%rsi), %ymm0
mov 32(%rsi), %cl
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 16(%rdi)
+ vmovdqu %ymm0, (%rdi)
mov %cl, 32(%rdi)
# ifdef USE_AS_STPCPY
lea 33(%rdi), %rax
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6a8074db808d8ede7ee3837611963f06ecad90ab
commit 6a8074db808d8ede7ee3837611963f06ecad90ab
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Fri Jul 27 10:16:13 2018 -0500
x86-64: Optimize strcat/strncat, strcpy/strncpy and stpcpy/stpncpy with AVX2
Optimize x86-64 strcat/strncat, strcpy/strncpy and stpcpy/stpncpy with AVX2.
It uses vector comparison as much as possible. In general, the larger the
source string, the greater performance gain observed (expected because AVX2
uses 256-bit registers), reaching speedups of 1.5x at 512-bytes lengths but
gains (>1x) start even in strings as short as 20-bytes. Select AVX2
strcat/strncat, strcpy/strncpy and stpcpy/stpncpy on AVX2 machines where
vzeroupper is preferred and AVX unaligned load is fast.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strcat-avx2, strncat-avx2, strcpy-avx2, strncpy-avx2,
stpcpy-avx2 and stpncpy-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c:
(__libc_ifunc_impl_list): Add tests for __strcat_avx2,
__strncat_avx2, __strcpy_avx2, __strncpy_avx2, __stpcpy_avx2
and __stpncpy_avx2.
* sysdeps/x86_64/multiarch/{ifunc-unaligned-ssse3.h =>
ifunc-unaligned.h}: rename header for a more generic name.
* sysdeps/x86_64/multiarch/ifunc-unaligned.h:
(IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX 2 machines if
AVX unaligned load is fast and vzeroupper is preferred.
* sysdeps/x86_64/multiarch/stpcpy-avx2.S: New file
* sysdeps/x86_64/multiarch/stpncpy-avx2.S: Likewise
* sysdeps/x86_64/multiarch/strcat-avx2.S: Likewise
* sysdeps/x86_64/multiarch/strcpy-avx2.S: Likewise
* sysdeps/x86_64/multiarch/strncat-avx2.S: Likewise
* sysdeps/x86_64/multiarch/strncpy-avx2.S: Likewise
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index bb5e970..395e432 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -24,11 +24,14 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
strrchr-sse2 strrchr-avx2 \
strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
+ strcat-avx2 strncat-avx2 \
strcat-ssse3 strncat-ssse3\
+ strcpy-avx2 strncpy-avx2 \
strcpy-sse2 stpcpy-sse2 \
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
+ stpcpy-avx2 stpncpy-avx2 \
strcat-sse2 \
strcat-sse2-unaligned strncat-sse2-unaligned \
strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 9aaaef7..950bd9e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -199,6 +199,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, stpncpy,
IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
__stpncpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, stpncpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ __stpncpy_avx2)
IFUNC_IMPL_ADD (array, i, stpncpy, 1,
__stpncpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
@@ -207,6 +209,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, stpcpy,
IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
__stpcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, stpcpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ __stpcpy_avx2)
IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
@@ -239,6 +243,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcat.c. */
IFUNC_IMPL (i, name, strcat,
+ IFUNC_IMPL_ADD (array, i, strcat, HAS_ARCH_FEATURE (AVX2_Usable),
+ __strcat_avx2)
IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
__strcat_ssse3)
IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
@@ -280,6 +286,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcpy.c. */
IFUNC_IMPL (i, name, strcpy,
+ IFUNC_IMPL_ADD (array, i, strcpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ __strcpy_avx2)
IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
__strcpy_ssse3)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
@@ -321,6 +329,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncat.c. */
IFUNC_IMPL (i, name, strncat,
+ IFUNC_IMPL_ADD (array, i, strncat, HAS_ARCH_FEATURE (AVX2_Usable),
+ __strncat_avx2)
IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
__strncat_ssse3)
IFUNC_IMPL_ADD (array, i, strncat, 1,
@@ -329,6 +339,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncpy.c. */
IFUNC_IMPL (i, name, strncpy,
+ IFUNC_IMPL_ADD (array, i, strncpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ __strncpy_avx2)
IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
__strncpy_ssse3)
IFUNC_IMPL_ADD (array, i, strncpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h b/sysdeps/x86_64/multiarch/ifunc-unaligned.h
similarity index 83%
rename from sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h
rename to sysdeps/x86_64/multiarch/ifunc-unaligned.h
index 81805f9..4f2286f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h
+++ b/sysdeps/x86_64/multiarch/ifunc-unaligned.h
@@ -24,12 +24,18 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ return OPTIMIZE (avx2);
+
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
return OPTIMIZE (sse2_unaligned);
diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2.S b/sysdeps/x86_64/multiarch/stpcpy-avx2.S
new file mode 100644
index 0000000..f0bd302
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_avx2
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy.c b/sysdeps/x86_64/multiarch/stpcpy.c
index 1e340fc..5f0e63c 100644
--- a/sysdeps/x86_64/multiarch/stpcpy.c
+++ b/sysdeps/x86_64/multiarch/stpcpy.c
@@ -28,7 +28,7 @@
# undef __stpcpy
# define SYMBOL_NAME stpcpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-unaligned.h"
libc_ifunc_redirected (__redirect_stpcpy, __stpcpy, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
new file mode 100644
index 0000000..032b040
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_avx2
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
index 28842ec..f87df0c 100644
--- a/sysdeps/x86_64/multiarch/stpncpy.c
+++ b/sysdeps/x86_64/multiarch/stpncpy.c
@@ -26,7 +26,7 @@
# undef __stpncpy
# define SYMBOL_NAME stpncpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-unaligned.h"
libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
new file mode 100644
index 0000000..94c2a7a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
@@ -0,0 +1,278 @@
+/* strcat with AVX2
+ Copyright (C) 2011-2018 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+# define STRCAT __strcat_avx2
+# endif
+
+# define USE_AS_STRCAT
+
+/* Number of bytes in a vector register */
+# define VEC_SIZE 32
+
+.text
+ENTRY (STRCAT)
+ mov %rdi, %r9
+# ifdef USE_AS_STRNCAT
+ mov %rdx, %r8
+# endif
+
+/* Inline corresponding strlen file, temporary until new strcpy
+ implementation gets merged. */
+
+ xor %eax, %eax
+ mov %edi, %ecx
+ and $((VEC_SIZE * 4) - 1), %ecx
+ vpxor %xmm6, %xmm6, %xmm6
+ cmp $(VEC_SIZE * 3), %ecx
+ ja L(fourth_vector_boundary)
+ vpcmpeqb (%rdi), %ymm6, %ymm0
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_first_vector)
+ mov %rdi, %rax
+ and $-VEC_SIZE, %rax
+ jmp L(align_vec_size_start)
+L(fourth_vector_boundary):
+ mov %rdi, %rax
+ and $-VEC_SIZE, %rax
+ vpcmpeqb (%rax), %ymm6, %ymm0
+ mov $-1, %r10d
+ sub %rax, %rcx
+ shl %cl, %r10d
+ vpmovmskb %ymm0, %edx
+ and %r10d, %edx
+ jnz L(exit)
+
+L(align_vec_size_start):
+ vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_second_vector)
+
+ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+ vpmovmskb %ymm1, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_third_vector)
+
+ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+ vpmovmskb %ymm2, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fourth_vector)
+
+ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+ vpmovmskb %ymm3, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fifth_vector)
+
+ vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
+ add $(VEC_SIZE * 4), %rax
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_second_vector)
+
+ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+ vpmovmskb %ymm1, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_third_vector)
+
+ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+ vpmovmskb %ymm2, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fourth_vector)
+
+ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+ vpmovmskb %ymm3, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fifth_vector)
+
+ vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
+ add $(VEC_SIZE * 4), %rax
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_second_vector)
+
+ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+ vpmovmskb %ymm1, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_third_vector)
+
+ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+ vpmovmskb %ymm2, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fourth_vector)
+
+ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+ vpmovmskb %ymm3, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fifth_vector)
+
+ vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
+ add $(VEC_SIZE * 4), %rax
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_second_vector)
+
+ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+ vpmovmskb %ymm1, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_third_vector)
+
+ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+ vpmovmskb %ymm2, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fourth_vector)
+
+ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+ vpmovmskb %ymm3, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fifth_vector)
+
+ test $((VEC_SIZE * 4) - 1), %rax
+ jz L(align_four_vec_loop)
+
+ vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
+ add $(VEC_SIZE * 5), %rax
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $((VEC_SIZE * 4) - 1), %rax
+ jz L(align_four_vec_loop)
+
+ vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
+ add $VEC_SIZE, %rax
+ vpmovmskb %ymm1, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $((VEC_SIZE * 4) - 1), %rax
+ jz L(align_four_vec_loop)
+
+ vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
+ add $VEC_SIZE, %rax
+ vpmovmskb %ymm2, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $((VEC_SIZE * 4) - 1), %rax
+ jz L(align_four_vec_loop)
+
+ vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
+ add $VEC_SIZE, %rax
+ vpmovmskb %ymm3, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ add $VEC_SIZE, %rax
+
+ .p2align 4
+L(align_four_vec_loop):
+ vmovaps (%rax), %ymm4
+ vpminub VEC_SIZE(%rax), %ymm4, %ymm4
+ vmovaps (VEC_SIZE * 2)(%rax), %ymm5
+ vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5
+ add $(VEC_SIZE * 4), %rax
+ vpminub %ymm4, %ymm5, %ymm5
+ vpcmpeqb %ymm5, %ymm6, %ymm5
+ vpmovmskb %ymm5, %edx
+ test %edx, %edx
+ jz L(align_four_vec_loop)
+
+ vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
+ sub $(VEC_SIZE * 5), %rax
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_second_vector)
+
+ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+ vpmovmskb %ymm1, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_third_vector)
+
+ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+ vpmovmskb %ymm2, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fourth_vector)
+
+ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+ vpmovmskb %ymm3, %edx
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $(VEC_SIZE * 4), %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit):
+ sub %rdi, %rax
+L(exit_null_on_first_vector):
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_null_on_second_vector):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $VEC_SIZE, %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_null_on_third_vector):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $(VEC_SIZE * 2), %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_null_on_fourth_vector):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $(VEC_SIZE * 3), %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_null_on_fifth_vector):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $(VEC_SIZE * 4), %rax
+
+ .p2align 4
+L(StartStrcpyPart):
+ lea (%r9, %rax), %rdi
+ mov %rsi, %rcx
+ mov %r9, %rax /* save result */
+
+# ifdef USE_AS_STRNCAT
+ test %r8, %r8
+ jz L(ExitZero)
+# define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-avx2.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcat.c b/sysdeps/x86_64/multiarch/strcat.c
index 1f7f626..4b41e3b 100644
--- a/sysdeps/x86_64/multiarch/strcat.c
+++ b/sysdeps/x86_64/multiarch/strcat.c
@@ -24,7 +24,7 @@
# undef strcat
# define SYMBOL_NAME strcat
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-unaligned.h"
libc_ifunc_redirected (__redirect_strcat, strcat, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
new file mode 100644
index 0000000..1215bc3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -0,0 +1,3341 @@
+/* strcpy with AVX2
+ Copyright (C) 2011-2018 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+# include <sysdep.h>
+
+# ifndef STRCPY
+# define STRCPY __strcpy_avx2
+# endif
+
+# endif
+
+# define JMPTBL(I, B) I - B
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ lea TABLE(%rip), %r11; \
+ movslq (%r11, INDEX, SCALE), %rcx; \
+ lea (%r11, %rcx), %rcx; \
+ _CET_NOTRACK jmp *%rcx
+
+/* Number of bytes in a vector register */
+# ifndef VEC_SIZE
+# define VEC_SIZE 32
+# endif
+
+# ifndef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+# ifndef USE_AS_STRCAT
+
+.text
+ENTRY (STRCPY)
+# ifdef USE_AS_STRNCPY
+ mov %rdx, %r8
+ test %r8, %r8
+ jz L(ExitZero)
+# endif
+ mov %rsi, %rcx
+# ifndef USE_AS_STPCPY
+ mov %rdi, %rax /* save result */
+# endif
+
+# endif
+
+ and $((VEC_SIZE * 4) - 1), %rcx
+ cmp $(VEC_SIZE * 2), %rcx
+ jbe L(SourceStringAlignmentLessTwoVecSize)
+
+ and $-VEC_SIZE, %rsi
+ and $(VEC_SIZE - 1), %rcx
+ vpxor %xmm0, %xmm0, %xmm0
+ vpxor %xmm1, %xmm1, %xmm1
+
+ vpcmpeqb (%rsi), %ymm1, %ymm1
+ vpmovmskb %ymm1, %rdx
+ shr %cl, %rdx
+
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ mov $VEC_SIZE, %r10
+ sub %rcx, %r10
+ cmp %r10, %r8
+# else
+ mov $(VEC_SIZE + 1), %r10
+ sub %rcx, %r10
+ cmp %r10, %r8
+# endif
+ jbe L(CopyVecSizeTailCase2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(CopyVecSizeTail)
+
+ vpcmpeqb VEC_SIZE(%rsi), %ymm0, %ymm0
+ vpmovmskb %ymm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+ add $VEC_SIZE, %r10
+ cmp %r10, %r8
+ jbe L(CopyTwoVecSizeCase2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(CopyTwoVecSize)
+
+ vmovdqu (%rsi, %rcx), %ymm1 /* copy VEC_SIZE bytes */
+ vmovdqu %ymm1, (%rdi)
+
+/* If source address alignment != destination address alignment */
+ .p2align 4
+L(UnalignVecSizeBoth):
+ sub %rcx, %rdi
+# ifdef USE_AS_STRNCPY
+ add %rcx, %r8
+ sbb %rcx, %rcx
+ or %rcx, %r8
+# endif
+ mov $VEC_SIZE, %rcx
+ vmovdqa (%rsi, %rcx), %ymm1
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
+ vmovdqu %ymm1, (%rdi, %rcx)
+ vpcmpeqb %ymm2, %ymm0, %ymm0
+ vpmovmskb %ymm0, %rdx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $(VEC_SIZE * 3), %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec2)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
+ vmovdqu %ymm2, (%rdi, %rcx)
+ vpcmpeqb %ymm3, %ymm0, %ymm0
+ vpmovmskb %ymm0, %rdx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec3)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
+ vmovdqu %ymm3, (%rdi, %rcx)
+ vpcmpeqb %ymm4, %ymm0, %ymm0
+ vpmovmskb %ymm0, %rdx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec4)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm1
+ vmovdqu %ymm4, (%rdi, %rcx)
+ vpcmpeqb %ymm1, %ymm0, %ymm0
+ vpmovmskb %ymm0, %rdx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec1)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
+ vmovdqu %ymm1, (%rdi, %rcx)
+ vpcmpeqb %ymm2, %ymm0, %ymm0
+ vpmovmskb %ymm0, %rdx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec2)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
+ vmovdqu %ymm2, (%rdi, %rcx)
+ vpcmpeqb %ymm3, %ymm0, %ymm0
+ vpmovmskb %ymm0, %rdx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec3)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vmovdqu %ymm3, (%rdi, %rcx)
+ mov %rsi, %rdx
+ lea VEC_SIZE(%rsi, %rcx), %rsi
+ and $-(VEC_SIZE * 4), %rsi
+ sub %rsi, %rdx
+ sub %rdx, %rdi
+# ifdef USE_AS_STRNCPY
+ lea (VEC_SIZE * 8)(%r8, %rdx), %r8
+# endif
+L(UnalignedFourVecSizeLoop):
+ vmovdqa (%rsi), %ymm4
+ vmovdqa VEC_SIZE(%rsi), %ymm5
+ vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
+ vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
+ vpminub %ymm5, %ymm4, %ymm2
+ vpminub %ymm7, %ymm6, %ymm3
+ vpminub %ymm2, %ymm3, %ymm3
+ vpcmpeqb %ymm0, %ymm3, %ymm3
+ vpmovmskb %ymm3, %rdx
+# ifdef USE_AS_STRNCPY
+ sub $(VEC_SIZE * 4), %r8
+ jbe L(UnalignedLeaveCase2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(UnalignedFourVecSizeLeave)
+
+L(UnalignedFourVecSizeLoop_start):
+ add $(VEC_SIZE * 4), %rdi
+ add $(VEC_SIZE * 4), %rsi
+ vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
+ vmovdqa (%rsi), %ymm4
+ vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
+ vmovdqa VEC_SIZE(%rsi), %ymm5
+ vpminub %ymm5, %ymm4, %ymm2
+ vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
+ vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
+ vmovdqu %ymm7, -VEC_SIZE(%rdi)
+ vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
+ vpminub %ymm7, %ymm6, %ymm3
+ vpminub %ymm2, %ymm3, %ymm3
+ vpcmpeqb %ymm0, %ymm3, %ymm3
+ vpmovmskb %ymm3, %rdx
+# ifdef USE_AS_STRNCPY
+ sub $(VEC_SIZE * 4), %r8
+ jbe L(UnalignedLeaveCase2OrCase3)
+# endif
+ test %rdx, %rdx
+ jz L(UnalignedFourVecSizeLoop_start)
+
+L(UnalignedFourVecSizeLeave):
+ vpxor %xmm1, %xmm1, %xmm1
+
+ vpcmpeqb %ymm4, %ymm0, %ymm0
+ vpmovmskb %ymm0, %rdx
+ test %rdx, %rdx
+ jnz L(CopyVecSizeUnaligned_0)
+
+ vpcmpeqb %ymm5, %ymm1, %ymm1
+ vpmovmskb %ymm1, %rcx
+ test %rcx, %rcx
+ jnz L(CopyVecSizeUnaligned_16)
+
+ vpcmpeqb %ymm6, %ymm0, %ymm0
+ vpmovmskb %ymm0, %rdx
+ test %rdx, %rdx
+ jnz L(CopyVecSizeUnaligned_32)
+
+ vpcmpeqb %ymm7, %ymm1, %ymm1
+ vpmovmskb %ymm1, %rcx
+ bsf %rcx, %rdx
+ vmovdqu %ymm4, (%rdi)
+ vmovdqu %ymm5, VEC_SIZE(%rdi)
+ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea (VEC_SIZE * 3)(%rdi, %rdx), %rax
+# endif
+ vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
+ add $(VEC_SIZE - 1), %r8
+ sub %rdx, %r8
+ lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $(VEC_SIZE * 3), %rsi
+ add $(VEC_SIZE * 3), %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentLessTwoVecSize):
+ vpxor %xmm0, %xmm0, %xmm0
+ vmovdqu (%rsi), %ymm1
+ vmovdqu VEC_SIZE(%rsi), %ymm2
+ vpcmpeqb %ymm1, %ymm0, %ymm0
+ vpmovmskb %ymm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ cmp $VEC_SIZE, %r8
+# else
+ cmp $(VEC_SIZE + 1), %r8
+# endif
+ jbe L(CopyVecSizeTail1Case2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(CopyVecSizeTail1)
+
+ vpcmpeqb %ymm2, %ymm0, %ymm0
+ vmovdqu %ymm1, (%rdi)
+ vpmovmskb %ymm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ cmp $(VEC_SIZE * 2), %r8
+# else
+ cmp $((VEC_SIZE * 2) + 1), %r8
+# endif
+ jbe L(CopyTwoVecSize1Case2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(CopyTwoVecSize1)
+
+ and $-VEC_SIZE, %rsi
+ and $(VEC_SIZE - 1), %rcx
+ jmp L(UnalignVecSizeBoth)
+
+/*------End of main part with loops---------------------*/
+
+/* Case1 */
+
+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
+ .p2align 4
+L(CopyVecSize):
+ add %rcx, %rdi
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+ .p2align 4
+L(CopyVecSizeTail):
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyTwoVecSize1):
+ add $VEC_SIZE, %rsi
+ add $VEC_SIZE, %rdi
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $VEC_SIZE, %r8
+# endif
+L(CopyVecSizeTail1):
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyTwoVecSize):
+ bsf %rdx, %rdx
+ add %rcx, %rsi
+ add $VEC_SIZE, %rdx
+ sub %rcx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyVecSizeUnaligned_0):
+ bsf %rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+ vmovdqu %ymm4, (%rdi)
+ add $((VEC_SIZE * 4) - 1), %r8
+ sub %rdx, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+ .p2align 4
+L(CopyVecSizeUnaligned_16):
+ bsf %rcx, %rdx
+ vmovdqu %ymm4, (%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea VEC_SIZE(%rdi, %rdx), %rax
+# endif
+ vmovdqu %ymm5, VEC_SIZE(%rdi)
+ add $((VEC_SIZE * 3) - 1), %r8
+ sub %rdx, %r8
+ lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $VEC_SIZE, %rsi
+ add $VEC_SIZE, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+ .p2align 4
+L(CopyVecSizeUnaligned_32):
+ bsf %rdx, %rdx
+ vmovdqu %ymm4, (%rdi)
+ vmovdqu %ymm5, VEC_SIZE(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea (VEC_SIZE * 2)(%rdi, %rdx), %rax
+# endif
+ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
+ add $((VEC_SIZE * 2) - 1), %r8
+ sub %rdx, %r8
+ lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $(VEC_SIZE * 2), %rsi
+ add $(VEC_SIZE * 2), %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+# ifdef USE_AS_STRNCPY
+# ifndef USE_AS_STRCAT
+ .p2align 4
+L(CopyVecSizeUnalignedVec6):
+ vmovdqu %ymm6, (%rdi, %rcx)
+ jmp L(CopyVecSizeVecExit)
+
+ .p2align 4
+L(CopyVecSizeUnalignedVec5):
+ vmovdqu %ymm5, (%rdi, %rcx)
+ jmp L(CopyVecSizeVecExit)
+
+ .p2align 4
+L(CopyVecSizeUnalignedVec4):
+ vmovdqu %ymm4, (%rdi, %rcx)
+ jmp L(CopyVecSizeVecExit)
+
+ .p2align 4
+L(CopyVecSizeUnalignedVec3):
+ vmovdqu %ymm3, (%rdi, %rcx)
+ jmp L(CopyVecSizeVecExit)
+
+ .p2align 4
+L(CopyVecSizeUnalignedVec1):
+ vmovdqu %ymm1, (%rdi, %rcx)
+ jmp L(CopyVecSizeVecExit)
+# endif
+
+ .p2align 4
+L(CopyVecSizeExit):
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+/* Case2 */
+
+ .p2align 4
+L(CopyVecSizeCase2):
+ add $VEC_SIZE, %r8
+ add %rcx, %rdi
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyVecSizeExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyTwoVecSizeCase2):
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ add $VEC_SIZE, %rdx
+ sub %rcx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyVecSizeExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyVecSizeTailCase2):
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyVecSizeExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyVecSizeTail1Case2):
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyVecSizeExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+/* Case2 or Case3, Case3 */
+
+ .p2align 4
+L(CopyVecSizeCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyVecSizeCase2)
+L(CopyVecSizeCase3):
+ add $VEC_SIZE, %r8
+ add %rcx, %rdi
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyTwoVecSizeCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyTwoVecSizeCase2)
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyVecSizeTailCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyVecSizeTailCase2)
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyTwoVecSize1Case2OrCase3):
+ add $VEC_SIZE, %rdi
+ add $VEC_SIZE, %rsi
+ sub $VEC_SIZE, %r8
+L(CopyVecSizeTail1Case2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyVecSizeTail1Case2)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+# endif
+
+/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
+
+ .p2align 4
+L(Exit1):
+ mov %dh, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea (%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $1, %r8
+ lea 1(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit2):
+ mov (%rsi), %dx
+ mov %dx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $2, %r8
+ lea 2(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit3):
+ mov (%rsi), %cx
+ mov %cx, (%rdi)
+ mov %dh, 2(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $3, %r8
+ lea 3(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit4):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 3(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $4, %r8
+ lea 4(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit5):
+ mov (%rsi), %ecx
+ mov %dh, 4(%rdi)
+ mov %ecx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 4(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $5, %r8
+ lea 5(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit6):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dx
+ mov %ecx, (%rdi)
+ mov %dx, 4(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 5(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $6, %r8
+ lea 6(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit7):
+ mov (%rsi), %ecx
+ mov 3(%rsi), %edx
+ mov %ecx, (%rdi)
+ mov %edx, 3(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 6(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $7, %r8
+ lea 7(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit8):
+ mov (%rsi), %rdx
+ mov %rdx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 7(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $8, %r8
+ lea 8(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit9):
+ mov (%rsi), %rcx
+ mov %dh, 8(%rdi)
+ mov %rcx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 8(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $9, %r8
+ lea 9(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit10):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dx
+ mov %rcx, (%rdi)
+ mov %dx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 9(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $10, %r8
+ lea 10(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit11):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 10(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $11, %r8
+ lea 11(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit12):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 11(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $12, %r8
+ lea 12(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit13):
+ mov (%rsi), %rcx
+ mov 5(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 5(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 12(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $13, %r8
+ lea 13(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit14):
+ mov (%rsi), %rcx
+ mov 6(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 6(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 13(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $14, %r8
+ lea 14(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit15):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 14(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $15, %r8
+ lea 15(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit16):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 15(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $16, %r8
+ lea 16(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit17):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+ mov %dh, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 16(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $17, %r8
+ lea 17(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit18):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %cx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 17(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $18, %r8
+ lea 18(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit19):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 18(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $19, %r8
+ lea 19(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit20):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 19(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $20, %r8
+ lea 20(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit21):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+ mov %dh, 20(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 20(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $21, %r8
+ lea 21(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit22):
+ movdqu (%rsi), %xmm0
+ mov 14(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 14(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 21(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $22, %r8
+ lea 22(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit23):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 22(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $23, %r8
+ lea 23(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit24):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 23(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $24, %r8
+ lea 24(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit25):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+ mov %dh, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 24(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $25, %r8
+ lea 25(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit26):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 25(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $26, %r8
+ lea 26(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit27):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 23(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 23(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 26(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $27, %r8
+ lea 27(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit28):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 27(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $28, %r8
+ lea 28(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit29):
+ movdqu (%rsi), %xmm0
+ movdqu 13(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 13(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 28(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $29, %r8
+ lea 29(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit30):
+ movdqu (%rsi), %xmm0
+ movdqu 14(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 14(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 29(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $30, %r8
+ lea 30(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit31):
+ movdqu (%rsi), %xmm0
+ movdqu 15(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 30(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $31, %r8
+ lea 31(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit32):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 31(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $32, %r8
+ lea 32(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit33):
+ /* 0/32, 32/1 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu %ymm0, (%rdi)
+ mov %dh, 32(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 32(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $33, %r8
+ lea 33(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit34):
+ /* 0/32, 32/2 */
+ vmovdqu (%rsi), %ymm0
+ mov 32(%rsi), %dx
+ vmovdqu %ymm0, (%rdi)
+ mov %dx, 32(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 33(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $34, %r8
+ lea 34(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit35):
+ /* 0/32, 31/4 */
+ vmovdqu (%rsi), %ymm0
+ mov 31(%rsi), %edx
+ vmovdqu %ymm0, (%rdi)
+ mov %edx, 31(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 34(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $35, %r8
+ lea 35(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit36):
+ /* 0/32, 32/4 */
+ vmovdqu (%rsi), %ymm0
+ mov 32(%rsi), %edx
+ vmovdqu %ymm0, (%rdi)
+ mov %edx, 32(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 35(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $36, %r8
+ lea 36(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit37):
+ /* 0/32, 29/8 */
+ vmovdqu (%rsi), %ymm0
+ mov 29(%rsi), %rdx
+ vmovdqu %ymm0, (%rdi)
+ mov %rdx, 29(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 36(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $37, %r8
+ lea 37(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit38):
+ /* 0/32, 30/8 */
+ vmovdqu (%rsi), %ymm0
+ mov 30(%rsi), %rdx
+ vmovdqu %ymm0, (%rdi)
+ mov %rdx, 30(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 37(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $38, %r8
+ lea 38(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit39):
+ /* 0/32, 31/8 */
+ vmovdqu (%rsi), %ymm0
+ mov 31(%rsi), %rdx
+ vmovdqu %ymm0, (%rdi)
+ mov %rdx, 31(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 38(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $39, %r8
+ lea 39(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit40):
+ /* 0/32, 32/8 */
+ vmovdqu (%rsi), %ymm0
+ mov 32(%rsi), %rdx
+ vmovdqu %ymm0, (%rdi)
+ mov %rdx, 32(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 39(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $40, %r8
+ lea 40(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit41):
+ /* 0/32, 32/8, 40/1 */
+ vmovdqu (%rsi), %ymm0
+ mov 32(%rsi), %rcx
+ vmovdqu %ymm0, (%rdi)
+ mov %rcx, 32(%rdi)
+ mov %dh, 40(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 40(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $41, %r8
+ lea 41(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit42):
+ /* 0/32, 32/8, 40/2 */
+ vmovdqu (%rsi), %ymm0
+ mov 32(%rsi), %rcx
+ mov 40(%rsi), %dx
+ vmovdqu %ymm0, (%rdi)
+ mov %rcx, 32(%rdi)
+ mov %dx, 40(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 41(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $42, %r8
+ lea 42(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit43):
+ /* 0/32, 27/16 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 27(%rsi), %xmm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm1, 27(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 42(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $43, %r8
+ lea 43(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit44):
+ /* 0/32, 28/16 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 28(%rsi), %xmm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm1, 28(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 43(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $44, %r8
+ lea 44(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit45):
+ /* 0/32, 29/16 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 29(%rsi), %xmm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm1, 29(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 44(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $45, %r8
+ lea 45(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit46):
+ /* 0/32, 30/16 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 30(%rsi), %xmm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm1, 30(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 45(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $46, %r8
+ lea 46(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit47):
+ /* 0/32, 31/16 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 31(%rsi), %xmm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm1, 31(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 46(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $47, %r8
+ lea 47(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit48):
+ /* 0/32, 32/16 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm1, 32(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 47(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $48, %r8
+ lea 48(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit49):
+ /* 0/32, 32/16, 48/1 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm1, 32(%rdi)
+ mov %dh, 48(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 48(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $49, %r8
+ lea 49(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit50):
+ /* 0/32, 32/16, 48/2 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm1
+ mov 48(%rsi), %dx
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm1, 32(%rdi)
+ mov %dx, 48(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 49(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $50, %r8
+ lea 50(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit51):
+ /* 0/32, 32/16, 47/4 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm1
+ mov 47(%rsi), %edx
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm1, 32(%rdi)
+ mov %edx, 47(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 50(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $51, %r8
+ lea 51(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit52):
+ /* 0/32, 32/16, 48/4 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm1
+ mov 48(%rsi), %edx
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm1, 32(%rdi)
+ mov %edx, 48(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 51(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $52, %r8
+ lea 52(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit53):
+ /* 0/32, 32/16, 45/8 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm1
+ mov 45(%rsi), %rdx
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm1, 32(%rdi)
+ mov %rdx, 45(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 52(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $53, %r8
+ lea 53(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit54):
+ /* 0/32, 32/16, 46/8 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm1
+ mov 46(%rsi), %rdx
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm1, 32(%rdi)
+ mov %rdx, 46(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 53(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $54, %r8
+ lea 54(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit55):
+ /* 0/32, 32/16, 47/8 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm1
+ mov 47(%rsi), %rdx
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm1, 32(%rdi)
+ mov %rdx, 47(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 54(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $55, %r8
+ lea 55(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit56):
+ /* 0/32, 32/16, 48/8 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm1
+ mov 48(%rsi), %rdx
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm1, 32(%rdi)
+ mov %rdx, 48(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 55(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $56, %r8
+ lea 56(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit57):
+ /* 0/32, 25/32 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 25(%rsi), %ymm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, 25(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 56(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $57, %r8
+ lea 57(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit58):
+ /* 0/32, 26/32 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 26(%rsi), %ymm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, 26(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 57(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $58, %r8
+ lea 58(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit59):
+ /* 0/32, 27/32 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 27(%rsi), %ymm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, 27(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 58(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $59, %r8
+ lea 59(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit60):
+ /* 0/32, 28/32 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 28(%rsi), %ymm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, 28(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 59(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $60, %r8
+ lea 60(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit61):
+ /* 0/32, 29/32 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 29(%rsi), %ymm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, 29(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 60(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $61, %r8
+ lea 61(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit62):
+ /* 0/32, 30/32 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 30(%rsi), %ymm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, 30(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 61(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $62, %r8
+ lea 62(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit63):
+ /* 0/32, 31/32 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 31(%rsi), %ymm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, 31(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 62(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $63, %r8
+ lea 63(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit64):
+ /* 0/32, 32/32 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %ymm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, 32(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 63(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $64, %r8
+ lea 64(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+# ifdef USE_AS_STRNCPY
+
+ .p2align 4
+L(StrncpyExit0):
+# ifdef USE_AS_STPCPY
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, (%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit1):
+ mov (%rsi), %dl
+ mov %dl, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 1(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit2):
+ mov (%rsi), %dx
+ mov %dx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 2(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit3):
+ mov (%rsi), %cx
+ mov 2(%rsi), %dl
+ mov %cx, (%rdi)
+ mov %dl, 2(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 3(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 3(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit4):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 4(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 4(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit5):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dl
+ mov %ecx, (%rdi)
+ mov %dl, 4(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 5(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 5(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit6):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dx
+ mov %ecx, (%rdi)
+ mov %dx, 4(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 6(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 6(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit7):
+ mov (%rsi), %ecx
+ mov 3(%rsi), %edx
+ mov %ecx, (%rdi)
+ mov %edx, 3(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 7(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 7(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit8):
+ mov (%rsi), %rdx
+ mov %rdx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 8(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 8(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit9):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dl
+ mov %rcx, (%rdi)
+ mov %dl, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 9(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 9(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit10):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dx
+ mov %rcx, (%rdi)
+ mov %dx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 10(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 10(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit11):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 11(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 11(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit12):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 12(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 12(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit13):
+ mov (%rsi), %rcx
+ mov 5(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 5(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 13(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 13(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit14):
+ mov (%rsi), %rcx
+ mov 6(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 6(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 14(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 14(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit15):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 15(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 15(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit16):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 16(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 16(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit17):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ mov %cl, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 17(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 17(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit18):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %cx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 18(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 18(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit19):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 19(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 19(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit20):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 20(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 20(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit21):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ mov 20(%rsi), %dl
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+ mov %dl, 20(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 21(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 21(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit22):
+ movdqu (%rsi), %xmm0
+ mov 14(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 14(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 22(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 22(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit23):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 23(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 23(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit24):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 24(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 24(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit25):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cl, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 25(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 25(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit26):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 26(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 26(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit27):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 23(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 23(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 27(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 27(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit28):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 28(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 28(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit29):
+ movdqu (%rsi), %xmm0
+ movdqu 13(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 13(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 29(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 29(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit30):
+ movdqu (%rsi), %xmm0
+ movdqu 14(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 14(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 30(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 30(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit31):
+ movdqu (%rsi), %xmm0
+ movdqu 15(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 31(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 31(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit32):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 32(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 32(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit33):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ mov 32(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ mov %cl, 32(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 33(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 33(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit34):
+ /* 0/32, 32/2 */
+ vmovdqu (%rsi), %ymm0
+ mov 32(%rsi), %cx
+ vmovdqu %ymm0, (%rdi)
+ mov %cx, 32(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 34(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 34(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit35):
+ /* 0/32, 31/4 */
+ vmovdqu (%rsi), %ymm0
+ mov 31(%rsi), %ecx
+ vmovdqu %ymm0, (%rdi)
+ mov %ecx, 31(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 35(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 35(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit36):
+ /* 0/32, 32/4 */
+ vmovdqu (%rsi), %ymm0
+ mov 32(%rsi), %ecx
+ vmovdqu %ymm0, (%rdi)
+ mov %ecx, 32(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 36(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 36(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit37):
+ /* 0/32, 29/8 */
+ vmovdqu (%rsi), %ymm0
+ mov 29(%rsi), %rcx
+ vmovdqu %ymm0, (%rdi)
+ mov %rcx, 29(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 37(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 37(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit38):
+ /* 0/32, 30/8 */
+ vmovdqu (%rsi), %ymm0
+ mov 30(%rsi), %rcx
+ vmovdqu %ymm0, (%rdi)
+ mov %rcx, 30(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 38(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 38(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit39):
+ /* 0/32, 31/8 */
+ vmovdqu (%rsi), %ymm0
+ mov 31(%rsi), %rcx
+ vmovdqu %ymm0, (%rdi)
+ mov %rcx, 31(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 39(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 39(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit40):
+ /* 0/32, 32/8 */
+ vmovdqu (%rsi), %ymm0
+ mov 32(%rsi), %rcx
+ vmovdqu %ymm0, (%rdi)
+ mov %rcx, 32(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 40(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 40(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit41):
+ /* 0/32, 32/8, 40/1 */
+ vmovdqu (%rsi), %ymm0
+ mov 32(%rsi), %rdx
+ mov 40(%rsi), %cl
+ vmovdqu %ymm0, (%rdi)
+ mov %rdx, 32(%rdi)
+ mov %cl, 40(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 41(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 41(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit42):
+ /* 0/32, 32/8, 40/2 */
+ vmovdqu (%rsi), %ymm0
+ mov 32(%rsi), %rdx
+ mov 40(%rsi), %cx
+ vmovdqu %ymm0, (%rdi)
+ mov %rdx, 32(%rdi)
+ mov %cx, 40(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 42(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 42(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit43):
+ /* 0/32, 27/16 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 27(%rsi), %xmm2
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm2, 27(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 43(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 43(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit44):
+ /* 0/32, 28/16 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 28(%rsi), %xmm2
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm2, 28(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 44(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 44(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit45):
+ /* 0/32, 29/16 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 29(%rsi), %xmm2
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm2, 29(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 45(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 45(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit46):
+ /* 0/32, 30/16 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 30(%rsi), %xmm2
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm2, 30(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 46(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 46(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit47):
+ /* 0/32, 31/16 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 31(%rsi), %xmm2
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm2, 31(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 47(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 47(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit48):
+ /* 0/32, 32/16 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm2
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm2, 32(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 48(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 48(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit49):
+ /* 0/32, 32/16, 48/1 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm2
+ mov 48(%rsi), %cl
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm2, 32(%rdi)
+ mov %cl, 48(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 49(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 49(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit50):
+ /* 0/32, 32/16, 48/2 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm2
+ mov 48(%rsi), %cx
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm2, 32(%rdi)
+ mov %cx, 48(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 50(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 50(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit51):
+ /* 0/32, 32/16, 47/4 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm2
+ mov 47(%rsi), %ecx
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm2, 32(%rdi)
+ mov %ecx, 47(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 51(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 51(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit52):
+ /* 0/32, 32/16, 48/4 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm2
+ mov 48(%rsi), %ecx
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm2, 32(%rdi)
+ mov %ecx, 48(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 52(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 52(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit53):
+ /* 0/32, 32/16, 45/8 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm2
+ mov 45(%rsi), %rcx
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm2, 32(%rdi)
+ mov %rcx, 45(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 53(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 53(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit54):
+ /* 0/32, 32/16, 46/8 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm2
+ mov 46(%rsi), %rcx
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm2, 32(%rdi)
+ mov %rcx, 46(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 54(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 54(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit55):
+ /* 0/32, 32/16, 47/8 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm2
+ mov 47(%rsi), %rcx
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm2, 32(%rdi)
+ mov %rcx, 47(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 55(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 55(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit56):
+ /* 0/32, 32/16, 48/8 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %xmm2
+ mov 48(%rsi), %rcx
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %xmm2, 32(%rdi)
+ mov %rcx, 48(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 56(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 56(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit57):
+ /* 0/32, 25/32 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 25(%rsi), %ymm2
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm2, 25(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 57(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 57(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit58):
+ /* 0/32, 26/32 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 26(%rsi), %ymm2
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm2, 26(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 58(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 58(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit59):
+ /* 0/32, 27/32 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 27(%rsi), %ymm2
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm2, 27(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 59(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 59(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+
+ .p2align 4
+L(StrncpyExit60):
+ /* 0/32, 28/32 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 28(%rsi), %ymm2
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm2, 28(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 60(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 60(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit61):
+ /* 0/32, 29/32 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 29(%rsi), %ymm2
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm2, 29(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 61(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 61(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit62):
+ /* 0/32, 30/32 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 30(%rsi), %ymm2
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm2, 30(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 62(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 62(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit63):
+ /* 0/32, 31/32 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 31(%rsi), %ymm2
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm2, 31(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 63(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 63(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit64):
+ /* 0/32, 32/32 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %ymm2
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm2, 32(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 64(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 64(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit65):
+ /* 0/32, 32/32, 64/1 */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %ymm2
+ mov 64(%rsi), %cl
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm2, 32(%rdi)
+ mov %cl, 64(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 65(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 65(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+# ifndef USE_AS_STRCAT
+
+ .p2align 4
+L(Fill0):
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill1):
+ mov %dl, (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill2):
+ mov %dx, (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill3):
+ mov %edx, -1(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill4):
+ mov %edx, (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill5):
+ mov %edx, (%rdi)
+ mov %dl, 4(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill6):
+ mov %edx, (%rdi)
+ mov %dx, 4(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill7):
+ mov %rdx, -1(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill8):
+ mov %rdx, (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill9):
+ mov %rdx, (%rdi)
+ mov %dl, 8(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill10):
+ mov %rdx, (%rdi)
+ mov %dx, 8(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill11):
+ mov %rdx, (%rdi)
+ mov %edx, 7(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill12):
+ mov %rdx, (%rdi)
+ mov %edx, 8(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill13):
+ mov %rdx, (%rdi)
+ mov %rdx, 5(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill14):
+ mov %rdx, (%rdi)
+ mov %rdx, 6(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill15):
+ vmovdqu %xmm0, -1(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill16):
+ vmovdqu %xmm0, (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill17):
+ vmovdqu %xmm0, (%rdi)
+ mov %dl, 16(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill18):
+ vmovdqu %xmm0, (%rdi)
+ mov %dx, 16(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill19):
+ vmovdqu %xmm0, (%rdi)
+ mov %edx, 15(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill20):
+ vmovdqu %xmm0, (%rdi)
+ mov %edx, 16(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill21):
+ vmovdqu %xmm0, (%rdi)
+ mov %edx, 16(%rdi)
+ mov %dl, 20(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill22):
+ vmovdqu %xmm0, (%rdi)
+ mov %edx, 16(%rdi)
+ mov %dx, 20(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill23):
+ vmovdqu %xmm0, (%rdi)
+ mov %rdx, 15(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill24):
+ vmovdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill25):
+ vmovdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %dl, 24(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill26):
+ vmovdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %dx, 24(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill27):
+ vmovdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %edx, 23(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill28):
+ vmovdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %edx, 24(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill29):
+ vmovdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %rdx, 21(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill30):
+ vmovdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %rdx, 22(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill31):
+ vmovdqu %ymm0, -1(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill32):
+ vmovdqu %ymm0, (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(CopyVecSizeUnalignedVec2):
+ vmovdqu %ymm2, (%rdi, %rcx)
+
+ .p2align 4
+L(CopyVecSizeVecExit):
+ bsf %rdx, %rdx
+ add $(VEC_SIZE - 1), %r8
+ add %rcx, %rdi
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+ sub %rdx, %r8
+ lea 1(%rdi, %rdx), %rdi
+
+ .p2align 4
+L(StrncpyFillTailWithZero):
+ vpxor %xmm0, %xmm0, %xmm0
+ xor %rdx, %rdx
+ sub $VEC_SIZE, %r8
+ jbe L(StrncpyFillExit)
+
+ vmovdqu %ymm0, (%rdi)
+ add $VEC_SIZE, %rdi
+
+ mov %rdi, %rsi
+ and $(VEC_SIZE - 1), %rsi
+ sub %rsi, %rdi
+ add %rsi, %r8
+ sub $(VEC_SIZE * 4), %r8
+ jb L(StrncpyFillLessFourVecSize)
+
+L(StrncpyFillLoopVmovdqa):
+ vmovdqa %ymm0, (%rdi)
+ vmovdqa %ymm0, VEC_SIZE(%rdi)
+ vmovdqa %ymm0, (VEC_SIZE * 2)(%rdi)
+ vmovdqa %ymm0, (VEC_SIZE * 3)(%rdi)
+ add $(VEC_SIZE * 4), %rdi
+ sub $(VEC_SIZE * 4), %r8
+ jae L(StrncpyFillLoopVmovdqa)
+
+L(StrncpyFillLessFourVecSize):
+ add $(VEC_SIZE * 2), %r8
+ jl L(StrncpyFillLessTwoVecSize)
+ vmovdqa %ymm0, (%rdi)
+ vmovdqa %ymm0, VEC_SIZE(%rdi)
+ add $(VEC_SIZE * 2), %rdi
+ sub $VEC_SIZE, %r8
+ jl L(StrncpyFillExit)
+ vmovdqa %ymm0, (%rdi)
+ add $VEC_SIZE, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillLessTwoVecSize):
+ add $VEC_SIZE, %r8
+ jl L(StrncpyFillExit)
+ vmovdqa %ymm0, (%rdi)
+ add $VEC_SIZE, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillExit):
+ add $VEC_SIZE, %r8
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+/* end of ifndef USE_AS_STRCAT */
+# endif
+
+ .p2align 4
+L(UnalignedLeaveCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(UnalignedFourVecSizeLeaveCase2)
+L(UnalignedFourVecSizeLeaveCase3):
+ lea (VEC_SIZE * 4)(%r8), %rcx
+ and $-VEC_SIZE, %rcx
+ add $(VEC_SIZE * 3), %r8
+ jl L(CopyVecSizeCase3)
+ vmovdqu %ymm4, (%rdi)
+ sub $VEC_SIZE, %r8
+ jb L(CopyVecSizeCase3)
+ vmovdqu %ymm5, VEC_SIZE(%rdi)
+ sub $VEC_SIZE, %r8
+ jb L(CopyVecSizeCase3)
+ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
+ sub $VEC_SIZE, %r8
+ jb L(CopyVecSizeCase3)
+ vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
+# ifdef USE_AS_STPCPY
+ lea (VEC_SIZE * 4)(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, (VEC_SIZE * 4)(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(UnalignedFourVecSizeLeaveCase2):
+ xor %ecx, %ecx
+ vpcmpeqb %ymm4, %ymm0, %ymm0
+ vpmovmskb %ymm0, %rdx
+ add $(VEC_SIZE * 3), %r8
+ jle L(CopyVecSizeCase2OrCase3)
+ test %rdx, %rdx
+# ifndef USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec4)
+# else
+ jnz L(CopyVecSize)
+# endif
+ vpcmpeqb %ymm5, %ymm0, %ymm0
+ vpmovmskb %ymm0, %rdx
+ vmovdqu %ymm4, (%rdi)
+ add $VEC_SIZE, %rcx
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+ test %rdx, %rdx
+# ifndef USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec5)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vpcmpeqb %ymm6, %ymm0, %ymm0
+ vpmovmskb %ymm0, %rdx
+ vmovdqu %ymm5, VEC_SIZE(%rdi)
+ add $VEC_SIZE, %rcx
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+ test %rdx, %rdx
+# ifndef USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec6)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vpcmpeqb %ymm7, %ymm0, %ymm0
+ vpmovmskb %ymm0, %rdx
+ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
+ lea VEC_SIZE(%rdi, %rcx), %rdi
+ lea VEC_SIZE(%rsi, %rcx), %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyVecSizeExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(ExitZero):
+# ifndef USE_AS_STRCAT
+ mov %rdi, %rax
+# endif
+ VZEROUPPER
+ ret
+
+# endif
+
+# ifndef USE_AS_STRCAT
+END (STRCPY)
+# else
+END (STRCAT)
+# endif
+ .p2align 4
+ .section .rodata
+L(ExitTable):
+ .int JMPTBL(L(Exit1), L(ExitTable))
+ .int JMPTBL(L(Exit2), L(ExitTable))
+ .int JMPTBL(L(Exit3), L(ExitTable))
+ .int JMPTBL(L(Exit4), L(ExitTable))
+ .int JMPTBL(L(Exit5), L(ExitTable))
+ .int JMPTBL(L(Exit6), L(ExitTable))
+ .int JMPTBL(L(Exit7), L(ExitTable))
+ .int JMPTBL(L(Exit8), L(ExitTable))
+ .int JMPTBL(L(Exit9), L(ExitTable))
+ .int JMPTBL(L(Exit10), L(ExitTable))
+ .int JMPTBL(L(Exit11), L(ExitTable))
+ .int JMPTBL(L(Exit12), L(ExitTable))
+ .int JMPTBL(L(Exit13), L(ExitTable))
+ .int JMPTBL(L(Exit14), L(ExitTable))
+ .int JMPTBL(L(Exit15), L(ExitTable))
+ .int JMPTBL(L(Exit16), L(ExitTable))
+ .int JMPTBL(L(Exit17), L(ExitTable))
+ .int JMPTBL(L(Exit18), L(ExitTable))
+ .int JMPTBL(L(Exit19), L(ExitTable))
+ .int JMPTBL(L(Exit20), L(ExitTable))
+ .int JMPTBL(L(Exit21), L(ExitTable))
+ .int JMPTBL(L(Exit22), L(ExitTable))
+ .int JMPTBL(L(Exit23), L(ExitTable))
+ .int JMPTBL(L(Exit24), L(ExitTable))
+ .int JMPTBL(L(Exit25), L(ExitTable))
+ .int JMPTBL(L(Exit26), L(ExitTable))
+ .int JMPTBL(L(Exit27), L(ExitTable))
+ .int JMPTBL(L(Exit28), L(ExitTable))
+ .int JMPTBL(L(Exit29), L(ExitTable))
+ .int JMPTBL(L(Exit30), L(ExitTable))
+ .int JMPTBL(L(Exit31), L(ExitTable))
+ .int JMPTBL(L(Exit32), L(ExitTable))
+ .int JMPTBL(L(Exit33), L(ExitTable))
+ .int JMPTBL(L(Exit34), L(ExitTable))
+ .int JMPTBL(L(Exit35), L(ExitTable))
+ .int JMPTBL(L(Exit36), L(ExitTable))
+ .int JMPTBL(L(Exit37), L(ExitTable))
+ .int JMPTBL(L(Exit38), L(ExitTable))
+ .int JMPTBL(L(Exit39), L(ExitTable))
+ .int JMPTBL(L(Exit40), L(ExitTable))
+ .int JMPTBL(L(Exit41), L(ExitTable))
+ .int JMPTBL(L(Exit42), L(ExitTable))
+ .int JMPTBL(L(Exit43), L(ExitTable))
+ .int JMPTBL(L(Exit44), L(ExitTable))
+ .int JMPTBL(L(Exit45), L(ExitTable))
+ .int JMPTBL(L(Exit46), L(ExitTable))
+ .int JMPTBL(L(Exit47), L(ExitTable))
+ .int JMPTBL(L(Exit48), L(ExitTable))
+ .int JMPTBL(L(Exit49), L(ExitTable))
+ .int JMPTBL(L(Exit50), L(ExitTable))
+ .int JMPTBL(L(Exit51), L(ExitTable))
+ .int JMPTBL(L(Exit52), L(ExitTable))
+ .int JMPTBL(L(Exit53), L(ExitTable))
+ .int JMPTBL(L(Exit54), L(ExitTable))
+ .int JMPTBL(L(Exit55), L(ExitTable))
+ .int JMPTBL(L(Exit56), L(ExitTable))
+ .int JMPTBL(L(Exit57), L(ExitTable))
+ .int JMPTBL(L(Exit58), L(ExitTable))
+ .int JMPTBL(L(Exit59), L(ExitTable))
+ .int JMPTBL(L(Exit60), L(ExitTable))
+ .int JMPTBL(L(Exit61), L(ExitTable))
+ .int JMPTBL(L(Exit62), L(ExitTable))
+ .int JMPTBL(L(Exit63), L(ExitTable))
+ .int JMPTBL(L(Exit64), L(ExitTable))
+# ifdef USE_AS_STRNCPY
+L(ExitStrncpyTable):
+ .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit34), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit35), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit36), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit37), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit38), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit39), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit40), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit41), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit42), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit43), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit44), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit45), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit46), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit47), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit48), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit49), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit50), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit51), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit52), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit53), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit54), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit55), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit56), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit57), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit58), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit59), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit60), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit61), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit62), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit63), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit64), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit65), L(ExitStrncpyTable))
+# ifndef USE_AS_STRCAT
+ .p2align 4
+L(FillTable):
+ .int JMPTBL(L(Fill0), L(FillTable))
+ .int JMPTBL(L(Fill1), L(FillTable))
+ .int JMPTBL(L(Fill2), L(FillTable))
+ .int JMPTBL(L(Fill3), L(FillTable))
+ .int JMPTBL(L(Fill4), L(FillTable))
+ .int JMPTBL(L(Fill5), L(FillTable))
+ .int JMPTBL(L(Fill6), L(FillTable))
+ .int JMPTBL(L(Fill7), L(FillTable))
+ .int JMPTBL(L(Fill8), L(FillTable))
+ .int JMPTBL(L(Fill9), L(FillTable))
+ .int JMPTBL(L(Fill10), L(FillTable))
+ .int JMPTBL(L(Fill11), L(FillTable))
+ .int JMPTBL(L(Fill12), L(FillTable))
+ .int JMPTBL(L(Fill13), L(FillTable))
+ .int JMPTBL(L(Fill14), L(FillTable))
+ .int JMPTBL(L(Fill15), L(FillTable))
+ .int JMPTBL(L(Fill16), L(FillTable))
+ .int JMPTBL(L(Fill17), L(FillTable))
+ .int JMPTBL(L(Fill18), L(FillTable))
+ .int JMPTBL(L(Fill19), L(FillTable))
+ .int JMPTBL(L(Fill20), L(FillTable))
+ .int JMPTBL(L(Fill21), L(FillTable))
+ .int JMPTBL(L(Fill22), L(FillTable))
+ .int JMPTBL(L(Fill23), L(FillTable))
+ .int JMPTBL(L(Fill24), L(FillTable))
+ .int JMPTBL(L(Fill25), L(FillTable))
+ .int JMPTBL(L(Fill26), L(FillTable))
+ .int JMPTBL(L(Fill27), L(FillTable))
+ .int JMPTBL(L(Fill28), L(FillTable))
+ .int JMPTBL(L(Fill29), L(FillTable))
+ .int JMPTBL(L(Fill30), L(FillTable))
+ .int JMPTBL(L(Fill31), L(FillTable))
+ .int JMPTBL(L(Fill32), L(FillTable))
+# endif
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy.c b/sysdeps/x86_64/multiarch/strcpy.c
index 12e0e3f..ef6858e 100644
--- a/sysdeps/x86_64/multiarch/strcpy.c
+++ b/sysdeps/x86_64/multiarch/strcpy.c
@@ -24,7 +24,7 @@
# undef strcpy
# define SYMBOL_NAME strcpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-unaligned.h"
libc_ifunc_redirected (__redirect_strcpy, strcpy, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
new file mode 100644
index 0000000..bfefa65
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_avx2
+#include "strcat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
index 841c165..e7757cd 100644
--- a/sysdeps/x86_64/multiarch/strncat.c
+++ b/sysdeps/x86_64/multiarch/strncat.c
@@ -24,7 +24,7 @@
# undef strncat
# define SYMBOL_NAME strncat
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-unaligned.h"
libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
strong_alias (strncat, __strncat);
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
new file mode 100644
index 0000000..9ef8c87
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_avx2
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
index 3c3de8b..d6d9dc7 100644
--- a/sysdeps/x86_64/multiarch/strncpy.c
+++ b/sysdeps/x86_64/multiarch/strncpy.c
@@ -24,7 +24,7 @@
# undef strncpy
# define SYMBOL_NAME strncpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-unaligned.h"
libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
-----------------------------------------------------------------------
hooks/post-receive
--
GNU C Library master sources