This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch hjl/strcpy-avx2-cleanup-v3-direct-branches created. glibc-2.27.9000-656-ge58ce5f


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, hjl/strcpy-avx2-cleanup-v3-direct-branches has been created
        at  e58ce5fabbb5635db944f2232155dc204fd45501 (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e58ce5fabbb5635db944f2232155dc204fd45501

commit e58ce5fabbb5635db944f2232155dc204fd45501
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Sep 26 15:33:22 2018 -0700

    Merge L(Exit1)

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 84fba69..7eeb255 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -368,11 +368,18 @@ L(CopyVecSizeExit):
 	jae	L(Exit5_7)
 	cmp	$3, %edx
 	je	L(Exit4)
-	cmp	$2, %edx
-	je	L(Exit3)
-	cmp	$0, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
+	cmp	$1, %edx
+	ja	L(Exit3)
+	je	L(Exit2)
+	movb	$0, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$1, %r8
+	lea	1(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
 	VZEROUPPER
 	ret
 
@@ -729,20 +736,6 @@ L(CopyVecSizeTail1Case2OrCase3):
 /*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
 
 	.p2align 4
-L(Exit1):
-	mov	%dh, (%rdi)
-# ifdef USE_AS_STPCPY
-	lea	(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$1, %r8
-	lea	1(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
 L(Exit2):
 	mov	(%rsi), %dx
 	mov	%dx, (%rdi)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=57cb5d8a1327ee54fd441b6e3ec4f1fa4b51127a

commit 57cb5d8a1327ee54fd441b6e3ec4f1fa4b51127a
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Sep 26 15:29:26 2018 -0700

    More L(CopyVecSizeExit) use

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 7efcec2..84fba69 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -391,31 +391,7 @@ L(CopyTwoVecSize):
 	add	%rcx, %rsi
 	add	$VEC_SIZE, %edx
 	sub	%ecx, %edx
-	cmp	$63, %edx
-	je	L(Exit64)
-	cmp	$32, %edx
-	jae	L(Exit33_63)
-	cmp	$31, %edx
-	je	L(Exit32)
-	cmp	$16, %edx
-	jae	L(Exit17_31)
-	cmp	$15, %edx
-	je	L(Exit16)
-	cmp	$8, %edx
-	jae	L(Exit9_15)
-	cmp	$7, %edx
-	je	L(Exit8)
-	cmp	$4, %edx
-	jae	L(Exit5_7)
-	cmp	$3, %edx
-	je	L(Exit4)
-	cmp	$2, %edx
-	je	L(Exit3)
-	cmp	$0, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	VZEROUPPER
-	ret
+	jmp	L(CopyVecSizeExit)
 
 	.p2align 4
 L(CopyVecSizeUnaligned_0):

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=08e3f3162ba71a22298179a2c5c8476287359ea2

commit 08e3f3162ba71a22298179a2c5c8476287359ea2
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Sep 26 15:26:21 2018 -0700

    Merge L(CopyVecSizeTail1)

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 85c284a..7efcec2 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -347,6 +347,7 @@ L(CopyVecSize):
 # endif
 L(CopyVecSizeTail):
 	add	%rcx, %rsi
+L(CopyVecSizeTail1):
 	bsf	%edx, %edx
 L(CopyVecSizeExit):
 	cmp	$63, %edx
@@ -382,33 +383,7 @@ L(CopyTwoVecSize1):
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$VEC_SIZE, %r8
 # endif
-L(CopyVecSizeTail1):
-	bsf	%edx, %edx
-	cmp	$63, %edx
-	je	L(Exit64)
-	cmp	$32, %edx
-	jae	L(Exit33_63)
-	cmp	$31, %edx
-	je	L(Exit32)
-	cmp	$16, %edx
-	jae	L(Exit17_31)
-	cmp	$15, %edx
-	je	L(Exit16)
-	cmp	$8, %edx
-	jae	L(Exit9_15)
-	cmp	$7, %edx
-	je	L(Exit8)
-	cmp	$4, %edx
-	jae	L(Exit5_7)
-	cmp	$3, %edx
-	je	L(Exit4)
-	cmp	$2, %edx
-	je	L(Exit3)
-	cmp	$0, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	VZEROUPPER
-	ret
+	jmp	L(CopyVecSizeTail1)
 
 	.p2align 4
 L(CopyTwoVecSize):

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c83b5a852f0a7ff09dbe9490e169cb30397ad17a

commit c83b5a852f0a7ff09dbe9490e169cb30397ad17a
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Sep 26 15:23:41 2018 -0700

    Use L(CopyVecSizeExit)

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 8dfb2d3..85c284a 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -295,32 +295,7 @@ L(UnalignedFourVecSizeLeave):
 # else
 	add	$(VEC_SIZE * 3), %rsi
 	add	$(VEC_SIZE * 3), %rdi
-	cmp	$63, %edx
-	je	L(Exit64)
-	cmp	$32, %edx
-	jae	L(Exit33_63)
-	cmp	$31, %edx
-	je	L(Exit32)
-	cmp	$16, %edx
-	jae	L(Exit17_31)
-	cmp	$15, %edx
-	je	L(Exit16)
-	cmp	$8, %edx
-	jae	L(Exit9_15)
-	cmp	$7, %edx
-	je	L(Exit8)
-	cmp	$4, %edx
-	jae	L(Exit5_7)
-	cmp	$3, %edx
-	je	L(Exit4)
-	cmp	$2, %edx
-	je	L(Exit3)
-	cmp	$0, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	VZEROUPPER
-	ret
-
+	jmp	L(CopyVecSizeExit)
 # endif
 
 /* If source address alignment == destination address alignment */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=95cc78baad0e4544967659bab8a0bec6863fe4ce

commit 95cc78baad0e4544967659bab8a0bec6863fe4ce
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Sep 26 15:20:47 2018 -0700

    Merge L(CopyVecSizeExit)

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index d3b4590..8dfb2d3 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -369,39 +369,11 @@ L(SourceStringAlignmentLessTwoVecSize):
 	.p2align 4
 L(CopyVecSize):
 	add	%rcx, %rdi
-	add	%rcx, %rsi
-	bsf	%edx, %edx
 # endif
-L(CopyVecSizeExit):
-	cmp	$63, %edx
-	je	L(Exit64)
-	cmp	$32, %edx
-	jae	L(Exit33_63)
-	cmp	$31, %edx
-	je	L(Exit32)
-	cmp	$16, %edx
-	jae	L(Exit17_31)
-	cmp	$15, %edx
-	je	L(Exit16)
-	cmp	$8, %edx
-	jae	L(Exit9_15)
-	cmp	$7, %edx
-	je	L(Exit8)
-	cmp	$4, %edx
-	jae	L(Exit5_7)
-	cmp	$3, %edx
-	je	L(Exit4)
-	cmp	$2, %edx
-	je	L(Exit3)
-	cmp	$0, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	VZEROUPPER
-	ret
-	.p2align 4
 L(CopyVecSizeTail):
 	add	%rcx, %rsi
 	bsf	%edx, %edx
+L(CopyVecSizeExit):
 	cmp	$63, %edx
 	je	L(Exit64)
 	cmp	$32, %edx

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=14d8b1cae5d1528a05f0e396d7bd2fa96aa2f64b

commit 14d8b1cae5d1528a05f0e396d7bd2fa96aa2f64b
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Sep 26 15:16:28 2018 -0700

    Merger L(CopyVecSizeExit)

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 00dfeb9..d3b4590 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -371,6 +371,8 @@ L(CopyVecSize):
 	add	%rcx, %rdi
 	add	%rcx, %rsi
 	bsf	%edx, %edx
+# endif
+L(CopyVecSizeExit):
 	cmp	$63, %edx
 	je	L(Exit64)
 	cmp	$32, %edx
@@ -396,7 +398,6 @@ L(CopyVecSize):
 	je	L(Exit1)
 	VZEROUPPER
 	ret
-# endif
 	.p2align 4
 L(CopyVecSizeTail):
 	add	%rcx, %rsi
@@ -507,31 +508,7 @@ L(CopyVecSizeUnaligned_0):
 	lea	1(%rdi, %rdx), %rdi
 	jmp	L(StrncpyFillTailWithZero)
 # else
-	cmp	$63, %edx
-	je	L(Exit64)
-	cmp	$32, %edx
-	jae	L(Exit33_63)
-	cmp	$31, %edx
-	je	L(Exit32)
-	cmp	$16, %edx
-	jae	L(Exit17_31)
-	cmp	$15, %edx
-	je	L(Exit16)
-	cmp	$8, %edx
-	jae	L(Exit9_15)
-	cmp	$7, %edx
-	je	L(Exit8)
-	cmp	$4, %edx
-	jae	L(Exit5_7)
-	cmp	$3, %edx
-	je	L(Exit4)
-	cmp	$2, %edx
-	je	L(Exit3)
-	cmp	$0, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	VZEROUPPER
-	ret
+	jmp	L(CopyVecSizeExit)
 # endif
 
 	.p2align 4
@@ -550,31 +527,7 @@ L(CopyVecSizeUnaligned_16):
 # else
 	add	$VEC_SIZE, %rsi
 	add	$VEC_SIZE, %rdi
-	cmp	$63, %edx
-	je	L(Exit64)
-	cmp	$32, %edx
-	jae	L(Exit33_63)
-	cmp	$31, %edx
-	je	L(Exit32)
-	cmp	$16, %edx
-	jae	L(Exit17_31)
-	cmp	$15, %edx
-	je	L(Exit16)
-	cmp	$8, %edx
-	jae	L(Exit9_15)
-	cmp	$7, %edx
-	je	L(Exit8)
-	cmp	$4, %edx
-	jae	L(Exit5_7)
-	cmp	$3, %edx
-	je	L(Exit4)
-	cmp	$2, %edx
-	je	L(Exit3)
-	cmp	$0, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	VZEROUPPER
-	ret
+	jmp	L(CopyVecSizeExit)
 # endif
 
 	.p2align 4
@@ -594,31 +547,7 @@ L(CopyVecSizeUnaligned_32):
 # else
 	add	$(VEC_SIZE * 2), %rsi
 	add	$(VEC_SIZE * 2), %rdi
-	cmp	$63, %edx
-	je	L(Exit64)
-	cmp	$32, %edx
-	jae	L(Exit33_63)
-	cmp	$31, %edx
-	je	L(Exit32)
-	cmp	$16, %edx
-	jae	L(Exit17_31)
-	cmp	$15, %edx
-	je	L(Exit16)
-	cmp	$8, %edx
-	jae	L(Exit9_15)
-	cmp	$7, %edx
-	je	L(Exit8)
-	cmp	$4, %edx
-	jae	L(Exit5_7)
-	cmp	$3, %edx
-	je	L(Exit4)
-	cmp	$2, %edx
-	je	L(Exit3)
-	cmp	$0, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	VZEROUPPER
-	ret
+	jmp	L(CopyVecSizeExit)
 # endif
 
 # ifdef USE_AS_STRNCPY
@@ -649,34 +578,6 @@ L(CopyVecSizeUnalignedVec1):
 	jmp	L(CopyVecSizeVecExit)
 #  endif
 
-	.p2align 4
-L(CopyVecSizeExit):
-	cmp	$63, %edx
-	je	L(Exit64)
-	cmp	$32, %edx
-	jae	L(Exit33_63)
-	cmp	$31, %edx
-	je	L(Exit32)
-	cmp	$16, %edx
-	jae	L(Exit17_31)
-	cmp	$15, %edx
-	je	L(Exit16)
-	cmp	$8, %edx
-	jae	L(Exit9_15)
-	cmp	$7, %edx
-	je	L(Exit8)
-	cmp	$4, %edx
-	jae	L(Exit5_7)
-	cmp	$3, %edx
-	je	L(Exit4)
-	cmp	$2, %edx
-	je	L(Exit3)
-	cmp	$0, %edx
-	ja	L(Exit2)
-	je	L(Exit1)
-	VZEROUPPER
-	ret
-
 /* Case2 */
 
 	.p2align 4

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b98f38b0e21a732eba99827fbf10382ed9900d68

commit b98f38b0e21a732eba99827fbf10382ed9900d68
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Wed Sep 26 12:35:38 2018 -0500

    use 32-bits register

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 408c038..00dfeb9 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1392,19 +1392,19 @@ L(StrncpyFillLessFourVecSize):
 	jl	L(StrncpyFillExit)
 	vmovdqa %ymmZ, (%rdi)
 	add	$VEC_SIZE, %rdi
-	cmp	$32, %r8
+	cmp	$32, %r8d
 	je	L(Fill32)
-	cmp	$17, %r8
+	cmp	$17, %r8d
 	jae	L(Fill17_31)
-	cmp	$15, %r8
+	cmp	$15, %r8d
 	jae	L(Fill15_16)
-	cmp	$8, %r8
+	cmp	$8, %r8d
 	jae	L(Fill8_14)
-	cmp	$4, %r8
+	cmp	$4, %r8d
 	jae	L(Fill4_7)
-	cmp	$3, %r8
+	cmp	$3, %r8d
 	je	L(Fill3)
-	cmp	$1, %r8
+	cmp	$1, %r8d
 	ja	L(Fill2)
 	je	L(Fill1)
 	VZEROUPPER
@@ -1416,19 +1416,19 @@ L(StrncpyFillLessTwoVecSize):
 	jl	L(StrncpyFillExit)
 	vmovdqa %ymmZ, (%rdi)
 	add	$VEC_SIZE, %rdi
-	cmp	$32, %r8
+	cmp	$32, %r8d
 	je	L(Fill32)
-	cmp	$17, %r8
+	cmp	$17, %r8d
 	jae	L(Fill17_31)
-	cmp	$15, %r8
+	cmp	$15, %r8d
 	jae	L(Fill15_16)
-	cmp	$8, %r8
+	cmp	$8, %r8d
 	jae	L(Fill8_14)
-	cmp	$4, %r8
+	cmp	$4, %r8d
 	jae	L(Fill4_7)
-	cmp	$3, %r8
+	cmp	$3, %r8d
 	je	L(Fill3)
-	cmp	$1, %r8
+	cmp	$1, %r8d
 	ja	L(Fill2)
 	je	L(Fill1)
 	VZEROUPPER
@@ -1436,19 +1436,19 @@ L(StrncpyFillLessTwoVecSize):
 
 L(StrncpyFillExit):
 	add	$VEC_SIZE, %r8
-	cmp	$32, %r8
+	cmp	$32, %r8d
 	je	L(Fill32)
-	cmp	$17, %r8
+	cmp	$17, %r8d
 	jae	L(Fill17_31)
-	cmp	$15, %r8
+	cmp	$15, %r8d
 	jae	L(Fill15_16)
-	cmp	$8, %r8
+	cmp	$8, %r8d
 	jae	L(Fill8_14)
-	cmp	$4, %r8
+	cmp	$4, %r8d
 	jae	L(Fill4_7)
-	cmp	$3, %r8
+	cmp	$3, %r8d
 	je	L(Fill3)
-	cmp	$1, %r8
+	cmp	$1, %r8d
 	ja	L(Fill2)
 	je	L(Fill1)
 	VZEROUPPER

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=532b9eda8b8ae7d5e87e3b7c9a0bf8a2496a02b0

commit 532b9eda8b8ae7d5e87e3b7c9a0bf8a2496a02b0
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Wed Sep 26 12:33:07 2018 -0500

    use 32-bits register

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 0c4a1e6..408c038 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -687,25 +687,25 @@ L(CopyVecSizeCase2):
 	bsf	%edx, %edx
 	cmp	%r8d, %edx
 	jb	L(CopyVecSizeExit)
-	cmp	$65, %r8
+	cmp	$65, %r8d
 	je	L(StrncpyExit65)
-	cmp	$33, %r8
+	cmp	$33, %r8d
 	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8
+	cmp	$17, %r8d
 	jae	L(StrncpyExit17_32)
-	cmp	$16, %r8
+	cmp	$16, %r8d
 	je	L(StrncpyExit16)
-	cmp	$15, %r8
+	cmp	$15, %r8d
 	je	L(StrncpyExit15)
-	cmp	$8, %r8
+	cmp	$8, %r8d
 	jae	L(StrncpyExit8_14)
-	cmp	$4, %r8
+	cmp	$4, %r8d
 	jae	L(StrncpyExit4_7)
-	cmp	$3, %r8
+	cmp	$3, %r8d
 	je	L(StrncpyExit3)
-	cmp	$2, %r8
+	cmp	$2, %r8d
 	je	L(StrncpyExit2)
-	cmp	$0, %r8
+	cmp	$0, %r8d
 	ja	L(StrncpyExit1)
 	je	L(StrncpyExit0)
 	VZEROUPPER
@@ -719,25 +719,25 @@ L(CopyTwoVecSizeCase2):
 	sub	%ecx, %edx
 	cmp	%r8d, %edx
 	jb	L(CopyVecSizeExit)
-	cmp	$65, %r8
+	cmp	$65, %r8d
 	je	L(StrncpyExit65)
-	cmp	$33, %r8
+	cmp	$33, %r8d
 	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8
+	cmp	$17, %r8d
 	jae	L(StrncpyExit17_32)
-	cmp	$16, %r8
+	cmp	$16, %r8d
 	je	L(StrncpyExit16)
-	cmp	$15, %r8
+	cmp	$15, %r8d
 	je	L(StrncpyExit15)
-	cmp	$8, %r8
+	cmp	$8, %r8d
 	jae	L(StrncpyExit8_14)
-	cmp	$4, %r8
+	cmp	$4, %r8d
 	jae	L(StrncpyExit4_7)
-	cmp	$3, %r8
+	cmp	$3, %r8d
 	je	L(StrncpyExit3)
-	cmp	$2, %r8
+	cmp	$2, %r8d
 	je	L(StrncpyExit2)
-	cmp	$0, %r8
+	cmp	$0, %r8d
 	ja	L(StrncpyExit1)
 	je	L(StrncpyExit0)
 	VZEROUPPER
@@ -749,25 +749,25 @@ L(CopyVecSizeTailCase2):
 	bsf	%edx, %edx
 	cmp	%r8d, %edx
 	jb	L(CopyVecSizeExit)
-	cmp	$65, %r8
+	cmp	$65, %r8d
 	je	L(StrncpyExit65)
-	cmp	$33, %r8
+	cmp	$33, %r8d
 	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8
+	cmp	$17, %r8d
 	jae	L(StrncpyExit17_32)
-	cmp	$16, %r8
+	cmp	$16, %r8d
 	je	L(StrncpyExit16)
-	cmp	$15, %r8
+	cmp	$15, %r8d
 	je	L(StrncpyExit15)
-	cmp	$8, %r8
+	cmp	$8, %r8d
 	jae	L(StrncpyExit8_14)
-	cmp	$4, %r8
+	cmp	$4, %r8d
 	jae	L(StrncpyExit4_7)
-	cmp	$3, %r8
+	cmp	$3, %r8d
 	je	L(StrncpyExit3)
-	cmp	$2, %r8
+	cmp	$2, %r8d
 	je	L(StrncpyExit2)
-	cmp	$0, %r8
+	cmp	$0, %r8d
 	ja	L(StrncpyExit1)
 	je	L(StrncpyExit0)
 	VZEROUPPER
@@ -777,25 +777,25 @@ L(CopyVecSizeTail1Case2):
 	bsf	%edx, %edx
 	cmp	%r8d, %edx
 	jb	L(CopyVecSizeExit)
-	cmp	$65, %r8
+	cmp	$65, %r8d
 	je	L(StrncpyExit65)
-	cmp	$33, %r8
+	cmp	$33, %r8d
 	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8
+	cmp	$17, %r8d
 	jae	L(StrncpyExit17_32)
-	cmp	$16, %r8
+	cmp	$16, %r8d
 	je	L(StrncpyExit16)
-	cmp	$15, %r8
+	cmp	$15, %r8d
 	je	L(StrncpyExit15)
-	cmp	$8, %r8
+	cmp	$8, %r8d
 	jae	L(StrncpyExit8_14)
-	cmp	$4, %r8
+	cmp	$4, %r8d
 	jae	L(StrncpyExit4_7)
-	cmp	$3, %r8
+	cmp	$3, %r8d
 	je	L(StrncpyExit3)
-	cmp	$2, %r8
+	cmp	$2, %r8d
 	je	L(StrncpyExit2)
-	cmp	$0, %r8
+	cmp	$0, %r8d
 	ja	L(StrncpyExit1)
 	je	L(StrncpyExit0)
 	VZEROUPPER
@@ -811,25 +811,25 @@ L(CopyVecSizeCase3):
 	add	$VEC_SIZE, %r8
 	add	%rcx, %rdi
 	add	%rcx, %rsi
-	cmp	$65, %r8
+	cmp	$65, %r8d
 	je	L(StrncpyExit65)
-	cmp	$33, %r8
+	cmp	$33, %r8d
 	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8
+	cmp	$17, %r8d
 	jae	L(StrncpyExit17_32)
-	cmp	$16, %r8
+	cmp	$16, %r8d
 	je	L(StrncpyExit16)
-	cmp	$15, %r8
+	cmp	$15, %r8d
 	je	L(StrncpyExit15)
-	cmp	$8, %r8
+	cmp	$8, %r8d
 	jae	L(StrncpyExit8_14)
-	cmp	$4, %r8
+	cmp	$4, %r8d
 	jae	L(StrncpyExit4_7)
-	cmp	$3, %r8
+	cmp	$3, %r8d
 	je	L(StrncpyExit3)
-	cmp	$2, %r8
+	cmp	$2, %r8d
 	je	L(StrncpyExit2)
-	cmp	$0, %r8
+	cmp	$0, %r8d
 	ja	L(StrncpyExit1)
 	je	L(StrncpyExit0)
 	VZEROUPPER
@@ -840,25 +840,25 @@ L(CopyTwoVecSizeCase2OrCase3):
 	test	%rdx, %rdx
 	jnz	L(CopyTwoVecSizeCase2)
 	add	%rcx, %rsi
-	cmp	$65, %r8
+	cmp	$65, %r8d
 	je	L(StrncpyExit65)
-	cmp	$33, %r8
+	cmp	$33, %r8d
 	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8
+	cmp	$17, %r8d
 	jae	L(StrncpyExit17_32)
-	cmp	$16, %r8
+	cmp	$16, %r8d
 	je	L(StrncpyExit16)
-	cmp	$15, %r8
+	cmp	$15, %r8d
 	je	L(StrncpyExit15)
-	cmp	$8, %r8
+	cmp	$8, %r8d
 	jae	L(StrncpyExit8_14)
-	cmp	$4, %r8
+	cmp	$4, %r8d
 	jae	L(StrncpyExit4_7)
-	cmp	$3, %r8
+	cmp	$3, %r8d
 	je	L(StrncpyExit3)
-	cmp	$2, %r8
+	cmp	$2, %r8d
 	je	L(StrncpyExit2)
-	cmp	$0, %r8
+	cmp	$0, %r8d
 	ja	L(StrncpyExit1)
 	je	L(StrncpyExit0)
 	VZEROUPPER
@@ -869,25 +869,25 @@ L(CopyVecSizeTailCase2OrCase3):
 	test	%rdx, %rdx
 	jnz	L(CopyVecSizeTailCase2)
 	add	%rcx, %rsi
-	cmp	$65, %r8
+	cmp	$65, %r8d
 	je	L(StrncpyExit65)
-	cmp	$33, %r8
+	cmp	$33, %r8d
 	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8
+	cmp	$17, %r8d
 	jae	L(StrncpyExit17_32)
-	cmp	$16, %r8
+	cmp	$16, %r8d
 	je	L(StrncpyExit16)
-	cmp	$15, %r8
+	cmp	$15, %r8d
 	je	L(StrncpyExit15)
-	cmp	$8, %r8
+	cmp	$8, %r8d
 	jae	L(StrncpyExit8_14)
-	cmp	$4, %r8
+	cmp	$4, %r8d
 	jae	L(StrncpyExit4_7)
-	cmp	$3, %r8
+	cmp	$3, %r8d
 	je	L(StrncpyExit3)
-	cmp	$2, %r8
+	cmp	$2, %r8d
 	je	L(StrncpyExit2)
-	cmp	$0, %r8
+	cmp	$0, %r8d
 	ja	L(StrncpyExit1)
 	je	L(StrncpyExit0)
 	VZEROUPPER
@@ -901,25 +901,25 @@ L(CopyTwoVecSize1Case2OrCase3):
 L(CopyVecSizeTail1Case2OrCase3):
 	test	%rdx, %rdx
 	jnz	L(CopyVecSizeTail1Case2)
-	cmp	$65, %r8
+	cmp	$65, %r8d
 	je	L(StrncpyExit65)
-	cmp	$33, %r8
+	cmp	$33, %r8d
 	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8
+	cmp	$17, %r8d
 	jae	L(StrncpyExit17_32)
-	cmp	$16, %r8
+	cmp	$16, %r8d
 	je	L(StrncpyExit16)
-	cmp	$15, %r8
+	cmp	$15, %r8d
 	je	L(StrncpyExit15)
-	cmp	$8, %r8
+	cmp	$8, %r8d
 	jae	L(StrncpyExit8_14)
-	cmp	$4, %r8
+	cmp	$4, %r8d
 	jae	L(StrncpyExit4_7)
-	cmp	$3, %r8
+	cmp	$3, %r8d
 	je	L(StrncpyExit3)
-	cmp	$2, %r8
+	cmp	$2, %r8d
 	je	L(StrncpyExit2)
-	cmp	$0, %r8
+	cmp	$0, %r8d
 	ja	L(StrncpyExit1)
 	je	L(StrncpyExit0)
 	VZEROUPPER
@@ -1532,25 +1532,25 @@ L(UnalignedFourVecSizeLeaveCase2):
 	bsf	%edx, %edx
 	cmp	%r8d, %edx
 	jb	L(CopyVecSizeExit)
-	cmp	$65, %r8
+	cmp	$65, %r8d
 	je	L(StrncpyExit65)
-	cmp	$33, %r8
+	cmp	$33, %r8d
 	jae	L(StrncpyExit33_64)
-	cmp	$17, %r8
+	cmp	$17, %r8d
 	jae	L(StrncpyExit17_32)
-	cmp	$16, %r8
+	cmp	$16, %r8d
 	je	L(StrncpyExit16)
-	cmp	$15, %r8
+	cmp	$15, %r8d
 	je	L(StrncpyExit15)
-	cmp	$8, %r8
+	cmp	$8, %r8d
 	jae	L(StrncpyExit8_14)
-	cmp	$4, %r8
+	cmp	$4, %r8d
 	jae	L(StrncpyExit4_7)
-	cmp	$3, %r8
+	cmp	$3, %r8d
 	je	L(StrncpyExit3)
-	cmp	$2, %r8
+	cmp	$2, %r8d
 	je	L(StrncpyExit2)
-	cmp	$0, %r8
+	cmp	$0, %r8d
 	ja	L(StrncpyExit1)
 	je	L(StrncpyExit0)
 	VZEROUPPER

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=71564cd3ec32db73b3d3a368d82fbd074e67b53f

commit 71564cd3ec32db73b3d3a368d82fbd074e67b53f
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Wed Sep 26 12:27:37 2018 -0500

    remove jump tables

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index e1bf592..0c4a1e6 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1571,177 +1571,4 @@ END (STRCPY)
 # else
 END (STRCAT)
 # endif
-	.p2align 4
-	.section .rodata
-L(ExitTable):
-	.int	JMPTBL(L(Exit1), L(ExitTable))
-	.int	JMPTBL(L(Exit2), L(ExitTable))
-	.int	JMPTBL(L(Exit3), L(ExitTable))
-	.int	JMPTBL(L(Exit4), L(ExitTable))
-	.int	JMPTBL(L(Exit5_7), L(ExitTable))
-	.int	JMPTBL(L(Exit5_7), L(ExitTable))
-	.int	JMPTBL(L(Exit5_7), L(ExitTable))
-	.int	JMPTBL(L(Exit8), L(ExitTable))
-	.int	JMPTBL(L(Exit9_15), L(ExitTable))
-	.int	JMPTBL(L(Exit9_15), L(ExitTable))
-	.int	JMPTBL(L(Exit9_15), L(ExitTable))
-	.int	JMPTBL(L(Exit9_15), L(ExitTable))
-	.int	JMPTBL(L(Exit9_15), L(ExitTable))
-	.int	JMPTBL(L(Exit9_15), L(ExitTable))
-	.int	JMPTBL(L(Exit9_15), L(ExitTable))
-	.int	JMPTBL(L(Exit16), L(ExitTable))
-	.int	JMPTBL(L(Exit17_31), L(ExitTable))
-	.int	JMPTBL(L(Exit17_31), L(ExitTable))
-	.int	JMPTBL(L(Exit17_31), L(ExitTable))
-	.int	JMPTBL(L(Exit17_31), L(ExitTable))
-	.int	JMPTBL(L(Exit17_31), L(ExitTable))
-	.int	JMPTBL(L(Exit17_31), L(ExitTable))
-	.int	JMPTBL(L(Exit17_31), L(ExitTable))
-	.int	JMPTBL(L(Exit17_31), L(ExitTable))
-	.int	JMPTBL(L(Exit17_31), L(ExitTable))
-	.int	JMPTBL(L(Exit17_31), L(ExitTable))
-	.int	JMPTBL(L(Exit17_31), L(ExitTable))
-	.int	JMPTBL(L(Exit17_31), L(ExitTable))
-	.int	JMPTBL(L(Exit17_31), L(ExitTable))
-	.int	JMPTBL(L(Exit17_31), L(ExitTable))
-	.int	JMPTBL(L(Exit17_31), L(ExitTable))
-	.int	JMPTBL(L(Exit32), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit33_63), L(ExitTable))
-	.int	JMPTBL(L(Exit64), L(ExitTable))
-# ifdef USE_AS_STRNCPY
-L(ExitStrncpyTable):
-	.int	JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit65), L(ExitStrncpyTable))
-#  ifndef USE_AS_STRCAT
-	.p2align 4
-L(FillTable):
-	.int	JMPTBL(L(Fill0), L(FillTable))
-	.int	JMPTBL(L(Fill1), L(FillTable))
-	.int	JMPTBL(L(Fill2), L(FillTable))
-	.int	JMPTBL(L(Fill3), L(FillTable))
-	.int	JMPTBL(L(Fill4_7), L(FillTable))
-	.int	JMPTBL(L(Fill4_7), L(FillTable))
-	.int	JMPTBL(L(Fill4_7), L(FillTable))
-	.int	JMPTBL(L(Fill4_7), L(FillTable))
-	.int	JMPTBL(L(Fill8_14), L(FillTable))
-	.int	JMPTBL(L(Fill8_14), L(FillTable))
-	.int	JMPTBL(L(Fill8_14), L(FillTable))
-	.int	JMPTBL(L(Fill8_14), L(FillTable))
-	.int	JMPTBL(L(Fill8_14), L(FillTable))
-	.int	JMPTBL(L(Fill8_14), L(FillTable))
-	.int	JMPTBL(L(Fill8_14), L(FillTable))
-	.int	JMPTBL(L(Fill15_16), L(FillTable))
-	.int	JMPTBL(L(Fill15_16), L(FillTable))
-	.int	JMPTBL(L(Fill17_31), L(FillTable))
-	.int	JMPTBL(L(Fill17_31), L(FillTable))
-	.int	JMPTBL(L(Fill17_31), L(FillTable))
-	.int	JMPTBL(L(Fill17_31), L(FillTable))
-	.int	JMPTBL(L(Fill17_31), L(FillTable))
-	.int	JMPTBL(L(Fill17_31), L(FillTable))
-	.int	JMPTBL(L(Fill17_31), L(FillTable))
-	.int	JMPTBL(L(Fill17_31), L(FillTable))
-	.int	JMPTBL(L(Fill17_31), L(FillTable))
-	.int	JMPTBL(L(Fill17_31), L(FillTable))
-	.int	JMPTBL(L(Fill17_31), L(FillTable))
-	.int	JMPTBL(L(Fill17_31), L(FillTable))
-	.int	JMPTBL(L(Fill17_31), L(FillTable))
-	.int	JMPTBL(L(Fill17_31), L(FillTable))
-	.int	JMPTBL(L(Fill17_31), L(FillTable))
-	.int	JMPTBL(L(Fill32), L(FillTable))
-#  endif
-# endif
 #endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9c70650443520d88715d0c7ba6aef9641557175d

commit 9c70650443520d88715d0c7ba6aef9641557175d
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Wed Sep 26 12:22:31 2018 -0500

    use direct branches for ExitN labels

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 3b93a2c..e1bf592 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -295,7 +295,32 @@ L(UnalignedFourVecSizeLeave):
 # else
 	add	$(VEC_SIZE * 3), %rsi
 	add	$(VEC_SIZE * 3), %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+	cmp	$63, %edx
+	je	L(Exit64)
+	cmp	$32, %edx
+	jae	L(Exit33_63)
+	cmp	$31, %edx
+	je	L(Exit32)
+	cmp	$16, %edx
+	jae	L(Exit17_31)
+	cmp	$15, %edx
+	je	L(Exit16)
+	cmp	$8, %edx
+	jae	L(Exit9_15)
+	cmp	$7, %edx
+	je	L(Exit8)
+	cmp	$4, %edx
+	jae	L(Exit5_7)
+	cmp	$3, %edx
+	je	L(Exit4)
+	cmp	$2, %edx
+	je	L(Exit3)
+	cmp	$0, %edx
+	ja	L(Exit2)
+	je	L(Exit1)
+	VZEROUPPER
+	ret
+
 # endif
 
 /* If source address alignment == destination address alignment */
@@ -346,13 +371,61 @@ L(CopyVecSize):
 	add	%rcx, %rdi
 	add	%rcx, %rsi
 	bsf	%edx, %edx
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+	cmp	$63, %edx
+	je	L(Exit64)
+	cmp	$32, %edx
+	jae	L(Exit33_63)
+	cmp	$31, %edx
+	je	L(Exit32)
+	cmp	$16, %edx
+	jae	L(Exit17_31)
+	cmp	$15, %edx
+	je	L(Exit16)
+	cmp	$8, %edx
+	jae	L(Exit9_15)
+	cmp	$7, %edx
+	je	L(Exit8)
+	cmp	$4, %edx
+	jae	L(Exit5_7)
+	cmp	$3, %edx
+	je	L(Exit4)
+	cmp	$2, %edx
+	je	L(Exit3)
+	cmp	$0, %edx
+	ja	L(Exit2)
+	je	L(Exit1)
+	VZEROUPPER
+	ret
 # endif
 	.p2align 4
 L(CopyVecSizeTail):
 	add	%rcx, %rsi
 	bsf	%edx, %edx
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+	cmp	$63, %edx
+	je	L(Exit64)
+	cmp	$32, %edx
+	jae	L(Exit33_63)
+	cmp	$31, %edx
+	je	L(Exit32)
+	cmp	$16, %edx
+	jae	L(Exit17_31)
+	cmp	$15, %edx
+	je	L(Exit16)
+	cmp	$8, %edx
+	jae	L(Exit9_15)
+	cmp	$7, %edx
+	je	L(Exit8)
+	cmp	$4, %edx
+	jae	L(Exit5_7)
+	cmp	$3, %edx
+	je	L(Exit4)
+	cmp	$2, %edx
+	je	L(Exit3)
+	cmp	$0, %edx
+	ja	L(Exit2)
+	je	L(Exit1)
+	VZEROUPPER
+	ret
 
 	.p2align 4
 L(CopyTwoVecSize1):
@@ -363,7 +436,31 @@ L(CopyTwoVecSize1):
 # endif
 L(CopyVecSizeTail1):
 	bsf	%edx, %edx
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+	cmp	$63, %edx
+	je	L(Exit64)
+	cmp	$32, %edx
+	jae	L(Exit33_63)
+	cmp	$31, %edx
+	je	L(Exit32)
+	cmp	$16, %edx
+	jae	L(Exit17_31)
+	cmp	$15, %edx
+	je	L(Exit16)
+	cmp	$8, %edx
+	jae	L(Exit9_15)
+	cmp	$7, %edx
+	je	L(Exit8)
+	cmp	$4, %edx
+	jae	L(Exit5_7)
+	cmp	$3, %edx
+	je	L(Exit4)
+	cmp	$2, %edx
+	je	L(Exit3)
+	cmp	$0, %edx
+	ja	L(Exit2)
+	je	L(Exit1)
+	VZEROUPPER
+	ret
 
 	.p2align 4
 L(CopyTwoVecSize):
@@ -371,7 +468,31 @@ L(CopyTwoVecSize):
 	add	%rcx, %rsi
 	add	$VEC_SIZE, %edx
 	sub	%ecx, %edx
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+	cmp	$63, %edx
+	je	L(Exit64)
+	cmp	$32, %edx
+	jae	L(Exit33_63)
+	cmp	$31, %edx
+	je	L(Exit32)
+	cmp	$16, %edx
+	jae	L(Exit17_31)
+	cmp	$15, %edx
+	je	L(Exit16)
+	cmp	$8, %edx
+	jae	L(Exit9_15)
+	cmp	$7, %edx
+	je	L(Exit8)
+	cmp	$4, %edx
+	jae	L(Exit5_7)
+	cmp	$3, %edx
+	je	L(Exit4)
+	cmp	$2, %edx
+	je	L(Exit3)
+	cmp	$0, %edx
+	ja	L(Exit2)
+	je	L(Exit1)
+	VZEROUPPER
+	ret
 
 	.p2align 4
 L(CopyVecSizeUnaligned_0):
@@ -386,7 +507,31 @@ L(CopyVecSizeUnaligned_0):
 	lea	1(%rdi, %rdx), %rdi
 	jmp	L(StrncpyFillTailWithZero)
 # else
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+	cmp	$63, %edx
+	je	L(Exit64)
+	cmp	$32, %edx
+	jae	L(Exit33_63)
+	cmp	$31, %edx
+	je	L(Exit32)
+	cmp	$16, %edx
+	jae	L(Exit17_31)
+	cmp	$15, %edx
+	je	L(Exit16)
+	cmp	$8, %edx
+	jae	L(Exit9_15)
+	cmp	$7, %edx
+	je	L(Exit8)
+	cmp	$4, %edx
+	jae	L(Exit5_7)
+	cmp	$3, %edx
+	je	L(Exit4)
+	cmp	$2, %edx
+	je	L(Exit3)
+	cmp	$0, %edx
+	ja	L(Exit2)
+	je	L(Exit1)
+	VZEROUPPER
+	ret
 # endif
 
 	.p2align 4
@@ -405,7 +550,31 @@ L(CopyVecSizeUnaligned_16):
 # else
 	add	$VEC_SIZE, %rsi
 	add	$VEC_SIZE, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+	cmp	$63, %edx
+	je	L(Exit64)
+	cmp	$32, %edx
+	jae	L(Exit33_63)
+	cmp	$31, %edx
+	je	L(Exit32)
+	cmp	$16, %edx
+	jae	L(Exit17_31)
+	cmp	$15, %edx
+	je	L(Exit16)
+	cmp	$8, %edx
+	jae	L(Exit9_15)
+	cmp	$7, %edx
+	je	L(Exit8)
+	cmp	$4, %edx
+	jae	L(Exit5_7)
+	cmp	$3, %edx
+	je	L(Exit4)
+	cmp	$2, %edx
+	je	L(Exit3)
+	cmp	$0, %edx
+	ja	L(Exit2)
+	je	L(Exit1)
+	VZEROUPPER
+	ret
 # endif
 
 	.p2align 4
@@ -425,7 +594,31 @@ L(CopyVecSizeUnaligned_32):
 # else
 	add	$(VEC_SIZE * 2), %rsi
 	add	$(VEC_SIZE * 2), %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+	cmp	$63, %edx
+	je	L(Exit64)
+	cmp	$32, %edx
+	jae	L(Exit33_63)
+	cmp	$31, %edx
+	je	L(Exit32)
+	cmp	$16, %edx
+	jae	L(Exit17_31)
+	cmp	$15, %edx
+	je	L(Exit16)
+	cmp	$8, %edx
+	jae	L(Exit9_15)
+	cmp	$7, %edx
+	je	L(Exit8)
+	cmp	$4, %edx
+	jae	L(Exit5_7)
+	cmp	$3, %edx
+	je	L(Exit4)
+	cmp	$2, %edx
+	je	L(Exit3)
+	cmp	$0, %edx
+	ja	L(Exit2)
+	je	L(Exit1)
+	VZEROUPPER
+	ret
 # endif
 
 # ifdef USE_AS_STRNCPY
@@ -458,7 +651,31 @@ L(CopyVecSizeUnalignedVec1):
 
 	.p2align 4
 L(CopyVecSizeExit):
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+	cmp	$63, %edx
+	je	L(Exit64)
+	cmp	$32, %edx
+	jae	L(Exit33_63)
+	cmp	$31, %edx
+	je	L(Exit32)
+	cmp	$16, %edx
+	jae	L(Exit17_31)
+	cmp	$15, %edx
+	je	L(Exit16)
+	cmp	$8, %edx
+	jae	L(Exit9_15)
+	cmp	$7, %edx
+	je	L(Exit8)
+	cmp	$4, %edx
+	jae	L(Exit5_7)
+	cmp	$3, %edx
+	je	L(Exit4)
+	cmp	$2, %edx
+	je	L(Exit3)
+	cmp	$0, %edx
+	ja	L(Exit2)
+	je	L(Exit1)
+	VZEROUPPER
+	ret
 
 /* Case2 */
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=27c20556d42ab6511d164aca811dec3500f17b30

commit 27c20556d42ab6511d164aca811dec3500f17b30
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Wed Sep 26 09:20:09 2018 -0500

    use direct branches instead of the StrncpyTable jump table

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 2516bdd..3b93a2c 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -470,7 +470,29 @@ L(CopyVecSizeCase2):
 	bsf	%edx, %edx
 	cmp	%r8d, %edx
 	jb	L(CopyVecSizeExit)
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+	cmp	$65, %r8
+	je	L(StrncpyExit65)
+	cmp	$33, %r8
+	jae	L(StrncpyExit33_64)
+	cmp	$17, %r8
+	jae	L(StrncpyExit17_32)
+	cmp	$16, %r8
+	je	L(StrncpyExit16)
+	cmp	$15, %r8
+	je	L(StrncpyExit15)
+	cmp	$8, %r8
+	jae	L(StrncpyExit8_14)
+	cmp	$4, %r8
+	jae	L(StrncpyExit4_7)
+	cmp	$3, %r8
+	je	L(StrncpyExit3)
+	cmp	$2, %r8
+	je	L(StrncpyExit2)
+	cmp	$0, %r8
+	ja	L(StrncpyExit1)
+	je	L(StrncpyExit0)
+	VZEROUPPER
+	ret
 
 	.p2align 4
 L(CopyTwoVecSizeCase2):
@@ -480,20 +502,87 @@ L(CopyTwoVecSizeCase2):
 	sub	%ecx, %edx
 	cmp	%r8d, %edx
 	jb	L(CopyVecSizeExit)
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+	cmp	$65, %r8
+	je	L(StrncpyExit65)
+	cmp	$33, %r8
+	jae	L(StrncpyExit33_64)
+	cmp	$17, %r8
+	jae	L(StrncpyExit17_32)
+	cmp	$16, %r8
+	je	L(StrncpyExit16)
+	cmp	$15, %r8
+	je	L(StrncpyExit15)
+	cmp	$8, %r8
+	jae	L(StrncpyExit8_14)
+	cmp	$4, %r8
+	jae	L(StrncpyExit4_7)
+	cmp	$3, %r8
+	je	L(StrncpyExit3)
+	cmp	$2, %r8
+	je	L(StrncpyExit2)
+	cmp	$0, %r8
+	ja	L(StrncpyExit1)
+	je	L(StrncpyExit0)
+	VZEROUPPER
+	ret
+
 
 L(CopyVecSizeTailCase2):
 	add	%rcx, %rsi
 	bsf	%edx, %edx
 	cmp	%r8d, %edx
 	jb	L(CopyVecSizeExit)
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+	cmp	$65, %r8
+	je	L(StrncpyExit65)
+	cmp	$33, %r8
+	jae	L(StrncpyExit33_64)
+	cmp	$17, %r8
+	jae	L(StrncpyExit17_32)
+	cmp	$16, %r8
+	je	L(StrncpyExit16)
+	cmp	$15, %r8
+	je	L(StrncpyExit15)
+	cmp	$8, %r8
+	jae	L(StrncpyExit8_14)
+	cmp	$4, %r8
+	jae	L(StrncpyExit4_7)
+	cmp	$3, %r8
+	je	L(StrncpyExit3)
+	cmp	$2, %r8
+	je	L(StrncpyExit2)
+	cmp	$0, %r8
+	ja	L(StrncpyExit1)
+	je	L(StrncpyExit0)
+	VZEROUPPER
+	ret
 
 L(CopyVecSizeTail1Case2):
 	bsf	%edx, %edx
 	cmp	%r8d, %edx
 	jb	L(CopyVecSizeExit)
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+	cmp	$65, %r8
+	je	L(StrncpyExit65)
+	cmp	$33, %r8
+	jae	L(StrncpyExit33_64)
+	cmp	$17, %r8
+	jae	L(StrncpyExit17_32)
+	cmp	$16, %r8
+	je	L(StrncpyExit16)
+	cmp	$15, %r8
+	je	L(StrncpyExit15)
+	cmp	$8, %r8
+	jae	L(StrncpyExit8_14)
+	cmp	$4, %r8
+	jae	L(StrncpyExit4_7)
+	cmp	$3, %r8
+	je	L(StrncpyExit3)
+	cmp	$2, %r8
+	je	L(StrncpyExit2)
+	cmp	$0, %r8
+	ja	L(StrncpyExit1)
+	je	L(StrncpyExit0)
+	VZEROUPPER
+	ret
 
 /* Case2 or Case3,  Case3 */
 
@@ -505,21 +594,87 @@ L(CopyVecSizeCase3):
 	add	$VEC_SIZE, %r8
 	add	%rcx, %rdi
 	add	%rcx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+	cmp	$65, %r8
+	je	L(StrncpyExit65)
+	cmp	$33, %r8
+	jae	L(StrncpyExit33_64)
+	cmp	$17, %r8
+	jae	L(StrncpyExit17_32)
+	cmp	$16, %r8
+	je	L(StrncpyExit16)
+	cmp	$15, %r8
+	je	L(StrncpyExit15)
+	cmp	$8, %r8
+	jae	L(StrncpyExit8_14)
+	cmp	$4, %r8
+	jae	L(StrncpyExit4_7)
+	cmp	$3, %r8
+	je	L(StrncpyExit3)
+	cmp	$2, %r8
+	je	L(StrncpyExit2)
+	cmp	$0, %r8
+	ja	L(StrncpyExit1)
+	je	L(StrncpyExit0)
+	VZEROUPPER
+	ret
 
 	.p2align 4
 L(CopyTwoVecSizeCase2OrCase3):
 	test	%rdx, %rdx
 	jnz	L(CopyTwoVecSizeCase2)
 	add	%rcx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+	cmp	$65, %r8
+	je	L(StrncpyExit65)
+	cmp	$33, %r8
+	jae	L(StrncpyExit33_64)
+	cmp	$17, %r8
+	jae	L(StrncpyExit17_32)
+	cmp	$16, %r8
+	je	L(StrncpyExit16)
+	cmp	$15, %r8
+	je	L(StrncpyExit15)
+	cmp	$8, %r8
+	jae	L(StrncpyExit8_14)
+	cmp	$4, %r8
+	jae	L(StrncpyExit4_7)
+	cmp	$3, %r8
+	je	L(StrncpyExit3)
+	cmp	$2, %r8
+	je	L(StrncpyExit2)
+	cmp	$0, %r8
+	ja	L(StrncpyExit1)
+	je	L(StrncpyExit0)
+	VZEROUPPER
+	ret
 
 	.p2align 4
 L(CopyVecSizeTailCase2OrCase3):
 	test	%rdx, %rdx
 	jnz	L(CopyVecSizeTailCase2)
 	add	%rcx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+	cmp	$65, %r8
+	je	L(StrncpyExit65)
+	cmp	$33, %r8
+	jae	L(StrncpyExit33_64)
+	cmp	$17, %r8
+	jae	L(StrncpyExit17_32)
+	cmp	$16, %r8
+	je	L(StrncpyExit16)
+	cmp	$15, %r8
+	je	L(StrncpyExit15)
+	cmp	$8, %r8
+	jae	L(StrncpyExit8_14)
+	cmp	$4, %r8
+	jae	L(StrncpyExit4_7)
+	cmp	$3, %r8
+	je	L(StrncpyExit3)
+	cmp	$2, %r8
+	je	L(StrncpyExit2)
+	cmp	$0, %r8
+	ja	L(StrncpyExit1)
+	je	L(StrncpyExit0)
+	VZEROUPPER
+	ret
 
 	.p2align 4
 L(CopyTwoVecSize1Case2OrCase3):
@@ -529,7 +684,29 @@ L(CopyTwoVecSize1Case2OrCase3):
 L(CopyVecSizeTail1Case2OrCase3):
 	test	%rdx, %rdx
 	jnz	L(CopyVecSizeTail1Case2)
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+	cmp	$65, %r8
+	je	L(StrncpyExit65)
+	cmp	$33, %r8
+	jae	L(StrncpyExit33_64)
+	cmp	$17, %r8
+	jae	L(StrncpyExit17_32)
+	cmp	$16, %r8
+	je	L(StrncpyExit16)
+	cmp	$15, %r8
+	je	L(StrncpyExit15)
+	cmp	$8, %r8
+	jae	L(StrncpyExit8_14)
+	cmp	$4, %r8
+	jae	L(StrncpyExit4_7)
+	cmp	$3, %r8
+	je	L(StrncpyExit3)
+	cmp	$2, %r8
+	je	L(StrncpyExit2)
+	cmp	$0, %r8
+	ja	L(StrncpyExit1)
+	je	L(StrncpyExit0)
+	VZEROUPPER
+	ret
 
 # endif
 
@@ -1138,7 +1315,29 @@ L(UnalignedFourVecSizeLeaveCase2):
 	bsf	%edx, %edx
 	cmp	%r8d, %edx
 	jb	L(CopyVecSizeExit)
-	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+	cmp	$65, %r8
+	je	L(StrncpyExit65)
+	cmp	$33, %r8
+	jae	L(StrncpyExit33_64)
+	cmp	$17, %r8
+	jae	L(StrncpyExit17_32)
+	cmp	$16, %r8
+	je	L(StrncpyExit16)
+	cmp	$15, %r8
+	je	L(StrncpyExit15)
+	cmp	$8, %r8
+	jae	L(StrncpyExit8_14)
+	cmp	$4, %r8
+	jae	L(StrncpyExit4_7)
+	cmp	$3, %r8
+	je	L(StrncpyExit3)
+	cmp	$2, %r8
+	je	L(StrncpyExit2)
+	cmp	$0, %r8
+	ja	L(StrncpyExit1)
+	je	L(StrncpyExit0)
+	VZEROUPPER
+	ret
 
 	.p2align 4
 L(ExitZero):

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ec5844206bed7e0212a99e6778e70132a2dfaced

commit ec5844206bed7e0212a99e6778e70132a2dfaced
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Wed Sep 26 09:04:04 2018 -0500

    use direct branches on L(StrncpyFillLessTwoVecSize) and L(StrncpyFillExit)

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index d9148f6..2516bdd 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1022,11 +1022,43 @@ L(StrncpyFillLessTwoVecSize):
 	jl	L(StrncpyFillExit)
 	vmovdqa %ymmZ, (%rdi)
 	add	$VEC_SIZE, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+	cmp	$32, %r8
+	je	L(Fill32)
+	cmp	$17, %r8
+	jae	L(Fill17_31)
+	cmp	$15, %r8
+	jae	L(Fill15_16)
+	cmp	$8, %r8
+	jae	L(Fill8_14)
+	cmp	$4, %r8
+	jae	L(Fill4_7)
+	cmp	$3, %r8
+	je	L(Fill3)
+	cmp	$1, %r8
+	ja	L(Fill2)
+	je	L(Fill1)
+	VZEROUPPER
+	ret
 
 L(StrncpyFillExit):
 	add	$VEC_SIZE, %r8
-	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+	cmp	$32, %r8
+	je	L(Fill32)
+	cmp	$17, %r8
+	jae	L(Fill17_31)
+	cmp	$15, %r8
+	jae	L(Fill15_16)
+	cmp	$8, %r8
+	jae	L(Fill8_14)
+	cmp	$4, %r8
+	jae	L(Fill4_7)
+	cmp	$3, %r8
+	je	L(Fill3)
+	cmp	$1, %r8
+	ja	L(Fill2)
+	je	L(Fill1)
+	VZEROUPPER
+	ret
 
 /* end of ifndef USE_AS_STRCAT */
 #  endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e664fdf02f0e0ebb14fba0fb5a2c14ff5ec4bed0

commit e664fdf02f0e0ebb14fba0fb5a2c14ff5ec4bed0
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Sep 25 14:56:08 2018 -0700

    Replace jz with je,  remove 1 cmp and 1 jz

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 31e1d80..d9148f6 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -999,7 +999,7 @@ L(StrncpyFillLessFourVecSize):
 	vmovdqa %ymmZ, (%rdi)
 	add	$VEC_SIZE, %rdi
 	cmp	$32, %r8
-	jz	L(Fill32)
+	je	L(Fill32)
 	cmp	$17, %r8
 	jae	L(Fill17_31)
 	cmp	$15, %r8
@@ -1009,14 +1009,14 @@ L(StrncpyFillLessFourVecSize):
 	cmp	$4, %r8
 	jae	L(Fill4_7)
 	cmp	$3, %r8
-	jz	L(Fill3)
-	cmp	$2, %r8
-	jz	L(Fill2)
+	je	L(Fill3)
 	cmp	$1, %r8
-	jz	L(Fill1)
-	cmp	$0, %r8
-	jz	L(Fill0)
+	ja	L(Fill2)
+	je	L(Fill1)
+	VZEROUPPER
+	ret
 
+	.p2align 4
 L(StrncpyFillLessTwoVecSize):
 	add	$VEC_SIZE, %r8
 	jl	L(StrncpyFillExit)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a691ad38c0eaea9eb51f56bd713e24362b770588

commit a691ad38c0eaea9eb51f56bd713e24362b770588
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Tue Sep 25 16:40:20 2018 -0500

    use direct branches on L(StrncpyFillLessFourVecSize)

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 993abf8..31e1d80 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -998,7 +998,24 @@ L(StrncpyFillLessFourVecSize):
 	jl	L(StrncpyFillExit)
 	vmovdqa %ymmZ, (%rdi)
 	add	$VEC_SIZE, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+	cmp	$32, %r8
+	jz	L(Fill32)
+	cmp	$17, %r8
+	jae	L(Fill17_31)
+	cmp	$15, %r8
+	jae	L(Fill15_16)
+	cmp	$8, %r8
+	jae	L(Fill8_14)
+	cmp	$4, %r8
+	jae	L(Fill4_7)
+	cmp	$3, %r8
+	jz	L(Fill3)
+	cmp	$2, %r8
+	jz	L(Fill2)
+	cmp	$1, %r8
+	jz	L(Fill1)
+	cmp	$0, %r8
+	jz	L(Fill0)
 
 L(StrncpyFillLessTwoVecSize):
 	add	$VEC_SIZE, %r8

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=7768744075492fb194a8f7631f882aa14bfa76b4

commit 7768744075492fb194a8f7631f882aa14bfa76b4
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Tue Sep 25 15:38:05 2018 -0500

    remove whitespaces

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index cc17312..993abf8 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -648,8 +648,8 @@ L(Exit9_15):
 
 	.p2align 4
 L(Exit16):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	%xmm0, (%rdi)
+	vmovdqu (%rsi), %xmm0
+	vmovdqu %xmm0, (%rdi)
 # ifdef USE_AS_STPCPY
 	lea	15(%rdi), %rax
 # endif
@@ -681,8 +681,8 @@ L(Exit17_31):
 
 	.p2align 4
 L(Exit32):
-	vmovdqu	(%rsi), %ymm0
-	vmovdqu	%ymm0, (%rdi)
+	vmovdqu (%rsi), %ymm0
+	vmovdqu %ymm0, (%rdi)
 # ifdef USE_AS_STPCPY
 	lea	31(%rdi), %rax
 # endif
@@ -830,8 +830,8 @@ L(StrncpyExit15):
 
 	.p2align 4
 L(StrncpyExit16):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	%xmm0, (%rdi)
+	vmovdqu (%rsi), %xmm0
+	vmovdqu %xmm0, (%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	16(%rdi), %rax
 #  endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=98b7ab9829061290ea0907cf16aa38ff64cccbe6

commit 98b7ab9829061290ea0907cf16aa38ff64cccbe6
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Tue Sep 25 15:31:55 2018 -0500

    consolidate StrncpyExit32 into the 17_31

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index aaa9200..cc17312 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -842,7 +842,7 @@ L(StrncpyExit16):
 	ret
 
 	.p2align 4
-L(StrncpyExit17_31):
+L(StrncpyExit17_32):
 	vmovdqu (%rsi), %xmm0
 	vmovdqu -16(%rsi, %r8), %xmm2
 	vmovdqu %xmm0, (%rdi)
@@ -857,139 +857,6 @@ L(StrncpyExit17_31):
 	ret
 
 	.p2align 4
-L(StrncpyExit24):
-	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %rcx
-	vmovdqu	%xmm0, (%rdi)
-	mov	%rcx, 16(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	24(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 24(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit25):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	9(%rsi), %xmm1
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm1, 9(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	25(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 25(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit26):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	10(%rsi), %xmm1
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm1, 10(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	26(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 26(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit27):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	11(%rsi), %xmm1
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm1, 11(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	27(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 27(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit28):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	12(%rsi), %xmm1
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm1, 12(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	28(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 28(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit29):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	13(%rsi), %xmm2
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm2, 13(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	29(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 29(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit30):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	14(%rsi), %xmm2
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm2, 14(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	30(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 30(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit31):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	15(%rsi), %xmm2
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm2, 15(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	31(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 31(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit32):
-	vmovdqu	(%rsi), %ymm0
-	vmovdqu	%ymm0, (%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	32(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 32(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
 L(StrncpyExit33_64):
 	/*  0/32, 31/16 */
 	vmovdqu (%rsi), %ymm0
@@ -1325,22 +1192,22 @@ L(ExitStrncpyTable):
 	.int	JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_32), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=16bd51bc6e15a1825d42221052024bd4cf55be32

commit 16bd51bc6e15a1825d42221052024bd4cf55be32
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Tue Sep 25 15:27:57 2018 -0500

    consolidate StrncpyExit64 into the 33_63

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 48f7273..aaa9200 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -990,7 +990,7 @@ L(StrncpyExit32):
 	ret
 
 	.p2align 4
-L(StrncpyExit33_63):
+L(StrncpyExit33_64):
 	/*  0/32, 31/16 */
 	vmovdqu (%rsi), %ymm0
 	vmovdqu -VEC_SIZE(%rsi, %r8), %ymm2
@@ -1006,22 +1006,6 @@ L(StrncpyExit33_63):
 	ret
 
 	.p2align 4
-L(StrncpyExit64):
-	/* 0/32, 32/32 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %ymm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm2, 32(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	64(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 64(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
 L(StrncpyExit65):
 	/* 0/32, 32/32, 64/1 */
 	vmovdqu (%rsi), %ymm0
@@ -1357,38 +1341,38 @@ L(ExitStrncpyTable):
 	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_64), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit65), L(ExitStrncpyTable))
 #  ifndef USE_AS_STRCAT
 	.p2align 4

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0d68b69aa12cd27e7f21f3bc3d207d105dbde1b2

commit 0d68b69aa12cd27e7f21f3bc3d207d105dbde1b2
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Tue Sep 25 15:15:24 2018 -0500

    consolidate Exit5-Exit7 labels

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 5e12e4f..48f7273 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -596,50 +596,18 @@ L(Exit4):
 	ret
 
 	.p2align 4
-L(Exit5):
+L(Exit5_7):
 	mov	(%rsi), %ecx
-	mov	%dh, 4(%rdi)
 	mov	%ecx, (%rdi)
+	mov	-3(%rsi, %rdx), %ecx
+	mov	%ecx, -3(%rdi, %rdx)
 # ifdef USE_AS_STPCPY
-	lea	4(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$5, %r8
-	lea	5(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit6):
-	mov	(%rsi), %ecx
-	mov	4(%rsi), %dx
-	mov	%ecx, (%rdi)
-	mov	%dx, 4(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	5(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$6, %r8
-	lea	6(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit7):
-	mov	(%rsi), %ecx
-	mov	3(%rsi), %edx
-	mov	%ecx, (%rdi)
-	mov	%edx, 3(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	6(%rdi), %rax
+	lea	(%rdi, %rdx), %rax
 # endif
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$7, %r8
-	lea	7(%rdi), %rdi
+	sub	%rdx, %r8
+	sub	$1, %r8
+	lea	1(%rdi, %rdx), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
 	VZEROUPPER
@@ -1294,9 +1262,9 @@ L(ExitTable):
 	.int	JMPTBL(L(Exit2), L(ExitTable))
 	.int	JMPTBL(L(Exit3), L(ExitTable))
 	.int	JMPTBL(L(Exit4), L(ExitTable))
-	.int	JMPTBL(L(Exit5), L(ExitTable))
-	.int	JMPTBL(L(Exit6), L(ExitTable))
-	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit5_7), L(ExitTable))
+	.int	JMPTBL(L(Exit5_7), L(ExitTable))
+	.int	JMPTBL(L(Exit5_7), L(ExitTable))
 	.int	JMPTBL(L(Exit8), L(ExitTable))
 	.int	JMPTBL(L(Exit9_15), L(ExitTable))
 	.int	JMPTBL(L(Exit9_15), L(ExitTable))

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3cc2a758ed5a5b0084ec73ac452a84cb42b9a4d6

commit 3cc2a758ed5a5b0084ec73ac452a84cb42b9a4d6
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Tue Sep 25 13:55:40 2018 -0500

    consolidate Exit9_15

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 709caef..5e12e4f 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -661,118 +661,18 @@ L(Exit8):
 	ret
 
 	.p2align 4
-L(Exit9):
+L(Exit9_15):
 	mov	(%rsi), %rcx
-	mov	%dh, 8(%rdi)
+	mov	-7(%rsi, %rdx), %r9
 	mov	%rcx, (%rdi)
+	mov	%r9, -7(%rdi, %rdx)
 # ifdef USE_AS_STPCPY
-	lea	8(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$9, %r8
-	lea	9(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit10):
-	mov	(%rsi), %rcx
-	mov	8(%rsi), %dx
-	mov	%rcx, (%rdi)
-	mov	%dx, 8(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	9(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$10, %r8
-	lea	10(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit11):
-	mov	(%rsi), %rcx
-	mov	7(%rsi), %edx
-	mov	%rcx, (%rdi)
-	mov	%edx, 7(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	10(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$11, %r8
-	lea	11(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit12):
-	mov	(%rsi), %rcx
-	mov	8(%rsi), %edx
-	mov	%rcx, (%rdi)
-	mov	%edx, 8(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	11(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$12, %r8
-	lea	12(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit13):
-	mov	(%rsi), %rcx
-	mov	5(%rsi), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, 5(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	12(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$13, %r8
-	lea	13(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit14):
-	mov	(%rsi), %rcx
-	mov	6(%rsi), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, 6(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	13(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$14, %r8
-	lea	14(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit15):
-	mov	(%rsi), %rcx
-	mov	7(%rsi), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, 7(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	14(%rdi), %rax
+	lea	(%rdi, %rdx), %rax
 # endif
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$15, %r8
-	lea	15(%rdi), %rdi
+	sub	%rdx, %r8
+	sub	$1, %r8
+	lea	1(%rdi, %rdx), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
 	VZEROUPPER
@@ -1398,13 +1298,13 @@ L(ExitTable):
 	.int	JMPTBL(L(Exit6), L(ExitTable))
 	.int	JMPTBL(L(Exit7), L(ExitTable))
 	.int	JMPTBL(L(Exit8), L(ExitTable))
-	.int	JMPTBL(L(Exit9), L(ExitTable))
-	.int	JMPTBL(L(Exit10), L(ExitTable))
-	.int	JMPTBL(L(Exit11), L(ExitTable))
-	.int	JMPTBL(L(Exit12), L(ExitTable))
-	.int	JMPTBL(L(Exit13), L(ExitTable))
-	.int	JMPTBL(L(Exit14), L(ExitTable))
-	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit9_15), L(ExitTable))
+	.int	JMPTBL(L(Exit9_15), L(ExitTable))
+	.int	JMPTBL(L(Exit9_15), L(ExitTable))
+	.int	JMPTBL(L(Exit9_15), L(ExitTable))
+	.int	JMPTBL(L(Exit9_15), L(ExitTable))
+	.int	JMPTBL(L(Exit9_15), L(ExitTable))
+	.int	JMPTBL(L(Exit9_15), L(ExitTable))
 	.int	JMPTBL(L(Exit16), L(ExitTable))
 	.int	JMPTBL(L(Exit17_31), L(ExitTable))
 	.int	JMPTBL(L(Exit17_31), L(ExitTable))

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a7fc68d9e66b1df6d164feca3250d182c2192d59

commit a7fc68d9e66b1df6d164feca3250d182c2192d59
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Tue Sep 25 09:50:33 2018 -0500

    remove old branches already consolidated

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 7fc02db..709caef 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -974,96 +974,6 @@ L(StrncpyExit16):
 	ret
 
 	.p2align 4
-L(StrncpyExit17):
-	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %cl
-	vmovdqu	%xmm0, (%rdi)
-	mov	%cl, 16(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	17(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 17(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit18):
-	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %cx
-	vmovdqu	%xmm0, (%rdi)
-	mov	%cx, 16(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	18(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 18(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit19):
-	vmovdqu	(%rsi), %xmm0
-	mov	15(%rsi), %ecx
-	vmovdqu	%xmm0, (%rdi)
-	mov	%ecx, 15(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	19(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 19(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit20):
-	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %ecx
-	vmovdqu	%xmm0, (%rdi)
-	mov	%ecx, 16(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	20(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 20(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit21):
-	vmovdqu	(%rsi), %xmm0
-	mov	13(%rsi), %rcx
-	vmovdqu	%xmm0, (%rdi)
-	mov	%rcx, 13(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	21(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 21(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit22):
-	vmovdqu	(%rsi), %xmm0
-	mov	14(%rsi), %rcx
-	vmovdqu	%xmm0, (%rdi)
-	mov	%rcx, 14(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	22(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 22(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
 L(StrncpyExit17_31):
 	vmovdqu (%rsi), %xmm0
 	vmovdqu -16(%rsi, %r8), %xmm2

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d4f1935722e7bb4ebb4a0416bad63dd8c0604baa

commit d4f1935722e7bb4ebb4a0416bad63dd8c0604baa
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Tue Sep 25 09:29:28 2018 -0500

    consolidate StrncpyExit8 to 14

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 78990be..7fc02db 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -931,104 +931,16 @@ L(StrncpyExit4_7):
 	ret
 
 	.p2align 4
-L(StrncpyExit8):
-	mov	(%rsi), %rdx
-	mov	%rdx, (%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	8(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 8(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit9):
-	mov	(%rsi), %rcx
-	mov	8(%rsi), %dl
-	mov	%rcx, (%rdi)
-	mov	%dl, 8(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	9(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 9(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit10):
-	mov	(%rsi), %rcx
-	mov	8(%rsi), %dx
-	mov	%rcx, (%rdi)
-	mov	%dx, 8(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	10(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 10(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit11):
-	mov	(%rsi), %rcx
-	mov	7(%rsi), %edx
-	mov	%rcx, (%rdi)
-	mov	%edx, 7(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	11(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 11(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit12):
-	mov	(%rsi), %rcx
-	mov	8(%rsi), %edx
-	mov	%rcx, (%rdi)
-	mov	%edx, 8(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	12(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 12(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit13):
-	mov	(%rsi), %rcx
-	mov	5(%rsi), %rdx
-	mov	%rcx, (%rdi)
-	mov	%rdx, 5(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	13(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 13(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit14):
+L(StrncpyExit8_14):
 	mov	(%rsi), %rcx
-	mov	6(%rsi), %rdx
+	mov	-8(%rsi, %r8), %rdx
 	mov	%rcx, (%rdi)
-	mov	%rdx, 6(%rdi)
+	mov	%rdx, -8(%rdi, %r8)
 #  ifdef USE_AS_STPCPY
-	lea	14(%rdi), %rax
+	lea	(%rdi, %r8), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	movb	$0, 14(%rdi)
+	movb	$0, (%rdi, %r8)
 #  endif
 	VZEROUPPER
 	ret
@@ -1642,13 +1554,13 @@ L(ExitStrncpyTable):
 	.int	JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8_14), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ee1d3cd9a99b79560ed9e44870b1873c0fa2fc8c

commit ee1d3cd9a99b79560ed9e44870b1873c0fa2fc8c
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Tue Sep 25 09:20:47 2018 -0500

    consolidate StrncpyExit4 to 7

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 9719509..78990be 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -916,59 +916,16 @@ L(StrncpyExit3):
 	ret
 
 	.p2align 4
-L(StrncpyExit4):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	4(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 4(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit5):
-	mov	(%rsi), %ecx
-	mov	4(%rsi), %dl
-	mov	%ecx, (%rdi)
-	mov	%dl, 4(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	5(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 5(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit6):
-	mov	(%rsi), %ecx
-	mov	4(%rsi), %dx
-	mov	%ecx, (%rdi)
-	mov	%dx, 4(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	6(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 6(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit7):
+L(StrncpyExit4_7):
 	mov	(%rsi), %ecx
-	mov	3(%rsi), %edx
+	mov	-4(%rsi, %r8), %edx
 	mov	%ecx, (%rdi)
-	mov	%edx, 3(%rdi)
+	mov	%edx, -4(%rdi, %r8)
 #  ifdef USE_AS_STPCPY
-	lea	7(%rdi), %rax
+	lea	(%rdi, %r8), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	movb	$0, 7(%rdi)
+	movb	$0, (%rdi, %r8)
 #  endif
 	VZEROUPPER
 	ret
@@ -1681,10 +1638,10 @@ L(ExitStrncpyTable):
 	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit4_7), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9c436d942c28ef4f0602f11534a9d49e657e8f70

commit 9c436d942c28ef4f0602f11534a9d49e657e8f70
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Tue Sep 25 08:43:21 2018 -0500

    consolidate Fill8 with Fill9_14

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 8a84fb2..9719509 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1425,13 +1425,7 @@ L(Fill4_7):
 	ret
 
 	.p2align 4
-L(Fill8):
-	mov	%rdx, (%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill9_14):
+L(Fill8_14):
 	mov	%rdx, (%rdi)
 	mov	%rdx, -8(%rdi, %r8)
 	VZEROUPPER
@@ -1760,13 +1754,13 @@ L(FillTable):
 	.int	JMPTBL(L(Fill4_7), L(FillTable))
 	.int	JMPTBL(L(Fill4_7), L(FillTable))
 	.int	JMPTBL(L(Fill4_7), L(FillTable))
-	.int	JMPTBL(L(Fill8), L(FillTable))
-	.int	JMPTBL(L(Fill9_14), L(FillTable))
-	.int	JMPTBL(L(Fill9_14), L(FillTable))
-	.int	JMPTBL(L(Fill9_14), L(FillTable))
-	.int	JMPTBL(L(Fill9_14), L(FillTable))
-	.int	JMPTBL(L(Fill9_14), L(FillTable))
-	.int	JMPTBL(L(Fill9_14), L(FillTable))
+	.int	JMPTBL(L(Fill8_14), L(FillTable))
+	.int	JMPTBL(L(Fill8_14), L(FillTable))
+	.int	JMPTBL(L(Fill8_14), L(FillTable))
+	.int	JMPTBL(L(Fill8_14), L(FillTable))
+	.int	JMPTBL(L(Fill8_14), L(FillTable))
+	.int	JMPTBL(L(Fill8_14), L(FillTable))
+	.int	JMPTBL(L(Fill8_14), L(FillTable))
 	.int	JMPTBL(L(Fill15_16), L(FillTable))
 	.int	JMPTBL(L(Fill15_16), L(FillTable))
 	.int	JMPTBL(L(Fill17_31), L(FillTable))

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e511145b5dd1dc4df1d1e03ea10006bb22c32cc0

commit e511145b5dd1dc4df1d1e03ea10006bb22c32cc0
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Tue Sep 25 08:36:51 2018 -0500

    consolidate Fill4_7

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 606a968..8a84fb2 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1418,28 +1418,9 @@ L(Fill3):
 	ret
 
 	.p2align 4
-L(Fill4):
+L(Fill4_7):
 	mov	%edx, (%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill5):
-	mov	%edx, (%rdi)
-	mov	%dl, 4(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill6):
-	mov	%edx, (%rdi)
-	mov	%dx, 4(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill7):
-	mov	%rdx, -1(%rdi)
+	mov     %edx, -4(%rdi, %r8)
 	VZEROUPPER
 	ret
 
@@ -1775,10 +1756,10 @@ L(FillTable):
 	.int	JMPTBL(L(Fill1), L(FillTable))
 	.int	JMPTBL(L(Fill2), L(FillTable))
 	.int	JMPTBL(L(Fill3), L(FillTable))
-	.int	JMPTBL(L(Fill4), L(FillTable))
-	.int	JMPTBL(L(Fill5), L(FillTable))
-	.int	JMPTBL(L(Fill6), L(FillTable))
-	.int	JMPTBL(L(Fill7), L(FillTable))
+	.int	JMPTBL(L(Fill4_7), L(FillTable))
+	.int	JMPTBL(L(Fill4_7), L(FillTable))
+	.int	JMPTBL(L(Fill4_7), L(FillTable))
+	.int	JMPTBL(L(Fill4_7), L(FillTable))
 	.int	JMPTBL(L(Fill8), L(FillTable))
 	.int	JMPTBL(L(Fill9_14), L(FillTable))
 	.int	JMPTBL(L(Fill9_14), L(FillTable))

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e7ce816c982f673360d2ac04658632fdfca4df19

commit e7ce816c982f673360d2ac04658632fdfca4df19
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Sep 24 15:19:50 2018 -0700

    Consolidate more entries in FillTable

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 0e67a7b..606a968 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1450,56 +1450,15 @@ L(Fill8):
 	ret
 
 	.p2align 4
-L(Fill9):
+L(Fill9_14):
 	mov	%rdx, (%rdi)
-	mov	%dl, 8(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill10):
-	mov	%rdx, (%rdi)
-	mov	%dx, 8(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill11):
-	mov	%rdx, (%rdi)
-	mov	%edx, 7(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill12):
-	mov	%rdx, (%rdi)
-	mov	%edx, 8(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill13):
-	mov	%rdx, (%rdi)
-	mov	%rdx, 5(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill14):
-	mov	%rdx, (%rdi)
-	mov	%rdx, 6(%rdi)
+	mov	%rdx, -8(%rdi, %r8)
 	VZEROUPPER
 	ret
 
 	.p2align 4
-L(Fill15):
-	vmovdqu %xmmZ, -1(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill16):
-	vmovdqu %xmmZ, (%rdi)
+L(Fill15_16):
+	vmovdqu %xmmZ, -16(%rdi,%r8)
 	VZEROUPPER
 	ret
 
@@ -1821,14 +1780,14 @@ L(FillTable):
 	.int	JMPTBL(L(Fill6), L(FillTable))
 	.int	JMPTBL(L(Fill7), L(FillTable))
 	.int	JMPTBL(L(Fill8), L(FillTable))
-	.int	JMPTBL(L(Fill9), L(FillTable))
-	.int	JMPTBL(L(Fill10), L(FillTable))
-	.int	JMPTBL(L(Fill11), L(FillTable))
-	.int	JMPTBL(L(Fill12), L(FillTable))
-	.int	JMPTBL(L(Fill13), L(FillTable))
-	.int	JMPTBL(L(Fill14), L(FillTable))
-	.int	JMPTBL(L(Fill15), L(FillTable))
-	.int	JMPTBL(L(Fill16), L(FillTable))
+	.int	JMPTBL(L(Fill9_14), L(FillTable))
+	.int	JMPTBL(L(Fill9_14), L(FillTable))
+	.int	JMPTBL(L(Fill9_14), L(FillTable))
+	.int	JMPTBL(L(Fill9_14), L(FillTable))
+	.int	JMPTBL(L(Fill9_14), L(FillTable))
+	.int	JMPTBL(L(Fill9_14), L(FillTable))
+	.int	JMPTBL(L(Fill15_16), L(FillTable))
+	.int	JMPTBL(L(Fill15_16), L(FillTable))
 	.int	JMPTBL(L(Fill17_31), L(FillTable))
 	.int	JMPTBL(L(Fill17_31), L(FillTable))
 	.int	JMPTBL(L(Fill17_31), L(FillTable))

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=81dbdce369b3376bab235925effbddb2c476d864

commit 81dbdce369b3376bab235925effbddb2c476d864
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Mon Sep 24 16:40:19 2018 -0500

    compact Fill17 to Fill31 labels into a single one

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 4ab4c66..0e67a7b 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1504,114 +1504,9 @@ L(Fill16):
 	ret
 
 	.p2align 4
-L(Fill17):
+L(Fill17_31):
 	vmovdqu %xmmZ, (%rdi)
-	mov	%dl, 16(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill18):
-	vmovdqu %xmmZ, (%rdi)
-	mov	%dx, 16(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill19):
-	vmovdqu %xmmZ, (%rdi)
-	mov	%edx, 15(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill20):
-	vmovdqu %xmmZ, (%rdi)
-	mov	%edx, 16(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill21):
-	vmovdqu %xmmZ, (%rdi)
-	mov	%edx, 16(%rdi)
-	mov	%dl, 20(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill22):
-	vmovdqu %xmmZ, (%rdi)
-	mov	%edx, 16(%rdi)
-	mov	%dx, 20(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill23):
-	vmovdqu %xmmZ, (%rdi)
-	mov	%rdx, 15(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill24):
-	vmovdqu %xmmZ, (%rdi)
-	mov	%rdx, 16(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill25):
-	vmovdqu %xmmZ, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%dl, 24(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill26):
-	vmovdqu %xmmZ, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%dx, 24(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill27):
-	vmovdqu %xmmZ, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%edx, 23(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill28):
-	vmovdqu %xmmZ, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%edx, 24(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill29):
-	vmovdqu %xmmZ, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%rdx, 21(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill30):
-	vmovdqu %xmmZ, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%rdx, 22(%rdi)
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Fill31):
-	vmovdqu %ymmZ, -1(%rdi)
+	vmovdqu %xmmZ, -16(%rdi, %r8)
 	VZEROUPPER
 	ret
 
@@ -1934,21 +1829,21 @@ L(FillTable):
 	.int	JMPTBL(L(Fill14), L(FillTable))
 	.int	JMPTBL(L(Fill15), L(FillTable))
 	.int	JMPTBL(L(Fill16), L(FillTable))
-	.int	JMPTBL(L(Fill17), L(FillTable))
-	.int	JMPTBL(L(Fill18), L(FillTable))
-	.int	JMPTBL(L(Fill19), L(FillTable))
-	.int	JMPTBL(L(Fill20), L(FillTable))
-	.int	JMPTBL(L(Fill21), L(FillTable))
-	.int	JMPTBL(L(Fill22), L(FillTable))
-	.int	JMPTBL(L(Fill23), L(FillTable))
-	.int	JMPTBL(L(Fill24), L(FillTable))
-	.int	JMPTBL(L(Fill25), L(FillTable))
-	.int	JMPTBL(L(Fill26), L(FillTable))
-	.int	JMPTBL(L(Fill27), L(FillTable))
-	.int	JMPTBL(L(Fill28), L(FillTable))
-	.int	JMPTBL(L(Fill29), L(FillTable))
-	.int	JMPTBL(L(Fill30), L(FillTable))
-	.int	JMPTBL(L(Fill31), L(FillTable))
+	.int	JMPTBL(L(Fill17_31), L(FillTable))
+	.int	JMPTBL(L(Fill17_31), L(FillTable))
+	.int	JMPTBL(L(Fill17_31), L(FillTable))
+	.int	JMPTBL(L(Fill17_31), L(FillTable))
+	.int	JMPTBL(L(Fill17_31), L(FillTable))
+	.int	JMPTBL(L(Fill17_31), L(FillTable))
+	.int	JMPTBL(L(Fill17_31), L(FillTable))
+	.int	JMPTBL(L(Fill17_31), L(FillTable))
+	.int	JMPTBL(L(Fill17_31), L(FillTable))
+	.int	JMPTBL(L(Fill17_31), L(FillTable))
+	.int	JMPTBL(L(Fill17_31), L(FillTable))
+	.int	JMPTBL(L(Fill17_31), L(FillTable))
+	.int	JMPTBL(L(Fill17_31), L(FillTable))
+	.int	JMPTBL(L(Fill17_31), L(FillTable))
+	.int	JMPTBL(L(Fill17_31), L(FillTable))
 	.int	JMPTBL(L(Fill32), L(FillTable))
 #  endif
 # endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2a584bf6c3c2ff2275641cc9c28d1f3410b6ce11

commit 2a584bf6c3c2ff2275641cc9c28d1f3410b6ce11
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Mon Sep 24 16:12:35 2018 -0500

    remove useless StrncpyExit33-63 labels

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 5a521ed..4ab4c66 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1343,229 +1343,6 @@ L(StrncpyExit32):
 	ret
 
 	.p2align 4
-L(StrncpyExit33):
-	vmovdqu	(%rsi), %ymm0
-	mov	32(%rsi), %cl
-	vmovdqu	%ymm0, (%rdi)
-	mov	%cl, 32(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	33(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 33(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit34):
-	/*  0/32, 32/2 */
-	vmovdqu (%rsi), %ymm0
-	mov	32(%rsi), %cx
-	vmovdqu %ymm0, (%rdi)
-	mov	%cx, 32(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	34(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 34(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit35):
-	/*  0/32, 31/4 */
-	vmovdqu (%rsi), %ymm0
-	mov	31(%rsi), %ecx
-	vmovdqu %ymm0, (%rdi)
-	mov	%ecx, 31(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	35(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 35(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit36):
-	/*  0/32, 32/4 */
-	vmovdqu (%rsi), %ymm0
-	mov	32(%rsi), %ecx
-	vmovdqu %ymm0, (%rdi)
-	mov	%ecx, 32(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	36(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 36(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit37):
-	/*  0/32, 29/8 */
-	vmovdqu (%rsi), %ymm0
-	mov	29(%rsi), %rcx
-	vmovdqu %ymm0, (%rdi)
-	mov	%rcx, 29(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	37(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 37(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit38):
-	/*  0/32, 30/8 */
-	vmovdqu (%rsi), %ymm0
-	mov	30(%rsi), %rcx
-	vmovdqu %ymm0, (%rdi)
-	mov	%rcx, 30(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	38(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 38(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit39):
-	/*  0/32, 31/8 */
-	vmovdqu (%rsi), %ymm0
-	mov	31(%rsi), %rcx
-	vmovdqu %ymm0, (%rdi)
-	mov	%rcx, 31(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	39(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 39(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit40):
-	/*  0/32, 32/8 */
-	vmovdqu (%rsi), %ymm0
-	mov	32(%rsi), %rcx
-	vmovdqu %ymm0, (%rdi)
-	mov	%rcx, 32(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	40(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 40(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit41):
-	/*  0/32, 32/8, 40/1 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 25(%rsi), %xmm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 25(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	41(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 41(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit42):
-	/*  0/32, 32/8, 40/2 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 26(%rsi), %xmm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 26(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	42(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 42(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit43):
-	/*  0/32, 27/16 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 27(%rsi), %xmm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm2, 27(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	43(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 43(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit44):
-	/*  0/32, 28/16 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 28(%rsi), %xmm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm2, 28(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	44(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 44(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit45):
-	/*  0/32, 29/16 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 29(%rsi), %xmm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm2, 29(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	45(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 45(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit46):
-	/*  0/32, 30/16 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 30(%rsi), %xmm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm2, 30(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	46(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 46(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
 L(StrncpyExit33_63):
 	/*  0/32, 31/16 */
 	vmovdqu (%rsi), %ymm0
@@ -1582,263 +1359,6 @@ L(StrncpyExit33_63):
 	ret
 
 	.p2align 4
-L(StrncpyExit48):
-	/*  0/32, 32/16 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm2, 32(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	48(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 48(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit49):
-	/* 0/32, 32/16, 48/1 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 17(%rsi), %ymm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm2, 17(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	49(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 49(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit50):
-	/*  0/32, 32/16, 48/2 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 18(%rsi), %ymm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm2, 18(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	50(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 50(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit51):
-	/*  0/32, 32/16, 47/4 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 19(%rsi), %ymm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm2, 19(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	51(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 51(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit52):
-	/*  0/32, 32/16, 48/4 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 20(%rsi), %ymm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm2, 20(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	52(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 52(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit53):
-	/*  0/32, 32/16, 45/8 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 21(%rsi), %ymm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm2, 21(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	53(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 53(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit54):
-	/*  0/32, 32/16, 46/8 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 22(%rsi), %ymm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm2, 22(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	54(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 54(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit55):
-	/* 0/32, 32/16, 47/8 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 23(%rsi), %ymm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm2, 23(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	55(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 55(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit56):
-	/* 0/32, 32/16, 48/8 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 24(%rsi), %ymm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm2, 24(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	56(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 56(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit57):
-	/* 0/32, 25/32 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 25(%rsi), %ymm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm2, 25(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	57(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 57(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit58):
-	/* 0/32, 26/32 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 26(%rsi), %ymm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm2, 26(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	58(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 58(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit59):
-	/* 0/32, 27/32 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 27(%rsi), %ymm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm2, 27(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	59(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 59(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-
-	.p2align 4
-L(StrncpyExit60):
-	/* 0/32, 28/32 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 28(%rsi), %ymm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm2, 28(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	60(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 60(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit61):
-	/* 0/32, 29/32 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 29(%rsi), %ymm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm2, 29(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	61(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 61(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit62):
-	/* 0/32, 30/32 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 30(%rsi), %ymm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm2, 30(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	62(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 62(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(StrncpyExit63):
-	/* 0/32, 31/32 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 31(%rsi), %ymm2
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm2, 31(%rdi)
-#  ifdef USE_AS_STPCPY
-	lea	63(%rdi), %rax
-#  endif
-#  ifdef USE_AS_STRCAT
-	movb	$0, 63(%rdi)
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
 L(StrncpyExit64):
 	/* 0/32, 32/32 */
 	vmovdqu (%rsi), %ymm0

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f24c12a6cf0552d60c7ea69e9f8cc4a719fdb592

commit f24c12a6cf0552d60c7ea69e9f8cc4a719fdb592
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Mon Sep 24 15:50:49 2018 -0500

    create a single StrncpyExit17_31

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 71d57c4..5a521ed 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1195,7 +1195,7 @@ L(StrncpyExit22):
 	ret
 
 	.p2align 4
-L(StrncpyExit23):
+L(StrncpyExit17_31):
 	vmovdqu (%rsi), %xmm0
 	vmovdqu -16(%rsi, %r8), %xmm2
 	vmovdqu %xmm0, (%rdi)
@@ -2345,21 +2345,21 @@ L(ExitStrncpyTable):
 	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
-	.int    JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17_31), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=1b9e998325aee369aff3175b9bd5a14b86ca7f35

commit 1b9e998325aee369aff3175b9bd5a14b86ca7f35
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Mon Sep 24 15:46:09 2018 -0500

    prepare StrncpyExit23 for 17-31 labels

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 8f2c482..71d57c4 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1196,15 +1196,15 @@ L(StrncpyExit22):
 
 	.p2align 4
 L(StrncpyExit23):
-	vmovdqu	(%rsi), %xmm0
-	mov	15(%rsi), %rcx
-	vmovdqu	%xmm0, (%rdi)
-	mov	%rcx, 15(%rdi)
+	vmovdqu (%rsi), %xmm0
+	vmovdqu -16(%rsi, %r8), %xmm2
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm2, -16(%rdi, %r8)
 #  ifdef USE_AS_STPCPY
-	lea	23(%rdi), %rax
+	lea	(%rdi, %r8), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	movb	$0, 23(%rdi)
+	movb	$0, (%rdi, %r8)
 #  endif
 	VZEROUPPER
 	ret

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ec1472b4329dfd4fcdc66a0ae2285a9e1929f844

commit ec1472b4329dfd4fcdc66a0ae2285a9e1929f844
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Mon Sep 24 15:40:36 2018 -0500

    Create a single label for StrncpyExit33-StrncpyExit63 cases

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index cc99867..8f2c482 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1566,7 +1566,7 @@ L(StrncpyExit46):
 	ret
 
 	.p2align 4
-L(StrncpyExit47):
+L(StrncpyExit33_63):
 	/*  0/32, 31/16 */
 	vmovdqu (%rsi), %ymm0
 	vmovdqu -VEC_SIZE(%rsi, %r8), %ymm2
@@ -2361,37 +2361,37 @@ L(ExitStrncpyTable):
 	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit34), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit35), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit36), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit37), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit38), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit39), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit40), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit41), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit42), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit43), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit44), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit45), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit46), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit47), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit48), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit49), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit50), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit51), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit52), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit53), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit54), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit55), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit56), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit57), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit58), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit59), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit60), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit61), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit62), L(ExitStrncpyTable))
-	.int	JMPTBL(L(StrncpyExit63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33_63), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit64), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit65), L(ExitStrncpyTable))
 #  ifndef USE_AS_STRCAT

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a527c9d1d7e966caf71d549b782681012d62055c

commit a527c9d1d7e966caf71d549b782681012d62055c
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Mon Sep 24 15:29:56 2018 -0500

    prepare StrncpyExit47 for multiple label

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 99b9bc7..cc99867 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1569,14 +1569,14 @@ L(StrncpyExit46):
 L(StrncpyExit47):
 	/*  0/32, 31/16 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 31(%rsi), %xmm2
+	vmovdqu -VEC_SIZE(%rsi, %r8), %ymm2
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm2, 31(%rdi)
+	vmovdqu %ymm2, -VEC_SIZE(%rdi, %r8)
 #  ifdef USE_AS_STPCPY
-	lea	47(%rdi), %rax
+	lea	(%rdi, %r8), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	movb	$0, 47(%rdi)
+	movb	$0, (%rdi, %r8)
 #  endif
 	VZEROUPPER
 	ret

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2e7d2b76ec0ba6f8b6bccd888b246d381756a4d4

commit 2e7d2b76ec0ba6f8b6bccd888b246d381756a4d4
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Mon Sep 24 14:38:15 2018 -0500

    remove obsolete exit branches

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 5890b8a..99b9bc7 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -794,107 +794,6 @@ L(Exit16):
 	ret
 
 	.p2align 4
-L(Exit17):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	%xmm0, (%rdi)
-	mov	%dh, 16(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	16(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$17, %r8
-	lea	17(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit18):
-	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %cx
-	vmovdqu	%xmm0, (%rdi)
-	mov	%cx, 16(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	17(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$18, %r8
-	lea	18(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit19):
-	vmovdqu	(%rsi), %xmm0
-	mov	15(%rsi), %ecx
-	vmovdqu	%xmm0, (%rdi)
-	mov	%ecx, 15(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	18(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$19, %r8
-	lea	19(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit20):
-	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %ecx
-	vmovdqu	%xmm0, (%rdi)
-	mov	%ecx, 16(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	19(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$20, %r8
-	lea	20(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit21):
-	vmovdqu	(%rsi), %xmm0
-	mov	13(%rsi), %rcx
-	vmovdqu	%xmm0, (%rdi)
-	mov	%rcx, 13(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	20(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$21, %r8
-	lea	21(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit22):
-	vmovdqu	(%rsi), %xmm0
-	mov	14(%rsi), %rcx
-	vmovdqu	%xmm0, (%rdi)
-	mov	%rcx, 14(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	21(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$22, %r8
-	lea	22(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
 L(Exit17_31):
 	vmovdqu (%rsi), %xmm0
 	vmovdqu -15(%rsi, %rdx), %xmm1
@@ -913,142 +812,6 @@ L(Exit17_31):
 	ret
 
 	.p2align 4
-L(Exit24):
-	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %rcx
-	vmovdqu	%xmm0, (%rdi)
-	mov	%rcx, 16(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	23(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$24, %r8
-	lea	24(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit25):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	9(%rsi), %xmm1
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm1, 9(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	24(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$25, %r8
-	lea	25(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit26):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	10(%rsi), %xmm1
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm1, 10(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	25(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$26, %r8
-	lea	26(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit27):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	11(%rsi), %xmm1
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm1, 11(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	26(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$27, %r8
-	lea	27(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit28):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	12(%rsi), %xmm1
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm1, 12(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	27(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$28, %r8
-	lea	28(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit29):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	13(%rsi), %xmm2
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm2, 13(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	28(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$29, %r8
-	lea	29(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit30):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	14(%rsi), %xmm2
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm2, 14(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	29(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$30, %r8
-	lea	30(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit31):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	15(%rsi), %xmm2
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm2, 15(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	30(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$31, %r8
-	lea	31(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
 L(Exit32):
 	vmovdqu	(%rsi), %ymm0
 	vmovdqu	%ymm0, (%rdi)
@@ -1065,7 +828,6 @@ L(Exit32):
 
 	.p2align 4
 L(Exit33_63):
-	/* 0/32, 31/16 */
 	vmovdqu (%rsi), %ymm0
 	vmovdqu -31(%rsi, %rdx), %ymm1
 	vmovdqu %ymm0, (%rdi)
@@ -1083,296 +845,7 @@ L(Exit33_63):
 	ret
 
 	.p2align 4
-L(Exit48):
-	/* 0/32, 32/16 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 32(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	47(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$48, %r8
-	lea	48(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit49):
-	/* 0/32, 32/16, 48/1 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 17(%rsi), %ymm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 17(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	48(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$49, %r8
-	lea	49(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit50):
-	/* 0/32, 32/16, 48/2 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 18(%rsi), %ymm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 18(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	49(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$50, %r8
-	lea	50(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit51):
-	/* 0/32, 32/16, 47/4 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 19(%rsi), %ymm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 19(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	50(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$51, %r8
-	lea	51(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit52):
-	/* 0/32, 32/16, 48/4 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 20(%rsi), %ymm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 20(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	51(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$52, %r8
-	lea	52(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit53):
-	/* 0/32, 32/16, 45/8 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 21(%rsi), %ymm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 21(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	52(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$53, %r8
-	lea	53(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit54):
-	/* 0/32, 32/16, 46/8 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 22(%rsi), %ymm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 22(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	53(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$54, %r8
-	lea	54(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit55):
-	/* 0/32, 32/16, 47/8 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 23(%rsi), %ymm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 23(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	54(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$55, %r8
-	lea	55(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit56):
-	/* 0/32, 32/16, 48/8 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 24(%rsi), %ymm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 24(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	55(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$56, %r8
-	lea	56(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit57):
-	/* 0/32, 25/32 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 25(%rsi), %ymm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 25(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	56(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$57, %r8
-	lea	57(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit58):
-	/* 0/32, 26/32 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 26(%rsi), %ymm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 26(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	57(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$58, %r8
-	lea	58(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit59):
-	/* 0/32, 27/32 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 27(%rsi), %ymm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 27(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	58(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$59, %r8
-	lea	59(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit60):
-	/* 0/32, 28/32 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 28(%rsi), %ymm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 28(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	59(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$60, %r8
-	lea	60(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit61):
-	/* 0/32, 29/32 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 29(%rsi), %ymm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 29(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	60(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$61, %r8
-	lea	61(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit62):
-	/* 0/32, 30/32 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 30(%rsi), %ymm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 30(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	61(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$62, %r8
-	lea	62(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit63):
-	/* 0/32, 31/32 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 31(%rsi), %ymm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 31(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	62(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$63, %r8
-	lea	63(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
 L(Exit64):
-	/* 0/32, 32/32 */
 	vmovdqu (%rsi), %ymm0
 	vmovdqu 32(%rsi), %ymm1
 	vmovdqu %ymm0, (%rdi)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=acaf1ef714e072b891aa51630d47716ee08929c4

commit acaf1ef714e072b891aa51630d47716ee08929c4
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Mon Sep 24 14:30:28 2018 -0500

    create a single Exit17_31 label for Exit17 to Exit31 labels

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index fd46d28..5890b8a 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -895,7 +895,7 @@ L(Exit22):
 	ret
 
 	.p2align 4
-L(Exit23):
+L(Exit17_31):
 	vmovdqu (%rsi), %xmm0
 	vmovdqu -15(%rsi, %rdx), %xmm1
 	vmovdqu %xmm0, (%rdi)
@@ -2805,21 +2805,21 @@ L(ExitTable):
 	.int	JMPTBL(L(Exit14), L(ExitTable))
 	.int	JMPTBL(L(Exit15), L(ExitTable))
 	.int	JMPTBL(L(Exit16), L(ExitTable))
-	.int	JMPTBL(L(Exit17), L(ExitTable))
-	.int	JMPTBL(L(Exit18), L(ExitTable))
-	.int	JMPTBL(L(Exit19), L(ExitTable))
-	.int	JMPTBL(L(Exit20), L(ExitTable))
-	.int	JMPTBL(L(Exit21), L(ExitTable))
-	.int	JMPTBL(L(Exit22), L(ExitTable))
-	.int    JMPTBL(L(Exit23), L(ExitTable))
-	.int	JMPTBL(L(Exit24), L(ExitTable))
-	.int	JMPTBL(L(Exit25), L(ExitTable))
-	.int	JMPTBL(L(Exit26), L(ExitTable))
-	.int	JMPTBL(L(Exit27), L(ExitTable))
-	.int	JMPTBL(L(Exit28), L(ExitTable))
-	.int	JMPTBL(L(Exit29), L(ExitTable))
-	.int	JMPTBL(L(Exit30), L(ExitTable))
-	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit17_31), L(ExitTable))
+	.int	JMPTBL(L(Exit17_31), L(ExitTable))
+	.int	JMPTBL(L(Exit17_31), L(ExitTable))
+	.int	JMPTBL(L(Exit17_31), L(ExitTable))
+	.int	JMPTBL(L(Exit17_31), L(ExitTable))
+	.int	JMPTBL(L(Exit17_31), L(ExitTable))
+	.int	JMPTBL(L(Exit17_31), L(ExitTable))
+	.int	JMPTBL(L(Exit17_31), L(ExitTable))
+	.int	JMPTBL(L(Exit17_31), L(ExitTable))
+	.int	JMPTBL(L(Exit17_31), L(ExitTable))
+	.int	JMPTBL(L(Exit17_31), L(ExitTable))
+	.int	JMPTBL(L(Exit17_31), L(ExitTable))
+	.int	JMPTBL(L(Exit17_31), L(ExitTable))
+	.int	JMPTBL(L(Exit17_31), L(ExitTable))
+	.int	JMPTBL(L(Exit17_31), L(ExitTable))
 	.int	JMPTBL(L(Exit32), L(ExitTable))
 	.int	JMPTBL(L(Exit33_63), L(ExitTable))
 	.int	JMPTBL(L(Exit33_63), L(ExitTable))

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=fa50be1e0599168aeda6497a08f1955279198efd

commit fa50be1e0599168aeda6497a08f1955279198efd
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Mon Sep 24 14:25:20 2018 -0500

    strcpy-avx2.S: change Exit23 to prepare for Exit17 to Exit31

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 2f275f5..fd46d28 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -896,17 +896,18 @@ L(Exit22):
 
 	.p2align 4
 L(Exit23):
-	vmovdqu	(%rsi), %xmm0
-	mov	15(%rsi), %rcx
-	vmovdqu	%xmm0, (%rdi)
-	mov	%rcx, 15(%rdi)
+	vmovdqu (%rsi), %xmm0
+	vmovdqu -15(%rsi, %rdx), %xmm1
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm1, -15(%rdi, %rdx)
 # ifdef USE_AS_STPCPY
-	lea	22(%rdi), %rax
+	lea	(%rdi, %rdx), %rax
 # endif
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$23, %r8
-	lea	23(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
+	sub %rdx, %r8
+	sub $1, %r8
+	lea 1(%rdi, %rdx), %rdi
+	jnz L(StrncpyFillTailWithZero)
 # endif
 	VZEROUPPER
 	ret

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3a7ccd25d15b2b34ff5f86cd68b2679cae46c335

commit 3a7ccd25d15b2b34ff5f86cd68b2679cae46c335
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Mon Sep 24 12:16:19 2018 -0500

    strcpy-avx2.S: usage of ymmZ vector

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 38b36e9..2f275f5 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -262,7 +262,7 @@ L(UnalignedFourVecSizeLoop_start):
 	jz	L(UnalignedFourVecSizeLoop_start)
 
 L(UnalignedFourVecSizeLeave):
-	vpcmpeqb %ymm4, %ymm0, %ymm0
+	vpcmpeqb %ymm4, %ymmZ, %ymm0
 	vpmovmskb %ymm0, %edx
 	test	%edx, %edx
 	jnz	L(CopyVecSizeUnaligned_0)
@@ -272,7 +272,7 @@ L(UnalignedFourVecSizeLeave):
 	test	%ecx, %ecx
 	jnz	L(CopyVecSizeUnaligned_16)
 
-	vpcmpeqb %ymm6, %ymm0, %ymm0
+	vpcmpeqb %ymm6, %ymmZ, %ymm0
 	vpmovmskb %ymm0, %edx
 	test	%edx, %edx
 	jnz	L(CopyVecSizeUnaligned_32)
@@ -2724,7 +2724,7 @@ L(UnalignedFourVecSizeLeaveCase3):
 	.p2align 4
 L(UnalignedFourVecSizeLeaveCase2):
 	xor	%ecx, %ecx
-	vpcmpeqb %ymm4, %ymm0, %ymm0
+	vpcmpeqb %ymm4, %ymmZ, %ymm0
 	vpmovmskb %ymm0, %edx
 	add	$(VEC_SIZE * 3), %r8
 	jle	L(CopyVecSizeCase2OrCase3)
@@ -2734,7 +2734,7 @@ L(UnalignedFourVecSizeLeaveCase2):
 #  else
 	jnz	L(CopyVecSize)
 #  endif
-	vpcmpeqb %ymm5, %ymm0, %ymm0
+	vpcmpeqb %ymm5, %ymmZ, %ymm0
 	vpmovmskb %ymm0, %edx
 	vmovdqu %ymm4, (%rdi)
 	add	$VEC_SIZE, %rcx
@@ -2747,7 +2747,7 @@ L(UnalignedFourVecSizeLeaveCase2):
 	jnz	L(CopyVecSize)
 #  endif
 
-	vpcmpeqb %ymm6, %ymm0, %ymm0
+	vpcmpeqb %ymm6, %ymmZ, %ymm0
 	vpmovmskb %ymm0, %edx
 	vmovdqu %ymm5, VEC_SIZE(%rdi)
 	add	$VEC_SIZE, %rcx
@@ -2760,7 +2760,7 @@ L(UnalignedFourVecSizeLeaveCase2):
 	jnz	L(CopyVecSize)
 #  endif
 
-	vpcmpeqb %ymm7, %ymm0, %ymm0
+	vpcmpeqb %ymm7, %ymmZ, %ymm0
 	vpmovmskb %ymm0, %edx
 	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
 	lea	VEC_SIZE(%rdi, %rcx), %rdi

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=da995e6f074b3504d4e8c9860b41caa33ffdea23

commit da995e6f074b3504d4e8c9860b41caa33ffdea23
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Mon Sep 24 12:06:49 2018 -0500

    strcpy-avx2.S: remove useless vpxor instructions

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 9dbcd08..38b36e9 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -71,8 +71,6 @@ ENTRY (STRCPY)
 
 	and	$-VEC_SIZE, %rsi
 	and	$(VEC_SIZE - 1), %ecx
-	vpxor	%xmm0, %xmm0, %xmm0
-	vpxor	%xmm1, %xmm1, %xmm1
 
 	vpcmpeqb (%rsi), %ymmZ, %ymm1
 	vpmovmskb %ymm1, %edx

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a27d35e8a9806b211d31449c13a8c97b114e5221

commit a27d35e8a9806b211d31449c13a8c97b114e5221
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Sep 21 09:00:16 2018 -0700

    Replace Exit33 to Exit63 with Exit33_63

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index b4f4738..9dbcd08 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1065,258 +1065,7 @@ L(Exit32):
 	ret
 
 	.p2align 4
-L(Exit33):
-	/* 0/32, 32/1 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu %ymm0, (%rdi)
-	mov	%dh, 32(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	32(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$33, %r8
-	lea	33(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit34):
-	/* 0/32, 32/2 */
-	vmovdqu (%rsi), %ymm0
-	mov	32(%rsi), %dx
-	vmovdqu %ymm0, (%rdi)
-	mov	%dx, 32(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	33(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$34, %r8
-	lea	34(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit35):
-	/* 0/32, 31/4 */
-	vmovdqu (%rsi), %ymm0
-	mov	31(%rsi), %edx
-	vmovdqu %ymm0, (%rdi)
-	mov	%edx, 31(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	34(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$35, %r8
-	lea	35(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit36):
-	/* 0/32, 32/4 */
-	vmovdqu (%rsi), %ymm0
-	mov	32(%rsi), %edx
-	vmovdqu %ymm0, (%rdi)
-	mov	%edx, 32(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	35(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$36, %r8
-	lea	36(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit37):
-	/* 0/32, 29/8 */
-	vmovdqu (%rsi), %ymm0
-	mov	29(%rsi), %rdx
-	vmovdqu %ymm0, (%rdi)
-	mov	%rdx, 29(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	36(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$37, %r8
-	lea	37(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit38):
-	/* 0/32, 30/8 */
-	vmovdqu (%rsi), %ymm0
-	mov	30(%rsi), %rdx
-	vmovdqu %ymm0, (%rdi)
-	mov	%rdx, 30(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	37(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$38, %r8
-	lea	38(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit39):
-	/* 0/32, 31/8 */
-	vmovdqu (%rsi), %ymm0
-	mov	31(%rsi), %rdx
-	vmovdqu %ymm0, (%rdi)
-	mov	%rdx, 31(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	38(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$39, %r8
-	lea	39(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit40):
-	/* 0/32, 32/8 */
-	vmovdqu (%rsi), %ymm0
-	mov	32(%rsi), %rdx
-	vmovdqu %ymm0, (%rdi)
-	mov	%rdx, 32(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	39(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$40, %r8
-	lea	40(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit41):
-	/* 0/32, 32/8, 40/1 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 25(%rsi), %xmm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 25(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	40(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$41, %r8
-	lea	41(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit42):
-	/* 0/32, 32/8, 40/2 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 26(%rsi), %xmm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 26(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	41(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$42, %r8
-	lea	42(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit43):
-	/* 0/32, 27/16 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 27(%rsi), %xmm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 27(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	42(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$43, %r8
-	lea	43(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit44):
-	/* 0/32, 28/16 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 28(%rsi), %xmm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 28(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	43(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$44, %r8
-	lea	44(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit45):
-	/* 0/32, 29/16 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 29(%rsi), %xmm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 29(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	44(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$45, %r8
-	lea	45(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit46):
-	/* 0/32, 30/16 */
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 30(%rsi), %xmm1
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 30(%rdi)
-# ifdef USE_AS_STPCPY
-	lea	45(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$46, %r8
-	lea	46(%rdi), %rdi
-	jnz	L(StrncpyFillTailWithZero)
-# endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(Exit47):
+L(Exit33_63):
 	/* 0/32, 31/16 */
 	vmovdqu (%rsi), %ymm0
 	vmovdqu -31(%rsi, %rdx), %ymm1
@@ -3073,37 +2822,37 @@ L(ExitTable):
 	.int	JMPTBL(L(Exit30), L(ExitTable))
 	.int	JMPTBL(L(Exit31), L(ExitTable))
 	.int	JMPTBL(L(Exit32), L(ExitTable))
-	.int	JMPTBL(L(Exit33), L(ExitTable))
-	.int	JMPTBL(L(Exit34), L(ExitTable))
-	.int	JMPTBL(L(Exit35), L(ExitTable))
-	.int	JMPTBL(L(Exit36), L(ExitTable))
-	.int	JMPTBL(L(Exit37), L(ExitTable))
-	.int	JMPTBL(L(Exit38), L(ExitTable))
-	.int	JMPTBL(L(Exit39), L(ExitTable))
-	.int	JMPTBL(L(Exit40), L(ExitTable))
-	.int	JMPTBL(L(Exit41), L(ExitTable))
-	.int	JMPTBL(L(Exit42), L(ExitTable))
-	.int	JMPTBL(L(Exit43), L(ExitTable))
-	.int	JMPTBL(L(Exit44), L(ExitTable))
-	.int	JMPTBL(L(Exit45), L(ExitTable))
-	.int	JMPTBL(L(Exit46), L(ExitTable))
-	.int	JMPTBL(L(Exit47), L(ExitTable))
-	.int	JMPTBL(L(Exit48), L(ExitTable))
-	.int	JMPTBL(L(Exit49), L(ExitTable))
-	.int	JMPTBL(L(Exit50), L(ExitTable))
-	.int	JMPTBL(L(Exit51), L(ExitTable))
-	.int	JMPTBL(L(Exit52), L(ExitTable))
-	.int	JMPTBL(L(Exit53), L(ExitTable))
-	.int	JMPTBL(L(Exit54), L(ExitTable))
-	.int	JMPTBL(L(Exit55), L(ExitTable))
-	.int	JMPTBL(L(Exit56), L(ExitTable))
-	.int	JMPTBL(L(Exit57), L(ExitTable))
-	.int	JMPTBL(L(Exit58), L(ExitTable))
-	.int	JMPTBL(L(Exit59), L(ExitTable))
-	.int	JMPTBL(L(Exit60), L(ExitTable))
-	.int	JMPTBL(L(Exit61), L(ExitTable))
-	.int	JMPTBL(L(Exit62), L(ExitTable))
-	.int	JMPTBL(L(Exit63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
+	.int	JMPTBL(L(Exit33_63), L(ExitTable))
 	.int	JMPTBL(L(Exit64), L(ExitTable))
 # ifdef USE_AS_STRNCPY
 L(ExitStrncpyTable):

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=782590dfdd140ba21765a7ac563ebbc6d64958c2

commit 782590dfdd140ba21765a7ac563ebbc6d64958c2
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Sep 21 08:56:27 2018 -0700

    Change L(Exit47) to handle Exit33 to Exit63

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index e1b7431..b4f4738 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1319,15 +1319,16 @@ L(Exit46):
 L(Exit47):
 	/* 0/32, 31/16 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 31(%rsi), %xmm1
+	vmovdqu -31(%rsi, %rdx), %ymm1
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 31(%rdi)
+	vmovdqu %ymm1, -31(%rdi, %rdx)
 # ifdef USE_AS_STPCPY
-	lea	46(%rdi), %rax
+	lea	(%rdi, %rdx), %rax
 # endif
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	sub	$47, %r8
-	lea	47(%rdi), %rdi
+	sub	%rdx, %r8
+	sub	$1, %r8
+	lea	1(%rdi, %rdx), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
 	VZEROUPPER

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e10ee0652bc868d9b4f18de0fc392524b451bc88

commit e10ee0652bc868d9b4f18de0fc392524b451bc88
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu Sep 20 16:01:59 2018 -0700

    Use 32-bit registers for vpmovmskb YMM
    
    YMM registers are 32 bytes. 32-bit registers are sufficient.

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 548e0e9..e1b7431 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -75,7 +75,7 @@ ENTRY (STRCPY)
 	vpxor	%xmm1, %xmm1, %xmm1
 
 	vpcmpeqb (%rsi), %ymmZ, %ymm1
-	vpmovmskb %ymm1, %rdx
+	vpmovmskb %ymm1, %edx
 	shr	%cl, %rdx
 
 # ifdef USE_AS_STRNCPY
@@ -90,18 +90,18 @@ ENTRY (STRCPY)
 #  endif
 	jbe	L(CopyVecSizeTailCase2OrCase3)
 # endif
-	test	%rdx, %rdx
+	test	%edx, %edx
 	jnz	L(CopyVecSizeTail)
 
 	vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm0
-	vpmovmskb %ymm0, %rdx
+	vpmovmskb %ymm0, %edx
 
 # ifdef USE_AS_STRNCPY
 	add	$VEC_SIZE, %r10
 	cmp	%r10, %r8
 	jbe	L(CopyTwoVecSizeCase2OrCase3)
 # endif
-	test	%rdx, %rdx
+	test	%edx, %edx
 	jnz	L(CopyTwoVecSize)
 
 	vmovdqu (%rsi, %rcx), %ymm1   /* copy VEC_SIZE bytes */
@@ -121,13 +121,13 @@ L(UnalignVecSizeBoth):
 	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
 	vmovdqu %ymm1, (%rdi, %rcx)
 	vpcmpeqb %ymm2, %ymmZ, %ymm0
-	vpmovmskb %ymm0, %rdx
+	vpmovmskb %ymm0, %edx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
 	sub	$(VEC_SIZE * 3), %r8
 	jbe	L(CopyVecSizeCase2OrCase3)
 # endif
-	test	%rdx, %rdx
+	test	%edx, %edx
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyVecSizeUnalignedVec2)
 # else
@@ -137,13 +137,13 @@ L(UnalignVecSizeBoth):
 	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
 	vmovdqu %ymm2, (%rdi, %rcx)
 	vpcmpeqb %ymm3, %ymmZ, %ymm0
-	vpmovmskb %ymm0, %rdx
+	vpmovmskb %ymm0, %edx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
 	sub	$VEC_SIZE, %r8
 	jbe	L(CopyVecSizeCase2OrCase3)
 # endif
-	test	%rdx, %rdx
+	test	%edx, %edx
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyVecSizeUnalignedVec3)
 # else
@@ -153,13 +153,13 @@ L(UnalignVecSizeBoth):
 	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
 	vmovdqu %ymm3, (%rdi, %rcx)
 	vpcmpeqb %ymm4, %ymmZ, %ymm0
-	vpmovmskb %ymm0, %rdx
+	vpmovmskb %ymm0, %edx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
 	sub	$VEC_SIZE, %r8
 	jbe	L(CopyVecSizeCase2OrCase3)
 # endif
-	test	%rdx, %rdx
+	test	%edx, %edx
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyVecSizeUnalignedVec4)
 # else
@@ -169,13 +169,13 @@ L(UnalignVecSizeBoth):
 	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm1
 	vmovdqu %ymm4, (%rdi, %rcx)
 	vpcmpeqb %ymm1, %ymmZ, %ymm0
-	vpmovmskb %ymm0, %rdx
+	vpmovmskb %ymm0, %edx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
 	sub	$VEC_SIZE, %r8
 	jbe	L(CopyVecSizeCase2OrCase3)
 # endif
-	test	%rdx, %rdx
+	test	%edx, %edx
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyVecSizeUnalignedVec1)
 # else
@@ -185,13 +185,13 @@ L(UnalignVecSizeBoth):
 	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
 	vmovdqu %ymm1, (%rdi, %rcx)
 	vpcmpeqb %ymm2, %ymmZ, %ymm0
-	vpmovmskb %ymm0, %rdx
+	vpmovmskb %ymm0, %edx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
 	sub	$VEC_SIZE, %r8
 	jbe	L(CopyVecSizeCase2OrCase3)
 # endif
-	test	%rdx, %rdx
+	test	%edx, %edx
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyVecSizeUnalignedVec2)
 # else
@@ -201,13 +201,13 @@ L(UnalignVecSizeBoth):
 	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
 	vmovdqu %ymm2, (%rdi, %rcx)
 	vpcmpeqb %ymm3, %ymmZ, %ymm0
-	vpmovmskb %ymm0, %rdx
+	vpmovmskb %ymm0, %edx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
 	sub	$VEC_SIZE, %r8
 	jbe	L(CopyVecSizeCase2OrCase3)
 # endif
-	test	%rdx, %rdx
+	test	%edx, %edx
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyVecSizeUnalignedVec3)
 # else
@@ -232,12 +232,12 @@ L(UnalignedFourVecSizeLoop):
 	vpminub %ymm7, %ymm6, %ymm3
 	vpminub %ymm2, %ymm3, %ymm3
 	vpcmpeqb %ymm0, %ymm3, %ymm3
-	vpmovmskb %ymm3, %rdx
+	vpmovmskb %ymm3, %edx
 # ifdef USE_AS_STRNCPY
 	sub	$(VEC_SIZE * 4), %r8
 	jbe	L(UnalignedLeaveCase2OrCase3)
 # endif
-	test	%rdx, %rdx
+	test	%edx, %edx
 	jnz	L(UnalignedFourVecSizeLeave)
 
 L(UnalignedFourVecSizeLoop_start):
@@ -255,33 +255,33 @@ L(UnalignedFourVecSizeLoop_start):
 	vpminub %ymm7, %ymm6, %ymm3
 	vpminub %ymm2, %ymm3, %ymm3
 	vpcmpeqb %ymm0, %ymm3, %ymm3
-	vpmovmskb %ymm3, %rdx
+	vpmovmskb %ymm3, %edx
 # ifdef USE_AS_STRNCPY
 	sub	$(VEC_SIZE * 4), %r8
 	jbe	L(UnalignedLeaveCase2OrCase3)
 # endif
-	test	%rdx, %rdx
+	test	%edx, %edx
 	jz	L(UnalignedFourVecSizeLoop_start)
 
 L(UnalignedFourVecSizeLeave):
 	vpcmpeqb %ymm4, %ymm0, %ymm0
-	vpmovmskb %ymm0, %rdx
-	test	%rdx, %rdx
+	vpmovmskb %ymm0, %edx
+	test	%edx, %edx
 	jnz	L(CopyVecSizeUnaligned_0)
 
 	vpcmpeqb %ymm5, %ymmZ, %ymm1
-	vpmovmskb %ymm1, %rcx
-	test	%rcx, %rcx
+	vpmovmskb %ymm1, %ecx
+	test	%ecx, %ecx
 	jnz	L(CopyVecSizeUnaligned_16)
 
 	vpcmpeqb %ymm6, %ymm0, %ymm0
-	vpmovmskb %ymm0, %rdx
-	test	%rdx, %rdx
+	vpmovmskb %ymm0, %edx
+	test	%edx, %edx
 	jnz	L(CopyVecSizeUnaligned_32)
 
 	vpcmpeqb %ymm7, %ymmZ, %ymm1
-	vpmovmskb %ymm1, %rcx
-	bsf	%rcx, %rdx
+	vpmovmskb %ymm1, %ecx
+	bsf	%ecx, %edx
 	vmovdqu %ymm4, (%rdi)
 	vmovdqu %ymm5, VEC_SIZE(%rdi)
 	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
@@ -306,7 +306,7 @@ L(SourceStringAlignmentLessTwoVecSize):
 	vmovdqu (%rsi), %ymm1
 	vmovdqu VEC_SIZE(%rsi), %ymm2
 	vpcmpeqb %ymm1, %ymmZ, %ymm0
-	vpmovmskb %ymm0, %rdx
+	vpmovmskb %ymm0, %edx
 
 # ifdef USE_AS_STRNCPY
 #  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
@@ -316,12 +316,12 @@ L(SourceStringAlignmentLessTwoVecSize):
 #  endif
 	jbe	L(CopyVecSizeTail1Case2OrCase3)
 # endif
-	test	%rdx, %rdx
+	test	%edx, %edx
 	jnz	L(CopyVecSizeTail1)
 
 	vpcmpeqb %ymm2, %ymmZ, %ymm0
 	vmovdqu %ymm1, (%rdi)
-	vpmovmskb %ymm0, %rdx
+	vpmovmskb %ymm0, %edx
 
 # ifdef USE_AS_STRNCPY
 #  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
@@ -331,7 +331,7 @@ L(SourceStringAlignmentLessTwoVecSize):
 #  endif
 	jbe	L(CopyTwoVecSize1Case2OrCase3)
 # endif
-	test	%rdx, %rdx
+	test	%edx, %edx
 	jnz	L(CopyTwoVecSize1)
 
 	and	$-VEC_SIZE, %rsi
@@ -347,13 +347,13 @@ L(SourceStringAlignmentLessTwoVecSize):
 L(CopyVecSize):
 	add	%rcx, %rdi
 	add	%rcx, %rsi
-	bsf	%rdx, %rdx
+	bsf	%edx, %edx
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
 # endif
 	.p2align 4
 L(CopyVecSizeTail):
 	add	%rcx, %rsi
-	bsf	%rdx, %rdx
+	bsf	%edx, %edx
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
 
 	.p2align 4
@@ -364,20 +364,20 @@ L(CopyTwoVecSize1):
 	sub	$VEC_SIZE, %r8
 # endif
 L(CopyVecSizeTail1):
-	bsf	%rdx, %rdx
+	bsf	%edx, %edx
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
 
 	.p2align 4
 L(CopyTwoVecSize):
-	bsf	%rdx, %rdx
+	bsf	%edx, %edx
 	add	%rcx, %rsi
-	add	$VEC_SIZE, %rdx
-	sub	%rcx, %rdx
+	add	$VEC_SIZE, %edx
+	sub	%ecx, %edx
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
 
 	.p2align 4
 L(CopyVecSizeUnaligned_0):
-	bsf	%rdx, %rdx
+	bsf	%edx, %edx
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 # ifdef USE_AS_STPCPY
 	lea	(%rdi, %rdx), %rax
@@ -393,7 +393,7 @@ L(CopyVecSizeUnaligned_0):
 
 	.p2align 4
 L(CopyVecSizeUnaligned_16):
-	bsf	%rcx, %rdx
+	bsf	%ecx, %edx
 	vmovdqu %ymm4, (%rdi)
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 # ifdef USE_AS_STPCPY
@@ -412,7 +412,7 @@ L(CopyVecSizeUnaligned_16):
 
 	.p2align 4
 L(CopyVecSizeUnaligned_32):
-	bsf	%rdx, %rdx
+	bsf	%edx, %edx
 	vmovdqu %ymm4, (%rdi)
 	vmovdqu %ymm5, VEC_SIZE(%rdi)
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
@@ -469,31 +469,31 @@ L(CopyVecSizeCase2):
 	add	$VEC_SIZE, %r8
 	add	%rcx, %rdi
 	add	%rcx, %rsi
-	bsf	%rdx, %rdx
-	cmp	%r8, %rdx
+	bsf	%edx, %edx
+	cmp	%r8d, %edx
 	jb	L(CopyVecSizeExit)
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
 
 	.p2align 4
 L(CopyTwoVecSizeCase2):
 	add	%rcx, %rsi
-	bsf	%rdx, %rdx
-	add	$VEC_SIZE, %rdx
-	sub	%rcx, %rdx
-	cmp	%r8, %rdx
+	bsf	%edx, %edx
+	add	$VEC_SIZE, %edx
+	sub	%ecx, %edx
+	cmp	%r8d, %edx
 	jb	L(CopyVecSizeExit)
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
 
 L(CopyVecSizeTailCase2):
 	add	%rcx, %rsi
-	bsf	%rdx, %rdx
-	cmp	%r8, %rdx
+	bsf	%edx, %edx
+	cmp	%r8d, %edx
 	jb	L(CopyVecSizeExit)
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
 
 L(CopyVecSizeTail1Case2):
-	bsf	%rdx, %rdx
-	cmp	%r8, %rdx
+	bsf	%edx, %edx
+	cmp	%r8d, %edx
 	jb	L(CopyVecSizeExit)
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
 
@@ -2885,7 +2885,7 @@ L(CopyVecSizeUnalignedVec2):
 
 	.p2align 4
 L(CopyVecSizeVecExit):
-	bsf	%rdx, %rdx
+	bsf	%edx, %edx
 	add	$(VEC_SIZE - 1), %r8
 	add	%rcx, %rdi
 #   ifdef USE_AS_STPCPY
@@ -2977,22 +2977,22 @@ L(UnalignedFourVecSizeLeaveCase3):
 L(UnalignedFourVecSizeLeaveCase2):
 	xor	%ecx, %ecx
 	vpcmpeqb %ymm4, %ymm0, %ymm0
-	vpmovmskb %ymm0, %rdx
+	vpmovmskb %ymm0, %edx
 	add	$(VEC_SIZE * 3), %r8
 	jle	L(CopyVecSizeCase2OrCase3)
-	test	%rdx, %rdx
+	test	%edx, %edx
 #  ifndef USE_AS_STRCAT
 	jnz	L(CopyVecSizeUnalignedVec4)
 #  else
 	jnz	L(CopyVecSize)
 #  endif
 	vpcmpeqb %ymm5, %ymm0, %ymm0
-	vpmovmskb %ymm0, %rdx
+	vpmovmskb %ymm0, %edx
 	vmovdqu %ymm4, (%rdi)
 	add	$VEC_SIZE, %rcx
 	sub	$VEC_SIZE, %r8
 	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%rdx, %rdx
+	test	%edx, %edx
 #  ifndef USE_AS_STRCAT
 	jnz	L(CopyVecSizeUnalignedVec5)
 #  else
@@ -3000,12 +3000,12 @@ L(UnalignedFourVecSizeLeaveCase2):
 #  endif
 
 	vpcmpeqb %ymm6, %ymm0, %ymm0
-	vpmovmskb %ymm0, %rdx
+	vpmovmskb %ymm0, %edx
 	vmovdqu %ymm5, VEC_SIZE(%rdi)
 	add	$VEC_SIZE, %rcx
 	sub	$VEC_SIZE, %r8
 	jbe	L(CopyVecSizeCase2OrCase3)
-	test	%rdx, %rdx
+	test	%edx, %edx
 #  ifndef USE_AS_STRCAT
 	jnz	L(CopyVecSizeUnalignedVec6)
 #  else
@@ -3013,12 +3013,12 @@ L(UnalignedFourVecSizeLeaveCase2):
 #  endif
 
 	vpcmpeqb %ymm7, %ymm0, %ymm0
-	vpmovmskb %ymm0, %rdx
+	vpmovmskb %ymm0, %edx
 	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
 	lea	VEC_SIZE(%rdi, %rcx), %rdi
 	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	bsf	%rdx, %rdx
-	cmp	%r8, %rdx
+	bsf	%edx, %edx
+	cmp	%r8d, %edx
 	jb	L(CopyVecSizeExit)
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0706e178920c6cea7f23bb415e73365fda4a9f57

commit 0706e178920c6cea7f23bb415e73365fda4a9f57
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Sep 18 12:21:18 2018 -0700

    Use 32-bit AND when upper 32 bits are zero

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index ea39094..548e0e9 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -65,12 +65,12 @@ ENTRY (STRCPY)
 
 	vpxor	%xmmZ, %xmmZ, %xmmZ
 
-	and	$((VEC_SIZE * 4) - 1), %rcx
-	cmp	$(VEC_SIZE * 2), %rcx
+	and	$((VEC_SIZE * 4) - 1), %ecx
+	cmp	$(VEC_SIZE * 2), %ecx
 	jbe	L(SourceStringAlignmentLessTwoVecSize)
 
 	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %rcx
+	and	$(VEC_SIZE - 1), %ecx
 	vpxor	%xmm0, %xmm0, %xmm0
 	vpxor	%xmm1, %xmm1, %xmm1
 
@@ -335,7 +335,7 @@ L(SourceStringAlignmentLessTwoVecSize):
 	jnz	L(CopyTwoVecSize1)
 
 	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %rcx
+	and	$(VEC_SIZE - 1), %ecx
 	jmp	L(UnalignVecSizeBoth)
 
 /*------End of main part with loops---------------------*/
@@ -2904,7 +2904,7 @@ L(StrncpyFillTailWithZero):
 	add	$VEC_SIZE, %rdi
 
 	mov	%rdi, %rsi
-	and	$(VEC_SIZE - 1), %rsi
+	and	$(VEC_SIZE - 1), %esi
 	sub	%rsi, %rdi
 	add	%rsi, %r8
 	sub	$(VEC_SIZE * 4), %r8

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=307f7ebcdefbb32f4d2451c3c052448272b13f26

commit 307f7ebcdefbb32f4d2451c3c052448272b13f26
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Sep 18 12:18:22 2018 -0700

    Replace xor %ch, %ch/movb %ch, (%rdi) with movb $0, (%rdi)
    
    Please compare performance before and after.

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 222826d..ea39094 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -1647,8 +1647,7 @@ L(StrncpyExit0):
 	mov	%rdi, %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, (%rdi)
+	movb	$0, (%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1661,8 +1660,7 @@ L(StrncpyExit1):
 	lea	1(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 1(%rdi)
+	movb	$0, 1(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1675,8 +1673,7 @@ L(StrncpyExit2):
 	lea	2(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 2(%rdi)
+	movb	$0, 2(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1691,8 +1688,7 @@ L(StrncpyExit3):
 	lea	3(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 3(%rdi)
+	movb	$0, 3(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1705,8 +1701,7 @@ L(StrncpyExit4):
 	lea	4(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 4(%rdi)
+	movb	$0, 4(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1721,8 +1716,7 @@ L(StrncpyExit5):
 	lea	5(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 5(%rdi)
+	movb	$0, 5(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1737,8 +1731,7 @@ L(StrncpyExit6):
 	lea	6(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 6(%rdi)
+	movb	$0, 6(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1753,8 +1746,7 @@ L(StrncpyExit7):
 	lea	7(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 7(%rdi)
+	movb	$0, 7(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1767,8 +1759,7 @@ L(StrncpyExit8):
 	lea	8(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 8(%rdi)
+	movb	$0, 8(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1783,8 +1774,7 @@ L(StrncpyExit9):
 	lea	9(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 9(%rdi)
+	movb	$0, 9(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1799,8 +1789,7 @@ L(StrncpyExit10):
 	lea	10(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 10(%rdi)
+	movb	$0, 10(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1815,8 +1804,7 @@ L(StrncpyExit11):
 	lea	11(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 11(%rdi)
+	movb	$0, 11(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1831,8 +1819,7 @@ L(StrncpyExit12):
 	lea	12(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 12(%rdi)
+	movb	$0, 12(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1847,8 +1834,7 @@ L(StrncpyExit13):
 	lea	13(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 13(%rdi)
+	movb	$0, 13(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1863,8 +1849,7 @@ L(StrncpyExit14):
 	lea	14(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 14(%rdi)
+	movb	$0, 14(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1879,8 +1864,7 @@ L(StrncpyExit15):
 	lea	15(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 15(%rdi)
+	movb	$0, 15(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1893,8 +1877,7 @@ L(StrncpyExit16):
 	lea	16(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 16(%rdi)
+	movb	$0, 16(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1909,8 +1892,7 @@ L(StrncpyExit17):
 	lea	17(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 17(%rdi)
+	movb	$0, 17(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1925,8 +1907,7 @@ L(StrncpyExit18):
 	lea	18(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 18(%rdi)
+	movb	$0, 18(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1941,8 +1922,7 @@ L(StrncpyExit19):
 	lea	19(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 19(%rdi)
+	movb	$0, 19(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1957,8 +1937,7 @@ L(StrncpyExit20):
 	lea	20(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 20(%rdi)
+	movb	$0, 20(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1973,8 +1952,7 @@ L(StrncpyExit21):
 	lea	21(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 21(%rdi)
+	movb	$0, 21(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -1989,8 +1967,7 @@ L(StrncpyExit22):
 	lea	22(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 22(%rdi)
+	movb	$0, 22(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2005,8 +1982,7 @@ L(StrncpyExit23):
 	lea	23(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 23(%rdi)
+	movb	$0, 23(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2021,8 +1997,7 @@ L(StrncpyExit24):
 	lea	24(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 24(%rdi)
+	movb	$0, 24(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2037,8 +2012,7 @@ L(StrncpyExit25):
 	lea	25(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 25(%rdi)
+	movb	$0, 25(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2053,8 +2027,7 @@ L(StrncpyExit26):
 	lea	26(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 26(%rdi)
+	movb	$0, 26(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2069,8 +2042,7 @@ L(StrncpyExit27):
 	lea	27(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 27(%rdi)
+	movb	$0, 27(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2085,8 +2057,7 @@ L(StrncpyExit28):
 	lea	28(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 28(%rdi)
+	movb	$0, 28(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2101,8 +2072,7 @@ L(StrncpyExit29):
 	lea	29(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 29(%rdi)
+	movb	$0, 29(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2117,8 +2087,7 @@ L(StrncpyExit30):
 	lea	30(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 30(%rdi)
+	movb	$0, 30(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2133,8 +2102,7 @@ L(StrncpyExit31):
 	lea	31(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 31(%rdi)
+	movb	$0, 31(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2147,8 +2115,7 @@ L(StrncpyExit32):
 	lea	32(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 32(%rdi)
+	movb	$0, 32(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2163,8 +2130,7 @@ L(StrncpyExit33):
 	lea	33(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 33(%rdi)
+	movb	$0, 33(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2180,8 +2146,7 @@ L(StrncpyExit34):
 	lea	34(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 34(%rdi)
+	movb	$0, 34(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2197,8 +2162,7 @@ L(StrncpyExit35):
 	lea	35(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 35(%rdi)
+	movb	$0, 35(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2214,8 +2178,7 @@ L(StrncpyExit36):
 	lea	36(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 36(%rdi)
+	movb	$0, 36(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2231,8 +2194,7 @@ L(StrncpyExit37):
 	lea	37(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 37(%rdi)
+	movb	$0, 37(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2248,8 +2210,7 @@ L(StrncpyExit38):
 	lea	38(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 38(%rdi)
+	movb	$0, 38(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2265,8 +2226,7 @@ L(StrncpyExit39):
 	lea	39(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 39(%rdi)
+	movb	$0, 39(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2282,8 +2242,7 @@ L(StrncpyExit40):
 	lea	40(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 40(%rdi)
+	movb	$0, 40(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2299,8 +2258,7 @@ L(StrncpyExit41):
 	lea	41(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 41(%rdi)
+	movb	$0, 41(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2316,8 +2274,7 @@ L(StrncpyExit42):
 	lea	42(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 42(%rdi)
+	movb	$0, 42(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2333,8 +2290,7 @@ L(StrncpyExit43):
 	lea	43(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 43(%rdi)
+	movb	$0, 43(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2350,8 +2306,7 @@ L(StrncpyExit44):
 	lea	44(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 44(%rdi)
+	movb	$0, 44(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2367,8 +2322,7 @@ L(StrncpyExit45):
 	lea	45(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 45(%rdi)
+	movb	$0, 45(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2384,8 +2338,7 @@ L(StrncpyExit46):
 	lea	46(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 46(%rdi)
+	movb	$0, 46(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2401,8 +2354,7 @@ L(StrncpyExit47):
 	lea	47(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 47(%rdi)
+	movb	$0, 47(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2418,8 +2370,7 @@ L(StrncpyExit48):
 	lea	48(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 48(%rdi)
+	movb	$0, 48(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2435,8 +2386,7 @@ L(StrncpyExit49):
 	lea	49(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 49(%rdi)
+	movb	$0, 49(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2452,8 +2402,7 @@ L(StrncpyExit50):
 	lea	50(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 50(%rdi)
+	movb	$0, 50(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2469,8 +2418,7 @@ L(StrncpyExit51):
 	lea	51(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 51(%rdi)
+	movb	$0, 51(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2486,8 +2434,7 @@ L(StrncpyExit52):
 	lea	52(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 52(%rdi)
+	movb	$0, 52(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2503,8 +2450,7 @@ L(StrncpyExit53):
 	lea	53(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 53(%rdi)
+	movb	$0, 53(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2520,8 +2466,7 @@ L(StrncpyExit54):
 	lea	54(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 54(%rdi)
+	movb	$0, 54(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2537,8 +2482,7 @@ L(StrncpyExit55):
 	lea	55(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 55(%rdi)
+	movb	$0, 55(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2554,8 +2498,7 @@ L(StrncpyExit56):
 	lea	56(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 56(%rdi)
+	movb	$0, 56(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2571,8 +2514,7 @@ L(StrncpyExit57):
 	lea	57(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 57(%rdi)
+	movb	$0, 57(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2588,8 +2530,7 @@ L(StrncpyExit58):
 	lea	58(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 58(%rdi)
+	movb	$0, 58(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2605,8 +2546,7 @@ L(StrncpyExit59):
 	lea	59(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 59(%rdi)
+	movb	$0, 59(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2623,8 +2563,7 @@ L(StrncpyExit60):
 	lea	60(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 60(%rdi)
+	movb	$0, 60(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2640,8 +2579,7 @@ L(StrncpyExit61):
 	lea	61(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 61(%rdi)
+	movb	$0, 61(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2657,8 +2595,7 @@ L(StrncpyExit62):
 	lea	62(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 62(%rdi)
+	movb	$0, 62(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2674,8 +2611,7 @@ L(StrncpyExit63):
 	lea	63(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 63(%rdi)
+	movb	$0, 63(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2691,8 +2627,7 @@ L(StrncpyExit64):
 	lea	64(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 64(%rdi)
+	movb	$0, 64(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -2710,8 +2645,7 @@ L(StrncpyExit65):
 	lea	65(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, 65(%rdi)
+	movb	$0, 65(%rdi)
 #  endif
 	VZEROUPPER
 	ret
@@ -3034,8 +2968,7 @@ L(UnalignedFourVecSizeLeaveCase3):
 	lea	(VEC_SIZE * 4)(%rdi), %rax
 #  endif
 #  ifdef USE_AS_STRCAT
-	xor	%ch, %ch
-	movb	%ch, (VEC_SIZE * 4)(%rdi)
+	movb	$0, (VEC_SIZE * 4)(%rdi)
 #  endif
 	VZEROUPPER
 	ret

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a551f0943fcc0deac13942d1bd1f3f3d40a1235a

commit a551f0943fcc0deac13942d1bd1f3f3d40a1235a
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Sep 18 11:40:31 2018 -0700

    Initial xmmZ/ymmZ: a fixed all-zero vector register
    
    Please update, keep only one
    
    vpxor %xmmZ, %xmmZ, %xmmZ
    
    and remove other vpxor:
    
    	vpxor	%xmm0, %xmm0, %xmm0
    	vpxor	%xmm1, %xmm1, %xmm1

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 22bd063..222826d 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -44,6 +44,9 @@
 #  define VZEROUPPER	vzeroupper
 # endif
 
+#define xmmZ	xmm8
+#define ymmZ	ymm8
+
 # ifndef USE_AS_STRCAT
 
 .text
@@ -60,6 +63,8 @@ ENTRY (STRCPY)
 
 # endif
 
+	vpxor	%xmmZ, %xmmZ, %xmmZ
+
 	and	$((VEC_SIZE * 4) - 1), %rcx
 	cmp	$(VEC_SIZE * 2), %rcx
 	jbe	L(SourceStringAlignmentLessTwoVecSize)
@@ -69,7 +74,7 @@ ENTRY (STRCPY)
 	vpxor	%xmm0, %xmm0, %xmm0
 	vpxor	%xmm1, %xmm1, %xmm1
 
-	vpcmpeqb (%rsi), %ymm1, %ymm1
+	vpcmpeqb (%rsi), %ymmZ, %ymm1
 	vpmovmskb %ymm1, %rdx
 	shr	%cl, %rdx
 
@@ -88,7 +93,7 @@ ENTRY (STRCPY)
 	test	%rdx, %rdx
 	jnz	L(CopyVecSizeTail)
 
-	vpcmpeqb VEC_SIZE(%rsi), %ymm0, %ymm0
+	vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm0
 	vpmovmskb %ymm0, %rdx
 
 # ifdef USE_AS_STRNCPY
@@ -115,7 +120,7 @@ L(UnalignVecSizeBoth):
 	vmovdqa (%rsi, %rcx), %ymm1
 	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
 	vmovdqu %ymm1, (%rdi, %rcx)
-	vpcmpeqb %ymm2, %ymm0, %ymm0
+	vpcmpeqb %ymm2, %ymmZ, %ymm0
 	vpmovmskb %ymm0, %rdx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
@@ -131,7 +136,7 @@ L(UnalignVecSizeBoth):
 
 	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
 	vmovdqu %ymm2, (%rdi, %rcx)
-	vpcmpeqb %ymm3, %ymm0, %ymm0
+	vpcmpeqb %ymm3, %ymmZ, %ymm0
 	vpmovmskb %ymm0, %rdx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
@@ -147,7 +152,7 @@ L(UnalignVecSizeBoth):
 
 	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
 	vmovdqu %ymm3, (%rdi, %rcx)
-	vpcmpeqb %ymm4, %ymm0, %ymm0
+	vpcmpeqb %ymm4, %ymmZ, %ymm0
 	vpmovmskb %ymm0, %rdx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
@@ -163,7 +168,7 @@ L(UnalignVecSizeBoth):
 
 	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm1
 	vmovdqu %ymm4, (%rdi, %rcx)
-	vpcmpeqb %ymm1, %ymm0, %ymm0
+	vpcmpeqb %ymm1, %ymmZ, %ymm0
 	vpmovmskb %ymm0, %rdx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
@@ -179,7 +184,7 @@ L(UnalignVecSizeBoth):
 
 	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
 	vmovdqu %ymm1, (%rdi, %rcx)
-	vpcmpeqb %ymm2, %ymm0, %ymm0
+	vpcmpeqb %ymm2, %ymmZ, %ymm0
 	vpmovmskb %ymm0, %rdx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
@@ -195,7 +200,7 @@ L(UnalignVecSizeBoth):
 
 	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
 	vmovdqu %ymm2, (%rdi, %rcx)
-	vpcmpeqb %ymm3, %ymm0, %ymm0
+	vpcmpeqb %ymm3, %ymmZ, %ymm0
 	vpmovmskb %ymm0, %rdx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
@@ -259,14 +264,12 @@ L(UnalignedFourVecSizeLoop_start):
 	jz	L(UnalignedFourVecSizeLoop_start)
 
 L(UnalignedFourVecSizeLeave):
-	vpxor	%xmm1, %xmm1, %xmm1
-
 	vpcmpeqb %ymm4, %ymm0, %ymm0
 	vpmovmskb %ymm0, %rdx
 	test	%rdx, %rdx
 	jnz	L(CopyVecSizeUnaligned_0)
 
-	vpcmpeqb %ymm5, %ymm1, %ymm1
+	vpcmpeqb %ymm5, %ymmZ, %ymm1
 	vpmovmskb %ymm1, %rcx
 	test	%rcx, %rcx
 	jnz	L(CopyVecSizeUnaligned_16)
@@ -276,7 +279,7 @@ L(UnalignedFourVecSizeLeave):
 	test	%rdx, %rdx
 	jnz	L(CopyVecSizeUnaligned_32)
 
-	vpcmpeqb %ymm7, %ymm1, %ymm1
+	vpcmpeqb %ymm7, %ymmZ, %ymm1
 	vpmovmskb %ymm1, %rcx
 	bsf	%rcx, %rdx
 	vmovdqu %ymm4, (%rdi)
@@ -300,10 +303,9 @@ L(UnalignedFourVecSizeLeave):
 /* If source address alignment == destination address alignment */
 
 L(SourceStringAlignmentLessTwoVecSize):
-	vpxor	%xmm0, %xmm0, %xmm0
 	vmovdqu (%rsi), %ymm1
 	vmovdqu VEC_SIZE(%rsi), %ymm2
-	vpcmpeqb %ymm1, %ymm0, %ymm0
+	vpcmpeqb %ymm1, %ymmZ, %ymm0
 	vpmovmskb %ymm0, %rdx
 
 # ifdef USE_AS_STRNCPY
@@ -317,7 +319,7 @@ L(SourceStringAlignmentLessTwoVecSize):
 	test	%rdx, %rdx
 	jnz	L(CopyVecSizeTail1)
 
-	vpcmpeqb %ymm2, %ymm0, %ymm0
+	vpcmpeqb %ymm2, %ymmZ, %ymm0
 	vmovdqu %ymm1, (%rdi)
 	vpmovmskb %ymm0, %rdx
 
@@ -2815,47 +2817,47 @@ L(Fill14):
 
 	.p2align 4
 L(Fill15):
-	vmovdqu %xmm0, -1(%rdi)
+	vmovdqu %xmmZ, -1(%rdi)
 	VZEROUPPER
 	ret
 
 	.p2align 4
 L(Fill16):
-	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmmZ, (%rdi)
 	VZEROUPPER
 	ret
 
 	.p2align 4
 L(Fill17):
-	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmmZ, (%rdi)
 	mov	%dl, 16(%rdi)
 	VZEROUPPER
 	ret
 
 	.p2align 4
 L(Fill18):
-	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmmZ, (%rdi)
 	mov	%dx, 16(%rdi)
 	VZEROUPPER
 	ret
 
 	.p2align 4
 L(Fill19):
-	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmmZ, (%rdi)
 	mov	%edx, 15(%rdi)
 	VZEROUPPER
 	ret
 
 	.p2align 4
 L(Fill20):
-	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmmZ, (%rdi)
 	mov	%edx, 16(%rdi)
 	VZEROUPPER
 	ret
 
 	.p2align 4
 L(Fill21):
-	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmmZ, (%rdi)
 	mov	%edx, 16(%rdi)
 	mov	%dl, 20(%rdi)
 	VZEROUPPER
@@ -2863,7 +2865,7 @@ L(Fill21):
 
 	.p2align 4
 L(Fill22):
-	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmmZ, (%rdi)
 	mov	%edx, 16(%rdi)
 	mov	%dx, 20(%rdi)
 	VZEROUPPER
@@ -2871,21 +2873,21 @@ L(Fill22):
 
 	.p2align 4
 L(Fill23):
-	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmmZ, (%rdi)
 	mov	%rdx, 15(%rdi)
 	VZEROUPPER
 	ret
 
 	.p2align 4
 L(Fill24):
-	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmmZ, (%rdi)
 	mov	%rdx, 16(%rdi)
 	VZEROUPPER
 	ret
 
 	.p2align 4
 L(Fill25):
-	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmmZ, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%dl, 24(%rdi)
 	VZEROUPPER
@@ -2893,7 +2895,7 @@ L(Fill25):
 
 	.p2align 4
 L(Fill26):
-	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmmZ, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%dx, 24(%rdi)
 	VZEROUPPER
@@ -2901,7 +2903,7 @@ L(Fill26):
 
 	.p2align 4
 L(Fill27):
-	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmmZ, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%edx, 23(%rdi)
 	VZEROUPPER
@@ -2909,7 +2911,7 @@ L(Fill27):
 
 	.p2align 4
 L(Fill28):
-	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmmZ, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%edx, 24(%rdi)
 	VZEROUPPER
@@ -2917,7 +2919,7 @@ L(Fill28):
 
 	.p2align 4
 L(Fill29):
-	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmmZ, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%rdx, 21(%rdi)
 	VZEROUPPER
@@ -2925,7 +2927,7 @@ L(Fill29):
 
 	.p2align 4
 L(Fill30):
-	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmmZ, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%rdx, 22(%rdi)
 	VZEROUPPER
@@ -2933,13 +2935,13 @@ L(Fill30):
 
 	.p2align 4
 L(Fill31):
-	vmovdqu %ymm0, -1(%rdi)
+	vmovdqu %ymmZ, -1(%rdi)
 	VZEROUPPER
 	ret
 
 	.p2align 4
 L(Fill32):
-	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymmZ, (%rdi)
 	VZEROUPPER
 	ret
 
@@ -2960,12 +2962,11 @@ L(CopyVecSizeVecExit):
 
 	.p2align 4
 L(StrncpyFillTailWithZero):
-	vpxor	%xmm0, %xmm0, %xmm0
-	xor	%rdx, %rdx
+	xor	%edx, %edx
 	sub	$VEC_SIZE, %r8
 	jbe	L(StrncpyFillExit)
 
-	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymmZ, (%rdi)
 	add	$VEC_SIZE, %rdi
 
 	mov	%rdi, %rsi
@@ -2976,10 +2977,10 @@ L(StrncpyFillTailWithZero):
 	jb	L(StrncpyFillLessFourVecSize)
 
 L(StrncpyFillLoopVmovdqa):
-	vmovdqa %ymm0, (%rdi)
-	vmovdqa %ymm0, VEC_SIZE(%rdi)
-	vmovdqa %ymm0, (VEC_SIZE * 2)(%rdi)
-	vmovdqa %ymm0, (VEC_SIZE * 3)(%rdi)
+	vmovdqa %ymmZ, (%rdi)
+	vmovdqa %ymmZ, VEC_SIZE(%rdi)
+	vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
+	vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
 	add	$(VEC_SIZE * 4), %rdi
 	sub	$(VEC_SIZE * 4), %r8
 	jae	L(StrncpyFillLoopVmovdqa)
@@ -2987,19 +2988,19 @@ L(StrncpyFillLoopVmovdqa):
 L(StrncpyFillLessFourVecSize):
 	add	$(VEC_SIZE * 2), %r8
 	jl	L(StrncpyFillLessTwoVecSize)
-	vmovdqa %ymm0, (%rdi)
-	vmovdqa %ymm0, VEC_SIZE(%rdi)
+	vmovdqa %ymmZ, (%rdi)
+	vmovdqa %ymmZ, VEC_SIZE(%rdi)
 	add	$(VEC_SIZE * 2), %rdi
 	sub	$VEC_SIZE, %r8
 	jl	L(StrncpyFillExit)
-	vmovdqa %ymm0, (%rdi)
+	vmovdqa %ymmZ, (%rdi)
 	add	$VEC_SIZE, %rdi
 	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
 
 L(StrncpyFillLessTwoVecSize):
 	add	$VEC_SIZE, %r8
 	jl	L(StrncpyFillExit)
-	vmovdqa %ymm0, (%rdi)
+	vmovdqa %ymmZ, (%rdi)
 	add	$VEC_SIZE, %rdi
 	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=fc43bfda27f0b0a15c4a590dffcace9d25c06bf8

commit fc43bfda27f0b0a15c4a590dffcace9d25c06bf8
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Sep 18 11:20:51 2018 -0700

    Replace 2 load/store with 1 load/store
    
    Please compare performance of 2 load/store2 vs 1 load/store.

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 2f93cec..22bd063 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -863,10 +863,9 @@ L(Exit20):
 	.p2align 4
 L(Exit21):
 	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %ecx
+	mov	13(%rsi), %rcx
 	vmovdqu	%xmm0, (%rdi)
-	mov	%ecx, 16(%rdi)
-	mov	%dh, 20(%rdi)
+	mov	%rcx, 13(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	20(%rdi), %rax
 # endif
@@ -932,10 +931,9 @@ L(Exit24):
 	.p2align 4
 L(Exit25):
 	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %rcx
+	vmovdqu	9(%rsi), %xmm1
 	vmovdqu	%xmm0, (%rdi)
-	mov	%rcx, 16(%rdi)
-	mov	%dh, 24(%rdi)
+	vmovdqu	%xmm1, 9(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	24(%rdi), %rax
 # endif
@@ -950,11 +948,9 @@ L(Exit25):
 	.p2align 4
 L(Exit26):
 	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %rdx
-	mov	24(%rsi), %cx
+	vmovdqu	10(%rsi), %xmm1
 	vmovdqu	%xmm0, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%cx, 24(%rdi)
+	vmovdqu	%xmm1, 10(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	25(%rdi), %rax
 # endif
@@ -969,11 +965,9 @@ L(Exit26):
 	.p2align 4
 L(Exit27):
 	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %rdx
-	mov	23(%rsi), %ecx
+	vmovdqu	11(%rsi), %xmm1
 	vmovdqu	%xmm0, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%ecx, 23(%rdi)
+	vmovdqu	%xmm1, 11(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	26(%rdi), %rax
 # endif
@@ -988,11 +982,9 @@ L(Exit27):
 	.p2align 4
 L(Exit28):
 	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %rdx
-	mov	24(%rsi), %ecx
+	vmovdqu	12(%rsi), %xmm1
 	vmovdqu	%xmm0, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%ecx, 24(%rdi)
+	vmovdqu	%xmm1, 12(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	27(%rdi), %rax
 # endif
@@ -1217,10 +1209,9 @@ L(Exit40):
 L(Exit41):
 	/* 0/32, 32/8, 40/1 */
 	vmovdqu (%rsi), %ymm0
-	mov	32(%rsi), %rcx
+	vmovdqu 25(%rsi), %xmm1
 	vmovdqu %ymm0, (%rdi)
-	mov	%rcx, 32(%rdi)
-	mov	%dh, 40(%rdi)
+	vmovdqu %xmm1, 25(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	40(%rdi), %rax
 # endif
@@ -1236,11 +1227,9 @@ L(Exit41):
 L(Exit42):
 	/* 0/32, 32/8, 40/2 */
 	vmovdqu (%rsi), %ymm0
-	mov	32(%rsi), %rcx
-	mov	40(%rsi), %dx
+	vmovdqu 26(%rsi), %xmm1
 	vmovdqu %ymm0, (%rdi)
-	mov	%rcx, 32(%rdi)
-	mov	%dx, 40(%rdi)
+	vmovdqu %xmm1, 26(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	41(%rdi), %rax
 # endif
@@ -1364,10 +1353,9 @@ L(Exit48):
 L(Exit49):
 	/* 0/32, 32/16, 48/1 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm1
+	vmovdqu 17(%rsi), %ymm1
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 32(%rdi)
-	mov	%dh, 48(%rdi)
+	vmovdqu %ymm1, 17(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	48(%rdi), %rax
 # endif
@@ -1383,11 +1371,9 @@ L(Exit49):
 L(Exit50):
 	/* 0/32, 32/16, 48/2 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm1
-	mov	48(%rsi), %dx
+	vmovdqu 18(%rsi), %ymm1
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 32(%rdi)
-	mov	%dx, 48(%rdi)
+	vmovdqu %ymm1, 18(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	49(%rdi), %rax
 # endif
@@ -1403,11 +1389,9 @@ L(Exit50):
 L(Exit51):
 	/* 0/32, 32/16, 47/4 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm1
-	mov	47(%rsi), %edx
+	vmovdqu 19(%rsi), %ymm1
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 32(%rdi)
-	mov	%edx, 47(%rdi)
+	vmovdqu %ymm1, 19(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	50(%rdi), %rax
 # endif
@@ -1423,11 +1407,9 @@ L(Exit51):
 L(Exit52):
 	/* 0/32, 32/16, 48/4 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm1
-	mov	48(%rsi), %edx
+	vmovdqu 20(%rsi), %ymm1
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 32(%rdi)
-	mov	%edx, 48(%rdi)
+	vmovdqu %ymm1, 20(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	51(%rdi), %rax
 # endif
@@ -1443,11 +1425,9 @@ L(Exit52):
 L(Exit53):
 	/* 0/32, 32/16, 45/8 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm1
-	mov	45(%rsi), %rdx
+	vmovdqu 21(%rsi), %ymm1
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 32(%rdi)
-	mov	%rdx, 45(%rdi)
+	vmovdqu %ymm1, 21(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	52(%rdi), %rax
 # endif
@@ -1463,11 +1443,9 @@ L(Exit53):
 L(Exit54):
 	/* 0/32, 32/16, 46/8 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm1
-	mov	46(%rsi), %rdx
+	vmovdqu 22(%rsi), %ymm1
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 32(%rdi)
-	mov	%rdx, 46(%rdi)
+	vmovdqu %ymm1, 22(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	53(%rdi), %rax
 # endif
@@ -1483,11 +1461,9 @@ L(Exit54):
 L(Exit55):
 	/* 0/32, 32/16, 47/8 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm1
-	mov	47(%rsi), %rdx
+	vmovdqu 23(%rsi), %ymm1
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 32(%rdi)
-	mov	%rdx, 47(%rdi)
+	vmovdqu %ymm1, 23(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	54(%rdi), %rax
 # endif
@@ -1503,11 +1479,9 @@ L(Exit55):
 L(Exit56):
 	/* 0/32, 32/16, 48/8 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm1
-	mov	48(%rsi), %rdx
+	vmovdqu 24(%rsi), %ymm1
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm1, 32(%rdi)
-	mov	%rdx, 48(%rdi)
+	vmovdqu %ymm1, 24(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	55(%rdi), %rax
 # endif
@@ -1990,11 +1964,9 @@ L(StrncpyExit20):
 	.p2align 4
 L(StrncpyExit21):
 	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %ecx
-	mov	20(%rsi), %dl
+	mov	13(%rsi), %rcx
 	vmovdqu	%xmm0, (%rdi)
-	mov	%ecx, 16(%rdi)
-	mov	%dl, 20(%rdi)
+	mov	%rcx, 13(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	21(%rdi), %rax
 #  endif
@@ -2056,11 +2028,9 @@ L(StrncpyExit24):
 	.p2align 4
 L(StrncpyExit25):
 	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %rdx
-	mov	24(%rsi), %cl
+	vmovdqu	9(%rsi), %xmm1
 	vmovdqu	%xmm0, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%cl, 24(%rdi)
+	vmovdqu	%xmm1, 9(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	25(%rdi), %rax
 #  endif
@@ -2074,11 +2044,9 @@ L(StrncpyExit25):
 	.p2align 4
 L(StrncpyExit26):
 	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %rdx
-	mov	24(%rsi), %cx
+	vmovdqu	10(%rsi), %xmm1
 	vmovdqu	%xmm0, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%cx, 24(%rdi)
+	vmovdqu	%xmm1, 10(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	26(%rdi), %rax
 #  endif
@@ -2092,11 +2060,9 @@ L(StrncpyExit26):
 	.p2align 4
 L(StrncpyExit27):
 	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %rdx
-	mov	23(%rsi), %ecx
+	vmovdqu	11(%rsi), %xmm1
 	vmovdqu	%xmm0, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%ecx, 23(%rdi)
+	vmovdqu	%xmm1, 11(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	27(%rdi), %rax
 #  endif
@@ -2110,11 +2076,9 @@ L(StrncpyExit27):
 	.p2align 4
 L(StrncpyExit28):
 	vmovdqu	(%rsi), %xmm0
-	mov	16(%rsi), %rdx
-	mov	24(%rsi), %ecx
+	vmovdqu	12(%rsi), %xmm1
 	vmovdqu	%xmm0, (%rdi)
-	mov	%rdx, 16(%rdi)
-	mov	%ecx, 24(%rdi)
+	vmovdqu	%xmm1, 12(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	28(%rdi), %rax
 #  endif
@@ -2326,11 +2290,9 @@ L(StrncpyExit40):
 L(StrncpyExit41):
 	/*  0/32, 32/8, 40/1 */
 	vmovdqu (%rsi), %ymm0
-	mov	32(%rsi), %rdx
-	mov	40(%rsi), %cl
+	vmovdqu 25(%rsi), %xmm1
 	vmovdqu %ymm0, (%rdi)
-	mov	%rdx, 32(%rdi)
-	mov	%cl, 40(%rdi)
+	vmovdqu %xmm1, 25(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	41(%rdi), %rax
 #  endif
@@ -2345,11 +2307,9 @@ L(StrncpyExit41):
 L(StrncpyExit42):
 	/*  0/32, 32/8, 40/2 */
 	vmovdqu (%rsi), %ymm0
-	mov	32(%rsi), %rdx
-	mov	40(%rsi), %cx
+	vmovdqu 26(%rsi), %xmm1
 	vmovdqu %ymm0, (%rdi)
-	mov	%rdx, 32(%rdi)
-	mov	%cx, 40(%rdi)
+	vmovdqu %xmm1, 26(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	42(%rdi), %rax
 #  endif
@@ -2466,11 +2426,9 @@ L(StrncpyExit48):
 L(StrncpyExit49):
 	/* 0/32, 32/16, 48/1 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm2
-	mov	48(%rsi), %cl
+	vmovdqu 17(%rsi), %ymm2
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm2, 32(%rdi)
-	mov	%cl, 48(%rdi)
+	vmovdqu %ymm2, 17(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	49(%rdi), %rax
 #  endif
@@ -2485,11 +2443,9 @@ L(StrncpyExit49):
 L(StrncpyExit50):
 	/*  0/32, 32/16, 48/2 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm2
-	mov	48(%rsi), %cx
+	vmovdqu 18(%rsi), %ymm2
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm2, 32(%rdi)
-	mov	%cx, 48(%rdi)
+	vmovdqu %ymm2, 18(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	50(%rdi), %rax
 #  endif
@@ -2504,11 +2460,9 @@ L(StrncpyExit50):
 L(StrncpyExit51):
 	/*  0/32, 32/16, 47/4 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm2
-	mov	47(%rsi), %ecx
+	vmovdqu 19(%rsi), %ymm2
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm2, 32(%rdi)
-	mov	%ecx, 47(%rdi)
+	vmovdqu %ymm2, 19(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	51(%rdi), %rax
 #  endif
@@ -2523,11 +2477,9 @@ L(StrncpyExit51):
 L(StrncpyExit52):
 	/*  0/32, 32/16, 48/4 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm2
-	mov	48(%rsi), %ecx
+	vmovdqu 20(%rsi), %ymm2
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm2, 32(%rdi)
-	mov	%ecx, 48(%rdi)
+	vmovdqu %ymm2, 20(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	52(%rdi), %rax
 #  endif
@@ -2542,11 +2494,9 @@ L(StrncpyExit52):
 L(StrncpyExit53):
 	/*  0/32, 32/16, 45/8 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm2
-	mov	45(%rsi), %rcx
+	vmovdqu 21(%rsi), %ymm2
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm2, 32(%rdi)
-	mov	%rcx, 45(%rdi)
+	vmovdqu %ymm2, 21(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	53(%rdi), %rax
 #  endif
@@ -2561,11 +2511,9 @@ L(StrncpyExit53):
 L(StrncpyExit54):
 	/*  0/32, 32/16, 46/8 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm2
-	mov	46(%rsi), %rcx
+	vmovdqu 22(%rsi), %ymm2
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm2, 32(%rdi)
-	mov	%rcx, 46(%rdi)
+	vmovdqu %ymm2, 22(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	54(%rdi), %rax
 #  endif
@@ -2580,11 +2528,9 @@ L(StrncpyExit54):
 L(StrncpyExit55):
 	/* 0/32, 32/16, 47/8 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm2
-	mov	47(%rsi), %rcx
+	vmovdqu 23(%rsi), %ymm2
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm2, 32(%rdi)
-	mov	%rcx, 47(%rdi)
+	vmovdqu %ymm2, 23(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	55(%rdi), %rax
 #  endif
@@ -2599,11 +2545,9 @@ L(StrncpyExit55):
 L(StrncpyExit56):
 	/* 0/32, 32/16, 48/8 */
 	vmovdqu (%rsi), %ymm0
-	vmovdqu 32(%rsi), %xmm2
-	mov	48(%rsi), %rcx
+	vmovdqu 24(%rsi), %ymm2
 	vmovdqu %ymm0, (%rdi)
-	vmovdqu %xmm2, 32(%rdi)
-	mov	%rcx, 48(%rdi)
+	vmovdqu %ymm2, 24(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	56(%rdi), %rax
 #  endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=7121fc964d416749df76aaef61b6afd2524a8a90

commit 7121fc964d416749df76aaef61b6afd2524a8a90
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Sep 18 10:14:06 2018 -0700

    Replace movdqu with vmovdqu

diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index 1215bc3..2f93cec 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -780,8 +780,8 @@ L(Exit15):
 
 	.p2align 4
 L(Exit16):
-	movdqu	(%rsi), %xmm0
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	%xmm0, (%rdi)
 # ifdef USE_AS_STPCPY
 	lea	15(%rdi), %rax
 # endif
@@ -795,8 +795,8 @@ L(Exit16):
 
 	.p2align 4
 L(Exit17):
-	movdqu	(%rsi), %xmm0
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	%xmm0, (%rdi)
 	mov	%dh, 16(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	16(%rdi), %rax
@@ -811,9 +811,9 @@ L(Exit17):
 
 	.p2align 4
 L(Exit18):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %cx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%cx, 16(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	17(%rdi), %rax
@@ -828,9 +828,9 @@ L(Exit18):
 
 	.p2align 4
 L(Exit19):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	15(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%ecx, 15(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	18(%rdi), %rax
@@ -845,9 +845,9 @@ L(Exit19):
 
 	.p2align 4
 L(Exit20):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%ecx, 16(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	19(%rdi), %rax
@@ -862,9 +862,9 @@ L(Exit20):
 
 	.p2align 4
 L(Exit21):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%ecx, 16(%rdi)
 	mov	%dh, 20(%rdi)
 # ifdef USE_AS_STPCPY
@@ -880,9 +880,9 @@ L(Exit21):
 
 	.p2align 4
 L(Exit22):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	14(%rsi), %rcx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%rcx, 14(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	21(%rdi), %rax
@@ -897,9 +897,9 @@ L(Exit22):
 
 	.p2align 4
 L(Exit23):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	15(%rsi), %rcx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%rcx, 15(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	22(%rdi), %rax
@@ -914,9 +914,9 @@ L(Exit23):
 
 	.p2align 4
 L(Exit24):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rcx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%rcx, 16(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	23(%rdi), %rax
@@ -931,9 +931,9 @@ L(Exit24):
 
 	.p2align 4
 L(Exit25):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rcx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%rcx, 16(%rdi)
 	mov	%dh, 24(%rdi)
 # ifdef USE_AS_STPCPY
@@ -949,10 +949,10 @@ L(Exit25):
 
 	.p2align 4
 L(Exit26):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rdx
 	mov	24(%rsi), %cx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%cx, 24(%rdi)
 # ifdef USE_AS_STPCPY
@@ -968,10 +968,10 @@ L(Exit26):
 
 	.p2align 4
 L(Exit27):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rdx
 	mov	23(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%ecx, 23(%rdi)
 # ifdef USE_AS_STPCPY
@@ -987,10 +987,10 @@ L(Exit27):
 
 	.p2align 4
 L(Exit28):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rdx
 	mov	24(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%ecx, 24(%rdi)
 # ifdef USE_AS_STPCPY
@@ -1006,10 +1006,10 @@ L(Exit28):
 
 	.p2align 4
 L(Exit29):
-	movdqu	(%rsi), %xmm0
-	movdqu	13(%rsi), %xmm2
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 13(%rdi)
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	13(%rsi), %xmm2
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm2, 13(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	28(%rdi), %rax
 # endif
@@ -1023,10 +1023,10 @@ L(Exit29):
 
 	.p2align 4
 L(Exit30):
-	movdqu	(%rsi), %xmm0
-	movdqu	14(%rsi), %xmm2
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 14(%rdi)
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	14(%rsi), %xmm2
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm2, 14(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	29(%rdi), %rax
 # endif
@@ -1040,10 +1040,10 @@ L(Exit30):
 
 	.p2align 4
 L(Exit31):
-	movdqu	(%rsi), %xmm0
-	movdqu	15(%rsi), %xmm2
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 15(%rdi)
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	15(%rsi), %xmm2
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm2, 15(%rdi)
 # ifdef USE_AS_STPCPY
 	lea	30(%rdi), %rax
 # endif
@@ -1057,10 +1057,8 @@ L(Exit31):
 
 	.p2align 4
 L(Exit32):
-	movdqu	(%rsi), %xmm0
-	movdqu	16(%rsi), %xmm2
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 16(%rdi)
+	vmovdqu	(%rsi), %ymm0
+	vmovdqu	%ymm0, (%rdi)
 # ifdef USE_AS_STPCPY
 	lea	31(%rdi), %rax
 # endif
@@ -1913,8 +1911,8 @@ L(StrncpyExit15):
 
 	.p2align 4
 L(StrncpyExit16):
-	movdqu	(%rsi), %xmm0
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	%xmm0, (%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	16(%rdi), %rax
 #  endif
@@ -1927,9 +1925,9 @@ L(StrncpyExit16):
 
 	.p2align 4
 L(StrncpyExit17):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %cl
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%cl, 16(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	17(%rdi), %rax
@@ -1943,9 +1941,9 @@ L(StrncpyExit17):
 
 	.p2align 4
 L(StrncpyExit18):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %cx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%cx, 16(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	18(%rdi), %rax
@@ -1959,9 +1957,9 @@ L(StrncpyExit18):
 
 	.p2align 4
 L(StrncpyExit19):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	15(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%ecx, 15(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	19(%rdi), %rax
@@ -1975,9 +1973,9 @@ L(StrncpyExit19):
 
 	.p2align 4
 L(StrncpyExit20):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%ecx, 16(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	20(%rdi), %rax
@@ -1991,10 +1989,10 @@ L(StrncpyExit20):
 
 	.p2align 4
 L(StrncpyExit21):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %ecx
 	mov	20(%rsi), %dl
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%ecx, 16(%rdi)
 	mov	%dl, 20(%rdi)
 #  ifdef USE_AS_STPCPY
@@ -2009,9 +2007,9 @@ L(StrncpyExit21):
 
 	.p2align 4
 L(StrncpyExit22):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	14(%rsi), %rcx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%rcx, 14(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	22(%rdi), %rax
@@ -2025,9 +2023,9 @@ L(StrncpyExit22):
 
 	.p2align 4
 L(StrncpyExit23):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	15(%rsi), %rcx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%rcx, 15(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	23(%rdi), %rax
@@ -2041,9 +2039,9 @@ L(StrncpyExit23):
 
 	.p2align 4
 L(StrncpyExit24):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rcx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%rcx, 16(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	24(%rdi), %rax
@@ -2057,10 +2055,10 @@ L(StrncpyExit24):
 
 	.p2align 4
 L(StrncpyExit25):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rdx
 	mov	24(%rsi), %cl
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%cl, 24(%rdi)
 #  ifdef USE_AS_STPCPY
@@ -2075,10 +2073,10 @@ L(StrncpyExit25):
 
 	.p2align 4
 L(StrncpyExit26):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rdx
 	mov	24(%rsi), %cx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%cx, 24(%rdi)
 #  ifdef USE_AS_STPCPY
@@ -2093,10 +2091,10 @@ L(StrncpyExit26):
 
 	.p2align 4
 L(StrncpyExit27):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rdx
 	mov	23(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%ecx, 23(%rdi)
 #  ifdef USE_AS_STPCPY
@@ -2111,10 +2109,10 @@ L(StrncpyExit27):
 
 	.p2align 4
 L(StrncpyExit28):
-	movdqu	(%rsi), %xmm0
+	vmovdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rdx
 	mov	24(%rsi), %ecx
-	movdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%ecx, 24(%rdi)
 #  ifdef USE_AS_STPCPY
@@ -2129,10 +2127,10 @@ L(StrncpyExit28):
 
 	.p2align 4
 L(StrncpyExit29):
-	movdqu	(%rsi), %xmm0
-	movdqu	13(%rsi), %xmm2
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 13(%rdi)
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	13(%rsi), %xmm2
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm2, 13(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	29(%rdi), %rax
 #  endif
@@ -2145,10 +2143,10 @@ L(StrncpyExit29):
 
 	.p2align 4
 L(StrncpyExit30):
-	movdqu	(%rsi), %xmm0
-	movdqu	14(%rsi), %xmm2
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 14(%rdi)
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	14(%rsi), %xmm2
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm2, 14(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	30(%rdi), %rax
 #  endif
@@ -2161,10 +2159,10 @@ L(StrncpyExit30):
 
 	.p2align 4
 L(StrncpyExit31):
-	movdqu	(%rsi), %xmm0
-	movdqu	15(%rsi), %xmm2
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 15(%rdi)
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	15(%rsi), %xmm2
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm2, 15(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	31(%rdi), %rax
 #  endif
@@ -2177,10 +2175,8 @@ L(StrncpyExit31):
 
 	.p2align 4
 L(StrncpyExit32):
-	movdqu	(%rsi), %xmm0
-	movdqu	16(%rsi), %xmm2
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 16(%rdi)
+	vmovdqu	(%rsi), %ymm0
+	vmovdqu	%ymm0, (%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	32(%rdi), %rax
 #  endif
@@ -2193,11 +2189,9 @@ L(StrncpyExit32):
 
 	.p2align 4
 L(StrncpyExit33):
-	movdqu	(%rsi), %xmm0
-	movdqu	16(%rsi), %xmm2
+	vmovdqu	(%rsi), %ymm0
 	mov	32(%rsi), %cl
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm2, 16(%rdi)
+	vmovdqu	%ymm0, (%rdi)
 	mov	%cl, 32(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	33(%rdi), %rax

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6a8074db808d8ede7ee3837611963f06ecad90ab

commit 6a8074db808d8ede7ee3837611963f06ecad90ab
Author: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date:   Fri Jul 27 10:16:13 2018 -0500

    x86-64: Optimize strcat/strncat, strcpy/strncpy and stpcpy/stpncpy with AVX2
    
    Optimize x86-64 strcat/strncat, strcpy/strncpy and stpcpy/stpncpy with AVX2.
    It uses vector comparison as much as possible. In general, the larger the
    source string, the greater performance gain observed (expected because AVX2
    uses 256-bit registers), reaching speedups of 1.5x at 512-bytes lengths but
    gains (>1x) start even in strings as short as 20-bytes. Select AVX2
    strcat/strncat, strcpy/strncpy and stpcpy/stpncpy on AVX2 machines where
    vzeroupper is preferred and AVX unaligned load is fast.
    
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	strcat-avx2, strncat-avx2, strcpy-avx2, strncpy-avx2,
    	stpcpy-avx2 and stpncpy-avx2.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c:
    	(__libc_ifunc_impl_list): Add tests for __strcat_avx2,
    	__strncat_avx2, __strcpy_avx2, __strncpy_avx2, __stpcpy_avx2
    	and __stpncpy_avx2.
    	* sysdeps/x86_64/multiarch/{ifunc-unaligned-ssse3.h =>
    	ifunc-unaligned.h}: rename header for a more generic name.
    	* sysdeps/x86_64/multiarch/ifunc-unaligned.h:
    	(IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX 2 machines if
    	AVX unaligned load is fast and vzeroupper is preferred.
    	* sysdeps/x86_64/multiarch/stpcpy-avx2.S: New file
    	* sysdeps/x86_64/multiarch/stpncpy-avx2.S: Likewise
    	* sysdeps/x86_64/multiarch/strcat-avx2.S: Likewise
    	* sysdeps/x86_64/multiarch/strcpy-avx2.S: Likewise
    	* sysdeps/x86_64/multiarch/strncat-avx2.S: Likewise
    	* sysdeps/x86_64/multiarch/strncpy-avx2.S: Likewise

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index bb5e970..395e432 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -24,11 +24,14 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
 		   strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
 		   strrchr-sse2 strrchr-avx2 \
 		   strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
+		   strcat-avx2 strncat-avx2 \
 		   strcat-ssse3 strncat-ssse3\
+		   strcpy-avx2 strncpy-avx2 \
 		   strcpy-sse2 stpcpy-sse2 \
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
+		   stpcpy-avx2 stpncpy-avx2 \
 		   strcat-sse2 \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
 		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 9aaaef7..950bd9e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -199,6 +199,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, stpncpy,
 	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
 			      __stpncpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_ARCH_FEATURE (AVX2_Usable),
+			      __stpncpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
 			      __stpncpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
@@ -207,6 +209,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, stpcpy,
 	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
 			      __stpcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_ARCH_FEATURE (AVX2_Usable),
+			      __stpcpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
 
@@ -239,6 +243,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcat.c.  */
   IFUNC_IMPL (i, name, strcat,
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strcat_avx2)
 	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
 			      __strcat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
@@ -280,6 +286,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcpy.c.  */
   IFUNC_IMPL (i, name, strcpy,
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strcpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
 			      __strcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
@@ -321,6 +329,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncat.c.  */
   IFUNC_IMPL (i, name, strncat,
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strncat_avx2)
 	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
 			      __strncat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
@@ -329,6 +339,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ 			      __strncpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
 			      __strncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h b/sysdeps/x86_64/multiarch/ifunc-unaligned.h
similarity index 83%
rename from sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h
rename to sysdeps/x86_64/multiarch/ifunc-unaligned.h
index 81805f9..4f2286f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h
+++ b/sysdeps/x86_64/multiarch/ifunc-unaligned.h
@@ -24,12 +24,18 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    return OPTIMIZE (avx2);
+
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
 
diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2.S b/sysdeps/x86_64/multiarch/stpcpy-avx2.S
new file mode 100644
index 0000000..f0bd302
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_avx2
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy.c b/sysdeps/x86_64/multiarch/stpcpy.c
index 1e340fc..5f0e63c 100644
--- a/sysdeps/x86_64/multiarch/stpcpy.c
+++ b/sysdeps/x86_64/multiarch/stpcpy.c
@@ -28,7 +28,7 @@
 # undef __stpcpy
 
 # define SYMBOL_NAME stpcpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-unaligned.h"
 
 libc_ifunc_redirected (__redirect_stpcpy, __stpcpy, IFUNC_SELECTOR ());
 
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
new file mode 100644
index 0000000..032b040
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_avx2
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
index 28842ec..f87df0c 100644
--- a/sysdeps/x86_64/multiarch/stpncpy.c
+++ b/sysdeps/x86_64/multiarch/stpncpy.c
@@ -26,7 +26,7 @@
 # undef __stpncpy
 
 # define SYMBOL_NAME stpncpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-unaligned.h"
 
 libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
 
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
new file mode 100644
index 0000000..94c2a7a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
@@ -0,0 +1,278 @@
+/* strcat with AVX2
+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_avx2
+# endif
+
+# define USE_AS_STRCAT
+
+/* Number of bytes in a vector register */
+# define VEC_SIZE	32
+
+.text
+ENTRY (STRCAT)
+	mov	%rdi, %r9
+# ifdef USE_AS_STRNCAT
+	mov	%rdx, %r8
+# endif
+
+/* Inline corresponding strlen file, temporary until new strcpy
+   implementation gets merged.  */
+
+	xor	%eax, %eax
+	mov	%edi, %ecx
+	and	$((VEC_SIZE * 4) - 1), %ecx
+	vpxor	%xmm6, %xmm6, %xmm6
+	cmp	$(VEC_SIZE * 3), %ecx
+	ja	L(fourth_vector_boundary)
+	vpcmpeqb (%rdi), %ymm6, %ymm0
+	vpmovmskb %ymm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_first_vector)
+	mov	%rdi, %rax
+	and	$-VEC_SIZE, %rax
+	jmp	L(align_vec_size_start)
+L(fourth_vector_boundary):
+	mov	%rdi, %rax
+	and	$-VEC_SIZE, %rax
+	vpcmpeqb	(%rax), %ymm6, %ymm0
+	mov	$-1, %r10d
+	sub	%rax, %rcx
+	shl	%cl, %r10d
+	vpmovmskb %ymm0, %edx
+	and	%r10d, %edx
+	jnz	L(exit)
+
+L(align_vec_size_start):
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
+	vpmovmskb %ymm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+	vpmovmskb %ymm1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+	vpmovmskb %ymm2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+	vpmovmskb %ymm3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
+	add	$(VEC_SIZE * 4), %rax
+	vpmovmskb %ymm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+	vpmovmskb %ymm1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+	vpmovmskb %ymm2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+	vpmovmskb %ymm3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
+	add	$(VEC_SIZE * 4), %rax
+	vpmovmskb %ymm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+	vpmovmskb %ymm1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+	vpmovmskb %ymm2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+	vpmovmskb %ymm3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
+	add	$(VEC_SIZE * 4), %rax
+	vpmovmskb %ymm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+	vpmovmskb %ymm1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+	vpmovmskb %ymm2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+	vpmovmskb %ymm3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
+	add	$(VEC_SIZE * 5), %rax
+	vpmovmskb %ymm0, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
+	add	$VEC_SIZE, %rax
+	vpmovmskb %ymm1, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
+	add	$VEC_SIZE, %rax
+	vpmovmskb %ymm2, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
+	add	$VEC_SIZE, %rax
+	vpmovmskb %ymm3, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	add	$VEC_SIZE, %rax
+
+	.p2align 4
+L(align_four_vec_loop):
+	vmovaps	(%rax),	%ymm4
+	vpminub	VEC_SIZE(%rax),	%ymm4, %ymm4
+	vmovaps	(VEC_SIZE * 2)(%rax),	%ymm5
+	vpminub	(VEC_SIZE * 3)(%rax),	%ymm5, %ymm5
+	add	$(VEC_SIZE * 4),	%rax
+	vpminub	%ymm4,	%ymm5, %ymm5
+	vpcmpeqb %ymm5,	%ymm6, %ymm5
+	vpmovmskb %ymm5,	%edx
+	test	%edx,	%edx
+	jz	L(align_four_vec_loop)
+
+	vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
+	sub	$(VEC_SIZE * 5),	%rax
+	vpmovmskb %ymm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+	vpmovmskb %ymm1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+	vpmovmskb %ymm2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+	vpmovmskb %ymm3, %edx
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 4), %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit):
+	sub	%rdi, %rax
+L(exit_null_on_first_vector):
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_second_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$VEC_SIZE, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_third_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 2), %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_fourth_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 3), %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_fifth_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 4), %rax
+
+	.p2align 4
+L(StartStrcpyPart):
+	lea	(%r9, %rax), %rdi
+	mov	%rsi, %rcx
+	mov	%r9, %rax      /* save result */
+
+# ifdef USE_AS_STRNCAT
+	test	%r8, %r8
+	jz	L(ExitZero)
+#  define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-avx2.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcat.c b/sysdeps/x86_64/multiarch/strcat.c
index 1f7f626..4b41e3b 100644
--- a/sysdeps/x86_64/multiarch/strcat.c
+++ b/sysdeps/x86_64/multiarch/strcat.c
@@ -24,7 +24,7 @@
 # undef strcat
 
 # define SYMBOL_NAME strcat
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-unaligned.h"
 
 libc_ifunc_redirected (__redirect_strcat, strcat, IFUNC_SELECTOR ());
 
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
new file mode 100644
index 0000000..1215bc3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -0,0 +1,3341 @@
+/* strcpy with AVX2
+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_avx2
+#  endif
+
+# endif
+
+# define JMPTBL(I, B)	I - B
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)             \
+	lea	TABLE(%rip), %r11;                              \
+	movslq	(%r11, INDEX, SCALE), %rcx;                     \
+	lea	(%r11, %rcx), %rcx;                             \
+	_CET_NOTRACK jmp *%rcx
+
+/* Number of bytes in a vector register */
+# ifndef VEC_SIZE
+#  define VEC_SIZE	32
+# endif
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# ifndef USE_AS_STRCAT
+
+.text
+ENTRY (STRCPY)
+#  ifdef USE_AS_STRNCPY
+	mov	%rdx, %r8
+	test	%r8, %r8
+	jz	L(ExitZero)
+#  endif
+	mov	%rsi, %rcx
+#  ifndef USE_AS_STPCPY
+	mov	%rdi, %rax      /* save result */
+#  endif
+
+# endif
+
+	and	$((VEC_SIZE * 4) - 1), %rcx
+	cmp	$(VEC_SIZE * 2), %rcx
+	jbe	L(SourceStringAlignmentLessTwoVecSize)
+
+	and	$-VEC_SIZE, %rsi
+	and	$(VEC_SIZE - 1), %rcx
+	vpxor	%xmm0, %xmm0, %xmm0
+	vpxor	%xmm1, %xmm1, %xmm1
+
+	vpcmpeqb (%rsi), %ymm1, %ymm1
+	vpmovmskb %ymm1, %rdx
+	shr	%cl, %rdx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	mov	$VEC_SIZE, %r10
+	sub	%rcx, %r10
+	cmp	%r10, %r8
+#  else
+	mov	$(VEC_SIZE + 1), %r10
+	sub	%rcx, %r10
+	cmp	%r10, %r8
+#  endif
+	jbe	L(CopyVecSizeTailCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(CopyVecSizeTail)
+
+	vpcmpeqb VEC_SIZE(%rsi), %ymm0, %ymm0
+	vpmovmskb %ymm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+	add	$VEC_SIZE, %r10
+	cmp	%r10, %r8
+	jbe	L(CopyTwoVecSizeCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(CopyTwoVecSize)
+
+	vmovdqu (%rsi, %rcx), %ymm1   /* copy VEC_SIZE bytes */
+	vmovdqu %ymm1, (%rdi)
+
+/* If source address alignment != destination address alignment */
+	.p2align 4
+L(UnalignVecSizeBoth):
+	sub	%rcx, %rdi
+# ifdef USE_AS_STRNCPY
+	add	%rcx, %r8
+	sbb	%rcx, %rcx
+	or	%rcx, %r8
+# endif
+	mov	$VEC_SIZE, %rcx
+	vmovdqa (%rsi, %rcx), %ymm1
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
+	vmovdqu %ymm1, (%rdi, %rcx)
+	vpcmpeqb %ymm2, %ymm0, %ymm0
+	vpmovmskb %ymm0, %rdx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$(VEC_SIZE * 3), %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec2)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
+	vmovdqu %ymm2, (%rdi, %rcx)
+	vpcmpeqb %ymm3, %ymm0, %ymm0
+	vpmovmskb %ymm0, %rdx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec3)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
+	vmovdqu %ymm3, (%rdi, %rcx)
+	vpcmpeqb %ymm4, %ymm0, %ymm0
+	vpmovmskb %ymm0, %rdx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec4)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm1
+	vmovdqu %ymm4, (%rdi, %rcx)
+	vpcmpeqb %ymm1, %ymm0, %ymm0
+	vpmovmskb %ymm0, %rdx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec1)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
+	vmovdqu %ymm1, (%rdi, %rcx)
+	vpcmpeqb %ymm2, %ymm0, %ymm0
+	vpmovmskb %ymm0, %rdx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec2)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
+	vmovdqu %ymm2, (%rdi, %rcx)
+	vpcmpeqb %ymm3, %ymm0, %ymm0
+	vpmovmskb %ymm0, %rdx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec3)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	vmovdqu %ymm3, (%rdi, %rcx)
+	mov	%rsi, %rdx
+	lea	VEC_SIZE(%rsi, %rcx), %rsi
+	and	$-(VEC_SIZE * 4), %rsi
+	sub	%rsi, %rdx
+	sub	%rdx, %rdi
+# ifdef USE_AS_STRNCPY
+	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
+# endif
+L(UnalignedFourVecSizeLoop):
+	vmovdqa (%rsi), %ymm4
+	vmovdqa VEC_SIZE(%rsi), %ymm5
+	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
+	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
+	vpminub %ymm5, %ymm4, %ymm2
+	vpminub %ymm7, %ymm6, %ymm3
+	vpminub %ymm2, %ymm3, %ymm3
+	vpcmpeqb %ymm0, %ymm3, %ymm3
+	vpmovmskb %ymm3, %rdx
+# ifdef USE_AS_STRNCPY
+	sub	$(VEC_SIZE * 4), %r8
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(UnalignedFourVecSizeLeave)
+
+L(UnalignedFourVecSizeLoop_start):
+	add	$(VEC_SIZE * 4), %rdi
+	add	$(VEC_SIZE * 4), %rsi
+	vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
+	vmovdqa (%rsi), %ymm4
+	vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
+	vmovdqa VEC_SIZE(%rsi), %ymm5
+	vpminub %ymm5, %ymm4, %ymm2
+	vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
+	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
+	vmovdqu %ymm7, -VEC_SIZE(%rdi)
+	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
+	vpminub %ymm7, %ymm6, %ymm3
+	vpminub %ymm2, %ymm3, %ymm3
+	vpcmpeqb %ymm0, %ymm3, %ymm3
+	vpmovmskb %ymm3, %rdx
+# ifdef USE_AS_STRNCPY
+	sub	$(VEC_SIZE * 4), %r8
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jz	L(UnalignedFourVecSizeLoop_start)
+
+L(UnalignedFourVecSizeLeave):
+	vpxor	%xmm1, %xmm1, %xmm1
+
+	vpcmpeqb %ymm4, %ymm0, %ymm0
+	vpmovmskb %ymm0, %rdx
+	test	%rdx, %rdx
+	jnz	L(CopyVecSizeUnaligned_0)
+
+	vpcmpeqb %ymm5, %ymm1, %ymm1
+	vpmovmskb %ymm1, %rcx
+	test	%rcx, %rcx
+	jnz	L(CopyVecSizeUnaligned_16)
+
+	vpcmpeqb %ymm6, %ymm0, %ymm0
+	vpmovmskb %ymm0, %rdx
+	test	%rdx, %rdx
+	jnz	L(CopyVecSizeUnaligned_32)
+
+	vpcmpeqb %ymm7, %ymm1, %ymm1
+	vpmovmskb %ymm1, %rcx
+	bsf	%rcx, %rdx
+	vmovdqu %ymm4, (%rdi)
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
+# endif
+	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
+	add	$(VEC_SIZE - 1), %r8
+	sub	%rdx, %r8
+	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$(VEC_SIZE * 3), %rsi
+	add	$(VEC_SIZE * 3), %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentLessTwoVecSize):
+	vpxor	%xmm0, %xmm0, %xmm0
+	vmovdqu (%rsi), %ymm1
+	vmovdqu VEC_SIZE(%rsi), %ymm2
+	vpcmpeqb %ymm1, %ymm0, %ymm0
+	vpmovmskb %ymm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	cmp	$VEC_SIZE, %r8
+#  else
+	cmp	$(VEC_SIZE + 1), %r8
+#  endif
+	jbe	L(CopyVecSizeTail1Case2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(CopyVecSizeTail1)
+
+	vpcmpeqb %ymm2, %ymm0, %ymm0
+	vmovdqu %ymm1, (%rdi)
+	vpmovmskb %ymm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	cmp	$(VEC_SIZE * 2), %r8
+#  else
+	cmp	$((VEC_SIZE * 2) + 1), %r8
+#  endif
+	jbe	L(CopyTwoVecSize1Case2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(CopyTwoVecSize1)
+
+	and	$-VEC_SIZE, %rsi
+	and	$(VEC_SIZE - 1), %rcx
+	jmp	L(UnalignVecSizeBoth)
+
+/*------End of main part with loops---------------------*/
+
+/* Case1 */
+
+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
+	.p2align 4
+L(CopyVecSize):
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+	.p2align 4
+L(CopyVecSizeTail):
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+	.p2align 4
+L(CopyTwoVecSize1):
+	add	$VEC_SIZE, %rsi
+	add	$VEC_SIZE, %rdi
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$VEC_SIZE, %r8
+# endif
+L(CopyVecSizeTail1):
+	bsf	%rdx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+	.p2align 4
+L(CopyTwoVecSize):
+	bsf	%rdx, %rdx
+	add	%rcx, %rsi
+	add	$VEC_SIZE, %rdx
+	sub	%rcx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+	.p2align 4
+L(CopyVecSizeUnaligned_0):
+	bsf	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+# endif
+	vmovdqu %ymm4, (%rdi)
+	add	$((VEC_SIZE * 4) - 1), %r8
+	sub	%rdx, %r8
+	lea	1(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+	.p2align 4
+L(CopyVecSizeUnaligned_16):
+	bsf	%rcx, %rdx
+	vmovdqu %ymm4, (%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	VEC_SIZE(%rdi, %rdx), %rax
+# endif
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
+	add	$((VEC_SIZE * 3) - 1), %r8
+	sub	%rdx, %r8
+	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$VEC_SIZE, %rsi
+	add	$VEC_SIZE, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+	.p2align 4
+L(CopyVecSizeUnaligned_32):
+	bsf	%rdx, %rdx
+	vmovdqu %ymm4, (%rdi)
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
+# endif
+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
+	add	$((VEC_SIZE * 2) - 1), %r8
+	sub	%rdx, %r8
+	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$(VEC_SIZE * 2), %rsi
+	add	$(VEC_SIZE * 2), %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+# ifdef USE_AS_STRNCPY
+#  ifndef USE_AS_STRCAT
+	.p2align 4
+L(CopyVecSizeUnalignedVec6):
+	vmovdqu %ymm6, (%rdi, %rcx)
+	jmp	L(CopyVecSizeVecExit)
+
+	.p2align 4
+L(CopyVecSizeUnalignedVec5):
+	vmovdqu %ymm5, (%rdi, %rcx)
+	jmp	L(CopyVecSizeVecExit)
+
+	.p2align 4
+L(CopyVecSizeUnalignedVec4):
+	vmovdqu %ymm4, (%rdi, %rcx)
+	jmp	L(CopyVecSizeVecExit)
+
+	.p2align 4
+L(CopyVecSizeUnalignedVec3):
+	vmovdqu %ymm3, (%rdi, %rcx)
+	jmp	L(CopyVecSizeVecExit)
+
+	.p2align 4
+L(CopyVecSizeUnalignedVec1):
+	vmovdqu %ymm1, (%rdi, %rcx)
+	jmp	L(CopyVecSizeVecExit)
+#  endif
+
+	.p2align 4
+L(CopyVecSizeExit):
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+/* Case2 */
+
+	.p2align 4
+L(CopyVecSizeCase2):
+	add	$VEC_SIZE, %r8
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyVecSizeExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyTwoVecSizeCase2):
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	add	$VEC_SIZE, %rdx
+	sub	%rcx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyVecSizeExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyVecSizeTailCase2):
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyVecSizeExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyVecSizeTail1Case2):
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyVecSizeExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyVecSizeCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyVecSizeCase2)
+L(CopyVecSizeCase3):
+	add	$VEC_SIZE, %r8
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyTwoVecSizeCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyTwoVecSizeCase2)
+	add	%rcx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyVecSizeTailCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyVecSizeTailCase2)
+	add	%rcx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyTwoVecSize1Case2OrCase3):
+	add	$VEC_SIZE, %rdi
+	add	$VEC_SIZE, %rsi
+	sub	$VEC_SIZE, %r8
+L(CopyVecSizeTail1Case2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyVecSizeTail1Case2)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+# endif
+
+/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
+
+	.p2align 4
+L(Exit1):
+	mov	%dh, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$1, %r8
+	lea	1(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit2):
+	mov	(%rsi), %dx
+	mov	%dx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$2, %r8
+	lea	2(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit3):
+	mov	(%rsi), %cx
+	mov	%cx, (%rdi)
+	mov	%dh, 2(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$3, %r8
+	lea	3(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit4):
+	mov	(%rsi), %edx
+	mov	%edx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	3(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$4, %r8
+	lea	4(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit5):
+	mov	(%rsi), %ecx
+	mov	%dh, 4(%rdi)
+	mov	%ecx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	4(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$5, %r8
+	lea	5(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit6):
+	mov	(%rsi), %ecx
+	mov	4(%rsi), %dx
+	mov	%ecx, (%rdi)
+	mov	%dx, 4(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	5(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$6, %r8
+	lea	6(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit7):
+	mov	(%rsi), %ecx
+	mov	3(%rsi), %edx
+	mov	%ecx, (%rdi)
+	mov	%edx, 3(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	6(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$7, %r8
+	lea	7(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit8):
+	mov	(%rsi), %rdx
+	mov	%rdx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	7(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$8, %r8
+	lea	8(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit9):
+	mov	(%rsi), %rcx
+	mov	%dh, 8(%rdi)
+	mov	%rcx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	8(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$9, %r8
+	lea	9(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit10):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %dx
+	mov	%rcx, (%rdi)
+	mov	%dx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	9(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$10, %r8
+	lea	10(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit11):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	10(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$11, %r8
+	lea	11(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit12):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	11(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$12, %r8
+	lea	12(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit13):
+	mov	(%rsi), %rcx
+	mov	5(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 5(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	12(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$13, %r8
+	lea	13(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit14):
+	mov	(%rsi), %rcx
+	mov	6(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 6(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	13(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$14, %r8
+	lea	14(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit15):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	14(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$15, %r8
+	lea	15(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	15(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$16, %r8
+	lea	16(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit17):
+	movdqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+	mov	%dh, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	16(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$17, %r8
+	lea	17(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit18):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%cx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	17(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$18, %r8
+	lea	18(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit19):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	18(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$19, %r8
+	lea	19(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit20):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	19(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$20, %r8
+	lea	20(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit21):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+	mov	%dh, 20(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	20(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$21, %r8
+	lea	21(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit22):
+	movdqu	(%rsi), %xmm0
+	mov	14(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 14(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	21(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$22, %r8
+	lea	22(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit23):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	22(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$23, %r8
+	lea	23(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit24):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	23(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$24, %r8
+	lea	24(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit25):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 16(%rdi)
+	mov	%dh, 24(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	24(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$25, %r8
+	lea	25(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit26):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%cx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	25(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$26, %r8
+	lea	26(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit27):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	23(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 23(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	26(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$27, %r8
+	lea	27(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit28):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	27(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$28, %r8
+	lea	28(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit29):
+	movdqu	(%rsi), %xmm0
+	movdqu	13(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 13(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	28(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$29, %r8
+	lea	29(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit30):
+	movdqu	(%rsi), %xmm0
+	movdqu	14(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 14(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	29(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$30, %r8
+	lea	30(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit31):
+	movdqu	(%rsi), %xmm0
+	movdqu	15(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 15(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	30(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$31, %r8
+	lea	31(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit32):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	31(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$32, %r8
+	lea	32(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit33):
+	/* 0/32, 32/1 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu %ymm0, (%rdi)
+	mov	%dh, 32(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	32(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$33, %r8
+	lea	33(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit34):
+	/* 0/32, 32/2 */
+	vmovdqu (%rsi), %ymm0
+	mov	32(%rsi), %dx
+	vmovdqu %ymm0, (%rdi)
+	mov	%dx, 32(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	33(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$34, %r8
+	lea	34(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit35):
+	/* 0/32, 31/4 */
+	vmovdqu (%rsi), %ymm0
+	mov	31(%rsi), %edx
+	vmovdqu %ymm0, (%rdi)
+	mov	%edx, 31(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	34(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$35, %r8
+	lea	35(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit36):
+	/* 0/32, 32/4 */
+	vmovdqu (%rsi), %ymm0
+	mov	32(%rsi), %edx
+	vmovdqu %ymm0, (%rdi)
+	mov	%edx, 32(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	35(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$36, %r8
+	lea	36(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit37):
+	/* 0/32, 29/8 */
+	vmovdqu (%rsi), %ymm0
+	mov	29(%rsi), %rdx
+	vmovdqu %ymm0, (%rdi)
+	mov	%rdx, 29(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	36(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$37, %r8
+	lea	37(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit38):
+	/* 0/32, 30/8 */
+	vmovdqu (%rsi), %ymm0
+	mov	30(%rsi), %rdx
+	vmovdqu %ymm0, (%rdi)
+	mov	%rdx, 30(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	37(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$38, %r8
+	lea	38(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit39):
+	/* 0/32, 31/8 */
+	vmovdqu (%rsi), %ymm0
+	mov	31(%rsi), %rdx
+	vmovdqu %ymm0, (%rdi)
+	mov	%rdx, 31(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	38(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$39, %r8
+	lea	39(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit40):
+	/* 0/32, 32/8 */
+	vmovdqu (%rsi), %ymm0
+	mov	32(%rsi), %rdx
+	vmovdqu %ymm0, (%rdi)
+	mov	%rdx, 32(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	39(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$40, %r8
+	lea	40(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit41):
+	/* 0/32, 32/8, 40/1 */
+	vmovdqu (%rsi), %ymm0
+	mov	32(%rsi), %rcx
+	vmovdqu %ymm0, (%rdi)
+	mov	%rcx, 32(%rdi)
+	mov	%dh, 40(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	40(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$41, %r8
+	lea	41(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit42):
+	/* 0/32, 32/8, 40/2 */
+	vmovdqu (%rsi), %ymm0
+	mov	32(%rsi), %rcx
+	mov	40(%rsi), %dx
+	vmovdqu %ymm0, (%rdi)
+	mov	%rcx, 32(%rdi)
+	mov	%dx, 40(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	41(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$42, %r8
+	lea	42(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit43):
+	/* 0/32, 27/16 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 27(%rsi), %xmm1
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm1, 27(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	42(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$43, %r8
+	lea	43(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit44):
+	/* 0/32, 28/16 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 28(%rsi), %xmm1
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm1, 28(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	43(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$44, %r8
+	lea	44(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit45):
+	/* 0/32, 29/16 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 29(%rsi), %xmm1
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm1, 29(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	44(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$45, %r8
+	lea	45(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit46):
+	/* 0/32, 30/16 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 30(%rsi), %xmm1
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm1, 30(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	45(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$46, %r8
+	lea	46(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit47):
+	/* 0/32, 31/16 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 31(%rsi), %xmm1
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm1, 31(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	46(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$47, %r8
+	lea	47(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit48):
+	/* 0/32, 32/16 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm1
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm1, 32(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	47(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$48, %r8
+	lea	48(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit49):
+	/* 0/32, 32/16, 48/1 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm1
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm1, 32(%rdi)
+	mov	%dh, 48(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	48(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$49, %r8
+	lea	49(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit50):
+	/* 0/32, 32/16, 48/2 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm1
+	mov	48(%rsi), %dx
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm1, 32(%rdi)
+	mov	%dx, 48(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	49(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$50, %r8
+	lea	50(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit51):
+	/* 0/32, 32/16, 47/4 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm1
+	mov	47(%rsi), %edx
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm1, 32(%rdi)
+	mov	%edx, 47(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	50(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$51, %r8
+	lea	51(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit52):
+	/* 0/32, 32/16, 48/4 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm1
+	mov	48(%rsi), %edx
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm1, 32(%rdi)
+	mov	%edx, 48(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	51(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$52, %r8
+	lea	52(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit53):
+	/* 0/32, 32/16, 45/8 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm1
+	mov	45(%rsi), %rdx
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm1, 32(%rdi)
+	mov	%rdx, 45(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	52(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$53, %r8
+	lea	53(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit54):
+	/* 0/32, 32/16, 46/8 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm1
+	mov	46(%rsi), %rdx
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm1, 32(%rdi)
+	mov	%rdx, 46(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	53(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$54, %r8
+	lea	54(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit55):
+	/* 0/32, 32/16, 47/8 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm1
+	mov	47(%rsi), %rdx
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm1, 32(%rdi)
+	mov	%rdx, 47(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	54(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$55, %r8
+	lea	55(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit56):
+	/* 0/32, 32/16, 48/8 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm1
+	mov	48(%rsi), %rdx
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm1, 32(%rdi)
+	mov	%rdx, 48(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	55(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$56, %r8
+	lea	56(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit57):
+	/* 0/32, 25/32 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 25(%rsi), %ymm1
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm1, 25(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	56(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$57, %r8
+	lea	57(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit58):
+	/* 0/32, 26/32 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 26(%rsi), %ymm1
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm1, 26(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	57(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$58, %r8
+	lea	58(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit59):
+	/* 0/32, 27/32 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 27(%rsi), %ymm1
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm1, 27(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	58(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$59, %r8
+	lea	59(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit60):
+	/* 0/32, 28/32 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 28(%rsi), %ymm1
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm1, 28(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	59(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$60, %r8
+	lea	60(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit61):
+	/* 0/32, 29/32 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 29(%rsi), %ymm1
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm1, 29(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	60(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$61, %r8
+	lea	61(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit62):
+	/* 0/32, 30/32 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 30(%rsi), %ymm1
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm1, 30(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	61(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$62, %r8
+	lea	62(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit63):
+	/* 0/32, 31/32 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 31(%rsi), %ymm1
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm1, 31(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	62(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$63, %r8
+	lea	63(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Exit64):
+	/* 0/32, 32/32 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %ymm1
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm1, 32(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	63(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$64, %r8
+	lea	64(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	VZEROUPPER
+	ret
+
+# ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(StrncpyExit0):
+#  ifdef USE_AS_STPCPY
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, (%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit1):
+	mov	(%rsi), %dl
+	mov	%dl, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 1(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit2):
+	mov	(%rsi), %dx
+	mov	%dx, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 2(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit3):
+	mov	(%rsi), %cx
+	mov	2(%rsi), %dl
+	mov	%cx, (%rdi)
+	mov	%dl, 2(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	3(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 3(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit4):
+	mov	(%rsi), %edx
+	mov	%edx, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	4(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 4(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit5):
+	mov	(%rsi), %ecx
+	mov	4(%rsi), %dl
+	mov	%ecx, (%rdi)
+	mov	%dl, 4(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	5(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 5(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit6):
+	mov	(%rsi), %ecx
+	mov	4(%rsi), %dx
+	mov	%ecx, (%rdi)
+	mov	%dx, 4(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	6(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 6(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit7):
+	mov	(%rsi), %ecx
+	mov	3(%rsi), %edx
+	mov	%ecx, (%rdi)
+	mov	%edx, 3(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	7(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 7(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit8):
+	mov	(%rsi), %rdx
+	mov	%rdx, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	8(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 8(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit9):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %dl
+	mov	%rcx, (%rdi)
+	mov	%dl, 8(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	9(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 9(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit10):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %dx
+	mov	%rcx, (%rdi)
+	mov	%dx, 8(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	10(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 10(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit11):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 7(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	11(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 11(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit12):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 8(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	12(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 12(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit13):
+	mov	(%rsi), %rcx
+	mov	5(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 5(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	13(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 13(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit14):
+	mov	(%rsi), %rcx
+	mov	6(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 6(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	14(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 14(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit15):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 7(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	15(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 15(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit16):
+	movdqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	16(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 16(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit17):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %cl
+	movdqu	%xmm0, (%rdi)
+	mov	%cl, 16(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	17(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 17(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit18):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%cx, 16(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	18(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 18(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit19):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 15(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	19(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 19(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit20):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	20(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 20(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit21):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	mov	20(%rsi), %dl
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+	mov	%dl, 20(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	21(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 21(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit22):
+	movdqu	(%rsi), %xmm0
+	mov	14(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 14(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	22(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 22(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit23):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 15(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	23(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 23(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit24):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 16(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	24(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 24(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit25):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %cl
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%cl, 24(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	25(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 25(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit26):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%cx, 24(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	26(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 26(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit27):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	23(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 23(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	27(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 27(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit28):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 24(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	28(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 28(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit29):
+	movdqu	(%rsi), %xmm0
+	movdqu	13(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 13(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	29(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 29(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit30):
+	movdqu	(%rsi), %xmm0
+	movdqu	14(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 14(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	30(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 30(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit31):
+	movdqu	(%rsi), %xmm0
+	movdqu	15(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 15(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	31(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 31(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit32):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 16(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	32(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 32(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit33):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm2
+	mov	32(%rsi), %cl
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 16(%rdi)
+	mov	%cl, 32(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	33(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 33(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit34):
+	/*  0/32, 32/2 */
+	vmovdqu (%rsi), %ymm0
+	mov	32(%rsi), %cx
+	vmovdqu %ymm0, (%rdi)
+	mov	%cx, 32(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	34(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 34(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit35):
+	/*  0/32, 31/4 */
+	vmovdqu (%rsi), %ymm0
+	mov	31(%rsi), %ecx
+	vmovdqu %ymm0, (%rdi)
+	mov	%ecx, 31(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	35(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 35(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit36):
+	/*  0/32, 32/4 */
+	vmovdqu (%rsi), %ymm0
+	mov	32(%rsi), %ecx
+	vmovdqu %ymm0, (%rdi)
+	mov	%ecx, 32(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	36(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 36(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit37):
+	/*  0/32, 29/8 */
+	vmovdqu (%rsi), %ymm0
+	mov	29(%rsi), %rcx
+	vmovdqu %ymm0, (%rdi)
+	mov	%rcx, 29(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	37(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 37(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit38):
+	/*  0/32, 30/8 */
+	vmovdqu (%rsi), %ymm0
+	mov	30(%rsi), %rcx
+	vmovdqu %ymm0, (%rdi)
+	mov	%rcx, 30(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	38(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 38(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit39):
+	/*  0/32, 31/8 */
+	vmovdqu (%rsi), %ymm0
+	mov	31(%rsi), %rcx
+	vmovdqu %ymm0, (%rdi)
+	mov	%rcx, 31(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	39(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 39(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit40):
+	/*  0/32, 32/8 */
+	vmovdqu (%rsi), %ymm0
+	mov	32(%rsi), %rcx
+	vmovdqu %ymm0, (%rdi)
+	mov	%rcx, 32(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	40(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 40(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit41):
+	/*  0/32, 32/8, 40/1 */
+	vmovdqu (%rsi), %ymm0
+	mov	32(%rsi), %rdx
+	mov	40(%rsi), %cl
+	vmovdqu %ymm0, (%rdi)
+	mov	%rdx, 32(%rdi)
+	mov	%cl, 40(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	41(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 41(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit42):
+	/*  0/32, 32/8, 40/2 */
+	vmovdqu (%rsi), %ymm0
+	mov	32(%rsi), %rdx
+	mov	40(%rsi), %cx
+	vmovdqu %ymm0, (%rdi)
+	mov	%rdx, 32(%rdi)
+	mov	%cx, 40(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	42(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 42(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit43):
+	/*  0/32, 27/16 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 27(%rsi), %xmm2
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm2, 27(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	43(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 43(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit44):
+	/*  0/32, 28/16 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 28(%rsi), %xmm2
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm2, 28(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	44(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 44(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit45):
+	/*  0/32, 29/16 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 29(%rsi), %xmm2
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm2, 29(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	45(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 45(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit46):
+	/*  0/32, 30/16 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 30(%rsi), %xmm2
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm2, 30(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	46(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 46(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit47):
+	/*  0/32, 31/16 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 31(%rsi), %xmm2
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm2, 31(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	47(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 47(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit48):
+	/*  0/32, 32/16 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm2
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm2, 32(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	48(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 48(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit49):
+	/* 0/32, 32/16, 48/1 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm2
+	mov	48(%rsi), %cl
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm2, 32(%rdi)
+	mov	%cl, 48(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	49(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 49(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit50):
+	/*  0/32, 32/16, 48/2 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm2
+	mov	48(%rsi), %cx
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm2, 32(%rdi)
+	mov	%cx, 48(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	50(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 50(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit51):
+	/*  0/32, 32/16, 47/4 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm2
+	mov	47(%rsi), %ecx
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm2, 32(%rdi)
+	mov	%ecx, 47(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	51(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 51(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit52):
+	/*  0/32, 32/16, 48/4 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm2
+	mov	48(%rsi), %ecx
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm2, 32(%rdi)
+	mov	%ecx, 48(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	52(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 52(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit53):
+	/*  0/32, 32/16, 45/8 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm2
+	mov	45(%rsi), %rcx
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm2, 32(%rdi)
+	mov	%rcx, 45(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	53(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 53(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit54):
+	/*  0/32, 32/16, 46/8 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm2
+	mov	46(%rsi), %rcx
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm2, 32(%rdi)
+	mov	%rcx, 46(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	54(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 54(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit55):
+	/* 0/32, 32/16, 47/8 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm2
+	mov	47(%rsi), %rcx
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm2, 32(%rdi)
+	mov	%rcx, 47(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	55(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 55(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit56):
+	/* 0/32, 32/16, 48/8 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %xmm2
+	mov	48(%rsi), %rcx
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %xmm2, 32(%rdi)
+	mov	%rcx, 48(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	56(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 56(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit57):
+	/* 0/32, 25/32 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 25(%rsi), %ymm2
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm2, 25(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	57(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 57(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit58):
+	/* 0/32, 26/32 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 26(%rsi), %ymm2
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm2, 26(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	58(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 58(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit59):
+	/* 0/32, 27/32 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 27(%rsi), %ymm2
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm2, 27(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	59(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 59(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+
+	.p2align 4
+L(StrncpyExit60):
+	/* 0/32, 28/32 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 28(%rsi), %ymm2
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm2, 28(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	60(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 60(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit61):
+	/* 0/32, 29/32 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 29(%rsi), %ymm2
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm2, 29(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	61(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 61(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit62):
+	/* 0/32, 30/32 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 30(%rsi), %ymm2
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm2, 30(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	62(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 62(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit63):
+	/* 0/32, 31/32 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 31(%rsi), %ymm2
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm2, 31(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	63(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 63(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit64):
+	/* 0/32, 32/32 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %ymm2
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm2, 32(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	64(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 64(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(StrncpyExit65):
+	/* 0/32, 32/32, 64/1 */
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 32(%rsi), %ymm2
+	mov	64(%rsi), %cl
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm2, 32(%rdi)
+	mov	%cl, 64(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	65(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 65(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+#  ifndef USE_AS_STRCAT
+
+	.p2align 4
+L(Fill0):
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill1):
+	mov	%dl, (%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill2):
+	mov	%dx, (%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill3):
+	mov	%edx, -1(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill4):
+	mov	%edx, (%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill5):
+	mov	%edx, (%rdi)
+	mov	%dl, 4(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill6):
+	mov	%edx, (%rdi)
+	mov	%dx, 4(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill7):
+	mov	%rdx, -1(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill8):
+	mov	%rdx, (%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill9):
+	mov	%rdx, (%rdi)
+	mov	%dl, 8(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill10):
+	mov	%rdx, (%rdi)
+	mov	%dx, 8(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill11):
+	mov	%rdx, (%rdi)
+	mov	%edx, 7(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill12):
+	mov	%rdx, (%rdi)
+	mov	%edx, 8(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill13):
+	mov	%rdx, (%rdi)
+	mov	%rdx, 5(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill14):
+	mov	%rdx, (%rdi)
+	mov	%rdx, 6(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill15):
+	vmovdqu %xmm0, -1(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill16):
+	vmovdqu %xmm0, (%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill17):
+	vmovdqu %xmm0, (%rdi)
+	mov	%dl, 16(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill18):
+	vmovdqu %xmm0, (%rdi)
+	mov	%dx, 16(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill19):
+	vmovdqu %xmm0, (%rdi)
+	mov	%edx, 15(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill20):
+	vmovdqu %xmm0, (%rdi)
+	mov	%edx, 16(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill21):
+	vmovdqu %xmm0, (%rdi)
+	mov	%edx, 16(%rdi)
+	mov	%dl, 20(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill22):
+	vmovdqu %xmm0, (%rdi)
+	mov	%edx, 16(%rdi)
+	mov	%dx, 20(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill23):
+	vmovdqu %xmm0, (%rdi)
+	mov	%rdx, 15(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill24):
+	vmovdqu %xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill25):
+	vmovdqu %xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%dl, 24(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill26):
+	vmovdqu %xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%dx, 24(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill27):
+	vmovdqu %xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%edx, 23(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill28):
+	vmovdqu %xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%edx, 24(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill29):
+	vmovdqu %xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%rdx, 21(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill30):
+	vmovdqu %xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%rdx, 22(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill31):
+	vmovdqu %ymm0, -1(%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(Fill32):
+	vmovdqu %ymm0, (%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(CopyVecSizeUnalignedVec2):
+	vmovdqu %ymm2, (%rdi, %rcx)
+
+	.p2align 4
+L(CopyVecSizeVecExit):
+	bsf	%rdx, %rdx
+	add	$(VEC_SIZE - 1), %r8
+	add	%rcx, %rdi
+#   ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+#   endif
+	sub	%rdx, %r8
+	lea	1(%rdi, %rdx), %rdi
+
+	.p2align 4
+L(StrncpyFillTailWithZero):
+	vpxor	%xmm0, %xmm0, %xmm0
+	xor	%rdx, %rdx
+	sub	$VEC_SIZE, %r8
+	jbe	L(StrncpyFillExit)
+
+	vmovdqu %ymm0, (%rdi)
+	add	$VEC_SIZE, %rdi
+
+	mov	%rdi, %rsi
+	and	$(VEC_SIZE - 1), %rsi
+	sub	%rsi, %rdi
+	add	%rsi, %r8
+	sub	$(VEC_SIZE * 4), %r8
+	jb	L(StrncpyFillLessFourVecSize)
+
+L(StrncpyFillLoopVmovdqa):
+	vmovdqa %ymm0, (%rdi)
+	vmovdqa %ymm0, VEC_SIZE(%rdi)
+	vmovdqa %ymm0, (VEC_SIZE * 2)(%rdi)
+	vmovdqa %ymm0, (VEC_SIZE * 3)(%rdi)
+	add	$(VEC_SIZE * 4), %rdi
+	sub	$(VEC_SIZE * 4), %r8
+	jae	L(StrncpyFillLoopVmovdqa)
+
+L(StrncpyFillLessFourVecSize):
+	add	$(VEC_SIZE * 2), %r8
+	jl	L(StrncpyFillLessTwoVecSize)
+	vmovdqa %ymm0, (%rdi)
+	vmovdqa %ymm0, VEC_SIZE(%rdi)
+	add	$(VEC_SIZE * 2), %rdi
+	sub	$VEC_SIZE, %r8
+	jl	L(StrncpyFillExit)
+	vmovdqa %ymm0, (%rdi)
+	add	$VEC_SIZE, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillLessTwoVecSize):
+	add	$VEC_SIZE, %r8
+	jl	L(StrncpyFillExit)
+	vmovdqa %ymm0, (%rdi)
+	add	$VEC_SIZE, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillExit):
+	add	$VEC_SIZE, %r8
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+/* end of ifndef USE_AS_STRCAT */
+#  endif
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(UnalignedFourVecSizeLeaveCase2)
+L(UnalignedFourVecSizeLeaveCase3):
+	lea	(VEC_SIZE * 4)(%r8), %rcx
+	and	$-VEC_SIZE, %rcx
+	add	$(VEC_SIZE * 3), %r8
+	jl	L(CopyVecSizeCase3)
+	vmovdqu %ymm4, (%rdi)
+	sub	$VEC_SIZE, %r8
+	jb	L(CopyVecSizeCase3)
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
+	sub	$VEC_SIZE, %r8
+	jb	L(CopyVecSizeCase3)
+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
+	sub	$VEC_SIZE, %r8
+	jb	L(CopyVecSizeCase3)
+	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	(VEC_SIZE * 4)(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, (VEC_SIZE * 4)(%rdi)
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(UnalignedFourVecSizeLeaveCase2):
+	xor	%ecx, %ecx
+	vpcmpeqb %ymm4, %ymm0, %ymm0
+	vpmovmskb %ymm0, %rdx
+	add	$(VEC_SIZE * 3), %r8
+	jle	L(CopyVecSizeCase2OrCase3)
+	test	%rdx, %rdx
+#  ifndef USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec4)
+#  else
+	jnz	L(CopyVecSize)
+#  endif
+	vpcmpeqb %ymm5, %ymm0, %ymm0
+	vpmovmskb %ymm0, %rdx
+	vmovdqu %ymm4, (%rdi)
+	add	$VEC_SIZE, %rcx
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+	test	%rdx, %rdx
+#  ifndef USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec5)
+#  else
+	jnz	L(CopyVecSize)
+#  endif
+
+	vpcmpeqb %ymm6, %ymm0, %ymm0
+	vpmovmskb %ymm0, %rdx
+	vmovdqu %ymm5, VEC_SIZE(%rdi)
+	add	$VEC_SIZE, %rcx
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+	test	%rdx, %rdx
+#  ifndef USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec6)
+#  else
+	jnz	L(CopyVecSize)
+#  endif
+
+	vpcmpeqb %ymm7, %ymm0, %ymm0
+	vpmovmskb %ymm0, %rdx
+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
+	lea	VEC_SIZE(%rdi, %rcx), %rdi
+	lea	VEC_SIZE(%rsi, %rcx), %rsi
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyVecSizeExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(ExitZero):
+#  ifndef USE_AS_STRCAT
+	mov	%rdi, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+# endif
+
+# ifndef USE_AS_STRCAT
+END (STRCPY)
+# else
+END (STRCAT)
+# endif
+	.p2align 4
+	.section .rodata
+L(ExitTable):
+	.int	JMPTBL(L(Exit1), L(ExitTable))
+	.int	JMPTBL(L(Exit2), L(ExitTable))
+	.int	JMPTBL(L(Exit3), L(ExitTable))
+	.int	JMPTBL(L(Exit4), L(ExitTable))
+	.int	JMPTBL(L(Exit5), L(ExitTable))
+	.int	JMPTBL(L(Exit6), L(ExitTable))
+	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit8), L(ExitTable))
+	.int	JMPTBL(L(Exit9), L(ExitTable))
+	.int	JMPTBL(L(Exit10), L(ExitTable))
+	.int	JMPTBL(L(Exit11), L(ExitTable))
+	.int	JMPTBL(L(Exit12), L(ExitTable))
+	.int	JMPTBL(L(Exit13), L(ExitTable))
+	.int	JMPTBL(L(Exit14), L(ExitTable))
+	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit16), L(ExitTable))
+	.int	JMPTBL(L(Exit17), L(ExitTable))
+	.int	JMPTBL(L(Exit18), L(ExitTable))
+	.int	JMPTBL(L(Exit19), L(ExitTable))
+	.int	JMPTBL(L(Exit20), L(ExitTable))
+	.int	JMPTBL(L(Exit21), L(ExitTable))
+	.int	JMPTBL(L(Exit22), L(ExitTable))
+	.int    JMPTBL(L(Exit23), L(ExitTable))
+	.int	JMPTBL(L(Exit24), L(ExitTable))
+	.int	JMPTBL(L(Exit25), L(ExitTable))
+	.int	JMPTBL(L(Exit26), L(ExitTable))
+	.int	JMPTBL(L(Exit27), L(ExitTable))
+	.int	JMPTBL(L(Exit28), L(ExitTable))
+	.int	JMPTBL(L(Exit29), L(ExitTable))
+	.int	JMPTBL(L(Exit30), L(ExitTable))
+	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit32), L(ExitTable))
+	.int	JMPTBL(L(Exit33), L(ExitTable))
+	.int	JMPTBL(L(Exit34), L(ExitTable))
+	.int	JMPTBL(L(Exit35), L(ExitTable))
+	.int	JMPTBL(L(Exit36), L(ExitTable))
+	.int	JMPTBL(L(Exit37), L(ExitTable))
+	.int	JMPTBL(L(Exit38), L(ExitTable))
+	.int	JMPTBL(L(Exit39), L(ExitTable))
+	.int	JMPTBL(L(Exit40), L(ExitTable))
+	.int	JMPTBL(L(Exit41), L(ExitTable))
+	.int	JMPTBL(L(Exit42), L(ExitTable))
+	.int	JMPTBL(L(Exit43), L(ExitTable))
+	.int	JMPTBL(L(Exit44), L(ExitTable))
+	.int	JMPTBL(L(Exit45), L(ExitTable))
+	.int	JMPTBL(L(Exit46), L(ExitTable))
+	.int	JMPTBL(L(Exit47), L(ExitTable))
+	.int	JMPTBL(L(Exit48), L(ExitTable))
+	.int	JMPTBL(L(Exit49), L(ExitTable))
+	.int	JMPTBL(L(Exit50), L(ExitTable))
+	.int	JMPTBL(L(Exit51), L(ExitTable))
+	.int	JMPTBL(L(Exit52), L(ExitTable))
+	.int	JMPTBL(L(Exit53), L(ExitTable))
+	.int	JMPTBL(L(Exit54), L(ExitTable))
+	.int	JMPTBL(L(Exit55), L(ExitTable))
+	.int	JMPTBL(L(Exit56), L(ExitTable))
+	.int	JMPTBL(L(Exit57), L(ExitTable))
+	.int	JMPTBL(L(Exit58), L(ExitTable))
+	.int	JMPTBL(L(Exit59), L(ExitTable))
+	.int	JMPTBL(L(Exit60), L(ExitTable))
+	.int	JMPTBL(L(Exit61), L(ExitTable))
+	.int	JMPTBL(L(Exit62), L(ExitTable))
+	.int	JMPTBL(L(Exit63), L(ExitTable))
+	.int	JMPTBL(L(Exit64), L(ExitTable))
+# ifdef USE_AS_STRNCPY
+L(ExitStrncpyTable):
+	.int	JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+	.int    JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit34), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit35), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit36), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit37), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit38), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit39), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit40), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit41), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit42), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit43), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit44), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit45), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit46), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit47), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit48), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit49), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit50), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit51), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit52), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit53), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit54), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit55), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit56), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit57), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit58), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit59), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit60), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit61), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit62), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit63), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit64), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit65), L(ExitStrncpyTable))
+#  ifndef USE_AS_STRCAT
+	.p2align 4
+L(FillTable):
+	.int	JMPTBL(L(Fill0), L(FillTable))
+	.int	JMPTBL(L(Fill1), L(FillTable))
+	.int	JMPTBL(L(Fill2), L(FillTable))
+	.int	JMPTBL(L(Fill3), L(FillTable))
+	.int	JMPTBL(L(Fill4), L(FillTable))
+	.int	JMPTBL(L(Fill5), L(FillTable))
+	.int	JMPTBL(L(Fill6), L(FillTable))
+	.int	JMPTBL(L(Fill7), L(FillTable))
+	.int	JMPTBL(L(Fill8), L(FillTable))
+	.int	JMPTBL(L(Fill9), L(FillTable))
+	.int	JMPTBL(L(Fill10), L(FillTable))
+	.int	JMPTBL(L(Fill11), L(FillTable))
+	.int	JMPTBL(L(Fill12), L(FillTable))
+	.int	JMPTBL(L(Fill13), L(FillTable))
+	.int	JMPTBL(L(Fill14), L(FillTable))
+	.int	JMPTBL(L(Fill15), L(FillTable))
+	.int	JMPTBL(L(Fill16), L(FillTable))
+	.int	JMPTBL(L(Fill17), L(FillTable))
+	.int	JMPTBL(L(Fill18), L(FillTable))
+	.int	JMPTBL(L(Fill19), L(FillTable))
+	.int	JMPTBL(L(Fill20), L(FillTable))
+	.int	JMPTBL(L(Fill21), L(FillTable))
+	.int	JMPTBL(L(Fill22), L(FillTable))
+	.int	JMPTBL(L(Fill23), L(FillTable))
+	.int	JMPTBL(L(Fill24), L(FillTable))
+	.int	JMPTBL(L(Fill25), L(FillTable))
+	.int	JMPTBL(L(Fill26), L(FillTable))
+	.int	JMPTBL(L(Fill27), L(FillTable))
+	.int	JMPTBL(L(Fill28), L(FillTable))
+	.int	JMPTBL(L(Fill29), L(FillTable))
+	.int	JMPTBL(L(Fill30), L(FillTable))
+	.int	JMPTBL(L(Fill31), L(FillTable))
+	.int	JMPTBL(L(Fill32), L(FillTable))
+#  endif
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy.c b/sysdeps/x86_64/multiarch/strcpy.c
index 12e0e3f..ef6858e 100644
--- a/sysdeps/x86_64/multiarch/strcpy.c
+++ b/sysdeps/x86_64/multiarch/strcpy.c
@@ -24,7 +24,7 @@
 # undef strcpy
 
 # define SYMBOL_NAME strcpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-unaligned.h"
 
 libc_ifunc_redirected (__redirect_strcpy, strcpy, IFUNC_SELECTOR ());
 
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
new file mode 100644
index 0000000..bfefa65
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_avx2
+#include "strcat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
index 841c165..e7757cd 100644
--- a/sysdeps/x86_64/multiarch/strncat.c
+++ b/sysdeps/x86_64/multiarch/strncat.c
@@ -24,7 +24,7 @@
 # undef strncat
 
 # define SYMBOL_NAME strncat
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-unaligned.h"
 
 libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
 strong_alias (strncat, __strncat);
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
new file mode 100644
index 0000000..9ef8c87
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_avx2
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
index 3c3de8b..d6d9dc7 100644
--- a/sysdeps/x86_64/multiarch/strncpy.c
+++ b/sysdeps/x86_64/multiarch/strncpy.c
@@ -24,7 +24,7 @@
 # undef strncpy
 
 # define SYMBOL_NAME strncpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-unaligned.h"
 
 libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
 

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]