This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
GNU C Library master sources branch hjl/avx2/master created. glibc-2.25-394-g29995cd

From: hjl at sourceware dot org
To: glibc-cvs at sourceware dot org
Date: 1 Jun 2017 20:35:01 -0000
Subject: GNU C Library master sources branch hjl/avx2/master created. glibc-2.25-394-g29995cd
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, hjl/avx2/master has been created
        at  29995cdfd5e53e616aae0dda9283dae2093f3971 (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=29995cdfd5e53e616aae0dda9283dae2093f3971

commit 29995cdfd5e53e616aae0dda9283dae2093f3971
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu Jun 1 12:57:14 2017 -0700

    Update between_2_3 memcmp-avx2.S

diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2.S b/sysdeps/x86_64/multiarch/memcmp-avx2.S
index 5297e3c..c04902e 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2.S
@@ -136,18 +136,15 @@ L(between_2_3):
 	/* Load 2 bytes into registers.  */
 	movzwl	(%rdi), %eax
 	movzwl	(%rsi), %ecx
-	/* Compare the lowest byte.  */
-	cmpb	%cl, %al
-	jne	L(1byte_reg)
-	/* Load the difference of 2 bytes into EAX.  */
-	subl	%ecx, %eax
-	/* Return if 2 bytes differ.  */
-	jnz	L(exit)
-	cmpb	$2, %dl
-	/* Return if these are the last 2 bytes.  */
-	je	L(exit)
-	movzbl	2(%rdi), %eax
-	movzbl	2(%rsi), %ecx
+	shll	$16, %eax
+	shll	$16, %ecx
+	/* Use overlapping loads to avoid branches.  */
+	movzwl	-2(%rdi, %rdx), %edi
+	movzwl	-2(%rsi, %rdx), %esi
+	orl	%edi, %eax
+	orl	%esi, %ecx
+	bswap	%eax
+	bswap	%ecx
 	subl	%ecx, %eax
 	ret
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3793249d7aefe57acca311a3687c764225122a05

commit 3793249d7aefe57acca311a3687c764225122a05
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu Jun 1 12:27:16 2017 -0700

    memcmp-avx2.S

diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2.S b/sysdeps/x86_64/multiarch/memcmp-avx2.S
index eb3a4cb..5297e3c 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2.S
@@ -72,6 +72,7 @@ L(last_2x_vec):
 	jnz	L(first_vec)
 
 L(last_vec):
+	/* Use overlapping loads to avoid branches.  */
 	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
 	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
 	vmovdqu	(%rsi), %ymm2
@@ -119,6 +120,7 @@ L(between_4_7):
 	vpmovmskb %xmm2, %eax
 	subl    $0xffff, %eax
 	jnz	L(first_vec)
+	/* Use overlapping loads to avoid branches.  */
 	leaq	-4(%rdi, %rdx), %rdi
 	leaq	-4(%rsi, %rdx), %rsi
 	vmovd	(%rdi), %xmm1
@@ -198,6 +200,7 @@ L(less_vec):
 	vpmovmskb %xmm2, %eax
 	subl    $0xffff, %eax
 	jnz	L(first_vec)
+	/* Use overlapping loads to avoid branches.  */
 	leaq	-8(%rdi, %rdx), %rdi
 	leaq	-8(%rsi, %rdx), %rsi
 	vmovq	(%rdi), %xmm1
@@ -217,6 +220,7 @@ L(between_16_31):
 	subl    $0xffff, %eax
 	jnz	L(first_vec)
 
+	/* Use overlapping loads to avoid branches.  */
 	leaq	-16(%rdi, %rdx), %rdi
 	leaq	-16(%rsi, %rdx), %rsi
 	vmovdqu	(%rsi), %xmm2
@@ -350,6 +354,7 @@ L(last_4x_vec):
 	subl    $VEC_MASK, %eax
 	jnz	L(first_vec)
 
+	/* Use overlapping loads to avoid branches.  */
 	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
 	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
 	vmovdqu	(%rsi), %ymm2

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3180371a826547726ac5b457004b82194418903b

commit 3180371a826547726ac5b457004b82194418903b
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri May 26 12:21:55 2017 -0700

    x86-64: Optimize strrchr/wcsrchr with AVX2
    
    Optimize strrchr/wcsrchr with AVX2 to check 32 bytes with vector
    instructions.  It is as fast as SSE2 version for small data sizes
    and up to 1X faster for large data sizes on Haswell.  Select AVX2
    version on AVX2 machines where vzeroupper is preferred and AVX
    unaligned load is fast.
    
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	strrchr-avx2 and wcsrchr-avx2.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Add tests for __strrchr_avx2,
    	__strrchr_sse2, __wcsrchr_avx2 and __wcsrchr_sse2.
    	* sysdeps/x86_64/multiarch/strrchr-avx2.S: New file.
    	* sysdeps/x86_64/multiarch/strrchr.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcsrchr-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcsrchr.S: Likewise.

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index aee4820..5d7825a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -15,6 +15,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   memmove-ssse3-back \
 		   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
 		   strchr-avx2 strchrnul-avx2 \
+		   strrchr-avx2 \
 		   strlen-avx2 strnlen-avx2 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
@@ -40,5 +41,6 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
 		   wmemcmp-avx2 \
 		   wcscpy-ssse3 wcscpy-c \
 		   wcschr-avx2 \
+		   wcsrchr-avx2 \
 		   wcslen-avx2 wcsnlen-avx2
 endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 9f1ec4b..0112621 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -250,6 +250,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strchrnul_avx2)
 	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/strrchr.S.  */
+  IFUNC_IMPL (i, name, strrchr,
+	      IFUNC_IMPL_ADD (array, i, strrchr,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strrchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
+
   /* Support sysdeps/x86_64/multiarch/strcmp.S.  */
   IFUNC_IMPL (i, name, strcmp,
 	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
@@ -341,6 +348,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __wcschr_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/wcsrchr.S.  */
+  IFUNC_IMPL (i, name, wcsrchr,
+	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wcsrchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
+
   /* Support sysdeps/x86_64/multiarch/wcscpy.S.  */
   IFUNC_IMPL (i, name, wcscpy,
 	      IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
new file mode 100644
index 0000000..36ef660
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -0,0 +1,235 @@
+/* strrchr/wcsrchr optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRRCHR
+#  define STRRCHR	__strrchr_avx2
+# endif
+
+# ifdef USE_AS_WCSRCHR
+#  define VPBROADCAST	vpbroadcastd
+#  define VPCMPEQ	vpcmpeqd
+# else
+#  define VPBROADCAST	vpbroadcastb
+#  define VPCMPEQ	vpcmpeqb
+# endif
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define VEC_SIZE	32
+
+	.section .text.avx,"ax",@progbits
+ENTRY (STRRCHR)
+	movd	%esi, %xmm4
+	movl	%edi, %ecx
+	/* Broadcast CHAR to YMM4.  */
+	VPBROADCAST %xmm4, %ymm4
+	vpxor	%ymm0, %ymm0, %ymm0
+
+	/* Check if we may cross page boundary with one vector load.  */
+	andl	$(2 * VEC_SIZE - 1), %ecx
+	cmpl	$VEC_SIZE, %ecx
+	ja	L(cros_page_boundary)
+
+	vmovdqu	(%rdi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm2
+	VPCMPEQ	%ymm1, %ymm4, %ymm3
+	vpmovmskb %ymm2, %ecx
+	vpmovmskb %ymm3, %eax
+	addq	$VEC_SIZE, %rdi
+
+	testl	%eax, %eax
+	jnz	L(first_vec)
+
+	testl	%ecx, %ecx
+	jnz	L(return_null)
+
+	andq	$-VEC_SIZE, %rdi
+	xorl	%edx, %edx
+	jmp	L(aligned_loop)
+
+	.p2align 4
+L(first_vec):
+	/* Check if there is a nul CHAR.  */
+	testl	%ecx, %ecx
+	jnz	L(char_and_nul_in_first_vec)
+
+	/* Remember the match and keep searching.  */
+	movl	%eax, %edx
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rdi
+	jmp	L(aligned_loop)
+
+	.p2align 4
+L(cros_page_boundary):
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+	vmovdqa	(%rdi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm2
+	VPCMPEQ	%ymm1, %ymm4, %ymm3
+	vpmovmskb %ymm2, %edx
+	vpmovmskb %ymm3, %eax
+	shrl	%cl, %edx
+	shrl	%cl, %eax
+	addq	$VEC_SIZE, %rdi
+
+	/* Check if there is a CHAR.  */
+	testl	%eax, %eax
+	jnz	L(found_char)
+
+	testl	%edx, %edx
+	jnz	L(return_null)
+
+	jmp	L(aligned_loop)
+
+	.p2align 4
+L(found_char):
+	testl	%edx, %edx
+	jnz	L(char_and_nul)
+
+	/* Remember the match and keep searching.  */
+	movl	%eax, %edx
+	leaq	(%rdi, %rcx), %rsi
+
+	.p2align 4
+L(aligned_loop):
+	vmovdqa	(%rdi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm2
+	addq	$VEC_SIZE, %rdi
+	VPCMPEQ	%ymm1, %ymm4, %ymm3
+	vpmovmskb %ymm2, %ecx
+	vpmovmskb %ymm3, %eax
+	orl	%eax, %ecx
+	jnz	L(char_nor_null)
+
+	vmovdqa	(%rdi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm2
+	add	$VEC_SIZE, %rdi
+	VPCMPEQ	%ymm1, %ymm4, %ymm3
+	vpmovmskb %ymm2, %ecx
+	vpmovmskb %ymm3, %eax
+	orl	%eax, %ecx
+	jnz	L(char_nor_null)
+
+	vmovdqa	(%rdi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm2
+	addq	$VEC_SIZE, %rdi
+	VPCMPEQ	%ymm1, %ymm4, %ymm3
+	vpmovmskb %ymm2, %ecx
+	vpmovmskb %ymm3, %eax
+	orl	%eax, %ecx
+	jnz	L(char_nor_null)
+
+	vmovdqa	(%rdi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm2
+	addq	$VEC_SIZE, %rdi
+	VPCMPEQ	%ymm1, %ymm4, %ymm3
+	vpmovmskb %ymm2, %ecx
+	vpmovmskb %ymm3, %eax
+	orl	%eax, %ecx
+	jz	L(aligned_loop)
+
+	.p2align 4
+L(char_nor_null):
+	/* Find a CHAR or a nul CHAR in a loop.  */
+	testl	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	testl	%edx, %edx
+	jz	L(return_null)
+	movl	%edx, %eax
+	movq	%rsi, %rdi
+
+# ifdef USE_AS_WCSRCHR
+	/* Keep the first bit for each matching CHAR for bsr.  */
+	andl	$0x11111111, %eax
+# endif
+	bsrl	%eax, %eax
+	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(match):
+	/* Find a CHAR.  Check if there is a nul CHAR.  */
+	vpmovmskb %ymm2, %ecx
+	testl	%ecx, %ecx
+	jnz	L(find_nul)
+
+	/* Remember the match and keep searching.  */
+	movl	%eax, %edx
+	movq	%rdi, %rsi
+	jmp	L(aligned_loop)
+
+	.p2align 4
+L(find_nul):
+# ifdef USE_AS_WCSRCHR
+	/* Keep the first bit for each matching CHAR for bsr.  */
+	andl	$0x11111111, %ecx
+	andl	$0x11111111, %eax
+# endif
+	/* Mask out any matching bits after the nul CHAR.  */
+	movl	%ecx, %r8d
+	subl	$1, %r8d
+	xorl	%ecx, %r8d
+	andl	%r8d, %eax
+	testl	%eax, %eax
+	/* If there is no CHAR here, return the remembered one.  */
+	jz	L(return_value)
+	bsrl	%eax, %eax
+	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(char_and_nul):
+	/* Find both a CHAR and a nul CHAR.  */
+	addq	%rcx, %rdi
+	movl	%edx, %ecx
+L(char_and_nul_in_first_vec):
+# ifdef USE_AS_WCSRCHR
+	/* Keep the first bit for each matching CHAR for bsr.  */
+	andl	$0x11111111, %ecx
+	andl	$0x11111111, %eax
+# endif
+	/* Mask out any matching bits after the nul CHAR.  */
+	movl	%ecx, %r8d
+	subl	$1, %r8d
+	xorl	%ecx, %r8d
+	andl	%r8d, %eax
+	testl	%eax, %eax
+	/* Return null pointer if the nul CHAR comes first.  */
+	jz	L(return_null)
+	bsrl	%eax, %eax
+	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(return_null):
+	xorl	%eax, %eax
+	VZEROUPPER
+	ret
+
+END (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
new file mode 100644
index 0000000..e5b49a9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr.S
@@ -0,0 +1,63 @@
+/* Multiple versions of strrchr
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+	.text
+ENTRY(strrchr)
+	.type	strrchr, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__strrchr_avx2(%rip), %rax
+	ret
+1:
+	leaq	__strrchr_sse2(%rip), %rax
+	ret
+END(strrchr)
+
+
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strrchr_sse2, @function; \
+	.align 16; \
+	.globl __strrchr_sse2; \
+	.hidden __strrchr_sse2; \
+	__strrchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strrchr calls through a PLT.
+   The speedup we get from using SSE4.2 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
+#endif
+
+#include "../strrchr.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2.S
new file mode 100644
index 0000000..cf8a239
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2.S
@@ -0,0 +1,3 @@
+#define STRRCHR __wcsrchr_avx2
+#define USE_AS_WCSRCHR 1
+#include "strrchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr.S b/sysdeps/x86_64/multiarch/wcsrchr.S
new file mode 100644
index 0000000..59276cb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr.S
@@ -0,0 +1,55 @@
+/* Multiple versions of wcsrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(wcsrchr)
+	.type	wcsrchr, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__wcsrchr_avx2(%rip), %rax
+	ret
+
+1:	leaq	__wcsrchr_sse2(%rip), %rax
+	ret
+END(wcsrchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __wcsrchr_sse2, @function; \
+	.p2align 4; \
+	.globl __wcsrchr_sse2; \
+	.hidden __wcsrchr_sse2; \
+	__wcsrchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __wcsrchr_sse2, .-__wcsrchr_sse2
+#endif
+
+#include "../wcsrchr.S"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b3395c9a7431c4153e5f2bd5a3c5a023017d2703

commit b3395c9a7431c4153e5f2bd5a3c5a023017d2703
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue May 23 11:25:19 2017 -0700

    x86-64: Optimize memrchr with AVX2
    
    Optimize memrchr with AVX2 to search 32 bytes with a single vector
    compare instruction.  It is as fast as SSE2 memrchr for small data
    sizes and up to 1X faster for large data sizes on Haswell.  Select
    AVX2 memrchr on AVX2 machines where vzeroupper is preferred and AVX
    unaligned load is fast.
    
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	memrchr-avx2.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Add tests for __memrchr_avx2 and
    	__memrchr_sse2.
    	* sysdeps/x86_64/multiarch/memrchr-avx2.S: New file.
    	* sysdeps/x86_64/multiarch/memrchr.S: Likewise.

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 2974495..aee4820 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -7,6 +7,7 @@ ifeq ($(subdir),string)
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
 		   memchr-avx2 rawmemchr-avx2 \
+		   memrchr-avx2 \
 		   memcmp-avx2 \
 		   memcmp-sse4 memcpy-ssse3 \
 		   memmove-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index d3fa98f..9f1ec4b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -111,6 +111,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove, 1,
 			      __memmove_sse2_unaligned_erms))
 
+  /* Support sysdeps/x86_64/multiarch/memrchr.S.  */
+  IFUNC_IMPL (i, name, memrchr,
+	      IFUNC_IMPL_ADD (array, i, memrchr,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memrchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2))
+
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
   IFUNC_IMPL (i, name, __memset_chk,
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
new file mode 100644
index 0000000..3ee02e1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -0,0 +1,359 @@
+/* memrchr optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+	.section .text.avx,"ax",@progbits
+ENTRY (__memrchr_avx2)
+	/* Broadcast CHAR to YMM0.  */
+	vmovd	%esi, %xmm0
+	vpbroadcastb %xmm0, %ymm0
+
+	subq	$VEC_SIZE, %rdx
+	jbe	L(last_vec_or_less)
+
+	addq	%rdx, %rdi
+
+	/* Check the last VEC_SIZE bytes.  */
+	vpcmpeqb (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x0)
+
+	subq	$(VEC_SIZE * 4), %rdi
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE - 1), %ecx
+	jz	L(aligned_more)
+
+	/* Align data for aligned loads in the loop.  */
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rdx
+	andq	$-VEC_SIZE, %rdi
+	subq	%rcx, %rdx
+
+	.p2align 4
+L(aligned_more):
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+
+	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
+	vpmovmskb %ymm2, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
+	vpmovmskb %ymm3, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	vpcmpeqb (%rdi), %ymm0, %ymm4
+	vpmovmskb %ymm4, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x0)
+
+	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
+	   There are some overlaps with above if data isn't aligned
+	   to 4 * VEC_SIZE.  */
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	jz	L(loop_4x_vec)
+
+	addq	$(VEC_SIZE * 4), %rdi
+	addq	$(VEC_SIZE * 4), %rdx
+	andq	$-(VEC_SIZE * 4), %rdi
+	subq	%rcx, %rdx
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	subq	$(VEC_SIZE * 4), %rdi
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+
+	vmovdqa	(%rdi), %ymm1
+	vmovdqa	VEC_SIZE(%rdi), %ymm2
+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
+
+	vpcmpeqb %ymm1, %ymm0, %ymm1
+	vpcmpeqb %ymm2, %ymm0, %ymm2
+	vpcmpeqb %ymm3, %ymm0, %ymm3
+	vpcmpeqb %ymm4, %ymm0, %ymm4
+
+	vpor	%ymm1, %ymm2, %ymm5
+	vpor	%ymm3, %ymm4, %ymm6
+	vpor	%ymm5, %ymm6, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jz	L(loop_4x_vec)
+
+	/* There is a match.  */
+	vpmovmskb %ymm4, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	vpmovmskb %ymm3, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	vpmovmskb %ymm2, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	vpmovmskb %ymm1, %eax
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_4x_vec_or_less):
+	addl	$(VEC_SIZE * 4), %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+
+	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
+	vpmovmskb %ymm2, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
+	vpmovmskb %ymm3, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x1_check)
+	cmpl	$(VEC_SIZE * 3), %edx
+	jbe	L(zero)
+
+	vpcmpeqb (%rdi), %ymm0, %ymm4
+	vpmovmskb %ymm4, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+	bsrl	%eax, %eax
+	subq	$(VEC_SIZE * 4), %rdx
+	addq	%rax, %rdx
+	jl	L(zero)
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3_check)
+	cmpl	$VEC_SIZE, %edx
+	jbe	L(zero)
+
+	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+	bsrl	%eax, %eax
+	subq	$(VEC_SIZE * 2), %rdx
+	addq	%rax, %rdx
+	jl	L(zero)
+	addl	$(VEC_SIZE * 2), %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_vec_x0):
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_vec_x1):
+	bsrl	%eax, %eax
+	addl	$VEC_SIZE, %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_vec_x2):
+	bsrl	%eax, %eax
+	addl	$(VEC_SIZE * 2), %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_vec_x3):
+	bsrl	%eax, %eax
+	addl	$(VEC_SIZE * 3), %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_vec_x1_check):
+	bsrl	%eax, %eax
+	subq	$(VEC_SIZE * 3), %rdx
+	addq	%rax, %rdx
+	jl	L(zero)
+	addl	$VEC_SIZE, %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_vec_x3_check):
+	bsrl	%eax, %eax
+	subq	$VEC_SIZE, %rdx
+	addq	%rax, %rdx
+	jl	L(zero)
+	addl	$(VEC_SIZE * 3), %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(zero):
+	VZEROUPPER
+L(null):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(last_vec_or_less_aligned):
+	movl	%edx, %ecx
+
+	vpcmpeqb (%rdi), %ymm0, %ymm1
+
+	movl	$1, %edx
+	/* Support rdx << 32.  */
+	salq	%cl, %rdx
+	subq	$1, %rdx
+
+	vpmovmskb %ymm1, %eax
+
+	/* Remove the trailing bytes.  */
+	andl	%edx, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_vec_or_less):
+	addl	$VEC_SIZE, %edx
+
+	/* Check for zero length.  */
+	testl	%edx, %edx
+	jz	L(null)
+
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE - 1), %ecx
+	jz	L(last_vec_or_less_aligned)
+
+	movl	%ecx, %esi
+	movl	%ecx, %r8d
+	addl	%edx, %esi
+	andq	$-VEC_SIZE, %rdi
+
+	subl	$VEC_SIZE, %esi
+	ja	L(last_vec_2x_aligned)
+
+	/* Check the last VEC.  */
+	vpcmpeqb (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+
+	/* Remove the leading and trailing bytes.  */
+	sarl	%cl, %eax
+	movl	%edx, %ecx
+
+	movl	$1, %edx
+	sall	%cl, %edx
+	subl	$1, %edx
+
+	andl	%edx, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	addq	%r8, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_vec_2x_aligned):
+	movl	%esi, %ecx
+
+	/* Check the last VEC.  */
+	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
+
+	movl	$1, %edx
+	sall	%cl, %edx
+	subl	$1, %edx
+
+	vpmovmskb %ymm1, %eax
+
+	/* Remove the trailing bytes.  */
+	andl	%edx, %eax
+
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	/* Check the second last VEC.  */
+	vpcmpeqb (%rdi), %ymm0, %ymm1
+
+	movl	%r8d, %ecx
+
+	vpmovmskb %ymm1, %eax
+
+	/* Remove the leading bytes.  Must use unsigned right shift for
+	   bsrl below.  */
+	shrl	%cl, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	addq	%r8, %rax
+	VZEROUPPER
+	ret
+END (__memrchr_avx2)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memrchr.S b/sysdeps/x86_64/multiarch/memrchr.S
new file mode 100644
index 0000000..20a9e76
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr.S
@@ -0,0 +1,55 @@
+/* Multiple versions of memrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(__memrchr)
+	.type	__memrchr, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__memrchr_avx2(%rip), %rax
+	ret
+
+1:	leaq	__memrchr_sse2(%rip), %rax
+	ret
+END(__memrchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memrchr_sse2, @function; \
+	.p2align 4; \
+	.globl __memrchr_sse2; \
+	.hidden __memrchr_sse2; \
+	__memrchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memrchr_sse2, .-__memrchr_sse2
+#endif
+
+#include "../memrchr.S"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=adb18859214a0e365eddc6e253789ae366c573c3

commit adb18859214a0e365eddc6e253789ae366c573c3
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon May 22 15:09:50 2017 -0700

    x86-64: Optimize strchr/strchrnul/wcschr with AVX2
    
    Optimize strchr/strchrnul/wcschr with AVX2 to search 32 bytes with vector
    instructions.  It is as fast as SSE2 versions for size <= 16 bytes and up
    to 1X faster for or size > 16 bytes on Haswell.  Select AVX2 version on
    AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast.
    
    NB: It uses TZCNT instead of BSF since TZCNT produces the same result
    as BSF for non-zero input.  TZCNT is faster than BSF and is executed
    as BSF if machine doesn't support TZCNT.
    
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	strchr-avx2, strchrnul-avx2 and wcschr-avx2.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Add tests for __strchr_avx2,
    	__strchrnul_avx2, __strchrnul_sse2, __wcschr_avx2 and
    	__wcschr_sse2.
    	* sysdeps/x86_64/multiarch/strchr-avx2.S: New file.
    	* sysdeps/x86_64/multiarch/strchrnul-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/strchrnul.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcschr-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcschr.S: Likewise.
    	* sysdeps/x86_64/multiarch/strchr.S (strchr): Add support for
    	__strchr_avx2.

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f33f21a..2974495 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   memcpy-ssse3-back \
 		   memmove-ssse3-back \
 		   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
+		   strchr-avx2 strchrnul-avx2 \
 		   strlen-avx2 strnlen-avx2 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
@@ -37,5 +38,6 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
 		   wmemchr-avx2 \
 		   wmemcmp-avx2 \
 		   wcscpy-ssse3 wcscpy-c \
+		   wcschr-avx2 \
 		   wcslen-avx2 wcsnlen-avx2
 endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c2b07b3..d3fa98f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -230,9 +230,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strchr.S.  */
   IFUNC_IMPL (i, name, strchr,
+	      IFUNC_IMPL_ADD (array, i, strchr,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strchr_avx2)
 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/strchrnul.S.  */
+  IFUNC_IMPL (i, name, strchrnul,
+	      IFUNC_IMPL_ADD (array, i, strchrnul,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strchrnul_avx2)
+	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
+
   /* Support sysdeps/x86_64/multiarch/strcmp.S.  */
   IFUNC_IMPL (i, name, strcmp,
 	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
@@ -317,6 +327,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/wcschr.S.  */
+  IFUNC_IMPL (i, name, wcschr,
+	      IFUNC_IMPL_ADD (array, i, wcschr,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wcschr_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
+
   /* Support sysdeps/x86_64/multiarch/wcscpy.S.  */
   IFUNC_IMPL (i, name, wcscpy,
 	      IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
new file mode 100644
index 0000000..e4292d3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -0,0 +1,254 @@
+/* strchr/strchrnul optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCHR
+#  define STRCHR	__strchr_avx2
+# endif
+
+# ifdef USE_AS_WCSCHR
+#  define VPBROADCAST	vpbroadcastd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_REG	esi
+# else
+#  define VPBROADCAST	vpbroadcastb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_REG	sil
+# endif
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+	.section .text.avx,"ax",@progbits
+ENTRY (STRCHR)
+	movl	%edi, %ecx
+	/* Broadcast CHAR to YMM0.  */
+	vmovd	%esi, %xmm0
+	vpxor	%xmm9, %xmm9, %xmm9
+	VPBROADCAST %xmm0, %ymm0
+	/* Check if we may cross page boundary with one vector load.  */
+	andl	$(2 * VEC_SIZE - 1), %ecx
+	cmpl	$VEC_SIZE, %ecx
+	ja	L(cros_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
+	   null byte.  */
+	vmovdqu	(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	/* Align data for aligned loads in the loop.  */
+	addq	$VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+
+	jmp	L(more_4x_vec)
+
+	.p2align 4
+L(cros_page_boundary):
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+	vmovdqu	(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Remove the leading bytes.  */
+	sarl	%cl, %eax
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	/* Found CHAR or the null byte.  */
+	tzcntl	%eax, %eax
+	addq	%rcx, %rax
+# ifdef USE_AS_STRCHRNUL
+	addq	%rdi, %rax
+# else
+	xorl	%edx, %edx
+	leaq	(%rdi, %rax), %rax
+	cmp	(%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(aligned_more):
+	addq	$VEC_SIZE, %rdi
+
+L(more_4x_vec):
+	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+	vmovdqa	(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	vmovdqa	VEC_SIZE(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+	/* Align data to 4 * VEC_SIZE.  */
+	movq	%rdi, %rcx
+	andl	$(4 * VEC_SIZE - 1), %ecx
+	andq	$-(4 * VEC_SIZE), %rdi
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	vmovdqa	(%rdi), %ymm5
+	vmovdqa	VEC_SIZE(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+
+	VPCMPEQ %ymm5, %ymm0, %ymm1
+	VPCMPEQ %ymm6, %ymm0, %ymm2
+	VPCMPEQ %ymm7, %ymm0, %ymm3
+	VPCMPEQ %ymm8, %ymm0, %ymm4
+
+	VPCMPEQ %ymm5, %ymm9, %ymm5
+	VPCMPEQ %ymm6, %ymm9, %ymm6
+	VPCMPEQ %ymm7, %ymm9, %ymm7
+	VPCMPEQ %ymm8, %ymm9, %ymm8
+
+	vpor	%ymm1, %ymm5, %ymm1
+	vpor	%ymm2, %ymm6, %ymm2
+	vpor	%ymm3, %ymm7, %ymm3
+	vpor	%ymm4, %ymm8, %ymm4
+
+	vpor	%ymm1, %ymm2, %ymm5
+	vpor	%ymm3, %ymm4, %ymm6
+
+	vpor	%ymm5, %ymm6, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jnz	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+	jmp	L(loop_4x_vec)
+
+	.p2align 4
+L(first_vec_x0):
+	/* Found CHAR or the null byte.  */
+	tzcntl	%eax, %eax
+# ifdef USE_AS_STRCHRNUL
+	addq	%rdi, %rax
+# else
+	xorl	%edx, %edx
+	leaq	(%rdi, %rax), %rax
+	cmp	(%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_STRCHRNUL
+	addq	$VEC_SIZE, %rax
+	addq	%rdi, %rax
+# else
+	xorl	%edx, %edx
+	leaq	VEC_SIZE(%rdi, %rax), %rax
+	cmp	(%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_STRCHRNUL
+	addq	$(VEC_SIZE * 2), %rax
+	addq	%rdi, %rax
+# else
+	xorl	%edx, %edx
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+	cmp	(%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+	vpmovmskb %ymm2, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+	vpmovmskb %ymm3, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+	vpmovmskb %ymm4, %eax
+	testl	%eax, %eax
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_STRCHRNUL
+	addq	$(VEC_SIZE * 3), %rax
+	addq	%rdi, %rax
+# else
+	xorl	%edx, %edx
+	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+	cmp	(%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+
+END (STRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index c9f54ca..f83ba6e 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -26,6 +26,15 @@
 ENTRY(strchr)
 	.type	strchr, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__strchr_avx2(%rip), %rax
+	ret
+1:
 	leaq	__strchr_sse2(%rip), %rax
 2:	HAS_ARCH_FEATURE (Slow_BSF)
 	jz	3f
diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2.S b/sysdeps/x86_64/multiarch/strchrnul-avx2.S
new file mode 100644
index 0000000..fa0cc09
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-avx2.S
@@ -0,0 +1,3 @@
+#define STRCHR __strchrnul_avx2
+#define USE_AS_STRCHRNUL 1
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchrnul.S
similarity index 51%
copy from sysdeps/x86_64/multiarch/strchr.S
copy to sysdeps/x86_64/multiarch/strchrnul.S
index c9f54ca..4df9c39 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchrnul.S
@@ -1,5 +1,6 @@
-/* Multiple versions of strchr
-   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+/* Multiple versions of strchrnul
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -19,39 +20,36 @@
 #include <sysdep.h>
 #include <init-arch.h>
 
-
-/* Define multiple versions only for the definition in libc.  */
+/* Define multiple versions only for the definition in libc. */
 #if IS_IN (libc)
 	.text
-ENTRY(strchr)
-	.type	strchr, @gnu_indirect_function
+ENTRY(__strchrnul)
+	.type	__strchrnul, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-	leaq	__strchr_sse2(%rip), %rax
-2:	HAS_ARCH_FEATURE (Slow_BSF)
-	jz	3f
-	leaq    __strchr_sse2_no_bsf(%rip), %rax
-3:	ret
-END(strchr)
-
-
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__strchrnul_avx2(%rip), %rax
+	ret
+
+1:	leaq	__strchrnul_sse2(%rip), %rax
+	ret
+END(__strchrnul)
 
 # undef ENTRY
 # define ENTRY(name) \
-	.type __strchr_sse2, @function; \
-	.align 16; \
-	.globl __strchr_sse2; \
-	.hidden __strchr_sse2; \
-	__strchr_sse2: cfi_startproc; \
+	.type __strchrnul_sse2, @function; \
+	.p2align 4; \
+	.globl __strchrnul_sse2; \
+	.hidden __strchrnul_sse2; \
+	__strchrnul_sse2: cfi_startproc; \
 	CALL_MCOUNT
 # undef END
 # define END(name) \
-	cfi_endproc; .size __strchr_sse2, .-__strchr_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strchr calls through a PLT.
-   The speedup we get from using SSE4.2 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_strchr; __GI_strchr = __strchr_sse2
+	cfi_endproc; .size __strchrnul_sse2, .-__strchrnul_sse2
 #endif
 
-#include "../strchr.S"
+#include "../strchrnul.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2.S b/sysdeps/x86_64/multiarch/wcschr-avx2.S
new file mode 100644
index 0000000..67726b6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-avx2.S
@@ -0,0 +1,3 @@
+#define STRCHR __wcschr_avx2
+#define USE_AS_WCSCHR 1
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr.S b/sysdeps/x86_64/multiarch/wcschr.S
new file mode 100644
index 0000000..e132acc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr.S
@@ -0,0 +1,67 @@
+/* Multiple versions of wcschr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(__wcschr)
+	.type	__wcschr, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__wcschr_avx2(%rip), %rax
+	ret
+
+1:	leaq	__wcschr_sse2(%rip), %rax
+	ret
+END(__wcschr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __wcschr_sse2, @function; \
+	.p2align 4; \
+	.globl __wcschr_sse2; \
+	.hidden __wcschr_sse2; \
+	__wcschr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __wcschr_sse2, .-__wcschr_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal wcschr calls through a PLT.
+   The speedup we get from using AVX2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+	.globl __GI___wcschr; __GI___wcschr = __wcschr_sse2
+#  undef libc_hidden_weak
+#  define libc_hidden_weak(name) \
+	.weak __GI_wcschr; __GI_wcschr = __wcschr_sse2
+# endif
+#endif
+
+#include "../wcschr.S"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c67ef62d5961ad7630fa9bb413eac252100f70c1

commit c67ef62d5961ad7630fa9bb413eac252100f70c1
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri May 19 12:19:42 2017 -0700

    x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2
    
    Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with
    a single vector compare instruction.  It is as fast as SSE2 versions for
    size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell.
    Select AVX2 version on AVX2 machines where vzeroupper is preferred and
    AVX unaligned load is fast.
    
    NB: It uses TZCNT instead of BSF since TZCNT produces the same result
    as BSF for non-zero input.  TZCNT is faster than BSF and is executed
    as BSF if machine doesn't support TZCNT.
    
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	strlen-avx2, strnlen-avx2 and wcslen-avx2 and wcsnlen-avx2.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Add tests for __strlen_avx2,
    	__strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2,
    	__wcslen_sse2, __wcsnlen_avx2 and __wcsnlen_sse2.
    	* sysdeps/x86_64/multiarch/strlen-avx2.S: New file.
    	* sysdeps/x86_64/multiarch/strlen.S: Likewise.
    	* sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/strnlen.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcslen.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcsnlen.S: Likewise.

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 48aba0f..f33f21a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   memcpy-ssse3-back \
 		   memmove-ssse3-back \
 		   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
+		   strlen-avx2 strnlen-avx2 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
@@ -35,5 +36,6 @@ ifeq ($(subdir),wcsmbs)
 sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
 		   wmemchr-avx2 \
 		   wmemcmp-avx2 \
-		   wcscpy-ssse3 wcscpy-c
+		   wcscpy-ssse3 wcscpy-c \
+		   wcslen-avx2 wcsnlen-avx2
 endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index ae09241..c2b07b3 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -165,6 +165,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __rawmemchr_avx2)
 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/strlen.S.  */
+  IFUNC_IMPL (i, name, strlen,
+	      IFUNC_IMPL_ADD (array, i, strlen,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strlen_avx2)
+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strnlen.S.  */
+  IFUNC_IMPL (i, name, strnlen,
+	      IFUNC_IMPL_ADD (array, i, strnlen,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strnlen_avx2)
+	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
+
   /* Support sysdeps/x86_64/multiarch/stpncpy.S.  */
   IFUNC_IMPL (i, name, stpncpy,
 	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
@@ -309,6 +323,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __wcscpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/wcslen.S.  */
+  IFUNC_IMPL (i, name, wcslen,
+	      IFUNC_IMPL_ADD (array, i, wcslen,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wcslen_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/wcsnlen.S.  */
+  IFUNC_IMPL (i, name, wcsnlen,
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wcsnlen_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2))
+
   /* Support sysdeps/x86_64/multiarch/wmemchr.S.  */
   IFUNC_IMPL (i, name, wmemchr,
 	      IFUNC_IMPL_ADD (array, i, wmemchr,
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
new file mode 100644
index 0000000..1dc823a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -0,0 +1,394 @@
+/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRLEN
+#  define STRLEN	__strlen_avx2
+# endif
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMINU	vpminud
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMINU	vpminub
+# endif
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+	.section .text.avx,"ax",@progbits
+ENTRY (STRLEN)
+# ifdef USE_AS_STRNLEN
+	/* Check for zero length.  */
+	testq	%rsi, %rsi
+	jz	L(zero)
+#  ifdef USE_AS_WCSLEN
+	shl	$2, %rsi
+#  endif
+	movq	%rsi, %r8
+# endif
+	movl	%edi, %ecx
+	movq	%rdi, %rdx
+	vpxor	%xmm0, %xmm0, %xmm0
+
+	/* Check if we may cross page boundary with one vector load.  */
+	andl	$(2 * VEC_SIZE - 1), %ecx
+	cmpl	$VEC_SIZE, %ecx
+	ja	L(cros_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  */
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+# ifdef USE_AS_STRNLEN
+	jnz	L(first_vec_x0_check)
+	/* Adjust length and check the end of data.  */
+	subq	$VEC_SIZE, %rsi
+	jbe	L(max)
+# else
+	jnz	L(first_vec_x0)
+# endif
+
+	/* Align data for aligned loads in the loop.  */
+	addq	$VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+
+# ifdef USE_AS_STRNLEN
+	/* Adjust length.  */
+	addq	%rcx, %rsi
+
+	subq	$(VEC_SIZE * 4), %rsi
+	jbe	L(last_4x_vec_or_less)
+# endif
+	jmp	L(more_4x_vec)
+
+	.p2align 4
+L(cros_page_boundary):
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Remove the leading bytes.  */
+	sarl	%cl, %eax
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_STRNLEN
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+# endif
+	addq	%rdi, %rax
+	addq	%rcx, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(aligned_more):
+# ifdef USE_AS_STRNLEN
+        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
+	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+	    to void possible addition overflow.  */
+	negq	%rcx
+	addq	$VEC_SIZE, %rcx
+
+	/* Check the end of data.  */
+	subq	%rcx, %rsi
+	jbe	L(max)
+# endif
+
+	addq	$VEC_SIZE, %rdi
+
+# ifdef USE_AS_STRNLEN
+	subq	$(VEC_SIZE * 4), %rsi
+	jbe	L(last_4x_vec_or_less)
+# endif
+
+L(more_4x_vec):
+	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+# ifdef USE_AS_STRNLEN
+	subq	$(VEC_SIZE * 4), %rsi
+	jbe	L(last_4x_vec_or_less)
+# endif
+
+	/* Align data to 4 * VEC_SIZE.  */
+	movq	%rdi, %rcx
+	andl	$(4 * VEC_SIZE - 1), %ecx
+	andq	$-(4 * VEC_SIZE), %rdi
+
+# ifdef USE_AS_STRNLEN
+	/* Adjust length.  */
+	addq	%rcx, %rsi
+# endif
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	vmovdqa (%rdi), %ymm1
+	vmovdqa	VEC_SIZE(%rdi), %ymm2
+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
+	VPMINU	%ymm1, %ymm2, %ymm5
+	VPMINU	%ymm3, %ymm4, %ymm6
+	VPMINU	%ymm5, %ymm6, %ymm5
+
+	VPCMPEQ	%ymm5, %ymm0, %ymm5
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jnz	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+# ifndef USE_AS_STRNLEN
+	jmp	L(loop_4x_vec)
+# else
+	subq	$(VEC_SIZE * 4), %rsi
+	ja	L(loop_4x_vec)
+
+L(last_4x_vec_or_less):
+	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+	addl	$(VEC_SIZE * 2), %esi
+	jle	L(last_2x_vec)
+
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x2_check)
+	subl	$VEC_SIZE, %esi
+	jle	L(max)
+
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x3_check)
+	movq	%r8, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	addl	$(VEC_SIZE * 2), %esi
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x0_check)
+	subl	$VEC_SIZE, %esi
+	jle	L(max)
+
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+	movq	%r8, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x0_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+	addq	$VEC_SIZE, %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+	addq	$(VEC_SIZE * 2), %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x3_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+	addq	$(VEC_SIZE * 3), %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(max):
+	movq	%r8, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+# endif
+
+	.p2align 4
+L(first_vec_x0):
+	tzcntl	%eax, %eax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	addq	$VEC_SIZE, %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 2), %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+	VPCMPEQ %ymm2, %ymm0, %ymm2
+	vpmovmskb %ymm2, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+	VPCMPEQ %ymm3, %ymm0, %ymm3
+	vpmovmskb %ymm3, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+	VPCMPEQ %ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %eax
+	testl	%eax, %eax
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 3), %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	VZEROUPPER
+	ret
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
new file mode 100644
index 0000000..2847440
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -0,0 +1,64 @@
+/* Multiple versions of strlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(strlen)
+	.type	strlen, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__strlen_avx2(%rip), %rax
+	ret
+
+1:	leaq	__strlen_sse2(%rip), %rax
+	ret
+END(strlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strlen_sse2, @function; \
+	.p2align 4; \
+	.globl __strlen_sse2; \
+	.hidden __strlen_sse2; \
+	__strlen_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strlen_sse2, .-__strlen_sse2
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strlen calls through a PLT.
+   The speedup we get from using AVX2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_strlen; __GI_strlen = __strlen_sse2
+# endif
+#endif
+
+#include "../strlen.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2.S b/sysdeps/x86_64/multiarch/strnlen-avx2.S
new file mode 100644
index 0000000..c4062b2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-avx2.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_avx2
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen.S b/sysdeps/x86_64/multiarch/strnlen.S
new file mode 100644
index 0000000..0c2289a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen.S
@@ -0,0 +1,65 @@
+/* Multiple versions of strnlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(__strnlen)
+	.type	__strnlen, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__strnlen_avx2(%rip), %rax
+	ret
+
+1:	leaq	__strnlen_sse2(%rip), %rax
+	ret
+END(__strnlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strnlen_sse2, @function; \
+	.p2align 4; \
+	.globl __strnlen_sse2; \
+	.hidden __strnlen_sse2; \
+	__strnlen_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strnlen_sse2, .-__strnlen_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal strnlen calls through a PLT.
+   The speedup we get from using AVX2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+	.globl __GI_strnlen; __GI_strnlen = __strnlen_sse2; \
+	.globl __GI___strnlen; __GI___strnlen = __strnlen_sse2
+# endif
+#endif
+
+#include "../strnlen.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2.S b/sysdeps/x86_64/multiarch/wcslen-avx2.S
new file mode 100644
index 0000000..c9224f1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-avx2.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_avx2
+#define USE_AS_WCSLEN 1
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen.S b/sysdeps/x86_64/multiarch/wcslen.S
new file mode 100644
index 0000000..04369b6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen.S
@@ -0,0 +1,55 @@
+/* Multiple versions of wcslen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(__wcslen)
+	.type	__wcslen, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__wcslen_avx2(%rip), %rax
+	ret
+
+1:	leaq	__wcslen_sse2(%rip), %rax
+	ret
+END(__wcslen)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __wcslen_sse2, @function; \
+	.p2align 4; \
+	.globl __wcslen_sse2; \
+	.hidden __wcslen_sse2; \
+	__wcslen_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __wcslen_sse2, .-__wcslen_sse2
+#endif
+
+#include "../wcslen.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2.S
new file mode 100644
index 0000000..fac8354
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_avx2
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.S b/sysdeps/x86_64/multiarch/wcsnlen.S
new file mode 100644
index 0000000..0893bea
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen.S
@@ -0,0 +1,55 @@
+/* Multiple versions of wcsnlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(__wcsnlen)
+	.type	__wcsnlen, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__wcsnlen_avx2(%rip), %rax
+	ret
+
+1:	leaq	__wcsnlen_sse2(%rip), %rax
+	ret
+END(__wcsnlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __wcsnlen_sse2, @function; \
+	.p2align 4; \
+	.globl __wcsnlen_sse2; \
+	.hidden __wcsnlen_sse2; \
+	__wcsnlen_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __wcsnlen_sse2, .-__wcsnlen_sse2
+#endif
+
+#include "../wcsnlen.S"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a9c26d841ea4cb4d271093926f6bb6e79a8fe72a

commit a9c26d841ea4cb4d271093926f6bb6e79a8fe72a
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu May 18 11:10:09 2017 -0700

    x86-64: Optimize memchr/rawmemchr/wmemchr with SSE2/AVX2
    
    SSE2 memchr is extended to support wmemchr.  AVX2 memchr/rawmemchr/wmemchr
    are added to search 32 bytes with a single vector compare instruction.
    AVX2 memchr/rawmemchr/wmemchr are as fast as SSE2 memchr/rawmemchr/wmemchr
    for small sizes and up to 1.5X faster for larger sizes on Haswell and
    Skylake.  Select AVX2 memchr/rawmemchr/wmemchr on AVX2 machines where
    vzeroupper is preferred and AVX unaligned load is fast.
    
    NB: It uses TZCNT instead of BSF since TZCNT produces the same result
    as BSF for non-zero input.  TZCNT is faster than BSF and is executed
    as BSF if machine doesn't support TZCNT.
    
    	* sysdeps/x86_64/memchr.S (MEMCHR): New.  Depending on if
    	USE_AS_WMEMCHR is defined.
    	(PCMPEQ): Likewise.
    	(memchr): Renamed to ...
    	(MEMCHR): This.  Support wmemchr if USE_AS_WMEMCHR is defined.
    	Replace pcmpeqb with PCMPEQ.
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	memchr-avx2, rawmemchr-avx2 and wmemchr-avx2.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Test __memchr_avx2, __memchr_sse2,
    	__rawmemchr_avx2, __rawmemchr_sse2, __wmemchr_avx2 and
    	__wmemchr_sse2.
    	* sysdeps/x86_64/multiarch/memchr-avx2.S: New file.
    	* sysdeps/x86_64/multiarch/memchr.S: Likewise.
    	* sysdeps/x86_64/multiarch/rawmemchr-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/rawmemchr.S: Likewise.
    	* sysdeps/x86_64/multiarch/wmemchr-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/wmemchr.S: Likewise.
    	* sysdeps/x86_64/wmemchr.S: Likewise.

diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index d3be012..3167cd8 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -18,17 +18,31 @@
 
 #include <sysdep.h>
 
+#ifdef USE_AS_WMEMCHR
+# define MEMCHR		wmemchr
+# define PCMPEQ		pcmpeqd
+#else
+# define MEMCHR		memchr
+# define PCMPEQ		pcmpeqb
+#endif
+
 /* fast SSE2 version with using pmaxub and 64 byte loop */
 
 	.text
-ENTRY(memchr)
+ENTRY(MEMCHR)
 	movd	%esi, %xmm1
 	mov	%edi, %ecx
 
+#ifdef USE_AS_WMEMCHR
+	test	%rdx, %rdx
+	jz	L(return_null)
+	shl	$2, %rdx
+#else
 	punpcklbw %xmm1, %xmm1
 	test	%rdx, %rdx
 	jz	L(return_null)
 	punpcklbw %xmm1, %xmm1
+#endif
 
 	and	$63, %ecx
 	pshufd	$0, %xmm1, %xmm1
@@ -37,7 +51,7 @@ ENTRY(memchr)
 	ja	L(crosscache)
 
 	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
+	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 
@@ -58,7 +72,7 @@ L(crosscache):
 	and	$-16, %rdi
 	movdqa	(%rdi), %xmm0
 
-	pcmpeqb	%xmm1, %xmm0
+	PCMPEQ	%xmm1, %xmm0
 /* Check if there is a match.  */
 	pmovmskb %xmm0, %eax
 /* Remove the leading bytes.  */
@@ -90,25 +104,25 @@ L(unaligned_no_match):
 	.p2align 4
 L(loop_prolog):
 	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
+	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches)
 
 	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
+	PCMPEQ	%xmm1, %xmm2
 	pmovmskb %xmm2, %eax
 	test	%eax, %eax
 	jnz	L(matches16)
 
 	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
+	PCMPEQ	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32)
 
 	movdqa	48(%rdi), %xmm4
-	pcmpeqb	%xmm1, %xmm4
+	PCMPEQ	%xmm1, %xmm4
 	add	$64, %rdi
 	pmovmskb %xmm4, %eax
 	test	%eax, %eax
@@ -121,25 +135,25 @@ L(loop_prolog):
 	jbe	L(exit_loop)
 
 	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
+	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches)
 
 	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
+	PCMPEQ	%xmm1, %xmm2
 	pmovmskb %xmm2, %eax
 	test	%eax, %eax
 	jnz	L(matches16)
 
 	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
+	PCMPEQ	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32)
 
 	movdqa	48(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
+	PCMPEQ	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
 
 	add	$64, %rdi
@@ -160,10 +174,10 @@ L(align64_loop):
 	movdqa	32(%rdi), %xmm3
 	movdqa	48(%rdi), %xmm4
 
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm1, %xmm4
+	PCMPEQ	%xmm1, %xmm0
+	PCMPEQ	%xmm1, %xmm2
+	PCMPEQ	%xmm1, %xmm3
+	PCMPEQ	%xmm1, %xmm4
 
 	pmaxub	%xmm0, %xmm3
 	pmaxub	%xmm2, %xmm4
@@ -186,9 +200,9 @@ L(align64_loop):
 	jnz	L(matches16)
 
 	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
+	PCMPEQ	%xmm1, %xmm3
 
-	pcmpeqb	48(%rdi), %xmm1
+	PCMPEQ	48(%rdi), %xmm1
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32)
@@ -204,26 +218,26 @@ L(exit_loop):
 	jle	L(exit_loop_32)
 
 	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
+	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches)
 
 	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
+	PCMPEQ	%xmm1, %xmm2
 	pmovmskb %xmm2, %eax
 	test	%eax, %eax
 	jnz	L(matches16)
 
 	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
+	PCMPEQ	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32_1)
 	sub	$16, %edx
 	jle	L(return_null)
 
-	pcmpeqb	48(%rdi), %xmm1
+	PCMPEQ	48(%rdi), %xmm1
 	pmovmskb %xmm1, %eax
 	test	%eax, %eax
 	jnz	L(matches48_1)
@@ -234,14 +248,14 @@ L(exit_loop):
 L(exit_loop_32):
 	add	$32, %edx
 	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
+	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches_1)
 	sub	$16, %edx
 	jbe	L(return_null)
 
-	pcmpeqb	16(%rdi), %xmm1
+	PCMPEQ	16(%rdi), %xmm1
 	pmovmskb %xmm1, %eax
 	test	%eax, %eax
 	jnz	L(matches16_1)
@@ -308,8 +322,13 @@ L(matches48_1):
 L(return_null):
 	xor	%eax, %eax
 	ret
-END(memchr)
+END(MEMCHR)
 
+#ifdef USE_AS_WMEMCHR
+libc_hidden_def (__wmemchr)
+weak_alias (__wmemchr, wmemchr)
+libc_hidden_weak (wmemchr)
+#else
 strong_alias (memchr, __memchr)
-
 libc_hidden_builtin_def(memchr)
+#endif
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index a62def3..48aba0f 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -6,6 +6,7 @@ ifeq ($(subdir),string)
 
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
+		   memchr-avx2 rawmemchr-avx2 \
 		   memcmp-avx2 \
 		   memcmp-sse4 memcpy-ssse3 \
 		   memmove-ssse3 \
@@ -32,6 +33,7 @@ endif
 
 ifeq ($(subdir),wcsmbs)
 sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+		   wmemchr-avx2 \
 		   wmemcmp-avx2 \
 		   wcscpy-ssse3 wcscpy-c
 endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 35f1960..ae09241 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -38,6 +38,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   size_t i = 0;
 
+  /* Support sysdeps/x86_64/multiarch/memchr.S.  */
+  IFUNC_IMPL (i, name, memchr,
+	      IFUNC_IMPL_ADD (array, i, memchr,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
+
   /* Support sysdeps/x86_64/multiarch/memcmp.S.  */
   IFUNC_IMPL (i, name, memcmp,
 	      IFUNC_IMPL_ADD (array, i, memcmp,
@@ -151,6 +158,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memset_avx512_no_vzeroupper)
 	     )
 
+  /* Support sysdeps/x86_64/multiarch/rawmemchr.S.  */
+  IFUNC_IMPL (i, name, rawmemchr,
+	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __rawmemchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
+
   /* Support sysdeps/x86_64/multiarch/stpncpy.S.  */
   IFUNC_IMPL (i, name, stpncpy,
 	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
@@ -295,6 +309,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __wcscpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/wmemchr.S.  */
+  IFUNC_IMPL (i, name, wmemchr,
+	      IFUNC_IMPL_ADD (array, i, wmemchr,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wmemchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
+
   /* Support sysdeps/x86_64/multiarch/wmemcmp.S.  */
   IFUNC_IMPL (i, name, wmemcmp,
 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
new file mode 100644
index 0000000..a7275ed
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -0,0 +1,340 @@
+/* memchr/wmemchr optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCHR
+#  define MEMCHR	__memchr_avx2
+# endif
+
+# ifdef USE_AS_WMEMCHR
+#  define VPCMPEQ	vpcmpeqd
+# else
+#  define VPCMPEQ	vpcmpeqb
+# endif
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+	.section .text.avx,"ax",@progbits
+ENTRY (MEMCHR)
+# ifndef USE_AS_RAWMEMCHR
+	/* Check for zero length.  */
+	testq	%rdx, %rdx
+	jz	L(null)
+# endif
+	movl	%edi, %ecx
+	/* Broadcast CHAR to YMM0.  */
+	vmovd	%esi, %xmm0
+# ifdef USE_AS_WMEMCHR
+	shl	$2, %rdx
+	vpbroadcastd %xmm0, %ymm0
+# else
+	vpbroadcastb %xmm0, %ymm0
+# endif
+	/* Check if we may cross page boundary with one vector load.  */
+	andl	$(2 * VEC_SIZE - 1), %ecx
+	cmpl	$VEC_SIZE, %ecx
+	ja	L(cros_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  */
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	jnz	L(first_vec_x0_check)
+	/* Adjust length and check the end of data.  */
+	subq	$VEC_SIZE, %rdx
+	jbe	L(zero)
+# else
+	jnz	L(first_vec_x0)
+# endif
+
+	/* Align data for aligned loads in the loop.  */
+	addq	$VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length.  */
+	addq	%rcx, %rdx
+
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
+	jmp	L(more_4x_vec)
+
+	.p2align 4
+L(cros_page_boundary):
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Remove the leading bytes.  */
+	sarl	%cl, %eax
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Check the end of data.  */
+	cmpq	%rax, %rdx
+	jbe	L(zero)
+# endif
+	addq	%rdi, %rax
+	addq	%rcx, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(aligned_more):
+# ifndef USE_AS_RAWMEMCHR
+        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
+	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
+	   overflow.  */
+	negq	%rcx
+	addq	$VEC_SIZE, %rcx
+
+	/* Check the end of data.  */
+	subq	%rcx, %rdx
+	jbe	L(zero)
+# endif
+
+	addq	$VEC_SIZE, %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
+
+L(more_4x_vec):
+	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
+
+	/* Align data to 4 * VEC_SIZE.  */
+	movq	%rdi, %rcx
+	andl	$(4 * VEC_SIZE - 1), %ecx
+	andq	$-(4 * VEC_SIZE), %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length.  */
+	addq	%rcx, %rdx
+# endif
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+
+	vpor	%ymm1, %ymm2, %ymm5
+	vpor	%ymm3, %ymm4, %ymm6
+	vpor	%ymm5, %ymm6, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jnz	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+# ifdef USE_AS_RAWMEMCHR
+	jmp	L(loop_4x_vec)
+# else
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec)
+
+L(last_4x_vec_or_less):
+	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jle	L(last_2x_vec)
+
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x2_check)
+	subl	$VEC_SIZE, %edx
+	jle	L(zero)
+
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x3_check)
+	xorl	%eax, %eax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	addl	$(VEC_SIZE * 2), %edx
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x0_check)
+	subl	$VEC_SIZE, %edx
+	jle	L(zero)
+
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+	xorl	%eax, %eax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x0_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rdx
+	jbe	L(zero)
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rdx
+	jbe	L(zero)
+	addq	$VEC_SIZE, %rax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rdx
+	jbe	L(zero)
+	addq	$(VEC_SIZE * 2), %rax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x3_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rdx
+	jbe	L(zero)
+	addq	$(VEC_SIZE * 3), %rax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(zero):
+	VZEROUPPER
+L(null):
+	xorl	%eax, %eax
+	ret
+# endif
+
+	.p2align 4
+L(first_vec_x0):
+	tzcntl	%eax, %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	addq	$VEC_SIZE, %rax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 2), %rax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+	vpmovmskb %ymm2, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+	vpmovmskb %ymm3, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+	vpmovmskb %ymm4, %eax
+	testl	%eax, %eax
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 3), %rax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+END (MEMCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memchr.S b/sysdeps/x86_64/multiarch/memchr.S
new file mode 100644
index 0000000..dee3fd1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr.S
@@ -0,0 +1,64 @@
+/* Multiple versions of memchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(memchr)
+	.type	memchr, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__memchr_avx2(%rip), %rax
+	ret
+
+1:	leaq	__memchr_sse2(%rip), %rax
+	ret
+END(memchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memchr_sse2, @function; \
+	.p2align 4; \
+	.globl __memchr_sse2; \
+	.hidden __memchr_sse2; \
+	__memchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memchr_sse2, .-__memchr_sse2
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memchr calls through a PLT.
+   The speedup we get from using AVX2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memchr; __GI_memchr = __memchr_sse2
+# endif
+#endif
+
+#include "../memchr.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S
new file mode 100644
index 0000000..128f9ea
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S
@@ -0,0 +1,4 @@
+#define MEMCHR __rawmemchr_avx2
+#define USE_AS_RAWMEMCHR 1
+
+#include "memchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S
new file mode 100644
index 0000000..b938f13
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr.S
@@ -0,0 +1,64 @@
+/* Multiple versions of rawmemchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(__rawmemchr)
+	.type	__rawmemchr, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__rawmemchr_avx2(%rip), %rax
+	ret
+
+1:	leaq	__rawmemchr_sse2(%rip), %rax
+	ret
+END(__rawmemchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __rawmemchr_sse2, @function; \
+	.p2align 4; \
+	.globl __rawmemchr_sse2; \
+	.hidden __rawmemchr_sse2; \
+	__rawmemchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __rawmemchr_sse2, .-__rawmemchr_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal rawmemchr calls through a
+   PLT.  The speedup we get from using AVX2 instructions is likely eaten
+   away by the indirect call in the PLT.  */
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+	.globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_sse2
+# endif
+#endif
+
+#include "../rawmemchr.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2.S b/sysdeps/x86_64/multiarch/wmemchr-avx2.S
new file mode 100644
index 0000000..282854f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-avx2.S
@@ -0,0 +1,4 @@
+#define MEMCHR __wmemchr_avx2
+#define USE_AS_WMEMCHR 1
+
+#include "memchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr.S b/sysdeps/x86_64/multiarch/wmemchr.S
new file mode 100644
index 0000000..b79df23
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr.S
@@ -0,0 +1,67 @@
+/* Multiple versions of wmemchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(__wmemchr)
+	.type	__wmemchr, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__wmemchr_avx2(%rip), %rax
+	ret
+
+1:	leaq	__wmemchr_sse2(%rip), %rax
+	ret
+END(__wmemchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __wmemchr_sse2, @function; \
+	.p2align 4; \
+	.globl __wmemchr_sse2; \
+	.hidden __wmemchr_sse2; \
+	__wmemchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __wmemchr_sse2, .-__wmemchr_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal wmemchr calls through a PLT.
+   The speedup we get from using AVX2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+	.globl __GI___wmemchr; __GI___wmemchr = __wmemchr_sse2
+#  undef libc_hidden_weak
+#  define libc_hidden_weak(name) \
+	.weak __GI_wmemchr; __GI_wmemchr = __wmemchr_sse2
+# endif
+#endif
+
+#include "../wmemchr.S"
diff --git a/sysdeps/x86_64/wmemchr.S b/sysdeps/x86_64/wmemchr.S
new file mode 100644
index 0000000..9d8079b
--- /dev/null
+++ b/sysdeps/x86_64/wmemchr.S
@@ -0,0 +1,3 @@
+#define USE_AS_WMEMCHR 1
+
+#include "memchr.S"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=33a455e5a983fc7ef5187f8a64ee467426fc892d

commit 33a455e5a983fc7ef5187f8a64ee467426fc892d
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Jul 19 09:44:05 2016 -0700

    x86-64: Optimize memcmp/wmemcmp AVX2
    
    Optimize x86-64 memcmp/wmemcmp with AVX2.  It uses vector compare as
    much as possible.  It is as fast as SSE4 memcmp for size <= 16 bytes
    and up to 2X faster for size > 16 bytes on Haswell and Skylake.  Select
    AVX2 memcmp/wmemcmp on AVX2 machines where vzeroupper is preferred and
    AVX unaligned load is fast.
    
    NB: It uses TZCNT instead of BSF since TZCNT produces the same result
    as BSF for non-zero input.  TZCNT is faster than BSF and is executed
    as BSF if machine doesn't support TZCNT.
    
    Key features:
    
    1. Use overlapping compare to avoid branch.
    2. Use vector compare when size >= 4 bytes for memcmp or size >= 8
       bytes for wmemcmp.
    3. If size is 8 * VEC_SIZE or less, unroll the loop.
    4. Compare 4 * VEC_SIZE at a time with the aligned first memory area.
    5. Use 2 vector compares when size is 2 * VEC_SIZE or less.
    6. Use 4 vector compares when size is 4 * VEC_SIZE or less.
    7. Use 8 vector compares when size is 8 * VEC_SIZE or less.
    
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	memcmp-avx2 and wmemcmp-avx2.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Test __memcmp_avx2 and __wmemcmp_avx2.
    	* sysdeps/x86_64/multiarch/memcmp-avx2.S: New file.
    	* sysdeps/x86_64/multiarch/wmemcmp-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/memcmp.S: Use __memcmp_avx2 on AVX
    	2 machines if AVX unaligned load is fast and vzeroupper is
    	preferred.
    	* sysdeps/x86_64/multiarch/wmemcmp.S: Use __wmemcmp_avx2 on AVX
    	2 machines if AVX unaligned load is fast and vzeroupper is
    	preferred.

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 3736f54..a62def3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -6,6 +6,7 @@ ifeq ($(subdir),string)
 
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
+		   memcmp-avx2 \
 		   memcmp-sse4 memcpy-ssse3 \
 		   memmove-ssse3 \
 		   memcpy-ssse3-back \
@@ -30,5 +31,7 @@ CFLAGS-strspn-c.c += -msse4
 endif
 
 ifeq ($(subdir),wcsmbs)
-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
+sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+		   wmemcmp-avx2 \
+		   wcscpy-ssse3 wcscpy-c
 endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a91d2f9..35f1960 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -40,6 +40,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memcmp.S.  */
   IFUNC_IMPL (i, name, memcmp,
+	      IFUNC_IMPL_ADD (array, i, memcmp,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memcmp_avx2)
 	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1),
 			      __memcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
@@ -294,6 +297,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/wmemcmp.S.  */
   IFUNC_IMPL (i, name, wmemcmp,
+	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wmemcmp_avx2)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1),
 			      __wmemcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2.S b/sysdeps/x86_64/multiarch/memcmp-avx2.S
new file mode 100644
index 0000000..eb3a4cb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2.S
@@ -0,0 +1,430 @@
+/* memcmp/wmemcmp optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+/* memcmp/wmemcmp is implemented as:
+   1. Use overlapping compare to avoid branch.
+   2. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+      bytes for wmemcmp.
+   3. If size is 8 * VEC_SIZE or less, unroll the loop.
+   4. Compare 4 * VEC_SIZE at a time with the aligned first memory
+      area.
+   5. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+   6. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+   7. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_avx2
+# endif
+
+# ifdef USE_AS_WMEMCMP
+#  define VPCMPEQ	vpcmpeqd
+# else
+#  define VPCMPEQ	vpcmpeqb
+# endif
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define VEC_SIZE 32
+# define VEC_MASK ((1 << VEC_SIZE) - 1)
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.avx,"ax",@progbits
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %rdx
+# endif
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+L(last_vec):
+	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec):
+	/* A byte or int32 is different within 16 or 32 bytes.  */
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(%rdi, %rcx), %edx
+	cmpl	(%rsi, %rcx), %edx
+L(wmemcmp_return):
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
+# else
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+
+# ifdef USE_AS_WMEMCMP
+	.p2align 4
+L(4):
+	xorl	%eax, %eax
+	movl	(%rdi), %edx
+	cmpl	(%rsi), %edx
+	jne	L(wmemcmp_return)
+	ret
+# else
+	.p2align 4
+L(between_4_7):
+	vmovd	(%rdi), %xmm1
+	vmovd	(%rsi), %xmm2
+	VPCMPEQ %xmm1, %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	leaq	-4(%rdi, %rdx), %rdi
+	leaq	-4(%rsi, %rdx), %rsi
+	vmovd	(%rdi), %xmm1
+	vmovd	(%rsi), %xmm2
+	VPCMPEQ %xmm1, %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(between_2_3):
+	/* Load 2 bytes into registers.  */
+	movzwl	(%rdi), %eax
+	movzwl	(%rsi), %ecx
+	/* Compare the lowest byte.  */
+	cmpb	%cl, %al
+	jne	L(1byte_reg)
+	/* Load the difference of 2 bytes into EAX.  */
+	subl	%ecx, %eax
+	/* Return if 2 bytes differ.  */
+	jnz	L(exit)
+	cmpb	$2, %dl
+	/* Return if these are the last 2 bytes.  */
+	je	L(exit)
+	movzbl	2(%rdi), %eax
+	movzbl	2(%rsi), %ecx
+	subl	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit):
+	ret
+
+	.p2align 4
+L(1byte_reg):
+	movzbl	%al, %eax
+	movzbl	%cl, %ecx
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
+L(1):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	subl	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(less_vec):
+# ifdef USE_AS_WMEMCMP
+	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+	cmpb	$4, %dl
+	je	L(4)
+	jb	L(zero)
+# else
+	cmpb	$1, %dl
+	je	L(1)
+	jb	L(zero)
+	cmpb	$4, %dl
+	jb	L(between_2_3)
+	cmpb	$8, %dl
+	jb	L(between_4_7)
+# endif
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+	/* It is between 8 and 15 bytes.  */
+	vmovq	(%rdi), %xmm1
+	vmovq	(%rsi), %xmm2
+	VPCMPEQ %xmm1, %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	leaq	-8(%rdi, %rdx), %rdi
+	leaq	-8(%rsi, %rdx), %rsi
+	vmovq	(%rdi), %xmm1
+	vmovq	(%rsi), %xmm2
+	VPCMPEQ %xmm1, %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(between_16_31):
+	/* From 16 to 31 bytes.  No branch when size == 16.  */
+	vmovdqu	(%rsi), %xmm2
+	VPCMPEQ (%rdi), %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+
+	leaq	-16(%rdi, %rdx), %rdi
+	leaq	-16(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %xmm2
+	VPCMPEQ (%rdi), %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(more_2x_vec):
+	/* More than 2 * VEC.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(last_4x_vec)
+
+	/* From 4 * VEC to 8 * VEC, inclusively. */
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ (%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+
+	vpand	%ymm1, %ymm2, %ymm5
+	vpand	%ymm3, %ymm4, %ymm6
+	vpand	%ymm5, %ymm6, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(4x_vec_end)
+
+	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ (%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+	vpand	%ymm2, %ymm1, %ymm5
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	vpand	%ymm3, %ymm5, %ymm5
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	vpand	%ymm4, %ymm5, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(4x_vec_end)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(more_8x_vec):
+	/* More than 8 * VEC.  Check the first VEC.  */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Align the first memory area for aligned loads in the loop.
+	   Compute how much the first memory area is misaligned.  */
+	movq	%rdi, %rcx
+	andl	$(VEC_SIZE - 1), %ecx
+	/* Get the negative of offset for alignment.  */
+	subq	$VEC_SIZE, %rcx
+	/* Adjust the second memory area.  */
+	subq	%rcx, %rsi
+	/* Adjust the first memory area which should be aligned now.  */
+	subq	%rcx, %rdi
+	/* Adjust length.  */
+	addq	%rcx, %rdx
+
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ (%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+	vpand	%ymm2, %ymm1, %ymm5
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	vpand	%ymm3, %ymm5, %ymm5
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	vpand	%ymm4, %ymm5, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+	addq	$(VEC_SIZE * 4), %rsi
+
+	subq	$(VEC_SIZE * 4), %rdx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jae	L(loop_4x_vec)
+
+	/* Less than 4 * VEC.  */
+	cmpq	$VEC_SIZE, %rdx
+	jbe	L(last_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(last_2x_vec)
+
+L(last_4x_vec):
+	/* From 2 * VEC to 4 * VEC. */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	vpmovmskb %ymm1, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec)
+	vpmovmskb %ymm2, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec_x1)
+	vpmovmskb %ymm3, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec_x2)
+	vpmovmskb %ymm4, %eax
+	subl	$VEC_MASK, %eax
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	VEC_SIZE(%rdi, %rcx), %edx
+	cmpl	VEC_SIZE(%rsi, %rcx), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index 6129820..08acacb 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -27,7 +27,16 @@
 ENTRY(memcmp)
 	.type	memcmp, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-	HAS_CPU_FEATURE (SSSE3)
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__memcmp_avx2(%rip), %rax
+	ret
+
+1:	HAS_CPU_FEATURE (SSSE3)
 	jnz	2f
 	leaq	__memcmp_sse2(%rip), %rax
 	ret
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2.S
new file mode 100644
index 0000000..aa2190b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2.S
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_avx2
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
index 5dc54d7..46ee8f5 100644
--- a/sysdeps/x86_64/multiarch/wmemcmp.S
+++ b/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -27,7 +27,16 @@
 ENTRY(wmemcmp)
 	.type	wmemcmp, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-	HAS_CPU_FEATURE (SSSE3)
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__wmemcmp_avx2(%rip), %rax
+	ret
+
+1:	HAS_CPU_FEATURE (SSSE3)
 	jnz	2f
 	leaq	__wmemcmp_sse2(%rip), %rax
 	ret

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=51597d57a2889eff8bd8661a9c10ecc945ebae9d

commit 51597d57a2889eff8bd8661a9c10ecc945ebae9d
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun May 21 09:22:44 2017 -0700

    x86-64: Optimize wmemset with SSE2/AVX2/AVX512
    
    The difference between memset and wmemset is byte vs int.  Add stubs
    to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size:
    
    SSE2 wmemset:
    	shl    $0x2,%rdx
    	movd   %esi,%xmm0
    	mov    %rdi,%rax
    	pshufd $0x0,%xmm0,%xmm0
    	jmp	entry_from_wmemset
    
    SSE2 memset:
    	movd   %esi,%xmm0
    	mov    %rdi,%rax
    	punpcklbw %xmm0,%xmm0
    	punpcklwd %xmm0,%xmm0
    	pshufd $0x0,%xmm0,%xmm0
    entry_from_wmemset:
    
    Since the ERMS versions of wmemset requires "rep stosl" instead of
    "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset
    are added.  The SSE2 wmemset is about 3X faster and the AVX2 wmemset
    is about 6X faster on Haswell.
    
    	* include/wchar.h (__wmemset_chk): New.
    	* sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed
    	to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
    	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
    	(WMEMSET_CHK_SYMBOL): Likewise.
    	(WMEMSET_SYMBOL): Likewise.
    	(__wmemset): Add hidden definition.
    	(wmemset): Add weak hidden definition.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned,
    	__wmemset_avx2_unaligned, __wmemset_avx512_unaligned,
    	__wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned
    	and __wmemset_chk_avx512_unaligned.
    	* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
    	(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
    	(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
    	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
    	(WMEMSET_SYMBOL): Likewise.
    	* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
    	(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
    	(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
    	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
    	(WMEMSET_SYMBOL): Likewise.
    	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated.
    	(WMEMSET_CHK_SYMBOL): New.
    	(WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise.
    	(WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise.
    	* sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New.
    	(libc_hidden_builtin_def): Also define __GI_wmemset and
    	__GI___wmemset.
    	(weak_alias): New.
    	* sysdeps/x86_64/multiarch/wmemset.S: New file.
    	* sysdeps/x86_64/multiarch/wmemset_chk.S: Likewise.
    	* sysdeps/x86_64/wmemset.S: Likewise.
    	* sysdeps/x86_64/wmemset_chk.S: Likewise.

diff --git a/include/wchar.h b/include/wchar.h
index e2579a1..a773d56 100644
--- a/include/wchar.h
+++ b/include/wchar.h
@@ -157,6 +157,9 @@ extern wchar_t *__wmemmove (wchar_t *__s1, const wchar_t *__s2,
 extern wchar_t *__wcschrnul (const wchar_t *__s, wchar_t __wc)
      __attribute_pure__;
 
+extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
+			       size_t __ns) __THROW;
+
 extern int __vfwscanf (__FILE *__restrict __s,
 		       const wchar_t *__restrict __format,
 		       __gnuc_va_list __arg)
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 69ed509..4127878 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -26,13 +26,18 @@
 #define VMOVU		movdqu
 #define VMOVA		movdqa
 
-#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
   movq r, %rax; \
   punpcklbw %xmm0, %xmm0; \
   punpcklwd %xmm0, %xmm0; \
   pshufd $0, %xmm0, %xmm0
 
+#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movd d, %xmm0; \
+  movq r, %rax; \
+  pshufd $0, %xmm0, %xmm0
+
 #define SECTION(p)		p
 
 #ifndef MEMSET_SYMBOL
@@ -40,10 +45,21 @@
 # define MEMSET_SYMBOL(p,s)	memset
 #endif
 
+#ifndef WMEMSET_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s) p
+# define WMEMSET_SYMBOL(p,s)	__wmemset
+#endif
+
 #include "multiarch/memset-vec-unaligned-erms.S"
 
 libc_hidden_builtin_def (memset)
 
+#if IS_IN (libc)
+libc_hidden_def (__wmemset)
+weak_alias (__wmemset, wmemset)
+libc_hidden_weak (wmemset)
+#endif
+
 #if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH
 strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
 	.section .gnu.warning.__memset_zero_constant_len_parameter
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 06d9a9d..a91d2f9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -300,6 +300,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/wmemset.S.  */
+  IFUNC_IMPL (i, name, wmemset,
+	      IFUNC_IMPL_ADD (array, i, wmemset, 1,
+			      __wmemset_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wmemset_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __wmemset_avx512_unaligned))
+
 #ifdef SHARED
   /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
   IFUNC_IMPL (i, name, __memcpy_chk,
@@ -417,6 +428,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
 			      __strncmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/wmemset_chk.S.  */
+  IFUNC_IMPL (i, name, __wmemset_chk,
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1,
+			      __wmemset_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wmemset_chk_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __wmemset_chk_avx512_unaligned))
 #endif
 
   return i;
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 79975e0..7ab3d89 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -4,13 +4,19 @@
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
 
-# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
   movq r, %rax; \
   vpbroadcastb %xmm0, %ymm0
 
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastd %xmm0, %ymm0
+
 # define SECTION(p)		p##.avx
 # define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+# define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
 
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index a5ec349..0783979 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -4,14 +4,21 @@
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 
-# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
   movq r, %rax; \
   vpbroadcastb %xmm0, %xmm0; \
   vpbroadcastq %xmm0, %zmm0
 
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastd %xmm0, %xmm0; \
+  vpbroadcastq %xmm0, %zmm0
+
 # define SECTION(p)		p##.avx512
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+# define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
 
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 704eed9..2eb9e37 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -30,6 +30,10 @@
 # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
 #endif
 
+#ifndef WMEMSET_CHK_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
+#endif
+
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER			vzeroupper
@@ -79,6 +83,21 @@ END (__bzero)
 weak_alias (__bzero, bzero)
 #endif
 
+#if IS_IN (libc)
+# if defined SHARED
+ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+# endif
+
+ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+	shlq	$2, %rdx
+	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	jmp	L(entry_from_bzero)
+END (WMEMSET_SYMBOL (__wmemset, unaligned))
+#endif
+
 #if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 	cmpq	%rdx, %rcx
@@ -87,8 +106,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 #endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
-L(memset_entry):
-	VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
 L(entry_from_bzero):
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
@@ -132,7 +150,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 # endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
-	VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
 	cmpq	$(VEC_SIZE * 2), %rdx
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index 9d33118..11f2737 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -58,16 +58,23 @@ END(memset)
 
 #if IS_IN (libc)
 # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
+# define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
 
 # ifdef SHARED
-# undef libc_hidden_builtin_def
+#  undef libc_hidden_builtin_def
 /* It doesn't make sense to send libc-internal memset calls through a PLT.
    The speedup we get from using SSE2 instructions is likely eaten away
    by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_memset; __GI_memset = __memset_sse2_unaligned
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \
+	.globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \
+	.globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned
 # endif
 
+# undef weak_alias
+# define weak_alias(original, alias) \
+	.weak bzero; bzero = __bzero
+
 # undef strong_alias
 # define strong_alias(original, alias)
 #endif
diff --git a/sysdeps/x86_64/multiarch/wmemset.S b/sysdeps/x86_64/multiarch/wmemset.S
new file mode 100644
index 0000000..3bd7ca2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemset.S
@@ -0,0 +1,47 @@
+/* Multiple versions of wmemset
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+ENTRY(__wmemset)
+	.type	__wmemset, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__wmemset_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	lea	__wmemset_avx2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__wmemset_avx512_unaligned(%rip), %RAX_LP
+1:	ret
+END(__wmemset)
+
+weak_alias (__wmemset, wmemset)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wmemset_chk.S b/sysdeps/x86_64/multiarch/wmemset_chk.S
new file mode 100644
index 0000000..c76fcb1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemset_chk.S
@@ -0,0 +1,46 @@
+/* Multiple versions of wmemset_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# ifdef SHARED
+ENTRY(__wmemset_chk)
+	.type	__wmemset_chk, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__wmemset_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	lea	__wmemset_chk_avx2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__wmemset_chk_avx512_unaligned(%rip), %RAX_LP
+1:	ret
+END(__wmemset_chk)
+# else
+#  include "../wmemset_chk.S"
+# endif
+#endif
diff --git a/sysdeps/x86_64/wmemset.S b/sysdeps/x86_64/wmemset.S
new file mode 100644
index 0000000..f96d567
--- /dev/null
+++ b/sysdeps/x86_64/wmemset.S
@@ -0,0 +1 @@
+/* Implemented in memset.S.  */
diff --git a/sysdeps/x86_64/wmemset_chk.S b/sysdeps/x86_64/wmemset_chk.S
new file mode 100644
index 0000000..64c2774
--- /dev/null
+++ b/sysdeps/x86_64/wmemset_chk.S
@@ -0,0 +1,33 @@
+/* Checking wmemset for x86-64.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef SHARED
+	/* For libc.so this is defined in wmemset.S.
+	   For libc.a, this is a separate source to avoid
+	   wmemset bringing in __chk_fail and all routines
+	   it calls.  */
+        .text
+ENTRY (__wmemset_chk)
+	cmpq	%rdx, %rcx
+	jb	__chk_fail
+	jmp	wmemset
+END (__wmemset_chk)
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f37a2a4d0467286c711231baf633aaf977b752d0

commit f37a2a4d0467286c711231baf633aaf977b752d0
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sat May 20 06:48:04 2017 -0700

    x86-64: Update strlen.S to support wcslen/wcsnlen
    
    The difference between strlen and wcslen is byte vs int.  We can
    replace pminub and pcmpeqb with pminud and pcmpeqd to turn strlen
    into wcslen.
    
    	* sysdeps/x86_64/strlen.S (PMINU): New.
    	(PCMPEQ): Likewise.
    	(SHIFT_RETURN): Likewise.
    	(FIND_ZERO): Replace pcmpeqb with PCMPEQ.
    	(strlen): Add SHIFT_RETURN before ret.  Replace pcmpeqb and
    	pminub with PCMPEQ and PMINU.
    	* sysdeps/x86_64/wcsnlen.S: New file.

diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index 5896e6b..b5ab117 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,4 +1,4 @@
-/* SSE2 version of strlen.
+/* SSE2 version of strlen/wcslen.
    Copyright (C) 2012-2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -18,6 +18,16 @@
 
 #include <sysdep.h>
 
+#ifdef AS_WCSLEN
+# define PMINU		pminud
+# define PCMPEQ		pcmpeqd
+# define SHIFT_RETURN	shrq $2, %rax
+#else
+# define PMINU		pminub
+# define PCMPEQ		pcmpeqb
+# define SHIFT_RETURN
+#endif
+
 /* Long lived register in strlen(s), strnlen(s, n) are:
 
 	%xmm3 - zero
@@ -32,10 +42,10 @@ ENTRY(strlen)
 
 /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
 #define FIND_ZERO	\
-	pcmpeqb	(%rax), %xmm0;	\
-	pcmpeqb	16(%rax), %xmm1;	\
-	pcmpeqb	32(%rax), %xmm2;	\
-	pcmpeqb	48(%rax), %xmm3;	\
+	PCMPEQ	(%rax), %xmm0;	\
+	PCMPEQ	16(%rax), %xmm1;	\
+	PCMPEQ	32(%rax), %xmm2;	\
+	PCMPEQ	48(%rax), %xmm3;	\
 	pmovmskb	%xmm0, %esi;	\
 	pmovmskb	%xmm1, %edx;	\
 	pmovmskb	%xmm2, %r8d;	\
@@ -54,6 +64,9 @@ ENTRY(strlen)
 	xor	%rax, %rax
 	ret
 L(n_nonzero):
+# ifdef AS_WCSLEN
+	shlq	$2, %rsi
+# endif
 
 /* Initialize long lived registers.  */
 
@@ -96,6 +109,7 @@ L(n_nonzero):
 	test	%rdx, %rdx;	\
 	je	L(lab);	\
 	bsfq	%rdx, %rax;	\
+	SHIFT_RETURN;		\
 	ret
 
 #ifdef AS_STRNLEN
@@ -104,19 +118,20 @@ L(n_nonzero):
 #else
 	/* Test first 16 bytes unaligned.  */
 	movdqu	(%rax), %xmm4
-	pcmpeqb	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm4
 	pmovmskb	%xmm4, %edx
 	test	%edx, %edx
 	je 	L(next48_bytes)
 	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+	SHIFT_RETURN
 	ret
 
 L(next48_bytes):
 /* Same as FIND_ZERO except we do not check first 16 bytes.  */
 	andq	$-16, %rax
-	pcmpeqb 16(%rax), %xmm1
-	pcmpeqb 32(%rax), %xmm2
-	pcmpeqb 48(%rax), %xmm3
+	PCMPEQ 16(%rax), %xmm1
+	PCMPEQ 32(%rax), %xmm2
+	PCMPEQ 48(%rax), %xmm3
 	pmovmskb	%xmm1, %edx
 	pmovmskb	%xmm2, %r8d
 	pmovmskb	%xmm3, %ecx
@@ -145,6 +160,7 @@ L(strnlen_ret):
 	test	%rdx, %rdx
 	je	L(loop_init)
 	bsfq	%rdx, %rax
+	SHIFT_RETURN
 	ret
 #endif
 	.p2align 4
@@ -161,10 +177,10 @@ L(loop):
 	je	L(exit_end)
 
 	movdqa	(%rax), %xmm0
-	pminub	16(%rax), %xmm0
-	pminub	32(%rax), %xmm0
-	pminub	48(%rax), %xmm0
-	pcmpeqb	%xmm3, %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
 	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit)
@@ -182,6 +198,7 @@ L(first):
 	bsfq	%rdx, %rdx
 	addq	%rdx, %rax
 	subq	%rdi, %rax
+	SHIFT_RETURN
 	ret
 
 	.p2align 4
@@ -192,6 +209,7 @@ L(exit):
 	bsfq	%rdx, %rdx
 	addq	%rdx, %rax
 	subq	%rdi, %rax
+	SHIFT_RETURN
 	ret
 
 #else
@@ -201,10 +219,10 @@ L(exit):
 L(loop):
 
 	movdqa	64(%rax), %xmm0
-	pminub	80(%rax), %xmm0
-	pminub	96(%rax), %xmm0
-	pminub	112(%rax), %xmm0
-	pcmpeqb	%xmm3, %xmm0
+	PMINU	80(%rax), %xmm0
+	PMINU	96(%rax), %xmm0
+	PMINU	112(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
 	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit64)
@@ -212,10 +230,10 @@ L(loop):
 	subq	$-128, %rax
 
 	movdqa	(%rax), %xmm0
-	pminub	16(%rax), %xmm0
-	pminub	32(%rax), %xmm0
-	pminub	48(%rax), %xmm0
-	pcmpeqb	%xmm3, %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
 	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit0)
@@ -231,6 +249,7 @@ L(exit0):
 	bsfq	%rdx, %rdx
 	addq	%rdx, %rax
 	subq	%rdi, %rax
+	SHIFT_RETURN
 	ret
 
 #endif
diff --git a/sysdeps/x86_64/wcsnlen.S b/sysdeps/x86_64/wcsnlen.S
new file mode 100644
index 0000000..968bb69
--- /dev/null
+++ b/sysdeps/x86_64/wcsnlen.S
@@ -0,0 +1,7 @@
+#define AS_WCSLEN
+#define AS_STRNLEN
+#define strlen	__wcsnlen
+
+#include "strlen.S"
+
+weak_alias(__wcsnlen, wcsnlen)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6c6a107dada6f6026a8cbd5f57a4f1d2f9ab87bc

commit 6c6a107dada6f6026a8cbd5f57a4f1d2f9ab87bc
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue May 23 08:43:52 2017 -0700

    x86_64: Remove redundant REX bytes from memrchr.S
    
    By x86-64 specification, 32-bit destination registers are zero-extended
    to 64 bits.  There is no need to use 64-bit registers when only the lower
    32 bits are non-zero.  Also 2 instructions in:
    
    	mov	%rdi, %rcx
    	and	$15, %rcx
    	jz	L(length_less16_offset0)
    
    	mov	%rdi, %rcx		<<< redundant
    	and	$15, %rcx		<<< redundant
    
    are redundant.
    
    	* sysdeps/x86_64/memrchr.S (__memrchr): Use 32-bit registers for
    	the lower 32 bits.  Remove redundant instructions.

diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index aab1a4a..5fa0fe9 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -22,7 +22,7 @@
 
 	.text
 ENTRY (__memrchr)
-	movd	%rsi, %xmm1
+	movd	%esi, %xmm1
 
 	sub	$16, %rdx
 	jbe	L(length_less16)
@@ -42,8 +42,8 @@ ENTRY (__memrchr)
 	jnz	L(matches0)
 
 	sub	$64, %rdi
-	mov	%rdi, %rcx
-	and	$15, %rcx
+	mov	%edi, %ecx
+	and	$15, %ecx
 	jz	L(loop_prolog)
 
 	add	$16, %rdi
@@ -108,8 +108,8 @@ L(loop_prolog):
 	test	%eax, %eax
 	jnz	L(matches0)
 
-	mov	%rdi, %rcx
-	and	$63, %rcx
+	mov	%edi, %ecx
+	and	$63, %ecx
 	jz	L(align64_loop)
 
 	add	$64, %rdi
@@ -166,8 +166,8 @@ L(align64_loop):
 
 	.p2align 4
 L(exit_loop):
-	add	$64, %rdx
-	cmp	$32, %rdx
+	add	$64, %edx
+	cmp	$32, %edx
 	jbe	L(exit_loop_32)
 
 	movdqa	48(%rdi), %xmm0
@@ -187,7 +187,7 @@ L(exit_loop):
 	pmovmskb	%xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches16_1)
-	cmp	$48, %rdx
+	cmp	$48, %edx
 	jbe	L(return_null)
 
 	pcmpeqb	(%rdi), %xmm1
@@ -204,7 +204,7 @@ L(exit_loop_32):
 	pmovmskb	%xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches48_1)
-	cmp	$16, %rdx
+	cmp	$16, %edx
 	jbe	L(return_null)
 
 	pcmpeqb	32(%rdi), %xmm1
@@ -276,7 +276,7 @@ L(matches48_1):
 
 	.p2align 4
 L(return_null):
-	xor	%rax, %rax
+	xor	%eax, %eax
 	ret
 
 	.p2align 4
@@ -306,18 +306,16 @@ L(length_less16):
 	punpcklbw	%xmm1, %xmm1
 	punpcklbw	%xmm1, %xmm1
 
-	add	$16, %rdx
+	add	$16, %edx
 
 	pshufd	$0, %xmm1, %xmm1
 
-	mov	%rdi, %rcx
-	and	$15, %rcx
+	mov	%edi, %ecx
+	and	$15, %ecx
 	jz	L(length_less16_offset0)
 
-	mov	%rdi, %rcx
-	and	$15, %rcx
 	mov	%cl, %dh
-	mov	%rcx, %r8
+	mov	%ecx, %esi
 	add	%dl, %dh
 	and	$-16, %rdi
 
@@ -340,7 +338,7 @@ L(length_less16):
 
 	bsr	%eax, %eax
 	add	%rdi, %rax
-	add	%r8, %rax
+	add	%rsi, %rax
 	ret
 
 	.p2align 4
@@ -362,14 +360,14 @@ L(length_less16_part2):
 	pcmpeqb	(%rdi), %xmm1
 	pmovmskb	%xmm1, %eax
 
-	mov	%r8, %rcx
+	mov	%esi, %ecx
 	sar	%cl, %eax
 	test	%eax, %eax
 	jz	L(return_null)
 
 	bsr	%eax, %eax
 	add	%rdi, %rax
-	add	%r8, %rax
+	add	%rsi, %rax
 	ret
 
 	.p2align 4

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2e88eb8c07b587d4fb818d9498954fc1f11de3d2

commit 2e88eb8c07b587d4fb818d9498954fc1f11de3d2
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed May 24 08:41:23 2017 -0700

    Add more tests for memchr
    
    This patch adds tests for len == 0 and tests for positions close to the
    beginning, which are equivalent to positions close to the end for memchr.
    
    	* string/test-memrchr.c (test_main): Add tests for len == 0
    	and tests for positions close to the beginning, which are
    	equivalent to positions close to the end for memchr.

diff --git a/string/test-memrchr.c b/string/test-memrchr.c
index bfc9920..15483f5 100644
--- a/string/test-memrchr.c
+++ b/string/test-memrchr.c
@@ -151,15 +151,32 @@ test_main (void)
 
   for (i = 1; i < 8; ++i)
     {
+      /* Test len == 0.  */
+      do_test (i, i, 0, 0);
+      do_test (i, i, 0, 23);
+
       do_test (0, 16 << i, 2048, 23);
       do_test (i, 64, 256, 23);
       do_test (0, 16 << i, 2048, 0);
       do_test (i, 64, 256, 0);
+
+      do_test (0, i, 256, 23);
+      do_test (0, i, 256, 0);
+      do_test (i, i, 256, 23);
+      do_test (i, i, 256, 0);
+
     }
   for (i = 1; i < 32; ++i)
     {
       do_test (0, i, i + 1, 23);
       do_test (0, i, i + 1, 0);
+      do_test (i, i, i + 1, 23);
+      do_test (i, i, i + 1, 0);
+
+      do_test (0, 1, i + 1, 23);
+      do_test (0, 2, i + 1, 0);
+      do_test (i, 1, i + 1, 23);
+      do_test (i, 2, i + 1, 0);
     }
 
   do_random_tests ();

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=986b51f7fb78074b4426409002dab9389597df40

commit 986b51f7fb78074b4426409002dab9389597df40
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed May 24 08:35:59 2017 -0700

    benchtests: Add more tests for memrchr
    
    bench-memchr.c is shared with bench-memrchr.c.  This patch adds some
    tests for positions close to the beginning for memrchr, which are
    equivalent to positions close to the end for memchr.
    
    	* benchtests/bench-memchr.c (do_test): Print out both length
    	and position.
    	(test_main): Also test the position close to the beginning for
    	memrchr.

diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
index 16099ac..c95cc70 100644
--- a/benchtests/bench-memchr.c
+++ b/benchtests/bench-memchr.c
@@ -117,7 +117,8 @@ do_test (size_t align, size_t pos, size_t len, int seek_char)
       buf[align + len] = seek_char;
     }
 
-  printf ("Length %4zd, alignment %2zd:", pos, align);
+  printf ("Length %4zd, position %4zd, alignment %2zd:",
+	  len, pos, align);
 
   FOR_EACH_IMPL (impl, 0)
     do_one_test (impl, (CHAR *) (buf + align), seek_char, len, result);
@@ -143,11 +144,27 @@ test_main (void)
       do_test (i, 64, 256, 23);
       do_test (0, 16 << i, 2048, 0);
       do_test (i, 64, 256, 0);
+#ifdef USE_AS_MEMRCHR
+      /* Also test the position close to the beginning for memrchr.  */
+      do_test (0, i, 256, 23);
+      do_test (0, i, 256, 0);
+      do_test (i, i, 256, 23);
+      do_test (i, i, 256, 0);
+#endif
     }
   for (i = 1; i < 32; ++i)
     {
       do_test (0, i, i + 1, 23);
       do_test (0, i, i + 1, 0);
+      do_test (i, i, i + 1, 23);
+      do_test (i, i, i + 1, 0);
+#ifdef USE_AS_MEMRCHR
+      /* Also test the position close to the beginning for memrchr.  */
+      do_test (0, 1, i + 1, 23);
+      do_test (0, 2, i + 1, 0);
+      do_test (i, i, i + 1, 23);
+      do_test (i, i, i + 1, 0);
+#endif
     }
 
   return ret;

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]