This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch hjl/avx2/master created. glibc-2.25-394-g29995cd
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 1 Jun 2017 20:35:01 -0000
- Subject: GNU C Library master sources branch hjl/avx2/master created. glibc-2.25-394-g29995cd
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, hjl/avx2/master has been created
at 29995cdfd5e53e616aae0dda9283dae2093f3971 (commit)
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=29995cdfd5e53e616aae0dda9283dae2093f3971
commit 29995cdfd5e53e616aae0dda9283dae2093f3971
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Jun 1 12:57:14 2017 -0700
Update between_2_3 memcmp-avx2.S
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2.S b/sysdeps/x86_64/multiarch/memcmp-avx2.S
index 5297e3c..c04902e 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2.S
@@ -136,18 +136,15 @@ L(between_2_3):
/* Load 2 bytes into registers. */
movzwl (%rdi), %eax
movzwl (%rsi), %ecx
- /* Compare the lowest byte. */
- cmpb %cl, %al
- jne L(1byte_reg)
- /* Load the difference of 2 bytes into EAX. */
- subl %ecx, %eax
- /* Return if 2 bytes differ. */
- jnz L(exit)
- cmpb $2, %dl
- /* Return if these are the last 2 bytes. */
- je L(exit)
- movzbl 2(%rdi), %eax
- movzbl 2(%rsi), %ecx
+ shll $16, %eax
+ shll $16, %ecx
+ /* Use overlapping loads to avoid branches. */
+ movzwl -2(%rdi, %rdx), %edi
+ movzwl -2(%rsi, %rdx), %esi
+ orl %edi, %eax
+ orl %esi, %ecx
+ bswap %eax
+ bswap %ecx
subl %ecx, %eax
ret
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3793249d7aefe57acca311a3687c764225122a05
commit 3793249d7aefe57acca311a3687c764225122a05
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Jun 1 12:27:16 2017 -0700
memcmp-avx2.S
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2.S b/sysdeps/x86_64/multiarch/memcmp-avx2.S
index eb3a4cb..5297e3c 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2.S
@@ -72,6 +72,7 @@ L(last_2x_vec):
jnz L(first_vec)
L(last_vec):
+ /* Use overlapping loads to avoid branches. */
leaq -VEC_SIZE(%rdi, %rdx), %rdi
leaq -VEC_SIZE(%rsi, %rdx), %rsi
vmovdqu (%rsi), %ymm2
@@ -119,6 +120,7 @@ L(between_4_7):
vpmovmskb %xmm2, %eax
subl $0xffff, %eax
jnz L(first_vec)
+ /* Use overlapping loads to avoid branches. */
leaq -4(%rdi, %rdx), %rdi
leaq -4(%rsi, %rdx), %rsi
vmovd (%rdi), %xmm1
@@ -198,6 +200,7 @@ L(less_vec):
vpmovmskb %xmm2, %eax
subl $0xffff, %eax
jnz L(first_vec)
+ /* Use overlapping loads to avoid branches. */
leaq -8(%rdi, %rdx), %rdi
leaq -8(%rsi, %rdx), %rsi
vmovq (%rdi), %xmm1
@@ -217,6 +220,7 @@ L(between_16_31):
subl $0xffff, %eax
jnz L(first_vec)
+ /* Use overlapping loads to avoid branches. */
leaq -16(%rdi, %rdx), %rdi
leaq -16(%rsi, %rdx), %rsi
vmovdqu (%rsi), %xmm2
@@ -350,6 +354,7 @@ L(last_4x_vec):
subl $VEC_MASK, %eax
jnz L(first_vec)
+ /* Use overlapping loads to avoid branches. */
leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
vmovdqu (%rsi), %ymm2
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3180371a826547726ac5b457004b82194418903b
commit 3180371a826547726ac5b457004b82194418903b
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri May 26 12:21:55 2017 -0700
x86-64: Optimize strrchr/wcsrchr with AVX2
Optimize strrchr/wcsrchr with AVX2 to check 32 bytes with vector
instructions. It is as fast as SSE2 version for small data sizes
and up to 1X faster for large data sizes on Haswell. Select AVX2
version on AVX2 machines where vzeroupper is preferred and AVX
unaligned load is fast.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strrchr-avx2 and wcsrchr-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add tests for __strrchr_avx2,
__strrchr_sse2, __wcsrchr_avx2 and __wcsrchr_sse2.
* sysdeps/x86_64/multiarch/strrchr-avx2.S: New file.
* sysdeps/x86_64/multiarch/strrchr.S: Likewise.
* sysdeps/x86_64/multiarch/wcsrchr-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcsrchr.S: Likewise.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index aee4820..5d7825a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -15,6 +15,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memmove-ssse3-back \
memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
strchr-avx2 strchrnul-avx2 \
+ strrchr-avx2 \
strlen-avx2 strnlen-avx2 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
@@ -40,5 +41,6 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wmemcmp-avx2 \
wcscpy-ssse3 wcscpy-c \
wcschr-avx2 \
+ wcsrchr-avx2 \
wcslen-avx2 wcsnlen-avx2
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 9f1ec4b..0112621 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -250,6 +250,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strchrnul_avx2)
IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
+ /* Support sysdeps/x86_64/multiarch/strrchr.S. */
+ IFUNC_IMPL (i, name, strrchr,
+ IFUNC_IMPL_ADD (array, i, strrchr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __strrchr_avx2)
+ IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
+
/* Support sysdeps/x86_64/multiarch/strcmp.S. */
IFUNC_IMPL (i, name, strcmp,
IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
@@ -341,6 +348,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__wcschr_avx2)
IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
+ /* Support sysdeps/x86_64/multiarch/wcsrchr.S. */
+ IFUNC_IMPL (i, name, wcsrchr,
+ IFUNC_IMPL_ADD (array, i, wcsrchr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wcsrchr_avx2)
+ IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
+
/* Support sysdeps/x86_64/multiarch/wcscpy.S. */
IFUNC_IMPL (i, name, wcscpy,
IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
new file mode 100644
index 0000000..36ef660
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -0,0 +1,235 @@
+/* strrchr/wcsrchr optimized with AVX2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRRCHR
+# define STRRCHR __strrchr_avx2
+# endif
+
+# ifdef USE_AS_WCSRCHR
+# define VPBROADCAST vpbroadcastd
+# define VPCMPEQ vpcmpeqd
+# else
+# define VPBROADCAST vpbroadcastb
+# define VPCMPEQ vpcmpeqb
+# endif
+
+# ifndef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+ .section .text.avx,"ax",@progbits
+ENTRY (STRRCHR)
+ movd %esi, %xmm4
+ movl %edi, %ecx
+ /* Broadcast CHAR to YMM4. */
+ VPBROADCAST %xmm4, %ymm4
+ vpxor %ymm0, %ymm0, %ymm0
+
+ /* Check if we may cross page boundary with one vector load. */
+ andl $(2 * VEC_SIZE - 1), %ecx
+ cmpl $VEC_SIZE, %ecx
+ ja L(cros_page_boundary)
+
+ vmovdqu (%rdi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm2
+ VPCMPEQ %ymm1, %ymm4, %ymm3
+ vpmovmskb %ymm2, %ecx
+ vpmovmskb %ymm3, %eax
+ addq $VEC_SIZE, %rdi
+
+ testl %eax, %eax
+ jnz L(first_vec)
+
+ testl %ecx, %ecx
+ jnz L(return_null)
+
+ andq $-VEC_SIZE, %rdi
+ xorl %edx, %edx
+ jmp L(aligned_loop)
+
+ .p2align 4
+L(first_vec):
+ /* Check if there is a nul CHAR. */
+ testl %ecx, %ecx
+ jnz L(char_and_nul_in_first_vec)
+
+ /* Remember the match and keep searching. */
+ movl %eax, %edx
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rdi
+ jmp L(aligned_loop)
+
+ .p2align 4
+L(cros_page_boundary):
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+ vmovdqa (%rdi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm2
+ VPCMPEQ %ymm1, %ymm4, %ymm3
+ vpmovmskb %ymm2, %edx
+ vpmovmskb %ymm3, %eax
+ shrl %cl, %edx
+ shrl %cl, %eax
+ addq $VEC_SIZE, %rdi
+
+ /* Check if there is a CHAR. */
+ testl %eax, %eax
+ jnz L(found_char)
+
+ testl %edx, %edx
+ jnz L(return_null)
+
+ jmp L(aligned_loop)
+
+ .p2align 4
+L(found_char):
+ testl %edx, %edx
+ jnz L(char_and_nul)
+
+ /* Remember the match and keep searching. */
+ movl %eax, %edx
+ leaq (%rdi, %rcx), %rsi
+
+ .p2align 4
+L(aligned_loop):
+ vmovdqa (%rdi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm2
+ addq $VEC_SIZE, %rdi
+ VPCMPEQ %ymm1, %ymm4, %ymm3
+ vpmovmskb %ymm2, %ecx
+ vpmovmskb %ymm3, %eax
+ orl %eax, %ecx
+ jnz L(char_nor_null)
+
+ vmovdqa (%rdi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm2
+ add $VEC_SIZE, %rdi
+ VPCMPEQ %ymm1, %ymm4, %ymm3
+ vpmovmskb %ymm2, %ecx
+ vpmovmskb %ymm3, %eax
+ orl %eax, %ecx
+ jnz L(char_nor_null)
+
+ vmovdqa (%rdi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm2
+ addq $VEC_SIZE, %rdi
+ VPCMPEQ %ymm1, %ymm4, %ymm3
+ vpmovmskb %ymm2, %ecx
+ vpmovmskb %ymm3, %eax
+ orl %eax, %ecx
+ jnz L(char_nor_null)
+
+ vmovdqa (%rdi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm2
+ addq $VEC_SIZE, %rdi
+ VPCMPEQ %ymm1, %ymm4, %ymm3
+ vpmovmskb %ymm2, %ecx
+ vpmovmskb %ymm3, %eax
+ orl %eax, %ecx
+ jz L(aligned_loop)
+
+ .p2align 4
+L(char_nor_null):
+ /* Find a CHAR or a nul CHAR in a loop. */
+ testl %eax, %eax
+ jnz L(match)
+L(return_value):
+ testl %edx, %edx
+ jz L(return_null)
+ movl %edx, %eax
+ movq %rsi, %rdi
+
+# ifdef USE_AS_WCSRCHR
+ /* Keep the first bit for each matching CHAR for bsr. */
+ andl $0x11111111, %eax
+# endif
+ bsrl %eax, %eax
+ leaq -VEC_SIZE(%rdi, %rax), %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(match):
+ /* Find a CHAR. Check if there is a nul CHAR. */
+ vpmovmskb %ymm2, %ecx
+ testl %ecx, %ecx
+ jnz L(find_nul)
+
+ /* Remember the match and keep searching. */
+ movl %eax, %edx
+ movq %rdi, %rsi
+ jmp L(aligned_loop)
+
+ .p2align 4
+L(find_nul):
+# ifdef USE_AS_WCSRCHR
+ /* Keep the first bit for each matching CHAR for bsr. */
+ andl $0x11111111, %ecx
+ andl $0x11111111, %eax
+# endif
+ /* Mask out any matching bits after the nul CHAR. */
+ movl %ecx, %r8d
+ subl $1, %r8d
+ xorl %ecx, %r8d
+ andl %r8d, %eax
+ testl %eax, %eax
+ /* If there is no CHAR here, return the remembered one. */
+ jz L(return_value)
+ bsrl %eax, %eax
+ leaq -VEC_SIZE(%rdi, %rax), %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(char_and_nul):
+ /* Find both a CHAR and a nul CHAR. */
+ addq %rcx, %rdi
+ movl %edx, %ecx
+L(char_and_nul_in_first_vec):
+# ifdef USE_AS_WCSRCHR
+ /* Keep the first bit for each matching CHAR for bsr. */
+ andl $0x11111111, %ecx
+ andl $0x11111111, %eax
+# endif
+ /* Mask out any matching bits after the nul CHAR. */
+ movl %ecx, %r8d
+ subl $1, %r8d
+ xorl %ecx, %r8d
+ andl %r8d, %eax
+ testl %eax, %eax
+ /* Return null pointer if the nul CHAR comes first. */
+ jz L(return_null)
+ bsrl %eax, %eax
+ leaq -VEC_SIZE(%rdi, %rax), %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(return_null):
+ xorl %eax, %eax
+ VZEROUPPER
+ ret
+
+END (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
new file mode 100644
index 0000000..e5b49a9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr.S
@@ -0,0 +1,63 @@
+/* Multiple versions of strrchr
+ Copyright (C) 2009-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(strrchr)
+ .type strrchr, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __strrchr_avx2(%rip), %rax
+ ret
+1:
+ leaq __strrchr_sse2(%rip), %rax
+ ret
+END(strrchr)
+
+
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strrchr_sse2, @function; \
+ .align 16; \
+ .globl __strrchr_sse2; \
+ .hidden __strrchr_sse2; \
+ __strrchr_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strrchr calls through a PLT.
+ The speedup we get from using SSE4.2 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
+#endif
+
+#include "../strrchr.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2.S
new file mode 100644
index 0000000..cf8a239
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2.S
@@ -0,0 +1,3 @@
+#define STRRCHR __wcsrchr_avx2
+#define USE_AS_WCSRCHR 1
+#include "strrchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr.S b/sysdeps/x86_64/multiarch/wcsrchr.S
new file mode 100644
index 0000000..59276cb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr.S
@@ -0,0 +1,55 @@
+/* Multiple versions of wcsrchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(wcsrchr)
+ .type wcsrchr, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __wcsrchr_avx2(%rip), %rax
+ ret
+
+1: leaq __wcsrchr_sse2(%rip), %rax
+ ret
+END(wcsrchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __wcsrchr_sse2, @function; \
+ .p2align 4; \
+ .globl __wcsrchr_sse2; \
+ .hidden __wcsrchr_sse2; \
+ __wcsrchr_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __wcsrchr_sse2, .-__wcsrchr_sse2
+#endif
+
+#include "../wcsrchr.S"
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b3395c9a7431c4153e5f2bd5a3c5a023017d2703
commit b3395c9a7431c4153e5f2bd5a3c5a023017d2703
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Tue May 23 11:25:19 2017 -0700
x86-64: Optimize memrchr with AVX2
Optimize memrchr with AVX2 to search 32 bytes with a single vector
compare instruction. It is as fast as SSE2 memrchr for small data
sizes and up to 1X faster for large data sizes on Haswell. Select
AVX2 memrchr on AVX2 machines where vzeroupper is preferred and AVX
unaligned load is fast.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memrchr-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add tests for __memrchr_avx2 and
__memrchr_sse2.
* sysdeps/x86_64/multiarch/memrchr-avx2.S: New file.
* sysdeps/x86_64/multiarch/memrchr.S: Likewise.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 2974495..aee4820 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -7,6 +7,7 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcmp-sse2-unaligned strncmp-ssse3 \
memchr-avx2 rawmemchr-avx2 \
+ memrchr-avx2 \
memcmp-avx2 \
memcmp-sse4 memcpy-ssse3 \
memmove-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index d3fa98f..9f1ec4b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -111,6 +111,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove, 1,
__memmove_sse2_unaligned_erms))
+ /* Support sysdeps/x86_64/multiarch/memrchr.S. */
+ IFUNC_IMPL (i, name, memrchr,
+ IFUNC_IMPL_ADD (array, i, memrchr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memrchr_avx2)
+ IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2))
+
/* Support sysdeps/x86_64/multiarch/memset_chk.S. */
IFUNC_IMPL (i, name, __memset_chk,
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
new file mode 100644
index 0000000..3ee02e1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -0,0 +1,359 @@
+/* memrchr optimized with AVX2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+ .section .text.avx,"ax",@progbits
+ENTRY (__memrchr_avx2)
+ /* Broadcast CHAR to YMM0. */
+ vmovd %esi, %xmm0
+ vpbroadcastb %xmm0, %ymm0
+
+ subq $VEC_SIZE, %rdx
+ jbe L(last_vec_or_less)
+
+ addq %rdx, %rdi
+
+ /* Check the last VEC_SIZE bytes. */
+ vpcmpeqb (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x0)
+
+ subq $(VEC_SIZE * 4), %rdi
+ movl %edi, %ecx
+ andl $(VEC_SIZE - 1), %ecx
+ jz L(aligned_more)
+
+ /* Align data for aligned loads in the loop. */
+ addq $VEC_SIZE, %rdi
+ addq $VEC_SIZE, %rdx
+ andq $-VEC_SIZE, %rdi
+ subq %rcx, %rdx
+
+ .p2align 4
+L(aligned_more):
+ subq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec_or_less)
+
+ /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3)
+
+ vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
+ vpmovmskb %ymm2, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+ vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
+ vpmovmskb %ymm3, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x1)
+
+ vpcmpeqb (%rdi), %ymm0, %ymm4
+ vpmovmskb %ymm4, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x0)
+
+ /* Align data to 4 * VEC_SIZE for loop with fewer branches.
+ There are some overlaps with above if data isn't aligned
+ to 4 * VEC_SIZE. */
+ movl %edi, %ecx
+ andl $(VEC_SIZE * 4 - 1), %ecx
+ jz L(loop_4x_vec)
+
+ addq $(VEC_SIZE * 4), %rdi
+ addq $(VEC_SIZE * 4), %rdx
+ andq $-(VEC_SIZE * 4), %rdi
+ subq %rcx, %rdx
+
+ .p2align 4
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ subq $(VEC_SIZE * 4), %rdi
+ subq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec_or_less)
+
+ vmovdqa (%rdi), %ymm1
+ vmovdqa VEC_SIZE(%rdi), %ymm2
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
+
+ vpcmpeqb %ymm1, %ymm0, %ymm1
+ vpcmpeqb %ymm2, %ymm0, %ymm2
+ vpcmpeqb %ymm3, %ymm0, %ymm3
+ vpcmpeqb %ymm4, %ymm0, %ymm4
+
+ vpor %ymm1, %ymm2, %ymm5
+ vpor %ymm3, %ymm4, %ymm6
+ vpor %ymm5, %ymm6, %ymm5
+
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jz L(loop_4x_vec)
+
+ /* There is a match. */
+ vpmovmskb %ymm4, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3)
+
+ vpmovmskb %ymm3, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+ vpmovmskb %ymm2, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x1)
+
+ vpmovmskb %ymm1, %eax
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_4x_vec_or_less):
+ addl $(VEC_SIZE * 4), %edx
+ cmpl $(VEC_SIZE * 2), %edx
+ jbe L(last_2x_vec)
+
+ vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3)
+
+ vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
+ vpmovmskb %ymm2, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+ vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
+ vpmovmskb %ymm3, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x1_check)
+ cmpl $(VEC_SIZE * 3), %edx
+ jbe L(zero)
+
+ vpcmpeqb (%rdi), %ymm0, %ymm4
+ vpmovmskb %ymm4, %eax
+ testl %eax, %eax
+ jz L(zero)
+ bsrl %eax, %eax
+ subq $(VEC_SIZE * 4), %rdx
+ addq %rax, %rdx
+ jl L(zero)
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_2x_vec):
+ vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3_check)
+ cmpl $VEC_SIZE, %edx
+ jbe L(zero)
+
+ vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jz L(zero)
+ bsrl %eax, %eax
+ subq $(VEC_SIZE * 2), %rdx
+ addq %rax, %rdx
+ jl L(zero)
+ addl $(VEC_SIZE * 2), %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_vec_x0):
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_vec_x1):
+ bsrl %eax, %eax
+ addl $VEC_SIZE, %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_vec_x2):
+ bsrl %eax, %eax
+ addl $(VEC_SIZE * 2), %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_vec_x3):
+ bsrl %eax, %eax
+ addl $(VEC_SIZE * 3), %eax
+ addq %rdi, %rax
+ ret
+
+ .p2align 4
+L(last_vec_x1_check):
+ bsrl %eax, %eax
+ subq $(VEC_SIZE * 3), %rdx
+ addq %rax, %rdx
+ jl L(zero)
+ addl $VEC_SIZE, %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_vec_x3_check):
+ bsrl %eax, %eax
+ subq $VEC_SIZE, %rdx
+ addq %rax, %rdx
+ jl L(zero)
+ addl $(VEC_SIZE * 3), %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(zero):
+ VZEROUPPER
+L(null):
+ xorl %eax, %eax
+ ret
+
+ .p2align 4
+L(last_vec_or_less_aligned):
+ movl %edx, %ecx
+
+ vpcmpeqb (%rdi), %ymm0, %ymm1
+
+ movl $1, %edx
+ /* Support rdx << 32. */
+ salq %cl, %rdx
+ subq $1, %rdx
+
+ vpmovmskb %ymm1, %eax
+
+ /* Remove the trailing bytes. */
+ andl %edx, %eax
+ testl %eax, %eax
+ jz L(zero)
+
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_vec_or_less):
+ addl $VEC_SIZE, %edx
+
+ /* Check for zero length. */
+ testl %edx, %edx
+ jz L(null)
+
+ movl %edi, %ecx
+ andl $(VEC_SIZE - 1), %ecx
+ jz L(last_vec_or_less_aligned)
+
+ movl %ecx, %esi
+ movl %ecx, %r8d
+ addl %edx, %esi
+ andq $-VEC_SIZE, %rdi
+
+ subl $VEC_SIZE, %esi
+ ja L(last_vec_2x_aligned)
+
+ /* Check the last VEC. */
+ vpcmpeqb (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+
+ /* Remove the leading and trailing bytes. */
+ sarl %cl, %eax
+ movl %edx, %ecx
+
+ movl $1, %edx
+ sall %cl, %edx
+ subl $1, %edx
+
+ andl %edx, %eax
+ testl %eax, %eax
+ jz L(zero)
+
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ addq %r8, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_vec_2x_aligned):
+ movl %esi, %ecx
+
+ /* Check the last VEC. */
+ vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
+
+ movl $1, %edx
+ sall %cl, %edx
+ subl $1, %edx
+
+ vpmovmskb %ymm1, %eax
+
+ /* Remove the trailing bytes. */
+ andl %edx, %eax
+
+ testl %eax, %eax
+ jnz L(last_vec_x1)
+
+ /* Check the second last VEC. */
+ vpcmpeqb (%rdi), %ymm0, %ymm1
+
+ movl %r8d, %ecx
+
+ vpmovmskb %ymm1, %eax
+
+ /* Remove the leading bytes. Must use unsigned right shift for
+ bsrl below. */
+ shrl %cl, %eax
+ testl %eax, %eax
+ jz L(zero)
+
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ addq %r8, %rax
+ VZEROUPPER
+ ret
+END (__memrchr_avx2)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memrchr.S b/sysdeps/x86_64/multiarch/memrchr.S
new file mode 100644
index 0000000..20a9e76
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr.S
@@ -0,0 +1,55 @@
+/* Multiple versions of memrchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(__memrchr)
+ .type __memrchr, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __memrchr_avx2(%rip), %rax
+ ret
+
+1: leaq __memrchr_sse2(%rip), %rax
+ ret
+END(__memrchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memrchr_sse2, @function; \
+ .p2align 4; \
+ .globl __memrchr_sse2; \
+ .hidden __memrchr_sse2; \
+ __memrchr_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memrchr_sse2, .-__memrchr_sse2
+#endif
+
+#include "../memrchr.S"
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=adb18859214a0e365eddc6e253789ae366c573c3
commit adb18859214a0e365eddc6e253789ae366c573c3
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon May 22 15:09:50 2017 -0700
x86-64: Optimize strchr/strchrnul/wcschr with AVX2
Optimize strchr/strchrnul/wcschr with AVX2 to search 32 bytes with vector
instructions. It is as fast as SSE2 versions for size <= 16 bytes and up
to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on
AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast.
NB: It uses TZCNT instead of BSF since TZCNT produces the same result
as BSF for non-zero input. TZCNT is faster than BSF and is executed
as BSF if machine doesn't support TZCNT.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strchr-avx2, strchrnul-avx2 and wcschr-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add tests for __strchr_avx2,
__strchrnul_avx2, __strchrnul_sse2, __wcschr_avx2 and
__wcschr_sse2.
* sysdeps/x86_64/multiarch/strchr-avx2.S: New file.
* sysdeps/x86_64/multiarch/strchrnul-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/strchrnul.S: Likewise.
* sysdeps/x86_64/multiarch/wcschr-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcschr.S: Likewise.
* sysdeps/x86_64/multiarch/strchr.S (strchr): Add support for
__strchr_avx2.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f33f21a..2974495 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memcpy-ssse3-back \
memmove-ssse3-back \
memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
+ strchr-avx2 strchrnul-avx2 \
strlen-avx2 strnlen-avx2 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
@@ -37,5 +38,6 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wmemchr-avx2 \
wmemcmp-avx2 \
wcscpy-ssse3 wcscpy-c \
+ wcschr-avx2 \
wcslen-avx2 wcsnlen-avx2
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c2b07b3..d3fa98f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -230,9 +230,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strchr.S. */
IFUNC_IMPL (i, name, strchr,
+ IFUNC_IMPL_ADD (array, i, strchr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __strchr_avx2)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
+ /* Support sysdeps/x86_64/multiarch/strchrnul.S. */
+ IFUNC_IMPL (i, name, strchrnul,
+ IFUNC_IMPL_ADD (array, i, strchrnul,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __strchrnul_avx2)
+ IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
+
/* Support sysdeps/x86_64/multiarch/strcmp.S. */
IFUNC_IMPL (i, name, strcmp,
IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
@@ -317,6 +327,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
+ /* Support sysdeps/x86_64/multiarch/wcschr.S. */
+ IFUNC_IMPL (i, name, wcschr,
+ IFUNC_IMPL_ADD (array, i, wcschr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wcschr_avx2)
+ IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
+
/* Support sysdeps/x86_64/multiarch/wcscpy.S. */
IFUNC_IMPL (i, name, wcscpy,
IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
new file mode 100644
index 0000000..e4292d3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -0,0 +1,254 @@
+/* strchr/strchrnul optimized with AVX2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCHR
+# define STRCHR __strchr_avx2
+# endif
+
+# ifdef USE_AS_WCSCHR
+# define VPBROADCAST vpbroadcastd
+# define VPCMPEQ vpcmpeqd
+# define CHAR_REG esi
+# else
+# define VPBROADCAST vpbroadcastb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_REG sil
+# endif
+
+# ifndef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+ .section .text.avx,"ax",@progbits
+ENTRY (STRCHR)
+ movl %edi, %ecx
+ /* Broadcast CHAR to YMM0. */
+ vmovd %esi, %xmm0
+ vpxor %xmm9, %xmm9, %xmm9
+ VPBROADCAST %xmm0, %ymm0
+ /* Check if we may cross page boundary with one vector load. */
+ andl $(2 * VEC_SIZE - 1), %ecx
+ cmpl $VEC_SIZE, %ecx
+ ja L(cros_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the
+ null byte. */
+ vmovdqu (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ /* Align data for aligned loads in the loop. */
+ addq $VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+
+ jmp L(more_4x_vec)
+
+ .p2align 4
+L(cros_page_boundary):
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+ vmovdqu (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bytes. */
+ sarl %cl, %eax
+ testl %eax, %eax
+ jz L(aligned_more)
+ /* Found CHAR or the null byte. */
+ tzcntl %eax, %eax
+ addq %rcx, %rax
+# ifdef USE_AS_STRCHRNUL
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(aligned_more):
+ addq $VEC_SIZE, %rdi
+
+L(more_4x_vec):
+ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ vmovdqa (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ vmovdqa VEC_SIZE(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x3)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+ /* Align data to 4 * VEC_SIZE. */
+ movq %rdi, %rcx
+ andl $(4 * VEC_SIZE - 1), %ecx
+ andq $-(4 * VEC_SIZE), %rdi
+
+ .p2align 4
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ vmovdqa (%rdi), %ymm5
+ vmovdqa VEC_SIZE(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
+
+ VPCMPEQ %ymm5, %ymm0, %ymm1
+ VPCMPEQ %ymm6, %ymm0, %ymm2
+ VPCMPEQ %ymm7, %ymm0, %ymm3
+ VPCMPEQ %ymm8, %ymm0, %ymm4
+
+ VPCMPEQ %ymm5, %ymm9, %ymm5
+ VPCMPEQ %ymm6, %ymm9, %ymm6
+ VPCMPEQ %ymm7, %ymm9, %ymm7
+ VPCMPEQ %ymm8, %ymm9, %ymm8
+
+ vpor %ymm1, %ymm5, %ymm1
+ vpor %ymm2, %ymm6, %ymm2
+ vpor %ymm3, %ymm7, %ymm3
+ vpor %ymm4, %ymm8, %ymm4
+
+ vpor %ymm1, %ymm2, %ymm5
+ vpor %ymm3, %ymm4, %ymm6
+
+ vpor %ymm5, %ymm6, %ymm5
+
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jnz L(4x_vec_end)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+ jmp L(loop_4x_vec)
+
+ .p2align 4
+L(first_vec_x0):
+ /* Found CHAR or the null byte. */
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq $VEC_SIZE, %rax
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq VEC_SIZE(%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq $(VEC_SIZE * 2), %rax
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(4x_vec_end):
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+ vpmovmskb %ymm2, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+ vpmovmskb %ymm3, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+ vpmovmskb %ymm4, %eax
+ testl %eax, %eax
+L(first_vec_x3):
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq $(VEC_SIZE * 3), %rax
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+END (STRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index c9f54ca..f83ba6e 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -26,6 +26,15 @@
ENTRY(strchr)
.type strchr, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __strchr_avx2(%rip), %rax
+ ret
+1:
leaq __strchr_sse2(%rip), %rax
2: HAS_ARCH_FEATURE (Slow_BSF)
jz 3f
diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2.S b/sysdeps/x86_64/multiarch/strchrnul-avx2.S
new file mode 100644
index 0000000..fa0cc09
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-avx2.S
@@ -0,0 +1,3 @@
+#define STRCHR __strchrnul_avx2
+#define USE_AS_STRCHRNUL 1
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchrnul.S
similarity index 51%
copy from sysdeps/x86_64/multiarch/strchr.S
copy to sysdeps/x86_64/multiarch/strchrnul.S
index c9f54ca..4df9c39 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchrnul.S
@@ -1,5 +1,6 @@
-/* Multiple versions of strchr
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
+/* Multiple versions of strchrnul
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -19,39 +20,36 @@
#include <sysdep.h>
#include <init-arch.h>
-
-/* Define multiple versions only for the definition in libc. */
+/* Define multiple versions only for the definition in libc. */
#if IS_IN (libc)
.text
-ENTRY(strchr)
- .type strchr, @gnu_indirect_function
+ENTRY(__strchrnul)
+ .type __strchrnul, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
- leaq __strchr_sse2(%rip), %rax
-2: HAS_ARCH_FEATURE (Slow_BSF)
- jz 3f
- leaq __strchr_sse2_no_bsf(%rip), %rax
-3: ret
-END(strchr)
-
-
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __strchrnul_avx2(%rip), %rax
+ ret
+
+1: leaq __strchrnul_sse2(%rip), %rax
+ ret
+END(__strchrnul)
# undef ENTRY
# define ENTRY(name) \
- .type __strchr_sse2, @function; \
- .align 16; \
- .globl __strchr_sse2; \
- .hidden __strchr_sse2; \
- __strchr_sse2: cfi_startproc; \
+ .type __strchrnul_sse2, @function; \
+ .p2align 4; \
+ .globl __strchrnul_sse2; \
+ .hidden __strchrnul_sse2; \
+ __strchrnul_sse2: cfi_startproc; \
CALL_MCOUNT
# undef END
# define END(name) \
- cfi_endproc; .size __strchr_sse2, .-__strchr_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strchr calls through a PLT.
- The speedup we get from using SSE4.2 instruction is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_strchr; __GI_strchr = __strchr_sse2
+ cfi_endproc; .size __strchrnul_sse2, .-__strchrnul_sse2
#endif
-#include "../strchr.S"
+#include "../strchrnul.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2.S b/sysdeps/x86_64/multiarch/wcschr-avx2.S
new file mode 100644
index 0000000..67726b6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-avx2.S
@@ -0,0 +1,3 @@
+#define STRCHR __wcschr_avx2
+#define USE_AS_WCSCHR 1
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr.S b/sysdeps/x86_64/multiarch/wcschr.S
new file mode 100644
index 0000000..e132acc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr.S
@@ -0,0 +1,67 @@
+/* Multiple versions of wcschr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(__wcschr)
+ .type __wcschr, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __wcschr_avx2(%rip), %rax
+ ret
+
+1: leaq __wcschr_sse2(%rip), %rax
+ ret
+END(__wcschr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __wcschr_sse2, @function; \
+ .p2align 4; \
+ .globl __wcschr_sse2; \
+ .hidden __wcschr_sse2; \
+ __wcschr_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __wcschr_sse2, .-__wcschr_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal wcschr calls through a PLT.
+ The speedup we get from using AVX2 instructions is likely eaten away
+ by the indirect call in the PLT. */
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI___wcschr; __GI___wcschr = __wcschr_sse2
+# undef libc_hidden_weak
+# define libc_hidden_weak(name) \
+ .weak __GI_wcschr; __GI_wcschr = __wcschr_sse2
+# endif
+#endif
+
+#include "../wcschr.S"
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c67ef62d5961ad7630fa9bb413eac252100f70c1
commit c67ef62d5961ad7630fa9bb413eac252100f70c1
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri May 19 12:19:42 2017 -0700
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2
Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with
a single vector compare instruction. It is as fast as SSE2 versions for
size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell.
Select AVX2 version on AVX2 machines where vzeroupper is preferred and
AVX unaligned load is fast.
NB: It uses TZCNT instead of BSF since TZCNT produces the same result
as BSF for non-zero input. TZCNT is faster than BSF and is executed
as BSF if machine doesn't support TZCNT.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strlen-avx2, strnlen-avx2 and wcslen-avx2 and wcsnlen-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add tests for __strlen_avx2,
__strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2,
__wcslen_sse2, __wcsnlen_avx2 and __wcsnlen_sse2.
* sysdeps/x86_64/multiarch/strlen-avx2.S: New file.
* sysdeps/x86_64/multiarch/strlen.S: Likewise.
* sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/strnlen.S: Likewise.
* sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcslen.S: Likewise.
* sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcsnlen.S: Likewise.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 48aba0f..f33f21a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memcpy-ssse3-back \
memmove-ssse3-back \
memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
+ strlen-avx2 strnlen-avx2 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
@@ -35,5 +36,6 @@ ifeq ($(subdir),wcsmbs)
sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wmemchr-avx2 \
wmemcmp-avx2 \
- wcscpy-ssse3 wcscpy-c
+ wcscpy-ssse3 wcscpy-c \
+ wcslen-avx2 wcsnlen-avx2
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index ae09241..c2b07b3 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -165,6 +165,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__rawmemchr_avx2)
IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
+ /* Support sysdeps/x86_64/multiarch/strlen.S. */
+ IFUNC_IMPL (i, name, strlen,
+ IFUNC_IMPL_ADD (array, i, strlen,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __strlen_avx2)
+ IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
+
+ /* Support sysdeps/x86_64/multiarch/strnlen.S. */
+ IFUNC_IMPL (i, name, strnlen,
+ IFUNC_IMPL_ADD (array, i, strnlen,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __strnlen_avx2)
+ IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
+
/* Support sysdeps/x86_64/multiarch/stpncpy.S. */
IFUNC_IMPL (i, name, stpncpy,
IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
@@ -309,6 +323,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__wcscpy_ssse3)
IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
+ /* Support sysdeps/x86_64/multiarch/wcslen.S. */
+ IFUNC_IMPL (i, name, wcslen,
+ IFUNC_IMPL_ADD (array, i, wcslen,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wcslen_avx2)
+ IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+
+ /* Support sysdeps/x86_64/multiarch/wcsnlen.S. */
+ IFUNC_IMPL (i, name, wcsnlen,
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wcsnlen_avx2)
+ IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2))
+
/* Support sysdeps/x86_64/multiarch/wmemchr.S. */
IFUNC_IMPL (i, name, wmemchr,
IFUNC_IMPL_ADD (array, i, wmemchr,
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
new file mode 100644
index 0000000..1dc823a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -0,0 +1,394 @@
+/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRLEN
+# define STRLEN __strlen_avx2
+# endif
+
+# ifdef USE_AS_WCSLEN
+# define VPCMPEQ vpcmpeqd
+# define VPMINU vpminud
+# else
+# define VPCMPEQ vpcmpeqb
+# define VPMINU vpminub
+# endif
+
+# ifndef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+ .section .text.avx,"ax",@progbits
+ENTRY (STRLEN)
+# ifdef USE_AS_STRNLEN
+ /* Check for zero length. */
+ testq %rsi, %rsi
+ jz L(zero)
+# ifdef USE_AS_WCSLEN
+ shl $2, %rsi
+# endif
+ movq %rsi, %r8
+# endif
+ movl %edi, %ecx
+ movq %rdi, %rdx
+ vpxor %xmm0, %xmm0, %xmm0
+
+ /* Check if we may cross page boundary with one vector load. */
+ andl $(2 * VEC_SIZE - 1), %ecx
+ cmpl $VEC_SIZE, %ecx
+ ja L(cros_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. */
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+
+# ifdef USE_AS_STRNLEN
+ jnz L(first_vec_x0_check)
+ /* Adjust length and check the end of data. */
+ subq $VEC_SIZE, %rsi
+ jbe L(max)
+# else
+ jnz L(first_vec_x0)
+# endif
+
+ /* Align data for aligned loads in the loop. */
+ addq $VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+
+# ifdef USE_AS_STRNLEN
+ /* Adjust length. */
+ addq %rcx, %rsi
+
+ subq $(VEC_SIZE * 4), %rsi
+ jbe L(last_4x_vec_or_less)
+# endif
+ jmp L(more_4x_vec)
+
+ .p2align 4
+L(cros_page_boundary):
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bytes. */
+ sarl %cl, %eax
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRNLEN
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ jbe L(max)
+# endif
+ addq %rdi, %rax
+ addq %rcx, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(aligned_more):
+# ifdef USE_AS_STRNLEN
+ /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
+ with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+ to void possible addition overflow. */
+ negq %rcx
+ addq $VEC_SIZE, %rcx
+
+ /* Check the end of data. */
+ subq %rcx, %rsi
+ jbe L(max)
+# endif
+
+ addq $VEC_SIZE, %rdi
+
+# ifdef USE_AS_STRNLEN
+ subq $(VEC_SIZE * 4), %rsi
+ jbe L(last_4x_vec_or_less)
+# endif
+
+L(more_4x_vec):
+ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x3)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+# ifdef USE_AS_STRNLEN
+ subq $(VEC_SIZE * 4), %rsi
+ jbe L(last_4x_vec_or_less)
+# endif
+
+ /* Align data to 4 * VEC_SIZE. */
+ movq %rdi, %rcx
+ andl $(4 * VEC_SIZE - 1), %ecx
+ andq $-(4 * VEC_SIZE), %rdi
+
+# ifdef USE_AS_STRNLEN
+ /* Adjust length. */
+ addq %rcx, %rsi
+# endif
+
+ .p2align 4
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ vmovdqa (%rdi), %ymm1
+ vmovdqa VEC_SIZE(%rdi), %ymm2
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
+ VPMINU %ymm1, %ymm2, %ymm5
+ VPMINU %ymm3, %ymm4, %ymm6
+ VPMINU %ymm5, %ymm6, %ymm5
+
+ VPCMPEQ %ymm5, %ymm0, %ymm5
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jnz L(4x_vec_end)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+# ifndef USE_AS_STRNLEN
+ jmp L(loop_4x_vec)
+# else
+ subq $(VEC_SIZE * 4), %rsi
+ ja L(loop_4x_vec)
+
+L(last_4x_vec_or_less):
+ /* Less than 4 * VEC and aligned to VEC_SIZE. */
+ addl $(VEC_SIZE * 2), %esi
+ jle L(last_2x_vec)
+
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+
+ jnz L(first_vec_x2_check)
+ subl $VEC_SIZE, %esi
+ jle L(max)
+
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+
+ jnz L(first_vec_x3_check)
+ movq %r8, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_2x_vec):
+ addl $(VEC_SIZE * 2), %esi
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+
+ jnz L(first_vec_x0_check)
+ subl $VEC_SIZE, %esi
+ jle L(max)
+
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+ movq %r8, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x0_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ jbe L(max)
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x1_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ jbe L(max)
+ addq $VEC_SIZE, %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x2_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ jbe L(max)
+ addq $(VEC_SIZE * 2), %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x3_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ jbe L(max)
+ addq $(VEC_SIZE * 3), %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(max):
+ movq %r8, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(zero):
+ xorl %eax, %eax
+ ret
+# endif
+
+ .p2align 4
+L(first_vec_x0):
+ tzcntl %eax, %eax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ addq $VEC_SIZE, %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 2), %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(4x_vec_end):
+ VPCMPEQ %ymm1, %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+ VPCMPEQ %ymm2, %ymm0, %ymm2
+ vpmovmskb %ymm2, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+ VPCMPEQ %ymm3, %ymm0, %ymm3
+ vpmovmskb %ymm3, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+ VPCMPEQ %ymm4, %ymm0, %ymm4
+ vpmovmskb %ymm4, %eax
+ testl %eax, %eax
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 3), %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
new file mode 100644
index 0000000..2847440
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -0,0 +1,64 @@
+/* Multiple versions of strlen
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(strlen)
+ .type strlen, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __strlen_avx2(%rip), %rax
+ ret
+
+1: leaq __strlen_sse2(%rip), %rax
+ ret
+END(strlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strlen_sse2, @function; \
+ .p2align 4; \
+ .globl __strlen_sse2; \
+ .hidden __strlen_sse2; \
+ __strlen_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strlen_sse2, .-__strlen_sse2
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strlen calls through a PLT.
+ The speedup we get from using AVX2 instructions is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strlen; __GI_strlen = __strlen_sse2
+# endif
+#endif
+
+#include "../strlen.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2.S b/sysdeps/x86_64/multiarch/strnlen-avx2.S
new file mode 100644
index 0000000..c4062b2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-avx2.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_avx2
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen.S b/sysdeps/x86_64/multiarch/strnlen.S
new file mode 100644
index 0000000..0c2289a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen.S
@@ -0,0 +1,65 @@
+/* Multiple versions of strnlen
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(__strnlen)
+ .type __strnlen, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __strnlen_avx2(%rip), %rax
+ ret
+
+1: leaq __strnlen_sse2(%rip), %rax
+ ret
+END(__strnlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strnlen_sse2, @function; \
+ .p2align 4; \
+ .globl __strnlen_sse2; \
+ .hidden __strnlen_sse2; \
+ __strnlen_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strnlen_sse2, .-__strnlen_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal strnlen calls through a PLT.
+ The speedup we get from using AVX2 instructions is likely eaten away
+ by the indirect call in the PLT. */
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI_strnlen; __GI_strnlen = __strnlen_sse2; \
+ .globl __GI___strnlen; __GI___strnlen = __strnlen_sse2
+# endif
+#endif
+
+#include "../strnlen.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2.S b/sysdeps/x86_64/multiarch/wcslen-avx2.S
new file mode 100644
index 0000000..c9224f1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-avx2.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_avx2
+#define USE_AS_WCSLEN 1
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen.S b/sysdeps/x86_64/multiarch/wcslen.S
new file mode 100644
index 0000000..04369b6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen.S
@@ -0,0 +1,55 @@
+/* Multiple versions of wcslen
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(__wcslen)
+ .type __wcslen, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __wcslen_avx2(%rip), %rax
+ ret
+
+1: leaq __wcslen_sse2(%rip), %rax
+ ret
+END(__wcslen)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __wcslen_sse2, @function; \
+ .p2align 4; \
+ .globl __wcslen_sse2; \
+ .hidden __wcslen_sse2; \
+ __wcslen_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __wcslen_sse2, .-__wcslen_sse2
+#endif
+
+#include "../wcslen.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2.S
new file mode 100644
index 0000000..fac8354
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_avx2
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.S b/sysdeps/x86_64/multiarch/wcsnlen.S
new file mode 100644
index 0000000..0893bea
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen.S
@@ -0,0 +1,55 @@
+/* Multiple versions of wcsnlen
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(__wcsnlen)
+ .type __wcsnlen, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __wcsnlen_avx2(%rip), %rax
+ ret
+
+1: leaq __wcsnlen_sse2(%rip), %rax
+ ret
+END(__wcsnlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __wcsnlen_sse2, @function; \
+ .p2align 4; \
+ .globl __wcsnlen_sse2; \
+ .hidden __wcsnlen_sse2; \
+ __wcsnlen_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __wcsnlen_sse2, .-__wcsnlen_sse2
+#endif
+
+#include "../wcsnlen.S"
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a9c26d841ea4cb4d271093926f6bb6e79a8fe72a
commit a9c26d841ea4cb4d271093926f6bb6e79a8fe72a
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu May 18 11:10:09 2017 -0700
x86-64: Optimize memchr/rawmemchr/wmemchr with SSE2/AVX2
SSE2 memchr is extended to support wmemchr. AVX2 memchr/rawmemchr/wmemchr
are added to search 32 bytes with a single vector compare instruction.
AVX2 memchr/rawmemchr/wmemchr are as fast as SSE2 memchr/rawmemchr/wmemchr
for small sizes and up to 1.5X faster for larger sizes on Haswell and
Skylake. Select AVX2 memchr/rawmemchr/wmemchr on AVX2 machines where
vzeroupper is preferred and AVX unaligned load is fast.
NB: It uses TZCNT instead of BSF since TZCNT produces the same result
as BSF for non-zero input. TZCNT is faster than BSF and is executed
as BSF if machine doesn't support TZCNT.
* sysdeps/x86_64/memchr.S (MEMCHR): New. Depending on if
USE_AS_WMEMCHR is defined.
(PCMPEQ): Likewise.
(memchr): Renamed to ...
(MEMCHR): This. Support wmemchr if USE_AS_WMEMCHR is defined.
Replace pcmpeqb with PCMPEQ.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memchr-avx2, rawmemchr-avx2 and wmemchr-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memchr_avx2, __memchr_sse2,
__rawmemchr_avx2, __rawmemchr_sse2, __wmemchr_avx2 and
__wmemchr_sse2.
* sysdeps/x86_64/multiarch/memchr-avx2.S: New file.
* sysdeps/x86_64/multiarch/memchr.S: Likewise.
* sysdeps/x86_64/multiarch/rawmemchr-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/rawmemchr.S: Likewise.
* sysdeps/x86_64/multiarch/wmemchr-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wmemchr.S: Likewise.
* sysdeps/x86_64/wmemchr.S: Likewise.
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index d3be012..3167cd8 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -18,17 +18,31 @@
#include <sysdep.h>
+#ifdef USE_AS_WMEMCHR
+# define MEMCHR wmemchr
+# define PCMPEQ pcmpeqd
+#else
+# define MEMCHR memchr
+# define PCMPEQ pcmpeqb
+#endif
+
/* fast SSE2 version with using pmaxub and 64 byte loop */
.text
-ENTRY(memchr)
+ENTRY(MEMCHR)
movd %esi, %xmm1
mov %edi, %ecx
+#ifdef USE_AS_WMEMCHR
+ test %rdx, %rdx
+ jz L(return_null)
+ shl $2, %rdx
+#else
punpcklbw %xmm1, %xmm1
test %rdx, %rdx
jz L(return_null)
punpcklbw %xmm1, %xmm1
+#endif
and $63, %ecx
pshufd $0, %xmm1, %xmm1
@@ -37,7 +51,7 @@ ENTRY(memchr)
ja L(crosscache)
movdqu (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
+ PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
@@ -58,7 +72,7 @@ L(crosscache):
and $-16, %rdi
movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
+ PCMPEQ %xmm1, %xmm0
/* Check if there is a match. */
pmovmskb %xmm0, %eax
/* Remove the leading bytes. */
@@ -90,25 +104,25 @@ L(unaligned_no_match):
.p2align 4
L(loop_prolog):
movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
+ PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches)
movdqa 16(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
+ PCMPEQ %xmm1, %xmm2
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L(matches16)
movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
+ PCMPEQ %xmm1, %xmm3
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32)
movdqa 48(%rdi), %xmm4
- pcmpeqb %xmm1, %xmm4
+ PCMPEQ %xmm1, %xmm4
add $64, %rdi
pmovmskb %xmm4, %eax
test %eax, %eax
@@ -121,25 +135,25 @@ L(loop_prolog):
jbe L(exit_loop)
movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
+ PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches)
movdqa 16(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
+ PCMPEQ %xmm1, %xmm2
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L(matches16)
movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
+ PCMPEQ %xmm1, %xmm3
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32)
movdqa 48(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
+ PCMPEQ %xmm1, %xmm3
pmovmskb %xmm3, %eax
add $64, %rdi
@@ -160,10 +174,10 @@ L(align64_loop):
movdqa 32(%rdi), %xmm3
movdqa 48(%rdi), %xmm4
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm1, %xmm2
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm1, %xmm4
+ PCMPEQ %xmm1, %xmm0
+ PCMPEQ %xmm1, %xmm2
+ PCMPEQ %xmm1, %xmm3
+ PCMPEQ %xmm1, %xmm4
pmaxub %xmm0, %xmm3
pmaxub %xmm2, %xmm4
@@ -186,9 +200,9 @@ L(align64_loop):
jnz L(matches16)
movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
+ PCMPEQ %xmm1, %xmm3
- pcmpeqb 48(%rdi), %xmm1
+ PCMPEQ 48(%rdi), %xmm1
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32)
@@ -204,26 +218,26 @@ L(exit_loop):
jle L(exit_loop_32)
movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
+ PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches)
movdqa 16(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
+ PCMPEQ %xmm1, %xmm2
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L(matches16)
movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
+ PCMPEQ %xmm1, %xmm3
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32_1)
sub $16, %edx
jle L(return_null)
- pcmpeqb 48(%rdi), %xmm1
+ PCMPEQ 48(%rdi), %xmm1
pmovmskb %xmm1, %eax
test %eax, %eax
jnz L(matches48_1)
@@ -234,14 +248,14 @@ L(exit_loop):
L(exit_loop_32):
add $32, %edx
movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
+ PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches_1)
sub $16, %edx
jbe L(return_null)
- pcmpeqb 16(%rdi), %xmm1
+ PCMPEQ 16(%rdi), %xmm1
pmovmskb %xmm1, %eax
test %eax, %eax
jnz L(matches16_1)
@@ -308,8 +322,13 @@ L(matches48_1):
L(return_null):
xor %eax, %eax
ret
-END(memchr)
+END(MEMCHR)
+#ifdef USE_AS_WMEMCHR
+libc_hidden_def (__wmemchr)
+weak_alias (__wmemchr, wmemchr)
+libc_hidden_weak (wmemchr)
+#else
strong_alias (memchr, __memchr)
-
libc_hidden_builtin_def(memchr)
+#endif
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index a62def3..48aba0f 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -6,6 +6,7 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcmp-sse2-unaligned strncmp-ssse3 \
+ memchr-avx2 rawmemchr-avx2 \
memcmp-avx2 \
memcmp-sse4 memcpy-ssse3 \
memmove-ssse3 \
@@ -32,6 +33,7 @@ endif
ifeq ($(subdir),wcsmbs)
sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ wmemchr-avx2 \
wmemcmp-avx2 \
wcscpy-ssse3 wcscpy-c
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 35f1960..ae09241 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -38,6 +38,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
size_t i = 0;
+ /* Support sysdeps/x86_64/multiarch/memchr.S. */
+ IFUNC_IMPL (i, name, memchr,
+ IFUNC_IMPL_ADD (array, i, memchr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memchr_avx2)
+ IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
+
/* Support sysdeps/x86_64/multiarch/memcmp.S. */
IFUNC_IMPL (i, name, memcmp,
IFUNC_IMPL_ADD (array, i, memcmp,
@@ -151,6 +158,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memset_avx512_no_vzeroupper)
)
+ /* Support sysdeps/x86_64/multiarch/rawmemchr.S. */
+ IFUNC_IMPL (i, name, rawmemchr,
+ IFUNC_IMPL_ADD (array, i, rawmemchr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __rawmemchr_avx2)
+ IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
+
/* Support sysdeps/x86_64/multiarch/stpncpy.S. */
IFUNC_IMPL (i, name, stpncpy,
IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
@@ -295,6 +309,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__wcscpy_ssse3)
IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
+ /* Support sysdeps/x86_64/multiarch/wmemchr.S. */
+ IFUNC_IMPL (i, name, wmemchr,
+ IFUNC_IMPL_ADD (array, i, wmemchr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wmemchr_avx2)
+ IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
+
/* Support sysdeps/x86_64/multiarch/wmemcmp.S. */
IFUNC_IMPL (i, name, wmemcmp,
IFUNC_IMPL_ADD (array, i, wmemcmp,
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
new file mode 100644
index 0000000..a7275ed
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -0,0 +1,340 @@
+/* memchr/wmemchr optimized with AVX2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCHR
+# define MEMCHR __memchr_avx2
+# endif
+
+# ifdef USE_AS_WMEMCHR
+# define VPCMPEQ vpcmpeqd
+# else
+# define VPCMPEQ vpcmpeqb
+# endif
+
+# ifndef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+ .section .text.avx,"ax",@progbits
+ENTRY (MEMCHR)
+# ifndef USE_AS_RAWMEMCHR
+ /* Check for zero length. */
+ testq %rdx, %rdx
+ jz L(null)
+# endif
+ movl %edi, %ecx
+ /* Broadcast CHAR to YMM0. */
+ vmovd %esi, %xmm0
+# ifdef USE_AS_WMEMCHR
+ shl $2, %rdx
+ vpbroadcastd %xmm0, %ymm0
+# else
+ vpbroadcastb %xmm0, %ymm0
+# endif
+ /* Check if we may cross page boundary with one vector load. */
+ andl $(2 * VEC_SIZE - 1), %ecx
+ cmpl $VEC_SIZE, %ecx
+ ja L(cros_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. */
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ jnz L(first_vec_x0_check)
+ /* Adjust length and check the end of data. */
+ subq $VEC_SIZE, %rdx
+ jbe L(zero)
+# else
+ jnz L(first_vec_x0)
+# endif
+
+ /* Align data for aligned loads in the loop. */
+ addq $VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. */
+ addq %rcx, %rdx
+
+ subq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
+ jmp L(more_4x_vec)
+
+ .p2align 4
+L(cros_page_boundary):
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bytes. */
+ sarl %cl, %eax
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Check the end of data. */
+ cmpq %rax, %rdx
+ jbe L(zero)
+# endif
+ addq %rdi, %rax
+ addq %rcx, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(aligned_more):
+# ifndef USE_AS_RAWMEMCHR
+ /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
+ instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
+ overflow. */
+ negq %rcx
+ addq $VEC_SIZE, %rcx
+
+ /* Check the end of data. */
+ subq %rcx, %rdx
+ jbe L(zero)
+# endif
+
+ addq $VEC_SIZE, %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
+
+L(more_4x_vec):
+ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x3)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
+
+ /* Align data to 4 * VEC_SIZE. */
+ movq %rdi, %rcx
+ andl $(4 * VEC_SIZE - 1), %ecx
+ andq $-(4 * VEC_SIZE), %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. */
+ addq %rcx, %rdx
+# endif
+
+ .p2align 4
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+
+ vpor %ymm1, %ymm2, %ymm5
+ vpor %ymm3, %ymm4, %ymm6
+ vpor %ymm5, %ymm6, %ymm5
+
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jnz L(4x_vec_end)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+# ifdef USE_AS_RAWMEMCHR
+ jmp L(loop_4x_vec)
+# else
+ subq $(VEC_SIZE * 4), %rdx
+ ja L(loop_4x_vec)
+
+L(last_4x_vec_or_less):
+ /* Less than 4 * VEC and aligned to VEC_SIZE. */
+ addl $(VEC_SIZE * 2), %edx
+ jle L(last_2x_vec)
+
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+
+ jnz L(first_vec_x2_check)
+ subl $VEC_SIZE, %edx
+ jle L(zero)
+
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+
+ jnz L(first_vec_x3_check)
+ xorl %eax, %eax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_2x_vec):
+ addl $(VEC_SIZE * 2), %edx
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+
+ jnz L(first_vec_x0_check)
+ subl $VEC_SIZE, %edx
+ jle L(zero)
+
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+ xorl %eax, %eax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x0_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rdx
+ jbe L(zero)
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x1_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rdx
+ jbe L(zero)
+ addq $VEC_SIZE, %rax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x2_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rdx
+ jbe L(zero)
+ addq $(VEC_SIZE * 2), %rax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x3_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rdx
+ jbe L(zero)
+ addq $(VEC_SIZE * 3), %rax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(zero):
+ VZEROUPPER
+L(null):
+ xorl %eax, %eax
+ ret
+# endif
+
+ .p2align 4
+L(first_vec_x0):
+ tzcntl %eax, %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ addq $VEC_SIZE, %rax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 2), %rax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(4x_vec_end):
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+ vpmovmskb %ymm2, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+ vpmovmskb %ymm3, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+ vpmovmskb %ymm4, %eax
+ testl %eax, %eax
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 3), %rax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+END (MEMCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memchr.S b/sysdeps/x86_64/multiarch/memchr.S
new file mode 100644
index 0000000..dee3fd1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr.S
@@ -0,0 +1,64 @@
+/* Multiple versions of memchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(memchr)
+ .type memchr, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __memchr_avx2(%rip), %rax
+ ret
+
+1: leaq __memchr_sse2(%rip), %rax
+ ret
+END(memchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memchr_sse2, @function; \
+ .p2align 4; \
+ .globl __memchr_sse2; \
+ .hidden __memchr_sse2; \
+ __memchr_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memchr_sse2, .-__memchr_sse2
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memchr calls through a PLT.
+ The speedup we get from using AVX2 instructions is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memchr; __GI_memchr = __memchr_sse2
+# endif
+#endif
+
+#include "../memchr.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S
new file mode 100644
index 0000000..128f9ea
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S
@@ -0,0 +1,4 @@
+#define MEMCHR __rawmemchr_avx2
+#define USE_AS_RAWMEMCHR 1
+
+#include "memchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S
new file mode 100644
index 0000000..b938f13
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr.S
@@ -0,0 +1,64 @@
+/* Multiple versions of rawmemchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(__rawmemchr)
+ .type __rawmemchr, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __rawmemchr_avx2(%rip), %rax
+ ret
+
+1: leaq __rawmemchr_sse2(%rip), %rax
+ ret
+END(__rawmemchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __rawmemchr_sse2, @function; \
+ .p2align 4; \
+ .globl __rawmemchr_sse2; \
+ .hidden __rawmemchr_sse2; \
+ __rawmemchr_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __rawmemchr_sse2, .-__rawmemchr_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal rawmemchr calls through a
+ PLT. The speedup we get from using AVX2 instructions is likely eaten
+ away by the indirect call in the PLT. */
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_sse2
+# endif
+#endif
+
+#include "../rawmemchr.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2.S b/sysdeps/x86_64/multiarch/wmemchr-avx2.S
new file mode 100644
index 0000000..282854f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-avx2.S
@@ -0,0 +1,4 @@
+#define MEMCHR __wmemchr_avx2
+#define USE_AS_WMEMCHR 1
+
+#include "memchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr.S b/sysdeps/x86_64/multiarch/wmemchr.S
new file mode 100644
index 0000000..b79df23
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr.S
@@ -0,0 +1,67 @@
+/* Multiple versions of wmemchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(__wmemchr)
+ .type __wmemchr, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __wmemchr_avx2(%rip), %rax
+ ret
+
+1: leaq __wmemchr_sse2(%rip), %rax
+ ret
+END(__wmemchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __wmemchr_sse2, @function; \
+ .p2align 4; \
+ .globl __wmemchr_sse2; \
+ .hidden __wmemchr_sse2; \
+ __wmemchr_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __wmemchr_sse2, .-__wmemchr_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal wmemchr calls through a PLT.
+ The speedup we get from using AVX2 instructions is likely eaten away
+ by the indirect call in the PLT. */
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI___wmemchr; __GI___wmemchr = __wmemchr_sse2
+# undef libc_hidden_weak
+# define libc_hidden_weak(name) \
+ .weak __GI_wmemchr; __GI_wmemchr = __wmemchr_sse2
+# endif
+#endif
+
+#include "../wmemchr.S"
diff --git a/sysdeps/x86_64/wmemchr.S b/sysdeps/x86_64/wmemchr.S
new file mode 100644
index 0000000..9d8079b
--- /dev/null
+++ b/sysdeps/x86_64/wmemchr.S
@@ -0,0 +1,3 @@
+#define USE_AS_WMEMCHR 1
+
+#include "memchr.S"
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=33a455e5a983fc7ef5187f8a64ee467426fc892d
commit 33a455e5a983fc7ef5187f8a64ee467426fc892d
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Tue Jul 19 09:44:05 2016 -0700
x86-64: Optimize memcmp/wmemcmp AVX2
Optimize x86-64 memcmp/wmemcmp with AVX2. It uses vector compare as
much as possible. It is as fast as SSE4 memcmp for size <= 16 bytes
and up to 2X faster for size > 16 bytes on Haswell and Skylake. Select
AVX2 memcmp/wmemcmp on AVX2 machines where vzeroupper is preferred and
AVX unaligned load is fast.
NB: It uses TZCNT instead of BSF since TZCNT produces the same result
as BSF for non-zero input. TZCNT is faster than BSF and is executed
as BSF if machine doesn't support TZCNT.
Key features:
1. Use overlapping compare to avoid branch.
2. Use vector compare when size >= 4 bytes for memcmp or size >= 8
bytes for wmemcmp.
3. If size is 8 * VEC_SIZE or less, unroll the loop.
4. Compare 4 * VEC_SIZE at a time with the aligned first memory area.
5. Use 2 vector compares when size is 2 * VEC_SIZE or less.
6. Use 4 vector compares when size is 4 * VEC_SIZE or less.
7. Use 8 vector compares when size is 8 * VEC_SIZE or less.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memcmp-avx2 and wmemcmp-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memcmp_avx2 and __wmemcmp_avx2.
* sysdeps/x86_64/multiarch/memcmp-avx2.S: New file.
* sysdeps/x86_64/multiarch/wmemcmp-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/memcmp.S: Use __memcmp_avx2 on AVX
2 machines if AVX unaligned load is fast and vzeroupper is
preferred.
* sysdeps/x86_64/multiarch/wmemcmp.S: Use __wmemcmp_avx2 on AVX
2 machines if AVX unaligned load is fast and vzeroupper is
preferred.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 3736f54..a62def3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -6,6 +6,7 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcmp-sse2-unaligned strncmp-ssse3 \
+ memcmp-avx2 \
memcmp-sse4 memcpy-ssse3 \
memmove-ssse3 \
memcpy-ssse3-back \
@@ -30,5 +31,7 @@ CFLAGS-strspn-c.c += -msse4
endif
ifeq ($(subdir),wcsmbs)
-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
+sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ wmemcmp-avx2 \
+ wcscpy-ssse3 wcscpy-c
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a91d2f9..35f1960 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -40,6 +40,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memcmp.S. */
IFUNC_IMPL (i, name, memcmp,
+ IFUNC_IMPL_ADD (array, i, memcmp,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memcmp_avx2)
IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1),
__memcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
@@ -294,6 +297,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/wmemcmp.S. */
IFUNC_IMPL (i, name, wmemcmp,
+ IFUNC_IMPL_ADD (array, i, wmemcmp,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wmemcmp_avx2)
IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1),
__wmemcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2.S b/sysdeps/x86_64/multiarch/memcmp-avx2.S
new file mode 100644
index 0000000..eb3a4cb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2.S
@@ -0,0 +1,430 @@
+/* memcmp/wmemcmp optimized with AVX2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+/* memcmp/wmemcmp is implemented as:
+ 1. Use overlapping compare to avoid branch.
+ 2. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+ bytes for wmemcmp.
+ 3. If size is 8 * VEC_SIZE or less, unroll the loop.
+ 4. Compare 4 * VEC_SIZE at a time with the aligned first memory
+ area.
+ 5. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+ 6. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+ 7. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+# define MEMCMP __memcmp_avx2
+# endif
+
+# ifdef USE_AS_WMEMCMP
+# define VPCMPEQ vpcmpeqd
+# else
+# define VPCMPEQ vpcmpeqb
+# endif
+
+# ifndef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+# define VEC_SIZE 32
+# define VEC_MASK ((1 << VEC_SIZE) - 1)
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+ .section .text.avx,"ax",@progbits
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+ shl $2, %rdx
+# endif
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(more_2x_vec)
+
+L(last_2x_vec):
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
+ vmovdqu (%rsi), %ymm2
+ VPCMPEQ (%rdi), %ymm2, %ymm2
+ vpmovmskb %ymm2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+L(last_vec):
+ leaq -VEC_SIZE(%rdi, %rdx), %rdi
+ leaq -VEC_SIZE(%rsi, %rdx), %rsi
+ vmovdqu (%rsi), %ymm2
+ VPCMPEQ (%rdi), %ymm2, %ymm2
+ vpmovmskb %ymm2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec):
+ /* A byte or int32 is different within 16 or 32 bytes. */
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl (%rdi, %rcx), %edx
+ cmpl (%rsi, %rcx), %edx
+L(wmemcmp_return):
+ setl %al
+ negl %eax
+ orl $1, %eax
+# else
+ movzbl (%rdi, %rcx), %eax
+ movzbl (%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ VZEROUPPER
+ ret
+
+# ifdef USE_AS_WMEMCMP
+ .p2align 4
+L(4):
+ xorl %eax, %eax
+ movl (%rdi), %edx
+ cmpl (%rsi), %edx
+ jne L(wmemcmp_return)
+ ret
+# else
+ .p2align 4
+L(between_4_7):
+ vmovd (%rdi), %xmm1
+ vmovd (%rsi), %xmm2
+ VPCMPEQ %xmm1, %xmm2, %xmm2
+ vpmovmskb %xmm2, %eax
+ subl $0xffff, %eax
+ jnz L(first_vec)
+ leaq -4(%rdi, %rdx), %rdi
+ leaq -4(%rsi, %rdx), %rsi
+ vmovd (%rdi), %xmm1
+ vmovd (%rsi), %xmm2
+ VPCMPEQ %xmm1, %xmm2, %xmm2
+ vpmovmskb %xmm2, %eax
+ subl $0xffff, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(between_2_3):
+ /* Load 2 bytes into registers. */
+ movzwl (%rdi), %eax
+ movzwl (%rsi), %ecx
+ /* Compare the lowest byte. */
+ cmpb %cl, %al
+ jne L(1byte_reg)
+ /* Load the difference of 2 bytes into EAX. */
+ subl %ecx, %eax
+ /* Return if 2 bytes differ. */
+ jnz L(exit)
+ cmpb $2, %dl
+ /* Return if these are the last 2 bytes. */
+ je L(exit)
+ movzbl 2(%rdi), %eax
+ movzbl 2(%rsi), %ecx
+ subl %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit):
+ ret
+
+ .p2align 4
+L(1byte_reg):
+ movzbl %al, %eax
+ movzbl %cl, %ecx
+ sub %ecx, %eax
+ ret
+
+ .p2align 4
+L(1):
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %ecx
+ subl %ecx, %eax
+ ret
+# endif
+
+ .p2align 4
+L(zero):
+ xorl %eax, %eax
+ ret
+
+ .p2align 4
+L(less_vec):
+# ifdef USE_AS_WMEMCMP
+ /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
+ cmpb $4, %dl
+ je L(4)
+ jb L(zero)
+# else
+ cmpb $1, %dl
+ je L(1)
+ jb L(zero)
+ cmpb $4, %dl
+ jb L(between_2_3)
+ cmpb $8, %dl
+ jb L(between_4_7)
+# endif
+ cmpb $16, %dl
+ jae L(between_16_31)
+ /* It is between 8 and 15 bytes. */
+ vmovq (%rdi), %xmm1
+ vmovq (%rsi), %xmm2
+ VPCMPEQ %xmm1, %xmm2, %xmm2
+ vpmovmskb %xmm2, %eax
+ subl $0xffff, %eax
+ jnz L(first_vec)
+ leaq -8(%rdi, %rdx), %rdi
+ leaq -8(%rsi, %rdx), %rsi
+ vmovq (%rdi), %xmm1
+ vmovq (%rsi), %xmm2
+ VPCMPEQ %xmm1, %xmm2, %xmm2
+ vpmovmskb %xmm2, %eax
+ subl $0xffff, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(between_16_31):
+ /* From 16 to 31 bytes. No branch when size == 16. */
+ vmovdqu (%rsi), %xmm2
+ VPCMPEQ (%rdi), %xmm2, %xmm2
+ vpmovmskb %xmm2, %eax
+ subl $0xffff, %eax
+ jnz L(first_vec)
+
+ leaq -16(%rdi, %rdx), %rdi
+ leaq -16(%rsi, %rdx), %rsi
+ vmovdqu (%rsi), %xmm2
+ VPCMPEQ (%rdi), %xmm2, %xmm2
+ vpmovmskb %xmm2, %eax
+ subl $0xffff, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(more_2x_vec):
+ /* More than 2 * VEC. */
+ cmpq $(VEC_SIZE * 8), %rdx
+ ja L(more_8x_vec)
+ cmpq $(VEC_SIZE * 4), %rdx
+ jb L(last_4x_vec)
+
+ /* From 4 * VEC to 8 * VEC, inclusively. */
+ vmovdqu (%rsi), %ymm1
+ VPCMPEQ (%rdi), %ymm1, %ymm1
+
+ vmovdqu VEC_SIZE(%rsi), %ymm2
+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+
+ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+
+ vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+
+ vpand %ymm1, %ymm2, %ymm5
+ vpand %ymm3, %ymm4, %ymm6
+ vpand %ymm5, %ymm6, %ymm5
+
+ vpmovmskb %ymm5, %eax
+ subl $VEC_MASK, %eax
+ jnz L(4x_vec_end)
+
+ leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+ leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+ vmovdqu (%rsi), %ymm1
+ VPCMPEQ (%rdi), %ymm1, %ymm1
+
+ vmovdqu VEC_SIZE(%rsi), %ymm2
+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+ vpand %ymm2, %ymm1, %ymm5
+
+ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+ vpand %ymm3, %ymm5, %ymm5
+
+ vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+ vpand %ymm4, %ymm5, %ymm5
+
+ vpmovmskb %ymm5, %eax
+ subl $VEC_MASK, %eax
+ jnz L(4x_vec_end)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(more_8x_vec):
+ /* More than 8 * VEC. Check the first VEC. */
+ vmovdqu (%rsi), %ymm2
+ VPCMPEQ (%rdi), %ymm2, %ymm2
+ vpmovmskb %ymm2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ /* Align the first memory area for aligned loads in the loop.
+ Compute how much the first memory area is misaligned. */
+ movq %rdi, %rcx
+ andl $(VEC_SIZE - 1), %ecx
+ /* Get the negative of offset for alignment. */
+ subq $VEC_SIZE, %rcx
+ /* Adjust the second memory area. */
+ subq %rcx, %rsi
+ /* Adjust the first memory area which should be aligned now. */
+ subq %rcx, %rdi
+ /* Adjust length. */
+ addq %rcx, %rdx
+
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ vmovdqu (%rsi), %ymm1
+ VPCMPEQ (%rdi), %ymm1, %ymm1
+
+ vmovdqu VEC_SIZE(%rsi), %ymm2
+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+ vpand %ymm2, %ymm1, %ymm5
+
+ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+ vpand %ymm3, %ymm5, %ymm5
+
+ vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+ vpand %ymm4, %ymm5, %ymm5
+
+ vpmovmskb %ymm5, %eax
+ subl $VEC_MASK, %eax
+ jnz L(4x_vec_end)
+
+ addq $(VEC_SIZE * 4), %rdi
+ addq $(VEC_SIZE * 4), %rsi
+
+ subq $(VEC_SIZE * 4), %rdx
+ cmpq $(VEC_SIZE * 4), %rdx
+ jae L(loop_4x_vec)
+
+ /* Less than 4 * VEC. */
+ cmpq $VEC_SIZE, %rdx
+ jbe L(last_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ jbe L(last_2x_vec)
+
+L(last_4x_vec):
+ /* From 2 * VEC to 4 * VEC. */
+ vmovdqu (%rsi), %ymm2
+ VPCMPEQ (%rdi), %ymm2, %ymm2
+ vpmovmskb %ymm2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ addq $VEC_SIZE, %rdi
+ addq $VEC_SIZE, %rsi
+ vmovdqu (%rsi), %ymm2
+ VPCMPEQ (%rdi), %ymm2, %ymm2
+ vpmovmskb %ymm2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+ leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+ vmovdqu (%rsi), %ymm2
+ VPCMPEQ (%rdi), %ymm2, %ymm2
+ vpmovmskb %ymm2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ addq $VEC_SIZE, %rdi
+ addq $VEC_SIZE, %rsi
+ vmovdqu (%rsi), %ymm2
+ VPCMPEQ (%rdi), %ymm2, %ymm2
+ vpmovmskb %ymm2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(4x_vec_end):
+ vpmovmskb %ymm1, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+ vpmovmskb %ymm2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec_x1)
+ vpmovmskb %ymm3, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec_x2)
+ vpmovmskb %ymm4, %eax
+ subl $VEC_MASK, %eax
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl (VEC_SIZE * 3)(%rdi, %rcx), %edx
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx
+ jmp L(wmemcmp_return)
+# else
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl VEC_SIZE(%rdi, %rcx), %edx
+ cmpl VEC_SIZE(%rsi, %rcx), %edx
+ jmp L(wmemcmp_return)
+# else
+ movzbl VEC_SIZE(%rdi, %rcx), %eax
+ movzbl VEC_SIZE(%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
+ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
+ jmp L(wmemcmp_return)
+# else
+ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ VZEROUPPER
+ ret
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index 6129820..08acacb 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -27,7 +27,16 @@
ENTRY(memcmp)
.type memcmp, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
- HAS_CPU_FEATURE (SSSE3)
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __memcmp_avx2(%rip), %rax
+ ret
+
+1: HAS_CPU_FEATURE (SSSE3)
jnz 2f
leaq __memcmp_sse2(%rip), %rax
ret
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2.S
new file mode 100644
index 0000000..aa2190b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2.S
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_avx2
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
index 5dc54d7..46ee8f5 100644
--- a/sysdeps/x86_64/multiarch/wmemcmp.S
+++ b/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -27,7 +27,16 @@
ENTRY(wmemcmp)
.type wmemcmp, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
- HAS_CPU_FEATURE (SSSE3)
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __wmemcmp_avx2(%rip), %rax
+ ret
+
+1: HAS_CPU_FEATURE (SSSE3)
jnz 2f
leaq __wmemcmp_sse2(%rip), %rax
ret
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=51597d57a2889eff8bd8661a9c10ecc945ebae9d
commit 51597d57a2889eff8bd8661a9c10ecc945ebae9d
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun May 21 09:22:44 2017 -0700
x86-64: Optimize wmemset with SSE2/AVX2/AVX512
The difference between memset and wmemset is byte vs int. Add stubs
to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size:
SSE2 wmemset:
shl $0x2,%rdx
movd %esi,%xmm0
mov %rdi,%rax
pshufd $0x0,%xmm0,%xmm0
jmp entry_from_wmemset
SSE2 memset:
movd %esi,%xmm0
mov %rdi,%rax
punpcklbw %xmm0,%xmm0
punpcklwd %xmm0,%xmm0
pshufd $0x0,%xmm0,%xmm0
entry_from_wmemset:
Since the ERMS versions of wmemset requires "rep stosl" instead of
"rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset
are added. The SSE2 wmemset is about 3X faster and the AVX2 wmemset
is about 6X faster on Haswell.
* include/wchar.h (__wmemset_chk): New.
* sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed
to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
(WMEMSET_CHK_SYMBOL): Likewise.
(WMEMSET_SYMBOL): Likewise.
(__wmemset): Add hidden definition.
(wmemset): Add weak hidden definition.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned,
__wmemset_avx2_unaligned, __wmemset_avx512_unaligned,
__wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned
and __wmemset_chk_avx512_unaligned.
* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
(WMEMSET_SYMBOL): Likewise.
* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
(WMEMSET_SYMBOL): Likewise.
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated.
(WMEMSET_CHK_SYMBOL): New.
(WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise.
(WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise.
* sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New.
(libc_hidden_builtin_def): Also define __GI_wmemset and
__GI___wmemset.
(weak_alias): New.
* sysdeps/x86_64/multiarch/wmemset.S: New file.
* sysdeps/x86_64/multiarch/wmemset_chk.S: Likewise.
* sysdeps/x86_64/wmemset.S: Likewise.
* sysdeps/x86_64/wmemset_chk.S: Likewise.
diff --git a/include/wchar.h b/include/wchar.h
index e2579a1..a773d56 100644
--- a/include/wchar.h
+++ b/include/wchar.h
@@ -157,6 +157,9 @@ extern wchar_t *__wmemmove (wchar_t *__s1, const wchar_t *__s2,
extern wchar_t *__wcschrnul (const wchar_t *__s, wchar_t __wc)
__attribute_pure__;
+extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
+ size_t __ns) __THROW;
+
extern int __vfwscanf (__FILE *__restrict __s,
const wchar_t *__restrict __format,
__gnuc_va_list __arg)
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 69ed509..4127878 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -26,13 +26,18 @@
#define VMOVU movdqu
#define VMOVA movdqa
-#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
movq r, %rax; \
punpcklbw %xmm0, %xmm0; \
punpcklwd %xmm0, %xmm0; \
pshufd $0, %xmm0, %xmm0
+#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ movd d, %xmm0; \
+ movq r, %rax; \
+ pshufd $0, %xmm0, %xmm0
+
#define SECTION(p) p
#ifndef MEMSET_SYMBOL
@@ -40,10 +45,21 @@
# define MEMSET_SYMBOL(p,s) memset
#endif
+#ifndef WMEMSET_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s) p
+# define WMEMSET_SYMBOL(p,s) __wmemset
+#endif
+
#include "multiarch/memset-vec-unaligned-erms.S"
libc_hidden_builtin_def (memset)
+#if IS_IN (libc)
+libc_hidden_def (__wmemset)
+weak_alias (__wmemset, wmemset)
+libc_hidden_weak (wmemset)
+#endif
+
#if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH
strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
.section .gnu.warning.__memset_zero_constant_len_parameter
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 06d9a9d..a91d2f9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -300,6 +300,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__wmemcmp_ssse3)
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
+ /* Support sysdeps/x86_64/multiarch/wmemset.S. */
+ IFUNC_IMPL (i, name, wmemset,
+ IFUNC_IMPL_ADD (array, i, wmemset, 1,
+ __wmemset_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, wmemset,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wmemset_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, wmemset,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __wmemset_avx512_unaligned))
+
#ifdef SHARED
/* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */
IFUNC_IMPL (i, name, __memcpy_chk,
@@ -417,6 +428,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
__strncmp_ssse3)
IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
+
+ /* Support sysdeps/x86_64/multiarch/wmemset_chk.S. */
+ IFUNC_IMPL (i, name, __wmemset_chk,
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1,
+ __wmemset_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wmemset_chk_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __wmemset_chk_avx512_unaligned))
#endif
return i;
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 79975e0..7ab3d89 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -4,13 +4,19 @@
# define VMOVU vmovdqu
# define VMOVA vmovdqa
-# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
movq r, %rax; \
vpbroadcastb %xmm0, %ymm0
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ vmovd d, %xmm0; \
+ movq r, %rax; \
+ vpbroadcastd %xmm0, %ymm0
+
# define SECTION(p) p##.avx
# define MEMSET_SYMBOL(p,s) p##_avx2_##s
+# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
# include "memset-vec-unaligned-erms.S"
#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index a5ec349..0783979 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -4,14 +4,21 @@
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
-# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
movq r, %rax; \
vpbroadcastb %xmm0, %xmm0; \
vpbroadcastq %xmm0, %zmm0
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ vmovd d, %xmm0; \
+ movq r, %rax; \
+ vpbroadcastd %xmm0, %xmm0; \
+ vpbroadcastq %xmm0, %zmm0
+
# define SECTION(p) p##.avx512
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
+# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
# include "memset-vec-unaligned-erms.S"
#endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 704eed9..2eb9e37 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -30,6 +30,10 @@
# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
#endif
+#ifndef WMEMSET_CHK_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
+#endif
+
#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
@@ -79,6 +83,21 @@ END (__bzero)
weak_alias (__bzero, bzero)
#endif
+#if IS_IN (libc)
+# if defined SHARED
+ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+# endif
+
+ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+ shlq $2, %rdx
+ WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ jmp L(entry_from_bzero)
+END (WMEMSET_SYMBOL (__wmemset, unaligned))
+#endif
+
#if defined SHARED && IS_IN (libc)
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
cmpq %rdx, %rcx
@@ -87,8 +106,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
#endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
-L(memset_entry):
- VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
L(entry_from_bzero):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
@@ -132,7 +150,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
# endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
- VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
cmpq $(VEC_SIZE * 2), %rdx
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index 9d33118..11f2737 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -58,16 +58,23 @@ END(memset)
#if IS_IN (libc)
# define MEMSET_SYMBOL(p,s) p##_sse2_##s
+# define WMEMSET_SYMBOL(p,s) p##_sse2_##s
# ifdef SHARED
-# undef libc_hidden_builtin_def
+# undef libc_hidden_builtin_def
/* It doesn't make sense to send libc-internal memset calls through a PLT.
The speedup we get from using SSE2 instructions is likely eaten away
by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_memset; __GI_memset = __memset_sse2_unaligned
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \
+ .globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \
+ .globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned
# endif
+# undef weak_alias
+# define weak_alias(original, alias) \
+ .weak bzero; bzero = __bzero
+
# undef strong_alias
# define strong_alias(original, alias)
#endif
diff --git a/sysdeps/x86_64/multiarch/wmemset.S b/sysdeps/x86_64/multiarch/wmemset.S
new file mode 100644
index 0000000..3bd7ca2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemset.S
@@ -0,0 +1,47 @@
+/* Multiple versions of wmemset
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+ENTRY(__wmemset)
+ .type __wmemset, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ lea __wmemset_sse2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ lea __wmemset_avx2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Prefer_No_AVX512)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX512F_Usable)
+ jz 1f
+ lea __wmemset_avx512_unaligned(%rip), %RAX_LP
+1: ret
+END(__wmemset)
+
+weak_alias (__wmemset, wmemset)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wmemset_chk.S b/sysdeps/x86_64/multiarch/wmemset_chk.S
new file mode 100644
index 0000000..c76fcb1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemset_chk.S
@@ -0,0 +1,46 @@
+/* Multiple versions of wmemset_chk
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# ifdef SHARED
+ENTRY(__wmemset_chk)
+ .type __wmemset_chk, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ lea __wmemset_chk_sse2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ lea __wmemset_chk_avx2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Prefer_No_AVX512)
+ jnz 1f
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX512F_Usable)
+ jz 1f
+ lea __wmemset_chk_avx512_unaligned(%rip), %RAX_LP
+1: ret
+END(__wmemset_chk)
+# else
+# include "../wmemset_chk.S"
+# endif
+#endif
diff --git a/sysdeps/x86_64/wmemset.S b/sysdeps/x86_64/wmemset.S
new file mode 100644
index 0000000..f96d567
--- /dev/null
+++ b/sysdeps/x86_64/wmemset.S
@@ -0,0 +1 @@
+/* Implemented in memset.S. */
diff --git a/sysdeps/x86_64/wmemset_chk.S b/sysdeps/x86_64/wmemset_chk.S
new file mode 100644
index 0000000..64c2774
--- /dev/null
+++ b/sysdeps/x86_64/wmemset_chk.S
@@ -0,0 +1,33 @@
+/* Checking wmemset for x86-64.
+ Copyright (C) 2004-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef SHARED
+ /* For libc.so this is defined in wmemset.S.
+ For libc.a, this is a separate source to avoid
+ wmemset bringing in __chk_fail and all routines
+ it calls. */
+ .text
+ENTRY (__wmemset_chk)
+ cmpq %rdx, %rcx
+ jb __chk_fail
+ jmp wmemset
+END (__wmemset_chk)
+#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f37a2a4d0467286c711231baf633aaf977b752d0
commit f37a2a4d0467286c711231baf633aaf977b752d0
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sat May 20 06:48:04 2017 -0700
x86-64: Update strlen.S to support wcslen/wcsnlen
The difference between strlen and wcslen is byte vs int. We can
replace pminub and pcmpeqb with pminud and pcmpeqd to turn strlen
into wcslen.
* sysdeps/x86_64/strlen.S (PMINU): New.
(PCMPEQ): Likewise.
(SHIFT_RETURN): Likewise.
(FIND_ZERO): Replace pcmpeqb with PCMPEQ.
(strlen): Add SHIFT_RETURN before ret. Replace pcmpeqb and
pminub with PCMPEQ and PMINU.
* sysdeps/x86_64/wcsnlen.S: New file.
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index 5896e6b..b5ab117 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,4 +1,4 @@
-/* SSE2 version of strlen.
+/* SSE2 version of strlen/wcslen.
Copyright (C) 2012-2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -18,6 +18,16 @@
#include <sysdep.h>
+#ifdef AS_WCSLEN
+# define PMINU pminud
+# define PCMPEQ pcmpeqd
+# define SHIFT_RETURN shrq $2, %rax
+#else
+# define PMINU pminub
+# define PCMPEQ pcmpeqb
+# define SHIFT_RETURN
+#endif
+
/* Long lived register in strlen(s), strnlen(s, n) are:
%xmm3 - zero
@@ -32,10 +42,10 @@ ENTRY(strlen)
/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
#define FIND_ZERO \
- pcmpeqb (%rax), %xmm0; \
- pcmpeqb 16(%rax), %xmm1; \
- pcmpeqb 32(%rax), %xmm2; \
- pcmpeqb 48(%rax), %xmm3; \
+ PCMPEQ (%rax), %xmm0; \
+ PCMPEQ 16(%rax), %xmm1; \
+ PCMPEQ 32(%rax), %xmm2; \
+ PCMPEQ 48(%rax), %xmm3; \
pmovmskb %xmm0, %esi; \
pmovmskb %xmm1, %edx; \
pmovmskb %xmm2, %r8d; \
@@ -54,6 +64,9 @@ ENTRY(strlen)
xor %rax, %rax
ret
L(n_nonzero):
+# ifdef AS_WCSLEN
+ shlq $2, %rsi
+# endif
/* Initialize long lived registers. */
@@ -96,6 +109,7 @@ L(n_nonzero):
test %rdx, %rdx; \
je L(lab); \
bsfq %rdx, %rax; \
+ SHIFT_RETURN; \
ret
#ifdef AS_STRNLEN
@@ -104,19 +118,20 @@ L(n_nonzero):
#else
/* Test first 16 bytes unaligned. */
movdqu (%rax), %xmm4
- pcmpeqb %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm4
pmovmskb %xmm4, %edx
test %edx, %edx
je L(next48_bytes)
bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
+ SHIFT_RETURN
ret
L(next48_bytes):
/* Same as FIND_ZERO except we do not check first 16 bytes. */
andq $-16, %rax
- pcmpeqb 16(%rax), %xmm1
- pcmpeqb 32(%rax), %xmm2
- pcmpeqb 48(%rax), %xmm3
+ PCMPEQ 16(%rax), %xmm1
+ PCMPEQ 32(%rax), %xmm2
+ PCMPEQ 48(%rax), %xmm3
pmovmskb %xmm1, %edx
pmovmskb %xmm2, %r8d
pmovmskb %xmm3, %ecx
@@ -145,6 +160,7 @@ L(strnlen_ret):
test %rdx, %rdx
je L(loop_init)
bsfq %rdx, %rax
+ SHIFT_RETURN
ret
#endif
.p2align 4
@@ -161,10 +177,10 @@ L(loop):
je L(exit_end)
movdqa (%rax), %xmm0
- pminub 16(%rax), %xmm0
- pminub 32(%rax), %xmm0
- pminub 48(%rax), %xmm0
- pcmpeqb %xmm3, %xmm0
+ PMINU 16(%rax), %xmm0
+ PMINU 32(%rax), %xmm0
+ PMINU 48(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
pmovmskb %xmm0, %edx
testl %edx, %edx
jne L(exit)
@@ -182,6 +198,7 @@ L(first):
bsfq %rdx, %rdx
addq %rdx, %rax
subq %rdi, %rax
+ SHIFT_RETURN
ret
.p2align 4
@@ -192,6 +209,7 @@ L(exit):
bsfq %rdx, %rdx
addq %rdx, %rax
subq %rdi, %rax
+ SHIFT_RETURN
ret
#else
@@ -201,10 +219,10 @@ L(exit):
L(loop):
movdqa 64(%rax), %xmm0
- pminub 80(%rax), %xmm0
- pminub 96(%rax), %xmm0
- pminub 112(%rax), %xmm0
- pcmpeqb %xmm3, %xmm0
+ PMINU 80(%rax), %xmm0
+ PMINU 96(%rax), %xmm0
+ PMINU 112(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
pmovmskb %xmm0, %edx
testl %edx, %edx
jne L(exit64)
@@ -212,10 +230,10 @@ L(loop):
subq $-128, %rax
movdqa (%rax), %xmm0
- pminub 16(%rax), %xmm0
- pminub 32(%rax), %xmm0
- pminub 48(%rax), %xmm0
- pcmpeqb %xmm3, %xmm0
+ PMINU 16(%rax), %xmm0
+ PMINU 32(%rax), %xmm0
+ PMINU 48(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
pmovmskb %xmm0, %edx
testl %edx, %edx
jne L(exit0)
@@ -231,6 +249,7 @@ L(exit0):
bsfq %rdx, %rdx
addq %rdx, %rax
subq %rdi, %rax
+ SHIFT_RETURN
ret
#endif
diff --git a/sysdeps/x86_64/wcsnlen.S b/sysdeps/x86_64/wcsnlen.S
new file mode 100644
index 0000000..968bb69
--- /dev/null
+++ b/sysdeps/x86_64/wcsnlen.S
@@ -0,0 +1,7 @@
+#define AS_WCSLEN
+#define AS_STRNLEN
+#define strlen __wcsnlen
+
+#include "strlen.S"
+
+weak_alias(__wcsnlen, wcsnlen)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6c6a107dada6f6026a8cbd5f57a4f1d2f9ab87bc
commit 6c6a107dada6f6026a8cbd5f57a4f1d2f9ab87bc
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Tue May 23 08:43:52 2017 -0700
x86_64: Remove redundant REX bytes from memrchr.S
By x86-64 specification, 32-bit destination registers are zero-extended
to 64 bits. There is no need to use 64-bit registers when only the lower
32 bits are non-zero. Also 2 instructions in:
mov %rdi, %rcx
and $15, %rcx
jz L(length_less16_offset0)
mov %rdi, %rcx <<< redundant
and $15, %rcx <<< redundant
are redundant.
* sysdeps/x86_64/memrchr.S (__memrchr): Use 32-bit registers for
the lower 32 bits. Remove redundant instructions.
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index aab1a4a..5fa0fe9 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -22,7 +22,7 @@
.text
ENTRY (__memrchr)
- movd %rsi, %xmm1
+ movd %esi, %xmm1
sub $16, %rdx
jbe L(length_less16)
@@ -42,8 +42,8 @@ ENTRY (__memrchr)
jnz L(matches0)
sub $64, %rdi
- mov %rdi, %rcx
- and $15, %rcx
+ mov %edi, %ecx
+ and $15, %ecx
jz L(loop_prolog)
add $16, %rdi
@@ -108,8 +108,8 @@ L(loop_prolog):
test %eax, %eax
jnz L(matches0)
- mov %rdi, %rcx
- and $63, %rcx
+ mov %edi, %ecx
+ and $63, %ecx
jz L(align64_loop)
add $64, %rdi
@@ -166,8 +166,8 @@ L(align64_loop):
.p2align 4
L(exit_loop):
- add $64, %rdx
- cmp $32, %rdx
+ add $64, %edx
+ cmp $32, %edx
jbe L(exit_loop_32)
movdqa 48(%rdi), %xmm0
@@ -187,7 +187,7 @@ L(exit_loop):
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches16_1)
- cmp $48, %rdx
+ cmp $48, %edx
jbe L(return_null)
pcmpeqb (%rdi), %xmm1
@@ -204,7 +204,7 @@ L(exit_loop_32):
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches48_1)
- cmp $16, %rdx
+ cmp $16, %edx
jbe L(return_null)
pcmpeqb 32(%rdi), %xmm1
@@ -276,7 +276,7 @@ L(matches48_1):
.p2align 4
L(return_null):
- xor %rax, %rax
+ xor %eax, %eax
ret
.p2align 4
@@ -306,18 +306,16 @@ L(length_less16):
punpcklbw %xmm1, %xmm1
punpcklbw %xmm1, %xmm1
- add $16, %rdx
+ add $16, %edx
pshufd $0, %xmm1, %xmm1
- mov %rdi, %rcx
- and $15, %rcx
+ mov %edi, %ecx
+ and $15, %ecx
jz L(length_less16_offset0)
- mov %rdi, %rcx
- and $15, %rcx
mov %cl, %dh
- mov %rcx, %r8
+ mov %ecx, %esi
add %dl, %dh
and $-16, %rdi
@@ -340,7 +338,7 @@ L(length_less16):
bsr %eax, %eax
add %rdi, %rax
- add %r8, %rax
+ add %rsi, %rax
ret
.p2align 4
@@ -362,14 +360,14 @@ L(length_less16_part2):
pcmpeqb (%rdi), %xmm1
pmovmskb %xmm1, %eax
- mov %r8, %rcx
+ mov %esi, %ecx
sar %cl, %eax
test %eax, %eax
jz L(return_null)
bsr %eax, %eax
add %rdi, %rax
- add %r8, %rax
+ add %rsi, %rax
ret
.p2align 4
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2e88eb8c07b587d4fb818d9498954fc1f11de3d2
commit 2e88eb8c07b587d4fb818d9498954fc1f11de3d2
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed May 24 08:41:23 2017 -0700
Add more tests for memchr
This patch adds tests for len == 0 and tests for positions close to the
beginning, which are equivalent to positions close to the end for memchr.
* string/test-memrchr.c (test_main): Add tests for len == 0
and tests for positions close to the beginning, which are
equivalent to positions close to the end for memchr.
diff --git a/string/test-memrchr.c b/string/test-memrchr.c
index bfc9920..15483f5 100644
--- a/string/test-memrchr.c
+++ b/string/test-memrchr.c
@@ -151,15 +151,32 @@ test_main (void)
for (i = 1; i < 8; ++i)
{
+ /* Test len == 0. */
+ do_test (i, i, 0, 0);
+ do_test (i, i, 0, 23);
+
do_test (0, 16 << i, 2048, 23);
do_test (i, 64, 256, 23);
do_test (0, 16 << i, 2048, 0);
do_test (i, 64, 256, 0);
+
+ do_test (0, i, 256, 23);
+ do_test (0, i, 256, 0);
+ do_test (i, i, 256, 23);
+ do_test (i, i, 256, 0);
+
}
for (i = 1; i < 32; ++i)
{
do_test (0, i, i + 1, 23);
do_test (0, i, i + 1, 0);
+ do_test (i, i, i + 1, 23);
+ do_test (i, i, i + 1, 0);
+
+ do_test (0, 1, i + 1, 23);
+ do_test (0, 2, i + 1, 0);
+ do_test (i, 1, i + 1, 23);
+ do_test (i, 2, i + 1, 0);
}
do_random_tests ();
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=986b51f7fb78074b4426409002dab9389597df40
commit 986b51f7fb78074b4426409002dab9389597df40
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed May 24 08:35:59 2017 -0700
benchtests: Add more tests for memrchr
bench-memchr.c is shared with bench-memrchr.c. This patch adds some
tests for positions close to the beginning for memrchr, which are
equivalent to positions close to the end for memchr.
* benchtests/bench-memchr.c (do_test): Print out both length
and position.
(test_main): Also test the position close to the beginning for
memrchr.
diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
index 16099ac..c95cc70 100644
--- a/benchtests/bench-memchr.c
+++ b/benchtests/bench-memchr.c
@@ -117,7 +117,8 @@ do_test (size_t align, size_t pos, size_t len, int seek_char)
buf[align + len] = seek_char;
}
- printf ("Length %4zd, alignment %2zd:", pos, align);
+ printf ("Length %4zd, position %4zd, alignment %2zd:",
+ len, pos, align);
FOR_EACH_IMPL (impl, 0)
do_one_test (impl, (CHAR *) (buf + align), seek_char, len, result);
@@ -143,11 +144,27 @@ test_main (void)
do_test (i, 64, 256, 23);
do_test (0, 16 << i, 2048, 0);
do_test (i, 64, 256, 0);
+#ifdef USE_AS_MEMRCHR
+ /* Also test the position close to the beginning for memrchr. */
+ do_test (0, i, 256, 23);
+ do_test (0, i, 256, 0);
+ do_test (i, i, 256, 23);
+ do_test (i, i, 256, 0);
+#endif
}
for (i = 1; i < 32; ++i)
{
do_test (0, i, i + 1, 23);
do_test (0, i, i + 1, 0);
+ do_test (i, i, i + 1, 23);
+ do_test (i, i, i + 1, 0);
+#ifdef USE_AS_MEMRCHR
+ /* Also test the position close to the beginning for memrchr. */
+ do_test (0, 1, i + 1, 23);
+ do_test (0, 2, i + 1, 0);
+ do_test (i, i, i + 1, 23);
+ do_test (i, i, i + 1, 0);
+#endif
}
return ret;
-----------------------------------------------------------------------
hooks/post-receive
--
GNU C Library master sources