This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH][BZ #12100] strstr with unaligned loads.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Sun, 25 Aug 2013 11:07:58 +0200
- Subject: [PATCH][BZ #12100] strstr with unaligned loads.
Hi,
This patch continues to remove ineffective sse4.2 functions. Now we
focus on strstr.
I use a techique of doing vectorized search for first pair of characters
which is fastest in practice.
I analyzed a pcpistri instruction and there is a weakness that degrades
performance. In comparison with pcpistri a limiting factor is 16-th
character. It gets matched when it corresponds to first character in
needle and this leads to many false positives.
This weakness could theoreticaly be addressed by using pcpistrm to
compute masks at 8 byte intervals and doing bitwise and to ensure that
we always check at least 8 bytes.
However this does not work in practice because pcpistrm is slow and we
are left with pcpistri whose result is mostly useless.
New sse2 version is 280% faster on nehalem and fx10 and 400% faster on
ivy bridge for random strings where you pick randomly between 16
characters.
http://kam.mff.cuni.cz/~ondra/benchmark_string/strstr.html
I tried to adapt it to avx2, you can test following
http://kam.mff.cuni.cz/~ondra/benchmark_string/strstr_profile250813.tar.bz2
I am collecting data of strstr usage in wild. I need to find program
that heavily uses strstr A gcc benchmark mostly catches strstr used by
linker. I could still get 10% improvement there but it is hard to get
more as around half of calls are strstr(string,"=") or strstr(string,",").
Another common call is strstr(s,"selinux").
Then there is second problem that sse4_2 version is not written properly
and exhibits quadratic behaviour.
This could be solved by standard buy or rent techique which I employ.
For any position p there could be only p+64n+512 comparisons until I
switch to two way algorithm which gives linear complexity.
I also removed strcasestr as I do not have implementation yet but it
should be similar. Main question here is if for all locales it is safe
to assume that only case conversion for characters 0-127 is a-z <-> A-Z
Then I could vectorize conversion for most cases which would give
noticable speeedup. Otherwise I would need to check locale if this
holds.
Passes tests, it needed lot of manual editing to optimize so it should
be checked carefully.
>From this I will derive a ssse3 version, shift sse2 version and memmem.
Do you see any optimization oppurtinity?
Ondra
* sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S: New file
* sysdeps/x86_64/multiarch/strstr-c.c: Moved to ...
* sysdeps/x86_64/multiarch/strstr.c: ... here.
(strstr): Add __strstr_sse2_unaligned ifunc.
* sysdeps/x86_64/multiarch/strcasestr-c.c: Moved to ...
* sysdeps/x86_64/multiarch/strcasestr.c ... here.
(strcasestr): Remove __strcasestr_sse42 ifunc.
* sysdeps/x86_64/multiarch/strcasestr-nonascii.c: Remove.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Update.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Update.
---
sysdeps/x86_64/multiarch/Makefile | 10 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 +-
sysdeps/x86_64/multiarch/strcasestr-c.c | 19 --
sysdeps/x86_64/multiarch/strcasestr-nonascii.c | 50 ---
sysdeps/x86_64/multiarch/strcasestr.c | 23 +-
sysdeps/x86_64/multiarch/strstr-c.c | 47 ---
sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S | 381 ++++++++++++++++++++++
sysdeps/x86_64/multiarch/strstr.c | 388 ++---------------------
8 files changed, 428 insertions(+), 494 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/strcasestr-c.c
delete mode 100644 sysdeps/x86_64/multiarch/strcasestr-nonascii.c
delete mode 100644 sysdeps/x86_64/multiarch/strstr-c.c
create mode 100644 sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 203d16e..146e800 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -9,22 +9,20 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
- memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
+ memmove-ssse3-back strcasecmp_l-ssse3 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
- strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
+ strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3 \
+ strstr-sse2-unaligned
ifeq (yes,$(config-cflags-sse4))
-sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
+sysdep_routines += strcspn-c strpbrk-c strspn-c varshift
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
CFLAGS-strspn-c.c += -msse4
-CFLAGS-strstr.c += -msse4
-CFLAGS-strcasestr.c += -msse4
-CFLAGS-strcasestr-nonascii.c += -msse4
endif
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 28d3579..0eaddc7 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -104,8 +104,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasestr.c. */
IFUNC_IMPL (i, name, strcasestr,
- IFUNC_IMPL_ADD (array, i, strcasestr, HAS_SSE4_2,
- __strcasestr_sse42)
IFUNC_IMPL_ADD (array, i, strcasestr, 1, __strcasestr_sse2))
/* Support sysdeps/x86_64/multiarch/strcat.S. */
@@ -196,7 +194,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strstr-c.c. */
IFUNC_IMPL (i, name, strstr,
- IFUNC_IMPL_ADD (array, i, strstr, HAS_SSE4_2, __strstr_sse42)
+ IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
/* Support sysdeps/x86_64/multiarch/wcscpy.S. */
diff --git a/sysdeps/x86_64/multiarch/strcasestr-c.c b/sysdeps/x86_64/multiarch/strcasestr-c.c
deleted file mode 100644
index c13a4c4..0000000
--- a/sysdeps/x86_64/multiarch/strcasestr-c.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Multiple versions of strcasestr
- All versions must be listed in ifunc-impl-list.c. */
-
-#include "init-arch.h"
-
-#define STRCASESTR __strcasestr_sse2
-
-#include "string/strcasestr.c"
-
-extern char *__strcasestr_sse42 (const char *, const char *) attribute_hidden;
-extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden;
-
-#if 1
-libc_ifunc (__strcasestr,
- HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2);
-#else
-libc_ifunc (__strcasestr,
- 0 ? __strcasestr_sse42 : __strcasestr_sse2);
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c b/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
deleted file mode 100644
index 032a642..0000000
--- a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/* strstr with SSE4.2 intrinsics
- Copyright (C) 2010-2013 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <ctype.h>
-#include <xmmintrin.h>
-
-
-/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C
- locale. */
-static __m128i
-__m128i_strloadu_tolower (const unsigned char *p)
-{
- union
- {
- char b[16];
- __m128i x;
- } u;
-
- for (int i = 0; i < 16; ++i)
- if (p[i] == 0)
- {
- u.b[i] = 0;
- break;
- }
- else
- u.b[i] = tolower (p[i]);
-
- return u.x;
-}
-
-
-#define STRCASESTR_NONASCII
-#define USE_AS_STRCASESTR
-#define STRSTR_SSE42 __strcasestr_sse42_nonascii
-#include "strstr.c"
diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c
index d1cfb3b..2c2454a 100644
--- a/sysdeps/x86_64/multiarch/strcasestr.c
+++ b/sysdeps/x86_64/multiarch/strcasestr.c
@@ -1,7 +1,18 @@
-extern char *__strcasestr_sse42_nonascii (const unsigned char *s1,
- const unsigned char *s2)
- attribute_hidden;
+/* Multiple versions of strcasestr
+ All versions must be listed in ifunc-impl-list.c. */
-#define USE_AS_STRCASESTR
-#define STRSTR_SSE42 __strcasestr_sse42
-#include "strstr.c"
+#include "init-arch.h"
+
+#define STRCASESTR __strcasestr_sse2
+
+#include "string/strcasestr.c"
+
+extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden;
+
+#if 1
+libc_ifunc (__strcasestr,
+ __strcasestr_sse2);
+#else
+libc_ifunc (__strcasestr,
+ 0 ? __strcasestr_sse42 : __strcasestr_sse2);
+#endif
diff --git a/sysdeps/x86_64/multiarch/strstr-c.c b/sysdeps/x86_64/multiarch/strstr-c.c
deleted file mode 100644
index 42bbe48..0000000
--- a/sysdeps/x86_64/multiarch/strstr-c.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Multiple versions of strstr.
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2012-2013 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* Redefine strstr so that the compiler won't complain about the type
- mismatch with the IFUNC selector in strong_alias, below. */
-#undef strstr
-#define strstr __redirect_strstr
-#include <string.h>
-#undef strstr
-
-#define STRSTR __strstr_sse2
-#ifdef SHARED
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
- __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
-#endif
-
-#include "string/strstr.c"
-
-extern __typeof (__redirect_strstr) __strstr_sse42 attribute_hidden;
-extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
-
-#include "init-arch.h"
-
-/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
- ifunc symbol properly. */
-extern __typeof (__redirect_strstr) __libc_strstr;
-libc_ifunc (__libc_strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2)
-
-#undef strstr
-strong_alias (__libc_strstr, strstr)
diff --git a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
new file mode 100644
index 0000000..6c134fa
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
@@ -0,0 +1,381 @@
+/* strstr with unaligned loads
+ Copyright (C) 2009-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+
+ENTRY(__strstr_sse2_unaligned)
+ movzbl (%rsi), %eax
+ testb %al, %al
+ je L(empty)
+ movzbl 1(%rsi), %edx
+ testb %dl, %dl
+ je L(strchr)
+ movd %eax, %xmm1
+ movd %edx, %xmm2
+ movq %rdi, %rax
+ andl $4095, %eax
+ punpcklbw %xmm1, %xmm1
+ cmpq $4031, %rax
+ punpcklbw %xmm2, %xmm2
+ punpcklwd %xmm1, %xmm1
+ punpcklwd %xmm2, %xmm2
+ pshufd $0, %xmm1, %xmm1
+ pshufd $0, %xmm2, %xmm2
+ ja L(cross_page)
+ movdqu (%rdi), %xmm3
+ pxor %xmm5, %xmm5
+ movdqu 1(%rdi), %xmm4
+ movdqa %xmm3, %xmm6
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm2, %xmm4
+ movdqu 16(%rdi), %xmm0
+ pcmpeqb %xmm5, %xmm6
+ pminub %xmm4, %xmm3
+ movdqa %xmm3, %xmm4
+ movdqu 17(%rdi), %xmm3
+ pcmpeqb %xmm0, %xmm5
+ pcmpeqb %xmm2, %xmm3
+ por %xmm6, %xmm4
+ pcmpeqb %xmm1, %xmm0
+ pminub %xmm3, %xmm0
+ por %xmm5, %xmm0
+ pmovmskb %xmm4, %r8d
+ pmovmskb %xmm0, %eax
+ salq $16, %rax
+ orq %rax, %r8
+ je L(next_32_bytes)
+L(next_pair_index):
+ bsf %r8, %rax
+ addq %rdi, %rax
+ cmpb $0, (%rax)
+ je L(zero1)
+ movzbl 2(%rsi), %edx
+ testb %dl, %dl
+ je L(found1)
+ cmpb 2(%rax), %dl
+ jne L(next_pair)
+ xorl %edx, %edx
+ jmp L(pair_loop_start)
+
+ ALIGN (4)
+L(strchr):
+ movzbl %al, %esi
+ jmp __strchr_sse2
+
+ ALIGN (4)
+L(pair_loop):
+ addq $1, %rdx
+ cmpb 2(%rax,%rdx), %cl
+ jne L(next_pair)
+L(pair_loop_start):
+ movzbl 3(%rsi,%rdx), %ecx
+ testb %cl, %cl
+ jne L(pair_loop)
+L(found1):
+ ret
+L(zero1):
+ xorl %eax, %eax
+ ret
+
+ ALIGN (4)
+L(next_pair):
+ leaq -1(%r8), %rax
+ andq %rax, %r8
+ jne L(next_pair_index)
+
+ ALIGN (4)
+L(next_32_bytes):
+ movdqu 32(%rdi), %xmm3
+ pxor %xmm5, %xmm5
+ movdqu 33(%rdi), %xmm4
+ movdqa %xmm3, %xmm6
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm2, %xmm4
+ movdqu 48(%rdi), %xmm0
+ pcmpeqb %xmm5, %xmm6
+ pminub %xmm4, %xmm3
+ movdqa %xmm3, %xmm4
+ movdqu 49(%rdi), %xmm3
+ pcmpeqb %xmm0, %xmm5
+ pcmpeqb %xmm2, %xmm3
+ por %xmm6, %xmm4
+ pcmpeqb %xmm1, %xmm0
+ pminub %xmm3, %xmm0
+ por %xmm5, %xmm0
+ pmovmskb %xmm4, %eax
+ salq $32, %rax
+ pmovmskb %xmm0, %r8d
+ salq $48, %r8
+ orq %rax, %r8
+ je L(loop_header)
+L(next_pair2_index):
+ bsfq %r8, %rax
+ addq %rdi, %rax
+ cmpb $0, (%rax)
+ je L(zero2)
+ movzbl 2(%rsi), %edx
+ testb %dl, %dl
+ je L(found2)
+ cmpb 2(%rax), %dl
+ jne L(next_pair2)
+ xorl %edx, %edx
+ jmp L(pair_loop2_start)
+
+ ALIGN (4)
+L(pair_loop2):
+ addq $1, %rdx
+ cmpb 2(%rax,%rdx), %cl
+ jne L(next_pair2)
+L(pair_loop2_start):
+ movzbl 3(%rsi,%rdx), %ecx
+ testb %cl, %cl
+ jne L(pair_loop2)
+L(found2):
+ ret
+ L(zero2):
+ xorl %eax, %eax
+ ret
+L(empty):
+ mov %rdi, %rax
+ ret
+
+ ALIGN (4)
+L(next_pair2):
+ leaq -1(%r8), %rax
+ andq %rax, %r8
+ jne L(next_pair2_index)
+L(loop_header):
+ movq $-512, %r11
+ movq %rdi, %r9
+
+ pxor %xmm7, %xmm7
+ andq $-64, %rdi
+
+ ALIGN(4)
+L(loop):
+ movdqa 64(%rdi), %xmm3
+ movdqu 63(%rdi), %xmm6
+ movdqa %xmm3, %xmm0
+ pxor %xmm2, %xmm3
+ pxor %xmm1, %xmm6
+ movdqa 80(%rdi), %xmm10
+ por %xmm3, %xmm6
+ pminub %xmm10, %xmm0
+ movdqu 79(%rdi), %xmm3
+ pxor %xmm2, %xmm10
+ pxor %xmm1, %xmm3
+ movdqa 96(%rdi), %xmm9
+ por %xmm10, %xmm3
+ pminub %xmm9, %xmm0
+ pxor %xmm2, %xmm9
+ movdqa 112(%rdi), %xmm8
+ addq $64, %rdi
+ pminub %xmm6, %xmm3
+ movdqu 31(%rdi), %xmm4
+ pminub %xmm8, %xmm0
+ pxor %xmm2, %xmm8
+ pxor %xmm1, %xmm4
+ por %xmm9, %xmm4
+ pminub %xmm4, %xmm3
+ movdqu 47(%rdi), %xmm5
+ pxor %xmm1, %xmm5
+ por %xmm8, %xmm5
+ pminub %xmm5, %xmm3
+ pminub %xmm3, %xmm0
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ testl %eax, %eax
+ je L(loop)
+ pminub (%rdi), %xmm6
+ pminub 32(%rdi),%xmm4
+ pminub 48(%rdi),%xmm5
+ pcmpeqb %xmm7, %xmm6
+ pcmpeqb %xmm7, %xmm5
+ pmovmskb %xmm6, %edx
+ movdqa 16(%rdi), %xmm8
+ pcmpeqb %xmm7, %xmm4
+ movdqu 15(%rdi), %xmm0
+ pmovmskb %xmm5, %r8d
+ movdqa %xmm8, %xmm3
+ pmovmskb %xmm4, %ecx
+ pcmpeqb %xmm1,%xmm0
+ pcmpeqb %xmm2,%xmm3
+ salq $32, %rcx
+ pcmpeqb %xmm7,%xmm8
+ salq $48, %r8
+ pminub %xmm0,%xmm3
+ orq %rcx, %rdx
+ por %xmm3,%xmm8
+ orq %rdx, %r8
+ pmovmskb %xmm8, %eax
+ salq $16, %rax
+ orq %rax, %r8
+L(next_pair_index3):
+ bsfq %r8, %rcx
+ addq %rdi, %rcx
+ cmpb $0, (%rcx)
+ je L(zero)
+ xorl %eax, %eax
+ movzbl 2(%rsi), %edx
+ testb %dl, %dl
+ je L(success3)
+ cmpb 1(%rcx), %dl
+ jne L(next_pair3)
+ jmp L(pair_loop_start3)
+
+ ALIGN (4)
+L(pair_loop3):
+ addq $1, %rax
+ cmpb 1(%rcx,%rax), %dl
+ jne L(next_pair3)
+L(pair_loop_start3):
+ movzbl 3(%rsi,%rax), %edx
+ testb %dl, %dl
+ jne L(pair_loop3)
+L(success3):
+ lea -1(%rcx), %rax
+ ret
+
+ ALIGN (4)
+L(next_pair3):
+ addq %rax, %r11
+ movq %rdi, %rax
+ subq %r9, %rax
+ cmpq %r11, %rax
+ jl L(switch_strstr)
+ leaq -1(%r8), %rax
+ andq %rax, %r8
+ jne L(next_pair_index3)
+ jmp L(loop)
+
+ ALIGN (4)
+L(switch_strstr):
+ movq %rdi, %rdi
+ jmp __strstr_sse2
+
+
+ ALIGN (4)
+L(cross_page):
+
+ movq %rdi, %rax
+ pxor %xmm0, %xmm0
+ andq $-64, %rax
+ movdqa (%rax), %xmm3
+ movdqu -1(%rax), %xmm4
+ movdqa %xmm3, %xmm8
+ movdqa 16(%rax), %xmm5
+ pcmpeqb %xmm1, %xmm4
+ pcmpeqb %xmm0, %xmm8
+ pcmpeqb %xmm2, %xmm3
+ movdqa %xmm5, %xmm7
+ pminub %xmm4, %xmm3
+ movdqu 15(%rax), %xmm4
+ pcmpeqb %xmm0, %xmm7
+ por %xmm3, %xmm8
+ movdqa %xmm5, %xmm3
+ movdqa 32(%rax), %xmm5
+ pcmpeqb %xmm1, %xmm4
+ pcmpeqb %xmm2, %xmm3
+ movdqa %xmm5, %xmm6
+ pmovmskb %xmm8, %ecx
+ pminub %xmm4, %xmm3
+ movdqu 31(%rax), %xmm4
+ por %xmm3, %xmm7
+ movdqa %xmm5, %xmm3
+ pcmpeqb %xmm0, %xmm6
+ movdqa 48(%rax), %xmm5
+ pcmpeqb %xmm1, %xmm4
+ pmovmskb %xmm7, %r8d
+ pcmpeqb %xmm2, %xmm3
+ pcmpeqb %xmm5, %xmm0
+ pminub %xmm4, %xmm3
+ movdqu 47(%rax), %xmm4
+ por %xmm3, %xmm6
+ movdqa %xmm5, %xmm3
+ salq $16, %r8
+ pcmpeqb %xmm1, %xmm4
+ pcmpeqb %xmm2, %xmm3
+ pmovmskb %xmm6, %r10d
+ pminub %xmm4, %xmm3
+ por %xmm3, %xmm0
+ salq $32, %r10
+ orq %r10, %r8
+ orq %rcx, %r8
+ movl %edi, %ecx
+ pmovmskb %xmm0, %edx
+ subl %eax, %ecx
+ salq $48, %rdx
+ orq %rdx, %r8
+ shrq %cl, %r8
+ je L(loop_header)
+L(next_pair_index4):
+ bsfq %r8, %rax
+ addq %rdi, %rax
+ cmpb $0, (%rax)
+ je L(zero)
+
+ cmpq %rax,%rdi
+ je L(next_pair4)
+
+ movzbl 2(%rsi), %edx
+ testb %dl, %dl
+ je L(found3)
+ cmpb 1(%rax), %dl
+ jne L(next_pair4)
+ xorl %edx, %edx
+ jmp L(pair_loop_start4)
+
+
+ ALIGN (4)
+L(pair_loop4):
+ addq $1, %rdx
+ cmpb 1(%rax,%rdx), %cl
+ jne L(next_pair4)
+L(pair_loop_start4):
+ movzbl 3(%rsi,%rdx), %ecx
+ testb %cl, %cl
+ jne L(pair_loop4)
+L(found3):
+ subq $1, %rax
+ ret
+
+ALIGN (4)
+L(next_pair4):
+ leaq -1(%r8), %rax
+ andq %rax, %r8
+ jne L(next_pair_index4)
+ jmp L(loop_header)
+
+ ALIGN (4)
+L(found):
+ rep
+ ret
+
+ ALIGN (4)
+L(zero):
+ xorl %eax, %eax
+ ret
+
+
+END(__strstr_sse2_unaligned)
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index cd63b68..fbff3a8 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -1,6 +1,6 @@
-/* strstr with SSE4.2 intrinsics
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
+/* Multiple versions of strstr.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2012-2013 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -17,369 +17,31 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <nmmintrin.h>
-#include "varshift.h"
-
-#ifndef STRSTR_SSE42
-# define STRSTR_SSE42 __strstr_sse42
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# include <ctype.h>
-# include <locale/localeinfo.h>
-
-# define LOADBYTE(C) tolower (C)
-# define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2))
-#else
-# define LOADBYTE(C) (C)
-# define CMPBYTE(C1, C2) ((C1) == (C2))
-#endif
-
-/* We use 0xe ordered-compare:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_ORDER
- | _SIDD_LEAST_SIGNIFICANT
- on pcmpistri to do the scanning and string comparsion requirements of
- sub-string match. In the scanning phase, we process Cflag and ECX
- index to locate the first fragment match; once the first fragment
- match position has been identified, we do comparison of subsequent
- string fragments until we can conclude false or true match; whe
- n concluding a false match, we may need to repeat scanning process
- from next relevant offset in the target string.
-
- In the scanning phase we have 4 cases:
- case ECX CFlag ZFlag SFlag
- 1 16 0 0 0
- 2a 16 0 0 1
- 2b 16 0 1 0
- 2c 16 0 1 1
-
- 1. No ordered-comparison match, both 16B fragments are valid, so
- continue to next fragment.
- 2. No ordered-comparison match, there is EOS in either fragment,
- 2a. Zflg = 0, Sflg = 1, we continue
- 2b. Zflg = 1, Sflg = 0, we conclude no match and return.
- 2c. Zflg = 1, sflg = 1, lenth determine match or no match
-
- In the string comparison phase, the 1st fragment match is fixed up
- to produce ECX = 0. Subsequent fragment compare of nonzero index
- and no match conclude a false match.
-
- case ECX CFlag ZFlag SFlag
- 3 X 1 0 0/1
- 4a 0 1 0 0
- 4b 0 1 0 1
- 4c 0 < X 1 0 0/1
- 5 16 0 1 0
-
- 3. An initial ordered-comparison fragment match, we fix up to do
- subsequent string comparison
- 4a. Continuation of fragment comparison of a string compare.
- 4b. EOS reached in the reference string, we conclude true match and
- return
- 4c. String compare failed if index is nonzero, we need to go back to
- scanning
- 5. failed string compare, go back to scanning
- */
-
-#if !(defined USE_AS_STRCASESTR && defined STRCASESTR_NONASCII)
-/* Simple replacement of movdqu to address 4KB boundary cross issue.
- If EOS occurs within less than 16B before 4KB boundary, we don't
- cross to next page. */
-static __m128i
-__m128i_strloadu (const unsigned char * p, __m128i zero)
-{
- if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0))
- {
- size_t offset = ((size_t) p & (16 - 1));
- __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
- if ((bmsk >> offset) != 0)
- return __m128i_shift_right (a, offset);
- }
- return _mm_loadu_si128 ((__m128i *) p);
-}
-#endif
-
-#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
-
-/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
- locale and other which have single-byte letters only in the ASCII
- range. */
-static __m128i
-__m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow,
- __m128i uchigh, __m128i lcqword)
-{
- __m128i frag = __m128i_strloadu (p, zero);
-
- /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */
- __m128i r2 = _mm_cmpgt_epi8 (uchigh, frag);
- /* Compare if bytes are > 'A' - 1. */
- __m128i r1 = _mm_cmpgt_epi8 (frag, uclow);
- /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1. */
- __m128i mask = _mm_and_si128 (r2, r1);
- /* Apply lowercase bit 6 mask for above mask bytes == ff. */
- return _mm_or_si128 (frag, _mm_and_si128 (mask, lcqword));
-}
-
-#endif
-
-/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
- algorithm) overlap for a fully populated 16B vector.
- Input parameter: 1st 16Byte loaded from the reference string of a
- strstr function.
- We don't use KMP algorithm if reference string is less than 16B. */
-static int
-__inline__ __attribute__ ((__always_inline__,))
-KMP16Bovrlap (__m128i s2)
-{
- __m128i b = _mm_unpacklo_epi8 (s2, s2);
- __m128i a = _mm_unpacklo_epi8 (b, b);
- a = _mm_shuffle_epi32 (a, 0);
- b = _mm_srli_si128 (s2, sizeof (char));
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
-
- /* _BitScanForward(&k1, bmsk); */
- int k1;
- __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
- if (!bmsk)
- return 16;
- else if (bmsk == 0x7fff)
- return 1;
- else if (!k1)
- {
- /* There are al least two distinct chars in s2. If byte 0 and 1 are
- idential and the distinct value lies farther down, we can deduce
- the next byte offset to restart full compare is least no earlier
- than byte 3. */
- return 3;
- }
- else
- {
- /* Byte 1 is not degenerated to byte 0. */
- return k1 + 1;
- }
-}
-
-char *
-__attribute__ ((section (".text.sse4.2")))
-STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
-{
-#define p1 s1
- const unsigned char *p2 = s2;
-
-#ifndef STRCASESTR_NONASCII
- if (__builtin_expect (p2[0] == '\0', 0))
- return (char *) p1;
-
- if (__builtin_expect (p1[0] == '\0', 0))
- return NULL;
-
- /* Check if p1 length is 1 byte long. */
- if (__builtin_expect (p1[1] == '\0', 0))
- return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# ifndef STRCASESTR_NONASCII
- if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
- != 0, 0))
- return __strcasestr_sse42_nonascii (s1, s2);
-
- const __m128i uclow = _mm_set1_epi8 (0x40);
- const __m128i uchigh = _mm_set1_epi8 (0x5b);
- const __m128i lcqword = _mm_set1_epi8 (0x20);
- const __m128i zero = _mm_setzero_si128 ();
-# define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword)
-# else
-# define strloadu __m128i_strloadu_tolower
-# define zero _mm_setzero_si128 ()
-# endif
-#else
-# define strloadu(p) __m128i_strloadu (p, zero)
- const __m128i zero = _mm_setzero_si128 ();
+/* Redefine strstr so that the compiler won't complain about the type
+ mismatch with the IFUNC selector in strong_alias, below. */
+#undef strstr
+#define strstr __redirect_strstr
+#include <string.h>
+#undef strstr
+
+#define STRSTR __strstr_sse2
+#ifdef SHARED
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+ __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
#endif
- /* p1 > 1 byte long. Load up to 16 bytes of fragment. */
- __m128i frag1 = strloadu (p1);
-
- __m128i frag2;
- if (p2[1] != '\0')
- /* p2 is > 1 byte long. */
- frag2 = strloadu (p2);
- else
- frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0);
-
- /* Unsigned bytes, equal order, does frag2 has null? */
- int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
- if (cmp_s & cmp_c)
- {
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
- int len;
- __asm ("bsfl %[bmsk], %[len]"
- : [len] "=r" (len) : [bmsk] "r" (bmsk));
- p1 += cmp;
- if ((len + cmp) <= 16)
- return (char *) p1;
-
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- if ((len + cmp) <= 16)
- return (char *) p1 + cmp;
- }
-
- if (cmp_s)
- {
- /* Adjust addr for 16B alginment in ensuing loop. */
- while (!cmp_z)
- {
- p1 += cmp;
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
- once already, this time cmp will be zero and we can exit. */
- if ((!cmp) & cmp_c)
- break;
- }
-
- if (!cmp_c)
- return NULL;
-
- /* Since s2 is less than 16 bytes, com_c is definitive
- determination of full match. */
- return (char *) p1 + cmp;
- }
-
- /* General case, s2 is at least 16 bytes or more.
- First, the common case of false-match at first byte of p2. */
- const unsigned char *pt = NULL;
- int kmp_fwd = 0;
-re_trace:
- while (!cmp_c)
- {
- /* frag1 has null. */
- if (cmp_z)
- return NULL;
-
- /* frag 1 has no null, advance 16 bytes. */
- p1 += 16;
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
- /* Unsigned bytes, equal order, is there a partial match? */
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- }
-
- /* Next, handle initial positive match as first byte of p2. We have
- a partial fragment match, make full determination until we reached
- end of s2. */
- if (!cmp)
- {
- if (cmp_z)
- return (char *) p1;
-
- pt = p1;
- p1 += 16;
- p2 += 16;
- /* Load up to 16 bytes of fragment. */
- frag2 = strloadu (p2);
- }
- else
- {
- /* Adjust 16B alignment. */
- p1 += cmp;
- pt = p1;
- }
-
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
-
- /* Unsigned bytes, equal order, does frag2 has null? */
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
- while (!(cmp | cmp_z | cmp_s))
- {
- p1 += 16;
- p2 += 16;
- /* Load up to 16 bytes of fragment. */
- frag2 = strloadu (p2);
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
- /* Unsigned bytes, equal order, does frag2 has null? */
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
- }
-
- /* Full determination yielded a false result, retrace s1 to next
- starting position.
- Zflg 1 0 1 0/1
- Sflg 0 1 1 0/1
- cmp na 0 0 >0
- action done done continue continue if s2 < s1
- false match retrace s1 else false
- */
-
- if (cmp_s & !cmp)
- return (char *) pt;
- if (cmp_z)
- {
- if (!cmp_s)
- return NULL;
-
- /* Handle both zero and sign flag set and s1 is shorter in
- length. */
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
- int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
- int len;
- int len1;
- __asm ("bsfl %[bmsk], %[len]"
- : [len] "=r" (len) : [bmsk] "r" (bmsk));
- __asm ("bsfl %[bmsk1], %[len1]"
- : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
- if (len >= len1)
- return NULL;
- }
- else if (!cmp)
- return (char *) pt;
-
- /* Otherwise, we have to retrace and continue. Default of multiple
- paths that need to retrace from next byte in s1. */
- p2 = s2;
- frag2 = strloadu (p2);
-
- if (!kmp_fwd)
- kmp_fwd = KMP16Bovrlap (frag2);
+#include "string/strstr.c"
- /* KMP algorithm predicted overlap needs to be corrected for
- partial fragment compare. */
- p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
+extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
+extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
- /* Since s2 is at least 16 bytes long, we're certain there is no
- match. */
- if (p1[0] == '\0')
- return NULL;
+#include "init-arch.h"
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
+/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
+ ifunc symbol properly. */
+extern __typeof (__redirect_strstr) __libc_strstr;
+libc_ifunc (__libc_strstr, HAS_FAST_UNALIGNED_LOAD ? __strstr_sse2_unaligned : __strstr_sse2)
- /* Unsigned bytes, equal order, is there a partial match? */
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- goto re_trace;
-}
+#undef strstr
+strong_alias (__libc_strstr, strstr)
--
1.8.3.2