This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PING][PATCH][BZ #12100] strstr with unaligned loads.
- From: Liubov Dmitrieva <liubov dot dmitrieva at gmail dot com>
- To: Ondřej Bílka <neleai at seznam dot cz>
- Cc: GNU C Library <libc-alpha at sourceware dot org>
- Date: Mon, 7 Oct 2013 18:33:32 +0400
- Subject: Re: [PING][PATCH][BZ #12100] strstr with unaligned loads.
- Authentication-results: sourceware.org; auth=none
- References: <20130825090758 dot GA23686 at domone dot kolej dot mff dot cuni dot cz> <20130908204231 dot GA30260 at domone dot kolej dot mff dot cuni dot cz> <20131007131430 dot GE5515 at domone>
It looks good to me.
--
Liubov
Intel Corporation
On Mon, Oct 7, 2013 at 5:14 PM, OndÅej BÃlka <neleai@seznam.cz> wrote:
> ping,
>
> Benchmark results with haswell are available here:
> http://kam.mff.cuni.cz/~ondra/benchmark_string/strstr_profile.html
>
> On Sun, Sep 08, 2013 at 10:42:31PM +0200, OndÅej BÃlka wrote:
>> ping
>> On Sun, Aug 25, 2013 at 11:07:58AM +0200, OndÅej BÃlka wrote:
>> > Hi,
>> >
>> > This patch continues to remove ineffective sse4.2 functions. Now we
>> > focus on strstr.
>> >
>> > I use a techique of doing vectorized search for first pair of characters
>> > which is fastest in practice.
>> >
>> > I analyzed a pcpistri instruction and there is a weakness that degrades
>> > performance. In comparison with pcpistri a limiting factor is 16-th
>> > character. It gets matched when it corresponds to first character in
>> > needle and this leads to many false positives.
>> >
>> > This weakness could theoreticaly be addressed by using pcpistrm to
>> > compute masks at 8 byte intervals and doing bitwise and to ensure that
>> > we always check at least 8 bytes.
>> >
>> > However this does not work in practice because pcpistrm is slow and we
>> > are left with pcpistri whose result is mostly useless.
>> >
>> > New sse2 version is 280% faster on nehalem and fx10 and 400% faster on
>> > ivy bridge for random strings where you pick randomly between 16
>> > characters.
>> >
>> > I tried to adapt it to avx2, you can test following
>> > http://kam.mff.cuni.cz/~ondra/benchmark_string/strstr_profile250813.tar.bz2
>> >
>> > I am collecting data of strstr usage in wild. I need to find program
>> > that heavily uses strstr A gcc benchmark mostly catches strstr used by
>> > linker. I could still get 10% improvement there but it is hard to get
>> > more as around half of calls are strstr(string,"=") or strstr(string,",").
>> > Another common call is strstr(s,"selinux").
>> >
>> >
>> > Then there is second problem that sse4_2 version is not written properly
>> > and exhibits quadratic behaviour.
>> >
>> > This could be solved by standard buy or rent techique which I employ.
>> > For any position p there could be only p+64n+512 comparisons until I
>> > switch to two way algorithm which gives linear complexity.
>> >
>> > I also removed strcasestr as I do not have implementation yet but it
>> > should be similar. Main question here is if for all locales it is safe
>> > to assume that only case conversion for characters 0-127 is a-z <-> A-Z
>> > Then I could vectorize conversion for most cases which would give
>> > noticable speeedup. Otherwise I would need to check locale if this
>> > holds.
>> >
>> > Passes tests, it needed lot of manual editing to optimize so it should
>> > be checked carefully.
>> >
>> > >From this I will derive a ssse3 version, shift sse2 version and memmem.
>> > Do you see any optimization oppurtinity?
>> >
>> > Ondra
>> >
>> > * sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S: New file
>> > * sysdeps/x86_64/multiarch/strstr-c.c: Moved to ...
>> > * sysdeps/x86_64/multiarch/strstr.c: ... here.
>> > (strstr): Add __strstr_sse2_unaligned ifunc.
>> > * sysdeps/x86_64/multiarch/strcasestr-c.c: Moved to ...
>> > * sysdeps/x86_64/multiarch/strcasestr.c ... here.
>> > (strcasestr): Remove __strcasestr_sse42 ifunc.
>> > * sysdeps/x86_64/multiarch/strcasestr-nonascii.c: Remove.
>> > * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Update.
>> > * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Update.
>> >
>> >
>> > ---
>> > sysdeps/x86_64/multiarch/Makefile | 10 +-
>> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 +-
>> > sysdeps/x86_64/multiarch/strcasestr-c.c | 19 --
>> > sysdeps/x86_64/multiarch/strcasestr-nonascii.c | 50 ---
>> > sysdeps/x86_64/multiarch/strcasestr.c | 23 +-
>> > sysdeps/x86_64/multiarch/strstr-c.c | 47 ---
>> > sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S | 381 ++++++++++++++++++++++
>> > sysdeps/x86_64/multiarch/strstr.c | 388 ++---------------------
>> > 8 files changed, 428 insertions(+), 494 deletions(-)
>> > delete mode 100644 sysdeps/x86_64/multiarch/strcasestr-c.c
>> > delete mode 100644 sysdeps/x86_64/multiarch/strcasestr-nonascii.c
>> > delete mode 100644 sysdeps/x86_64/multiarch/strstr-c.c
>> > create mode 100644 sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
>> >
>> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
>> > index 203d16e..146e800 100644
>> > --- a/sysdeps/x86_64/multiarch/Makefile
>> > +++ b/sysdeps/x86_64/multiarch/Makefile
>> > @@ -9,22 +9,20 @@ ifeq ($(subdir),string)
>> > sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
>> > strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \
>> > memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
>> > - memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
>> > + memmove-ssse3-back strcasecmp_l-ssse3 \
>> > strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
>> > strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
>> > strcpy-sse2-unaligned strncpy-sse2-unaligned \
>> > stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
>> > strcat-sse2-unaligned strncat-sse2-unaligned \
>> > - strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
>> > + strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3 \
>> > + strstr-sse2-unaligned
>> > ifeq (yes,$(config-cflags-sse4))
>> > -sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
>> > +sysdep_routines += strcspn-c strpbrk-c strspn-c varshift
>> > CFLAGS-varshift.c += -msse4
>> > CFLAGS-strcspn-c.c += -msse4
>> > CFLAGS-strpbrk-c.c += -msse4
>> > CFLAGS-strspn-c.c += -msse4
>> > -CFLAGS-strstr.c += -msse4
>> > -CFLAGS-strcasestr.c += -msse4
>> > -CFLAGS-strcasestr-nonascii.c += -msse4
>> > endif
>> > endif
>> >
>> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> > index 28d3579..0eaddc7 100644
>> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> > @@ -104,8 +104,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> >
>> > /* Support sysdeps/x86_64/multiarch/strcasestr.c. */
>> > IFUNC_IMPL (i, name, strcasestr,
>> > - IFUNC_IMPL_ADD (array, i, strcasestr, HAS_SSE4_2,
>> > - __strcasestr_sse42)
>> > IFUNC_IMPL_ADD (array, i, strcasestr, 1, __strcasestr_sse2))
>> >
>> > /* Support sysdeps/x86_64/multiarch/strcat.S. */
>> > @@ -196,7 +194,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> >
>> > /* Support sysdeps/x86_64/multiarch/strstr-c.c. */
>> > IFUNC_IMPL (i, name, strstr,
>> > - IFUNC_IMPL_ADD (array, i, strstr, HAS_SSE4_2, __strstr_sse42)
>> > + IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
>> > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
>> >
>> > /* Support sysdeps/x86_64/multiarch/wcscpy.S. */
>> > diff --git a/sysdeps/x86_64/multiarch/strcasestr-c.c b/sysdeps/x86_64/multiarch/strcasestr-c.c
>> > deleted file mode 100644
>> > index c13a4c4..0000000
>> > --- a/sysdeps/x86_64/multiarch/strcasestr-c.c
>> > +++ /dev/null
>> > @@ -1,19 +0,0 @@
>> > -/* Multiple versions of strcasestr
>> > - All versions must be listed in ifunc-impl-list.c. */
>> > -
>> > -#include "init-arch.h"
>> > -
>> > -#define STRCASESTR __strcasestr_sse2
>> > -
>> > -#include "string/strcasestr.c"
>> > -
>> > -extern char *__strcasestr_sse42 (const char *, const char *) attribute_hidden;
>> > -extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden;
>> > -
>> > -#if 1
>> > -libc_ifunc (__strcasestr,
>> > - HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2);
>> > -#else
>> > -libc_ifunc (__strcasestr,
>> > - 0 ? __strcasestr_sse42 : __strcasestr_sse2);
>> > -#endif
>> > diff --git a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c b/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
>> > deleted file mode 100644
>> > index 032a642..0000000
>> > --- a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
>> > +++ /dev/null
>> > @@ -1,50 +0,0 @@
>> > -/* strstr with SSE4.2 intrinsics
>> > - Copyright (C) 2010-2013 Free Software Foundation, Inc.
>> > - This file is part of the GNU C Library.
>> > -
>> > - The GNU C Library is free software; you can redistribute it and/or
>> > - modify it under the terms of the GNU Lesser General Public
>> > - License as published by the Free Software Foundation; either
>> > - version 2.1 of the License, or (at your option) any later version.
>> > -
>> > - The GNU C Library is distributed in the hope that it will be useful,
>> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> > - Lesser General Public License for more details.
>> > -
>> > - You should have received a copy of the GNU Lesser General Public
>> > - License along with the GNU C Library; if not, see
>> > - <http://www.gnu.org/licenses/>. */
>> > -
>> > -#include <ctype.h>
>> > -#include <xmmintrin.h>
>> > -
>> > -
>> > -/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C
>> > - locale. */
>> > -static __m128i
>> > -__m128i_strloadu_tolower (const unsigned char *p)
>> > -{
>> > - union
>> > - {
>> > - char b[16];
>> > - __m128i x;
>> > - } u;
>> > -
>> > - for (int i = 0; i < 16; ++i)
>> > - if (p[i] == 0)
>> > - {
>> > - u.b[i] = 0;
>> > - break;
>> > - }
>> > - else
>> > - u.b[i] = tolower (p[i]);
>> > -
>> > - return u.x;
>> > -}
>> > -
>> > -
>> > -#define STRCASESTR_NONASCII
>> > -#define USE_AS_STRCASESTR
>> > -#define STRSTR_SSE42 __strcasestr_sse42_nonascii
>> > -#include "strstr.c"
>> > diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c
>> > index d1cfb3b..2c2454a 100644
>> > --- a/sysdeps/x86_64/multiarch/strcasestr.c
>> > +++ b/sysdeps/x86_64/multiarch/strcasestr.c
>> > @@ -1,7 +1,18 @@
>> > -extern char *__strcasestr_sse42_nonascii (const unsigned char *s1,
>> > - const unsigned char *s2)
>> > - attribute_hidden;
>> > +/* Multiple versions of strcasestr
>> > + All versions must be listed in ifunc-impl-list.c. */
>> >
>> > -#define USE_AS_STRCASESTR
>> > -#define STRSTR_SSE42 __strcasestr_sse42
>> > -#include "strstr.c"
>> > +#include "init-arch.h"
>> > +
>> > +#define STRCASESTR __strcasestr_sse2
>> > +
>> > +#include "string/strcasestr.c"
>> > +
>> > +extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden;
>> > +
>> > +#if 1
>> > +libc_ifunc (__strcasestr,
>> > + __strcasestr_sse2);
>> > +#else
>> > +libc_ifunc (__strcasestr,
>> > + 0 ? __strcasestr_sse42 : __strcasestr_sse2);
>> > +#endif
>> > diff --git a/sysdeps/x86_64/multiarch/strstr-c.c b/sysdeps/x86_64/multiarch/strstr-c.c
>> > deleted file mode 100644
>> > index 42bbe48..0000000
>> > --- a/sysdeps/x86_64/multiarch/strstr-c.c
>> > +++ /dev/null
>> > @@ -1,47 +0,0 @@
>> > -/* Multiple versions of strstr.
>> > - All versions must be listed in ifunc-impl-list.c.
>> > - Copyright (C) 2012-2013 Free Software Foundation, Inc.
>> > - This file is part of the GNU C Library.
>> > -
>> > - The GNU C Library is free software; you can redistribute it and/or
>> > - modify it under the terms of the GNU Lesser General Public
>> > - License as published by the Free Software Foundation; either
>> > - version 2.1 of the License, or (at your option) any later version.
>> > -
>> > - The GNU C Library is distributed in the hope that it will be useful,
>> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> > - Lesser General Public License for more details.
>> > -
>> > - You should have received a copy of the GNU Lesser General Public
>> > - License along with the GNU C Library; if not, see
>> > - <http://www.gnu.org/licenses/>. */
>> > -
>> > -/* Redefine strstr so that the compiler won't complain about the type
>> > - mismatch with the IFUNC selector in strong_alias, below. */
>> > -#undef strstr
>> > -#define strstr __redirect_strstr
>> > -#include <string.h>
>> > -#undef strstr
>> > -
>> > -#define STRSTR __strstr_sse2
>> > -#ifdef SHARED
>> > -# undef libc_hidden_builtin_def
>> > -# define libc_hidden_builtin_def(name) \
>> > - __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
>> > -#endif
>> > -
>> > -#include "string/strstr.c"
>> > -
>> > -extern __typeof (__redirect_strstr) __strstr_sse42 attribute_hidden;
>> > -extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
>> > -
>> > -#include "init-arch.h"
>> > -
>> > -/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
>> > - ifunc symbol properly. */
>> > -extern __typeof (__redirect_strstr) __libc_strstr;
>> > -libc_ifunc (__libc_strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2)
>> > -
>> > -#undef strstr
>> > -strong_alias (__libc_strstr, strstr)
>> > diff --git a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
>> > new file mode 100644
>> > index 0000000..6c134fa
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
>> > @@ -0,0 +1,381 @@
>> > +/* strstr with unaligned loads
>> > + Copyright (C) 2009-2013 Free Software Foundation, Inc.
>> > + This file is part of the GNU C Library.
>> > +
>> > + The GNU C Library is free software; you can redistribute it and/or
>> > + modify it under the terms of the GNU Lesser General Public
>> > + License as published by the Free Software Foundation; either
>> > + version 2.1 of the License, or (at your option) any later version.
>> > +
>> > + The GNU C Library is distributed in the hope that it will be useful,
>> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> > + Lesser General Public License for more details.
>> > +
>> > + You should have received a copy of the GNU Lesser General Public
>> > + License along with the GNU C Library; if not, see
>> > + <http://www.gnu.org/licenses/>. */
>> > +
>> > +#include <sysdep.h>
>> > +
>> > +#ifndef ALIGN
>> > +# define ALIGN(n) .p2align n
>> > +#endif
>> > +
>> > +
>> > +ENTRY(__strstr_sse2_unaligned)
>> > + movzbl (%rsi), %eax
>> > + testb %al, %al
>> > + je L(empty)
>> > + movzbl 1(%rsi), %edx
>> > + testb %dl, %dl
>> > + je L(strchr)
>> > + movd %eax, %xmm1
>> > + movd %edx, %xmm2
>> > + movq %rdi, %rax
>> > + andl $4095, %eax
>> > + punpcklbw %xmm1, %xmm1
>> > + cmpq $4031, %rax
>> > + punpcklbw %xmm2, %xmm2
>> > + punpcklwd %xmm1, %xmm1
>> > + punpcklwd %xmm2, %xmm2
>> > + pshufd $0, %xmm1, %xmm1
>> > + pshufd $0, %xmm2, %xmm2
>> > + ja L(cross_page)
>> > + movdqu (%rdi), %xmm3
>> > + pxor %xmm5, %xmm5
>> > + movdqu 1(%rdi), %xmm4
>> > + movdqa %xmm3, %xmm6
>> > + pcmpeqb %xmm1, %xmm3
>> > + pcmpeqb %xmm2, %xmm4
>> > + movdqu 16(%rdi), %xmm0
>> > + pcmpeqb %xmm5, %xmm6
>> > + pminub %xmm4, %xmm3
>> > + movdqa %xmm3, %xmm4
>> > + movdqu 17(%rdi), %xmm3
>> > + pcmpeqb %xmm0, %xmm5
>> > + pcmpeqb %xmm2, %xmm3
>> > + por %xmm6, %xmm4
>> > + pcmpeqb %xmm1, %xmm0
>> > + pminub %xmm3, %xmm0
>> > + por %xmm5, %xmm0
>> > + pmovmskb %xmm4, %r8d
>> > + pmovmskb %xmm0, %eax
>> > + salq $16, %rax
>> > + orq %rax, %r8
>> > + je L(next_32_bytes)
>> > +L(next_pair_index):
>> > + bsf %r8, %rax
>> > + addq %rdi, %rax
>> > + cmpb $0, (%rax)
>> > + je L(zero1)
>> > + movzbl 2(%rsi), %edx
>> > + testb %dl, %dl
>> > + je L(found1)
>> > + cmpb 2(%rax), %dl
>> > + jne L(next_pair)
>> > + xorl %edx, %edx
>> > + jmp L(pair_loop_start)
>> > +
>> > + ALIGN (4)
>> > +L(strchr):
>> > + movzbl %al, %esi
>> > + jmp __strchr_sse2
>> > +
>> > + ALIGN (4)
>> > +L(pair_loop):
>> > + addq $1, %rdx
>> > + cmpb 2(%rax,%rdx), %cl
>> > + jne L(next_pair)
>> > +L(pair_loop_start):
>> > + movzbl 3(%rsi,%rdx), %ecx
>> > + testb %cl, %cl
>> > + jne L(pair_loop)
>> > +L(found1):
>> > + ret
>> > +L(zero1):
>> > + xorl %eax, %eax
>> > + ret
>> > +
>> > + ALIGN (4)
>> > +L(next_pair):
>> > + leaq -1(%r8), %rax
>> > + andq %rax, %r8
>> > + jne L(next_pair_index)
>> > +
>> > + ALIGN (4)
>> > +L(next_32_bytes):
>> > + movdqu 32(%rdi), %xmm3
>> > + pxor %xmm5, %xmm5
>> > + movdqu 33(%rdi), %xmm4
>> > + movdqa %xmm3, %xmm6
>> > + pcmpeqb %xmm1, %xmm3
>> > + pcmpeqb %xmm2, %xmm4
>> > + movdqu 48(%rdi), %xmm0
>> > + pcmpeqb %xmm5, %xmm6
>> > + pminub %xmm4, %xmm3
>> > + movdqa %xmm3, %xmm4
>> > + movdqu 49(%rdi), %xmm3
>> > + pcmpeqb %xmm0, %xmm5
>> > + pcmpeqb %xmm2, %xmm3
>> > + por %xmm6, %xmm4
>> > + pcmpeqb %xmm1, %xmm0
>> > + pminub %xmm3, %xmm0
>> > + por %xmm5, %xmm0
>> > + pmovmskb %xmm4, %eax
>> > + salq $32, %rax
>> > + pmovmskb %xmm0, %r8d
>> > + salq $48, %r8
>> > + orq %rax, %r8
>> > + je L(loop_header)
>> > +L(next_pair2_index):
>> > + bsfq %r8, %rax
>> > + addq %rdi, %rax
>> > + cmpb $0, (%rax)
>> > + je L(zero2)
>> > + movzbl 2(%rsi), %edx
>> > + testb %dl, %dl
>> > + je L(found2)
>> > + cmpb 2(%rax), %dl
>> > + jne L(next_pair2)
>> > + xorl %edx, %edx
>> > + jmp L(pair_loop2_start)
>> > +
>> > + ALIGN (4)
>> > +L(pair_loop2):
>> > + addq $1, %rdx
>> > + cmpb 2(%rax,%rdx), %cl
>> > + jne L(next_pair2)
>> > +L(pair_loop2_start):
>> > + movzbl 3(%rsi,%rdx), %ecx
>> > + testb %cl, %cl
>> > + jne L(pair_loop2)
>> > +L(found2):
>> > + ret
>> > + L(zero2):
>> > + xorl %eax, %eax
>> > + ret
>> > +L(empty):
>> > + mov %rdi, %rax
>> > + ret
>> > +
>> > + ALIGN (4)
>> > +L(next_pair2):
>> > + leaq -1(%r8), %rax
>> > + andq %rax, %r8
>> > + jne L(next_pair2_index)
>> > +L(loop_header):
>> > + movq $-512, %r11
>> > + movq %rdi, %r9
>> > +
>> > + pxor %xmm7, %xmm7
>> > + andq $-64, %rdi
>> > +
>> > + ALIGN(4)
>> > +L(loop):
>> > + movdqa 64(%rdi), %xmm3
>> > + movdqu 63(%rdi), %xmm6
>> > + movdqa %xmm3, %xmm0
>> > + pxor %xmm2, %xmm3
>> > + pxor %xmm1, %xmm6
>> > + movdqa 80(%rdi), %xmm10
>> > + por %xmm3, %xmm6
>> > + pminub %xmm10, %xmm0
>> > + movdqu 79(%rdi), %xmm3
>> > + pxor %xmm2, %xmm10
>> > + pxor %xmm1, %xmm3
>> > + movdqa 96(%rdi), %xmm9
>> > + por %xmm10, %xmm3
>> > + pminub %xmm9, %xmm0
>> > + pxor %xmm2, %xmm9
>> > + movdqa 112(%rdi), %xmm8
>> > + addq $64, %rdi
>> > + pminub %xmm6, %xmm3
>> > + movdqu 31(%rdi), %xmm4
>> > + pminub %xmm8, %xmm0
>> > + pxor %xmm2, %xmm8
>> > + pxor %xmm1, %xmm4
>> > + por %xmm9, %xmm4
>> > + pminub %xmm4, %xmm3
>> > + movdqu 47(%rdi), %xmm5
>> > + pxor %xmm1, %xmm5
>> > + por %xmm8, %xmm5
>> > + pminub %xmm5, %xmm3
>> > + pminub %xmm3, %xmm0
>> > + pcmpeqb %xmm7, %xmm0
>> > + pmovmskb %xmm0, %eax
>> > + testl %eax, %eax
>> > + je L(loop)
>> > + pminub (%rdi), %xmm6
>> > + pminub 32(%rdi),%xmm4
>> > + pminub 48(%rdi),%xmm5
>> > + pcmpeqb %xmm7, %xmm6
>> > + pcmpeqb %xmm7, %xmm5
>> > + pmovmskb %xmm6, %edx
>> > + movdqa 16(%rdi), %xmm8
>> > + pcmpeqb %xmm7, %xmm4
>> > + movdqu 15(%rdi), %xmm0
>> > + pmovmskb %xmm5, %r8d
>> > + movdqa %xmm8, %xmm3
>> > + pmovmskb %xmm4, %ecx
>> > + pcmpeqb %xmm1,%xmm0
>> > + pcmpeqb %xmm2,%xmm3
>> > + salq $32, %rcx
>> > + pcmpeqb %xmm7,%xmm8
>> > + salq $48, %r8
>> > + pminub %xmm0,%xmm3
>> > + orq %rcx, %rdx
>> > + por %xmm3,%xmm8
>> > + orq %rdx, %r8
>> > + pmovmskb %xmm8, %eax
>> > + salq $16, %rax
>> > + orq %rax, %r8
>> > +L(next_pair_index3):
>> > + bsfq %r8, %rcx
>> > + addq %rdi, %rcx
>> > + cmpb $0, (%rcx)
>> > + je L(zero)
>> > + xorl %eax, %eax
>> > + movzbl 2(%rsi), %edx
>> > + testb %dl, %dl
>> > + je L(success3)
>> > + cmpb 1(%rcx), %dl
>> > + jne L(next_pair3)
>> > + jmp L(pair_loop_start3)
>> > +
>> > + ALIGN (4)
>> > +L(pair_loop3):
>> > + addq $1, %rax
>> > + cmpb 1(%rcx,%rax), %dl
>> > + jne L(next_pair3)
>> > +L(pair_loop_start3):
>> > + movzbl 3(%rsi,%rax), %edx
>> > + testb %dl, %dl
>> > + jne L(pair_loop3)
>> > +L(success3):
>> > + lea -1(%rcx), %rax
>> > + ret
>> > +
>> > + ALIGN (4)
>> > +L(next_pair3):
>> > + addq %rax, %r11
>> > + movq %rdi, %rax
>> > + subq %r9, %rax
>> > + cmpq %r11, %rax
>> > + jl L(switch_strstr)
>> > + leaq -1(%r8), %rax
>> > + andq %rax, %r8
>> > + jne L(next_pair_index3)
>> > + jmp L(loop)
>> > +
>> > + ALIGN (4)
>> > +L(switch_strstr):
>> > + movq %rdi, %rdi
>> > + jmp __strstr_sse2
>> > +
>> > +
>> > + ALIGN (4)
>> > +L(cross_page):
>> > +
>> > + movq %rdi, %rax
>> > + pxor %xmm0, %xmm0
>> > + andq $-64, %rax
>> > + movdqa (%rax), %xmm3
>> > + movdqu -1(%rax), %xmm4
>> > + movdqa %xmm3, %xmm8
>> > + movdqa 16(%rax), %xmm5
>> > + pcmpeqb %xmm1, %xmm4
>> > + pcmpeqb %xmm0, %xmm8
>> > + pcmpeqb %xmm2, %xmm3
>> > + movdqa %xmm5, %xmm7
>> > + pminub %xmm4, %xmm3
>> > + movdqu 15(%rax), %xmm4
>> > + pcmpeqb %xmm0, %xmm7
>> > + por %xmm3, %xmm8
>> > + movdqa %xmm5, %xmm3
>> > + movdqa 32(%rax), %xmm5
>> > + pcmpeqb %xmm1, %xmm4
>> > + pcmpeqb %xmm2, %xmm3
>> > + movdqa %xmm5, %xmm6
>> > + pmovmskb %xmm8, %ecx
>> > + pminub %xmm4, %xmm3
>> > + movdqu 31(%rax), %xmm4
>> > + por %xmm3, %xmm7
>> > + movdqa %xmm5, %xmm3
>> > + pcmpeqb %xmm0, %xmm6
>> > + movdqa 48(%rax), %xmm5
>> > + pcmpeqb %xmm1, %xmm4
>> > + pmovmskb %xmm7, %r8d
>> > + pcmpeqb %xmm2, %xmm3
>> > + pcmpeqb %xmm5, %xmm0
>> > + pminub %xmm4, %xmm3
>> > + movdqu 47(%rax), %xmm4
>> > + por %xmm3, %xmm6
>> > + movdqa %xmm5, %xmm3
>> > + salq $16, %r8
>> > + pcmpeqb %xmm1, %xmm4
>> > + pcmpeqb %xmm2, %xmm3
>> > + pmovmskb %xmm6, %r10d
>> > + pminub %xmm4, %xmm3
>> > + por %xmm3, %xmm0
>> > + salq $32, %r10
>> > + orq %r10, %r8
>> > + orq %rcx, %r8
>> > + movl %edi, %ecx
>> > + pmovmskb %xmm0, %edx
>> > + subl %eax, %ecx
>> > + salq $48, %rdx
>> > + orq %rdx, %r8
>> > + shrq %cl, %r8
>> > + je L(loop_header)
>> > +L(next_pair_index4):
>> > + bsfq %r8, %rax
>> > + addq %rdi, %rax
>> > + cmpb $0, (%rax)
>> > + je L(zero)
>> > +
>> > + cmpq %rax,%rdi
>> > + je L(next_pair4)
>> > +
>> > + movzbl 2(%rsi), %edx
>> > + testb %dl, %dl
>> > + je L(found3)
>> > + cmpb 1(%rax), %dl
>> > + jne L(next_pair4)
>> > + xorl %edx, %edx
>> > + jmp L(pair_loop_start4)
>> > +
>> > +
>> > + ALIGN (4)
>> > +L(pair_loop4):
>> > + addq $1, %rdx
>> > + cmpb 1(%rax,%rdx), %cl
>> > + jne L(next_pair4)
>> > +L(pair_loop_start4):
>> > + movzbl 3(%rsi,%rdx), %ecx
>> > + testb %cl, %cl
>> > + jne L(pair_loop4)
>> > +L(found3):
>> > + subq $1, %rax
>> > + ret
>> > +
>> > +ALIGN (4)
>> > +L(next_pair4):
>> > + leaq -1(%r8), %rax
>> > + andq %rax, %r8
>> > + jne L(next_pair_index4)
>> > + jmp L(loop_header)
>> > +
>> > + ALIGN (4)
>> > +L(found):
>> > + rep
>> > + ret
>> > +
>> > + ALIGN (4)
>> > +L(zero):
>> > + xorl %eax, %eax
>> > + ret
>> > +
>> > +
>> > +END(__strstr_sse2_unaligned)
>> > diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
>> > index cd63b68..fbff3a8 100644
>> > --- a/sysdeps/x86_64/multiarch/strstr.c
>> > +++ b/sysdeps/x86_64/multiarch/strstr.c
>> > @@ -1,6 +1,6 @@
>> > -/* strstr with SSE4.2 intrinsics
>> > - Copyright (C) 2009-2013 Free Software Foundation, Inc.
>> > - Contributed by Intel Corporation.
>> > +/* Multiple versions of strstr.
>> > + All versions must be listed in ifunc-impl-list.c.
>> > + Copyright (C) 2012-2013 Free Software Foundation, Inc.
>> > This file is part of the GNU C Library.
>> >
>> > The GNU C Library is free software; you can redistribute it and/or
>> > @@ -17,369 +17,31 @@
>> > License along with the GNU C Library; if not, see
>> > <http://www.gnu.org/licenses/>. */
>> >
>> > -#include <nmmintrin.h>
>> > -#include "varshift.h"
>> > -
>> > -#ifndef STRSTR_SSE42
>> > -# define STRSTR_SSE42 __strstr_sse42
>> > -#endif
>> > -
>> > -#ifdef USE_AS_STRCASESTR
>> > -# include <ctype.h>
>> > -# include <locale/localeinfo.h>
>> > -
>> > -# define LOADBYTE(C) tolower (C)
>> > -# define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2))
>> > -#else
>> > -# define LOADBYTE(C) (C)
>> > -# define CMPBYTE(C1, C2) ((C1) == (C2))
>> > -#endif
>> > -
>> > -/* We use 0xe ordered-compare:
>> > - _SIDD_SBYTE_OPS
>> > - | _SIDD_CMP_EQUAL_ORDER
>> > - | _SIDD_LEAST_SIGNIFICANT
>> > - on pcmpistri to do the scanning and string comparsion requirements of
>> > - sub-string match. In the scanning phase, we process Cflag and ECX
>> > - index to locate the first fragment match; once the first fragment
>> > - match position has been identified, we do comparison of subsequent
>> > - string fragments until we can conclude false or true match; whe
>> > - n concluding a false match, we may need to repeat scanning process
>> > - from next relevant offset in the target string.
>> > -
>> > - In the scanning phase we have 4 cases:
>> > - case ECX CFlag ZFlag SFlag
>> > - 1 16 0 0 0
>> > - 2a 16 0 0 1
>> > - 2b 16 0 1 0
>> > - 2c 16 0 1 1
>> > -
>> > - 1. No ordered-comparison match, both 16B fragments are valid, so
>> > - continue to next fragment.
>> > - 2. No ordered-comparison match, there is EOS in either fragment,
>> > - 2a. Zflg = 0, Sflg = 1, we continue
>> > - 2b. Zflg = 1, Sflg = 0, we conclude no match and return.
>> > - 2c. Zflg = 1, sflg = 1, lenth determine match or no match
>> > -
>> > - In the string comparison phase, the 1st fragment match is fixed up
>> > - to produce ECX = 0. Subsequent fragment compare of nonzero index
>> > - and no match conclude a false match.
>> > -
>> > - case ECX CFlag ZFlag SFlag
>> > - 3 X 1 0 0/1
>> > - 4a 0 1 0 0
>> > - 4b 0 1 0 1
>> > - 4c 0 < X 1 0 0/1
>> > - 5 16 0 1 0
>> > -
>> > - 3. An initial ordered-comparison fragment match, we fix up to do
>> > - subsequent string comparison
>> > - 4a. Continuation of fragment comparison of a string compare.
>> > - 4b. EOS reached in the reference string, we conclude true match and
>> > - return
>> > - 4c. String compare failed if index is nonzero, we need to go back to
>> > - scanning
>> > - 5. failed string compare, go back to scanning
>> > - */
>> > -
>> > -#if !(defined USE_AS_STRCASESTR && defined STRCASESTR_NONASCII)
>> > -/* Simple replacement of movdqu to address 4KB boundary cross issue.
>> > - If EOS occurs within less than 16B before 4KB boundary, we don't
>> > - cross to next page. */
>> > -static __m128i
>> > -__m128i_strloadu (const unsigned char * p, __m128i zero)
>> > -{
>> > - if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0))
>> > - {
>> > - size_t offset = ((size_t) p & (16 - 1));
>> > - __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
>> > - int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
>> > - if ((bmsk >> offset) != 0)
>> > - return __m128i_shift_right (a, offset);
>> > - }
>> > - return _mm_loadu_si128 ((__m128i *) p);
>> > -}
>> > -#endif
>> > -
>> > -#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
>> > -
>> > -/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
>> > - locale and other which have single-byte letters only in the ASCII
>> > - range. */
>> > -static __m128i
>> > -__m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow,
>> > - __m128i uchigh, __m128i lcqword)
>> > -{
>> > - __m128i frag = __m128i_strloadu (p, zero);
>> > -
>> > - /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */
>> > - __m128i r2 = _mm_cmpgt_epi8 (uchigh, frag);
>> > - /* Compare if bytes are > 'A' - 1. */
>> > - __m128i r1 = _mm_cmpgt_epi8 (frag, uclow);
>> > - /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1. */
>> > - __m128i mask = _mm_and_si128 (r2, r1);
>> > - /* Apply lowercase bit 6 mask for above mask bytes == ff. */
>> > - return _mm_or_si128 (frag, _mm_and_si128 (mask, lcqword));
>> > -}
>> > -
>> > -#endif
>> > -
>> > -/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
>> > - algorithm) overlap for a fully populated 16B vector.
>> > - Input parameter: 1st 16Byte loaded from the reference string of a
>> > - strstr function.
>> > - We don't use KMP algorithm if reference string is less than 16B. */
>> > -static int
>> > -__inline__ __attribute__ ((__always_inline__,))
>> > -KMP16Bovrlap (__m128i s2)
>> > -{
>> > - __m128i b = _mm_unpacklo_epi8 (s2, s2);
>> > - __m128i a = _mm_unpacklo_epi8 (b, b);
>> > - a = _mm_shuffle_epi32 (a, 0);
>> > - b = _mm_srli_si128 (s2, sizeof (char));
>> > - int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
>> > -
>> > - /* _BitScanForward(&k1, bmsk); */
>> > - int k1;
>> > - __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
>> > - if (!bmsk)
>> > - return 16;
>> > - else if (bmsk == 0x7fff)
>> > - return 1;
>> > - else if (!k1)
>> > - {
>> > - /* There are al least two distinct chars in s2. If byte 0 and 1 are
>> > - idential and the distinct value lies farther down, we can deduce
>> > - the next byte offset to restart full compare is least no earlier
>> > - than byte 3. */
>> > - return 3;
>> > - }
>> > - else
>> > - {
>> > - /* Byte 1 is not degenerated to byte 0. */
>> > - return k1 + 1;
>> > - }
>> > -}
>> > -
>> > -char *
>> > -__attribute__ ((section (".text.sse4.2")))
>> > -STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
>> > -{
>> > -#define p1 s1
>> > - const unsigned char *p2 = s2;
>> > -
>> > -#ifndef STRCASESTR_NONASCII
>> > - if (__builtin_expect (p2[0] == '\0', 0))
>> > - return (char *) p1;
>> > -
>> > - if (__builtin_expect (p1[0] == '\0', 0))
>> > - return NULL;
>> > -
>> > - /* Check if p1 length is 1 byte long. */
>> > - if (__builtin_expect (p1[1] == '\0', 0))
>> > - return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
>> > -#endif
>> > -
>> > -#ifdef USE_AS_STRCASESTR
>> > -# ifndef STRCASESTR_NONASCII
>> > - if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
>> > - != 0, 0))
>> > - return __strcasestr_sse42_nonascii (s1, s2);
>> > -
>> > - const __m128i uclow = _mm_set1_epi8 (0x40);
>> > - const __m128i uchigh = _mm_set1_epi8 (0x5b);
>> > - const __m128i lcqword = _mm_set1_epi8 (0x20);
>> > - const __m128i zero = _mm_setzero_si128 ();
>> > -# define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword)
>> > -# else
>> > -# define strloadu __m128i_strloadu_tolower
>> > -# define zero _mm_setzero_si128 ()
>> > -# endif
>> > -#else
>> > -# define strloadu(p) __m128i_strloadu (p, zero)
>> > - const __m128i zero = _mm_setzero_si128 ();
>> > +/* Redefine strstr so that the compiler won't complain about the type
>> > + mismatch with the IFUNC selector in strong_alias, below. */
>> > +#undef strstr
>> > +#define strstr __redirect_strstr
>> > +#include <string.h>
>> > +#undef strstr
>> > +
>> > +#define STRSTR __strstr_sse2
>> > +#ifdef SHARED
>> > +# undef libc_hidden_builtin_def
>> > +# define libc_hidden_builtin_def(name) \
>> > + __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
>> > #endif
>> >
>> > - /* p1 > 1 byte long. Load up to 16 bytes of fragment. */
>> > - __m128i frag1 = strloadu (p1);
>> > -
>> > - __m128i frag2;
>> > - if (p2[1] != '\0')
>> > - /* p2 is > 1 byte long. */
>> > - frag2 = strloadu (p2);
>> > - else
>> > - frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0);
>> > -
>> > - /* Unsigned bytes, equal order, does frag2 has null? */
>> > - int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
>> > - int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
>> > - int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
>> > - int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
>> > - if (cmp_s & cmp_c)
>> > - {
>> > - int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
>> > - int len;
>> > - __asm ("bsfl %[bmsk], %[len]"
>> > - : [len] "=r" (len) : [bmsk] "r" (bmsk));
>> > - p1 += cmp;
>> > - if ((len + cmp) <= 16)
>> > - return (char *) p1;
>> > -
>> > - /* Load up to 16 bytes of fragment. */
>> > - frag1 = strloadu (p1);
>> > - cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
>> > - cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
>> > - cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
>> > - cmp = _mm_cmpistri (frag2, frag1, 0x0c);
>> > - if ((len + cmp) <= 16)
>> > - return (char *) p1 + cmp;
>> > - }
>> > -
>> > - if (cmp_s)
>> > - {
>> > - /* Adjust addr for 16B alginment in ensuing loop. */
>> > - while (!cmp_z)
>> > - {
>> > - p1 += cmp;
>> > - /* Load up to 16 bytes of fragment. */
>> > - frag1 = strloadu (p1);
>> > - cmp = _mm_cmpistri (frag2, frag1, 0x0c);
>> > - cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
>> > - cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
>> > - /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
>> > - once already, this time cmp will be zero and we can exit. */
>> > - if ((!cmp) & cmp_c)
>> > - break;
>> > - }
>> > -
>> > - if (!cmp_c)
>> > - return NULL;
>> > -
>> > - /* Since s2 is less than 16 bytes, com_c is definitive
>> > - determination of full match. */
>> > - return (char *) p1 + cmp;
>> > - }
>> > -
>> > - /* General case, s2 is at least 16 bytes or more.
>> > - First, the common case of false-match at first byte of p2. */
>> > - const unsigned char *pt = NULL;
>> > - int kmp_fwd = 0;
>> > -re_trace:
>> > - while (!cmp_c)
>> > - {
>> > - /* frag1 has null. */
>> > - if (cmp_z)
>> > - return NULL;
>> > -
>> > - /* frag 1 has no null, advance 16 bytes. */
>> > - p1 += 16;
>> > - /* Load up to 16 bytes of fragment. */
>> > - frag1 = strloadu (p1);
>> > - /* Unsigned bytes, equal order, is there a partial match? */
>> > - cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
>> > - cmp = _mm_cmpistri (frag2, frag1, 0x0c);
>> > - cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
>> > - }
>> > -
>> > - /* Next, handle initial positive match as first byte of p2. We have
>> > - a partial fragment match, make full determination until we reached
>> > - end of s2. */
>> > - if (!cmp)
>> > - {
>> > - if (cmp_z)
>> > - return (char *) p1;
>> > -
>> > - pt = p1;
>> > - p1 += 16;
>> > - p2 += 16;
>> > - /* Load up to 16 bytes of fragment. */
>> > - frag2 = strloadu (p2);
>> > - }
>> > - else
>> > - {
>> > - /* Adjust 16B alignment. */
>> > - p1 += cmp;
>> > - pt = p1;
>> > - }
>> > -
>> > - /* Load up to 16 bytes of fragment. */
>> > - frag1 = strloadu (p1);
>> > -
>> > - /* Unsigned bytes, equal order, does frag2 has null? */
>> > - cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
>> > - cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
>> > - cmp = _mm_cmpistri (frag2, frag1, 0x0c);
>> > - cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
>> > - while (!(cmp | cmp_z | cmp_s))
>> > - {
>> > - p1 += 16;
>> > - p2 += 16;
>> > - /* Load up to 16 bytes of fragment. */
>> > - frag2 = strloadu (p2);
>> > - /* Load up to 16 bytes of fragment. */
>> > - frag1 = strloadu (p1);
>> > - /* Unsigned bytes, equal order, does frag2 has null? */
>> > - cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
>> > - cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
>> > - cmp = _mm_cmpistri (frag2, frag1, 0x0c);
>> > - cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
>> > - }
>> > -
>> > - /* Full determination yielded a false result, retrace s1 to next
>> > - starting position.
>> > - Zflg 1 0 1 0/1
>> > - Sflg 0 1 1 0/1
>> > - cmp na 0 0 >0
>> > - action done done continue continue if s2 < s1
>> > - false match retrace s1 else false
>> > - */
>> > -
>> > - if (cmp_s & !cmp)
>> > - return (char *) pt;
>> > - if (cmp_z)
>> > - {
>> > - if (!cmp_s)
>> > - return NULL;
>> > -
>> > - /* Handle both zero and sign flag set and s1 is shorter in
>> > - length. */
>> > - int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
>> > - int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
>> > - int len;
>> > - int len1;
>> > - __asm ("bsfl %[bmsk], %[len]"
>> > - : [len] "=r" (len) : [bmsk] "r" (bmsk));
>> > - __asm ("bsfl %[bmsk1], %[len1]"
>> > - : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
>> > - if (len >= len1)
>> > - return NULL;
>> > - }
>> > - else if (!cmp)
>> > - return (char *) pt;
>> > -
>> > - /* Otherwise, we have to retrace and continue. Default of multiple
>> > - paths that need to retrace from next byte in s1. */
>> > - p2 = s2;
>> > - frag2 = strloadu (p2);
>> > -
>> > - if (!kmp_fwd)
>> > - kmp_fwd = KMP16Bovrlap (frag2);
>> > +#include "string/strstr.c"
>> >
>> > - /* KMP algorithm predicted overlap needs to be corrected for
>> > - partial fragment compare. */
>> > - p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
>> > +extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
>> > +extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
>> >
>> > - /* Since s2 is at least 16 bytes long, we're certain there is no
>> > - match. */
>> > - if (p1[0] == '\0')
>> > - return NULL;
>> > +#include "init-arch.h"
>> >
>> > - /* Load up to 16 bytes of fragment. */
>> > - frag1 = strloadu (p1);
>> > +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
>> > + ifunc symbol properly. */
>> > +extern __typeof (__redirect_strstr) __libc_strstr;
>> > +libc_ifunc (__libc_strstr, HAS_FAST_UNALIGNED_LOAD ? __strstr_sse2_unaligned : __strstr_sse2)
>> >
>> > - /* Unsigned bytes, equal order, is there a partial match? */
>> > - cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
>> > - cmp = _mm_cmpistri (frag2, frag1, 0x0c);
>> > - cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
>> > - goto re_trace;
>> > -}
>> > +#undef strstr
>> > +strong_alias (__libc_strstr, strstr)
>> > --
>> > 1.8.3.2
>>
>> --
>>
>> The UPS doesn't have a battery backup.
>
> --
>
> The air conditioning water supply pipe ruptured over the machine room