[RFC] unpolished strstr implementation
Ondřej Bílka
neleai@seznam.cz
Tue Jul 3 04:07:00 GMT 2012
Hello,
I wrote working but unpolished strstr implementation of strstr that
combines two-way algorithm with digraph search.
Benchmark is at usual place http://kam.mff.cuni.cz/~ondra/benchmark_string/i7/strstr/html/test_r.html
I had for performance changed loop to platform specific loop_sse.h.
Probably arithmetic will need separate loop.
Next technical issue is that I cannot changing strstr-c to
sse2 version breaks ifunc so I added sse2 as separate implementation.
There are three possible optimalizations. First is that also on two-way
search last two characters so we do not have to worry about termination.
Second is with sse4.2 use mask _mm_cmpistrm to also kill next 16
positions.
Third is to have better strcmp_dir.
Tomorow I plan add memmem and strcasestr.
---
sysdeps/x86_64/multiarch/Makefile | 4 +-
sysdeps/x86_64/multiarch/loop.h | 1 +
sysdeps/x86_64/multiarch/loop_sse.h | 100 ++++++
sysdeps/x86_64/multiarch/sse2.h | 95 ++++++
sysdeps/x86_64/multiarch/ssse3.h | 5 +
sysdeps/x86_64/multiarch/strcasestr-nonascii.c | 2 +-
sysdeps/x86_64/multiarch/strcasestr.c | 2 +-
sysdeps/x86_64/multiarch/strstr-c.c | 12 +-
sysdeps/x86_64/multiarch/strstr.c | 389 +-----------------------
sysdeps/x86_64/multiarch/strstr.h | 215 +++++++++++++
sysdeps/x86_64/multiarch/strstr_old.h | 384 +++++++++++++++++++++++
sysdeps/x86_64/multiarch/strstr_sse2.c | 4 +
sysdeps/x86_64/multiarch/vector.h | 17 +
13 files changed, 837 insertions(+), 393 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/loop.h
create mode 100644 sysdeps/x86_64/multiarch/loop_sse.h
create mode 100644 sysdeps/x86_64/multiarch/sse2.h
create mode 100644 sysdeps/x86_64/multiarch/ssse3.h
create mode 100644 sysdeps/x86_64/multiarch/strstr.h
create mode 100644 sysdeps/x86_64/multiarch/strstr_old.h
create mode 100644 sysdeps/x86_64/multiarch/strstr_sse2.c
create mode 100644 sysdeps/x86_64/multiarch/vector.h
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index dd6c27d..7f2c8cd 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -19,12 +19,12 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
memcmp-ssse3
ifeq (yes,$(config-cflags-sse4))
-sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
+sysdep_routines += strcspn-c strpbrk-c strspn-c strstr_sse2 strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
CFLAGS-strspn-c.c += -msse4
-CFLAGS-strstr.c += -msse4
+CFLAGS-strstr.c += -mssse3
CFLAGS-strcasestr.c += -msse4
CFLAGS-strcasestr-nonascii.c += -msse4
endif
diff --git a/sysdeps/x86_64/multiarch/loop.h b/sysdeps/x86_64/multiarch/loop.h
new file mode 100644
index 0000000..6862d6a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/loop.h
@@ -0,0 +1 @@
+#include "loop_sse.h"
diff --git a/sysdeps/x86_64/multiarch/loop_sse.h b/sysdeps/x86_64/multiarch/loop_sse.h
new file mode 100644
index 0000000..d5567c2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/loop_sse.h
@@ -0,0 +1,100 @@
+#ifdef DETECT_ZERO_BYTE
+ #define _DETECT_ZERO_BYTE mvec= OR(mvec,TEST_ZERO(sn));
+ #define _TEST_ZERO_BYTE (*p==0)
+#else
+ #define _DETECT_ZERO_BYTE
+ #define _TEST_ZERO_BYTE 0
+#endif
+#ifdef DETECT_END
+ #define _DETECT_END(u) if (DETECT_END<=s2+u*BYTES_AT_ONCE) mask |= ((tp_mask)1)<<(DETECT_END-s2);
+ #define _TEST_END (p==DETECT_END)
+#else
+ #define _DETECT_END(u)
+ #define _TEST_END 0
+#endif
+
+ #define TEST(u) \
+ mvec=vzero;\
+ so=sn;\
+ sn=LOAD(s2+u*16);\
+ mvec = TEST_CODE(so,sn); \
+ _DETECT_ZERO_BYTE;\
+ mvec##u = mvec;
+
+ int i;
+ tp_vector vzero=BROADCAST_ZERO();
+ tp_vector sn,so;
+ int s_offset; uchar* s2;
+ sn=vzero;
+ ALIGN(s,4);
+ tp_mask forget_offset=((tp_mask)-1)<<s_offset;
+ tp_vector mvec; tp_vector mvec0,mvec1,mvec2,mvec3;
+ tp_mask mask,mask0,mask1,mask2,mask3;
+
+ TEST(0);
+ mask0=get_mask(mvec0);
+ TEST(1);
+ mask1=get_mask(mvec1);
+ TEST(2);
+ mask2=get_mask(mvec2);
+ TEST(3);
+ mask3=get_mask(mvec3);
+ mask =(mask0|(mask1<<16))|((mask2|(mask3<<16))<<32);
+ //or-ing in this way minimizes data dependencies
+
+ _DETECT_END(4);
+ mask=mask&forget_offset;
+ if (mask) goto test;
+ start:;
+ while(1){
+ s2+=64;
+ PREFETCH(s2+512);
+ TEST(0);
+ TEST(1);
+ TEST(2);
+ TEST(3);
+ mask=0;
+ _DETECT_END(4);
+ if(get_mask(OR(OR(mvec0,mvec1),OR(mvec2,mvec3)))|mask){
+ // on x64 or is destructive operation
+ // in case of strlen it is faster to recalculate mvec0,mvec1 than move them to separate registers.
+ mask0=get_mask(mvec0);
+ mask1=get_mask(mvec1);
+ mask2=get_mask(mvec2);
+ mask3=get_mask(mvec3);
+ mask =(mask0|(mask1<<16))|((mask2|(mask3<<16))<<32);
+ _DETECT_END(4);
+ goto test;
+ }
+ }
+ test:;
+#ifdef CAN_SKIP
+ if(skip_to-s2>=64){
+ mask=0;
+ } else if (skip_to-s2>=0){
+ mask&=((tp_mask)-1)<<(skip_to-s2);
+ }
+#endif
+ while(mask){ i=first_bit(mask);
+ uchar *p=s2+i;
+ if(__builtin_expect(_TEST_ZERO_BYTE||_TEST_END,0)){
+ LOOP_END(p)
+ }
+ LOOP_BODY(p)
+#ifdef CAN_SKIP
+ if(skip_to-s2>=64){
+ mask=0;
+ } else if (skip_to-s2>=0){
+ mask&=((tp_mask)-1)<<(skip_to-s2);
+ }
+#else
+ mask=forget_first_bit(mask);
+#endif
+ }
+ goto start;
+
+
+ #undef CAN_SKIP
+
+#undef TEST_CODE
+#undef LOOP_BODY
diff --git a/sysdeps/x86_64/multiarch/sse2.h b/sysdeps/x86_64/multiarch/sse2.h
new file mode 100644
index 0000000..4b2adc0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/sse2.h
@@ -0,0 +1,95 @@
+#include <stdint.h>
+#include <emmintrin.h>
+#include <stdlib.h>
+
+typedef __m128i tp_vector;
+typedef long tp_mask;
+#define SI static inline
+#define BYTES_AT_ONCE sizeof(tp_vector)
+#define PARA (BYTES_AT_ONCE*unroll)
+#define VSIZ_BYTE sizeof(tp_vector)
+#define VSIZ_BIT (VSIZ_BYTE*8)
+#define MSIZ_BYTE sizeof(tp_mask)
+#define MSIZ_BIT (MSIZ_BYTE*8)
+
+#define CACHE_LINE_SIZE 64
+#define PREFETCH(x) _mm_prefetch(((char *)x),_MM_HINT_T0);
+#define ALIGN(x,u) s_offset=((size_t) x)%((u)*BYTES_AT_ONCE); s2=((uchar *)x)-s_offset;
+
+SI tp_mask get_mask(tp_vector x){ return _mm_movemask_epi8(x); }
+
+SI tp_mask first_bit(tp_mask t){ return __builtin_ctzl(t);}
+SI tp_mask last_bit(tp_mask t){ return __builtin_clzl(t);}
+
+SI tp_mask forget_first_bit(tp_mask t){return t&(t-1);}
+
+SI tp_mask get_bit(tp_mask x,int y){return (x&(((tp_mask)1)<<(y)));}
+SI tp_mask shift_down(tp_mask x,int y){ return x>>y;}
+SI tp_mask shift_up (tp_mask x,int y){ return x<<y;}
+SI tp_mask forget_low_bits(tp_mask m,int b)
+{
+ if (b>=MSIZ_BIT) return 0;
+ return shift_up(shift_down(m,b),b);
+}
+SI tp_mask forget_high_bits(tp_mask m,int b)
+{
+ if (b>=MSIZ_BIT) return 0;
+ b=MSIZ_BIT-b;
+ return shift_down(shift_up(m,b),b);
+}
+
+
+
+
+
+SI tp_vector BYTE_AT(uchar c,int shift)
+{
+ return _mm_set_epi64x(((uint64_t)c)<<(8*shift),((uint64_t)c)<<(8*(shift-8)));
+}
+
+#define TEST_EQ _mm_cmpeq_epi8
+#define TEST_ZERO(x) TEST_EQ(x,vzero)
+#define AND _mm_and_si128
+#define ANDNOT(x,y) _mm_andnot_si128(y,x)
+#define OR _mm_or_si128
+#define XOR _mm_xor_si128
+#define SHIFT_DOWN _mm_srli_si128
+#define SHIFT_UP _mm_slli_si128
+#define CONCAT(x,y,n) (n==0) ? (y) : ((n==BYTES_AT_ONCE) ? (x) : OR(SHIFT_UP(x,BYTES_AT_ONCE-(n)),SHIFT_DOWN(y,(n))))
+SI tp_vector BROADCAST(uchar c)
+{
+ return _mm_set1_epi8(c);
+}
+
+SI tp_vector BROADCAST_ZERO(void){
+ tp_vector m;
+ m=XOR(m,m);
+ return m;
+}
+#define find_zeros(x) get_mask(TEST_EQ(x,vzero))
+
+
+SI tp_vector TEST_RANGE(tp_vector v,uchar from,uchar to){
+ tp_vector fv=BROADCAST(-127-from);
+ v=_mm_add_epi8(v,fv);
+ tp_vector tv=BROADCAST(-127+to-from+1);
+ return _mm_cmplt_epi8(v,tv);
+}
+#ifdef CASE_INSENSITIVE
+SI tp_vector LOAD(tp_vector* x){tp_mask mask;
+ tp_vector high_bit=BROADCAST(128);
+ tp_vector m=_mm_load_si128(x);
+ tp_vector l= AND(TEST_RANGE(m,'A','Z'),high_bit);
+ m=OR(m,_mm_srli_epi64(l,2));
+ if (mask=get_mask(m)){int i;
+ enum_bits_start(mask,i)
+ ((uchar*)&m)[i]=tolower(((uchar*)&m)[i]);
+ enum_bits_end
+ }
+ return m;
+}
+#else
+#define LOAD(x) _mm_load_si128((tp_vector*)(x))
+#define LOAD_UNALIGNED(x) _mm_loadu_si128(x)
+#endif
+
diff --git a/sysdeps/x86_64/multiarch/ssse3.h b/sysdeps/x86_64/multiarch/ssse3.h
new file mode 100644
index 0000000..b6b87fd
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ssse3.h
@@ -0,0 +1,5 @@
+#include <tmmintrin.h>
+#include "sse2.h"
+#undef CONCAT
+
+#define CONCAT(x,y,n) ((n==0) ? (y) : ((n==BYTES_AT_ONCE) ? (x) : _mm_alignr_epi8(x,y,n)))
diff --git a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c b/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
index a1f9968..0bbca7e 100644
--- a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
+++ b/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
@@ -46,4 +46,4 @@ __m128i_strloadu_tolower (const unsigned char *p)
#define STRCASESTR_NONASCII
#define USE_AS_STRCASESTR
#define STRSTR_SSE42 __strcasestr_sse42_nonascii
-#include "strstr.c"
+#include "strstr_old.h"
diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c
index d1cfb3b..21ae683 100644
--- a/sysdeps/x86_64/multiarch/strcasestr.c
+++ b/sysdeps/x86_64/multiarch/strcasestr.c
@@ -4,4 +4,4 @@ extern char *__strcasestr_sse42_nonascii (const unsigned char *s1,
#define USE_AS_STRCASESTR
#define STRSTR_SSE42 __strcasestr_sse42
-#include "strstr.c"
+#include "strstr_old.h"
diff --git a/sysdeps/x86_64/multiarch/strstr-c.c b/sysdeps/x86_64/multiarch/strstr-c.c
index b8ed316..56e8094 100644
--- a/sysdeps/x86_64/multiarch/strstr-c.c
+++ b/sysdeps/x86_64/multiarch/strstr-c.c
@@ -1,15 +1,17 @@
#include "init-arch.h"
-#define STRSTR __strstr_sse2
+#define STRSTR __strstr_nosse
#ifdef SHARED
# undef libc_hidden_builtin_def
# define libc_hidden_builtin_def(name) \
- __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
+ __hidden_ver1 (__strstr_nosse, __GI_strstr, __strstr_nosse);
#endif
#include "string/strstr.c"
-extern char *__strstr_sse42 (const char *, const char *) attribute_hidden;
-extern __typeof (__strstr_sse2) __strstr_sse2 attribute_hidden;
+extern char *__strstr_ssse3 (const char *, const char *) attribute_hidden;
+extern char *__strstr_sse2 (const char *, const char *) attribute_hidden;
-libc_ifunc (strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2);
+extern __typeof (__strstr_nosse) __strstr_nosse attribute_hidden;
+
+libc_ifunc (strstr, HAS_SSSE3 ? __strstr_ssse3 : (HAS_SSE2 ? __strstr_sse2 : __strstr_nosse));
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index b1b4139..dd6e655 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -1,384 +1,5 @@
-/* strstr with SSE4.2 intrinsics
- Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <nmmintrin.h>
-#include "varshift.h"
-
-#ifndef STRSTR_SSE42
-# define STRSTR_SSE42 __strstr_sse42
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# include <ctype.h>
-# include <locale/localeinfo.h>
-
-# define LOADBYTE(C) tolower (C)
-# define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2))
-#else
-# define LOADBYTE(C) (C)
-# define CMPBYTE(C1, C2) ((C1) == (C2))
-#endif
-
-/* We use 0xe ordered-compare:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_ORDER
- | _SIDD_LEAST_SIGNIFICANT
- on pcmpistri to do the scanning and string comparsion requirements of
- sub-string match. In the scanning phase, we process Cflag and ECX
- index to locate the first fragment match; once the first fragment
- match position has been identified, we do comparison of subsequent
- string fragments until we can conclude false or true match; whe
- n concluding a false match, we may need to repeat scanning process
- from next relevant offset in the target string.
-
- In the scanning phase we have 4 cases:
- case ECX CFlag ZFlag SFlag
- 1 16 0 0 0
- 2a 16 0 0 1
- 2b 16 0 1 0
- 2c 16 0 1 1
-
- 1. No ordered-comparison match, both 16B fragments are valid, so
- continue to next fragment.
- 2. No ordered-comparison match, there is EOS in either fragment,
- 2a. Zflg = 0, Sflg = 1, we continue
- 2b. Zflg = 1, Sflg = 0, we conclude no match and return.
- 2c. Zflg = 1, sflg = 1, lenth determine match or no match
-
- In the string comparison phase, the 1st fragment match is fixed up
- to produce ECX = 0. Subsequent fragment compare of nonzero index
- and no match conclude a false match.
-
- case ECX CFlag ZFlag SFlag
- 3 X 1 0 0/1
- 4a 0 1 0 0
- 4b 0 1 0 1
- 4c 0 < X 1 0 0/1
- 5 16 0 1 0
-
- 3. An initial ordered-comparison fragment match, we fix up to do
- subsequent string comparison
- 4a. Continuation of fragment comparison of a string compare.
- 4b. EOS reached in the reference string, we conclude true match and
- return
- 4c. String compare failed if index is nonzero, we need to go back to
- scanning
- 5. failed string compare, go back to scanning
- */
-
-/* Simple replacement of movdqu to address 4KB boundary cross issue.
- If EOS occurs within less than 16B before 4KB boundary, we don't
- cross to next page. */
-
-static inline __m128i
-__m128i_strloadu (const unsigned char * p, __m128i zero)
-{
- if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0))
- {
- size_t offset = ((size_t) p & (16 - 1));
- __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
- if ((bmsk >> offset) != 0)
- return __m128i_shift_right (a, offset);
- }
- return _mm_loadu_si128 ((__m128i *) p);
-}
-
-#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
-
-/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
- locale and other which have single-byte letters only in the ASCII
- range. */
-static inline __m128i
-__m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow,
- __m128i uchigh, __m128i lcqword)
-{
- __m128i frag = __m128i_strloadu (p, zero);
-
- /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */
- __m128i r2 = _mm_cmpgt_epi8 (uchigh, frag);
- /* Compare if bytes are > 'A' - 1. */
- __m128i r1 = _mm_cmpgt_epi8 (frag, uclow);
- /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1. */
- __m128i mask = _mm_and_si128 (r2, r1);
- /* Apply lowercase bit 6 mask for above mask bytes == ff. */
- return _mm_or_si128 (frag, _mm_and_si128 (mask, lcqword));
-}
-
-#endif
-
-/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
- algorithm) overlap for a fully populated 16B vector.
- Input parameter: 1st 16Byte loaded from the reference string of a
- strstr function.
- We don't use KMP algorithm if reference string is less than 16B. */
-static int
-__inline__ __attribute__ ((__always_inline__,))
-KMP16Bovrlap (__m128i s2)
-{
- __m128i b = _mm_unpacklo_epi8 (s2, s2);
- __m128i a = _mm_unpacklo_epi8 (b, b);
- a = _mm_shuffle_epi32 (a, 0);
- b = _mm_srli_si128 (s2, sizeof (char));
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
-
- /* _BitScanForward(&k1, bmsk); */
- int k1;
- __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
- if (!bmsk)
- return 16;
- else if (bmsk == 0x7fff)
- return 1;
- else if (!k1)
- {
- /* There are al least two distinct chars in s2. If byte 0 and 1 are
- idential and the distinct value lies farther down, we can deduce
- the next byte offset to restart full compare is least no earlier
- than byte 3. */
- return 3;
- }
- else
- {
- /* Byte 1 is not degenerated to byte 0. */
- return k1 + 1;
- }
-}
-
-char *
-__attribute__ ((section (".text.sse4.2")))
-STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
-{
-#define p1 s1
- const unsigned char *p2 = s2;
-
-#ifndef STRCASESTR_NONASCII
- if (__builtin_expect (p2[0] == '\0', 0))
- return (char *) p1;
-
- if (__builtin_expect (p1[0] == '\0', 0))
- return NULL;
-
- /* Check if p1 length is 1 byte long. */
- if (__builtin_expect (p1[1] == '\0', 0))
- return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# ifndef STRCASESTR_NONASCII
- if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
- != 0, 0))
- return __strcasestr_sse42_nonascii (s1, s2);
-
- const __m128i uclow = _mm_set1_epi8 (0x40);
- const __m128i uchigh = _mm_set1_epi8 (0x5b);
- const __m128i lcqword = _mm_set1_epi8 (0x20);
- const __m128i zero = _mm_setzero_si128 ();
-# define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword)
-# else
-# define strloadu __m128i_strloadu_tolower
-# define zero _mm_setzero_si128 ()
-# endif
-#else
-# define strloadu(p) __m128i_strloadu (p, zero)
- const __m128i zero = _mm_setzero_si128 ();
-#endif
-
- /* p1 > 1 byte long. Load up to 16 bytes of fragment. */
- __m128i frag1 = strloadu (p1);
-
- __m128i frag2;
- if (p2[1] != '\0')
- /* p2 is > 1 byte long. */
- frag2 = strloadu (p2);
- else
- frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0);
-
- /* Unsigned bytes, equal order, does frag2 has null? */
- int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
- if (cmp_s & cmp_c)
- {
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
- int len;
- __asm ("bsfl %[bmsk], %[len]"
- : [len] "=r" (len) : [bmsk] "r" (bmsk));
- p1 += cmp;
- if ((len + cmp) <= 16)
- return (char *) p1;
-
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- if ((len + cmp) <= 16)
- return (char *) p1 + cmp;
- }
-
- if (cmp_s)
- {
- /* Adjust addr for 16B alginment in ensuing loop. */
- while (!cmp_z)
- {
- p1 += cmp;
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
- once already, this time cmp will be zero and we can exit. */
- if ((!cmp) & cmp_c)
- break;
- }
-
- if (!cmp_c)
- return NULL;
-
- /* Since s2 is less than 16 bytes, com_c is definitive
- determination of full match. */
- return (char *) p1 + cmp;
- }
-
- /* General case, s2 is at least 16 bytes or more.
- First, the common case of false-match at first byte of p2. */
- const unsigned char *pt = NULL;
- int kmp_fwd = 0;
-re_trace:
- while (!cmp_c)
- {
- /* frag1 has null. */
- if (cmp_z)
- return NULL;
-
- /* frag 1 has no null, advance 16 bytes. */
- p1 += 16;
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
- /* Unsigned bytes, equal order, is there a partial match? */
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- }
-
- /* Next, handle initial positive match as first byte of p2. We have
- a partial fragment match, make full determination until we reached
- end of s2. */
- if (!cmp)
- {
- if (cmp_z)
- return (char *) p1;
-
- pt = p1;
- p1 += 16;
- p2 += 16;
- /* Load up to 16 bytes of fragment. */
- frag2 = strloadu (p2);
- }
- else
- {
- /* Adjust 16B alignment. */
- p1 += cmp;
- pt = p1;
- }
-
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
-
- /* Unsigned bytes, equal order, does frag2 has null? */
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
- while (!(cmp | cmp_z | cmp_s))
- {
- p1 += 16;
- p2 += 16;
- /* Load up to 16 bytes of fragment. */
- frag2 = strloadu (p2);
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
- /* Unsigned bytes, equal order, does frag2 has null? */
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
- }
-
- /* Full determination yielded a false result, retrace s1 to next
- starting position.
- Zflg 1 0 1 0/1
- Sflg 0 1 1 0/1
- cmp na 0 0 >0
- action done done continue continue if s2 < s1
- false match retrace s1 else false
- */
-
- if (cmp_s & !cmp)
- return (char *) pt;
- if (cmp_z)
- {
- if (!cmp_s)
- return NULL;
-
- /* Handle both zero and sign flag set and s1 is shorter in
- length. */
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
- int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
- int len;
- int len1;
- __asm ("bsfl %[bmsk], %[len]"
- : [len] "=r" (len) : [bmsk] "r" (bmsk));
- __asm ("bsfl %[bmsk1], %[len1]"
- : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
- if (len >= len1)
- return NULL;
- }
- else if (!cmp)
- return (char *) pt;
-
- /* Otherwise, we have to retrace and continue. Default of multiple
- paths that need to retrace from next byte in s1. */
- p2 = s2;
- frag2 = strloadu (p2);
-
- if (!kmp_fwd)
- kmp_fwd = KMP16Bovrlap (frag2);
-
- /* KMP algorithm predicted overlap needs to be corrected for
- partial fragment compare. */
- p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
-
- /* Since s2 is at least 16 bytes long, we're certain there is no
- match. */
- if (p1[0] == '\0')
- return NULL;
-
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
-
- /* Unsigned bytes, equal order, is there a partial match? */
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- goto re_trace;
-}
+static const int small_treshold=128;
+#define STRSTR __strstr_ssse3
+#define AS_STRSTR
+#define USE_SSSE3
+#include "strstr.h"
diff --git a/sysdeps/x86_64/multiarch/strstr.h b/sysdeps/x86_64/multiarch/strstr.h
new file mode 100644
index 0000000..f229223
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr.h
@@ -0,0 +1,215 @@
+#include <stdlib.h>
+#include <string.h>
+#include "vector.h"
+
+SI int max(int x,int y){ return x>y ? x : y; }
+SI int min(int x,int y){ return x<y ? x : y; }
+
+
+#define CHAR(x) *(x)
+static size_t strcmp_dir(const uchar *a,const uchar *b,size_t no,int dir){
+ int i;
+ for(i=0;i<no && CHAR(a)==CHAR(b);i++){a+=dir;b+=dir;}
+ return i;
+}
+
+
+/* Two way algorithm: CROCHEMORE M., PERRIN D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
+ Implementation based from http://www-igm.univ-mlv.fr/~lecroq/string/node26.html
+*/
+static void two_way_preprocessing(uchar *n,int ns,int *per2,int *ell2,int *peri);
+
+#ifdef AS_STRSTR
+ #define DETECT_ZERO_BYTE
+ #define _AS_STRSTR(x) x
+ #define _AS_MEMMEM(x)
+#endif
+#ifdef AS_MEMMEM
+ #define DETECT_END s+ss-ns+check
+ #define _AS_STRSTR(x)
+ #define _AS_MEMMEM(x) x
+#endif
+static uchar *strstr_two_way(uchar *s, int ss, uchar *n, int ns)
+{
+
+ int fw,fwno,bw,bwno;
+ int ell, per, peri,pos;
+ two_way_preprocessing(n,ns,&per,&ell,&peri);
+ int check=min(ell+2,ns-1);
+ uchar *skip_to=s+check;
+ tp_vector vn0=BROADCAST(n[check-0]);
+ tp_vector vn1=BROADCAST(n[check-1]);
+ tp_vector e0,e1;
+
+ #define TEST_CODE(so,sn) vzero;\
+ e0 = TEST_EQ(CONCAT(sn,so,BYTES_AT_ONCE-0),vn0); \
+ e1 = TEST_EQ(CONCAT(sn,so,BYTES_AT_ONCE-1),vn1); \
+ mvec = AND(e0,e1);
+
+ #define LOOP_BODY(p) \
+ p = p - check;\
+ pos = check + 1;\
+ fwno = ns - pos;\
+ fw = strcmp_dir(n + pos ,p + pos, fwno , 1);\
+ if (fw < fwno ){\
+ p += fw + 1;\
+ } else {\
+ bwno = ell + 1;\
+ bw = strcmp_dir(n + bwno - 1, p + bwno - 1, bwno, -1);\
+ if ( bw < bwno ){\
+ _AS_STRSTR(for(i=0;i<per;i++) if (!p[ns+i]) return NULL);\
+ p += per;\
+ if (peri){\
+ while(1){\
+ pos = max(ell, ns - per - 1) + 1;\
+ fwno = ns - pos;\
+ fw = strcmp_dir(n + pos ,p + pos, fwno , 1);\
+ if (fw < fwno ){\
+ p += fw + 1;\
+ break ;\
+ } else {\
+ bwno = ell - ns - per - 1;\
+ bw = strcmp_dir(n + ell, p + ell, bwno, -1);\
+ if ( bw < bwno ){\
+ _AS_STRSTR(for(i=0;i<per;i++) if (!p[ns+i]) return NULL);\
+ p += per;\
+ } else {\
+ return p;\
+ }\
+ }\
+ }\
+ }\
+ } else {\
+ return p;\
+ }\
+ }\
+ skip_to = p + check;
+
+ #define LOOP_END(p) return NULL;
+ #define CAN_SKIP
+ #include "loop.h"
+}
+
+static uchar *strstr_vec( uchar *s,int ss,uchar *n,int ns){
+ int buy=8*ns+64,rent=0;
+ tp_vector vn0=BROADCAST(n[ns-1-0]);
+ tp_vector vn1=BROADCAST(n[ns-1-1]);
+ tp_vector e0,e1;
+
+ #define DETECT_ZERO_BYTE
+
+ #define TEST_CODE(so,sn) vzero;\
+ e0 = TEST_EQ(CONCAT(sn,so,BYTES_AT_ONCE-0),vn0); \
+ e1 = TEST_EQ(CONCAT(sn,so,BYTES_AT_ONCE-1),vn1); \
+ mvec = (AND(e0,e1));
+
+ #define LOOP_BODY(p) \
+ p = p - (ns - 1);\
+ if(p>= s){\
+ int checked=strcmp_dir(p+ns-1-2,n+ns-1-2,ns-2,-1); \
+ if (checked==ns-2) return p; \
+ rent+=checked;\
+ if(buy+2*(p-s)>rent) return strstr_two_way(p,ss,n,ns);\
+ }
+
+ #define LOOP_END(p) return NULL;
+ #include "loop.h"
+}
+#undef TEST_CODE
+#undef LOOP_BODY
+#undef LOOP_END
+
+
+
+#ifdef AS_STRSTR
+ #define STRCHR(s,sn,c) strchr(s,c)
+ uchar *STRSTR(const uchar *s,const uchar *n)
+#endif
+#ifdef AS_MEMMEM
+ #define STRCHR(s,sn,c) strchr(s,c)
+ uchar *memmem(const uchar *s,size_t ss,const uchar *n,size_t ns)
+#endif
+ #define SWITCH_IMPLEMENTATION return strstr_vec((uchar*)p,s_end-p,(uchar*)n,ns);
+{
+ int buy=small_treshold,rent=0;
+#ifdef AS_MEMMEM
+ uchar *s_end=s+ss;
+ if( ns > ss) return NULL;
+#endif
+#ifdef AS_STRSTR
+ uchar *s_end=NULL;
+ int ns=0;
+ while(n[ns]){
+ if(!s[ns]) return NULL;
+ ns++;
+ }
+#endif
+ if (!ns) return (uchar*) s;
+ uchar *p=(uchar*)s;
+ p+=ns-1;
+ while(1){
+ p=(uchar*) STRCHR((char*)p,s_end-p,((char*)n)[ns-1]);
+ if(!p) return NULL;
+ p -= ns - 1;
+ int check = strcmp_dir(n + ns - 1 -1 ,p + ns - 1 -1, ns - 1 , -1);
+ if (check == ns - 1) return p;
+ rent+=check+32;
+ if(buy-(p-s)<rent) SWITCH_IMPLEMENTATION
+ p++;
+ p+= ns - 1;
+ }
+}
+
+
+static int maxSuf(uchar *x, int m, int *p, int invert) {
+ int ms, j, k;
+ uchar a, b;
+
+ ms = -1;
+ j = 0;
+ k = *p = 1;
+ while (j + k < m) {
+ a = CHAR(x + j + k);
+ b = CHAR(x + ms + k);
+ if (invert ? (a > b) : (a < b)) {
+ j += k;
+ k = 1;
+ *p = j - ms;
+ }
+ else
+ if (a == b)
+ if (k != *p)
+ ++k;
+ else {
+ j += *p;
+ k = 1;
+ }
+ else { /* a > b */
+ ms = j;
+ j = ms + 1;
+ k = *p = 1;
+ }
+ }
+ return(ms);
+}
+
+static int periodic(uchar *a,uchar *b,int siz){
+ return strcmp_dir(a,b,siz,1)==siz;
+}
+
+static void two_way_preprocessing(uchar *n,int ns,int *per2,int *ell2,int *peri){
+ int u,v,up,vp;
+ int per,ell;
+ u=maxSuf(n,ns,&up,0);
+ v=maxSuf(n,ns,&vp,1);
+ ell = (u > v) ? u : v;
+ per = (u > v) ? up : vp;
+ *peri = periodic(n, n + per, ell + 1);
+ if (!*peri)
+ per = max(ell + 1, ns - ell - 1) + 1;
+ *per2=per;
+ *ell2=ell;
+}
+
+
+
diff --git a/sysdeps/x86_64/multiarch/strstr_old.h b/sysdeps/x86_64/multiarch/strstr_old.h
new file mode 100644
index 0000000..b1b4139
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr_old.h
@@ -0,0 +1,384 @@
+/* strstr with SSE4.2 intrinsics
+ Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <nmmintrin.h>
+#include "varshift.h"
+
+#ifndef STRSTR_SSE42
+# define STRSTR_SSE42 __strstr_sse42
+#endif
+
+#ifdef USE_AS_STRCASESTR
+# include <ctype.h>
+# include <locale/localeinfo.h>
+
+# define LOADBYTE(C) tolower (C)
+# define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2))
+#else
+# define LOADBYTE(C) (C)
+# define CMPBYTE(C1, C2) ((C1) == (C2))
+#endif
+
+/* We use 0xe ordered-compare:
+ _SIDD_SBYTE_OPS
+ | _SIDD_CMP_EQUAL_ORDER
+ | _SIDD_LEAST_SIGNIFICANT
+ on pcmpistri to do the scanning and string comparsion requirements of
+ sub-string match. In the scanning phase, we process Cflag and ECX
+ index to locate the first fragment match; once the first fragment
+ match position has been identified, we do comparison of subsequent
+ string fragments until we can conclude false or true match; whe
+ n concluding a false match, we may need to repeat scanning process
+ from next relevant offset in the target string.
+
+ In the scanning phase we have 4 cases:
+ case ECX CFlag ZFlag SFlag
+ 1 16 0 0 0
+ 2a 16 0 0 1
+ 2b 16 0 1 0
+ 2c 16 0 1 1
+
+ 1. No ordered-comparison match, both 16B fragments are valid, so
+ continue to next fragment.
+ 2. No ordered-comparison match, there is EOS in either fragment,
+ 2a. Zflg = 0, Sflg = 1, we continue
+ 2b. Zflg = 1, Sflg = 0, we conclude no match and return.
+ 2c. Zflg = 1, sflg = 1, lenth determine match or no match
+
+ In the string comparison phase, the 1st fragment match is fixed up
+ to produce ECX = 0. Subsequent fragment compare of nonzero index
+ and no match conclude a false match.
+
+ case ECX CFlag ZFlag SFlag
+ 3 X 1 0 0/1
+ 4a 0 1 0 0
+ 4b 0 1 0 1
+ 4c 0 < X 1 0 0/1
+ 5 16 0 1 0
+
+ 3. An initial ordered-comparison fragment match, we fix up to do
+ subsequent string comparison
+ 4a. Continuation of fragment comparison of a string compare.
+ 4b. EOS reached in the reference string, we conclude true match and
+ return
+ 4c. String compare failed if index is nonzero, we need to go back to
+ scanning
+ 5. failed string compare, go back to scanning
+ */
+
+/* Simple replacement of movdqu to address 4KB boundary cross issue.
+ If EOS occurs within less than 16B before 4KB boundary, we don't
+ cross to next page. */
+
+static inline __m128i
+__m128i_strloadu (const unsigned char * p, __m128i zero)
+{
+ if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0))
+ {
+ size_t offset = ((size_t) p & (16 - 1));
+ __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
+ int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
+ if ((bmsk >> offset) != 0)
+ return __m128i_shift_right (a, offset);
+ }
+ return _mm_loadu_si128 ((__m128i *) p);
+}
+
+#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
+
+/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
+ locale and other which have single-byte letters only in the ASCII
+ range. */
+static inline __m128i
+__m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow,
+ __m128i uchigh, __m128i lcqword)
+{
+ __m128i frag = __m128i_strloadu (p, zero);
+
+ /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */
+ __m128i r2 = _mm_cmpgt_epi8 (uchigh, frag);
+ /* Compare if bytes are > 'A' - 1. */
+ __m128i r1 = _mm_cmpgt_epi8 (frag, uclow);
+ /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1. */
+ __m128i mask = _mm_and_si128 (r2, r1);
+ /* Apply lowercase bit 6 mask for above mask bytes == ff. */
+ return _mm_or_si128 (frag, _mm_and_si128 (mask, lcqword));
+}
+
+#endif
+
+/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
+ algorithm) overlap for a fully populated 16B vector.
+ Input parameter: 1st 16Byte loaded from the reference string of a
+ strstr function.
+ We don't use KMP algorithm if reference string is less than 16B. */
+static int
+__inline__ __attribute__ ((__always_inline__,))
+KMP16Bovrlap (__m128i s2)
+{
+ __m128i b = _mm_unpacklo_epi8 (s2, s2);
+ __m128i a = _mm_unpacklo_epi8 (b, b);
+ a = _mm_shuffle_epi32 (a, 0);
+ b = _mm_srli_si128 (s2, sizeof (char));
+ int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
+
+ /* _BitScanForward(&k1, bmsk); */
+ int k1;
+ __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
+ if (!bmsk)
+ return 16;
+ else if (bmsk == 0x7fff)
+ return 1;
+ else if (!k1)
+ {
+ /* There are al least two distinct chars in s2. If byte 0 and 1 are
+ idential and the distinct value lies farther down, we can deduce
+ the next byte offset to restart full compare is least no earlier
+ than byte 3. */
+ return 3;
+ }
+ else
+ {
+ /* Byte 1 is not degenerated to byte 0. */
+ return k1 + 1;
+ }
+}
+
+char *
+__attribute__ ((section (".text.sse4.2")))
+STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
+{
+#define p1 s1
+ const unsigned char *p2 = s2;
+
+#ifndef STRCASESTR_NONASCII
+ if (__builtin_expect (p2[0] == '\0', 0))
+ return (char *) p1;
+
+ if (__builtin_expect (p1[0] == '\0', 0))
+ return NULL;
+
+ /* Check if p1 length is 1 byte long. */
+ if (__builtin_expect (p1[1] == '\0', 0))
+ return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
+#endif
+
+#ifdef USE_AS_STRCASESTR
+# ifndef STRCASESTR_NONASCII
+ if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
+ != 0, 0))
+ return __strcasestr_sse42_nonascii (s1, s2);
+
+ const __m128i uclow = _mm_set1_epi8 (0x40);
+ const __m128i uchigh = _mm_set1_epi8 (0x5b);
+ const __m128i lcqword = _mm_set1_epi8 (0x20);
+ const __m128i zero = _mm_setzero_si128 ();
+# define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword)
+# else
+# define strloadu __m128i_strloadu_tolower
+# define zero _mm_setzero_si128 ()
+# endif
+#else
+# define strloadu(p) __m128i_strloadu (p, zero)
+ const __m128i zero = _mm_setzero_si128 ();
+#endif
+
+ /* p1 > 1 byte long. Load up to 16 bytes of fragment. */
+ __m128i frag1 = strloadu (p1);
+
+ __m128i frag2;
+ if (p2[1] != '\0')
+ /* p2 is > 1 byte long. */
+ frag2 = strloadu (p2);
+ else
+ frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0);
+
+ /* Unsigned bytes, equal order, does frag2 has null? */
+ int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+ int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+ int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+ int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
+ if (cmp_s & cmp_c)
+ {
+ int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
+ int len;
+ __asm ("bsfl %[bmsk], %[len]"
+ : [len] "=r" (len) : [bmsk] "r" (bmsk));
+ p1 += cmp;
+ if ((len + cmp) <= 16)
+ return (char *) p1;
+
+ /* Load up to 16 bytes of fragment. */
+ frag1 = strloadu (p1);
+ cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+ cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
+ cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+ cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+ if ((len + cmp) <= 16)
+ return (char *) p1 + cmp;
+ }
+
+ if (cmp_s)
+ {
+ /* Adjust addr for 16B alginment in ensuing loop. */
+ while (!cmp_z)
+ {
+ p1 += cmp;
+ /* Load up to 16 bytes of fragment. */
+ frag1 = strloadu (p1);
+ cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+ cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+ cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+ /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
+ once already, this time cmp will be zero and we can exit. */
+ if ((!cmp) & cmp_c)
+ break;
+ }
+
+ if (!cmp_c)
+ return NULL;
+
+ /* Since s2 is less than 16 bytes, com_c is definitive
+ determination of full match. */
+ return (char *) p1 + cmp;
+ }
+
+ /* General case, s2 is at least 16 bytes or more.
+ First, the common case of false-match at first byte of p2. */
+ const unsigned char *pt = NULL;
+ int kmp_fwd = 0;
+re_trace:
+ while (!cmp_c)
+ {
+ /* frag1 has null. */
+ if (cmp_z)
+ return NULL;
+
+ /* frag 1 has no null, advance 16 bytes. */
+ p1 += 16;
+ /* Load up to 16 bytes of fragment. */
+ frag1 = strloadu (p1);
+ /* Unsigned bytes, equal order, is there a partial match? */
+ cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+ cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+ cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+ }
+
+ /* Next, handle initial positive match as first byte of p2. We have
+ a partial fragment match, make full determination until we reached
+ end of s2. */
+ if (!cmp)
+ {
+ if (cmp_z)
+ return (char *) p1;
+
+ pt = p1;
+ p1 += 16;
+ p2 += 16;
+ /* Load up to 16 bytes of fragment. */
+ frag2 = strloadu (p2);
+ }
+ else
+ {
+ /* Adjust 16B alignment. */
+ p1 += cmp;
+ pt = p1;
+ }
+
+ /* Load up to 16 bytes of fragment. */
+ frag1 = strloadu (p1);
+
+ /* Unsigned bytes, equal order, does frag2 has null? */
+ cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+ cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+ cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+ cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
+ while (!(cmp | cmp_z | cmp_s))
+ {
+ p1 += 16;
+ p2 += 16;
+ /* Load up to 16 bytes of fragment. */
+ frag2 = strloadu (p2);
+ /* Load up to 16 bytes of fragment. */
+ frag1 = strloadu (p1);
+ /* Unsigned bytes, equal order, does frag2 has null? */
+ cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+ cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+ cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+ cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
+ }
+
+ /* Full determination yielded a false result, retrace s1 to next
+ starting position.
+ Zflg 1 0 1 0/1
+ Sflg 0 1 1 0/1
+ cmp na 0 0 >0
+ action done done continue continue if s2 < s1
+ false match retrace s1 else false
+ */
+
+ if (cmp_s & !cmp)
+ return (char *) pt;
+ if (cmp_z)
+ {
+ if (!cmp_s)
+ return NULL;
+
+ /* Handle both zero and sign flag set and s1 is shorter in
+ length. */
+ int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
+ int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
+ int len;
+ int len1;
+ __asm ("bsfl %[bmsk], %[len]"
+ : [len] "=r" (len) : [bmsk] "r" (bmsk));
+ __asm ("bsfl %[bmsk1], %[len1]"
+ : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
+ if (len >= len1)
+ return NULL;
+ }
+ else if (!cmp)
+ return (char *) pt;
+
+ /* Otherwise, we have to retrace and continue. Default of multiple
+ paths that need to retrace from next byte in s1. */
+ p2 = s2;
+ frag2 = strloadu (p2);
+
+ if (!kmp_fwd)
+ kmp_fwd = KMP16Bovrlap (frag2);
+
+ /* KMP algorithm predicted overlap needs to be corrected for
+ partial fragment compare. */
+ p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
+
+ /* Since s2 is at least 16 bytes long, we're certain there is no
+ match. */
+ if (p1[0] == '\0')
+ return NULL;
+
+ /* Load up to 16 bytes of fragment. */
+ frag1 = strloadu (p1);
+
+ /* Unsigned bytes, equal order, is there a partial match? */
+ cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+ cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+ cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+ goto re_trace;
+}
diff --git a/sysdeps/x86_64/multiarch/strstr_sse2.c b/sysdeps/x86_64/multiarch/strstr_sse2.c
new file mode 100644
index 0000000..ab95587
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr_sse2.c
@@ -0,0 +1,4 @@
+static const int small_treshold=128;
+#define STRSTR __strstr_sse2
+#define AS_STRSTR
+#include "strstr.h"
diff --git a/sysdeps/x86_64/multiarch/vector.h b/sysdeps/x86_64/multiarch/vector.h
new file mode 100644
index 0000000..5ac92c2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/vector.h
@@ -0,0 +1,17 @@
+typedef unsigned char uchar;
+
+#ifdef SCALAR
+#include "scalar.h"
+#else
+#ifdef ARITHMETIC
+#include "arithmetic.h"
+#else
+#ifdef USE_SSSE3
+#include "ssse3.h"
+#else
+#include "sse2.h"
+#endif
+#endif
+#endif
+
+
--
1.7.4.4
More information about the Libc-help
mailing list