[RFC] unpolished strstr implementation

Ondřej Bílka neleai@seznam.cz
Tue Jul 3 04:07:00 GMT 2012


Hello,

I wrote working but unpolished strstr implementation of strstr that
combines two-way algorithm with digraph search. 

Benchmark is at usual place http://kam.mff.cuni.cz/~ondra/benchmark_string/i7/strstr/html/test_r.html

I had for performance changed loop to platform specific loop_sse.h.
Probably arithmetic will need separate loop.

Next technical issue is that I cannot changing strstr-c to 
sse2 version breaks  ifunc so I added sse2 as separate implementation.


There are three possible optimalizations. First is that also on two-way
search last two characters so we do not have to worry about termination.

Second is with sse4.2 use mask _mm_cmpistrm to also kill next 16
positions.

Third is to have better strcmp_dir.

Tomorow I plan add memmem and strcasestr.


---
 sysdeps/x86_64/multiarch/Makefile              |    4 +-
 sysdeps/x86_64/multiarch/loop.h                |    1 +
 sysdeps/x86_64/multiarch/loop_sse.h            |  100 ++++++
 sysdeps/x86_64/multiarch/sse2.h                |   95 ++++++
 sysdeps/x86_64/multiarch/ssse3.h               |    5 +
 sysdeps/x86_64/multiarch/strcasestr-nonascii.c |    2 +-
 sysdeps/x86_64/multiarch/strcasestr.c          |    2 +-
 sysdeps/x86_64/multiarch/strstr-c.c            |   12 +-
 sysdeps/x86_64/multiarch/strstr.c              |  389 +-----------------------
 sysdeps/x86_64/multiarch/strstr.h              |  215 +++++++++++++
 sysdeps/x86_64/multiarch/strstr_old.h          |  384 +++++++++++++++++++++++
 sysdeps/x86_64/multiarch/strstr_sse2.c         |    4 +
 sysdeps/x86_64/multiarch/vector.h              |   17 +
 13 files changed, 837 insertions(+), 393 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/loop.h
 create mode 100644 sysdeps/x86_64/multiarch/loop_sse.h
 create mode 100644 sysdeps/x86_64/multiarch/sse2.h
 create mode 100644 sysdeps/x86_64/multiarch/ssse3.h
 create mode 100644 sysdeps/x86_64/multiarch/strstr.h
 create mode 100644 sysdeps/x86_64/multiarch/strstr_old.h
 create mode 100644 sysdeps/x86_64/multiarch/strstr_sse2.c
 create mode 100644 sysdeps/x86_64/multiarch/vector.h

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index dd6c27d..7f2c8cd 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -19,12 +19,12 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
 		   memcmp-ssse3
 ifeq (yes,$(config-cflags-sse4))
-sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
+sysdep_routines += strcspn-c strpbrk-c strspn-c strstr_sse2 strstr-c strcasestr-c varshift
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
 CFLAGS-strspn-c.c += -msse4
-CFLAGS-strstr.c += -msse4
+CFLAGS-strstr.c += -mssse3 
 CFLAGS-strcasestr.c += -msse4
 CFLAGS-strcasestr-nonascii.c += -msse4
 endif
diff --git a/sysdeps/x86_64/multiarch/loop.h b/sysdeps/x86_64/multiarch/loop.h
new file mode 100644
index 0000000..6862d6a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/loop.h
@@ -0,0 +1 @@
+#include "loop_sse.h"
diff --git a/sysdeps/x86_64/multiarch/loop_sse.h b/sysdeps/x86_64/multiarch/loop_sse.h
new file mode 100644
index 0000000..d5567c2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/loop_sse.h
@@ -0,0 +1,100 @@
+#ifdef DETECT_ZERO_BYTE
+  #define _DETECT_ZERO_BYTE mvec= OR(mvec,TEST_ZERO(sn));
+  #define _TEST_ZERO_BYTE (*p==0)
+#else
+  #define _DETECT_ZERO_BYTE 
+  #define _TEST_ZERO_BYTE 0 
+#endif
+#ifdef DETECT_END
+  #define _DETECT_END(u) if (DETECT_END<=s2+u*BYTES_AT_ONCE) mask |= ((tp_mask)1)<<(DETECT_END-s2);
+  #define _TEST_END (p==DETECT_END)
+#else
+  #define _DETECT_END(u)
+  #define _TEST_END 0
+#endif
+
+   #define TEST(u) \
+     mvec=vzero;\
+     so=sn;\
+     sn=LOAD(s2+u*16);\
+     mvec    = TEST_CODE(so,sn); \
+     _DETECT_ZERO_BYTE;\
+     mvec##u = mvec;
+
+   int i;
+   tp_vector vzero=BROADCAST_ZERO();
+   tp_vector sn,so;
+   int s_offset; uchar* s2;
+   sn=vzero;
+   ALIGN(s,4);
+   tp_mask forget_offset=((tp_mask)-1)<<s_offset;
+   tp_vector mvec; tp_vector mvec0,mvec1,mvec2,mvec3;
+   tp_mask mask,mask0,mask1,mask2,mask3;
+   
+   TEST(0);
+   mask0=get_mask(mvec0);
+   TEST(1);
+   mask1=get_mask(mvec1);
+   TEST(2);
+   mask2=get_mask(mvec2);
+   TEST(3);
+   mask3=get_mask(mvec3);
+   mask =(mask0|(mask1<<16))|((mask2|(mask3<<16))<<32);
+  //or-ing in this way minimizes data dependencies
+
+   _DETECT_END(4);
+   mask=mask&forget_offset;
+   if (mask) goto test;
+   start:;
+   while(1){
+     s2+=64;
+     PREFETCH(s2+512);
+     TEST(0);
+     TEST(1);
+     TEST(2);
+     TEST(3);
+     mask=0;
+     _DETECT_END(4);
+     if(get_mask(OR(OR(mvec0,mvec1),OR(mvec2,mvec3)))|mask){
+       // on x64 or is destructive operation
+       // in case of strlen it is faster to recalculate mvec0,mvec1 than move them to separate registers.
+       mask0=get_mask(mvec0);
+       mask1=get_mask(mvec1);
+       mask2=get_mask(mvec2);
+       mask3=get_mask(mvec3);
+       mask =(mask0|(mask1<<16))|((mask2|(mask3<<16))<<32);
+       _DETECT_END(4);
+       goto test;
+     }
+   }
+   test:;
+#ifdef CAN_SKIP
+     if(skip_to-s2>=64){ 
+       mask=0;
+     } else if (skip_to-s2>=0){
+       mask&=((tp_mask)-1)<<(skip_to-s2);
+     }
+#endif
+     while(mask){ i=first_bit(mask);
+        uchar *p=s2+i;
+        if(__builtin_expect(_TEST_ZERO_BYTE||_TEST_END,0)){
+          LOOP_END(p)
+        }
+        LOOP_BODY(p)
+#ifdef CAN_SKIP
+        if(skip_to-s2>=64){ 
+          mask=0;
+        } else if (skip_to-s2>=0){
+          mask&=((tp_mask)-1)<<(skip_to-s2);
+        }
+#else
+        mask=forget_first_bit(mask);
+#endif
+     }
+   goto start;
+
+
+  #undef CAN_SKIP
+
+#undef TEST_CODE
+#undef LOOP_BODY
diff --git a/sysdeps/x86_64/multiarch/sse2.h b/sysdeps/x86_64/multiarch/sse2.h
new file mode 100644
index 0000000..4b2adc0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/sse2.h
@@ -0,0 +1,95 @@
+#include <stdint.h>
+#include <emmintrin.h>
+#include <stdlib.h>
+
+typedef __m128i tp_vector;
+typedef long tp_mask;
+#define SI static inline
+#define BYTES_AT_ONCE sizeof(tp_vector)
+#define PARA (BYTES_AT_ONCE*unroll)
+#define VSIZ_BYTE sizeof(tp_vector)
+#define VSIZ_BIT  (VSIZ_BYTE*8)
+#define MSIZ_BYTE sizeof(tp_mask)
+#define MSIZ_BIT  (MSIZ_BYTE*8)
+
+#define CACHE_LINE_SIZE 64
+#define PREFETCH(x)	_mm_prefetch(((char *)x),_MM_HINT_T0);
+#define ALIGN(x,u)         s_offset=((size_t) x)%((u)*BYTES_AT_ONCE);           s2=((uchar *)x)-s_offset;
+
+SI tp_mask get_mask(tp_vector x){  return  _mm_movemask_epi8(x); }
+
+SI tp_mask first_bit(tp_mask t){ return __builtin_ctzl(t);}
+SI tp_mask  last_bit(tp_mask t){ return __builtin_clzl(t);}
+
+SI tp_mask forget_first_bit(tp_mask t){return t&(t-1);}
+
+SI tp_mask get_bit(tp_mask x,int y){return (x&(((tp_mask)1)<<(y)));}
+SI tp_mask shift_down(tp_mask x,int y){ return x>>y;}
+SI tp_mask shift_up  (tp_mask x,int y){ return x<<y;}
+SI tp_mask forget_low_bits(tp_mask m,int b)
+{
+	if (b>=MSIZ_BIT) return 0;
+  return shift_up(shift_down(m,b),b);
+}
+SI tp_mask forget_high_bits(tp_mask m,int b)
+{
+	if (b>=MSIZ_BIT) return 0;
+  b=MSIZ_BIT-b;
+  return shift_down(shift_up(m,b),b);
+}
+
+
+
+
+
+SI tp_vector BYTE_AT(uchar c,int shift)
+{
+  return _mm_set_epi64x(((uint64_t)c)<<(8*shift),((uint64_t)c)<<(8*(shift-8)));
+}
+
+#define TEST_EQ  _mm_cmpeq_epi8
+#define TEST_ZERO(x) TEST_EQ(x,vzero)
+#define AND  _mm_and_si128
+#define ANDNOT(x,y) _mm_andnot_si128(y,x)
+#define OR   _mm_or_si128
+#define XOR  _mm_xor_si128
+#define SHIFT_DOWN _mm_srli_si128
+#define SHIFT_UP   _mm_slli_si128
+#define CONCAT(x,y,n) (n==0) ? (y) : ((n==BYTES_AT_ONCE) ? (x) : OR(SHIFT_UP(x,BYTES_AT_ONCE-(n)),SHIFT_DOWN(y,(n))))
+SI tp_vector BROADCAST(uchar c)
+{
+  return _mm_set1_epi8(c);
+}
+
+SI tp_vector BROADCAST_ZERO(void){
+	tp_vector m;
+	m=XOR(m,m);
+  return m;
+}
+#define find_zeros(x) get_mask(TEST_EQ(x,vzero))
+
+
+SI tp_vector TEST_RANGE(tp_vector v,uchar from,uchar to){
+	tp_vector fv=BROADCAST(-127-from);
+	v=_mm_add_epi8(v,fv);
+	tp_vector tv=BROADCAST(-127+to-from+1);
+	return _mm_cmplt_epi8(v,tv);
+}
+#ifdef CASE_INSENSITIVE
+SI tp_vector LOAD(tp_vector* x){tp_mask mask;
+	tp_vector high_bit=BROADCAST(128);
+	tp_vector m=_mm_load_si128(x);
+  tp_vector l= AND(TEST_RANGE(m,'A','Z'),high_bit);
+	m=OR(m,_mm_srli_epi64(l,2));
+	if (mask=get_mask(m)){int i;
+    enum_bits_start(mask,i)
+			((uchar*)&m)[i]=tolower(((uchar*)&m)[i]);
+    enum_bits_end
+	}
+	return m;
+}
+#else
+#define LOAD(x) _mm_load_si128((tp_vector*)(x))
+#define LOAD_UNALIGNED(x) _mm_loadu_si128(x)
+#endif
+
diff --git a/sysdeps/x86_64/multiarch/ssse3.h b/sysdeps/x86_64/multiarch/ssse3.h
new file mode 100644
index 0000000..b6b87fd
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ssse3.h
@@ -0,0 +1,5 @@
+#include <tmmintrin.h>
+#include "sse2.h"
+#undef CONCAT
+
+#define CONCAT(x,y,n) ((n==0) ? (y) : ((n==BYTES_AT_ONCE) ? (x) : _mm_alignr_epi8(x,y,n)))
diff --git a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c b/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
index a1f9968..0bbca7e 100644
--- a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
+++ b/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
@@ -46,4 +46,4 @@ __m128i_strloadu_tolower (const unsigned char *p)
 #define STRCASESTR_NONASCII
 #define USE_AS_STRCASESTR
 #define STRSTR_SSE42 __strcasestr_sse42_nonascii
-#include "strstr.c"
+#include "strstr_old.h"
diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c
index d1cfb3b..21ae683 100644
--- a/sysdeps/x86_64/multiarch/strcasestr.c
+++ b/sysdeps/x86_64/multiarch/strcasestr.c
@@ -4,4 +4,4 @@ extern char *__strcasestr_sse42_nonascii (const unsigned char *s1,
 
 #define USE_AS_STRCASESTR
 #define STRSTR_SSE42 __strcasestr_sse42
-#include "strstr.c"
+#include "strstr_old.h"
diff --git a/sysdeps/x86_64/multiarch/strstr-c.c b/sysdeps/x86_64/multiarch/strstr-c.c
index b8ed316..56e8094 100644
--- a/sysdeps/x86_64/multiarch/strstr-c.c
+++ b/sysdeps/x86_64/multiarch/strstr-c.c
@@ -1,15 +1,17 @@
 #include "init-arch.h"
 
-#define STRSTR __strstr_sse2
+#define STRSTR __strstr_nosse
 #ifdef SHARED
 # undef libc_hidden_builtin_def
 # define libc_hidden_builtin_def(name) \
-  __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
+  __hidden_ver1 (__strstr_nosse, __GI_strstr, __strstr_nosse);
 #endif
 
 #include "string/strstr.c"
 
-extern char *__strstr_sse42 (const char *, const char *) attribute_hidden;
-extern __typeof (__strstr_sse2) __strstr_sse2 attribute_hidden;
+extern char *__strstr_ssse3 (const char *, const char *) attribute_hidden;
+extern char *__strstr_sse2 (const char *, const char *) attribute_hidden;
 
-libc_ifunc (strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2);
+extern __typeof (__strstr_nosse) __strstr_nosse attribute_hidden;
+
+libc_ifunc (strstr, HAS_SSSE3 ? __strstr_ssse3 : (HAS_SSE2 ? __strstr_sse2 : __strstr_nosse));
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index b1b4139..dd6e655 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -1,384 +1,5 @@
-/* strstr with SSE4.2 intrinsics
-   Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <nmmintrin.h>
-#include "varshift.h"
-
-#ifndef STRSTR_SSE42
-# define STRSTR_SSE42 __strstr_sse42
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# include <ctype.h>
-# include <locale/localeinfo.h>
-
-# define LOADBYTE(C)		tolower (C)
-# define CMPBYTE(C1, C2)	(tolower (C1) == tolower (C2))
-#else
-# define LOADBYTE(C)		(C)
-# define CMPBYTE(C1, C2)	((C1) == (C2))
-#endif
-
-/* We use 0xe ordered-compare:
-	_SIDD_SBYTE_OPS
-	| _SIDD_CMP_EQUAL_ORDER
-	| _SIDD_LEAST_SIGNIFICANT
-   on pcmpistri to do the scanning and string comparsion requirements of
-   sub-string match.  In the scanning phase, we process Cflag and ECX
-   index to locate the first fragment match; once the first fragment
-   match position has been identified, we do comparison of subsequent
-   string fragments until we can conclude false or true match; whe
-   n concluding a false match, we may need to repeat scanning process
-   from next relevant offset in the target string.
-
-   In the scanning phase we have 4 cases:
-   case		ECX	CFlag	ZFlag	SFlag
-    1		16	  0	  0	  0
-    2a		16	  0	  0	  1
-    2b		16	  0	  1	  0
-    2c		16	  0	  1	  1
-
-   1. No ordered-comparison match, both 16B fragments are valid, so
-      continue to next fragment.
-   2. No ordered-comparison match, there is EOS in either fragment,
-   2a. Zflg = 0, Sflg = 1, we continue
-   2b. Zflg = 1, Sflg = 0, we conclude no match and return.
-   2c. Zflg = 1, sflg = 1, lenth determine match or no match
-
-   In the string comparison phase, the 1st fragment match is fixed up
-   to produce ECX = 0.  Subsequent fragment compare of nonzero index
-   and no match conclude a false match.
-
-   case		ECX	CFlag	ZFlag	SFlag
-    3		 X	  1	  0	  0/1
-    4a		 0	  1	  0	  0
-    4b		 0	  1	  0	  1
-    4c		0 < X	  1	  0	  0/1
-    5		16	  0	  1	  0
-
-   3. An initial ordered-comparison fragment match, we fix up to do
-      subsequent string comparison
-   4a. Continuation of fragment comparison of a string compare.
-   4b. EOS reached in the reference string, we conclude true match and
-       return
-   4c. String compare failed if index is nonzero, we need to go back to
-       scanning
-   5.  failed string compare, go back to scanning
- */
-
-/* Simple replacement of movdqu to address 4KB boundary cross issue.
-   If EOS occurs within less than 16B before 4KB boundary, we don't
-   cross to next page.  */
-
-static inline __m128i
-__m128i_strloadu (const unsigned char * p, __m128i zero)
-{
-  if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0))
-    {
-      size_t offset = ((size_t) p & (16 - 1));
-      __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
-      int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
-      if ((bmsk >> offset) != 0)
-	return __m128i_shift_right (a, offset);
-    }
-  return _mm_loadu_si128 ((__m128i *) p);
-}
-
-#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
-
-/* Similar to __m128i_strloadu.  Convert to lower case for POSIX/C
-   locale and other which have single-byte letters only in the ASCII
-   range.  */
-static inline __m128i
-__m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow,
-			  __m128i uchigh, __m128i lcqword)
-{
-  __m128i frag = __m128i_strloadu (p, zero);
-
-  /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'.  */
-  __m128i r2 = _mm_cmpgt_epi8 (uchigh, frag);
-  /* Compare if bytes are > 'A' - 1.  */
-  __m128i r1 = _mm_cmpgt_epi8 (frag, uclow);
-  /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1.  */
-  __m128i mask = _mm_and_si128 (r2, r1);
-  /* Apply lowercase bit 6 mask for above mask bytes == ff.  */
-  return _mm_or_si128 (frag, _mm_and_si128 (mask, lcqword));
-}
-
-#endif
-
-/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
-   algorithm) overlap for a fully populated 16B vector.
-   Input parameter: 1st 16Byte loaded from the reference string of a
-		    strstr function.
-   We don't use KMP algorithm if reference string is less than 16B.  */
-static int
-__inline__ __attribute__ ((__always_inline__,))
-KMP16Bovrlap (__m128i s2)
-{
-  __m128i b = _mm_unpacklo_epi8 (s2, s2);
-  __m128i a = _mm_unpacklo_epi8 (b, b);
-  a = _mm_shuffle_epi32 (a, 0);
-  b = _mm_srli_si128 (s2, sizeof (char));
-  int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
-
-  /* _BitScanForward(&k1, bmsk); */
-  int k1;
-  __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
-  if (!bmsk)
-    return 16;
-  else if (bmsk == 0x7fff)
-    return 1;
-  else if (!k1)
-    {
-      /* There are al least two distinct chars in s2.  If byte 0 and 1 are
-	 idential and the distinct value lies farther down, we can deduce
-	 the next byte offset to restart full compare is least no earlier
-	 than byte 3.  */
-      return 3;
-    }
-  else
-    {
-      /* Byte 1 is not degenerated to byte 0.  */
-      return k1 + 1;
-    }
-}
-
-char *
-__attribute__ ((section (".text.sse4.2")))
-STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
-{
-#define p1 s1
-  const unsigned char *p2 = s2;
-
-#ifndef STRCASESTR_NONASCII
-  if (__builtin_expect (p2[0] == '\0', 0))
-    return (char *) p1;
-
-  if (__builtin_expect (p1[0] == '\0', 0))
-    return NULL;
-
-  /* Check if p1 length is 1 byte long.  */
-  if (__builtin_expect (p1[1] == '\0', 0))
-    return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# ifndef STRCASESTR_NONASCII
-  if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
-			!= 0, 0))
-    return __strcasestr_sse42_nonascii (s1, s2);
-
-  const __m128i uclow = _mm_set1_epi8 (0x40);
-  const __m128i uchigh = _mm_set1_epi8 (0x5b);
-  const __m128i lcqword = _mm_set1_epi8 (0x20);
-  const __m128i zero = _mm_setzero_si128 ();
-#  define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword)
-# else
-#  define strloadu __m128i_strloadu_tolower
-#  define zero _mm_setzero_si128 ()
-# endif
-#else
-# define strloadu(p) __m128i_strloadu (p, zero)
-  const __m128i zero = _mm_setzero_si128 ();
-#endif
-
-  /* p1 > 1 byte long.  Load up to 16 bytes of fragment.  */
-  __m128i frag1 = strloadu (p1);
-
-  __m128i frag2;
-  if (p2[1] != '\0')
-    /* p2 is > 1 byte long.  */
-    frag2 = strloadu (p2);
-  else
-    frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0);
-
-  /* Unsigned bytes, equal order, does frag2 has null?  */
-  int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-  int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-  int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-  int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
-  if (cmp_s & cmp_c)
-    {
-      int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
-      int len;
-      __asm ("bsfl %[bmsk], %[len]"
-	     : [len] "=r" (len) : [bmsk] "r" (bmsk));
-      p1 += cmp;
-      if ((len + cmp) <= 16)
-	return (char *) p1;
-
-      /* Load up to 16 bytes of fragment.  */
-      frag1 = strloadu (p1);
-      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-      cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
-      cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-      cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-      if ((len + cmp) <= 16)
-	return (char *) p1 + cmp;
-    }
-
-  if (cmp_s)
-    {
-      /* Adjust addr for 16B alginment in ensuing loop.  */
-      while (!cmp_z)
-	{
-	  p1 += cmp;
-	  /* Load up to 16 bytes of fragment.  */
-	  frag1 = strloadu (p1);
-	  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-	  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-	  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-	  /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
-	     once already, this time cmp will be zero and we can exit.  */
-	  if ((!cmp) & cmp_c)
-	    break;
-	}
-
-      if (!cmp_c)
-	return NULL;
-
-      /* Since s2 is less than 16 bytes, com_c is definitive
-	 determination of full match.  */
-      return (char *) p1 + cmp;
-    }
-
-  /* General case, s2 is at least 16 bytes or more.
-     First, the common case of false-match at first byte of p2.  */
-  const unsigned char *pt = NULL;
-  int kmp_fwd = 0;
-re_trace:
-  while (!cmp_c)
-    {
-      /* frag1 has null. */
-      if (cmp_z)
-	return NULL;
-
-      /* frag 1 has no null, advance 16 bytes.  */
-      p1 += 16;
-      /* Load up to 16 bytes of fragment.  */
-      frag1 = strloadu (p1);
-      /* Unsigned bytes, equal order, is there a partial match?  */
-      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-      cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-      cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-    }
-
-  /* Next, handle initial positive match as first byte of p2.  We have
-     a partial fragment match, make full determination until we reached
-     end of s2.  */
-  if (!cmp)
-    {
-      if (cmp_z)
-	return (char *) p1;
-
-      pt = p1;
-      p1 += 16;
-      p2 += 16;
-      /* Load up to 16 bytes of fragment.  */
-      frag2 = strloadu (p2);
-    }
-  else
-    {
-      /* Adjust 16B alignment.  */
-      p1 += cmp;
-      pt = p1;
-    }
-
-  /* Load up to 16 bytes of fragment.  */
-  frag1 = strloadu (p1);
-
-  /* Unsigned bytes, equal order, does frag2 has null?  */
-  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-  cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
-  while (!(cmp | cmp_z | cmp_s))
-    {
-      p1 += 16;
-      p2 += 16;
-      /* Load up to 16 bytes of fragment.  */
-      frag2 = strloadu (p2);
-      /* Load up to 16 bytes of fragment.  */
-      frag1 = strloadu (p1);
-      /* Unsigned bytes, equal order, does frag2 has null?  */
-      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-      cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-      cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-      cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
-    }
-
-  /* Full determination yielded a false result, retrace s1 to next
-     starting position.
-     Zflg	1      0      1			0/1
-     Sflg	0      1      1			0/1
-     cmp	na     0      0			>0
-     action   done   done   continue    continue if s2 < s1
-	      false  match  retrace s1     else false
-   */
-
-  if (cmp_s & !cmp)
-    return (char *) pt;
-  if (cmp_z)
-    {
-      if (!cmp_s)
-	return NULL;
-
-      /* Handle both zero and sign flag set and s1 is shorter in
-	 length.  */
-      int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
-      int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
-      int len;
-      int len1;
-      __asm ("bsfl %[bmsk], %[len]"
-	     : [len] "=r" (len) : [bmsk] "r" (bmsk));
-      __asm ("bsfl %[bmsk1], %[len1]"
-	     : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
-      if (len >= len1)
-	return NULL;
-    }
-  else if (!cmp)
-    return (char *) pt;
-
-  /* Otherwise, we have to retrace and continue.  Default of multiple
-     paths that need to retrace from next byte in s1.  */
-  p2 = s2;
-  frag2 = strloadu (p2);
-
-  if (!kmp_fwd)
-    kmp_fwd = KMP16Bovrlap (frag2);
-
-  /* KMP algorithm predicted overlap needs to be corrected for
-     partial fragment compare.  */
-  p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
-
-  /* Since s2 is at least 16 bytes long, we're certain there is no
-     match.  */
-  if (p1[0] == '\0')
-    return NULL;
-
-  /* Load up to 16 bytes of fragment.  */
-  frag1 = strloadu (p1);
-
-  /* Unsigned bytes, equal order, is there a partial match?  */
-  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-  goto re_trace;
-}
+static const int small_treshold=128;
+#define STRSTR __strstr_ssse3
+#define AS_STRSTR
+#define USE_SSSE3
+#include "strstr.h"
diff --git a/sysdeps/x86_64/multiarch/strstr.h b/sysdeps/x86_64/multiarch/strstr.h
new file mode 100644
index 0000000..f229223
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr.h
@@ -0,0 +1,215 @@
+#include <stdlib.h>
+#include <string.h>
+#include "vector.h"
+
+SI int max(int x,int y){ return x>y ? x : y; }
+SI int min(int x,int y){ return x<y ? x : y; }
+
+
+#define CHAR(x) *(x)
+static size_t strcmp_dir(const uchar *a,const uchar *b,size_t no,int dir){
+  int i;
+  for(i=0;i<no && CHAR(a)==CHAR(b);i++){a+=dir;b+=dir;}
+  return i;
+}
+
+
+/* Two way algorithm: CROCHEMORE M., PERRIN D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
+   Implementation based from http://www-igm.univ-mlv.fr/~lecroq/string/node26.html
+*/
+static void two_way_preprocessing(uchar *n,int ns,int *per2,int *ell2,int *peri);
+
+#ifdef AS_STRSTR
+   #define DETECT_ZERO_BYTE
+  #define _AS_STRSTR(x) x 
+  #define _AS_MEMMEM(x) 
+#endif
+#ifdef AS_MEMMEM
+  #define DETECT_END s+ss-ns+check
+  #define _AS_STRSTR(x) 
+  #define _AS_MEMMEM(x) x
+#endif
+static uchar *strstr_two_way(uchar *s, int ss, uchar *n, int ns)
+{
+
+   int fw,fwno,bw,bwno;
+   int ell,   per, peri,pos;
+   two_way_preprocessing(n,ns,&per,&ell,&peri);
+   int check=min(ell+2,ns-1);
+   uchar *skip_to=s+check;
+   tp_vector vn0=BROADCAST(n[check-0]);
+   tp_vector vn1=BROADCAST(n[check-1]);
+   tp_vector e0,e1;
+
+  #define TEST_CODE(so,sn) vzero;\
+     e0   = TEST_EQ(CONCAT(sn,so,BYTES_AT_ONCE-0),vn0); \
+     e1   = TEST_EQ(CONCAT(sn,so,BYTES_AT_ONCE-1),vn1); \
+     mvec = AND(e0,e1);
+
+  #define LOOP_BODY(p) \
+     p = p - check;\
+     pos = check + 1;\
+     fwno = ns - pos;\
+     fw = strcmp_dir(n + pos ,p + pos, fwno , 1);\
+     if (fw < fwno ){\
+       p += fw + 1;\
+     } else {\
+       bwno = ell + 1;\
+       bw = strcmp_dir(n + bwno - 1, p + bwno - 1, bwno, -1);\
+       if ( bw < bwno ){\
+         _AS_STRSTR(for(i=0;i<per;i++) if (!p[ns+i]) return NULL);\
+         p += per;\
+         if (peri){\
+           while(1){\
+             pos = max(ell, ns - per - 1) + 1;\
+             fwno = ns - pos;\
+             fw = strcmp_dir(n + pos ,p + pos, fwno , 1);\
+             if (fw < fwno ){\
+               p += fw + 1;\
+               break ;\
+             } else {\
+               bwno = ell - ns - per - 1;\
+               bw = strcmp_dir(n + ell, p + ell, bwno, -1);\
+               if ( bw < bwno ){\
+                 _AS_STRSTR(for(i=0;i<per;i++) if (!p[ns+i]) return NULL);\
+                 p += per;\
+               } else {\
+                 return p;\
+               }\
+             }\
+           }\
+         }\
+       } else {\
+         return p;\
+       }\
+     }\
+     skip_to = p + check;
+
+  #define LOOP_END(p) return NULL;
+  #define CAN_SKIP
+  #include "loop.h"
+}
+
+static uchar *strstr_vec( uchar *s,int ss,uchar *n,int ns){
+  int buy=8*ns+64,rent=0; 
+  tp_vector vn0=BROADCAST(n[ns-1-0]); 
+  tp_vector vn1=BROADCAST(n[ns-1-1]);
+  tp_vector e0,e1;
+
+  #define DETECT_ZERO_BYTE
+
+  #define TEST_CODE(so,sn) vzero;\
+     e0   = TEST_EQ(CONCAT(sn,so,BYTES_AT_ONCE-0),vn0); \
+     e1   = TEST_EQ(CONCAT(sn,so,BYTES_AT_ONCE-1),vn1); \
+     mvec = (AND(e0,e1));
+
+  #define LOOP_BODY(p) \
+    p = p - (ns - 1);\
+    if(p>= s){\
+      int checked=strcmp_dir(p+ns-1-2,n+ns-1-2,ns-2,-1); \
+      if (checked==ns-2) return p; \
+      rent+=checked;\
+      if(buy+2*(p-s)>rent)        return strstr_two_way(p,ss,n,ns);\
+    }
+
+  #define LOOP_END(p) return NULL;
+  #include "loop.h"
+}
+#undef TEST_CODE
+#undef LOOP_BODY
+#undef LOOP_END
+
+
+
+#ifdef AS_STRSTR
+  #define STRCHR(s,sn,c) strchr(s,c)
+  uchar *STRSTR(const uchar *s,const uchar *n)
+#endif
+#ifdef AS_MEMMEM
+  #define STRCHR(s,sn,c) strchr(s,c)
+ uchar *memmem(const uchar *s,size_t ss,const uchar *n,size_t ns)
+#endif
+  #define SWITCH_IMPLEMENTATION return strstr_vec((uchar*)p,s_end-p,(uchar*)n,ns);
+{
+  int buy=small_treshold,rent=0;
+#ifdef AS_MEMMEM
+  uchar *s_end=s+ss;
+  if( ns > ss) return NULL;
+#endif
+#ifdef AS_STRSTR
+  uchar *s_end=NULL;
+  int ns=0;
+  while(n[ns]){
+    if(!s[ns]) return NULL;
+    ns++;
+  }
+#endif
+  if (!ns) return (uchar*) s;
+  uchar *p=(uchar*)s;
+  p+=ns-1;
+  while(1){
+    p=(uchar*) STRCHR((char*)p,s_end-p,((char*)n)[ns-1]);
+    if(!p) return NULL;
+    p -= ns - 1;
+    int check = strcmp_dir(n + ns - 1 -1  ,p + ns - 1 -1, ns - 1 , -1);
+    if (check == ns - 1) return p;
+    rent+=check+32;
+    if(buy-(p-s)<rent) SWITCH_IMPLEMENTATION
+    p++;
+    p+= ns - 1;
+  }
+}
+
+
+static int maxSuf(uchar *x, int m, int *p, int invert) {
+   int ms, j, k;
+   uchar a, b;
+
+   ms = -1;
+   j = 0;
+   k = *p = 1;
+   while (j + k < m) {
+      a = CHAR(x + j + k);
+      b = CHAR(x + ms + k);
+      if (invert ? (a > b) : (a < b)) {
+         j += k;
+         k = 1;
+         *p = j - ms;
+      }
+      else
+         if (a == b)
+            if (k != *p)
+               ++k;
+            else {
+               j += *p;
+               k = 1;
+            }
+         else { /* a > b */
+            ms = j;
+            j = ms + 1;
+            k = *p = 1;
+         }
+   }
+   return(ms);
+}
+ 
+static int periodic(uchar *a,uchar *b,int siz){
+  return strcmp_dir(a,b,siz,1)==siz;
+}
+
+static void two_way_preprocessing(uchar *n,int ns,int *per2,int *ell2,int *peri){
+  int u,v,up,vp;
+  int per,ell;
+  u=maxSuf(n,ns,&up,0);
+  v=maxSuf(n,ns,&vp,1);
+  ell = (u > v) ? u :  v;
+  per = (u > v) ? up : vp;
+  *peri = periodic(n, n + per, ell + 1);
+  if (!*peri)
+    per = max(ell + 1, ns - ell - 1) + 1;
+  *per2=per;
+  *ell2=ell;
+}
+
+
+
diff --git a/sysdeps/x86_64/multiarch/strstr_old.h b/sysdeps/x86_64/multiarch/strstr_old.h
new file mode 100644
index 0000000..b1b4139
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr_old.h
@@ -0,0 +1,384 @@
+/* strstr with SSE4.2 intrinsics
+   Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <nmmintrin.h>
+#include "varshift.h"
+
+#ifndef STRSTR_SSE42
+# define STRSTR_SSE42 __strstr_sse42
+#endif
+
+#ifdef USE_AS_STRCASESTR
+# include <ctype.h>
+# include <locale/localeinfo.h>
+
+# define LOADBYTE(C)		tolower (C)
+# define CMPBYTE(C1, C2)	(tolower (C1) == tolower (C2))
+#else
+# define LOADBYTE(C)		(C)
+# define CMPBYTE(C1, C2)	((C1) == (C2))
+#endif
+
+/* We use 0xe ordered-compare:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_ORDER
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to do the scanning and string comparsion requirements of
+   sub-string match.  In the scanning phase, we process Cflag and ECX
+   index to locate the first fragment match; once the first fragment
+   match position has been identified, we do comparison of subsequent
+   string fragments until we can conclude false or true match; whe
+   n concluding a false match, we may need to repeat scanning process
+   from next relevant offset in the target string.
+
+   In the scanning phase we have 4 cases:
+   case		ECX	CFlag	ZFlag	SFlag
+    1		16	  0	  0	  0
+    2a		16	  0	  0	  1
+    2b		16	  0	  1	  0
+    2c		16	  0	  1	  1
+
+   1. No ordered-comparison match, both 16B fragments are valid, so
+      continue to next fragment.
+   2. No ordered-comparison match, there is EOS in either fragment,
+   2a. Zflg = 0, Sflg = 1, we continue
+   2b. Zflg = 1, Sflg = 0, we conclude no match and return.
+   2c. Zflg = 1, sflg = 1, lenth determine match or no match
+
+   In the string comparison phase, the 1st fragment match is fixed up
+   to produce ECX = 0.  Subsequent fragment compare of nonzero index
+   and no match conclude a false match.
+
+   case		ECX	CFlag	ZFlag	SFlag
+    3		 X	  1	  0	  0/1
+    4a		 0	  1	  0	  0
+    4b		 0	  1	  0	  1
+    4c		0 < X	  1	  0	  0/1
+    5		16	  0	  1	  0
+
+   3. An initial ordered-comparison fragment match, we fix up to do
+      subsequent string comparison
+   4a. Continuation of fragment comparison of a string compare.
+   4b. EOS reached in the reference string, we conclude true match and
+       return
+   4c. String compare failed if index is nonzero, we need to go back to
+       scanning
+   5.  failed string compare, go back to scanning
+ */
+
+/* Simple replacement of movdqu to address 4KB boundary cross issue.
+   If EOS occurs within less than 16B before 4KB boundary, we don't
+   cross to next page.  */
+
+static inline __m128i
+__m128i_strloadu (const unsigned char * p, __m128i zero)
+{
+  if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0))
+    {
+      size_t offset = ((size_t) p & (16 - 1));
+      __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
+      int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
+      if ((bmsk >> offset) != 0)
+	return __m128i_shift_right (a, offset);
+    }
+  return _mm_loadu_si128 ((__m128i *) p);
+}
+
+#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
+
+/* Similar to __m128i_strloadu.  Convert to lower case for POSIX/C
+   locale and other which have single-byte letters only in the ASCII
+   range.  */
+static inline __m128i
+__m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow,
+			  __m128i uchigh, __m128i lcqword)
+{
+  __m128i frag = __m128i_strloadu (p, zero);
+
+  /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'.  */
+  __m128i r2 = _mm_cmpgt_epi8 (uchigh, frag);
+  /* Compare if bytes are > 'A' - 1.  */
+  __m128i r1 = _mm_cmpgt_epi8 (frag, uclow);
+  /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1.  */
+  __m128i mask = _mm_and_si128 (r2, r1);
+  /* Apply lowercase bit 6 mask for above mask bytes == ff.  */
+  return _mm_or_si128 (frag, _mm_and_si128 (mask, lcqword));
+}
+
+#endif
+
+/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
+   algorithm) overlap for a fully populated 16B vector.
+   Input parameter: 1st 16Byte loaded from the reference string of a
+		    strstr function.
+   We don't use KMP algorithm if reference string is less than 16B.  */
+static int
+__inline__ __attribute__ ((__always_inline__,))
+KMP16Bovrlap (__m128i s2)
+{
+  __m128i b = _mm_unpacklo_epi8 (s2, s2);
+  __m128i a = _mm_unpacklo_epi8 (b, b);
+  a = _mm_shuffle_epi32 (a, 0);
+  b = _mm_srli_si128 (s2, sizeof (char));
+  int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
+
+  /* _BitScanForward(&k1, bmsk); */
+  int k1;
+  __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
+  if (!bmsk)
+    return 16;
+  else if (bmsk == 0x7fff)
+    return 1;
+  else if (!k1)
+    {
+      /* There are al least two distinct chars in s2.  If byte 0 and 1 are
+	 idential and the distinct value lies farther down, we can deduce
+	 the next byte offset to restart full compare is least no earlier
+	 than byte 3.  */
+      return 3;
+    }
+  else
+    {
+      /* Byte 1 is not degenerated to byte 0.  */
+      return k1 + 1;
+    }
+}
+
+char *
+__attribute__ ((section (".text.sse4.2")))
+STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
+{
+#define p1 s1
+  const unsigned char *p2 = s2;
+
+#ifndef STRCASESTR_NONASCII
+  if (__builtin_expect (p2[0] == '\0', 0))
+    return (char *) p1;
+
+  if (__builtin_expect (p1[0] == '\0', 0))
+    return NULL;
+
+  /* Check if p1 length is 1 byte long.  */
+  if (__builtin_expect (p1[1] == '\0', 0))
+    return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
+#endif
+
+#ifdef USE_AS_STRCASESTR
+# ifndef STRCASESTR_NONASCII
+  if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
+			!= 0, 0))
+    return __strcasestr_sse42_nonascii (s1, s2);
+
+  const __m128i uclow = _mm_set1_epi8 (0x40);
+  const __m128i uchigh = _mm_set1_epi8 (0x5b);
+  const __m128i lcqword = _mm_set1_epi8 (0x20);
+  const __m128i zero = _mm_setzero_si128 ();
+#  define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword)
+# else
+#  define strloadu __m128i_strloadu_tolower
+#  define zero _mm_setzero_si128 ()
+# endif
+#else
+# define strloadu(p) __m128i_strloadu (p, zero)
+  const __m128i zero = _mm_setzero_si128 ();
+#endif
+
+  /* p1 > 1 byte long.  Load up to 16 bytes of fragment.  */
+  __m128i frag1 = strloadu (p1);
+
+  __m128i frag2;
+  if (p2[1] != '\0')
+    /* p2 is > 1 byte long.  */
+    frag2 = strloadu (p2);
+  else
+    frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0);
+
+  /* Unsigned bytes, equal order, does frag2 has null?  */
+  int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+  int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+  int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+  int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
+  if (cmp_s & cmp_c)
+    {
+      int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
+      int len;
+      __asm ("bsfl %[bmsk], %[len]"
+	     : [len] "=r" (len) : [bmsk] "r" (bmsk));
+      p1 += cmp;
+      if ((len + cmp) <= 16)
+	return (char *) p1;
+
+      /* Load up to 16 bytes of fragment.  */
+      frag1 = strloadu (p1);
+      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+      cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
+      cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+      cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+      if ((len + cmp) <= 16)
+	return (char *) p1 + cmp;
+    }
+
+  if (cmp_s)
+    {
+      /* Adjust addr for 16B alginment in ensuing loop.  */
+      while (!cmp_z)
+	{
+	  p1 += cmp;
+	  /* Load up to 16 bytes of fragment.  */
+	  frag1 = strloadu (p1);
+	  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+	  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+	  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+	  /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
+	     once already, this time cmp will be zero and we can exit.  */
+	  if ((!cmp) & cmp_c)
+	    break;
+	}
+
+      if (!cmp_c)
+	return NULL;
+
+      /* Since s2 is less than 16 bytes, com_c is definitive
+	 determination of full match.  */
+      return (char *) p1 + cmp;
+    }
+
+  /* General case, s2 is at least 16 bytes or more.
+     First, the common case of false-match at first byte of p2.  */
+  const unsigned char *pt = NULL;
+  int kmp_fwd = 0;
+re_trace:
+  while (!cmp_c)
+    {
+      /* frag1 has null. */
+      if (cmp_z)
+	return NULL;
+
+      /* frag 1 has no null, advance 16 bytes.  */
+      p1 += 16;
+      /* Load up to 16 bytes of fragment.  */
+      frag1 = strloadu (p1);
+      /* Unsigned bytes, equal order, is there a partial match?  */
+      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+      cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+      cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+    }
+
+  /* Next, handle initial positive match as first byte of p2.  We have
+     a partial fragment match, make full determination until we reached
+     end of s2.  */
+  if (!cmp)
+    {
+      if (cmp_z)
+	return (char *) p1;
+
+      pt = p1;
+      p1 += 16;
+      p2 += 16;
+      /* Load up to 16 bytes of fragment.  */
+      frag2 = strloadu (p2);
+    }
+  else
+    {
+      /* Adjust 16B alignment.  */
+      p1 += cmp;
+      pt = p1;
+    }
+
+  /* Load up to 16 bytes of fragment.  */
+  frag1 = strloadu (p1);
+
+  /* Unsigned bytes, equal order, does frag2 has null?  */
+  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+  cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
+  while (!(cmp | cmp_z | cmp_s))
+    {
+      p1 += 16;
+      p2 += 16;
+      /* Load up to 16 bytes of fragment.  */
+      frag2 = strloadu (p2);
+      /* Load up to 16 bytes of fragment.  */
+      frag1 = strloadu (p1);
+      /* Unsigned bytes, equal order, does frag2 has null?  */
+      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+      cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+      cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+      cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
+    }
+
+  /* Full determination yielded a false result, retrace s1 to next
+     starting position.
+     Zflg	1      0      1			0/1
+     Sflg	0      1      1			0/1
+     cmp	na     0      0			>0
+     action   done   done   continue    continue if s2 < s1
+	      false  match  retrace s1     else false
+   */
+
+  if (cmp_s & !cmp)
+    return (char *) pt;
+  if (cmp_z)
+    {
+      if (!cmp_s)
+	return NULL;
+
+      /* Handle both zero and sign flag set and s1 is shorter in
+	 length.  */
+      int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
+      int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
+      int len;
+      int len1;
+      __asm ("bsfl %[bmsk], %[len]"
+	     : [len] "=r" (len) : [bmsk] "r" (bmsk));
+      __asm ("bsfl %[bmsk1], %[len1]"
+	     : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
+      if (len >= len1)
+	return NULL;
+    }
+  else if (!cmp)
+    return (char *) pt;
+
+  /* Otherwise, we have to retrace and continue.  Default of multiple
+     paths that need to retrace from next byte in s1.  */
+  p2 = s2;
+  frag2 = strloadu (p2);
+
+  if (!kmp_fwd)
+    kmp_fwd = KMP16Bovrlap (frag2);
+
+  /* KMP algorithm predicted overlap needs to be corrected for
+     partial fragment compare.  */
+  p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
+
+  /* Since s2 is at least 16 bytes long, we're certain there is no
+     match.  */
+  if (p1[0] == '\0')
+    return NULL;
+
+  /* Load up to 16 bytes of fragment.  */
+  frag1 = strloadu (p1);
+
+  /* Unsigned bytes, equal order, is there a partial match?  */
+  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+  goto re_trace;
+}
diff --git a/sysdeps/x86_64/multiarch/strstr_sse2.c b/sysdeps/x86_64/multiarch/strstr_sse2.c
new file mode 100644
index 0000000..ab95587
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr_sse2.c
@@ -0,0 +1,4 @@
+static const int small_treshold=128;
+#define STRSTR __strstr_sse2
+#define AS_STRSTR
+#include "strstr.h"
diff --git a/sysdeps/x86_64/multiarch/vector.h b/sysdeps/x86_64/multiarch/vector.h
new file mode 100644
index 0000000..5ac92c2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/vector.h
@@ -0,0 +1,17 @@
+typedef unsigned char uchar;
+
+#ifdef SCALAR
+#include "scalar.h"
+#else
+#ifdef ARITHMETIC
+#include "arithmetic.h"
+#else
+#ifdef USE_SSSE3
+#include "ssse3.h"
+#else
+#include "sse2.h"
+#endif
+#endif
+#endif
+
+
-- 
1.7.4.4



More information about the Libc-help mailing list