This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] vectorized string functions


On Wed, Jul 11, 2012 at 09:23:37AM +0200, Andreas Jaeger wrote:
> On Wednesday, July 11, 2012 17:10:07 OndÅej BÃlka wrote:
> > Now I am almost done with vectorized implementation of string
> > functions. I use single loop to get faster implementation of *len,
> > *chr and *str functions.
> 
> Ondrej, thanks for your contribution! Do you have a copyright assignment 
> on place for glibc? This is definitely needed for such a large piece of 
> code.

I already have.

> 
> Also, your code does not confirm to our coding style at all, please read 
> the following wiki page for details 
> http://sourceware.org/glibc/wiki/Contribution%20checklist
> 
> I noticed especially:
> * overlong lines
> * no copyright headers
> * missing comments
> * comments that are not full sentences
> * wrong line formatting, missing spaces
> 
> This needs performance testing of all functions on a variety of 
> architectures. Have you done some of that already?

Tested only for x64. It should be easy to add header for processors 
supporting AltiVec but I have no experience with these.

Benchmark results are at usual place
http://kam.mff.cuni.cz/~ondra/benchmark_string/

Here is updated version. I made several additional improvements.

One is that now I test zero by pminub. 
One is that I simplified two-way algorithm. 

---
 string/arit.h                                  |  179 ++++++++
 string/loop.h                                  |  195 +++++++++
 string/memchr.c                                |  168 +-------
 string/memmem.c                                |   64 +---
 string/memrchr.c                               |  167 +-------
 string/rawmemchr.c                             |  151 +-------
 string/str-two-way.h                           |  428 ------------------
 string/strcasestr.c                            |   85 +----
 string/strchr.c                                |  173 +-------
 string/strchr.h                                |   70 +++
 string/strchrnul.c                             |  142 +------
 string/strlen.c                                |   95 +----
 string/strlen.h                                |   39 ++
 string/strnlen.c                               |  162 +-------
 string/strrchr.c                               |   35 +--
 string/strstr.c                                |   74 +---
 string/strstr.h                                |  297 +++++++++++++
 string/strstr_vec.h                            |   52 +++
 string/vector.h                                |  120 +++++
 sysdeps/x86_64/memchr.S                        |  311 -------------
 sysdeps/x86_64/multiarch/Makefile              |   54 ++-
 sysdeps/x86_64/multiarch/gen_stub              |  111 +++++
 sysdeps/x86_64/multiarch/rawmemchr.S           |   97 ----
 sysdeps/x86_64/multiarch/strcasestr-c.c        |   16 -
 sysdeps/x86_64/multiarch/strcasestr-nonascii.c |   49 --
 sysdeps/x86_64/multiarch/strcasestr.c          |    7 -
 sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S |    3 -
 sysdeps/x86_64/multiarch/strnlen.S             |   54 ---
 sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S |  555 ------------------------
 sysdeps/x86_64/multiarch/strrchr.S             |  281 ------------
 sysdeps/x86_64/multiarch/strstr-c.c            |   15 -
 sysdeps/x86_64/multiarch/strstr.c              |  384 ----------------
 sysdeps/x86_64/sse.h                           |  161 +++++++
 sysdeps/x86_64/strchrnul.S                     |   62 ---
 sysdeps/x86_64/strnlen.S                       |   63 ---
 sysdeps/x86_64/strrchr.S                       |   80 ----
 37 files changed, 1349 insertions(+), 3661 deletions(-)
 create mode 100644 string/arit.h
 create mode 100644 string/loop.h
 delete mode 100644 string/str-two-way.h
 create mode 100644 string/strchr.h
 create mode 100644 string/strlen.h
 create mode 100644 string/strstr.h
 create mode 100644 string/strstr_vec.h
 create mode 100644 string/vector.h
 delete mode 100644 sysdeps/x86_64/memchr.S
 create mode 100755 sysdeps/x86_64/multiarch/gen_stub
 delete mode 100644 sysdeps/x86_64/multiarch/rawmemchr.S
 delete mode 100644 sysdeps/x86_64/multiarch/strcasestr-c.c
 delete mode 100644 sysdeps/x86_64/multiarch/strcasestr-nonascii.c
 delete mode 100644 sysdeps/x86_64/multiarch/strcasestr.c
 delete mode 100644 sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
 delete mode 100644 sysdeps/x86_64/multiarch/strnlen.S
 delete mode 100644 sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
 delete mode 100644 sysdeps/x86_64/multiarch/strrchr.S
 delete mode 100644 sysdeps/x86_64/multiarch/strstr-c.c
 delete mode 100644 sysdeps/x86_64/multiarch/strstr.c
 create mode 100644 sysdeps/x86_64/sse.h
 delete mode 100644 sysdeps/x86_64/strchrnul.S
 delete mode 100644 sysdeps/x86_64/strnlen.S
 delete mode 100644 sysdeps/x86_64/strrchr.S

diff --git a/string/arit.h b/string/arit.h
new file mode 100644
index 0000000..22a8ea5
--- /dev/null
+++ b/string/arit.h
@@ -0,0 +1,179 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License asize_t with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+#include "endian.h"
+#define unroll 4
+#if     __WORDSIZE == 64
+typedef uint64_t tp_vector;
+typedef uint64_t tp_mask;
+#elif   __WORDSIZE == 32
+typedef uint32_t tp_vector;
+typedef uint32_t tp_mask;
+#endif
+
+const tp_vector ONES=((~((tp_vector)0))/255);
+const tp_vector HIGH_BIT=(((~((tp_vector)0))/255)*0x80);
+
+SI tp_vector BROADCAST(uchar c)
+{
+  return ONES*c;
+}
+SI tp_vector LOAD(              uchar *x)
+{
+  return (*((tp_vector*)(x)));
+}
+SI tp_vector LOAD_UNALIGNED(    uchar *x)
+{
+  return (*((tp_vector*)(x)));
+}
+
+#define PREFETCH(x)
+
+
+SI tp_mask get_mask(tp_vector x)
+{
+  return     x&HIGH_BIT;
+}
+SI int NONZERO_MASK(tp_vector x)
+{
+  return get_mask(x)!=0;
+}
+
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+SI tp_mask bit_i(int i)
+{
+  return ((tp_mask) 1)<<(8*(i%BYTES_AT_ONCE)+7-(i/BYTES_AT_ONCE) );
+}
+#elif __BYTE_ORDER == __BIG_ENDIAN
+SI tp_mask bit_i(int i)
+{
+  return ((tp_mask) 1)<<(8*(BYTES_AT_ONCE-1-i%BYTES_AT_ONCE)+7-(i/BYTES_AT_ONCE));
+}
+#endif
+
+#ifdef CALCULATE_MASK
+SI int  calculate_mask_before_after()
+{
+  int i,j;
+  printf("static tp_mask kill_before[]={");
+  for(j=0; j<8*BYTES_AT_ONCE; j++)
+    {
+      tp_mask mask=0;
+      for(i=j; i<8*BYTES_AT_ONCE; i++) mask|=bit_i(i);
+      printf("0x%llx,",mask);
+    }
+  printf("0};\n");
+  printf("static tp_mask kill_after[]={");
+  for(j=0; j<8*BYTES_AT_ONCE; j++)
+    {
+      tp_mask mask=0;
+      for(i=0; i<=j; i++) mask|=bit_i(i);
+      printf("0x%llx,",mask);
+    }
+  printf("0};\n");
+}
+#endif
+#if __WORDSIZE == 32
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+static tp_mask kill_before[]= {0xffffffff,0xffffff7f,0xffff7f7f,0xff7f7f7f,0x7f7f7f7f,0x7f7f7f3f,0x7f7f3f3f,0x7f3f3f3f,0x3f3f3f3f,0x3f3f3f1f,0x3f3f1f1f,0x3f1f1f1f,0x1f1f1f1f,0x1f1f1f0f,0x1f1f0f0f,0x1f0f0f0f,0xf0f0f0f,0xf0f0f07,0xf0f0707,0xf070707,0x7070707,0x7070703,0x7070303,0x7030303,0x3030303,0x3030301,0x3030101,0x3010101,0x1010101,0x1010100,0x1010000,0x1000000,0};
+static tp_mask kill_after[]= {0x80,0x8080,0x808080,0x80808080,0x808080c0,0x8080c0c0,0x80c0c0c0,0xc0c0c0c0,0xc0c0c0e0,0xc0c0e0e0,0xc0e0e0e0,0xe0e0e0e0,0xe0e0e0f0,0xe0e0f0f0,0xe0f0f0f0,0xf0f0f0f0,0xf0f0f0f8,0xf0f0f8f8,0xf0f8f8f8,0xf8f8f8f8,0xf8f8f8fc,0xf8f8fcfc,0xf8fcfcfc,0xfcfcfcfc,0xfcfcfcfe,0xfcfcfefe,0xfcfefefe,0xfefefefe,0xfefefeff,0xfefeffff,0xfeffffff,0xffffffff,0};
+#else
+static tp_mask kill_before[]= {0xffffffff,0x7fffffff,0x7f7fffff,0x7f7f7fff,0x7f7f7f7f,0x3f7f7f7f,0x3f3f7f7f,0x3f3f3f7f,0x3f3f3f3f,0x1f3f3f3f,0x1f1f3f3f,0x1f1f1f3f,0x1f1f1f1f,0xf1f1f1f,0xf0f1f1f,0xf0f0f1f,0xf0f0f0f,0x70f0f0f,0x7070f0f,0x707070f,0x7070707,0x3070707,0x3030707,0x3030307,0x3030303,0x1030303,0x1010303,0x1010103,0x1010101,0x10101,0x101,0x1,0};
+static tp_mask kill_after[]= {0x80000000,0x80800000,0x80808000,0x80808080,0xc0808080,0xc0c08080,0xc0c0c080,0xc0c0c0c0,0xe0c0c0c0,0xe0e0c0c0,0xe0e0e0c0,0xe0e0e0e0,0xf0e0e0e0,0xf0f0e0e0,0xf0f0f0e0,0xf0f0f0f0,0xf8f0f0f0,0xf8f8f0f0,0xf8f8f8f0,0xf8f8f8f8,0xfcf8f8f8,0xfcfcf8f8,0xfcfcfcf8,0xfcfcfcfc,0xfefcfcfc,0xfefefcfc,0xfefefefc,0xfefefefe,0xfffefefe,0xfffffefe,0xfffffffe,0xffffffff,0};
+#endif
+#elif __WORDSIZE == 64
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+static tp_mask kill_before[]= {0xffffffffffffffff,0xffffffffffffff7f,0xffffffffffff7f7f,0xffffffffff7f7f7f,0xffffffff7f7f7f7f,0xffffff7f7f7f7f7f,0xffff7f7f7f7f7f7f,0xff7f7f7f7f7f7f7f,0x7f7f7f7f7f7f7f7f,0x7f7f7f7f7f7f7f3f,0x7f7f7f7f7f7f3f3f,0x7f7f7f7f7f3f3f3f,0x7f7f7f7f3f3f3f3f,0x7f7f7f3f3f3f3f3f,0x7f7f3f3f3f3f3f3f,0x7f3f3f3f3f3f3f3f,0x3f3f3f3f3f3f3f3f,0x3f3f3f3f3f3f3f1f,0x3f3f3f3f3f3f1f1f,0x3f3f3f3f3f1f1f1f,0x3f3f3f3f1f1f1f1f,0x3f3f3f1f1f1f1f1f,0x3f3f1f1f1f1f1f1f,0x3f1f1f1f1f1f1f1f,0x1f1f1f1f1f1f1f1f,0x1f1f1f1f1f1f1f0f,0x1f1f1f1f1f1f0f0f,0x1f1f1f1f1f0f0f0f,0x1f1f1f1f0f0f0f0f,0x1f1f1f0f0f0f0f0f,0x1f1f0f0f0f0f0f0f,0x1f0f0f0f0f0f0f0f,0xf0f0f0f0f0f0f0f,0xf0f0f0f0f0f0f07,0xf0f0f0f0f0f0707,0xf0f0f0f0f070707,0xf0f0f0f07070707,0xf0f0f0707070707,0xf0f070707070707,0xf07070707070707,0x707070707070707,0x707070707070703,0x707070707070303,0x707070707030303,0x707070703030303,0x707070303030303,0x707030303030303,0x703030303030303,0x303030303030303,0x303030303030301,0x303030303030101,0x303030303010101,0x303030301010101,0x303030101010101,0x303010101010101,0x301010101010101,0x101010101010101,0x101010101010100,0x101010101010000,0x101010101000000,0x101010100000000,0x101010000000000,0x101000000000000,0x100000000000000,0};
+static tp_mask kill_after[]= {0x80,0x8080,0x808080,0x80808080,0x8080808080,0x808080808080,0x80808080808080,0x8080808080808080,0x80808080808080c0,0x808080808080c0c0,0x8080808080c0c0c0,0x80808080c0c0c0c0,0x808080c0c0c0c0c0,0x8080c0c0c0c0c0c0,0x80c0c0c0c0c0c0c0,0xc0c0c0c0c0c0c0c0,0xc0c0c0c0c0c0c0e0,0xc0c0c0c0c0c0e0e0,0xc0c0c0c0c0e0e0e0,0xc0c0c0c0e0e0e0e0,0xc0c0c0e0e0e0e0e0,0xc0c0e0e0e0e0e0e0,0xc0e0e0e0e0e0e0e0,0xe0e0e0e0e0e0e0e0,0xe0e0e0e0e0e0e0f0,0xe0e0e0e0e0e0f0f0,0xe0e0e0e0e0f0f0f0,0xe0e0e0e0f0f0f0f0,0xe0e0e0f0f0f0f0f0,0xe0e0f0f0f0f0f0f0,0xe0f0f0f0f0f0f0f0,0xf0f0f0f0f0f0f0f0,0xf0f0f0f0f0f0f0f8,0xf0f0f0f0f0f0f8f8,0xf0f0f0f0f0f8f8f8,0xf0f0f0f0f8f8f8f8,0xf0f0f0f8f8f8f8f8,0xf0f0f8f8f8f8f8f8,0xf0f8f8f8f8f8f8f8,0xf8f8f8f8f8f8f8f8,0xf8f8f8f8f8f8f8fc,0xf8f8f8f8f8f8fcfc,0xf8f8f8f8f8fcfcfc,0xf8f8f8f8fcfcfcfc,0xf8f8f8fcfcfcfcfc,0xf8f8fcfcfcfcfcfc,0xf8fcfcfcfcfcfcfc,0xfcfcfcfcfcfcfcfc,0xfcfcfcfcfcfcfcfe,0xfcfcfcfcfcfcfefe,0xfcfcfcfcfcfefefe,0xfcfcfcfcfefefefe,0xfcfcfcfefefefefe,0xfcfcfefefefefefe,0xfcfefefefefefefe,0xfefefefefefefefe,0xfefefefefefefeff,0xfefefefefefeffff,0xfefefefefeffffff,0xfefefefeffffffff,0xfefefeffffffffff,0xfefeffffffffffff,0xfeffffffffffffff,0xffffffffffffffff,0};
+#elif __BYTE_ORDER == __BIG_ENDIAN
+static tp_mask kill_before[]= {0xffffffffffffffff,0x7fffffffffffffff,0x7f7fffffffffffff,0x7f7f7fffffffffff,0x7f7f7f7fffffffff,0x7f7f7f7f7fffffff,0x7f7f7f7f7f7fffff,0x7f7f7f7f7f7f7fff,0x7f7f7f7f7f7f7f7f,0x3f7f7f7f7f7f7f7f,0x3f3f7f7f7f7f7f7f,0x3f3f3f7f7f7f7f7f,0x3f3f3f3f7f7f7f7f,0x3f3f3f3f3f7f7f7f,0x3f3f3f3f3f3f7f7f,0x3f3f3f3f3f3f3f7f,0x3f3f3f3f3f3f3f3f,0x1f3f3f3f3f3f3f3f,0x1f1f3f3f3f3f3f3f,0x1f1f1f3f3f3f3f3f,0x1f1f1f1f3f3f3f3f,0x1f1f1f1f1f3f3f3f,0x1f1f1f1f1f1f3f3f,0x1f1f1f1f1f1f1f3f,0x1f1f1f1f1f1f1f1f,0xf1f1f1f1f1f1f1f,0xf0f1f1f1f1f1f1f,0xf0f0f1f1f1f1f1f,0xf0f0f0f1f1f1f1f,0xf0f0f0f0f1f1f1f,0xf0f0f0f0f0f1f1f,0xf0f0f0f0f0f0f1f,0xf0f0f0f0f0f0f0f,0x70f0f0f0f0f0f0f,0x7070f0f0f0f0f0f,0x707070f0f0f0f0f,0x70707070f0f0f0f,0x7070707070f0f0f,0x707070707070f0f,0x70707070707070f,0x707070707070707,0x307070707070707,0x303070707070707,0x303030707070707,0x303030307070707,0x303030303070707,0x303030303030707,0x303030303030307,0x303030303030303,0x103030303030303,0x101030303030303,0x101010303030303,0x101010103030303,0x101010101030303,0x101010101010303,0x101010101010103,0x101010101010101,0x1010101010101,0x10101010101,0x101010101,0x1010101,0x10101,0x101,0x1,0};
+static tp_mask kill_after[]= {0x8000000000000000,0x8080000000000000,0x8080800000000000,0x8080808000000000,0x8080808080000000,0x8080808080800000,0x8080808080808000,0x8080808080808080,0xc080808080808080,0xc0c0808080808080,0xc0c0c08080808080,0xc0c0c0c080808080,0xc0c0c0c0c0808080,0xc0c0c0c0c0c08080,0xc0c0c0c0c0c0c080,0xc0c0c0c0c0c0c0c0,0xe0c0c0c0c0c0c0c0,0xe0e0c0c0c0c0c0c0,0xe0e0e0c0c0c0c0c0,0xe0e0e0e0c0c0c0c0,0xe0e0e0e0e0c0c0c0,0xe0e0e0e0e0e0c0c0,0xe0e0e0e0e0e0e0c0,0xe0e0e0e0e0e0e0e0,0xf0e0e0e0e0e0e0e0,0xf0f0e0e0e0e0e0e0,0xf0f0f0e0e0e0e0e0,0xf0f0f0f0e0e0e0e0,0xf0f0f0f0f0e0e0e0,0xf0f0f0f0f0f0e0e0,0xf0f0f0f0f0f0f0e0,0xf0f0f0f0f0f0f0f0,0xf8f0f0f0f0f0f0f0,0xf8f8f0f0f0f0f0f0,0xf8f8f8f0f0f0f0f0,0xf8f8f8f8f0f0f0f0,0xf8f8f8f8f8f0f0f0,0xf8f8f8f8f8f8f0f0,0xf8f8f8f8f8f8f8f0,0xf8f8f8f8f8f8f8f8,0xfcf8f8f8f8f8f8f8,0xfcfcf8f8f8f8f8f8,0xfcfcfcf8f8f8f8f8,0xfcfcfcfcf8f8f8f8,0xfcfcfcfcfcf8f8f8,0xfcfcfcfcfcfcf8f8,0xfcfcfcfcfcfcfcf8,0xfcfcfcfcfcfcfcfc,0xfefcfcfcfcfcfcfc,0xfefefcfcfcfcfcfc,0xfefefefcfcfcfcfc,0xfefefefefcfcfcfc,0xfefefefefefcfcfc,0xfefefefefefefcfc,0xfefefefefefefefc,0xfefefefefefefefe,0xfffefefefefefefe,0xfffffefefefefefe,0xfffffffefefefefe,0xfffffffffefefefe,0xfffffffffffefefe,0xfffffffffffffefe,0xfffffffffffffffe,0xffffffffffffffff,0};
+#endif
+#endif
+SI tp_mask first_bit(tp_mask t,int y)
+{
+  while (!(t&bit_i(y))) y++;
+  return y;
+}
+MASK_OP(forget_first_bit, x^bit_i(y))
+MASK_OP(forget_before   , x&((y>=PARA) ? 0 : kill_before[y]))
+MASK_OP(forget_after    , x&((y<0)     ? 0 : kill_after[ y]))
+
+
+BIN_OP(XOR,x^y)
+BIN_OP(OR,x|y)
+BIN_OP(AND,x&y)
+BIN_OP(ANDNOT,x&(~y))
+UN_OP(TEST_ZERO,(AND(~(OR(x,HIGH_BIT)-ONES),~(x))))
+BIN_OP(TEST_EQ,TEST_ZERO(XOR(x,y)));
+
+#define SHIFT_DOWN(x,y) ((x)>>(8*(y)))
+#define SHIFT_UP(x,y)   ((x)<<(8*(y)))
+#define CONCAT(x,y,n) ((n==0) ? (y) : ((n==BYTES_AT_ONCE) ? (x) : OR(SHIFT_UP(x,BYTES_AT_ONCE-(n)),SHIFT_DOWN(y,(n)))))
+
+
+#ifdef DEBUG
+void inspect_mask(tp_mask m)
+{
+  int i;
+  for(i=0; i<PARA; i++) printf(m&bit_i(i) ?  "1" : "0");
+  printf("\n");
+}
+#endif
+
+
+/* TODO implement TEST_RANGE for generic parallel_tolower
+SI tp_vector TEST_RANGE(tp_vector v,uchar from,uchar to){
+	tp_vector fv=BROADCAST(-127-from);
+	v=_mm_add_epi8(v,fv);
+	tp_vector tv=BROADCAST(-127+to-from+1);
+	return _mm_cmplt_epi8(v,tv);
+}
+
+SI tp_vector parallel_tolower(tp_vector m){tp_mask mask;
+	tp_vector high_bit=BROADCAST(128);
+  tp_vector l= AND(TEST_RANGE(m,'A','Z'),high_bit);
+	m=OR(m,_mm_srli_epi64(l,2));
+	if ((mask=get_mask(m))){int i;
+    while(mask){ i=first_bit(mask); mask=forget_first_bit(mask,i);
+			((uchar*)&m)[i]=tolower(((uchar*)&m)[i]);
+    }
+	}
+	return m;
+}
+*/
+
+
+SI tp_vector parallel_tolower(tp_vector m)
+{
+  int i;
+  tp_vector r;
+  for(i=0; i<sizeof(tp_vector); i++)
+    ((uchar*)&r)[i]=tolower_fixed[((uchar*)&m)[i]];
+  return r;
+}
+
+
+#if unroll==1
+#define AGREGATE_MASK     mask0
+#elif unroll==2
+#define AGREGATE_MASK   (mask0|(mask1>>1))
+#elif unroll==4
+#define AGREGATE_MASK   ((mask0|(mask1>>1))|((mask2>>2)|(mask3>>3)))
+#endif
diff --git a/string/loop.h b/string/loop.h
new file mode 100644
index 0000000..1ae2e8f
--- /dev/null
+++ b/string/loop.h
@@ -0,0 +1,195 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License asize_t with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* basic string search loop. To use it define macros below and include this file.
+  TEST_CODE(so,sn)  given consecutive sequence so,sn of bytes  you should produce an
+                    vector. For bytes with highest bit set to 1 a loop invokes macro
+                    LOOP_BODY(p) where p is coresponding byte in sn.
+  LOOP_BODY(p)      see above
+  DETECT_END(p)     When byte p is reached call macro    LOOP_END(p)
+  DETECT_ZERO_BYTE  When first zero byte is reached call LOOP_END(p)
+  LOOP_END(p)       see above
+
+  CAN_SKIP          You have to define skip_to variable. Then a loop will not call
+                    LOOP_BODY(p) when p<skip_to. A LOOP_END condition will still be processed.
+
+  This file should be included inside function. A loop uses local variable s as matched string.
+  Note that implementation by callback is complicated by fact that you usualy need a closure to
+       share arguments.
+*/
+
+#ifdef DETECT_ZERO_BYTE
+#define _DETECT_ZERO_BYTE mvec= OR(mvec,TEST_ZERO(sz));
+#define _TEST_ZERO_BYTE (*p==0)
+#else
+#define _DETECT_ZERO_BYTE
+#define _TEST_ZERO_BYTE 0
+#endif
+#ifdef DETECT_END
+#define _DETECT_END(u) (DETECT_END<=s2+u*BYTES_AT_ONCE)
+if  (DETECT_END == s)
+  {
+    uchar UNUSED *p=s;
+    LOOP_END(p);
+  }
+#else
+#define     DETECT_END  ((uchar*)NULL)
+#define _DETECT_END(u)  0
+#endif
+
+
+#define TEST(u) \
+     mvec=vzero;\
+     so=sn;\
+     sn=sz##u= LOAD(s2+u*BYTES_AT_ONCE);\
+     mvec    = TEST_CODE(so,sn); \
+     mvec##u = mvec;
+
+
+int  i;
+tp_vector vzero=BROADCAST(0);
+tp_vector sn,so,sz0,sz1,sz2,sz3;
+int s_offset;
+uchar* s2;
+sn=vzero;
+ALIGN(s,unroll);
+tp_vector mvec,zvec=vzero;
+tp_mask mask, UNUSED zmask;
+#undef ACTION
+#define ACTION(x) tp_vector mvec##x; tp_mask mask##x;
+DO_ACTION;
+#undef ACTION
+#define ACTION(x) TEST(x)
+DO_ACTION;
+
+#ifdef DETECT_ZERO_BYTE
+  #undef ACTION
+  #define ACTION(x) mvec##x=OR(mvec##x,TEST_ZERO(sz##x));
+  DO_ACTION;
+#endif
+
+#undef ACTION
+#define ACTION(x) mask##x=get_mask(mvec##x);
+DO_ACTION;
+mask=AGREGATE_MASK;
+mask=forget_before(mask,s_offset);
+if (mask||_DETECT_END(unroll)) goto test;
+start:
+;
+while(1)
+  {
+    s2+=PARA;
+    PREFETCH(s2+prefetch*CACHE_LINE_SIZE);
+#undef ACTION
+#define ACTION(x) TEST(x)
+    DO_ACTION;
+#ifdef DETECT_ZERO_BYTE
+#if unroll==1
+  zvec=zvec0;
+#elif unroll==2
+#ifdef HAS_PARALLEL_MIN
+  zvec=TEST_ZERO(MINI(MINI(sz0,sz1)));
+#else
+  zvec=OR(OR(TEST_ZERO(sz0),TEST_ZERO(sz1)));
+#endif
+#elif unroll==4
+#ifdef HAS_PARALLEL_MIN
+  zvec=TEST_ZERO(MINI(MINI(sz0,sz1),MINI(sz2,sz3)));
+#else
+  zvec=OR(OR(TEST_ZERO(sz0),TEST_ZERO(sz1)),
+          OR(TEST_ZERO(sz2),TEST_ZERO(sz3)));
+#endif
+#endif
+#endif
+    if(NONZERO_MASK(OR(AGREGATE_VECTOR,zvec))||_DETECT_END(unroll))
+      {
+        /* on x64 or is destructive operation
+           in case of strlen it is faster to recalculate
+           mvec0,mvec2 than move them to separate registers.*/
+
+#ifdef DETECT_ZERO_BYTE
+  #undef ACTION
+  #define ACTION(x) mvec##x=OR(mvec##x,TEST_ZERO(sz##x));
+  DO_ACTION;
+#endif
+
+#undef ACTION
+#define ACTION(x) mask##x=get_mask(mvec##x);
+        DO_ACTION;
+        mask=AGREGATE_MASK;
+        goto test;
+      }
+  }
+test:; /*we need this flow otherwise gcc would duplicate this fragment.*/
+int end=0;
+#ifdef CAN_SKIP
+/* detect zero byte so it cannot be skipped.*/
+#ifdef DETECT_ZERO_BYTE
+#define ZTEST(u) \
+     mask##u=get_mask(TEST_ZERO(sz##u));
+#undef ACTION
+#define ACTION(x) ZTEST(x)
+DO_ACTION;
+zmask=AGREGATE_MASK;
+if (s>s2)
+  zmask=forget_before(zmask,s_offset);
+
+if(zmask) end = first_bit(zmask,0)+1;
+#endif
+if(skip_to>s2)
+  mask=forget_before(mask,skip_to-s2);
+#endif
+if (_DETECT_END(unroll)) /*we need to handle case when end is at start of next page here*/
+  {
+    end = min(DETECT_END-s2-1,end ? (end-1) : 64)+1;
+  }
+if (end)
+  {
+    mask=forget_after(mask,end-1);
+  }
+i=0;
+while(mask)
+  {
+    i=first_bit(mask,i);
+    uchar UNUSED *p=s2+i;
+    if(__builtin_expect(_TEST_ZERO_BYTE,0))
+      {
+        LOOP_END(p)
+      }
+    LOOP_BODY(p)
+#ifdef CAN_SKIP
+    mask=forget_before(mask,skip_to-s2);
+#else
+    mask=forget_first_bit(mask,i);
+#endif
+  }
+if(end)
+  {
+    uchar UNUSED *p=DETECT_END;
+    LOOP_END(p);
+  }
+
+goto start;
+
+
+#undef CAN_SKIP
+
+#undef TEST_CODE
+#undef LOOP_BODY
+#undef ACTION
+#undef DETECT_END
+#undef _DETECT_END
diff --git a/string/memchr.c b/string/memchr.c
index 22637cf..775afa6 100644
--- a/string/memchr.c
+++ b/string/memchr.c
@@ -24,29 +24,6 @@
 #include <config.h>
 #endif
 
-#undef __ptr_t
-#define __ptr_t void *
-
-#if defined _LIBC
-# include <string.h>
-# include <memcopy.h>
-#endif
-
-#if HAVE_STDLIB_H || defined _LIBC
-# include <stdlib.h>
-#endif
-
-#if HAVE_LIMITS_H || defined _LIBC
-# include <limits.h>
-#endif
-
-#define LONG_MAX_32_BITS 2147483647
-
-#ifndef LONG_MAX
-#define LONG_MAX LONG_MAX_32_BITS
-#endif
-
-#include <sys/types.h>
 #if HAVE_BP_SYM_H || defined _LIBC
 #include <bp-sym.h>
 #else
@@ -56,152 +33,15 @@
 #undef memchr
 #undef __memchr
 
-/* Search no more than N bytes of S for C.  */
-__ptr_t
-__memchr (s, c_in, n)
-     const __ptr_t s;
-     int c_in;
-     size_t n;
-{
-  const unsigned char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, charmask;
-  unsigned char c;
-
-  c = (unsigned char) c_in;
-
-  /* Handle the first few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = (const unsigned char *) s;
-       n > 0 && ((unsigned long int) char_ptr
-		 & (sizeof (longword) - 1)) != 0;
-       --n, ++char_ptr)
-    if (*char_ptr == c)
-      return (__ptr_t) char_ptr;
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-
-  if (sizeof (longword) != 4 && sizeof (longword) != 8)
-    abort ();
-
-#if LONG_MAX <= LONG_MAX_32_BITS
-  magic_bits = 0x7efefeff;
-#else
-  magic_bits = ((unsigned long int) 0x7efefefe << 32) | 0xfefefeff;
-#endif
-
-  /* Set up a longword, each of whose bytes is C.  */
-  charmask = c | (c << 8);
-  charmask |= charmask << 16;
-#if LONG_MAX > LONG_MAX_32_BITS
-  charmask |= charmask << 32;
-#endif
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  while (n >= sizeof (longword))
-    {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
-
-	 So it ignores everything except 128's, when they're aligned
-	 properly.
-
-	 3) But wait!  Aren't we looking for C, not zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-      longword = *longword_ptr++ ^ charmask;
-
-      /* Add MAGIC_BITS to LONGWORD.  */
-      if ((((longword + magic_bits)
-
-	    /* Set those bits that were unchanged by the addition.  */
-	    ^ ~longword)
-
-	   /* Look at only the hole bits.  If any of the hole bits
-	      are unchanged, most likely one of the bytes was a
-	      zero.  */
-	   & ~magic_bits) != 0)
-	{
-	  /* Which of the bytes was C?  If none of them were, it was
-	     a misfire; continue the search.  */
-
-	  const unsigned char *cp = (const unsigned char *) (longword_ptr - 1);
-
-	  if (cp[0] == c)
-	    return (__ptr_t) cp;
-	  if (cp[1] == c)
-	    return (__ptr_t) &cp[1];
-	  if (cp[2] == c)
-	    return (__ptr_t) &cp[2];
-	  if (cp[3] == c)
-	    return (__ptr_t) &cp[3];
-#if LONG_MAX > 2147483647
-	  if (cp[4] == c)
-	    return (__ptr_t) &cp[4];
-	  if (cp[5] == c)
-	    return (__ptr_t) &cp[5];
-	  if (cp[6] == c)
-	    return (__ptr_t) &cp[6];
-	  if (cp[7] == c)
-	    return (__ptr_t) &cp[7];
+#ifndef MEMCHR
+#define MEMCHR __memchr
 #endif
-	}
 
-      n -= sizeof (longword);
-    }
+#define AS_MEMCHR
+#include "strchr.h"
 
-  char_ptr = (const unsigned char *) longword_ptr;
 
-  while (n-- > 0)
-    {
-      if (*char_ptr == c)
-	return (__ptr_t) char_ptr;
-      else
-	++char_ptr;
-    }
 
-  return 0;
-}
 #ifdef weak_alias
 weak_alias (__memchr, BP_SYM (memchr))
 #endif
diff --git a/string/memmem.c b/string/memmem.c
index 625c9cf..d208a35 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1991,92,93,94,96,97,98,2000,2004,2008 Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -12,66 +12,14 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
+   License asize_t with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This particular implementation was written by Eric Blake, 2008.  */
 
-#ifndef _LIBC
-# include <config.h>
+#ifndef MEMMEM
+#define MEMMEM memmem
 #endif
 
-/* Specification of memmem.  */
-#include <string.h>
+#define AS_MEMMEM
+#include "strstr.h"
 
-#ifndef _LIBC
-# define __builtin_expect(expr, val)   (expr)
-#endif
-
-#define RETURN_TYPE void *
-#define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
-#include "str-two-way.h"
-
-#undef memmem
-
-/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
-   if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
-   HAYSTACK.  */
-void *
-memmem (const void *haystack_start, size_t haystack_len,
-	const void *needle_start, size_t needle_len)
-{
-  /* Abstract memory is considered to be an array of 'unsigned char' values,
-     not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
-  const unsigned char *haystack = (const unsigned char *) haystack_start;
-  const unsigned char *needle = (const unsigned char *) needle_start;
-
-  if (needle_len == 0)
-    /* The first occurrence of the empty string is deemed to occur at
-       the beginning of the string.  */
-    return (void *) haystack;
-
-  /* Sanity check, otherwise the loop might search through the whole
-     memory.  */
-  if (__builtin_expect (haystack_len < needle_len, 0))
-    return NULL;
-
-  /* Use optimizations in memchr when possible, to reduce the search
-     size of haystack using a linear algorithm with a smaller
-     coefficient.  However, avoid memchr for long needles, since we
-     can often achieve sublinear performance.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
-    {
-      haystack = memchr (haystack, *needle, haystack_len);
-      if (!haystack || __builtin_expect (needle_len == 1, 0))
-	return (void *) haystack;
-      haystack_len -= haystack - (const unsigned char *) haystack_start;
-      if (haystack_len < needle_len)
-	return NULL;
-      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
-    }
-  else
-    return two_way_long_needle (haystack, haystack_len, needle, needle_len);
-}
-
-#undef LONG_NEEDLE_THRESHOLD
diff --git a/string/memrchr.c b/string/memrchr.c
index 2826f13..a1da7bd 100644
--- a/string/memrchr.c
+++ b/string/memrchr.c
@@ -27,25 +27,6 @@
 # include <config.h>
 #endif
 
-#undef __ptr_t
-#define __ptr_t void *
-
-#if defined _LIBC
-# include <string.h>
-# include <memcopy.h>
-#endif
-
-#if defined HAVE_LIMITS_H || defined _LIBC
-# include <limits.h>
-#endif
-
-#define LONG_MAX_32_BITS 2147483647
-
-#ifndef LONG_MAX
-# define LONG_MAX LONG_MAX_32_BITS
-#endif
-
-#include <sys/types.h>
 
 #undef __memrchr
 #undef memrchr
@@ -54,155 +35,13 @@
 # define __memrchr memrchr
 #endif
 
-/* Search no more than N bytes of S for C.  */
-__ptr_t
 #ifndef MEMRCHR
-__memrchr
-#else
-MEMRCHR
-#endif
-     (s, c_in, n)
-     const __ptr_t s;
-     int c_in;
-     size_t n;
-{
-  const unsigned char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, charmask;
-  unsigned char c;
-
-  c = (unsigned char) c_in;
-
-  /* Handle the last few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = (const unsigned char *) s + n;
-       n > 0 && ((unsigned long int) char_ptr
-		 & (sizeof (longword) - 1)) != 0;
-       --n)
-    if (*--char_ptr == c)
-      return (__ptr_t) char_ptr;
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (const unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-
-  if (sizeof (longword) != 4 && sizeof (longword) != 8)
-    abort ();
-
-#if LONG_MAX <= LONG_MAX_32_BITS
-  magic_bits = 0x7efefeff;
-#else
-  magic_bits = ((unsigned long int) 0x7efefefe << 32) | 0xfefefeff;
+#define MEMRCHR __memrchr
 #endif
 
-  /* Set up a longword, each of whose bytes is C.  */
-  charmask = c | (c << 8);
-  charmask |= charmask << 16;
-#if LONG_MAX > LONG_MAX_32_BITS
-  charmask |= charmask << 32;
-#endif
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  while (n >= sizeof (longword))
-    {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
-
-	 So it ignores everything except 128's, when they're aligned
-	 properly.
-
-	 3) But wait!  Aren't we looking for C, not zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-      longword = *--longword_ptr ^ charmask;
-
-      /* Add MAGIC_BITS to LONGWORD.  */
-      if ((((longword + magic_bits)
-
-	    /* Set those bits that were unchanged by the addition.  */
-	    ^ ~longword)
-
-	   /* Look at only the hole bits.  If any of the hole bits
-	      are unchanged, most likely one of the bytes was a
-	      zero.  */
-	   & ~magic_bits) != 0)
-	{
-	  /* Which of the bytes was C?  If none of them were, it was
-	     a misfire; continue the search.  */
-
-	  const unsigned char *cp = (const unsigned char *) longword_ptr;
-
-#if LONG_MAX > 2147483647
-	  if (cp[7] == c)
-	    return (__ptr_t) &cp[7];
-	  if (cp[6] == c)
-	    return (__ptr_t) &cp[6];
-	  if (cp[5] == c)
-	    return (__ptr_t) &cp[5];
-	  if (cp[4] == c)
-	    return (__ptr_t) &cp[4];
-#endif
-	  if (cp[3] == c)
-	    return (__ptr_t) &cp[3];
-	  if (cp[2] == c)
-	    return (__ptr_t) &cp[2];
-	  if (cp[1] == c)
-	    return (__ptr_t) &cp[1];
-	  if (cp[0] == c)
-	    return (__ptr_t) cp;
-	}
-
-      n -= sizeof (longword);
-    }
-
-  char_ptr = (const unsigned char *) longword_ptr;
-
-  while (n-- > 0)
-    {
-      if (*--char_ptr == c)
-	return (__ptr_t) char_ptr;
-    }
+#define AS_MEMRCHR
+#include "strchr.h"
 
-  return 0;
-}
 #ifndef MEMRCHR
 # ifdef weak_alias
 weak_alias (__memrchr, memrchr)
diff --git a/string/rawmemchr.c b/string/rawmemchr.c
index 90e8c7c..d880272 100644
--- a/string/rawmemchr.c
+++ b/string/rawmemchr.c
@@ -24,159 +24,16 @@
 #include <config.h>
 #endif
 
-#undef __ptr_t
-#define __ptr_t void *
-
-#if defined (_LIBC)
-# include <string.h>
-# include <memcopy.h>
-# include <stdlib.h>
-#endif
-
-#if defined (HAVE_LIMITS_H) || defined (_LIBC)
-# include <limits.h>
-#endif
-
-#define LONG_MAX_32_BITS 2147483647
-
-#ifndef LONG_MAX
-#define LONG_MAX LONG_MAX_32_BITS
-#endif
-
-#include <sys/types.h>
-
 #undef memchr
 
-
-/* Find the first occurrence of C in S.  */
-__ptr_t
-__rawmemchr (s, c_in)
-     const __ptr_t s;
-     int c_in;
-{
-  const unsigned char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, charmask;
-  unsigned char c;
-
-  c = (unsigned char) c_in;
-
-  /* Handle the first few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = (const unsigned char *) s;
-       ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0;
-       ++char_ptr)
-    if (*char_ptr == c)
-      return (__ptr_t) char_ptr;
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-
-  if (sizeof (longword) != 4 && sizeof (longword) != 8)
-    abort ();
-
-#if LONG_MAX <= LONG_MAX_32_BITS
-  magic_bits = 0x7efefeff;
-#else
-  magic_bits = ((unsigned long int) 0x7efefefe << 32) | 0xfefefeff;
+#ifndef RAWMEMCHR
+#define RAWMEMCHR __rawmemchr
 #endif
 
-  /* Set up a longword, each of whose bytes is C.  */
-  charmask = c | (c << 8);
-  charmask |= charmask << 16;
-#if LONG_MAX > LONG_MAX_32_BITS
-  charmask |= charmask << 32;
-#endif
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  while (1)
-    {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
+#define AS_RAWMEMCHR
+#include "strchr.h"
 
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
 
-	 So it ignores everything except 128's, when they're aligned
-	 properly.
 
-	 3) But wait!  Aren't we looking for C, not zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-      longword = *longword_ptr++ ^ charmask;
-
-      /* Add MAGIC_BITS to LONGWORD.  */
-      if ((((longword + magic_bits)
-
-	    /* Set those bits that were unchanged by the addition.  */
-	    ^ ~longword)
-
-	   /* Look at only the hole bits.  If any of the hole bits
-	      are unchanged, most likely one of the bytes was a
-	      zero.  */
-	   & ~magic_bits) != 0)
-	{
-	  /* Which of the bytes was C?  If none of them were, it was
-	     a misfire; continue the search.  */
-
-	  const unsigned char *cp = (const unsigned char *) (longword_ptr - 1);
-
-	  if (cp[0] == c)
-	    return (__ptr_t) cp;
-	  if (cp[1] == c)
-	    return (__ptr_t) &cp[1];
-	  if (cp[2] == c)
-	    return (__ptr_t) &cp[2];
-	  if (cp[3] == c)
-	    return (__ptr_t) &cp[3];
-#if LONG_MAX > 2147483647
-	  if (cp[4] == c)
-	    return (__ptr_t) &cp[4];
-	  if (cp[5] == c)
-	    return (__ptr_t) &cp[5];
-	  if (cp[6] == c)
-	    return (__ptr_t) &cp[6];
-	  if (cp[7] == c)
-	    return (__ptr_t) &cp[7];
-#endif
-	}
-    }
-}
 libc_hidden_def (__rawmemchr)
 weak_alias (__rawmemchr, rawmemchr)
diff --git a/string/str-two-way.h b/string/str-two-way.h
deleted file mode 100644
index 1b2a8bd..0000000
--- a/string/str-two-way.h
+++ /dev/null
@@ -1,428 +0,0 @@
-/* Byte-wise substring search, using the Two-Way algorithm.
-   Copyright (C) 2008, 2010 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Written by Eric Blake <ebb9@byu.net>, 2008.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-/* Before including this file, you need to include <string.h> (and
-   <config.h> before that, if not part of libc), and define:
-     RESULT_TYPE             A macro that expands to the return type.
-     AVAILABLE(h, h_l, j, n_l)
-			     A macro that returns nonzero if there are
-			     at least N_L bytes left starting at H[J].
-			     H is 'unsigned char *', H_L, J, and N_L
-			     are 'size_t'; H_L is an lvalue.  For
-			     NUL-terminated searches, H_L can be
-			     modified each iteration to avoid having
-			     to compute the end of H up front.
-
-  For case-insensitivity, you may optionally define:
-     CMP_FUNC(p1, p2, l)     A macro that returns 0 iff the first L
-			     characters of P1 and P2 are equal.
-     CANON_ELEMENT(c)        A macro that canonicalizes an element right after
-			     it has been fetched from one of the two strings.
-			     The argument is an 'unsigned char'; the result
-			     must be an 'unsigned char' as well.
-
-  This file undefines the macros documented above, and defines
-  LONG_NEEDLE_THRESHOLD.
-*/
-
-#include <limits.h>
-#include <stdint.h>
-
-/* We use the Two-Way string matching algorithm, which guarantees
-   linear complexity with constant space.  Additionally, for long
-   needles, we also use a bad character shift table similar to the
-   Boyer-Moore algorithm to achieve improved (potentially sub-linear)
-   performance.
-
-   See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
-   and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
-*/
-
-/* Point at which computing a bad-byte shift table is likely to be
-   worthwhile.  Small needles should not compute a table, since it
-   adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
-   speedup no greater than a factor of NEEDLE_LEN.  The larger the
-   needle, the better the potential performance gain.  On the other
-   hand, on non-POSIX systems with CHAR_BIT larger than eight, the
-   memory required for the table is prohibitive.  */
-#if CHAR_BIT < 10
-# define LONG_NEEDLE_THRESHOLD 32U
-#else
-# define LONG_NEEDLE_THRESHOLD SIZE_MAX
-#endif
-
-#ifndef MAX
-# define MAX(a, b) ((a < b) ? (b) : (a))
-#endif
-
-#ifndef CANON_ELEMENT
-# define CANON_ELEMENT(c) c
-#endif
-#ifndef CMP_FUNC
-# define CMP_FUNC memcmp
-#endif
-
-/* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
-   Return the index of the first byte in the right half, and set
-   *PERIOD to the global period of the right half.
-
-   The global period of a string is the smallest index (possibly its
-   length) at which all remaining bytes in the string are repetitions
-   of the prefix (the last repetition may be a subset of the prefix).
-
-   When NEEDLE is factored into two halves, a local period is the
-   length of the smallest word that shares a suffix with the left half
-   and shares a prefix with the right half.  All factorizations of a
-   non-empty NEEDLE have a local period of at least 1 and no greater
-   than NEEDLE_LEN.
-
-   A critical factorization has the property that the local period
-   equals the global period.  All strings have at least one critical
-   factorization with the left half smaller than the global period.
-
-   Given an ordered alphabet, a critical factorization can be computed
-   in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
-   larger of two ordered maximal suffixes.  The ordered maximal
-   suffixes are determined by lexicographic comparison of
-   periodicity.  */
-static size_t
-critical_factorization (const unsigned char *needle, size_t needle_len,
-			size_t *period)
-{
-  /* Index of last byte of left half, or SIZE_MAX.  */
-  size_t max_suffix, max_suffix_rev;
-  size_t j; /* Index into NEEDLE for current candidate suffix.  */
-  size_t k; /* Offset into current period.  */
-  size_t p; /* Intermediate period.  */
-  unsigned char a, b; /* Current comparison bytes.  */
-
-  /* Invariants:
-     0 <= j < NEEDLE_LEN - 1
-     -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
-     min(max_suffix, max_suffix_rev) < global period of NEEDLE
-     1 <= p <= global period of NEEDLE
-     p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
-     1 <= k <= p
-  */
-
-  /* Perform lexicographic search.  */
-  max_suffix = SIZE_MAX;
-  j = 0;
-  k = p = 1;
-  while (j + k < needle_len)
-    {
-      a = CANON_ELEMENT (needle[j + k]);
-      b = CANON_ELEMENT (needle[max_suffix + k]);
-      if (a < b)
-	{
-	  /* Suffix is smaller, period is entire prefix so far.  */
-	  j += k;
-	  k = 1;
-	  p = j - max_suffix;
-	}
-      else if (a == b)
-	{
-	  /* Advance through repetition of the current period.  */
-	  if (k != p)
-	    ++k;
-	  else
-	    {
-	      j += p;
-	      k = 1;
-	    }
-	}
-      else /* b < a */
-	{
-	  /* Suffix is larger, start over from current location.  */
-	  max_suffix = j++;
-	  k = p = 1;
-	}
-    }
-  *period = p;
-
-  /* Perform reverse lexicographic search.  */
-  max_suffix_rev = SIZE_MAX;
-  j = 0;
-  k = p = 1;
-  while (j + k < needle_len)
-    {
-      a = CANON_ELEMENT (needle[j + k]);
-      b = CANON_ELEMENT (needle[max_suffix_rev + k]);
-      if (b < a)
-	{
-	  /* Suffix is smaller, period is entire prefix so far.  */
-	  j += k;
-	  k = 1;
-	  p = j - max_suffix_rev;
-	}
-      else if (a == b)
-	{
-	  /* Advance through repetition of the current period.  */
-	  if (k != p)
-	    ++k;
-	  else
-	    {
-	      j += p;
-	      k = 1;
-	    }
-	}
-      else /* a < b */
-	{
-	  /* Suffix is larger, start over from current location.  */
-	  max_suffix_rev = j++;
-	  k = p = 1;
-	}
-    }
-
-  /* Choose the longer suffix.  Return the first byte of the right
-     half, rather than the last byte of the left half.  */
-  if (max_suffix_rev + 1 < max_suffix + 1)
-    return max_suffix + 1;
-  *period = p;
-  return max_suffix_rev + 1;
-}
-
-/* Return the first location of non-empty NEEDLE within HAYSTACK, or
-   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
-   method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
-   Performance is guaranteed to be linear, with an initialization cost
-   of 2 * NEEDLE_LEN comparisons.
-
-   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
-   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
-   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
-   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
-static RETURN_TYPE
-two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
-		      const unsigned char *needle, size_t needle_len)
-{
-  size_t i; /* Index into current byte of NEEDLE.  */
-  size_t j; /* Index into current window of HAYSTACK.  */
-  size_t period; /* The period of the right half of needle.  */
-  size_t suffix; /* The index of the right half of needle.  */
-
-  /* Factor the needle into two halves, such that the left half is
-     smaller than the global period, and the right half is
-     periodic (with a period as large as NEEDLE_LEN - suffix).  */
-  suffix = critical_factorization (needle, needle_len, &period);
-
-  /* Perform the search.  Each iteration compares the right half
-     first.  */
-  if (CMP_FUNC (needle, needle + period, suffix) == 0)
-    {
-      /* Entire needle is periodic; a mismatch can only advance by the
-	 period, so use memory to avoid rescanning known occurrences
-	 of the period.  */
-      size_t memory = 0;
-      j = 0;
-      while (AVAILABLE (haystack, haystack_len, j, needle_len))
-	{
-	  /* Scan for matches in right half.  */
-	  i = MAX (suffix, memory);
-	  while (i < needle_len && (CANON_ELEMENT (needle[i])
-				    == CANON_ELEMENT (haystack[i + j])))
-	    ++i;
-	  if (needle_len <= i)
-	    {
-	      /* Scan for matches in left half.  */
-	      i = suffix - 1;
-	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
-					== CANON_ELEMENT (haystack[i + j])))
-		--i;
-	      if (i + 1 < memory + 1)
-		return (RETURN_TYPE) (haystack + j);
-	      /* No match, so remember how many repetitions of period
-		 on the right half were scanned.  */
-	      j += period;
-	      memory = needle_len - period;
-	    }
-	  else
-	    {
-	      j += i - suffix + 1;
-	      memory = 0;
-	    }
-	}
-    }
-  else
-    {
-      /* The two halves of needle are distinct; no extra memory is
-	 required, and any mismatch results in a maximal shift.  */
-      period = MAX (suffix, needle_len - suffix) + 1;
-      j = 0;
-      while (AVAILABLE (haystack, haystack_len, j, needle_len))
-	{
-	  /* Scan for matches in right half.  */
-	  i = suffix;
-	  while (i < needle_len && (CANON_ELEMENT (needle[i])
-				    == CANON_ELEMENT (haystack[i + j])))
-	    ++i;
-	  if (needle_len <= i)
-	    {
-	      /* Scan for matches in left half.  */
-	      i = suffix - 1;
-	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
-				       == CANON_ELEMENT (haystack[i + j])))
-		--i;
-	      if (i == SIZE_MAX)
-		return (RETURN_TYPE) (haystack + j);
-	      j += period;
-	    }
-	  else
-	    j += i - suffix + 1;
-	}
-    }
-  return NULL;
-}
-
-/* Return the first location of non-empty NEEDLE within HAYSTACK, or
-   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
-   method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
-   Performance is guaranteed to be linear, with an initialization cost
-   of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
-
-   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
-   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
-   and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
-   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
-   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
-   sublinear performance is not possible.  */
-static RETURN_TYPE
-two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
-		     const unsigned char *needle, size_t needle_len)
-{
-  size_t i; /* Index into current byte of NEEDLE.  */
-  size_t j; /* Index into current window of HAYSTACK.  */
-  size_t period; /* The period of the right half of needle.  */
-  size_t suffix; /* The index of the right half of needle.  */
-  size_t shift_table[1U << CHAR_BIT]; /* See below.  */
-
-  /* Factor the needle into two halves, such that the left half is
-     smaller than the global period, and the right half is
-     periodic (with a period as large as NEEDLE_LEN - suffix).  */
-  suffix = critical_factorization (needle, needle_len, &period);
-
-  /* Populate shift_table.  For each possible byte value c,
-     shift_table[c] is the distance from the last occurrence of c to
-     the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
-     shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
-  for (i = 0; i < 1U << CHAR_BIT; i++)
-    shift_table[i] = needle_len;
-  for (i = 0; i < needle_len; i++)
-    shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
-
-  /* Perform the search.  Each iteration compares the right half
-     first.  */
-  if (CMP_FUNC (needle, needle + period, suffix) == 0)
-    {
-      /* Entire needle is periodic; a mismatch can only advance by the
-	 period, so use memory to avoid rescanning known occurrences
-	 of the period.  */
-      size_t memory = 0;
-      size_t shift;
-      j = 0;
-      while (AVAILABLE (haystack, haystack_len, j, needle_len))
-	{
-	  /* Check the last byte first; if it does not match, then
-	     shift to the next possible match location.  */
-	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
-	  if (0 < shift)
-	    {
-	      if (memory && shift < period)
-		{
-		  /* Since needle is periodic, but the last period has
-		     a byte out of place, there can be no match until
-		     after the mismatch.  */
-		  shift = needle_len - period;
-		}
-	      memory = 0;
-	      j += shift;
-	      continue;
-	    }
-	  /* Scan for matches in right half.  The last byte has
-	     already been matched, by virtue of the shift table.  */
-	  i = MAX (suffix, memory);
-	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
-					== CANON_ELEMENT (haystack[i + j])))
-	    ++i;
-	  if (needle_len - 1 <= i)
-	    {
-	      /* Scan for matches in left half.  */
-	      i = suffix - 1;
-	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
-					== CANON_ELEMENT (haystack[i + j])))
-		--i;
-	      if (i + 1 < memory + 1)
-		return (RETURN_TYPE) (haystack + j);
-	      /* No match, so remember how many repetitions of period
-		 on the right half were scanned.  */
-	      j += period;
-	      memory = needle_len - period;
-	    }
-	  else
-	    {
-	      j += i - suffix + 1;
-	      memory = 0;
-	    }
-	}
-    }
-  else
-    {
-      /* The two halves of needle are distinct; no extra memory is
-	 required, and any mismatch results in a maximal shift.  */
-      size_t shift;
-      period = MAX (suffix, needle_len - suffix) + 1;
-      j = 0;
-      while (AVAILABLE (haystack, haystack_len, j, needle_len))
-	{
-	  /* Check the last byte first; if it does not match, then
-	     shift to the next possible match location.  */
-	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
-	  if (0 < shift)
-	    {
-	      j += shift;
-	      continue;
-	    }
-	  /* Scan for matches in right half.  The last byte has
-	     already been matched, by virtue of the shift table.  */
-	  i = suffix;
-	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
-					== CANON_ELEMENT (haystack[i + j])))
-	    ++i;
-	  if (needle_len - 1 <= i)
-	    {
-	      /* Scan for matches in left half.  */
-	      i = suffix - 1;
-	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
-				       == CANON_ELEMENT (haystack[i + j])))
-		--i;
-	      if (i == SIZE_MAX)
-		return (RETURN_TYPE) (haystack + j);
-	      j += period;
-	    }
-	  else
-	    j += i - suffix + 1;
-	}
-    }
-  return NULL;
-}
-
-#undef AVAILABLE
-#undef CANON_ELEMENT
-#undef CMP_FUNC
-#undef RETURN_TYPE
diff --git a/string/strcasestr.c b/string/strcasestr.c
index 9e1bde9..df41f50 100644
--- a/string/strcasestr.c
+++ b/string/strcasestr.c
@@ -1,6 +1,4 @@
-/* Return the offset of one string within another.
-   Copyright (C) 1994, 1996-2000, 2004, 2008, 2009, 2010
-   Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -14,40 +12,9 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
+   License asize_t with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/*
- * My personal strstr() implementation that beats most other algorithms.
- * Until someone tells me otherwise, I assume that this is the
- * fastest implementation of strstr() in C.
- * I deliberately chose not to comment it.  You should have at least
- * as much fun trying to understand it, as I had to write it :-).
- *
- * Stephen R. van den Berg, berg@pool.informatik.rwth-aachen.de	*/
-
-#if HAVE_CONFIG_H
-# include <config.h>
-#endif
-
-/* Specification.  */
-#include <string.h>
-
-#include <ctype.h>
-#include <stdbool.h>
-#include <strings.h>
-
-#define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch))
-
-/* Two-Way algorithm.  */
-#define RETURN_TYPE char *
-#define AVAILABLE(h, h_l, j, n_l)			\
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
-   && ((h_l) = (j) + (n_l)))
-#define CANON_ELEMENT(c) TOLOWER (c)
-#define CMP_FUNC(p1, p2, l)				\
-  __strncasecmp ((const char *) (p1), (const char *) (p2), l)
-#include "str-two-way.h"
 
 #undef strcasestr
 #undef __strcasestr
@@ -56,52 +23,12 @@
 #define STRCASESTR __strcasestr
 #endif
 
+#define AS_STRCASESTR
+#include "strstr.h"
 
-/* Find the first occurrence of NEEDLE in HAYSTACK, using
-   case-insensitive comparison.  This function gives unspecified
-   results in multibyte locales.  */
-char *
-STRCASESTR (const char *haystack_start, const char *needle_start)
-{
-  const char *haystack = haystack_start;
-  const char *needle = needle_start;
-  size_t needle_len; /* Length of NEEDLE.  */
-  size_t haystack_len; /* Known minimum length of HAYSTACK.  */
-  bool ok = true; /* True if NEEDLE is prefix of HAYSTACK.  */
-
-  /* Determine length of NEEDLE, and in the process, make sure
-     HAYSTACK is at least as long (no point processing all of a long
-     NEEDLE if HAYSTACK is too short).  */
-  while (*haystack && *needle)
-    {
-      ok &= (TOLOWER ((unsigned char) *haystack)
-	     == TOLOWER ((unsigned char) *needle));
-      haystack++;
-      needle++;
-    }
-  if (*needle)
-    return NULL;
-  if (ok)
-    return (char *) haystack_start;
-  needle_len = needle - needle_start;
-  haystack = haystack_start + 1;
-  haystack_len = needle_len - 1;
-
-  /* Perform the search.  Abstract memory is considered to be an array
-     of 'unsigned char' values, not an array of 'char' values.  See
-     ISO C 99 section 6.2.6.1.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
-    return two_way_short_needle ((const unsigned char *) haystack,
-				 haystack_len,
-				 (const unsigned char *) needle_start,
-				 needle_len);
-  return two_way_long_needle ((const unsigned char *) haystack, haystack_len,
-			      (const unsigned char *) needle_start,
-			      needle_len);
-}
-
-#undef LONG_NEEDLE_THRESHOLD
 
 #ifndef NO_ALIAS
 weak_alias (__strcasestr, strcasestr)
 #endif
+
+
diff --git a/string/strchr.c b/string/strchr.c
index 9d18b7e..04f6eb8 100644
--- a/string/strchr.c
+++ b/string/strchr.c
@@ -1,11 +1,5 @@
-/* Copyright (C) 1991,1993-1997,1999,2000,2003,2006
-   Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Based on strlen implementation by Torbjorn Granlund (tege@sics.se),
-   with help from Dan Sahlin (dan@sics.se) and
-   bug fix and commentary by Jim Blandy (jimb@ai.mit.edu);
-   adaptation to strchr suggested by Dick Karpinski (dick@cca.ucsf.edu),
-   and implemented by Roland McGrath (roland@ai.mit.edu).
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -18,170 +12,17 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
+   License asize_t with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <string.h>
-#include <memcopy.h>
-#include <stdlib.h>
 
-#undef strchr
-
-/* Find the first occurrence of C in S.  */
-char *
-strchr (s, c_in)
-     const char *s;
-     int c_in;
-{
-  const unsigned char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, charmask;
-  unsigned char c;
-
-  c = (unsigned char) c_in;
-
-  /* Handle the first few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = (const unsigned char *) s;
-       ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0;
-       ++char_ptr)
-    if (*char_ptr == c)
-      return (void *) char_ptr;
-    else if (*char_ptr == '\0')
-      return NULL;
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-  switch (sizeof (longword))
-    {
-    case 4: magic_bits = 0x7efefeffL; break;
-    case 8: magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; break;
-    default:
-      abort ();
-    }
-
-  /* Set up a longword, each of whose bytes is C.  */
-  charmask = c | (c << 8);
-  charmask |= charmask << 16;
-  if (sizeof (longword) > 4)
-    /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-    charmask |= (charmask << 16) << 16;
-  if (sizeof (longword) > 8)
-    abort ();
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  for (;;)
-    {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
-
-	 So it ignores everything except 128's, when they're aligned
-	 properly.
-
-	 3) But wait!  Aren't we looking for C as well as zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-      longword = *longword_ptr++;
-
-      /* Add MAGIC_BITS to LONGWORD.  */
-      if ((((longword + magic_bits)
-
-	    /* Set those bits that were unchanged by the addition.  */
-	    ^ ~longword)
-
-	   /* Look at only the hole bits.  If any of the hole bits
-	      are unchanged, most likely one of the bytes was a
-	      zero.  */
-	   & ~magic_bits) != 0 ||
-
-	  /* That caught zeroes.  Now test for C.  */
-	  ((((longword ^ charmask) + magic_bits) ^ ~(longword ^ charmask))
-	   & ~magic_bits) != 0)
-	{
-	  /* Which of the bytes was C or zero?
-	     If none of them were, it was a misfire; continue the search.  */
-
-	  const unsigned char *cp = (const unsigned char *) (longword_ptr - 1);
+#ifndef STRCHR
+#define STRCHR strchr
+#endif
 
-	  if (*cp == c)
-	    return (char *) cp;
-	  else if (*cp == '\0')
-	    return NULL;
-	  if (*++cp == c)
-	    return (char *) cp;
-	  else if (*cp == '\0')
-	    return NULL;
-	  if (*++cp == c)
-	    return (char *) cp;
-	  else if (*cp == '\0')
-	    return NULL;
-	  if (*++cp == c)
-	    return (char *) cp;
-	  else if (*cp == '\0')
-	    return NULL;
-	  if (sizeof (longword) > 4)
-	    {
-	      if (*++cp == c)
-		return (char *) cp;
-	      else if (*cp == '\0')
-		return NULL;
-	      if (*++cp == c)
-		return (char *) cp;
-	      else if (*cp == '\0')
-		return NULL;
-	      if (*++cp == c)
-		return (char *) cp;
-	      else if (*cp == '\0')
-		return NULL;
-	      if (*++cp == c)
-		return (char *) cp;
-	      else if (*cp == '\0')
-		return NULL;
-	    }
-	}
-    }
+#define AS_STRCHR
+#include "strchr.h"
 
-  return NULL;
-}
 
 #ifdef weak_alias
 #undef index
diff --git a/string/strchr.h b/string/strchr.h
new file mode 100644
index 0000000..b6ff374
--- /dev/null
+++ b/string/strchr.h
@@ -0,0 +1,70 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License asize_t with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define unroll 4
+#define prefetch 8
+
+#include "vector.h"
+
+#define TEST_CODE(so,sn) TEST_EQ(sn,vc)
+
+#if defined(AS_STRCHR) || defined(AS_STRRCHR) || defined(AS_STRCHRNUL)
+#define DETECT_ZERO_BYTE
+#endif
+#if defined(AS_MEMCHR) || defined(AS_MEMRCHR)
+#define DETECT_END ((s+ss>=s) ? s+ss : ((uchar*)((long)-1)))
+#endif
+
+
+#ifdef AS_STRCHR
+#define LOOP_END(p) return NULL;
+uchar* STRCHR(   const uchar *s, int c )
+#endif
+#ifdef AS_MEMCHR
+#define LOOP_END(p) return NULL;
+uchar* MEMCHR(   const uchar *s, int c , size_t ss)
+#endif
+
+#if defined(AS_STRRCHR) || defined(AS_MEMRCHR)
+#define LOOP_BODY(p) r=p;
+#define LOOP_END(p) return r;
+#ifdef AS_STRRCHR
+uchar* STRRCHR(   const uchar *s, int c)
+#endif
+#ifdef AS_MEMRCHR
+uchar* MEMRCHR(   const uchar *s, int c , size_t ss)
+#endif
+#else
+#define LOOP_BODY(p) return p;
+#endif
+
+#ifdef AS_STRCHRNUL
+#define LOOP_END(p) return p;
+uchar* STRCHRNUL(const uchar *s, int c )
+#endif
+#ifdef AS_RAWMEMCHR
+#define LOOP_END(p) /*cannot happen*/
+uchar* RAWMEMCHR(const uchar *s, int c )
+#endif
+{
+#if defined(AS_STRCHR) || defined(AS_STRRCHR) || defined(AS_STRCHRNUL)
+  if(__builtin_expect(c==0,0)) return s+strlen(s);
+#endif
+  uchar UNUSED *r = NULL;
+  tp_vector vc=BROADCAST(c);
+#include "loop.h"
+}
diff --git a/string/strchrnul.c b/string/strchrnul.c
index 0db5e23..6e6992f 100644
--- a/string/strchrnul.c
+++ b/string/strchrnul.c
@@ -21,149 +21,17 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
-#include <memcopy.h>
 #include <stdlib.h>
 
 #undef __strchrnul
 #undef strchrnul
 
-/* Find the first occurrence of C in S or the final NUL byte.  */
-char *
-__strchrnul (s, c_in)
-     const char *s;
-     int c_in;
-{
-  const unsigned char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, charmask;
-  unsigned char c;
 
-  c = (unsigned char) c_in;
+#ifndef STRCHRNUL
+#define STRCHRNUL __strchrnul
+#endif
 
-  /* Handle the first few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = (const unsigned char *) s;
-       ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0;
-       ++char_ptr)
-    if (*char_ptr == c || *char_ptr == '\0')
-      return (void *) char_ptr;
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-  switch (sizeof (longword))
-    {
-    case 4: magic_bits = 0x7efefeffL; break;
-    case 8: magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; break;
-    default:
-      abort ();
-    }
-
-  /* Set up a longword, each of whose bytes is C.  */
-  charmask = c | (c << 8);
-  charmask |= charmask << 16;
-  if (sizeof (longword) > 4)
-    /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-    charmask |= (charmask << 16) << 16;
-  if (sizeof (longword) > 8)
-    abort ();
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  for (;;)
-    {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
-
-	 So it ignores everything except 128's, when they're aligned
-	 properly.
-
-	 3) But wait!  Aren't we looking for C as well as zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-      longword = *longword_ptr++;
-
-      /* Add MAGIC_BITS to LONGWORD.  */
-      if ((((longword + magic_bits)
-
-	    /* Set those bits that were unchanged by the addition.  */
-	    ^ ~longword)
-
-	   /* Look at only the hole bits.  If any of the hole bits
-	      are unchanged, most likely one of the bytes was a
-	      zero.  */
-	   & ~magic_bits) != 0 ||
-
-	  /* That caught zeroes.  Now test for C.  */
-	  ((((longword ^ charmask) + magic_bits) ^ ~(longword ^ charmask))
-	   & ~magic_bits) != 0)
-	{
-	  /* Which of the bytes was C or zero?
-	     If none of them were, it was a misfire; continue the search.  */
-
-	  const unsigned char *cp = (const unsigned char *) (longword_ptr - 1);
-
-	  if (*cp == c || *cp == '\0')
-	    return (char *) cp;
-	  if (*++cp == c || *cp == '\0')
-	    return (char *) cp;
-	  if (*++cp == c || *cp == '\0')
-	    return (char *) cp;
-	  if (*++cp == c || *cp == '\0')
-	    return (char *) cp;
-	  if (sizeof (longword) > 4)
-	    {
-	      if (*++cp == c || *cp == '\0')
-		return (char *) cp;
-	      if (*++cp == c || *cp == '\0')
-		return (char *) cp;
-	      if (*++cp == c || *cp == '\0')
-		return (char *) cp;
-	      if (*++cp == c || *cp == '\0')
-		return (char *) cp;
-	    }
-	}
-    }
-
-  /* This should never happen.  */
-  return NULL;
-}
+#define AS_STRCHRNUL
+#include "strchr.h"
 
 weak_alias (__strchrnul, strchrnul)
diff --git a/string/strlen.c b/string/strlen.c
index 5c1efda..7fdf07e 100644
--- a/string/strlen.c
+++ b/string/strlen.c
@@ -1,8 +1,5 @@
-/* Copyright (C) 1991,1993,1997,2000,2003,2009 Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Written by Torbjorn Granlund (tege@sics.se),
-   with help from Dan Sahlin (dan@sics.se);
-   commentary by Jim Blandy (jimb@ai.mit.edu).
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -15,92 +12,14 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
+   License asize_t with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <string.h>
-#include <stdlib.h>
 
-#undef strlen
+#ifndef STRLEN
+#define STRLEN strlen
+#endif
 
-/* Return the length of the null-terminated string STR.  Scan for
-   the null terminator quickly by testing four bytes at a time.  */
-size_t
-strlen (str)
-     const char *str;
-{
-  const char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, himagic, lomagic;
+#define AS_STRLEN
+#include "strlen.h"
 
-  /* Handle the first few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = str; ((unsigned long int) char_ptr
-			& (sizeof (longword) - 1)) != 0;
-       ++char_ptr)
-    if (*char_ptr == '\0')
-      return char_ptr - str;
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-  himagic = 0x80808080L;
-  lomagic = 0x01010101L;
-  if (sizeof (longword) > 4)
-    {
-      /* 64-bit version of the magic.  */
-      /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-      himagic = ((himagic << 16) << 16) | himagic;
-      lomagic = ((lomagic << 16) << 16) | lomagic;
-    }
-  if (sizeof (longword) > 8)
-    abort ();
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  for (;;)
-    {
-      longword = *longword_ptr++;
-
-      if (((longword - lomagic) & ~longword & himagic) != 0)
-	{
-	  /* Which of the bytes was the zero?  If none of them were, it was
-	     a misfire; continue the search.  */
-
-	  const char *cp = (const char *) (longword_ptr - 1);
-
-	  if (cp[0] == 0)
-	    return cp - str;
-	  if (cp[1] == 0)
-	    return cp - str + 1;
-	  if (cp[2] == 0)
-	    return cp - str + 2;
-	  if (cp[3] == 0)
-	    return cp - str + 3;
-	  if (sizeof (longword) > 4)
-	    {
-	      if (cp[4] == 0)
-		return cp - str + 4;
-	      if (cp[5] == 0)
-		return cp - str + 5;
-	      if (cp[6] == 0)
-		return cp - str + 6;
-	      if (cp[7] == 0)
-		return cp - str + 7;
-	    }
-	}
-    }
-}
-libc_hidden_builtin_def (strlen)
diff --git a/string/strlen.h b/string/strlen.h
new file mode 100644
index 0000000..950b8e1
--- /dev/null
+++ b/string/strlen.h
@@ -0,0 +1,39 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License asize_t with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#define unroll 4
+#define prefetch 8
+
+#include "vector.h"
+
+#define DETECT_ZERO_BYTE
+#define TEST_CODE(so,sn) vzero
+#define LOOP_BODY(p) return p-s;
+
+#ifdef AS_STRNLEN
+#define DETECT_END ((s+ss>=s) ? s+ss : ((uchar*)((long)-1)))
+#define LOOP_END(p)  return p-s;
+size_t STRNLEN( uchar *s , size_t ss )
+#endif
+#ifdef AS_STRLEN
+#define LOOP_END(p)  return p-s;
+size_t STRLEN(  uchar *s )
+#endif
+{
+#include "loop.h"
+}
diff --git a/string/strnlen.c b/string/strnlen.c
index 65b9aa6..90b9725 100644
--- a/string/strnlen.c
+++ b/string/strnlen.c
@@ -1,15 +1,10 @@
-/* Find the length of STRING, but scan at most MAXLEN characters.
-   Copyright (C) 1991, 1993, 1997, 2000, 2001, 2005, 2011 Free Software Foundation, Inc.
-   Contributed by Jakub Jelinek <jakub@redhat.com>.
-
-   Based on strlen written by Torbjorn Granlund (tege@sics.se),
-   with help from Dan Sahlin (dan@sics.se);
-   commentary by Jim Blandy (jimb@ai.mit.edu).
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public License as
-   published by the Free Software Foundation; either version 2.1 of the
-   License, or (at your option) any later version.
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
 
    The GNU C Library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -17,149 +12,18 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; see the file COPYING.LIB.  If
-   not, see <http://www.gnu.org/licenses/>.  */
-
-#include <string.h>
-#include <stdlib.h>
+   License asize_t with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
-/* Find the length of S, but scan at most MAXLEN characters.  If no
-   '\0' terminator is found in that many characters, return MAXLEN.  */
 
-#ifdef STRNLEN
-# define __strnlen STRNLEN
+#ifndef STRNLEN
+#define STRNLEN __strnlen
 #endif
 
-size_t
-__strnlen (const char *str, size_t maxlen)
-{
-  const char *char_ptr, *end_ptr = str + maxlen;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, himagic, lomagic;
-
-  if (maxlen == 0)
-    return 0;
-
-  if (__builtin_expect (end_ptr < str, 0))
-    end_ptr = (const char *) ~0UL;
-
-  /* Handle the first few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = str; ((unsigned long int) char_ptr
-			& (sizeof (longword) - 1)) != 0;
-       ++char_ptr)
-    if (*char_ptr == '\0')
-      {
-	if (char_ptr > end_ptr)
-	  char_ptr = end_ptr;
-	return char_ptr - str;
-      }
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-  himagic = 0x80808080L;
-  lomagic = 0x01010101L;
-  if (sizeof (longword) > 4)
-    {
-      /* 64-bit version of the magic.  */
-      /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-      himagic = ((himagic << 16) << 16) | himagic;
-      lomagic = ((lomagic << 16) << 16) | lomagic;
-    }
-  if (sizeof (longword) > 8)
-    abort ();
+#define AS_STRNLEN
+#include "strlen.h"
 
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  while (longword_ptr < (unsigned long int *) end_ptr)
-    {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
-
-	 So it ignores everything except 128's, when they're aligned
-	 properly.  */
-
-      longword = *longword_ptr++;
-
-      if ((longword - lomagic) & himagic)
-	{
-	  /* Which of the bytes was the zero?  If none of them were, it was
-	     a misfire; continue the search.  */
-
-	  const char *cp = (const char *) (longword_ptr - 1);
-
-	  char_ptr = cp;
-	  if (cp[0] == 0)
-	    break;
-	  char_ptr = cp + 1;
-	  if (cp[1] == 0)
-	    break;
-	  char_ptr = cp + 2;
-	  if (cp[2] == 0)
-	    break;
-	  char_ptr = cp + 3;
-	  if (cp[3] == 0)
-	    break;
-	  if (sizeof (longword) > 4)
-	    {
-	      char_ptr = cp + 4;
-	      if (cp[4] == 0)
-		break;
-	      char_ptr = cp + 5;
-	      if (cp[5] == 0)
-		break;
-	      char_ptr = cp + 6;
-	      if (cp[6] == 0)
-		break;
-	      char_ptr = cp + 7;
-	      if (cp[7] == 0)
-		break;
-	    }
-	}
-      char_ptr = end_ptr;
-    }
-
-  if (char_ptr > end_ptr)
-    char_ptr = end_ptr;
-  return char_ptr - str;
-}
-#ifndef STRNLEN
+#ifndef NO_ALIAS
 weak_alias (__strnlen, strnlen)
 #endif
-libc_hidden_def (strnlen)
+
diff --git a/string/strrchr.c b/string/strrchr.c
index a986ff9..9d2d6c4 100644
--- a/string/strrchr.c
+++ b/string/strrchr.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1991, 1995, 1996, 1997, 2003 Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -12,38 +12,19 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
+   License asize_t with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <string.h>
 
-#undef strrchr
-
-/* Find the last occurrence of C in S.  */
-char *
-strrchr (const char *s, int c)
-{
-  register const char *found, *p;
-
-  c = (unsigned char) c;
-
-  /* Since strchr is fast, we use it rather than the obvious loop.  */
-
-  if (c == '\0')
-    return strchr (s, '\0');
-
-  found = NULL;
-  while ((p = strchr (s, c)) != NULL)
-    {
-      found = p;
-      s = p + 1;
-    }
+#ifndef STRRCHR
+#define STRRCHR strrchr
+#endif
 
-  return (char *) found;
-}
+#define AS_STRRCHR
+#include "strchr.h"
 
 #ifdef weak_alias
-#undef rindex
+#undef index
 weak_alias (strrchr, rindex)
 #endif
 libc_hidden_builtin_def (strrchr)
diff --git a/string/strstr.c b/string/strstr.c
index 10e6fdc..f09559a 100644
--- a/string/strstr.c
+++ b/string/strstr.c
@@ -1,6 +1,4 @@
-/* Return the offset of one string within another.
-   Copyright (C) 1994,1996,1997,2000,2001,2003,2008,2009
-   Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -14,78 +12,14 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
+   License asize_t with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This particular implementation was written by Eric Blake, 2008.  */
-
-#ifndef _LIBC
-# include <config.h>
-#endif
-
-/* Specification of strstr.  */
-#include <string.h>
-
-#include <stdbool.h>
-
-#ifndef _LIBC
-# define __builtin_expect(expr, val)   (expr)
-#endif
-
-#define RETURN_TYPE char *
-#define AVAILABLE(h, h_l, j, n_l)			\
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
-   && ((h_l) = (j) + (n_l)))
-#include "str-two-way.h"
-
-#undef strstr
 
 #ifndef STRSTR
 #define STRSTR strstr
 #endif
 
-/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
-   if NEEDLE is empty, otherwise NULL if NEEDLE is not found in
-   HAYSTACK.  */
-char *
-STRSTR (const char *haystack_start, const char *needle_start)
-{
-  const char *haystack = haystack_start;
-  const char *needle = needle_start;
-  size_t needle_len; /* Length of NEEDLE.  */
-  size_t haystack_len; /* Known minimum length of HAYSTACK.  */
-  bool ok = true; /* True if NEEDLE is prefix of HAYSTACK.  */
-
-  /* Determine length of NEEDLE, and in the process, make sure
-     HAYSTACK is at least as long (no point processing all of a long
-     NEEDLE if HAYSTACK is too short).  */
-  while (*haystack && *needle)
-    ok &= *haystack++ == *needle++;
-  if (*needle)
-    return NULL;
-  if (ok)
-    return (char *) haystack_start;
-
-  /* Reduce the size of haystack using strchr, since it has a smaller
-     linear coefficient than the Two-Way algorithm.  */
-  needle_len = needle - needle_start;
-  haystack = strchr (haystack_start + 1, *needle_start);
-  if (!haystack || __builtin_expect (needle_len == 1, 0))
-    return (char *) haystack;
-  needle -= needle_len;
-  haystack_len = (haystack > haystack_start + needle_len ? 1
-		  : needle_len + haystack_start - haystack);
-
-  /* Perform the search.  Abstract memory is considered to be an array
-     of 'unsigned char' values, not an array of 'char' values.  See
-     ISO C 99 section 6.2.6.1.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
-    return two_way_short_needle ((const unsigned char *) haystack,
-				 haystack_len,
-				 (const unsigned char *) needle, needle_len);
-  return two_way_long_needle ((const unsigned char *) haystack, haystack_len,
-			      (const unsigned char *) needle, needle_len);
-}
-libc_hidden_builtin_def (strstr)
+#define AS_STRSTR
+#include "strstr.h"
 
-#undef LONG_NEEDLE_THRESHOLD
diff --git a/string/strstr.h b/string/strstr.h
new file mode 100644
index 0000000..016cc94
--- /dev/null
+++ b/string/strstr.h
@@ -0,0 +1,297 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License asize_t with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include <string.h>
+
+#define unroll 4
+#define prefetch 8
+#define small_treshold 128
+
+#include "vector.h"
+
+
+
+#ifdef AS_STRSTR
+#define _AS_STR_CASESTR_MEM(x,y,z) x
+#endif
+#ifdef AS_STRCASESTR
+#define _AS_STR_CASESTR_MEM(x,y,z) y
+#endif
+#ifdef AS_MEMMEM
+#define _AS_STR_CASESTR_MEM(x,y,z) z
+#endif
+#define CHAR(x) _AS_STR_CASESTR_MEM(*(x),\
+tolower_fixed[*(x)],\
+*(x))
+
+/*TODO vectorize*/
+SI size_t strcmp_dir(const uchar *a,const uchar *b,size_t no,int dir)
+{
+  size_t i;
+  for(i=0; i<no && CHAR(a)==CHAR(b); i++)
+    {
+      a+=dir;
+      b+=dir;
+    }
+  return i;
+}
+
+/* Two way algorithm: CROCHEMORE M., PERRIN D., 1991,
+ Two-way string-matching, Journal of the ACM 38(3):651-675.
+   Implementation based from http://www-igm.univ-mlv.fr/~lecroq/string/node26.html
+
+
+   We use vectorized algorithm to find occurences of fragments
+   of size ns-check starting at n+check-1
+   On occurence we do step of two way algorithm and
+   tell finder in skip_to variable where it should resume search.
+*/
+
+static void two_way_preprocessing(uchar *n,size_t ns,size_t *per2,size_t *ell2,size_t *peri);
+static uchar *strstr_two_way(uchar *s, uchar *s_end, uchar *n, size_t ns)
+{
+  size_t ell,   per, peri;
+  two_way_preprocessing(n,ns,&per,&ell,&peri);
+  size_t fw,fw_no,bw,bw_no;
+  size_t fw_from,fw_to,bw_from,bw_to;
+  size_t check=ns-2;
+  fw_from = ell;
+  fw_to   = max(ell,check);
+  bw_from = min(ell,check);
+  bw_to   = 0;
+  fw_no   = fw_to   - fw_from;
+  bw_no   = bw_from - bw_to;
+
+  uchar *skip_to=s+check;
+  s+=ns-2;
+
+#define CAN_SKIP
+#define CASE_CONVERT(x) _AS_STR_CASESTR_MEM(x, parallel_tolower(x), x)
+#define MASK_CONVERT(x) CHAR(&x)
+
+#define LOOP_BODY(p)\
+  p -= ns - 1;\
+  fw = strcmp_dir(n + fw_from ,p + fw_from, fw_no , 1);\
+  if (fw != fw_no )\
+    {\
+      p += fw + 1;\
+    }\
+  else\
+    {\
+      bw = strcmp_dir(n + bw_from - 1, p + bw_from - 1, bw_no, -1);\
+      if ( bw != bw_no )\
+        {\
+          p += per;\
+          if (peri) /*Prefix memoization see definition of peri.*/\
+            {\
+              if(_AS_STR_CASESTR_MEM(0,0,p+ns>s_end)) return NULL;\
+              /*Zero byte in forward check causes mismatch.*/\
+              fw = strcmp_dir(n + ns - per ,p + ns - per, per , 1);\
+              if (fw != per )\
+                {\
+                  p += fw+ (ns-per-ell) + 1;\
+                }\
+              else\
+                {\
+                  /*Backward scan always succeds.*/\
+                  return p;\
+                }\
+            }\
+        }\
+      else\
+        {\
+          return p;\
+        }\
+    }\
+  skip_to = p + (ns - 1);
+
+#include "strstr_vec.h"
+}
+
+#ifdef AS_STRCASESTR
+
+#endif
+static uchar *strstr_vec(uchar *s,uchar *s_end,uchar *n,size_t ns)
+{
+#ifdef AS_STRCASESTR
+#define CASECHECK(u) (tolower_class_no[u]==1 || (tolower_class_no[u]==2 && (tolower_class[u][0]^tolower_class[u][1])==32))
+  if (!(CASECHECK(n[ns-1]) || CASECHECK(n[ns-2])))
+    return strstr_two_way(s,s_end,n,ns);
+#undef CASECHECK
+#endif
+  size_t buy=8*ns+64,rent=0;
+  size_t check_last=_AS_STR_CASESTR_MEM(2,0,2);
+  tp_mask   phase2mask=0;
+  uchar phase2n[BYTES_AT_ONCE];
+  int ii;
+  for (ii=0; ii<min(ns-check_last,BYTES_AT_ONCE); ii++)
+    {
+      phase2n[BYTES_AT_ONCE-1-ii]=CHAR(n+ns-1-check_last-ii);
+      phase2mask|=bit_i(BYTES_AT_ONCE-1-ii);
+    }
+  tp_vector phase2v=LOAD_UNALIGNED(phase2n);
+#define PHASE2_CONVERT(x) _AS_STR_CASESTR_MEM(x, parallel_tolower(x), x)
+  /*TODO use pcmpistrm to possibly kill next 15 positions*/
+#define PHASE2TEST ((get_mask(TEST_EQ(\
+        PHASE2_CONVERT(LOAD_UNALIGNED(\
+                       p+ns-check_last-BYTES_AT_ONCE)),\
+        phase2v))&phase2mask)==phase2mask)
+  size_t check = ns - min(ns, BYTES_AT_ONCE+check_last);
+  s += ns-2;
+  tp_vector UNUSED diff=BROADCAST('A'^'a');
+#define CASE_CONVERT(x) _AS_STR_CASESTR_MEM(x, OR(x,diff),  x)
+#define MASK_CONVERT(x) _AS_STR_CASESTR_MEM(x, x|('A'^'a'), x)
+#define LOOP_BODY(p)\
+  p -= ns - 1;\
+  if(PHASE2TEST){\
+     size_t checked=strcmp_dir(p + check - 1,n + check - 1,check , -1);\
+     if (checked == check)\
+        return p;\
+     rent+=checked;\
+     if(buy+2*(p-s)<rent)\
+        return strstr_two_way(p,s_end,n,ns);\
+  }
+
+#include "strstr_vec.h"
+}
+
+
+
+#ifdef AS_STRSTR
+uchar *STRSTR(const uchar *s,const uchar *n)
+#endif
+#ifdef AS_STRCASESTR
+uchar *STRCASESTR(const uchar *s,const uchar *n)
+#endif
+#ifdef AS_MEMMEM
+uchar *MEMMEM(const uchar *s,size_t ss,const uchar *n,size_t ns)
+#endif
+{
+#ifdef AS_STRCASESTR
+  if(!calc_tolower_class) calc_tolower_cls(); /*TODO recalculate when locale changes. */
+#endif
+  size_t buy=small_treshold,rent=0;
+  uchar *p=(uchar*)s;
+#if defined( AS_STRSTR) || defined(AS_STRCASESTR)
+  /* TODO handle case when ss<ns by searching for end of n,s in parallel.*/
+  size_t ns=0,ss;
+  while(n[ns])
+    {
+      if(!s[ns]) return NULL;
+      ns++;
+    }
+#else
+  if( ns > ss) return NULL;
+#endif
+  if (!ns) return (uchar*) s;
+  uchar *s_end=(uchar*)((s+ss>=s) ? s+ss : ((uchar*)((long)-1)));
+  /*For strstr and memmem this decreases startup cost.
+    For strcasestr we align haystack.*/
+  size_t check=ns-_AS_STR_CASESTR_MEM(1,0,1);
+  size_t page_offset= ((size_t)s)%4096;
+  p += check;
+  while(1)
+    {
+#define STRCHR(s,sn,c) _AS_STR_CASESTR_MEM( strchr((char*)s,c),\
+                                            (*(s-1) ? s : NULL),\
+                                            memchr((void*)s,c,sn))
+      /*strpbrk(s,tolower_class[(uchar) c]) is too slow -cca 100 cycles.*/
+      p=(uchar*) STRCHR(p,s_end-p,((char*)n)[ns-1]);
+      if(!p) return NULL;
+      p -= check;
+      size_t checked = strcmp_dir(n, p, check, 1);
+      if (checked == check) return p;
+      rent += check + 32;
+      /*next implementation is faster but has large startup cost*/
+      if(buy < rent + (p - s) &&
+          p >= s - page_offset +BYTES_AT_ONCE)
+        {
+          /*Next implementations need two invariants.
+            First  is that string started before position that is passed.
+            Second is that p - BYTES_AT_ONCE is valid memory*/
+          return strstr_vec((uchar*)p+1,s_end,(uchar*)n,ns);
+        }
+      p++;
+      p += check;
+    }
+}
+
+/*Two way preprocessing.*/
+SI size_t maxSuf(uchar *n, size_t ns, size_t *per, size_t invert)
+{
+  /*Note that per+ms+1<ns.*/
+  size_t p,ms, j, k;
+  uchar a, b;
+
+  ms = 0;
+  j  = 1;
+  k  = p = 0;
+  while (j + k < ns)
+    {
+      a = CHAR(n + j +  k);
+      b = CHAR(n + ms + k);
+      if (invert ? (a > b) : (a < b))
+        {
+          j += k;
+          k = 0;
+          p = j - ms;
+          j++;
+        }
+      else if (a == b)
+        {
+          if (k == p)
+            {
+              j += k;
+              k = 0;
+              j++;
+            }
+          else
+            {
+              k++;
+            }
+        }
+      else   /* invert ? a < b : a > b*/
+        {
+          ms = j;
+          j++;
+          k = p = 0;
+        }
+    }
+  *per =(p+1);
+  return ms;
+}
+
+SI size_t periodic(uchar *a,uchar *b,size_t siz)
+{
+  return strcmp_dir(a,b,siz,1)==siz;
+}
+
+static void two_way_preprocessing(uchar *n,size_t ns,size_t *per2,size_t *ell2,size_t *peri)
+{
+  size_t u,v,up,vp;
+  size_t per,ell;
+  u=maxSuf(n,ns,&up,0);
+  v=maxSuf(n,ns,&vp,1);
+  ell = (u > v) ? u :  v;
+  per = (u > v) ? up : vp;
+  *peri = periodic(n, n + per, ell);
+  if (!*peri)
+    per = max(ell, ns - ell) + 1;
+  *per2=per;
+  *ell2=ell;
+}
diff --git a/string/strstr_vec.h b/string/strstr_vec.h
new file mode 100644
index 0000000..b257e09
--- /dev/null
+++ b/string/strstr_vec.h
@@ -0,0 +1,52 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License asize_t with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+tp_vector vn0=BROADCAST(MASK_CONVERT(n[ns-1-0]));
+tp_vector vn1=BROADCAST(MASK_CONVERT(n[ns-1-1]));
+tp_vector e0,e1;
+#ifdef AS_STRSTR
+#define DETECT_ZERO_BYTE
+#endif
+#ifdef AS_STRCASESTR
+#define DETECT_ZERO_BYTE
+#endif
+#ifdef AS_MEMMEM
+#define DETECT_END s_end
+#endif
+
+#ifdef USE_ARITHMETIC
+#define TEST_CODE(so,sn) vzero;\
+        e0   =XOR(CONCAT(sn,so,BYTES_AT_ONCE-0),vn0);\
+        e1   =XOR(CONCAT(sn,so,BYTES_AT_ONCE-1),vn1);\
+        mvec=TEST_ZERO(OR(e0,e1));
+#else
+#define TEST_CODE(so,sn) vzero;\
+     sn   = CASE_CONVERT(sn);\
+     e0   = TEST_EQ(CONCAT(sn,so,BYTES_AT_ONCE-0),vn0); \
+     e1   = TEST_EQ(CONCAT(sn,so,BYTES_AT_ONCE-1),vn1); \
+     mvec = (AND(e0,e1));
+#endif
+
+#define LOOP_END(p) return NULL;
+#include "loop.h"
+
+#undef TEST_CODE
+#undef LOOP_BODY
+#undef LOOP_END
+#undef CASE_CONVERT
+#undef MASK_CONVERT
diff --git a/string/vector.h b/string/vector.h
new file mode 100644
index 0000000..f4479cd
--- /dev/null
+++ b/string/vector.h
@@ -0,0 +1,120 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License asize_t with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+/* vectorized functions for string matching. They operate many(4,8,16,32) unsigned bytes at once, allowed operations are
+  TEST_ZERO(x)      - set highest bit of bytes that were zero to 1 and 0 otherwise.
+  TEST_EQ(x,y)      - set highest bit of bytes that are equal to 1 and 0 otherwise.
+  BROADCAST(c)      - return vector such that all bytes have value c
+  TEST_RANGE(x,y,z) - set highest bit of bytes that xi <= yi <= zi to 1 and 0 otherwise. You must satisfy condition zi-xi<128.
+  AND,OR,XOR,ANDNOT - do logic operation bytewise
+  SHIFT_UP(x,k), SHIFT_DOWN(x,k) shift vector x k bytes up/down
+  CONCAT(xlow,xhigh,k) concatenate xlow,xhigh and return bytes from k-th.
+  In shifts and concatenation k must be constant.
+
+  To support other vector extension see sysdeps/x86_64/sse.h file.
+*/
+typedef unsigned char uchar;
+#define SI static inline
+#define UNUSED __attribute__((unused))
+
+#include <stdlib.h>
+#include <ctype.h>
+/*TODO this tables should be recalculated when locale changes.*/
+static uchar _tolower_class[512];
+static uchar *tolower_class[256];
+static uchar tolower_class_no[256];
+static uchar tolower_fixed[256];
+static int calc_tolower_class=0;
+SI void calc_tolower_cls(void)
+{
+  int i,j;
+  uchar *p=_tolower_class;
+  /* as POSIX tolower has undefined behaviour on nonupper characters
+     we construct table with defined behaviour.*/
+  /* second reason is that tolower call is slow because compiler spills all used xmm registers*/
+  for (i=0; i<256; i++) tolower_fixed[i] = isupper(i) ? tolower(i) : i;
+  /* calculate equivalence classes*/
+  for (i=0; i<256; i++)
+    {
+      for(j=0; j<i; j++) if(tolower_fixed[i]==tolower_fixed[j])
+          {
+            tolower_class_no[i]=tolower_class_no[j];
+            tolower_class[i]=tolower_class[j];
+            goto skip;
+          }
+      tolower_class[i]   =p;
+      tolower_class_no[i]=0;
+      for(j=i; j<256; j++)
+        {
+          if(tolower_fixed[i]==tolower_fixed[j])
+            {
+              tolower_class_no[i]++;
+              *p++=j;
+            }
+        }
+      *p++=0;
+skip:
+      ;
+    }
+  calc_tolower_class=1;
+}
+
+#define BYTES_AT_ONCE sizeof(tp_vector)
+#define PARA (BYTES_AT_ONCE*unroll)
+#define VSIZ_BYTE sizeof(tp_vector)
+#define VSIZ_BIT  (VSIZ_BYTE*8)
+#define MSIZ_BYTE sizeof(tp_mask)
+#define MSIZ_BIT  (MSIZ_BYTE*8)
+
+#define ALIGN(x,u)         s_offset=((size_t) x)%((u)*BYTES_AT_ONCE);           s2=(uchar *)(((size_t) x)&((long) (~(u*BYTES_AT_ONCE-1))));
+/*line s2=x-offset; is clearer some compilers do not know that s2 is aligned*/
+
+#define CACHE_LINE_SIZE 64
+#define UN_OP(n,e) SI tp_vector n(tp_vector x){ return e;}
+#define BIN_OP(n,e) SI tp_vector n(tp_vector x,tp_vector y){ return e;}
+#define MASK_OP(name,exp) SI tp_mask name(tp_mask x,int y){ return exp; }
+
+#if defined( USE_SSE2) | defined(USE_SSE2_NO_BSF) | defined(USE_SSSE3) | defined(USE_SSE4_1)
+#include "sse.h"
+#else
+#include "arit.h"
+#endif
+#undef UN_OP
+#undef BIN_OP
+#undef MASK_OP
+
+#if unroll==1
+#define DO_ACTION ACTION(0)
+#define AGREGATE_VECTOR  mvec0
+#elif unroll==2
+#define DO_ACTION ACTION(0) ACTION(1)
+#define  AGREGATE_VECTOR    OR(mvec0,mvec1)
+#elif unroll==4
+#define DO_ACTION ACTION(0) ACTION(1) ACTION(2) ACTION(3)
+#define AGREGATE_VECTOR OR(OR(mvec0,mvec1),OR(mvec2,mvec3))
+#endif
+
+SI size_t min(size_t x,size_t y)
+{
+  return x<y ? x : y;
+}
+SI size_t max(size_t x,size_t y)
+{
+  return x>y ? x : y;
+}
+
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
deleted file mode 100644
index dcc8bef..0000000
--- a/sysdeps/x86_64/memchr.S
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright (C)  2011 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-/* fast SSE2 version with using pmaxub and 64 byte loop */
-
-	.text
-ENTRY(memchr)
-	movd	%rsi, %xmm1
-	mov	%rdi, %rcx
-
-	punpcklbw %xmm1, %xmm1
-	test	%rdx, %rdx
-	jz	L(return_null)
-	punpcklbw %xmm1, %xmm1
-
-	and	$63, %rcx
-	pshufd	$0, %xmm1, %xmm1
-
-	cmp	$48, %rcx
-	ja	L(crosscache)
-
-	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-
-	jnz	L(matches_1)
-	sub	$16, %rdx
-	jbe	L(return_null)
-	add	$16, %rdi
-	and	$15, %rcx
-	and	$-16, %rdi
-	add	%rcx, %rdx
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-	jmp	L(loop_prolog)
-
-	.p2align 4
-L(crosscache):
-	and	$15, %rcx
-	and	$-16, %rdi
-	movdqa	(%rdi), %xmm0
-
-	pcmpeqb	%xmm1, %xmm0
-/* Check if there is a match.  */
-	pmovmskb %xmm0, %eax
-/* Remove the leading bytes.  */
-	sar	%cl, %eax
-	test	%eax, %eax
-	je	L(unaligned_no_match)
-/* Check which byte is a match.  */
-	bsf	%eax, %eax
-
-	sub	%rax, %rdx
-	jbe	L(return_null)
-	add	%rdi, %rax
-	add	%rcx, %rax
-	ret
-
-	.p2align 4
-L(unaligned_no_match):
-	add	%rcx, %rdx
-	sub	$16, %rdx
-	jbe	L(return_null)
-	add	$16, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	.p2align 4
-L(loop_prolog):
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches)
-
-	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb %xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb %xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	48(%rdi), %xmm4
-	pcmpeqb	%xmm1, %xmm4
-	add	$64, %rdi
-	pmovmskb %xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	test	$0x3f, %rdi
-	jz	L(align64_loop)
-
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches)
-
-	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb %xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb %xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	48(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb %xmm3, %eax
-
-	add	$64, %rdi
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	mov	%rdi, %rcx
-	and	$-64, %rdi
-	and	$63, %rcx
-	add	%rcx, %rdx
-
-	.p2align 4
-L(align64_loop):
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
-	movdqa	48(%rdi), %xmm4
-
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm1, %xmm4
-
-	pmaxub	%xmm0, %xmm3
-	pmaxub	%xmm2, %xmm4
-	pmaxub	%xmm3, %xmm4
-	pmovmskb %xmm4, %eax
-
-	add	$64, %rdi
-
-	test	%eax, %eax
-	jz	L(align64_loop)
-
-	sub	$64, %rdi
-
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches)
-
-	pmovmskb %xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-
-	pcmpeqb	48(%rdi), %xmm1
-	pmovmskb %xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	pmovmskb %xmm1, %eax
-	bsf	%eax, %eax
-	lea	48(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(exit_loop):
-	add	$32, %rdx
-	jle	L(exit_loop_32)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches)
-
-	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb %xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb %xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32_1)
-	sub	$16, %rdx
-	jle	L(return_null)
-
-	pcmpeqb	48(%rdi), %xmm1
-	pmovmskb %xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches48_1)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(exit_loop_32):
-	add	$32, %rdx
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches_1)
-	sub	$16, %rdx
-	jbe	L(return_null)
-
-	pcmpeqb	16(%rdi), %xmm1
-	pmovmskb %xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches16_1)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(matches0):
-	bsf	%eax, %eax
-	lea	-16(%rax, %rdi), %rax
-	ret
-
-	.p2align 4
-L(matches):
-	bsf	%eax, %eax
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(matches16):
-	bsf	%eax, %eax
-	lea	16(%rax, %rdi), %rax
-	ret
-
-	.p2align 4
-L(matches32):
-	bsf	%eax, %eax
-	lea	32(%rax, %rdi), %rax
-	ret
-
-	.p2align 4
-L(matches_1):
-	bsf	%eax, %eax
-	sub	%rax, %rdx
-	jbe	L(return_null)
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(matches16_1):
-	bsf	%eax, %eax
-	sub	%rax, %rdx
-	jbe	L(return_null)
-	lea	16(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(matches32_1):
-	bsf	%eax, %eax
-	sub	%rax, %rdx
-	jbe	L(return_null)
-	lea	32(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(matches48_1):
-	bsf	%eax, %eax
-	sub	%rax, %rdx
-	jbe	L(return_null)
-	lea	48(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(return_null):
-	xor	%rax, %rax
-	ret
-END(memchr)
-
-strong_alias (memchr, __memchr)
-
-libc_hidden_builtin_def(memchr)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index dd6c27d..9d088fe 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -9,24 +9,64 @@ ifeq ($(subdir),string)
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
-		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
+		   memmove-ssse3-back  strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
 		   strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
-		   strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
-		   memcmp-ssse3
+		   memcmp-ssse3 strchr-sse2-no-bsf
+
+sysdep_routines += \
+strnlen_sse2_no_bsf\
+strnlen_sse2\
+strnlen
+sysdep_routines += \
+strstr_sse2_no_bsf\
+strstr_sse2\
+strstr_ssse3\
+strstr
+CFLAGS-strstr_ssse3.c  += -mssse3
+sysdep_routines += \
+strcasestr_sse2_no_bsf\
+strcasestr_sse2\
+strcasestr_ssse3\
+strcasestr
+CFLAGS-strcasestr_ssse3.c  += -mssse3
+sysdep_routines += \
+memmem_sse2_no_bsf\
+memmem_sse2\
+memmem_ssse3\
+memmem
+CFLAGS-memmem_ssse3.c  += -mssse3
+sysdep_routines += \
+strrchr_sse2_no_bsf\
+strrchr_sse2\
+strrchr
+sysdep_routines += \
+strchrnul_sse2_no_bsf\
+strchrnul_sse2\
+strchrnul
+sysdep_routines += \
+memchr_sse2_no_bsf\
+memchr_sse2\
+memchr
+sysdep_routines += \
+rawmemchr_sse2_no_bsf\
+rawmemchr_sse2\
+rawmemchr
+sysdep_routines += \
+memrchr_sse2_no_bsf\
+memrchr_sse2\
+memrchr
+
 ifeq (yes,$(config-cflags-sse4))
-sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
+sysdep_routines += strcspn-c strpbrk-c strspn-c   varshift
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
 CFLAGS-strspn-c.c += -msse4
-CFLAGS-strstr.c += -msse4
-CFLAGS-strcasestr.c += -msse4
-CFLAGS-strcasestr-nonascii.c += -msse4
 endif
 endif
 
diff --git a/sysdeps/x86_64/multiarch/gen_stub b/sysdeps/x86_64/multiarch/gen_stub
new file mode 100755
index 0000000..3289335
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/gen_stub
@@ -0,0 +1,111 @@
+fn(){
+J=$1
+TP=$2
+ARG=$3
+ARGN=$4
+BASE=$5
+TYPES=$6
+EXT=$7
+
+echo "sysdep_routines += \\"
+for I in $TYPES; do
+  echo "${J}_${I}\\"
+done
+echo "${J}"
+
+for I in $TYPES; do
+F="${J}_${I}.c"
+IU=`echo $I | tr '[a-z]' '[A-Z]'`
+JU=`echo $J | tr '[a-z]' '[A-Z]'`
+echo "/*generated by gen_stub*/"  > $F
+echo "#define AS_${JU}"          >> $F
+echo "#define USE_${IU}"         >> $F
+echo "#define ${JU} __${J}_${I}" >> $F
+echo "#include \"string/${BASE}.h\""       >> $F
+done
+
+if [ -z $EXT ]; then
+FN=$J
+ALIASED=""
+else
+FN="__${J}"
+ALIASED="#ifndef NO_ALIAS
+weak_alias(${FN},${J});
+#endif"
+fi
+
+echo "/*generated by gen_stub*/"  > "${J}.c"
+
+echo "
+#include <sysdep.h>
+#ifndef _LIBC
+# include <config.h>
+#endif
+
+#if defined SHARED  && !defined NOT_IN_libc
+
+#include \"init-arch.h\"
+
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+  __hidden_ver1 (__${J}_base, __GI_${J}, __${J}_base);
+
+" >> "${J}.c"
+
+for I in $TYPES; do
+  echo "extern ${TP} __${J}_${I}(${ARG}) attribute_hidden;" >> "${J}.c"
+done
+echo " ${TP} ${FN}(${ARG});
+${TP} __${J}_base(${ARG}){  
+  return __${J}_sse2(${ARGN}); 
+}
+libc_hidden_builtin_def (__${J}_base)
+extern __typeof (__${J}_base) __${J}_base attribute_hidden;
+" >> "${J}.c"
+if [ "$TYPES" == "sse2_no_bsf sse2 ssse3" ]; then
+echo "libc_ifunc (${FN}, HAS_SSSE3 ? __${J}_ssse3 : (HAS_SLOW_BSF ? __${J}_sse2_no_bsf : __${J}_sse2));" >> "${J}.c"
+echo "CFLAGS-${J}_ssse3.c  += -mssse3"
+else
+echo "libc_ifunc (${FN}, (HAS_SLOW_BSF ? __${J}_sse2_no_bsf : __${J}_sse2));" >> "${J}.c"
+fi
+echo "
+#else
+
+#include \"${J}_sse2.c\"
+
+$TP ${FN}(${ARG}){
+  return __${J}_sse2(${ARGN});
+}
+#endif
+${ALIASED}
+" >> "${J}.c"
+
+}
+#fn strlen "size_t" "const char* n" "n"              strlen "sse2_no_bsf sse2"
+fn strnlen "size_t" "const char* n,size_t ns" "n,ns" strlen "sse2_no_bsf sse2"
+
+
+fn strstr  "char *" "const char* s,const char *n"                      "s,n"       strstr "sse2_no_bsf sse2 ssse3"
+fn strcasestr  "char *" "const char* s,const char *n"                  "s,n"       strstr "sse2_no_bsf sse2 ssse3" ext
+fn memmem  "void *" "const void* s,size_t ss,const void *n, size_t ns" "s,ss,n,ns" strstr "sse2_no_bsf sse2 ssse3"
+
+#fn strchr   "char *" "const char* s,int c" "s,c"  strchr "sse2_no_bsf sse2"
+# fails because strch expands to builtin
+
+fn strrchr   "char *" "const char* s,int c" "s,c"  strchr "sse2_no_bsf sse2"
+fn strchrnul "char *" "const char* s,int c" "s,c"  strchr "sse2_no_bsf sse2" ext
+
+fn memchr    "void *" "const void* s,int c,size_t ss" "s,c,ss"  strchr  "sse2_no_bsf sse2" 
+fn rawmemchr "void *" "const void* s,int c" "s,c"               strchr  "sse2_no_bsf sse2" ext
+fn memrchr   "void *" "const void* s,int c,size_t ss" "s,c,ss"  strchr  "sse2_no_bsf sse2" ext
+
+echo "
+#ifndef NO_ALIAS
+weak_alias(strrchr,rindex);
+#endif" >> strrchr.c
+
+
+echo "size_t __strnlen(const char* n,size_t ns){
+  return strnlen(n,ns);
+}" >> strnlen.c
+
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S
deleted file mode 100644
index c4157ad..0000000
--- a/sysdeps/x86_64/multiarch/rawmemchr.S
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
-   Contributed by Ulrich Drepper <drepper@redhat.com>.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in lib.  */
-#ifndef NOT_IN_libc
-	.text
-ENTRY(rawmemchr)
-	.type	rawmemchr, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	testl	$bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
-	jnz	2f
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
-	jz	2f
-	leaq	__rawmemchr_sse42(%rip), %rax
-	ret
-2:	leaq	__rawmemchr_sse2(%rip), %rax
-	ret
-
-END(rawmemchr)
-strong_alias (rawmemchr, __rawmemchr)
-
-
-	.section .text.sse4.2,"ax",@progbits
-	.align	16
-	.type	__rawmemchr_sse42, @function
-__rawmemchr_sse42:
-	cfi_startproc
-	CALL_MCOUNT
-	movd	%esi, %xmm1
-	movq	%rdi, %rcx
-	pxor	%xmm2, %xmm2
-	andq	$~15, %rdi
-	orl	$0xffffffff, %esi
-	pshufb	%xmm2, %xmm1
-	movdqa	(%rdi), %xmm0
-	subq	%rdi, %rcx
-	pcmpeqb	%xmm1, %xmm0
-	shl	%cl, %esi
-	pmovmskb %xmm0, %ecx
-	movl	$16, %eax
-	movl	$16, %edx
-	andl	%esi, %ecx
-	jnz	1f
-
-2:	pcmpestri $0x08, 16(%rdi), %xmm1
-	leaq	16(%rdi), %rdi
-	jnc	2b
-
-	leaq	(%rdi,%rcx), %rax
-	ret
-
-1:	bsfl	%ecx, %eax
-	addq	%rdi, %rax
-	ret
-	cfi_endproc
-	.size	__rawmemchr_sse42, .-__rawmemchr_sse42
-
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __rawmemchr_sse2, @function; \
-	.align 16; \
-	__rawmemchr_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __rawmemchr_sse2, .-__rawmemchr_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal rawmemchr calls through a PLT.
-   The speedup we get from using SSE4.2 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_sse2
-#endif
-
-#include "../rawmemchr.S"
diff --git a/sysdeps/x86_64/multiarch/strcasestr-c.c b/sysdeps/x86_64/multiarch/strcasestr-c.c
deleted file mode 100644
index 551492d..0000000
--- a/sysdeps/x86_64/multiarch/strcasestr-c.c
+++ /dev/null
@@ -1,16 +0,0 @@
-#include "init-arch.h"
-
-#define STRCASESTR __strcasestr_sse2
-
-#include "string/strcasestr.c"
-
-extern char *__strcasestr_sse42 (const char *, const char *) attribute_hidden;
-extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden;
-
-#if 1
-libc_ifunc (__strcasestr,
-	    HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2);
-#else
-libc_ifunc (__strcasestr,
-	    0 ? __strcasestr_sse42 : __strcasestr_sse2);
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c b/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
deleted file mode 100644
index a1f9968..0000000
--- a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/* strstr with SSE4.2 intrinsics
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-# include <ctype.h>
-
-
-/* Similar to __m128i_strloadu.  Convert to lower case for none-POSIX/C
-   locale.  */
-static inline __m128i
-__m128i_strloadu_tolower (const unsigned char *p)
-{
-  union
-    {
-      char b[16];
-      __m128i x;
-    } u;
-
-  for (int i = 0; i < 16; ++i)
-    if (p[i] == 0)
-      {
-	u.b[i] = 0;
-	break;
-      }
-    else
-      u.b[i] = tolower (p[i]);
-
-  return u.x;
-}
-
-
-#define STRCASESTR_NONASCII
-#define USE_AS_STRCASESTR
-#define STRSTR_SSE42 __strcasestr_sse42_nonascii
-#include "strstr.c"
diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c
deleted file mode 100644
index d1cfb3b..0000000
--- a/sysdeps/x86_64/multiarch/strcasestr.c
+++ /dev/null
@@ -1,7 +0,0 @@
-extern char *__strcasestr_sse42_nonascii (const unsigned char *s1,
-					  const unsigned char *s2)
-  attribute_hidden;
-
-#define USE_AS_STRCASESTR
-#define STRSTR_SSE42 __strcasestr_sse42
-#include "strstr.c"
diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
deleted file mode 100644
index 248328d..0000000
--- a/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNLEN
-#define STRLEN __strnlen_sse2_no_bsf
-#include "strlen-sse2-no-bsf.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen.S b/sysdeps/x86_64/multiarch/strnlen.S
deleted file mode 100644
index 044b910..0000000
--- a/sysdeps/x86_64/multiarch/strnlen.S
+++ /dev/null
@@ -1,54 +0,0 @@
-/* multiple version of strnlen
-   Copyright (C) 2011 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc.  */
-#ifndef NOT_IN_libc
-
-	.text
-ENTRY(__strnlen)
-	.type	__strnlen, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__strnlen_sse2(%rip), %rax
-	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
-	jz	2f
-	leaq	__strnlen_sse2_no_bsf(%rip), %rax
-2:	ret
-END(__strnlen)
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __strnlen_sse2, @function; \
-	.align 16; \
-	__strnlen_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __strnlen_sse2, .-__strnlen_sse2
-
-# undef libc_hidden_def
-# define libc_hidden_def(name) \
-	.globl __GI_strnlen; __GI_strnlen = __strnlen_sse2
-#endif
-
-#include "../strnlen.S"
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
deleted file mode 100644
index c698c94..0000000
--- a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
+++ /dev/null
@@ -1,555 +0,0 @@
-/* strrchr with SSE2 without bsf and bsr
-   Copyright (C) 2011 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#if defined SHARED && !defined NOT_IN_libc
-
-# include <sysdep.h>
-# include "asm-syntax.h"
-
-	atom_text_section
-ENTRY (__strrchr_sse2_no_bsf)
-
-	movd	%rsi, %xmm1
-	pxor	%xmm2, %xmm2
-	mov	%rdi, %rcx
-	punpcklbw %xmm1, %xmm1
-	punpcklbw %xmm1, %xmm1
-	/* ECX has OFFSET. */
-	and	$63, %rcx
-	cmp	$48, %rcx
-	pshufd	$0, %xmm1, %xmm1
-	ja	L(crosscache)
-
-/* unaligned string. */
-	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	pcmpeqb	%xmm1, %xmm0
-	/* Find where NULL is.  */
-	pmovmskb %xmm2, %rcx
-	/* Check if there is a match.  */
-	pmovmskb %xmm0, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match1)
-
-	test	%rcx, %rcx
-	jnz	L(return_null)
-
-	and	$-16, %rdi
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match1):
-	test	%rcx, %rcx
-	jnz	L(prolog_find_zero_1)
-
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	and	$-16, %rdi
-	jmp	L(loop)
-
-	.p2align 4
-L(crosscache):
-/* Hancle unaligned string.  */
-	and	$15, %rcx
-	and	$-16, %rdi
-	pxor	%xmm3, %xmm3
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	/* Find where NULL is.  */
-	pmovmskb %xmm3, %rdx
-	/* Check if there is a match.  */
-	pmovmskb %xmm0, %rax
-	/* Remove the leading bytes.  */
-	shr	%cl, %rdx
-	shr	%cl, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match)
-
-	test	%rdx, %rdx
-	jnz	L(return_null)
-
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match):
-	test	%rdx, %rdx
-	jnz	L(prolog_find_zero)
-
-	mov	%rax, %r8
-	lea	(%rdi, %rcx), %rsi
-
-/* Loop start on aligned string.  */
-	.p2align 4
-L(loop):
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jz	L(loop)
-
-L(matches):
-	test	%rax, %rax
-	jnz	L(match)
-L(return_value):
-	test	%r8, %r8
-	jz	L(return_null)
-	mov	%r8, %rax
-	mov	%rsi, %rdi
-	jmp	L(match_exit)
-
-	.p2align 4
-L(match):
-	pmovmskb %xmm2, %rcx
-	test	%rcx, %rcx
-	jnz	L(find_zero)
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	jmp	L(loop)
-
-	.p2align 4
-L(find_zero):
-	test	%cl, %cl
-	jz	L(find_zero_high)
-	mov	%cl, %dl
-	and	$15, %dl
-	jz	L(find_zero_8)
-	test	$0x01, %cl
-	jnz	L(FindZeroExit1)
-	test	$0x02, %cl
-	jnz	L(FindZeroExit2)
-	test	$0x04, %cl
-	jnz	L(FindZeroExit3)
-	and	$1 << 4 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(find_zero_8):
-	test	$0x10, %cl
-	jnz	L(FindZeroExit5)
-	test	$0x20, %cl
-	jnz	L(FindZeroExit6)
-	test	$0x40, %cl
-	jnz	L(FindZeroExit7)
-	and	$1 << 8 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(find_zero_high):
-	mov	%ch, %dh
-	and	$15, %dh
-	jz	L(find_zero_high_8)
-	test	$0x01, %ch
-	jnz	L(FindZeroExit9)
-	test	$0x02, %ch
-	jnz	L(FindZeroExit10)
-	test	$0x04, %ch
-	jnz	L(FindZeroExit11)
-	and	$1 << 12 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(find_zero_high_8):
-	test	$0x10, %ch
-	jnz	L(FindZeroExit13)
-	test	$0x20, %ch
-	jnz	L(FindZeroExit14)
-	test	$0x40, %ch
-	jnz	L(FindZeroExit15)
-	and	$1 << 16 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit1):
-	and	$1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit2):
-	and	$1 << 2 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit3):
-	and	$1 << 3 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit5):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit6):
-	and	$1 << 6 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit7):
-	and	$1 << 7 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit9):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit10):
-	and	$1 << 10 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit11):
-	and	$1 << 11 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit13):
-	and	$1 << 13 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit14):
-	and	$1 << 14 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit15):
-	and	$1 << 15 - 1, %rax
-	jz	L(return_value)
-
-	.p2align 4
-L(match_exit):
-	test	%ah, %ah
-	jnz	L(match_exit_high)
-	mov	%al, %dl
-	and	$15 << 4, %dl
-	jnz	L(match_exit_8)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_exit_8):
-	test	$0x80, %al
-	jnz	L(Exit8)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	lea	-12(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_exit_high):
-	mov	%ah, %dh
-	and	$15 << 4, %dh
-	jnz	L(match_exit_high_8)
-	test	$0x08, %ah
-	jnz	L(Exit12)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	lea	-8(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_exit_high_8):
-	test	$0x80, %ah
-	jnz	L(Exit16)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	lea	-4(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit2):
-	lea	-15(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit3):
-	lea	-14(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit4):
-	lea	-13(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit6):
-	lea	-11(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit7):
-	lea	-10(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit8):
-	lea	-9(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit10):
-	lea	-7(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit11):
-	lea	-6(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit12):
-	lea	-5(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit14):
-	lea	-3(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit15):
-	lea	-2(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit16):
-	lea	-1(%rdi), %rax
-	ret
-
-/* Return NULL.  */
-	.p2align 4
-L(return_null):
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero):
-	add	%rcx, %rdi
-	mov     %rdx, %rcx
-L(prolog_find_zero_1):
-	test	%cl, %cl
-	jz	L(prolog_find_zero_high)
-	mov	%cl, %dl
-	and	$15, %dl
-	jz	L(prolog_find_zero_8)
-	test	$0x01, %cl
-	jnz	L(PrologFindZeroExit1)
-	test	$0x02, %cl
-	jnz	L(PrologFindZeroExit2)
-	test	$0x04, %cl
-	jnz	L(PrologFindZeroExit3)
-	and	$1 << 4 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_8):
-	test	$0x10, %cl
-	jnz	L(PrologFindZeroExit5)
-	test	$0x20, %cl
-	jnz	L(PrologFindZeroExit6)
-	test	$0x40, %cl
-	jnz	L(PrologFindZeroExit7)
-	and	$1 << 8 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_high):
-	mov	%ch, %dh
-	and	$15, %dh
-	jz	L(prolog_find_zero_high_8)
-	test	$0x01, %ch
-	jnz	L(PrologFindZeroExit9)
-	test	$0x02, %ch
-	jnz	L(PrologFindZeroExit10)
-	test	$0x04, %ch
-	jnz	L(PrologFindZeroExit11)
-	and	$1 << 12 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_high_8):
-	test	$0x10, %ch
-	jnz	L(PrologFindZeroExit13)
-	test	$0x20, %ch
-	jnz	L(PrologFindZeroExit14)
-	test	$0x40, %ch
-	jnz	L(PrologFindZeroExit15)
-	and	$1 << 16 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit1):
-	and	$1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit2):
-	and	$1 << 2 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit3):
-	and	$1 << 3 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit5):
-	and	$1 << 5 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit6):
-	and	$1 << 6 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit7):
-	and	$1 << 7 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit9):
-	and	$1 << 9 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit10):
-	and	$1 << 10 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit11):
-	and	$1 << 11 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit13):
-	and	$1 << 13 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit14):
-	and	$1 << 14 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit15):
-	and	$1 << 15 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-END (__strrchr_sse2_no_bsf)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
deleted file mode 100644
index c87d8fa..0000000
--- a/sysdeps/x86_64/multiarch/strrchr.S
+++ /dev/null
@@ -1,281 +0,0 @@
-/* strrchr with SSE4.2
-   Copyright (C) 2009 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc and for
-   the DSO.  In static binaries we need strrchr before the initialization
-   happened.  */
-#if defined SHARED && !defined NOT_IN_libc
-	.text
-ENTRY(strrchr)
-	.type	strrchr, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__strrchr_sse2(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
-	jz	2f
-	leaq	__strrchr_sse42(%rip), %rax
-	ret
-2:	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
-	jz	3f
-	leaq    __strrchr_sse2_no_bsf(%rip), %rax
-3:	ret
-END(strrchr)
-
-/*
-   This implementation uses SSE4 instructions to compare up to 16 bytes
-   at a time looking for the last occurrence of the character c in the
-   string s:
-
-   char *strrchr (const char *s, int c);
-
-   We use 0x4a:
-	_SIDD_SBYTE_OPS
-	| _SIDD_CMP_EQUAL_EACH
-	| _SIDD_MOST_SIGNIFICANT
-   on pcmpistri to compare xmm/mem128
-
-   0 1 2 3 4 5 6 7 8 9 A B C D E F
-   X X X X X X X X X X X X X X X X
-
-   against xmm
-
-   0 1 2 3 4 5 6 7 8 9 A B C D E F
-   C C C C C C C C C C C C C C C C
-
-   to find out if the first 16byte data element has a byte C and the
-   last offset.  There are 4 cases:
-
-   1. The first 16byte data element has EOS and has the byte C at the
-      last offset X.
-   2. The first 16byte data element is valid and has the byte C at the
-      last offset X.
-   3. The first 16byte data element has EOS and doesn't have the byte C.
-   4. The first 16byte data element is valid and doesn't have the byte C.
-
-   Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
-
-   case		ECX	CFlag	ZFlag	SFlag
-    1		 X	  1	  1	  0
-    2		 X	  1	  0	  0
-    3		16	  0	  1	  0
-    4		16	  0	  0	  0
-
-   We exit from the loop for cases 1 and 3 with jz which branches
-   when ZFlag is 1.  If CFlag == 1, ECX has the offset X for case 1.  */
-
-
-	.section .text.sse4.2,"ax",@progbits
-	.align	16
-	.type	__strrchr_sse42, @function
-__strrchr_sse42:
-	cfi_startproc
-	CALL_MCOUNT
-	testb	%sil, %sil
-	je	__strend_sse4
-	xor	%eax,%eax	/* RAX has the last occurrence of s.  */
-	movd	%esi, %xmm1
-	punpcklbw	%xmm1, %xmm1
-	movl	%edi, %esi
-	punpcklbw	%xmm1, %xmm1
-	andl	$15, %esi
-	pshufd	$0, %xmm1, %xmm1
-	movq	%rdi, %r8
-	je	L(loop)
-
-/* Handle unaligned string using psrldq.  */
-	leaq	L(psrldq_table)(%rip), %rdx
-	andq	$-16, %r8
-	movslq	(%rdx,%rsi,4),%r9
-	movdqa	(%r8), %xmm0
-	addq	%rdx, %r9
-	jmp	*%r9
-
-/* Handle unaligned string with offset 1 using psrldq.  */
-	.p2align 4
-L(psrldq_1):
-	psrldq	$1, %xmm0
-
-	.p2align 4
-L(unaligned_pcmpistri):
-	pcmpistri	$0x4a, %xmm1, %xmm0
-	jnc	L(unaligned_no_byte)
-	leaq	(%rdi,%rcx), %rax
-L(unaligned_no_byte):
-	/* Find the length of the unaligned string.  */
-	pcmpistri	$0x3a, %xmm0, %xmm0
-	movl	$16, %edx
-	subl	%esi, %edx
-	cmpl	%ecx, %edx
-	/* Return RAX if the unaligned fragment to next 16B already
-	   contain the NULL terminator.  */
-	jg	L(exit)
-	addq	$16, %r8
-
-/* Loop start on aligned string.  */
-	.p2align 4
-L(loop):
-	pcmpistri	$0x4a, (%r8), %xmm1
-	jbe	L(match_or_eos)
-	addq	$16, %r8
-	jmp	L(loop)
-	.p2align 4
-L(match_or_eos):
-	je	L(had_eos)
-L(match_no_eos):
-	leaq	(%r8,%rcx), %rax
-	addq	$16, %r8
-	jmp     L(loop)
-	.p2align 4
-L(had_eos):
-	jnc     L(exit)
-	leaq	(%r8,%rcx), %rax
-	.p2align 4
-L(exit):
-	ret
-
-/* Handle unaligned string with offset 15 using psrldq.  */
-	.p2align 4
-L(psrldq_15):
-	psrldq	$15, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 14 using psrldq.  */
-	.p2align 4
-L(psrldq_14):
-	psrldq	$14, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 13 using psrldq.  */
-	.p2align 4
-L(psrldq_13):
-	psrldq	$13, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 12 using psrldq.  */
-	.p2align 4
-L(psrldq_12):
-	psrldq	$12, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 11 using psrldq.  */
-	.p2align 4
-L(psrldq_11):
-	psrldq	$11, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 10 using psrldq.  */
-	.p2align 4
-L(psrldq_10):
-	psrldq	$10, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 9 using psrldq.  */
-	.p2align 4
-L(psrldq_9):
-	psrldq	$9, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 8 using psrldq.  */
-	.p2align 4
-L(psrldq_8):
-	psrldq	$8, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 7 using psrldq.  */
-	.p2align 4
-L(psrldq_7):
-	psrldq	$7, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 6 using psrldq.  */
-	.p2align 4
-L(psrldq_6):
-	psrldq	$6, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 5 using psrldq.  */
-	.p2align 4
-L(psrldq_5):
-	psrldq	$5, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 4 using psrldq.  */
-	.p2align 4
-L(psrldq_4):
-	psrldq	$4, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 3 using psrldq.  */
-	.p2align 4
-L(psrldq_3):
-	psrldq	$3, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 2 using psrldq.  */
-	.p2align 4
-L(psrldq_2):
-	psrldq	$2, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-	cfi_endproc
-	.size	__strrchr_sse42, .-__strrchr_sse42
-
-	.section .rodata.sse4.2,"a",@progbits
-	.p2align 4
-L(psrldq_table):
-	.int	L(loop) - L(psrldq_table)
-	.int	L(psrldq_1) - L(psrldq_table)
-	.int	L(psrldq_2) - L(psrldq_table)
-	.int	L(psrldq_3) - L(psrldq_table)
-	.int	L(psrldq_4) - L(psrldq_table)
-	.int	L(psrldq_5) - L(psrldq_table)
-	.int	L(psrldq_6) - L(psrldq_table)
-	.int	L(psrldq_7) - L(psrldq_table)
-	.int	L(psrldq_8) - L(psrldq_table)
-	.int	L(psrldq_9) - L(psrldq_table)
-	.int	L(psrldq_10) - L(psrldq_table)
-	.int	L(psrldq_11) - L(psrldq_table)
-	.int	L(psrldq_12) - L(psrldq_table)
-	.int	L(psrldq_13) - L(psrldq_table)
-	.int	L(psrldq_14) - L(psrldq_table)
-	.int	L(psrldq_15) - L(psrldq_table)
-
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __strrchr_sse2, @function; \
-	.align 16; \
-	__strrchr_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strrchr calls through a PLT.
-   The speedup we get from using SSE4.2 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
-#endif
-
-#include "../strrchr.S"
diff --git a/sysdeps/x86_64/multiarch/strstr-c.c b/sysdeps/x86_64/multiarch/strstr-c.c
deleted file mode 100644
index b8ed316..0000000
--- a/sysdeps/x86_64/multiarch/strstr-c.c
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "init-arch.h"
-
-#define STRSTR __strstr_sse2
-#ifdef SHARED
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
-  __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
-#endif
-
-#include "string/strstr.c"
-
-extern char *__strstr_sse42 (const char *, const char *) attribute_hidden;
-extern __typeof (__strstr_sse2) __strstr_sse2 attribute_hidden;
-
-libc_ifunc (strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2);
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
deleted file mode 100644
index b1b4139..0000000
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ /dev/null
@@ -1,384 +0,0 @@
-/* strstr with SSE4.2 intrinsics
-   Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <nmmintrin.h>
-#include "varshift.h"
-
-#ifndef STRSTR_SSE42
-# define STRSTR_SSE42 __strstr_sse42
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# include <ctype.h>
-# include <locale/localeinfo.h>
-
-# define LOADBYTE(C)		tolower (C)
-# define CMPBYTE(C1, C2)	(tolower (C1) == tolower (C2))
-#else
-# define LOADBYTE(C)		(C)
-# define CMPBYTE(C1, C2)	((C1) == (C2))
-#endif
-
-/* We use 0xe ordered-compare:
-	_SIDD_SBYTE_OPS
-	| _SIDD_CMP_EQUAL_ORDER
-	| _SIDD_LEAST_SIGNIFICANT
-   on pcmpistri to do the scanning and string comparsion requirements of
-   sub-string match.  In the scanning phase, we process Cflag and ECX
-   index to locate the first fragment match; once the first fragment
-   match position has been identified, we do comparison of subsequent
-   string fragments until we can conclude false or true match; whe
-   n concluding a false match, we may need to repeat scanning process
-   from next relevant offset in the target string.
-
-   In the scanning phase we have 4 cases:
-   case		ECX	CFlag	ZFlag	SFlag
-    1		16	  0	  0	  0
-    2a		16	  0	  0	  1
-    2b		16	  0	  1	  0
-    2c		16	  0	  1	  1
-
-   1. No ordered-comparison match, both 16B fragments are valid, so
-      continue to next fragment.
-   2. No ordered-comparison match, there is EOS in either fragment,
-   2a. Zflg = 0, Sflg = 1, we continue
-   2b. Zflg = 1, Sflg = 0, we conclude no match and return.
-   2c. Zflg = 1, sflg = 1, lenth determine match or no match
-
-   In the string comparison phase, the 1st fragment match is fixed up
-   to produce ECX = 0.  Subsequent fragment compare of nonzero index
-   and no match conclude a false match.
-
-   case		ECX	CFlag	ZFlag	SFlag
-    3		 X	  1	  0	  0/1
-    4a		 0	  1	  0	  0
-    4b		 0	  1	  0	  1
-    4c		0 < X	  1	  0	  0/1
-    5		16	  0	  1	  0
-
-   3. An initial ordered-comparison fragment match, we fix up to do
-      subsequent string comparison
-   4a. Continuation of fragment comparison of a string compare.
-   4b. EOS reached in the reference string, we conclude true match and
-       return
-   4c. String compare failed if index is nonzero, we need to go back to
-       scanning
-   5.  failed string compare, go back to scanning
- */
-
-/* Simple replacement of movdqu to address 4KB boundary cross issue.
-   If EOS occurs within less than 16B before 4KB boundary, we don't
-   cross to next page.  */
-
-static inline __m128i
-__m128i_strloadu (const unsigned char * p, __m128i zero)
-{
-  if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0))
-    {
-      size_t offset = ((size_t) p & (16 - 1));
-      __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
-      int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
-      if ((bmsk >> offset) != 0)
-	return __m128i_shift_right (a, offset);
-    }
-  return _mm_loadu_si128 ((__m128i *) p);
-}
-
-#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
-
-/* Similar to __m128i_strloadu.  Convert to lower case for POSIX/C
-   locale and other which have single-byte letters only in the ASCII
-   range.  */
-static inline __m128i
-__m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow,
-			  __m128i uchigh, __m128i lcqword)
-{
-  __m128i frag = __m128i_strloadu (p, zero);
-
-  /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'.  */
-  __m128i r2 = _mm_cmpgt_epi8 (uchigh, frag);
-  /* Compare if bytes are > 'A' - 1.  */
-  __m128i r1 = _mm_cmpgt_epi8 (frag, uclow);
-  /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1.  */
-  __m128i mask = _mm_and_si128 (r2, r1);
-  /* Apply lowercase bit 6 mask for above mask bytes == ff.  */
-  return _mm_or_si128 (frag, _mm_and_si128 (mask, lcqword));
-}
-
-#endif
-
-/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
-   algorithm) overlap for a fully populated 16B vector.
-   Input parameter: 1st 16Byte loaded from the reference string of a
-		    strstr function.
-   We don't use KMP algorithm if reference string is less than 16B.  */
-static int
-__inline__ __attribute__ ((__always_inline__,))
-KMP16Bovrlap (__m128i s2)
-{
-  __m128i b = _mm_unpacklo_epi8 (s2, s2);
-  __m128i a = _mm_unpacklo_epi8 (b, b);
-  a = _mm_shuffle_epi32 (a, 0);
-  b = _mm_srli_si128 (s2, sizeof (char));
-  int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
-
-  /* _BitScanForward(&k1, bmsk); */
-  int k1;
-  __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
-  if (!bmsk)
-    return 16;
-  else if (bmsk == 0x7fff)
-    return 1;
-  else if (!k1)
-    {
-      /* There are al least two distinct chars in s2.  If byte 0 and 1 are
-	 idential and the distinct value lies farther down, we can deduce
-	 the next byte offset to restart full compare is least no earlier
-	 than byte 3.  */
-      return 3;
-    }
-  else
-    {
-      /* Byte 1 is not degenerated to byte 0.  */
-      return k1 + 1;
-    }
-}
-
-char *
-__attribute__ ((section (".text.sse4.2")))
-STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
-{
-#define p1 s1
-  const unsigned char *p2 = s2;
-
-#ifndef STRCASESTR_NONASCII
-  if (__builtin_expect (p2[0] == '\0', 0))
-    return (char *) p1;
-
-  if (__builtin_expect (p1[0] == '\0', 0))
-    return NULL;
-
-  /* Check if p1 length is 1 byte long.  */
-  if (__builtin_expect (p1[1] == '\0', 0))
-    return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# ifndef STRCASESTR_NONASCII
-  if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
-			!= 0, 0))
-    return __strcasestr_sse42_nonascii (s1, s2);
-
-  const __m128i uclow = _mm_set1_epi8 (0x40);
-  const __m128i uchigh = _mm_set1_epi8 (0x5b);
-  const __m128i lcqword = _mm_set1_epi8 (0x20);
-  const __m128i zero = _mm_setzero_si128 ();
-#  define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword)
-# else
-#  define strloadu __m128i_strloadu_tolower
-#  define zero _mm_setzero_si128 ()
-# endif
-#else
-# define strloadu(p) __m128i_strloadu (p, zero)
-  const __m128i zero = _mm_setzero_si128 ();
-#endif
-
-  /* p1 > 1 byte long.  Load up to 16 bytes of fragment.  */
-  __m128i frag1 = strloadu (p1);
-
-  __m128i frag2;
-  if (p2[1] != '\0')
-    /* p2 is > 1 byte long.  */
-    frag2 = strloadu (p2);
-  else
-    frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0);
-
-  /* Unsigned bytes, equal order, does frag2 has null?  */
-  int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-  int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-  int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-  int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
-  if (cmp_s & cmp_c)
-    {
-      int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
-      int len;
-      __asm ("bsfl %[bmsk], %[len]"
-	     : [len] "=r" (len) : [bmsk] "r" (bmsk));
-      p1 += cmp;
-      if ((len + cmp) <= 16)
-	return (char *) p1;
-
-      /* Load up to 16 bytes of fragment.  */
-      frag1 = strloadu (p1);
-      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-      cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
-      cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-      cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-      if ((len + cmp) <= 16)
-	return (char *) p1 + cmp;
-    }
-
-  if (cmp_s)
-    {
-      /* Adjust addr for 16B alginment in ensuing loop.  */
-      while (!cmp_z)
-	{
-	  p1 += cmp;
-	  /* Load up to 16 bytes of fragment.  */
-	  frag1 = strloadu (p1);
-	  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-	  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-	  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-	  /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
-	     once already, this time cmp will be zero and we can exit.  */
-	  if ((!cmp) & cmp_c)
-	    break;
-	}
-
-      if (!cmp_c)
-	return NULL;
-
-      /* Since s2 is less than 16 bytes, com_c is definitive
-	 determination of full match.  */
-      return (char *) p1 + cmp;
-    }
-
-  /* General case, s2 is at least 16 bytes or more.
-     First, the common case of false-match at first byte of p2.  */
-  const unsigned char *pt = NULL;
-  int kmp_fwd = 0;
-re_trace:
-  while (!cmp_c)
-    {
-      /* frag1 has null. */
-      if (cmp_z)
-	return NULL;
-
-      /* frag 1 has no null, advance 16 bytes.  */
-      p1 += 16;
-      /* Load up to 16 bytes of fragment.  */
-      frag1 = strloadu (p1);
-      /* Unsigned bytes, equal order, is there a partial match?  */
-      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-      cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-      cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-    }
-
-  /* Next, handle initial positive match as first byte of p2.  We have
-     a partial fragment match, make full determination until we reached
-     end of s2.  */
-  if (!cmp)
-    {
-      if (cmp_z)
-	return (char *) p1;
-
-      pt = p1;
-      p1 += 16;
-      p2 += 16;
-      /* Load up to 16 bytes of fragment.  */
-      frag2 = strloadu (p2);
-    }
-  else
-    {
-      /* Adjust 16B alignment.  */
-      p1 += cmp;
-      pt = p1;
-    }
-
-  /* Load up to 16 bytes of fragment.  */
-  frag1 = strloadu (p1);
-
-  /* Unsigned bytes, equal order, does frag2 has null?  */
-  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-  cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
-  while (!(cmp | cmp_z | cmp_s))
-    {
-      p1 += 16;
-      p2 += 16;
-      /* Load up to 16 bytes of fragment.  */
-      frag2 = strloadu (p2);
-      /* Load up to 16 bytes of fragment.  */
-      frag1 = strloadu (p1);
-      /* Unsigned bytes, equal order, does frag2 has null?  */
-      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-      cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-      cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-      cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
-    }
-
-  /* Full determination yielded a false result, retrace s1 to next
-     starting position.
-     Zflg	1      0      1			0/1
-     Sflg	0      1      1			0/1
-     cmp	na     0      0			>0
-     action   done   done   continue    continue if s2 < s1
-	      false  match  retrace s1     else false
-   */
-
-  if (cmp_s & !cmp)
-    return (char *) pt;
-  if (cmp_z)
-    {
-      if (!cmp_s)
-	return NULL;
-
-      /* Handle both zero and sign flag set and s1 is shorter in
-	 length.  */
-      int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
-      int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
-      int len;
-      int len1;
-      __asm ("bsfl %[bmsk], %[len]"
-	     : [len] "=r" (len) : [bmsk] "r" (bmsk));
-      __asm ("bsfl %[bmsk1], %[len1]"
-	     : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
-      if (len >= len1)
-	return NULL;
-    }
-  else if (!cmp)
-    return (char *) pt;
-
-  /* Otherwise, we have to retrace and continue.  Default of multiple
-     paths that need to retrace from next byte in s1.  */
-  p2 = s2;
-  frag2 = strloadu (p2);
-
-  if (!kmp_fwd)
-    kmp_fwd = KMP16Bovrlap (frag2);
-
-  /* KMP algorithm predicted overlap needs to be corrected for
-     partial fragment compare.  */
-  p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
-
-  /* Since s2 is at least 16 bytes long, we're certain there is no
-     match.  */
-  if (p1[0] == '\0')
-    return NULL;
-
-  /* Load up to 16 bytes of fragment.  */
-  frag1 = strloadu (p1);
-
-  /* Unsigned bytes, equal order, is there a partial match?  */
-  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-  goto re_trace;
-}
diff --git a/sysdeps/x86_64/sse.h b/sysdeps/x86_64/sse.h
new file mode 100644
index 0000000..2b756ca
--- /dev/null
+++ b/sysdeps/x86_64/sse.h
@@ -0,0 +1,161 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License asize_t with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#include <stdint.h>
+
+#include <emmintrin.h>
+#ifdef USE_SSSE3
+#define _HAS_SSSE3(x,y) x
+#include <tmmintrin.h>
+#else
+#define _HAS_SSSE3(x,y) y
+#endif
+#ifdef USE_SSE4_1
+#define _HAS_SSE4_1(x,y) x
+#undef  _HAS_SSSE3
+#define _HAS_SSSE3( x,y) x
+#include <smmintrin.h>
+#else
+#define _HAS_SSE4_1(x,y) y
+#endif
+
+typedef __m128i tp_vector;
+typedef unsigned long tp_mask;
+
+
+SI tp_vector BROADCAST(uchar c)
+{
+  return _mm_set1_epi8(c);
+}
+SI tp_vector LOAD(uchar* x)
+{
+  return  _mm_load_si128( (tp_vector*)(x));
+}
+SI tp_vector LOAD_UNALIGNED(uchar* x)
+{
+  return  _mm_loadu_si128((tp_vector*)(x));
+}
+
+#define PREFETCH(x)	_mm_prefetch(((char *)x),_MM_HINT_T0);
+
+SI tp_mask get_mask(tp_vector x)
+{
+  /*gcc unnecesary adds sign extension instructions for pkmovmskb.*/
+  return  (tp_mask)((unsigned int)
+                    _mm_movemask_epi8(x));
+}
+SI unsigned int NONZERO_MASK(tp_vector x)
+{
+  return _HAS_SSE4_1(!_mm_testz_si128(x,x),
+                     get_mask(x));
+}
+
+#ifdef USE_SSE2_NO_BSF
+static char first_bit_hash[]= {0,37,50,8,0,21,0,0,38,54,5,51,9,0,30,0,22,12,1,0,0,0,0,39,0,55,0,35,6,52,28,10,0,0,33,31,0,0,23,0,13,44,0,2,0,0,25,0,0,0,0,0,40,15,0,0,56,62,46,0,19,36,7,0,0,53,4,0,29,11,0,0,0,0,34,0,27,32,0,0,0,43,0,0,24,0,0,14,0,61,45,18,0,0,3,0,0,0,0,26,0,42,0,0,0,60,17,0,0,0,0,41,0,59,16,0,0,58,0,57,0,63,47,48,0,0,49,20};
+SI tp_mask first_bit(tp_mask x,int y)
+{
+  /* ones has form 2**(tz+1)-1 where tb is number of trailing zereos.*/
+  tp_mask ones=x^(x-1);
+  /* Calculate perfect hash.*/
+  return first_bit_hash[(903385529620038207L*ones)>>57];
+}
+#else
+SI tp_mask first_bit(tp_mask x,int y)
+{
+  return __builtin_ctzl(x);
+}
+#endif
+SI tp_mask bit_i(int i)
+{
+  return ((tp_mask) 1)<<i;
+}
+
+MASK_OP(get_bit         , x&bit_i(y))
+MASK_OP(shift_down      , x>>y )
+MASK_OP(shift_up        , x<<y )
+MASK_OP(forget_first_bit, x&(x-1))
+MASK_OP(forget_before   , x&((y>=PARA) ? 0 : ((y<0) ? x :\
+                             shift_up(  (tp_mask)-1,y))))
+MASK_OP(forget_after    , x&((y>=PARA) ? x : ((y<0) ? 0 :\
+                             shift_down((tp_mask)-1,63-y))))
+
+
+BIN_OP(TEST_EQ,_mm_cmpeq_epi8( x,y))
+#define TEST_ZERO(x) TEST_EQ(x,vzero)
+BIN_OP(AND   ,_mm_and_si128(   x,y))
+BIN_OP(OR    ,_mm_or_si128(    x,y))
+BIN_OP(ANDNOT,_mm_andnot_si128(y,x))
+BIN_OP(XOR   ,_mm_xor_si128(   x,y))
+BIN_OP(ADD   ,_mm_add_epi8(    x,y))
+BIN_OP(SUB   ,_mm_sub_epi8(    x,y))
+#define HAS_PARALLEL_MIN
+BIN_OP(MINI   ,_mm_min_epu8(    x,y))
+
+#define SHIFT_DOWN _mm_srli_si128
+#define SHIFT_UP   _mm_slli_si128
+
+#define CONCAT(x,y,n) ((n==0) ? (y) : ((n==BYTES_AT_ONCE) ? (x) : \
+                       _HAS_SSSE3( _mm_alignr_epi8(x,y,n),\
+                       OR(SHIFT_UP(x,BYTES_AT_ONCE-(n)),SHIFT_DOWN(y,(n))))))
+
+
+
+
+SI tp_vector TEST_RANGE(tp_vector x,tp_vector y,tp_vector z)
+{
+  /*We use signed comparison.*/
+  tp_vector fv=ADD(BROADCAST(128),x);
+  tp_vector v=SUB(y,fv);
+  tp_vector tv=SUB(ADD(z,BROADCAST(1)),fv);
+  return _mm_cmplt_epi8(v,tv);
+}
+
+SI tp_vector TEST_RANGE_C(tp_vector v,uchar from,uchar to)
+{
+  /* If gcc did constant folding on sse we could just use
+     TEST_RANGE(BROADCAST('A'),v,BROADCAST('Z'));.*/
+  tp_vector fv=BROADCAST(-128-from);
+  v=_mm_add_epi8(v,fv);
+  tp_vector tv=BROADCAST(-128+to-from+1);
+  return _mm_cmplt_epi8(v,tv);
+}
+
+SI tp_vector parallel_tolower(tp_vector m)
+{
+  int i;
+  tp_vector high_bit=BROADCAST(128);
+  tp_vector l= AND(TEST_RANGE_C(m,'A','Z'),high_bit);
+  m=OR(m,_mm_srli_epi64(l,2));
+  if (get_mask(m))
+    for(i=0; i<BYTES_AT_ONCE; i++)
+      {
+        ((uchar*)&m)[i]=tolower_fixed[((uchar*)&m)[i]];
+      }
+  return m;
+}
+
+
+
+#if unroll==1
+#define AGREGATE_MASK    mask0
+#elif unroll==2
+#define AGREGATE_MASK   (mask0|(mask1<<16))
+#elif unroll==4
+/*Has one dependency less than mask0|(mask1<<16)|(mask2<<32)|(mask3<<48)*/
+#define AGREGATE_MASK   (mask0|(mask1<<16))|((mask2|(mask3<<16))<<32)
+#endif
diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
deleted file mode 100644
index baf3076..0000000
--- a/sysdeps/x86_64/strchrnul.S
+++ /dev/null
@@ -1,62 +0,0 @@
-/* strchrnul (str, ch) -- Return pointer to first occurrence of CH in STR
-	or terminating NUL byte.
-   For AMD x86-64.
-   Copyright (C) 2009 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-
-	.text
-ENTRY (__strchrnul)
-	movd	%esi, %xmm1
-	movq	%rdi, %rcx
-	punpcklbw %xmm1, %xmm1
-	andq	$~15, %rdi
-	pxor	%xmm2, %xmm2
-	punpcklbw %xmm1, %xmm1
-	orl	$0xffffffff, %esi
-	movdqa	(%rdi), %xmm0
-	pshufd	$0, %xmm1, %xmm1
-	subq	%rdi, %rcx
-	movdqa	%xmm0, %xmm3
-	leaq	16(%rdi), %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	shl	%cl, %esi
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	orl	%edx, %ecx
-	andl	%esi, %ecx
-	jnz	1f
-
-2:	movdqa	(%rdi), %xmm0
-	leaq	16(%rdi), %rdi
-	movdqa	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	orl	%edx, %ecx
-	jz	2b
-
-1:	bsfl	%ecx, %edx
-	leaq	-16(%rdi,%rdx), %rax
-	ret
-END (__strchrnul)
-
-weak_alias (__strchrnul, strchrnul)
diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S
deleted file mode 100644
index 7b38bf4..0000000
--- a/sysdeps/x86_64/strnlen.S
+++ /dev/null
@@ -1,63 +0,0 @@
-/* strnlen(str,maxlen) -- determine the length of the string STR up to MAXLEN.
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   Contributed by Ulrich Drepper <drepper@redhat.com>.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-
-	.text
-ENTRY(__strnlen)
-	movq	%rsi, %rax
-	testq	%rsi, %rsi
-	jz	3f
-	pxor	%xmm2, %xmm2
-	movq	%rdi, %rcx
-	movq	%rdi, %r8
-	movq	$16, %r9
-	andq	$~15, %rdi
-	movdqa	%xmm2, %xmm1
-	pcmpeqb	(%rdi), %xmm2
-	orl	$0xffffffff, %r10d
-	subq	%rdi, %rcx
-	shll	%cl, %r10d
-	subq	%rcx, %r9
-	pmovmskb %xmm2, %edx
-	andl	%r10d, %edx
-	jnz	1f
-	subq	%r9, %rsi
-	jbe	3f
-
-2:	movdqa	16(%rdi), %xmm0
-	leaq	16(%rdi), %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %edx
-	testl	%edx, %edx
-	jnz	1f
-	subq	$16, %rsi
-	jnbe	2b
-3:	ret
-
-1:	subq	%r8, %rdi
-	bsfl	%edx, %edx
-	addq	%rdi, %rdx
-	cmpq	%rdx, %rax
-	cmovnbq	%rdx, %rax
-	ret
-END(__strnlen)
-weak_alias (__strnlen, strnlen)
-libc_hidden_def (strnlen)
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
deleted file mode 100644
index a5397e7..0000000
--- a/sysdeps/x86_64/strrchr.S
+++ /dev/null
@@ -1,80 +0,0 @@
-/* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR.
-   For AMD x86-64.
-   Copyright (C) 2009 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-
-	.text
-ENTRY (strrchr)
-	movd	%esi, %xmm1
-	movq	%rdi, %rcx
-	punpcklbw %xmm1, %xmm1
-	andq	$~15, %rdi
-	pxor	%xmm2, %xmm2
-	punpcklbw %xmm1, %xmm1
-	orl	$0xffffffff, %esi
-	movdqa	(%rdi), %xmm0
-	pshufd	$0, %xmm1, %xmm1
-	subq	%rdi, %rcx
-	movdqa	%xmm0, %xmm3
-	leaq	16(%rdi), %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	shl	%cl, %esi
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	andl	%esi, %edx
-	andl	%esi, %ecx
-	xorl	%eax, %eax
-	movl	%edx, %esi
-	orl	%ecx, %esi
-	jnz	1f
-
-2:	movdqa	(%rdi), %xmm0
-	leaq	16(%rdi), %rdi
-	movdqa	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	movl	%edx, %esi
-	orl	%ecx, %esi
-	jz	2b
-
-1:	bsfl	%ecx, %r9d
-	movl	$0xffffffff, %r8d
-	movl	$31, %ecx
-	jnz	5f
-
-	bsrl	%edx, %edx
-	jz	2b
-	leaq	-16(%rdi,%rdx), %rax
-	jmp	2b
-
-5:	subl	%r9d, %ecx
-	shrl	%cl, %r8d
-	andl	%r8d, %edx
-	bsrl	%edx, %edx
-	jz	4f
-	leaq	-16(%rdi,%rdx), %rax
-4:	ret
-END (strrchr)
-
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
-- 
1.7.4.4


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]