This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] vectorized string functions
On Wed, Jul 11, 2012 at 09:23:37AM +0200, Andreas Jaeger wrote:
> On Wednesday, July 11, 2012 17:10:07 OndÅej BÃlka wrote:
> > Now I am almost done with vectorized implementation of string
> > functions. I use single loop to get faster implementation of *len,
> > *chr and *str functions.
>
> Ondrej, thanks for your contribution! Do you have a copyright assignment
> on place for glibc? This is definitely needed for such a large piece of
> code.
I already have.
>
> Also, your code does not confirm to our coding style at all, please read
> the following wiki page for details
> http://sourceware.org/glibc/wiki/Contribution%20checklist
>
> I noticed especially:
> * overlong lines
> * no copyright headers
> * missing comments
> * comments that are not full sentences
> * wrong line formatting, missing spaces
>
> This needs performance testing of all functions on a variety of
> architectures. Have you done some of that already?
Tested only for x64. It should be easy to add header for processors
supporting AltiVec but I have no experience with these.
Benchmark results are at usual place
http://kam.mff.cuni.cz/~ondra/benchmark_string/
Here is updated version. I made several additional improvements.
One is that now I test zero by pminub.
One is that I simplified two-way algorithm.
---
string/arit.h | 179 ++++++++
string/loop.h | 195 +++++++++
string/memchr.c | 168 +-------
string/memmem.c | 64 +---
string/memrchr.c | 167 +-------
string/rawmemchr.c | 151 +-------
string/str-two-way.h | 428 ------------------
string/strcasestr.c | 85 +----
string/strchr.c | 173 +-------
string/strchr.h | 70 +++
string/strchrnul.c | 142 +------
string/strlen.c | 95 +----
string/strlen.h | 39 ++
string/strnlen.c | 162 +-------
string/strrchr.c | 35 +--
string/strstr.c | 74 +---
string/strstr.h | 297 +++++++++++++
string/strstr_vec.h | 52 +++
string/vector.h | 120 +++++
sysdeps/x86_64/memchr.S | 311 -------------
sysdeps/x86_64/multiarch/Makefile | 54 ++-
sysdeps/x86_64/multiarch/gen_stub | 111 +++++
sysdeps/x86_64/multiarch/rawmemchr.S | 97 ----
sysdeps/x86_64/multiarch/strcasestr-c.c | 16 -
sysdeps/x86_64/multiarch/strcasestr-nonascii.c | 49 --
sysdeps/x86_64/multiarch/strcasestr.c | 7 -
sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S | 3 -
sysdeps/x86_64/multiarch/strnlen.S | 54 ---
sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S | 555 ------------------------
sysdeps/x86_64/multiarch/strrchr.S | 281 ------------
sysdeps/x86_64/multiarch/strstr-c.c | 15 -
sysdeps/x86_64/multiarch/strstr.c | 384 ----------------
sysdeps/x86_64/sse.h | 161 +++++++
sysdeps/x86_64/strchrnul.S | 62 ---
sysdeps/x86_64/strnlen.S | 63 ---
sysdeps/x86_64/strrchr.S | 80 ----
37 files changed, 1349 insertions(+), 3661 deletions(-)
create mode 100644 string/arit.h
create mode 100644 string/loop.h
delete mode 100644 string/str-two-way.h
create mode 100644 string/strchr.h
create mode 100644 string/strlen.h
create mode 100644 string/strstr.h
create mode 100644 string/strstr_vec.h
create mode 100644 string/vector.h
delete mode 100644 sysdeps/x86_64/memchr.S
create mode 100755 sysdeps/x86_64/multiarch/gen_stub
delete mode 100644 sysdeps/x86_64/multiarch/rawmemchr.S
delete mode 100644 sysdeps/x86_64/multiarch/strcasestr-c.c
delete mode 100644 sysdeps/x86_64/multiarch/strcasestr-nonascii.c
delete mode 100644 sysdeps/x86_64/multiarch/strcasestr.c
delete mode 100644 sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
delete mode 100644 sysdeps/x86_64/multiarch/strnlen.S
delete mode 100644 sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
delete mode 100644 sysdeps/x86_64/multiarch/strrchr.S
delete mode 100644 sysdeps/x86_64/multiarch/strstr-c.c
delete mode 100644 sysdeps/x86_64/multiarch/strstr.c
create mode 100644 sysdeps/x86_64/sse.h
delete mode 100644 sysdeps/x86_64/strchrnul.S
delete mode 100644 sysdeps/x86_64/strnlen.S
delete mode 100644 sysdeps/x86_64/strrchr.S
diff --git a/string/arit.h b/string/arit.h
new file mode 100644
index 0000000..22a8ea5
--- /dev/null
+++ b/string/arit.h
@@ -0,0 +1,179 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License asize_t with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdint.h>
+#include "endian.h"
+#define unroll 4
+#if __WORDSIZE == 64
+typedef uint64_t tp_vector;
+typedef uint64_t tp_mask;
+#elif __WORDSIZE == 32
+typedef uint32_t tp_vector;
+typedef uint32_t tp_mask;
+#endif
+
+const tp_vector ONES=((~((tp_vector)0))/255);
+const tp_vector HIGH_BIT=(((~((tp_vector)0))/255)*0x80);
+
+SI tp_vector BROADCAST(uchar c)
+{
+ return ONES*c;
+}
+SI tp_vector LOAD( uchar *x)
+{
+ return (*((tp_vector*)(x)));
+}
+SI tp_vector LOAD_UNALIGNED( uchar *x)
+{
+ return (*((tp_vector*)(x)));
+}
+
+#define PREFETCH(x)
+
+
+SI tp_mask get_mask(tp_vector x)
+{
+ return x&HIGH_BIT;
+}
+SI int NONZERO_MASK(tp_vector x)
+{
+ return get_mask(x)!=0;
+}
+
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+SI tp_mask bit_i(int i)
+{
+ return ((tp_mask) 1)<<(8*(i%BYTES_AT_ONCE)+7-(i/BYTES_AT_ONCE) );
+}
+#elif __BYTE_ORDER == __BIG_ENDIAN
+SI tp_mask bit_i(int i)
+{
+ return ((tp_mask) 1)<<(8*(BYTES_AT_ONCE-1-i%BYTES_AT_ONCE)+7-(i/BYTES_AT_ONCE));
+}
+#endif
+
+#ifdef CALCULATE_MASK
+SI int calculate_mask_before_after()
+{
+ int i,j;
+ printf("static tp_mask kill_before[]={");
+ for(j=0; j<8*BYTES_AT_ONCE; j++)
+ {
+ tp_mask mask=0;
+ for(i=j; i<8*BYTES_AT_ONCE; i++) mask|=bit_i(i);
+ printf("0x%llx,",mask);
+ }
+ printf("0};\n");
+ printf("static tp_mask kill_after[]={");
+ for(j=0; j<8*BYTES_AT_ONCE; j++)
+ {
+ tp_mask mask=0;
+ for(i=0; i<=j; i++) mask|=bit_i(i);
+ printf("0x%llx,",mask);
+ }
+ printf("0};\n");
+}
+#endif
+#if __WORDSIZE == 32
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+static tp_mask kill_before[]= {0xffffffff,0xffffff7f,0xffff7f7f,0xff7f7f7f,0x7f7f7f7f,0x7f7f7f3f,0x7f7f3f3f,0x7f3f3f3f,0x3f3f3f3f,0x3f3f3f1f,0x3f3f1f1f,0x3f1f1f1f,0x1f1f1f1f,0x1f1f1f0f,0x1f1f0f0f,0x1f0f0f0f,0xf0f0f0f,0xf0f0f07,0xf0f0707,0xf070707,0x7070707,0x7070703,0x7070303,0x7030303,0x3030303,0x3030301,0x3030101,0x3010101,0x1010101,0x1010100,0x1010000,0x1000000,0};
+static tp_mask kill_after[]= {0x80,0x8080,0x808080,0x80808080,0x808080c0,0x8080c0c0,0x80c0c0c0,0xc0c0c0c0,0xc0c0c0e0,0xc0c0e0e0,0xc0e0e0e0,0xe0e0e0e0,0xe0e0e0f0,0xe0e0f0f0,0xe0f0f0f0,0xf0f0f0f0,0xf0f0f0f8,0xf0f0f8f8,0xf0f8f8f8,0xf8f8f8f8,0xf8f8f8fc,0xf8f8fcfc,0xf8fcfcfc,0xfcfcfcfc,0xfcfcfcfe,0xfcfcfefe,0xfcfefefe,0xfefefefe,0xfefefeff,0xfefeffff,0xfeffffff,0xffffffff,0};
+#else
+static tp_mask kill_before[]= {0xffffffff,0x7fffffff,0x7f7fffff,0x7f7f7fff,0x7f7f7f7f,0x3f7f7f7f,0x3f3f7f7f,0x3f3f3f7f,0x3f3f3f3f,0x1f3f3f3f,0x1f1f3f3f,0x1f1f1f3f,0x1f1f1f1f,0xf1f1f1f,0xf0f1f1f,0xf0f0f1f,0xf0f0f0f,0x70f0f0f,0x7070f0f,0x707070f,0x7070707,0x3070707,0x3030707,0x3030307,0x3030303,0x1030303,0x1010303,0x1010103,0x1010101,0x10101,0x101,0x1,0};
+static tp_mask kill_after[]= {0x80000000,0x80800000,0x80808000,0x80808080,0xc0808080,0xc0c08080,0xc0c0c080,0xc0c0c0c0,0xe0c0c0c0,0xe0e0c0c0,0xe0e0e0c0,0xe0e0e0e0,0xf0e0e0e0,0xf0f0e0e0,0xf0f0f0e0,0xf0f0f0f0,0xf8f0f0f0,0xf8f8f0f0,0xf8f8f8f0,0xf8f8f8f8,0xfcf8f8f8,0xfcfcf8f8,0xfcfcfcf8,0xfcfcfcfc,0xfefcfcfc,0xfefefcfc,0xfefefefc,0xfefefefe,0xfffefefe,0xfffffefe,0xfffffffe,0xffffffff,0};
+#endif
+#elif __WORDSIZE == 64
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+static tp_mask kill_before[]= {0xffffffffffffffff,0xffffffffffffff7f,0xffffffffffff7f7f,0xffffffffff7f7f7f,0xffffffff7f7f7f7f,0xffffff7f7f7f7f7f,0xffff7f7f7f7f7f7f,0xff7f7f7f7f7f7f7f,0x7f7f7f7f7f7f7f7f,0x7f7f7f7f7f7f7f3f,0x7f7f7f7f7f7f3f3f,0x7f7f7f7f7f3f3f3f,0x7f7f7f7f3f3f3f3f,0x7f7f7f3f3f3f3f3f,0x7f7f3f3f3f3f3f3f,0x7f3f3f3f3f3f3f3f,0x3f3f3f3f3f3f3f3f,0x3f3f3f3f3f3f3f1f,0x3f3f3f3f3f3f1f1f,0x3f3f3f3f3f1f1f1f,0x3f3f3f3f1f1f1f1f,0x3f3f3f1f1f1f1f1f,0x3f3f1f1f1f1f1f1f,0x3f1f1f1f1f1f1f1f,0x1f1f1f1f1f1f1f1f,0x1f1f1f1f1f1f1f0f,0x1f1f1f1f1f1f0f0f,0x1f1f1f1f1f0f0f0f,0x1f1f1f1f0f0f0f0f,0x1f1f1f0f0f0f0f0f,0x1f1f0f0f0f0f0f0f,0x1f0f0f0f0f0f0f0f,0xf0f0f0f0f0f0f0f,0xf0f0f0f0f0f0f07,0xf0f0f0f0f0f0707,0xf0f0f0f0f070707,0xf0f0f0f07070707,0xf0f0f0707070707,0xf0f070707070707,0xf07070707070707,0x707070707070707,0x707070707070703,0x707070707070303,0x707070707030303,0x707070703030303,0x707070303030303,0x707030303030303,0x703030303030303,0x303030303030303,0x303030303030301,0x303030303030101,0x303030303010101,0x303030301010101,0x303030101010101,0x303010101010101,0x301010101010101,0x101010101010101,0x101010101010100,0x101010101010000,0x101010101000000,0x101010100000000,0x101010000000000,0x101000000000000,0x100000000000000,0};
+static tp_mask kill_after[]= {0x80,0x8080,0x808080,0x80808080,0x8080808080,0x808080808080,0x80808080808080,0x8080808080808080,0x80808080808080c0,0x808080808080c0c0,0x8080808080c0c0c0,0x80808080c0c0c0c0,0x808080c0c0c0c0c0,0x8080c0c0c0c0c0c0,0x80c0c0c0c0c0c0c0,0xc0c0c0c0c0c0c0c0,0xc0c0c0c0c0c0c0e0,0xc0c0c0c0c0c0e0e0,0xc0c0c0c0c0e0e0e0,0xc0c0c0c0e0e0e0e0,0xc0c0c0e0e0e0e0e0,0xc0c0e0e0e0e0e0e0,0xc0e0e0e0e0e0e0e0,0xe0e0e0e0e0e0e0e0,0xe0e0e0e0e0e0e0f0,0xe0e0e0e0e0e0f0f0,0xe0e0e0e0e0f0f0f0,0xe0e0e0e0f0f0f0f0,0xe0e0e0f0f0f0f0f0,0xe0e0f0f0f0f0f0f0,0xe0f0f0f0f0f0f0f0,0xf0f0f0f0f0f0f0f0,0xf0f0f0f0f0f0f0f8,0xf0f0f0f0f0f0f8f8,0xf0f0f0f0f0f8f8f8,0xf0f0f0f0f8f8f8f8,0xf0f0f0f8f8f8f8f8,0xf0f0f8f8f8f8f8f8,0xf0f8f8f8f8f8f8f8,0xf8f8f8f8f8f8f8f8,0xf8f8f8f8f8f8f8fc,0xf8f8f8f8f8f8fcfc,0xf8f8f8f8f8fcfcfc,0xf8f8f8f8fcfcfcfc,0xf8f8f8fcfcfcfcfc,0xf8f8fcfcfcfcfcfc,0xf8fcfcfcfcfcfcfc,0xfcfcfcfcfcfcfcfc,0xfcfcfcfcfcfcfcfe,0xfcfcfcfcfcfcfefe,0xfcfcfcfcfcfefefe,0xfcfcfcfcfefefefe,0xfcfcfcfefefefefe,0xfcfcfefefefefefe,0xfcfefefefefefefe,0xfefefefefefefefe,0xfefefefefefefeff,0xfefefefefefeffff,0xfefefefefeffffff,0xfefefefeffffffff,0xfefefeffffffffff,0xfefeffffffffffff,0xfeffffffffffffff,0xffffffffffffffff,0};
+#elif __BYTE_ORDER == __BIG_ENDIAN
+static tp_mask kill_before[]= {0xffffffffffffffff,0x7fffffffffffffff,0x7f7fffffffffffff,0x7f7f7fffffffffff,0x7f7f7f7fffffffff,0x7f7f7f7f7fffffff,0x7f7f7f7f7f7fffff,0x7f7f7f7f7f7f7fff,0x7f7f7f7f7f7f7f7f,0x3f7f7f7f7f7f7f7f,0x3f3f7f7f7f7f7f7f,0x3f3f3f7f7f7f7f7f,0x3f3f3f3f7f7f7f7f,0x3f3f3f3f3f7f7f7f,0x3f3f3f3f3f3f7f7f,0x3f3f3f3f3f3f3f7f,0x3f3f3f3f3f3f3f3f,0x1f3f3f3f3f3f3f3f,0x1f1f3f3f3f3f3f3f,0x1f1f1f3f3f3f3f3f,0x1f1f1f1f3f3f3f3f,0x1f1f1f1f1f3f3f3f,0x1f1f1f1f1f1f3f3f,0x1f1f1f1f1f1f1f3f,0x1f1f1f1f1f1f1f1f,0xf1f1f1f1f1f1f1f,0xf0f1f1f1f1f1f1f,0xf0f0f1f1f1f1f1f,0xf0f0f0f1f1f1f1f,0xf0f0f0f0f1f1f1f,0xf0f0f0f0f0f1f1f,0xf0f0f0f0f0f0f1f,0xf0f0f0f0f0f0f0f,0x70f0f0f0f0f0f0f,0x7070f0f0f0f0f0f,0x707070f0f0f0f0f,0x70707070f0f0f0f,0x7070707070f0f0f,0x707070707070f0f,0x70707070707070f,0x707070707070707,0x307070707070707,0x303070707070707,0x303030707070707,0x303030307070707,0x303030303070707,0x303030303030707,0x303030303030307,0x303030303030303,0x103030303030303,0x101030303030303,0x101010303030303,0x101010103030303,0x101010101030303,0x101010101010303,0x101010101010103,0x101010101010101,0x1010101010101,0x10101010101,0x101010101,0x1010101,0x10101,0x101,0x1,0};
+static tp_mask kill_after[]= {0x8000000000000000,0x8080000000000000,0x8080800000000000,0x8080808000000000,0x8080808080000000,0x8080808080800000,0x8080808080808000,0x8080808080808080,0xc080808080808080,0xc0c0808080808080,0xc0c0c08080808080,0xc0c0c0c080808080,0xc0c0c0c0c0808080,0xc0c0c0c0c0c08080,0xc0c0c0c0c0c0c080,0xc0c0c0c0c0c0c0c0,0xe0c0c0c0c0c0c0c0,0xe0e0c0c0c0c0c0c0,0xe0e0e0c0c0c0c0c0,0xe0e0e0e0c0c0c0c0,0xe0e0e0e0e0c0c0c0,0xe0e0e0e0e0e0c0c0,0xe0e0e0e0e0e0e0c0,0xe0e0e0e0e0e0e0e0,0xf0e0e0e0e0e0e0e0,0xf0f0e0e0e0e0e0e0,0xf0f0f0e0e0e0e0e0,0xf0f0f0f0e0e0e0e0,0xf0f0f0f0f0e0e0e0,0xf0f0f0f0f0f0e0e0,0xf0f0f0f0f0f0f0e0,0xf0f0f0f0f0f0f0f0,0xf8f0f0f0f0f0f0f0,0xf8f8f0f0f0f0f0f0,0xf8f8f8f0f0f0f0f0,0xf8f8f8f8f0f0f0f0,0xf8f8f8f8f8f0f0f0,0xf8f8f8f8f8f8f0f0,0xf8f8f8f8f8f8f8f0,0xf8f8f8f8f8f8f8f8,0xfcf8f8f8f8f8f8f8,0xfcfcf8f8f8f8f8f8,0xfcfcfcf8f8f8f8f8,0xfcfcfcfcf8f8f8f8,0xfcfcfcfcfcf8f8f8,0xfcfcfcfcfcfcf8f8,0xfcfcfcfcfcfcfcf8,0xfcfcfcfcfcfcfcfc,0xfefcfcfcfcfcfcfc,0xfefefcfcfcfcfcfc,0xfefefefcfcfcfcfc,0xfefefefefcfcfcfc,0xfefefefefefcfcfc,0xfefefefefefefcfc,0xfefefefefefefefc,0xfefefefefefefefe,0xfffefefefefefefe,0xfffffefefefefefe,0xfffffffefefefefe,0xfffffffffefefefe,0xfffffffffffefefe,0xfffffffffffffefe,0xfffffffffffffffe,0xffffffffffffffff,0};
+#endif
+#endif
+SI tp_mask first_bit(tp_mask t,int y)
+{
+ while (!(t&bit_i(y))) y++;
+ return y;
+}
+MASK_OP(forget_first_bit, x^bit_i(y))
+MASK_OP(forget_before , x&((y>=PARA) ? 0 : kill_before[y]))
+MASK_OP(forget_after , x&((y<0) ? 0 : kill_after[ y]))
+
+
+BIN_OP(XOR,x^y)
+BIN_OP(OR,x|y)
+BIN_OP(AND,x&y)
+BIN_OP(ANDNOT,x&(~y))
+UN_OP(TEST_ZERO,(AND(~(OR(x,HIGH_BIT)-ONES),~(x))))
+BIN_OP(TEST_EQ,TEST_ZERO(XOR(x,y)));
+
+#define SHIFT_DOWN(x,y) ((x)>>(8*(y)))
+#define SHIFT_UP(x,y) ((x)<<(8*(y)))
+#define CONCAT(x,y,n) ((n==0) ? (y) : ((n==BYTES_AT_ONCE) ? (x) : OR(SHIFT_UP(x,BYTES_AT_ONCE-(n)),SHIFT_DOWN(y,(n)))))
+
+
+#ifdef DEBUG
+void inspect_mask(tp_mask m)
+{
+ int i;
+ for(i=0; i<PARA; i++) printf(m&bit_i(i) ? "1" : "0");
+ printf("\n");
+}
+#endif
+
+
+/* TODO implement TEST_RANGE for generic parallel_tolower
+SI tp_vector TEST_RANGE(tp_vector v,uchar from,uchar to){
+ tp_vector fv=BROADCAST(-127-from);
+ v=_mm_add_epi8(v,fv);
+ tp_vector tv=BROADCAST(-127+to-from+1);
+ return _mm_cmplt_epi8(v,tv);
+}
+
+SI tp_vector parallel_tolower(tp_vector m){tp_mask mask;
+ tp_vector high_bit=BROADCAST(128);
+ tp_vector l= AND(TEST_RANGE(m,'A','Z'),high_bit);
+ m=OR(m,_mm_srli_epi64(l,2));
+ if ((mask=get_mask(m))){int i;
+ while(mask){ i=first_bit(mask); mask=forget_first_bit(mask,i);
+ ((uchar*)&m)[i]=tolower(((uchar*)&m)[i]);
+ }
+ }
+ return m;
+}
+*/
+
+
+SI tp_vector parallel_tolower(tp_vector m)
+{
+ int i;
+ tp_vector r;
+ for(i=0; i<sizeof(tp_vector); i++)
+ ((uchar*)&r)[i]=tolower_fixed[((uchar*)&m)[i]];
+ return r;
+}
+
+
+#if unroll==1
+#define AGREGATE_MASK mask0
+#elif unroll==2
+#define AGREGATE_MASK (mask0|(mask1>>1))
+#elif unroll==4
+#define AGREGATE_MASK ((mask0|(mask1>>1))|((mask2>>2)|(mask3>>3)))
+#endif
diff --git a/string/loop.h b/string/loop.h
new file mode 100644
index 0000000..1ae2e8f
--- /dev/null
+++ b/string/loop.h
@@ -0,0 +1,195 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License asize_t with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* basic string search loop. To use it define macros below and include this file.
+ TEST_CODE(so,sn) given consecutive sequence so,sn of bytes you should produce an
+ vector. For bytes with highest bit set to 1 a loop invokes macro
+ LOOP_BODY(p) where p is coresponding byte in sn.
+ LOOP_BODY(p) see above
+ DETECT_END(p) When byte p is reached call macro LOOP_END(p)
+ DETECT_ZERO_BYTE When first zero byte is reached call LOOP_END(p)
+ LOOP_END(p) see above
+
+ CAN_SKIP You have to define skip_to variable. Then a loop will not call
+ LOOP_BODY(p) when p<skip_to. A LOOP_END condition will still be processed.
+
+ This file should be included inside function. A loop uses local variable s as matched string.
+ Note that implementation by callback is complicated by fact that you usualy need a closure to
+ share arguments.
+*/
+
+#ifdef DETECT_ZERO_BYTE
+#define _DETECT_ZERO_BYTE mvec= OR(mvec,TEST_ZERO(sz));
+#define _TEST_ZERO_BYTE (*p==0)
+#else
+#define _DETECT_ZERO_BYTE
+#define _TEST_ZERO_BYTE 0
+#endif
+#ifdef DETECT_END
+#define _DETECT_END(u) (DETECT_END<=s2+u*BYTES_AT_ONCE)
+if (DETECT_END == s)
+ {
+ uchar UNUSED *p=s;
+ LOOP_END(p);
+ }
+#else
+#define DETECT_END ((uchar*)NULL)
+#define _DETECT_END(u) 0
+#endif
+
+
+#define TEST(u) \
+ mvec=vzero;\
+ so=sn;\
+ sn=sz##u= LOAD(s2+u*BYTES_AT_ONCE);\
+ mvec = TEST_CODE(so,sn); \
+ mvec##u = mvec;
+
+
+int i;
+tp_vector vzero=BROADCAST(0);
+tp_vector sn,so,sz0,sz1,sz2,sz3;
+int s_offset;
+uchar* s2;
+sn=vzero;
+ALIGN(s,unroll);
+tp_vector mvec,zvec=vzero;
+tp_mask mask, UNUSED zmask;
+#undef ACTION
+#define ACTION(x) tp_vector mvec##x; tp_mask mask##x;
+DO_ACTION;
+#undef ACTION
+#define ACTION(x) TEST(x)
+DO_ACTION;
+
+#ifdef DETECT_ZERO_BYTE
+ #undef ACTION
+ #define ACTION(x) mvec##x=OR(mvec##x,TEST_ZERO(sz##x));
+ DO_ACTION;
+#endif
+
+#undef ACTION
+#define ACTION(x) mask##x=get_mask(mvec##x);
+DO_ACTION;
+mask=AGREGATE_MASK;
+mask=forget_before(mask,s_offset);
+if (mask||_DETECT_END(unroll)) goto test;
+start:
+;
+while(1)
+ {
+ s2+=PARA;
+ PREFETCH(s2+prefetch*CACHE_LINE_SIZE);
+#undef ACTION
+#define ACTION(x) TEST(x)
+ DO_ACTION;
+#ifdef DETECT_ZERO_BYTE
+#if unroll==1
+ zvec=zvec0;
+#elif unroll==2
+#ifdef HAS_PARALLEL_MIN
+ zvec=TEST_ZERO(MINI(MINI(sz0,sz1)));
+#else
+ zvec=OR(OR(TEST_ZERO(sz0),TEST_ZERO(sz1)));
+#endif
+#elif unroll==4
+#ifdef HAS_PARALLEL_MIN
+ zvec=TEST_ZERO(MINI(MINI(sz0,sz1),MINI(sz2,sz3)));
+#else
+ zvec=OR(OR(TEST_ZERO(sz0),TEST_ZERO(sz1)),
+ OR(TEST_ZERO(sz2),TEST_ZERO(sz3)));
+#endif
+#endif
+#endif
+ if(NONZERO_MASK(OR(AGREGATE_VECTOR,zvec))||_DETECT_END(unroll))
+ {
+ /* on x64 or is destructive operation
+ in case of strlen it is faster to recalculate
+ mvec0,mvec2 than move them to separate registers.*/
+
+#ifdef DETECT_ZERO_BYTE
+ #undef ACTION
+ #define ACTION(x) mvec##x=OR(mvec##x,TEST_ZERO(sz##x));
+ DO_ACTION;
+#endif
+
+#undef ACTION
+#define ACTION(x) mask##x=get_mask(mvec##x);
+ DO_ACTION;
+ mask=AGREGATE_MASK;
+ goto test;
+ }
+ }
+test:; /*we need this flow otherwise gcc would duplicate this fragment.*/
+int end=0;
+#ifdef CAN_SKIP
+/* detect zero byte so it cannot be skipped.*/
+#ifdef DETECT_ZERO_BYTE
+#define ZTEST(u) \
+ mask##u=get_mask(TEST_ZERO(sz##u));
+#undef ACTION
+#define ACTION(x) ZTEST(x)
+DO_ACTION;
+zmask=AGREGATE_MASK;
+if (s>s2)
+ zmask=forget_before(zmask,s_offset);
+
+if(zmask) end = first_bit(zmask,0)+1;
+#endif
+if(skip_to>s2)
+ mask=forget_before(mask,skip_to-s2);
+#endif
+if (_DETECT_END(unroll)) /*we need to handle case when end is at start of next page here*/
+ {
+ end = min(DETECT_END-s2-1,end ? (end-1) : 64)+1;
+ }
+if (end)
+ {
+ mask=forget_after(mask,end-1);
+ }
+i=0;
+while(mask)
+ {
+ i=first_bit(mask,i);
+ uchar UNUSED *p=s2+i;
+ if(__builtin_expect(_TEST_ZERO_BYTE,0))
+ {
+ LOOP_END(p)
+ }
+ LOOP_BODY(p)
+#ifdef CAN_SKIP
+ mask=forget_before(mask,skip_to-s2);
+#else
+ mask=forget_first_bit(mask,i);
+#endif
+ }
+if(end)
+ {
+ uchar UNUSED *p=DETECT_END;
+ LOOP_END(p);
+ }
+
+goto start;
+
+
+#undef CAN_SKIP
+
+#undef TEST_CODE
+#undef LOOP_BODY
+#undef ACTION
+#undef DETECT_END
+#undef _DETECT_END
diff --git a/string/memchr.c b/string/memchr.c
index 22637cf..775afa6 100644
--- a/string/memchr.c
+++ b/string/memchr.c
@@ -24,29 +24,6 @@
#include <config.h>
#endif
-#undef __ptr_t
-#define __ptr_t void *
-
-#if defined _LIBC
-# include <string.h>
-# include <memcopy.h>
-#endif
-
-#if HAVE_STDLIB_H || defined _LIBC
-# include <stdlib.h>
-#endif
-
-#if HAVE_LIMITS_H || defined _LIBC
-# include <limits.h>
-#endif
-
-#define LONG_MAX_32_BITS 2147483647
-
-#ifndef LONG_MAX
-#define LONG_MAX LONG_MAX_32_BITS
-#endif
-
-#include <sys/types.h>
#if HAVE_BP_SYM_H || defined _LIBC
#include <bp-sym.h>
#else
@@ -56,152 +33,15 @@
#undef memchr
#undef __memchr
-/* Search no more than N bytes of S for C. */
-__ptr_t
-__memchr (s, c_in, n)
- const __ptr_t s;
- int c_in;
- size_t n;
-{
- const unsigned char *char_ptr;
- const unsigned long int *longword_ptr;
- unsigned long int longword, magic_bits, charmask;
- unsigned char c;
-
- c = (unsigned char) c_in;
-
- /* Handle the first few characters by reading one character at a time.
- Do this until CHAR_PTR is aligned on a longword boundary. */
- for (char_ptr = (const unsigned char *) s;
- n > 0 && ((unsigned long int) char_ptr
- & (sizeof (longword) - 1)) != 0;
- --n, ++char_ptr)
- if (*char_ptr == c)
- return (__ptr_t) char_ptr;
-
- /* All these elucidatory comments refer to 4-byte longwords,
- but the theory applies equally well to 8-byte longwords. */
-
- longword_ptr = (unsigned long int *) char_ptr;
-
- /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits
- the "holes." Note that there is a hole just to the left of
- each byte, with an extra at the end:
-
- bits: 01111110 11111110 11111110 11111111
- bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
- The 1-bits make sure that carries propagate to the next 0-bit.
- The 0-bits provide holes for carries to fall into. */
-
- if (sizeof (longword) != 4 && sizeof (longword) != 8)
- abort ();
-
-#if LONG_MAX <= LONG_MAX_32_BITS
- magic_bits = 0x7efefeff;
-#else
- magic_bits = ((unsigned long int) 0x7efefefe << 32) | 0xfefefeff;
-#endif
-
- /* Set up a longword, each of whose bytes is C. */
- charmask = c | (c << 8);
- charmask |= charmask << 16;
-#if LONG_MAX > LONG_MAX_32_BITS
- charmask |= charmask << 32;
-#endif
-
- /* Instead of the traditional loop which tests each character,
- we will test a longword at a time. The tricky part is testing
- if *any of the four* bytes in the longword in question are zero. */
- while (n >= sizeof (longword))
- {
- /* We tentatively exit the loop if adding MAGIC_BITS to
- LONGWORD fails to change any of the hole bits of LONGWORD.
-
- 1) Is this safe? Will it catch all the zero bytes?
- Suppose there is a byte with all zeros. Any carry bits
- propagating from its left will fall into the hole at its
- least significant bit and stop. Since there will be no
- carry from its most significant bit, the LSB of the
- byte to the left will be unchanged, and the zero will be
- detected.
-
- 2) Is this worthwhile? Will it ignore everything except
- zero bytes? Suppose every byte of LONGWORD has a bit set
- somewhere. There will be a carry into bit 8. If bit 8
- is set, this will carry into bit 16. If bit 8 is clear,
- one of bits 9-15 must be set, so there will be a carry
- into bit 16. Similarly, there will be a carry into bit
- 24. If one of bits 24-30 is set, there will be a carry
- into bit 31, so all of the hole bits will be changed.
-
- The one misfire occurs when bits 24-30 are clear and bit
- 31 is set; in this case, the hole at bit 31 is not
- changed. If we had access to the processor carry flag,
- we could close this loophole by putting the fourth hole
- at bit 32!
-
- So it ignores everything except 128's, when they're aligned
- properly.
-
- 3) But wait! Aren't we looking for C, not zero?
- Good point. So what we do is XOR LONGWORD with a longword,
- each of whose bytes is C. This turns each byte that is C
- into a zero. */
-
- longword = *longword_ptr++ ^ charmask;
-
- /* Add MAGIC_BITS to LONGWORD. */
- if ((((longword + magic_bits)
-
- /* Set those bits that were unchanged by the addition. */
- ^ ~longword)
-
- /* Look at only the hole bits. If any of the hole bits
- are unchanged, most likely one of the bytes was a
- zero. */
- & ~magic_bits) != 0)
- {
- /* Which of the bytes was C? If none of them were, it was
- a misfire; continue the search. */
-
- const unsigned char *cp = (const unsigned char *) (longword_ptr - 1);
-
- if (cp[0] == c)
- return (__ptr_t) cp;
- if (cp[1] == c)
- return (__ptr_t) &cp[1];
- if (cp[2] == c)
- return (__ptr_t) &cp[2];
- if (cp[3] == c)
- return (__ptr_t) &cp[3];
-#if LONG_MAX > 2147483647
- if (cp[4] == c)
- return (__ptr_t) &cp[4];
- if (cp[5] == c)
- return (__ptr_t) &cp[5];
- if (cp[6] == c)
- return (__ptr_t) &cp[6];
- if (cp[7] == c)
- return (__ptr_t) &cp[7];
+#ifndef MEMCHR
+#define MEMCHR __memchr
#endif
- }
- n -= sizeof (longword);
- }
+#define AS_MEMCHR
+#include "strchr.h"
- char_ptr = (const unsigned char *) longword_ptr;
- while (n-- > 0)
- {
- if (*char_ptr == c)
- return (__ptr_t) char_ptr;
- else
- ++char_ptr;
- }
- return 0;
-}
#ifdef weak_alias
weak_alias (__memchr, BP_SYM (memchr))
#endif
diff --git a/string/memmem.c b/string/memmem.c
index 625c9cf..d208a35 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1991,92,93,94,96,97,98,2000,2004,2008 Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -12,66 +12,14 @@
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
+ License asize_t with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-/* This particular implementation was written by Eric Blake, 2008. */
-#ifndef _LIBC
-# include <config.h>
+#ifndef MEMMEM
+#define MEMMEM memmem
#endif
-/* Specification of memmem. */
-#include <string.h>
+#define AS_MEMMEM
+#include "strstr.h"
-#ifndef _LIBC
-# define __builtin_expect(expr, val) (expr)
-#endif
-
-#define RETURN_TYPE void *
-#define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
-#include "str-two-way.h"
-
-#undef memmem
-
-/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK
- if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
- HAYSTACK. */
-void *
-memmem (const void *haystack_start, size_t haystack_len,
- const void *needle_start, size_t needle_len)
-{
- /* Abstract memory is considered to be an array of 'unsigned char' values,
- not an array of 'char' values. See ISO C 99 section 6.2.6.1. */
- const unsigned char *haystack = (const unsigned char *) haystack_start;
- const unsigned char *needle = (const unsigned char *) needle_start;
-
- if (needle_len == 0)
- /* The first occurrence of the empty string is deemed to occur at
- the beginning of the string. */
- return (void *) haystack;
-
- /* Sanity check, otherwise the loop might search through the whole
- memory. */
- if (__builtin_expect (haystack_len < needle_len, 0))
- return NULL;
-
- /* Use optimizations in memchr when possible, to reduce the search
- size of haystack using a linear algorithm with a smaller
- coefficient. However, avoid memchr for long needles, since we
- can often achieve sublinear performance. */
- if (needle_len < LONG_NEEDLE_THRESHOLD)
- {
- haystack = memchr (haystack, *needle, haystack_len);
- if (!haystack || __builtin_expect (needle_len == 1, 0))
- return (void *) haystack;
- haystack_len -= haystack - (const unsigned char *) haystack_start;
- if (haystack_len < needle_len)
- return NULL;
- return two_way_short_needle (haystack, haystack_len, needle, needle_len);
- }
- else
- return two_way_long_needle (haystack, haystack_len, needle, needle_len);
-}
-
-#undef LONG_NEEDLE_THRESHOLD
diff --git a/string/memrchr.c b/string/memrchr.c
index 2826f13..a1da7bd 100644
--- a/string/memrchr.c
+++ b/string/memrchr.c
@@ -27,25 +27,6 @@
# include <config.h>
#endif
-#undef __ptr_t
-#define __ptr_t void *
-
-#if defined _LIBC
-# include <string.h>
-# include <memcopy.h>
-#endif
-
-#if defined HAVE_LIMITS_H || defined _LIBC
-# include <limits.h>
-#endif
-
-#define LONG_MAX_32_BITS 2147483647
-
-#ifndef LONG_MAX
-# define LONG_MAX LONG_MAX_32_BITS
-#endif
-
-#include <sys/types.h>
#undef __memrchr
#undef memrchr
@@ -54,155 +35,13 @@
# define __memrchr memrchr
#endif
-/* Search no more than N bytes of S for C. */
-__ptr_t
#ifndef MEMRCHR
-__memrchr
-#else
-MEMRCHR
-#endif
- (s, c_in, n)
- const __ptr_t s;
- int c_in;
- size_t n;
-{
- const unsigned char *char_ptr;
- const unsigned long int *longword_ptr;
- unsigned long int longword, magic_bits, charmask;
- unsigned char c;
-
- c = (unsigned char) c_in;
-
- /* Handle the last few characters by reading one character at a time.
- Do this until CHAR_PTR is aligned on a longword boundary. */
- for (char_ptr = (const unsigned char *) s + n;
- n > 0 && ((unsigned long int) char_ptr
- & (sizeof (longword) - 1)) != 0;
- --n)
- if (*--char_ptr == c)
- return (__ptr_t) char_ptr;
-
- /* All these elucidatory comments refer to 4-byte longwords,
- but the theory applies equally well to 8-byte longwords. */
-
- longword_ptr = (const unsigned long int *) char_ptr;
-
- /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits
- the "holes." Note that there is a hole just to the left of
- each byte, with an extra at the end:
-
- bits: 01111110 11111110 11111110 11111111
- bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
- The 1-bits make sure that carries propagate to the next 0-bit.
- The 0-bits provide holes for carries to fall into. */
-
- if (sizeof (longword) != 4 && sizeof (longword) != 8)
- abort ();
-
-#if LONG_MAX <= LONG_MAX_32_BITS
- magic_bits = 0x7efefeff;
-#else
- magic_bits = ((unsigned long int) 0x7efefefe << 32) | 0xfefefeff;
+#define MEMRCHR __memrchr
#endif
- /* Set up a longword, each of whose bytes is C. */
- charmask = c | (c << 8);
- charmask |= charmask << 16;
-#if LONG_MAX > LONG_MAX_32_BITS
- charmask |= charmask << 32;
-#endif
-
- /* Instead of the traditional loop which tests each character,
- we will test a longword at a time. The tricky part is testing
- if *any of the four* bytes in the longword in question are zero. */
- while (n >= sizeof (longword))
- {
- /* We tentatively exit the loop if adding MAGIC_BITS to
- LONGWORD fails to change any of the hole bits of LONGWORD.
-
- 1) Is this safe? Will it catch all the zero bytes?
- Suppose there is a byte with all zeros. Any carry bits
- propagating from its left will fall into the hole at its
- least significant bit and stop. Since there will be no
- carry from its most significant bit, the LSB of the
- byte to the left will be unchanged, and the zero will be
- detected.
-
- 2) Is this worthwhile? Will it ignore everything except
- zero bytes? Suppose every byte of LONGWORD has a bit set
- somewhere. There will be a carry into bit 8. If bit 8
- is set, this will carry into bit 16. If bit 8 is clear,
- one of bits 9-15 must be set, so there will be a carry
- into bit 16. Similarly, there will be a carry into bit
- 24. If one of bits 24-30 is set, there will be a carry
- into bit 31, so all of the hole bits will be changed.
-
- The one misfire occurs when bits 24-30 are clear and bit
- 31 is set; in this case, the hole at bit 31 is not
- changed. If we had access to the processor carry flag,
- we could close this loophole by putting the fourth hole
- at bit 32!
-
- So it ignores everything except 128's, when they're aligned
- properly.
-
- 3) But wait! Aren't we looking for C, not zero?
- Good point. So what we do is XOR LONGWORD with a longword,
- each of whose bytes is C. This turns each byte that is C
- into a zero. */
-
- longword = *--longword_ptr ^ charmask;
-
- /* Add MAGIC_BITS to LONGWORD. */
- if ((((longword + magic_bits)
-
- /* Set those bits that were unchanged by the addition. */
- ^ ~longword)
-
- /* Look at only the hole bits. If any of the hole bits
- are unchanged, most likely one of the bytes was a
- zero. */
- & ~magic_bits) != 0)
- {
- /* Which of the bytes was C? If none of them were, it was
- a misfire; continue the search. */
-
- const unsigned char *cp = (const unsigned char *) longword_ptr;
-
-#if LONG_MAX > 2147483647
- if (cp[7] == c)
- return (__ptr_t) &cp[7];
- if (cp[6] == c)
- return (__ptr_t) &cp[6];
- if (cp[5] == c)
- return (__ptr_t) &cp[5];
- if (cp[4] == c)
- return (__ptr_t) &cp[4];
-#endif
- if (cp[3] == c)
- return (__ptr_t) &cp[3];
- if (cp[2] == c)
- return (__ptr_t) &cp[2];
- if (cp[1] == c)
- return (__ptr_t) &cp[1];
- if (cp[0] == c)
- return (__ptr_t) cp;
- }
-
- n -= sizeof (longword);
- }
-
- char_ptr = (const unsigned char *) longword_ptr;
-
- while (n-- > 0)
- {
- if (*--char_ptr == c)
- return (__ptr_t) char_ptr;
- }
+#define AS_MEMRCHR
+#include "strchr.h"
- return 0;
-}
#ifndef MEMRCHR
# ifdef weak_alias
weak_alias (__memrchr, memrchr)
diff --git a/string/rawmemchr.c b/string/rawmemchr.c
index 90e8c7c..d880272 100644
--- a/string/rawmemchr.c
+++ b/string/rawmemchr.c
@@ -24,159 +24,16 @@
#include <config.h>
#endif
-#undef __ptr_t
-#define __ptr_t void *
-
-#if defined (_LIBC)
-# include <string.h>
-# include <memcopy.h>
-# include <stdlib.h>
-#endif
-
-#if defined (HAVE_LIMITS_H) || defined (_LIBC)
-# include <limits.h>
-#endif
-
-#define LONG_MAX_32_BITS 2147483647
-
-#ifndef LONG_MAX
-#define LONG_MAX LONG_MAX_32_BITS
-#endif
-
-#include <sys/types.h>
-
#undef memchr
-
-/* Find the first occurrence of C in S. */
-__ptr_t
-__rawmemchr (s, c_in)
- const __ptr_t s;
- int c_in;
-{
- const unsigned char *char_ptr;
- const unsigned long int *longword_ptr;
- unsigned long int longword, magic_bits, charmask;
- unsigned char c;
-
- c = (unsigned char) c_in;
-
- /* Handle the first few characters by reading one character at a time.
- Do this until CHAR_PTR is aligned on a longword boundary. */
- for (char_ptr = (const unsigned char *) s;
- ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0;
- ++char_ptr)
- if (*char_ptr == c)
- return (__ptr_t) char_ptr;
-
- /* All these elucidatory comments refer to 4-byte longwords,
- but the theory applies equally well to 8-byte longwords. */
-
- longword_ptr = (unsigned long int *) char_ptr;
-
- /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits
- the "holes." Note that there is a hole just to the left of
- each byte, with an extra at the end:
-
- bits: 01111110 11111110 11111110 11111111
- bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
- The 1-bits make sure that carries propagate to the next 0-bit.
- The 0-bits provide holes for carries to fall into. */
-
- if (sizeof (longword) != 4 && sizeof (longword) != 8)
- abort ();
-
-#if LONG_MAX <= LONG_MAX_32_BITS
- magic_bits = 0x7efefeff;
-#else
- magic_bits = ((unsigned long int) 0x7efefefe << 32) | 0xfefefeff;
+#ifndef RAWMEMCHR
+#define RAWMEMCHR __rawmemchr
#endif
- /* Set up a longword, each of whose bytes is C. */
- charmask = c | (c << 8);
- charmask |= charmask << 16;
-#if LONG_MAX > LONG_MAX_32_BITS
- charmask |= charmask << 32;
-#endif
-
- /* Instead of the traditional loop which tests each character,
- we will test a longword at a time. The tricky part is testing
- if *any of the four* bytes in the longword in question are zero. */
- while (1)
- {
- /* We tentatively exit the loop if adding MAGIC_BITS to
- LONGWORD fails to change any of the hole bits of LONGWORD.
-
- 1) Is this safe? Will it catch all the zero bytes?
- Suppose there is a byte with all zeros. Any carry bits
- propagating from its left will fall into the hole at its
- least significant bit and stop. Since there will be no
- carry from its most significant bit, the LSB of the
- byte to the left will be unchanged, and the zero will be
- detected.
-
- 2) Is this worthwhile? Will it ignore everything except
- zero bytes? Suppose every byte of LONGWORD has a bit set
- somewhere. There will be a carry into bit 8. If bit 8
- is set, this will carry into bit 16. If bit 8 is clear,
- one of bits 9-15 must be set, so there will be a carry
- into bit 16. Similarly, there will be a carry into bit
- 24. If one of bits 24-30 is set, there will be a carry
- into bit 31, so all of the hole bits will be changed.
+#define AS_RAWMEMCHR
+#include "strchr.h"
- The one misfire occurs when bits 24-30 are clear and bit
- 31 is set; in this case, the hole at bit 31 is not
- changed. If we had access to the processor carry flag,
- we could close this loophole by putting the fourth hole
- at bit 32!
- So it ignores everything except 128's, when they're aligned
- properly.
- 3) But wait! Aren't we looking for C, not zero?
- Good point. So what we do is XOR LONGWORD with a longword,
- each of whose bytes is C. This turns each byte that is C
- into a zero. */
-
- longword = *longword_ptr++ ^ charmask;
-
- /* Add MAGIC_BITS to LONGWORD. */
- if ((((longword + magic_bits)
-
- /* Set those bits that were unchanged by the addition. */
- ^ ~longword)
-
- /* Look at only the hole bits. If any of the hole bits
- are unchanged, most likely one of the bytes was a
- zero. */
- & ~magic_bits) != 0)
- {
- /* Which of the bytes was C? If none of them were, it was
- a misfire; continue the search. */
-
- const unsigned char *cp = (const unsigned char *) (longword_ptr - 1);
-
- if (cp[0] == c)
- return (__ptr_t) cp;
- if (cp[1] == c)
- return (__ptr_t) &cp[1];
- if (cp[2] == c)
- return (__ptr_t) &cp[2];
- if (cp[3] == c)
- return (__ptr_t) &cp[3];
-#if LONG_MAX > 2147483647
- if (cp[4] == c)
- return (__ptr_t) &cp[4];
- if (cp[5] == c)
- return (__ptr_t) &cp[5];
- if (cp[6] == c)
- return (__ptr_t) &cp[6];
- if (cp[7] == c)
- return (__ptr_t) &cp[7];
-#endif
- }
- }
-}
libc_hidden_def (__rawmemchr)
weak_alias (__rawmemchr, rawmemchr)
diff --git a/string/str-two-way.h b/string/str-two-way.h
deleted file mode 100644
index 1b2a8bd..0000000
--- a/string/str-two-way.h
+++ /dev/null
@@ -1,428 +0,0 @@
-/* Byte-wise substring search, using the Two-Way algorithm.
- Copyright (C) 2008, 2010 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
- Written by Eric Blake <ebb9@byu.net>, 2008.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* Before including this file, you need to include <string.h> (and
- <config.h> before that, if not part of libc), and define:
- RESULT_TYPE A macro that expands to the return type.
- AVAILABLE(h, h_l, j, n_l)
- A macro that returns nonzero if there are
- at least N_L bytes left starting at H[J].
- H is 'unsigned char *', H_L, J, and N_L
- are 'size_t'; H_L is an lvalue. For
- NUL-terminated searches, H_L can be
- modified each iteration to avoid having
- to compute the end of H up front.
-
- For case-insensitivity, you may optionally define:
- CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L
- characters of P1 and P2 are equal.
- CANON_ELEMENT(c) A macro that canonicalizes an element right after
- it has been fetched from one of the two strings.
- The argument is an 'unsigned char'; the result
- must be an 'unsigned char' as well.
-
- This file undefines the macros documented above, and defines
- LONG_NEEDLE_THRESHOLD.
-*/
-
-#include <limits.h>
-#include <stdint.h>
-
-/* We use the Two-Way string matching algorithm, which guarantees
- linear complexity with constant space. Additionally, for long
- needles, we also use a bad character shift table similar to the
- Boyer-Moore algorithm to achieve improved (potentially sub-linear)
- performance.
-
- See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
- and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
-*/
-
-/* Point at which computing a bad-byte shift table is likely to be
- worthwhile. Small needles should not compute a table, since it
- adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
- speedup no greater than a factor of NEEDLE_LEN. The larger the
- needle, the better the potential performance gain. On the other
- hand, on non-POSIX systems with CHAR_BIT larger than eight, the
- memory required for the table is prohibitive. */
-#if CHAR_BIT < 10
-# define LONG_NEEDLE_THRESHOLD 32U
-#else
-# define LONG_NEEDLE_THRESHOLD SIZE_MAX
-#endif
-
-#ifndef MAX
-# define MAX(a, b) ((a < b) ? (b) : (a))
-#endif
-
-#ifndef CANON_ELEMENT
-# define CANON_ELEMENT(c) c
-#endif
-#ifndef CMP_FUNC
-# define CMP_FUNC memcmp
-#endif
-
-/* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
- Return the index of the first byte in the right half, and set
- *PERIOD to the global period of the right half.
-
- The global period of a string is the smallest index (possibly its
- length) at which all remaining bytes in the string are repetitions
- of the prefix (the last repetition may be a subset of the prefix).
-
- When NEEDLE is factored into two halves, a local period is the
- length of the smallest word that shares a suffix with the left half
- and shares a prefix with the right half. All factorizations of a
- non-empty NEEDLE have a local period of at least 1 and no greater
- than NEEDLE_LEN.
-
- A critical factorization has the property that the local period
- equals the global period. All strings have at least one critical
- factorization with the left half smaller than the global period.
-
- Given an ordered alphabet, a critical factorization can be computed
- in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
- larger of two ordered maximal suffixes. The ordered maximal
- suffixes are determined by lexicographic comparison of
- periodicity. */
-static size_t
-critical_factorization (const unsigned char *needle, size_t needle_len,
- size_t *period)
-{
- /* Index of last byte of left half, or SIZE_MAX. */
- size_t max_suffix, max_suffix_rev;
- size_t j; /* Index into NEEDLE for current candidate suffix. */
- size_t k; /* Offset into current period. */
- size_t p; /* Intermediate period. */
- unsigned char a, b; /* Current comparison bytes. */
-
- /* Invariants:
- 0 <= j < NEEDLE_LEN - 1
- -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
- min(max_suffix, max_suffix_rev) < global period of NEEDLE
- 1 <= p <= global period of NEEDLE
- p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
- 1 <= k <= p
- */
-
- /* Perform lexicographic search. */
- max_suffix = SIZE_MAX;
- j = 0;
- k = p = 1;
- while (j + k < needle_len)
- {
- a = CANON_ELEMENT (needle[j + k]);
- b = CANON_ELEMENT (needle[max_suffix + k]);
- if (a < b)
- {
- /* Suffix is smaller, period is entire prefix so far. */
- j += k;
- k = 1;
- p = j - max_suffix;
- }
- else if (a == b)
- {
- /* Advance through repetition of the current period. */
- if (k != p)
- ++k;
- else
- {
- j += p;
- k = 1;
- }
- }
- else /* b < a */
- {
- /* Suffix is larger, start over from current location. */
- max_suffix = j++;
- k = p = 1;
- }
- }
- *period = p;
-
- /* Perform reverse lexicographic search. */
- max_suffix_rev = SIZE_MAX;
- j = 0;
- k = p = 1;
- while (j + k < needle_len)
- {
- a = CANON_ELEMENT (needle[j + k]);
- b = CANON_ELEMENT (needle[max_suffix_rev + k]);
- if (b < a)
- {
- /* Suffix is smaller, period is entire prefix so far. */
- j += k;
- k = 1;
- p = j - max_suffix_rev;
- }
- else if (a == b)
- {
- /* Advance through repetition of the current period. */
- if (k != p)
- ++k;
- else
- {
- j += p;
- k = 1;
- }
- }
- else /* a < b */
- {
- /* Suffix is larger, start over from current location. */
- max_suffix_rev = j++;
- k = p = 1;
- }
- }
-
- /* Choose the longer suffix. Return the first byte of the right
- half, rather than the last byte of the left half. */
- if (max_suffix_rev + 1 < max_suffix + 1)
- return max_suffix + 1;
- *period = p;
- return max_suffix_rev + 1;
-}
-
-/* Return the first location of non-empty NEEDLE within HAYSTACK, or
- NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
- method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
- Performance is guaranteed to be linear, with an initialization cost
- of 2 * NEEDLE_LEN comparisons.
-
- If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
- most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
- If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
- HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */
-static RETURN_TYPE
-two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
- const unsigned char *needle, size_t needle_len)
-{
- size_t i; /* Index into current byte of NEEDLE. */
- size_t j; /* Index into current window of HAYSTACK. */
- size_t period; /* The period of the right half of needle. */
- size_t suffix; /* The index of the right half of needle. */
-
- /* Factor the needle into two halves, such that the left half is
- smaller than the global period, and the right half is
- periodic (with a period as large as NEEDLE_LEN - suffix). */
- suffix = critical_factorization (needle, needle_len, &period);
-
- /* Perform the search. Each iteration compares the right half
- first. */
- if (CMP_FUNC (needle, needle + period, suffix) == 0)
- {
- /* Entire needle is periodic; a mismatch can only advance by the
- period, so use memory to avoid rescanning known occurrences
- of the period. */
- size_t memory = 0;
- j = 0;
- while (AVAILABLE (haystack, haystack_len, j, needle_len))
- {
- /* Scan for matches in right half. */
- i = MAX (suffix, memory);
- while (i < needle_len && (CANON_ELEMENT (needle[i])
- == CANON_ELEMENT (haystack[i + j])))
- ++i;
- if (needle_len <= i)
- {
- /* Scan for matches in left half. */
- i = suffix - 1;
- while (memory < i + 1 && (CANON_ELEMENT (needle[i])
- == CANON_ELEMENT (haystack[i + j])))
- --i;
- if (i + 1 < memory + 1)
- return (RETURN_TYPE) (haystack + j);
- /* No match, so remember how many repetitions of period
- on the right half were scanned. */
- j += period;
- memory = needle_len - period;
- }
- else
- {
- j += i - suffix + 1;
- memory = 0;
- }
- }
- }
- else
- {
- /* The two halves of needle are distinct; no extra memory is
- required, and any mismatch results in a maximal shift. */
- period = MAX (suffix, needle_len - suffix) + 1;
- j = 0;
- while (AVAILABLE (haystack, haystack_len, j, needle_len))
- {
- /* Scan for matches in right half. */
- i = suffix;
- while (i < needle_len && (CANON_ELEMENT (needle[i])
- == CANON_ELEMENT (haystack[i + j])))
- ++i;
- if (needle_len <= i)
- {
- /* Scan for matches in left half. */
- i = suffix - 1;
- while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
- == CANON_ELEMENT (haystack[i + j])))
- --i;
- if (i == SIZE_MAX)
- return (RETURN_TYPE) (haystack + j);
- j += period;
- }
- else
- j += i - suffix + 1;
- }
- }
- return NULL;
-}
-
-/* Return the first location of non-empty NEEDLE within HAYSTACK, or
- NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
- method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
- Performance is guaranteed to be linear, with an initialization cost
- of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
-
- If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
- most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
- and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
- If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
- HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
- sublinear performance is not possible. */
-static RETURN_TYPE
-two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
- const unsigned char *needle, size_t needle_len)
-{
- size_t i; /* Index into current byte of NEEDLE. */
- size_t j; /* Index into current window of HAYSTACK. */
- size_t period; /* The period of the right half of needle. */
- size_t suffix; /* The index of the right half of needle. */
- size_t shift_table[1U << CHAR_BIT]; /* See below. */
-
- /* Factor the needle into two halves, such that the left half is
- smaller than the global period, and the right half is
- periodic (with a period as large as NEEDLE_LEN - suffix). */
- suffix = critical_factorization (needle, needle_len, &period);
-
- /* Populate shift_table. For each possible byte value c,
- shift_table[c] is the distance from the last occurrence of c to
- the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
- shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */
- for (i = 0; i < 1U << CHAR_BIT; i++)
- shift_table[i] = needle_len;
- for (i = 0; i < needle_len; i++)
- shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
-
- /* Perform the search. Each iteration compares the right half
- first. */
- if (CMP_FUNC (needle, needle + period, suffix) == 0)
- {
- /* Entire needle is periodic; a mismatch can only advance by the
- period, so use memory to avoid rescanning known occurrences
- of the period. */
- size_t memory = 0;
- size_t shift;
- j = 0;
- while (AVAILABLE (haystack, haystack_len, j, needle_len))
- {
- /* Check the last byte first; if it does not match, then
- shift to the next possible match location. */
- shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
- if (0 < shift)
- {
- if (memory && shift < period)
- {
- /* Since needle is periodic, but the last period has
- a byte out of place, there can be no match until
- after the mismatch. */
- shift = needle_len - period;
- }
- memory = 0;
- j += shift;
- continue;
- }
- /* Scan for matches in right half. The last byte has
- already been matched, by virtue of the shift table. */
- i = MAX (suffix, memory);
- while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
- == CANON_ELEMENT (haystack[i + j])))
- ++i;
- if (needle_len - 1 <= i)
- {
- /* Scan for matches in left half. */
- i = suffix - 1;
- while (memory < i + 1 && (CANON_ELEMENT (needle[i])
- == CANON_ELEMENT (haystack[i + j])))
- --i;
- if (i + 1 < memory + 1)
- return (RETURN_TYPE) (haystack + j);
- /* No match, so remember how many repetitions of period
- on the right half were scanned. */
- j += period;
- memory = needle_len - period;
- }
- else
- {
- j += i - suffix + 1;
- memory = 0;
- }
- }
- }
- else
- {
- /* The two halves of needle are distinct; no extra memory is
- required, and any mismatch results in a maximal shift. */
- size_t shift;
- period = MAX (suffix, needle_len - suffix) + 1;
- j = 0;
- while (AVAILABLE (haystack, haystack_len, j, needle_len))
- {
- /* Check the last byte first; if it does not match, then
- shift to the next possible match location. */
- shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
- if (0 < shift)
- {
- j += shift;
- continue;
- }
- /* Scan for matches in right half. The last byte has
- already been matched, by virtue of the shift table. */
- i = suffix;
- while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
- == CANON_ELEMENT (haystack[i + j])))
- ++i;
- if (needle_len - 1 <= i)
- {
- /* Scan for matches in left half. */
- i = suffix - 1;
- while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
- == CANON_ELEMENT (haystack[i + j])))
- --i;
- if (i == SIZE_MAX)
- return (RETURN_TYPE) (haystack + j);
- j += period;
- }
- else
- j += i - suffix + 1;
- }
- }
- return NULL;
-}
-
-#undef AVAILABLE
-#undef CANON_ELEMENT
-#undef CMP_FUNC
-#undef RETURN_TYPE
diff --git a/string/strcasestr.c b/string/strcasestr.c
index 9e1bde9..df41f50 100644
--- a/string/strcasestr.c
+++ b/string/strcasestr.c
@@ -1,6 +1,4 @@
-/* Return the offset of one string within another.
- Copyright (C) 1994, 1996-2000, 2004, 2008, 2009, 2010
- Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -14,40 +12,9 @@
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
+ License asize_t with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-/*
- * My personal strstr() implementation that beats most other algorithms.
- * Until someone tells me otherwise, I assume that this is the
- * fastest implementation of strstr() in C.
- * I deliberately chose not to comment it. You should have at least
- * as much fun trying to understand it, as I had to write it :-).
- *
- * Stephen R. van den Berg, berg@pool.informatik.rwth-aachen.de */
-
-#if HAVE_CONFIG_H
-# include <config.h>
-#endif
-
-/* Specification. */
-#include <string.h>
-
-#include <ctype.h>
-#include <stdbool.h>
-#include <strings.h>
-
-#define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch))
-
-/* Two-Way algorithm. */
-#define RETURN_TYPE char *
-#define AVAILABLE(h, h_l, j, n_l) \
- (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \
- && ((h_l) = (j) + (n_l)))
-#define CANON_ELEMENT(c) TOLOWER (c)
-#define CMP_FUNC(p1, p2, l) \
- __strncasecmp ((const char *) (p1), (const char *) (p2), l)
-#include "str-two-way.h"
#undef strcasestr
#undef __strcasestr
@@ -56,52 +23,12 @@
#define STRCASESTR __strcasestr
#endif
+#define AS_STRCASESTR
+#include "strstr.h"
-/* Find the first occurrence of NEEDLE in HAYSTACK, using
- case-insensitive comparison. This function gives unspecified
- results in multibyte locales. */
-char *
-STRCASESTR (const char *haystack_start, const char *needle_start)
-{
- const char *haystack = haystack_start;
- const char *needle = needle_start;
- size_t needle_len; /* Length of NEEDLE. */
- size_t haystack_len; /* Known minimum length of HAYSTACK. */
- bool ok = true; /* True if NEEDLE is prefix of HAYSTACK. */
-
- /* Determine length of NEEDLE, and in the process, make sure
- HAYSTACK is at least as long (no point processing all of a long
- NEEDLE if HAYSTACK is too short). */
- while (*haystack && *needle)
- {
- ok &= (TOLOWER ((unsigned char) *haystack)
- == TOLOWER ((unsigned char) *needle));
- haystack++;
- needle++;
- }
- if (*needle)
- return NULL;
- if (ok)
- return (char *) haystack_start;
- needle_len = needle - needle_start;
- haystack = haystack_start + 1;
- haystack_len = needle_len - 1;
-
- /* Perform the search. Abstract memory is considered to be an array
- of 'unsigned char' values, not an array of 'char' values. See
- ISO C 99 section 6.2.6.1. */
- if (needle_len < LONG_NEEDLE_THRESHOLD)
- return two_way_short_needle ((const unsigned char *) haystack,
- haystack_len,
- (const unsigned char *) needle_start,
- needle_len);
- return two_way_long_needle ((const unsigned char *) haystack, haystack_len,
- (const unsigned char *) needle_start,
- needle_len);
-}
-
-#undef LONG_NEEDLE_THRESHOLD
#ifndef NO_ALIAS
weak_alias (__strcasestr, strcasestr)
#endif
+
+
diff --git a/string/strchr.c b/string/strchr.c
index 9d18b7e..04f6eb8 100644
--- a/string/strchr.c
+++ b/string/strchr.c
@@ -1,11 +1,5 @@
-/* Copyright (C) 1991,1993-1997,1999,2000,2003,2006
- Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
- Based on strlen implementation by Torbjorn Granlund (tege@sics.se),
- with help from Dan Sahlin (dan@sics.se) and
- bug fix and commentary by Jim Blandy (jimb@ai.mit.edu);
- adaptation to strchr suggested by Dick Karpinski (dick@cca.ucsf.edu),
- and implemented by Roland McGrath (roland@ai.mit.edu).
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -18,170 +12,17 @@
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
+ License asize_t with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <string.h>
-#include <memcopy.h>
-#include <stdlib.h>
-#undef strchr
-
-/* Find the first occurrence of C in S. */
-char *
-strchr (s, c_in)
- const char *s;
- int c_in;
-{
- const unsigned char *char_ptr;
- const unsigned long int *longword_ptr;
- unsigned long int longword, magic_bits, charmask;
- unsigned char c;
-
- c = (unsigned char) c_in;
-
- /* Handle the first few characters by reading one character at a time.
- Do this until CHAR_PTR is aligned on a longword boundary. */
- for (char_ptr = (const unsigned char *) s;
- ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0;
- ++char_ptr)
- if (*char_ptr == c)
- return (void *) char_ptr;
- else if (*char_ptr == '\0')
- return NULL;
-
- /* All these elucidatory comments refer to 4-byte longwords,
- but the theory applies equally well to 8-byte longwords. */
-
- longword_ptr = (unsigned long int *) char_ptr;
-
- /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits
- the "holes." Note that there is a hole just to the left of
- each byte, with an extra at the end:
-
- bits: 01111110 11111110 11111110 11111111
- bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
- The 1-bits make sure that carries propagate to the next 0-bit.
- The 0-bits provide holes for carries to fall into. */
- switch (sizeof (longword))
- {
- case 4: magic_bits = 0x7efefeffL; break;
- case 8: magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; break;
- default:
- abort ();
- }
-
- /* Set up a longword, each of whose bytes is C. */
- charmask = c | (c << 8);
- charmask |= charmask << 16;
- if (sizeof (longword) > 4)
- /* Do the shift in two steps to avoid a warning if long has 32 bits. */
- charmask |= (charmask << 16) << 16;
- if (sizeof (longword) > 8)
- abort ();
-
- /* Instead of the traditional loop which tests each character,
- we will test a longword at a time. The tricky part is testing
- if *any of the four* bytes in the longword in question are zero. */
- for (;;)
- {
- /* We tentatively exit the loop if adding MAGIC_BITS to
- LONGWORD fails to change any of the hole bits of LONGWORD.
-
- 1) Is this safe? Will it catch all the zero bytes?
- Suppose there is a byte with all zeros. Any carry bits
- propagating from its left will fall into the hole at its
- least significant bit and stop. Since there will be no
- carry from its most significant bit, the LSB of the
- byte to the left will be unchanged, and the zero will be
- detected.
-
- 2) Is this worthwhile? Will it ignore everything except
- zero bytes? Suppose every byte of LONGWORD has a bit set
- somewhere. There will be a carry into bit 8. If bit 8
- is set, this will carry into bit 16. If bit 8 is clear,
- one of bits 9-15 must be set, so there will be a carry
- into bit 16. Similarly, there will be a carry into bit
- 24. If one of bits 24-30 is set, there will be a carry
- into bit 31, so all of the hole bits will be changed.
-
- The one misfire occurs when bits 24-30 are clear and bit
- 31 is set; in this case, the hole at bit 31 is not
- changed. If we had access to the processor carry flag,
- we could close this loophole by putting the fourth hole
- at bit 32!
-
- So it ignores everything except 128's, when they're aligned
- properly.
-
- 3) But wait! Aren't we looking for C as well as zero?
- Good point. So what we do is XOR LONGWORD with a longword,
- each of whose bytes is C. This turns each byte that is C
- into a zero. */
-
- longword = *longword_ptr++;
-
- /* Add MAGIC_BITS to LONGWORD. */
- if ((((longword + magic_bits)
-
- /* Set those bits that were unchanged by the addition. */
- ^ ~longword)
-
- /* Look at only the hole bits. If any of the hole bits
- are unchanged, most likely one of the bytes was a
- zero. */
- & ~magic_bits) != 0 ||
-
- /* That caught zeroes. Now test for C. */
- ((((longword ^ charmask) + magic_bits) ^ ~(longword ^ charmask))
- & ~magic_bits) != 0)
- {
- /* Which of the bytes was C or zero?
- If none of them were, it was a misfire; continue the search. */
-
- const unsigned char *cp = (const unsigned char *) (longword_ptr - 1);
+#ifndef STRCHR
+#define STRCHR strchr
+#endif
- if (*cp == c)
- return (char *) cp;
- else if (*cp == '\0')
- return NULL;
- if (*++cp == c)
- return (char *) cp;
- else if (*cp == '\0')
- return NULL;
- if (*++cp == c)
- return (char *) cp;
- else if (*cp == '\0')
- return NULL;
- if (*++cp == c)
- return (char *) cp;
- else if (*cp == '\0')
- return NULL;
- if (sizeof (longword) > 4)
- {
- if (*++cp == c)
- return (char *) cp;
- else if (*cp == '\0')
- return NULL;
- if (*++cp == c)
- return (char *) cp;
- else if (*cp == '\0')
- return NULL;
- if (*++cp == c)
- return (char *) cp;
- else if (*cp == '\0')
- return NULL;
- if (*++cp == c)
- return (char *) cp;
- else if (*cp == '\0')
- return NULL;
- }
- }
- }
+#define AS_STRCHR
+#include "strchr.h"
- return NULL;
-}
#ifdef weak_alias
#undef index
diff --git a/string/strchr.h b/string/strchr.h
new file mode 100644
index 0000000..b6ff374
--- /dev/null
+++ b/string/strchr.h
@@ -0,0 +1,70 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License asize_t with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define unroll 4
+#define prefetch 8
+
+#include "vector.h"
+
+#define TEST_CODE(so,sn) TEST_EQ(sn,vc)
+
+#if defined(AS_STRCHR) || defined(AS_STRRCHR) || defined(AS_STRCHRNUL)
+#define DETECT_ZERO_BYTE
+#endif
+#if defined(AS_MEMCHR) || defined(AS_MEMRCHR)
+#define DETECT_END ((s+ss>=s) ? s+ss : ((uchar*)((long)-1)))
+#endif
+
+
+#ifdef AS_STRCHR
+#define LOOP_END(p) return NULL;
+uchar* STRCHR( const uchar *s, int c )
+#endif
+#ifdef AS_MEMCHR
+#define LOOP_END(p) return NULL;
+uchar* MEMCHR( const uchar *s, int c , size_t ss)
+#endif
+
+#if defined(AS_STRRCHR) || defined(AS_MEMRCHR)
+#define LOOP_BODY(p) r=p;
+#define LOOP_END(p) return r;
+#ifdef AS_STRRCHR
+uchar* STRRCHR( const uchar *s, int c)
+#endif
+#ifdef AS_MEMRCHR
+uchar* MEMRCHR( const uchar *s, int c , size_t ss)
+#endif
+#else
+#define LOOP_BODY(p) return p;
+#endif
+
+#ifdef AS_STRCHRNUL
+#define LOOP_END(p) return p;
+uchar* STRCHRNUL(const uchar *s, int c )
+#endif
+#ifdef AS_RAWMEMCHR
+#define LOOP_END(p) /*cannot happen*/
+uchar* RAWMEMCHR(const uchar *s, int c )
+#endif
+{
+#if defined(AS_STRCHR) || defined(AS_STRRCHR) || defined(AS_STRCHRNUL)
+ if(__builtin_expect(c==0,0)) return s+strlen(s);
+#endif
+ uchar UNUSED *r = NULL;
+ tp_vector vc=BROADCAST(c);
+#include "loop.h"
+}
diff --git a/string/strchrnul.c b/string/strchrnul.c
index 0db5e23..6e6992f 100644
--- a/string/strchrnul.c
+++ b/string/strchrnul.c
@@ -21,149 +21,17 @@
<http://www.gnu.org/licenses/>. */
#include <string.h>
-#include <memcopy.h>
#include <stdlib.h>
#undef __strchrnul
#undef strchrnul
-/* Find the first occurrence of C in S or the final NUL byte. */
-char *
-__strchrnul (s, c_in)
- const char *s;
- int c_in;
-{
- const unsigned char *char_ptr;
- const unsigned long int *longword_ptr;
- unsigned long int longword, magic_bits, charmask;
- unsigned char c;
- c = (unsigned char) c_in;
+#ifndef STRCHRNUL
+#define STRCHRNUL __strchrnul
+#endif
- /* Handle the first few characters by reading one character at a time.
- Do this until CHAR_PTR is aligned on a longword boundary. */
- for (char_ptr = (const unsigned char *) s;
- ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0;
- ++char_ptr)
- if (*char_ptr == c || *char_ptr == '\0')
- return (void *) char_ptr;
-
- /* All these elucidatory comments refer to 4-byte longwords,
- but the theory applies equally well to 8-byte longwords. */
-
- longword_ptr = (unsigned long int *) char_ptr;
-
- /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits
- the "holes." Note that there is a hole just to the left of
- each byte, with an extra at the end:
-
- bits: 01111110 11111110 11111110 11111111
- bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
- The 1-bits make sure that carries propagate to the next 0-bit.
- The 0-bits provide holes for carries to fall into. */
- switch (sizeof (longword))
- {
- case 4: magic_bits = 0x7efefeffL; break;
- case 8: magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; break;
- default:
- abort ();
- }
-
- /* Set up a longword, each of whose bytes is C. */
- charmask = c | (c << 8);
- charmask |= charmask << 16;
- if (sizeof (longword) > 4)
- /* Do the shift in two steps to avoid a warning if long has 32 bits. */
- charmask |= (charmask << 16) << 16;
- if (sizeof (longword) > 8)
- abort ();
-
- /* Instead of the traditional loop which tests each character,
- we will test a longword at a time. The tricky part is testing
- if *any of the four* bytes in the longword in question are zero. */
- for (;;)
- {
- /* We tentatively exit the loop if adding MAGIC_BITS to
- LONGWORD fails to change any of the hole bits of LONGWORD.
-
- 1) Is this safe? Will it catch all the zero bytes?
- Suppose there is a byte with all zeros. Any carry bits
- propagating from its left will fall into the hole at its
- least significant bit and stop. Since there will be no
- carry from its most significant bit, the LSB of the
- byte to the left will be unchanged, and the zero will be
- detected.
-
- 2) Is this worthwhile? Will it ignore everything except
- zero bytes? Suppose every byte of LONGWORD has a bit set
- somewhere. There will be a carry into bit 8. If bit 8
- is set, this will carry into bit 16. If bit 8 is clear,
- one of bits 9-15 must be set, so there will be a carry
- into bit 16. Similarly, there will be a carry into bit
- 24. If one of bits 24-30 is set, there will be a carry
- into bit 31, so all of the hole bits will be changed.
-
- The one misfire occurs when bits 24-30 are clear and bit
- 31 is set; in this case, the hole at bit 31 is not
- changed. If we had access to the processor carry flag,
- we could close this loophole by putting the fourth hole
- at bit 32!
-
- So it ignores everything except 128's, when they're aligned
- properly.
-
- 3) But wait! Aren't we looking for C as well as zero?
- Good point. So what we do is XOR LONGWORD with a longword,
- each of whose bytes is C. This turns each byte that is C
- into a zero. */
-
- longword = *longword_ptr++;
-
- /* Add MAGIC_BITS to LONGWORD. */
- if ((((longword + magic_bits)
-
- /* Set those bits that were unchanged by the addition. */
- ^ ~longword)
-
- /* Look at only the hole bits. If any of the hole bits
- are unchanged, most likely one of the bytes was a
- zero. */
- & ~magic_bits) != 0 ||
-
- /* That caught zeroes. Now test for C. */
- ((((longword ^ charmask) + magic_bits) ^ ~(longword ^ charmask))
- & ~magic_bits) != 0)
- {
- /* Which of the bytes was C or zero?
- If none of them were, it was a misfire; continue the search. */
-
- const unsigned char *cp = (const unsigned char *) (longword_ptr - 1);
-
- if (*cp == c || *cp == '\0')
- return (char *) cp;
- if (*++cp == c || *cp == '\0')
- return (char *) cp;
- if (*++cp == c || *cp == '\0')
- return (char *) cp;
- if (*++cp == c || *cp == '\0')
- return (char *) cp;
- if (sizeof (longword) > 4)
- {
- if (*++cp == c || *cp == '\0')
- return (char *) cp;
- if (*++cp == c || *cp == '\0')
- return (char *) cp;
- if (*++cp == c || *cp == '\0')
- return (char *) cp;
- if (*++cp == c || *cp == '\0')
- return (char *) cp;
- }
- }
- }
-
- /* This should never happen. */
- return NULL;
-}
+#define AS_STRCHRNUL
+#include "strchr.h"
weak_alias (__strchrnul, strchrnul)
diff --git a/string/strlen.c b/string/strlen.c
index 5c1efda..7fdf07e 100644
--- a/string/strlen.c
+++ b/string/strlen.c
@@ -1,8 +1,5 @@
-/* Copyright (C) 1991,1993,1997,2000,2003,2009 Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
- Written by Torbjorn Granlund (tege@sics.se),
- with help from Dan Sahlin (dan@sics.se);
- commentary by Jim Blandy (jimb@ai.mit.edu).
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -15,92 +12,14 @@
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
+ License asize_t with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <string.h>
-#include <stdlib.h>
-#undef strlen
+#ifndef STRLEN
+#define STRLEN strlen
+#endif
-/* Return the length of the null-terminated string STR. Scan for
- the null terminator quickly by testing four bytes at a time. */
-size_t
-strlen (str)
- const char *str;
-{
- const char *char_ptr;
- const unsigned long int *longword_ptr;
- unsigned long int longword, himagic, lomagic;
+#define AS_STRLEN
+#include "strlen.h"
- /* Handle the first few characters by reading one character at a time.
- Do this until CHAR_PTR is aligned on a longword boundary. */
- for (char_ptr = str; ((unsigned long int) char_ptr
- & (sizeof (longword) - 1)) != 0;
- ++char_ptr)
- if (*char_ptr == '\0')
- return char_ptr - str;
-
- /* All these elucidatory comments refer to 4-byte longwords,
- but the theory applies equally well to 8-byte longwords. */
-
- longword_ptr = (unsigned long int *) char_ptr;
-
- /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits
- the "holes." Note that there is a hole just to the left of
- each byte, with an extra at the end:
-
- bits: 01111110 11111110 11111110 11111111
- bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
- The 1-bits make sure that carries propagate to the next 0-bit.
- The 0-bits provide holes for carries to fall into. */
- himagic = 0x80808080L;
- lomagic = 0x01010101L;
- if (sizeof (longword) > 4)
- {
- /* 64-bit version of the magic. */
- /* Do the shift in two steps to avoid a warning if long has 32 bits. */
- himagic = ((himagic << 16) << 16) | himagic;
- lomagic = ((lomagic << 16) << 16) | lomagic;
- }
- if (sizeof (longword) > 8)
- abort ();
-
- /* Instead of the traditional loop which tests each character,
- we will test a longword at a time. The tricky part is testing
- if *any of the four* bytes in the longword in question are zero. */
- for (;;)
- {
- longword = *longword_ptr++;
-
- if (((longword - lomagic) & ~longword & himagic) != 0)
- {
- /* Which of the bytes was the zero? If none of them were, it was
- a misfire; continue the search. */
-
- const char *cp = (const char *) (longword_ptr - 1);
-
- if (cp[0] == 0)
- return cp - str;
- if (cp[1] == 0)
- return cp - str + 1;
- if (cp[2] == 0)
- return cp - str + 2;
- if (cp[3] == 0)
- return cp - str + 3;
- if (sizeof (longword) > 4)
- {
- if (cp[4] == 0)
- return cp - str + 4;
- if (cp[5] == 0)
- return cp - str + 5;
- if (cp[6] == 0)
- return cp - str + 6;
- if (cp[7] == 0)
- return cp - str + 7;
- }
- }
- }
-}
-libc_hidden_builtin_def (strlen)
diff --git a/string/strlen.h b/string/strlen.h
new file mode 100644
index 0000000..950b8e1
--- /dev/null
+++ b/string/strlen.h
@@ -0,0 +1,39 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License asize_t with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#define unroll 4
+#define prefetch 8
+
+#include "vector.h"
+
+#define DETECT_ZERO_BYTE
+#define TEST_CODE(so,sn) vzero
+#define LOOP_BODY(p) return p-s;
+
+#ifdef AS_STRNLEN
+#define DETECT_END ((s+ss>=s) ? s+ss : ((uchar*)((long)-1)))
+#define LOOP_END(p) return p-s;
+size_t STRNLEN( uchar *s , size_t ss )
+#endif
+#ifdef AS_STRLEN
+#define LOOP_END(p) return p-s;
+size_t STRLEN( uchar *s )
+#endif
+{
+#include "loop.h"
+}
diff --git a/string/strnlen.c b/string/strnlen.c
index 65b9aa6..90b9725 100644
--- a/string/strnlen.c
+++ b/string/strnlen.c
@@ -1,15 +1,10 @@
-/* Find the length of STRING, but scan at most MAXLEN characters.
- Copyright (C) 1991, 1993, 1997, 2000, 2001, 2005, 2011 Free Software Foundation, Inc.
- Contributed by Jakub Jelinek <jakub@redhat.com>.
-
- Based on strlen written by Torbjorn Granlund (tege@sics.se),
- with help from Dan Sahlin (dan@sics.se);
- commentary by Jim Blandy (jimb@ai.mit.edu).
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public License as
- published by the Free Software Foundation; either version 2.1 of the
- License, or (at your option) any later version.
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -17,149 +12,18 @@
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; see the file COPYING.LIB. If
- not, see <http://www.gnu.org/licenses/>. */
-
-#include <string.h>
-#include <stdlib.h>
+ License asize_t with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
-/* Find the length of S, but scan at most MAXLEN characters. If no
- '\0' terminator is found in that many characters, return MAXLEN. */
-#ifdef STRNLEN
-# define __strnlen STRNLEN
+#ifndef STRNLEN
+#define STRNLEN __strnlen
#endif
-size_t
-__strnlen (const char *str, size_t maxlen)
-{
- const char *char_ptr, *end_ptr = str + maxlen;
- const unsigned long int *longword_ptr;
- unsigned long int longword, himagic, lomagic;
-
- if (maxlen == 0)
- return 0;
-
- if (__builtin_expect (end_ptr < str, 0))
- end_ptr = (const char *) ~0UL;
-
- /* Handle the first few characters by reading one character at a time.
- Do this until CHAR_PTR is aligned on a longword boundary. */
- for (char_ptr = str; ((unsigned long int) char_ptr
- & (sizeof (longword) - 1)) != 0;
- ++char_ptr)
- if (*char_ptr == '\0')
- {
- if (char_ptr > end_ptr)
- char_ptr = end_ptr;
- return char_ptr - str;
- }
-
- /* All these elucidatory comments refer to 4-byte longwords,
- but the theory applies equally well to 8-byte longwords. */
-
- longword_ptr = (unsigned long int *) char_ptr;
-
- /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits
- the "holes." Note that there is a hole just to the left of
- each byte, with an extra at the end:
-
- bits: 01111110 11111110 11111110 11111111
- bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
- The 1-bits make sure that carries propagate to the next 0-bit.
- The 0-bits provide holes for carries to fall into. */
- himagic = 0x80808080L;
- lomagic = 0x01010101L;
- if (sizeof (longword) > 4)
- {
- /* 64-bit version of the magic. */
- /* Do the shift in two steps to avoid a warning if long has 32 bits. */
- himagic = ((himagic << 16) << 16) | himagic;
- lomagic = ((lomagic << 16) << 16) | lomagic;
- }
- if (sizeof (longword) > 8)
- abort ();
+#define AS_STRNLEN
+#include "strlen.h"
- /* Instead of the traditional loop which tests each character,
- we will test a longword at a time. The tricky part is testing
- if *any of the four* bytes in the longword in question are zero. */
- while (longword_ptr < (unsigned long int *) end_ptr)
- {
- /* We tentatively exit the loop if adding MAGIC_BITS to
- LONGWORD fails to change any of the hole bits of LONGWORD.
-
- 1) Is this safe? Will it catch all the zero bytes?
- Suppose there is a byte with all zeros. Any carry bits
- propagating from its left will fall into the hole at its
- least significant bit and stop. Since there will be no
- carry from its most significant bit, the LSB of the
- byte to the left will be unchanged, and the zero will be
- detected.
-
- 2) Is this worthwhile? Will it ignore everything except
- zero bytes? Suppose every byte of LONGWORD has a bit set
- somewhere. There will be a carry into bit 8. If bit 8
- is set, this will carry into bit 16. If bit 8 is clear,
- one of bits 9-15 must be set, so there will be a carry
- into bit 16. Similarly, there will be a carry into bit
- 24. If one of bits 24-30 is set, there will be a carry
- into bit 31, so all of the hole bits will be changed.
-
- The one misfire occurs when bits 24-30 are clear and bit
- 31 is set; in this case, the hole at bit 31 is not
- changed. If we had access to the processor carry flag,
- we could close this loophole by putting the fourth hole
- at bit 32!
-
- So it ignores everything except 128's, when they're aligned
- properly. */
-
- longword = *longword_ptr++;
-
- if ((longword - lomagic) & himagic)
- {
- /* Which of the bytes was the zero? If none of them were, it was
- a misfire; continue the search. */
-
- const char *cp = (const char *) (longword_ptr - 1);
-
- char_ptr = cp;
- if (cp[0] == 0)
- break;
- char_ptr = cp + 1;
- if (cp[1] == 0)
- break;
- char_ptr = cp + 2;
- if (cp[2] == 0)
- break;
- char_ptr = cp + 3;
- if (cp[3] == 0)
- break;
- if (sizeof (longword) > 4)
- {
- char_ptr = cp + 4;
- if (cp[4] == 0)
- break;
- char_ptr = cp + 5;
- if (cp[5] == 0)
- break;
- char_ptr = cp + 6;
- if (cp[6] == 0)
- break;
- char_ptr = cp + 7;
- if (cp[7] == 0)
- break;
- }
- }
- char_ptr = end_ptr;
- }
-
- if (char_ptr > end_ptr)
- char_ptr = end_ptr;
- return char_ptr - str;
-}
-#ifndef STRNLEN
+#ifndef NO_ALIAS
weak_alias (__strnlen, strnlen)
#endif
-libc_hidden_def (strnlen)
+
diff --git a/string/strrchr.c b/string/strrchr.c
index a986ff9..9d2d6c4 100644
--- a/string/strrchr.c
+++ b/string/strrchr.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1991, 1995, 1996, 1997, 2003 Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -12,38 +12,19 @@
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
+ License asize_t with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <string.h>
-#undef strrchr
-
-/* Find the last occurrence of C in S. */
-char *
-strrchr (const char *s, int c)
-{
- register const char *found, *p;
-
- c = (unsigned char) c;
-
- /* Since strchr is fast, we use it rather than the obvious loop. */
-
- if (c == '\0')
- return strchr (s, '\0');
-
- found = NULL;
- while ((p = strchr (s, c)) != NULL)
- {
- found = p;
- s = p + 1;
- }
+#ifndef STRRCHR
+#define STRRCHR strrchr
+#endif
- return (char *) found;
-}
+#define AS_STRRCHR
+#include "strchr.h"
#ifdef weak_alias
-#undef rindex
+#undef index
weak_alias (strrchr, rindex)
#endif
libc_hidden_builtin_def (strrchr)
diff --git a/string/strstr.c b/string/strstr.c
index 10e6fdc..f09559a 100644
--- a/string/strstr.c
+++ b/string/strstr.c
@@ -1,6 +1,4 @@
-/* Return the offset of one string within another.
- Copyright (C) 1994,1996,1997,2000,2001,2003,2008,2009
- Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -14,78 +12,14 @@
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
+ License asize_t with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-/* This particular implementation was written by Eric Blake, 2008. */
-
-#ifndef _LIBC
-# include <config.h>
-#endif
-
-/* Specification of strstr. */
-#include <string.h>
-
-#include <stdbool.h>
-
-#ifndef _LIBC
-# define __builtin_expect(expr, val) (expr)
-#endif
-
-#define RETURN_TYPE char *
-#define AVAILABLE(h, h_l, j, n_l) \
- (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \
- && ((h_l) = (j) + (n_l)))
-#include "str-two-way.h"
-
-#undef strstr
#ifndef STRSTR
#define STRSTR strstr
#endif
-/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK
- if NEEDLE is empty, otherwise NULL if NEEDLE is not found in
- HAYSTACK. */
-char *
-STRSTR (const char *haystack_start, const char *needle_start)
-{
- const char *haystack = haystack_start;
- const char *needle = needle_start;
- size_t needle_len; /* Length of NEEDLE. */
- size_t haystack_len; /* Known minimum length of HAYSTACK. */
- bool ok = true; /* True if NEEDLE is prefix of HAYSTACK. */
-
- /* Determine length of NEEDLE, and in the process, make sure
- HAYSTACK is at least as long (no point processing all of a long
- NEEDLE if HAYSTACK is too short). */
- while (*haystack && *needle)
- ok &= *haystack++ == *needle++;
- if (*needle)
- return NULL;
- if (ok)
- return (char *) haystack_start;
-
- /* Reduce the size of haystack using strchr, since it has a smaller
- linear coefficient than the Two-Way algorithm. */
- needle_len = needle - needle_start;
- haystack = strchr (haystack_start + 1, *needle_start);
- if (!haystack || __builtin_expect (needle_len == 1, 0))
- return (char *) haystack;
- needle -= needle_len;
- haystack_len = (haystack > haystack_start + needle_len ? 1
- : needle_len + haystack_start - haystack);
-
- /* Perform the search. Abstract memory is considered to be an array
- of 'unsigned char' values, not an array of 'char' values. See
- ISO C 99 section 6.2.6.1. */
- if (needle_len < LONG_NEEDLE_THRESHOLD)
- return two_way_short_needle ((const unsigned char *) haystack,
- haystack_len,
- (const unsigned char *) needle, needle_len);
- return two_way_long_needle ((const unsigned char *) haystack, haystack_len,
- (const unsigned char *) needle, needle_len);
-}
-libc_hidden_builtin_def (strstr)
+#define AS_STRSTR
+#include "strstr.h"
-#undef LONG_NEEDLE_THRESHOLD
diff --git a/string/strstr.h b/string/strstr.h
new file mode 100644
index 0000000..016cc94
--- /dev/null
+++ b/string/strstr.h
@@ -0,0 +1,297 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License asize_t with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdlib.h>
+#include <string.h>
+
+#define unroll 4
+#define prefetch 8
+#define small_treshold 128
+
+#include "vector.h"
+
+
+
+#ifdef AS_STRSTR
+#define _AS_STR_CASESTR_MEM(x,y,z) x
+#endif
+#ifdef AS_STRCASESTR
+#define _AS_STR_CASESTR_MEM(x,y,z) y
+#endif
+#ifdef AS_MEMMEM
+#define _AS_STR_CASESTR_MEM(x,y,z) z
+#endif
+#define CHAR(x) _AS_STR_CASESTR_MEM(*(x),\
+tolower_fixed[*(x)],\
+*(x))
+
+/*TODO vectorize*/
+SI size_t strcmp_dir(const uchar *a,const uchar *b,size_t no,int dir)
+{
+ size_t i;
+ for(i=0; i<no && CHAR(a)==CHAR(b); i++)
+ {
+ a+=dir;
+ b+=dir;
+ }
+ return i;
+}
+
+/* Two way algorithm: CROCHEMORE M., PERRIN D., 1991,
+ Two-way string-matching, Journal of the ACM 38(3):651-675.
+ Implementation based from http://www-igm.univ-mlv.fr/~lecroq/string/node26.html
+
+
+ We use vectorized algorithm to find occurences of fragments
+ of size ns-check starting at n+check-1
+ On occurence we do step of two way algorithm and
+ tell finder in skip_to variable where it should resume search.
+*/
+
+static void two_way_preprocessing(uchar *n,size_t ns,size_t *per2,size_t *ell2,size_t *peri);
+static uchar *strstr_two_way(uchar *s, uchar *s_end, uchar *n, size_t ns)
+{
+ size_t ell, per, peri;
+ two_way_preprocessing(n,ns,&per,&ell,&peri);
+ size_t fw,fw_no,bw,bw_no;
+ size_t fw_from,fw_to,bw_from,bw_to;
+ size_t check=ns-2;
+ fw_from = ell;
+ fw_to = max(ell,check);
+ bw_from = min(ell,check);
+ bw_to = 0;
+ fw_no = fw_to - fw_from;
+ bw_no = bw_from - bw_to;
+
+ uchar *skip_to=s+check;
+ s+=ns-2;
+
+#define CAN_SKIP
+#define CASE_CONVERT(x) _AS_STR_CASESTR_MEM(x, parallel_tolower(x), x)
+#define MASK_CONVERT(x) CHAR(&x)
+
+#define LOOP_BODY(p)\
+ p -= ns - 1;\
+ fw = strcmp_dir(n + fw_from ,p + fw_from, fw_no , 1);\
+ if (fw != fw_no )\
+ {\
+ p += fw + 1;\
+ }\
+ else\
+ {\
+ bw = strcmp_dir(n + bw_from - 1, p + bw_from - 1, bw_no, -1);\
+ if ( bw != bw_no )\
+ {\
+ p += per;\
+ if (peri) /*Prefix memoization see definition of peri.*/\
+ {\
+ if(_AS_STR_CASESTR_MEM(0,0,p+ns>s_end)) return NULL;\
+ /*Zero byte in forward check causes mismatch.*/\
+ fw = strcmp_dir(n + ns - per ,p + ns - per, per , 1);\
+ if (fw != per )\
+ {\
+ p += fw+ (ns-per-ell) + 1;\
+ }\
+ else\
+ {\
+ /*Backward scan always succeds.*/\
+ return p;\
+ }\
+ }\
+ }\
+ else\
+ {\
+ return p;\
+ }\
+ }\
+ skip_to = p + (ns - 1);
+
+#include "strstr_vec.h"
+}
+
+#ifdef AS_STRCASESTR
+
+#endif
+static uchar *strstr_vec(uchar *s,uchar *s_end,uchar *n,size_t ns)
+{
+#ifdef AS_STRCASESTR
+#define CASECHECK(u) (tolower_class_no[u]==1 || (tolower_class_no[u]==2 && (tolower_class[u][0]^tolower_class[u][1])==32))
+ if (!(CASECHECK(n[ns-1]) || CASECHECK(n[ns-2])))
+ return strstr_two_way(s,s_end,n,ns);
+#undef CASECHECK
+#endif
+ size_t buy=8*ns+64,rent=0;
+ size_t check_last=_AS_STR_CASESTR_MEM(2,0,2);
+ tp_mask phase2mask=0;
+ uchar phase2n[BYTES_AT_ONCE];
+ int ii;
+ for (ii=0; ii<min(ns-check_last,BYTES_AT_ONCE); ii++)
+ {
+ phase2n[BYTES_AT_ONCE-1-ii]=CHAR(n+ns-1-check_last-ii);
+ phase2mask|=bit_i(BYTES_AT_ONCE-1-ii);
+ }
+ tp_vector phase2v=LOAD_UNALIGNED(phase2n);
+#define PHASE2_CONVERT(x) _AS_STR_CASESTR_MEM(x, parallel_tolower(x), x)
+ /*TODO use pcmpistrm to possibly kill next 15 positions*/
+#define PHASE2TEST ((get_mask(TEST_EQ(\
+ PHASE2_CONVERT(LOAD_UNALIGNED(\
+ p+ns-check_last-BYTES_AT_ONCE)),\
+ phase2v))&phase2mask)==phase2mask)
+ size_t check = ns - min(ns, BYTES_AT_ONCE+check_last);
+ s += ns-2;
+ tp_vector UNUSED diff=BROADCAST('A'^'a');
+#define CASE_CONVERT(x) _AS_STR_CASESTR_MEM(x, OR(x,diff), x)
+#define MASK_CONVERT(x) _AS_STR_CASESTR_MEM(x, x|('A'^'a'), x)
+#define LOOP_BODY(p)\
+ p -= ns - 1;\
+ if(PHASE2TEST){\
+ size_t checked=strcmp_dir(p + check - 1,n + check - 1,check , -1);\
+ if (checked == check)\
+ return p;\
+ rent+=checked;\
+ if(buy+2*(p-s)<rent)\
+ return strstr_two_way(p,s_end,n,ns);\
+ }
+
+#include "strstr_vec.h"
+}
+
+
+
+#ifdef AS_STRSTR
+uchar *STRSTR(const uchar *s,const uchar *n)
+#endif
+#ifdef AS_STRCASESTR
+uchar *STRCASESTR(const uchar *s,const uchar *n)
+#endif
+#ifdef AS_MEMMEM
+uchar *MEMMEM(const uchar *s,size_t ss,const uchar *n,size_t ns)
+#endif
+{
+#ifdef AS_STRCASESTR
+ if(!calc_tolower_class) calc_tolower_cls(); /*TODO recalculate when locale changes. */
+#endif
+ size_t buy=small_treshold,rent=0;
+ uchar *p=(uchar*)s;
+#if defined( AS_STRSTR) || defined(AS_STRCASESTR)
+ /* TODO handle case when ss<ns by searching for end of n,s in parallel.*/
+ size_t ns=0,ss;
+ while(n[ns])
+ {
+ if(!s[ns]) return NULL;
+ ns++;
+ }
+#else
+ if( ns > ss) return NULL;
+#endif
+ if (!ns) return (uchar*) s;
+ uchar *s_end=(uchar*)((s+ss>=s) ? s+ss : ((uchar*)((long)-1)));
+ /*For strstr and memmem this decreases startup cost.
+ For strcasestr we align haystack.*/
+ size_t check=ns-_AS_STR_CASESTR_MEM(1,0,1);
+ size_t page_offset= ((size_t)s)%4096;
+ p += check;
+ while(1)
+ {
+#define STRCHR(s,sn,c) _AS_STR_CASESTR_MEM( strchr((char*)s,c),\
+ (*(s-1) ? s : NULL),\
+ memchr((void*)s,c,sn))
+ /*strpbrk(s,tolower_class[(uchar) c]) is too slow -cca 100 cycles.*/
+ p=(uchar*) STRCHR(p,s_end-p,((char*)n)[ns-1]);
+ if(!p) return NULL;
+ p -= check;
+ size_t checked = strcmp_dir(n, p, check, 1);
+ if (checked == check) return p;
+ rent += check + 32;
+ /*next implementation is faster but has large startup cost*/
+ if(buy < rent + (p - s) &&
+ p >= s - page_offset +BYTES_AT_ONCE)
+ {
+ /*Next implementations need two invariants.
+ First is that string started before position that is passed.
+ Second is that p - BYTES_AT_ONCE is valid memory*/
+ return strstr_vec((uchar*)p+1,s_end,(uchar*)n,ns);
+ }
+ p++;
+ p += check;
+ }
+}
+
+/*Two way preprocessing.*/
+SI size_t maxSuf(uchar *n, size_t ns, size_t *per, size_t invert)
+{
+ /*Note that per+ms+1<ns.*/
+ size_t p,ms, j, k;
+ uchar a, b;
+
+ ms = 0;
+ j = 1;
+ k = p = 0;
+ while (j + k < ns)
+ {
+ a = CHAR(n + j + k);
+ b = CHAR(n + ms + k);
+ if (invert ? (a > b) : (a < b))
+ {
+ j += k;
+ k = 0;
+ p = j - ms;
+ j++;
+ }
+ else if (a == b)
+ {
+ if (k == p)
+ {
+ j += k;
+ k = 0;
+ j++;
+ }
+ else
+ {
+ k++;
+ }
+ }
+ else /* invert ? a < b : a > b*/
+ {
+ ms = j;
+ j++;
+ k = p = 0;
+ }
+ }
+ *per =(p+1);
+ return ms;
+}
+
+SI size_t periodic(uchar *a,uchar *b,size_t siz)
+{
+ return strcmp_dir(a,b,siz,1)==siz;
+}
+
+static void two_way_preprocessing(uchar *n,size_t ns,size_t *per2,size_t *ell2,size_t *peri)
+{
+ size_t u,v,up,vp;
+ size_t per,ell;
+ u=maxSuf(n,ns,&up,0);
+ v=maxSuf(n,ns,&vp,1);
+ ell = (u > v) ? u : v;
+ per = (u > v) ? up : vp;
+ *peri = periodic(n, n + per, ell);
+ if (!*peri)
+ per = max(ell, ns - ell) + 1;
+ *per2=per;
+ *ell2=ell;
+}
diff --git a/string/strstr_vec.h b/string/strstr_vec.h
new file mode 100644
index 0000000..b257e09
--- /dev/null
+++ b/string/strstr_vec.h
@@ -0,0 +1,52 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License asize_t with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+tp_vector vn0=BROADCAST(MASK_CONVERT(n[ns-1-0]));
+tp_vector vn1=BROADCAST(MASK_CONVERT(n[ns-1-1]));
+tp_vector e0,e1;
+#ifdef AS_STRSTR
+#define DETECT_ZERO_BYTE
+#endif
+#ifdef AS_STRCASESTR
+#define DETECT_ZERO_BYTE
+#endif
+#ifdef AS_MEMMEM
+#define DETECT_END s_end
+#endif
+
+#ifdef USE_ARITHMETIC
+#define TEST_CODE(so,sn) vzero;\
+ e0 =XOR(CONCAT(sn,so,BYTES_AT_ONCE-0),vn0);\
+ e1 =XOR(CONCAT(sn,so,BYTES_AT_ONCE-1),vn1);\
+ mvec=TEST_ZERO(OR(e0,e1));
+#else
+#define TEST_CODE(so,sn) vzero;\
+ sn = CASE_CONVERT(sn);\
+ e0 = TEST_EQ(CONCAT(sn,so,BYTES_AT_ONCE-0),vn0); \
+ e1 = TEST_EQ(CONCAT(sn,so,BYTES_AT_ONCE-1),vn1); \
+ mvec = (AND(e0,e1));
+#endif
+
+#define LOOP_END(p) return NULL;
+#include "loop.h"
+
+#undef TEST_CODE
+#undef LOOP_BODY
+#undef LOOP_END
+#undef CASE_CONVERT
+#undef MASK_CONVERT
diff --git a/string/vector.h b/string/vector.h
new file mode 100644
index 0000000..f4479cd
--- /dev/null
+++ b/string/vector.h
@@ -0,0 +1,120 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License asize_t with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+/* vectorized functions for string matching. They operate many(4,8,16,32) unsigned bytes at once, allowed operations are
+ TEST_ZERO(x) - set highest bit of bytes that were zero to 1 and 0 otherwise.
+ TEST_EQ(x,y) - set highest bit of bytes that are equal to 1 and 0 otherwise.
+ BROADCAST(c) - return vector such that all bytes have value c
+ TEST_RANGE(x,y,z) - set highest bit of bytes that xi <= yi <= zi to 1 and 0 otherwise. You must satisfy condition zi-xi<128.
+ AND,OR,XOR,ANDNOT - do logic operation bytewise
+ SHIFT_UP(x,k), SHIFT_DOWN(x,k) shift vector x k bytes up/down
+ CONCAT(xlow,xhigh,k) concatenate xlow,xhigh and return bytes from k-th.
+ In shifts and concatenation k must be constant.
+
+ To support other vector extension see sysdeps/x86_64/sse.h file.
+*/
+typedef unsigned char uchar;
+#define SI static inline
+#define UNUSED __attribute__((unused))
+
+#include <stdlib.h>
+#include <ctype.h>
+/*TODO this tables should be recalculated when locale changes.*/
+static uchar _tolower_class[512];
+static uchar *tolower_class[256];
+static uchar tolower_class_no[256];
+static uchar tolower_fixed[256];
+static int calc_tolower_class=0;
+SI void calc_tolower_cls(void)
+{
+ int i,j;
+ uchar *p=_tolower_class;
+ /* as POSIX tolower has undefined behaviour on nonupper characters
+ we construct table with defined behaviour.*/
+ /* second reason is that tolower call is slow because compiler spills all used xmm registers*/
+ for (i=0; i<256; i++) tolower_fixed[i] = isupper(i) ? tolower(i) : i;
+ /* calculate equivalence classes*/
+ for (i=0; i<256; i++)
+ {
+ for(j=0; j<i; j++) if(tolower_fixed[i]==tolower_fixed[j])
+ {
+ tolower_class_no[i]=tolower_class_no[j];
+ tolower_class[i]=tolower_class[j];
+ goto skip;
+ }
+ tolower_class[i] =p;
+ tolower_class_no[i]=0;
+ for(j=i; j<256; j++)
+ {
+ if(tolower_fixed[i]==tolower_fixed[j])
+ {
+ tolower_class_no[i]++;
+ *p++=j;
+ }
+ }
+ *p++=0;
+skip:
+ ;
+ }
+ calc_tolower_class=1;
+}
+
+#define BYTES_AT_ONCE sizeof(tp_vector)
+#define PARA (BYTES_AT_ONCE*unroll)
+#define VSIZ_BYTE sizeof(tp_vector)
+#define VSIZ_BIT (VSIZ_BYTE*8)
+#define MSIZ_BYTE sizeof(tp_mask)
+#define MSIZ_BIT (MSIZ_BYTE*8)
+
+#define ALIGN(x,u) s_offset=((size_t) x)%((u)*BYTES_AT_ONCE); s2=(uchar *)(((size_t) x)&((long) (~(u*BYTES_AT_ONCE-1))));
+/*line s2=x-offset; is clearer some compilers do not know that s2 is aligned*/
+
+#define CACHE_LINE_SIZE 64
+#define UN_OP(n,e) SI tp_vector n(tp_vector x){ return e;}
+#define BIN_OP(n,e) SI tp_vector n(tp_vector x,tp_vector y){ return e;}
+#define MASK_OP(name,exp) SI tp_mask name(tp_mask x,int y){ return exp; }
+
+#if defined( USE_SSE2) | defined(USE_SSE2_NO_BSF) | defined(USE_SSSE3) | defined(USE_SSE4_1)
+#include "sse.h"
+#else
+#include "arit.h"
+#endif
+#undef UN_OP
+#undef BIN_OP
+#undef MASK_OP
+
+#if unroll==1
+#define DO_ACTION ACTION(0)
+#define AGREGATE_VECTOR mvec0
+#elif unroll==2
+#define DO_ACTION ACTION(0) ACTION(1)
+#define AGREGATE_VECTOR OR(mvec0,mvec1)
+#elif unroll==4
+#define DO_ACTION ACTION(0) ACTION(1) ACTION(2) ACTION(3)
+#define AGREGATE_VECTOR OR(OR(mvec0,mvec1),OR(mvec2,mvec3))
+#endif
+
+SI size_t min(size_t x,size_t y)
+{
+ return x<y ? x : y;
+}
+SI size_t max(size_t x,size_t y)
+{
+ return x>y ? x : y;
+}
+
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
deleted file mode 100644
index dcc8bef..0000000
--- a/sysdeps/x86_64/memchr.S
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright (C) 2011 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* fast SSE2 version with using pmaxub and 64 byte loop */
-
- .text
-ENTRY(memchr)
- movd %rsi, %xmm1
- mov %rdi, %rcx
-
- punpcklbw %xmm1, %xmm1
- test %rdx, %rdx
- jz L(return_null)
- punpcklbw %xmm1, %xmm1
-
- and $63, %rcx
- pshufd $0, %xmm1, %xmm1
-
- cmp $48, %rcx
- ja L(crosscache)
-
- movdqu (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
-
- jnz L(matches_1)
- sub $16, %rdx
- jbe L(return_null)
- add $16, %rdi
- and $15, %rcx
- and $-16, %rdi
- add %rcx, %rdx
- sub $64, %rdx
- jbe L(exit_loop)
- jmp L(loop_prolog)
-
- .p2align 4
-L(crosscache):
- and $15, %rcx
- and $-16, %rdi
- movdqa (%rdi), %xmm0
-
- pcmpeqb %xmm1, %xmm0
-/* Check if there is a match. */
- pmovmskb %xmm0, %eax
-/* Remove the leading bytes. */
- sar %cl, %eax
- test %eax, %eax
- je L(unaligned_no_match)
-/* Check which byte is a match. */
- bsf %eax, %eax
-
- sub %rax, %rdx
- jbe L(return_null)
- add %rdi, %rax
- add %rcx, %rax
- ret
-
- .p2align 4
-L(unaligned_no_match):
- add %rcx, %rdx
- sub $16, %rdx
- jbe L(return_null)
- add $16, %rdi
- sub $64, %rdx
- jbe L(exit_loop)
-
- .p2align 4
-L(loop_prolog):
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches)
-
- movdqa 16(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches32)
-
- movdqa 48(%rdi), %xmm4
- pcmpeqb %xmm1, %xmm4
- add $64, %rdi
- pmovmskb %xmm4, %eax
- test %eax, %eax
- jnz L(matches0)
-
- test $0x3f, %rdi
- jz L(align64_loop)
-
- sub $64, %rdx
- jbe L(exit_loop)
-
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches)
-
- movdqa 16(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches32)
-
- movdqa 48(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
- pmovmskb %xmm3, %eax
-
- add $64, %rdi
- test %eax, %eax
- jnz L(matches0)
-
- mov %rdi, %rcx
- and $-64, %rdi
- and $63, %rcx
- add %rcx, %rdx
-
- .p2align 4
-L(align64_loop):
- sub $64, %rdx
- jbe L(exit_loop)
- movdqa (%rdi), %xmm0
- movdqa 16(%rdi), %xmm2
- movdqa 32(%rdi), %xmm3
- movdqa 48(%rdi), %xmm4
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm1, %xmm2
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm1, %xmm4
-
- pmaxub %xmm0, %xmm3
- pmaxub %xmm2, %xmm4
- pmaxub %xmm3, %xmm4
- pmovmskb %xmm4, %eax
-
- add $64, %rdi
-
- test %eax, %eax
- jz L(align64_loop)
-
- sub $64, %rdi
-
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches)
-
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
-
- pcmpeqb 48(%rdi), %xmm1
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches32)
-
- pmovmskb %xmm1, %eax
- bsf %eax, %eax
- lea 48(%rdi, %rax), %rax
- ret
-
- .p2align 4
-L(exit_loop):
- add $32, %rdx
- jle L(exit_loop_32)
-
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches)
-
- movdqa 16(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches32_1)
- sub $16, %rdx
- jle L(return_null)
-
- pcmpeqb 48(%rdi), %xmm1
- pmovmskb %xmm1, %eax
- test %eax, %eax
- jnz L(matches48_1)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(exit_loop_32):
- add $32, %rdx
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches_1)
- sub $16, %rdx
- jbe L(return_null)
-
- pcmpeqb 16(%rdi), %xmm1
- pmovmskb %xmm1, %eax
- test %eax, %eax
- jnz L(matches16_1)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(matches0):
- bsf %eax, %eax
- lea -16(%rax, %rdi), %rax
- ret
-
- .p2align 4
-L(matches):
- bsf %eax, %eax
- add %rdi, %rax
- ret
-
- .p2align 4
-L(matches16):
- bsf %eax, %eax
- lea 16(%rax, %rdi), %rax
- ret
-
- .p2align 4
-L(matches32):
- bsf %eax, %eax
- lea 32(%rax, %rdi), %rax
- ret
-
- .p2align 4
-L(matches_1):
- bsf %eax, %eax
- sub %rax, %rdx
- jbe L(return_null)
- add %rdi, %rax
- ret
-
- .p2align 4
-L(matches16_1):
- bsf %eax, %eax
- sub %rax, %rdx
- jbe L(return_null)
- lea 16(%rdi, %rax), %rax
- ret
-
- .p2align 4
-L(matches32_1):
- bsf %eax, %eax
- sub %rax, %rdx
- jbe L(return_null)
- lea 32(%rdi, %rax), %rax
- ret
-
- .p2align 4
-L(matches48_1):
- bsf %eax, %eax
- sub %rax, %rdx
- jbe L(return_null)
- lea 48(%rdi, %rax), %rax
- ret
-
- .p2align 4
-L(return_null):
- xor %rax, %rax
- ret
-END(memchr)
-
-strong_alias (memchr, __memchr)
-
-libc_hidden_builtin_def(memchr)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index dd6c27d..9d088fe 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -9,24 +9,64 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
- memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
+ memmove-ssse3-back strcasecmp_l-ssse3 \
strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
- strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
- memcmp-ssse3
+ memcmp-ssse3 strchr-sse2-no-bsf
+
+sysdep_routines += \
+strnlen_sse2_no_bsf\
+strnlen_sse2\
+strnlen
+sysdep_routines += \
+strstr_sse2_no_bsf\
+strstr_sse2\
+strstr_ssse3\
+strstr
+CFLAGS-strstr_ssse3.c += -mssse3
+sysdep_routines += \
+strcasestr_sse2_no_bsf\
+strcasestr_sse2\
+strcasestr_ssse3\
+strcasestr
+CFLAGS-strcasestr_ssse3.c += -mssse3
+sysdep_routines += \
+memmem_sse2_no_bsf\
+memmem_sse2\
+memmem_ssse3\
+memmem
+CFLAGS-memmem_ssse3.c += -mssse3
+sysdep_routines += \
+strrchr_sse2_no_bsf\
+strrchr_sse2\
+strrchr
+sysdep_routines += \
+strchrnul_sse2_no_bsf\
+strchrnul_sse2\
+strchrnul
+sysdep_routines += \
+memchr_sse2_no_bsf\
+memchr_sse2\
+memchr
+sysdep_routines += \
+rawmemchr_sse2_no_bsf\
+rawmemchr_sse2\
+rawmemchr
+sysdep_routines += \
+memrchr_sse2_no_bsf\
+memrchr_sse2\
+memrchr
+
ifeq (yes,$(config-cflags-sse4))
-sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
+sysdep_routines += strcspn-c strpbrk-c strspn-c varshift
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
CFLAGS-strspn-c.c += -msse4
-CFLAGS-strstr.c += -msse4
-CFLAGS-strcasestr.c += -msse4
-CFLAGS-strcasestr-nonascii.c += -msse4
endif
endif
diff --git a/sysdeps/x86_64/multiarch/gen_stub b/sysdeps/x86_64/multiarch/gen_stub
new file mode 100755
index 0000000..3289335
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/gen_stub
@@ -0,0 +1,111 @@
+fn(){
+J=$1
+TP=$2
+ARG=$3
+ARGN=$4
+BASE=$5
+TYPES=$6
+EXT=$7
+
+echo "sysdep_routines += \\"
+for I in $TYPES; do
+ echo "${J}_${I}\\"
+done
+echo "${J}"
+
+for I in $TYPES; do
+F="${J}_${I}.c"
+IU=`echo $I | tr '[a-z]' '[A-Z]'`
+JU=`echo $J | tr '[a-z]' '[A-Z]'`
+echo "/*generated by gen_stub*/" > $F
+echo "#define AS_${JU}" >> $F
+echo "#define USE_${IU}" >> $F
+echo "#define ${JU} __${J}_${I}" >> $F
+echo "#include \"string/${BASE}.h\"" >> $F
+done
+
+if [ -z $EXT ]; then
+FN=$J
+ALIASED=""
+else
+FN="__${J}"
+ALIASED="#ifndef NO_ALIAS
+weak_alias(${FN},${J});
+#endif"
+fi
+
+echo "/*generated by gen_stub*/" > "${J}.c"
+
+echo "
+#include <sysdep.h>
+#ifndef _LIBC
+# include <config.h>
+#endif
+
+#if defined SHARED && !defined NOT_IN_libc
+
+#include \"init-arch.h\"
+
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+ __hidden_ver1 (__${J}_base, __GI_${J}, __${J}_base);
+
+" >> "${J}.c"
+
+for I in $TYPES; do
+ echo "extern ${TP} __${J}_${I}(${ARG}) attribute_hidden;" >> "${J}.c"
+done
+echo " ${TP} ${FN}(${ARG});
+${TP} __${J}_base(${ARG}){
+ return __${J}_sse2(${ARGN});
+}
+libc_hidden_builtin_def (__${J}_base)
+extern __typeof (__${J}_base) __${J}_base attribute_hidden;
+" >> "${J}.c"
+if [ "$TYPES" == "sse2_no_bsf sse2 ssse3" ]; then
+echo "libc_ifunc (${FN}, HAS_SSSE3 ? __${J}_ssse3 : (HAS_SLOW_BSF ? __${J}_sse2_no_bsf : __${J}_sse2));" >> "${J}.c"
+echo "CFLAGS-${J}_ssse3.c += -mssse3"
+else
+echo "libc_ifunc (${FN}, (HAS_SLOW_BSF ? __${J}_sse2_no_bsf : __${J}_sse2));" >> "${J}.c"
+fi
+echo "
+#else
+
+#include \"${J}_sse2.c\"
+
+$TP ${FN}(${ARG}){
+ return __${J}_sse2(${ARGN});
+}
+#endif
+${ALIASED}
+" >> "${J}.c"
+
+}
+#fn strlen "size_t" "const char* n" "n" strlen "sse2_no_bsf sse2"
+fn strnlen "size_t" "const char* n,size_t ns" "n,ns" strlen "sse2_no_bsf sse2"
+
+
+fn strstr "char *" "const char* s,const char *n" "s,n" strstr "sse2_no_bsf sse2 ssse3"
+fn strcasestr "char *" "const char* s,const char *n" "s,n" strstr "sse2_no_bsf sse2 ssse3" ext
+fn memmem "void *" "const void* s,size_t ss,const void *n, size_t ns" "s,ss,n,ns" strstr "sse2_no_bsf sse2 ssse3"
+
+#fn strchr "char *" "const char* s,int c" "s,c" strchr "sse2_no_bsf sse2"
+# fails because strch expands to builtin
+
+fn strrchr "char *" "const char* s,int c" "s,c" strchr "sse2_no_bsf sse2"
+fn strchrnul "char *" "const char* s,int c" "s,c" strchr "sse2_no_bsf sse2" ext
+
+fn memchr "void *" "const void* s,int c,size_t ss" "s,c,ss" strchr "sse2_no_bsf sse2"
+fn rawmemchr "void *" "const void* s,int c" "s,c" strchr "sse2_no_bsf sse2" ext
+fn memrchr "void *" "const void* s,int c,size_t ss" "s,c,ss" strchr "sse2_no_bsf sse2" ext
+
+echo "
+#ifndef NO_ALIAS
+weak_alias(strrchr,rindex);
+#endif" >> strrchr.c
+
+
+echo "size_t __strnlen(const char* n,size_t ns){
+ return strnlen(n,ns);
+}" >> strnlen.c
+
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S
deleted file mode 100644
index c4157ad..0000000
--- a/sysdeps/x86_64/multiarch/rawmemchr.S
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
- Contributed by Ulrich Drepper <drepper@redhat.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in lib. */
-#ifndef NOT_IN_libc
- .text
-ENTRY(rawmemchr)
- .type rawmemchr, @gnu_indirect_function
- cmpl $0, __cpu_features+KIND_OFFSET(%rip)
- jne 1f
- call __init_cpu_features
-1: testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
- jnz 2f
- testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
- jz 2f
- leaq __rawmemchr_sse42(%rip), %rax
- ret
-2: leaq __rawmemchr_sse2(%rip), %rax
- ret
-
-END(rawmemchr)
-strong_alias (rawmemchr, __rawmemchr)
-
-
- .section .text.sse4.2,"ax",@progbits
- .align 16
- .type __rawmemchr_sse42, @function
-__rawmemchr_sse42:
- cfi_startproc
- CALL_MCOUNT
- movd %esi, %xmm1
- movq %rdi, %rcx
- pxor %xmm2, %xmm2
- andq $~15, %rdi
- orl $0xffffffff, %esi
- pshufb %xmm2, %xmm1
- movdqa (%rdi), %xmm0
- subq %rdi, %rcx
- pcmpeqb %xmm1, %xmm0
- shl %cl, %esi
- pmovmskb %xmm0, %ecx
- movl $16, %eax
- movl $16, %edx
- andl %esi, %ecx
- jnz 1f
-
-2: pcmpestri $0x08, 16(%rdi), %xmm1
- leaq 16(%rdi), %rdi
- jnc 2b
-
- leaq (%rdi,%rcx), %rax
- ret
-
-1: bsfl %ecx, %eax
- addq %rdi, %rax
- ret
- cfi_endproc
- .size __rawmemchr_sse42, .-__rawmemchr_sse42
-
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __rawmemchr_sse2, @function; \
- .align 16; \
- __rawmemchr_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __rawmemchr_sse2, .-__rawmemchr_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal rawmemchr calls through a PLT.
- The speedup we get from using SSE4.2 instruction is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_sse2
-#endif
-
-#include "../rawmemchr.S"
diff --git a/sysdeps/x86_64/multiarch/strcasestr-c.c b/sysdeps/x86_64/multiarch/strcasestr-c.c
deleted file mode 100644
index 551492d..0000000
--- a/sysdeps/x86_64/multiarch/strcasestr-c.c
+++ /dev/null
@@ -1,16 +0,0 @@
-#include "init-arch.h"
-
-#define STRCASESTR __strcasestr_sse2
-
-#include "string/strcasestr.c"
-
-extern char *__strcasestr_sse42 (const char *, const char *) attribute_hidden;
-extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden;
-
-#if 1
-libc_ifunc (__strcasestr,
- HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2);
-#else
-libc_ifunc (__strcasestr,
- 0 ? __strcasestr_sse42 : __strcasestr_sse2);
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c b/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
deleted file mode 100644
index a1f9968..0000000
--- a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/* strstr with SSE4.2 intrinsics
- Copyright (C) 2010 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-# include <ctype.h>
-
-
-/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C
- locale. */
-static inline __m128i
-__m128i_strloadu_tolower (const unsigned char *p)
-{
- union
- {
- char b[16];
- __m128i x;
- } u;
-
- for (int i = 0; i < 16; ++i)
- if (p[i] == 0)
- {
- u.b[i] = 0;
- break;
- }
- else
- u.b[i] = tolower (p[i]);
-
- return u.x;
-}
-
-
-#define STRCASESTR_NONASCII
-#define USE_AS_STRCASESTR
-#define STRSTR_SSE42 __strcasestr_sse42_nonascii
-#include "strstr.c"
diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c
deleted file mode 100644
index d1cfb3b..0000000
--- a/sysdeps/x86_64/multiarch/strcasestr.c
+++ /dev/null
@@ -1,7 +0,0 @@
-extern char *__strcasestr_sse42_nonascii (const unsigned char *s1,
- const unsigned char *s2)
- attribute_hidden;
-
-#define USE_AS_STRCASESTR
-#define STRSTR_SSE42 __strcasestr_sse42
-#include "strstr.c"
diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
deleted file mode 100644
index 248328d..0000000
--- a/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNLEN
-#define STRLEN __strnlen_sse2_no_bsf
-#include "strlen-sse2-no-bsf.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen.S b/sysdeps/x86_64/multiarch/strnlen.S
deleted file mode 100644
index 044b910..0000000
--- a/sysdeps/x86_64/multiarch/strnlen.S
+++ /dev/null
@@ -1,54 +0,0 @@
-/* multiple version of strnlen
- Copyright (C) 2011 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc. */
-#ifndef NOT_IN_libc
-
- .text
-ENTRY(__strnlen)
- .type __strnlen, @gnu_indirect_function
- cmpl $0, __cpu_features+KIND_OFFSET(%rip)
- jne 1f
- call __init_cpu_features
-1: leaq __strnlen_sse2(%rip), %rax
- testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
- jz 2f
- leaq __strnlen_sse2_no_bsf(%rip), %rax
-2: ret
-END(__strnlen)
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __strnlen_sse2, @function; \
- .align 16; \
- __strnlen_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __strnlen_sse2, .-__strnlen_sse2
-
-# undef libc_hidden_def
-# define libc_hidden_def(name) \
- .globl __GI_strnlen; __GI_strnlen = __strnlen_sse2
-#endif
-
-#include "../strnlen.S"
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
deleted file mode 100644
index c698c94..0000000
--- a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
+++ /dev/null
@@ -1,555 +0,0 @@
-/* strrchr with SSE2 without bsf and bsr
- Copyright (C) 2011 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if defined SHARED && !defined NOT_IN_libc
-
-# include <sysdep.h>
-# include "asm-syntax.h"
-
- atom_text_section
-ENTRY (__strrchr_sse2_no_bsf)
-
- movd %rsi, %xmm1
- pxor %xmm2, %xmm2
- mov %rdi, %rcx
- punpcklbw %xmm1, %xmm1
- punpcklbw %xmm1, %xmm1
- /* ECX has OFFSET. */
- and $63, %rcx
- cmp $48, %rcx
- pshufd $0, %xmm1, %xmm1
- ja L(crosscache)
-
-/* unaligned string. */
- movdqu (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm2
- pcmpeqb %xmm1, %xmm0
- /* Find where NULL is. */
- pmovmskb %xmm2, %rcx
- /* Check if there is a match. */
- pmovmskb %xmm0, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match1)
-
- test %rcx, %rcx
- jnz L(return_null)
-
- and $-16, %rdi
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match1):
- test %rcx, %rcx
- jnz L(prolog_find_zero_1)
-
- mov %rax, %r8
- mov %rdi, %rsi
- and $-16, %rdi
- jmp L(loop)
-
- .p2align 4
-L(crosscache):
-/* Hancle unaligned string. */
- and $15, %rcx
- and $-16, %rdi
- pxor %xmm3, %xmm3
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- /* Find where NULL is. */
- pmovmskb %xmm3, %rdx
- /* Check if there is a match. */
- pmovmskb %xmm0, %rax
- /* Remove the leading bytes. */
- shr %cl, %rdx
- shr %cl, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match)
-
- test %rdx, %rdx
- jnz L(return_null)
-
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match):
- test %rdx, %rdx
- jnz L(prolog_find_zero)
-
- mov %rax, %r8
- lea (%rdi, %rcx), %rsi
-
-/* Loop start on aligned string. */
- .p2align 4
-L(loop):
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm2
- add $16, %rdi
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm2
- add $16, %rdi
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm2
- add $16, %rdi
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm2
- add $16, %rdi
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jz L(loop)
-
-L(matches):
- test %rax, %rax
- jnz L(match)
-L(return_value):
- test %r8, %r8
- jz L(return_null)
- mov %r8, %rax
- mov %rsi, %rdi
- jmp L(match_exit)
-
- .p2align 4
-L(match):
- pmovmskb %xmm2, %rcx
- test %rcx, %rcx
- jnz L(find_zero)
- mov %rax, %r8
- mov %rdi, %rsi
- jmp L(loop)
-
- .p2align 4
-L(find_zero):
- test %cl, %cl
- jz L(find_zero_high)
- mov %cl, %dl
- and $15, %dl
- jz L(find_zero_8)
- test $0x01, %cl
- jnz L(FindZeroExit1)
- test $0x02, %cl
- jnz L(FindZeroExit2)
- test $0x04, %cl
- jnz L(FindZeroExit3)
- and $1 << 4 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(find_zero_8):
- test $0x10, %cl
- jnz L(FindZeroExit5)
- test $0x20, %cl
- jnz L(FindZeroExit6)
- test $0x40, %cl
- jnz L(FindZeroExit7)
- and $1 << 8 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(find_zero_high):
- mov %ch, %dh
- and $15, %dh
- jz L(find_zero_high_8)
- test $0x01, %ch
- jnz L(FindZeroExit9)
- test $0x02, %ch
- jnz L(FindZeroExit10)
- test $0x04, %ch
- jnz L(FindZeroExit11)
- and $1 << 12 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(find_zero_high_8):
- test $0x10, %ch
- jnz L(FindZeroExit13)
- test $0x20, %ch
- jnz L(FindZeroExit14)
- test $0x40, %ch
- jnz L(FindZeroExit15)
- and $1 << 16 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit1):
- and $1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit2):
- and $1 << 2 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit3):
- and $1 << 3 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit5):
- and $1 << 5 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit6):
- and $1 << 6 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit7):
- and $1 << 7 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit9):
- and $1 << 9 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit10):
- and $1 << 10 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit11):
- and $1 << 11 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit13):
- and $1 << 13 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit14):
- and $1 << 14 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit15):
- and $1 << 15 - 1, %rax
- jz L(return_value)
-
- .p2align 4
-L(match_exit):
- test %ah, %ah
- jnz L(match_exit_high)
- mov %al, %dl
- and $15 << 4, %dl
- jnz L(match_exit_8)
- test $0x08, %al
- jnz L(Exit4)
- test $0x04, %al
- jnz L(Exit3)
- test $0x02, %al
- jnz L(Exit2)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match_exit_8):
- test $0x80, %al
- jnz L(Exit8)
- test $0x40, %al
- jnz L(Exit7)
- test $0x20, %al
- jnz L(Exit6)
- lea -12(%rdi), %rax
- ret
-
- .p2align 4
-L(match_exit_high):
- mov %ah, %dh
- and $15 << 4, %dh
- jnz L(match_exit_high_8)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x02, %ah
- jnz L(Exit10)
- lea -8(%rdi), %rax
- ret
-
- .p2align 4
-L(match_exit_high_8):
- test $0x80, %ah
- jnz L(Exit16)
- test $0x40, %ah
- jnz L(Exit15)
- test $0x20, %ah
- jnz L(Exit14)
- lea -4(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit2):
- lea -15(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit3):
- lea -14(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit4):
- lea -13(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit6):
- lea -11(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit7):
- lea -10(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit8):
- lea -9(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit10):
- lea -7(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit11):
- lea -6(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit12):
- lea -5(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit14):
- lea -3(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit15):
- lea -2(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit16):
- lea -1(%rdi), %rax
- ret
-
-/* Return NULL. */
- .p2align 4
-L(return_null):
- xor %rax, %rax
- ret
-
- .p2align 4
-L(prolog_find_zero):
- add %rcx, %rdi
- mov %rdx, %rcx
-L(prolog_find_zero_1):
- test %cl, %cl
- jz L(prolog_find_zero_high)
- mov %cl, %dl
- and $15, %dl
- jz L(prolog_find_zero_8)
- test $0x01, %cl
- jnz L(PrologFindZeroExit1)
- test $0x02, %cl
- jnz L(PrologFindZeroExit2)
- test $0x04, %cl
- jnz L(PrologFindZeroExit3)
- and $1 << 4 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_8):
- test $0x10, %cl
- jnz L(PrologFindZeroExit5)
- test $0x20, %cl
- jnz L(PrologFindZeroExit6)
- test $0x40, %cl
- jnz L(PrologFindZeroExit7)
- and $1 << 8 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_high):
- mov %ch, %dh
- and $15, %dh
- jz L(prolog_find_zero_high_8)
- test $0x01, %ch
- jnz L(PrologFindZeroExit9)
- test $0x02, %ch
- jnz L(PrologFindZeroExit10)
- test $0x04, %ch
- jnz L(PrologFindZeroExit11)
- and $1 << 12 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_high_8):
- test $0x10, %ch
- jnz L(PrologFindZeroExit13)
- test $0x20, %ch
- jnz L(PrologFindZeroExit14)
- test $0x40, %ch
- jnz L(PrologFindZeroExit15)
- and $1 << 16 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit1):
- and $1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit2):
- and $1 << 2 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit3):
- and $1 << 3 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit5):
- and $1 << 5 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit6):
- and $1 << 6 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit7):
- and $1 << 7 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit9):
- and $1 << 9 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit10):
- and $1 << 10 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit11):
- and $1 << 11 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit13):
- and $1 << 13 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit14):
- and $1 << 14 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit15):
- and $1 << 15 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
-END (__strrchr_sse2_no_bsf)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
deleted file mode 100644
index c87d8fa..0000000
--- a/sysdeps/x86_64/multiarch/strrchr.S
+++ /dev/null
@@ -1,281 +0,0 @@
-/* strrchr with SSE4.2
- Copyright (C) 2009 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc and for
- the DSO. In static binaries we need strrchr before the initialization
- happened. */
-#if defined SHARED && !defined NOT_IN_libc
- .text
-ENTRY(strrchr)
- .type strrchr, @gnu_indirect_function
- cmpl $0, __cpu_features+KIND_OFFSET(%rip)
- jne 1f
- call __init_cpu_features
-1: leaq __strrchr_sse2(%rip), %rax
- testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
- jz 2f
- leaq __strrchr_sse42(%rip), %rax
- ret
-2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
- jz 3f
- leaq __strrchr_sse2_no_bsf(%rip), %rax
-3: ret
-END(strrchr)
-
-/*
- This implementation uses SSE4 instructions to compare up to 16 bytes
- at a time looking for the last occurrence of the character c in the
- string s:
-
- char *strrchr (const char *s, int c);
-
- We use 0x4a:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_EACH
- | _SIDD_MOST_SIGNIFICANT
- on pcmpistri to compare xmm/mem128
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- X X X X X X X X X X X X X X X X
-
- against xmm
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- C C C C C C C C C C C C C C C C
-
- to find out if the first 16byte data element has a byte C and the
- last offset. There are 4 cases:
-
- 1. The first 16byte data element has EOS and has the byte C at the
- last offset X.
- 2. The first 16byte data element is valid and has the byte C at the
- last offset X.
- 3. The first 16byte data element has EOS and doesn't have the byte C.
- 4. The first 16byte data element is valid and doesn't have the byte C.
-
- Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
-
- case ECX CFlag ZFlag SFlag
- 1 X 1 1 0
- 2 X 1 0 0
- 3 16 0 1 0
- 4 16 0 0 0
-
- We exit from the loop for cases 1 and 3 with jz which branches
- when ZFlag is 1. If CFlag == 1, ECX has the offset X for case 1. */
-
-
- .section .text.sse4.2,"ax",@progbits
- .align 16
- .type __strrchr_sse42, @function
-__strrchr_sse42:
- cfi_startproc
- CALL_MCOUNT
- testb %sil, %sil
- je __strend_sse4
- xor %eax,%eax /* RAX has the last occurrence of s. */
- movd %esi, %xmm1
- punpcklbw %xmm1, %xmm1
- movl %edi, %esi
- punpcklbw %xmm1, %xmm1
- andl $15, %esi
- pshufd $0, %xmm1, %xmm1
- movq %rdi, %r8
- je L(loop)
-
-/* Handle unaligned string using psrldq. */
- leaq L(psrldq_table)(%rip), %rdx
- andq $-16, %r8
- movslq (%rdx,%rsi,4),%r9
- movdqa (%r8), %xmm0
- addq %rdx, %r9
- jmp *%r9
-
-/* Handle unaligned string with offset 1 using psrldq. */
- .p2align 4
-L(psrldq_1):
- psrldq $1, %xmm0
-
- .p2align 4
-L(unaligned_pcmpistri):
- pcmpistri $0x4a, %xmm1, %xmm0
- jnc L(unaligned_no_byte)
- leaq (%rdi,%rcx), %rax
-L(unaligned_no_byte):
- /* Find the length of the unaligned string. */
- pcmpistri $0x3a, %xmm0, %xmm0
- movl $16, %edx
- subl %esi, %edx
- cmpl %ecx, %edx
- /* Return RAX if the unaligned fragment to next 16B already
- contain the NULL terminator. */
- jg L(exit)
- addq $16, %r8
-
-/* Loop start on aligned string. */
- .p2align 4
-L(loop):
- pcmpistri $0x4a, (%r8), %xmm1
- jbe L(match_or_eos)
- addq $16, %r8
- jmp L(loop)
- .p2align 4
-L(match_or_eos):
- je L(had_eos)
-L(match_no_eos):
- leaq (%r8,%rcx), %rax
- addq $16, %r8
- jmp L(loop)
- .p2align 4
-L(had_eos):
- jnc L(exit)
- leaq (%r8,%rcx), %rax
- .p2align 4
-L(exit):
- ret
-
-/* Handle unaligned string with offset 15 using psrldq. */
- .p2align 4
-L(psrldq_15):
- psrldq $15, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 14 using psrldq. */
- .p2align 4
-L(psrldq_14):
- psrldq $14, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 13 using psrldq. */
- .p2align 4
-L(psrldq_13):
- psrldq $13, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 12 using psrldq. */
- .p2align 4
-L(psrldq_12):
- psrldq $12, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 11 using psrldq. */
- .p2align 4
-L(psrldq_11):
- psrldq $11, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 10 using psrldq. */
- .p2align 4
-L(psrldq_10):
- psrldq $10, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 9 using psrldq. */
- .p2align 4
-L(psrldq_9):
- psrldq $9, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 8 using psrldq. */
- .p2align 4
-L(psrldq_8):
- psrldq $8, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 7 using psrldq. */
- .p2align 4
-L(psrldq_7):
- psrldq $7, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 6 using psrldq. */
- .p2align 4
-L(psrldq_6):
- psrldq $6, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 5 using psrldq. */
- .p2align 4
-L(psrldq_5):
- psrldq $5, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 4 using psrldq. */
- .p2align 4
-L(psrldq_4):
- psrldq $4, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 3 using psrldq. */
- .p2align 4
-L(psrldq_3):
- psrldq $3, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 2 using psrldq. */
- .p2align 4
-L(psrldq_2):
- psrldq $2, %xmm0
- jmp L(unaligned_pcmpistri)
-
- cfi_endproc
- .size __strrchr_sse42, .-__strrchr_sse42
-
- .section .rodata.sse4.2,"a",@progbits
- .p2align 4
-L(psrldq_table):
- .int L(loop) - L(psrldq_table)
- .int L(psrldq_1) - L(psrldq_table)
- .int L(psrldq_2) - L(psrldq_table)
- .int L(psrldq_3) - L(psrldq_table)
- .int L(psrldq_4) - L(psrldq_table)
- .int L(psrldq_5) - L(psrldq_table)
- .int L(psrldq_6) - L(psrldq_table)
- .int L(psrldq_7) - L(psrldq_table)
- .int L(psrldq_8) - L(psrldq_table)
- .int L(psrldq_9) - L(psrldq_table)
- .int L(psrldq_10) - L(psrldq_table)
- .int L(psrldq_11) - L(psrldq_table)
- .int L(psrldq_12) - L(psrldq_table)
- .int L(psrldq_13) - L(psrldq_table)
- .int L(psrldq_14) - L(psrldq_table)
- .int L(psrldq_15) - L(psrldq_table)
-
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __strrchr_sse2, @function; \
- .align 16; \
- __strrchr_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strrchr calls through a PLT.
- The speedup we get from using SSE4.2 instruction is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
-#endif
-
-#include "../strrchr.S"
diff --git a/sysdeps/x86_64/multiarch/strstr-c.c b/sysdeps/x86_64/multiarch/strstr-c.c
deleted file mode 100644
index b8ed316..0000000
--- a/sysdeps/x86_64/multiarch/strstr-c.c
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "init-arch.h"
-
-#define STRSTR __strstr_sse2
-#ifdef SHARED
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
- __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
-#endif
-
-#include "string/strstr.c"
-
-extern char *__strstr_sse42 (const char *, const char *) attribute_hidden;
-extern __typeof (__strstr_sse2) __strstr_sse2 attribute_hidden;
-
-libc_ifunc (strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2);
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
deleted file mode 100644
index b1b4139..0000000
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ /dev/null
@@ -1,384 +0,0 @@
-/* strstr with SSE4.2 intrinsics
- Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <nmmintrin.h>
-#include "varshift.h"
-
-#ifndef STRSTR_SSE42
-# define STRSTR_SSE42 __strstr_sse42
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# include <ctype.h>
-# include <locale/localeinfo.h>
-
-# define LOADBYTE(C) tolower (C)
-# define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2))
-#else
-# define LOADBYTE(C) (C)
-# define CMPBYTE(C1, C2) ((C1) == (C2))
-#endif
-
-/* We use 0xe ordered-compare:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_ORDER
- | _SIDD_LEAST_SIGNIFICANT
- on pcmpistri to do the scanning and string comparsion requirements of
- sub-string match. In the scanning phase, we process Cflag and ECX
- index to locate the first fragment match; once the first fragment
- match position has been identified, we do comparison of subsequent
- string fragments until we can conclude false or true match; whe
- n concluding a false match, we may need to repeat scanning process
- from next relevant offset in the target string.
-
- In the scanning phase we have 4 cases:
- case ECX CFlag ZFlag SFlag
- 1 16 0 0 0
- 2a 16 0 0 1
- 2b 16 0 1 0
- 2c 16 0 1 1
-
- 1. No ordered-comparison match, both 16B fragments are valid, so
- continue to next fragment.
- 2. No ordered-comparison match, there is EOS in either fragment,
- 2a. Zflg = 0, Sflg = 1, we continue
- 2b. Zflg = 1, Sflg = 0, we conclude no match and return.
- 2c. Zflg = 1, sflg = 1, lenth determine match or no match
-
- In the string comparison phase, the 1st fragment match is fixed up
- to produce ECX = 0. Subsequent fragment compare of nonzero index
- and no match conclude a false match.
-
- case ECX CFlag ZFlag SFlag
- 3 X 1 0 0/1
- 4a 0 1 0 0
- 4b 0 1 0 1
- 4c 0 < X 1 0 0/1
- 5 16 0 1 0
-
- 3. An initial ordered-comparison fragment match, we fix up to do
- subsequent string comparison
- 4a. Continuation of fragment comparison of a string compare.
- 4b. EOS reached in the reference string, we conclude true match and
- return
- 4c. String compare failed if index is nonzero, we need to go back to
- scanning
- 5. failed string compare, go back to scanning
- */
-
-/* Simple replacement of movdqu to address 4KB boundary cross issue.
- If EOS occurs within less than 16B before 4KB boundary, we don't
- cross to next page. */
-
-static inline __m128i
-__m128i_strloadu (const unsigned char * p, __m128i zero)
-{
- if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0))
- {
- size_t offset = ((size_t) p & (16 - 1));
- __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
- if ((bmsk >> offset) != 0)
- return __m128i_shift_right (a, offset);
- }
- return _mm_loadu_si128 ((__m128i *) p);
-}
-
-#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
-
-/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
- locale and other which have single-byte letters only in the ASCII
- range. */
-static inline __m128i
-__m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow,
- __m128i uchigh, __m128i lcqword)
-{
- __m128i frag = __m128i_strloadu (p, zero);
-
- /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */
- __m128i r2 = _mm_cmpgt_epi8 (uchigh, frag);
- /* Compare if bytes are > 'A' - 1. */
- __m128i r1 = _mm_cmpgt_epi8 (frag, uclow);
- /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1. */
- __m128i mask = _mm_and_si128 (r2, r1);
- /* Apply lowercase bit 6 mask for above mask bytes == ff. */
- return _mm_or_si128 (frag, _mm_and_si128 (mask, lcqword));
-}
-
-#endif
-
-/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
- algorithm) overlap for a fully populated 16B vector.
- Input parameter: 1st 16Byte loaded from the reference string of a
- strstr function.
- We don't use KMP algorithm if reference string is less than 16B. */
-static int
-__inline__ __attribute__ ((__always_inline__,))
-KMP16Bovrlap (__m128i s2)
-{
- __m128i b = _mm_unpacklo_epi8 (s2, s2);
- __m128i a = _mm_unpacklo_epi8 (b, b);
- a = _mm_shuffle_epi32 (a, 0);
- b = _mm_srli_si128 (s2, sizeof (char));
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
-
- /* _BitScanForward(&k1, bmsk); */
- int k1;
- __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
- if (!bmsk)
- return 16;
- else if (bmsk == 0x7fff)
- return 1;
- else if (!k1)
- {
- /* There are al least two distinct chars in s2. If byte 0 and 1 are
- idential and the distinct value lies farther down, we can deduce
- the next byte offset to restart full compare is least no earlier
- than byte 3. */
- return 3;
- }
- else
- {
- /* Byte 1 is not degenerated to byte 0. */
- return k1 + 1;
- }
-}
-
-char *
-__attribute__ ((section (".text.sse4.2")))
-STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
-{
-#define p1 s1
- const unsigned char *p2 = s2;
-
-#ifndef STRCASESTR_NONASCII
- if (__builtin_expect (p2[0] == '\0', 0))
- return (char *) p1;
-
- if (__builtin_expect (p1[0] == '\0', 0))
- return NULL;
-
- /* Check if p1 length is 1 byte long. */
- if (__builtin_expect (p1[1] == '\0', 0))
- return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# ifndef STRCASESTR_NONASCII
- if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
- != 0, 0))
- return __strcasestr_sse42_nonascii (s1, s2);
-
- const __m128i uclow = _mm_set1_epi8 (0x40);
- const __m128i uchigh = _mm_set1_epi8 (0x5b);
- const __m128i lcqword = _mm_set1_epi8 (0x20);
- const __m128i zero = _mm_setzero_si128 ();
-# define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword)
-# else
-# define strloadu __m128i_strloadu_tolower
-# define zero _mm_setzero_si128 ()
-# endif
-#else
-# define strloadu(p) __m128i_strloadu (p, zero)
- const __m128i zero = _mm_setzero_si128 ();
-#endif
-
- /* p1 > 1 byte long. Load up to 16 bytes of fragment. */
- __m128i frag1 = strloadu (p1);
-
- __m128i frag2;
- if (p2[1] != '\0')
- /* p2 is > 1 byte long. */
- frag2 = strloadu (p2);
- else
- frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0);
-
- /* Unsigned bytes, equal order, does frag2 has null? */
- int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
- if (cmp_s & cmp_c)
- {
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
- int len;
- __asm ("bsfl %[bmsk], %[len]"
- : [len] "=r" (len) : [bmsk] "r" (bmsk));
- p1 += cmp;
- if ((len + cmp) <= 16)
- return (char *) p1;
-
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- if ((len + cmp) <= 16)
- return (char *) p1 + cmp;
- }
-
- if (cmp_s)
- {
- /* Adjust addr for 16B alginment in ensuing loop. */
- while (!cmp_z)
- {
- p1 += cmp;
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
- once already, this time cmp will be zero and we can exit. */
- if ((!cmp) & cmp_c)
- break;
- }
-
- if (!cmp_c)
- return NULL;
-
- /* Since s2 is less than 16 bytes, com_c is definitive
- determination of full match. */
- return (char *) p1 + cmp;
- }
-
- /* General case, s2 is at least 16 bytes or more.
- First, the common case of false-match at first byte of p2. */
- const unsigned char *pt = NULL;
- int kmp_fwd = 0;
-re_trace:
- while (!cmp_c)
- {
- /* frag1 has null. */
- if (cmp_z)
- return NULL;
-
- /* frag 1 has no null, advance 16 bytes. */
- p1 += 16;
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
- /* Unsigned bytes, equal order, is there a partial match? */
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- }
-
- /* Next, handle initial positive match as first byte of p2. We have
- a partial fragment match, make full determination until we reached
- end of s2. */
- if (!cmp)
- {
- if (cmp_z)
- return (char *) p1;
-
- pt = p1;
- p1 += 16;
- p2 += 16;
- /* Load up to 16 bytes of fragment. */
- frag2 = strloadu (p2);
- }
- else
- {
- /* Adjust 16B alignment. */
- p1 += cmp;
- pt = p1;
- }
-
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
-
- /* Unsigned bytes, equal order, does frag2 has null? */
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
- while (!(cmp | cmp_z | cmp_s))
- {
- p1 += 16;
- p2 += 16;
- /* Load up to 16 bytes of fragment. */
- frag2 = strloadu (p2);
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
- /* Unsigned bytes, equal order, does frag2 has null? */
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
- }
-
- /* Full determination yielded a false result, retrace s1 to next
- starting position.
- Zflg 1 0 1 0/1
- Sflg 0 1 1 0/1
- cmp na 0 0 >0
- action done done continue continue if s2 < s1
- false match retrace s1 else false
- */
-
- if (cmp_s & !cmp)
- return (char *) pt;
- if (cmp_z)
- {
- if (!cmp_s)
- return NULL;
-
- /* Handle both zero and sign flag set and s1 is shorter in
- length. */
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
- int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
- int len;
- int len1;
- __asm ("bsfl %[bmsk], %[len]"
- : [len] "=r" (len) : [bmsk] "r" (bmsk));
- __asm ("bsfl %[bmsk1], %[len1]"
- : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
- if (len >= len1)
- return NULL;
- }
- else if (!cmp)
- return (char *) pt;
-
- /* Otherwise, we have to retrace and continue. Default of multiple
- paths that need to retrace from next byte in s1. */
- p2 = s2;
- frag2 = strloadu (p2);
-
- if (!kmp_fwd)
- kmp_fwd = KMP16Bovrlap (frag2);
-
- /* KMP algorithm predicted overlap needs to be corrected for
- partial fragment compare. */
- p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
-
- /* Since s2 is at least 16 bytes long, we're certain there is no
- match. */
- if (p1[0] == '\0')
- return NULL;
-
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
-
- /* Unsigned bytes, equal order, is there a partial match? */
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- goto re_trace;
-}
diff --git a/sysdeps/x86_64/sse.h b/sysdeps/x86_64/sse.h
new file mode 100644
index 0000000..2b756ca
--- /dev/null
+++ b/sysdeps/x86_64/sse.h
@@ -0,0 +1,161 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License asize_t with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#include <stdint.h>
+
+#include <emmintrin.h>
+#ifdef USE_SSSE3
+#define _HAS_SSSE3(x,y) x
+#include <tmmintrin.h>
+#else
+#define _HAS_SSSE3(x,y) y
+#endif
+#ifdef USE_SSE4_1
+#define _HAS_SSE4_1(x,y) x
+#undef _HAS_SSSE3
+#define _HAS_SSSE3( x,y) x
+#include <smmintrin.h>
+#else
+#define _HAS_SSE4_1(x,y) y
+#endif
+
+typedef __m128i tp_vector;
+typedef unsigned long tp_mask;
+
+
+SI tp_vector BROADCAST(uchar c)
+{
+ return _mm_set1_epi8(c);
+}
+SI tp_vector LOAD(uchar* x)
+{
+ return _mm_load_si128( (tp_vector*)(x));
+}
+SI tp_vector LOAD_UNALIGNED(uchar* x)
+{
+ return _mm_loadu_si128((tp_vector*)(x));
+}
+
+#define PREFETCH(x) _mm_prefetch(((char *)x),_MM_HINT_T0);
+
+SI tp_mask get_mask(tp_vector x)
+{
+ /*gcc unnecesary adds sign extension instructions for pkmovmskb.*/
+ return (tp_mask)((unsigned int)
+ _mm_movemask_epi8(x));
+}
+SI unsigned int NONZERO_MASK(tp_vector x)
+{
+ return _HAS_SSE4_1(!_mm_testz_si128(x,x),
+ get_mask(x));
+}
+
+#ifdef USE_SSE2_NO_BSF
+static char first_bit_hash[]= {0,37,50,8,0,21,0,0,38,54,5,51,9,0,30,0,22,12,1,0,0,0,0,39,0,55,0,35,6,52,28,10,0,0,33,31,0,0,23,0,13,44,0,2,0,0,25,0,0,0,0,0,40,15,0,0,56,62,46,0,19,36,7,0,0,53,4,0,29,11,0,0,0,0,34,0,27,32,0,0,0,43,0,0,24,0,0,14,0,61,45,18,0,0,3,0,0,0,0,26,0,42,0,0,0,60,17,0,0,0,0,41,0,59,16,0,0,58,0,57,0,63,47,48,0,0,49,20};
+SI tp_mask first_bit(tp_mask x,int y)
+{
+ /* ones has form 2**(tz+1)-1 where tb is number of trailing zereos.*/
+ tp_mask ones=x^(x-1);
+ /* Calculate perfect hash.*/
+ return first_bit_hash[(903385529620038207L*ones)>>57];
+}
+#else
+SI tp_mask first_bit(tp_mask x,int y)
+{
+ return __builtin_ctzl(x);
+}
+#endif
+SI tp_mask bit_i(int i)
+{
+ return ((tp_mask) 1)<<i;
+}
+
+MASK_OP(get_bit , x&bit_i(y))
+MASK_OP(shift_down , x>>y )
+MASK_OP(shift_up , x<<y )
+MASK_OP(forget_first_bit, x&(x-1))
+MASK_OP(forget_before , x&((y>=PARA) ? 0 : ((y<0) ? x :\
+ shift_up( (tp_mask)-1,y))))
+MASK_OP(forget_after , x&((y>=PARA) ? x : ((y<0) ? 0 :\
+ shift_down((tp_mask)-1,63-y))))
+
+
+BIN_OP(TEST_EQ,_mm_cmpeq_epi8( x,y))
+#define TEST_ZERO(x) TEST_EQ(x,vzero)
+BIN_OP(AND ,_mm_and_si128( x,y))
+BIN_OP(OR ,_mm_or_si128( x,y))
+BIN_OP(ANDNOT,_mm_andnot_si128(y,x))
+BIN_OP(XOR ,_mm_xor_si128( x,y))
+BIN_OP(ADD ,_mm_add_epi8( x,y))
+BIN_OP(SUB ,_mm_sub_epi8( x,y))
+#define HAS_PARALLEL_MIN
+BIN_OP(MINI ,_mm_min_epu8( x,y))
+
+#define SHIFT_DOWN _mm_srli_si128
+#define SHIFT_UP _mm_slli_si128
+
+#define CONCAT(x,y,n) ((n==0) ? (y) : ((n==BYTES_AT_ONCE) ? (x) : \
+ _HAS_SSSE3( _mm_alignr_epi8(x,y,n),\
+ OR(SHIFT_UP(x,BYTES_AT_ONCE-(n)),SHIFT_DOWN(y,(n))))))
+
+
+
+
+SI tp_vector TEST_RANGE(tp_vector x,tp_vector y,tp_vector z)
+{
+ /*We use signed comparison.*/
+ tp_vector fv=ADD(BROADCAST(128),x);
+ tp_vector v=SUB(y,fv);
+ tp_vector tv=SUB(ADD(z,BROADCAST(1)),fv);
+ return _mm_cmplt_epi8(v,tv);
+}
+
+SI tp_vector TEST_RANGE_C(tp_vector v,uchar from,uchar to)
+{
+ /* If gcc did constant folding on sse we could just use
+ TEST_RANGE(BROADCAST('A'),v,BROADCAST('Z'));.*/
+ tp_vector fv=BROADCAST(-128-from);
+ v=_mm_add_epi8(v,fv);
+ tp_vector tv=BROADCAST(-128+to-from+1);
+ return _mm_cmplt_epi8(v,tv);
+}
+
+SI tp_vector parallel_tolower(tp_vector m)
+{
+ int i;
+ tp_vector high_bit=BROADCAST(128);
+ tp_vector l= AND(TEST_RANGE_C(m,'A','Z'),high_bit);
+ m=OR(m,_mm_srli_epi64(l,2));
+ if (get_mask(m))
+ for(i=0; i<BYTES_AT_ONCE; i++)
+ {
+ ((uchar*)&m)[i]=tolower_fixed[((uchar*)&m)[i]];
+ }
+ return m;
+}
+
+
+
+#if unroll==1
+#define AGREGATE_MASK mask0
+#elif unroll==2
+#define AGREGATE_MASK (mask0|(mask1<<16))
+#elif unroll==4
+/*Has one dependency less than mask0|(mask1<<16)|(mask2<<32)|(mask3<<48)*/
+#define AGREGATE_MASK (mask0|(mask1<<16))|((mask2|(mask3<<16))<<32)
+#endif
diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
deleted file mode 100644
index baf3076..0000000
--- a/sysdeps/x86_64/strchrnul.S
+++ /dev/null
@@ -1,62 +0,0 @@
-/* strchrnul (str, ch) -- Return pointer to first occurrence of CH in STR
- or terminating NUL byte.
- For AMD x86-64.
- Copyright (C) 2009 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-
- .text
-ENTRY (__strchrnul)
- movd %esi, %xmm1
- movq %rdi, %rcx
- punpcklbw %xmm1, %xmm1
- andq $~15, %rdi
- pxor %xmm2, %xmm2
- punpcklbw %xmm1, %xmm1
- orl $0xffffffff, %esi
- movdqa (%rdi), %xmm0
- pshufd $0, %xmm1, %xmm1
- subq %rdi, %rcx
- movdqa %xmm0, %xmm3
- leaq 16(%rdi), %rdi
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- shl %cl, %esi
- pmovmskb %xmm0, %edx
- pmovmskb %xmm3, %ecx
- orl %edx, %ecx
- andl %esi, %ecx
- jnz 1f
-
-2: movdqa (%rdi), %xmm0
- leaq 16(%rdi), %rdi
- movdqa %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %edx
- pmovmskb %xmm3, %ecx
- orl %edx, %ecx
- jz 2b
-
-1: bsfl %ecx, %edx
- leaq -16(%rdi,%rdx), %rax
- ret
-END (__strchrnul)
-
-weak_alias (__strchrnul, strchrnul)
diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S
deleted file mode 100644
index 7b38bf4..0000000
--- a/sysdeps/x86_64/strnlen.S
+++ /dev/null
@@ -1,63 +0,0 @@
-/* strnlen(str,maxlen) -- determine the length of the string STR up to MAXLEN.
- Copyright (C) 2010 Free Software Foundation, Inc.
- Contributed by Ulrich Drepper <drepper@redhat.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-
- .text
-ENTRY(__strnlen)
- movq %rsi, %rax
- testq %rsi, %rsi
- jz 3f
- pxor %xmm2, %xmm2
- movq %rdi, %rcx
- movq %rdi, %r8
- movq $16, %r9
- andq $~15, %rdi
- movdqa %xmm2, %xmm1
- pcmpeqb (%rdi), %xmm2
- orl $0xffffffff, %r10d
- subq %rdi, %rcx
- shll %cl, %r10d
- subq %rcx, %r9
- pmovmskb %xmm2, %edx
- andl %r10d, %edx
- jnz 1f
- subq %r9, %rsi
- jbe 3f
-
-2: movdqa 16(%rdi), %xmm0
- leaq 16(%rdi), %rdi
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jnz 1f
- subq $16, %rsi
- jnbe 2b
-3: ret
-
-1: subq %r8, %rdi
- bsfl %edx, %edx
- addq %rdi, %rdx
- cmpq %rdx, %rax
- cmovnbq %rdx, %rax
- ret
-END(__strnlen)
-weak_alias (__strnlen, strnlen)
-libc_hidden_def (strnlen)
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
deleted file mode 100644
index a5397e7..0000000
--- a/sysdeps/x86_64/strrchr.S
+++ /dev/null
@@ -1,80 +0,0 @@
-/* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR.
- For AMD x86-64.
- Copyright (C) 2009 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-
- .text
-ENTRY (strrchr)
- movd %esi, %xmm1
- movq %rdi, %rcx
- punpcklbw %xmm1, %xmm1
- andq $~15, %rdi
- pxor %xmm2, %xmm2
- punpcklbw %xmm1, %xmm1
- orl $0xffffffff, %esi
- movdqa (%rdi), %xmm0
- pshufd $0, %xmm1, %xmm1
- subq %rdi, %rcx
- movdqa %xmm0, %xmm3
- leaq 16(%rdi), %rdi
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- shl %cl, %esi
- pmovmskb %xmm0, %edx
- pmovmskb %xmm3, %ecx
- andl %esi, %edx
- andl %esi, %ecx
- xorl %eax, %eax
- movl %edx, %esi
- orl %ecx, %esi
- jnz 1f
-
-2: movdqa (%rdi), %xmm0
- leaq 16(%rdi), %rdi
- movdqa %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %edx
- pmovmskb %xmm3, %ecx
- movl %edx, %esi
- orl %ecx, %esi
- jz 2b
-
-1: bsfl %ecx, %r9d
- movl $0xffffffff, %r8d
- movl $31, %ecx
- jnz 5f
-
- bsrl %edx, %edx
- jz 2b
- leaq -16(%rdi,%rdx), %rax
- jmp 2b
-
-5: subl %r9d, %ecx
- shrl %cl, %r8d
- andl %r8d, %edx
- bsrl %edx, %edx
- jz 4f
- leaq -16(%rdi,%rdx), %rax
-4: ret
-END (strrchr)
-
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
--
1.7.4.4