This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] faster strcat


Why do you think this is faster?

You two times iterates all src array instead of one time in the
current version (the first is strnlen and the second is strcpy, in the
current assembler version something like strncpy is used instead but
without strncpy zero filling)

It's likely true that current version inlines slow strlen but that
code can be switched to use other strlen version.

+  size_t dest_len = strlen (dest);
+  size_t src_len  = strnlen (src , n);

-  if (c != '\0')
-    *++s1 = '\0';
+  if (src_len == n)
+    {
+     memcpy (dest + dest_len, src, n);
+     dest[dest_len + n] = '\0';
+    }
+  else
+    strcpy (dest + dest_len, src);

-  return s;
+  return dest;
 }


--
Liubov Dmitrieva
Intel Corporation

2012/10/8 OndÅej BÃlka <neleai@seznam.cz>
>
> This is next version of my patch
> http://sourceware.org/ml/libc-alpha/2012-06/msg00489.html
>
> I investigated strcat bit futher and speed degradation
> was caused by improper usage of indirect functions.
>
> strcat ifunc first tests bit_Fast_Unaligned_Load which is
> false on core2 and AMD processors. Then it checks ssse3 and
> calls ssse3 version.
> But strcat_ssse3 inlines strlen_sse2_no_bsf which on core2 and phenomII
> is slowest strlen variant unless strings is larger than 2000 where
> strlen_sse2 takes lead.
>
> Then I deleted strcat variants that are no longer needed.
>
> Files ports/sysdeps/ia64/strcat.c, sysdeps/powerpc/strcat.c,  became
> duplicates of string/strcat.c.
>
>
>         * string/strcat.c: Reduce algorithm selection
>           to strlen,strcpy
>         * string/strncat.c: Likewise
>         * sysdeps/powerpc/strcat.c: Duplicated string/strcat.c
>         * ports/sysdeps/ia64/strcat.c: Likewise
>
>         * sysdeps/i386/i686/multiarch/Makefile: Updated
>         * sysdeps/x86_64/multiarch/Makefile:    Updated
>
>         * sysdeps/i386/i486/strcat.S: No longer needed
>         * sysdeps/i386/i686/multiarch/strcat-sse2.S:Likewise
>         * sysdeps/i386/i686/multiarch/strcat-ssse3.S:Likewise
>         * sysdeps/i386/i686/multiarch/strcat.S:Likewise
>         * sysdeps/i386/i686/multiarch/strncat-c.c:Likewise
>         * sysdeps/i386/i686/multiarch/strncat-sse2.S:Likewise
>         * sysdeps/i386/i686/multiarch/strncat-ssse3.S:Likewise
>         * sysdeps/i386/i686/multiarch/strncat.S:Likewise
>         * sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S:Likewise
>         * sysdeps/x86_64/multiarch/strcat-ssse3.S:Likewise
>         * sysdeps/x86_64/multiarch/strcat.S:Likewise
>         * sysdeps/x86_64/multiarch/strncat-c.c:Likewise
>         * sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S:Likewise
>         * sysdeps/x86_64/multiarch/strncat-ssse3.S:Likewise
>         * sysdeps/x86_64/multiarch/strncat.S:Likewise
>         * sysdeps/x86_64/strcat.S:Likewise
>
>
> ---
>  ports/sysdeps/ia64/strcat.c                       |   26 -
>  string/strcat.c                                   |   29 +-
>  string/strncat.c                                  |   62 +-
>  sysdeps/i386/i486/strcat.S                        |  273 -----
>  sysdeps/i386/i686/multiarch/Makefile              |    3 +-
>  sysdeps/i386/i686/multiarch/strcat-sse2.S         | 1243
> ---------------------
>  sysdeps/i386/i686/multiarch/strcat-ssse3.S        |  572 ----------
>  sysdeps/i386/i686/multiarch/strcat.S              |  119 --
>  sysdeps/i386/i686/multiarch/strncat-c.c           |    8 -
>  sysdeps/i386/i686/multiarch/strncat-sse2.S        |    4 -
>  sysdeps/i386/i686/multiarch/strncat-ssse3.S       |    4 -
>  sysdeps/i386/i686/multiarch/strncat.S             |    3 -
>  sysdeps/powerpc/strcat.c                          |   30 -
>  sysdeps/x86_64/multiarch/Makefile                 |    5 +-
>  sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S  |   53 -
>  sysdeps/x86_64/multiarch/strcat-ssse3.S           |  557 ---------
>  sysdeps/x86_64/multiarch/strcat.S                 |   84 --
>  sysdeps/x86_64/multiarch/strncat-c.c              |    8 -
>  sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S |    3 -
>  sysdeps/x86_64/multiarch/strncat-ssse3.S          |    3 -
>  sysdeps/x86_64/multiarch/strncat.S                |    3 -
>  sysdeps/x86_64/strcat.S                           |  259 -----
>  24 files changed, 15 insertions(+), 4280 deletions(-)
>  delete mode 100644 ports/sysdeps/ia64/strcat.c
>  delete mode 100644 sysdeps/i386/i486/strcat.S
>  delete mode 100644 sysdeps/i386/i686/multiarch/strcat-sse2.S
>  delete mode 100644 sysdeps/i386/i686/multiarch/strcat-ssse3.S
>  delete mode 100644 sysdeps/i386/i686/multiarch/strcat.S
>  delete mode 100644 sysdeps/i386/i686/multiarch/strncat-c.c
>  delete mode 100644 sysdeps/i386/i686/multiarch/strncat-sse2.S
>  delete mode 100644 sysdeps/i386/i686/multiarch/strncat-ssse3.S
>  delete mode 100644 sysdeps/i386/i686/multiarch/strncat.S
>  delete mode 100644 sysdeps/powerpc/strcat.c
>  delete mode 100644 sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strcat.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strncat-c.c
>  delete mode 100644 sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strncat.S
>  delete mode 100644 sysdeps/x86_64/strcat.S
>
> diff --git a/ports/sysdeps/ia64/strcat.c b/ports/sysdeps/ia64/strcat.c
> deleted file mode 100644
> index 53cd4d1..0000000
> --- a/ports/sysdeps/ia64/strcat.c
> +++ /dev/null
> @@ -1,26 +0,0 @@
> -/* Copyright (C) 2004 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -#include <string.h>
> -
> -char *
> -strcat (char *dest, const char *src)
> -{
> -  strcpy (dest + strlen (dest), src);
> -  return dest;
> -}
> -libc_hidden_builtin_def (strcat)
> diff --git a/string/strcat.c b/string/strcat.c
> index f9e4bc6..28575d0 100644
> --- a/string/strcat.c
> +++ b/string/strcat.c
> @@ -1,4 +1,5 @@
> -/* Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
> +/* strcat version that uses fast strcpy/strlen.
> +   Copyright (C) 1997, 2003 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
>     The GNU C Library is free software; you can redistribute it and/or
> @@ -16,36 +17,14 @@
>     <http://www.gnu.org/licenses/>.  */
>
>  #include <string.h>
> -#include <memcopy.h>
>
>  #undef strcat
>
>  /* Append SRC on the end of DEST.  */
>  char *
> -strcat (dest, src)
> -     char *dest;
> -     const char *src;
> +strcat (char *dest, const char *src)
>  {
> -  char *s1 = dest;
> -  const char *s2 = src;
> -  char c;
> -
> -  /* Find the end of the string.  */
> -  do
> -    c = *s1++;
> -  while (c != '\0');
> -
> -  /* Make S1 point before the next character, so we can increment
> -     it while memory is read (wins on pipelined cpus).  */
> -  s1 -= 2;
> -
> -  do
> -    {
> -      c = *s2++;
> -      *++s1 = c;
> -    }
> -  while (c != '\0');
> -
> +  strcpy (dest + strlen (dest), src);
>    return dest;
>  }
>  libc_hidden_builtin_def (strcat)
> diff --git a/string/strncat.c b/string/strncat.c
> index dcfb04d..17b4c9a 100644
> --- a/string/strncat.c
> +++ b/string/strncat.c
> @@ -1,4 +1,4 @@
> -/* Copyright (C) 1991,1997,2011 Free Software Foundation, Inc.
> +/* Copyright (C) 1991-2012 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
>     The GNU C Library is free software; you can redistribute it and/or
> @@ -17,66 +17,20 @@
>
>  #include <string.h>
>
> -#ifdef _LIBC
> -# include <memcopy.h>
> -#endif
> -
>  #ifndef STRNCAT
>  # undef strncat
>  # define STRNCAT  strncat
>  #endif
>
>  char *
> -STRNCAT (char *s1, const char *s2, size_t n)
> +STRNCAT (char *dest, const char *src, size_t n)
>  {
> -  char c;
> -  char *s = s1;
> -
> -  /* Find the end of S1.  */
> -  do
> -    c = *s1++;
> -  while (c != '\0');
> -
> -  /* Make S1 point before next character, so we can increment
> -     it while memory is read (wins on pipelined cpus).  */
> -  s1 -= 2;
> -
> -  if (n >= 4)
> -    {
> -      size_t n4 = n >> 2;
> -      do
> -       {
> -         c = *s2++;
> -         *++s1 = c;
> -         if (c == '\0')
> -           return s;
> -         c = *s2++;
> -         *++s1 = c;
> -         if (c == '\0')
> -           return s;
> -         c = *s2++;
> -         *++s1 = c;
> -         if (c == '\0')
> -           return s;
> -         c = *s2++;
> -         *++s1 = c;
> -         if (c == '\0')
> -           return s;
> -       } while (--n4 > 0);
> -      n &= 3;
> -    }
> -
> -  while (n > 0)
> -    {
> -      c = *s2++;
> -      *++s1 = c;
> -      if (c == '\0')
> -       return s;
> -      n--;
> -    }
> +  size_t dest_len = strlen (dest);
> +  size_t src_len  = strnlen (src , n);
>
> -  if (c != '\0')
> -    *++s1 = '\0';
> +  if (src_len == n)
> +    {
> +     memcpy (dest + dest_len, src, n);
> +     dest[dest_len + n] = '\0';
> +    }
> +  else
> +    strcpy (dest + dest_len, src);
>
> -  return s;
> +  return dest;
>  }
> diff --git a/sysdeps/i386/i486/strcat.S b/sysdeps/i386/i486/strcat.S
> deleted file mode 100644
> index 7596a0d..0000000
> --- a/sysdeps/i386/i486/strcat.S
> +++ /dev/null
> @@ -1,273 +0,0 @@
> -/* strcat(dest, src) -- Append SRC on the end of DEST.
> -   For Intel 80x86, x>=4.
> -   Copyright (C) 1994-1997,2000,2003,2005 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -   Contributed by Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de>.
> -   Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -#include "asm-syntax.h"
> -#include "bp-sym.h"
> -#include "bp-asm.h"
> -
> -#define PARMS  LINKAGE+4       /* space for 1 saved reg */
> -#define RTN    PARMS
> -#define DEST   RTN+RTN_SIZE
> -#define SRC    DEST+PTR_SIZE
> -
> -       .text
> -ENTRY (BP_SYM (strcat))
> -       ENTER
> -
> -       pushl %edi              /* Save callee-safe register.  */
> -       cfi_adjust_cfa_offset (4)
> -
> -       movl DEST(%esp), %edx
> -       movl SRC(%esp), %ecx
> -       CHECK_BOUNDS_LOW (%edx, DEST(%esp))
> -       CHECK_BOUNDS_LOW (%ecx, SRC(%esp))
> -
> -       testb $0xff, (%ecx)     /* Is source string empty? */
> -       jz L(8)                 /* yes => return */
> -
> -       /* Test the first bytes separately until destination is aligned.
> */
> -       testl $3, %edx          /* destination pointer aligned? */
> -       jz L(1)                 /* yes => begin scan loop */
> -       testb $0xff, (%edx)     /* is end of string? */
> -       jz L(2)                 /* yes => start appending */
> -       incl %edx               /* increment source pointer */
> -
> -       testl $3, %edx          /* destination pointer aligned? */
> -       jz L(1)                 /* yes => begin scan loop */
> -       testb $0xff, (%edx)     /* is end of string? */
> -       jz L(2)                 /* yes => start appending */
> -       incl %edx               /* increment source pointer */
> -
> -       testl $3, %edx          /* destination pointer aligned? */
> -       jz L(1)                 /* yes => begin scan loop */
> -       testb $0xff, (%edx)     /* is end of string? */
> -       jz L(2)                 /* yes => start appending */
> -       incl %edx               /* increment source pointer */
> -
> -       /* Now we are aligned.  Begin scan loop.  */
> -       jmp L(1)
> -
> -       cfi_rel_offset (edi, 0)
> -       ALIGN(4)
> -
> -L(4):  addl $16,%edx           /* increment destination pointer for round
> */
> -
> -L(1):  movl (%edx), %eax       /* get word (= 4 bytes) in question */
> -       movl $0xfefefeff, %edi  /* magic value */
> -
> -       /* If you compare this with the algorithm in memchr.S you will
> -          notice that here is an `xorl' statement missing.  But you must
> -          not forget that we are looking for C == 0 and `xorl $0, %eax'
> -          is a no-op.  */
> -
> -       addl %eax, %edi         /* add the magic value to the word.  We
> get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -
> -       /* According to the algorithm we had to reverse the effect of the
> -          XOR first and then test the overflow bits.  But because the
> -          following XOR would destroy the carry flag and it would (in a
> -          representation with more than 32 bits) not alter then last
> -          overflow, we can now test this condition.  If no carry is
> signaled
> -          no overflow must have occurred in the last byte => it was 0. */
> -       jnc L(3)
> -
> -       /* We are only interested in carry bits that change due to the
> -          previous add, so remove original bits */
> -       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask)
> */
> -
> -       /* Now test for the other three overflow bits.  */
> -       orl $0xfefefeff, %edi   /* set all non-carry bits */
> -       incl %edi               /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       /* If at least one byte of the word is C we don't get 0 in %ecx.
> */
> -       jnz L(3)
> -
> -       movl 4(%edx), %eax      /* get word from source */
> -       movl $0xfefefeff, %edi  /* magic value */
> -       addl %eax, %edi         /* add the magic value to the word.  We
> get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc L(5)                /* highest byte is C => stop copying */
> -       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask)
> */
> -       orl $0xfefefeff, %edi   /* set all non-carry bits */
> -       incl %edi               /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -       jnz L(5)                /* one byte is NUL => stop copying */
> -
> -       movl 8(%edx), %eax      /* get word from source */
> -       movl $0xfefefeff, %edi  /* magic value */
> -       addl %eax, %edi         /* add the magic value to the word.  We
> get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc L(6)                /* highest byte is C => stop copying */
> -       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask)
> */
> -       orl $0xfefefeff, %edi   /* set all non-carry bits */
> -       incl %edi               /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -       jnz L(6)                /* one byte is NUL => stop copying */
> -
> -       movl 12(%edx), %eax     /* get word from source */
> -       movl $0xfefefeff, %edi  /* magic value */
> -       addl %eax, %edi         /* add the magic value to the word.  We
> get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc L(7)                /* highest byte is C => stop copying */
> -       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask)
> */
> -       orl $0xfefefeff, %edi   /* set all non-carry bits */
> -       incl %edi               /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -       jz L(4)                 /* no byte is NUL => carry on copying */
> -
> -L(7):  addl $4, %edx           /* adjust source pointer */
> -L(6):  addl $4, %edx
> -L(5):  addl $4, %edx
> -
> -L(3):  testb %al, %al          /* is first byte NUL? */
> -       jz L(2)                 /* yes => start copying */
> -       incl %edx               /* increment source pointer */
> -
> -       testb %ah, %ah          /* is second byte NUL? */
> -       jz L(2)                 /* yes => start copying */
> -       incl %edx               /* increment source pointer */
> -
> -       testl $0xff0000, %eax   /* is third byte NUL? */
> -       jz L(2)                 /* yes => start copying */
> -       incl %edx               /* increment source pointer */
> -
> -L(2):  subl %ecx, %edx         /* reduce number of loop variants */
> -
> -       /* Now we have to align the source pointer.  */
> -       testl $3, %ecx          /* pointer correctly aligned? */
> -       jz L(29)                /* yes => start copy loop */
> -       movb (%ecx), %al        /* get first byte */
> -       movb %al, (%ecx,%edx)   /* and store it */
> -       andb %al, %al           /* is byte NUL? */
> -       jz L(8)                 /* yes => return */
> -       incl %ecx               /* increment pointer */
> -
> -       testl $3, %ecx          /* pointer correctly aligned? */
> -       jz L(29)                /* yes => start copy loop */
> -       movb (%ecx), %al        /* get first byte */
> -       movb %al, (%ecx,%edx)   /* and store it */
> -       andb %al, %al           /* is byte NUL? */
> -       jz L(8)                 /* yes => return */
> -       incl %ecx               /* increment pointer */
> -
> -       testl $3, %ecx          /* pointer correctly aligned? */
> -       jz L(29)                /* yes => start copy loop */
> -       movb (%ecx), %al        /* get first byte */
> -       movb %al, (%ecx,%edx)   /* and store it */
> -       andb %al, %al           /* is byte NUL? */
> -       jz L(8)                 /* yes => return */
> -       incl %ecx               /* increment pointer */
> -
> -       /* Now we are aligned.  */
> -       jmp L(29)               /* start copy loop */
> -
> -       ALIGN(4)
> -
> -L(28): movl %eax, 12(%ecx,%edx)/* store word at destination */
> -       addl $16, %ecx          /* adjust pointer for full round */
> -
> -L(29): movl (%ecx), %eax       /* get word from source */
> -       movl $0xfefefeff, %edi  /* magic value */
> -       addl %eax, %edi         /* add the magic value to the word.  We
> get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc L(9)                /* highest byte is C => stop copying */
> -       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask)
> */
> -       orl $0xfefefeff, %edi   /* set all non-carry bits */
> -       incl %edi               /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -       jnz L(9)                /* one byte is NUL => stop copying */
> -       movl %eax, (%ecx,%edx)  /* store word to destination */
> -
> -       movl 4(%ecx), %eax      /* get word from source */
> -       movl $0xfefefeff, %edi  /* magic value */
> -       addl %eax, %edi         /* add the magic value to the word.  We
> get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc L(91)               /* highest byte is C => stop copying */
> -       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask)
> */
> -       orl $0xfefefeff, %edi   /* set all non-carry bits */
> -       incl %edi               /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -       jnz L(91)               /* one byte is NUL => stop copying */
> -       movl %eax, 4(%ecx,%edx) /* store word to destination */
> -
> -       movl 8(%ecx), %eax      /* get word from source */
> -       movl $0xfefefeff, %edi  /* magic value */
> -       addl %eax, %edi         /* add the magic value to the word.  We
> get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc L(92)               /* highest byte is C => stop copying */
> -       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask)
> */
> -       orl $0xfefefeff, %edi   /* set all non-carry bits */
> -       incl %edi               /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -       jnz L(92)               /* one byte is NUL => stop copying */
> -       movl %eax, 8(%ecx,%edx) /* store word to destination */
> -
> -       movl 12(%ecx), %eax     /* get word from source */
> -       movl $0xfefefeff, %edi  /* magic value */
> -       addl %eax, %edi         /* add the magic value to the word.  We
> get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc L(93)               /* highest byte is C => stop copying */
> -       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask)
> */
> -       orl $0xfefefeff, %edi   /* set all non-carry bits */
> -       incl %edi               /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -       jz L(28)                /* no is NUL => carry on copying */
> -
> -L(93): addl $4, %ecx           /* adjust pointer */
> -L(92): addl $4, %ecx
> -L(91): addl $4, %ecx
> -
> -L(9):  movb %al, (%ecx,%edx)   /* store first byte of last word */
> -       orb %al, %al            /* is it NUL? */
> -       jz L(8)                 /* yes => return */
> -
> -       movb %ah, 1(%ecx,%edx)  /* store second byte of last word */
> -       orb %ah, %ah            /* is it NUL? */
> -       jz L(8)                 /* yes => return */
> -
> -       shrl $16, %eax          /* make upper bytes accessible */
> -       movb %al, 2(%ecx,%edx)  /* store third byte of last word */
> -       orb %al, %al            /* is it NUL? */
> -       jz L(8)                 /* yes => return */
> -
> -       movb %ah, 3(%ecx,%edx)  /* store fourth byte of last word */
> -
> -L(8):  /* GKM FIXME: check high bounds */
> -       movl DEST(%esp), %eax   /* start address of destination is result
> */
> -       RETURN_BOUNDED_POINTER (DEST(%esp))
> -       popl %edi               /* restore saved register */
> -       cfi_adjust_cfa_offset (-4)
> -       cfi_restore (edi)
> -
> -       LEAVE
> -       RET_PTR
> -END (BP_SYM (strcat))
> -libc_hidden_builtin_def (strcat)
> diff --git a/sysdeps/i386/i686/multiarch/Makefile
> b/sysdeps/i386/i686/multiarch/Makefile
> index 8946bfa..92a2b8f 100644
> --- a/sysdeps/i386/i686/multiarch/Makefile
> +++ b/sysdeps/i386/i686/multiarch/Makefile
> @@ -14,8 +14,7 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3
> mempcpy-ssse3 \
>                    memcmp-ssse3 memcmp-sse4 strcasestr-nonascii varshift \
>                    strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \
>                    strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \
> -                  strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \
> -                  strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \
> +                  strncpy-sse2 stpcpy-sse2 stpncpy-sse2 \
>                    strchr-sse2 strrchr-sse2 strchr-sse2-bsf
> strrchr-sse2-bsf \
>                    memchr-sse2 memchr-sse2-bsf \
>                    memrchr-sse2 memrchr-sse2-bsf memrchr-c \
> diff --git a/sysdeps/i386/i686/multiarch/strcat-sse2.S
> b/sysdeps/i386/i686/multiarch/strcat-sse2.S
> deleted file mode 100644
> index e75f92c..0000000
> --- a/sysdeps/i386/i686/multiarch/strcat-sse2.S
> +++ /dev/null
> @@ -1,1243 +0,0 @@
> -/* strcat with SSE2
> -   Copyright (C) 2011-2012 Free Software Foundation, Inc.
> -   Contributed by Intel Corporation.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -
> -#ifndef NOT_IN_libc
> -
> -# include <sysdep.h>
> -
> -
> -# define CFI_PUSH(REG) \
> -       cfi_adjust_cfa_offset (4);      \
> -       cfi_rel_offset (REG, 0)
> -
> -# define CFI_POP(REG)  \
> -       cfi_adjust_cfa_offset (-4);     \
> -       cfi_restore (REG)
> -
> -# define PUSH(REG) pushl REG; CFI_PUSH (REG)
> -# define POP(REG) popl REG; CFI_POP (REG)
> -
> -# ifdef SHARED
> -#  define JMPTBL(I, B) I - B
> -
> -/* Load an entry in a jump table into ECX and branch to it.  TABLE is a
> -       jump table with relative offsets.  INDEX is a register contains
> the
> -       index into the jump table.   SCALE is the scale of INDEX. */
> -
> -#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)  \
> -       /* We first load PC into ECX.  */       \
> -       SETUP_PIC_REG(cx);      \
> -       /* Get the address of the jump table.  */       \
> -       addl    $(TABLE - .), %ecx;     \
> -       /* Get the entry and convert the relative offset to the \
> -       absolute address.  */   \
> -       addl    (%ecx,INDEX,SCALE), %ecx;       \
> -       /* We loaded the jump table and adjuested ECX. Go.  */  \
> -       jmp     *%ecx
> -# else
> -#  define JMPTBL(I, B) I
> -
> -/* Branch to an entry in a jump table.  TABLE is a jump table with
> -       absolute offsets.  INDEX is a register contains the index into the
> -       jump table.  SCALE is the scale of INDEX. */
> -
> -#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)  \
> -       jmp     *TABLE(,INDEX,SCALE)
> -# endif
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_sse2
> -# endif
> -
> -# define PARMS  4
> -# define STR1  PARMS+4
> -# define STR2  STR1+4
> -
> -# ifdef USE_AS_STRNCAT
> -#  define LEN    STR2+8
> -#  define STR3   STR1+4
> -# else
> -#  define STR3   STR1
> -# endif
> -
> -# define USE_AS_STRCAT
> -# ifdef USE_AS_STRNCAT
> -#  define RETURN  POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx);
> CFI_PUSH(%esi);
> -# else
> -#  define RETURN  POP(%esi); ret; CFI_PUSH(%esi);
> -# endif
> -
> -.text
> -ENTRY (STRCAT)
> -       PUSH    (%esi)
> -       mov     STR1(%esp), %eax
> -       mov     STR2(%esp), %esi
> -# ifdef USE_AS_STRNCAT
> -       PUSH    (%ebx)
> -       movl    LEN(%esp), %ebx
> -       test    %ebx, %ebx
> -       jz      L(ExitZero)
> -# endif
> -       cmpb    $0, (%esi)
> -       mov     %esi, %ecx
> -       mov     %eax, %edx
> -       jz      L(ExitZero)
> -
> -       and     $63, %ecx
> -       and     $63, %edx
> -       cmp     $32, %ecx
> -       ja      L(StrlenCore7_1)
> -       cmp     $48, %edx
> -       ja      L(alignment_prolog)
> -
> -       pxor    %xmm0, %xmm0
> -       pxor    %xmm4, %xmm4
> -       pxor    %xmm7, %xmm7
> -       movdqu  (%eax), %xmm1
> -       movdqu  (%esi), %xmm5
> -       pcmpeqb %xmm1, %xmm0
> -       movdqu  16(%esi), %xmm6
> -       pmovmskb %xmm0, %ecx
> -       pcmpeqb %xmm5, %xmm4
> -       pcmpeqb %xmm6, %xmm7
> -       test    %ecx, %ecx
> -       jnz     L(exit_less16_)
> -       mov     %eax, %ecx
> -       and     $-16, %eax
> -       jmp     L(loop_prolog)
> -
> -L(alignment_prolog):
> -       pxor    %xmm0, %xmm0
> -       pxor    %xmm4, %xmm4
> -       mov     %edx, %ecx
> -       pxor    %xmm7, %xmm7
> -       and     $15, %ecx
> -       and     $-16, %eax
> -       pcmpeqb (%eax), %xmm0
> -       movdqu  (%esi), %xmm5
> -       movdqu  16(%esi), %xmm6
> -       pmovmskb %xmm0, %edx
> -       pcmpeqb %xmm5, %xmm4
> -       shr     %cl, %edx
> -       pcmpeqb %xmm6, %xmm7
> -       test    %edx, %edx
> -       jnz     L(exit_less16)
> -       add     %eax, %ecx
> -
> -       pxor    %xmm0, %xmm0
> -L(loop_prolog):
> -       pxor    %xmm1, %xmm1
> -       pxor    %xmm2, %xmm2
> -       pxor    %xmm3, %xmm3
> -       .p2align 4
> -L(align16_loop):
> -       pcmpeqb 16(%eax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit16)
> -
> -       pcmpeqb 32(%eax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit32)
> -
> -       pcmpeqb 48(%eax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit48)
> -
> -       pcmpeqb 64(%eax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     64(%eax), %eax
> -       test    %edx, %edx
> -       jz      L(align16_loop)
> -       bsf     %edx, %edx
> -       add     %edx, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit16):
> -       bsf     %edx, %edx
> -       lea     16(%eax, %edx), %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit32):
> -       bsf     %edx, %edx
> -       lea     32(%eax, %edx), %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit48):
> -       bsf     %edx, %edx
> -       lea     48(%eax, %edx), %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_less16):
> -       bsf     %edx, %edx
> -       add     %ecx, %eax
> -       add     %edx, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_less16_):
> -       bsf     %ecx, %ecx
> -       add     %ecx, %eax
> -
> -       .p2align 4
> -L(StartStrcpyPart):
> -       pmovmskb %xmm4, %edx
> -# ifdef USE_AS_STRNCAT
> -       cmp     $16, %ebx
> -       jbe     L(CopyFrom1To16BytesTail1Case2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16BytesTail1)
> -
> -       movdqu  %xmm5, (%eax)
> -       pmovmskb %xmm7, %edx
> -# ifdef USE_AS_STRNCAT
> -       cmp     $32, %ebx
> -       jbe     L(CopyFrom1To32Bytes1Case2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To32Bytes1)
> -
> -       mov     %esi, %ecx
> -       and     $-16, %esi
> -       and     $15, %ecx
> -       pxor    %xmm0, %xmm0
> -# ifdef USE_AS_STRNCAT
> -       add     %ecx, %ebx
> -# endif
> -       sub     %ecx, %eax
> -       jmp     L(Unalign16Both)
> -
> -L(StrlenCore7_1):
> -       mov     %eax, %ecx
> -       pxor    %xmm0, %xmm0
> -       and     $15, %ecx
> -       and     $-16, %eax
> -       pcmpeqb (%eax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       shr     %cl, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_less16_1)
> -       add     %eax, %ecx
> -
> -       pxor    %xmm0, %xmm0
> -       pxor    %xmm1, %xmm1
> -       pxor    %xmm2, %xmm2
> -       pxor    %xmm3, %xmm3
> -
> -       .p2align 4
> -L(align16_loop_1):
> -       pcmpeqb 16(%eax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit16_1)
> -
> -       pcmpeqb 32(%eax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit32_1)
> -
> -       pcmpeqb 48(%eax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit48_1)
> -
> -       pcmpeqb 64(%eax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     64(%eax), %eax
> -       test    %edx, %edx
> -       jz      L(align16_loop_1)
> -       bsf     %edx, %edx
> -       add     %edx, %eax
> -       jmp     L(StartStrcpyPart_1)
> -
> -       .p2align 4
> -L(exit16_1):
> -       bsf     %edx, %edx
> -       lea     16(%eax, %edx), %eax
> -       jmp     L(StartStrcpyPart_1)
> -
> -       .p2align 4
> -L(exit32_1):
> -       bsf     %edx, %edx
> -       lea     32(%eax, %edx), %eax
> -       jmp     L(StartStrcpyPart_1)
> -
> -       .p2align 4
> -L(exit48_1):
> -       bsf     %edx, %edx
> -       lea     48(%eax, %edx), %eax
> -       jmp     L(StartStrcpyPart_1)
> -
> -       .p2align 4
> -L(exit_less16_1):
> -       bsf     %edx, %edx
> -       add     %ecx, %eax
> -       add     %edx, %eax
> -
> -       .p2align 4
> -L(StartStrcpyPart_1):
> -       mov     %esi, %ecx
> -       and     $15, %ecx
> -       and     $-16, %esi
> -       pxor    %xmm0, %xmm0
> -       pxor    %xmm1, %xmm1
> -
> -# ifdef USE_AS_STRNCAT
> -       cmp     $48, %ebx
> -       ja      L(BigN)
> -# endif
> -       pcmpeqb (%esi), %xmm1
> -# ifdef USE_AS_STRNCAT
> -       add     %ecx, %ebx
> -# endif
> -       pmovmskb %xmm1, %edx
> -       shr     %cl, %edx
> -# ifdef USE_AS_STRNCAT
> -       cmp     $16, %ebx
> -       jbe     L(CopyFrom1To16BytesTailCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16BytesTail)
> -
> -       pcmpeqb 16(%esi), %xmm0
> -       pmovmskb %xmm0, %edx
> -# ifdef USE_AS_STRNCAT
> -       cmp     $32, %ebx
> -       jbe     L(CopyFrom1To32BytesCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To32Bytes)
> -
> -       movdqu  (%esi, %ecx), %xmm1   /* copy 16 bytes */
> -       movdqu  %xmm1, (%eax)
> -       sub     %ecx, %eax
> -
> -       .p2align 4
> -L(Unalign16Both):
> -       mov     $16, %ecx
> -       movdqa  (%esi, %ecx), %xmm1
> -       movaps  16(%esi, %ecx), %xmm2
> -       movdqu  %xmm1, (%eax, %ecx)
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %edx
> -       add     $16, %ecx
> -# ifdef USE_AS_STRNCAT
> -       sub     $48, %ebx
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16Bytes)
> -L(Unalign16BothBigN):
> -       movaps  16(%esi, %ecx), %xmm3
> -       movdqu  %xmm2, (%eax, %ecx)
> -       pcmpeqb %xmm3, %xmm0
> -       pmovmskb %xmm0, %edx
> -       add     $16, %ecx
> -# ifdef USE_AS_STRNCAT
> -       sub     $16, %ebx
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%esi, %ecx), %xmm4
> -       movdqu  %xmm3, (%eax, %ecx)
> -       pcmpeqb %xmm4, %xmm0
> -       pmovmskb %xmm0, %edx
> -       add     $16, %ecx
> -# ifdef USE_AS_STRNCAT
> -       sub     $16, %ebx
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%esi, %ecx), %xmm1
> -       movdqu  %xmm4, (%eax, %ecx)
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb %xmm0, %edx
> -       add     $16, %ecx
> -# ifdef USE_AS_STRNCAT
> -       sub     $16, %ebx
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%esi, %ecx), %xmm2
> -       movdqu  %xmm1, (%eax, %ecx)
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %edx
> -       add     $16, %ecx
> -# ifdef USE_AS_STRNCAT
> -       sub     $16, %ebx
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%esi, %ecx), %xmm3
> -       movdqu  %xmm2, (%eax, %ecx)
> -       pcmpeqb %xmm3, %xmm0
> -       pmovmskb %xmm0, %edx
> -       add     $16, %ecx
> -# ifdef USE_AS_STRNCAT
> -       sub     $16, %ebx
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movdqu  %xmm3, (%eax, %ecx)
> -       mov     %esi, %edx
> -       lea     16(%esi, %ecx), %esi
> -       and     $-0x40, %esi
> -       sub     %esi, %edx
> -       sub     %edx, %eax
> -# ifdef USE_AS_STRNCAT
> -       lea     128(%ebx, %edx), %ebx
> -# endif
> -       movaps  (%esi), %xmm2
> -       movaps  %xmm2, %xmm4
> -       movaps  16(%esi), %xmm5
> -       movaps  32(%esi), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  48(%esi), %xmm7
> -       pminub  %xmm5, %xmm2
> -       pminub  %xmm7, %xmm3
> -       pminub  %xmm2, %xmm3
> -       pcmpeqb %xmm0, %xmm3
> -       pmovmskb %xmm3, %edx
> -# ifdef USE_AS_STRNCAT
> -       sub     $64, %ebx
> -       jbe     L(UnalignedLeaveCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jnz     L(Unaligned64Leave)
> -
> -       .p2align 4
> -L(Unaligned64Loop_start):
> -       add     $64, %eax
> -       add     $64, %esi
> -       movdqu  %xmm4, -64(%eax)
> -       movaps  (%esi), %xmm2
> -       movdqa  %xmm2, %xmm4
> -       movdqu  %xmm5, -48(%eax)
> -       movaps  16(%esi), %xmm5
> -       pminub  %xmm5, %xmm2
> -       movaps  32(%esi), %xmm3
> -       movdqu  %xmm6, -32(%eax)
> -       movaps  %xmm3, %xmm6
> -       movdqu  %xmm7, -16(%eax)
> -       movaps  48(%esi), %xmm7
> -       pminub  %xmm7, %xmm3
> -       pminub  %xmm2, %xmm3
> -       pcmpeqb %xmm0, %xmm3
> -       pmovmskb %xmm3, %edx
> -# ifdef USE_AS_STRNCAT
> -       sub     $64, %ebx
> -       jbe     L(UnalignedLeaveCase2OrCase3)
> -# endif
> -       test    %edx, %edx
> -       jz      L(Unaligned64Loop_start)
> -
> -L(Unaligned64Leave):
> -       pxor    %xmm1, %xmm1
> -
> -       pcmpeqb %xmm4, %xmm0
> -       pcmpeqb %xmm5, %xmm1
> -       pmovmskb %xmm0, %edx
> -       pmovmskb %xmm1, %ecx
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16BytesUnaligned_0)
> -       test    %ecx, %ecx
> -       jnz     L(CopyFrom1To16BytesUnaligned_16)
> -
> -       pcmpeqb %xmm6, %xmm0
> -       pcmpeqb %xmm7, %xmm1
> -       pmovmskb %xmm0, %edx
> -       pmovmskb %xmm1, %ecx
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16BytesUnaligned_32)
> -
> -       bsf     %ecx, %edx
> -       movdqu  %xmm4, (%eax)
> -       movdqu  %xmm5, 16(%eax)
> -       movdqu  %xmm6, 32(%eax)
> -       add     $48, %esi
> -       add     $48, %eax
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> -
> -# ifdef USE_AS_STRNCAT
> -       .p2align 4
> -L(BigN):
> -       pcmpeqb (%esi), %xmm1
> -       pmovmskb %xmm1, %edx
> -       shr     %cl, %edx
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16BytesTail)
> -
> -       pcmpeqb 16(%esi), %xmm0
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To32Bytes)
> -
> -       movdqu  (%esi, %ecx), %xmm1   /* copy 16 bytes */
> -       movdqu  %xmm1, (%eax)
> -       sub     %ecx, %eax
> -       sub     $48, %ebx
> -       add     %ecx, %ebx
> -
> -       mov     $16, %ecx
> -       movdqa  (%esi, %ecx), %xmm1
> -       movaps  16(%esi, %ecx), %xmm2
> -       movdqu  %xmm1, (%eax, %ecx)
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %edx
> -       add     $16, %ecx
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16Bytes)
> -       jmp     L(Unalign16BothBigN)
> -# endif
> -
> -/*------------end of main part-------------------------------*/
> -
> -/* Case1 */
> -       .p2align 4
> -L(CopyFrom1To16Bytes):
> -       add     %ecx, %eax
> -       add     %ecx, %esi
> -       bsf     %edx, %edx
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesTail):
> -       add     %ecx, %esi
> -       bsf     %edx, %edx
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> -
> -       .p2align 4
> -L(CopyFrom1To32Bytes1):
> -       add     $16, %esi
> -       add     $16, %eax
> -L(CopyFrom1To16BytesTail1):
> -       bsf     %edx, %edx
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> -
> -       .p2align 4
> -L(CopyFrom1To32Bytes):
> -       bsf     %edx, %edx
> -       add     %ecx, %esi
> -       add     $16, %edx
> -       sub     %ecx, %edx
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesUnaligned_0):
> -       bsf     %edx, %edx
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesUnaligned_16):
> -       bsf     %ecx, %edx
> -       movdqu  %xmm4, (%eax)
> -       add     $16, %esi
> -       add     $16, %eax
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesUnaligned_32):
> -       bsf     %edx, %edx
> -       movdqu  %xmm4, (%eax)
> -       movdqu  %xmm5, 16(%eax)
> -       add     $32, %esi
> -       add     $32, %eax
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> -
> -# ifdef USE_AS_STRNCAT
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesExit):
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> -
> -/* Case2 */
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase2):
> -       add     $16, %ebx
> -       add     %ecx, %eax
> -       add     %ecx, %esi
> -       bsf     %edx, %edx
> -       cmp     %ebx, %edx
> -       jb      L(CopyFrom1To16BytesExit)
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> -
> -       .p2align 4
> -L(CopyFrom1To32BytesCase2):
> -       sub     %ecx, %ebx
> -       add     %ecx, %esi
> -       bsf     %edx, %edx
> -       add     $16, %edx
> -       sub     %ecx, %edx
> -       cmp     %ebx, %edx
> -       jb      L(CopyFrom1To16BytesExit)
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> -
> -L(CopyFrom1To16BytesTailCase2):
> -       sub     %ecx, %ebx
> -       add     %ecx, %esi
> -       bsf     %edx, %edx
> -       cmp     %ebx, %edx
> -       jb      L(CopyFrom1To16BytesExit)
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> -
> -L(CopyFrom1To16BytesTail1Case2):
> -       bsf     %edx, %edx
> -       cmp     %ebx, %edx
> -       jb      L(CopyFrom1To16BytesExit)
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> -
> -/* Case2 or Case3,  Case3 */
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase2OrCase3):
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16BytesCase2)
> -L(CopyFrom1To16BytesCase3):
> -       add     $16, %ebx
> -       add     %ecx, %eax
> -       add     %ecx, %esi
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> -
> -       .p2align 4
> -L(CopyFrom1To32BytesCase2OrCase3):
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To32BytesCase2)
> -       sub     %ecx, %ebx
> -       add     %ecx, %esi
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesTailCase2OrCase3):
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16BytesTailCase2)
> -       sub     %ecx, %ebx
> -       add     %ecx, %esi
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> -
> -       .p2align 4
> -L(CopyFrom1To32Bytes1Case2OrCase3):
> -       add     $16, %eax
> -       add     $16, %esi
> -       sub     $16, %ebx
> -L(CopyFrom1To16BytesTail1Case2OrCase3):
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16BytesTail1Case2)
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> -
> -# endif
> -
> -# ifdef USE_AS_STRNCAT
> -       .p2align 4
> -L(StrncatExit0):
> -       movb    %bh, (%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -# endif
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit1):
> -       movb    %bh, 1(%eax)
> -# endif
> -L(Exit1):
> -# ifdef USE_AS_STRNCAT
> -       movb    (%esi), %dh
> -# endif
> -       movb    %dh, (%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit2):
> -       movb    %bh, 2(%eax)
> -# endif
> -L(Exit2):
> -       movw    (%esi), %dx
> -       movw    %dx, (%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit3):
> -       movb    %bh, 3(%eax)
> -# endif
> -L(Exit3):
> -       movw    (%esi), %cx
> -       movw    %cx, (%eax)
> -# ifdef USE_AS_STRNCAT
> -       movb    2(%esi), %dh
> -# endif
> -       movb    %dh, 2(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit4):
> -       movb    %bh, 4(%eax)
> -# endif
> -L(Exit4):
> -       movl    (%esi), %edx
> -       movl    %edx, (%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit5):
> -       movb    %bh, 5(%eax)
> -# endif
> -L(Exit5):
> -       movl    (%esi), %ecx
> -# ifdef USE_AS_STRNCAT
> -       movb    4(%esi), %dh
> -# endif
> -       movb    %dh, 4(%eax)
> -       movl    %ecx, (%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit6):
> -       movb    %bh, 6(%eax)
> -# endif
> -L(Exit6):
> -       movl    (%esi), %ecx
> -       movw    4(%esi), %dx
> -       movl    %ecx, (%eax)
> -       movw    %dx, 4(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit7):
> -       movb    %bh, 7(%eax)
> -# endif
> -L(Exit7):
> -       movl    (%esi), %ecx
> -       movl    3(%esi), %edx
> -       movl    %ecx, (%eax)
> -       movl    %edx, 3(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit8):
> -       movb    %bh, 8(%eax)
> -# endif
> -L(Exit8):
> -       movlpd  (%esi), %xmm0
> -       movlpd  %xmm0, (%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit9):
> -       movb    %bh, 9(%eax)
> -# endif
> -L(Exit9):
> -       movlpd  (%esi), %xmm0
> -# ifdef USE_AS_STRNCAT
> -       movb    8(%esi), %dh
> -# endif
> -       movb    %dh, 8(%eax)
> -       movlpd  %xmm0, (%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit10):
> -       movb    %bh, 10(%eax)
> -# endif
> -L(Exit10):
> -       movlpd  (%esi), %xmm0
> -       movw    8(%esi), %dx
> -       movlpd  %xmm0, (%eax)
> -       movw    %dx, 8(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit11):
> -       movb    %bh, 11(%eax)
> -# endif
> -L(Exit11):
> -       movlpd  (%esi), %xmm0
> -       movl    7(%esi), %edx
> -       movlpd  %xmm0, (%eax)
> -       movl    %edx, 7(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit12):
> -       movb    %bh, 12(%eax)
> -# endif
> -L(Exit12):
> -       movlpd  (%esi), %xmm0
> -       movl    8(%esi), %edx
> -       movlpd  %xmm0, (%eax)
> -       movl    %edx, 8(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit13):
> -       movb    %bh, 13(%eax)
> -# endif
> -L(Exit13):
> -       movlpd  (%esi), %xmm0
> -       movlpd  5(%esi), %xmm1
> -       movlpd  %xmm0, (%eax)
> -       movlpd  %xmm1, 5(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit14):
> -       movb    %bh, 14(%eax)
> -# endif
> -L(Exit14):
> -       movlpd  (%esi), %xmm0
> -       movlpd  6(%esi), %xmm1
> -       movlpd  %xmm0, (%eax)
> -       movlpd  %xmm1, 6(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit15):
> -       movb    %bh, 15(%eax)
> -# endif
> -L(Exit15):
> -       movlpd  (%esi), %xmm0
> -       movlpd  7(%esi), %xmm1
> -       movlpd  %xmm0, (%eax)
> -       movlpd  %xmm1, 7(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit16):
> -       movb    %bh, 16(%eax)
> -# endif
> -L(Exit16):
> -       movdqu  (%esi), %xmm0
> -       movdqu  %xmm0, (%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit17):
> -       movb    %bh, 17(%eax)
> -# endif
> -L(Exit17):
> -       movdqu  (%esi), %xmm0
> -# ifdef USE_AS_STRNCAT
> -       movb    16(%esi), %dh
> -# endif
> -       movdqu  %xmm0, (%eax)
> -       movb    %dh, 16(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit18):
> -       movb    %bh, 18(%eax)
> -# endif
> -L(Exit18):
> -       movdqu  (%esi), %xmm0
> -       movw    16(%esi), %cx
> -       movdqu  %xmm0, (%eax)
> -       movw    %cx, 16(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit19):
> -       movb    %bh, 19(%eax)
> -# endif
> -L(Exit19):
> -       movdqu  (%esi), %xmm0
> -       movl    15(%esi), %ecx
> -       movdqu  %xmm0, (%eax)
> -       movl    %ecx, 15(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit20):
> -       movb    %bh, 20(%eax)
> -# endif
> -L(Exit20):
> -       movdqu  (%esi), %xmm0
> -       movl    16(%esi), %ecx
> -       movdqu  %xmm0, (%eax)
> -       movl    %ecx, 16(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit21):
> -       movb    %bh, 21(%eax)
> -# endif
> -L(Exit21):
> -       movdqu  (%esi), %xmm0
> -       movl    16(%esi), %ecx
> -# ifdef USE_AS_STRNCAT
> -       movb    20(%esi), %dh
> -# endif
> -       movdqu  %xmm0, (%eax)
> -       movl    %ecx, 16(%eax)
> -       movb    %dh, 20(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit22):
> -       movb    %bh, 22(%eax)
> -# endif
> -L(Exit22):
> -       movdqu  (%esi), %xmm0
> -       movlpd  14(%esi), %xmm3
> -       movdqu  %xmm0, (%eax)
> -       movlpd  %xmm3, 14(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit23):
> -       movb    %bh, 23(%eax)
> -# endif
> -L(Exit23):
> -       movdqu  (%esi), %xmm0
> -       movlpd  15(%esi), %xmm3
> -       movdqu  %xmm0, (%eax)
> -       movlpd  %xmm3, 15(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit24):
> -       movb    %bh, 24(%eax)
> -# endif
> -L(Exit24):
> -       movdqu  (%esi), %xmm0
> -       movlpd  16(%esi), %xmm2
> -       movdqu  %xmm0, (%eax)
> -       movlpd  %xmm2, 16(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit25):
> -       movb    %bh, 25(%eax)
> -# endif
> -L(Exit25):
> -       movdqu  (%esi), %xmm0
> -       movlpd  16(%esi), %xmm2
> -# ifdef USE_AS_STRNCAT
> -       movb    24(%esi), %dh
> -# endif
> -       movdqu  %xmm0, (%eax)
> -       movlpd  %xmm2, 16(%eax)
> -       movb    %dh, 24(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit26):
> -       movb    %bh, 26(%eax)
> -# endif
> -L(Exit26):
> -       movdqu  (%esi), %xmm0
> -       movlpd  16(%esi), %xmm2
> -       movw    24(%esi), %cx
> -       movdqu  %xmm0, (%eax)
> -       movlpd  %xmm2, 16(%eax)
> -       movw    %cx, 24(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit27):
> -       movb    %bh, 27(%eax)
> -# endif
> -L(Exit27):
> -       movdqu  (%esi), %xmm0
> -       movlpd  16(%esi), %xmm2
> -       movl    23(%esi), %ecx
> -       movdqu  %xmm0, (%eax)
> -       movlpd  %xmm2, 16(%eax)
> -       movl    %ecx, 23(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit28):
> -       movb    %bh, 28(%eax)
> -# endif
> -L(Exit28):
> -       movdqu  (%esi), %xmm0
> -       movlpd  16(%esi), %xmm2
> -       movl    24(%esi), %ecx
> -       movdqu  %xmm0, (%eax)
> -       movlpd  %xmm2, 16(%eax)
> -       movl    %ecx, 24(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit29):
> -       movb    %bh, 29(%eax)
> -# endif
> -L(Exit29):
> -       movdqu  (%esi), %xmm0
> -       movdqu  13(%esi), %xmm2
> -       movdqu  %xmm0, (%eax)
> -       movdqu  %xmm2, 13(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit30):
> -       movb    %bh, 30(%eax)
> -# endif
> -L(Exit30):
> -       movdqu  (%esi), %xmm0
> -       movdqu  14(%esi), %xmm2
> -       movdqu  %xmm0, (%eax)
> -       movdqu  %xmm2, 14(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit31):
> -       movb    %bh, 31(%eax)
> -# endif
> -L(Exit31):
> -       movdqu  (%esi), %xmm0
> -       movdqu  15(%esi), %xmm2
> -       movdqu  %xmm0, (%eax)
> -       movdqu  %xmm2, 15(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -# ifdef USE_AS_STRNCAT
> -L(StrncatExit32):
> -       movb    %bh, 32(%eax)
> -# endif
> -L(Exit32):
> -       movdqu  (%esi), %xmm0
> -       movdqu  16(%esi), %xmm2
> -       movdqu  %xmm0, (%eax)
> -       movdqu  %xmm2, 16(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -# ifdef USE_AS_STRNCAT
> -
> -       .p2align 4
> -L(UnalignedLeaveCase2OrCase3):
> -       test    %edx, %edx
> -       jnz     L(Unaligned64LeaveCase2)
> -L(Unaligned64LeaveCase3):
> -       lea     64(%ebx), %ecx
> -       and     $-16, %ecx
> -       add     $48, %ebx
> -       jl      L(CopyFrom1To16BytesCase3)
> -       movdqu  %xmm4, (%eax)
> -       sub     $16, %ebx
> -       jb      L(CopyFrom1To16BytesCase3)
> -       movdqu  %xmm5, 16(%eax)
> -       sub     $16, %ebx
> -       jb      L(CopyFrom1To16BytesCase3)
> -       movdqu  %xmm6, 32(%eax)
> -       sub     $16, %ebx
> -       jb      L(CopyFrom1To16BytesCase3)
> -       movdqu  %xmm7, 48(%eax)
> -       xor     %bh, %bh
> -       movb    %bh, 64(%eax)
> -       mov     STR3(%esp), %eax
> -       RETURN
> -
> -       .p2align 4
> -L(Unaligned64LeaveCase2):
> -       xor     %ecx, %ecx
> -       pcmpeqb %xmm4, %xmm0
> -       pmovmskb %xmm0, %edx
> -       add     $48, %ebx
> -       jle     L(CopyFrom1To16BytesCase2OrCase3)
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm5, %xmm0
> -       pmovmskb %xmm0, %edx
> -       movdqu  %xmm4, (%eax)
> -       add     $16, %ecx
> -       sub     $16, %ebx
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm6, %xmm0
> -       pmovmskb %xmm0, %edx
> -       movdqu  %xmm5, 16(%eax)
> -       add     $16, %ecx
> -       sub     $16, %ebx
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -       test    %edx, %edx
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm7, %xmm0
> -       pmovmskb %xmm0, %edx
> -       movdqu  %xmm6, 32(%eax)
> -       lea     16(%eax, %ecx), %eax
> -       lea     16(%esi, %ecx), %esi
> -       bsf     %edx, %edx
> -       cmp     %ebx, %edx
> -       jb      L(CopyFrom1To16BytesExit)
> -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> -# endif
> -       .p2align 4
> -L(ExitZero):
> -       RETURN
> -
> -END (STRCAT)
> -
> -       .p2align 4
> -       .section .rodata
> -L(ExitTable):
> -       .int    JMPTBL(L(Exit1), L(ExitTable))
> -       .int    JMPTBL(L(Exit2), L(ExitTable))
> -       .int    JMPTBL(L(Exit3), L(ExitTable))
> -       .int    JMPTBL(L(Exit4), L(ExitTable))
> -       .int    JMPTBL(L(Exit5), L(ExitTable))
> -       .int    JMPTBL(L(Exit6), L(ExitTable))
> -       .int    JMPTBL(L(Exit7), L(ExitTable))
> -       .int    JMPTBL(L(Exit8), L(ExitTable))
> -       .int    JMPTBL(L(Exit9), L(ExitTable))
> -       .int    JMPTBL(L(Exit10), L(ExitTable))
> -       .int    JMPTBL(L(Exit11), L(ExitTable))
> -       .int    JMPTBL(L(Exit12), L(ExitTable))
> -       .int    JMPTBL(L(Exit13), L(ExitTable))
> -       .int    JMPTBL(L(Exit14), L(ExitTable))
> -       .int    JMPTBL(L(Exit15), L(ExitTable))
> -       .int    JMPTBL(L(Exit16), L(ExitTable))
> -       .int    JMPTBL(L(Exit17), L(ExitTable))
> -       .int    JMPTBL(L(Exit18), L(ExitTable))
> -       .int    JMPTBL(L(Exit19), L(ExitTable))
> -       .int    JMPTBL(L(Exit20), L(ExitTable))
> -       .int    JMPTBL(L(Exit21), L(ExitTable))
> -       .int    JMPTBL(L(Exit22), L(ExitTable))
> -       .int    JMPTBL(L(Exit23), L(ExitTable))
> -       .int    JMPTBL(L(Exit24), L(ExitTable))
> -       .int    JMPTBL(L(Exit25), L(ExitTable))
> -       .int    JMPTBL(L(Exit26), L(ExitTable))
> -       .int    JMPTBL(L(Exit27), L(ExitTable))
> -       .int    JMPTBL(L(Exit28), L(ExitTable))
> -       .int    JMPTBL(L(Exit29), L(ExitTable))
> -       .int    JMPTBL(L(Exit30), L(ExitTable))
> -       .int    JMPTBL(L(Exit31), L(ExitTable))
> -       .int    JMPTBL(L(Exit32), L(ExitTable))
> -# ifdef USE_AS_STRNCAT
> -L(ExitStrncatTable):
> -       .int    JMPTBL(L(StrncatExit0), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit1), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit2), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit3), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit4), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit5), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit6), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit7), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit8), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit9), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit10), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit11), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit12), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit13), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit14), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit15), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit16), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit17), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit18), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit19), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit20), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit21), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit22), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit23), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit24), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit25), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit26), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit27), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit28), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit29), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit30), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit31), L(ExitStrncatTable))
> -       .int    JMPTBL(L(StrncatExit32), L(ExitStrncatTable))
> -# endif
> -#endif
> diff --git a/sysdeps/i386/i686/multiarch/strcat-ssse3.S
> b/sysdeps/i386/i686/multiarch/strcat-ssse3.S
> deleted file mode 100644
> index 72bc49c..0000000
> --- a/sysdeps/i386/i686/multiarch/strcat-ssse3.S
> +++ /dev/null
> @@ -1,572 +0,0 @@
> -/* strcat with SSSE3
> -   Copyright (C) 2011 Free Software Foundation, Inc.
> -   Contributed by Intel Corporation.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -
> -#ifndef NOT_IN_libc
> -
> -# include <sysdep.h>
> -
> -# define CFI_PUSH(REG) \
> -       cfi_adjust_cfa_offset (4);      \
> -       cfi_rel_offset (REG, 0)
> -
> -# define CFI_POP(REG)  \
> -       cfi_adjust_cfa_offset (-4);     \
> -       cfi_restore (REG)
> -
> -# define PUSH(REG) pushl REG; CFI_PUSH (REG)
> -# define POP(REG) popl REG; CFI_POP (REG)
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_ssse3
> -# endif
> -
> -# define PARMS  4
> -# define STR1  PARMS+4
> -# define STR2  STR1+4
> -
> -# ifdef USE_AS_STRNCAT
> -#  define LEN STR2+8
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -.text
> -ENTRY (STRCAT)
> -       PUSH    (%edi)
> -       mov     STR1(%esp), %edi
> -       mov     %edi, %edx
> -
> -# define RETURN  jmp L(StartStrcpyPart)
> -# include "strlen-sse2.S"
> -
> -L(StartStrcpyPart):
> -       mov     STR2(%esp), %ecx
> -       lea     (%edi, %eax), %edx
> -# ifdef USE_AS_STRNCAT
> -       PUSH    (%ebx)
> -       mov     LEN(%esp), %ebx
> -       test    %ebx, %ebx
> -       jz      L(StrncatExit0)
> -       cmp     $8, %ebx
> -       jbe     L(StrncatExit8Bytes)
> -# endif
> -       cmpb    $0, (%ecx)
> -       jz      L(Exit1)
> -       cmpb    $0, 1(%ecx)
> -       jz      L(Exit2)
> -       cmpb    $0, 2(%ecx)
> -       jz      L(Exit3)
> -       cmpb    $0, 3(%ecx)
> -       jz      L(Exit4)
> -       cmpb    $0, 4(%ecx)
> -       jz      L(Exit5)
> -       cmpb    $0, 5(%ecx)
> -       jz      L(Exit6)
> -       cmpb    $0, 6(%ecx)
> -       jz      L(Exit7)
> -       cmpb    $0, 7(%ecx)
> -       jz      L(Exit8)
> -       cmpb    $0, 8(%ecx)
> -       jz      L(Exit9)
> -# ifdef USE_AS_STRNCAT
> -       cmp     $16, %ebx
> -       jb      L(StrncatExit15Bytes)
> -# endif
> -       cmpb    $0, 9(%ecx)
> -       jz      L(Exit10)
> -       cmpb    $0, 10(%ecx)
> -       jz      L(Exit11)
> -       cmpb    $0, 11(%ecx)
> -       jz      L(Exit12)
> -       cmpb    $0, 12(%ecx)
> -       jz      L(Exit13)
> -       cmpb    $0, 13(%ecx)
> -       jz      L(Exit14)
> -       cmpb    $0, 14(%ecx)
> -       jz      L(Exit15)
> -       cmpb    $0, 15(%ecx)
> -       jz      L(Exit16)
> -# ifdef USE_AS_STRNCAT
> -       cmp     $16, %ebx
> -       je      L(StrncatExit16)
> -
> -#  define RETURN1      \
> -       POP     (%ebx); \
> -       POP     (%edi); \
> -       ret;    \
> -       CFI_PUSH        (%ebx); \
> -       CFI_PUSH        (%edi)
> -#  define USE_AS_STRNCPY
> -# else
> -#  define RETURN1  POP (%edi); ret; CFI_PUSH (%edi)
> -# endif
> -# include "strcpy-ssse3.S"
> -       .p2align 4
> -L(CopyFrom1To16Bytes):
> -       add     %esi, %edx
> -       add     %esi, %ecx
> -
> -       POP     (%esi)
> -       test    %al, %al
> -       jz      L(ExitHigh)
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -       movlpd  (%ecx), %xmm0
> -       movlpd  %xmm0, (%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(ExitHigh):
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       test    $0x08, %ah
> -       jnz     L(Exit12)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -       movlpd  (%ecx), %xmm0
> -       movlpd  8(%ecx), %xmm1
> -       movlpd  %xmm0, (%edx)
> -       movlpd  %xmm1, 8(%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit1):
> -       movb    %bh, 1(%edx)
> -L(Exit1):
> -       movb    (%ecx), %al
> -       movb    %al, (%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit2):
> -       movb    %bh, 2(%edx)
> -L(Exit2):
> -       movw    (%ecx), %ax
> -       movw    %ax, (%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit3):
> -       movb    %bh, 3(%edx)
> -L(Exit3):
> -       movw    (%ecx), %ax
> -       movw    %ax, (%edx)
> -       movb    2(%ecx), %al
> -       movb    %al, 2(%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit4):
> -       movb    %bh, 4(%edx)
> -L(Exit4):
> -       movl    (%ecx), %eax
> -       movl    %eax, (%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit5):
> -       movb    %bh, 5(%edx)
> -L(Exit5):
> -       movl    (%ecx), %eax
> -       movl    %eax, (%edx)
> -       movb    4(%ecx), %al
> -       movb    %al, 4(%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit6):
> -       movb    %bh, 6(%edx)
> -L(Exit6):
> -       movl    (%ecx), %eax
> -       movl    %eax, (%edx)
> -       movw    4(%ecx), %ax
> -       movw    %ax, 4(%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit7):
> -       movb    %bh, 7(%edx)
> -L(Exit7):
> -       movl    (%ecx), %eax
> -       movl    %eax, (%edx)
> -       movl    3(%ecx), %eax
> -       movl    %eax, 3(%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit8):
> -       movb    %bh, 8(%edx)
> -L(Exit8):
> -       movlpd  (%ecx), %xmm0
> -       movlpd  %xmm0, (%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit9):
> -       movb    %bh, 9(%edx)
> -L(Exit9):
> -       movlpd  (%ecx), %xmm0
> -       movlpd  %xmm0, (%edx)
> -       movb    8(%ecx), %al
> -       movb    %al, 8(%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit10):
> -       movb    %bh, 10(%edx)
> -L(Exit10):
> -       movlpd  (%ecx), %xmm0
> -       movlpd  %xmm0, (%edx)
> -       movw    8(%ecx), %ax
> -       movw    %ax, 8(%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit11):
> -       movb    %bh, 11(%edx)
> -L(Exit11):
> -       movlpd  (%ecx), %xmm0
> -       movlpd  %xmm0, (%edx)
> -       movl    7(%ecx), %eax
> -       movl    %eax, 7(%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit12):
> -       movb    %bh, 12(%edx)
> -L(Exit12):
> -       movlpd  (%ecx), %xmm0
> -       movlpd  %xmm0, (%edx)
> -       movl    8(%ecx), %eax
> -       movl    %eax, 8(%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit13):
> -       movb    %bh, 13(%edx)
> -L(Exit13):
> -       movlpd  (%ecx), %xmm0
> -       movlpd  %xmm0, (%edx)
> -       movlpd  5(%ecx), %xmm0
> -       movlpd  %xmm0, 5(%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit14):
> -       movb    %bh, 14(%edx)
> -L(Exit14):
> -       movlpd  (%ecx), %xmm0
> -       movlpd  %xmm0, (%edx)
> -       movlpd  6(%ecx), %xmm0
> -       movlpd  %xmm0, 6(%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit15):
> -       movb    %bh, 15(%edx)
> -L(Exit15):
> -       movlpd  (%ecx), %xmm0
> -       movlpd  %xmm0, (%edx)
> -       movlpd  7(%ecx), %xmm0
> -       movlpd  %xmm0, 7(%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit16):
> -       movb    %bh, 16(%edx)
> -L(Exit16):
> -       movlpd  (%ecx), %xmm0
> -       movlpd  8(%ecx), %xmm1
> -       movlpd  %xmm0, (%edx)
> -       movlpd  %xmm1, 8(%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -# ifdef USE_AS_STRNCPY
> -
> -       CFI_PUSH(%esi)
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase2):
> -       add     $16, %ebx
> -       add     %esi, %ecx
> -       lea     (%esi, %edx), %esi
> -       lea     -9(%ebx), %edx
> -       and     $1<<7, %dh
> -       or      %al, %dh
> -       test    %dh, %dh
> -       lea     (%esi), %edx
> -       POP     (%esi)
> -       jz      L(ExitHighCase2)
> -
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       cmp     $1, %ebx
> -       je      L(StrncatExit1)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       cmp     $2, %ebx
> -       je      L(StrncatExit2)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       cmp     $3, %ebx
> -       je      L(StrncatExit3)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       cmp     $4, %ebx
> -       je      L(StrncatExit4)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       cmp     $5, %ebx
> -       je      L(StrncatExit5)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       cmp     $6, %ebx
> -       je      L(StrncatExit6)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -       cmp     $7, %ebx
> -       je      L(StrncatExit7)
> -       movlpd  (%ecx), %xmm0
> -       movlpd  %xmm0, (%edx)
> -       lea     7(%edx), %eax
> -       cmpb    $1, (%eax)
> -       sbb     $-1, %eax
> -       xor     %cl, %cl
> -       movb    %cl, (%eax)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(ExitHighCase2):
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       cmp     $9, %ebx
> -       je      L(StrncatExit9)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       cmp     $10, %ebx
> -       je      L(StrncatExit10)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       cmp     $11, %ebx
> -       je      L(StrncatExit11)
> -       test    $0x8, %ah
> -       jnz     L(Exit12)
> -       cmp     $12, %ebx
> -       je      L(StrncatExit12)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       cmp     $13, %ebx
> -       je      L(StrncatExit13)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       cmp     $14, %ebx
> -       je      L(StrncatExit14)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -       cmp     $15, %ebx
> -       je      L(StrncatExit15)
> -       movlpd  (%ecx), %xmm0
> -       movlpd  %xmm0, (%edx)
> -       movlpd  8(%ecx), %xmm1
> -       movlpd  %xmm1, 8(%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       CFI_PUSH(%esi)
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> -       test    %eax, %eax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase3):
> -       add     $16, %ebx
> -       add     %esi, %edx
> -       add     %esi, %ecx
> -
> -       POP     (%esi)
> -
> -       cmp     $8, %ebx
> -       ja      L(ExitHighCase3)
> -       cmp     $1, %ebx
> -       je      L(StrncatExit1)
> -       cmp     $2, %ebx
> -       je      L(StrncatExit2)
> -       cmp     $3, %ebx
> -       je      L(StrncatExit3)
> -       cmp     $4, %ebx
> -       je      L(StrncatExit4)
> -       cmp     $5, %ebx
> -       je      L(StrncatExit5)
> -       cmp     $6, %ebx
> -       je      L(StrncatExit6)
> -       cmp     $7, %ebx
> -       je      L(StrncatExit7)
> -       movlpd  (%ecx), %xmm0
> -       movlpd  %xmm0, (%edx)
> -       movb    %bh, 8(%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(ExitHighCase3):
> -       cmp     $9, %ebx
> -       je      L(StrncatExit9)
> -       cmp     $10, %ebx
> -       je      L(StrncatExit10)
> -       cmp     $11, %ebx
> -       je      L(StrncatExit11)
> -       cmp     $12, %ebx
> -       je      L(StrncatExit12)
> -       cmp     $13, %ebx
> -       je      L(StrncatExit13)
> -       cmp     $14, %ebx
> -       je      L(StrncatExit14)
> -       cmp     $15, %ebx
> -       je      L(StrncatExit15)
> -       movlpd  (%ecx), %xmm0
> -       movlpd  %xmm0, (%edx)
> -       movlpd  8(%ecx), %xmm1
> -       movlpd  %xmm1, 8(%edx)
> -       movb    %bh, 16(%edx)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit0):
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit15Bytes):
> -       cmp     $9, %ebx
> -       je      L(StrncatExit9)
> -       cmpb    $0, 9(%ecx)
> -       jz      L(Exit10)
> -       cmp     $10, %ebx
> -       je      L(StrncatExit10)
> -       cmpb    $0, 10(%ecx)
> -       jz      L(Exit11)
> -       cmp     $11, %ebx
> -       je      L(StrncatExit11)
> -       cmpb    $0, 11(%ecx)
> -       jz      L(Exit12)
> -       cmp     $12, %ebx
> -       je      L(StrncatExit12)
> -       cmpb    $0, 12(%ecx)
> -       jz      L(Exit13)
> -       cmp     $13, %ebx
> -       je      L(StrncatExit13)
> -       cmpb    $0, 13(%ecx)
> -       jz      L(Exit14)
> -       cmp     $14, %ebx
> -       je      L(StrncatExit14)
> -       movlpd  (%ecx), %xmm0
> -       movlpd  %xmm0, (%edx)
> -       movlpd  7(%ecx), %xmm0
> -       movlpd  %xmm0, 7(%edx)
> -       lea     14(%edx), %eax
> -       cmpb    $1, (%eax)
> -       sbb     $-1, %eax
> -       movb    %bh, (%eax)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -       .p2align 4
> -L(StrncatExit8Bytes):
> -       cmpb    $0, (%ecx)
> -       jz      L(Exit1)
> -       cmp     $1, %ebx
> -       je      L(StrncatExit1)
> -       cmpb    $0, 1(%ecx)
> -       jz      L(Exit2)
> -       cmp     $2, %ebx
> -       je      L(StrncatExit2)
> -       cmpb    $0, 2(%ecx)
> -       jz      L(Exit3)
> -       cmp     $3, %ebx
> -       je      L(StrncatExit3)
> -       cmpb    $0, 3(%ecx)
> -       jz      L(Exit4)
> -       cmp     $4, %ebx
> -       je      L(StrncatExit4)
> -       cmpb    $0, 4(%ecx)
> -       jz      L(Exit5)
> -       cmp     $5, %ebx
> -       je      L(StrncatExit5)
> -       cmpb    $0, 5(%ecx)
> -       jz      L(Exit6)
> -       cmp     $6, %ebx
> -       je      L(StrncatExit6)
> -       cmpb    $0, 6(%ecx)
> -       jz      L(Exit7)
> -       cmp     $7, %ebx
> -       je      L(StrncatExit7)
> -       movlpd  (%ecx), %xmm0
> -       movlpd  %xmm0, (%edx)
> -       lea     7(%edx), %eax
> -       cmpb    $1, (%eax)
> -       sbb     $-1, %eax
> -       movb    %bh, (%eax)
> -       movl    %edi, %eax
> -       RETURN1
> -
> -# endif
> -END (STRCAT)
> -#endif
> diff --git a/sysdeps/i386/i686/multiarch/strcat.S
> b/sysdeps/i386/i686/multiarch/strcat.S
> deleted file mode 100644
> index e68feca..0000000
> --- a/sysdeps/i386/i686/multiarch/strcat.S
> +++ /dev/null
> @@ -1,119 +0,0 @@
> -/* Multiple versions of strcat
> -   Copyright (C) 2011-2012 Free Software Foundation, Inc.
> -   Contributed by Intel Corporation.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -#include <init-arch.h>
> -
> -#ifndef USE_AS_STRNCAT
> -# ifndef STRCAT
> -#  define STRCAT strcat
> -# endif
> -#endif
> -
> -#ifdef USE_AS_STRNCAT
> -# define STRCAT_SSSE3  __strncat_ssse3
> -# define STRCAT_SSE2           __strncat_sse2
> -# define STRCAT_IA32           __strncat_ia32
> -# define __GI_STRCAT           __GI_strncat
> -#else
> -# define STRCAT_SSSE3  __strcat_ssse3
> -# define STRCAT_SSE2           __strcat_sse2
> -# define STRCAT_IA32           __strcat_ia32
> -# define __GI_STRCAT           __GI_strcat
> -#endif
> -
> -
> -/* Define multiple versions only for the definition in libc.  Don't
> -   define multiple versions for strncat in static library since we
> -   need strncat before the initialization happened.  */
> -#ifndef NOT_IN_libc
> -
> -# ifdef SHARED
> -       .text
> -ENTRY(STRCAT)
> -       .type   STRCAT, @gnu_indirect_function
> -       pushl   %ebx
> -       cfi_adjust_cfa_offset (4)
> -       cfi_rel_offset (ebx, 0)
> -       LOAD_PIC_REG(bx)
> -       cmpl    $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
> -       jne     1f
> -       call    __init_cpu_features
> -1:     leal    STRCAT_IA32@GOTOFF(%ebx), %eax
> -       testl   $bit_SSE2,
> CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
> -       jz      2f
> -       leal    STRCAT_SSE2@GOTOFF(%ebx), %eax
> -       testl   $bit_Fast_Unaligned_Load,
> FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
> -       jnz     2f
> -       testl   $bit_SSSE3,
> CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
> -       jz      2f
> -       leal    STRCAT_SSSE3@GOTOFF(%ebx), %eax
> -2:     popl    %ebx
> -       cfi_adjust_cfa_offset (-4)
> -       cfi_restore (ebx)
> -       ret
> -END(STRCAT)
> -# else
> -
> -ENTRY(STRCAT)
> -       .type   STRCAT, @gnu_indirect_function
> -       cmpl    $0, KIND_OFFSET+__cpu_features
> -       jne     1f
> -       call    __init_cpu_features
> -1:     leal    STRCAT_IA32, %eax
> -       testl   $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
> -       jz      2f
> -       leal    STRCAT_SSE2, %eax
> -       testl   $bit_Fast_Unaligned_Load,
> FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features
> -       jnz     2f
> -       testl   $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
> -       jz      2f
> -       leal    STRCAT_SSSE3, %eax
> -2:     ret
> -END(STRCAT)
> -
> -# endif
> -
> -# undef ENTRY
> -# define ENTRY(name) \
> -       .type STRCAT_IA32, @function; \
> -       .align 16; \
> -       STRCAT_IA32: cfi_startproc; \
> -       CALL_MCOUNT
> -# undef END
> -# define END(name) \
> -       cfi_endproc; .size STRCAT_IA32, .-STRCAT_IA32
> -
> -# ifdef SHARED
> -#  undef libc_hidden_builtin_def
> -/* It doesn't make sense to send libc-internal strcat calls through a
> PLT.
> -   The speedup we get from using SSSE3 instruction is likely eaten away
> -   by the indirect call in the PLT.  */
> -#  define libc_hidden_builtin_def(name) \
> -       .globl __GI_STRCAT; __GI_STRCAT = STRCAT_IA32
> -#  undef libc_hidden_def
> -#  define libc_hidden_def(name) \
> -       .globl __GI___STRCAT; __GI___STRCAT = STRCAT_IA32
> -
> -# endif
> -#endif
> -
> -#ifndef USE_AS_STRNCAT
> -# include "../../i486/strcat.S"
> -#endif
> diff --git a/sysdeps/i386/i686/multiarch/strncat-c.c
> b/sysdeps/i386/i686/multiarch/strncat-c.c
> deleted file mode 100644
> index 132a000..0000000
> --- a/sysdeps/i386/i686/multiarch/strncat-c.c
> +++ /dev/null
> @@ -1,8 +0,0 @@
> -#define STRNCAT __strncat_ia32
> -#ifdef SHARED
> -#undef libc_hidden_def
> -#define libc_hidden_def(name) \
> -  __hidden_ver1 (__strncat_ia32, __GI___strncat, __strncat_ia32);
> -#endif
> -
> -#include "string/strncat.c"
> diff --git a/sysdeps/i386/i686/multiarch/strncat-sse2.S
> b/sysdeps/i386/i686/multiarch/strncat-sse2.S
> deleted file mode 100644
> index f1045b7..0000000
> --- a/sysdeps/i386/i686/multiarch/strncat-sse2.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define STRCAT  __strncat_sse2
> -#define USE_AS_STRNCAT
> -
> -#include "strcat-sse2.S"
> diff --git a/sysdeps/i386/i686/multiarch/strncat-ssse3.S
> b/sysdeps/i386/i686/multiarch/strncat-ssse3.S
> deleted file mode 100644
> index 625b90a..0000000
> --- a/sysdeps/i386/i686/multiarch/strncat-ssse3.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define STRCAT  __strncat_ssse3
> -#define USE_AS_STRNCAT
> -
> -#include "strcat-ssse3.S"
> diff --git a/sysdeps/i386/i686/multiarch/strncat.S
> b/sysdeps/i386/i686/multiarch/strncat.S
> deleted file mode 100644
> index fd569c2..0000000
> --- a/sysdeps/i386/i686/multiarch/strncat.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define STRCAT strncat
> -#define USE_AS_STRNCAT
> -#include "strcat.S"
> diff --git a/sysdeps/powerpc/strcat.c b/sysdeps/powerpc/strcat.c
> deleted file mode 100644
> index 28575d0..0000000
> --- a/sysdeps/powerpc/strcat.c
> +++ /dev/null
> @@ -1,30 +0,0 @@
> -/* strcat version that uses fast strcpy/strlen.
> -   Copyright (C) 1997, 2003 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -#include <string.h>
> -
> -#undef strcat
> -
> -/* Append SRC on the end of DEST.  */
> -char *
> -strcat (char *dest, const char *src)
> -{
> -  strcpy (dest + strlen (dest), src);
> -  return dest;
> -}
> -libc_hidden_builtin_def (strcat)
> diff --git a/sysdeps/x86_64/multiarch/Makefile
> b/sysdeps/x86_64/multiarch/Makefile
> index 22f1435..ae94366 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -6,7 +6,7 @@ endif
>
>  ifeq ($(subdir),string)
>
> -sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3
> strncmp-ssse3 \
> +sysdep_routines +=  stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
>                    strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
>                    memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
>                    memmove-ssse3-back strcasestr-nonascii
> strcasecmp_l-ssse3 \
> @@ -14,8 +14,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c
> strcmp-ssse3 strncmp-ssse3 \
>                    strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
>                    strcpy-sse2-unaligned strncpy-sse2-unaligned \
>                    stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
> -                  strcat-sse2-unaligned strncat-sse2-unaligned \
> -                  strcat-ssse3 strncat-ssse3 strlen_atom strlen_avx \
> +                  strlen_atom strlen_avx \
>                    strnlen-sse2-no-bsf strrchr-sse2-no-bsf
> strchr-sse2-no-bsf \
>                    memcmp-ssse3
>  ifeq (yes,$(config-cflags-sse4))
> diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> deleted file mode 100644
> index 7811ab5..0000000
> --- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> +++ /dev/null
> @@ -1,53 +0,0 @@
> -/* strcat with SSE2
> -   Copyright (C) 2011 Free Software Foundation, Inc.
> -   Contributed by Intel Corporation.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -#ifndef NOT_IN_libc
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_sse2_unaligned
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -.text
> -ENTRY (STRCAT)
> -       mov     %rdi, %r9
> -# ifdef USE_AS_STRNCAT
> -       mov     %rdx, %r8
> -# endif
> -
> -# define RETURN  jmp L(StartStrcpyPart)
> -# include "strlen-sse2-pminub.S"
> -# undef RETURN
> -
> -L(StartStrcpyPart):
> -       lea     (%r9, %rax), %rdi
> -       mov     %rsi, %rcx
> -       mov     %r9, %rax      /* save result */
> -
> -# ifdef USE_AS_STRNCAT
> -       test    %r8, %r8
> -       jz      L(ExitZero)
> -#  define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-sse2-unaligned.S"
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S
> b/sysdeps/x86_64/multiarch/strcat-ssse3.S
> deleted file mode 100644
> index abd2c0c..0000000
> --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
> +++ /dev/null
> @@ -1,557 +0,0 @@
> -/* strcat with SSSE3
> -   Copyright (C) 2011 Free Software Foundation, Inc.
> -   Contributed by Intel Corporation.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -#ifndef NOT_IN_libc
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_ssse3
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -.text
> -ENTRY (STRCAT)
> -# ifdef USE_AS_STRNCAT
> -       mov     %rdx, %r8
> -# endif
> -
> -# define RETURN  jmp L(StartStrcpyPart)
> -# include "strlen-sse2-no-bsf.S"
> -
> -# undef RETURN
> -
> -L(StartStrcpyPart):
> -       mov     %rsi, %rcx
> -       lea     (%rdi, %rax), %rdx
> -# ifdef USE_AS_STRNCAT
> -       test    %r8, %r8
> -       jz      L(StrncatExit0)
> -       cmp     $8, %r8
> -       jbe     L(StrncatExit8Bytes)
> -# endif
> -       cmpb    $0, (%rcx)
> -       jz      L(Exit1)
> -       cmpb    $0, 1(%rcx)
> -       jz      L(Exit2)
> -       cmpb    $0, 2(%rcx)
> -       jz      L(Exit3)
> -       cmpb    $0, 3(%rcx)
> -       jz      L(Exit4)
> -       cmpb    $0, 4(%rcx)
> -       jz      L(Exit5)
> -       cmpb    $0, 5(%rcx)
> -       jz      L(Exit6)
> -       cmpb    $0, 6(%rcx)
> -       jz      L(Exit7)
> -       cmpb    $0, 7(%rcx)
> -       jz      L(Exit8)
> -       cmpb    $0, 8(%rcx)
> -       jz      L(Exit9)
> -# ifdef USE_AS_STRNCAT
> -       cmp     $16, %r8
> -       jb      L(StrncatExit15Bytes)
> -# endif
> -       cmpb    $0, 9(%rcx)
> -       jz      L(Exit10)
> -       cmpb    $0, 10(%rcx)
> -       jz      L(Exit11)
> -       cmpb    $0, 11(%rcx)
> -       jz      L(Exit12)
> -       cmpb    $0, 12(%rcx)
> -       jz      L(Exit13)
> -       cmpb    $0, 13(%rcx)
> -       jz      L(Exit14)
> -       cmpb    $0, 14(%rcx)
> -       jz      L(Exit15)
> -       cmpb    $0, 15(%rcx)
> -       jz      L(Exit16)
> -# ifdef USE_AS_STRNCAT
> -       cmp     $16, %r8
> -       je      L(StrncatExit16)
> -#  define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-ssse3.S"
> -
> -       .p2align 4
> -L(CopyFrom1To16Bytes):
> -       add     %rsi, %rdx
> -       add     %rsi, %rcx
> -
> -       test    %al, %al
> -       jz      L(ExitHigh)
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(ExitHigh):
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       test    $0x08, %ah
> -       jnz     L(Exit12)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  8(%rcx), %xmm1
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  %xmm1, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit1):
> -       xor     %ah, %ah
> -       movb    %ah, 1(%rdx)
> -L(Exit1):
> -       movb    (%rcx), %al
> -       movb    %al, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit2):
> -       xor     %ah, %ah
> -       movb    %ah, 2(%rdx)
> -L(Exit2):
> -       movw    (%rcx), %ax
> -       movw    %ax, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit3):
> -       xor     %ah, %ah
> -       movb    %ah, 3(%rdx)
> -L(Exit3):
> -       movw    (%rcx), %ax
> -       movw    %ax, (%rdx)
> -       movb    2(%rcx), %al
> -       movb    %al, 2(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit4):
> -       xor     %ah, %ah
> -       movb    %ah, 4(%rdx)
> -L(Exit4):
> -       mov     (%rcx), %eax
> -       mov     %eax, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit5):
> -       xor     %ah, %ah
> -       movb    %ah, 5(%rdx)
> -L(Exit5):
> -       mov     (%rcx), %eax
> -       mov     %eax, (%rdx)
> -       movb    4(%rcx), %al
> -       movb    %al, 4(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit6):
> -       xor     %ah, %ah
> -       movb    %ah, 6(%rdx)
> -L(Exit6):
> -       mov     (%rcx), %eax
> -       mov     %eax, (%rdx)
> -       movw    4(%rcx), %ax
> -       movw    %ax, 4(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit7):
> -       xor     %ah, %ah
> -       movb    %ah, 7(%rdx)
> -L(Exit7):
> -       mov     (%rcx), %eax
> -       mov     %eax, (%rdx)
> -       mov     3(%rcx), %eax
> -       mov     %eax, 3(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit8):
> -       xor     %ah, %ah
> -       movb    %ah, 8(%rdx)
> -L(Exit8):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit9):
> -       xor     %ah, %ah
> -       movb    %ah, 9(%rdx)
> -L(Exit9):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movb    8(%rcx), %al
> -       movb    %al, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit10):
> -       xor     %ah, %ah
> -       movb    %ah, 10(%rdx)
> -L(Exit10):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movw    8(%rcx), %ax
> -       movw    %ax, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit11):
> -       xor     %ah, %ah
> -       movb    %ah, 11(%rdx)
> -L(Exit11):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       mov     7(%rcx), %eax
> -       mov     %eax, 7(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit12):
> -       xor     %ah, %ah
> -       movb    %ah, 12(%rdx)
> -L(Exit12):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       mov     8(%rcx), %eax
> -       mov     %eax, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit13):
> -       xor     %ah, %ah
> -       movb    %ah, 13(%rdx)
> -L(Exit13):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  5(%rcx), %xmm1
> -       movlpd  %xmm1, 5(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit14):
> -       xor     %ah, %ah
> -       movb    %ah, 14(%rdx)
> -L(Exit14):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  6(%rcx), %xmm1
> -       movlpd  %xmm1, 6(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit15):
> -       xor     %ah, %ah
> -       movb    %ah, 15(%rdx)
> -L(Exit15):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  7(%rcx), %xmm1
> -       movlpd  %xmm1, 7(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit16):
> -       xor     %ah, %ah
> -       movb    %ah, 16(%rdx)
> -L(Exit16):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  8(%rcx), %xmm1
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  %xmm1, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -# ifdef USE_AS_STRNCPY
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase2):
> -       add     $16, %r8
> -       add     %rsi, %rcx
> -       lea     (%rsi, %rdx), %rsi
> -       lea     -9(%r8), %rdx
> -       and     $1<<7, %dh
> -       or      %al, %dh
> -       test    %dh, %dh
> -       lea     (%rsi), %rdx
> -       jz      L(ExitHighCase2)
> -
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       cmp     $1, %r8
> -       je      L(StrncatExit1)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       cmp     $2, %r8
> -       je      L(StrncatExit2)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       cmp     $3, %r8
> -       je      L(StrncatExit3)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       cmp     $4, %r8
> -       je      L(StrncatExit4)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       cmp     $5, %r8
> -       je      L(StrncatExit5)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       cmp     $6, %r8
> -       je      L(StrncatExit6)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -       cmp     $7, %r8
> -       je      L(StrncatExit7)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       lea     7(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -       xor     %cl, %cl
> -       movb    %cl, (%rax)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(ExitHighCase2):
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       cmp     $9, %r8
> -       je      L(StrncatExit9)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       cmp     $10, %r8
> -       je      L(StrncatExit10)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       cmp     $11, %r8
> -       je      L(StrncatExit11)
> -       test    $0x8, %ah
> -       jnz     L(Exit12)
> -       cmp     $12, %r8
> -       je      L(StrncatExit12)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       cmp     $13, %r8
> -       je      L(StrncatExit13)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       cmp     $14, %r8
> -       je      L(StrncatExit14)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -       cmp     $15, %r8
> -       je      L(StrncatExit15)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  8(%rcx), %xmm1
> -       movlpd  %xmm1, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase3):
> -       add     $16, %r8
> -       add     %rsi, %rdx
> -       add     %rsi, %rcx
> -
> -       cmp     $8, %r8
> -       ja      L(ExitHighCase3)
> -       cmp     $1, %r8
> -       je      L(StrncatExit1)
> -       cmp     $2, %r8
> -       je      L(StrncatExit2)
> -       cmp     $3, %r8
> -       je      L(StrncatExit3)
> -       cmp     $4, %r8
> -       je      L(StrncatExit4)
> -       cmp     $5, %r8
> -       je      L(StrncatExit5)
> -       cmp     $6, %r8
> -       je      L(StrncatExit6)
> -       cmp     $7, %r8
> -       je      L(StrncatExit7)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       xor     %ah, %ah
> -       movb    %ah, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(ExitHighCase3):
> -       cmp     $9, %r8
> -       je      L(StrncatExit9)
> -       cmp     $10, %r8
> -       je      L(StrncatExit10)
> -       cmp     $11, %r8
> -       je      L(StrncatExit11)
> -       cmp     $12, %r8
> -       je      L(StrncatExit12)
> -       cmp     $13, %r8
> -       je      L(StrncatExit13)
> -       cmp     $14, %r8
> -       je      L(StrncatExit14)
> -       cmp     $15, %r8
> -       je      L(StrncatExit15)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  8(%rcx), %xmm1
> -       movlpd  %xmm1, 8(%rdx)
> -       xor     %ah, %ah
> -       movb    %ah, 16(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit0):
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit15Bytes):
> -       cmp     $9, %r8
> -       je      L(StrncatExit9)
> -       cmpb    $0, 9(%rcx)
> -       jz      L(Exit10)
> -       cmp     $10, %r8
> -       je      L(StrncatExit10)
> -       cmpb    $0, 10(%rcx)
> -       jz      L(Exit11)
> -       cmp     $11, %r8
> -       je      L(StrncatExit11)
> -       cmpb    $0, 11(%rcx)
> -       jz      L(Exit12)
> -       cmp     $12, %r8
> -       je      L(StrncatExit12)
> -       cmpb    $0, 12(%rcx)
> -       jz      L(Exit13)
> -       cmp     $13, %r8
> -       je      L(StrncatExit13)
> -       cmpb    $0, 13(%rcx)
> -       jz      L(Exit14)
> -       cmp     $14, %r8
> -       je      L(StrncatExit14)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  7(%rcx), %xmm1
> -       movlpd  %xmm1, 7(%rdx)
> -       lea     14(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -       xor     %cl, %cl
> -       movb    %cl, (%rax)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit8Bytes):
> -       cmpb    $0, (%rcx)
> -       jz      L(Exit1)
> -       cmp     $1, %r8
> -       je      L(StrncatExit1)
> -       cmpb    $0, 1(%rcx)
> -       jz      L(Exit2)
> -       cmp     $2, %r8
> -       je      L(StrncatExit2)
> -       cmpb    $0, 2(%rcx)
> -       jz      L(Exit3)
> -       cmp     $3, %r8
> -       je      L(StrncatExit3)
> -       cmpb    $0, 3(%rcx)
> -       jz      L(Exit4)
> -       cmp     $4, %r8
> -       je      L(StrncatExit4)
> -       cmpb    $0, 4(%rcx)
> -       jz      L(Exit5)
> -       cmp     $5, %r8
> -       je      L(StrncatExit5)
> -       cmpb    $0, 5(%rcx)
> -       jz      L(Exit6)
> -       cmp     $6, %r8
> -       je      L(StrncatExit6)
> -       cmpb    $0, 6(%rcx)
> -       jz      L(Exit7)
> -       cmp     $7, %r8
> -       je      L(StrncatExit7)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       lea     7(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -       xor     %cl, %cl
> -       movb    %cl, (%rax)
> -       mov     %rdi, %rax
> -       ret
> -
> -# endif
> -END (STRCAT)
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strcat.S
> b/sysdeps/x86_64/multiarch/strcat.S
> deleted file mode 100644
> index 0c256de..0000000
> --- a/sysdeps/x86_64/multiarch/strcat.S
> +++ /dev/null
> @@ -1,84 +0,0 @@
> -/* Multiple versions of strcat
> -   Copyright (C) 2009, 2011 Free Software Foundation, Inc.
> -   Contributed by Intel Corporation.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -#include <init-arch.h>
> -
> -#ifndef USE_AS_STRNCAT
> -# ifndef STRCAT
> -#  define STRCAT strcat
> -# endif
> -#endif
> -
> -#ifdef USE_AS_STRNCAT
> -# define STRCAT_SSSE3                  __strncat_ssse3
> -# define STRCAT_SSE2                   __strncat_sse2
> -# define STRCAT_SSE2_UNALIGNED         __strncat_sse2_unaligned
> -# define __GI_STRCAT                   __GI_strncat
> -# define __GI___STRCAT              __GI___strncat
> -#else
> -# define STRCAT_SSSE3                  __strcat_ssse3
> -# define STRCAT_SSE2                   __strcat_sse2
> -# define STRCAT_SSE2_UNALIGNED         __strcat_sse2_unaligned
> -# define __GI_STRCAT                   __GI_strcat
> -# define __GI___STRCAT              __GI___strcat
> -#endif
> -
> -
> -/* Define multiple versions only for the definition in libc.  */
> -#ifndef NOT_IN_libc
> -       .text
> -ENTRY(STRCAT)
> -       .type   STRCAT, @gnu_indirect_function
> -       cmpl    $0, __cpu_features+KIND_OFFSET(%rip)
> -       jne     1f
> -       call    __init_cpu_features
> -1:     leaq    STRCAT_SSE2_UNALIGNED(%rip), %rax
> -       testl   $bit_Fast_Unaligned_Load,
> __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
> -       jnz     2f
> -       leaq    STRCAT_SSE2(%rip), %rax
> -       testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> -       jz      2f
> -       leaq    STRCAT_SSSE3(%rip), %rax
> -2:     ret
> -END(STRCAT)
> -
> -# undef ENTRY
> -# define ENTRY(name) \
> -       .type STRCAT_SSE2, @function; \
> -       .align 16; \
> -       STRCAT_SSE2: cfi_startproc; \
> -       CALL_MCOUNT
> -# undef END
> -# define END(name) \
> -       cfi_endproc; .size STRCAT_SSE2, .-STRCAT_SSE2
> -# undef libc_hidden_builtin_def
> -/* It doesn't make sense to send libc-internal strcat calls through a
> PLT.
> -   The speedup we get from using SSSE3 instruction is likely eaten away
> -   by the indirect call in the PLT.  */
> -# define libc_hidden_builtin_def(name) \
> -       .globl __GI_STRCAT; __GI_STRCAT = STRCAT_SSE2
> -# undef libc_hidden_def
> -# define libc_hidden_def(name) \
> -       .globl __GI___STRCAT; __GI___STRCAT = STRCAT_SSE2
> -#endif
> -
> -#ifndef USE_AS_STRNCAT
> -# include "../strcat.S"
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-c.c
> b/sysdeps/x86_64/multiarch/strncat-c.c
> deleted file mode 100644
> index a3cdbff..0000000
> --- a/sysdeps/x86_64/multiarch/strncat-c.c
> +++ /dev/null
> @@ -1,8 +0,0 @@
> -#define STRNCAT __strncat_sse2
> -#ifdef SHARED
> -#undef libc_hidden_def
> -#define libc_hidden_def(name) \
> -  __hidden_ver1 (__strncat_sse2, __GI___strncat, __strncat_sse2);
> -#endif
> -
> -#include "string/strncat.c"
> diff --git a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> deleted file mode 100644
> index 133e1d2..0000000
> --- a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STRNCAT
> -#define STRCAT __strncat_sse2_unaligned
> -#include "strcat-sse2-unaligned.S"
> diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S
> b/sysdeps/x86_64/multiarch/strncat-ssse3.S
> deleted file mode 100644
> index 6c45ff3..0000000
> --- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STRNCAT
> -#define STRCAT __strncat_ssse3
> -#include "strcat-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/strncat.S
> b/sysdeps/x86_64/multiarch/strncat.S
> deleted file mode 100644
> index fd569c2..0000000
> --- a/sysdeps/x86_64/multiarch/strncat.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define STRCAT strncat
> -#define USE_AS_STRNCAT
> -#include "strcat.S"
> diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S
> deleted file mode 100644
> index 535a18d..0000000
> --- a/sysdeps/x86_64/strcat.S
> +++ /dev/null
> @@ -1,259 +0,0 @@
> -/* strcat(dest, src) -- Append SRC on the end of DEST.
> -   Optimized for x86-64.
> -   Copyright (C) 2002 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -   Contributed by Andreas Jaeger <aj@suse.de>, 2002.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -#include "asm-syntax.h"
> -#include "bp-sym.h"
> -#include "bp-asm.h"
> -
> -
> -       .text
> -ENTRY (BP_SYM (strcat))
> -       movq %rdi, %rcx         /* Dest. register. */
> -       andl $7, %ecx           /* mask alignment bits */
> -       movq %rdi, %rax         /* Duplicate destination pointer.  */
> -       movq $0xfefefefefefefeff,%r8
> -
> -       /* First step: Find end of destination.  */
> -       jz 4f                   /* aligned => start loop */
> -
> -       neg %ecx                /* We need to align to 8 bytes.  */
> -       addl $8,%ecx
> -       /* Search the first bytes directly.  */
> -0:     cmpb $0x0,(%rax)        /* is byte NUL? */
> -       je 2f                   /* yes => start copy */
> -       incq %rax               /* increment pointer */
> -       decl %ecx
> -       jnz 0b
> -
> -
> -
> -       /* Now the source is aligned.  Scan for NUL byte.  */
> -       .p2align 4
> -4:
> -       /* First unroll.  */
> -       movq (%rax), %rcx       /* get double word (= 8 bytes) in question
> */
> -       addq $8,%rax            /* adjust pointer for next word */
> -       movq %r8, %rdx          /* magic value */
> -       addq %rcx, %rdx         /* add the magic value to the word.  We
> get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc 3f                  /* highest byte is NUL => return pointer
> */
> -       xorq %rcx, %rdx         /* (word+magic)^word */
> -       orq %r8, %rdx           /* set all non-carry bits */
> -       incq %rdx               /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -       jnz 3f                  /* found NUL => return pointer */
> -
> -       /* Second unroll.  */
> -       movq (%rax), %rcx       /* get double word (= 8 bytes) in question
> */
> -       addq $8,%rax            /* adjust pointer for next word */
> -       movq %r8, %rdx          /* magic value */
> -       addq %rcx, %rdx         /* add the magic value to the word.  We
> get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc 3f                  /* highest byte is NUL => return pointer
> */
> -       xorq %rcx, %rdx         /* (word+magic)^word */
> -       orq %r8, %rdx           /* set all non-carry bits */
> -       incq %rdx               /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -       jnz 3f                  /* found NUL => return pointer */
> -
> -       /* Third unroll.  */
> -       movq (%rax), %rcx       /* get double word (= 8 bytes) in question
> */
> -       addq $8,%rax            /* adjust pointer for next word */
> -       movq %r8, %rdx          /* magic value */
> -       addq %rcx, %rdx         /* add the magic value to the word.  We
> get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc 3f                  /* highest byte is NUL => return pointer
> */
> -       xorq %rcx, %rdx         /* (word+magic)^word */
> -       orq %r8, %rdx           /* set all non-carry bits */
> -       incq %rdx               /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -       jnz 3f                  /* found NUL => return pointer */
> -
> -       /* Fourth unroll.  */
> -       movq (%rax), %rcx       /* get double word (= 8 bytes) in question
> */
> -       addq $8,%rax            /* adjust pointer for next word */
> -       movq %r8, %rdx          /* magic value */
> -       addq %rcx, %rdx         /* add the magic value to the word.  We
> get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc 3f                  /* highest byte is NUL => return pointer
> */
> -       xorq %rcx, %rdx         /* (word+magic)^word */
> -       orq %r8, %rdx           /* set all non-carry bits */
> -       incq %rdx               /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -       jz 4b                   /* no NUL found => continue loop */
> -
> -       .p2align 4              /* Align, it's a jump target.  */
> -3:     subq $8,%rax            /* correct pointer increment.  */
> -
> -       testb %cl, %cl          /* is first byte NUL? */
> -       jz 2f                   /* yes => return */
> -       incq %rax               /* increment pointer */
> -
> -       testb %ch, %ch          /* is second byte NUL? */
> -       jz 2f                   /* yes => return */
> -       incq %rax               /* increment pointer */
> -
> -       testl $0x00ff0000, %ecx /* is third byte NUL? */
> -       jz 2f                   /* yes => return pointer */
> -       incq %rax               /* increment pointer */
> -
> -       testl $0xff000000, %ecx /* is fourth byte NUL? */
> -       jz 2f                   /* yes => return pointer */
> -       incq %rax               /* increment pointer */
> -
> -       shrq $32, %rcx          /* look at other half.  */
> -
> -       testb %cl, %cl          /* is first byte NUL? */
> -       jz 2f                   /* yes => return */
> -       incq %rax               /* increment pointer */
> -
> -       testb %ch, %ch          /* is second byte NUL? */
> -       jz 2f                   /* yes => return */
> -       incq %rax               /* increment pointer */
> -
> -       testl $0xff0000, %ecx   /* is third byte NUL? */
> -       jz 2f                   /* yes => return pointer */
> -       incq %rax               /* increment pointer */
> -
> -2:
> -       /* Second step: Copy source to destination.  */
> -
> -       movq    %rsi, %rcx      /* duplicate  */
> -       andl    $7,%ecx         /* mask alignment bits */
> -       movq    %rax, %rdx      /* move around */
> -       jz      22f             /* aligned => start loop */
> -
> -       neg     %ecx            /* align to 8 bytes.  */
> -       addl    $8, %ecx
> -       /* Align the source pointer.  */
> -21:
> -       movb    (%rsi), %al     /* Fetch a byte */
> -       testb   %al, %al        /* Is it NUL? */
> -       movb    %al, (%rdx)     /* Store it */
> -       jz      24f             /* If it was NUL, done! */
> -       incq    %rsi
> -       incq    %rdx
> -       decl    %ecx
> -       jnz     21b
> -
> -       /* Now the sources is aligned.  Unfortunatly we cannot force
> -          to have both source and destination aligned, so ignore the
> -          alignment of the destination.  */
> -       .p2align 4
> -22:
> -       /* 1st unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We
> get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     23f             /* highest byte is NUL => return pointer
> */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     23f             /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -
> -       /* 2nd unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We
> get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     23f             /* highest byte is NUL => return pointer
> */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     23f             /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -
> -       /* 3rd unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We
> get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     23f             /* highest byte is NUL => return pointer
> */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     23f             /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -
> -       /* 4th unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We
> get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     23f             /* highest byte is NUL => return pointer
> */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     23f             /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -       jmp     22b             /* Next iteration.  */
> -
> -       /* Do the last few bytes. %rax contains the value to write.
> -          The loop is unrolled twice.  */
> -       .p2align 4
> -23:
> -       movb    %al, (%rdx)     /* 1st byte.  */
> -       testb   %al, %al        /* Is it NUL.  */
> -       jz      24f             /* yes, finish.  */
> -       incq    %rdx            /* Increment destination.  */
> -       movb    %ah, (%rdx)     /* 2nd byte.  */
> -       testb   %ah, %ah        /* Is it NUL?.  */
> -       jz      24f             /* yes, finish.  */
> -       incq    %rdx            /* Increment destination.  */
> -       shrq    $16, %rax       /* Shift...  */
> -       jmp     23b             /* and look at next two bytes in %rax.  */
> -
> -
> -24:
> -       movq    %rdi, %rax      /* Source is return value.  */
> -       retq
> -END (BP_SYM (strcat))
> -libc_hidden_builtin_def (strcat)
> --
> 1.7.4.4
>
>
>


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]