This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] faster strcat


On Tue, Oct 09, 2012 at 04:37:34PM +0400, Dmitrieva Liubov wrote:
> Why do you think this is faster?
> 
> You two times iterates all src array instead of one time in the
> current version (the first is strnlen and the second is strcpy, in the
> current assembler version something like strncpy is used instead but
> without strncpy zero filling)
>
It is almost fastest I could portably write. I simplified it into

+  size_t dest_len = strlen (dest);
+  size_t src_len  = strnlen (src , n);
+  memcpy (dest + dest_len, src, src_len);
+  dest[dest_len + src_len] = '\0';

I would use stpncpy if it did not zero filled. Also adding strcat variant that
returns end of string could speed things up.

> It's likely true that current version inlines slow strlen but that
> code can be switched to use other strlen version.
Good idea. I am not sure how to keep it with sync with strlen for
future.

> 
> +  size_t dest_len = strlen (dest);
> +  size_t src_len  = strnlen (src , n);
> 
> -  if (c != '\0')
> -    *++s1 = '\0';
> +  if (src_len == n)
> +    {
> +     memcpy (dest + dest_len, src, n);
> +     dest[dest_len + n] = '\0';
> +    }
> +  else
> +    strcpy (dest + dest_len, src);
> 
> -  return s;
> +  return dest;
>  }
> 
> 
> --
> Liubov Dmitrieva
> Intel Corporation
> 
> 2012/10/8 OndÅej BÃlka <neleai@seznam.cz>
> >
> > This is next version of my patch
> > http://sourceware.org/ml/libc-alpha/2012-06/msg00489.html
> >
> > I investigated strcat bit futher and speed degradation
> > was caused by improper usage of indirect functions.
> >
> > strcat ifunc first tests bit_Fast_Unaligned_Load which is
> > false on core2 and AMD processors. Then it checks ssse3 and
> > calls ssse3 version.
> > But strcat_ssse3 inlines strlen_sse2_no_bsf which on core2 and phenomII
> > is slowest strlen variant unless strings is larger than 2000 where
> > strlen_sse2 takes lead.
> >
> > Then I deleted strcat variants that are no longer needed.
> >
> > Files ports/sysdeps/ia64/strcat.c, sysdeps/powerpc/strcat.c,  became
> > duplicates of string/strcat.c.
> >
> >
> >         * string/strcat.c: Reduce algorithm selection
> >           to strlen,strcpy
> >         * string/strncat.c: Likewise
> >         * sysdeps/powerpc/strcat.c: Duplicated string/strcat.c
> >         * ports/sysdeps/ia64/strcat.c: Likewise
> >
> >         * sysdeps/i386/i686/multiarch/Makefile: Updated
> >         * sysdeps/x86_64/multiarch/Makefile:    Updated
> >
> >         * sysdeps/i386/i486/strcat.S: No longer needed
> >         * sysdeps/i386/i686/multiarch/strcat-sse2.S:Likewise
> >         * sysdeps/i386/i686/multiarch/strcat-ssse3.S:Likewise
> >         * sysdeps/i386/i686/multiarch/strcat.S:Likewise
> >         * sysdeps/i386/i686/multiarch/strncat-c.c:Likewise
> >         * sysdeps/i386/i686/multiarch/strncat-sse2.S:Likewise
> >         * sysdeps/i386/i686/multiarch/strncat-ssse3.S:Likewise
> >         * sysdeps/i386/i686/multiarch/strncat.S:Likewise
> >         * sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S:Likewise
> >         * sysdeps/x86_64/multiarch/strcat-ssse3.S:Likewise
> >         * sysdeps/x86_64/multiarch/strcat.S:Likewise
> >         * sysdeps/x86_64/multiarch/strncat-c.c:Likewise
> >         * sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S:Likewise
> >         * sysdeps/x86_64/multiarch/strncat-ssse3.S:Likewise
> >         * sysdeps/x86_64/multiarch/strncat.S:Likewise
> >         * sysdeps/x86_64/strcat.S:Likewise
> >
> >
> > ---
> >  ports/sysdeps/ia64/strcat.c                       |   26 -
> >  string/strcat.c                                   |   29 +-
> >  string/strncat.c                                  |   62 +-
> >  sysdeps/i386/i486/strcat.S                        |  273 -----
> >  sysdeps/i386/i686/multiarch/Makefile              |    3 +-
> >  sysdeps/i386/i686/multiarch/strcat-sse2.S         | 1243
> > ---------------------
> >  sysdeps/i386/i686/multiarch/strcat-ssse3.S        |  572 ----------
> >  sysdeps/i386/i686/multiarch/strcat.S              |  119 --
> >  sysdeps/i386/i686/multiarch/strncat-c.c           |    8 -
> >  sysdeps/i386/i686/multiarch/strncat-sse2.S        |    4 -
> >  sysdeps/i386/i686/multiarch/strncat-ssse3.S       |    4 -
> >  sysdeps/i386/i686/multiarch/strncat.S             |    3 -
> >  sysdeps/powerpc/strcat.c                          |   30 -
> >  sysdeps/x86_64/multiarch/Makefile                 |    5 +-
> >  sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S  |   53 -
> >  sysdeps/x86_64/multiarch/strcat-ssse3.S           |  557 ---------
> >  sysdeps/x86_64/multiarch/strcat.S                 |   84 --
> >  sysdeps/x86_64/multiarch/strncat-c.c              |    8 -
> >  sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S |    3 -
> >  sysdeps/x86_64/multiarch/strncat-ssse3.S          |    3 -
> >  sysdeps/x86_64/multiarch/strncat.S                |    3 -
> >  sysdeps/x86_64/strcat.S                           |  259 -----
> >  24 files changed, 15 insertions(+), 4280 deletions(-)
> >  delete mode 100644 ports/sysdeps/ia64/strcat.c
> >  delete mode 100644 sysdeps/i386/i486/strcat.S
> >  delete mode 100644 sysdeps/i386/i686/multiarch/strcat-sse2.S
> >  delete mode 100644 sysdeps/i386/i686/multiarch/strcat-ssse3.S
> >  delete mode 100644 sysdeps/i386/i686/multiarch/strcat.S
> >  delete mode 100644 sysdeps/i386/i686/multiarch/strncat-c.c
> >  delete mode 100644 sysdeps/i386/i686/multiarch/strncat-sse2.S
> >  delete mode 100644 sysdeps/i386/i686/multiarch/strncat-ssse3.S
> >  delete mode 100644 sysdeps/i386/i686/multiarch/strncat.S
> >  delete mode 100644 sysdeps/powerpc/strcat.c
> >  delete mode 100644 sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> >  delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
> >  delete mode 100644 sysdeps/x86_64/multiarch/strcat.S
> >  delete mode 100644 sysdeps/x86_64/multiarch/strncat-c.c
> >  delete mode 100644 sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> >  delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S
> >  delete mode 100644 sysdeps/x86_64/multiarch/strncat.S
> >  delete mode 100644 sysdeps/x86_64/strcat.S
> >
> > diff --git a/ports/sysdeps/ia64/strcat.c b/ports/sysdeps/ia64/strcat.c
> > deleted file mode 100644
> > index 53cd4d1..0000000
> > --- a/ports/sysdeps/ia64/strcat.c
> > +++ /dev/null
> > @@ -1,26 +0,0 @@
> > -/* Copyright (C) 2004 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <http://www.gnu.org/licenses/>.  */
> > -
> > -#include <string.h>
> > -
> > -char *
> > -strcat (char *dest, const char *src)
> > -{
> > -  strcpy (dest + strlen (dest), src);
> > -  return dest;
> > -}
> > -libc_hidden_builtin_def (strcat)
> > diff --git a/string/strcat.c b/string/strcat.c
> > index f9e4bc6..28575d0 100644
> > --- a/string/strcat.c
> > +++ b/string/strcat.c
> > @@ -1,4 +1,5 @@
> > -/* Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
> > +/* strcat version that uses fast strcpy/strlen.
> > +   Copyright (C) 1997, 2003 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> >     The GNU C Library is free software; you can redistribute it and/or
> > @@ -16,36 +17,14 @@
> >     <http://www.gnu.org/licenses/>.  */
> >
> >  #include <string.h>
> > -#include <memcopy.h>
> >
> >  #undef strcat
> >
> >  /* Append SRC on the end of DEST.  */
> >  char *
> > -strcat (dest, src)
> > -     char *dest;
> > -     const char *src;
> > +strcat (char *dest, const char *src)
> >  {
> > -  char *s1 = dest;
> > -  const char *s2 = src;
> > -  char c;
> > -
> > -  /* Find the end of the string.  */
> > -  do
> > -    c = *s1++;
> > -  while (c != '\0');
> > -
> > -  /* Make S1 point before the next character, so we can increment
> > -     it while memory is read (wins on pipelined cpus).  */
> > -  s1 -= 2;
> > -
> > -  do
> > -    {
> > -      c = *s2++;
> > -      *++s1 = c;
> > -    }
> > -  while (c != '\0');
> > -
> > +  strcpy (dest + strlen (dest), src);
> >    return dest;
> >  }
> >  libc_hidden_builtin_def (strcat)
> > diff --git a/string/strncat.c b/string/strncat.c
> > index dcfb04d..17b4c9a 100644
> > --- a/string/strncat.c
> > +++ b/string/strncat.c
> > @@ -1,4 +1,4 @@
> > -/* Copyright (C) 1991,1997,2011 Free Software Foundation, Inc.
> > +/* Copyright (C) 1991-2012 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> >     The GNU C Library is free software; you can redistribute it and/or
> > @@ -17,66 +17,20 @@
> >
> >  #include <string.h>
> >
> > -#ifdef _LIBC
> > -# include <memcopy.h>
> > -#endif
> > -
> >  #ifndef STRNCAT
> >  # undef strncat
> >  # define STRNCAT  strncat
> >  #endif
> >
> >  char *
> > -STRNCAT (char *s1, const char *s2, size_t n)
> > +STRNCAT (char *dest, const char *src, size_t n)
> >  {
> > -  char c;
> > -  char *s = s1;
> > -
> > -  /* Find the end of S1.  */
> > -  do
> > -    c = *s1++;
> > -  while (c != '\0');
> > -
> > -  /* Make S1 point before next character, so we can increment
> > -     it while memory is read (wins on pipelined cpus).  */
> > -  s1 -= 2;
> > -
> > -  if (n >= 4)
> > -    {
> > -      size_t n4 = n >> 2;
> > -      do
> > -       {
> > -         c = *s2++;
> > -         *++s1 = c;
> > -         if (c == '\0')
> > -           return s;
> > -         c = *s2++;
> > -         *++s1 = c;
> > -         if (c == '\0')
> > -           return s;
> > -         c = *s2++;
> > -         *++s1 = c;
> > -         if (c == '\0')
> > -           return s;
> > -         c = *s2++;
> > -         *++s1 = c;
> > -         if (c == '\0')
> > -           return s;
> > -       } while (--n4 > 0);
> > -      n &= 3;
> > -    }
> > -
> > -  while (n > 0)
> > -    {
> > -      c = *s2++;
> > -      *++s1 = c;
> > -      if (c == '\0')
> > -       return s;
> > -      n--;
> > -    }
> > +  size_t dest_len = strlen (dest);
> > +  size_t src_len  = strnlen (src , n);
> >
> > -  if (c != '\0')
> > -    *++s1 = '\0';
> > +  if (src_len == n)
> > +    {
> > +     memcpy (dest + dest_len, src, n);
> > +     dest[dest_len + n] = '\0';
> > +    }
> > +  else
> > +    strcpy (dest + dest_len, src);
> >
> > -  return s;
> > +  return dest;
> >  }
> > diff --git a/sysdeps/i386/i486/strcat.S b/sysdeps/i386/i486/strcat.S
> > deleted file mode 100644
> > index 7596a0d..0000000
> > --- a/sysdeps/i386/i486/strcat.S
> > +++ /dev/null
> > @@ -1,273 +0,0 @@
> > -/* strcat(dest, src) -- Append SRC on the end of DEST.
> > -   For Intel 80x86, x>=4.
> > -   Copyright (C) 1994-1997,2000,2003,2005 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -   Contributed by Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de>.
> > -   Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <http://www.gnu.org/licenses/>.  */
> > -
> > -#include <sysdep.h>
> > -#include "asm-syntax.h"
> > -#include "bp-sym.h"
> > -#include "bp-asm.h"
> > -
> > -#define PARMS  LINKAGE+4       /* space for 1 saved reg */
> > -#define RTN    PARMS
> > -#define DEST   RTN+RTN_SIZE
> > -#define SRC    DEST+PTR_SIZE
> > -
> > -       .text
> > -ENTRY (BP_SYM (strcat))
> > -       ENTER
> > -
> > -       pushl %edi              /* Save callee-safe register.  */
> > -       cfi_adjust_cfa_offset (4)
> > -
> > -       movl DEST(%esp), %edx
> > -       movl SRC(%esp), %ecx
> > -       CHECK_BOUNDS_LOW (%edx, DEST(%esp))
> > -       CHECK_BOUNDS_LOW (%ecx, SRC(%esp))
> > -
> > -       testb $0xff, (%ecx)     /* Is source string empty? */
> > -       jz L(8)                 /* yes => return */
> > -
> > -       /* Test the first bytes separately until destination is aligned.
> > */
> > -       testl $3, %edx          /* destination pointer aligned? */
> > -       jz L(1)                 /* yes => begin scan loop */
> > -       testb $0xff, (%edx)     /* is end of string? */
> > -       jz L(2)                 /* yes => start appending */
> > -       incl %edx               /* increment source pointer */
> > -
> > -       testl $3, %edx          /* destination pointer aligned? */
> > -       jz L(1)                 /* yes => begin scan loop */
> > -       testb $0xff, (%edx)     /* is end of string? */
> > -       jz L(2)                 /* yes => start appending */
> > -       incl %edx               /* increment source pointer */
> > -
> > -       testl $3, %edx          /* destination pointer aligned? */
> > -       jz L(1)                 /* yes => begin scan loop */
> > -       testb $0xff, (%edx)     /* is end of string? */
> > -       jz L(2)                 /* yes => start appending */
> > -       incl %edx               /* increment source pointer */
> > -
> > -       /* Now we are aligned.  Begin scan loop.  */
> > -       jmp L(1)
> > -
> > -       cfi_rel_offset (edi, 0)
> > -       ALIGN(4)
> > -
> > -L(4):  addl $16,%edx           /* increment destination pointer for round
> > */
> > -
> > -L(1):  movl (%edx), %eax       /* get word (= 4 bytes) in question */
> > -       movl $0xfefefeff, %edi  /* magic value */
> > -
> > -       /* If you compare this with the algorithm in memchr.S you will
> > -          notice that here is an `xorl' statement missing.  But you must
> > -          not forget that we are looking for C == 0 and `xorl $0, %eax'
> > -          is a no-op.  */
> > -
> > -       addl %eax, %edi         /* add the magic value to the word.  We
> > get
> > -                                  carry bits reported for each byte which
> > -                                  is *not* 0 */
> > -
> > -       /* According to the algorithm we had to reverse the effect of the
> > -          XOR first and then test the overflow bits.  But because the
> > -          following XOR would destroy the carry flag and it would (in a
> > -          representation with more than 32 bits) not alter then last
> > -          overflow, we can now test this condition.  If no carry is
> > signaled
> > -          no overflow must have occurred in the last byte => it was 0. */
> > -       jnc L(3)
> > -
> > -       /* We are only interested in carry bits that change due to the
> > -          previous add, so remove original bits */
> > -       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask)
> > */
> > -
> > -       /* Now test for the other three overflow bits.  */
> > -       orl $0xfefefeff, %edi   /* set all non-carry bits */
> > -       incl %edi               /* add 1: if one carry bit was *not* set
> > -                                  the addition will not result in 0.  */
> > -
> > -       /* If at least one byte of the word is C we don't get 0 in %ecx.
> > */
> > -       jnz L(3)
> > -
> > -       movl 4(%edx), %eax      /* get word from source */
> > -       movl $0xfefefeff, %edi  /* magic value */
> > -       addl %eax, %edi         /* add the magic value to the word.  We
> > get
> > -                                  carry bits reported for each byte which
> > -                                  is *not* 0 */
> > -       jnc L(5)                /* highest byte is C => stop copying */
> > -       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask)
> > */
> > -       orl $0xfefefeff, %edi   /* set all non-carry bits */
> > -       incl %edi               /* add 1: if one carry bit was *not* set
> > -                                  the addition will not result in 0.  */
> > -       jnz L(5)                /* one byte is NUL => stop copying */
> > -
> > -       movl 8(%edx), %eax      /* get word from source */
> > -       movl $0xfefefeff, %edi  /* magic value */
> > -       addl %eax, %edi         /* add the magic value to the word.  We
> > get
> > -                                  carry bits reported for each byte which
> > -                                  is *not* 0 */
> > -       jnc L(6)                /* highest byte is C => stop copying */
> > -       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask)
> > */
> > -       orl $0xfefefeff, %edi   /* set all non-carry bits */
> > -       incl %edi               /* add 1: if one carry bit was *not* set
> > -                                  the addition will not result in 0.  */
> > -       jnz L(6)                /* one byte is NUL => stop copying */
> > -
> > -       movl 12(%edx), %eax     /* get word from source */
> > -       movl $0xfefefeff, %edi  /* magic value */
> > -       addl %eax, %edi         /* add the magic value to the word.  We
> > get
> > -                                  carry bits reported for each byte which
> > -                                  is *not* 0 */
> > -       jnc L(7)                /* highest byte is C => stop copying */
> > -       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask)
> > */
> > -       orl $0xfefefeff, %edi   /* set all non-carry bits */
> > -       incl %edi               /* add 1: if one carry bit was *not* set
> > -                                  the addition will not result in 0.  */
> > -       jz L(4)                 /* no byte is NUL => carry on copying */
> > -
> > -L(7):  addl $4, %edx           /* adjust source pointer */
> > -L(6):  addl $4, %edx
> > -L(5):  addl $4, %edx
> > -
> > -L(3):  testb %al, %al          /* is first byte NUL? */
> > -       jz L(2)                 /* yes => start copying */
> > -       incl %edx               /* increment source pointer */
> > -
> > -       testb %ah, %ah          /* is second byte NUL? */
> > -       jz L(2)                 /* yes => start copying */
> > -       incl %edx               /* increment source pointer */
> > -
> > -       testl $0xff0000, %eax   /* is third byte NUL? */
> > -       jz L(2)                 /* yes => start copying */
> > -       incl %edx               /* increment source pointer */
> > -
> > -L(2):  subl %ecx, %edx         /* reduce number of loop variants */
> > -
> > -       /* Now we have to align the source pointer.  */
> > -       testl $3, %ecx          /* pointer correctly aligned? */
> > -       jz L(29)                /* yes => start copy loop */
> > -       movb (%ecx), %al        /* get first byte */
> > -       movb %al, (%ecx,%edx)   /* and store it */
> > -       andb %al, %al           /* is byte NUL? */
> > -       jz L(8)                 /* yes => return */
> > -       incl %ecx               /* increment pointer */
> > -
> > -       testl $3, %ecx          /* pointer correctly aligned? */
> > -       jz L(29)                /* yes => start copy loop */
> > -       movb (%ecx), %al        /* get first byte */
> > -       movb %al, (%ecx,%edx)   /* and store it */
> > -       andb %al, %al           /* is byte NUL? */
> > -       jz L(8)                 /* yes => return */
> > -       incl %ecx               /* increment pointer */
> > -
> > -       testl $3, %ecx          /* pointer correctly aligned? */
> > -       jz L(29)                /* yes => start copy loop */
> > -       movb (%ecx), %al        /* get first byte */
> > -       movb %al, (%ecx,%edx)   /* and store it */
> > -       andb %al, %al           /* is byte NUL? */
> > -       jz L(8)                 /* yes => return */
> > -       incl %ecx               /* increment pointer */
> > -
> > -       /* Now we are aligned.  */
> > -       jmp L(29)               /* start copy loop */
> > -
> > -       ALIGN(4)
> > -
> > -L(28): movl %eax, 12(%ecx,%edx)/* store word at destination */
> > -       addl $16, %ecx          /* adjust pointer for full round */
> > -
> > -L(29): movl (%ecx), %eax       /* get word from source */
> > -       movl $0xfefefeff, %edi  /* magic value */
> > -       addl %eax, %edi         /* add the magic value to the word.  We
> > get
> > -                                  carry bits reported for each byte which
> > -                                  is *not* 0 */
> > -       jnc L(9)                /* highest byte is C => stop copying */
> > -       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask)
> > */
> > -       orl $0xfefefeff, %edi   /* set all non-carry bits */
> > -       incl %edi               /* add 1: if one carry bit was *not* set
> > -                                  the addition will not result in 0.  */
> > -       jnz L(9)                /* one byte is NUL => stop copying */
> > -       movl %eax, (%ecx,%edx)  /* store word to destination */
> > -
> > -       movl 4(%ecx), %eax      /* get word from source */
> > -       movl $0xfefefeff, %edi  /* magic value */
> > -       addl %eax, %edi         /* add the magic value to the word.  We
> > get
> > -                                  carry bits reported for each byte which
> > -                                  is *not* 0 */
> > -       jnc L(91)               /* highest byte is C => stop copying */
> > -       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask)
> > */
> > -       orl $0xfefefeff, %edi   /* set all non-carry bits */
> > -       incl %edi               /* add 1: if one carry bit was *not* set
> > -                                  the addition will not result in 0.  */
> > -       jnz L(91)               /* one byte is NUL => stop copying */
> > -       movl %eax, 4(%ecx,%edx) /* store word to destination */
> > -
> > -       movl 8(%ecx), %eax      /* get word from source */
> > -       movl $0xfefefeff, %edi  /* magic value */
> > -       addl %eax, %edi         /* add the magic value to the word.  We
> > get
> > -                                  carry bits reported for each byte which
> > -                                  is *not* 0 */
> > -       jnc L(92)               /* highest byte is C => stop copying */
> > -       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask)
> > */
> > -       orl $0xfefefeff, %edi   /* set all non-carry bits */
> > -       incl %edi               /* add 1: if one carry bit was *not* set
> > -                                  the addition will not result in 0.  */
> > -       jnz L(92)               /* one byte is NUL => stop copying */
> > -       movl %eax, 8(%ecx,%edx) /* store word to destination */
> > -
> > -       movl 12(%ecx), %eax     /* get word from source */
> > -       movl $0xfefefeff, %edi  /* magic value */
> > -       addl %eax, %edi         /* add the magic value to the word.  We
> > get
> > -                                  carry bits reported for each byte which
> > -                                  is *not* 0 */
> > -       jnc L(93)               /* highest byte is C => stop copying */
> > -       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask)
> > */
> > -       orl $0xfefefeff, %edi   /* set all non-carry bits */
> > -       incl %edi               /* add 1: if one carry bit was *not* set
> > -                                  the addition will not result in 0.  */
> > -       jz L(28)                /* no is NUL => carry on copying */
> > -
> > -L(93): addl $4, %ecx           /* adjust pointer */
> > -L(92): addl $4, %ecx
> > -L(91): addl $4, %ecx
> > -
> > -L(9):  movb %al, (%ecx,%edx)   /* store first byte of last word */
> > -       orb %al, %al            /* is it NUL? */
> > -       jz L(8)                 /* yes => return */
> > -
> > -       movb %ah, 1(%ecx,%edx)  /* store second byte of last word */
> > -       orb %ah, %ah            /* is it NUL? */
> > -       jz L(8)                 /* yes => return */
> > -
> > -       shrl $16, %eax          /* make upper bytes accessible */
> > -       movb %al, 2(%ecx,%edx)  /* store third byte of last word */
> > -       orb %al, %al            /* is it NUL? */
> > -       jz L(8)                 /* yes => return */
> > -
> > -       movb %ah, 3(%ecx,%edx)  /* store fourth byte of last word */
> > -
> > -L(8):  /* GKM FIXME: check high bounds */
> > -       movl DEST(%esp), %eax   /* start address of destination is result
> > */
> > -       RETURN_BOUNDED_POINTER (DEST(%esp))
> > -       popl %edi               /* restore saved register */
> > -       cfi_adjust_cfa_offset (-4)
> > -       cfi_restore (edi)
> > -
> > -       LEAVE
> > -       RET_PTR
> > -END (BP_SYM (strcat))
> > -libc_hidden_builtin_def (strcat)
> > diff --git a/sysdeps/i386/i686/multiarch/Makefile
> > b/sysdeps/i386/i686/multiarch/Makefile
> > index 8946bfa..92a2b8f 100644
> > --- a/sysdeps/i386/i686/multiarch/Makefile
> > +++ b/sysdeps/i386/i686/multiarch/Makefile
> > @@ -14,8 +14,7 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3
> > mempcpy-ssse3 \
> >                    memcmp-ssse3 memcmp-sse4 strcasestr-nonascii varshift \
> >                    strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \
> >                    strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \
> > -                  strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \
> > -                  strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \
> > +                  strncpy-sse2 stpcpy-sse2 stpncpy-sse2 \
> >                    strchr-sse2 strrchr-sse2 strchr-sse2-bsf
> > strrchr-sse2-bsf \
> >                    memchr-sse2 memchr-sse2-bsf \
> >                    memrchr-sse2 memrchr-sse2-bsf memrchr-c \
> > diff --git a/sysdeps/i386/i686/multiarch/strcat-sse2.S
> > b/sysdeps/i386/i686/multiarch/strcat-sse2.S
> > deleted file mode 100644
> > index e75f92c..0000000
> > --- a/sysdeps/i386/i686/multiarch/strcat-sse2.S
> > +++ /dev/null
> > @@ -1,1243 +0,0 @@
> > -/* strcat with SSE2
> > -   Copyright (C) 2011-2012 Free Software Foundation, Inc.
> > -   Contributed by Intel Corporation.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <http://www.gnu.org/licenses/>.  */
> > -
> > -
> > -#ifndef NOT_IN_libc
> > -
> > -# include <sysdep.h>
> > -
> > -
> > -# define CFI_PUSH(REG) \
> > -       cfi_adjust_cfa_offset (4);      \
> > -       cfi_rel_offset (REG, 0)
> > -
> > -# define CFI_POP(REG)  \
> > -       cfi_adjust_cfa_offset (-4);     \
> > -       cfi_restore (REG)
> > -
> > -# define PUSH(REG) pushl REG; CFI_PUSH (REG)
> > -# define POP(REG) popl REG; CFI_POP (REG)
> > -
> > -# ifdef SHARED
> > -#  define JMPTBL(I, B) I - B
> > -
> > -/* Load an entry in a jump table into ECX and branch to it.  TABLE is a
> > -       jump table with relative offsets.  INDEX is a register contains
> > the
> > -       index into the jump table.   SCALE is the scale of INDEX. */
> > -
> > -#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)  \
> > -       /* We first load PC into ECX.  */       \
> > -       SETUP_PIC_REG(cx);      \
> > -       /* Get the address of the jump table.  */       \
> > -       addl    $(TABLE - .), %ecx;     \
> > -       /* Get the entry and convert the relative offset to the \
> > -       absolute address.  */   \
> > -       addl    (%ecx,INDEX,SCALE), %ecx;       \
> > -       /* We loaded the jump table and adjuested ECX. Go.  */  \
> > -       jmp     *%ecx
> > -# else
> > -#  define JMPTBL(I, B) I
> > -
> > -/* Branch to an entry in a jump table.  TABLE is a jump table with
> > -       absolute offsets.  INDEX is a register contains the index into the
> > -       jump table.  SCALE is the scale of INDEX. */
> > -
> > -#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)  \
> > -       jmp     *TABLE(,INDEX,SCALE)
> > -# endif
> > -
> > -# ifndef STRCAT
> > -#  define STRCAT  __strcat_sse2
> > -# endif
> > -
> > -# define PARMS  4
> > -# define STR1  PARMS+4
> > -# define STR2  STR1+4
> > -
> > -# ifdef USE_AS_STRNCAT
> > -#  define LEN    STR2+8
> > -#  define STR3   STR1+4
> > -# else
> > -#  define STR3   STR1
> > -# endif
> > -
> > -# define USE_AS_STRCAT
> > -# ifdef USE_AS_STRNCAT
> > -#  define RETURN  POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx);
> > CFI_PUSH(%esi);
> > -# else
> > -#  define RETURN  POP(%esi); ret; CFI_PUSH(%esi);
> > -# endif
> > -
> > -.text
> > -ENTRY (STRCAT)
> > -       PUSH    (%esi)
> > -       mov     STR1(%esp), %eax
> > -       mov     STR2(%esp), %esi
> > -# ifdef USE_AS_STRNCAT
> > -       PUSH    (%ebx)
> > -       movl    LEN(%esp), %ebx
> > -       test    %ebx, %ebx
> > -       jz      L(ExitZero)
> > -# endif
> > -       cmpb    $0, (%esi)
> > -       mov     %esi, %ecx
> > -       mov     %eax, %edx
> > -       jz      L(ExitZero)
> > -
> > -       and     $63, %ecx
> > -       and     $63, %edx
> > -       cmp     $32, %ecx
> > -       ja      L(StrlenCore7_1)
> > -       cmp     $48, %edx
> > -       ja      L(alignment_prolog)
> > -
> > -       pxor    %xmm0, %xmm0
> > -       pxor    %xmm4, %xmm4
> > -       pxor    %xmm7, %xmm7
> > -       movdqu  (%eax), %xmm1
> > -       movdqu  (%esi), %xmm5
> > -       pcmpeqb %xmm1, %xmm0
> > -       movdqu  16(%esi), %xmm6
> > -       pmovmskb %xmm0, %ecx
> > -       pcmpeqb %xmm5, %xmm4
> > -       pcmpeqb %xmm6, %xmm7
> > -       test    %ecx, %ecx
> > -       jnz     L(exit_less16_)
> > -       mov     %eax, %ecx
> > -       and     $-16, %eax
> > -       jmp     L(loop_prolog)
> > -
> > -L(alignment_prolog):
> > -       pxor    %xmm0, %xmm0
> > -       pxor    %xmm4, %xmm4
> > -       mov     %edx, %ecx
> > -       pxor    %xmm7, %xmm7
> > -       and     $15, %ecx
> > -       and     $-16, %eax
> > -       pcmpeqb (%eax), %xmm0
> > -       movdqu  (%esi), %xmm5
> > -       movdqu  16(%esi), %xmm6
> > -       pmovmskb %xmm0, %edx
> > -       pcmpeqb %xmm5, %xmm4
> > -       shr     %cl, %edx
> > -       pcmpeqb %xmm6, %xmm7
> > -       test    %edx, %edx
> > -       jnz     L(exit_less16)
> > -       add     %eax, %ecx
> > -
> > -       pxor    %xmm0, %xmm0
> > -L(loop_prolog):
> > -       pxor    %xmm1, %xmm1
> > -       pxor    %xmm2, %xmm2
> > -       pxor    %xmm3, %xmm3
> > -       .p2align 4
> > -L(align16_loop):
> > -       pcmpeqb 16(%eax), %xmm0
> > -       pmovmskb %xmm0, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit16)
> > -
> > -       pcmpeqb 32(%eax), %xmm1
> > -       pmovmskb %xmm1, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit32)
> > -
> > -       pcmpeqb 48(%eax), %xmm2
> > -       pmovmskb %xmm2, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit48)
> > -
> > -       pcmpeqb 64(%eax), %xmm3
> > -       pmovmskb %xmm3, %edx
> > -       lea     64(%eax), %eax
> > -       test    %edx, %edx
> > -       jz      L(align16_loop)
> > -       bsf     %edx, %edx
> > -       add     %edx, %eax
> > -       jmp     L(StartStrcpyPart)
> > -
> > -       .p2align 4
> > -L(exit16):
> > -       bsf     %edx, %edx
> > -       lea     16(%eax, %edx), %eax
> > -       jmp     L(StartStrcpyPart)
> > -
> > -       .p2align 4
> > -L(exit32):
> > -       bsf     %edx, %edx
> > -       lea     32(%eax, %edx), %eax
> > -       jmp     L(StartStrcpyPart)
> > -
> > -       .p2align 4
> > -L(exit48):
> > -       bsf     %edx, %edx
> > -       lea     48(%eax, %edx), %eax
> > -       jmp     L(StartStrcpyPart)
> > -
> > -       .p2align 4
> > -L(exit_less16):
> > -       bsf     %edx, %edx
> > -       add     %ecx, %eax
> > -       add     %edx, %eax
> > -       jmp     L(StartStrcpyPart)
> > -
> > -       .p2align 4
> > -L(exit_less16_):
> > -       bsf     %ecx, %ecx
> > -       add     %ecx, %eax
> > -
> > -       .p2align 4
> > -L(StartStrcpyPart):
> > -       pmovmskb %xmm4, %edx
> > -# ifdef USE_AS_STRNCAT
> > -       cmp     $16, %ebx
> > -       jbe     L(CopyFrom1To16BytesTail1Case2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16BytesTail1)
> > -
> > -       movdqu  %xmm5, (%eax)
> > -       pmovmskb %xmm7, %edx
> > -# ifdef USE_AS_STRNCAT
> > -       cmp     $32, %ebx
> > -       jbe     L(CopyFrom1To32Bytes1Case2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To32Bytes1)
> > -
> > -       mov     %esi, %ecx
> > -       and     $-16, %esi
> > -       and     $15, %ecx
> > -       pxor    %xmm0, %xmm0
> > -# ifdef USE_AS_STRNCAT
> > -       add     %ecx, %ebx
> > -# endif
> > -       sub     %ecx, %eax
> > -       jmp     L(Unalign16Both)
> > -
> > -L(StrlenCore7_1):
> > -       mov     %eax, %ecx
> > -       pxor    %xmm0, %xmm0
> > -       and     $15, %ecx
> > -       and     $-16, %eax
> > -       pcmpeqb (%eax), %xmm0
> > -       pmovmskb %xmm0, %edx
> > -       shr     %cl, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit_less16_1)
> > -       add     %eax, %ecx
> > -
> > -       pxor    %xmm0, %xmm0
> > -       pxor    %xmm1, %xmm1
> > -       pxor    %xmm2, %xmm2
> > -       pxor    %xmm3, %xmm3
> > -
> > -       .p2align 4
> > -L(align16_loop_1):
> > -       pcmpeqb 16(%eax), %xmm0
> > -       pmovmskb %xmm0, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit16_1)
> > -
> > -       pcmpeqb 32(%eax), %xmm1
> > -       pmovmskb %xmm1, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit32_1)
> > -
> > -       pcmpeqb 48(%eax), %xmm2
> > -       pmovmskb %xmm2, %edx
> > -       test    %edx, %edx
> > -       jnz     L(exit48_1)
> > -
> > -       pcmpeqb 64(%eax), %xmm3
> > -       pmovmskb %xmm3, %edx
> > -       lea     64(%eax), %eax
> > -       test    %edx, %edx
> > -       jz      L(align16_loop_1)
> > -       bsf     %edx, %edx
> > -       add     %edx, %eax
> > -       jmp     L(StartStrcpyPart_1)
> > -
> > -       .p2align 4
> > -L(exit16_1):
> > -       bsf     %edx, %edx
> > -       lea     16(%eax, %edx), %eax
> > -       jmp     L(StartStrcpyPart_1)
> > -
> > -       .p2align 4
> > -L(exit32_1):
> > -       bsf     %edx, %edx
> > -       lea     32(%eax, %edx), %eax
> > -       jmp     L(StartStrcpyPart_1)
> > -
> > -       .p2align 4
> > -L(exit48_1):
> > -       bsf     %edx, %edx
> > -       lea     48(%eax, %edx), %eax
> > -       jmp     L(StartStrcpyPart_1)
> > -
> > -       .p2align 4
> > -L(exit_less16_1):
> > -       bsf     %edx, %edx
> > -       add     %ecx, %eax
> > -       add     %edx, %eax
> > -
> > -       .p2align 4
> > -L(StartStrcpyPart_1):
> > -       mov     %esi, %ecx
> > -       and     $15, %ecx
> > -       and     $-16, %esi
> > -       pxor    %xmm0, %xmm0
> > -       pxor    %xmm1, %xmm1
> > -
> > -# ifdef USE_AS_STRNCAT
> > -       cmp     $48, %ebx
> > -       ja      L(BigN)
> > -# endif
> > -       pcmpeqb (%esi), %xmm1
> > -# ifdef USE_AS_STRNCAT
> > -       add     %ecx, %ebx
> > -# endif
> > -       pmovmskb %xmm1, %edx
> > -       shr     %cl, %edx
> > -# ifdef USE_AS_STRNCAT
> > -       cmp     $16, %ebx
> > -       jbe     L(CopyFrom1To16BytesTailCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16BytesTail)
> > -
> > -       pcmpeqb 16(%esi), %xmm0
> > -       pmovmskb %xmm0, %edx
> > -# ifdef USE_AS_STRNCAT
> > -       cmp     $32, %ebx
> > -       jbe     L(CopyFrom1To32BytesCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To32Bytes)
> > -
> > -       movdqu  (%esi, %ecx), %xmm1   /* copy 16 bytes */
> > -       movdqu  %xmm1, (%eax)
> > -       sub     %ecx, %eax
> > -
> > -       .p2align 4
> > -L(Unalign16Both):
> > -       mov     $16, %ecx
> > -       movdqa  (%esi, %ecx), %xmm1
> > -       movaps  16(%esi, %ecx), %xmm2
> > -       movdqu  %xmm1, (%eax, %ecx)
> > -       pcmpeqb %xmm2, %xmm0
> > -       pmovmskb %xmm0, %edx
> > -       add     $16, %ecx
> > -# ifdef USE_AS_STRNCAT
> > -       sub     $48, %ebx
> > -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16Bytes)
> > -L(Unalign16BothBigN):
> > -       movaps  16(%esi, %ecx), %xmm3
> > -       movdqu  %xmm2, (%eax, %ecx)
> > -       pcmpeqb %xmm3, %xmm0
> > -       pmovmskb %xmm0, %edx
> > -       add     $16, %ecx
> > -# ifdef USE_AS_STRNCAT
> > -       sub     $16, %ebx
> > -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16Bytes)
> > -
> > -       movaps  16(%esi, %ecx), %xmm4
> > -       movdqu  %xmm3, (%eax, %ecx)
> > -       pcmpeqb %xmm4, %xmm0
> > -       pmovmskb %xmm0, %edx
> > -       add     $16, %ecx
> > -# ifdef USE_AS_STRNCAT
> > -       sub     $16, %ebx
> > -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16Bytes)
> > -
> > -       movaps  16(%esi, %ecx), %xmm1
> > -       movdqu  %xmm4, (%eax, %ecx)
> > -       pcmpeqb %xmm1, %xmm0
> > -       pmovmskb %xmm0, %edx
> > -       add     $16, %ecx
> > -# ifdef USE_AS_STRNCAT
> > -       sub     $16, %ebx
> > -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16Bytes)
> > -
> > -       movaps  16(%esi, %ecx), %xmm2
> > -       movdqu  %xmm1, (%eax, %ecx)
> > -       pcmpeqb %xmm2, %xmm0
> > -       pmovmskb %xmm0, %edx
> > -       add     $16, %ecx
> > -# ifdef USE_AS_STRNCAT
> > -       sub     $16, %ebx
> > -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16Bytes)
> > -
> > -       movaps  16(%esi, %ecx), %xmm3
> > -       movdqu  %xmm2, (%eax, %ecx)
> > -       pcmpeqb %xmm3, %xmm0
> > -       pmovmskb %xmm0, %edx
> > -       add     $16, %ecx
> > -# ifdef USE_AS_STRNCAT
> > -       sub     $16, %ebx
> > -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16Bytes)
> > -
> > -       movdqu  %xmm3, (%eax, %ecx)
> > -       mov     %esi, %edx
> > -       lea     16(%esi, %ecx), %esi
> > -       and     $-0x40, %esi
> > -       sub     %esi, %edx
> > -       sub     %edx, %eax
> > -# ifdef USE_AS_STRNCAT
> > -       lea     128(%ebx, %edx), %ebx
> > -# endif
> > -       movaps  (%esi), %xmm2
> > -       movaps  %xmm2, %xmm4
> > -       movaps  16(%esi), %xmm5
> > -       movaps  32(%esi), %xmm3
> > -       movaps  %xmm3, %xmm6
> > -       movaps  48(%esi), %xmm7
> > -       pminub  %xmm5, %xmm2
> > -       pminub  %xmm7, %xmm3
> > -       pminub  %xmm2, %xmm3
> > -       pcmpeqb %xmm0, %xmm3
> > -       pmovmskb %xmm3, %edx
> > -# ifdef USE_AS_STRNCAT
> > -       sub     $64, %ebx
> > -       jbe     L(UnalignedLeaveCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jnz     L(Unaligned64Leave)
> > -
> > -       .p2align 4
> > -L(Unaligned64Loop_start):
> > -       add     $64, %eax
> > -       add     $64, %esi
> > -       movdqu  %xmm4, -64(%eax)
> > -       movaps  (%esi), %xmm2
> > -       movdqa  %xmm2, %xmm4
> > -       movdqu  %xmm5, -48(%eax)
> > -       movaps  16(%esi), %xmm5
> > -       pminub  %xmm5, %xmm2
> > -       movaps  32(%esi), %xmm3
> > -       movdqu  %xmm6, -32(%eax)
> > -       movaps  %xmm3, %xmm6
> > -       movdqu  %xmm7, -16(%eax)
> > -       movaps  48(%esi), %xmm7
> > -       pminub  %xmm7, %xmm3
> > -       pminub  %xmm2, %xmm3
> > -       pcmpeqb %xmm0, %xmm3
> > -       pmovmskb %xmm3, %edx
> > -# ifdef USE_AS_STRNCAT
> > -       sub     $64, %ebx
> > -       jbe     L(UnalignedLeaveCase2OrCase3)
> > -# endif
> > -       test    %edx, %edx
> > -       jz      L(Unaligned64Loop_start)
> > -
> > -L(Unaligned64Leave):
> > -       pxor    %xmm1, %xmm1
> > -
> > -       pcmpeqb %xmm4, %xmm0
> > -       pcmpeqb %xmm5, %xmm1
> > -       pmovmskb %xmm0, %edx
> > -       pmovmskb %xmm1, %ecx
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16BytesUnaligned_0)
> > -       test    %ecx, %ecx
> > -       jnz     L(CopyFrom1To16BytesUnaligned_16)
> > -
> > -       pcmpeqb %xmm6, %xmm0
> > -       pcmpeqb %xmm7, %xmm1
> > -       pmovmskb %xmm0, %edx
> > -       pmovmskb %xmm1, %ecx
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16BytesUnaligned_32)
> > -
> > -       bsf     %ecx, %edx
> > -       movdqu  %xmm4, (%eax)
> > -       movdqu  %xmm5, 16(%eax)
> > -       movdqu  %xmm6, 32(%eax)
> > -       add     $48, %esi
> > -       add     $48, %eax
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > -# ifdef USE_AS_STRNCAT
> > -       .p2align 4
> > -L(BigN):
> > -       pcmpeqb (%esi), %xmm1
> > -       pmovmskb %xmm1, %edx
> > -       shr     %cl, %edx
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16BytesTail)
> > -
> > -       pcmpeqb 16(%esi), %xmm0
> > -       pmovmskb %xmm0, %edx
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To32Bytes)
> > -
> > -       movdqu  (%esi, %ecx), %xmm1   /* copy 16 bytes */
> > -       movdqu  %xmm1, (%eax)
> > -       sub     %ecx, %eax
> > -       sub     $48, %ebx
> > -       add     %ecx, %ebx
> > -
> > -       mov     $16, %ecx
> > -       movdqa  (%esi, %ecx), %xmm1
> > -       movaps  16(%esi, %ecx), %xmm2
> > -       movdqu  %xmm1, (%eax, %ecx)
> > -       pcmpeqb %xmm2, %xmm0
> > -       pmovmskb %xmm0, %edx
> > -       add     $16, %ecx
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16Bytes)
> > -       jmp     L(Unalign16BothBigN)
> > -# endif
> > -
> > -/*------------end of main part-------------------------------*/
> > -
> > -/* Case1 */
> > -       .p2align 4
> > -L(CopyFrom1To16Bytes):
> > -       add     %ecx, %eax
> > -       add     %ecx, %esi
> > -       bsf     %edx, %edx
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > -       .p2align 4
> > -L(CopyFrom1To16BytesTail):
> > -       add     %ecx, %esi
> > -       bsf     %edx, %edx
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > -       .p2align 4
> > -L(CopyFrom1To32Bytes1):
> > -       add     $16, %esi
> > -       add     $16, %eax
> > -L(CopyFrom1To16BytesTail1):
> > -       bsf     %edx, %edx
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > -       .p2align 4
> > -L(CopyFrom1To32Bytes):
> > -       bsf     %edx, %edx
> > -       add     %ecx, %esi
> > -       add     $16, %edx
> > -       sub     %ecx, %edx
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > -       .p2align 4
> > -L(CopyFrom1To16BytesUnaligned_0):
> > -       bsf     %edx, %edx
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > -       .p2align 4
> > -L(CopyFrom1To16BytesUnaligned_16):
> > -       bsf     %ecx, %edx
> > -       movdqu  %xmm4, (%eax)
> > -       add     $16, %esi
> > -       add     $16, %eax
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > -       .p2align 4
> > -L(CopyFrom1To16BytesUnaligned_32):
> > -       bsf     %edx, %edx
> > -       movdqu  %xmm4, (%eax)
> > -       movdqu  %xmm5, 16(%eax)
> > -       add     $32, %esi
> > -       add     $32, %eax
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > -# ifdef USE_AS_STRNCAT
> > -
> > -       .p2align 4
> > -L(CopyFrom1To16BytesExit):
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > -/* Case2 */
> > -
> > -       .p2align 4
> > -L(CopyFrom1To16BytesCase2):
> > -       add     $16, %ebx
> > -       add     %ecx, %eax
> > -       add     %ecx, %esi
> > -       bsf     %edx, %edx
> > -       cmp     %ebx, %edx
> > -       jb      L(CopyFrom1To16BytesExit)
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -
> > -       .p2align 4
> > -L(CopyFrom1To32BytesCase2):
> > -       sub     %ecx, %ebx
> > -       add     %ecx, %esi
> > -       bsf     %edx, %edx
> > -       add     $16, %edx
> > -       sub     %ecx, %edx
> > -       cmp     %ebx, %edx
> > -       jb      L(CopyFrom1To16BytesExit)
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -
> > -L(CopyFrom1To16BytesTailCase2):
> > -       sub     %ecx, %ebx
> > -       add     %ecx, %esi
> > -       bsf     %edx, %edx
> > -       cmp     %ebx, %edx
> > -       jb      L(CopyFrom1To16BytesExit)
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -
> > -L(CopyFrom1To16BytesTail1Case2):
> > -       bsf     %edx, %edx
> > -       cmp     %ebx, %edx
> > -       jb      L(CopyFrom1To16BytesExit)
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -
> > -/* Case2 or Case3,  Case3 */
> > -
> > -       .p2align 4
> > -L(CopyFrom1To16BytesCase2OrCase3):
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16BytesCase2)
> > -L(CopyFrom1To16BytesCase3):
> > -       add     $16, %ebx
> > -       add     %ecx, %eax
> > -       add     %ecx, %esi
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -
> > -       .p2align 4
> > -L(CopyFrom1To32BytesCase2OrCase3):
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To32BytesCase2)
> > -       sub     %ecx, %ebx
> > -       add     %ecx, %esi
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -
> > -       .p2align 4
> > -L(CopyFrom1To16BytesTailCase2OrCase3):
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16BytesTailCase2)
> > -       sub     %ecx, %ebx
> > -       add     %ecx, %esi
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -
> > -       .p2align 4
> > -L(CopyFrom1To32Bytes1Case2OrCase3):
> > -       add     $16, %eax
> > -       add     $16, %esi
> > -       sub     $16, %ebx
> > -L(CopyFrom1To16BytesTail1Case2OrCase3):
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16BytesTail1Case2)
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -
> > -# endif
> > -
> > -# ifdef USE_AS_STRNCAT
> > -       .p2align 4
> > -L(StrncatExit0):
> > -       movb    %bh, (%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -# endif
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit1):
> > -       movb    %bh, 1(%eax)
> > -# endif
> > -L(Exit1):
> > -# ifdef USE_AS_STRNCAT
> > -       movb    (%esi), %dh
> > -# endif
> > -       movb    %dh, (%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit2):
> > -       movb    %bh, 2(%eax)
> > -# endif
> > -L(Exit2):
> > -       movw    (%esi), %dx
> > -       movw    %dx, (%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit3):
> > -       movb    %bh, 3(%eax)
> > -# endif
> > -L(Exit3):
> > -       movw    (%esi), %cx
> > -       movw    %cx, (%eax)
> > -# ifdef USE_AS_STRNCAT
> > -       movb    2(%esi), %dh
> > -# endif
> > -       movb    %dh, 2(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit4):
> > -       movb    %bh, 4(%eax)
> > -# endif
> > -L(Exit4):
> > -       movl    (%esi), %edx
> > -       movl    %edx, (%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit5):
> > -       movb    %bh, 5(%eax)
> > -# endif
> > -L(Exit5):
> > -       movl    (%esi), %ecx
> > -# ifdef USE_AS_STRNCAT
> > -       movb    4(%esi), %dh
> > -# endif
> > -       movb    %dh, 4(%eax)
> > -       movl    %ecx, (%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit6):
> > -       movb    %bh, 6(%eax)
> > -# endif
> > -L(Exit6):
> > -       movl    (%esi), %ecx
> > -       movw    4(%esi), %dx
> > -       movl    %ecx, (%eax)
> > -       movw    %dx, 4(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit7):
> > -       movb    %bh, 7(%eax)
> > -# endif
> > -L(Exit7):
> > -       movl    (%esi), %ecx
> > -       movl    3(%esi), %edx
> > -       movl    %ecx, (%eax)
> > -       movl    %edx, 3(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit8):
> > -       movb    %bh, 8(%eax)
> > -# endif
> > -L(Exit8):
> > -       movlpd  (%esi), %xmm0
> > -       movlpd  %xmm0, (%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit9):
> > -       movb    %bh, 9(%eax)
> > -# endif
> > -L(Exit9):
> > -       movlpd  (%esi), %xmm0
> > -# ifdef USE_AS_STRNCAT
> > -       movb    8(%esi), %dh
> > -# endif
> > -       movb    %dh, 8(%eax)
> > -       movlpd  %xmm0, (%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit10):
> > -       movb    %bh, 10(%eax)
> > -# endif
> > -L(Exit10):
> > -       movlpd  (%esi), %xmm0
> > -       movw    8(%esi), %dx
> > -       movlpd  %xmm0, (%eax)
> > -       movw    %dx, 8(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit11):
> > -       movb    %bh, 11(%eax)
> > -# endif
> > -L(Exit11):
> > -       movlpd  (%esi), %xmm0
> > -       movl    7(%esi), %edx
> > -       movlpd  %xmm0, (%eax)
> > -       movl    %edx, 7(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit12):
> > -       movb    %bh, 12(%eax)
> > -# endif
> > -L(Exit12):
> > -       movlpd  (%esi), %xmm0
> > -       movl    8(%esi), %edx
> > -       movlpd  %xmm0, (%eax)
> > -       movl    %edx, 8(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit13):
> > -       movb    %bh, 13(%eax)
> > -# endif
> > -L(Exit13):
> > -       movlpd  (%esi), %xmm0
> > -       movlpd  5(%esi), %xmm1
> > -       movlpd  %xmm0, (%eax)
> > -       movlpd  %xmm1, 5(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit14):
> > -       movb    %bh, 14(%eax)
> > -# endif
> > -L(Exit14):
> > -       movlpd  (%esi), %xmm0
> > -       movlpd  6(%esi), %xmm1
> > -       movlpd  %xmm0, (%eax)
> > -       movlpd  %xmm1, 6(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit15):
> > -       movb    %bh, 15(%eax)
> > -# endif
> > -L(Exit15):
> > -       movlpd  (%esi), %xmm0
> > -       movlpd  7(%esi), %xmm1
> > -       movlpd  %xmm0, (%eax)
> > -       movlpd  %xmm1, 7(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit16):
> > -       movb    %bh, 16(%eax)
> > -# endif
> > -L(Exit16):
> > -       movdqu  (%esi), %xmm0
> > -       movdqu  %xmm0, (%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit17):
> > -       movb    %bh, 17(%eax)
> > -# endif
> > -L(Exit17):
> > -       movdqu  (%esi), %xmm0
> > -# ifdef USE_AS_STRNCAT
> > -       movb    16(%esi), %dh
> > -# endif
> > -       movdqu  %xmm0, (%eax)
> > -       movb    %dh, 16(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit18):
> > -       movb    %bh, 18(%eax)
> > -# endif
> > -L(Exit18):
> > -       movdqu  (%esi), %xmm0
> > -       movw    16(%esi), %cx
> > -       movdqu  %xmm0, (%eax)
> > -       movw    %cx, 16(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit19):
> > -       movb    %bh, 19(%eax)
> > -# endif
> > -L(Exit19):
> > -       movdqu  (%esi), %xmm0
> > -       movl    15(%esi), %ecx
> > -       movdqu  %xmm0, (%eax)
> > -       movl    %ecx, 15(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit20):
> > -       movb    %bh, 20(%eax)
> > -# endif
> > -L(Exit20):
> > -       movdqu  (%esi), %xmm0
> > -       movl    16(%esi), %ecx
> > -       movdqu  %xmm0, (%eax)
> > -       movl    %ecx, 16(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit21):
> > -       movb    %bh, 21(%eax)
> > -# endif
> > -L(Exit21):
> > -       movdqu  (%esi), %xmm0
> > -       movl    16(%esi), %ecx
> > -# ifdef USE_AS_STRNCAT
> > -       movb    20(%esi), %dh
> > -# endif
> > -       movdqu  %xmm0, (%eax)
> > -       movl    %ecx, 16(%eax)
> > -       movb    %dh, 20(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit22):
> > -       movb    %bh, 22(%eax)
> > -# endif
> > -L(Exit22):
> > -       movdqu  (%esi), %xmm0
> > -       movlpd  14(%esi), %xmm3
> > -       movdqu  %xmm0, (%eax)
> > -       movlpd  %xmm3, 14(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit23):
> > -       movb    %bh, 23(%eax)
> > -# endif
> > -L(Exit23):
> > -       movdqu  (%esi), %xmm0
> > -       movlpd  15(%esi), %xmm3
> > -       movdqu  %xmm0, (%eax)
> > -       movlpd  %xmm3, 15(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit24):
> > -       movb    %bh, 24(%eax)
> > -# endif
> > -L(Exit24):
> > -       movdqu  (%esi), %xmm0
> > -       movlpd  16(%esi), %xmm2
> > -       movdqu  %xmm0, (%eax)
> > -       movlpd  %xmm2, 16(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit25):
> > -       movb    %bh, 25(%eax)
> > -# endif
> > -L(Exit25):
> > -       movdqu  (%esi), %xmm0
> > -       movlpd  16(%esi), %xmm2
> > -# ifdef USE_AS_STRNCAT
> > -       movb    24(%esi), %dh
> > -# endif
> > -       movdqu  %xmm0, (%eax)
> > -       movlpd  %xmm2, 16(%eax)
> > -       movb    %dh, 24(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit26):
> > -       movb    %bh, 26(%eax)
> > -# endif
> > -L(Exit26):
> > -       movdqu  (%esi), %xmm0
> > -       movlpd  16(%esi), %xmm2
> > -       movw    24(%esi), %cx
> > -       movdqu  %xmm0, (%eax)
> > -       movlpd  %xmm2, 16(%eax)
> > -       movw    %cx, 24(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit27):
> > -       movb    %bh, 27(%eax)
> > -# endif
> > -L(Exit27):
> > -       movdqu  (%esi), %xmm0
> > -       movlpd  16(%esi), %xmm2
> > -       movl    23(%esi), %ecx
> > -       movdqu  %xmm0, (%eax)
> > -       movlpd  %xmm2, 16(%eax)
> > -       movl    %ecx, 23(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit28):
> > -       movb    %bh, 28(%eax)
> > -# endif
> > -L(Exit28):
> > -       movdqu  (%esi), %xmm0
> > -       movlpd  16(%esi), %xmm2
> > -       movl    24(%esi), %ecx
> > -       movdqu  %xmm0, (%eax)
> > -       movlpd  %xmm2, 16(%eax)
> > -       movl    %ecx, 24(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit29):
> > -       movb    %bh, 29(%eax)
> > -# endif
> > -L(Exit29):
> > -       movdqu  (%esi), %xmm0
> > -       movdqu  13(%esi), %xmm2
> > -       movdqu  %xmm0, (%eax)
> > -       movdqu  %xmm2, 13(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit30):
> > -       movb    %bh, 30(%eax)
> > -# endif
> > -L(Exit30):
> > -       movdqu  (%esi), %xmm0
> > -       movdqu  14(%esi), %xmm2
> > -       movdqu  %xmm0, (%eax)
> > -       movdqu  %xmm2, 14(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit31):
> > -       movb    %bh, 31(%eax)
> > -# endif
> > -L(Exit31):
> > -       movdqu  (%esi), %xmm0
> > -       movdqu  15(%esi), %xmm2
> > -       movdqu  %xmm0, (%eax)
> > -       movdqu  %xmm2, 15(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit32):
> > -       movb    %bh, 32(%eax)
> > -# endif
> > -L(Exit32):
> > -       movdqu  (%esi), %xmm0
> > -       movdqu  16(%esi), %xmm2
> > -       movdqu  %xmm0, (%eax)
> > -       movdqu  %xmm2, 16(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -# ifdef USE_AS_STRNCAT
> > -
> > -       .p2align 4
> > -L(UnalignedLeaveCase2OrCase3):
> > -       test    %edx, %edx
> > -       jnz     L(Unaligned64LeaveCase2)
> > -L(Unaligned64LeaveCase3):
> > -       lea     64(%ebx), %ecx
> > -       and     $-16, %ecx
> > -       add     $48, %ebx
> > -       jl      L(CopyFrom1To16BytesCase3)
> > -       movdqu  %xmm4, (%eax)
> > -       sub     $16, %ebx
> > -       jb      L(CopyFrom1To16BytesCase3)
> > -       movdqu  %xmm5, 16(%eax)
> > -       sub     $16, %ebx
> > -       jb      L(CopyFrom1To16BytesCase3)
> > -       movdqu  %xmm6, 32(%eax)
> > -       sub     $16, %ebx
> > -       jb      L(CopyFrom1To16BytesCase3)
> > -       movdqu  %xmm7, 48(%eax)
> > -       xor     %bh, %bh
> > -       movb    %bh, 64(%eax)
> > -       mov     STR3(%esp), %eax
> > -       RETURN
> > -
> > -       .p2align 4
> > -L(Unaligned64LeaveCase2):
> > -       xor     %ecx, %ecx
> > -       pcmpeqb %xmm4, %xmm0
> > -       pmovmskb %xmm0, %edx
> > -       add     $48, %ebx
> > -       jle     L(CopyFrom1To16BytesCase2OrCase3)
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16Bytes)
> > -
> > -       pcmpeqb %xmm5, %xmm0
> > -       pmovmskb %xmm0, %edx
> > -       movdqu  %xmm4, (%eax)
> > -       add     $16, %ecx
> > -       sub     $16, %ebx
> > -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16Bytes)
> > -
> > -       pcmpeqb %xmm6, %xmm0
> > -       pmovmskb %xmm0, %edx
> > -       movdqu  %xmm5, 16(%eax)
> > -       add     $16, %ecx
> > -       sub     $16, %ebx
> > -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> > -       test    %edx, %edx
> > -       jnz     L(CopyFrom1To16Bytes)
> > -
> > -       pcmpeqb %xmm7, %xmm0
> > -       pmovmskb %xmm0, %edx
> > -       movdqu  %xmm6, 32(%eax)
> > -       lea     16(%eax, %ecx), %eax
> > -       lea     16(%esi, %ecx), %esi
> > -       bsf     %edx, %edx
> > -       cmp     %ebx, %edx
> > -       jb      L(CopyFrom1To16BytesExit)
> > -       BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -# endif
> > -       .p2align 4
> > -L(ExitZero):
> > -       RETURN
> > -
> > -END (STRCAT)
> > -
> > -       .p2align 4
> > -       .section .rodata
> > -L(ExitTable):
> > -       .int    JMPTBL(L(Exit1), L(ExitTable))
> > -       .int    JMPTBL(L(Exit2), L(ExitTable))
> > -       .int    JMPTBL(L(Exit3), L(ExitTable))
> > -       .int    JMPTBL(L(Exit4), L(ExitTable))
> > -       .int    JMPTBL(L(Exit5), L(ExitTable))
> > -       .int    JMPTBL(L(Exit6), L(ExitTable))
> > -       .int    JMPTBL(L(Exit7), L(ExitTable))
> > -       .int    JMPTBL(L(Exit8), L(ExitTable))
> > -       .int    JMPTBL(L(Exit9), L(ExitTable))
> > -       .int    JMPTBL(L(Exit10), L(ExitTable))
> > -       .int    JMPTBL(L(Exit11), L(ExitTable))
> > -       .int    JMPTBL(L(Exit12), L(ExitTable))
> > -       .int    JMPTBL(L(Exit13), L(ExitTable))
> > -       .int    JMPTBL(L(Exit14), L(ExitTable))
> > -       .int    JMPTBL(L(Exit15), L(ExitTable))
> > -       .int    JMPTBL(L(Exit16), L(ExitTable))
> > -       .int    JMPTBL(L(Exit17), L(ExitTable))
> > -       .int    JMPTBL(L(Exit18), L(ExitTable))
> > -       .int    JMPTBL(L(Exit19), L(ExitTable))
> > -       .int    JMPTBL(L(Exit20), L(ExitTable))
> > -       .int    JMPTBL(L(Exit21), L(ExitTable))
> > -       .int    JMPTBL(L(Exit22), L(ExitTable))
> > -       .int    JMPTBL(L(Exit23), L(ExitTable))
> > -       .int    JMPTBL(L(Exit24), L(ExitTable))
> > -       .int    JMPTBL(L(Exit25), L(ExitTable))
> > -       .int    JMPTBL(L(Exit26), L(ExitTable))
> > -       .int    JMPTBL(L(Exit27), L(ExitTable))
> > -       .int    JMPTBL(L(Exit28), L(ExitTable))
> > -       .int    JMPTBL(L(Exit29), L(ExitTable))
> > -       .int    JMPTBL(L(Exit30), L(ExitTable))
> > -       .int    JMPTBL(L(Exit31), L(ExitTable))
> > -       .int    JMPTBL(L(Exit32), L(ExitTable))
> > -# ifdef USE_AS_STRNCAT
> > -L(ExitStrncatTable):
> > -       .int    JMPTBL(L(StrncatExit0), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit1), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit2), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit3), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit4), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit5), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit6), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit7), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit8), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit9), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit10), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit11), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit12), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit13), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit14), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit15), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit16), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit17), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit18), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit19), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit20), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit21), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit22), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit23), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit24), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit25), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit26), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit27), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit28), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit29), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit30), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit31), L(ExitStrncatTable))
> > -       .int    JMPTBL(L(StrncatExit32), L(ExitStrncatTable))
> > -# endif
> > -#endif
> > diff --git a/sysdeps/i386/i686/multiarch/strcat-ssse3.S
> > b/sysdeps/i386/i686/multiarch/strcat-ssse3.S
> > deleted file mode 100644
> > index 72bc49c..0000000
> > --- a/sysdeps/i386/i686/multiarch/strcat-ssse3.S
> > +++ /dev/null
> > @@ -1,572 +0,0 @@
> > -/* strcat with SSSE3
> > -   Copyright (C) 2011 Free Software Foundation, Inc.
> > -   Contributed by Intel Corporation.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <http://www.gnu.org/licenses/>.  */
> > -
> > -
> > -#ifndef NOT_IN_libc
> > -
> > -# include <sysdep.h>
> > -
> > -# define CFI_PUSH(REG) \
> > -       cfi_adjust_cfa_offset (4);      \
> > -       cfi_rel_offset (REG, 0)
> > -
> > -# define CFI_POP(REG)  \
> > -       cfi_adjust_cfa_offset (-4);     \
> > -       cfi_restore (REG)
> > -
> > -# define PUSH(REG) pushl REG; CFI_PUSH (REG)
> > -# define POP(REG) popl REG; CFI_POP (REG)
> > -
> > -# ifndef STRCAT
> > -#  define STRCAT  __strcat_ssse3
> > -# endif
> > -
> > -# define PARMS  4
> > -# define STR1  PARMS+4
> > -# define STR2  STR1+4
> > -
> > -# ifdef USE_AS_STRNCAT
> > -#  define LEN STR2+8
> > -# endif
> > -
> > -# define USE_AS_STRCAT
> > -
> > -.text
> > -ENTRY (STRCAT)
> > -       PUSH    (%edi)
> > -       mov     STR1(%esp), %edi
> > -       mov     %edi, %edx
> > -
> > -# define RETURN  jmp L(StartStrcpyPart)
> > -# include "strlen-sse2.S"
> > -
> > -L(StartStrcpyPart):
> > -       mov     STR2(%esp), %ecx
> > -       lea     (%edi, %eax), %edx
> > -# ifdef USE_AS_STRNCAT
> > -       PUSH    (%ebx)
> > -       mov     LEN(%esp), %ebx
> > -       test    %ebx, %ebx
> > -       jz      L(StrncatExit0)
> > -       cmp     $8, %ebx
> > -       jbe     L(StrncatExit8Bytes)
> > -# endif
> > -       cmpb    $0, (%ecx)
> > -       jz      L(Exit1)
> > -       cmpb    $0, 1(%ecx)
> > -       jz      L(Exit2)
> > -       cmpb    $0, 2(%ecx)
> > -       jz      L(Exit3)
> > -       cmpb    $0, 3(%ecx)
> > -       jz      L(Exit4)
> > -       cmpb    $0, 4(%ecx)
> > -       jz      L(Exit5)
> > -       cmpb    $0, 5(%ecx)
> > -       jz      L(Exit6)
> > -       cmpb    $0, 6(%ecx)
> > -       jz      L(Exit7)
> > -       cmpb    $0, 7(%ecx)
> > -       jz      L(Exit8)
> > -       cmpb    $0, 8(%ecx)
> > -       jz      L(Exit9)
> > -# ifdef USE_AS_STRNCAT
> > -       cmp     $16, %ebx
> > -       jb      L(StrncatExit15Bytes)
> > -# endif
> > -       cmpb    $0, 9(%ecx)
> > -       jz      L(Exit10)
> > -       cmpb    $0, 10(%ecx)
> > -       jz      L(Exit11)
> > -       cmpb    $0, 11(%ecx)
> > -       jz      L(Exit12)
> > -       cmpb    $0, 12(%ecx)
> > -       jz      L(Exit13)
> > -       cmpb    $0, 13(%ecx)
> > -       jz      L(Exit14)
> > -       cmpb    $0, 14(%ecx)
> > -       jz      L(Exit15)
> > -       cmpb    $0, 15(%ecx)
> > -       jz      L(Exit16)
> > -# ifdef USE_AS_STRNCAT
> > -       cmp     $16, %ebx
> > -       je      L(StrncatExit16)
> > -
> > -#  define RETURN1      \
> > -       POP     (%ebx); \
> > -       POP     (%edi); \
> > -       ret;    \
> > -       CFI_PUSH        (%ebx); \
> > -       CFI_PUSH        (%edi)
> > -#  define USE_AS_STRNCPY
> > -# else
> > -#  define RETURN1  POP (%edi); ret; CFI_PUSH (%edi)
> > -# endif
> > -# include "strcpy-ssse3.S"
> > -       .p2align 4
> > -L(CopyFrom1To16Bytes):
> > -       add     %esi, %edx
> > -       add     %esi, %ecx
> > -
> > -       POP     (%esi)
> > -       test    %al, %al
> > -       jz      L(ExitHigh)
> > -       test    $0x01, %al
> > -       jnz     L(Exit1)
> > -       test    $0x02, %al
> > -       jnz     L(Exit2)
> > -       test    $0x04, %al
> > -       jnz     L(Exit3)
> > -       test    $0x08, %al
> > -       jnz     L(Exit4)
> > -       test    $0x10, %al
> > -       jnz     L(Exit5)
> > -       test    $0x20, %al
> > -       jnz     L(Exit6)
> > -       test    $0x40, %al
> > -       jnz     L(Exit7)
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  %xmm0, (%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(ExitHigh):
> > -       test    $0x01, %ah
> > -       jnz     L(Exit9)
> > -       test    $0x02, %ah
> > -       jnz     L(Exit10)
> > -       test    $0x04, %ah
> > -       jnz     L(Exit11)
> > -       test    $0x08, %ah
> > -       jnz     L(Exit12)
> > -       test    $0x10, %ah
> > -       jnz     L(Exit13)
> > -       test    $0x20, %ah
> > -       jnz     L(Exit14)
> > -       test    $0x40, %ah
> > -       jnz     L(Exit15)
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  8(%ecx), %xmm1
> > -       movlpd  %xmm0, (%edx)
> > -       movlpd  %xmm1, 8(%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit1):
> > -       movb    %bh, 1(%edx)
> > -L(Exit1):
> > -       movb    (%ecx), %al
> > -       movb    %al, (%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit2):
> > -       movb    %bh, 2(%edx)
> > -L(Exit2):
> > -       movw    (%ecx), %ax
> > -       movw    %ax, (%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit3):
> > -       movb    %bh, 3(%edx)
> > -L(Exit3):
> > -       movw    (%ecx), %ax
> > -       movw    %ax, (%edx)
> > -       movb    2(%ecx), %al
> > -       movb    %al, 2(%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit4):
> > -       movb    %bh, 4(%edx)
> > -L(Exit4):
> > -       movl    (%ecx), %eax
> > -       movl    %eax, (%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit5):
> > -       movb    %bh, 5(%edx)
> > -L(Exit5):
> > -       movl    (%ecx), %eax
> > -       movl    %eax, (%edx)
> > -       movb    4(%ecx), %al
> > -       movb    %al, 4(%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit6):
> > -       movb    %bh, 6(%edx)
> > -L(Exit6):
> > -       movl    (%ecx), %eax
> > -       movl    %eax, (%edx)
> > -       movw    4(%ecx), %ax
> > -       movw    %ax, 4(%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit7):
> > -       movb    %bh, 7(%edx)
> > -L(Exit7):
> > -       movl    (%ecx), %eax
> > -       movl    %eax, (%edx)
> > -       movl    3(%ecx), %eax
> > -       movl    %eax, 3(%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit8):
> > -       movb    %bh, 8(%edx)
> > -L(Exit8):
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  %xmm0, (%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit9):
> > -       movb    %bh, 9(%edx)
> > -L(Exit9):
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  %xmm0, (%edx)
> > -       movb    8(%ecx), %al
> > -       movb    %al, 8(%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit10):
> > -       movb    %bh, 10(%edx)
> > -L(Exit10):
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  %xmm0, (%edx)
> > -       movw    8(%ecx), %ax
> > -       movw    %ax, 8(%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit11):
> > -       movb    %bh, 11(%edx)
> > -L(Exit11):
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  %xmm0, (%edx)
> > -       movl    7(%ecx), %eax
> > -       movl    %eax, 7(%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit12):
> > -       movb    %bh, 12(%edx)
> > -L(Exit12):
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  %xmm0, (%edx)
> > -       movl    8(%ecx), %eax
> > -       movl    %eax, 8(%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit13):
> > -       movb    %bh, 13(%edx)
> > -L(Exit13):
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  %xmm0, (%edx)
> > -       movlpd  5(%ecx), %xmm0
> > -       movlpd  %xmm0, 5(%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit14):
> > -       movb    %bh, 14(%edx)
> > -L(Exit14):
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  %xmm0, (%edx)
> > -       movlpd  6(%ecx), %xmm0
> > -       movlpd  %xmm0, 6(%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit15):
> > -       movb    %bh, 15(%edx)
> > -L(Exit15):
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  %xmm0, (%edx)
> > -       movlpd  7(%ecx), %xmm0
> > -       movlpd  %xmm0, 7(%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit16):
> > -       movb    %bh, 16(%edx)
> > -L(Exit16):
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  8(%ecx), %xmm1
> > -       movlpd  %xmm0, (%edx)
> > -       movlpd  %xmm1, 8(%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -# ifdef USE_AS_STRNCPY
> > -
> > -       CFI_PUSH(%esi)
> > -
> > -       .p2align 4
> > -L(CopyFrom1To16BytesCase2):
> > -       add     $16, %ebx
> > -       add     %esi, %ecx
> > -       lea     (%esi, %edx), %esi
> > -       lea     -9(%ebx), %edx
> > -       and     $1<<7, %dh
> > -       or      %al, %dh
> > -       test    %dh, %dh
> > -       lea     (%esi), %edx
> > -       POP     (%esi)
> > -       jz      L(ExitHighCase2)
> > -
> > -       test    $0x01, %al
> > -       jnz     L(Exit1)
> > -       cmp     $1, %ebx
> > -       je      L(StrncatExit1)
> > -       test    $0x02, %al
> > -       jnz     L(Exit2)
> > -       cmp     $2, %ebx
> > -       je      L(StrncatExit2)
> > -       test    $0x04, %al
> > -       jnz     L(Exit3)
> > -       cmp     $3, %ebx
> > -       je      L(StrncatExit3)
> > -       test    $0x08, %al
> > -       jnz     L(Exit4)
> > -       cmp     $4, %ebx
> > -       je      L(StrncatExit4)
> > -       test    $0x10, %al
> > -       jnz     L(Exit5)
> > -       cmp     $5, %ebx
> > -       je      L(StrncatExit5)
> > -       test    $0x20, %al
> > -       jnz     L(Exit6)
> > -       cmp     $6, %ebx
> > -       je      L(StrncatExit6)
> > -       test    $0x40, %al
> > -       jnz     L(Exit7)
> > -       cmp     $7, %ebx
> > -       je      L(StrncatExit7)
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  %xmm0, (%edx)
> > -       lea     7(%edx), %eax
> > -       cmpb    $1, (%eax)
> > -       sbb     $-1, %eax
> > -       xor     %cl, %cl
> > -       movb    %cl, (%eax)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(ExitHighCase2):
> > -       test    $0x01, %ah
> > -       jnz     L(Exit9)
> > -       cmp     $9, %ebx
> > -       je      L(StrncatExit9)
> > -       test    $0x02, %ah
> > -       jnz     L(Exit10)
> > -       cmp     $10, %ebx
> > -       je      L(StrncatExit10)
> > -       test    $0x04, %ah
> > -       jnz     L(Exit11)
> > -       cmp     $11, %ebx
> > -       je      L(StrncatExit11)
> > -       test    $0x8, %ah
> > -       jnz     L(Exit12)
> > -       cmp     $12, %ebx
> > -       je      L(StrncatExit12)
> > -       test    $0x10, %ah
> > -       jnz     L(Exit13)
> > -       cmp     $13, %ebx
> > -       je      L(StrncatExit13)
> > -       test    $0x20, %ah
> > -       jnz     L(Exit14)
> > -       cmp     $14, %ebx
> > -       je      L(StrncatExit14)
> > -       test    $0x40, %ah
> > -       jnz     L(Exit15)
> > -       cmp     $15, %ebx
> > -       je      L(StrncatExit15)
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  %xmm0, (%edx)
> > -       movlpd  8(%ecx), %xmm1
> > -       movlpd  %xmm1, 8(%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       CFI_PUSH(%esi)
> > -
> > -L(CopyFrom1To16BytesCase2OrCase3):
> > -       test    %eax, %eax
> > -       jnz     L(CopyFrom1To16BytesCase2)
> > -
> > -       .p2align 4
> > -L(CopyFrom1To16BytesCase3):
> > -       add     $16, %ebx
> > -       add     %esi, %edx
> > -       add     %esi, %ecx
> > -
> > -       POP     (%esi)
> > -
> > -       cmp     $8, %ebx
> > -       ja      L(ExitHighCase3)
> > -       cmp     $1, %ebx
> > -       je      L(StrncatExit1)
> > -       cmp     $2, %ebx
> > -       je      L(StrncatExit2)
> > -       cmp     $3, %ebx
> > -       je      L(StrncatExit3)
> > -       cmp     $4, %ebx
> > -       je      L(StrncatExit4)
> > -       cmp     $5, %ebx
> > -       je      L(StrncatExit5)
> > -       cmp     $6, %ebx
> > -       je      L(StrncatExit6)
> > -       cmp     $7, %ebx
> > -       je      L(StrncatExit7)
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  %xmm0, (%edx)
> > -       movb    %bh, 8(%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(ExitHighCase3):
> > -       cmp     $9, %ebx
> > -       je      L(StrncatExit9)
> > -       cmp     $10, %ebx
> > -       je      L(StrncatExit10)
> > -       cmp     $11, %ebx
> > -       je      L(StrncatExit11)
> > -       cmp     $12, %ebx
> > -       je      L(StrncatExit12)
> > -       cmp     $13, %ebx
> > -       je      L(StrncatExit13)
> > -       cmp     $14, %ebx
> > -       je      L(StrncatExit14)
> > -       cmp     $15, %ebx
> > -       je      L(StrncatExit15)
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  %xmm0, (%edx)
> > -       movlpd  8(%ecx), %xmm1
> > -       movlpd  %xmm1, 8(%edx)
> > -       movb    %bh, 16(%edx)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit0):
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit15Bytes):
> > -       cmp     $9, %ebx
> > -       je      L(StrncatExit9)
> > -       cmpb    $0, 9(%ecx)
> > -       jz      L(Exit10)
> > -       cmp     $10, %ebx
> > -       je      L(StrncatExit10)
> > -       cmpb    $0, 10(%ecx)
> > -       jz      L(Exit11)
> > -       cmp     $11, %ebx
> > -       je      L(StrncatExit11)
> > -       cmpb    $0, 11(%ecx)
> > -       jz      L(Exit12)
> > -       cmp     $12, %ebx
> > -       je      L(StrncatExit12)
> > -       cmpb    $0, 12(%ecx)
> > -       jz      L(Exit13)
> > -       cmp     $13, %ebx
> > -       je      L(StrncatExit13)
> > -       cmpb    $0, 13(%ecx)
> > -       jz      L(Exit14)
> > -       cmp     $14, %ebx
> > -       je      L(StrncatExit14)
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  %xmm0, (%edx)
> > -       movlpd  7(%ecx), %xmm0
> > -       movlpd  %xmm0, 7(%edx)
> > -       lea     14(%edx), %eax
> > -       cmpb    $1, (%eax)
> > -       sbb     $-1, %eax
> > -       movb    %bh, (%eax)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -       .p2align 4
> > -L(StrncatExit8Bytes):
> > -       cmpb    $0, (%ecx)
> > -       jz      L(Exit1)
> > -       cmp     $1, %ebx
> > -       je      L(StrncatExit1)
> > -       cmpb    $0, 1(%ecx)
> > -       jz      L(Exit2)
> > -       cmp     $2, %ebx
> > -       je      L(StrncatExit2)
> > -       cmpb    $0, 2(%ecx)
> > -       jz      L(Exit3)
> > -       cmp     $3, %ebx
> > -       je      L(StrncatExit3)
> > -       cmpb    $0, 3(%ecx)
> > -       jz      L(Exit4)
> > -       cmp     $4, %ebx
> > -       je      L(StrncatExit4)
> > -       cmpb    $0, 4(%ecx)
> > -       jz      L(Exit5)
> > -       cmp     $5, %ebx
> > -       je      L(StrncatExit5)
> > -       cmpb    $0, 5(%ecx)
> > -       jz      L(Exit6)
> > -       cmp     $6, %ebx
> > -       je      L(StrncatExit6)
> > -       cmpb    $0, 6(%ecx)
> > -       jz      L(Exit7)
> > -       cmp     $7, %ebx
> > -       je      L(StrncatExit7)
> > -       movlpd  (%ecx), %xmm0
> > -       movlpd  %xmm0, (%edx)
> > -       lea     7(%edx), %eax
> > -       cmpb    $1, (%eax)
> > -       sbb     $-1, %eax
> > -       movb    %bh, (%eax)
> > -       movl    %edi, %eax
> > -       RETURN1
> > -
> > -# endif
> > -END (STRCAT)
> > -#endif
> > diff --git a/sysdeps/i386/i686/multiarch/strcat.S
> > b/sysdeps/i386/i686/multiarch/strcat.S
> > deleted file mode 100644
> > index e68feca..0000000
> > --- a/sysdeps/i386/i686/multiarch/strcat.S
> > +++ /dev/null
> > @@ -1,119 +0,0 @@
> > -/* Multiple versions of strcat
> > -   Copyright (C) 2011-2012 Free Software Foundation, Inc.
> > -   Contributed by Intel Corporation.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <http://www.gnu.org/licenses/>.  */
> > -
> > -#include <sysdep.h>
> > -#include <init-arch.h>
> > -
> > -#ifndef USE_AS_STRNCAT
> > -# ifndef STRCAT
> > -#  define STRCAT strcat
> > -# endif
> > -#endif
> > -
> > -#ifdef USE_AS_STRNCAT
> > -# define STRCAT_SSSE3  __strncat_ssse3
> > -# define STRCAT_SSE2           __strncat_sse2
> > -# define STRCAT_IA32           __strncat_ia32
> > -# define __GI_STRCAT           __GI_strncat
> > -#else
> > -# define STRCAT_SSSE3  __strcat_ssse3
> > -# define STRCAT_SSE2           __strcat_sse2
> > -# define STRCAT_IA32           __strcat_ia32
> > -# define __GI_STRCAT           __GI_strcat
> > -#endif
> > -
> > -
> > -/* Define multiple versions only for the definition in libc.  Don't
> > -   define multiple versions for strncat in static library since we
> > -   need strncat before the initialization happened.  */
> > -#ifndef NOT_IN_libc
> > -
> > -# ifdef SHARED
> > -       .text
> > -ENTRY(STRCAT)
> > -       .type   STRCAT, @gnu_indirect_function
> > -       pushl   %ebx
> > -       cfi_adjust_cfa_offset (4)
> > -       cfi_rel_offset (ebx, 0)
> > -       LOAD_PIC_REG(bx)
> > -       cmpl    $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
> > -       jne     1f
> > -       call    __init_cpu_features
> > -1:     leal    STRCAT_IA32@GOTOFF(%ebx), %eax
> > -       testl   $bit_SSE2,
> > CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
> > -       jz      2f
> > -       leal    STRCAT_SSE2@GOTOFF(%ebx), %eax
> > -       testl   $bit_Fast_Unaligned_Load,
> > FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
> > -       jnz     2f
> > -       testl   $bit_SSSE3,
> > CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
> > -       jz      2f
> > -       leal    STRCAT_SSSE3@GOTOFF(%ebx), %eax
> > -2:     popl    %ebx
> > -       cfi_adjust_cfa_offset (-4)
> > -       cfi_restore (ebx)
> > -       ret
> > -END(STRCAT)
> > -# else
> > -
> > -ENTRY(STRCAT)
> > -       .type   STRCAT, @gnu_indirect_function
> > -       cmpl    $0, KIND_OFFSET+__cpu_features
> > -       jne     1f
> > -       call    __init_cpu_features
> > -1:     leal    STRCAT_IA32, %eax
> > -       testl   $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
> > -       jz      2f
> > -       leal    STRCAT_SSE2, %eax
> > -       testl   $bit_Fast_Unaligned_Load,
> > FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features
> > -       jnz     2f
> > -       testl   $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
> > -       jz      2f
> > -       leal    STRCAT_SSSE3, %eax
> > -2:     ret
> > -END(STRCAT)
> > -
> > -# endif
> > -
> > -# undef ENTRY
> > -# define ENTRY(name) \
> > -       .type STRCAT_IA32, @function; \
> > -       .align 16; \
> > -       STRCAT_IA32: cfi_startproc; \
> > -       CALL_MCOUNT
> > -# undef END
> > -# define END(name) \
> > -       cfi_endproc; .size STRCAT_IA32, .-STRCAT_IA32
> > -
> > -# ifdef SHARED
> > -#  undef libc_hidden_builtin_def
> > -/* It doesn't make sense to send libc-internal strcat calls through a
> > PLT.
> > -   The speedup we get from using SSSE3 instruction is likely eaten away
> > -   by the indirect call in the PLT.  */
> > -#  define libc_hidden_builtin_def(name) \
> > -       .globl __GI_STRCAT; __GI_STRCAT = STRCAT_IA32
> > -#  undef libc_hidden_def
> > -#  define libc_hidden_def(name) \
> > -       .globl __GI___STRCAT; __GI___STRCAT = STRCAT_IA32
> > -
> > -# endif
> > -#endif
> > -
> > -#ifndef USE_AS_STRNCAT
> > -# include "../../i486/strcat.S"
> > -#endif
> > diff --git a/sysdeps/i386/i686/multiarch/strncat-c.c
> > b/sysdeps/i386/i686/multiarch/strncat-c.c
> > deleted file mode 100644
> > index 132a000..0000000
> > --- a/sysdeps/i386/i686/multiarch/strncat-c.c
> > +++ /dev/null
> > @@ -1,8 +0,0 @@
> > -#define STRNCAT __strncat_ia32
> > -#ifdef SHARED
> > -#undef libc_hidden_def
> > -#define libc_hidden_def(name) \
> > -  __hidden_ver1 (__strncat_ia32, __GI___strncat, __strncat_ia32);
> > -#endif
> > -
> > -#include "string/strncat.c"
> > diff --git a/sysdeps/i386/i686/multiarch/strncat-sse2.S
> > b/sysdeps/i386/i686/multiarch/strncat-sse2.S
> > deleted file mode 100644
> > index f1045b7..0000000
> > --- a/sysdeps/i386/i686/multiarch/strncat-sse2.S
> > +++ /dev/null
> > @@ -1,4 +0,0 @@
> > -#define STRCAT  __strncat_sse2
> > -#define USE_AS_STRNCAT
> > -
> > -#include "strcat-sse2.S"
> > diff --git a/sysdeps/i386/i686/multiarch/strncat-ssse3.S
> > b/sysdeps/i386/i686/multiarch/strncat-ssse3.S
> > deleted file mode 100644
> > index 625b90a..0000000
> > --- a/sysdeps/i386/i686/multiarch/strncat-ssse3.S
> > +++ /dev/null
> > @@ -1,4 +0,0 @@
> > -#define STRCAT  __strncat_ssse3
> > -#define USE_AS_STRNCAT
> > -
> > -#include "strcat-ssse3.S"
> > diff --git a/sysdeps/i386/i686/multiarch/strncat.S
> > b/sysdeps/i386/i686/multiarch/strncat.S
> > deleted file mode 100644
> > index fd569c2..0000000
> > --- a/sysdeps/i386/i686/multiarch/strncat.S
> > +++ /dev/null
> > @@ -1,3 +0,0 @@
> > -#define STRCAT strncat
> > -#define USE_AS_STRNCAT
> > -#include "strcat.S"
> > diff --git a/sysdeps/powerpc/strcat.c b/sysdeps/powerpc/strcat.c
> > deleted file mode 100644
> > index 28575d0..0000000
> > --- a/sysdeps/powerpc/strcat.c
> > +++ /dev/null
> > @@ -1,30 +0,0 @@
> > -/* strcat version that uses fast strcpy/strlen.
> > -   Copyright (C) 1997, 2003 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <http://www.gnu.org/licenses/>.  */
> > -
> > -#include <string.h>
> > -
> > -#undef strcat
> > -
> > -/* Append SRC on the end of DEST.  */
> > -char *
> > -strcat (char *dest, const char *src)
> > -{
> > -  strcpy (dest + strlen (dest), src);
> > -  return dest;
> > -}
> > -libc_hidden_builtin_def (strcat)
> > diff --git a/sysdeps/x86_64/multiarch/Makefile
> > b/sysdeps/x86_64/multiarch/Makefile
> > index 22f1435..ae94366 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -6,7 +6,7 @@ endif
> >
> >  ifeq ($(subdir),string)
> >
> > -sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3
> > strncmp-ssse3 \
> > +sysdep_routines +=  stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
> >                    strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
> >                    memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
> >                    memmove-ssse3-back strcasestr-nonascii
> > strcasecmp_l-ssse3 \
> > @@ -14,8 +14,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c
> > strcmp-ssse3 strncmp-ssse3 \
> >                    strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
> >                    strcpy-sse2-unaligned strncpy-sse2-unaligned \
> >                    stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
> > -                  strcat-sse2-unaligned strncat-sse2-unaligned \
> > -                  strcat-ssse3 strncat-ssse3 strlen_atom strlen_avx \
> > +                  strlen_atom strlen_avx \
> >                    strnlen-sse2-no-bsf strrchr-sse2-no-bsf
> > strchr-sse2-no-bsf \
> >                    memcmp-ssse3
> >  ifeq (yes,$(config-cflags-sse4))
> > diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> > b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> > deleted file mode 100644
> > index 7811ab5..0000000
> > --- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> > +++ /dev/null
> > @@ -1,53 +0,0 @@
> > -/* strcat with SSE2
> > -   Copyright (C) 2011 Free Software Foundation, Inc.
> > -   Contributed by Intel Corporation.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <http://www.gnu.org/licenses/>.  */
> > -
> > -#ifndef NOT_IN_libc
> > -
> > -# include <sysdep.h>
> > -
> > -# ifndef STRCAT
> > -#  define STRCAT  __strcat_sse2_unaligned
> > -# endif
> > -
> > -# define USE_AS_STRCAT
> > -
> > -.text
> > -ENTRY (STRCAT)
> > -       mov     %rdi, %r9
> > -# ifdef USE_AS_STRNCAT
> > -       mov     %rdx, %r8
> > -# endif
> > -
> > -# define RETURN  jmp L(StartStrcpyPart)
> > -# include "strlen-sse2-pminub.S"
> > -# undef RETURN
> > -
> > -L(StartStrcpyPart):
> > -       lea     (%r9, %rax), %rdi
> > -       mov     %rsi, %rcx
> > -       mov     %r9, %rax      /* save result */
> > -
> > -# ifdef USE_AS_STRNCAT
> > -       test    %r8, %r8
> > -       jz      L(ExitZero)
> > -#  define USE_AS_STRNCPY
> > -# endif
> > -
> > -# include "strcpy-sse2-unaligned.S"
> > -#endif
> > diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S
> > b/sysdeps/x86_64/multiarch/strcat-ssse3.S
> > deleted file mode 100644
> > index abd2c0c..0000000
> > --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
> > +++ /dev/null
> > @@ -1,557 +0,0 @@
> > -/* strcat with SSSE3
> > -   Copyright (C) 2011 Free Software Foundation, Inc.
> > -   Contributed by Intel Corporation.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <http://www.gnu.org/licenses/>.  */
> > -
> > -#ifndef NOT_IN_libc
> > -
> > -# include <sysdep.h>
> > -
> > -# ifndef STRCAT
> > -#  define STRCAT  __strcat_ssse3
> > -# endif
> > -
> > -# define USE_AS_STRCAT
> > -
> > -.text
> > -ENTRY (STRCAT)
> > -# ifdef USE_AS_STRNCAT
> > -       mov     %rdx, %r8
> > -# endif
> > -
> > -# define RETURN  jmp L(StartStrcpyPart)
> > -# include "strlen-sse2-no-bsf.S"
> > -
> > -# undef RETURN
> > -
> > -L(StartStrcpyPart):
> > -       mov     %rsi, %rcx
> > -       lea     (%rdi, %rax), %rdx
> > -# ifdef USE_AS_STRNCAT
> > -       test    %r8, %r8
> > -       jz      L(StrncatExit0)
> > -       cmp     $8, %r8
> > -       jbe     L(StrncatExit8Bytes)
> > -# endif
> > -       cmpb    $0, (%rcx)
> > -       jz      L(Exit1)
> > -       cmpb    $0, 1(%rcx)
> > -       jz      L(Exit2)
> > -       cmpb    $0, 2(%rcx)
> > -       jz      L(Exit3)
> > -       cmpb    $0, 3(%rcx)
> > -       jz      L(Exit4)
> > -       cmpb    $0, 4(%rcx)
> > -       jz      L(Exit5)
> > -       cmpb    $0, 5(%rcx)
> > -       jz      L(Exit6)
> > -       cmpb    $0, 6(%rcx)
> > -       jz      L(Exit7)
> > -       cmpb    $0, 7(%rcx)
> > -       jz      L(Exit8)
> > -       cmpb    $0, 8(%rcx)
> > -       jz      L(Exit9)
> > -# ifdef USE_AS_STRNCAT
> > -       cmp     $16, %r8
> > -       jb      L(StrncatExit15Bytes)
> > -# endif
> > -       cmpb    $0, 9(%rcx)
> > -       jz      L(Exit10)
> > -       cmpb    $0, 10(%rcx)
> > -       jz      L(Exit11)
> > -       cmpb    $0, 11(%rcx)
> > -       jz      L(Exit12)
> > -       cmpb    $0, 12(%rcx)
> > -       jz      L(Exit13)
> > -       cmpb    $0, 13(%rcx)
> > -       jz      L(Exit14)
> > -       cmpb    $0, 14(%rcx)
> > -       jz      L(Exit15)
> > -       cmpb    $0, 15(%rcx)
> > -       jz      L(Exit16)
> > -# ifdef USE_AS_STRNCAT
> > -       cmp     $16, %r8
> > -       je      L(StrncatExit16)
> > -#  define USE_AS_STRNCPY
> > -# endif
> > -
> > -# include "strcpy-ssse3.S"
> > -
> > -       .p2align 4
> > -L(CopyFrom1To16Bytes):
> > -       add     %rsi, %rdx
> > -       add     %rsi, %rcx
> > -
> > -       test    %al, %al
> > -       jz      L(ExitHigh)
> > -       test    $0x01, %al
> > -       jnz     L(Exit1)
> > -       test    $0x02, %al
> > -       jnz     L(Exit2)
> > -       test    $0x04, %al
> > -       jnz     L(Exit3)
> > -       test    $0x08, %al
> > -       jnz     L(Exit4)
> > -       test    $0x10, %al
> > -       jnz     L(Exit5)
> > -       test    $0x20, %al
> > -       jnz     L(Exit6)
> > -       test    $0x40, %al
> > -       jnz     L(Exit7)
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  %xmm0, (%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(ExitHigh):
> > -       test    $0x01, %ah
> > -       jnz     L(Exit9)
> > -       test    $0x02, %ah
> > -       jnz     L(Exit10)
> > -       test    $0x04, %ah
> > -       jnz     L(Exit11)
> > -       test    $0x08, %ah
> > -       jnz     L(Exit12)
> > -       test    $0x10, %ah
> > -       jnz     L(Exit13)
> > -       test    $0x20, %ah
> > -       jnz     L(Exit14)
> > -       test    $0x40, %ah
> > -       jnz     L(Exit15)
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  8(%rcx), %xmm1
> > -       movlpd  %xmm0, (%rdx)
> > -       movlpd  %xmm1, 8(%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit1):
> > -       xor     %ah, %ah
> > -       movb    %ah, 1(%rdx)
> > -L(Exit1):
> > -       movb    (%rcx), %al
> > -       movb    %al, (%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit2):
> > -       xor     %ah, %ah
> > -       movb    %ah, 2(%rdx)
> > -L(Exit2):
> > -       movw    (%rcx), %ax
> > -       movw    %ax, (%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit3):
> > -       xor     %ah, %ah
> > -       movb    %ah, 3(%rdx)
> > -L(Exit3):
> > -       movw    (%rcx), %ax
> > -       movw    %ax, (%rdx)
> > -       movb    2(%rcx), %al
> > -       movb    %al, 2(%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit4):
> > -       xor     %ah, %ah
> > -       movb    %ah, 4(%rdx)
> > -L(Exit4):
> > -       mov     (%rcx), %eax
> > -       mov     %eax, (%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit5):
> > -       xor     %ah, %ah
> > -       movb    %ah, 5(%rdx)
> > -L(Exit5):
> > -       mov     (%rcx), %eax
> > -       mov     %eax, (%rdx)
> > -       movb    4(%rcx), %al
> > -       movb    %al, 4(%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit6):
> > -       xor     %ah, %ah
> > -       movb    %ah, 6(%rdx)
> > -L(Exit6):
> > -       mov     (%rcx), %eax
> > -       mov     %eax, (%rdx)
> > -       movw    4(%rcx), %ax
> > -       movw    %ax, 4(%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit7):
> > -       xor     %ah, %ah
> > -       movb    %ah, 7(%rdx)
> > -L(Exit7):
> > -       mov     (%rcx), %eax
> > -       mov     %eax, (%rdx)
> > -       mov     3(%rcx), %eax
> > -       mov     %eax, 3(%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit8):
> > -       xor     %ah, %ah
> > -       movb    %ah, 8(%rdx)
> > -L(Exit8):
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  %xmm0, (%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit9):
> > -       xor     %ah, %ah
> > -       movb    %ah, 9(%rdx)
> > -L(Exit9):
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  %xmm0, (%rdx)
> > -       movb    8(%rcx), %al
> > -       movb    %al, 8(%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit10):
> > -       xor     %ah, %ah
> > -       movb    %ah, 10(%rdx)
> > -L(Exit10):
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  %xmm0, (%rdx)
> > -       movw    8(%rcx), %ax
> > -       movw    %ax, 8(%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit11):
> > -       xor     %ah, %ah
> > -       movb    %ah, 11(%rdx)
> > -L(Exit11):
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  %xmm0, (%rdx)
> > -       mov     7(%rcx), %eax
> > -       mov     %eax, 7(%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit12):
> > -       xor     %ah, %ah
> > -       movb    %ah, 12(%rdx)
> > -L(Exit12):
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  %xmm0, (%rdx)
> > -       mov     8(%rcx), %eax
> > -       mov     %eax, 8(%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit13):
> > -       xor     %ah, %ah
> > -       movb    %ah, 13(%rdx)
> > -L(Exit13):
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  %xmm0, (%rdx)
> > -       movlpd  5(%rcx), %xmm1
> > -       movlpd  %xmm1, 5(%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit14):
> > -       xor     %ah, %ah
> > -       movb    %ah, 14(%rdx)
> > -L(Exit14):
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  %xmm0, (%rdx)
> > -       movlpd  6(%rcx), %xmm1
> > -       movlpd  %xmm1, 6(%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit15):
> > -       xor     %ah, %ah
> > -       movb    %ah, 15(%rdx)
> > -L(Exit15):
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  %xmm0, (%rdx)
> > -       movlpd  7(%rcx), %xmm1
> > -       movlpd  %xmm1, 7(%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit16):
> > -       xor     %ah, %ah
> > -       movb    %ah, 16(%rdx)
> > -L(Exit16):
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  8(%rcx), %xmm1
> > -       movlpd  %xmm0, (%rdx)
> > -       movlpd  %xmm1, 8(%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -# ifdef USE_AS_STRNCPY
> > -
> > -       .p2align 4
> > -L(CopyFrom1To16BytesCase2):
> > -       add     $16, %r8
> > -       add     %rsi, %rcx
> > -       lea     (%rsi, %rdx), %rsi
> > -       lea     -9(%r8), %rdx
> > -       and     $1<<7, %dh
> > -       or      %al, %dh
> > -       test    %dh, %dh
> > -       lea     (%rsi), %rdx
> > -       jz      L(ExitHighCase2)
> > -
> > -       test    $0x01, %al
> > -       jnz     L(Exit1)
> > -       cmp     $1, %r8
> > -       je      L(StrncatExit1)
> > -       test    $0x02, %al
> > -       jnz     L(Exit2)
> > -       cmp     $2, %r8
> > -       je      L(StrncatExit2)
> > -       test    $0x04, %al
> > -       jnz     L(Exit3)
> > -       cmp     $3, %r8
> > -       je      L(StrncatExit3)
> > -       test    $0x08, %al
> > -       jnz     L(Exit4)
> > -       cmp     $4, %r8
> > -       je      L(StrncatExit4)
> > -       test    $0x10, %al
> > -       jnz     L(Exit5)
> > -       cmp     $5, %r8
> > -       je      L(StrncatExit5)
> > -       test    $0x20, %al
> > -       jnz     L(Exit6)
> > -       cmp     $6, %r8
> > -       je      L(StrncatExit6)
> > -       test    $0x40, %al
> > -       jnz     L(Exit7)
> > -       cmp     $7, %r8
> > -       je      L(StrncatExit7)
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  %xmm0, (%rdx)
> > -       lea     7(%rdx), %rax
> > -       cmpb    $1, (%rax)
> > -       sbb     $-1, %rax
> > -       xor     %cl, %cl
> > -       movb    %cl, (%rax)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(ExitHighCase2):
> > -       test    $0x01, %ah
> > -       jnz     L(Exit9)
> > -       cmp     $9, %r8
> > -       je      L(StrncatExit9)
> > -       test    $0x02, %ah
> > -       jnz     L(Exit10)
> > -       cmp     $10, %r8
> > -       je      L(StrncatExit10)
> > -       test    $0x04, %ah
> > -       jnz     L(Exit11)
> > -       cmp     $11, %r8
> > -       je      L(StrncatExit11)
> > -       test    $0x8, %ah
> > -       jnz     L(Exit12)
> > -       cmp     $12, %r8
> > -       je      L(StrncatExit12)
> > -       test    $0x10, %ah
> > -       jnz     L(Exit13)
> > -       cmp     $13, %r8
> > -       je      L(StrncatExit13)
> > -       test    $0x20, %ah
> > -       jnz     L(Exit14)
> > -       cmp     $14, %r8
> > -       je      L(StrncatExit14)
> > -       test    $0x40, %ah
> > -       jnz     L(Exit15)
> > -       cmp     $15, %r8
> > -       je      L(StrncatExit15)
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  %xmm0, (%rdx)
> > -       movlpd  8(%rcx), %xmm1
> > -       movlpd  %xmm1, 8(%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -L(CopyFrom1To16BytesCase2OrCase3):
> > -       test    %rax, %rax
> > -       jnz     L(CopyFrom1To16BytesCase2)
> > -
> > -       .p2align 4
> > -L(CopyFrom1To16BytesCase3):
> > -       add     $16, %r8
> > -       add     %rsi, %rdx
> > -       add     %rsi, %rcx
> > -
> > -       cmp     $8, %r8
> > -       ja      L(ExitHighCase3)
> > -       cmp     $1, %r8
> > -       je      L(StrncatExit1)
> > -       cmp     $2, %r8
> > -       je      L(StrncatExit2)
> > -       cmp     $3, %r8
> > -       je      L(StrncatExit3)
> > -       cmp     $4, %r8
> > -       je      L(StrncatExit4)
> > -       cmp     $5, %r8
> > -       je      L(StrncatExit5)
> > -       cmp     $6, %r8
> > -       je      L(StrncatExit6)
> > -       cmp     $7, %r8
> > -       je      L(StrncatExit7)
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  %xmm0, (%rdx)
> > -       xor     %ah, %ah
> > -       movb    %ah, 8(%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(ExitHighCase3):
> > -       cmp     $9, %r8
> > -       je      L(StrncatExit9)
> > -       cmp     $10, %r8
> > -       je      L(StrncatExit10)
> > -       cmp     $11, %r8
> > -       je      L(StrncatExit11)
> > -       cmp     $12, %r8
> > -       je      L(StrncatExit12)
> > -       cmp     $13, %r8
> > -       je      L(StrncatExit13)
> > -       cmp     $14, %r8
> > -       je      L(StrncatExit14)
> > -       cmp     $15, %r8
> > -       je      L(StrncatExit15)
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  %xmm0, (%rdx)
> > -       movlpd  8(%rcx), %xmm1
> > -       movlpd  %xmm1, 8(%rdx)
> > -       xor     %ah, %ah
> > -       movb    %ah, 16(%rdx)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit0):
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit15Bytes):
> > -       cmp     $9, %r8
> > -       je      L(StrncatExit9)
> > -       cmpb    $0, 9(%rcx)
> > -       jz      L(Exit10)
> > -       cmp     $10, %r8
> > -       je      L(StrncatExit10)
> > -       cmpb    $0, 10(%rcx)
> > -       jz      L(Exit11)
> > -       cmp     $11, %r8
> > -       je      L(StrncatExit11)
> > -       cmpb    $0, 11(%rcx)
> > -       jz      L(Exit12)
> > -       cmp     $12, %r8
> > -       je      L(StrncatExit12)
> > -       cmpb    $0, 12(%rcx)
> > -       jz      L(Exit13)
> > -       cmp     $13, %r8
> > -       je      L(StrncatExit13)
> > -       cmpb    $0, 13(%rcx)
> > -       jz      L(Exit14)
> > -       cmp     $14, %r8
> > -       je      L(StrncatExit14)
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  %xmm0, (%rdx)
> > -       movlpd  7(%rcx), %xmm1
> > -       movlpd  %xmm1, 7(%rdx)
> > -       lea     14(%rdx), %rax
> > -       cmpb    $1, (%rax)
> > -       sbb     $-1, %rax
> > -       xor     %cl, %cl
> > -       movb    %cl, (%rax)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(StrncatExit8Bytes):
> > -       cmpb    $0, (%rcx)
> > -       jz      L(Exit1)
> > -       cmp     $1, %r8
> > -       je      L(StrncatExit1)
> > -       cmpb    $0, 1(%rcx)
> > -       jz      L(Exit2)
> > -       cmp     $2, %r8
> > -       je      L(StrncatExit2)
> > -       cmpb    $0, 2(%rcx)
> > -       jz      L(Exit3)
> > -       cmp     $3, %r8
> > -       je      L(StrncatExit3)
> > -       cmpb    $0, 3(%rcx)
> > -       jz      L(Exit4)
> > -       cmp     $4, %r8
> > -       je      L(StrncatExit4)
> > -       cmpb    $0, 4(%rcx)
> > -       jz      L(Exit5)
> > -       cmp     $5, %r8
> > -       je      L(StrncatExit5)
> > -       cmpb    $0, 5(%rcx)
> > -       jz      L(Exit6)
> > -       cmp     $6, %r8
> > -       je      L(StrncatExit6)
> > -       cmpb    $0, 6(%rcx)
> > -       jz      L(Exit7)
> > -       cmp     $7, %r8
> > -       je      L(StrncatExit7)
> > -       movlpd  (%rcx), %xmm0
> > -       movlpd  %xmm0, (%rdx)
> > -       lea     7(%rdx), %rax
> > -       cmpb    $1, (%rax)
> > -       sbb     $-1, %rax
> > -       xor     %cl, %cl
> > -       movb    %cl, (%rax)
> > -       mov     %rdi, %rax
> > -       ret
> > -
> > -# endif
> > -END (STRCAT)
> > -#endif
> > diff --git a/sysdeps/x86_64/multiarch/strcat.S
> > b/sysdeps/x86_64/multiarch/strcat.S
> > deleted file mode 100644
> > index 0c256de..0000000
> > --- a/sysdeps/x86_64/multiarch/strcat.S
> > +++ /dev/null
> > @@ -1,84 +0,0 @@
> > -/* Multiple versions of strcat
> > -   Copyright (C) 2009, 2011 Free Software Foundation, Inc.
> > -   Contributed by Intel Corporation.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <http://www.gnu.org/licenses/>.  */
> > -
> > -#include <sysdep.h>
> > -#include <init-arch.h>
> > -
> > -#ifndef USE_AS_STRNCAT
> > -# ifndef STRCAT
> > -#  define STRCAT strcat
> > -# endif
> > -#endif
> > -
> > -#ifdef USE_AS_STRNCAT
> > -# define STRCAT_SSSE3                  __strncat_ssse3
> > -# define STRCAT_SSE2                   __strncat_sse2
> > -# define STRCAT_SSE2_UNALIGNED         __strncat_sse2_unaligned
> > -# define __GI_STRCAT                   __GI_strncat
> > -# define __GI___STRCAT              __GI___strncat
> > -#else
> > -# define STRCAT_SSSE3                  __strcat_ssse3
> > -# define STRCAT_SSE2                   __strcat_sse2
> > -# define STRCAT_SSE2_UNALIGNED         __strcat_sse2_unaligned
> > -# define __GI_STRCAT                   __GI_strcat
> > -# define __GI___STRCAT              __GI___strcat
> > -#endif
> > -
> > -
> > -/* Define multiple versions only for the definition in libc.  */
> > -#ifndef NOT_IN_libc
> > -       .text
> > -ENTRY(STRCAT)
> > -       .type   STRCAT, @gnu_indirect_function
> > -       cmpl    $0, __cpu_features+KIND_OFFSET(%rip)
> > -       jne     1f
> > -       call    __init_cpu_features
> > -1:     leaq    STRCAT_SSE2_UNALIGNED(%rip), %rax
> > -       testl   $bit_Fast_Unaligned_Load,
> > __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
> > -       jnz     2f
> > -       leaq    STRCAT_SSE2(%rip), %rax
> > -       testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> > -       jz      2f
> > -       leaq    STRCAT_SSSE3(%rip), %rax
> > -2:     ret
> > -END(STRCAT)
> > -
> > -# undef ENTRY
> > -# define ENTRY(name) \
> > -       .type STRCAT_SSE2, @function; \
> > -       .align 16; \
> > -       STRCAT_SSE2: cfi_startproc; \
> > -       CALL_MCOUNT
> > -# undef END
> > -# define END(name) \
> > -       cfi_endproc; .size STRCAT_SSE2, .-STRCAT_SSE2
> > -# undef libc_hidden_builtin_def
> > -/* It doesn't make sense to send libc-internal strcat calls through a
> > PLT.
> > -   The speedup we get from using SSSE3 instruction is likely eaten away
> > -   by the indirect call in the PLT.  */
> > -# define libc_hidden_builtin_def(name) \
> > -       .globl __GI_STRCAT; __GI_STRCAT = STRCAT_SSE2
> > -# undef libc_hidden_def
> > -# define libc_hidden_def(name) \
> > -       .globl __GI___STRCAT; __GI___STRCAT = STRCAT_SSE2
> > -#endif
> > -
> > -#ifndef USE_AS_STRNCAT
> > -# include "../strcat.S"
> > -#endif
> > diff --git a/sysdeps/x86_64/multiarch/strncat-c.c
> > b/sysdeps/x86_64/multiarch/strncat-c.c
> > deleted file mode 100644
> > index a3cdbff..0000000
> > --- a/sysdeps/x86_64/multiarch/strncat-c.c
> > +++ /dev/null
> > @@ -1,8 +0,0 @@
> > -#define STRNCAT __strncat_sse2
> > -#ifdef SHARED
> > -#undef libc_hidden_def
> > -#define libc_hidden_def(name) \
> > -  __hidden_ver1 (__strncat_sse2, __GI___strncat, __strncat_sse2);
> > -#endif
> > -
> > -#include "string/strncat.c"
> > diff --git a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> > b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> > deleted file mode 100644
> > index 133e1d2..0000000
> > --- a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> > +++ /dev/null
> > @@ -1,3 +0,0 @@
> > -#define USE_AS_STRNCAT
> > -#define STRCAT __strncat_sse2_unaligned
> > -#include "strcat-sse2-unaligned.S"
> > diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S
> > b/sysdeps/x86_64/multiarch/strncat-ssse3.S
> > deleted file mode 100644
> > index 6c45ff3..0000000
> > --- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
> > +++ /dev/null
> > @@ -1,3 +0,0 @@
> > -#define USE_AS_STRNCAT
> > -#define STRCAT __strncat_ssse3
> > -#include "strcat-ssse3.S"
> > diff --git a/sysdeps/x86_64/multiarch/strncat.S
> > b/sysdeps/x86_64/multiarch/strncat.S
> > deleted file mode 100644
> > index fd569c2..0000000
> > --- a/sysdeps/x86_64/multiarch/strncat.S
> > +++ /dev/null
> > @@ -1,3 +0,0 @@
> > -#define STRCAT strncat
> > -#define USE_AS_STRNCAT
> > -#include "strcat.S"
> > diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S
> > deleted file mode 100644
> > index 535a18d..0000000
> > --- a/sysdeps/x86_64/strcat.S
> > +++ /dev/null
> > @@ -1,259 +0,0 @@
> > -/* strcat(dest, src) -- Append SRC on the end of DEST.
> > -   Optimized for x86-64.
> > -   Copyright (C) 2002 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -   Contributed by Andreas Jaeger <aj@suse.de>, 2002.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <http://www.gnu.org/licenses/>.  */
> > -
> > -#include <sysdep.h>
> > -#include "asm-syntax.h"
> > -#include "bp-sym.h"
> > -#include "bp-asm.h"
> > -
> > -
> > -       .text
> > -ENTRY (BP_SYM (strcat))
> > -       movq %rdi, %rcx         /* Dest. register. */
> > -       andl $7, %ecx           /* mask alignment bits */
> > -       movq %rdi, %rax         /* Duplicate destination pointer.  */
> > -       movq $0xfefefefefefefeff,%r8
> > -
> > -       /* First step: Find end of destination.  */
> > -       jz 4f                   /* aligned => start loop */
> > -
> > -       neg %ecx                /* We need to align to 8 bytes.  */
> > -       addl $8,%ecx
> > -       /* Search the first bytes directly.  */
> > -0:     cmpb $0x0,(%rax)        /* is byte NUL? */
> > -       je 2f                   /* yes => start copy */
> > -       incq %rax               /* increment pointer */
> > -       decl %ecx
> > -       jnz 0b
> > -
> > -
> > -
> > -       /* Now the source is aligned.  Scan for NUL byte.  */
> > -       .p2align 4
> > -4:
> > -       /* First unroll.  */
> > -       movq (%rax), %rcx       /* get double word (= 8 bytes) in question
> > */
> > -       addq $8,%rax            /* adjust pointer for next word */
> > -       movq %r8, %rdx          /* magic value */
> > -       addq %rcx, %rdx         /* add the magic value to the word.  We
> > get
> > -                                  carry bits reported for each byte which
> > -                                  is *not* 0 */
> > -       jnc 3f                  /* highest byte is NUL => return pointer
> > */
> > -       xorq %rcx, %rdx         /* (word+magic)^word */
> > -       orq %r8, %rdx           /* set all non-carry bits */
> > -       incq %rdx               /* add 1: if one carry bit was *not* set
> > -                                  the addition will not result in 0.  */
> > -       jnz 3f                  /* found NUL => return pointer */
> > -
> > -       /* Second unroll.  */
> > -       movq (%rax), %rcx       /* get double word (= 8 bytes) in question
> > */
> > -       addq $8,%rax            /* adjust pointer for next word */
> > -       movq %r8, %rdx          /* magic value */
> > -       addq %rcx, %rdx         /* add the magic value to the word.  We
> > get
> > -                                  carry bits reported for each byte which
> > -                                  is *not* 0 */
> > -       jnc 3f                  /* highest byte is NUL => return pointer
> > */
> > -       xorq %rcx, %rdx         /* (word+magic)^word */
> > -       orq %r8, %rdx           /* set all non-carry bits */
> > -       incq %rdx               /* add 1: if one carry bit was *not* set
> > -                                  the addition will not result in 0.  */
> > -       jnz 3f                  /* found NUL => return pointer */
> > -
> > -       /* Third unroll.  */
> > -       movq (%rax), %rcx       /* get double word (= 8 bytes) in question
> > */
> > -       addq $8,%rax            /* adjust pointer for next word */
> > -       movq %r8, %rdx          /* magic value */
> > -       addq %rcx, %rdx         /* add the magic value to the word.  We
> > get
> > -                                  carry bits reported for each byte which
> > -                                  is *not* 0 */
> > -       jnc 3f                  /* highest byte is NUL => return pointer
> > */
> > -       xorq %rcx, %rdx         /* (word+magic)^word */
> > -       orq %r8, %rdx           /* set all non-carry bits */
> > -       incq %rdx               /* add 1: if one carry bit was *not* set
> > -                                  the addition will not result in 0.  */
> > -       jnz 3f                  /* found NUL => return pointer */
> > -
> > -       /* Fourth unroll.  */
> > -       movq (%rax), %rcx       /* get double word (= 8 bytes) in question
> > */
> > -       addq $8,%rax            /* adjust pointer for next word */
> > -       movq %r8, %rdx          /* magic value */
> > -       addq %rcx, %rdx         /* add the magic value to the word.  We
> > get
> > -                                  carry bits reported for each byte which
> > -                                  is *not* 0 */
> > -       jnc 3f                  /* highest byte is NUL => return pointer
> > */
> > -       xorq %rcx, %rdx         /* (word+magic)^word */
> > -       orq %r8, %rdx           /* set all non-carry bits */
> > -       incq %rdx               /* add 1: if one carry bit was *not* set
> > -                                  the addition will not result in 0.  */
> > -       jz 4b                   /* no NUL found => continue loop */
> > -
> > -       .p2align 4              /* Align, it's a jump target.  */
> > -3:     subq $8,%rax            /* correct pointer increment.  */
> > -
> > -       testb %cl, %cl          /* is first byte NUL? */
> > -       jz 2f                   /* yes => return */
> > -       incq %rax               /* increment pointer */
> > -
> > -       testb %ch, %ch          /* is second byte NUL? */
> > -       jz 2f                   /* yes => return */
> > -       incq %rax               /* increment pointer */
> > -
> > -       testl $0x00ff0000, %ecx /* is third byte NUL? */
> > -       jz 2f                   /* yes => return pointer */
> > -       incq %rax               /* increment pointer */
> > -
> > -       testl $0xff000000, %ecx /* is fourth byte NUL? */
> > -       jz 2f                   /* yes => return pointer */
> > -       incq %rax               /* increment pointer */
> > -
> > -       shrq $32, %rcx          /* look at other half.  */
> > -
> > -       testb %cl, %cl          /* is first byte NUL? */
> > -       jz 2f                   /* yes => return */
> > -       incq %rax               /* increment pointer */
> > -
> > -       testb %ch, %ch          /* is second byte NUL? */
> > -       jz 2f                   /* yes => return */
> > -       incq %rax               /* increment pointer */
> > -
> > -       testl $0xff0000, %ecx   /* is third byte NUL? */
> > -       jz 2f                   /* yes => return pointer */
> > -       incq %rax               /* increment pointer */
> > -
> > -2:
> > -       /* Second step: Copy source to destination.  */
> > -
> > -       movq    %rsi, %rcx      /* duplicate  */
> > -       andl    $7,%ecx         /* mask alignment bits */
> > -       movq    %rax, %rdx      /* move around */
> > -       jz      22f             /* aligned => start loop */
> > -
> > -       neg     %ecx            /* align to 8 bytes.  */
> > -       addl    $8, %ecx
> > -       /* Align the source pointer.  */
> > -21:
> > -       movb    (%rsi), %al     /* Fetch a byte */
> > -       testb   %al, %al        /* Is it NUL? */
> > -       movb    %al, (%rdx)     /* Store it */
> > -       jz      24f             /* If it was NUL, done! */
> > -       incq    %rsi
> > -       incq    %rdx
> > -       decl    %ecx
> > -       jnz     21b
> > -
> > -       /* Now the sources is aligned.  Unfortunatly we cannot force
> > -          to have both source and destination aligned, so ignore the
> > -          alignment of the destination.  */
> > -       .p2align 4
> > -22:
> > -       /* 1st unroll.  */
> > -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> > -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> > -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> > -       addq    %r8, %r9        /* add the magic value to the word.  We
> > get
> > -                                  carry bits reported for each byte which
> > -                                  is *not* 0 */
> > -       jnc     23f             /* highest byte is NUL => return pointer
> > */
> > -       xorq    %rax, %r9       /* (word+magic)^word */
> > -       orq     %r8, %r9        /* set all non-carry bits */
> > -       incq    %r9             /* add 1: if one carry bit was *not* set
> > -                                  the addition will not result in 0.  */
> > -
> > -       jnz     23f             /* found NUL => return pointer */
> > -
> > -       movq    %rax, (%rdx)    /* Write value to destination.  */
> > -       addq    $8, %rdx        /* Adjust pointer.  */
> > -
> > -       /* 2nd unroll.  */
> > -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> > -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> > -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> > -       addq    %r8, %r9        /* add the magic value to the word.  We
> > get
> > -                                  carry bits reported for each byte which
> > -                                  is *not* 0 */
> > -       jnc     23f             /* highest byte is NUL => return pointer
> > */
> > -       xorq    %rax, %r9       /* (word+magic)^word */
> > -       orq     %r8, %r9        /* set all non-carry bits */
> > -       incq    %r9             /* add 1: if one carry bit was *not* set
> > -                                  the addition will not result in 0.  */
> > -
> > -       jnz     23f             /* found NUL => return pointer */
> > -
> > -       movq    %rax, (%rdx)    /* Write value to destination.  */
> > -       addq    $8, %rdx        /* Adjust pointer.  */
> > -
> > -       /* 3rd unroll.  */
> > -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> > -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> > -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> > -       addq    %r8, %r9        /* add the magic value to the word.  We
> > get
> > -                                  carry bits reported for each byte which
> > -                                  is *not* 0 */
> > -       jnc     23f             /* highest byte is NUL => return pointer
> > */
> > -       xorq    %rax, %r9       /* (word+magic)^word */
> > -       orq     %r8, %r9        /* set all non-carry bits */
> > -       incq    %r9             /* add 1: if one carry bit was *not* set
> > -                                  the addition will not result in 0.  */
> > -
> > -       jnz     23f             /* found NUL => return pointer */
> > -
> > -       movq    %rax, (%rdx)    /* Write value to destination.  */
> > -       addq    $8, %rdx        /* Adjust pointer.  */
> > -
> > -       /* 4th unroll.  */
> > -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> > -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> > -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> > -       addq    %r8, %r9        /* add the magic value to the word.  We
> > get
> > -                                  carry bits reported for each byte which
> > -                                  is *not* 0 */
> > -       jnc     23f             /* highest byte is NUL => return pointer
> > */
> > -       xorq    %rax, %r9       /* (word+magic)^word */
> > -       orq     %r8, %r9        /* set all non-carry bits */
> > -       incq    %r9             /* add 1: if one carry bit was *not* set
> > -                                  the addition will not result in 0.  */
> > -
> > -       jnz     23f             /* found NUL => return pointer */
> > -
> > -       movq    %rax, (%rdx)    /* Write value to destination.  */
> > -       addq    $8, %rdx        /* Adjust pointer.  */
> > -       jmp     22b             /* Next iteration.  */
> > -
> > -       /* Do the last few bytes. %rax contains the value to write.
> > -          The loop is unrolled twice.  */
> > -       .p2align 4
> > -23:
> > -       movb    %al, (%rdx)     /* 1st byte.  */
> > -       testb   %al, %al        /* Is it NUL.  */
> > -       jz      24f             /* yes, finish.  */
> > -       incq    %rdx            /* Increment destination.  */
> > -       movb    %ah, (%rdx)     /* 2nd byte.  */
> > -       testb   %ah, %ah        /* Is it NUL?.  */
> > -       jz      24f             /* yes, finish.  */
> > -       incq    %rdx            /* Increment destination.  */
> > -       shrq    $16, %rax       /* Shift...  */
> > -       jmp     23b             /* and look at next two bytes in %rax.  */
> > -
> > -
> > -24:
> > -       movq    %rdi, %rax      /* Source is return value.  */
> > -       retq
> > -END (BP_SYM (strcat))
> > -libc_hidden_builtin_def (strcat)
> > --
> > 1.7.4.4
> >
> >
> >

-- 

Feature was not beta tested


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]