This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] faster strcat
On Tue, Oct 09, 2012 at 04:37:34PM +0400, Dmitrieva Liubov wrote:
> Why do you think this is faster?
>
> You two times iterates all src array instead of one time in the
> current version (the first is strnlen and the second is strcpy, in the
> current assembler version something like strncpy is used instead but
> without strncpy zero filling)
>
It is almost fastest I could portably write. I simplified it into
+ size_t dest_len = strlen (dest);
+ size_t src_len = strnlen (src , n);
+ memcpy (dest + dest_len, src, src_len);
+ dest[dest_len + src_len] = '\0';
I would use stpncpy if it did not zero filled. Also adding strcat variant that
returns end of string could speed things up.
> It's likely true that current version inlines slow strlen but that
> code can be switched to use other strlen version.
Good idea. I am not sure how to keep it with sync with strlen for
future.
>
> + size_t dest_len = strlen (dest);
> + size_t src_len = strnlen (src , n);
>
> - if (c != '\0')
> - *++s1 = '\0';
> + if (src_len == n)
> + {
> + memcpy (dest + dest_len, src, n);
> + dest[dest_len + n] = '\0';
> + }
> + else
> + strcpy (dest + dest_len, src);
>
> - return s;
> + return dest;
> }
>
>
> --
> Liubov Dmitrieva
> Intel Corporation
>
> 2012/10/8 OndÅej BÃlka <neleai@seznam.cz>
> >
> > This is next version of my patch
> > http://sourceware.org/ml/libc-alpha/2012-06/msg00489.html
> >
> > I investigated strcat bit futher and speed degradation
> > was caused by improper usage of indirect functions.
> >
> > strcat ifunc first tests bit_Fast_Unaligned_Load which is
> > false on core2 and AMD processors. Then it checks ssse3 and
> > calls ssse3 version.
> > But strcat_ssse3 inlines strlen_sse2_no_bsf which on core2 and phenomII
> > is slowest strlen variant unless strings is larger than 2000 where
> > strlen_sse2 takes lead.
> >
> > Then I deleted strcat variants that are no longer needed.
> >
> > Files ports/sysdeps/ia64/strcat.c, sysdeps/powerpc/strcat.c, became
> > duplicates of string/strcat.c.
> >
> >
> > * string/strcat.c: Reduce algorithm selection
> > to strlen,strcpy
> > * string/strncat.c: Likewise
> > * sysdeps/powerpc/strcat.c: Duplicated string/strcat.c
> > * ports/sysdeps/ia64/strcat.c: Likewise
> >
> > * sysdeps/i386/i686/multiarch/Makefile: Updated
> > * sysdeps/x86_64/multiarch/Makefile: Updated
> >
> > * sysdeps/i386/i486/strcat.S: No longer needed
> > * sysdeps/i386/i686/multiarch/strcat-sse2.S:Likewise
> > * sysdeps/i386/i686/multiarch/strcat-ssse3.S:Likewise
> > * sysdeps/i386/i686/multiarch/strcat.S:Likewise
> > * sysdeps/i386/i686/multiarch/strncat-c.c:Likewise
> > * sysdeps/i386/i686/multiarch/strncat-sse2.S:Likewise
> > * sysdeps/i386/i686/multiarch/strncat-ssse3.S:Likewise
> > * sysdeps/i386/i686/multiarch/strncat.S:Likewise
> > * sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S:Likewise
> > * sysdeps/x86_64/multiarch/strcat-ssse3.S:Likewise
> > * sysdeps/x86_64/multiarch/strcat.S:Likewise
> > * sysdeps/x86_64/multiarch/strncat-c.c:Likewise
> > * sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S:Likewise
> > * sysdeps/x86_64/multiarch/strncat-ssse3.S:Likewise
> > * sysdeps/x86_64/multiarch/strncat.S:Likewise
> > * sysdeps/x86_64/strcat.S:Likewise
> >
> >
> > ---
> > ports/sysdeps/ia64/strcat.c | 26 -
> > string/strcat.c | 29 +-
> > string/strncat.c | 62 +-
> > sysdeps/i386/i486/strcat.S | 273 -----
> > sysdeps/i386/i686/multiarch/Makefile | 3 +-
> > sysdeps/i386/i686/multiarch/strcat-sse2.S | 1243
> > ---------------------
> > sysdeps/i386/i686/multiarch/strcat-ssse3.S | 572 ----------
> > sysdeps/i386/i686/multiarch/strcat.S | 119 --
> > sysdeps/i386/i686/multiarch/strncat-c.c | 8 -
> > sysdeps/i386/i686/multiarch/strncat-sse2.S | 4 -
> > sysdeps/i386/i686/multiarch/strncat-ssse3.S | 4 -
> > sysdeps/i386/i686/multiarch/strncat.S | 3 -
> > sysdeps/powerpc/strcat.c | 30 -
> > sysdeps/x86_64/multiarch/Makefile | 5 +-
> > sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S | 53 -
> > sysdeps/x86_64/multiarch/strcat-ssse3.S | 557 ---------
> > sysdeps/x86_64/multiarch/strcat.S | 84 --
> > sysdeps/x86_64/multiarch/strncat-c.c | 8 -
> > sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S | 3 -
> > sysdeps/x86_64/multiarch/strncat-ssse3.S | 3 -
> > sysdeps/x86_64/multiarch/strncat.S | 3 -
> > sysdeps/x86_64/strcat.S | 259 -----
> > 24 files changed, 15 insertions(+), 4280 deletions(-)
> > delete mode 100644 ports/sysdeps/ia64/strcat.c
> > delete mode 100644 sysdeps/i386/i486/strcat.S
> > delete mode 100644 sysdeps/i386/i686/multiarch/strcat-sse2.S
> > delete mode 100644 sysdeps/i386/i686/multiarch/strcat-ssse3.S
> > delete mode 100644 sysdeps/i386/i686/multiarch/strcat.S
> > delete mode 100644 sysdeps/i386/i686/multiarch/strncat-c.c
> > delete mode 100644 sysdeps/i386/i686/multiarch/strncat-sse2.S
> > delete mode 100644 sysdeps/i386/i686/multiarch/strncat-ssse3.S
> > delete mode 100644 sysdeps/i386/i686/multiarch/strncat.S
> > delete mode 100644 sysdeps/powerpc/strcat.c
> > delete mode 100644 sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> > delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
> > delete mode 100644 sysdeps/x86_64/multiarch/strcat.S
> > delete mode 100644 sysdeps/x86_64/multiarch/strncat-c.c
> > delete mode 100644 sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> > delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S
> > delete mode 100644 sysdeps/x86_64/multiarch/strncat.S
> > delete mode 100644 sysdeps/x86_64/strcat.S
> >
> > diff --git a/ports/sysdeps/ia64/strcat.c b/ports/sysdeps/ia64/strcat.c
> > deleted file mode 100644
> > index 53cd4d1..0000000
> > --- a/ports/sysdeps/ia64/strcat.c
> > +++ /dev/null
> > @@ -1,26 +0,0 @@
> > -/* Copyright (C) 2004 Free Software Foundation, Inc.
> > - This file is part of the GNU C Library.
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <http://www.gnu.org/licenses/>. */
> > -
> > -#include <string.h>
> > -
> > -char *
> > -strcat (char *dest, const char *src)
> > -{
> > - strcpy (dest + strlen (dest), src);
> > - return dest;
> > -}
> > -libc_hidden_builtin_def (strcat)
> > diff --git a/string/strcat.c b/string/strcat.c
> > index f9e4bc6..28575d0 100644
> > --- a/string/strcat.c
> > +++ b/string/strcat.c
> > @@ -1,4 +1,5 @@
> > -/* Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
> > +/* strcat version that uses fast strcpy/strlen.
> > + Copyright (C) 1997, 2003 Free Software Foundation, Inc.
> > This file is part of the GNU C Library.
> >
> > The GNU C Library is free software; you can redistribute it and/or
> > @@ -16,36 +17,14 @@
> > <http://www.gnu.org/licenses/>. */
> >
> > #include <string.h>
> > -#include <memcopy.h>
> >
> > #undef strcat
> >
> > /* Append SRC on the end of DEST. */
> > char *
> > -strcat (dest, src)
> > - char *dest;
> > - const char *src;
> > +strcat (char *dest, const char *src)
> > {
> > - char *s1 = dest;
> > - const char *s2 = src;
> > - char c;
> > -
> > - /* Find the end of the string. */
> > - do
> > - c = *s1++;
> > - while (c != '\0');
> > -
> > - /* Make S1 point before the next character, so we can increment
> > - it while memory is read (wins on pipelined cpus). */
> > - s1 -= 2;
> > -
> > - do
> > - {
> > - c = *s2++;
> > - *++s1 = c;
> > - }
> > - while (c != '\0');
> > -
> > + strcpy (dest + strlen (dest), src);
> > return dest;
> > }
> > libc_hidden_builtin_def (strcat)
> > diff --git a/string/strncat.c b/string/strncat.c
> > index dcfb04d..17b4c9a 100644
> > --- a/string/strncat.c
> > +++ b/string/strncat.c
> > @@ -1,4 +1,4 @@
> > -/* Copyright (C) 1991,1997,2011 Free Software Foundation, Inc.
> > +/* Copyright (C) 1991-2012 Free Software Foundation, Inc.
> > This file is part of the GNU C Library.
> >
> > The GNU C Library is free software; you can redistribute it and/or
> > @@ -17,66 +17,20 @@
> >
> > #include <string.h>
> >
> > -#ifdef _LIBC
> > -# include <memcopy.h>
> > -#endif
> > -
> > #ifndef STRNCAT
> > # undef strncat
> > # define STRNCAT strncat
> > #endif
> >
> > char *
> > -STRNCAT (char *s1, const char *s2, size_t n)
> > +STRNCAT (char *dest, const char *src, size_t n)
> > {
> > - char c;
> > - char *s = s1;
> > -
> > - /* Find the end of S1. */
> > - do
> > - c = *s1++;
> > - while (c != '\0');
> > -
> > - /* Make S1 point before next character, so we can increment
> > - it while memory is read (wins on pipelined cpus). */
> > - s1 -= 2;
> > -
> > - if (n >= 4)
> > - {
> > - size_t n4 = n >> 2;
> > - do
> > - {
> > - c = *s2++;
> > - *++s1 = c;
> > - if (c == '\0')
> > - return s;
> > - c = *s2++;
> > - *++s1 = c;
> > - if (c == '\0')
> > - return s;
> > - c = *s2++;
> > - *++s1 = c;
> > - if (c == '\0')
> > - return s;
> > - c = *s2++;
> > - *++s1 = c;
> > - if (c == '\0')
> > - return s;
> > - } while (--n4 > 0);
> > - n &= 3;
> > - }
> > -
> > - while (n > 0)
> > - {
> > - c = *s2++;
> > - *++s1 = c;
> > - if (c == '\0')
> > - return s;
> > - n--;
> > - }
> > + size_t dest_len = strlen (dest);
> > + size_t src_len = strnlen (src , n);
> >
> > - if (c != '\0')
> > - *++s1 = '\0';
> > + if (src_len == n)
> > + {
> > + memcpy (dest + dest_len, src, n);
> > + dest[dest_len + n] = '\0';
> > + }
> > + else
> > + strcpy (dest + dest_len, src);
> >
> > - return s;
> > + return dest;
> > }
> > diff --git a/sysdeps/i386/i486/strcat.S b/sysdeps/i386/i486/strcat.S
> > deleted file mode 100644
> > index 7596a0d..0000000
> > --- a/sysdeps/i386/i486/strcat.S
> > +++ /dev/null
> > @@ -1,273 +0,0 @@
> > -/* strcat(dest, src) -- Append SRC on the end of DEST.
> > - For Intel 80x86, x>=4.
> > - Copyright (C) 1994-1997,2000,2003,2005 Free Software Foundation, Inc.
> > - This file is part of the GNU C Library.
> > - Contributed by Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de>.
> > - Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <http://www.gnu.org/licenses/>. */
> > -
> > -#include <sysdep.h>
> > -#include "asm-syntax.h"
> > -#include "bp-sym.h"
> > -#include "bp-asm.h"
> > -
> > -#define PARMS LINKAGE+4 /* space for 1 saved reg */
> > -#define RTN PARMS
> > -#define DEST RTN+RTN_SIZE
> > -#define SRC DEST+PTR_SIZE
> > -
> > - .text
> > -ENTRY (BP_SYM (strcat))
> > - ENTER
> > -
> > - pushl %edi /* Save callee-safe register. */
> > - cfi_adjust_cfa_offset (4)
> > -
> > - movl DEST(%esp), %edx
> > - movl SRC(%esp), %ecx
> > - CHECK_BOUNDS_LOW (%edx, DEST(%esp))
> > - CHECK_BOUNDS_LOW (%ecx, SRC(%esp))
> > -
> > - testb $0xff, (%ecx) /* Is source string empty? */
> > - jz L(8) /* yes => return */
> > -
> > - /* Test the first bytes separately until destination is aligned.
> > */
> > - testl $3, %edx /* destination pointer aligned? */
> > - jz L(1) /* yes => begin scan loop */
> > - testb $0xff, (%edx) /* is end of string? */
> > - jz L(2) /* yes => start appending */
> > - incl %edx /* increment source pointer */
> > -
> > - testl $3, %edx /* destination pointer aligned? */
> > - jz L(1) /* yes => begin scan loop */
> > - testb $0xff, (%edx) /* is end of string? */
> > - jz L(2) /* yes => start appending */
> > - incl %edx /* increment source pointer */
> > -
> > - testl $3, %edx /* destination pointer aligned? */
> > - jz L(1) /* yes => begin scan loop */
> > - testb $0xff, (%edx) /* is end of string? */
> > - jz L(2) /* yes => start appending */
> > - incl %edx /* increment source pointer */
> > -
> > - /* Now we are aligned. Begin scan loop. */
> > - jmp L(1)
> > -
> > - cfi_rel_offset (edi, 0)
> > - ALIGN(4)
> > -
> > -L(4): addl $16,%edx /* increment destination pointer for round
> > */
> > -
> > -L(1): movl (%edx), %eax /* get word (= 4 bytes) in question */
> > - movl $0xfefefeff, %edi /* magic value */
> > -
> > - /* If you compare this with the algorithm in memchr.S you will
> > - notice that here is an `xorl' statement missing. But you must
> > - not forget that we are looking for C == 0 and `xorl $0, %eax'
> > - is a no-op. */
> > -
> > - addl %eax, %edi /* add the magic value to the word. We
> > get
> > - carry bits reported for each byte which
> > - is *not* 0 */
> > -
> > - /* According to the algorithm we had to reverse the effect of the
> > - XOR first and then test the overflow bits. But because the
> > - following XOR would destroy the carry flag and it would (in a
> > - representation with more than 32 bits) not alter then last
> > - overflow, we can now test this condition. If no carry is
> > signaled
> > - no overflow must have occurred in the last byte => it was 0. */
> > - jnc L(3)
> > -
> > - /* We are only interested in carry bits that change due to the
> > - previous add, so remove original bits */
> > - xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask)
> > */
> > -
> > - /* Now test for the other three overflow bits. */
> > - orl $0xfefefeff, %edi /* set all non-carry bits */
> > - incl %edi /* add 1: if one carry bit was *not* set
> > - the addition will not result in 0. */
> > -
> > - /* If at least one byte of the word is C we don't get 0 in %ecx.
> > */
> > - jnz L(3)
> > -
> > - movl 4(%edx), %eax /* get word from source */
> > - movl $0xfefefeff, %edi /* magic value */
> > - addl %eax, %edi /* add the magic value to the word. We
> > get
> > - carry bits reported for each byte which
> > - is *not* 0 */
> > - jnc L(5) /* highest byte is C => stop copying */
> > - xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask)
> > */
> > - orl $0xfefefeff, %edi /* set all non-carry bits */
> > - incl %edi /* add 1: if one carry bit was *not* set
> > - the addition will not result in 0. */
> > - jnz L(5) /* one byte is NUL => stop copying */
> > -
> > - movl 8(%edx), %eax /* get word from source */
> > - movl $0xfefefeff, %edi /* magic value */
> > - addl %eax, %edi /* add the magic value to the word. We
> > get
> > - carry bits reported for each byte which
> > - is *not* 0 */
> > - jnc L(6) /* highest byte is C => stop copying */
> > - xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask)
> > */
> > - orl $0xfefefeff, %edi /* set all non-carry bits */
> > - incl %edi /* add 1: if one carry bit was *not* set
> > - the addition will not result in 0. */
> > - jnz L(6) /* one byte is NUL => stop copying */
> > -
> > - movl 12(%edx), %eax /* get word from source */
> > - movl $0xfefefeff, %edi /* magic value */
> > - addl %eax, %edi /* add the magic value to the word. We
> > get
> > - carry bits reported for each byte which
> > - is *not* 0 */
> > - jnc L(7) /* highest byte is C => stop copying */
> > - xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask)
> > */
> > - orl $0xfefefeff, %edi /* set all non-carry bits */
> > - incl %edi /* add 1: if one carry bit was *not* set
> > - the addition will not result in 0. */
> > - jz L(4) /* no byte is NUL => carry on copying */
> > -
> > -L(7): addl $4, %edx /* adjust source pointer */
> > -L(6): addl $4, %edx
> > -L(5): addl $4, %edx
> > -
> > -L(3): testb %al, %al /* is first byte NUL? */
> > - jz L(2) /* yes => start copying */
> > - incl %edx /* increment source pointer */
> > -
> > - testb %ah, %ah /* is second byte NUL? */
> > - jz L(2) /* yes => start copying */
> > - incl %edx /* increment source pointer */
> > -
> > - testl $0xff0000, %eax /* is third byte NUL? */
> > - jz L(2) /* yes => start copying */
> > - incl %edx /* increment source pointer */
> > -
> > -L(2): subl %ecx, %edx /* reduce number of loop variants */
> > -
> > - /* Now we have to align the source pointer. */
> > - testl $3, %ecx /* pointer correctly aligned? */
> > - jz L(29) /* yes => start copy loop */
> > - movb (%ecx), %al /* get first byte */
> > - movb %al, (%ecx,%edx) /* and store it */
> > - andb %al, %al /* is byte NUL? */
> > - jz L(8) /* yes => return */
> > - incl %ecx /* increment pointer */
> > -
> > - testl $3, %ecx /* pointer correctly aligned? */
> > - jz L(29) /* yes => start copy loop */
> > - movb (%ecx), %al /* get first byte */
> > - movb %al, (%ecx,%edx) /* and store it */
> > - andb %al, %al /* is byte NUL? */
> > - jz L(8) /* yes => return */
> > - incl %ecx /* increment pointer */
> > -
> > - testl $3, %ecx /* pointer correctly aligned? */
> > - jz L(29) /* yes => start copy loop */
> > - movb (%ecx), %al /* get first byte */
> > - movb %al, (%ecx,%edx) /* and store it */
> > - andb %al, %al /* is byte NUL? */
> > - jz L(8) /* yes => return */
> > - incl %ecx /* increment pointer */
> > -
> > - /* Now we are aligned. */
> > - jmp L(29) /* start copy loop */
> > -
> > - ALIGN(4)
> > -
> > -L(28): movl %eax, 12(%ecx,%edx)/* store word at destination */
> > - addl $16, %ecx /* adjust pointer for full round */
> > -
> > -L(29): movl (%ecx), %eax /* get word from source */
> > - movl $0xfefefeff, %edi /* magic value */
> > - addl %eax, %edi /* add the magic value to the word. We
> > get
> > - carry bits reported for each byte which
> > - is *not* 0 */
> > - jnc L(9) /* highest byte is C => stop copying */
> > - xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask)
> > */
> > - orl $0xfefefeff, %edi /* set all non-carry bits */
> > - incl %edi /* add 1: if one carry bit was *not* set
> > - the addition will not result in 0. */
> > - jnz L(9) /* one byte is NUL => stop copying */
> > - movl %eax, (%ecx,%edx) /* store word to destination */
> > -
> > - movl 4(%ecx), %eax /* get word from source */
> > - movl $0xfefefeff, %edi /* magic value */
> > - addl %eax, %edi /* add the magic value to the word. We
> > get
> > - carry bits reported for each byte which
> > - is *not* 0 */
> > - jnc L(91) /* highest byte is C => stop copying */
> > - xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask)
> > */
> > - orl $0xfefefeff, %edi /* set all non-carry bits */
> > - incl %edi /* add 1: if one carry bit was *not* set
> > - the addition will not result in 0. */
> > - jnz L(91) /* one byte is NUL => stop copying */
> > - movl %eax, 4(%ecx,%edx) /* store word to destination */
> > -
> > - movl 8(%ecx), %eax /* get word from source */
> > - movl $0xfefefeff, %edi /* magic value */
> > - addl %eax, %edi /* add the magic value to the word. We
> > get
> > - carry bits reported for each byte which
> > - is *not* 0 */
> > - jnc L(92) /* highest byte is C => stop copying */
> > - xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask)
> > */
> > - orl $0xfefefeff, %edi /* set all non-carry bits */
> > - incl %edi /* add 1: if one carry bit was *not* set
> > - the addition will not result in 0. */
> > - jnz L(92) /* one byte is NUL => stop copying */
> > - movl %eax, 8(%ecx,%edx) /* store word to destination */
> > -
> > - movl 12(%ecx), %eax /* get word from source */
> > - movl $0xfefefeff, %edi /* magic value */
> > - addl %eax, %edi /* add the magic value to the word. We
> > get
> > - carry bits reported for each byte which
> > - is *not* 0 */
> > - jnc L(93) /* highest byte is C => stop copying */
> > - xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask)
> > */
> > - orl $0xfefefeff, %edi /* set all non-carry bits */
> > - incl %edi /* add 1: if one carry bit was *not* set
> > - the addition will not result in 0. */
> > - jz L(28) /* no is NUL => carry on copying */
> > -
> > -L(93): addl $4, %ecx /* adjust pointer */
> > -L(92): addl $4, %ecx
> > -L(91): addl $4, %ecx
> > -
> > -L(9): movb %al, (%ecx,%edx) /* store first byte of last word */
> > - orb %al, %al /* is it NUL? */
> > - jz L(8) /* yes => return */
> > -
> > - movb %ah, 1(%ecx,%edx) /* store second byte of last word */
> > - orb %ah, %ah /* is it NUL? */
> > - jz L(8) /* yes => return */
> > -
> > - shrl $16, %eax /* make upper bytes accessible */
> > - movb %al, 2(%ecx,%edx) /* store third byte of last word */
> > - orb %al, %al /* is it NUL? */
> > - jz L(8) /* yes => return */
> > -
> > - movb %ah, 3(%ecx,%edx) /* store fourth byte of last word */
> > -
> > -L(8): /* GKM FIXME: check high bounds */
> > - movl DEST(%esp), %eax /* start address of destination is result
> > */
> > - RETURN_BOUNDED_POINTER (DEST(%esp))
> > - popl %edi /* restore saved register */
> > - cfi_adjust_cfa_offset (-4)
> > - cfi_restore (edi)
> > -
> > - LEAVE
> > - RET_PTR
> > -END (BP_SYM (strcat))
> > -libc_hidden_builtin_def (strcat)
> > diff --git a/sysdeps/i386/i686/multiarch/Makefile
> > b/sysdeps/i386/i686/multiarch/Makefile
> > index 8946bfa..92a2b8f 100644
> > --- a/sysdeps/i386/i686/multiarch/Makefile
> > +++ b/sysdeps/i386/i686/multiarch/Makefile
> > @@ -14,8 +14,7 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3
> > mempcpy-ssse3 \
> > memcmp-ssse3 memcmp-sse4 strcasestr-nonascii varshift \
> > strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \
> > strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \
> > - strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \
> > - strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \
> > + strncpy-sse2 stpcpy-sse2 stpncpy-sse2 \
> > strchr-sse2 strrchr-sse2 strchr-sse2-bsf
> > strrchr-sse2-bsf \
> > memchr-sse2 memchr-sse2-bsf \
> > memrchr-sse2 memrchr-sse2-bsf memrchr-c \
> > diff --git a/sysdeps/i386/i686/multiarch/strcat-sse2.S
> > b/sysdeps/i386/i686/multiarch/strcat-sse2.S
> > deleted file mode 100644
> > index e75f92c..0000000
> > --- a/sysdeps/i386/i686/multiarch/strcat-sse2.S
> > +++ /dev/null
> > @@ -1,1243 +0,0 @@
> > -/* strcat with SSE2
> > - Copyright (C) 2011-2012 Free Software Foundation, Inc.
> > - Contributed by Intel Corporation.
> > - This file is part of the GNU C Library.
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <http://www.gnu.org/licenses/>. */
> > -
> > -
> > -#ifndef NOT_IN_libc
> > -
> > -# include <sysdep.h>
> > -
> > -
> > -# define CFI_PUSH(REG) \
> > - cfi_adjust_cfa_offset (4); \
> > - cfi_rel_offset (REG, 0)
> > -
> > -# define CFI_POP(REG) \
> > - cfi_adjust_cfa_offset (-4); \
> > - cfi_restore (REG)
> > -
> > -# define PUSH(REG) pushl REG; CFI_PUSH (REG)
> > -# define POP(REG) popl REG; CFI_POP (REG)
> > -
> > -# ifdef SHARED
> > -# define JMPTBL(I, B) I - B
> > -
> > -/* Load an entry in a jump table into ECX and branch to it. TABLE is a
> > - jump table with relative offsets. INDEX is a register contains
> > the
> > - index into the jump table. SCALE is the scale of INDEX. */
> > -
> > -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
> > - /* We first load PC into ECX. */ \
> > - SETUP_PIC_REG(cx); \
> > - /* Get the address of the jump table. */ \
> > - addl $(TABLE - .), %ecx; \
> > - /* Get the entry and convert the relative offset to the \
> > - absolute address. */ \
> > - addl (%ecx,INDEX,SCALE), %ecx; \
> > - /* We loaded the jump table and adjuested ECX. Go. */ \
> > - jmp *%ecx
> > -# else
> > -# define JMPTBL(I, B) I
> > -
> > -/* Branch to an entry in a jump table. TABLE is a jump table with
> > - absolute offsets. INDEX is a register contains the index into the
> > - jump table. SCALE is the scale of INDEX. */
> > -
> > -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
> > - jmp *TABLE(,INDEX,SCALE)
> > -# endif
> > -
> > -# ifndef STRCAT
> > -# define STRCAT __strcat_sse2
> > -# endif
> > -
> > -# define PARMS 4
> > -# define STR1 PARMS+4
> > -# define STR2 STR1+4
> > -
> > -# ifdef USE_AS_STRNCAT
> > -# define LEN STR2+8
> > -# define STR3 STR1+4
> > -# else
> > -# define STR3 STR1
> > -# endif
> > -
> > -# define USE_AS_STRCAT
> > -# ifdef USE_AS_STRNCAT
> > -# define RETURN POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx);
> > CFI_PUSH(%esi);
> > -# else
> > -# define RETURN POP(%esi); ret; CFI_PUSH(%esi);
> > -# endif
> > -
> > -.text
> > -ENTRY (STRCAT)
> > - PUSH (%esi)
> > - mov STR1(%esp), %eax
> > - mov STR2(%esp), %esi
> > -# ifdef USE_AS_STRNCAT
> > - PUSH (%ebx)
> > - movl LEN(%esp), %ebx
> > - test %ebx, %ebx
> > - jz L(ExitZero)
> > -# endif
> > - cmpb $0, (%esi)
> > - mov %esi, %ecx
> > - mov %eax, %edx
> > - jz L(ExitZero)
> > -
> > - and $63, %ecx
> > - and $63, %edx
> > - cmp $32, %ecx
> > - ja L(StrlenCore7_1)
> > - cmp $48, %edx
> > - ja L(alignment_prolog)
> > -
> > - pxor %xmm0, %xmm0
> > - pxor %xmm4, %xmm4
> > - pxor %xmm7, %xmm7
> > - movdqu (%eax), %xmm1
> > - movdqu (%esi), %xmm5
> > - pcmpeqb %xmm1, %xmm0
> > - movdqu 16(%esi), %xmm6
> > - pmovmskb %xmm0, %ecx
> > - pcmpeqb %xmm5, %xmm4
> > - pcmpeqb %xmm6, %xmm7
> > - test %ecx, %ecx
> > - jnz L(exit_less16_)
> > - mov %eax, %ecx
> > - and $-16, %eax
> > - jmp L(loop_prolog)
> > -
> > -L(alignment_prolog):
> > - pxor %xmm0, %xmm0
> > - pxor %xmm4, %xmm4
> > - mov %edx, %ecx
> > - pxor %xmm7, %xmm7
> > - and $15, %ecx
> > - and $-16, %eax
> > - pcmpeqb (%eax), %xmm0
> > - movdqu (%esi), %xmm5
> > - movdqu 16(%esi), %xmm6
> > - pmovmskb %xmm0, %edx
> > - pcmpeqb %xmm5, %xmm4
> > - shr %cl, %edx
> > - pcmpeqb %xmm6, %xmm7
> > - test %edx, %edx
> > - jnz L(exit_less16)
> > - add %eax, %ecx
> > -
> > - pxor %xmm0, %xmm0
> > -L(loop_prolog):
> > - pxor %xmm1, %xmm1
> > - pxor %xmm2, %xmm2
> > - pxor %xmm3, %xmm3
> > - .p2align 4
> > -L(align16_loop):
> > - pcmpeqb 16(%eax), %xmm0
> > - pmovmskb %xmm0, %edx
> > - test %edx, %edx
> > - jnz L(exit16)
> > -
> > - pcmpeqb 32(%eax), %xmm1
> > - pmovmskb %xmm1, %edx
> > - test %edx, %edx
> > - jnz L(exit32)
> > -
> > - pcmpeqb 48(%eax), %xmm2
> > - pmovmskb %xmm2, %edx
> > - test %edx, %edx
> > - jnz L(exit48)
> > -
> > - pcmpeqb 64(%eax), %xmm3
> > - pmovmskb %xmm3, %edx
> > - lea 64(%eax), %eax
> > - test %edx, %edx
> > - jz L(align16_loop)
> > - bsf %edx, %edx
> > - add %edx, %eax
> > - jmp L(StartStrcpyPart)
> > -
> > - .p2align 4
> > -L(exit16):
> > - bsf %edx, %edx
> > - lea 16(%eax, %edx), %eax
> > - jmp L(StartStrcpyPart)
> > -
> > - .p2align 4
> > -L(exit32):
> > - bsf %edx, %edx
> > - lea 32(%eax, %edx), %eax
> > - jmp L(StartStrcpyPart)
> > -
> > - .p2align 4
> > -L(exit48):
> > - bsf %edx, %edx
> > - lea 48(%eax, %edx), %eax
> > - jmp L(StartStrcpyPart)
> > -
> > - .p2align 4
> > -L(exit_less16):
> > - bsf %edx, %edx
> > - add %ecx, %eax
> > - add %edx, %eax
> > - jmp L(StartStrcpyPart)
> > -
> > - .p2align 4
> > -L(exit_less16_):
> > - bsf %ecx, %ecx
> > - add %ecx, %eax
> > -
> > - .p2align 4
> > -L(StartStrcpyPart):
> > - pmovmskb %xmm4, %edx
> > -# ifdef USE_AS_STRNCAT
> > - cmp $16, %ebx
> > - jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
> > -# endif
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16BytesTail1)
> > -
> > - movdqu %xmm5, (%eax)
> > - pmovmskb %xmm7, %edx
> > -# ifdef USE_AS_STRNCAT
> > - cmp $32, %ebx
> > - jbe L(CopyFrom1To32Bytes1Case2OrCase3)
> > -# endif
> > - test %edx, %edx
> > - jnz L(CopyFrom1To32Bytes1)
> > -
> > - mov %esi, %ecx
> > - and $-16, %esi
> > - and $15, %ecx
> > - pxor %xmm0, %xmm0
> > -# ifdef USE_AS_STRNCAT
> > - add %ecx, %ebx
> > -# endif
> > - sub %ecx, %eax
> > - jmp L(Unalign16Both)
> > -
> > -L(StrlenCore7_1):
> > - mov %eax, %ecx
> > - pxor %xmm0, %xmm0
> > - and $15, %ecx
> > - and $-16, %eax
> > - pcmpeqb (%eax), %xmm0
> > - pmovmskb %xmm0, %edx
> > - shr %cl, %edx
> > - test %edx, %edx
> > - jnz L(exit_less16_1)
> > - add %eax, %ecx
> > -
> > - pxor %xmm0, %xmm0
> > - pxor %xmm1, %xmm1
> > - pxor %xmm2, %xmm2
> > - pxor %xmm3, %xmm3
> > -
> > - .p2align 4
> > -L(align16_loop_1):
> > - pcmpeqb 16(%eax), %xmm0
> > - pmovmskb %xmm0, %edx
> > - test %edx, %edx
> > - jnz L(exit16_1)
> > -
> > - pcmpeqb 32(%eax), %xmm1
> > - pmovmskb %xmm1, %edx
> > - test %edx, %edx
> > - jnz L(exit32_1)
> > -
> > - pcmpeqb 48(%eax), %xmm2
> > - pmovmskb %xmm2, %edx
> > - test %edx, %edx
> > - jnz L(exit48_1)
> > -
> > - pcmpeqb 64(%eax), %xmm3
> > - pmovmskb %xmm3, %edx
> > - lea 64(%eax), %eax
> > - test %edx, %edx
> > - jz L(align16_loop_1)
> > - bsf %edx, %edx
> > - add %edx, %eax
> > - jmp L(StartStrcpyPart_1)
> > -
> > - .p2align 4
> > -L(exit16_1):
> > - bsf %edx, %edx
> > - lea 16(%eax, %edx), %eax
> > - jmp L(StartStrcpyPart_1)
> > -
> > - .p2align 4
> > -L(exit32_1):
> > - bsf %edx, %edx
> > - lea 32(%eax, %edx), %eax
> > - jmp L(StartStrcpyPart_1)
> > -
> > - .p2align 4
> > -L(exit48_1):
> > - bsf %edx, %edx
> > - lea 48(%eax, %edx), %eax
> > - jmp L(StartStrcpyPart_1)
> > -
> > - .p2align 4
> > -L(exit_less16_1):
> > - bsf %edx, %edx
> > - add %ecx, %eax
> > - add %edx, %eax
> > -
> > - .p2align 4
> > -L(StartStrcpyPart_1):
> > - mov %esi, %ecx
> > - and $15, %ecx
> > - and $-16, %esi
> > - pxor %xmm0, %xmm0
> > - pxor %xmm1, %xmm1
> > -
> > -# ifdef USE_AS_STRNCAT
> > - cmp $48, %ebx
> > - ja L(BigN)
> > -# endif
> > - pcmpeqb (%esi), %xmm1
> > -# ifdef USE_AS_STRNCAT
> > - add %ecx, %ebx
> > -# endif
> > - pmovmskb %xmm1, %edx
> > - shr %cl, %edx
> > -# ifdef USE_AS_STRNCAT
> > - cmp $16, %ebx
> > - jbe L(CopyFrom1To16BytesTailCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16BytesTail)
> > -
> > - pcmpeqb 16(%esi), %xmm0
> > - pmovmskb %xmm0, %edx
> > -# ifdef USE_AS_STRNCAT
> > - cmp $32, %ebx
> > - jbe L(CopyFrom1To32BytesCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > - jnz L(CopyFrom1To32Bytes)
> > -
> > - movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */
> > - movdqu %xmm1, (%eax)
> > - sub %ecx, %eax
> > -
> > - .p2align 4
> > -L(Unalign16Both):
> > - mov $16, %ecx
> > - movdqa (%esi, %ecx), %xmm1
> > - movaps 16(%esi, %ecx), %xmm2
> > - movdqu %xmm1, (%eax, %ecx)
> > - pcmpeqb %xmm2, %xmm0
> > - pmovmskb %xmm0, %edx
> > - add $16, %ecx
> > -# ifdef USE_AS_STRNCAT
> > - sub $48, %ebx
> > - jbe L(CopyFrom1To16BytesCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16Bytes)
> > -L(Unalign16BothBigN):
> > - movaps 16(%esi, %ecx), %xmm3
> > - movdqu %xmm2, (%eax, %ecx)
> > - pcmpeqb %xmm3, %xmm0
> > - pmovmskb %xmm0, %edx
> > - add $16, %ecx
> > -# ifdef USE_AS_STRNCAT
> > - sub $16, %ebx
> > - jbe L(CopyFrom1To16BytesCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16Bytes)
> > -
> > - movaps 16(%esi, %ecx), %xmm4
> > - movdqu %xmm3, (%eax, %ecx)
> > - pcmpeqb %xmm4, %xmm0
> > - pmovmskb %xmm0, %edx
> > - add $16, %ecx
> > -# ifdef USE_AS_STRNCAT
> > - sub $16, %ebx
> > - jbe L(CopyFrom1To16BytesCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16Bytes)
> > -
> > - movaps 16(%esi, %ecx), %xmm1
> > - movdqu %xmm4, (%eax, %ecx)
> > - pcmpeqb %xmm1, %xmm0
> > - pmovmskb %xmm0, %edx
> > - add $16, %ecx
> > -# ifdef USE_AS_STRNCAT
> > - sub $16, %ebx
> > - jbe L(CopyFrom1To16BytesCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16Bytes)
> > -
> > - movaps 16(%esi, %ecx), %xmm2
> > - movdqu %xmm1, (%eax, %ecx)
> > - pcmpeqb %xmm2, %xmm0
> > - pmovmskb %xmm0, %edx
> > - add $16, %ecx
> > -# ifdef USE_AS_STRNCAT
> > - sub $16, %ebx
> > - jbe L(CopyFrom1To16BytesCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16Bytes)
> > -
> > - movaps 16(%esi, %ecx), %xmm3
> > - movdqu %xmm2, (%eax, %ecx)
> > - pcmpeqb %xmm3, %xmm0
> > - pmovmskb %xmm0, %edx
> > - add $16, %ecx
> > -# ifdef USE_AS_STRNCAT
> > - sub $16, %ebx
> > - jbe L(CopyFrom1To16BytesCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16Bytes)
> > -
> > - movdqu %xmm3, (%eax, %ecx)
> > - mov %esi, %edx
> > - lea 16(%esi, %ecx), %esi
> > - and $-0x40, %esi
> > - sub %esi, %edx
> > - sub %edx, %eax
> > -# ifdef USE_AS_STRNCAT
> > - lea 128(%ebx, %edx), %ebx
> > -# endif
> > - movaps (%esi), %xmm2
> > - movaps %xmm2, %xmm4
> > - movaps 16(%esi), %xmm5
> > - movaps 32(%esi), %xmm3
> > - movaps %xmm3, %xmm6
> > - movaps 48(%esi), %xmm7
> > - pminub %xmm5, %xmm2
> > - pminub %xmm7, %xmm3
> > - pminub %xmm2, %xmm3
> > - pcmpeqb %xmm0, %xmm3
> > - pmovmskb %xmm3, %edx
> > -# ifdef USE_AS_STRNCAT
> > - sub $64, %ebx
> > - jbe L(UnalignedLeaveCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > - jnz L(Unaligned64Leave)
> > -
> > - .p2align 4
> > -L(Unaligned64Loop_start):
> > - add $64, %eax
> > - add $64, %esi
> > - movdqu %xmm4, -64(%eax)
> > - movaps (%esi), %xmm2
> > - movdqa %xmm2, %xmm4
> > - movdqu %xmm5, -48(%eax)
> > - movaps 16(%esi), %xmm5
> > - pminub %xmm5, %xmm2
> > - movaps 32(%esi), %xmm3
> > - movdqu %xmm6, -32(%eax)
> > - movaps %xmm3, %xmm6
> > - movdqu %xmm7, -16(%eax)
> > - movaps 48(%esi), %xmm7
> > - pminub %xmm7, %xmm3
> > - pminub %xmm2, %xmm3
> > - pcmpeqb %xmm0, %xmm3
> > - pmovmskb %xmm3, %edx
> > -# ifdef USE_AS_STRNCAT
> > - sub $64, %ebx
> > - jbe L(UnalignedLeaveCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > - jz L(Unaligned64Loop_start)
> > -
> > -L(Unaligned64Leave):
> > - pxor %xmm1, %xmm1
> > -
> > - pcmpeqb %xmm4, %xmm0
> > - pcmpeqb %xmm5, %xmm1
> > - pmovmskb %xmm0, %edx
> > - pmovmskb %xmm1, %ecx
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16BytesUnaligned_0)
> > - test %ecx, %ecx
> > - jnz L(CopyFrom1To16BytesUnaligned_16)
> > -
> > - pcmpeqb %xmm6, %xmm0
> > - pcmpeqb %xmm7, %xmm1
> > - pmovmskb %xmm0, %edx
> > - pmovmskb %xmm1, %ecx
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16BytesUnaligned_32)
> > -
> > - bsf %ecx, %edx
> > - movdqu %xmm4, (%eax)
> > - movdqu %xmm5, 16(%eax)
> > - movdqu %xmm6, 32(%eax)
> > - add $48, %esi
> > - add $48, %eax
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > -# ifdef USE_AS_STRNCAT
> > - .p2align 4
> > -L(BigN):
> > - pcmpeqb (%esi), %xmm1
> > - pmovmskb %xmm1, %edx
> > - shr %cl, %edx
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16BytesTail)
> > -
> > - pcmpeqb 16(%esi), %xmm0
> > - pmovmskb %xmm0, %edx
> > - test %edx, %edx
> > - jnz L(CopyFrom1To32Bytes)
> > -
> > - movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */
> > - movdqu %xmm1, (%eax)
> > - sub %ecx, %eax
> > - sub $48, %ebx
> > - add %ecx, %ebx
> > -
> > - mov $16, %ecx
> > - movdqa (%esi, %ecx), %xmm1
> > - movaps 16(%esi, %ecx), %xmm2
> > - movdqu %xmm1, (%eax, %ecx)
> > - pcmpeqb %xmm2, %xmm0
> > - pmovmskb %xmm0, %edx
> > - add $16, %ecx
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16Bytes)
> > - jmp L(Unalign16BothBigN)
> > -# endif
> > -
> > -/*------------end of main part-------------------------------*/
> > -
> > -/* Case1 */
> > - .p2align 4
> > -L(CopyFrom1To16Bytes):
> > - add %ecx, %eax
> > - add %ecx, %esi
> > - bsf %edx, %edx
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > - .p2align 4
> > -L(CopyFrom1To16BytesTail):
> > - add %ecx, %esi
> > - bsf %edx, %edx
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > - .p2align 4
> > -L(CopyFrom1To32Bytes1):
> > - add $16, %esi
> > - add $16, %eax
> > -L(CopyFrom1To16BytesTail1):
> > - bsf %edx, %edx
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > - .p2align 4
> > -L(CopyFrom1To32Bytes):
> > - bsf %edx, %edx
> > - add %ecx, %esi
> > - add $16, %edx
> > - sub %ecx, %edx
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > - .p2align 4
> > -L(CopyFrom1To16BytesUnaligned_0):
> > - bsf %edx, %edx
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > - .p2align 4
> > -L(CopyFrom1To16BytesUnaligned_16):
> > - bsf %ecx, %edx
> > - movdqu %xmm4, (%eax)
> > - add $16, %esi
> > - add $16, %eax
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > - .p2align 4
> > -L(CopyFrom1To16BytesUnaligned_32):
> > - bsf %edx, %edx
> > - movdqu %xmm4, (%eax)
> > - movdqu %xmm5, 16(%eax)
> > - add $32, %esi
> > - add $32, %eax
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > -# ifdef USE_AS_STRNCAT
> > -
> > - .p2align 4
> > -L(CopyFrom1To16BytesExit):
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
> > -
> > -/* Case2 */
> > -
> > - .p2align 4
> > -L(CopyFrom1To16BytesCase2):
> > - add $16, %ebx
> > - add %ecx, %eax
> > - add %ecx, %esi
> > - bsf %edx, %edx
> > - cmp %ebx, %edx
> > - jb L(CopyFrom1To16BytesExit)
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -
> > - .p2align 4
> > -L(CopyFrom1To32BytesCase2):
> > - sub %ecx, %ebx
> > - add %ecx, %esi
> > - bsf %edx, %edx
> > - add $16, %edx
> > - sub %ecx, %edx
> > - cmp %ebx, %edx
> > - jb L(CopyFrom1To16BytesExit)
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -
> > -L(CopyFrom1To16BytesTailCase2):
> > - sub %ecx, %ebx
> > - add %ecx, %esi
> > - bsf %edx, %edx
> > - cmp %ebx, %edx
> > - jb L(CopyFrom1To16BytesExit)
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -
> > -L(CopyFrom1To16BytesTail1Case2):
> > - bsf %edx, %edx
> > - cmp %ebx, %edx
> > - jb L(CopyFrom1To16BytesExit)
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -
> > -/* Case2 or Case3, Case3 */
> > -
> > - .p2align 4
> > -L(CopyFrom1To16BytesCase2OrCase3):
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16BytesCase2)
> > -L(CopyFrom1To16BytesCase3):
> > - add $16, %ebx
> > - add %ecx, %eax
> > - add %ecx, %esi
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -
> > - .p2align 4
> > -L(CopyFrom1To32BytesCase2OrCase3):
> > - test %edx, %edx
> > - jnz L(CopyFrom1To32BytesCase2)
> > - sub %ecx, %ebx
> > - add %ecx, %esi
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -
> > - .p2align 4
> > -L(CopyFrom1To16BytesTailCase2OrCase3):
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16BytesTailCase2)
> > - sub %ecx, %ebx
> > - add %ecx, %esi
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -
> > - .p2align 4
> > -L(CopyFrom1To32Bytes1Case2OrCase3):
> > - add $16, %eax
> > - add $16, %esi
> > - sub $16, %ebx
> > -L(CopyFrom1To16BytesTail1Case2OrCase3):
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16BytesTail1Case2)
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -
> > -# endif
> > -
> > -# ifdef USE_AS_STRNCAT
> > - .p2align 4
> > -L(StrncatExit0):
> > - movb %bh, (%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -# endif
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit1):
> > - movb %bh, 1(%eax)
> > -# endif
> > -L(Exit1):
> > -# ifdef USE_AS_STRNCAT
> > - movb (%esi), %dh
> > -# endif
> > - movb %dh, (%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit2):
> > - movb %bh, 2(%eax)
> > -# endif
> > -L(Exit2):
> > - movw (%esi), %dx
> > - movw %dx, (%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit3):
> > - movb %bh, 3(%eax)
> > -# endif
> > -L(Exit3):
> > - movw (%esi), %cx
> > - movw %cx, (%eax)
> > -# ifdef USE_AS_STRNCAT
> > - movb 2(%esi), %dh
> > -# endif
> > - movb %dh, 2(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit4):
> > - movb %bh, 4(%eax)
> > -# endif
> > -L(Exit4):
> > - movl (%esi), %edx
> > - movl %edx, (%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit5):
> > - movb %bh, 5(%eax)
> > -# endif
> > -L(Exit5):
> > - movl (%esi), %ecx
> > -# ifdef USE_AS_STRNCAT
> > - movb 4(%esi), %dh
> > -# endif
> > - movb %dh, 4(%eax)
> > - movl %ecx, (%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit6):
> > - movb %bh, 6(%eax)
> > -# endif
> > -L(Exit6):
> > - movl (%esi), %ecx
> > - movw 4(%esi), %dx
> > - movl %ecx, (%eax)
> > - movw %dx, 4(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit7):
> > - movb %bh, 7(%eax)
> > -# endif
> > -L(Exit7):
> > - movl (%esi), %ecx
> > - movl 3(%esi), %edx
> > - movl %ecx, (%eax)
> > - movl %edx, 3(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit8):
> > - movb %bh, 8(%eax)
> > -# endif
> > -L(Exit8):
> > - movlpd (%esi), %xmm0
> > - movlpd %xmm0, (%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit9):
> > - movb %bh, 9(%eax)
> > -# endif
> > -L(Exit9):
> > - movlpd (%esi), %xmm0
> > -# ifdef USE_AS_STRNCAT
> > - movb 8(%esi), %dh
> > -# endif
> > - movb %dh, 8(%eax)
> > - movlpd %xmm0, (%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit10):
> > - movb %bh, 10(%eax)
> > -# endif
> > -L(Exit10):
> > - movlpd (%esi), %xmm0
> > - movw 8(%esi), %dx
> > - movlpd %xmm0, (%eax)
> > - movw %dx, 8(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit11):
> > - movb %bh, 11(%eax)
> > -# endif
> > -L(Exit11):
> > - movlpd (%esi), %xmm0
> > - movl 7(%esi), %edx
> > - movlpd %xmm0, (%eax)
> > - movl %edx, 7(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit12):
> > - movb %bh, 12(%eax)
> > -# endif
> > -L(Exit12):
> > - movlpd (%esi), %xmm0
> > - movl 8(%esi), %edx
> > - movlpd %xmm0, (%eax)
> > - movl %edx, 8(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit13):
> > - movb %bh, 13(%eax)
> > -# endif
> > -L(Exit13):
> > - movlpd (%esi), %xmm0
> > - movlpd 5(%esi), %xmm1
> > - movlpd %xmm0, (%eax)
> > - movlpd %xmm1, 5(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit14):
> > - movb %bh, 14(%eax)
> > -# endif
> > -L(Exit14):
> > - movlpd (%esi), %xmm0
> > - movlpd 6(%esi), %xmm1
> > - movlpd %xmm0, (%eax)
> > - movlpd %xmm1, 6(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit15):
> > - movb %bh, 15(%eax)
> > -# endif
> > -L(Exit15):
> > - movlpd (%esi), %xmm0
> > - movlpd 7(%esi), %xmm1
> > - movlpd %xmm0, (%eax)
> > - movlpd %xmm1, 7(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit16):
> > - movb %bh, 16(%eax)
> > -# endif
> > -L(Exit16):
> > - movdqu (%esi), %xmm0
> > - movdqu %xmm0, (%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit17):
> > - movb %bh, 17(%eax)
> > -# endif
> > -L(Exit17):
> > - movdqu (%esi), %xmm0
> > -# ifdef USE_AS_STRNCAT
> > - movb 16(%esi), %dh
> > -# endif
> > - movdqu %xmm0, (%eax)
> > - movb %dh, 16(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit18):
> > - movb %bh, 18(%eax)
> > -# endif
> > -L(Exit18):
> > - movdqu (%esi), %xmm0
> > - movw 16(%esi), %cx
> > - movdqu %xmm0, (%eax)
> > - movw %cx, 16(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit19):
> > - movb %bh, 19(%eax)
> > -# endif
> > -L(Exit19):
> > - movdqu (%esi), %xmm0
> > - movl 15(%esi), %ecx
> > - movdqu %xmm0, (%eax)
> > - movl %ecx, 15(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit20):
> > - movb %bh, 20(%eax)
> > -# endif
> > -L(Exit20):
> > - movdqu (%esi), %xmm0
> > - movl 16(%esi), %ecx
> > - movdqu %xmm0, (%eax)
> > - movl %ecx, 16(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit21):
> > - movb %bh, 21(%eax)
> > -# endif
> > -L(Exit21):
> > - movdqu (%esi), %xmm0
> > - movl 16(%esi), %ecx
> > -# ifdef USE_AS_STRNCAT
> > - movb 20(%esi), %dh
> > -# endif
> > - movdqu %xmm0, (%eax)
> > - movl %ecx, 16(%eax)
> > - movb %dh, 20(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit22):
> > - movb %bh, 22(%eax)
> > -# endif
> > -L(Exit22):
> > - movdqu (%esi), %xmm0
> > - movlpd 14(%esi), %xmm3
> > - movdqu %xmm0, (%eax)
> > - movlpd %xmm3, 14(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit23):
> > - movb %bh, 23(%eax)
> > -# endif
> > -L(Exit23):
> > - movdqu (%esi), %xmm0
> > - movlpd 15(%esi), %xmm3
> > - movdqu %xmm0, (%eax)
> > - movlpd %xmm3, 15(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit24):
> > - movb %bh, 24(%eax)
> > -# endif
> > -L(Exit24):
> > - movdqu (%esi), %xmm0
> > - movlpd 16(%esi), %xmm2
> > - movdqu %xmm0, (%eax)
> > - movlpd %xmm2, 16(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit25):
> > - movb %bh, 25(%eax)
> > -# endif
> > -L(Exit25):
> > - movdqu (%esi), %xmm0
> > - movlpd 16(%esi), %xmm2
> > -# ifdef USE_AS_STRNCAT
> > - movb 24(%esi), %dh
> > -# endif
> > - movdqu %xmm0, (%eax)
> > - movlpd %xmm2, 16(%eax)
> > - movb %dh, 24(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit26):
> > - movb %bh, 26(%eax)
> > -# endif
> > -L(Exit26):
> > - movdqu (%esi), %xmm0
> > - movlpd 16(%esi), %xmm2
> > - movw 24(%esi), %cx
> > - movdqu %xmm0, (%eax)
> > - movlpd %xmm2, 16(%eax)
> > - movw %cx, 24(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit27):
> > - movb %bh, 27(%eax)
> > -# endif
> > -L(Exit27):
> > - movdqu (%esi), %xmm0
> > - movlpd 16(%esi), %xmm2
> > - movl 23(%esi), %ecx
> > - movdqu %xmm0, (%eax)
> > - movlpd %xmm2, 16(%eax)
> > - movl %ecx, 23(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit28):
> > - movb %bh, 28(%eax)
> > -# endif
> > -L(Exit28):
> > - movdqu (%esi), %xmm0
> > - movlpd 16(%esi), %xmm2
> > - movl 24(%esi), %ecx
> > - movdqu %xmm0, (%eax)
> > - movlpd %xmm2, 16(%eax)
> > - movl %ecx, 24(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit29):
> > - movb %bh, 29(%eax)
> > -# endif
> > -L(Exit29):
> > - movdqu (%esi), %xmm0
> > - movdqu 13(%esi), %xmm2
> > - movdqu %xmm0, (%eax)
> > - movdqu %xmm2, 13(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit30):
> > - movb %bh, 30(%eax)
> > -# endif
> > -L(Exit30):
> > - movdqu (%esi), %xmm0
> > - movdqu 14(%esi), %xmm2
> > - movdqu %xmm0, (%eax)
> > - movdqu %xmm2, 14(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit31):
> > - movb %bh, 31(%eax)
> > -# endif
> > -L(Exit31):
> > - movdqu (%esi), %xmm0
> > - movdqu 15(%esi), %xmm2
> > - movdqu %xmm0, (%eax)
> > - movdqu %xmm2, 15(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -# ifdef USE_AS_STRNCAT
> > -L(StrncatExit32):
> > - movb %bh, 32(%eax)
> > -# endif
> > -L(Exit32):
> > - movdqu (%esi), %xmm0
> > - movdqu 16(%esi), %xmm2
> > - movdqu %xmm0, (%eax)
> > - movdqu %xmm2, 16(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > -# ifdef USE_AS_STRNCAT
> > -
> > - .p2align 4
> > -L(UnalignedLeaveCase2OrCase3):
> > - test %edx, %edx
> > - jnz L(Unaligned64LeaveCase2)
> > -L(Unaligned64LeaveCase3):
> > - lea 64(%ebx), %ecx
> > - and $-16, %ecx
> > - add $48, %ebx
> > - jl L(CopyFrom1To16BytesCase3)
> > - movdqu %xmm4, (%eax)
> > - sub $16, %ebx
> > - jb L(CopyFrom1To16BytesCase3)
> > - movdqu %xmm5, 16(%eax)
> > - sub $16, %ebx
> > - jb L(CopyFrom1To16BytesCase3)
> > - movdqu %xmm6, 32(%eax)
> > - sub $16, %ebx
> > - jb L(CopyFrom1To16BytesCase3)
> > - movdqu %xmm7, 48(%eax)
> > - xor %bh, %bh
> > - movb %bh, 64(%eax)
> > - mov STR3(%esp), %eax
> > - RETURN
> > -
> > - .p2align 4
> > -L(Unaligned64LeaveCase2):
> > - xor %ecx, %ecx
> > - pcmpeqb %xmm4, %xmm0
> > - pmovmskb %xmm0, %edx
> > - add $48, %ebx
> > - jle L(CopyFrom1To16BytesCase2OrCase3)
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16Bytes)
> > -
> > - pcmpeqb %xmm5, %xmm0
> > - pmovmskb %xmm0, %edx
> > - movdqu %xmm4, (%eax)
> > - add $16, %ecx
> > - sub $16, %ebx
> > - jbe L(CopyFrom1To16BytesCase2OrCase3)
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16Bytes)
> > -
> > - pcmpeqb %xmm6, %xmm0
> > - pmovmskb %xmm0, %edx
> > - movdqu %xmm5, 16(%eax)
> > - add $16, %ecx
> > - sub $16, %ebx
> > - jbe L(CopyFrom1To16BytesCase2OrCase3)
> > - test %edx, %edx
> > - jnz L(CopyFrom1To16Bytes)
> > -
> > - pcmpeqb %xmm7, %xmm0
> > - pmovmskb %xmm0, %edx
> > - movdqu %xmm6, 32(%eax)
> > - lea 16(%eax, %ecx), %eax
> > - lea 16(%esi, %ecx), %esi
> > - bsf %edx, %edx
> > - cmp %ebx, %edx
> > - jb L(CopyFrom1To16BytesExit)
> > - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
> > -# endif
> > - .p2align 4
> > -L(ExitZero):
> > - RETURN
> > -
> > -END (STRCAT)
> > -
> > - .p2align 4
> > - .section .rodata
> > -L(ExitTable):
> > - .int JMPTBL(L(Exit1), L(ExitTable))
> > - .int JMPTBL(L(Exit2), L(ExitTable))
> > - .int JMPTBL(L(Exit3), L(ExitTable))
> > - .int JMPTBL(L(Exit4), L(ExitTable))
> > - .int JMPTBL(L(Exit5), L(ExitTable))
> > - .int JMPTBL(L(Exit6), L(ExitTable))
> > - .int JMPTBL(L(Exit7), L(ExitTable))
> > - .int JMPTBL(L(Exit8), L(ExitTable))
> > - .int JMPTBL(L(Exit9), L(ExitTable))
> > - .int JMPTBL(L(Exit10), L(ExitTable))
> > - .int JMPTBL(L(Exit11), L(ExitTable))
> > - .int JMPTBL(L(Exit12), L(ExitTable))
> > - .int JMPTBL(L(Exit13), L(ExitTable))
> > - .int JMPTBL(L(Exit14), L(ExitTable))
> > - .int JMPTBL(L(Exit15), L(ExitTable))
> > - .int JMPTBL(L(Exit16), L(ExitTable))
> > - .int JMPTBL(L(Exit17), L(ExitTable))
> > - .int JMPTBL(L(Exit18), L(ExitTable))
> > - .int JMPTBL(L(Exit19), L(ExitTable))
> > - .int JMPTBL(L(Exit20), L(ExitTable))
> > - .int JMPTBL(L(Exit21), L(ExitTable))
> > - .int JMPTBL(L(Exit22), L(ExitTable))
> > - .int JMPTBL(L(Exit23), L(ExitTable))
> > - .int JMPTBL(L(Exit24), L(ExitTable))
> > - .int JMPTBL(L(Exit25), L(ExitTable))
> > - .int JMPTBL(L(Exit26), L(ExitTable))
> > - .int JMPTBL(L(Exit27), L(ExitTable))
> > - .int JMPTBL(L(Exit28), L(ExitTable))
> > - .int JMPTBL(L(Exit29), L(ExitTable))
> > - .int JMPTBL(L(Exit30), L(ExitTable))
> > - .int JMPTBL(L(Exit31), L(ExitTable))
> > - .int JMPTBL(L(Exit32), L(ExitTable))
> > -# ifdef USE_AS_STRNCAT
> > -L(ExitStrncatTable):
> > - .int JMPTBL(L(StrncatExit0), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit1), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit2), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit3), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit4), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit5), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit6), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit7), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit8), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit9), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit10), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit11), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit12), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit13), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit14), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit15), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit16), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit17), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit18), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit19), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit20), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit21), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit22), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit23), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit24), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit25), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit26), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit27), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit28), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit29), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit30), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit31), L(ExitStrncatTable))
> > - .int JMPTBL(L(StrncatExit32), L(ExitStrncatTable))
> > -# endif
> > -#endif
> > diff --git a/sysdeps/i386/i686/multiarch/strcat-ssse3.S
> > b/sysdeps/i386/i686/multiarch/strcat-ssse3.S
> > deleted file mode 100644
> > index 72bc49c..0000000
> > --- a/sysdeps/i386/i686/multiarch/strcat-ssse3.S
> > +++ /dev/null
> > @@ -1,572 +0,0 @@
> > -/* strcat with SSSE3
> > - Copyright (C) 2011 Free Software Foundation, Inc.
> > - Contributed by Intel Corporation.
> > - This file is part of the GNU C Library.
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <http://www.gnu.org/licenses/>. */
> > -
> > -
> > -#ifndef NOT_IN_libc
> > -
> > -# include <sysdep.h>
> > -
> > -# define CFI_PUSH(REG) \
> > - cfi_adjust_cfa_offset (4); \
> > - cfi_rel_offset (REG, 0)
> > -
> > -# define CFI_POP(REG) \
> > - cfi_adjust_cfa_offset (-4); \
> > - cfi_restore (REG)
> > -
> > -# define PUSH(REG) pushl REG; CFI_PUSH (REG)
> > -# define POP(REG) popl REG; CFI_POP (REG)
> > -
> > -# ifndef STRCAT
> > -# define STRCAT __strcat_ssse3
> > -# endif
> > -
> > -# define PARMS 4
> > -# define STR1 PARMS+4
> > -# define STR2 STR1+4
> > -
> > -# ifdef USE_AS_STRNCAT
> > -# define LEN STR2+8
> > -# endif
> > -
> > -# define USE_AS_STRCAT
> > -
> > -.text
> > -ENTRY (STRCAT)
> > - PUSH (%edi)
> > - mov STR1(%esp), %edi
> > - mov %edi, %edx
> > -
> > -# define RETURN jmp L(StartStrcpyPart)
> > -# include "strlen-sse2.S"
> > -
> > -L(StartStrcpyPart):
> > - mov STR2(%esp), %ecx
> > - lea (%edi, %eax), %edx
> > -# ifdef USE_AS_STRNCAT
> > - PUSH (%ebx)
> > - mov LEN(%esp), %ebx
> > - test %ebx, %ebx
> > - jz L(StrncatExit0)
> > - cmp $8, %ebx
> > - jbe L(StrncatExit8Bytes)
> > -# endif
> > - cmpb $0, (%ecx)
> > - jz L(Exit1)
> > - cmpb $0, 1(%ecx)
> > - jz L(Exit2)
> > - cmpb $0, 2(%ecx)
> > - jz L(Exit3)
> > - cmpb $0, 3(%ecx)
> > - jz L(Exit4)
> > - cmpb $0, 4(%ecx)
> > - jz L(Exit5)
> > - cmpb $0, 5(%ecx)
> > - jz L(Exit6)
> > - cmpb $0, 6(%ecx)
> > - jz L(Exit7)
> > - cmpb $0, 7(%ecx)
> > - jz L(Exit8)
> > - cmpb $0, 8(%ecx)
> > - jz L(Exit9)
> > -# ifdef USE_AS_STRNCAT
> > - cmp $16, %ebx
> > - jb L(StrncatExit15Bytes)
> > -# endif
> > - cmpb $0, 9(%ecx)
> > - jz L(Exit10)
> > - cmpb $0, 10(%ecx)
> > - jz L(Exit11)
> > - cmpb $0, 11(%ecx)
> > - jz L(Exit12)
> > - cmpb $0, 12(%ecx)
> > - jz L(Exit13)
> > - cmpb $0, 13(%ecx)
> > - jz L(Exit14)
> > - cmpb $0, 14(%ecx)
> > - jz L(Exit15)
> > - cmpb $0, 15(%ecx)
> > - jz L(Exit16)
> > -# ifdef USE_AS_STRNCAT
> > - cmp $16, %ebx
> > - je L(StrncatExit16)
> > -
> > -# define RETURN1 \
> > - POP (%ebx); \
> > - POP (%edi); \
> > - ret; \
> > - CFI_PUSH (%ebx); \
> > - CFI_PUSH (%edi)
> > -# define USE_AS_STRNCPY
> > -# else
> > -# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi)
> > -# endif
> > -# include "strcpy-ssse3.S"
> > - .p2align 4
> > -L(CopyFrom1To16Bytes):
> > - add %esi, %edx
> > - add %esi, %ecx
> > -
> > - POP (%esi)
> > - test %al, %al
> > - jz L(ExitHigh)
> > - test $0x01, %al
> > - jnz L(Exit1)
> > - test $0x02, %al
> > - jnz L(Exit2)
> > - test $0x04, %al
> > - jnz L(Exit3)
> > - test $0x08, %al
> > - jnz L(Exit4)
> > - test $0x10, %al
> > - jnz L(Exit5)
> > - test $0x20, %al
> > - jnz L(Exit6)
> > - test $0x40, %al
> > - jnz L(Exit7)
> > - movlpd (%ecx), %xmm0
> > - movlpd %xmm0, (%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(ExitHigh):
> > - test $0x01, %ah
> > - jnz L(Exit9)
> > - test $0x02, %ah
> > - jnz L(Exit10)
> > - test $0x04, %ah
> > - jnz L(Exit11)
> > - test $0x08, %ah
> > - jnz L(Exit12)
> > - test $0x10, %ah
> > - jnz L(Exit13)
> > - test $0x20, %ah
> > - jnz L(Exit14)
> > - test $0x40, %ah
> > - jnz L(Exit15)
> > - movlpd (%ecx), %xmm0
> > - movlpd 8(%ecx), %xmm1
> > - movlpd %xmm0, (%edx)
> > - movlpd %xmm1, 8(%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit1):
> > - movb %bh, 1(%edx)
> > -L(Exit1):
> > - movb (%ecx), %al
> > - movb %al, (%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit2):
> > - movb %bh, 2(%edx)
> > -L(Exit2):
> > - movw (%ecx), %ax
> > - movw %ax, (%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit3):
> > - movb %bh, 3(%edx)
> > -L(Exit3):
> > - movw (%ecx), %ax
> > - movw %ax, (%edx)
> > - movb 2(%ecx), %al
> > - movb %al, 2(%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit4):
> > - movb %bh, 4(%edx)
> > -L(Exit4):
> > - movl (%ecx), %eax
> > - movl %eax, (%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit5):
> > - movb %bh, 5(%edx)
> > -L(Exit5):
> > - movl (%ecx), %eax
> > - movl %eax, (%edx)
> > - movb 4(%ecx), %al
> > - movb %al, 4(%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit6):
> > - movb %bh, 6(%edx)
> > -L(Exit6):
> > - movl (%ecx), %eax
> > - movl %eax, (%edx)
> > - movw 4(%ecx), %ax
> > - movw %ax, 4(%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit7):
> > - movb %bh, 7(%edx)
> > -L(Exit7):
> > - movl (%ecx), %eax
> > - movl %eax, (%edx)
> > - movl 3(%ecx), %eax
> > - movl %eax, 3(%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit8):
> > - movb %bh, 8(%edx)
> > -L(Exit8):
> > - movlpd (%ecx), %xmm0
> > - movlpd %xmm0, (%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit9):
> > - movb %bh, 9(%edx)
> > -L(Exit9):
> > - movlpd (%ecx), %xmm0
> > - movlpd %xmm0, (%edx)
> > - movb 8(%ecx), %al
> > - movb %al, 8(%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit10):
> > - movb %bh, 10(%edx)
> > -L(Exit10):
> > - movlpd (%ecx), %xmm0
> > - movlpd %xmm0, (%edx)
> > - movw 8(%ecx), %ax
> > - movw %ax, 8(%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit11):
> > - movb %bh, 11(%edx)
> > -L(Exit11):
> > - movlpd (%ecx), %xmm0
> > - movlpd %xmm0, (%edx)
> > - movl 7(%ecx), %eax
> > - movl %eax, 7(%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit12):
> > - movb %bh, 12(%edx)
> > -L(Exit12):
> > - movlpd (%ecx), %xmm0
> > - movlpd %xmm0, (%edx)
> > - movl 8(%ecx), %eax
> > - movl %eax, 8(%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit13):
> > - movb %bh, 13(%edx)
> > -L(Exit13):
> > - movlpd (%ecx), %xmm0
> > - movlpd %xmm0, (%edx)
> > - movlpd 5(%ecx), %xmm0
> > - movlpd %xmm0, 5(%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit14):
> > - movb %bh, 14(%edx)
> > -L(Exit14):
> > - movlpd (%ecx), %xmm0
> > - movlpd %xmm0, (%edx)
> > - movlpd 6(%ecx), %xmm0
> > - movlpd %xmm0, 6(%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit15):
> > - movb %bh, 15(%edx)
> > -L(Exit15):
> > - movlpd (%ecx), %xmm0
> > - movlpd %xmm0, (%edx)
> > - movlpd 7(%ecx), %xmm0
> > - movlpd %xmm0, 7(%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit16):
> > - movb %bh, 16(%edx)
> > -L(Exit16):
> > - movlpd (%ecx), %xmm0
> > - movlpd 8(%ecx), %xmm1
> > - movlpd %xmm0, (%edx)
> > - movlpd %xmm1, 8(%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > -# ifdef USE_AS_STRNCPY
> > -
> > - CFI_PUSH(%esi)
> > -
> > - .p2align 4
> > -L(CopyFrom1To16BytesCase2):
> > - add $16, %ebx
> > - add %esi, %ecx
> > - lea (%esi, %edx), %esi
> > - lea -9(%ebx), %edx
> > - and $1<<7, %dh
> > - or %al, %dh
> > - test %dh, %dh
> > - lea (%esi), %edx
> > - POP (%esi)
> > - jz L(ExitHighCase2)
> > -
> > - test $0x01, %al
> > - jnz L(Exit1)
> > - cmp $1, %ebx
> > - je L(StrncatExit1)
> > - test $0x02, %al
> > - jnz L(Exit2)
> > - cmp $2, %ebx
> > - je L(StrncatExit2)
> > - test $0x04, %al
> > - jnz L(Exit3)
> > - cmp $3, %ebx
> > - je L(StrncatExit3)
> > - test $0x08, %al
> > - jnz L(Exit4)
> > - cmp $4, %ebx
> > - je L(StrncatExit4)
> > - test $0x10, %al
> > - jnz L(Exit5)
> > - cmp $5, %ebx
> > - je L(StrncatExit5)
> > - test $0x20, %al
> > - jnz L(Exit6)
> > - cmp $6, %ebx
> > - je L(StrncatExit6)
> > - test $0x40, %al
> > - jnz L(Exit7)
> > - cmp $7, %ebx
> > - je L(StrncatExit7)
> > - movlpd (%ecx), %xmm0
> > - movlpd %xmm0, (%edx)
> > - lea 7(%edx), %eax
> > - cmpb $1, (%eax)
> > - sbb $-1, %eax
> > - xor %cl, %cl
> > - movb %cl, (%eax)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(ExitHighCase2):
> > - test $0x01, %ah
> > - jnz L(Exit9)
> > - cmp $9, %ebx
> > - je L(StrncatExit9)
> > - test $0x02, %ah
> > - jnz L(Exit10)
> > - cmp $10, %ebx
> > - je L(StrncatExit10)
> > - test $0x04, %ah
> > - jnz L(Exit11)
> > - cmp $11, %ebx
> > - je L(StrncatExit11)
> > - test $0x8, %ah
> > - jnz L(Exit12)
> > - cmp $12, %ebx
> > - je L(StrncatExit12)
> > - test $0x10, %ah
> > - jnz L(Exit13)
> > - cmp $13, %ebx
> > - je L(StrncatExit13)
> > - test $0x20, %ah
> > - jnz L(Exit14)
> > - cmp $14, %ebx
> > - je L(StrncatExit14)
> > - test $0x40, %ah
> > - jnz L(Exit15)
> > - cmp $15, %ebx
> > - je L(StrncatExit15)
> > - movlpd (%ecx), %xmm0
> > - movlpd %xmm0, (%edx)
> > - movlpd 8(%ecx), %xmm1
> > - movlpd %xmm1, 8(%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - CFI_PUSH(%esi)
> > -
> > -L(CopyFrom1To16BytesCase2OrCase3):
> > - test %eax, %eax
> > - jnz L(CopyFrom1To16BytesCase2)
> > -
> > - .p2align 4
> > -L(CopyFrom1To16BytesCase3):
> > - add $16, %ebx
> > - add %esi, %edx
> > - add %esi, %ecx
> > -
> > - POP (%esi)
> > -
> > - cmp $8, %ebx
> > - ja L(ExitHighCase3)
> > - cmp $1, %ebx
> > - je L(StrncatExit1)
> > - cmp $2, %ebx
> > - je L(StrncatExit2)
> > - cmp $3, %ebx
> > - je L(StrncatExit3)
> > - cmp $4, %ebx
> > - je L(StrncatExit4)
> > - cmp $5, %ebx
> > - je L(StrncatExit5)
> > - cmp $6, %ebx
> > - je L(StrncatExit6)
> > - cmp $7, %ebx
> > - je L(StrncatExit7)
> > - movlpd (%ecx), %xmm0
> > - movlpd %xmm0, (%edx)
> > - movb %bh, 8(%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(ExitHighCase3):
> > - cmp $9, %ebx
> > - je L(StrncatExit9)
> > - cmp $10, %ebx
> > - je L(StrncatExit10)
> > - cmp $11, %ebx
> > - je L(StrncatExit11)
> > - cmp $12, %ebx
> > - je L(StrncatExit12)
> > - cmp $13, %ebx
> > - je L(StrncatExit13)
> > - cmp $14, %ebx
> > - je L(StrncatExit14)
> > - cmp $15, %ebx
> > - je L(StrncatExit15)
> > - movlpd (%ecx), %xmm0
> > - movlpd %xmm0, (%edx)
> > - movlpd 8(%ecx), %xmm1
> > - movlpd %xmm1, 8(%edx)
> > - movb %bh, 16(%edx)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit0):
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit15Bytes):
> > - cmp $9, %ebx
> > - je L(StrncatExit9)
> > - cmpb $0, 9(%ecx)
> > - jz L(Exit10)
> > - cmp $10, %ebx
> > - je L(StrncatExit10)
> > - cmpb $0, 10(%ecx)
> > - jz L(Exit11)
> > - cmp $11, %ebx
> > - je L(StrncatExit11)
> > - cmpb $0, 11(%ecx)
> > - jz L(Exit12)
> > - cmp $12, %ebx
> > - je L(StrncatExit12)
> > - cmpb $0, 12(%ecx)
> > - jz L(Exit13)
> > - cmp $13, %ebx
> > - je L(StrncatExit13)
> > - cmpb $0, 13(%ecx)
> > - jz L(Exit14)
> > - cmp $14, %ebx
> > - je L(StrncatExit14)
> > - movlpd (%ecx), %xmm0
> > - movlpd %xmm0, (%edx)
> > - movlpd 7(%ecx), %xmm0
> > - movlpd %xmm0, 7(%edx)
> > - lea 14(%edx), %eax
> > - cmpb $1, (%eax)
> > - sbb $-1, %eax
> > - movb %bh, (%eax)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > - .p2align 4
> > -L(StrncatExit8Bytes):
> > - cmpb $0, (%ecx)
> > - jz L(Exit1)
> > - cmp $1, %ebx
> > - je L(StrncatExit1)
> > - cmpb $0, 1(%ecx)
> > - jz L(Exit2)
> > - cmp $2, %ebx
> > - je L(StrncatExit2)
> > - cmpb $0, 2(%ecx)
> > - jz L(Exit3)
> > - cmp $3, %ebx
> > - je L(StrncatExit3)
> > - cmpb $0, 3(%ecx)
> > - jz L(Exit4)
> > - cmp $4, %ebx
> > - je L(StrncatExit4)
> > - cmpb $0, 4(%ecx)
> > - jz L(Exit5)
> > - cmp $5, %ebx
> > - je L(StrncatExit5)
> > - cmpb $0, 5(%ecx)
> > - jz L(Exit6)
> > - cmp $6, %ebx
> > - je L(StrncatExit6)
> > - cmpb $0, 6(%ecx)
> > - jz L(Exit7)
> > - cmp $7, %ebx
> > - je L(StrncatExit7)
> > - movlpd (%ecx), %xmm0
> > - movlpd %xmm0, (%edx)
> > - lea 7(%edx), %eax
> > - cmpb $1, (%eax)
> > - sbb $-1, %eax
> > - movb %bh, (%eax)
> > - movl %edi, %eax
> > - RETURN1
> > -
> > -# endif
> > -END (STRCAT)
> > -#endif
> > diff --git a/sysdeps/i386/i686/multiarch/strcat.S
> > b/sysdeps/i386/i686/multiarch/strcat.S
> > deleted file mode 100644
> > index e68feca..0000000
> > --- a/sysdeps/i386/i686/multiarch/strcat.S
> > +++ /dev/null
> > @@ -1,119 +0,0 @@
> > -/* Multiple versions of strcat
> > - Copyright (C) 2011-2012 Free Software Foundation, Inc.
> > - Contributed by Intel Corporation.
> > - This file is part of the GNU C Library.
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <http://www.gnu.org/licenses/>. */
> > -
> > -#include <sysdep.h>
> > -#include <init-arch.h>
> > -
> > -#ifndef USE_AS_STRNCAT
> > -# ifndef STRCAT
> > -# define STRCAT strcat
> > -# endif
> > -#endif
> > -
> > -#ifdef USE_AS_STRNCAT
> > -# define STRCAT_SSSE3 __strncat_ssse3
> > -# define STRCAT_SSE2 __strncat_sse2
> > -# define STRCAT_IA32 __strncat_ia32
> > -# define __GI_STRCAT __GI_strncat
> > -#else
> > -# define STRCAT_SSSE3 __strcat_ssse3
> > -# define STRCAT_SSE2 __strcat_sse2
> > -# define STRCAT_IA32 __strcat_ia32
> > -# define __GI_STRCAT __GI_strcat
> > -#endif
> > -
> > -
> > -/* Define multiple versions only for the definition in libc. Don't
> > - define multiple versions for strncat in static library since we
> > - need strncat before the initialization happened. */
> > -#ifndef NOT_IN_libc
> > -
> > -# ifdef SHARED
> > - .text
> > -ENTRY(STRCAT)
> > - .type STRCAT, @gnu_indirect_function
> > - pushl %ebx
> > - cfi_adjust_cfa_offset (4)
> > - cfi_rel_offset (ebx, 0)
> > - LOAD_PIC_REG(bx)
> > - cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
> > - jne 1f
> > - call __init_cpu_features
> > -1: leal STRCAT_IA32@GOTOFF(%ebx), %eax
> > - testl $bit_SSE2,
> > CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
> > - jz 2f
> > - leal STRCAT_SSE2@GOTOFF(%ebx), %eax
> > - testl $bit_Fast_Unaligned_Load,
> > FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
> > - jnz 2f
> > - testl $bit_SSSE3,
> > CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
> > - jz 2f
> > - leal STRCAT_SSSE3@GOTOFF(%ebx), %eax
> > -2: popl %ebx
> > - cfi_adjust_cfa_offset (-4)
> > - cfi_restore (ebx)
> > - ret
> > -END(STRCAT)
> > -# else
> > -
> > -ENTRY(STRCAT)
> > - .type STRCAT, @gnu_indirect_function
> > - cmpl $0, KIND_OFFSET+__cpu_features
> > - jne 1f
> > - call __init_cpu_features
> > -1: leal STRCAT_IA32, %eax
> > - testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
> > - jz 2f
> > - leal STRCAT_SSE2, %eax
> > - testl $bit_Fast_Unaligned_Load,
> > FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features
> > - jnz 2f
> > - testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
> > - jz 2f
> > - leal STRCAT_SSSE3, %eax
> > -2: ret
> > -END(STRCAT)
> > -
> > -# endif
> > -
> > -# undef ENTRY
> > -# define ENTRY(name) \
> > - .type STRCAT_IA32, @function; \
> > - .align 16; \
> > - STRCAT_IA32: cfi_startproc; \
> > - CALL_MCOUNT
> > -# undef END
> > -# define END(name) \
> > - cfi_endproc; .size STRCAT_IA32, .-STRCAT_IA32
> > -
> > -# ifdef SHARED
> > -# undef libc_hidden_builtin_def
> > -/* It doesn't make sense to send libc-internal strcat calls through a
> > PLT.
> > - The speedup we get from using SSSE3 instruction is likely eaten away
> > - by the indirect call in the PLT. */
> > -# define libc_hidden_builtin_def(name) \
> > - .globl __GI_STRCAT; __GI_STRCAT = STRCAT_IA32
> > -# undef libc_hidden_def
> > -# define libc_hidden_def(name) \
> > - .globl __GI___STRCAT; __GI___STRCAT = STRCAT_IA32
> > -
> > -# endif
> > -#endif
> > -
> > -#ifndef USE_AS_STRNCAT
> > -# include "../../i486/strcat.S"
> > -#endif
> > diff --git a/sysdeps/i386/i686/multiarch/strncat-c.c
> > b/sysdeps/i386/i686/multiarch/strncat-c.c
> > deleted file mode 100644
> > index 132a000..0000000
> > --- a/sysdeps/i386/i686/multiarch/strncat-c.c
> > +++ /dev/null
> > @@ -1,8 +0,0 @@
> > -#define STRNCAT __strncat_ia32
> > -#ifdef SHARED
> > -#undef libc_hidden_def
> > -#define libc_hidden_def(name) \
> > - __hidden_ver1 (__strncat_ia32, __GI___strncat, __strncat_ia32);
> > -#endif
> > -
> > -#include "string/strncat.c"
> > diff --git a/sysdeps/i386/i686/multiarch/strncat-sse2.S
> > b/sysdeps/i386/i686/multiarch/strncat-sse2.S
> > deleted file mode 100644
> > index f1045b7..0000000
> > --- a/sysdeps/i386/i686/multiarch/strncat-sse2.S
> > +++ /dev/null
> > @@ -1,4 +0,0 @@
> > -#define STRCAT __strncat_sse2
> > -#define USE_AS_STRNCAT
> > -
> > -#include "strcat-sse2.S"
> > diff --git a/sysdeps/i386/i686/multiarch/strncat-ssse3.S
> > b/sysdeps/i386/i686/multiarch/strncat-ssse3.S
> > deleted file mode 100644
> > index 625b90a..0000000
> > --- a/sysdeps/i386/i686/multiarch/strncat-ssse3.S
> > +++ /dev/null
> > @@ -1,4 +0,0 @@
> > -#define STRCAT __strncat_ssse3
> > -#define USE_AS_STRNCAT
> > -
> > -#include "strcat-ssse3.S"
> > diff --git a/sysdeps/i386/i686/multiarch/strncat.S
> > b/sysdeps/i386/i686/multiarch/strncat.S
> > deleted file mode 100644
> > index fd569c2..0000000
> > --- a/sysdeps/i386/i686/multiarch/strncat.S
> > +++ /dev/null
> > @@ -1,3 +0,0 @@
> > -#define STRCAT strncat
> > -#define USE_AS_STRNCAT
> > -#include "strcat.S"
> > diff --git a/sysdeps/powerpc/strcat.c b/sysdeps/powerpc/strcat.c
> > deleted file mode 100644
> > index 28575d0..0000000
> > --- a/sysdeps/powerpc/strcat.c
> > +++ /dev/null
> > @@ -1,30 +0,0 @@
> > -/* strcat version that uses fast strcpy/strlen.
> > - Copyright (C) 1997, 2003 Free Software Foundation, Inc.
> > - This file is part of the GNU C Library.
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <http://www.gnu.org/licenses/>. */
> > -
> > -#include <string.h>
> > -
> > -#undef strcat
> > -
> > -/* Append SRC on the end of DEST. */
> > -char *
> > -strcat (char *dest, const char *src)
> > -{
> > - strcpy (dest + strlen (dest), src);
> > - return dest;
> > -}
> > -libc_hidden_builtin_def (strcat)
> > diff --git a/sysdeps/x86_64/multiarch/Makefile
> > b/sysdeps/x86_64/multiarch/Makefile
> > index 22f1435..ae94366 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -6,7 +6,7 @@ endif
> >
> > ifeq ($(subdir),string)
> >
> > -sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3
> > strncmp-ssse3 \
> > +sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
> > strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
> > memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
> > memmove-ssse3-back strcasestr-nonascii
> > strcasecmp_l-ssse3 \
> > @@ -14,8 +14,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c
> > strcmp-ssse3 strncmp-ssse3 \
> > strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
> > strcpy-sse2-unaligned strncpy-sse2-unaligned \
> > stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
> > - strcat-sse2-unaligned strncat-sse2-unaligned \
> > - strcat-ssse3 strncat-ssse3 strlen_atom strlen_avx \
> > + strlen_atom strlen_avx \
> > strnlen-sse2-no-bsf strrchr-sse2-no-bsf
> > strchr-sse2-no-bsf \
> > memcmp-ssse3
> > ifeq (yes,$(config-cflags-sse4))
> > diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> > b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> > deleted file mode 100644
> > index 7811ab5..0000000
> > --- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> > +++ /dev/null
> > @@ -1,53 +0,0 @@
> > -/* strcat with SSE2
> > - Copyright (C) 2011 Free Software Foundation, Inc.
> > - Contributed by Intel Corporation.
> > - This file is part of the GNU C Library.
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <http://www.gnu.org/licenses/>. */
> > -
> > -#ifndef NOT_IN_libc
> > -
> > -# include <sysdep.h>
> > -
> > -# ifndef STRCAT
> > -# define STRCAT __strcat_sse2_unaligned
> > -# endif
> > -
> > -# define USE_AS_STRCAT
> > -
> > -.text
> > -ENTRY (STRCAT)
> > - mov %rdi, %r9
> > -# ifdef USE_AS_STRNCAT
> > - mov %rdx, %r8
> > -# endif
> > -
> > -# define RETURN jmp L(StartStrcpyPart)
> > -# include "strlen-sse2-pminub.S"
> > -# undef RETURN
> > -
> > -L(StartStrcpyPart):
> > - lea (%r9, %rax), %rdi
> > - mov %rsi, %rcx
> > - mov %r9, %rax /* save result */
> > -
> > -# ifdef USE_AS_STRNCAT
> > - test %r8, %r8
> > - jz L(ExitZero)
> > -# define USE_AS_STRNCPY
> > -# endif
> > -
> > -# include "strcpy-sse2-unaligned.S"
> > -#endif
> > diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S
> > b/sysdeps/x86_64/multiarch/strcat-ssse3.S
> > deleted file mode 100644
> > index abd2c0c..0000000
> > --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
> > +++ /dev/null
> > @@ -1,557 +0,0 @@
> > -/* strcat with SSSE3
> > - Copyright (C) 2011 Free Software Foundation, Inc.
> > - Contributed by Intel Corporation.
> > - This file is part of the GNU C Library.
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <http://www.gnu.org/licenses/>. */
> > -
> > -#ifndef NOT_IN_libc
> > -
> > -# include <sysdep.h>
> > -
> > -# ifndef STRCAT
> > -# define STRCAT __strcat_ssse3
> > -# endif
> > -
> > -# define USE_AS_STRCAT
> > -
> > -.text
> > -ENTRY (STRCAT)
> > -# ifdef USE_AS_STRNCAT
> > - mov %rdx, %r8
> > -# endif
> > -
> > -# define RETURN jmp L(StartStrcpyPart)
> > -# include "strlen-sse2-no-bsf.S"
> > -
> > -# undef RETURN
> > -
> > -L(StartStrcpyPart):
> > - mov %rsi, %rcx
> > - lea (%rdi, %rax), %rdx
> > -# ifdef USE_AS_STRNCAT
> > - test %r8, %r8
> > - jz L(StrncatExit0)
> > - cmp $8, %r8
> > - jbe L(StrncatExit8Bytes)
> > -# endif
> > - cmpb $0, (%rcx)
> > - jz L(Exit1)
> > - cmpb $0, 1(%rcx)
> > - jz L(Exit2)
> > - cmpb $0, 2(%rcx)
> > - jz L(Exit3)
> > - cmpb $0, 3(%rcx)
> > - jz L(Exit4)
> > - cmpb $0, 4(%rcx)
> > - jz L(Exit5)
> > - cmpb $0, 5(%rcx)
> > - jz L(Exit6)
> > - cmpb $0, 6(%rcx)
> > - jz L(Exit7)
> > - cmpb $0, 7(%rcx)
> > - jz L(Exit8)
> > - cmpb $0, 8(%rcx)
> > - jz L(Exit9)
> > -# ifdef USE_AS_STRNCAT
> > - cmp $16, %r8
> > - jb L(StrncatExit15Bytes)
> > -# endif
> > - cmpb $0, 9(%rcx)
> > - jz L(Exit10)
> > - cmpb $0, 10(%rcx)
> > - jz L(Exit11)
> > - cmpb $0, 11(%rcx)
> > - jz L(Exit12)
> > - cmpb $0, 12(%rcx)
> > - jz L(Exit13)
> > - cmpb $0, 13(%rcx)
> > - jz L(Exit14)
> > - cmpb $0, 14(%rcx)
> > - jz L(Exit15)
> > - cmpb $0, 15(%rcx)
> > - jz L(Exit16)
> > -# ifdef USE_AS_STRNCAT
> > - cmp $16, %r8
> > - je L(StrncatExit16)
> > -# define USE_AS_STRNCPY
> > -# endif
> > -
> > -# include "strcpy-ssse3.S"
> > -
> > - .p2align 4
> > -L(CopyFrom1To16Bytes):
> > - add %rsi, %rdx
> > - add %rsi, %rcx
> > -
> > - test %al, %al
> > - jz L(ExitHigh)
> > - test $0x01, %al
> > - jnz L(Exit1)
> > - test $0x02, %al
> > - jnz L(Exit2)
> > - test $0x04, %al
> > - jnz L(Exit3)
> > - test $0x08, %al
> > - jnz L(Exit4)
> > - test $0x10, %al
> > - jnz L(Exit5)
> > - test $0x20, %al
> > - jnz L(Exit6)
> > - test $0x40, %al
> > - jnz L(Exit7)
> > - movlpd (%rcx), %xmm0
> > - movlpd %xmm0, (%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(ExitHigh):
> > - test $0x01, %ah
> > - jnz L(Exit9)
> > - test $0x02, %ah
> > - jnz L(Exit10)
> > - test $0x04, %ah
> > - jnz L(Exit11)
> > - test $0x08, %ah
> > - jnz L(Exit12)
> > - test $0x10, %ah
> > - jnz L(Exit13)
> > - test $0x20, %ah
> > - jnz L(Exit14)
> > - test $0x40, %ah
> > - jnz L(Exit15)
> > - movlpd (%rcx), %xmm0
> > - movlpd 8(%rcx), %xmm1
> > - movlpd %xmm0, (%rdx)
> > - movlpd %xmm1, 8(%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit1):
> > - xor %ah, %ah
> > - movb %ah, 1(%rdx)
> > -L(Exit1):
> > - movb (%rcx), %al
> > - movb %al, (%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit2):
> > - xor %ah, %ah
> > - movb %ah, 2(%rdx)
> > -L(Exit2):
> > - movw (%rcx), %ax
> > - movw %ax, (%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit3):
> > - xor %ah, %ah
> > - movb %ah, 3(%rdx)
> > -L(Exit3):
> > - movw (%rcx), %ax
> > - movw %ax, (%rdx)
> > - movb 2(%rcx), %al
> > - movb %al, 2(%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit4):
> > - xor %ah, %ah
> > - movb %ah, 4(%rdx)
> > -L(Exit4):
> > - mov (%rcx), %eax
> > - mov %eax, (%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit5):
> > - xor %ah, %ah
> > - movb %ah, 5(%rdx)
> > -L(Exit5):
> > - mov (%rcx), %eax
> > - mov %eax, (%rdx)
> > - movb 4(%rcx), %al
> > - movb %al, 4(%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit6):
> > - xor %ah, %ah
> > - movb %ah, 6(%rdx)
> > -L(Exit6):
> > - mov (%rcx), %eax
> > - mov %eax, (%rdx)
> > - movw 4(%rcx), %ax
> > - movw %ax, 4(%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit7):
> > - xor %ah, %ah
> > - movb %ah, 7(%rdx)
> > -L(Exit7):
> > - mov (%rcx), %eax
> > - mov %eax, (%rdx)
> > - mov 3(%rcx), %eax
> > - mov %eax, 3(%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit8):
> > - xor %ah, %ah
> > - movb %ah, 8(%rdx)
> > -L(Exit8):
> > - movlpd (%rcx), %xmm0
> > - movlpd %xmm0, (%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit9):
> > - xor %ah, %ah
> > - movb %ah, 9(%rdx)
> > -L(Exit9):
> > - movlpd (%rcx), %xmm0
> > - movlpd %xmm0, (%rdx)
> > - movb 8(%rcx), %al
> > - movb %al, 8(%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit10):
> > - xor %ah, %ah
> > - movb %ah, 10(%rdx)
> > -L(Exit10):
> > - movlpd (%rcx), %xmm0
> > - movlpd %xmm0, (%rdx)
> > - movw 8(%rcx), %ax
> > - movw %ax, 8(%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit11):
> > - xor %ah, %ah
> > - movb %ah, 11(%rdx)
> > -L(Exit11):
> > - movlpd (%rcx), %xmm0
> > - movlpd %xmm0, (%rdx)
> > - mov 7(%rcx), %eax
> > - mov %eax, 7(%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit12):
> > - xor %ah, %ah
> > - movb %ah, 12(%rdx)
> > -L(Exit12):
> > - movlpd (%rcx), %xmm0
> > - movlpd %xmm0, (%rdx)
> > - mov 8(%rcx), %eax
> > - mov %eax, 8(%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit13):
> > - xor %ah, %ah
> > - movb %ah, 13(%rdx)
> > -L(Exit13):
> > - movlpd (%rcx), %xmm0
> > - movlpd %xmm0, (%rdx)
> > - movlpd 5(%rcx), %xmm1
> > - movlpd %xmm1, 5(%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit14):
> > - xor %ah, %ah
> > - movb %ah, 14(%rdx)
> > -L(Exit14):
> > - movlpd (%rcx), %xmm0
> > - movlpd %xmm0, (%rdx)
> > - movlpd 6(%rcx), %xmm1
> > - movlpd %xmm1, 6(%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit15):
> > - xor %ah, %ah
> > - movb %ah, 15(%rdx)
> > -L(Exit15):
> > - movlpd (%rcx), %xmm0
> > - movlpd %xmm0, (%rdx)
> > - movlpd 7(%rcx), %xmm1
> > - movlpd %xmm1, 7(%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit16):
> > - xor %ah, %ah
> > - movb %ah, 16(%rdx)
> > -L(Exit16):
> > - movlpd (%rcx), %xmm0
> > - movlpd 8(%rcx), %xmm1
> > - movlpd %xmm0, (%rdx)
> > - movlpd %xmm1, 8(%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > -# ifdef USE_AS_STRNCPY
> > -
> > - .p2align 4
> > -L(CopyFrom1To16BytesCase2):
> > - add $16, %r8
> > - add %rsi, %rcx
> > - lea (%rsi, %rdx), %rsi
> > - lea -9(%r8), %rdx
> > - and $1<<7, %dh
> > - or %al, %dh
> > - test %dh, %dh
> > - lea (%rsi), %rdx
> > - jz L(ExitHighCase2)
> > -
> > - test $0x01, %al
> > - jnz L(Exit1)
> > - cmp $1, %r8
> > - je L(StrncatExit1)
> > - test $0x02, %al
> > - jnz L(Exit2)
> > - cmp $2, %r8
> > - je L(StrncatExit2)
> > - test $0x04, %al
> > - jnz L(Exit3)
> > - cmp $3, %r8
> > - je L(StrncatExit3)
> > - test $0x08, %al
> > - jnz L(Exit4)
> > - cmp $4, %r8
> > - je L(StrncatExit4)
> > - test $0x10, %al
> > - jnz L(Exit5)
> > - cmp $5, %r8
> > - je L(StrncatExit5)
> > - test $0x20, %al
> > - jnz L(Exit6)
> > - cmp $6, %r8
> > - je L(StrncatExit6)
> > - test $0x40, %al
> > - jnz L(Exit7)
> > - cmp $7, %r8
> > - je L(StrncatExit7)
> > - movlpd (%rcx), %xmm0
> > - movlpd %xmm0, (%rdx)
> > - lea 7(%rdx), %rax
> > - cmpb $1, (%rax)
> > - sbb $-1, %rax
> > - xor %cl, %cl
> > - movb %cl, (%rax)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(ExitHighCase2):
> > - test $0x01, %ah
> > - jnz L(Exit9)
> > - cmp $9, %r8
> > - je L(StrncatExit9)
> > - test $0x02, %ah
> > - jnz L(Exit10)
> > - cmp $10, %r8
> > - je L(StrncatExit10)
> > - test $0x04, %ah
> > - jnz L(Exit11)
> > - cmp $11, %r8
> > - je L(StrncatExit11)
> > - test $0x8, %ah
> > - jnz L(Exit12)
> > - cmp $12, %r8
> > - je L(StrncatExit12)
> > - test $0x10, %ah
> > - jnz L(Exit13)
> > - cmp $13, %r8
> > - je L(StrncatExit13)
> > - test $0x20, %ah
> > - jnz L(Exit14)
> > - cmp $14, %r8
> > - je L(StrncatExit14)
> > - test $0x40, %ah
> > - jnz L(Exit15)
> > - cmp $15, %r8
> > - je L(StrncatExit15)
> > - movlpd (%rcx), %xmm0
> > - movlpd %xmm0, (%rdx)
> > - movlpd 8(%rcx), %xmm1
> > - movlpd %xmm1, 8(%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > -L(CopyFrom1To16BytesCase2OrCase3):
> > - test %rax, %rax
> > - jnz L(CopyFrom1To16BytesCase2)
> > -
> > - .p2align 4
> > -L(CopyFrom1To16BytesCase3):
> > - add $16, %r8
> > - add %rsi, %rdx
> > - add %rsi, %rcx
> > -
> > - cmp $8, %r8
> > - ja L(ExitHighCase3)
> > - cmp $1, %r8
> > - je L(StrncatExit1)
> > - cmp $2, %r8
> > - je L(StrncatExit2)
> > - cmp $3, %r8
> > - je L(StrncatExit3)
> > - cmp $4, %r8
> > - je L(StrncatExit4)
> > - cmp $5, %r8
> > - je L(StrncatExit5)
> > - cmp $6, %r8
> > - je L(StrncatExit6)
> > - cmp $7, %r8
> > - je L(StrncatExit7)
> > - movlpd (%rcx), %xmm0
> > - movlpd %xmm0, (%rdx)
> > - xor %ah, %ah
> > - movb %ah, 8(%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(ExitHighCase3):
> > - cmp $9, %r8
> > - je L(StrncatExit9)
> > - cmp $10, %r8
> > - je L(StrncatExit10)
> > - cmp $11, %r8
> > - je L(StrncatExit11)
> > - cmp $12, %r8
> > - je L(StrncatExit12)
> > - cmp $13, %r8
> > - je L(StrncatExit13)
> > - cmp $14, %r8
> > - je L(StrncatExit14)
> > - cmp $15, %r8
> > - je L(StrncatExit15)
> > - movlpd (%rcx), %xmm0
> > - movlpd %xmm0, (%rdx)
> > - movlpd 8(%rcx), %xmm1
> > - movlpd %xmm1, 8(%rdx)
> > - xor %ah, %ah
> > - movb %ah, 16(%rdx)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit0):
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit15Bytes):
> > - cmp $9, %r8
> > - je L(StrncatExit9)
> > - cmpb $0, 9(%rcx)
> > - jz L(Exit10)
> > - cmp $10, %r8
> > - je L(StrncatExit10)
> > - cmpb $0, 10(%rcx)
> > - jz L(Exit11)
> > - cmp $11, %r8
> > - je L(StrncatExit11)
> > - cmpb $0, 11(%rcx)
> > - jz L(Exit12)
> > - cmp $12, %r8
> > - je L(StrncatExit12)
> > - cmpb $0, 12(%rcx)
> > - jz L(Exit13)
> > - cmp $13, %r8
> > - je L(StrncatExit13)
> > - cmpb $0, 13(%rcx)
> > - jz L(Exit14)
> > - cmp $14, %r8
> > - je L(StrncatExit14)
> > - movlpd (%rcx), %xmm0
> > - movlpd %xmm0, (%rdx)
> > - movlpd 7(%rcx), %xmm1
> > - movlpd %xmm1, 7(%rdx)
> > - lea 14(%rdx), %rax
> > - cmpb $1, (%rax)
> > - sbb $-1, %rax
> > - xor %cl, %cl
> > - movb %cl, (%rax)
> > - mov %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(StrncatExit8Bytes):
> > - cmpb $0, (%rcx)
> > - jz L(Exit1)
> > - cmp $1, %r8
> > - je L(StrncatExit1)
> > - cmpb $0, 1(%rcx)
> > - jz L(Exit2)
> > - cmp $2, %r8
> > - je L(StrncatExit2)
> > - cmpb $0, 2(%rcx)
> > - jz L(Exit3)
> > - cmp $3, %r8
> > - je L(StrncatExit3)
> > - cmpb $0, 3(%rcx)
> > - jz L(Exit4)
> > - cmp $4, %r8
> > - je L(StrncatExit4)
> > - cmpb $0, 4(%rcx)
> > - jz L(Exit5)
> > - cmp $5, %r8
> > - je L(StrncatExit5)
> > - cmpb $0, 5(%rcx)
> > - jz L(Exit6)
> > - cmp $6, %r8
> > - je L(StrncatExit6)
> > - cmpb $0, 6(%rcx)
> > - jz L(Exit7)
> > - cmp $7, %r8
> > - je L(StrncatExit7)
> > - movlpd (%rcx), %xmm0
> > - movlpd %xmm0, (%rdx)
> > - lea 7(%rdx), %rax
> > - cmpb $1, (%rax)
> > - sbb $-1, %rax
> > - xor %cl, %cl
> > - movb %cl, (%rax)
> > - mov %rdi, %rax
> > - ret
> > -
> > -# endif
> > -END (STRCAT)
> > -#endif
> > diff --git a/sysdeps/x86_64/multiarch/strcat.S
> > b/sysdeps/x86_64/multiarch/strcat.S
> > deleted file mode 100644
> > index 0c256de..0000000
> > --- a/sysdeps/x86_64/multiarch/strcat.S
> > +++ /dev/null
> > @@ -1,84 +0,0 @@
> > -/* Multiple versions of strcat
> > - Copyright (C) 2009, 2011 Free Software Foundation, Inc.
> > - Contributed by Intel Corporation.
> > - This file is part of the GNU C Library.
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <http://www.gnu.org/licenses/>. */
> > -
> > -#include <sysdep.h>
> > -#include <init-arch.h>
> > -
> > -#ifndef USE_AS_STRNCAT
> > -# ifndef STRCAT
> > -# define STRCAT strcat
> > -# endif
> > -#endif
> > -
> > -#ifdef USE_AS_STRNCAT
> > -# define STRCAT_SSSE3 __strncat_ssse3
> > -# define STRCAT_SSE2 __strncat_sse2
> > -# define STRCAT_SSE2_UNALIGNED __strncat_sse2_unaligned
> > -# define __GI_STRCAT __GI_strncat
> > -# define __GI___STRCAT __GI___strncat
> > -#else
> > -# define STRCAT_SSSE3 __strcat_ssse3
> > -# define STRCAT_SSE2 __strcat_sse2
> > -# define STRCAT_SSE2_UNALIGNED __strcat_sse2_unaligned
> > -# define __GI_STRCAT __GI_strcat
> > -# define __GI___STRCAT __GI___strcat
> > -#endif
> > -
> > -
> > -/* Define multiple versions only for the definition in libc. */
> > -#ifndef NOT_IN_libc
> > - .text
> > -ENTRY(STRCAT)
> > - .type STRCAT, @gnu_indirect_function
> > - cmpl $0, __cpu_features+KIND_OFFSET(%rip)
> > - jne 1f
> > - call __init_cpu_features
> > -1: leaq STRCAT_SSE2_UNALIGNED(%rip), %rax
> > - testl $bit_Fast_Unaligned_Load,
> > __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
> > - jnz 2f
> > - leaq STRCAT_SSE2(%rip), %rax
> > - testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> > - jz 2f
> > - leaq STRCAT_SSSE3(%rip), %rax
> > -2: ret
> > -END(STRCAT)
> > -
> > -# undef ENTRY
> > -# define ENTRY(name) \
> > - .type STRCAT_SSE2, @function; \
> > - .align 16; \
> > - STRCAT_SSE2: cfi_startproc; \
> > - CALL_MCOUNT
> > -# undef END
> > -# define END(name) \
> > - cfi_endproc; .size STRCAT_SSE2, .-STRCAT_SSE2
> > -# undef libc_hidden_builtin_def
> > -/* It doesn't make sense to send libc-internal strcat calls through a
> > PLT.
> > - The speedup we get from using SSSE3 instruction is likely eaten away
> > - by the indirect call in the PLT. */
> > -# define libc_hidden_builtin_def(name) \
> > - .globl __GI_STRCAT; __GI_STRCAT = STRCAT_SSE2
> > -# undef libc_hidden_def
> > -# define libc_hidden_def(name) \
> > - .globl __GI___STRCAT; __GI___STRCAT = STRCAT_SSE2
> > -#endif
> > -
> > -#ifndef USE_AS_STRNCAT
> > -# include "../strcat.S"
> > -#endif
> > diff --git a/sysdeps/x86_64/multiarch/strncat-c.c
> > b/sysdeps/x86_64/multiarch/strncat-c.c
> > deleted file mode 100644
> > index a3cdbff..0000000
> > --- a/sysdeps/x86_64/multiarch/strncat-c.c
> > +++ /dev/null
> > @@ -1,8 +0,0 @@
> > -#define STRNCAT __strncat_sse2
> > -#ifdef SHARED
> > -#undef libc_hidden_def
> > -#define libc_hidden_def(name) \
> > - __hidden_ver1 (__strncat_sse2, __GI___strncat, __strncat_sse2);
> > -#endif
> > -
> > -#include "string/strncat.c"
> > diff --git a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> > b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> > deleted file mode 100644
> > index 133e1d2..0000000
> > --- a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> > +++ /dev/null
> > @@ -1,3 +0,0 @@
> > -#define USE_AS_STRNCAT
> > -#define STRCAT __strncat_sse2_unaligned
> > -#include "strcat-sse2-unaligned.S"
> > diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S
> > b/sysdeps/x86_64/multiarch/strncat-ssse3.S
> > deleted file mode 100644
> > index 6c45ff3..0000000
> > --- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
> > +++ /dev/null
> > @@ -1,3 +0,0 @@
> > -#define USE_AS_STRNCAT
> > -#define STRCAT __strncat_ssse3
> > -#include "strcat-ssse3.S"
> > diff --git a/sysdeps/x86_64/multiarch/strncat.S
> > b/sysdeps/x86_64/multiarch/strncat.S
> > deleted file mode 100644
> > index fd569c2..0000000
> > --- a/sysdeps/x86_64/multiarch/strncat.S
> > +++ /dev/null
> > @@ -1,3 +0,0 @@
> > -#define STRCAT strncat
> > -#define USE_AS_STRNCAT
> > -#include "strcat.S"
> > diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S
> > deleted file mode 100644
> > index 535a18d..0000000
> > --- a/sysdeps/x86_64/strcat.S
> > +++ /dev/null
> > @@ -1,259 +0,0 @@
> > -/* strcat(dest, src) -- Append SRC on the end of DEST.
> > - Optimized for x86-64.
> > - Copyright (C) 2002 Free Software Foundation, Inc.
> > - This file is part of the GNU C Library.
> > - Contributed by Andreas Jaeger <aj@suse.de>, 2002.
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <http://www.gnu.org/licenses/>. */
> > -
> > -#include <sysdep.h>
> > -#include "asm-syntax.h"
> > -#include "bp-sym.h"
> > -#include "bp-asm.h"
> > -
> > -
> > - .text
> > -ENTRY (BP_SYM (strcat))
> > - movq %rdi, %rcx /* Dest. register. */
> > - andl $7, %ecx /* mask alignment bits */
> > - movq %rdi, %rax /* Duplicate destination pointer. */
> > - movq $0xfefefefefefefeff,%r8
> > -
> > - /* First step: Find end of destination. */
> > - jz 4f /* aligned => start loop */
> > -
> > - neg %ecx /* We need to align to 8 bytes. */
> > - addl $8,%ecx
> > - /* Search the first bytes directly. */
> > -0: cmpb $0x0,(%rax) /* is byte NUL? */
> > - je 2f /* yes => start copy */
> > - incq %rax /* increment pointer */
> > - decl %ecx
> > - jnz 0b
> > -
> > -
> > -
> > - /* Now the source is aligned. Scan for NUL byte. */
> > - .p2align 4
> > -4:
> > - /* First unroll. */
> > - movq (%rax), %rcx /* get double word (= 8 bytes) in question
> > */
> > - addq $8,%rax /* adjust pointer for next word */
> > - movq %r8, %rdx /* magic value */
> > - addq %rcx, %rdx /* add the magic value to the word. We
> > get
> > - carry bits reported for each byte which
> > - is *not* 0 */
> > - jnc 3f /* highest byte is NUL => return pointer
> > */
> > - xorq %rcx, %rdx /* (word+magic)^word */
> > - orq %r8, %rdx /* set all non-carry bits */
> > - incq %rdx /* add 1: if one carry bit was *not* set
> > - the addition will not result in 0. */
> > - jnz 3f /* found NUL => return pointer */
> > -
> > - /* Second unroll. */
> > - movq (%rax), %rcx /* get double word (= 8 bytes) in question
> > */
> > - addq $8,%rax /* adjust pointer for next word */
> > - movq %r8, %rdx /* magic value */
> > - addq %rcx, %rdx /* add the magic value to the word. We
> > get
> > - carry bits reported for each byte which
> > - is *not* 0 */
> > - jnc 3f /* highest byte is NUL => return pointer
> > */
> > - xorq %rcx, %rdx /* (word+magic)^word */
> > - orq %r8, %rdx /* set all non-carry bits */
> > - incq %rdx /* add 1: if one carry bit was *not* set
> > - the addition will not result in 0. */
> > - jnz 3f /* found NUL => return pointer */
> > -
> > - /* Third unroll. */
> > - movq (%rax), %rcx /* get double word (= 8 bytes) in question
> > */
> > - addq $8,%rax /* adjust pointer for next word */
> > - movq %r8, %rdx /* magic value */
> > - addq %rcx, %rdx /* add the magic value to the word. We
> > get
> > - carry bits reported for each byte which
> > - is *not* 0 */
> > - jnc 3f /* highest byte is NUL => return pointer
> > */
> > - xorq %rcx, %rdx /* (word+magic)^word */
> > - orq %r8, %rdx /* set all non-carry bits */
> > - incq %rdx /* add 1: if one carry bit was *not* set
> > - the addition will not result in 0. */
> > - jnz 3f /* found NUL => return pointer */
> > -
> > - /* Fourth unroll. */
> > - movq (%rax), %rcx /* get double word (= 8 bytes) in question
> > */
> > - addq $8,%rax /* adjust pointer for next word */
> > - movq %r8, %rdx /* magic value */
> > - addq %rcx, %rdx /* add the magic value to the word. We
> > get
> > - carry bits reported for each byte which
> > - is *not* 0 */
> > - jnc 3f /* highest byte is NUL => return pointer
> > */
> > - xorq %rcx, %rdx /* (word+magic)^word */
> > - orq %r8, %rdx /* set all non-carry bits */
> > - incq %rdx /* add 1: if one carry bit was *not* set
> > - the addition will not result in 0. */
> > - jz 4b /* no NUL found => continue loop */
> > -
> > - .p2align 4 /* Align, it's a jump target. */
> > -3: subq $8,%rax /* correct pointer increment. */
> > -
> > - testb %cl, %cl /* is first byte NUL? */
> > - jz 2f /* yes => return */
> > - incq %rax /* increment pointer */
> > -
> > - testb %ch, %ch /* is second byte NUL? */
> > - jz 2f /* yes => return */
> > - incq %rax /* increment pointer */
> > -
> > - testl $0x00ff0000, %ecx /* is third byte NUL? */
> > - jz 2f /* yes => return pointer */
> > - incq %rax /* increment pointer */
> > -
> > - testl $0xff000000, %ecx /* is fourth byte NUL? */
> > - jz 2f /* yes => return pointer */
> > - incq %rax /* increment pointer */
> > -
> > - shrq $32, %rcx /* look at other half. */
> > -
> > - testb %cl, %cl /* is first byte NUL? */
> > - jz 2f /* yes => return */
> > - incq %rax /* increment pointer */
> > -
> > - testb %ch, %ch /* is second byte NUL? */
> > - jz 2f /* yes => return */
> > - incq %rax /* increment pointer */
> > -
> > - testl $0xff0000, %ecx /* is third byte NUL? */
> > - jz 2f /* yes => return pointer */
> > - incq %rax /* increment pointer */
> > -
> > -2:
> > - /* Second step: Copy source to destination. */
> > -
> > - movq %rsi, %rcx /* duplicate */
> > - andl $7,%ecx /* mask alignment bits */
> > - movq %rax, %rdx /* move around */
> > - jz 22f /* aligned => start loop */
> > -
> > - neg %ecx /* align to 8 bytes. */
> > - addl $8, %ecx
> > - /* Align the source pointer. */
> > -21:
> > - movb (%rsi), %al /* Fetch a byte */
> > - testb %al, %al /* Is it NUL? */
> > - movb %al, (%rdx) /* Store it */
> > - jz 24f /* If it was NUL, done! */
> > - incq %rsi
> > - incq %rdx
> > - decl %ecx
> > - jnz 21b
> > -
> > - /* Now the sources is aligned. Unfortunatly we cannot force
> > - to have both source and destination aligned, so ignore the
> > - alignment of the destination. */
> > - .p2align 4
> > -22:
> > - /* 1st unroll. */
> > - movq (%rsi), %rax /* Read double word (8 bytes). */
> > - addq $8, %rsi /* Adjust pointer for next word. */
> > - movq %rax, %r9 /* Save a copy for NUL finding. */
> > - addq %r8, %r9 /* add the magic value to the word. We
> > get
> > - carry bits reported for each byte which
> > - is *not* 0 */
> > - jnc 23f /* highest byte is NUL => return pointer
> > */
> > - xorq %rax, %r9 /* (word+magic)^word */
> > - orq %r8, %r9 /* set all non-carry bits */
> > - incq %r9 /* add 1: if one carry bit was *not* set
> > - the addition will not result in 0. */
> > -
> > - jnz 23f /* found NUL => return pointer */
> > -
> > - movq %rax, (%rdx) /* Write value to destination. */
> > - addq $8, %rdx /* Adjust pointer. */
> > -
> > - /* 2nd unroll. */
> > - movq (%rsi), %rax /* Read double word (8 bytes). */
> > - addq $8, %rsi /* Adjust pointer for next word. */
> > - movq %rax, %r9 /* Save a copy for NUL finding. */
> > - addq %r8, %r9 /* add the magic value to the word. We
> > get
> > - carry bits reported for each byte which
> > - is *not* 0 */
> > - jnc 23f /* highest byte is NUL => return pointer
> > */
> > - xorq %rax, %r9 /* (word+magic)^word */
> > - orq %r8, %r9 /* set all non-carry bits */
> > - incq %r9 /* add 1: if one carry bit was *not* set
> > - the addition will not result in 0. */
> > -
> > - jnz 23f /* found NUL => return pointer */
> > -
> > - movq %rax, (%rdx) /* Write value to destination. */
> > - addq $8, %rdx /* Adjust pointer. */
> > -
> > - /* 3rd unroll. */
> > - movq (%rsi), %rax /* Read double word (8 bytes). */
> > - addq $8, %rsi /* Adjust pointer for next word. */
> > - movq %rax, %r9 /* Save a copy for NUL finding. */
> > - addq %r8, %r9 /* add the magic value to the word. We
> > get
> > - carry bits reported for each byte which
> > - is *not* 0 */
> > - jnc 23f /* highest byte is NUL => return pointer
> > */
> > - xorq %rax, %r9 /* (word+magic)^word */
> > - orq %r8, %r9 /* set all non-carry bits */
> > - incq %r9 /* add 1: if one carry bit was *not* set
> > - the addition will not result in 0. */
> > -
> > - jnz 23f /* found NUL => return pointer */
> > -
> > - movq %rax, (%rdx) /* Write value to destination. */
> > - addq $8, %rdx /* Adjust pointer. */
> > -
> > - /* 4th unroll. */
> > - movq (%rsi), %rax /* Read double word (8 bytes). */
> > - addq $8, %rsi /* Adjust pointer for next word. */
> > - movq %rax, %r9 /* Save a copy for NUL finding. */
> > - addq %r8, %r9 /* add the magic value to the word. We
> > get
> > - carry bits reported for each byte which
> > - is *not* 0 */
> > - jnc 23f /* highest byte is NUL => return pointer
> > */
> > - xorq %rax, %r9 /* (word+magic)^word */
> > - orq %r8, %r9 /* set all non-carry bits */
> > - incq %r9 /* add 1: if one carry bit was *not* set
> > - the addition will not result in 0. */
> > -
> > - jnz 23f /* found NUL => return pointer */
> > -
> > - movq %rax, (%rdx) /* Write value to destination. */
> > - addq $8, %rdx /* Adjust pointer. */
> > - jmp 22b /* Next iteration. */
> > -
> > - /* Do the last few bytes. %rax contains the value to write.
> > - The loop is unrolled twice. */
> > - .p2align 4
> > -23:
> > - movb %al, (%rdx) /* 1st byte. */
> > - testb %al, %al /* Is it NUL. */
> > - jz 24f /* yes, finish. */
> > - incq %rdx /* Increment destination. */
> > - movb %ah, (%rdx) /* 2nd byte. */
> > - testb %ah, %ah /* Is it NUL?. */
> > - jz 24f /* yes, finish. */
> > - incq %rdx /* Increment destination. */
> > - shrq $16, %rax /* Shift... */
> > - jmp 23b /* and look at next two bytes in %rax. */
> > -
> > -
> > -24:
> > - movq %rdi, %rax /* Source is return value. */
> > - retq
> > -END (BP_SYM (strcat))
> > -libc_hidden_builtin_def (strcat)
> > --
> > 1.7.4.4
> >
> >
> >
--
Feature was not beta tested