[PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S

H.J. Lu hjl.tools@gmail.com
Tue Jul 12 23:23:35 GMT 2022


On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
>  sysdeps/x86_64/multiarch/rtld-stpcpy.S |  18 ++++
>  sysdeps/x86_64/multiarch/stpcpy-sse2.S |  15 +--
>  sysdeps/x86_64/multiarch/strcpy-sse2.S | 137 ++++++++++++++++++++++--
>  sysdeps/x86_64/stpcpy.S                |   3 +-
>  sysdeps/x86_64/strcpy.S                | 138 +------------------------
>  5 files changed, 156 insertions(+), 155 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/rtld-stpcpy.S
>
> diff --git a/sysdeps/x86_64/multiarch/rtld-stpcpy.S b/sysdeps/x86_64/multiarch/rtld-stpcpy.S
> new file mode 100644
> index 0000000000..914141f07f
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rtld-stpcpy.S
> @@ -0,0 +1,18 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "../stpcpy.S"
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2.S b/sysdeps/x86_64/multiarch/stpcpy-sse2.S
> index 078504a44e..ea9f973af3 100644
> --- a/sysdeps/x86_64/multiarch/stpcpy-sse2.S
> +++ b/sysdeps/x86_64/multiarch/stpcpy-sse2.S
> @@ -17,17 +17,10 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -
> -# include <sysdep.h>
> -# define __stpcpy __stpcpy_sse2
> -
> -# undef weak_alias
> -# define weak_alias(ignored1, ignored2)
> -# undef libc_hidden_def
> -# define libc_hidden_def(__stpcpy)
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(stpcpy)
> +# ifndef STRCPY
> +#  define STRCPY       __stpcpy_sse2
> +# endif
>  #endif
>
>  #define USE_AS_STPCPY
> -#include <sysdeps/x86_64/stpcpy.S>
> +#include "strcpy-sse2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2.S b/sysdeps/x86_64/multiarch/strcpy-sse2.S
> index f37967c441..8b5db8b13d 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-sse2.S
> @@ -17,12 +17,137 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> +# ifndef STRCPY
> +#  define STRCPY __strcpy_sse2
> +# endif
> +#endif
>
> -# include <sysdep.h>
> -# define strcpy __strcpy_sse2
> +#include <sysdep.h>
>
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strcpy)
> -#endif
> +       .text
> +ENTRY (STRCPY)
> +       movq %rsi, %rcx         /* Source register. */
> +       andl $7, %ecx           /* mask alignment bits */
> +       movq %rdi, %rdx         /* Duplicate destination pointer.  */
> +
> +       jz 5f                   /* aligned => start loop */
> +
> +       neg %ecx                /* We need to align to 8 bytes.  */
> +       addl $8,%ecx
> +       /* Search the first bytes directly.  */
> +0:
> +       movb    (%rsi), %al     /* Fetch a byte */
> +       testb   %al, %al        /* Is it NUL? */
> +       movb    %al, (%rdx)     /* Store it */
> +       jz      4f              /* If it was NUL, done! */
> +       incq    %rsi
> +       incq    %rdx
> +       decl    %ecx
> +       jnz     0b
> +
> +5:
> +       movq $0xfefefefefefefeff,%r8
> +
> +       /* Now the sources is aligned.  Unfortunatly we cannot force
> +          to have both source and destination aligned, so ignore the
> +          alignment of the destination.  */
> +       .p2align 4
> +1:
> +       /* 1st unroll.  */
> +       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> +       addq    $8, %rsi        /* Adjust pointer for next word.  */
> +       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> +       addq    %r8, %r9        /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc     3f              /* highest byte is NUL => return pointer */
> +       xorq    %rax, %r9       /* (word+magic)^word */
> +       orq     %r8, %r9        /* set all non-carry bits */
> +       incq    %r9             /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +
> +       jnz     3f              /* found NUL => return pointer */
> +
> +       movq    %rax, (%rdx)    /* Write value to destination.  */
> +       addq    $8, %rdx        /* Adjust pointer.  */
> +
> +       /* 2nd unroll.  */
> +       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> +       addq    $8, %rsi        /* Adjust pointer for next word.  */
> +       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> +       addq    %r8, %r9        /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc     3f              /* highest byte is NUL => return pointer */
> +       xorq    %rax, %r9       /* (word+magic)^word */
> +       orq     %r8, %r9        /* set all non-carry bits */
> +       incq    %r9             /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +
> +       jnz     3f              /* found NUL => return pointer */
>
> -#include <sysdeps/x86_64/strcpy.S>
> +       movq    %rax, (%rdx)    /* Write value to destination.  */
> +       addq    $8, %rdx        /* Adjust pointer.  */
> +
> +       /* 3rd unroll.  */
> +       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> +       addq    $8, %rsi        /* Adjust pointer for next word.  */
> +       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> +       addq    %r8, %r9        /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc     3f              /* highest byte is NUL => return pointer */
> +       xorq    %rax, %r9       /* (word+magic)^word */
> +       orq     %r8, %r9        /* set all non-carry bits */
> +       incq    %r9             /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +
> +       jnz     3f              /* found NUL => return pointer */
> +
> +       movq    %rax, (%rdx)    /* Write value to destination.  */
> +       addq    $8, %rdx        /* Adjust pointer.  */
> +
> +       /* 4th unroll.  */
> +       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> +       addq    $8, %rsi        /* Adjust pointer for next word.  */
> +       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> +       addq    %r8, %r9        /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc     3f              /* highest byte is NUL => return pointer */
> +       xorq    %rax, %r9       /* (word+magic)^word */
> +       orq     %r8, %r9        /* set all non-carry bits */
> +       incq    %r9             /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +
> +       jnz     3f              /* found NUL => return pointer */
> +
> +       movq    %rax, (%rdx)    /* Write value to destination.  */
> +       addq    $8, %rdx        /* Adjust pointer.  */
> +       jmp     1b              /* Next iteration.  */
> +
> +       /* Do the last few bytes. %rax contains the value to write.
> +          The loop is unrolled twice.  */
> +       .p2align 4
> +3:
> +       /* Note that stpcpy needs to return with the value of the NUL
> +          byte.  */
> +       movb    %al, (%rdx)     /* 1st byte.  */
> +       testb   %al, %al        /* Is it NUL.  */
> +       jz      4f              /* yes, finish.  */
> +       incq    %rdx            /* Increment destination.  */
> +       movb    %ah, (%rdx)     /* 2nd byte.  */
> +       testb   %ah, %ah        /* Is it NUL?.  */
> +       jz      4f              /* yes, finish.  */
> +       incq    %rdx            /* Increment destination.  */
> +       shrq    $16, %rax       /* Shift...  */
> +       jmp     3b              /* and look at next two bytes in %rax.  */
> +
> +4:
> +#ifdef USE_AS_STPCPY
> +       movq    %rdx, %rax      /* Destination is return value.  */
> +#else
> +       movq    %rdi, %rax      /* Source is return value.  */
> +#endif
> +       retq
> +END (STRCPY)
> diff --git a/sysdeps/x86_64/stpcpy.S b/sysdeps/x86_64/stpcpy.S
> index ec23de1416..b097c203dd 100644
> --- a/sysdeps/x86_64/stpcpy.S
> +++ b/sysdeps/x86_64/stpcpy.S
> @@ -1,7 +1,6 @@
> -#define USE_AS_STPCPY
>  #define STRCPY __stpcpy
>
> -#include <sysdeps/x86_64/strcpy.S>
> +#include "multiarch/stpcpy-sse2.S"
>
>  weak_alias (__stpcpy, stpcpy)
>  libc_hidden_def (__stpcpy)
> diff --git a/sysdeps/x86_64/strcpy.S b/sysdeps/x86_64/strcpy.S
> index 17e8073550..05f19e6e94 100644
> --- a/sysdeps/x86_64/strcpy.S
> +++ b/sysdeps/x86_64/strcpy.S
> @@ -16,140 +16,6 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <sysdep.h>
> -#include "asm-syntax.h"
> -
> -#ifndef USE_AS_STPCPY
> -# define STRCPY strcpy
> -#endif
> -
> -       .text
> -ENTRY (STRCPY)
> -       movq %rsi, %rcx         /* Source register. */
> -       andl $7, %ecx           /* mask alignment bits */
> -       movq %rdi, %rdx         /* Duplicate destination pointer.  */
> -
> -       jz 5f                   /* aligned => start loop */
> -
> -       neg %ecx                /* We need to align to 8 bytes.  */
> -       addl $8,%ecx
> -       /* Search the first bytes directly.  */
> -0:
> -       movb    (%rsi), %al     /* Fetch a byte */
> -       testb   %al, %al        /* Is it NUL? */
> -       movb    %al, (%rdx)     /* Store it */
> -       jz      4f              /* If it was NUL, done! */
> -       incq    %rsi
> -       incq    %rdx
> -       decl    %ecx
> -       jnz     0b
> -
> -5:
> -       movq $0xfefefefefefefeff,%r8
> -
> -       /* Now the sources is aligned.  Unfortunatly we cannot force
> -          to have both source and destination aligned, so ignore the
> -          alignment of the destination.  */
> -       .p2align 4
> -1:
> -       /* 1st unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     3f              /* highest byte is NUL => return pointer */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     3f              /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -
> -       /* 2nd unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     3f              /* highest byte is NUL => return pointer */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     3f              /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -
> -       /* 3rd unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     3f              /* highest byte is NUL => return pointer */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     3f              /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -
> -       /* 4th unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     3f              /* highest byte is NUL => return pointer */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     3f              /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -       jmp     1b              /* Next iteration.  */
> -
> -       /* Do the last few bytes. %rax contains the value to write.
> -          The loop is unrolled twice.  */
> -       .p2align 4
> -3:
> -       /* Note that stpcpy needs to return with the value of the NUL
> -          byte.  */
> -       movb    %al, (%rdx)     /* 1st byte.  */
> -       testb   %al, %al        /* Is it NUL.  */
> -       jz      4f              /* yes, finish.  */
> -       incq    %rdx            /* Increment destination.  */
> -       movb    %ah, (%rdx)     /* 2nd byte.  */
> -       testb   %ah, %ah        /* Is it NUL?.  */
> -       jz      4f              /* yes, finish.  */
> -       incq    %rdx            /* Increment destination.  */
> -       shrq    $16, %rax       /* Shift...  */
> -       jmp     3b              /* and look at next two bytes in %rax.  */
> -
> -4:
> -#ifdef USE_AS_STPCPY
> -       movq    %rdx, %rax      /* Destination is return value.  */
> -#else
> -       movq    %rdi, %rax      /* Source is return value.  */
> -#endif
> -       retq
> -END (STRCPY)
> -#ifndef USE_AS_STPCPY
> +#define STRCPY strcpy
> +#include "multiarch/strcpy-sse2.S"
>  libc_hidden_builtin_def (strcpy)
> -#endif
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.


More information about the Libc-alpha mailing list