[PATCH v2 3/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions
Noah Goldstein
goldstein.w.n@gmail.com
Fri Nov 4 20:21:21 GMT 2022
On Fri, Nov 4, 2022 at 9:45 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Nov 4, 2022 at 1:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Optimizations are:
> > 1. Use more overlapping stores to avoid branches.
> > 2. Reduce how unrolled the aligning copies are (this is more of a
> > code-size save, its a negative for some sizes in terms of
> > perf).
> > 3. For st{r|p}n{cat|cpy} re-order the branches to minimize the
> > number that are taken.
> >
> > Performance Changes:
> >
> > Times are from N = 10 runs of the benchmark suite and are
> > reported as geometric mean of all ratios of
> > New Implementation / Old Implementation.
> >
> > strcat-avx2 -> 0.998
> > strcpy-avx2 -> 0.937
> > stpcpy-avx2 -> 0.971
> >
> > strncpy-avx2 -> 0.793
> > stpncpy-avx2 -> 0.775
> >
> > strncat-avx2 -> 0.962
> >
> > Code Size Changes:
> > function -> Bytes New / Bytes Old -> Ratio
> >
> > strcat-avx2 -> 685 / 1639 -> 0.418
> > strcpy-avx2 -> 560 / 903 -> 0.620
> > stpcpy-avx2 -> 592 / 939 -> 0.630
> >
> > strncpy-avx2 -> 1176 / 2390 -> 0.492
> > stpncpy-avx2 -> 1268 / 2438 -> 0.520
> >
> > strncat-avx2 -> 1042 / 2563 -> 0.407
> >
> > Notes:
> > 1. Because of the significant difference between the
> > implementations they are split into three files.
> >
> > strcpy-evex.S -> strcpy, stpcpy, strcat
> > strncpy-evex.S -> strncpy
> > strncat-evex.S > strncat
> >
> > I couldn't find a way to merge them without making the
> > ifdefs incredibly difficult to follow.
> >
> > 2. All implementations can be made evex512 by including
> > "x86-evex512-vecs.h" at the top.
>
> These comments are wrong for AVX2 implementations.
Sorry, fixed in V3.
>
> > Full check passes on x86-64 and build succeeds for all ISA levels w/
> > and w/o multiarch.
> >
> > Fix avx2
>
> Strayed comments?
>
Yes. Fixed in V3.
> > ---
> > sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S | 6 +-
> > sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S | 7 +-
> > sysdeps/x86_64/multiarch/stpncpy-avx2.S | 5 +-
> > sysdeps/x86_64/multiarch/strcat-avx2-rtm.S | 13 +-
> > sysdeps/x86_64/multiarch/strcat-avx2.S | 268 +---
> > sysdeps/x86_64/multiarch/strcat-strlen-avx2.S | 76 +
> > sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S | 13 +-
> > sysdeps/x86_64/multiarch/strcpy-avx2.S | 1236 +++++------------
> > sysdeps/x86_64/multiarch/strncat-avx2-rtm.S | 6 +-
> > sysdeps/x86_64/multiarch/strncat-avx2.S | 424 +++++-
> > sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S | 6 +-
> > sysdeps/x86_64/multiarch/strncpy-avx2.S | 740 +++++++++-
> > sysdeps/x86_64/multiarch/x86-avx-vecs.h | 5 +-
> > sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h | 26 +
> > sysdeps/x86_64/multiarch/x86-avx2-vecs.h | 27 +
> > 15 files changed, 1624 insertions(+), 1234 deletions(-)
> > create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
> > create mode 100644 sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
> > create mode 100644 sysdeps/x86_64/multiarch/x86-avx2-vecs.h
> >
> > diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> > index 2b9c07a59f..189a288053 100644
> > --- a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> > @@ -1,3 +1,3 @@
> > -#define USE_AS_STPCPY
> > -#define STRCPY __stpcpy_avx2_rtm
> > -#include "strcpy-avx2-rtm.S"
> > +#define STPCPY __stpcpy_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> > +#include "stpcpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> > index 60a2ccfe53..1b252985e7 100644
> > --- a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> > @@ -1,4 +1,3 @@
> > -#define USE_AS_STPCPY
> > -#define USE_AS_STRNCPY
> > -#define STRCPY __stpncpy_avx2_rtm
> > -#include "strcpy-avx2-rtm.S"
> > +#define STPNCPY __stpncpy_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> > +#include "stpncpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> > index b2f8c19143..a46a8edbe2 100644
> > --- a/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> > @@ -3,6 +3,5 @@
> > #endif
> >
> > #define USE_AS_STPCPY
> > -#define USE_AS_STRNCPY
> > -#define STRCPY STPNCPY
> > -#include "strcpy-avx2.S"
> > +#define STRNCPY STPNCPY
> > +#include "strncpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> > index 637fb557c4..94d51d10bd 100644
> > --- a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> > @@ -1,12 +1,3 @@
> > -#ifndef STRCAT
> > -# define STRCAT __strcat_avx2_rtm
> > -#endif
> > -
> > -#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> > - ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> > -
> > -#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
> > -
> > -#define SECTION(p) p##.avx.rtm
> > -
> > +#define STRCAT __strcat_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> > #include "strcat-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
> > index d9b7fb2a43..3f914fa342 100644
> > --- a/sysdeps/x86_64/multiarch/strcat-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
> > @@ -16,266 +16,10 @@
> > License along with the GNU C Library; if not, see
> > <https://www.gnu.org/licenses/>. */
> >
> > -#include <isa-level.h>
> > -
> > -#if ISA_SHOULD_BUILD (3)
> > -
> > -
> > -# include <sysdep.h>
> > -
> > -# ifndef STRCAT
> > -# define STRCAT __strcat_avx2
> > -# endif
> > -
> > -# define USE_AS_STRCAT
> > -
> > -/* Number of bytes in a vector register */
> > -# define VEC_SIZE 32
> > -
> > -# ifndef SECTION
> > -# define SECTION(p) p##.avx
> > -# endif
> > -
> > - .section SECTION(.text),"ax",@progbits
> > -ENTRY (STRCAT)
> > - mov %rdi, %r9
> > -# ifdef USE_AS_STRNCAT
> > - mov %rdx, %r8
> > -# endif
> > -
> > - xor %eax, %eax
> > - mov %edi, %ecx
> > - and $((VEC_SIZE * 4) - 1), %ecx
> > - vpxor %xmm6, %xmm6, %xmm6
> > - cmp $(VEC_SIZE * 3), %ecx
> > - ja L(fourth_vector_boundary)
> > - vpcmpeqb (%rdi), %ymm6, %ymm0
> > - vpmovmskb %ymm0, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_first_vector)
> > - mov %rdi, %rax
> > - and $-VEC_SIZE, %rax
> > - jmp L(align_vec_size_start)
> > -L(fourth_vector_boundary):
> > - mov %rdi, %rax
> > - and $-VEC_SIZE, %rax
> > - vpcmpeqb (%rax), %ymm6, %ymm0
> > - mov $-1, %r10d
> > - sub %rax, %rcx
> > - shl %cl, %r10d
> > - vpmovmskb %ymm0, %edx
> > - and %r10d, %edx
> > - jnz L(exit)
> > -
> > -L(align_vec_size_start):
> > - vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
> > - vpmovmskb %ymm0, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_second_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> > - vpmovmskb %ymm1, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_third_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> > - vpmovmskb %ymm2, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_fourth_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> > - vpmovmskb %ymm3, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_fifth_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> > - add $(VEC_SIZE * 4), %rax
> > - vpmovmskb %ymm0, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_second_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> > - vpmovmskb %ymm1, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_third_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> > - vpmovmskb %ymm2, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_fourth_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> > - vpmovmskb %ymm3, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_fifth_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> > - add $(VEC_SIZE * 4), %rax
> > - vpmovmskb %ymm0, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_second_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> > - vpmovmskb %ymm1, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_third_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> > - vpmovmskb %ymm2, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_fourth_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> > - vpmovmskb %ymm3, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_fifth_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> > - add $(VEC_SIZE * 4), %rax
> > - vpmovmskb %ymm0, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_second_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> > - vpmovmskb %ymm1, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_third_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> > - vpmovmskb %ymm2, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_fourth_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> > - vpmovmskb %ymm3, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_fifth_vector)
> > -
> > - test $((VEC_SIZE * 4) - 1), %rax
> > - jz L(align_four_vec_loop)
> > -
> > - vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> > - add $(VEC_SIZE * 5), %rax
> > - vpmovmskb %ymm0, %edx
> > - test %edx, %edx
> > - jnz L(exit)
> > -
> > - test $((VEC_SIZE * 4) - 1), %rax
> > - jz L(align_four_vec_loop)
> > -
> > - vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
> > - add $VEC_SIZE, %rax
> > - vpmovmskb %ymm1, %edx
> > - test %edx, %edx
> > - jnz L(exit)
> > -
> > - test $((VEC_SIZE * 4) - 1), %rax
> > - jz L(align_four_vec_loop)
> > -
> > - vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
> > - add $VEC_SIZE, %rax
> > - vpmovmskb %ymm2, %edx
> > - test %edx, %edx
> > - jnz L(exit)
> > -
> > - test $((VEC_SIZE * 4) - 1), %rax
> > - jz L(align_four_vec_loop)
> > -
> > - vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
> > - add $VEC_SIZE, %rax
> > - vpmovmskb %ymm3, %edx
> > - test %edx, %edx
> > - jnz L(exit)
> > -
> > - add $VEC_SIZE, %rax
> > -
> > - .p2align 4
> > -L(align_four_vec_loop):
> > - vmovaps (%rax), %ymm4
> > - vpminub VEC_SIZE(%rax), %ymm4, %ymm4
> > - vmovaps (VEC_SIZE * 2)(%rax), %ymm5
> > - vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5
> > - add $(VEC_SIZE * 4), %rax
> > - vpminub %ymm4, %ymm5, %ymm5
> > - vpcmpeqb %ymm5, %ymm6, %ymm5
> > - vpmovmskb %ymm5, %edx
> > - test %edx, %edx
> > - jz L(align_four_vec_loop)
> > -
> > - vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
> > - sub $(VEC_SIZE * 5), %rax
> > - vpmovmskb %ymm0, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_second_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> > - vpmovmskb %ymm1, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_third_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> > - vpmovmskb %ymm2, %edx
> > - test %edx, %edx
> > - jnz L(exit_null_on_fourth_vector)
> > -
> > - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> > - vpmovmskb %ymm3, %edx
> > - sub %rdi, %rax
> > - bsf %rdx, %rdx
> > - add %rdx, %rax
> > - add $(VEC_SIZE * 4), %rax
> > - jmp L(StartStrcpyPart)
> > -
> > - .p2align 4
> > -L(exit):
> > - sub %rdi, %rax
> > -L(exit_null_on_first_vector):
> > - bsf %rdx, %rdx
> > - add %rdx, %rax
> > - jmp L(StartStrcpyPart)
> > -
> > - .p2align 4
> > -L(exit_null_on_second_vector):
> > - sub %rdi, %rax
> > - bsf %rdx, %rdx
> > - add %rdx, %rax
> > - add $VEC_SIZE, %rax
> > - jmp L(StartStrcpyPart)
> > -
> > - .p2align 4
> > -L(exit_null_on_third_vector):
> > - sub %rdi, %rax
> > - bsf %rdx, %rdx
> > - add %rdx, %rax
> > - add $(VEC_SIZE * 2), %rax
> > - jmp L(StartStrcpyPart)
> > -
> > - .p2align 4
> > -L(exit_null_on_fourth_vector):
> > - sub %rdi, %rax
> > - bsf %rdx, %rdx
> > - add %rdx, %rax
> > - add $(VEC_SIZE * 3), %rax
> > - jmp L(StartStrcpyPart)
> > -
> > - .p2align 4
> > -L(exit_null_on_fifth_vector):
> > - sub %rdi, %rax
> > - bsf %rdx, %rdx
> > - add %rdx, %rax
> > - add $(VEC_SIZE * 4), %rax
> > -
> > - .p2align 4
> > -L(StartStrcpyPart):
> > - lea (%r9, %rax), %rdi
> > - mov %rsi, %rcx
> > - mov %r9, %rax /* save result */
> > -
> > -# ifdef USE_AS_STRNCAT
> > - test %r8, %r8
> > - jz L(ExitZero)
> > -# define USE_AS_STRNCPY
> > -# endif
> > -
> > -# include "strcpy-avx2.S"
> > +#ifndef STRCAT
> > +# define STRCAT __strcat_avx2
> > #endif
> > +
> > +#define USE_AS_STRCAT
> > +#define STRCPY STRCAT
> > +#include "strcpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
> > new file mode 100644
> > index 0000000000..128a45b6ff
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.S
>
> Missing copyright notice.
Fixed in V3.
>
> > @@ -0,0 +1,76 @@
> > + /* Simple strlen implementation that ends at L(strcat_strlen_done). */
> > + movq %rdi, %r8
> > + andq $(VEC_SIZE * -1), %r8
> > + VPCMPEQ (%r8), %VZERO, %VMM(0)
>
>
> > + vpmovmskb %VMM(0), %ecx
> > + shrxl %edi, %ecx, %ecx
> > + testl %ecx, %ecx
> > + jnz L(bsf_and_done_v0)
> > +
> > + VPCMPEQ VEC_SIZE(%r8), %VZERO, %VMM(0)
> > + vpmovmskb %VMM(0), %ecx
> > + leaq (VEC_SIZE)(%r8), %rdi
> > + testl %ecx, %ecx
> > + jnz L(bsf_and_done_v0)
> > +
> > + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %VMM(0)
> > + vpmovmskb %VMM(0), %ecx
> > + testl %ecx, %ecx
> > + jnz L(bsf_and_done_v1)
> > +
> > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %VMM(0)
> > + vpmovmskb %VMM(0), %ecx
> > + testl %ecx, %ecx
> > + jnz L(bsf_and_done_v2)
> > +
> > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %VMM(0)
> > + vpmovmskb %VMM(0), %ecx
> > + testl %ecx, %ecx
> > + jnz L(bsf_and_done_v3)
> > +
> > + orq $(VEC_SIZE * 4 - 1), %rdi
> > + .p2align 4,, 8
> > +L(loop_2x_vec):
> > + VMOVA (VEC_SIZE * 0 + 1)(%rdi), %VMM(0)
> > + VPMIN (VEC_SIZE * 1 + 1)(%rdi), %VMM(0), %VMM(1)
> > + VMOVA (VEC_SIZE * 2 + 1)(%rdi), %VMM(2)
> > + VPMIN (VEC_SIZE * 3 + 1)(%rdi), %VMM(2), %VMM(3)
> > + VPMIN %VMM(1), %VMM(3), %VMM(3)
> > + VPCMPEQ %VMM(3), %VZERO, %VMM(3)
> > + vpmovmskb %VMM(3), %r8d
> > + subq $(VEC_SIZE * -4), %rdi
> > + testl %r8d, %r8d
> > + jz L(loop_2x_vec)
> > +
> > + addq $(VEC_SIZE * -4 + 1), %rdi
> > +
> > + VPCMPEQ %VMM(0), %VZERO, %VMM(0)
> > + vpmovmskb %VMM(0), %ecx
> > + testl %ecx, %ecx
> > + jnz L(bsf_and_done_v0)
> > +
> > + VPCMPEQ %VMM(1), %VZERO, %VMM(1)
> > + vpmovmskb %VMM(1), %ecx
> > + testl %ecx, %ecx
> > + jnz L(bsf_and_done_v1)
> > +
> > + VPCMPEQ %VMM(2), %VZERO, %VMM(2)
> > + vpmovmskb %VMM(2), %ecx
> > + testl %ecx, %ecx
> > + jnz L(bsf_and_done_v2)
> > +
> > + movl %r8d, %ecx
> > +L(bsf_and_done_v3):
> > + addq $VEC_SIZE, %rdi
> > +L(bsf_and_done_v2):
> > + bsfl %ecx, %ecx
> > + leaq (VEC_SIZE * 2)(%rdi, %rcx), %rdi
> > + jmp L(strcat_strlen_done)
> > +
> > + .p2align 4,, 4
> > +L(bsf_and_done_v1):
> > + addq $VEC_SIZE, %rdi
> > +L(bsf_and_done_v0):
> > + bsfl %ecx, %ecx
> > + addq %rcx, %rdi
> > +L(strcat_strlen_done):
> > diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> > index c2c581ecf7..fe80ffd265 100644
> > --- a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> > @@ -1,12 +1,3 @@
> > -#ifndef STRCPY
> > -# define STRCPY __strcpy_avx2_rtm
> > -#endif
> > -
> > -#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> > - ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> > -
> > -#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
> > -
> > -#define SECTION(p) p##.avx.rtm
> > -
> > +#define STRCPY __strcpy_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> > #include "strcpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
> > index c725834929..b87a1722d5 100644
> > --- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
> > @@ -20,984 +20,378 @@
> >
> > #if ISA_SHOULD_BUILD (3)
> >
> > +# include <sysdep.h>
> >
> > -# ifndef USE_AS_STRCAT
> > -# include <sysdep.h>
> > -
> > -# ifndef STRCPY
> > -# define STRCPY __strcpy_avx2
> > -# endif
> > -
> > -# endif
> > -
> > -/* Number of bytes in a vector register */
> > # ifndef VEC_SIZE
> > -# define VEC_SIZE 32
> > -# endif
> > -
> > -# ifndef VZEROUPPER
> > -# define VZEROUPPER vzeroupper
> > -# endif
> > -
> > -# ifndef SECTION
> > -# define SECTION(p) p##.avx
> > -# endif
> > -
> > -/* zero register */
> > -#define xmmZ xmm0
> > -#define ymmZ ymm0
> > -
> > -/* mask register */
> > -#define ymmM ymm1
> > -
> > -# ifndef USE_AS_STRCAT
> > -
> > - .section SECTION(.text),"ax",@progbits
> > -ENTRY (STRCPY)
> > -# ifdef USE_AS_STRNCPY
> > - mov %RDX_LP, %R8_LP
> > - test %R8_LP, %R8_LP
> > - jz L(ExitZero)
> > -# endif
> > - mov %rsi, %rcx
> > -# ifndef USE_AS_STPCPY
> > - mov %rdi, %rax /* save result */
> > -# endif
> > -
> > +# include "x86-avx2-vecs.h"
> > # endif
> >
> > - vpxor %xmmZ, %xmmZ, %xmmZ
> > -
> > - and $((VEC_SIZE * 4) - 1), %ecx
> > - cmp $(VEC_SIZE * 2), %ecx
> > - jbe L(SourceStringAlignmentLessTwoVecSize)
> > -
> > - and $-VEC_SIZE, %rsi
> > - and $(VEC_SIZE - 1), %ecx
> > -
> > - vpcmpeqb (%rsi), %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %edx
> > - shr %cl, %rdx
> > -
> > -# ifdef USE_AS_STRNCPY
> > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > - mov $VEC_SIZE, %r10
> > - sub %rcx, %r10
> > - cmp %r10, %r8
> > -# else
> > - mov $(VEC_SIZE + 1), %r10
> > - sub %rcx, %r10
> > - cmp %r10, %r8
> > -# endif
> > - jbe L(CopyVecSizeTailCase2OrCase3)
> > +# ifndef STRCPY
> > +# define STRCPY __strcpy_avx2
> > # endif
> > - test %edx, %edx
> > - jnz L(CopyVecSizeTail)
> >
> > - vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
> > - vpmovmskb %ymm2, %edx
> > + /* Use movsb in page cross case to save code size. */
> > +# define USE_MOVSB_IN_PAGE_CROSS 1
> >
> > -# ifdef USE_AS_STRNCPY
> > - add $VEC_SIZE, %r10
> > - cmp %r10, %r8
> > - jbe L(CopyTwoVecSizeCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > - jnz L(CopyTwoVecSize)
> > -
> > - vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */
> > - vmovdqu %ymm2, (%rdi)
> > -
> > -/* If source address alignment != destination address alignment */
> > - .p2align 4
> > -L(UnalignVecSizeBoth):
> > - sub %rcx, %rdi
> > -# ifdef USE_AS_STRNCPY
> > - add %rcx, %r8
> > - sbb %rcx, %rcx
> > - or %rcx, %r8
> > -# endif
> > - mov $VEC_SIZE, %rcx
> > - vmovdqa (%rsi, %rcx), %ymm2
> > - vmovdqu %ymm2, (%rdi, %rcx)
> > - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> > - vpcmpeqb %ymm2, %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %edx
> > - add $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > - sub $(VEC_SIZE * 3), %r8
> > - jbe L(CopyVecSizeCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > - jnz L(CopyVecSizeUnalignedVec2)
> > +# ifdef USE_AS_WCSCPY
> > +# define VPCMPEQ vpcmpeqd
> > +# define VPMIN vpminud
> > +# define CHAR_SIZE 4
> > # else
> > - jnz L(CopyVecSize)
> > +# define VPCMPEQ vpcmpeqb
> > +# define VPMIN vpminub
> > +# define CHAR_SIZE 1
> > # endif
> >
> > - vmovdqu %ymm2, (%rdi, %rcx)
> > - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
> > - vpcmpeqb %ymm3, %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %edx
> > - add $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > - sub $VEC_SIZE, %r8
> > - jbe L(CopyVecSizeCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > - jnz L(CopyVecSizeUnalignedVec3)
> > -# else
> > - jnz L(CopyVecSize)
> > -# endif
> > +# define PAGE_SIZE 4096
> >
> > - vmovdqu %ymm3, (%rdi, %rcx)
> > - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
> > - vpcmpeqb %ymm4, %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %edx
> > - add $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > - sub $VEC_SIZE, %r8
> > - jbe L(CopyVecSizeCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > - jnz L(CopyVecSizeUnalignedVec4)
> > +# ifdef USE_AS_STPCPY
> > +# define END_REG rax
> > # else
> > - jnz L(CopyVecSize)
> > +# define END_REG rdi, %rdx
> > # endif
> >
> > - vmovdqu %ymm4, (%rdi, %rcx)
> > - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> > - vpcmpeqb %ymm2, %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %edx
> > - add $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > - sub $VEC_SIZE, %r8
> > - jbe L(CopyVecSizeCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > - jnz L(CopyVecSizeUnalignedVec2)
> > +# ifdef USE_AS_STRCAT
> > +# define PAGE_ALIGN_REG ecx
> > # else
> > - jnz L(CopyVecSize)
> > +# define PAGE_ALIGN_REG eax
> > # endif
> >
> > - vmovdqu %ymm2, (%rdi, %rcx)
> > - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> > - vpcmpeqb %ymm2, %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %edx
> > - add $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > - sub $VEC_SIZE, %r8
> > - jbe L(CopyVecSizeCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > - jnz L(CopyVecSizeUnalignedVec2)
> > -# else
> > - jnz L(CopyVecSize)
> > -# endif
> > +# define VZERO VMM(7)
> > +# define VZERO_128 VMM_128(7)
> >
> > - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
> > - vmovdqu %ymm2, (%rdi, %rcx)
> > - vpcmpeqb %ymm3, %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %edx
> > - add $VEC_SIZE, %rcx
> > -# ifdef USE_AS_STRNCPY
> > - sub $VEC_SIZE, %r8
> > - jbe L(CopyVecSizeCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > - jnz L(CopyVecSizeUnalignedVec3)
> > -# else
> > - jnz L(CopyVecSize)
> > -# endif
> > + .section SECTION(.text), "ax", @progbits
> > +ENTRY(STRCPY)
> > + vpxor %VZERO_128, %VZERO_128, %VZERO_128
> >
> > - vmovdqu %ymm3, (%rdi, %rcx)
> > - mov %rsi, %rdx
> > - lea VEC_SIZE(%rsi, %rcx), %rsi
> > - and $-(VEC_SIZE * 4), %rsi
> > - sub %rsi, %rdx
> > - sub %rdx, %rdi
> > -# ifdef USE_AS_STRNCPY
> > - lea (VEC_SIZE * 8)(%r8, %rdx), %r8
> > -# endif
> > -L(UnalignedFourVecSizeLoop):
> > - vmovdqa (%rsi), %ymm4
> > - vmovdqa VEC_SIZE(%rsi), %ymm5
> > - vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
> > - vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
> > - vpminub %ymm5, %ymm4, %ymm2
> > - vpminub %ymm7, %ymm6, %ymm3
> > - vpminub %ymm2, %ymm3, %ymm3
> > - vpcmpeqb %ymmM, %ymm3, %ymm3
> > - vpmovmskb %ymm3, %edx
> > -# ifdef USE_AS_STRNCPY
> > - sub $(VEC_SIZE * 4), %r8
> > - jbe L(UnalignedLeaveCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > - jnz L(UnalignedFourVecSizeLeave)
> > -
> > -L(UnalignedFourVecSizeLoop_start):
> > - add $(VEC_SIZE * 4), %rdi
> > - add $(VEC_SIZE * 4), %rsi
> > - vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
> > - vmovdqa (%rsi), %ymm4
> > - vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
> > - vmovdqa VEC_SIZE(%rsi), %ymm5
> > - vpminub %ymm5, %ymm4, %ymm2
> > - vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
> > - vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
> > - vmovdqu %ymm7, -VEC_SIZE(%rdi)
> > - vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
> > - vpminub %ymm7, %ymm6, %ymm3
> > - vpminub %ymm2, %ymm3, %ymm3
> > - vpcmpeqb %ymmM, %ymm3, %ymm3
> > - vpmovmskb %ymm3, %edx
> > -# ifdef USE_AS_STRNCPY
> > - sub $(VEC_SIZE * 4), %r8
> > - jbe L(UnalignedLeaveCase2OrCase3)
> > -# endif
> > - test %edx, %edx
> > - jz L(UnalignedFourVecSizeLoop_start)
> > -
> > -L(UnalignedFourVecSizeLeave):
> > - vpcmpeqb %ymm4, %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %edx
> > - test %edx, %edx
> > - jnz L(CopyVecSizeUnaligned_0)
> > -
> > - vpcmpeqb %ymm5, %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %ecx
> > - test %ecx, %ecx
> > - jnz L(CopyVecSizeUnaligned_16)
> > -
> > - vpcmpeqb %ymm6, %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %edx
> > - test %edx, %edx
> > - jnz L(CopyVecSizeUnaligned_32)
> > -
> > - vpcmpeqb %ymm7, %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %ecx
> > - bsf %ecx, %edx
> > - vmovdqu %ymm4, (%rdi)
> > - vmovdqu %ymm5, VEC_SIZE(%rdi)
> > - vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -# ifdef USE_AS_STPCPY
> > - lea (VEC_SIZE * 3)(%rdi, %rdx), %rax
> > -# endif
> > - vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
> > - add $(VEC_SIZE - 1), %r8
> > - sub %rdx, %r8
> > - lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
> > - jmp L(StrncpyFillTailWithZero)
> > -# else
> > - add $(VEC_SIZE * 3), %rsi
> > - add $(VEC_SIZE * 3), %rdi
> > - jmp L(CopyVecSizeExit)
> > +# ifdef USE_AS_STRCAT
> > + movq %rdi, %rax
> > +# include "strcat-strlen-avx2.S"
> > # endif
> >
> > -/* If source address alignment == destination address alignment */
> > -
> > -L(SourceStringAlignmentLessTwoVecSize):
> > - vmovdqu (%rsi), %ymm3
> > - vmovdqu VEC_SIZE(%rsi), %ymm2
> > - vpcmpeqb %ymm3, %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %edx
> > -
> > -# ifdef USE_AS_STRNCPY
> > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > - cmp $VEC_SIZE, %r8
> > -# else
> > - cmp $(VEC_SIZE + 1), %r8
> > -# endif
> > - jbe L(CopyVecSizeTail1Case2OrCase3)
> > + movl %esi, %PAGE_ALIGN_REG
> > + andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
> > + ja L(page_cross)
> > +L(page_cross_continue):
> > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > + movq %rdi, %rax
> > # endif
> > - test %edx, %edx
> > - jnz L(CopyVecSizeTail1)
> > -
> > - vmovdqu %ymm3, (%rdi)
> > - vpcmpeqb %ymm2, %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %edx
> > -
> > -# ifdef USE_AS_STRNCPY
> > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > - cmp $(VEC_SIZE * 2), %r8
> > -# else
> > - cmp $((VEC_SIZE * 2) + 1), %r8
> > -# endif
> > - jbe L(CopyTwoVecSize1Case2OrCase3)
> > -# endif
> > - test %edx, %edx
> > - jnz L(CopyTwoVecSize1)
> > -
> > - and $-VEC_SIZE, %rsi
> > - and $(VEC_SIZE - 1), %ecx
> > - jmp L(UnalignVecSizeBoth)
> > + VMOVU (%rsi), %VMM(0)
> > + VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> >
> > -/*------End of main part with loops---------------------*/
> > + testl %ecx, %ecx
> > + jz L(more_1x_vec)
> >
> > -/* Case1 */
> > + /* No longer need ymm registers so just vzeroupper so it doesn't
> > + need to be duplicated at each return statement. */
> > + COND_VZEROUPPER
> >
> > -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> > - .p2align 4
> > -L(CopyVecSize):
> > - add %rcx, %rdi
> > -# endif
> > -L(CopyVecSizeTail):
> > - add %rcx, %rsi
> > -L(CopyVecSizeTail1):
> > - bsf %edx, %edx
> > -L(CopyVecSizeExit):
> > - cmp $32, %edx
> > - jae L(Exit32_63)
> > - cmp $16, %edx
> > - jae L(Exit16_31)
> > - cmp $8, %edx
> > - jae L(Exit8_15)
> > - cmp $4, %edx
> > - jae L(Exit4_7)
> > - cmp $3, %edx
> > - je L(Exit3)
> > - cmp $1, %edx
> > - ja L(Exit2)
> > - je L(Exit1)
> > - movb $0, (%rdi)
> > + xorl %edx, %edx
> > + bsfl %ecx, %edx
> > # ifdef USE_AS_STPCPY
> > - lea (%rdi), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > - sub $1, %r8
> > - lea 1(%rdi), %rdi
> > - jnz L(StrncpyFillTailWithZero)
> > -# endif
> > -L(return_vzeroupper):
> > - ZERO_UPPER_VEC_REGISTERS_RETURN
> > -
> > - .p2align 4
> > -L(CopyTwoVecSize1):
> > - add $VEC_SIZE, %rsi
> > - add $VEC_SIZE, %rdi
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > - sub $VEC_SIZE, %r8
> > -# endif
> > - jmp L(CopyVecSizeTail1)
> > -
> > - .p2align 4
> > -L(CopyTwoVecSize):
> > - bsf %edx, %edx
> > - add %rcx, %rsi
> > - add $VEC_SIZE, %edx
> > - sub %ecx, %edx
> > - jmp L(CopyVecSizeExit)
> > -
> > - .p2align 4
> > -L(CopyVecSizeUnaligned_0):
> > - bsf %edx, %edx
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -# ifdef USE_AS_STPCPY
> > - lea (%rdi, %rdx), %rax
> > -# endif
> > - vmovdqu %ymm4, (%rdi)
> > - add $((VEC_SIZE * 4) - 1), %r8
> > - sub %rdx, %r8
> > - lea 1(%rdi, %rdx), %rdi
> > - jmp L(StrncpyFillTailWithZero)
> > -# else
> > - jmp L(CopyVecSizeExit)
> > -# endif
> > -
> > - .p2align 4
> > -L(CopyVecSizeUnaligned_16):
> > - bsf %ecx, %edx
> > - vmovdqu %ymm4, (%rdi)
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -# ifdef USE_AS_STPCPY
> > - lea VEC_SIZE(%rdi, %rdx), %rax
> > -# endif
> > - vmovdqu %ymm5, VEC_SIZE(%rdi)
> > - add $((VEC_SIZE * 3) - 1), %r8
> > - sub %rdx, %r8
> > - lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi
> > - jmp L(StrncpyFillTailWithZero)
> > + leaq (%rdi, %rdx), %rax
> > +# endif
> > +
> > + /* Use mask bits in rcx to detect which copy we need. If the low
> > + mask is zero then there must be a bit set in the upper half.
> > + I.e if ecx != 0 and cx == 0, then match must be upper 16
> > + bits so we use L(copy_16_31). */
> > + testw %cx, %cx
> > + jz L(copy_16_31)
> > +
> > + testb %cl, %cl
> > + jz L(copy_8_15)
> > +# ifdef USE_AS_WCSCPY
> > + vmovd %xmm0, (%rdi)
> > + movl $0, (%END_REG)
> > + ret
> > # else
> > - add $VEC_SIZE, %rsi
> > - add $VEC_SIZE, %rdi
> > - jmp L(CopyVecSizeExit)
> > -# endif
> > -
> > - .p2align 4
> > -L(CopyVecSizeUnaligned_32):
> > - bsf %edx, %edx
> > - vmovdqu %ymm4, (%rdi)
> > - vmovdqu %ymm5, VEC_SIZE(%rdi)
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > -# ifdef USE_AS_STPCPY
> > - lea (VEC_SIZE * 2)(%rdi, %rdx), %rax
> > -# endif
> > - vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> > - add $((VEC_SIZE * 2) - 1), %r8
> > - sub %rdx, %r8
> > - lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
> > - jmp L(StrncpyFillTailWithZero)
> > + testb $0x7, %cl
> > + jz L(copy_4_7)
> > +
> > + testl %edx, %edx
> > + jz L(set_null_term)
> > + vmovd %xmm0, %ecx
> > + movw %cx, (%rdi)
> > +
> > + .p2align 4,, 2
> > +L(set_null_term):
> > + movb $0, (%END_REG)
> > + ret
> > +
> > + .p2align 4,, 12
> > +L(copy_4_7):
> > + movl -3(%rsi, %rdx), %ecx
> > + vmovd %xmm0, (%rdi)
> > + movl %ecx, -3(%END_REG)
> > + ret
> > +# endif
> > +
> > + .p2align 4,, 10
> > +L(copy_16_31):
> > + VMOVU -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
> > + VMOVU %xmm0, (%rdi)
> > + VMOVU %xmm1, -(16 - CHAR_SIZE)(%END_REG)
> > + ret
> > +
> > + .p2align 4,, 10
> > +L(copy_8_15):
> > +# ifdef USE_AS_WCSCPY
> > + movl -(8 - CHAR_SIZE)(%rsi, %rdx), %ecx
> > # else
> > - add $(VEC_SIZE * 2), %rsi
> > - add $(VEC_SIZE * 2), %rdi
> > - jmp L(CopyVecSizeExit)
> > -# endif
> > -
> > -# ifdef USE_AS_STRNCPY
> > -# ifndef USE_AS_STRCAT
> > - .p2align 4
> > -L(CopyVecSizeUnalignedVec6):
> > - vmovdqu %ymm6, (%rdi, %rcx)
> > - jmp L(CopyVecSizeVecExit)
> > -
> > - .p2align 4
> > -L(CopyVecSizeUnalignedVec5):
> > - vmovdqu %ymm5, (%rdi, %rcx)
> > - jmp L(CopyVecSizeVecExit)
> > -
> > - .p2align 4
> > -L(CopyVecSizeUnalignedVec4):
> > - vmovdqu %ymm4, (%rdi, %rcx)
> > - jmp L(CopyVecSizeVecExit)
> > -
> > - .p2align 4
> > -L(CopyVecSizeUnalignedVec3):
> > - vmovdqu %ymm3, (%rdi, %rcx)
> > - jmp L(CopyVecSizeVecExit)
> > -# endif
> > -
> > -/* Case2 */
> > -
> > - .p2align 4
> > -L(CopyVecSizeCase2):
> > - add $VEC_SIZE, %r8
> > - add %rcx, %rdi
> > - add %rcx, %rsi
> > - bsf %edx, %edx
> > - cmp %r8d, %edx
> > - jb L(CopyVecSizeExit)
> > - jmp L(StrncpyExit)
> > -
> > - .p2align 4
> > -L(CopyTwoVecSizeCase2):
> > - add %rcx, %rsi
> > - bsf %edx, %edx
> > - add $VEC_SIZE, %edx
> > - sub %ecx, %edx
> > - cmp %r8d, %edx
> > - jb L(CopyVecSizeExit)
> > - jmp L(StrncpyExit)
> > -
> > -L(CopyVecSizeTailCase2):
> > - add %rcx, %rsi
> > - bsf %edx, %edx
> > - cmp %r8d, %edx
> > - jb L(CopyVecSizeExit)
> > - jmp L(StrncpyExit)
> > -
> > -L(CopyVecSizeTail1Case2):
> > - bsf %edx, %edx
> > - cmp %r8d, %edx
> > - jb L(CopyVecSizeExit)
> > - jmp L(StrncpyExit)
> > -
> > -/* Case2 or Case3, Case3 */
> > -
> > - .p2align 4
> > -L(CopyVecSizeCase2OrCase3):
> > - test %rdx, %rdx
> > - jnz L(CopyVecSizeCase2)
> > -L(CopyVecSizeCase3):
> > - add $VEC_SIZE, %r8
> > - add %rcx, %rdi
> > - add %rcx, %rsi
> > - jmp L(StrncpyExit)
> > -
> > - .p2align 4
> > -L(CopyTwoVecSizeCase2OrCase3):
> > - test %rdx, %rdx
> > - jnz L(CopyTwoVecSizeCase2)
> > - add %rcx, %rsi
> > - jmp L(StrncpyExit)
> > -
> > - .p2align 4
> > -L(CopyVecSizeTailCase2OrCase3):
> > - test %rdx, %rdx
> > - jnz L(CopyVecSizeTailCase2)
> > - add %rcx, %rsi
> > - jmp L(StrncpyExit)
> > -
> > - .p2align 4
> > -L(CopyTwoVecSize1Case2OrCase3):
> > - add $VEC_SIZE, %rdi
> > - add $VEC_SIZE, %rsi
> > - sub $VEC_SIZE, %r8
> > -L(CopyVecSizeTail1Case2OrCase3):
> > - test %rdx, %rdx
> > - jnz L(CopyVecSizeTail1Case2)
> > - jmp L(StrncpyExit)
> > -# endif
> > -
> > -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
> > -
> > - .p2align 4
> > -L(Exit1):
> > - movzwl (%rsi), %edx
> > - mov %dx, (%rdi)
> > -# ifdef USE_AS_STPCPY
> > - lea 1(%rdi), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > - sub $2, %r8
> > - lea 2(%rdi), %rdi
> > - jnz L(StrncpyFillTailWithZero)
> > -# endif
> > - VZEROUPPER_RETURN
> > -
> > - .p2align 4
> > -L(Exit2):
> > - movzwl (%rsi), %ecx
> > - mov %cx, (%rdi)
> > - movb $0, 2(%rdi)
> > -# ifdef USE_AS_STPCPY
> > - lea 2(%rdi), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > - sub $3, %r8
> > - lea 3(%rdi), %rdi
> > - jnz L(StrncpyFillTailWithZero)
> > -# endif
> > - VZEROUPPER_RETURN
> > -
> > - .p2align 4
> > -L(Exit3):
> > - mov (%rsi), %edx
> > - mov %edx, (%rdi)
> > + movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
> > +# endif
> > + vmovq %xmm0, (%rdi)
> > + movq %rcx, -(8 - CHAR_SIZE)(%END_REG)
> > + ret
> > +
> > +
> > + .p2align 4,, 8
> > +L(more_1x_vec):
> > +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> > + VMOVU %VMM(0), (%rdi)
> > +# endif
> > + subq %rsi, %rdi
> > + orq $(VEC_SIZE - 1), %rsi
> > + addq %rsi, %rdi
> > + VMOVA 1(%rsi), %VMM(1)
> > +
> > + /* Try and order stores after as many loads as is reasonable to
> > + avoid potential false dependencies. */
> > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > + VMOVU %VMM(0), (%rax)
> > +# endif
> > + VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + testl %ecx, %ecx
> > + jnz L(ret_vec_x1)
> > +
> > + VMOVA (VEC_SIZE + 1)(%rsi), %VMM(2)
> > + VMOVU %VMM(1), 1(%rdi)
> > +
> > + VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + testl %ecx, %ecx
> > + jnz L(ret_vec_x2)
> > +
> > + VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(3)
> > + VMOVU %VMM(2), (VEC_SIZE + 1)(%rdi)
> > +
> > + VPCMPEQ %VMM(3), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + testl %ecx, %ecx
> > + jnz L(ret_vec_x3)
> > +
> > + VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(4)
> > + VMOVU %VMM(3), (VEC_SIZE * 2 + 1)(%rdi)
> > + VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %edx
> > + testl %edx, %edx
> > + jnz L(ret_vec_x4)
> > +
> > + VMOVU %VMM(4), (VEC_SIZE * 3 + 1)(%rdi)
> > +
> > + /* Subtract rsi from rdi before aligning. Adding back rsi will
> > + get proper rdi (dst) for new src. */
> > + subq %rsi, %rdi
> > + incq %rsi
> > + orq $(VEC_SIZE * 4 - 1), %rsi
> > +
> > + /* Do first half of loop ahead of time so loop can just start by
> > + storing. */
> > + VMOVA (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
> > + VMOVA (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
> > + VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
> > + VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
> > +
> > + VPMIN %VMM(0), %VMM(1), %VMM(4)
> > + VPMIN %VMM(2), %VMM(3), %VMM(6)
> > + VPMIN %VMM(4), %VMM(6), %VMM(6)
> > + VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %edx
> > + addq %rsi, %rdi
> > +
> > + testl %edx, %edx
> > + jnz L(loop_4x_done)
> > +
> > + .p2align 4,, 11
> > +L(loop_4x_vec):
> > +
> > + VMOVU %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
> > + VMOVU %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
> > + subq $(VEC_SIZE * -4), %rsi
> > + VMOVU %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
> > + VMOVU %VMM(3), (VEC_SIZE * 3 + 1)(%rdi)
> > +
> > +
> > + VMOVA (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
> > + VMOVA (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
> > + VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
> > + VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
> > +
> > + VPMIN %VMM(0), %VMM(1), %VMM(4)
> > + VPMIN %VMM(2), %VMM(3), %VMM(6)
> > + VPMIN %VMM(4), %VMM(6), %VMM(6)
> > + VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> > +
> > + vpmovmskb %VMM(6), %edx
> > + subq $(VEC_SIZE * -4), %rdi
> > + testl %edx, %edx
> > + jz L(loop_4x_vec)
> > +
> > +L(loop_4x_done):
> > + VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + testl %ecx, %ecx
> > + jnz L(ret_vec_x1)
> > + VMOVU %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
> > +
> > + VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + testl %ecx, %ecx
> > + jnz L(ret_vec_x2)
> > + VMOVU %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
> > +
> > + VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + testl %ecx, %ecx
> > + jnz L(ret_vec_x3)
> > + VMOVU %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
> > +L(ret_vec_x4):
> > + bsfl %edx, %edx
> > + VMOVU ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> > + VMOVU %VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> > # ifdef USE_AS_STPCPY
> > - lea 3(%rdi), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > - sub $4, %r8
> > - lea 4(%rdi), %rdi
> > - jnz L(StrncpyFillTailWithZero)
> > + leaq (VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax
> > # endif
> > +L(return_end):
> > VZEROUPPER_RETURN
> >
> > - .p2align 4
> > -L(Exit4_7):
> > - mov (%rsi), %ecx
> > - mov %ecx, (%rdi)
> > - mov -3(%rsi, %rdx), %ecx
> > - mov %ecx, -3(%rdi, %rdx)
> > + .p2align 4,, 8
> > +L(ret_vec_x1):
> > + bsfl %ecx, %ecx
> > + VMOVU (1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> > + VMOVU %VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
> > # ifdef USE_AS_STPCPY
> > - lea (%rdi, %rdx), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > - sub %rdx, %r8
> > - sub $1, %r8
> > - lea 1(%rdi, %rdx), %rdi
> > - jnz L(StrncpyFillTailWithZero)
> > + leaq 1(%rcx, %rdi), %rax
> > # endif
> > - VZEROUPPER_RETURN
> > -
> > - .p2align 4
> > -L(Exit8_15):
> > - mov (%rsi), %rcx
> > - mov -7(%rsi, %rdx), %r9
> > - mov %rcx, (%rdi)
> > - mov %r9, -7(%rdi, %rdx)
> > -# ifdef USE_AS_STPCPY
> > - lea (%rdi, %rdx), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > - sub %rdx, %r8
> > - sub $1, %r8
> > - lea 1(%rdi, %rdx), %rdi
> > - jnz L(StrncpyFillTailWithZero)
> > -# endif
> > - VZEROUPPER_RETURN
> > +L(return_vzeroupper):
> > + ZERO_UPPER_VEC_REGISTERS_RETURN
> >
> > - .p2align 4
> > -L(Exit16_31):
> > - vmovdqu (%rsi), %xmm2
> > - vmovdqu -15(%rsi, %rdx), %xmm3
> > - vmovdqu %xmm2, (%rdi)
> > - vmovdqu %xmm3, -15(%rdi, %rdx)
> > + .p2align 4,, 8
> > +L(ret_vec_x2):
> > + bsfl %ecx, %ecx
> > + VMOVU ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> > + VMOVU %VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
> > # ifdef USE_AS_STPCPY
> > - lea (%rdi, %rdx), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > - sub %rdx, %r8
> > - sub $1, %r8
> > - lea 1(%rdi, %rdx), %rdi
> > - jnz L(StrncpyFillTailWithZero)
> > + leaq (VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax
> > # endif
> > VZEROUPPER_RETURN
> >
> > - .p2align 4
> > -L(Exit32_63):
> > - vmovdqu (%rsi), %ymm2
> > - vmovdqu -31(%rsi, %rdx), %ymm3
> > - vmovdqu %ymm2, (%rdi)
> > - vmovdqu %ymm3, -31(%rdi, %rdx)
> > + .p2align 4,, 8
> > +L(ret_vec_x3):
> > + bsfl %ecx, %ecx
> > + VMOVU ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> > + VMOVU %VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
> > # ifdef USE_AS_STPCPY
> > - lea (%rdi, %rdx), %rax
> > -# endif
> > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> > - sub %rdx, %r8
> > - sub $1, %r8
> > - lea 1(%rdi, %rdx), %rdi
> > - jnz L(StrncpyFillTailWithZero)
> > + leaq (VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax
> > # endif
> > VZEROUPPER_RETURN
> >
> > -# ifdef USE_AS_STRNCPY
> >
> > - .p2align 4
> > -L(StrncpyExit1):
> > - movzbl (%rsi), %edx
> > - mov %dl, (%rdi)
> > + .p2align 4,, 4
> > +L(page_cross):
> > + movq %rsi, %rcx
> > + andq $(VEC_SIZE * -1), %rcx
> > +
> > + VPCMPEQ (%rcx), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + shrxl %esi, %ecx, %ecx
> > +# if USE_MOVSB_IN_PAGE_CROSS
> > + /* Optimizing more aggressively for space as this is very cold
> > + code. This saves 2x cache lines. */
> > +
> > + /* This adds once to the later result which will get correct
> > + copy bounds. NB: this can never zero-out a non-zero RCX as
> > + to be in the page cross case rsi cannot be aligned and we
> > + already right-shift rcx by the misalignment. */
> > + shll $CHAR_SIZE, %ecx
> > + jz L(page_cross_continue)
> > + bsfl %ecx, %ecx
> > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> > + movq %rdi, %rax
> > +# endif
> > + rep movsb
> > # ifdef USE_AS_STPCPY
> > - lea 1(%rdi), %rax
> > -# endif
> > -# ifdef USE_AS_STRCAT
> > - movb $0, 1(%rdi)
> > + leaq -CHAR_SIZE(%rdi), %rax
> > # endif
> > - VZEROUPPER_RETURN
> >
> > - .p2align 4
> > -L(StrncpyExit2):
> > - movzwl (%rsi), %edx
> > - mov %dx, (%rdi)
> > -# ifdef USE_AS_STPCPY
> > - lea 2(%rdi), %rax
> > -# endif
> > -# ifdef USE_AS_STRCAT
> > - movb $0, 2(%rdi)
> > -# endif
> > VZEROUPPER_RETURN
> >
> > - .p2align 4
> > -L(StrncpyExit3_4):
> > - movzwl (%rsi), %ecx
> > - movzwl -2(%rsi, %r8), %edx
> > - mov %cx, (%rdi)
> > - mov %dx, -2(%rdi, %r8)
> > -# ifdef USE_AS_STPCPY
> > - lea (%rdi, %r8), %rax
> > -# endif
> > -# ifdef USE_AS_STRCAT
> > - movb $0, (%rdi, %r8)
> > -# endif
> > - VZEROUPPER_RETURN
> > -
> > - .p2align 4
> > -L(StrncpyExit5_8):
> > - mov (%rsi), %ecx
> > - mov -4(%rsi, %r8), %edx
> > - mov %ecx, (%rdi)
> > - mov %edx, -4(%rdi, %r8)
> > -# ifdef USE_AS_STPCPY
> > - lea (%rdi, %r8), %rax
> > -# endif
> > -# ifdef USE_AS_STRCAT
> > - movb $0, (%rdi, %r8)
> > -# endif
> > - VZEROUPPER_RETURN
> > -
> > - .p2align 4
> > -L(StrncpyExit9_16):
> > - mov (%rsi), %rcx
> > - mov -8(%rsi, %r8), %rdx
> > - mov %rcx, (%rdi)
> > - mov %rdx, -8(%rdi, %r8)
> > -# ifdef USE_AS_STPCPY
> > - lea (%rdi, %r8), %rax
> > -# endif
> > -# ifdef USE_AS_STRCAT
> > - movb $0, (%rdi, %r8)
> > -# endif
> > - VZEROUPPER_RETURN
> > -
> > - .p2align 4
> > -L(StrncpyExit17_32):
> > - vmovdqu (%rsi), %xmm2
> > - vmovdqu -16(%rsi, %r8), %xmm3
> > - vmovdqu %xmm2, (%rdi)
> > - vmovdqu %xmm3, -16(%rdi, %r8)
> > -# ifdef USE_AS_STPCPY
> > - lea (%rdi, %r8), %rax
> > -# endif
> > -# ifdef USE_AS_STRCAT
> > - movb $0, (%rdi, %r8)
> > -# endif
> > - VZEROUPPER_RETURN
> > -
> > - .p2align 4
> > -L(StrncpyExit33_64):
> > - /* 0/32, 31/16 */
> > - vmovdqu (%rsi), %ymm2
> > - vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
> > - vmovdqu %ymm2, (%rdi)
> > - vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
> > -# ifdef USE_AS_STPCPY
> > - lea (%rdi, %r8), %rax
> > -# endif
> > -# ifdef USE_AS_STRCAT
> > - movb $0, (%rdi, %r8)
> > -# endif
> > - VZEROUPPER_RETURN
> > -
> > - .p2align 4
> > -L(StrncpyExit65):
> > - /* 0/32, 32/32, 64/1 */
> > - vmovdqu (%rsi), %ymm2
> > - vmovdqu 32(%rsi), %ymm3
> > - mov 64(%rsi), %cl
> > - vmovdqu %ymm2, (%rdi)
> > - vmovdqu %ymm3, 32(%rdi)
> > - mov %cl, 64(%rdi)
> > -# ifdef USE_AS_STPCPY
> > - lea 65(%rdi), %rax
> > -# endif
> > -# ifdef USE_AS_STRCAT
> > - movb $0, 65(%rdi)
> > -# endif
> > - VZEROUPPER_RETURN
> > +# else
> > + testl %ecx, %ecx
> > + jz L(page_cross_continue)
> >
> > + /* Traditional copy case, essentially same as used in non-page-
> > + cross case but since we can't reuse VMM(0) we need twice as
> > + many loads from rsi. */
> > # ifndef USE_AS_STRCAT
> > -
> > - .p2align 4
> > -L(Fill1):
> > - mov %dl, (%rdi)
> > - VZEROUPPER_RETURN
> > -
> > - .p2align 4
> > -L(Fill2):
> > - mov %dx, (%rdi)
> > - VZEROUPPER_RETURN
> > -
> > - .p2align 4
> > -L(Fill3_4):
> > - mov %dx, (%rdi)
> > - mov %dx, -2(%rdi, %r8)
> > - VZEROUPPER_RETURN
> > -
> > - .p2align 4
> > -L(Fill5_8):
> > - mov %edx, (%rdi)
> > - mov %edx, -4(%rdi, %r8)
> > - VZEROUPPER_RETURN
> > -
> > - .p2align 4
> > -L(Fill9_16):
> > - mov %rdx, (%rdi)
> > - mov %rdx, -8(%rdi, %r8)
> > - VZEROUPPER_RETURN
> > -
> > - .p2align 4
> > -L(Fill17_32):
> > - vmovdqu %xmmZ, (%rdi)
> > - vmovdqu %xmmZ, -16(%rdi, %r8)
> > - VZEROUPPER_RETURN
> > -
> > - .p2align 4
> > -L(CopyVecSizeUnalignedVec2):
> > - vmovdqu %ymm2, (%rdi, %rcx)
> > -
> > - .p2align 4
> > -L(CopyVecSizeVecExit):
> > - bsf %edx, %edx
> > - add $(VEC_SIZE - 1), %r8
> > - add %rcx, %rdi
> > -# ifdef USE_AS_STPCPY
> > - lea (%rdi, %rdx), %rax
> > -# endif
> > - sub %rdx, %r8
> > - lea 1(%rdi, %rdx), %rdi
> > -
> > - .p2align 4
> > -L(StrncpyFillTailWithZero):
> > - xor %edx, %edx
> > - sub $VEC_SIZE, %r8
> > - jbe L(StrncpyFillExit)
> > -
> > - vmovdqu %ymmZ, (%rdi)
> > - add $VEC_SIZE, %rdi
> > -
> > - mov %rdi, %rsi
> > - and $(VEC_SIZE - 1), %esi
> > - sub %rsi, %rdi
> > - add %rsi, %r8
> > - sub $(VEC_SIZE * 4), %r8
> > - jb L(StrncpyFillLessFourVecSize)
> > -
> > -L(StrncpyFillLoopVmovdqa):
> > - vmovdqa %ymmZ, (%rdi)
> > - vmovdqa %ymmZ, VEC_SIZE(%rdi)
> > - vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
> > - vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
> > - add $(VEC_SIZE * 4), %rdi
> > - sub $(VEC_SIZE * 4), %r8
> > - jae L(StrncpyFillLoopVmovdqa)
> > -
> > -L(StrncpyFillLessFourVecSize):
> > - add $(VEC_SIZE * 2), %r8
> > - jl L(StrncpyFillLessTwoVecSize)
> > - vmovdqa %ymmZ, (%rdi)
> > - vmovdqa %ymmZ, VEC_SIZE(%rdi)
> > - add $(VEC_SIZE * 2), %rdi
> > - sub $VEC_SIZE, %r8
> > - jl L(StrncpyFillExit)
> > - vmovdqa %ymmZ, (%rdi)
> > - add $VEC_SIZE, %rdi
> > - jmp L(Fill)
> > -
> > - .p2align 4
> > -L(StrncpyFillLessTwoVecSize):
> > - add $VEC_SIZE, %r8
> > - jl L(StrncpyFillExit)
> > - vmovdqa %ymmZ, (%rdi)
> > - add $VEC_SIZE, %rdi
> > - jmp L(Fill)
> > -
> > - .p2align 4
> > -L(StrncpyFillExit):
> > - add $VEC_SIZE, %r8
> > -L(Fill):
> > - cmp $17, %r8d
> > - jae L(Fill17_32)
> > - cmp $9, %r8d
> > - jae L(Fill9_16)
> > - cmp $5, %r8d
> > - jae L(Fill5_8)
> > - cmp $3, %r8d
> > - jae L(Fill3_4)
> > - cmp $1, %r8d
> > - ja L(Fill2)
> > - je L(Fill1)
> > - VZEROUPPER_RETURN
> > -
> > -/* end of ifndef USE_AS_STRCAT */
> > + xorl %edx, %edx
> > # endif
> > -
> > - .p2align 4
> > -L(UnalignedLeaveCase2OrCase3):
> > - test %rdx, %rdx
> > - jnz L(UnalignedFourVecSizeLeaveCase2)
> > -L(UnalignedFourVecSizeLeaveCase3):
> > - lea (VEC_SIZE * 4)(%r8), %rcx
> > - and $-VEC_SIZE, %rcx
> > - add $(VEC_SIZE * 3), %r8
> > - jl L(CopyVecSizeCase3)
> > - vmovdqu %ymm4, (%rdi)
> > - sub $VEC_SIZE, %r8
> > - jb L(CopyVecSizeCase3)
> > - vmovdqu %ymm5, VEC_SIZE(%rdi)
> > - sub $VEC_SIZE, %r8
> > - jb L(CopyVecSizeCase3)
> > - vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> > - sub $VEC_SIZE, %r8
> > - jb L(CopyVecSizeCase3)
> > - vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
> > + bsfl %ecx, %edx
> > # ifdef USE_AS_STPCPY
> > - lea (VEC_SIZE * 4)(%rdi), %rax
> > -# endif
> > -# ifdef USE_AS_STRCAT
> > - movb $0, (VEC_SIZE * 4)(%rdi)
> > + leaq (%rdi, %rdx), %rax
> > +# elif !defined USE_AS_STRCAT
> > + movq %rdi, %rax
> > # endif
> > - VZEROUPPER_RETURN
> >
> > - .p2align 4
> > -L(UnalignedFourVecSizeLeaveCase2):
> > - xor %ecx, %ecx
> > - vpcmpeqb %ymm4, %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %edx
> > - add $(VEC_SIZE * 3), %r8
> > - jle L(CopyVecSizeCase2OrCase3)
> > - test %edx, %edx
> > -# ifndef USE_AS_STRCAT
> > - jnz L(CopyVecSizeUnalignedVec4)
> > -# else
> > - jnz L(CopyVecSize)
> > -# endif
> > - vpcmpeqb %ymm5, %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %edx
> > - vmovdqu %ymm4, (%rdi)
> > - add $VEC_SIZE, %rcx
> > - sub $VEC_SIZE, %r8
> > - jbe L(CopyVecSizeCase2OrCase3)
> > - test %edx, %edx
> > -# ifndef USE_AS_STRCAT
> > - jnz L(CopyVecSizeUnalignedVec5)
> > -# else
> > - jnz L(CopyVecSize)
> > -# endif
> > + /* vzeroupper early to avoid duplicating at each return. */
> > + COND_VZEROUPPER
> >
> > - vpcmpeqb %ymm6, %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %edx
> > - vmovdqu %ymm5, VEC_SIZE(%rdi)
> > - add $VEC_SIZE, %rcx
> > - sub $VEC_SIZE, %r8
> > - jbe L(CopyVecSizeCase2OrCase3)
> > - test %edx, %edx
> > -# ifndef USE_AS_STRCAT
> > - jnz L(CopyVecSizeUnalignedVec6)
> > -# else
> > - jnz L(CopyVecSize)
> > -# endif
> > + testw %cx, %cx
> > + jz L(page_cross_copy_16_31)
> >
> > - vpcmpeqb %ymm7, %ymmZ, %ymmM
> > - vpmovmskb %ymmM, %edx
> > - vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> > - lea VEC_SIZE(%rdi, %rcx), %rdi
> > - lea VEC_SIZE(%rsi, %rcx), %rsi
> > - bsf %edx, %edx
> > - cmp %r8d, %edx
> > - jb L(CopyVecSizeExit)
> > -L(StrncpyExit):
> > - cmp $65, %r8d
> > - je L(StrncpyExit65)
> > - cmp $33, %r8d
> > - jae L(StrncpyExit33_64)
> > - cmp $17, %r8d
> > - jae L(StrncpyExit17_32)
> > - cmp $9, %r8d
> > - jae L(StrncpyExit9_16)
> > - cmp $5, %r8d
> > - jae L(StrncpyExit5_8)
> > - cmp $3, %r8d
> > - jae L(StrncpyExit3_4)
> > - cmp $1, %r8d
> > - ja L(StrncpyExit2)
> > - je L(StrncpyExit1)
> > -# ifdef USE_AS_STPCPY
> > - mov %rdi, %rax
> > -# endif
> > -# ifdef USE_AS_STRCAT
> > - movb $0, (%rdi)
> > -# endif
> > - VZEROUPPER_RETURN
> > -
> > - .p2align 4
> > -L(ExitZero):
> > -# ifndef USE_AS_STRCAT
> > - mov %rdi, %rax
> > -# endif
> > - VZEROUPPER_RETURN
> > + testb %cl, %cl
> > + jz L(page_cross_copy_8_15)
> >
> > -# endif
> > + testl $0x7, %cl
> > + jz L(page_cross_copy_4_7)
> >
> > -# ifndef USE_AS_STRCAT
> > -END (STRCPY)
> > -# else
> > -END (STRCAT)
> > -# endif
> > + testl %edx, %edx
> > + jz L(page_cross_set_null_term)
> > + movzwl (%rsi), %ecx
> > + movw %cx, (%rdi)
> > +L(page_cross_set_null_term):
> > + movb $0, (%END_REG)
> > + ret
> > +
> > + .p2align 4,, 4
> > +L(page_cross_copy_4_7):
> > + movl (%rsi), %ecx
> > + movl -3(%rsi, %rdx), %esi
> > + movl %ecx, (%rdi)
> > + movl %esi, -3(%END_REG)
> > + ret
> > +
> > + .p2align 4,, 4
> > +L(page_cross_copy_8_15):
> > + movq (%rsi), %rcx
> > + movq -7(%rsi, %rdx), %rsi
> > + movq %rcx, (%rdi)
> > + movq %rsi, -7(%END_REG)
> > + ret
> > +
> > +
> > + .p2align 4,, 3
> > +L(page_cross_copy_16_31):
> > + VMOVU (%rsi), %xmm0
> > + VMOVU -15(%rsi, %rdx), %xmm1
> > + VMOVU %xmm0, (%rdi)
> > + VMOVU %xmm1, -15(%END_REG)
> > + ret
> > +# endif
> > +
> > +END(STRCPY)
> > #endif
> > diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> > index 0dcea18dbb..2bbdbb91ab 100644
> > --- a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> > @@ -1,3 +1,3 @@
> > -#define USE_AS_STRNCAT
> > -#define STRCAT __strncat_avx2_rtm
> > -#include "strcat-avx2-rtm.S"
> > +#define STRNCAT __strncat_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> > +#include "strncat-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
> > index 52ecbca943..547cef9486 100644
> > --- a/sysdeps/x86_64/multiarch/strncat-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
> > @@ -1,7 +1,419 @@
> > -#ifndef STRNCAT
> > -# define STRNCAT __strncat_avx2
> > -#endif
> > +/* strncat with AVX2
> > + Copyright (C) 2022 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (3)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifndef VEC_SIZE
> > +# include "x86-avx2-vecs.h"
> > +# endif
> > +
> > +# ifndef STRNCAT
> > +# define STRNCAT __strncat_avx2
> > +# endif
> > +
> > +# ifdef USE_AS_WCSCPY
> > +# define movNULL movl
> > +# define VPCMPEQ vpcmpeqd
> > +# define VPMIN vpminud
> > +# define CHAR_SIZE 4
> > +# else
> > +# define movNULL movb
> > +# define VPCMPEQ vpcmpeqb
> > +# define VPMIN vpminub
> > +# define CHAR_SIZE 1
> > +# endif
> > +
> > +# include "strncpy-or-cat-overflow-def.h"
> > +
> > +# define PAGE_SIZE 4096
> > +
> > +# define VZERO VMM(7)
> > +# define VZERO_128 VMM_128(7)
> > +
> > + .section SECTION(.text), "ax", @progbits
> > +ENTRY(STRNCAT)
> > + /* Filter zero length strings and very long strings. Zero
> > + length strings just return, very long strings are handled by
> > + using the non-length variant {wcs|str}cat. */
> > + movq %rdi, %rax
> > +# ifdef USE_AS_WCSCPY
> > + leaq -1(%rdx), %rcx
> > + shr $56, %rcx
> > + jnz L(zero_len)
> > + salq $2, %rdx
> > +# else
> > + test %rdx, %rdx
> > + jl L(zero_len)
> > +# endif
> > + vpxor %VZERO_128, %VZERO_128, %VZERO_128
> > +
> > +# include "strcat-strlen-avx2.S"
> > +
> > + movl %esi, %ecx
> > + andl $(PAGE_SIZE - 1), %ecx
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
> > + ja L(page_cross)
> > +L(page_cross_continue):
> > + VMOVU (%rsi), %VMM(0)
> > + VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > +
> > + tzcnt %ecx, %r8d
> > + cmpq %r8, %rdx
> > + jbe L(less_1x_vec)
> > +
> > + testl %ecx, %ecx
> > + jz L(more_1x_vec)
> > +
> > + /* Hoist this to save code size. */
> > +
> > + movl %r8d, %edx
> > +
> > +L(less_1x_vec):
> > + COND_VZEROUPPER
> > +
> > + cmpl $16, %edx
> > + jae L(copy_16_31)
> > + cmpl $8, %edx
> > + jae L(copy_8_15)
> > +
> > +
> > +# ifdef USE_AS_WCSCPY
> > + vmovd %VMM_128(0), (%rdi)
> > + movNULL $0, (%rdi, %rdx)
> > + ret
> > +# else
> > + cmpl $4, %edx
> > + jae L(copy_4_7)
> > +
> > + movzbl (%rsi), %ecx
> > + cmpl $1, %edx
> > + jbe L(set_null_term)
> > +
> > + /* NB: make this `vmovw` if support for AVX512-FP16 is added.
> > + */
> > + movzwl 1(%rsi), %esi
> > + movw %si, 1(%rdi)
> > +
> > + .p2align 4,, 1
> > +L(set_null_term):
> > + movb %cl, (%rdi)
> > + movNULL $0, (%rdi, %rdx)
> > + ret
> > +
> > + .p2align 4,, 11
> > +L(copy_4_7):
> > + movl -(4)(%rsi, %rdx), %ecx
> > + vmovd %xmm0, (%rdi)
> > + movl %ecx, -(4)(%rdi, %rdx)
> > + movNULL $0, (%rdi, %rdx)
> > + ret
> > +# endif
> > +
> > +
> > + .p2align 4,, 10
> > +L(copy_16_31):
> > + VMOVU -(16)(%rsi, %rdx), %xmm1
> > + VMOVU %xmm0, (%rdi)
> > + VMOVU %xmm1, -(16)(%rdi, %rdx)
> > + movNULL $0, (%rdi, %rdx)
> > + ret
> > +
> > + .p2align 4,, 10
> > +L(copy_8_15):
> > + movq -(8)(%rsi, %rdx), %rcx
> > + vmovq %xmm0, (%rdi)
> > + movq %rcx, -(8)(%rdi, %rdx)
> > + movNULL $0, (%rdi, %rdx)
> > + ret
> > +
> > + .p2align 4,, 8
> > + .p2align 6,, 14
> > +L(more_1x_vec):
> > + VMOVU %VMM(0), (%rdi)
> > +
> > + /* Align rsi (src) and just rdx/rdi (length/dst). */
> > + addq %rsi, %rdx
> > + subq %rsi, %rdi
> > + orq $(VEC_SIZE - 1), %rsi
> > + incq %rsi
> > + addq %rsi, %rdi
> > +L(loop_last_4x_vec):
> > + subq %rsi, %rdx
> > + VMOVA 0(%rsi), %VMM(1)
> > + VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + cmpq $(VEC_SIZE * 2), %rdx
> > + ja L(more_2x_vec)
> > +L(last_2x_vec):
> > + tzcnt %ecx, %ecx
> > + cmpl %ecx, %edx
> > + jbe L(ret_vec_x1_len)
> > +
> > + cmpl $VEC_SIZE, %ecx
> > + jnz L(ret_vec_x1)
> > +
> > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2)
> > + VMOVU %VMM(1), (%rdi)
> > + VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + addl $-VEC_SIZE, %edx
> > + bzhil %edx, %ecx, %r8d
> > + jz L(ret_vec_x2_len)
> > +L(ret_vec_x2):
> > + bsfl %ecx, %edx
> > +L(ret_vec_x2_len):
> > + VMOVU (%rsi, %rdx), %VMM(0)
> > + movNULL $0, (VEC_SIZE)(%rdi, %rdx)
> > + VMOVU %VMM(0), (%rdi, %rdx)
> > +L(return_vzeroupper):
> > + ZERO_UPPER_VEC_REGISTERS_RETURN
> > +
> > +
> > + .p2align 4,, 12
> > +L(ret_vec_x1_len):
> > + movl %edx, %ecx
> > +L(ret_vec_x1):
> > + VMOVU -(VEC_SIZE)(%rsi, %rcx), %VMM(1)
> > + movNULL $0, (%rdi, %rcx)
> > + VMOVU %VMM(1), -VEC_SIZE(%rdi, %rcx)
> > + VZEROUPPER_RETURN
> > +
> > + .p2align 4,, 8
> > +L(last_4x_vec):
> > + subq $-(VEC_SIZE * 4), %rsi
> > + VMOVA 0(%rsi), %VMM(1)
> > + VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + subq $-(VEC_SIZE * 4), %rdi
> > + addl $-(VEC_SIZE * 4), %edx
> > + cmpl $(VEC_SIZE * 2), %edx
> > + jbe L(last_2x_vec)
> > + .p2align 4,, 8
> > +L(more_2x_vec):
> > + /* L(ret_vec_x1) expects ecx to have position of first match so
> > + test with bsf. */
> > + bsfl %ecx, %ecx
> > + jnz L(ret_vec_x1)
> > +
> > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2)
> > + VMOVU %VMM(1), (%rdi)
> > +
> > + VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + testl %ecx, %ecx
> > + jnz L(ret_vec_x2)
> > +
> >
> > -#define USE_AS_STRNCAT
> > -#define STRCAT STRNCAT
> > -#include "strcat-avx2.S"
> > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3)
> > + VMOVU %VMM(2), (VEC_SIZE * 1)(%rdi)
> > +
> > + VPCMPEQ %VMM(3), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > +
> > + /* Check if length is greater than 4x VEC. */
> > + cmpq $(VEC_SIZE * 4), %rdx
> > + ja L(more_4x_vec)
> > +
> > + addl $(VEC_SIZE * -2), %edx
> > +
> > + tzcnt %ecx, %ecx
> > + cmpl %ecx, %edx
> > + jbe L(ret_vec_x3_len)
> > +
> > + cmpl $VEC_SIZE, %ecx
> > + jnz L(ret_vec_x3)
> > +
> > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
> > + VMOVU %VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
> > + VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + addl $-VEC_SIZE, %edx
> > + bzhil %edx, %ecx, %r8d
> > + jz L(ret_vec_x4_len)
> > +L(ret_vec_x4):
> > + bsfl %ecx, %edx
> > +L(ret_vec_x4_len):
> > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(0)
> > + movNULL $0, (VEC_SIZE * 3)(%rdi, %rdx)
> > + VMOVU %VMM(0), (VEC_SIZE * 2)(%rdi, %rdx)
> > + VZEROUPPER_RETURN
> > +
> > + .p2align 4,, 4
> > +L(ret_vec_x3_len):
> > + movl %edx, %ecx
> > +L(ret_vec_x3):
> > + VMOVU (VEC_SIZE)(%rsi, %rcx), %VMM(0)
> > + movNULL $0, (VEC_SIZE * 2)(%rdi, %rcx)
> > + VMOVU %VMM(0), (VEC_SIZE)(%rdi, %rcx)
> > + VZEROUPPER_RETURN
> > +
> > +
> > + .p2align 4,, 8
> > +L(more_4x_vec):
> > + bsfl %ecx, %ecx
> > + jnz L(ret_vec_x3)
> > +
> > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4)
> > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
> > + VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + testl %ecx, %ecx
> > + jnz L(ret_vec_x4)
> > +
> > + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi)
> > +
> > +
> > + /* Recheck length before aligning. */
> > + cmpq $(VEC_SIZE * 8), %rdx
> > + jbe L(last_4x_vec)
> > +
> > + /* Align rsi (src) and just rdx/rdi (length/dst). */
> > + addq %rsi, %rdx
> > + subq %rsi, %rdi
> > + subq $-(VEC_SIZE * 4), %rsi
> > + andq $(VEC_SIZE * -4), %rsi
> > +
> > + /* Do first half of loop ahead of time so loop can just start by
> > + storing. */
> > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > +
> > + VPMIN %VMM(0), %VMM(1), %VMM(4)
> > + VPMIN %VMM(2), %VMM(3), %VMM(6)
> > + VPMIN %VMM(4), %VMM(6), %VMM(6)
> > + VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %r8d
> > + addq %rsi, %rdi
> > + testl %r8d, %r8d
> > + jnz L(loop_4x_done)
> > +
> > + /* Use r9 for end of region before handling last 4x VEC
> > + specially. */
> > + leaq -(VEC_SIZE * 4)(%rdx), %r9
> > +
> > + .p2align 4,, 11
> > +L(loop_4x_vec):
> > +
> > + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> > + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> > + subq $(VEC_SIZE * -4), %rsi
> > + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> > + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> > +
> > + subq $(VEC_SIZE * -4), %rdi
> > + cmpq %rsi, %r9
> > + jbe L(loop_last_4x_vec)
> > +
> > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > +
> > + VPMIN %VMM(0), %VMM(1), %VMM(4)
> > + VPMIN %VMM(2), %VMM(3), %VMM(6)
> > + VPMIN %VMM(4), %VMM(6), %VMM(6)
> > + VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> > +
> > + vpmovmskb %VMM(6), %r8d
> > +
> > + testl %r8d, %r8d
> > + jz L(loop_4x_vec)
> > +
> > +L(loop_4x_done):
> > + VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + /* L(ret_vec_x1) expects ecx to have position of first match so
> > + test with bsf. */
> > + bsfl %ecx, %ecx
> > + jnz L(ret_vec_x1)
> > + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> > +
> > + VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > +
> > + testl %ecx, %ecx
> > + jnz L(ret_vec_x2)
> > + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> > +
> > + VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + bsfl %ecx, %ecx
> > + jnz L(ret_vec_x3)
> > +
> > + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> > + bsfl %r8d, %r8d
> > + VMOVU (VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1)
> > + VMOVU %VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8)
> > + VZEROUPPER_RETURN
> > +
> > +
> > +
> > + .p2align 4,, 4
> > +L(page_cross):
> > + movq %rsi, %r8
> > + andq $(VEC_SIZE * -1), %r8
> > +
> > + VPCMPEQ (%r8), %VZERO, %VMM(6)
> > +
> > + vpmovmskb %VMM(6), %ecx
> > + shrxl %esi, %ecx, %ecx
> > +
> > + subl %esi, %r8d
> > + andl $(VEC_SIZE - 1), %r8d
> > + cmpq %r8, %rdx
> > + jb L(page_cross_small)
> > +
> > + /* Optimizing more aggressively for space as this is very cold
> > + code. This saves 2x cache lines. */
> > +
> > + /* This adds once to the later result which will get correct
> > + copy bounds. NB: this can never zero-out a non-zero RCX as
> > + to be in the page cross case rsi cannot be aligned and we
> > + already right-shift rcx by the misalignment. */
> > + shll $CHAR_SIZE, %ecx
> > + jz L(page_cross_continue)
> > + bsfl %ecx, %ecx
> > + rep movsb
> > + VZEROUPPER_RETURN
> > +
> > +L(page_cross_small):
> > + tzcntl %ecx, %ecx
> > + jz L(page_cross_setz)
> > + cmpl %edx, %ecx
> > + cmova %edx, %ecx
> > + rep movsb
> > +L(page_cross_setz):
> > + movNULL $0, (%rdi)
> > + VZEROUPPER_RETURN
> > +L(zero_len):
> > +# ifdef USE_AS_WCSCPY
> > + test %rdx, %rdx
> > +# endif
> > + jnz OVERFLOW_STRCAT
> > + ret
> > +
> > +
> > +END(STRNCAT)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> > index 79e7083299..b582a4a7a1 100644
> > --- a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> > @@ -1,3 +1,3 @@
> > -#define USE_AS_STRNCPY
> > -#define STRCPY __strncpy_avx2_rtm
> > -#include "strcpy-avx2-rtm.S"
> > +#define STRNCPY __strncpy_avx2_rtm
> > +#include "x86-avx2-rtm-vecs.h"
> > +#include "strncpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
> > index ce634e94fa..d1b25b7a42 100644
> > --- a/sysdeps/x86_64/multiarch/strncpy-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
> > @@ -1,7 +1,735 @@
> > -#ifndef STRNCPY
> > -# define STRNCPY __strncpy_avx2
> > -#endif
> > +/* strncpy with AVX2
> > + Copyright (C) 2022 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (3)
> > +
> > +# include <sysdep.h>
> > +
> > +
> > +# ifndef VEC_SIZE
> > +# include "x86-avx2-vecs.h"
> > +# endif
> > +
> > +# ifndef STRNCPY
> > +# define STRNCPY __strncpy_avx2
> > +# endif
> > +
> > +
> > +# ifdef USE_AS_WCSCPY
> > +# define VPCMPEQ vpcmpeqd
> > +# define VPMIN vpminud
> > +# define CHAR_SIZE 4
> > +# else
> > +# define VPCMPEQ vpcmpeqb
> > +# define VPMIN vpminub
> > +# define CHAR_SIZE 1
> > +# endif
> > +
> > +# include "strncpy-or-cat-overflow-def.h"
> > +
> > +# define PAGE_SIZE 4096
> > +
> > +# define VZERO VMM(7)
> > +# define VZERO_128 VMM_128(7)
> > +
> > +
> > + .section SECTION(.text), "ax", @progbits
> > +ENTRY(STRNCPY)
> > + /* Filter zero length strings and very long strings. Zero
> > + length strings just return, very long strings are handled by
> > + just running rep stos{b|l} to zero set (which will almost
> > + certainly segfault), if that succeeds then just calling
> > + OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */
> > +# ifdef USE_AS_WCSCPY
> > + decq %rdx
> > + movq %rdx, %rax
> > + /* 56 is end of max supported address space. */
> > + shr $56, %rax
> > + jnz L(zero_len)
> > + salq $2, %rdx
> > +# else
> > + decq %rdx
> > + /* `dec` can macrofuse with `jl`. If the flag needs to become
> > + `jb` replace `dec` with `sub`. */
> > + jl L(zero_len)
> > +# endif
> > +
> > + vpxor %VZERO_128, %VZERO_128, %VZERO_128
> > + movl %esi, %eax
> > + andl $(PAGE_SIZE - 1), %eax
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > + ja L(page_cross)
> > +
> > +L(page_cross_continue):
> > + VMOVU (%rsi), %VMM(0)
> > + VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > +
> > + /* If no STPCPY just save end ahead of time. */
> > +# ifndef USE_AS_STPCPY
> > + movq %rdi, %rax
> > +# elif defined USE_AS_WCSCPY
> > + /* Clear dependency as nearly all return code for wcpncpy uses
> > + `setc %al`. */
> > + xorl %eax, %eax
> > +# endif
> > +
> > + cmpq $(VEC_SIZE - CHAR_SIZE), %rdx
> > + /* `jb` because length rdx is now length - CHAR_SIZE. */
> > + jbe L(less_1x_vec)
> > +
> > + /* This may overset but thats fine because we still need to zero
> > + fill. */
> > + VMOVU %VMM(0), (%rdi)
> > +
> > + testl %ecx, %ecx
> > + jnz L(zfill)
> > +
> > + /* Align. */
> > + addq %rsi, %rdx
> > + subq %rsi, %rdi
> > + orq $(VEC_SIZE - 1), %rsi
> > + incq %rsi
> > +L(last_4x_vec):
> > + addq %rsi, %rdi
> > +L(loop_last_4x_vec):
> > + subq %rsi, %rdx
> > +
> > +
> > + VMOVA 0(%rsi), %VMM(1)
> > + VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > +
> > + cmpq $(VEC_SIZE * 2), %rdx
> > + jae L(more_2x_vec)
> > +
> > + cmpl $(VEC_SIZE), %edx
> > + jb L(ret_vec_x1_len)
> > +
> > + testl %ecx, %ecx
> > + jnz L(ret_vec_x1)
> > +
> > + VPCMPEQ VEC_SIZE(%rsi), %VZERO, %VMM(6)
> > + VMOVU %VMM(1), (%rdi)
> > + vpmovmskb %VMM(6), %ecx
> > + shlq $VEC_SIZE, %rcx
> > +L(ret_vec_x1_len):
> > + tzcntq %rcx, %rcx
> > + cmpl %ecx, %edx
> > + jbe L(ret_vec_x1_len_no_zfill)
> > + /* Fall through (expectation) is copy len < buffer len. */
> > + VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> > +L(ret_vec_x1_len_no_zfill_mov):
> > + movl %ecx, %edx
> > +# ifdef USE_AS_STPCPY
> > + /* clear flags. */
> > + xorl %ecx, %ecx
> > +# endif
> > +L(ret_vec_x1_len_no_zfill):
> > + VMOVU ((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> > + VMOVU %VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> > +# ifdef USE_AS_STPCPY
> > +# ifdef USE_AS_WCSCPY
> > + setc %al
> > + addq %rdx, %rdi
> > + leaq (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > + movl %edx, %eax
> > + adcq %rdi, %rax
> > +# endif
> > +# endif
> > +L(return_vzeroupper):
> > + ZERO_UPPER_VEC_REGISTERS_RETURN
> > +
> > + .p2align 4,, 6
> > +L(ret_vec_x1):
> > + bsfl %ecx, %ecx
> > + VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> > + subl %ecx, %edx
> > + /* Check if we need to reload/store. */
> > + cmpl $VEC_SIZE, %edx
> > + jb L(ret_vec_x1_len_no_zfill_mov)
> > + /* Otherwise safe to just store directly. */
> > + VMOVU %VMM(1), (%rdi)
> > + VMOVU %VZERO, (%rdi, %rcx)
> > +# ifdef USE_AS_STPCPY
> > + leaq (%rdi, %rcx), %rax
> > +# endif
> > + VZEROUPPER_RETURN
> > +
> > + .p2align 4,, 12
> > +L(more_2x_vec):
> > + VMOVU %VMM(1), (%rdi)
> > + testl %ecx, %ecx
> > + /* Must fill at least 2x VEC. */
> > + jnz L(zfill_vec1)
> > +
> > + VMOVA VEC_SIZE(%rsi), %VMM(2)
> > + VMOVU %VMM(2), VEC_SIZE(%rdi)
> > + VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + testl %ecx, %ecx
> > + /* Must fill at least 1x VEC. */
> > + jnz L(zfill_vec2)
> > +
> > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3)
> > + VPCMPEQ %VMM(3), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > +
> > + /* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -
> > + CHAR_SIZE. */
> > + cmpq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
> > + ja L(more_4x_vec)
> > +
> > + subl $(VEC_SIZE * 3), %edx
> > + jb L(ret_vec_x3_len)
> > +
> > + testl %ecx, %ecx
> > + jnz L(ret_vec_x3)
> > +
> > + VPCMPEQ (VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6)
> > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
> > + vpmovmskb %VMM(6), %ecx
> > + tzcntl %ecx, %ecx
> > + cmpl %ecx, %edx
> > + jbe L(ret_vec_x4_len_no_zfill)
> > + /* Fall through (expectation) is copy len < buffer len. */
> > + VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> > + movl %ecx, %edx
> > +L(ret_vec_x4_len_no_zfill):
> > + VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> > + VMOVU %VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> > +# ifdef USE_AS_STPCPY
> > +# ifdef USE_AS_WCSCPY
> > + setc %al
> > + addq %rdx, %rdi
> > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > + leal (VEC_SIZE * 3 + 0)(%edx), %eax
> > + adcq %rdi, %rax
> > +# endif
> > +# endif
> > + VZEROUPPER_RETURN
> > +
> > +
> > +L(ret_vec_x3_len):
> > + addl $(VEC_SIZE * 1), %edx
> > + tzcntl %ecx, %ecx
> > + cmpl %ecx, %edx
> > + jbe L(ret_vec_x3_len_no_zfill)
> > + /* Fall through (expectation) is copy len < buffer len. */
> > + VMOVU %VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> > +L(ret_vec_x3_len_no_zfill_mov):
> > + movl %ecx, %edx
> > +# ifdef USE_AS_STPCPY
> > + /* clear flags. */
> > + xorl %ecx, %ecx
> > +# endif
> > + .p2align 4,, 4
> > +L(ret_vec_x3_len_no_zfill):
> > + VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> > + VMOVU %VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> > +# ifdef USE_AS_STPCPY
> > +# ifdef USE_AS_WCSCPY
> > + setc %al
> > + addq %rdx, %rdi
> > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > + leal (VEC_SIZE * 2 + 0)(%rdx), %eax
> > + adcq %rdi, %rax
> > +# endif
> > +# endif
> > + VZEROUPPER_RETURN
> > +
> > +
> > + .p2align 4,, 8
> > +L(ret_vec_x3):
> > + bsfl %ecx, %ecx
> > + VMOVU %VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
> > + subl %ecx, %edx
> > + jl L(ret_vec_x3_len_no_zfill_mov)
> > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
> > + VMOVU %VZERO, (VEC_SIZE * 2)(%rdi, %rcx)
> > +# ifdef USE_AS_STPCPY
> > + leaq (VEC_SIZE * 2)(%rdi, %rcx), %rax
> > +# endif
> > + VZEROUPPER_RETURN
> > +
> > + .p2align 4,, 8
> > +L(more_4x_vec):
> > +
> > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
> > + testl %ecx, %ecx
> > + jnz L(zfill_vec3)
> > +
> > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4)
> > + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi)
> > + VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + testl %ecx, %ecx
> > + jnz L(zfill_vec4)
> > +
> > + movq %rdx, %rcx
> > + addq %rsi, %rdx
> > + subq %rsi, %rdi
> > + subq $-(VEC_SIZE * 4), %rsi
> > + /* Recheck length before aligning. */
> > + cmpq $(VEC_SIZE * 8 - CHAR_SIZE), %rcx
> > + jbe L(last_4x_vec)
> > +
> > + andq $(VEC_SIZE * -4), %rsi
> > +
> > + /* Do first half of loop ahead of time so loop can just start by
> > + storing. */
> > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > +
> > + VPMIN %VMM(0), %VMM(1), %VMM(4)
> > + VPMIN %VMM(2), %VMM(3), %VMM(6)
> > + VPMIN %VMM(4), %VMM(6), %VMM(6)
> > + VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %r8d
> > + addq %rsi, %rdi
> > + testl %r8d, %r8d
> > + jnz L(loop_4x_done)
> > +
> > + /* Use r9 as end register. */
> > + leaq -(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
> >
> > -#define USE_AS_STRNCPY
> > -#define STRCPY STRNCPY
> > -#include "strcpy-avx2.S"
> > + .p2align 4,, 11
> > +L(loop_4x_vec):
> > +
> > + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> > + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> > + subq $(VEC_SIZE * -4), %rsi
> > + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> > + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> > +
> > + subq $(VEC_SIZE * -4), %rdi
> > + cmpq %rsi, %r9
> > + jbe L(loop_last_4x_vec)
> > +
> > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> > +
> > + VPMIN %VMM(0), %VMM(1), %VMM(4)
> > + VPMIN %VMM(2), %VMM(3), %VMM(6)
> > + VPMIN %VMM(4), %VMM(6), %VMM(6)
> > + VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> > +
> > + vpmovmskb %VMM(6), %r8d
> > +
> > + testl %r8d, %r8d
> > + jz L(loop_4x_vec)
> > +
> > +L(loop_4x_done):
> > + subq %rsi, %rdx
> > + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> > + VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + testl %ecx, %ecx
> > + jnz L(zfill_vec1)
> > +
> > + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> > + VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + testl %ecx, %ecx
> > + jnz L(zfill_vec2)
> > +
> > + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> > + VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> > + vpmovmskb %VMM(6), %ecx
> > + testl %ecx, %ecx
> > + jnz L(zfill_vec3)
> > +
> > + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> > + movl %r8d, %ecx
> > +
> > + // Zfill more....
> > +
> > + .p2align 4,, 4
> > +L(zfill_vec4):
> > + addq $(VEC_SIZE * 2), %rdi
> > + subq $(VEC_SIZE * 2), %rdx
> > +L(zfill_vec2):
> > + shlq $VEC_SIZE, %rcx
> > +L(zfill):
> > + bsfq %rcx, %rcx
> > + subq %rcx, %rdx
> > + addq %rcx, %rdi
> > +# ifdef USE_AS_STPCPY
> > + movq %rdi, %rax
> > +# endif
> > +L(zfill_from_page_cross):
> > + cmpq $VEC_SIZE, %rdx
> > + jb L(zfill_less_vec_vzeroupper)
> > +
> > +L(zfill_more_1x_vec):
> > + VMOVU %VZERO, CHAR_SIZE(%rdi)
> > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
> > + cmpq $(VEC_SIZE * 2), %rdx
> > + jae L(zfill_more_2x_vec)
> > +L(zfill_done0):
> > + VZEROUPPER_RETURN
> > +
> > + .p2align 4,, 8
> > +L(zfill_vec3):
> > + addq $(VEC_SIZE * 2), %rdi
> > + subq $(VEC_SIZE * 2), %rdx
> > + .p2align 4,, 2
> > +L(zfill_vec1):
> > + bsfl %ecx, %ecx
> > + addq %rcx, %rdi
> > + subq %rcx, %rdx
> > +# ifdef USE_AS_STPCPY
> > + movq %rdi, %rax
> > +# endif
> > + /* zfill from vec1/vec3 must have to set at least 2x VECS. */
> > +
> > + VMOVU %VZERO, CHAR_SIZE(%rdi)
> > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
> > + cmpq $(VEC_SIZE * 2), %rdx
> > + jb L(zfill_done0)
> > +L(zfill_more_2x_vec):
> > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx)
> > + VMOVU %VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
> > + subq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
> > + jbe L(zfill_done)
> > +
> > + addq %rdi, %rdx
> > + VMOVU %VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi)
> > + VMOVU %VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi)
> > +
> > +
> > + VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
> > + VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
> > +
> > + subq $-(VEC_SIZE * 4 + CHAR_SIZE), %rdi
> > + cmpq %rdi, %rdx
> > + jbe L(zfill_done)
> > +
> > + andq $-(VEC_SIZE), %rdi
> > + .p2align 4,, 12
> > +L(zfill_loop_4x_vec):
> > + VMOVA %VZERO, (VEC_SIZE * 0)(%rdi)
> > + VMOVA %VZERO, (VEC_SIZE * 1)(%rdi)
> > + VMOVA %VZERO, (VEC_SIZE * 2)(%rdi)
> > + VMOVA %VZERO, (VEC_SIZE * 3)(%rdi)
> > + subq $-(VEC_SIZE * 4), %rdi
> > + cmpq %rdi, %rdx
> > + ja L(zfill_loop_4x_vec)
> > +L(zfill_done):
> > + VZEROUPPER_RETURN
> > +
> > +
> > + .p2align 4,, 8
> > +L(copy_1x):
> > + VMOVU %VMM(0), (%rdi)
> > + testl %ecx, %ecx
> > + jz L(ret_32_32)
> > +L(zfill_less_vec):
> > + bsfl %ecx, %ecx
> > +L(zfill_less_vec_no_bsf):
> > + subq %rcx, %rdx
> > + addq %rcx, %rdi
> > +# ifdef USE_AS_STPCPY
> > + movq %rdi, %rax
> > +# endif
> > +L(zfill_less_vec_vzeroupper):
> > + COND_VZEROUPPER
> > + /* We are taking advantage of the fact that to be here we must
> > + be writing null-term as (%rdi, %rcx) we have a byte of lee-
> > + way for overwriting. */
> > + cmpl $16, %edx
> > + jb L(zfill_less_16)
> > + VMOVU %VZERO_128, (%rdi)
> > + VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx)
> > + ret
> > +# ifdef USE_AS_STPCPY
> > +L(ret_32_32):
> > + leaq CHAR_SIZE(%rdi, %rdx), %rax
> > + VZEROUPPER_RETURN
> > +# endif
> > +
> > + .p2align 4,, 4
> > +L(copy_16_31):
> > + /* Overfill to avoid branches. */
> > + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
> > + vmovdqu %xmm0, (%rdi)
> > + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
> > + cmpl %ecx, %edx
> > + ja L(zfill_less_vec_no_bsf)
> > +# ifndef USE_AS_STPCPY
> > +L(ret_32_32):
> > +# else
> > +# ifdef USE_AS_WCSCPY
> > + setc %al
> > + addq %rdx, %rdi
> > + leaq (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > + movl %edx, %eax
> > + adcq %rdi, %rax
> > +# endif
> > +# endif
> > + VZEROUPPER_RETURN
> > +
> > + .p2align 4,, 4
> > +L(copy_8_15):
> > + /* Overfill to avoid branches. */
> > + movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rsi
> > + vmovq %xmm0, (%rdi)
> > + movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx)
> > + cmpl %ecx, %edx
> > + jbe L(ret_8_15)
> > + subq %rcx, %rdx
> > + addq %rcx, %rdi
> > +# ifdef USE_AS_STPCPY
> > + movq %rdi, %rax
> > +# endif
> > + .p2align 4,, 8
> > +L(zfill_less_16):
> > + xorl %ecx, %ecx
> > + cmpl $8, %edx
> > + jb L(zfill_less_8)
> > + movq %rcx, (%rdi)
> > + movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
> > +# ifndef USE_AS_STPCPY
> > +L(ret_8_15):
> > +# endif
> > + ret
> > +
> > +
> > + .p2align 4,, 8
> > +L(less_1x_vec):
> > + /* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many
> > + buffer sizes are aligned conventially. */
> > + je L(copy_1x)
> > +
> > + tzcntl %ecx, %ecx
> > + cmpl $16, %edx
> > + jae L(copy_16_31)
> > +
> > + COND_VZEROUPPER
> > + cmpl $8, %edx
> > + jae L(copy_8_15)
> > +# ifdef USE_AS_WCSCPY
> > + testl %ecx, %ecx
> > + jz L(zfill_less_8_set_ret)
> > +
> > + movl (%rsi, %rdx), %esi
> > + vmovd %xmm0, (%rdi)
> > + movl %esi, (%rdi, %rdx)
> > +
> > +# ifdef USE_AS_STPCPY
> > + cmpl %ecx, %edx
> > +L(ret_8_15):
> > + setc %al
> > + addq %rdx, %rdi
> > + leaq (%rdi, %rax, CHAR_SIZE), %rax
> > +# endif
> > + ret
> > +L(zfill_less_8_set_ret):
> > + xorl %ecx, %ecx
> > +# ifdef USE_AS_STPCPY
> > + movq %rdi, %rax
> > +# endif
> > +L(zfill_less_8):
> > + movl %ecx, (%rdi)
> > + movl %ecx, (%rdi, %rdx)
> > + ret
> > +
> > +# else
> > + cmpl $3, %edx
> > + jb L(copy_0_3)
> > + /* Overfill to avoid branches. */
> > + movl -3(%rsi, %rdx), %esi
> > + vmovd %xmm0, (%rdi)
> > + movl %esi, -3(%rdi, %rdx)
> > + cmpl %ecx, %edx
> > + jbe L(ret_4_7)
> > + subq %rcx, %rdx
> > + addq %rcx, %rdi
> > +# ifdef USE_AS_STPCPY
> > + movq %rdi, %rax
> > +# endif
> > + xorl %ecx, %ecx
> > + .p2align 4,, 8
> > +L(zfill_less_8):
> > + cmpl $3, %edx
> > + jb L(zfill_less_3)
> > + movl %ecx, (%rdi)
> > + movl %ecx, -3(%rdi, %rdx)
> > +# ifdef USE_AS_STPCPY
> > + ret
> > +# endif
> > +
> > +L(ret_4_7):
> > +# ifdef USE_AS_STPCPY
> > +L(ret_8_15):
> > + movl %edx, %eax
> > + adcq %rdi, %rax
> > +# endif
> > + ret
> > +
> > + .p2align 4,, 4
> > +L(zfill_less_3):
> > + testl %edx, %edx
> > + jz L(zfill_1)
> > + movw %cx, (%rdi)
> > +L(zfill_1):
> > + movb %cl, (%rdi, %rdx)
> > + ret
> > +
> > + .p2align 4,, 8
> > +L(copy_0_3):
> > + vmovd %xmm0, %r8d
> > + testl %edx, %edx
> > + jz L(copy_1)
> > + movw %r8w, (%rdi)
> > + cmpl %ecx, %edx
> > + ja L(zfill_from_1)
> > + movzbl (%rsi, %rdx), %r8d
> > +# ifdef USE_AS_STPCPY
> > + movl %edx, %eax
> > + adcq %rdi, %rax
> > + movb %r8b, (%rdi, %rdx)
> > + ret
> > +# endif
> > +
> > +L(copy_1):
> > +# ifdef USE_AS_STPCPY
> > + movl %edx, %eax
> > + cmpl %ecx, %edx
> > + adcq %rdi, %rax
> > +# endif
> > +# ifdef USE_AS_WCSCPY
> > + vmovd %xmm0, (%rdi)
> > +# else
> > + movb %r8b, (%rdi, %rdx)
> > +# endif
> > + ret
> > +# endif
> > +
> > + .p2align 4,, 2
> > +L(zero_len):
> > + movq %rdi, %rax
> > + ret
> > +# ifndef USE_AS_WCSCPY
> > + .p2align 4,, 8
> > +L(zfill_from_1):
> > +# ifdef USE_AS_STPCPY
> > + leaq (%rdi, %rcx), %rax
> > +# endif
> > + movw $0, -1(%rdi, %rdx)
> > + ret
> > +# endif
> > +
> > + .p2align 4,, 4
> > + .p2align 6,, 8
> > +L(page_cross):
> > + movq %rsi, %rax
> > + andq $(VEC_SIZE * -1), %rax
> > +
> > + VPCMPEQ (%rax), %VZERO, %VMM(6)
> > +
> > + vpmovmskb %VMM(6), %ecx
> > + shrxl %esi, %ecx, %ecx
> > +
> > + subl %esi, %eax
> > + andl $(VEC_SIZE - 1), %eax
> > + cmpq %rax, %rdx
> > + jb L(page_cross_small)
> > + /* Optimizing more aggressively for space as this is very cold
> > + code. This saves 2x cache lines. */
> > +
> > + /* If rcx is non-zero then continue. */
> > + shl $CHAR_SIZE, %ecx
> > + jz L(page_cross_continue)
> > + bsf %ecx, %ecx
> > +
> > + subq %rcx, %rdx
> > +# ifdef USE_AS_STPCPY
> > + leaq -CHAR_SIZE(%rdi, %rcx), %rax
> > +# else
> > + movq %rdi, %rax
> > +# endif
> > +
> > + rep movsb
> > +# ifdef USE_AS_WCSCPY
> > + movl $0, (%rdi)
> > +# else
> > + movb $0, (%rdi)
> > +# endif
> > + jmp L(zfill_from_page_cross)
> > +
> > +L(page_cross_small):
> > + tzcntl %ecx, %ecx
> > + xorl %eax, %eax
> > + cmpl %ecx, %edx
> > + jbe L(page_cross_copy_only)
> > +
> > + /* Do a zfill of the tail before copying. */
> > + movq %rdi, %r9
> > + movl %ecx, %r8d
> > +
> > + subl %ecx, %edx
> > + leaq CHAR_SIZE(%rdi, %rcx), %rdi
> > + movl %edx, %ecx
> > + rep stosb
> > + movq %r9, %rdi
> > + movl %r8d, %edx
> > +L(page_cross_copy_only):
> > + leal CHAR_SIZE(%rdx), %ecx
> > +# ifdef USE_AS_STPCPY
> > +# ifdef USE_AS_WCSCPY
> > + setc %al
> > + addq %rdi, %rdx
> > + leaq (%rdx, %rax, CHAR_SIZE), %rax
> > +# else
> > + movl %edx, %eax
> > + adcq %rdi, %rax
> > +# endif
> > +# else
> > + movq %rdi, %rax
> > +# endif
> > + rep movsb
> > + ret
> > +
> > +
> > +L(best_effort_strncpy):
> > + movq %rdx, %rcx
> > + xorl %eax, %eax
> > + movq %rdi, %r8
> > + /* The length is >= 2^63. We very much so expect to segfault at
> > + rep stos. If that doesn't happen then just strcpy to finish.
> > + */
> > +# ifdef USE_AS_WCSCPY
> > + rep stosl
> > +# else
> > + rep stosb
> > +# endif
> > + movq %r8, %rdi
> > + jmp OVERFLOW_STRCPY
> > +END(STRNCPY)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> > index dca1089060..01bead1435 100644
> > --- a/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> > +++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> > @@ -27,7 +27,10 @@
> > #define VEC_SIZE 32
> > #include "x86-vec-macros.h"
> >
> > -#define USE_WITH_AVX 1
> > +#ifndef USE_WITH_AVX2
> > +# define USE_WITH_AVX 1
>
> Is this checked somewhere?
>
> > +#endif
> > +
> > #define SECTION(p) p##.avx
> >
> > /* 4-byte mov instructions with AVX2. */
> > diff --git a/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
> > new file mode 100644
> > index 0000000000..a5966701ec
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/x86-avx2-rtm-vecs.h
> > @@ -0,0 +1,26 @@
> > +/* Common config for AVX2-RTM VECs
> > + All versions must be listed in ifunc-impl-list.c.
> > + Copyright (C) 2022 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#ifndef _X86_AVX2_RTM_VECS_H
> > +#define _X86_AVX2_RTM_VECS_H 1
> > +
> > +#define USE_WITH_AVX2 1
> > +#include "x86-avx-rtm-vecs.h"
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/x86-avx2-vecs.h b/sysdeps/x86_64/multiarch/x86-avx2-vecs.h
> > new file mode 100644
> > index 0000000000..16d7ae5147
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/x86-avx2-vecs.h
> > @@ -0,0 +1,27 @@
> > +/* Common config for AVX2 VECs
> > + All versions must be listed in ifunc-impl-list.c.
> > + Copyright (C) 2022 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#ifndef _X86_AVX2_VECS_H
> > +#define _X86_AVX2_VECS_H 1
> > +
> > +#define USE_WITH_AVX2 1
>
> Is this checked somewhere?
>
> > +
> > +#include "x86-avx-vecs.h"
> > +
> > +#endif
> > --
> > 2.34.1
> >
>
>
> --
> H.J.
More information about the Libc-alpha
mailing list