x86: Unifies 'strnlen-evex' and 'strnlen-evex512' implementations.

author Matthew Sterrett <matthew.sterrett@intel.com>

Fri, 9 Aug 2024 22:05:09 +0000 (15:05 -0700)

committer H.J. Lu <hjl.tools@gmail.com>

Mon, 19 Aug 2024 14:34:02 +0000 (07:34 -0700)
author Matthew Sterrett <matthew.sterrett@intel.com>
Fri, 9 Aug 2024 22:05:09 +0000 (15:05 -0700)
committer H.J. Lu <hjl.tools@gmail.com>
Mon, 19 Aug 2024 14:34:02 +0000 (07:34 -0700)
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex-base.S b/sysdeps/x86_64/multiarch/strnlen-evex-base.S

new file mode 100644 (file)

index 0000000..1c2cfdf
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex-base.S
@@ -0,0 +1,462 @@
+/* strnlen/wcsnlen optimized with 256/512-bit EVEX instructions.
+   Copyright (C) 2022-2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+#ifdef USE_AS_WCSLEN
+# define VPCMPEQ       vpcmpeqd
+# define VPTESTN       vptestnmd
+# define VPMINU        vpminud
+# define CHAR_SIZE     4
+#else
+# define VPCMPEQ       vpcmpeqb
+# define VPTESTN       vptestnmb
+# define VPMINU        vpminub
+# define CHAR_SIZE     1
+#endif
+
+#define XZERO  VMM_128(0)
+#define VZERO  VMM(0)
+#define PAGE_SIZE      4096
+#define CHAR_PER_VEC   (VEC_SIZE / CHAR_SIZE)
+
+#if CHAR_PER_VEC == 32
+# define SUB_SHORT(imm, reg)   subb $(imm), %VGPR_SZ(reg, 8)
+#else
+# define SUB_SHORT(imm, reg)   subl $(imm), %VGPR_SZ(reg, 32)
+#endif
+
+#ifdef USE_AS_WCSLEN
+/* For wide-character, we care more about limitting code size
+   than optimally aligning targets, so just cap nop padding
+   reasonably low.  */
+# define P2ALIGN(...)  .p2align 4,, 6
+# define P2ALIGN_CLAMPED(...)  P2ALIGN(__VA_ARGS__)
+#else
+# define P2ALIGN(x)    .p2align x
+# define P2ALIGN_CLAMPED(x, y) .p2align x,, y
+#endif
+
+       .section SECTION(.text), "ax", @progbits
+       /* Aligning entry point to 64 byte, provides better performance for
+          one vector length string.  */
+ENTRY_P2ALIGN(STRNLEN, 6)
+       /* rdi is pointer to array, rsi is the upper limit.  */
+
+       /* Check zero length.  */
+       test    %RSI_LP, %RSI_LP
+       jz      L(zero)
+
+#ifdef __ILP32__
+       /* Clear the upper 32 bits.  */
+       movl    %esi, %esi
+#endif
+
+       vpxorq  %XZERO, %XZERO, %XZERO
+
+       /* Check that we won't cross a page boundary with our first load.  */
+       movl    %edi, %eax
+       shll    $20, %eax
+       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
+       ja      L(crosses_page_boundary)
+
+       /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+          null byte.  */
+       VPCMPEQ (%rdi), %VZERO, %k0
+       KMOV    %k0, %VRCX
+
+       /* If src (rcx) is zero, bsf does not change the result.  NB:
+          Must use 64-bit bsf here so that upper bits of len are not
+          cleared.  */
+       movq    %rsi, %rax
+       bsfq    %rcx, %rax
+
+       /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
+          CHAR) and rsi must be > CHAR_PER_VEC.  */
+       cmpq    $CHAR_PER_VEC, %rax
+       ja      L(more_1x_vec)
+
+       /* Check if first match in bounds.  */
+       cmpq    %rax, %rsi
+       cmovb   %esi, %eax
+       ret
+
+#if VEC_SIZE == 32
+       P2ALIGN_CLAMPED(4, 2)
+L(zero):
+L(max_0):
+       movl    %esi, %eax
+       ret
+#endif
+
+       P2ALIGN_CLAMPED(4, 10)
+L(more_1x_vec):
+L(cross_page_continue):
+       /* After this calculation, rax stores the number of elements
+          left to be processed The complexity comes from the fact some
+          elements get read twice due to alignment and we need to be
+          sure we don't count them twice (else, it would just be rsi -
+          CHAR_PER_VEC).  */
+
+#ifdef USE_AS_WCSLEN
+       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+          overflow.  */
+       movq    %rdi, %rax
+       andq    $(VEC_SIZE * -1), %rdi
+       subq    %rdi, %rax
+       sarq    $2, %rax
+       leaq    -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
+#else
+       /* Calculate ptr + N - VEC_SIZE, then mask off the low bits,
+          then subtract ptr to get the new aligned limit value.  */
+       leaq    (VEC_SIZE * -1)(%rsi, %rdi), %rax
+       andq    $(VEC_SIZE * -1), %rdi
+       subq    %rdi, %rax
+#endif
+
+       VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
+
+       /* Checking here is faster for 256-bit but not 512-bit */
+#if VEC_SIZE == 0
+       KMOV    %k0, %VRDX
+       test    %VRDX, %VRDX
+       jnz     L(last_vec_check)
+#endif
+
+       cmpq    $(CHAR_PER_VEC * 2), %rax
+       ja      L(more_2x_vec)
+
+L(last_2x_vec_or_less):
+
+       /* Checking here is faster for 512-bit but not 256-bit */
+#if VEC_SIZE != 0
+       KMOV    %k0, %VRDX
+       test    %VRDX, %VRDX
+       jnz     L(last_vec_check)
+#endif
+
+       /* Check for the end of data.  */
+       SUB_SHORT (CHAR_PER_VEC, rax)
+       jbe     L(max_0)
+
+       /* Check the final remaining vector.  */
+       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRDX
+       test    %VRDX, %VRDX
+#if VEC_SIZE == 32
+       jz      L(max_0)
+#else
+       jnz     L(last_vec_check)
+       P2ALIGN_CLAMPED(4, 2)
+L(zero):
+L(max_0):
+       movl    %esi, %eax
+       ret
+
+#endif
+       P2ALIGN_CLAMPED(4, 4)
+L(last_vec_check):
+       bsf     %VRDX, %VRDX
+       sub     %eax, %edx
+       lea     (%rsi, %rdx), %eax
+       cmovae  %esi, %eax
+       ret
+
+
+#if VEC_SIZE == 32
+       P2ALIGN_CLAMPED(4, 8)
+#endif
+L(last_4x_vec_or_less):
+       addl    $(CHAR_PER_VEC * -4), %eax
+       VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
+
+#if VEC_SIZE == 64
+       KMOV    %k0, %VRDX
+       test    %VRDX, %VRDX
+       jnz     L(last_vec_check)
+#endif
+
+       subq    $(VEC_SIZE * -4), %rdi
+       cmpl    $(CHAR_PER_VEC * 2), %eax
+       jbe     L(last_2x_vec_or_less)
+
+       P2ALIGN_CLAMPED(4, 6)
+L(more_2x_vec):
+       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+          rechecking bounds.  */
+
+       /* Already checked in 256-bit case */
+#if VEC_SIZE != 0
+       KMOV    %k0, %VRDX
+
+       test    %VRDX, %VRDX
+       jnz     L(first_vec_x1)
+#endif
+
+       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRDX
+
+       test    %VRDX, %VRDX
+       jnz     L(first_vec_x2)
+
+       cmpq    $(CHAR_PER_VEC * 4), %rax
+       ja      L(more_4x_vec)
+
+
+       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRDX
+       addl    $(CHAR_PER_VEC * -2), %eax
+       test    %VRDX, %VRDX
+       jnz     L(last_vec_check)
+
+       subb    $(CHAR_PER_VEC), %al
+       jbe     L(max_1)
+
+       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRDX
+
+       test    %VRDX, %VRDX
+       jnz     L(last_vec_check)
+L(max_1):
+       movl    %esi, %eax
+       ret
+
+
+       P2ALIGN_CLAMPED(4, 14)
+L(first_vec_x2):
+#if VEC_SIZE == 64
+       /* If VEC_SIZE == 64 we can fit logic for full return label in
+          spare bytes before next cache line.  */
+       bsf     %VRDX, %VRDX
+       sub     %eax, %esi
+       leal    (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
+       ret
+       P2ALIGN_CLAMPED(4, 6)
+#else
+       addl    $CHAR_PER_VEC, %esi
+#endif
+L(first_vec_x1):
+       bsf     %VRDX, %VRDX
+       sub     %eax, %esi
+       leal    (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
+       ret
+
+#if VEC_SIZE == 64
+       P2ALIGN_CLAMPED(4, 6)
+L(first_vec_x4):
+# if VEC_SIZE == 64
+       /* If VEC_SIZE == 64 we can fit logic for full return label in
+          spare bytes before next cache line.  */
+       bsf     %VRDX, %VRDX
+       sub     %eax, %esi
+       leal    (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
+       ret
+       P2ALIGN_CLAMPED(4, 6)
+# else
+       addl    $CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x3):
+       bsf     %VRDX, %VRDX
+       sub     %eax, %esi
+       leal    (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
+       ret
+#endif
+
+       P2ALIGN_CLAMPED(6, 20)
+L(more_4x_vec):
+       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRDX
+       test    %VRDX, %VRDX
+       jnz     L(first_vec_x3)
+
+       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
+       KMOV    %k0, %VRDX
+       test    %VRDX, %VRDX
+       jnz     L(first_vec_x4)
+
+       /* Check if at last VEC_SIZE * 4 length before aligning for the
+          loop.  */
+       cmpq    $(CHAR_PER_VEC * 8), %rax
+       jbe     L(last_4x_vec_or_less)
+
+
+       /* Compute number of words checked after aligning.  */
+#ifdef USE_AS_WCSLEN
+       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+          overflow.  */
+       leaq    (VEC_SIZE * -3)(%rdi), %rdx
+#else
+       leaq    (VEC_SIZE * -3)(%rdi, %rax), %rax
+#endif
+
+       subq    $(VEC_SIZE * -1), %rdi
+
+       /* Align data to VEC_SIZE * 4.  */
+#if VEC_SIZE == 64
+       /* Saves code size.  No evex512 processor has partial register
+          stalls.  If that change this can be replaced with `andq
+          $-(VEC_SIZE * 4), %rdi`.  */
+       xorb    %dil, %dil
+#else
+       andq    $-(VEC_SIZE * 4), %rdi
+#endif
+
+#ifdef USE_AS_WCSLEN
+       subq    %rdi, %rdx
+       sarq    $2, %rdx
+       addq    %rdx, %rax
+#else
+       subq    %rdi, %rax
+#endif
+
+       // mov     %rdi, %rdx
+
+       P2ALIGN(6)
+L(loop):
+       /* VPMINU and VPCMP combination provide better performance as
+          compared to alternative combinations.  */
+       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
+       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
+       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+
+       VPTESTN %VMM(2), %VMM(2), %k0
+       VPTESTN %VMM(4), %VMM(4), %k1
+
+       subq    $-(VEC_SIZE * 4), %rdi
+       KORTEST %k0, %k1
+
+       jnz     L(loopend)
+       subq    $(CHAR_PER_VEC * 4), %rax
+       ja      L(loop)
+       mov     %rsi, %rax
+       ret
+
+
+#if VEC_SIZE == 32
+       P2ALIGN_CLAMPED(4, 6)
+L(first_vec_x4):
+# if VEC_SIZE == 64
+       /* If VEC_SIZE == 64 we can fit logic for full return label in
+          spare bytes before next cache line.  */
+       bsf     %VRDX, %VRDX
+       sub     %eax, %esi
+       leal    (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
+       ret
+       P2ALIGN_CLAMPED(4, 6)
+# else
+       addl    $CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x3):
+       bsf     %VRDX, %VRDX
+       sub     %eax, %esi
+       leal    (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
+       ret
+#endif
+
+
+       P2ALIGN_CLAMPED(4, 11)
+L(loopend):
+       /* We found a null terminator in one of the 4 vectors.  */
+
+       /* Check the first vector.  */
+       movq    %rax, %r8
+       VPTESTN %VMM(1), %VMM(1), %k2
+       KMOV    %k2, %VRCX
+       bsf     %rcx, %r8
+
+       cmpq    $(CHAR_PER_VEC), %r8
+       jbe     L(end_vec)
+
+       /* Check the second vector.  */
+       subq    $(CHAR_PER_VEC), %rax
+       movq    %rax, %r8
+       KMOV    %k0, %VRCX
+       bsf     %rcx, %r8
+
+       cmpq    $(CHAR_PER_VEC), %r8
+       jbe     L(end_vec)
+
+       /* Check the third vector.  */
+       subq    $(CHAR_PER_VEC), %rax
+       movq    %rax, %r8
+       VPTESTN %VMM(3), %VMM(3), %k2
+       KMOV    %k2, %VRCX
+       bsf     %rcx, %r8
+
+       cmpq    $(CHAR_PER_VEC), %r8
+       jbe     L(end_vec)
+
+       /* It is in the fourth vector.  */
+       subq    $(CHAR_PER_VEC), %rax
+       movq    %rax, %r8
+       KMOV    %k1, %VRCX
+       bsf     %rcx, %r8
+
+       P2ALIGN_CLAMPED(4, 3)
+L(end_vec):
+       /* Get the number that has been processed.  */
+       movq    %rsi, %rcx
+       subq    %rax, %rcx
+
+       /* Add that to the offset we found the null terminator at.  */
+       leaq    (%r8, %rcx), %rax
+
+       /* Take the min of that and the limit.  */
+       cmpq    %rsi, %rax
+       cmovnb  %rsi, %rax
+       ret
+
+       P2ALIGN_CLAMPED(4, 11)
+L(crosses_page_boundary):
+       /* Align data backwards to VEC_SIZE.  */
+       shrl    $20, %eax
+       movq    %rdi, %rcx
+       andq    $-VEC_SIZE, %rcx
+       VPCMPEQ (%rcx), %VZERO, %k0
+
+       KMOV    %k0, %VRCX
+#ifdef USE_AS_WCSLEN
+       shrl    $2, %eax
+       andl    $(CHAR_PER_VEC - 1), %eax
+#endif
+       /* By this point rax contains number of bytes we need to skip.  */
+       shrx    %VRAX, %VRCX, %VRCX
+
+       /* Calculates CHAR_PER_VEC - eax and stores in eax.  */
+       negl    %eax
+       andl    $(CHAR_PER_VEC - 1), %eax
+
+       movq    %rsi, %rdx
+       bsf     %VRCX, %VRDX
+       cmpq    %rax, %rdx
+       ja      L(cross_page_continue)
+
+       /* The vector had a null terminator or we are at the limit.  */
+       movl    %edx, %eax
+       cmpq    %rdx, %rsi
+       cmovb   %esi, %eax
+       ret
+
+END(STRNLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S

index 91b16830ebf057feb4d199d463f76455733cd306..c41288906cdd2bc4ca851325459fde3c61e96ea8 100644 (file)
--- a/sysdeps/x86_64/multiarch/strnlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
@@ -1,423 +1,7 @@
-/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
-   Copyright (C) 2022-2024 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <isa-level.h>
-#include <sysdep.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-# ifndef VEC_SIZE
-#  include "x86-evex256-vecs.h"
-# endif
-
-
-# ifndef STRNLEN
-#  define STRNLEN      __strnlen_evex
-# endif
-
-# ifdef USE_AS_WCSLEN
-#  define VPCMPEQ      vpcmpeqd
-#  define VPCMPNEQ     vpcmpneqd
-#  define VPTESTN      vptestnmd
-#  define VPTEST       vptestmd
-#  define VPMINU       vpminud
-#  define CHAR_SIZE    4
-
-# else
-#  define VPCMPEQ      vpcmpeqb
-#  define VPCMPNEQ     vpcmpneqb
-#  define VPTESTN      vptestnmb
-#  define VPTEST       vptestmb
-#  define VPMINU       vpminub
-#  define CHAR_SIZE    1
-
-#  define REG_WIDTH    VEC_SIZE
-# endif
-
-# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
-
-# include "reg-macros.h"
-
-# if CHAR_PER_VEC == 32
-#  define SUB_SHORT(imm, reg)  subb $(imm), %VGPR_SZ(reg, 8)
-# else
-#  define SUB_SHORT(imm, reg)  subl $(imm), %VGPR_SZ(reg, 32)
-# endif
-
-
-
-# if CHAR_PER_VEC == 64
-#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
-# else
-#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
-# endif
-
-
-# define XZERO VMM_128(0)
-# define VZERO VMM(0)
-# define PAGE_SIZE     4096
-
-       .section SECTION(.text), "ax", @progbits
-ENTRY_P2ALIGN (STRNLEN, 6)
-       /* Check zero length.  */
-       test    %RSI_LP, %RSI_LP
-       jz      L(zero)
-# ifdef __ILP32__
-       /* Clear the upper 32 bits.  */
-       movl    %esi, %esi
-# endif
-
-       movl    %edi, %eax
-       vpxorq  %XZERO, %XZERO, %XZERO
-       andl    $(PAGE_SIZE - 1), %eax
-       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
-       ja      L(cross_page_boundary)
-
-       /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
-          null byte.  */
-       VPCMPEQ (%rdi), %VZERO, %k0
-
-       KMOV    %k0, %VRCX
-       movq    %rsi, %rax
-
-       /* If src (rcx) is zero, bsf does not change the result.  NB:
-          Must use 64-bit bsf here so that upper bits of len are not
-          cleared.  */
-       bsfq    %rcx, %rax
-       /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
-          CHAR) and rsi must be > CHAR_PER_VEC.  */
-       cmpq    $CHAR_PER_VEC, %rax
-       ja      L(more_1x_vec)
-       /* Check if first match in bounds.  */
-       cmpq    %rax, %rsi
-       cmovb   %esi, %eax
-       ret
-
-
-# if CHAR_PER_VEC != 32
-       .p2align 4,, 2
-L(zero):
-L(max_0):
-       movl    %esi, %eax
-       ret
-# endif
-
-       /* Aligned more for strnlen compares remaining length vs 2 *
-          CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
-          going to the loop.  */
-       .p2align 4,, 10
-L(more_1x_vec):
-L(cross_page_continue):
-       /* Compute number of words checked after aligning.  */
-# ifdef USE_AS_WCSLEN
-       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
-          overflow.  */
-       movq    %rdi, %rax
-       andq    $(VEC_SIZE * -1), %rdi
-       subq    %rdi, %rax
-       sarq    $2, %rax
-       leaq    -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
-# else
-       leaq    (VEC_SIZE * -1)(%rsi, %rdi), %rax
-       andq    $(VEC_SIZE * -1), %rdi
-       subq    %rdi, %rax
-# endif
-
-
-       VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
-
-       cmpq    $(CHAR_PER_VEC * 2), %rax
-       ja      L(more_2x_vec)
-
-L(last_2x_vec_or_less):
-       KMOV    %k0, %VRDX
-       test    %VRDX, %VRDX
-       jnz     L(last_vec_check)
-
-       /* Check the end of data.  */
-       SUB_SHORT (CHAR_PER_VEC, rax)
-       jbe     L(max_0)
-       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
-       KMOV    %k0, %VRDX
-       test    %VRDX, %VRDX
-       jz      L(max_0)
-       /* Best place for LAST_VEC_CHECK if ZMM.  */
-       .p2align 4,, 8
-L(last_vec_check):
-       bsf     %VRDX, %VRDX
-       sub     %eax, %edx
-       lea     (%rsi, %rdx), %eax
-       cmovae  %esi, %eax
-       ret
-
-# if CHAR_PER_VEC == 32
-       .p2align 4,, 2
-L(zero):
-L(max_0):
-       movl    %esi, %eax
-       ret
-# endif
-
-       .p2align 4,, 8
-L(last_4x_vec_or_less):
-       addl    $(CHAR_PER_VEC * -4), %eax
-       VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
-       subq    $(VEC_SIZE * -4), %rdi
-       cmpl    $(CHAR_PER_VEC * 2), %eax
-       jbe     L(last_2x_vec_or_less)
-
-       .p2align 4,, 6
-L(more_2x_vec):
-       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
-          rechecking bounds.  */
-
-       KMOV    %k0, %VRDX
-
-       test    %VRDX, %VRDX
-       jnz     L(first_vec_x1)
-
-       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
-       KMOV    %k0, %VRDX
-       test    %VRDX, %VRDX
-       jnz     L(first_vec_x2)
-
-       cmpq    $(CHAR_PER_VEC * 4), %rax
-       ja      L(more_4x_vec)
-
-
-       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
-       KMOV    %k0, %VRDX
-       addl    $(CHAR_PER_VEC * -2), %eax
-       test    %VRDX, %VRDX
-       jnz     L(last_vec_check)
-
-       subl    $(CHAR_PER_VEC), %eax
-       jbe     L(max_1)
-
-       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
-       KMOV    %k0, %VRDX
-
-       test    %VRDX, %VRDX
-       jnz     L(last_vec_check)
-L(max_1):
-       movl    %esi, %eax
-       ret
-
-       .p2align 4,, 3
-L(first_vec_x2):
-# if VEC_SIZE == 64
-       /* If VEC_SIZE == 64 we can fit logic for full return label in
-          spare bytes before next cache line.  */
-       bsf     %VRDX, %VRDX
-       sub     %eax, %esi
-       leal    (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
-       ret
-       .p2align 4,, 6
-# else
-       addl    $CHAR_PER_VEC, %esi
-# endif
-L(first_vec_x1):
-       bsf     %VRDX, %VRDX
-       sub     %eax, %esi
-       leal    (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
-       ret
-
-
-       .p2align 4,, 6
-L(first_vec_x4):
-# if VEC_SIZE == 64
-       /* If VEC_SIZE == 64 we can fit logic for full return label in
-          spare bytes before next cache line.  */
-       bsf     %VRDX, %VRDX
-       sub     %eax, %esi
-       leal    (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
-       ret
-       .p2align 4,, 6
-# else
-       addl    $CHAR_PER_VEC, %esi
-# endif
-L(first_vec_x3):
-       bsf     %VRDX, %VRDX
-       sub     %eax, %esi
-       leal    (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
-       ret
-
-       .p2align 4,, 5
-L(more_4x_vec):
-       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
-       KMOV    %k0, %VRDX
-       test    %VRDX, %VRDX
-       jnz     L(first_vec_x3)
-
-       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
-       KMOV    %k0, %VRDX
-       test    %VRDX, %VRDX
-       jnz     L(first_vec_x4)
-
-       /* Check if at last VEC_SIZE * 4 length before aligning for the
-          loop.  */
-       cmpq    $(CHAR_PER_VEC * 8), %rax
-       jbe     L(last_4x_vec_or_less)
-
-
-       /* Compute number of words checked after aligning.  */
-# ifdef USE_AS_WCSLEN
-       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
-          overflow.  */
-       leaq    (VEC_SIZE * -3)(%rdi), %rdx
-# else
-       leaq    (VEC_SIZE * -3)(%rdi, %rax), %rax
-# endif
-
-       subq    $(VEC_SIZE * -1), %rdi
-
-       /* Align data to VEC_SIZE * 4.  */
-# if VEC_SIZE == 64
-       /* Saves code size.  No evex512 processor has partial register
-          stalls.  If that change this can be replaced with `andq
-          $-(VEC_SIZE * 4), %rdi`.  */
-       xorb    %dil, %dil
-# else
-       andq    $-(VEC_SIZE * 4), %rdi
-# endif
-
-# ifdef USE_AS_WCSLEN
-       subq    %rdi, %rdx
-       sarq    $2, %rdx
-       addq    %rdx, %rax
-# else
-       subq    %rdi, %rax
-# endif
-       /* Compare 4 * VEC at a time forward.  */
-       .p2align 4,, 11
-L(loop_4x_vec):
-       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
-       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
-       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
-       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
-       VPTESTN %VMM(2), %VMM(2), %k0
-       VPTESTN %VMM(4), %VMM(4), %k2
-       subq    $-(VEC_SIZE * 4), %rdi
-       /* Break if at end of length.  */
-       subq    $(CHAR_PER_VEC * 4), %rax
-       jbe     L(loop_len_end)
-
-
-       KORTEST %k0, %k2
-       jz      L(loop_4x_vec)
-
-
-L(loop_last_4x_vec):
-       movq    %rsi, %rcx
-       subq    %rax, %rsi
-       VPTESTN %VMM(1), %VMM(1), %k1
-       KMOV    %k1, %VRDX
-       test    %VRDX, %VRDX
-       jnz     L(last_vec_x0)
-
-       KMOV    %k0, %VRDX
-       test    %VRDX, %VRDX
-       jnz     L(last_vec_x1)
-
-       VPTESTN %VMM(3), %VMM(3), %k0
-
-       /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
-          returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
-          individually, for VEC_SIZE == 32 we combine them in a single
-          64-bit GPR.  */
-# if CHAR_PER_VEC == 64
-       KMOV    %k0, %VRDX
-       test    %VRDX, %VRDX
-       jnz     L(last_vec_x2)
-       KMOV    %k2, %VRDX
-# else
-       /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
-        */
-       kmovd   %k2, %edx
-       kmovd   %k0, %eax
-       salq    $CHAR_PER_VEC, %rdx
-       orq     %rax, %rdx
-# endif
-
-       /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
-        */
-       bsfq    %rdx, %rdx
-       leaq    (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
-       cmpq    %rax, %rcx
-       cmovb   %rcx, %rax
-       ret
-
-       /* Handle last 4x VEC after loop. All VECs have been loaded.  */
-       .p2align 4,, 4
-L(loop_len_end):
-       KORTEST %k0, %k2
-       jnz     L(loop_last_4x_vec)
-       movq    %rsi, %rax
-       ret
-
-
-# if CHAR_PER_VEC == 64
-       /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
-          need return label for it.  */
-       .p2align 4,, 8
-L(last_vec_x2):
-       bsf     %VRDX, %VRDX
-       leaq    (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
-       cmpq    %rax, %rcx
-       cmovb   %rcx, %rax
-       ret
-# endif
-
-
-       .p2align 4,, 10
-L(last_vec_x1):
-       addq    $CHAR_PER_VEC, %rsi
-L(last_vec_x0):
-       bsf     %VRDX, %VRDX
-       leaq    (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
-       cmpq    %rax, %rcx
-       cmovb   %rcx, %rax
-       ret
-
-
-       .p2align 4,, 8
-L(cross_page_boundary):
-       /* Align data to VEC_SIZE.  */
-       movq    %rdi, %rcx
-       andq    $-VEC_SIZE, %rcx
-       VPCMPEQ (%rcx), %VZERO, %k0
-
-       KMOV    %k0, %VRCX
-# ifdef USE_AS_WCSLEN
-       shrl    $2, %eax
-       andl    $(CHAR_PER_VEC - 1), %eax
-# endif
-       shrx    %VRAX, %VRCX, %VRCX
-
-       negl    %eax
-       andl    $(CHAR_PER_VEC - 1), %eax
-       movq    %rsi, %rdx
-       bsf     %VRCX, %VRDX
-       cmpq    %rax, %rdx
-       ja      L(cross_page_continue)
-       movl    %edx, %eax
-       cmpq    %rdx, %rsi
-       cmovb   %esi, %eax
-       ret
-END (STRNLEN)
+#ifndef STRNLEN
+#define STRNLEN __strnlen_evex
  #endif
+
+#include "x86-evex256-vecs.h"
+#include "reg-macros.h"
+#include "strnlen-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S

index f8e55883bbfcf0e346da8e921aa95d39f25818b5..8ef54078f85e6d7c418474483a91b28d3296ff44 100644 (file)
--- a/sysdeps/x86_64/multiarch/strnlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -1,264 +1,7 @@
-/* Placeholder function, not used by any processor at the moment.
-   Copyright (C) 2022-2024 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
  #ifndef STRNLEN
  #define STRNLEN __strnlen_evex512
  #endif
  
  #include "x86-evex512-vecs.h"
  #include "reg-macros.h"
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-# include <sysdep.h>
-
-# ifdef USE_AS_WCSLEN
-#  define VPCMPEQ      vpcmpeqd
-#  define VPTESTN      vptestnmd
-#  define VPMINU       vpminud
-#  define CHAR_SIZE    4
-# else
-#  define VPCMPEQ      vpcmpeqb
-#  define VPTESTN      vptestnmb
-#  define VPMINU       vpminub
-#  define CHAR_SIZE    1
-# endif
-
-# define PAGE_SIZE     4096
-# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
-
-       .section SECTION(.text),"ax",@progbits
-/* Aligning entry point to 64 byte, provides better performance for
-   one vector length string.  */
-ENTRY_P2ALIGN (STRNLEN, 6)
-       /* Check zero length.  */
-       test    %RSI_LP, %RSI_LP
-       jz      L(ret_max)
-#  ifdef __ILP32__
-       /* Clear the upper 32 bits.  */
-       movl    %esi, %esi
-#  endif
-
-       movl    %edi, %eax
-       vpxorq  %VMM_128(0), %VMM_128(0), %VMM_128(0)
-       sall    $20, %eax
-       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
-       ja      L(page_cross)
-
-       /* Compare [w]char for null, mask bit will be set for match.  */
-       VPCMPEQ (%rdi), %VMM(0), %k0
-       KMOV    %k0, %VRCX
-       /* Store max length in rax.  */
-       mov     %rsi, %rax
-       /* If rcx is 0, rax will have max length.  We can not use VRCX
-          and VRAX here for evex256 because, upper 32 bits may be
-          undefined for ecx and eax.  */
-       bsfq    %rcx, %rax
-       cmp     $CHAR_PER_VEC, %rax
-       ja      L(align_more)
-       cmpq    %rax, %rsi
-       cmovb   %esi, %eax
-       ret
-
-       /* At this point vector max length reached.  */
-       .p2align 4,,3
-L(ret_max):
-       movq    %rsi, %rax
-       ret
-
-L(align_more):
-       mov     %rdi, %rax
-       /* Align rax to VEC_SIZE.  */
-       andq    $-VEC_SIZE, %rax
-       movq    %rdi, %rdx
-       subq    %rax, %rdx
-#  ifdef USE_AS_WCSLEN
-       shr     $2, %VRDX
-#  endif
-       /* At this point rdx contains [w]chars already compared.  */
-       leaq    -CHAR_PER_VEC(%rsi, %rdx), %rdx
-       /* At this point rdx contains number of w[char] needs to go.
-          Now onwards rdx will keep decrementing with each compare.  */
-
-       /* Loop unroll 4 times for 4 vector loop.  */
-       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
-       subq    $-VEC_SIZE, %rax
-       KMOV    %k0, %VRCX
-       test    %VRCX, %VRCX
-       jnz     L(ret_vec_x1)
-
-       subq    $CHAR_PER_VEC, %rdx
-       jbe     L(ret_max)
-
-       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
-       KMOV    %k0, %VRCX
-       test    %VRCX, %VRCX
-       jnz     L(ret_vec_x2)
-
-       subq    $CHAR_PER_VEC, %rdx
-       jbe     L(ret_max)
-
-       VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
-       KMOV    %k0, %VRCX
-       test    %VRCX, %VRCX
-       jnz     L(ret_vec_x3)
-
-       subq    $CHAR_PER_VEC, %rdx
-       jbe     L(ret_max)
-
-       VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
-       KMOV    %k0, %VRCX
-       test    %VRCX, %VRCX
-       jnz     L(ret_vec_x4)
-
-       subq    $CHAR_PER_VEC, %rdx
-       jbe     L(ret_max)
-       /* Save pointer before 4 x VEC_SIZE alignment.  */
-       movq    %rax, %rcx
-
-       /* Align address to VEC_SIZE * 4 for loop.  */
-       andq    $-(VEC_SIZE * 4), %rax
-
-       subq    %rax, %rcx
-#  ifdef USE_AS_WCSLEN
-       shr     $2, %VRCX
-#  endif
-       /* rcx contains number of [w]char will be recompared due to
-          alignment fixes.  rdx must be incremented by rcx to offset
-          alignment adjustment.  */
-       addq    %rcx, %rdx
-       /* Need jump as we don't want to add/subtract rdx for first
-          iteration of 4 x VEC_SIZE aligned loop.  */
-
-       .p2align 4,,11
-L(loop):
-       /* VPMINU and VPCMP combination provide better performance as
-          compared to alternative combinations.  */
-       VMOVA   (VEC_SIZE * 4)(%rax), %VMM(1)
-       VPMINU  (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
-       VMOVA   (VEC_SIZE * 6)(%rax), %VMM(3)
-       VPMINU  (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
-
-       VPTESTN %VMM(2), %VMM(2), %k0
-       VPTESTN %VMM(4), %VMM(4), %k1
-
-       subq    $-(VEC_SIZE * 4), %rax
-       KORTEST %k0, %k1
-
-       jnz     L(loopend)
-       subq    $(CHAR_PER_VEC * 4), %rdx
-       ja      L(loop)
-       mov     %rsi, %rax
-       ret
-
-L(loopend):
-
-       VPTESTN %VMM(1), %VMM(1), %k2
-       KMOV    %k2, %VRCX
-       test    %VRCX, %VRCX
-       jnz     L(ret_vec_x1)
-
-       KMOV    %k0, %VRCX
-       /* At this point, if k0 is non zero, null char must be in the
-          second vector.  */
-       test    %VRCX, %VRCX
-       jnz     L(ret_vec_x2)
-
-       VPTESTN %VMM(3), %VMM(3), %k3
-       KMOV    %k3, %VRCX
-       test    %VRCX, %VRCX
-       jnz     L(ret_vec_x3)
-       /* At this point null [w]char must be in the fourth vector so no
-          need to check.  */
-       KMOV    %k1, %VRCX
-
-       /* Fourth, third, second vector terminating are pretty much
-          same, implemented this way to avoid branching and reuse code
-          from pre loop exit condition.  */
-L(ret_vec_x4):
-       bsf     %VRCX, %VRCX
-       subq    %rdi, %rax
-# ifdef USE_AS_WCSLEN
-       subq    $-(VEC_SIZE * 3), %rax
-       shrq    $2, %rax
-       addq    %rcx, %rax
-# else
-       leaq    (VEC_SIZE * 3)(%rcx, %rax), %rax
-# endif
-
-       cmpq    %rsi, %rax
-       cmovnb  %rsi, %rax
-       ret
-
-L(ret_vec_x3):
-       bsf     %VRCX, %VRCX
-       subq    %rdi, %rax
-# ifdef USE_AS_WCSLEN
-       subq    $-(VEC_SIZE * 2), %rax
-       shrq    $2, %rax
-       addq    %rcx, %rax
-# else
-       leaq    (VEC_SIZE * 2)(%rcx, %rax), %rax
-# endif
-       cmpq    %rsi, %rax
-       cmovnb  %rsi, %rax
-       ret
-
-L(ret_vec_x2):
-       subq    $-VEC_SIZE, %rax
-L(ret_vec_x1):
-       bsf     %VRCX, %VRCX
-       subq    %rdi, %rax
-# ifdef USE_AS_WCSLEN
-       shrq    $2, %rax
-# endif
-       addq    %rcx, %rax
-       cmpq    %rsi, %rax
-       cmovnb  %rsi, %rax
-       ret
-
-L(page_cross):
-       mov     %rdi, %rax
-       movl    %edi, %ecx
-       andl    $(VEC_SIZE - 1), %ecx
-# ifdef USE_AS_WCSLEN
-       sarl    $2, %ecx
-# endif
-       /* ecx contains number of w[char] to be skipped as a result
-          of address alignment.  */
-       andq    $-VEC_SIZE, %rax
-       VPCMPEQ (%rax), %VMM(0), %k0
-       KMOV    %k0, %VRDX
-       /* Ignore number of character for alignment adjustment.  */
-       shr     %cl, %VRDX
-       jnz     L(page_cross_end)
-       movl    $CHAR_PER_VEC, %eax
-       sub     %ecx, %eax
-       cmp     %rax, %rsi
-       ja      L(align_more)
-
-L(page_cross_end):
-       bsf     %VRDX, %VRAX
-       cmpq    %rsi, %rax
-       cmovnb  %esi, %eax
-       ret
-
-END (STRNLEN)
-#endif
+#include "strnlen-evex-base.S"
+\ No newline at end of file
author	Matthew Sterrett <matthew.sterrett@intel.com>
	Fri, 9 Aug 2024 22:05:09 +0000 (15:05 -0700)
committer	H.J. Lu <hjl.tools@gmail.com>
	Mon, 19 Aug 2024 14:34:02 +0000 (07:34 -0700)
sysdeps/x86_64/multiarch/strnlen-evex-base.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/multiarch/strnlen-evex.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/strnlen-evex512.S		patch \| blob \| blame \| history