--- /dev/null
+/* strnlen/wcsnlen optimized with 256/512-bit EVEX instructions.
+ Copyright (C) 2022-2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+#ifdef USE_AS_WCSLEN
+# define VPCMPEQ vpcmpeqd
+# define VPTESTN vptestnmd
+# define VPMINU vpminud
+# define CHAR_SIZE 4
+#else
+# define VPCMPEQ vpcmpeqb
+# define VPTESTN vptestnmb
+# define VPMINU vpminub
+# define CHAR_SIZE 1
+#endif
+
+#define XZERO VMM_128(0)
+#define VZERO VMM(0)
+#define PAGE_SIZE 4096
+#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+#if CHAR_PER_VEC == 32
+# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8)
+#else
+# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32)
+#endif
+
+#ifdef USE_AS_WCSLEN
+/* For wide-character, we care more about limitting code size
+ than optimally aligning targets, so just cap nop padding
+ reasonably low. */
+# define P2ALIGN(...) .p2align 4,, 6
+# define P2ALIGN_CLAMPED(...) P2ALIGN(__VA_ARGS__)
+#else
+# define P2ALIGN(x) .p2align x
+# define P2ALIGN_CLAMPED(x, y) .p2align x,, y
+#endif
+
+ .section SECTION(.text), "ax", @progbits
+ /* Aligning entry point to 64 byte, provides better performance for
+ one vector length string. */
+ENTRY_P2ALIGN(STRNLEN, 6)
+ /* rdi is pointer to array, rsi is the upper limit. */
+
+ /* Check zero length. */
+ test %RSI_LP, %RSI_LP
+ jz L(zero)
+
+#ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %esi, %esi
+#endif
+
+ vpxorq %XZERO, %XZERO, %XZERO
+
+ /* Check that we won't cross a page boundary with our first load. */
+ movl %edi, %eax
+ shll $20, %eax
+ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
+ ja L(crosses_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
+ null byte. */
+ VPCMPEQ (%rdi), %VZERO, %k0
+ KMOV %k0, %VRCX
+
+ /* If src (rcx) is zero, bsf does not change the result. NB:
+ Must use 64-bit bsf here so that upper bits of len are not
+ cleared. */
+ movq %rsi, %rax
+ bsfq %rcx, %rax
+
+ /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
+ CHAR) and rsi must be > CHAR_PER_VEC. */
+ cmpq $CHAR_PER_VEC, %rax
+ ja L(more_1x_vec)
+
+ /* Check if first match in bounds. */
+ cmpq %rax, %rsi
+ cmovb %esi, %eax
+ ret
+
+#if VEC_SIZE == 32
+ P2ALIGN_CLAMPED(4, 2)
+L(zero):
+L(max_0):
+ movl %esi, %eax
+ ret
+#endif
+
+ P2ALIGN_CLAMPED(4, 10)
+L(more_1x_vec):
+L(cross_page_continue):
+ /* After this calculation, rax stores the number of elements
+ left to be processed The complexity comes from the fact some
+ elements get read twice due to alignment and we need to be
+ sure we don't count them twice (else, it would just be rsi -
+ CHAR_PER_VEC). */
+
+#ifdef USE_AS_WCSLEN
+ /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+ overflow. */
+ movq %rdi, %rax
+ andq $(VEC_SIZE * -1), %rdi
+ subq %rdi, %rax
+ sarq $2, %rax
+ leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
+#else
+ /* Calculate ptr + N - VEC_SIZE, then mask off the low bits,
+ then subtract ptr to get the new aligned limit value. */
+ leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax
+ andq $(VEC_SIZE * -1), %rdi
+ subq %rdi, %rax
+#endif
+
+ VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
+
+ /* Checking here is faster for 256-bit but not 512-bit */
+#if VEC_SIZE == 0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(last_vec_check)
+#endif
+
+ cmpq $(CHAR_PER_VEC * 2), %rax
+ ja L(more_2x_vec)
+
+L(last_2x_vec_or_less):
+
+ /* Checking here is faster for 512-bit but not 256-bit */
+#if VEC_SIZE != 0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(last_vec_check)
+#endif
+
+ /* Check for the end of data. */
+ SUB_SHORT (CHAR_PER_VEC, rax)
+ jbe L(max_0)
+
+ /* Check the final remaining vector. */
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+#if VEC_SIZE == 32
+ jz L(max_0)
+#else
+ jnz L(last_vec_check)
+ P2ALIGN_CLAMPED(4, 2)
+L(zero):
+L(max_0):
+ movl %esi, %eax
+ ret
+
+#endif
+ P2ALIGN_CLAMPED(4, 4)
+L(last_vec_check):
+ bsf %VRDX, %VRDX
+ sub %eax, %edx
+ lea (%rsi, %rdx), %eax
+ cmovae %esi, %eax
+ ret
+
+
+#if VEC_SIZE == 32
+ P2ALIGN_CLAMPED(4, 8)
+#endif
+L(last_4x_vec_or_less):
+ addl $(CHAR_PER_VEC * -4), %eax
+ VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
+
+#if VEC_SIZE == 64
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(last_vec_check)
+#endif
+
+ subq $(VEC_SIZE * -4), %rdi
+ cmpl $(CHAR_PER_VEC * 2), %eax
+ jbe L(last_2x_vec_or_less)
+
+ P2ALIGN_CLAMPED(4, 6)
+L(more_2x_vec):
+ /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+ rechecking bounds. */
+
+ /* Already checked in 256-bit case */
+#if VEC_SIZE != 0
+ KMOV %k0, %VRDX
+
+ test %VRDX, %VRDX
+ jnz L(first_vec_x1)
+#endif
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+
+ test %VRDX, %VRDX
+ jnz L(first_vec_x2)
+
+ cmpq $(CHAR_PER_VEC * 4), %rax
+ ja L(more_4x_vec)
+
+
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+ addl $(CHAR_PER_VEC * -2), %eax
+ test %VRDX, %VRDX
+ jnz L(last_vec_check)
+
+ subb $(CHAR_PER_VEC), %al
+ jbe L(max_1)
+
+ VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+
+ test %VRDX, %VRDX
+ jnz L(last_vec_check)
+L(max_1):
+ movl %esi, %eax
+ ret
+
+
+ P2ALIGN_CLAMPED(4, 14)
+L(first_vec_x2):
+#if VEC_SIZE == 64
+ /* If VEC_SIZE == 64 we can fit logic for full return label in
+ spare bytes before next cache line. */
+ bsf %VRDX, %VRDX
+ sub %eax, %esi
+ leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
+ ret
+ P2ALIGN_CLAMPED(4, 6)
+#else
+ addl $CHAR_PER_VEC, %esi
+#endif
+L(first_vec_x1):
+ bsf %VRDX, %VRDX
+ sub %eax, %esi
+ leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
+ ret
+
+#if VEC_SIZE == 64
+ P2ALIGN_CLAMPED(4, 6)
+L(first_vec_x4):
+# if VEC_SIZE == 64
+ /* If VEC_SIZE == 64 we can fit logic for full return label in
+ spare bytes before next cache line. */
+ bsf %VRDX, %VRDX
+ sub %eax, %esi
+ leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
+ ret
+ P2ALIGN_CLAMPED(4, 6)
+# else
+ addl $CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x3):
+ bsf %VRDX, %VRDX
+ sub %eax, %esi
+ leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
+ ret
+#endif
+
+ P2ALIGN_CLAMPED(6, 20)
+L(more_4x_vec):
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(first_vec_x3)
+
+ VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(first_vec_x4)
+
+ /* Check if at last VEC_SIZE * 4 length before aligning for the
+ loop. */
+ cmpq $(CHAR_PER_VEC * 8), %rax
+ jbe L(last_4x_vec_or_less)
+
+
+ /* Compute number of words checked after aligning. */
+#ifdef USE_AS_WCSLEN
+ /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+ overflow. */
+ leaq (VEC_SIZE * -3)(%rdi), %rdx
+#else
+ leaq (VEC_SIZE * -3)(%rdi, %rax), %rax
+#endif
+
+ subq $(VEC_SIZE * -1), %rdi
+
+ /* Align data to VEC_SIZE * 4. */
+#if VEC_SIZE == 64
+ /* Saves code size. No evex512 processor has partial register
+ stalls. If that change this can be replaced with `andq
+ $-(VEC_SIZE * 4), %rdi`. */
+ xorb %dil, %dil
+#else
+ andq $-(VEC_SIZE * 4), %rdi
+#endif
+
+#ifdef USE_AS_WCSLEN
+ subq %rdi, %rdx
+ sarq $2, %rdx
+ addq %rdx, %rax
+#else
+ subq %rdi, %rax
+#endif
+
+ // mov %rdi, %rdx
+
+ P2ALIGN(6)
+L(loop):
+ /* VPMINU and VPCMP combination provide better performance as
+ compared to alternative combinations. */
+ VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
+ VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+ VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
+ VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+
+ VPTESTN %VMM(2), %VMM(2), %k0
+ VPTESTN %VMM(4), %VMM(4), %k1
+
+ subq $-(VEC_SIZE * 4), %rdi
+ KORTEST %k0, %k1
+
+ jnz L(loopend)
+ subq $(CHAR_PER_VEC * 4), %rax
+ ja L(loop)
+ mov %rsi, %rax
+ ret
+
+
+#if VEC_SIZE == 32
+ P2ALIGN_CLAMPED(4, 6)
+L(first_vec_x4):
+# if VEC_SIZE == 64
+ /* If VEC_SIZE == 64 we can fit logic for full return label in
+ spare bytes before next cache line. */
+ bsf %VRDX, %VRDX
+ sub %eax, %esi
+ leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
+ ret
+ P2ALIGN_CLAMPED(4, 6)
+# else
+ addl $CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x3):
+ bsf %VRDX, %VRDX
+ sub %eax, %esi
+ leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
+ ret
+#endif
+
+
+ P2ALIGN_CLAMPED(4, 11)
+L(loopend):
+ /* We found a null terminator in one of the 4 vectors. */
+
+ /* Check the first vector. */
+ movq %rax, %r8
+ VPTESTN %VMM(1), %VMM(1), %k2
+ KMOV %k2, %VRCX
+ bsf %rcx, %r8
+
+ cmpq $(CHAR_PER_VEC), %r8
+ jbe L(end_vec)
+
+ /* Check the second vector. */
+ subq $(CHAR_PER_VEC), %rax
+ movq %rax, %r8
+ KMOV %k0, %VRCX
+ bsf %rcx, %r8
+
+ cmpq $(CHAR_PER_VEC), %r8
+ jbe L(end_vec)
+
+ /* Check the third vector. */
+ subq $(CHAR_PER_VEC), %rax
+ movq %rax, %r8
+ VPTESTN %VMM(3), %VMM(3), %k2
+ KMOV %k2, %VRCX
+ bsf %rcx, %r8
+
+ cmpq $(CHAR_PER_VEC), %r8
+ jbe L(end_vec)
+
+ /* It is in the fourth vector. */
+ subq $(CHAR_PER_VEC), %rax
+ movq %rax, %r8
+ KMOV %k1, %VRCX
+ bsf %rcx, %r8
+
+ P2ALIGN_CLAMPED(4, 3)
+L(end_vec):
+ /* Get the number that has been processed. */
+ movq %rsi, %rcx
+ subq %rax, %rcx
+
+ /* Add that to the offset we found the null terminator at. */
+ leaq (%r8, %rcx), %rax
+
+ /* Take the min of that and the limit. */
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+ ret
+
+ P2ALIGN_CLAMPED(4, 11)
+L(crosses_page_boundary):
+ /* Align data backwards to VEC_SIZE. */
+ shrl $20, %eax
+ movq %rdi, %rcx
+ andq $-VEC_SIZE, %rcx
+ VPCMPEQ (%rcx), %VZERO, %k0
+
+ KMOV %k0, %VRCX
+#ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+ andl $(CHAR_PER_VEC - 1), %eax
+#endif
+ /* By this point rax contains number of bytes we need to skip. */
+ shrx %VRAX, %VRCX, %VRCX
+
+ /* Calculates CHAR_PER_VEC - eax and stores in eax. */
+ negl %eax
+ andl $(CHAR_PER_VEC - 1), %eax
+
+ movq %rsi, %rdx
+ bsf %VRCX, %VRDX
+ cmpq %rax, %rdx
+ ja L(cross_page_continue)
+
+ /* The vector had a null terminator or we are at the limit. */
+ movl %edx, %eax
+ cmpq %rdx, %rsi
+ cmovb %esi, %eax
+ ret
+
+END(STRNLEN)
+#endif
-/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
- Copyright (C) 2022-2024 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <isa-level.h>
-#include <sysdep.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-# ifndef VEC_SIZE
-# include "x86-evex256-vecs.h"
-# endif
-
-
-# ifndef STRNLEN
-# define STRNLEN __strnlen_evex
-# endif
-
-# ifdef USE_AS_WCSLEN
-# define VPCMPEQ vpcmpeqd
-# define VPCMPNEQ vpcmpneqd
-# define VPTESTN vptestnmd
-# define VPTEST vptestmd
-# define VPMINU vpminud
-# define CHAR_SIZE 4
-
-# else
-# define VPCMPEQ vpcmpeqb
-# define VPCMPNEQ vpcmpneqb
-# define VPTESTN vptestnmb
-# define VPTEST vptestmb
-# define VPMINU vpminub
-# define CHAR_SIZE 1
-
-# define REG_WIDTH VEC_SIZE
-# endif
-
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
-# include "reg-macros.h"
-
-# if CHAR_PER_VEC == 32
-# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8)
-# else
-# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32)
-# endif
-
-
-
-# if CHAR_PER_VEC == 64
-# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
-# else
-# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
-# endif
-
-
-# define XZERO VMM_128(0)
-# define VZERO VMM(0)
-# define PAGE_SIZE 4096
-
- .section SECTION(.text), "ax", @progbits
-ENTRY_P2ALIGN (STRNLEN, 6)
- /* Check zero length. */
- test %RSI_LP, %RSI_LP
- jz L(zero)
-# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %esi, %esi
-# endif
-
- movl %edi, %eax
- vpxorq %XZERO, %XZERO, %XZERO
- andl $(PAGE_SIZE - 1), %eax
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
- ja L(cross_page_boundary)
-
- /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
- null byte. */
- VPCMPEQ (%rdi), %VZERO, %k0
-
- KMOV %k0, %VRCX
- movq %rsi, %rax
-
- /* If src (rcx) is zero, bsf does not change the result. NB:
- Must use 64-bit bsf here so that upper bits of len are not
- cleared. */
- bsfq %rcx, %rax
- /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
- CHAR) and rsi must be > CHAR_PER_VEC. */
- cmpq $CHAR_PER_VEC, %rax
- ja L(more_1x_vec)
- /* Check if first match in bounds. */
- cmpq %rax, %rsi
- cmovb %esi, %eax
- ret
-
-
-# if CHAR_PER_VEC != 32
- .p2align 4,, 2
-L(zero):
-L(max_0):
- movl %esi, %eax
- ret
-# endif
-
- /* Aligned more for strnlen compares remaining length vs 2 *
- CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
- going to the loop. */
- .p2align 4,, 10
-L(more_1x_vec):
-L(cross_page_continue):
- /* Compute number of words checked after aligning. */
-# ifdef USE_AS_WCSLEN
- /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
- overflow. */
- movq %rdi, %rax
- andq $(VEC_SIZE * -1), %rdi
- subq %rdi, %rax
- sarq $2, %rax
- leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
-# else
- leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax
- andq $(VEC_SIZE * -1), %rdi
- subq %rdi, %rax
-# endif
-
-
- VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
-
- cmpq $(CHAR_PER_VEC * 2), %rax
- ja L(more_2x_vec)
-
-L(last_2x_vec_or_less):
- KMOV %k0, %VRDX
- test %VRDX, %VRDX
- jnz L(last_vec_check)
-
- /* Check the end of data. */
- SUB_SHORT (CHAR_PER_VEC, rax)
- jbe L(max_0)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
- KMOV %k0, %VRDX
- test %VRDX, %VRDX
- jz L(max_0)
- /* Best place for LAST_VEC_CHECK if ZMM. */
- .p2align 4,, 8
-L(last_vec_check):
- bsf %VRDX, %VRDX
- sub %eax, %edx
- lea (%rsi, %rdx), %eax
- cmovae %esi, %eax
- ret
-
-# if CHAR_PER_VEC == 32
- .p2align 4,, 2
-L(zero):
-L(max_0):
- movl %esi, %eax
- ret
-# endif
-
- .p2align 4,, 8
-L(last_4x_vec_or_less):
- addl $(CHAR_PER_VEC * -4), %eax
- VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
- subq $(VEC_SIZE * -4), %rdi
- cmpl $(CHAR_PER_VEC * 2), %eax
- jbe L(last_2x_vec_or_less)
-
- .p2align 4,, 6
-L(more_2x_vec):
- /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
- rechecking bounds. */
-
- KMOV %k0, %VRDX
-
- test %VRDX, %VRDX
- jnz L(first_vec_x1)
-
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
- KMOV %k0, %VRDX
- test %VRDX, %VRDX
- jnz L(first_vec_x2)
-
- cmpq $(CHAR_PER_VEC * 4), %rax
- ja L(more_4x_vec)
-
-
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
- KMOV %k0, %VRDX
- addl $(CHAR_PER_VEC * -2), %eax
- test %VRDX, %VRDX
- jnz L(last_vec_check)
-
- subl $(CHAR_PER_VEC), %eax
- jbe L(max_1)
-
- VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
- KMOV %k0, %VRDX
-
- test %VRDX, %VRDX
- jnz L(last_vec_check)
-L(max_1):
- movl %esi, %eax
- ret
-
- .p2align 4,, 3
-L(first_vec_x2):
-# if VEC_SIZE == 64
- /* If VEC_SIZE == 64 we can fit logic for full return label in
- spare bytes before next cache line. */
- bsf %VRDX, %VRDX
- sub %eax, %esi
- leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
- ret
- .p2align 4,, 6
-# else
- addl $CHAR_PER_VEC, %esi
-# endif
-L(first_vec_x1):
- bsf %VRDX, %VRDX
- sub %eax, %esi
- leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
- ret
-
-
- .p2align 4,, 6
-L(first_vec_x4):
-# if VEC_SIZE == 64
- /* If VEC_SIZE == 64 we can fit logic for full return label in
- spare bytes before next cache line. */
- bsf %VRDX, %VRDX
- sub %eax, %esi
- leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
- ret
- .p2align 4,, 6
-# else
- addl $CHAR_PER_VEC, %esi
-# endif
-L(first_vec_x3):
- bsf %VRDX, %VRDX
- sub %eax, %esi
- leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
- ret
-
- .p2align 4,, 5
-L(more_4x_vec):
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
- KMOV %k0, %VRDX
- test %VRDX, %VRDX
- jnz L(first_vec_x3)
-
- VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
- KMOV %k0, %VRDX
- test %VRDX, %VRDX
- jnz L(first_vec_x4)
-
- /* Check if at last VEC_SIZE * 4 length before aligning for the
- loop. */
- cmpq $(CHAR_PER_VEC * 8), %rax
- jbe L(last_4x_vec_or_less)
-
-
- /* Compute number of words checked after aligning. */
-# ifdef USE_AS_WCSLEN
- /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
- overflow. */
- leaq (VEC_SIZE * -3)(%rdi), %rdx
-# else
- leaq (VEC_SIZE * -3)(%rdi, %rax), %rax
-# endif
-
- subq $(VEC_SIZE * -1), %rdi
-
- /* Align data to VEC_SIZE * 4. */
-# if VEC_SIZE == 64
- /* Saves code size. No evex512 processor has partial register
- stalls. If that change this can be replaced with `andq
- $-(VEC_SIZE * 4), %rdi`. */
- xorb %dil, %dil
-# else
- andq $-(VEC_SIZE * 4), %rdi
-# endif
-
-# ifdef USE_AS_WCSLEN
- subq %rdi, %rdx
- sarq $2, %rdx
- addq %rdx, %rax
-# else
- subq %rdi, %rax
-# endif
- /* Compare 4 * VEC at a time forward. */
- .p2align 4,, 11
-L(loop_4x_vec):
- VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
- VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
- VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
- VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
- VPTESTN %VMM(2), %VMM(2), %k0
- VPTESTN %VMM(4), %VMM(4), %k2
- subq $-(VEC_SIZE * 4), %rdi
- /* Break if at end of length. */
- subq $(CHAR_PER_VEC * 4), %rax
- jbe L(loop_len_end)
-
-
- KORTEST %k0, %k2
- jz L(loop_4x_vec)
-
-
-L(loop_last_4x_vec):
- movq %rsi, %rcx
- subq %rax, %rsi
- VPTESTN %VMM(1), %VMM(1), %k1
- KMOV %k1, %VRDX
- test %VRDX, %VRDX
- jnz L(last_vec_x0)
-
- KMOV %k0, %VRDX
- test %VRDX, %VRDX
- jnz L(last_vec_x1)
-
- VPTESTN %VMM(3), %VMM(3), %k0
-
- /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
- returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
- individually, for VEC_SIZE == 32 we combine them in a single
- 64-bit GPR. */
-# if CHAR_PER_VEC == 64
- KMOV %k0, %VRDX
- test %VRDX, %VRDX
- jnz L(last_vec_x2)
- KMOV %k2, %VRDX
-# else
- /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
- */
- kmovd %k2, %edx
- kmovd %k0, %eax
- salq $CHAR_PER_VEC, %rdx
- orq %rax, %rdx
-# endif
-
- /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
- */
- bsfq %rdx, %rdx
- leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
- cmpq %rax, %rcx
- cmovb %rcx, %rax
- ret
-
- /* Handle last 4x VEC after loop. All VECs have been loaded. */
- .p2align 4,, 4
-L(loop_len_end):
- KORTEST %k0, %k2
- jnz L(loop_last_4x_vec)
- movq %rsi, %rax
- ret
-
-
-# if CHAR_PER_VEC == 64
- /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
- need return label for it. */
- .p2align 4,, 8
-L(last_vec_x2):
- bsf %VRDX, %VRDX
- leaq (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
- cmpq %rax, %rcx
- cmovb %rcx, %rax
- ret
-# endif
-
-
- .p2align 4,, 10
-L(last_vec_x1):
- addq $CHAR_PER_VEC, %rsi
-L(last_vec_x0):
- bsf %VRDX, %VRDX
- leaq (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
- cmpq %rax, %rcx
- cmovb %rcx, %rax
- ret
-
-
- .p2align 4,, 8
-L(cross_page_boundary):
- /* Align data to VEC_SIZE. */
- movq %rdi, %rcx
- andq $-VEC_SIZE, %rcx
- VPCMPEQ (%rcx), %VZERO, %k0
-
- KMOV %k0, %VRCX
-# ifdef USE_AS_WCSLEN
- shrl $2, %eax
- andl $(CHAR_PER_VEC - 1), %eax
-# endif
- shrx %VRAX, %VRCX, %VRCX
-
- negl %eax
- andl $(CHAR_PER_VEC - 1), %eax
- movq %rsi, %rdx
- bsf %VRCX, %VRDX
- cmpq %rax, %rdx
- ja L(cross_page_continue)
- movl %edx, %eax
- cmpq %rdx, %rsi
- cmovb %esi, %eax
- ret
-END (STRNLEN)
+#ifndef STRNLEN
+#define STRNLEN __strnlen_evex
#endif
+
+#include "x86-evex256-vecs.h"
+#include "reg-macros.h"
+#include "strnlen-evex-base.S"
-/* Placeholder function, not used by any processor at the moment.
- Copyright (C) 2022-2024 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
#ifndef STRNLEN
#define STRNLEN __strnlen_evex512
#endif
#include "x86-evex512-vecs.h"
#include "reg-macros.h"
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-# include <sysdep.h>
-
-# ifdef USE_AS_WCSLEN
-# define VPCMPEQ vpcmpeqd
-# define VPTESTN vptestnmd
-# define VPMINU vpminud
-# define CHAR_SIZE 4
-# else
-# define VPCMPEQ vpcmpeqb
-# define VPTESTN vptestnmb
-# define VPMINU vpminub
-# define CHAR_SIZE 1
-# endif
-
-# define PAGE_SIZE 4096
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
- .section SECTION(.text),"ax",@progbits
-/* Aligning entry point to 64 byte, provides better performance for
- one vector length string. */
-ENTRY_P2ALIGN (STRNLEN, 6)
- /* Check zero length. */
- test %RSI_LP, %RSI_LP
- jz L(ret_max)
-# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %esi, %esi
-# endif
-
- movl %edi, %eax
- vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0)
- sall $20, %eax
- cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
- ja L(page_cross)
-
- /* Compare [w]char for null, mask bit will be set for match. */
- VPCMPEQ (%rdi), %VMM(0), %k0
- KMOV %k0, %VRCX
- /* Store max length in rax. */
- mov %rsi, %rax
- /* If rcx is 0, rax will have max length. We can not use VRCX
- and VRAX here for evex256 because, upper 32 bits may be
- undefined for ecx and eax. */
- bsfq %rcx, %rax
- cmp $CHAR_PER_VEC, %rax
- ja L(align_more)
- cmpq %rax, %rsi
- cmovb %esi, %eax
- ret
-
- /* At this point vector max length reached. */
- .p2align 4,,3
-L(ret_max):
- movq %rsi, %rax
- ret
-
-L(align_more):
- mov %rdi, %rax
- /* Align rax to VEC_SIZE. */
- andq $-VEC_SIZE, %rax
- movq %rdi, %rdx
- subq %rax, %rdx
-# ifdef USE_AS_WCSLEN
- shr $2, %VRDX
-# endif
- /* At this point rdx contains [w]chars already compared. */
- leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx
- /* At this point rdx contains number of w[char] needs to go.
- Now onwards rdx will keep decrementing with each compare. */
-
- /* Loop unroll 4 times for 4 vector loop. */
- VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
- subq $-VEC_SIZE, %rax
- KMOV %k0, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x1)
-
- subq $CHAR_PER_VEC, %rdx
- jbe L(ret_max)
-
- VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
- KMOV %k0, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x2)
-
- subq $CHAR_PER_VEC, %rdx
- jbe L(ret_max)
-
- VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
- KMOV %k0, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x3)
-
- subq $CHAR_PER_VEC, %rdx
- jbe L(ret_max)
-
- VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
- KMOV %k0, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x4)
-
- subq $CHAR_PER_VEC, %rdx
- jbe L(ret_max)
- /* Save pointer before 4 x VEC_SIZE alignment. */
- movq %rax, %rcx
-
- /* Align address to VEC_SIZE * 4 for loop. */
- andq $-(VEC_SIZE * 4), %rax
-
- subq %rax, %rcx
-# ifdef USE_AS_WCSLEN
- shr $2, %VRCX
-# endif
- /* rcx contains number of [w]char will be recompared due to
- alignment fixes. rdx must be incremented by rcx to offset
- alignment adjustment. */
- addq %rcx, %rdx
- /* Need jump as we don't want to add/subtract rdx for first
- iteration of 4 x VEC_SIZE aligned loop. */
-
- .p2align 4,,11
-L(loop):
- /* VPMINU and VPCMP combination provide better performance as
- compared to alternative combinations. */
- VMOVA (VEC_SIZE * 4)(%rax), %VMM(1)
- VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
- VMOVA (VEC_SIZE * 6)(%rax), %VMM(3)
- VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
-
- VPTESTN %VMM(2), %VMM(2), %k0
- VPTESTN %VMM(4), %VMM(4), %k1
-
- subq $-(VEC_SIZE * 4), %rax
- KORTEST %k0, %k1
-
- jnz L(loopend)
- subq $(CHAR_PER_VEC * 4), %rdx
- ja L(loop)
- mov %rsi, %rax
- ret
-
-L(loopend):
-
- VPTESTN %VMM(1), %VMM(1), %k2
- KMOV %k2, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x1)
-
- KMOV %k0, %VRCX
- /* At this point, if k0 is non zero, null char must be in the
- second vector. */
- test %VRCX, %VRCX
- jnz L(ret_vec_x2)
-
- VPTESTN %VMM(3), %VMM(3), %k3
- KMOV %k3, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x3)
- /* At this point null [w]char must be in the fourth vector so no
- need to check. */
- KMOV %k1, %VRCX
-
- /* Fourth, third, second vector terminating are pretty much
- same, implemented this way to avoid branching and reuse code
- from pre loop exit condition. */
-L(ret_vec_x4):
- bsf %VRCX, %VRCX
- subq %rdi, %rax
-# ifdef USE_AS_WCSLEN
- subq $-(VEC_SIZE * 3), %rax
- shrq $2, %rax
- addq %rcx, %rax
-# else
- leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
-# endif
-
- cmpq %rsi, %rax
- cmovnb %rsi, %rax
- ret
-
-L(ret_vec_x3):
- bsf %VRCX, %VRCX
- subq %rdi, %rax
-# ifdef USE_AS_WCSLEN
- subq $-(VEC_SIZE * 2), %rax
- shrq $2, %rax
- addq %rcx, %rax
-# else
- leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
-# endif
- cmpq %rsi, %rax
- cmovnb %rsi, %rax
- ret
-
-L(ret_vec_x2):
- subq $-VEC_SIZE, %rax
-L(ret_vec_x1):
- bsf %VRCX, %VRCX
- subq %rdi, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
- addq %rcx, %rax
- cmpq %rsi, %rax
- cmovnb %rsi, %rax
- ret
-
-L(page_cross):
- mov %rdi, %rax
- movl %edi, %ecx
- andl $(VEC_SIZE - 1), %ecx
-# ifdef USE_AS_WCSLEN
- sarl $2, %ecx
-# endif
- /* ecx contains number of w[char] to be skipped as a result
- of address alignment. */
- andq $-VEC_SIZE, %rax
- VPCMPEQ (%rax), %VMM(0), %k0
- KMOV %k0, %VRDX
- /* Ignore number of character for alignment adjustment. */
- shr %cl, %VRDX
- jnz L(page_cross_end)
- movl $CHAR_PER_VEC, %eax
- sub %ecx, %eax
- cmp %rax, %rsi
- ja L(align_more)
-
-L(page_cross_end):
- bsf %VRDX, %VRAX
- cmpq %rsi, %rax
- cmovnb %esi, %eax
- ret
-
-END (STRNLEN)
-#endif
+#include "strnlen-evex-base.S"
\ No newline at end of file