[PATCH] x86: Unifies 'strlen-evex' and 'strlen-evex512' implementations.

Wed Dec 13 19:57:35 GMT 2023

This commit uses a common implementation 'strlen-evex-base.S' for both
'strlen-evex' and 'strlen-evex512'

The motivation is to reduce the number of implementations to maintain.
This incidentally gives a small performance improvement.

All tests pass on x86.

Benchmarks were taken on SKX.
https://www.intel.com/content/www/us/en/products/sku/123613/intel-core-i97900x-xseries-processor-13-75m-cache-up-to-4-30-ghz/specifications.html

Geometric mean for strlen-evex512 over all benchmarks (N=10) was (new/old) 0.939
Geometric mean for wcslen-evex512 over all benchmarks (N=10) was (new/old) 0.965

Code Size Changes:
    strlen-evex512.S    :  +24 bytes
    wcslen-evex512.S    :  +54 bytes
---
 sysdeps/x86_64/multiarch/strlen-evex-base.S | 385 ++++++++------------
 sysdeps/x86_64/multiarch/strlen-evex.S      | 250 +------------
 sysdeps/x86_64/multiarch/strnlen-evex512.S  | 266 +++++++++++++-
 sysdeps/x86_64/multiarch/wcslen-evex512.S   |   6 +-
 sysdeps/x86_64/multiarch/wcsnlen-evex512.S  |   9 +-
 5 files changed, 440 insertions(+), 476 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index 7305b24e28..04a173c776 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -1,5 +1,5 @@
-/* Placeholder function, not used by any processor at the moment.
-   Copyright (C) 2022-2023 Free Software Foundation, Inc.
+/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021-2023 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,282 +16,217 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-/* UNUSED. Exists purely as reference implementation.  */
-
+	
 #include <isa-level.h>
 
-#if ISA_SHOULD_BUILD (4)
-
 # include <sysdep.h>
 
 # ifdef USE_AS_WCSLEN
 #  define VPCMPEQ	vpcmpeqd
+#  define VPCMPNEQ	vpcmpneqd
 #  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
 #  define VPMINU	vpminud
 #  define CHAR_SIZE	4
+#  define CHAR_SIZE_SHIFT_REG(reg)	sar $2, %reg
 # else
 #  define VPCMPEQ	vpcmpeqb
+#  define VPCMPNEQ	vpcmpneqb
 #  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
 #  define VPMINU	vpminub
 #  define CHAR_SIZE	1
+#  define CHAR_SIZE_SHIFT_REG(reg)
+
+#  define REG_WIDTH	VEC_SIZE
 # endif
 
-# define PAGE_SIZE	4096
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-	.section SECTION(.text),"ax",@progbits
-/* Aligning entry point to 64 byte, provides better performance for
-   one vector length string.  */
-ENTRY_P2ALIGN (STRLEN, 6)
-# ifdef USE_AS_STRNLEN
-	/* Check zero length.  */
-	test	%RSI_LP, %RSI_LP
-	jz	L(ret_max)
-#  ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
-#  endif
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 64
+
+#  define TAIL_RETURN_LBL	first_vec_x2
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x3
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+# else
+
+#  define TAIL_RETURN_LBL	first_vec_x3
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x2
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
 # endif
 
+# define XZERO	VMM_128(0)
+# define VZERO	VMM(0)
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(STRLEN, 6)
 	movl	%edi, %eax
-	vpxorq	%VMM_128(0), %VMM_128(0), %VMM_128(0)
-	sall	$20, %eax
-	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
-	ja	L(page_cross)
-
-	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMPEQ	(%rdi), %VMM(0), %k0
-# ifdef USE_AS_STRNLEN
-	KMOV	%k0, %VRCX
-	/* Store max length in rax.  */
-	mov	%rsi, %rax
-	/* If rcx is 0, rax will have max length.  We can not use VRCX
-	   and VRAX here for evex256 because, upper 32 bits may be
-	   undefined for ecx and eax.  */
-	bsfq	%rcx, %rax
-	cmp	$CHAR_PER_VEC, %rax
-	ja	L(align_more)
-	cmpq	%rax, %rsi
-	cmovb	%esi, %eax
-# else
+	vpxorq	%XZERO, %XZERO, %XZERO
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+	   null byte.  */
+	VPCMPEQ	(%rdi), %VZERO, %k0
 	KMOV	%k0, %VRAX
 	test	%VRAX, %VRAX
-	jz	L(align_more)
+	jz	L(aligned_more)
 	bsf	%VRAX, %VRAX
-# endif
 	ret
 
-	/* At this point vector max length reached.  */
-# ifdef USE_AS_STRNLEN
-	.p2align 4,,3
-L(ret_max):
-	movq	%rsi, %rax
+	.p2align 4,, 8
+L(first_vec_x4):
+	bsf	%VRAX, %VRAX
+	subl	%ecx, %edi
+	CHAR_SIZE_SHIFT_REG (edi)
+	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
 	ret
-# endif
-
-L(align_more):
-	mov	%rdi, %rax
-	/* Align rax to VEC_SIZE.  */
-	andq	$-VEC_SIZE, %rax
-# ifdef USE_AS_STRNLEN
-	movq	%rdi, %rdx
-	subq	%rax, %rdx
-#  ifdef USE_AS_WCSLEN
-	shr	$2, %VRDX
-#  endif
-	/* At this point rdx contains [w]chars already compared.  */
-	leaq	-CHAR_PER_VEC(%rsi, %rdx), %rdx
-	/* At this point rdx contains number of w[char] needs to go.
-	   Now onwards rdx will keep decrementing with each compare.  */
-# endif
 
-	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
-	subq	$-VEC_SIZE, %rax
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x1)
 
-# ifdef USE_AS_STRNLEN
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-# endif
 
-	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x2)
+	/* Aligned more for strnlen compares remaining length vs 2 *
+	   CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+	   going to the loop.  */
+	.p2align 4,, 10
+L(aligned_more):
+	movq	%rdi, %rcx
+	andq	$(VEC_SIZE * -1), %rdi
+L(cross_page_continue):
+	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+	   rechecking bounds.  */
+	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x1)
 
-# ifdef USE_AS_STRNLEN
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-# endif
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
 
-	VPCMPEQ	(VEC_SIZE * 2)(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x3)
 
-# ifdef USE_AS_STRNLEN
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-# endif
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x4)
 
-	VPCMPEQ	(VEC_SIZE * 3)(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x4)
+	subq	$(VEC_SIZE * -1), %rdi
 
-# ifdef USE_AS_STRNLEN
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-	/* Save pointer before 4 x VEC_SIZE alignment.  */
-	movq	%rax, %rcx
+# if CHAR_PER_VEC == 64
+	/* No partial register stalls on processors that we use evex512
+	   on and this saves code size.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
 # endif
 
-	/* Align address to VEC_SIZE * 4 for loop.  */
-	andq	$-(VEC_SIZE * 4), %rax
-
-# ifdef USE_AS_STRNLEN
-	subq	%rax, %rcx
-#  ifdef USE_AS_WCSLEN
-	shr	$2, %VRCX
-#  endif
-	/* rcx contains number of [w]char will be recompared due to
-	   alignment fixes.  rdx must be incremented by rcx to offset
-	   alignment adjustment.  */
-	addq	%rcx, %rdx
-	/* Need jump as we don't want to add/subtract rdx for first
-	   iteration of 4 x VEC_SIZE aligned loop.  */
-# endif
 
-	.p2align 4,,11
-L(loop):
-	/* VPMINU and VPCMP combination provide better performance as
-	   compared to alternative combinations.  */
-	VMOVA	(VEC_SIZE * 4)(%rax), %VMM(1)
-	VPMINU	(VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
-	VMOVA	(VEC_SIZE * 6)(%rax), %VMM(3)
-	VPMINU	(VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
 
+	/* Compare 4 * VEC at a time forward.  */
+	.p2align 4
+L(loop_4x_vec):
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
 	VPTESTN	%VMM(2), %VMM(2), %k0
-	VPTESTN	%VMM(4), %VMM(4), %k1
+	VPTESTN	%VMM(4), %VMM(4), %k2
+
+	subq	$-(VEC_SIZE * 4), %rdi
+	KORTEST	%k0, %k2
+	jz	L(loop_4x_vec)
+
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x0)
 
-	subq	$-(VEC_SIZE * 4), %rax
-	KORTEST	%k0, %k1
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x1)
 
-# ifndef USE_AS_STRNLEN
-	jz      L(loop)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+
+# if CHAR_PER_VEC == 64
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+	KMOV	%k2, %VRAX
 # else
-	jnz	L(loopend)
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	ja	L(loop)
-	mov	%rsi, %rax
+	/* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.  */
+	kmovd	%k2, %edx
+	kmovd	%k0, %eax
+	salq	$CHAR_PER_VEC, %rdx
+	orq	%rdx, %rax
+# endif
+
+	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.  */
+	.p2align 4,, 2
+L(FALLTHROUGH_RETURN_LBL):
+	bsfq	%rax, %rax
+	subq	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
 	ret
-# endif
 
-L(loopend):
-
-	VPTESTN	%VMM(1), %VMM(1), %k2
-	KMOV	%k2, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x1)
-
-	KMOV	%k0, %VRCX
-	/* At this point, if k0 is non zero, null char must be in the
-	   second vector.  */
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x2)
-
-	VPTESTN	%VMM(3), %VMM(3), %k3
-	KMOV	%k3, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x3)
-	/* At this point null [w]char must be in the fourth vector so no
-	   need to check.  */
-	KMOV	%k1, %VRCX
-
-	/* Fourth, third, second vector terminating are pretty much
-	   same, implemented this way to avoid branching and reuse code
-	   from pre loop exit condition.  */
-L(ret_vec_x4):
-	bsf	%VRCX, %VRCX
-	subq	%rdi, %rax
-# ifdef USE_AS_WCSLEN
-	subq	$-(VEC_SIZE * 3), %rax
-	shrq	$2, %rax
-	addq	%rcx, %rax
-# else
-	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
-# endif
-# ifdef USE_AS_STRNLEN
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
-# endif
+	.p2align 4,, 8
+L(first_vec_x0):
+	bsf	%VRAX, %VRAX
+	sub	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	addq	%rdi, %rax
 	ret
 
-L(ret_vec_x3):
-	bsf	%VRCX, %VRCX
-	subq	%rdi, %rax
-# ifdef USE_AS_WCSLEN
-	subq	$-(VEC_SIZE * 2), %rax
-	shrq	$2, %rax
-	addq	%rcx, %rax
-# else
-	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
-# endif
-# ifdef USE_AS_STRNLEN
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
-# endif
+	.p2align 4,, 10
+L(first_vec_x1):
+	bsf	%VRAX, %VRAX
+	sub	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
 	ret
 
-L(ret_vec_x2):
-	subq	$-VEC_SIZE, %rax
-L(ret_vec_x1):
-	bsf	%VRCX, %VRCX
-	subq	%rdi, %rax
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
-# endif
-	addq	%rcx, %rax
-# ifdef USE_AS_STRNLEN
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
-# endif
+	.p2align 4,, 10
+	/* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.  */
+L(TAIL_RETURN_LBL):
+	bsf	%VRAX, %VRAX
+	sub	%VRCX, %VRDI
+	CHAR_SIZE_SHIFT_REG (VRDI)
+	lea	(TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
 	ret
 
-L(page_cross):
-	mov	%rdi, %rax
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
+	.p2align 4,, 8
+L(cross_page_boundary):
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+
+	VPCMPEQ	(%rdi), %VZERO, %k0
+
+	KMOV	%k0, %VRAX
 # ifdef USE_AS_WCSLEN
-	sarl	$2, %ecx
-# endif
-	/* ecx contains number of w[char] to be skipped as a result
-	   of address alignment.  */
-	andq	$-VEC_SIZE, %rax
-	VPCMPEQ	(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRDX
-	/* Ignore number of character for alignment adjustment.  */
-	shr	%cl, %VRDX
-# ifdef USE_AS_STRNLEN
-	jnz	L(page_cross_end)
-	movl    $CHAR_PER_VEC, %eax
-	sub     %ecx, %eax
-	cmp	%rax, %rsi
-	ja	L(align_more)
+	movl	%ecx, %edx
+	shrl	$2, %edx
+	andl	$(CHAR_PER_VEC - 1), %edx
+	shrx	%edx, %eax, %eax
+	testl	%eax, %eax
 # else
-	jz	L(align_more)
-# endif
-
-L(page_cross_end):
-	bsf	%VRDX, %VRAX
-# ifdef USE_AS_STRNLEN
-	cmpq	%rsi, %rax
-	cmovnb	%esi, %eax
+	shr	%cl, %VRAX
 # endif
+	jz	L(cross_page_continue)
+	bsf	%VRAX, %VRAX
 	ret
 
-END (STRLEN)
-#endif
+END(STRLEN)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
index 364eeffff6..93ad15e356 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -1,245 +1,7 @@
-/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
-   Copyright (C) 2021-2023 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-# include <sysdep.h>
-
-# ifndef STRLEN
-#  define STRLEN	__strlen_evex
-# endif
-
-# ifndef VEC_SIZE
-#  include "x86-evex256-vecs.h"
-# endif
-
-# ifdef USE_AS_WCSLEN
-#  define VPCMPEQ	vpcmpeqd
-#  define VPCMPNEQ	vpcmpneqd
-#  define VPTESTN	vptestnmd
-#  define VPTEST	vptestmd
-#  define VPMINU	vpminud
-#  define CHAR_SIZE	4
-#  define CHAR_SIZE_SHIFT_REG(reg)	sar $2, %reg
-# else
-#  define VPCMPEQ	vpcmpeqb
-#  define VPCMPNEQ	vpcmpneqb
-#  define VPTESTN	vptestnmb
-#  define VPTEST	vptestmb
-#  define VPMINU	vpminub
-#  define CHAR_SIZE	1
-#  define CHAR_SIZE_SHIFT_REG(reg)
-
-#  define REG_WIDTH	VEC_SIZE
-# endif
-
-# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
-
-# include "reg-macros.h"
-
-# if CHAR_PER_VEC == 64
-
-#  define TAIL_RETURN_LBL	first_vec_x2
-#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 2)
-
-#  define FALLTHROUGH_RETURN_LBL	first_vec_x3
-#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
-
-# else
-
-#  define TAIL_RETURN_LBL	first_vec_x3
-#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 3)
-
-#  define FALLTHROUGH_RETURN_LBL	first_vec_x2
-#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
-# endif
-
-# define XZERO	VMM_128(0)
-# define VZERO	VMM(0)
-# define PAGE_SIZE	4096
-
-	.section SECTION(.text), "ax", @progbits
-ENTRY_P2ALIGN (STRLEN, 6)
-	movl	%edi, %eax
-	vpxorq	%XZERO, %XZERO, %XZERO
-	andl	$(PAGE_SIZE - 1), %eax
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	ja	L(cross_page_boundary)
-
-	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
-	   null byte.  */
-	VPCMPEQ	(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jz	L(aligned_more)
-	bsf	%VRAX, %VRAX
-	ret
-
-	.p2align 4,, 8
-L(first_vec_x4):
-	bsf	%VRAX, %VRAX
-	subl	%ecx, %edi
-	CHAR_SIZE_SHIFT_REG (edi)
-	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
-	ret
-
-
-
-	/* Aligned more for strnlen compares remaining length vs 2 *
-	   CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
-	   going to the loop.  */
-	.p2align 4,, 10
-L(aligned_more):
-	movq	%rdi, %rcx
-	andq	$(VEC_SIZE * -1), %rdi
-L(cross_page_continue):
-	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
-	   rechecking bounds.  */
-	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x1)
-
-	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x2)
-
-	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x3)
-
-	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x4)
-
-	subq	$(VEC_SIZE * -1), %rdi
-
-# if CHAR_PER_VEC == 64
-	/* No partial register stalls on processors that we use evex512
-	   on and this saves code size.  */
-	xorb	%dil, %dil
-# else
-	andq	$-(VEC_SIZE * 4), %rdi
-# endif
-
-
-
-	/* Compare 4 * VEC at a time forward.  */
-	.p2align 4
-L(loop_4x_vec):
-	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
-	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
-	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
-	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
-	VPTESTN	%VMM(2), %VMM(2), %k0
-	VPTESTN	%VMM(4), %VMM(4), %k2
-
-	subq	$-(VEC_SIZE * 4), %rdi
-	KORTEST %k0, %k2
-	jz	L(loop_4x_vec)
-
-	VPTESTN	%VMM(1), %VMM(1), %k1
-	KMOV	%k1, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x0)
-
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x1)
-
-	VPTESTN	%VMM(3), %VMM(3), %k0
-
-# if CHAR_PER_VEC == 64
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x2)
-	KMOV	%k2, %VRAX
-# else
-	/* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
-	 */
-	kmovd	%k2, %edx
-	kmovd	%k0, %eax
-	salq	$CHAR_PER_VEC, %rdx
-	orq	%rdx, %rax
-# endif
-
-	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
-	 */
-	.p2align 4,, 2
-L(FALLTHROUGH_RETURN_LBL):
-	bsfq	%rax, %rax
-	subq	%rcx, %rdi
-	CHAR_SIZE_SHIFT_REG (rdi)
-	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
-	ret
-
-	.p2align 4,, 8
-L(first_vec_x0):
-	bsf	%VRAX, %VRAX
-	sub	%rcx, %rdi
-	CHAR_SIZE_SHIFT_REG (rdi)
-	addq	%rdi, %rax
-	ret
-
-	.p2align 4,, 10
-L(first_vec_x1):
-	bsf	%VRAX, %VRAX
-	sub	%rcx, %rdi
-	CHAR_SIZE_SHIFT_REG (rdi)
-	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
-	ret
-
-	.p2align 4,, 10
-	/* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
-	 */
-L(TAIL_RETURN_LBL):
-	bsf	%VRAX, %VRAX
-	sub	%VRCX, %VRDI
-	CHAR_SIZE_SHIFT_REG (VRDI)
-	lea	(TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
-	ret
-
-	.p2align 4,, 8
-L(cross_page_boundary):
-	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE.  */
-	andq	$-VEC_SIZE, %rdi
-
-	VPCMPEQ	(%rdi), %VZERO, %k0
-
-	KMOV	%k0, %VRAX
-# ifdef USE_AS_WCSLEN
-	movl	%ecx, %edx
-	shrl	$2, %edx
-	andl	$(CHAR_PER_VEC - 1), %edx
-	shrx	%edx, %eax, %eax
-	testl	%eax, %eax
-# else
-	shr	%cl, %VRAX
-# endif
-	jz	L(cross_page_continue)
-	bsf	%VRAX, %VRAX
-	ret
-
-END (STRLEN)
+#ifndef STRLEN
+# define STRLEN		__strlen_evex
 #endif
+
+#include "x86-evex256-vecs.h"
+#include "reg-macros.h"
+#include "strlen-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
index 0b7f220214..ebf22c259f 100644
--- a/sysdeps/x86_64/multiarch/strnlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -1,4 +1,264 @@
-#define STRLEN __strnlen_evex512
-#define USE_AS_STRNLEN 1
+/* Placeholder function, not used by any processor at the moment.
+   Copyright (C) 2022-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
 
-#include "strlen-evex512.S"
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRNLEN
+#define STRNLEN __strnlen_evex512
+#endif
+
+#include "x86-evex512-vecs.h"
+#include "reg-macros.h"
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMPEQ	vpcmpeqd
+#  define VPTESTN	vptestnmd
+#  define VPMINU	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPTESTN	vptestnmb
+#  define VPMINU	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section SECTION(.text),"ax",@progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRNLEN, 6)
+	/* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(ret_max)
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+#  endif
+
+	movl	%edi, %eax
+	vpxorq	%VMM_128(0), %VMM_128(0), %VMM_128(0)
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
+	ja	L(page_cross)
+
+	/* Compare [w]char for null, mask bit will be set for match.  */
+	VPCMPEQ	(%rdi), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	/* Store max length in rax.  */
+	mov	%rsi, %rax
+	/* If rcx is 0, rax will have max length.  We can not use VRCX
+	   and VRAX here for evex256 because, upper 32 bits may be
+	   undefined for ecx and eax.  */
+	bsfq	%rcx, %rax
+	cmp	$CHAR_PER_VEC, %rax
+	ja	L(align_more)
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+	ret
+
+	/* At this point vector max length reached.  */
+	.p2align 4,,3
+L(ret_max):
+	movq	%rsi, %rax
+	ret
+
+L(align_more):
+	mov	%rdi, %rax
+	/* Align rax to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rax
+	movq	%rdi, %rdx
+	subq	%rax, %rdx
+#  ifdef USE_AS_WCSLEN
+	shr	$2, %VRDX
+#  endif
+	/* At this point rdx contains [w]chars already compared.  */
+	leaq	-CHAR_PER_VEC(%rsi, %rdx), %rdx
+	/* At this point rdx contains number of w[char] needs to go.
+	   Now onwards rdx will keep decrementing with each compare.  */
+
+	/* Loop unroll 4 times for 4 vector loop.  */
+	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
+	subq	$-VEC_SIZE, %rax
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+
+	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x4)
+
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+	/* Save pointer before 4 x VEC_SIZE alignment.  */
+	movq	%rax, %rcx
+
+	/* Align address to VEC_SIZE * 4 for loop.  */
+	andq	$-(VEC_SIZE * 4), %rax
+
+	subq	%rax, %rcx
+#  ifdef USE_AS_WCSLEN
+	shr	$2, %VRCX
+#  endif
+	/* rcx contains number of [w]char will be recompared due to
+	   alignment fixes.  rdx must be incremented by rcx to offset
+	   alignment adjustment.  */
+	addq	%rcx, %rdx
+	/* Need jump as we don't want to add/subtract rdx for first
+	   iteration of 4 x VEC_SIZE aligned loop.  */
+
+	.p2align 4,,11
+L(loop):
+	/* VPMINU and VPCMP combination provide better performance as
+	   compared to alternative combinations.  */
+	VMOVA	(VEC_SIZE * 4)(%rax), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rax), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k1
+
+	subq	$-(VEC_SIZE * 4), %rax
+	KORTEST	%k0, %k1
+
+	jnz	L(loopend)
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(loop)
+	mov	%rsi, %rax
+	ret
+
+L(loopend):
+
+	VPTESTN	%VMM(1), %VMM(1), %k2
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+
+	KMOV	%k0, %VRCX
+	/* At this point, if k0 is non zero, null char must be in the
+	   second vector.  */
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	VPTESTN	%VMM(3), %VMM(3), %k3
+	KMOV	%k3, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+	/* At this point null [w]char must be in the fourth vector so no
+	   need to check.  */
+	KMOV	%k1, %VRCX
+
+	/* Fourth, third, second vector terminating are pretty much
+	   same, implemented this way to avoid branching and reuse code
+	   from pre loop exit condition.  */
+L(ret_vec_x4):
+	bsf	%VRCX, %VRCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	subq	$-(VEC_SIZE * 3), %rax
+	shrq	$2, %rax
+	addq	%rcx, %rax
+# else
+	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
+# endif
+
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+	ret
+
+L(ret_vec_x3):
+	bsf	%VRCX, %VRCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	subq	$-(VEC_SIZE * 2), %rax
+	shrq	$2, %rax
+	addq	%rcx, %rax
+# else
+	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
+# endif
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+	ret
+
+L(ret_vec_x2):
+	subq	$-VEC_SIZE, %rax
+L(ret_vec_x1):
+	bsf	%VRCX, %VRCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	addq	%rcx, %rax
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+	ret
+
+L(page_cross):
+	mov	%rdi, %rax
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE - 1), %ecx
+# ifdef USE_AS_WCSLEN
+	sarl	$2, %ecx
+# endif
+	/* ecx contains number of w[char] to be skipped as a result
+	   of address alignment.  */
+	andq	$-VEC_SIZE, %rax
+	VPCMPEQ	(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRDX
+	/* Ignore number of character for alignment adjustment.  */
+	shr	%cl, %VRDX
+	jnz	L(page_cross_end)
+	movl    $CHAR_PER_VEC, %eax
+	sub     %ecx, %eax
+	cmp	%rax, %rsi
+	ja	L(align_more)
+
+L(page_cross_end):
+	bsf	%VRDX, %VRAX
+	cmpq	%rsi, %rax
+	cmovnb	%esi, %eax
+	ret
+
+END (STRNLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
index f59c372b78..aff288a66b 100644
--- a/sysdeps/x86_64/multiarch/wcslen-evex512.S
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
@@ -1,4 +1,8 @@
-#define STRLEN __wcslen_evex512
+#ifndef WCSLEN
+# define WCSLEN	__wcslen_evex512
+#endif
+
+#define STRLEN WCSLEN
 #define USE_AS_WCSLEN 1
 
 #include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
index 73dcf2f210..1c37d74fc9 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
@@ -1,5 +1,8 @@
-#define STRLEN __wcsnlen_evex512
+#ifndef WCSNLEN
+# define WCSNLEN	__wcsnlen_evex512
+#endif
+
+#define STRNLEN	WCSNLEN
 #define USE_AS_WCSLEN 1
-#define USE_AS_STRNLEN 1
 
-#include "strlen-evex512.S"
+#include "strnlen-evex512.S"
-- 
2.37.2