[PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp

Noah Goldstein goldstein.w.n@gmail.com
Wed Mar 23 21:57:42 GMT 2022


geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702

All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, AVX2 Time / SSE42 Time
     1,      1,      1,      127,                  1.032
     2,      2,      2,      127,                  1.006
     3,      3,      3,      127,                  1.009
     4,      4,      4,      127,                  0.964
     5,      5,      5,      127,                  0.929
     6,      6,      6,      127,                   0.94
     7,      7,      7,      127,                  0.958
     8,      0,      0,      127,                  0.988
     9,      1,      1,      127,                   0.99
    10,      2,      2,      127,                  0.995
    11,      3,      3,      127,                  0.991
    12,      4,      4,      127,                  0.975
    13,      5,      5,      127,                  0.943
    14,      6,      6,      127,                  0.955
    15,      7,      7,      127,                  0.988
     4,      0,      0,      127,                  0.983
     4,      0,      0,      254,                  0.978
     8,      0,      0,      254,                  0.989
    16,      0,      0,      127,                  0.792
    16,      0,      0,      254,                  0.774
    32,      0,      0,      127,                  0.568
    32,      0,      0,      254,                  0.555
    64,      0,      0,      127,                  0.561
    64,      0,      0,      254,                  0.561
   128,      0,      0,      127,                  0.574
   128,      0,      0,      254,                  0.577
   256,      0,      0,      127,                  0.561
   256,      0,      0,      254,                  0.552
   512,      0,      0,      127,                   0.59
   512,      0,      0,      254,                  0.594
  1024,      0,      0,      127,                  0.528
  1024,      0,      0,      254,                  0.517
    16,      1,      2,      127,                  0.758
    16,      2,      1,      254,                  0.748
    32,      2,      4,      127,                  0.419
    32,      4,      2,      254,                  0.428
    64,      3,      6,      127,                  0.472
    64,      6,      3,      254,                  0.464
   128,      4,      0,      127,                  0.534
   128,      0,      4,      254,                   0.53
   256,      5,      2,      127,                  0.679
   256,      2,      5,      254,                  0.676
   512,      6,      4,      127,                  0.525
   512,      4,      6,      254,                  0.523
  1024,      7,      6,      127,                  0.518
  1024,      6,      7,      254,                  0.505

 sysdeps/x86_64/multiarch/Makefile             |   4 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  28 +++
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |  12 +
 .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S  |  15 ++
 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S  |  23 ++
 sysdeps/x86_64/multiarch/strcmp-avx2.S        | 230 +++++++++++++++---
 .../x86_64/multiarch/strncase_l-avx2-rtm.S    |  16 ++
 sysdeps/x86_64/multiarch/strncase_l-avx2.S    |  27 ++
 8 files changed, 324 insertions(+), 31 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e7b413edad..06e1848823 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -55,6 +55,8 @@ sysdep_routines += \
   stpncpy-sse2-unaligned \
   stpncpy-ssse3 \
   strcasecmp_l-avx \
+  strcasecmp_l-avx2 \
+  strcasecmp_l-avx2-rtm \
   strcasecmp_l-sse2 \
   strcasecmp_l-sse4_2 \
   strcasecmp_l-ssse3 \
@@ -93,6 +95,8 @@ sysdep_routines += \
   strlen-evex \
   strlen-sse2 \
   strncase_l-avx \
+  strncase_l-avx2 \
+  strncase_l-avx2-rtm \
   strncase_l-sse2 \
   strncase_l-sse4_2 \
   strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a594f4176e..3c556d07ac 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strcasecmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcasecmp_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strcasecmp_avx)
@@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strcasecmp_l_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcasecmp_l_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strcasecmp_l_avx)
@@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strncasecmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncasecmp_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strncasecmp_avx)
@@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strncasecmp_l_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncasecmp_l_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
 			      CPU_FEATURE_USABLE (AVX),
 			      __strncasecmp_l_avx)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 9e3cc61ac0..c4de111fd0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+        return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+        return OPTIMIZE (avx2);
+    }
+
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
     return OPTIMIZE (avx);
 
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
new file mode 100644
index 0000000000..09957fc3c5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
@@ -0,0 +1,15 @@
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x)	x ## _rtm
+#define GLABEL(x)	_GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
+
+#define SECTION(p)	p##.avx.rtm
+
+#include "strcasecmp_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
new file mode 100644
index 0000000000..e2762f2a22
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with AVX2.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 86a86b68e3..eeb90a0da6 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -20,6 +20,10 @@
 
 # include <sysdep.h>
 
+# if defined USE_AS_STRCASECMP_L
+#  include "locale-defines.h"
+# endif
+
 # ifndef STRCMP
 #  define STRCMP	__strcmp_avx2
 # endif
@@ -74,13 +78,88 @@
 #  define VEC_OFFSET	(-VEC_SIZE)
 # endif
 
+# ifdef USE_AS_STRCASECMP_L
+#  define BYTE_LOOP_REG	OFFSET_REG
+# else
+#  define BYTE_LOOP_REG	ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  ifdef USE_AS_STRNCMP
+#   define STRCASECMP	__strncasecmp_avx2
+#   define LOCALE_REG	rcx
+#   define LOCALE_REG_LP	RCX_LP
+#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
+#  else
+#   define STRCASECMP	__strcasecmp_avx2
+#   define LOCALE_REG	rdx
+#   define LOCALE_REG_LP	RDX_LP
+#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
+#  endif
+# endif
+
 # define xmmZERO	xmm15
 # define ymmZERO	ymm15
 
+# define LCASE_MIN_ymm	%ymm10
+# define LCASE_MAX_ymm	%ymm11
+# define CASE_ADD_ymm	%ymm12
+
+# define LCASE_MIN_xmm	%xmm10
+# define LCASE_MAX_xmm	%xmm11
+# define CASE_ADD_xmm	%xmm12
+
+	/* r11 is never use elsewhere so this is safe to maintain.  */
+# define TOLOWER_BASE	%r11
+
 # ifndef SECTION
 #  define SECTION(p)	p##.avx
 # endif
 
+# ifdef USE_AS_STRCASECMP_L
+#  define REG(x, y) x ## y
+#  define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext)			\
+	vpaddb	REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8);				\
+	vpaddb	REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9);				\
+	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8);			\
+	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9);			\
+	vpandn	REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8);			\
+	vpandn	REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9);			\
+	vpaddb	REG(%ext, 8), reg1_in, reg1_out;							\
+	vpaddb	REG(%ext, 9), reg2_in, reg2_out
+
+#  define TOLOWER_gpr(src, dst)	movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_ymm(...)	TOLOWER(__VA_ARGS__, ymm)
+#  define TOLOWER_xmm(...)	TOLOWER(__VA_ARGS__, xmm)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext)			\
+	TOLOWER	(s1_reg, scratch_reg, s2_reg, s2_reg, ext);					\
+	VPCMPEQ	scratch_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext)			\
+	VMOVU	s2_mem, reg_out;											\
+	CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
+
+#  define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
+#  define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
+
+#  define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
+#  define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
+
+# else
+#  define TOLOWER_gpr(...)
+#  define TOLOWER_ymm(...)
+#  define TOLOWER_xmm(...)
+
+#  define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out)			\
+	VPCMPEQ	s2_reg, s1_reg, reg_out
+
+#  define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+
+#  define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+#  define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
+# endif
+
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
            strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -102,7 +181,45 @@
    returned.  */
 
 	.section SECTION(.text), "ax", @progbits
-ENTRY(STRCMP)
+	.align	16
+	.type	STRCMP, @function
+	.globl	STRCMP
+	.hidden	STRCMP
+
+# ifndef GLABEL
+#  define GLABEL(...)	__VA_ARGS__
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(STRCASECMP))
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %LOCALE_REG_LP
+
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
+END (GLABEL(STRCASECMP))
+	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
+# endif
+
+	.p2align 4
+STRCMP:
+	cfi_startproc
+	_CET_ENDBR
+	CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales with
+	   encodings not matching ASCII for single bytes.  */
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+#  else
+	mov	(%LOCALE_REG), %RAX_LP
+#  endif
+	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	jne	STRCASECMP_NONASCII
+	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
 # ifdef USE_AS_STRNCMP
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
@@ -128,6 +245,30 @@ ENTRY(STRCMP)
 #  endif
 # endif
 	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
+# if defined USE_AS_STRCASECMP_L
+	.section .rodata.cst32, "aM", @progbits, 32
+	.align	32
+L(lcase_min):
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+L(lcase_max):
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+L(case_add):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+	vmovdqa	L(lcase_min)(%rip), LCASE_MIN_ymm
+	vmovdqa	L(lcase_max)(%rip), LCASE_MAX_ymm
+	vmovdqa	L(case_add)(%rip), CASE_ADD_ymm
+# endif
 	movl	%edi, %eax
 	orl	%esi, %eax
 	sall	$20, %eax
@@ -138,8 +279,10 @@ ENTRY(STRCMP)
 L(no_page_cross):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	(%rdi), %ymm0
-	/* 1s where s1 and s2 equal.  */
-	VPCMPEQ	(%rsi), %ymm0, %ymm1
+	/* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
+	   Otherwise converts ymm0 and load from rsi to lower. ymm2 is
+	   scratch and ymm1 is the return.  */
+	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
 	/* 1s at null CHAR.  */
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	/* 1s where s1 and s2 equal AND not null CHAR.  */
@@ -172,6 +315,8 @@ L(return_vec_0):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret0):
@@ -207,6 +352,8 @@ L(one_or_less):
 #  else
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret1):
@@ -234,6 +381,8 @@ L(return_vec_1):
 # else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret2):
@@ -265,6 +414,8 @@ L(return_vec_2):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret3):
@@ -285,6 +436,8 @@ L(return_vec_3):
 #  else
 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret4):
@@ -295,7 +448,7 @@ L(ret4):
 L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	VEC_SIZE(%rdi), %ymm0
-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -308,7 +461,7 @@ L(more_3x_vec):
 # endif
 
 	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -316,7 +469,7 @@ L(more_3x_vec):
 	jnz	L(return_vec_2)
 
 	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -391,12 +544,10 @@ L(loop_skip_page_cross_check):
 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
 
 	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
-	VPCMPEQ	(VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
-
-	VPCMPEQ	(VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
-
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
+	CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
+	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
 
 	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
 	   zero.  */
@@ -465,6 +616,8 @@ L(return_vec_2_3_end):
 # else
 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -508,6 +661,8 @@ L(return_vec_0_end):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -530,6 +685,8 @@ L(return_vec_1_end):
 #  else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -556,6 +713,8 @@ L(return_vec_2_end):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -583,7 +742,7 @@ L(page_cross_during_loop):
 	jle	L(less_1x_vec_till_page_cross)
 
 	VMOVA	(%rdi), %ymm0
-	VPCMPEQ	(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -605,7 +764,7 @@ L(less_1x_vec_till_page_cross):
 	   here, it means the previous page (rdi - VEC_SIZE) has already
 	   been loaded earlier so must be valid.  */
 	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
-	VPCMPEQ	-VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -647,6 +806,8 @@ L(return_page_cross_cmp_mem):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -673,7 +834,7 @@ L(more_2x_vec_till_page_cross):
 	   iteration here.  */
 
 	VMOVU	VEC_SIZE(%rdi), %ymm0
-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -689,7 +850,7 @@ L(more_2x_vec_till_page_cross):
 
 	/* Safe to include comparisons from lower bytes.  */
 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
-	VPCMPEQ	-(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -697,7 +858,7 @@ L(more_2x_vec_till_page_cross):
 	jnz	L(return_vec_page_cross_0)
 
 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
-	VPCMPEQ	-(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -715,8 +876,8 @@ L(more_2x_vec_till_page_cross):
 	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
 
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
+	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
 	vpand	%ymm4, %ymm5, %ymm5
 	vpand	%ymm6, %ymm7, %ymm7
 	VPMINU	%ymm5, %ymm7, %ymm7
@@ -767,6 +928,8 @@ L(return_vec_page_cross_1):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -822,7 +985,7 @@ L(page_cross):
 L(page_cross_loop):
 
 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -840,11 +1003,11 @@ L(page_cross_loop):
 	subl	%eax, %OFFSET_REG
 	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
 	   to not cross page so is safe to load. Since we have already
-	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
-	 */
+	   loaded at least 1 VEC from rsi it is also guranteed to be
+	   safe.  */
 
 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
+	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
 	vpandn	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %ecx
@@ -877,6 +1040,8 @@ L(ret_vec_page_cross_cont):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -930,7 +1095,7 @@ L(less_1x_vec_till_page):
 	ja	L(less_16_till_page)
 
 	VMOVU	(%rdi), %xmm0
-	VPCMPEQ	(%rsi), %xmm0, %xmm1
+	CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
@@ -948,7 +1113,7 @@ L(less_1x_vec_till_page):
 # endif
 
 	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
-	VPCMPEQ	(%rsi, %OFFSET_REG64), %xmm0, %xmm1
+	CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
@@ -986,7 +1151,7 @@ L(less_16_till_page):
 	vmovq	(%rdi), %xmm0
 	vmovq	(%rsi), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	incb	%cl
@@ -1006,7 +1171,7 @@ L(less_16_till_page):
 	vmovq	(%rdi, %OFFSET_REG64), %xmm0
 	vmovq	(%rsi, %OFFSET_REG64), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	incb	%cl
@@ -1062,7 +1227,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi), %xmm0
 	vmovd	(%rsi), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	subl	$0xf, %ecx
@@ -1081,7 +1246,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi, %OFFSET_REG64), %xmm0
 	vmovd	(%rsi, %OFFSET_REG64), %xmm1
 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
-	VPCMPEQ	%xmm1, %xmm0, %xmm1
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
 	vpandn	%xmm1, %xmm2, %xmm1
 	vpmovmskb %ymm1, %ecx
 	subl	$0xf, %ecx
@@ -1115,7 +1280,9 @@ L(less_4_till_page):
 L(less_4_loop):
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi, %rdi), %ecx
-	subl	%ecx, %eax
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+	subl	%BYTE_LOOP_REG, %eax
 	jnz	L(ret_less_4_loop)
 	testl	%ecx, %ecx
 	jz	L(ret_zero_4_loop)
@@ -1142,5 +1309,6 @@ L(ret_less_4_loop):
 	subl	%r8d, %eax
 	ret
 # endif
-END(STRCMP)
+	cfi_endproc
+	.size	STRCMP, .-STRCMP
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
new file mode 100644
index 0000000000..e194936c36
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
@@ -0,0 +1,16 @@
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x)	x ## _rtm
+#define GLABEL(x)	_GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
+
+#define SECTION(p)	p##.avx.rtm
+#define OVERFLOW_STRCMP	__strcasecmp_avx2_rtm
+
+#include "strncase_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
new file mode 100644
index 0000000000..29afccbcc5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
@@ -0,0 +1,27 @@
+/* strncasecmp_l optimized with AVX2.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP	__strcasecmp_avx2
+#endif
+#include "strcmp-avx2.S"
-- 
2.25.1



More information about the Libc-alpha mailing list