This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH neleai/string-x64] Optimize strncmp with unaligned loads.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Sat, 20 Jun 2015 19:48:04 +0200
- Subject: [PATCH neleai/string-x64] Optimize strncmp with unaligned loads.
- Authentication-results: sourceware.org; auth=none
- References: <20150620083525 dot GA31992 at domone> <20150620102256 dot GA16801 at domone> <20150620103548 dot GA21670 at domone>
Hi,
This uses refactored strcmp to add strncmp functionality.
Basic idea is simple, at start initialize register with 1<<(n-1) when n
<64 on unaligned header and or that with masks. From what I tried that
gives smallest slowdown versus strcmp.
For larger sizes I modify counter when we need to do unaligned load
cross page to also trigger when we need inspect less than 64 characters.
Performance with these tricks is nearly identical as strcmp.
OK to add this?
Profile graphs are here:
http://kam.mff.cuni.cz/~ondra/benchmark_string/strncmp_profile.html
* sysdeps/x86_64/multiarch/Makefile (routines): Add strncmp-avx2.S and
strncmp-sse2-unaligned.S.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add
__strncmp_sse2_unaligned and __strncmp_avx2.
* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Add strncmp
functionality.
* sysdeps/x86_64/multiarch/strcmp.S: Adjust ifunc.
* sysdeps/x86_64/multiarch/strncmp-avx2.S: New file.
* sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S: Likewise.
---
sysdeps/x86_64/multiarch/Makefile | 4 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 5 +-
sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S | 78 +++++++++++++++++++++--
sysdeps/x86_64/multiarch/strcmp.S | 37 ++++++-----
sysdeps/x86_64/multiarch/strncmp-avx2.S | 4 ++
sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S | 3 +
6 files changed, 104 insertions(+), 27 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strncmp-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index bf48283..95e0190 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -7,7 +7,7 @@ endif
ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
- strcmp-sse2-unaligned strncmp-ssse3 \
+ strcmp-sse2-unaligned strncmp-sse2-unaligned strncmp-ssse3 \
memcpy-ssse3 \
memcpy-sse2-unaligned mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
@@ -30,7 +30,7 @@ CFLAGS-strspn-c.c += -msse4
endif
ifeq (yes,$(config-cflags-avx2))
-sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2 strcmp-avx2
+sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2 strcmp-avx2 strncmp-avx2
endif
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 57ce237..51ff3ed 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -257,8 +257,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncmp.S. */
IFUNC_IMPL (i, name, strncmp,
- IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2,
- __strncmp_sse42)
+ IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, strncmp, HAS_AVX2, __strncmp_avx2)
+
IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSSE3,
__strncmp_ssse3)
IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index 8258eb8..f3a0508 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -19,6 +19,14 @@
#include "sysdep.h"
ENTRY ( __strcmp_sse2_unaligned)
+#ifdef AS_STRNCMP
+ lea -1(%rdx), %r10
+ test %rdx, %rdx
+ je L(ret_zero)
+L(back_to_start):
+ xor %rdx, %rdx
+#endif
+
pxor %xmm7, %xmm7
movl %esi, %eax
andl $4095, %eax
@@ -29,20 +37,35 @@ ENTRY ( __strcmp_sse2_unaligned)
andl $4095, %eax
cmpl $4032, %eax
jg L(cross_page)
+#ifdef AS_STRNCMP
+ cmp $64, %r10
+ jae L(dont_set_mask)
+ bts %r10, %rdx
+L(dont_set_mask):
+#endif
+
movdqu (%rdi), %xmm1
movdqu (%rsi), %xmm0
pcmpeqb %xmm1, %xmm0
pminub %xmm1, %xmm0
pcmpeqb %xmm7, %xmm0
pmovmskb %xmm0, %eax
+#ifdef AS_STRNCMP
+ or %dx, %ax
+#else
test %eax, %eax
+#endif
je L(next_48_bytes)
bsf %eax, %edx
movzbl (%rdi, %rdx), %eax
movzbl (%rsi, %rdx), %edx
subl %edx, %eax
ret
-
+#ifdef AS_STRNCMP
+ L(ret_zero):
+ xor %eax, %eax
+ ret
+#endif
.p2align 4
L(next_48_bytes):
movdqu 16(%rdi), %xmm6
@@ -54,16 +77,19 @@ L(next_48_bytes):
pcmpeqb %xmm7, %xmm3
movdqu 48(%rdi), %xmm4
pcmpeqb %xmm5, %xmm2
- pmovmskb %xmm3, %edx
movdqu 48(%rsi), %xmm0
pminub %xmm5, %xmm2
pcmpeqb %xmm7, %xmm2
pcmpeqb %xmm4, %xmm0
pmovmskb %xmm2, %eax
+ salq $32, %rax
+#ifdef AS_STRNCMP
+ or %rdx, %rax
+#endif
+ pmovmskb %xmm3, %edx
sal $16, %edx
pminub %xmm4, %xmm0
pcmpeqb %xmm7, %xmm0
- salq $32, %rax
orq %rdx, %rax
pmovmskb %xmm0, %ecx
salq $48, %rcx
@@ -82,6 +108,10 @@ L(main_loop_header):
#endif
leaq 64(%rdi), %rdx
andq $-64, %rdx
+# ifdef AS_STRNCMP
+ addq %rdi, %r10
+ subq %rdx, %r10
+# endif
subq %rdi, %rdx
leaq (%rdi, %rdx), %rax
addq %rsi, %rdx
@@ -90,6 +120,15 @@ L(main_loop_header):
andl $4095, %ecx
sub %ecx, %esi
shr $6, %esi
+#ifdef AS_STRNCMP
+ mov %r10, %r9
+ addq %rdx, %r10
+ shr $6, %r9
+ cmp %r9, %rsi
+ jb L(dont_set_page_bound)
+ mov %r9, %rsi
+L(dont_set_page_bound):
+#endif
.p2align 4
L(loop):
@@ -111,7 +150,7 @@ L(back_to_loop):
addq $64, %rdx
vpmovmskb %ymm2, %edi
test %edi, %edi
- je .Lloop
+ je L(loop)
shl $32, %rdi
vpcmpeqb %ymm7, %ymm0, %ymm0
vpmovmskb %ymm0, %ecx
@@ -164,6 +203,14 @@ L(back_to_loop):
.p2align 4
L(loop_cross_page):
+#ifdef AS_STRNCMP
+ mov %r10, %r9
+ sub %rdx, %r9
+ cmp $64, %r9
+ jb L(prepare_back_to_start)
+#endif
+
+
mov %edx, %ecx
and $63, %ecx
neg %rcx
@@ -219,6 +266,14 @@ L(loop_cross_page):
#endif
mov %edx, %ecx
mov $63, %esi
+#ifdef AS_STRNCMP
+ shr $6, %r9
+ sub $1, %r9
+ cmp %r9, %rsi
+ jb L(dont_set_bound2)
+ mov %r9, %rsi
+L(dont_set_bound2):
+#endif
shrq %cl, %rdi
test %rdi, %rdi
je L(back_to_loop)
@@ -231,6 +286,18 @@ L(loop_cross_page):
subl %edx, %eax
ret
+#ifdef AS_STRNCMP
+L(prepare_back_to_start):
+# ifdef USE_AVX2
+ vzeroupper
+# endif
+ mov %r9, %r10
+ mov %rdx, %rsi
+ mov %rax, %rdi
+ jmp L(back_to_start)
+#endif
+
+
L(cross_page):
xorl %edx, %edx
jmp L(cross_page_loop_start)
@@ -244,6 +311,9 @@ L(cross_page_loop_start):
movzbl (%rsi, %rdx), %ecx
subl %ecx, %eax
jne L(different)
+ cmp %rdx, %r10
+ je L(different)
+
test %ecx, %ecx
jne L(cross_page_loop)
L(different):
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index 867e9d4..02d22d1 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -31,8 +31,8 @@
test %r9, %r9; \
je LABEL(strcmp_exitz); \
mov %r9, %r11
-
-# define STRCMP_SSE42 __strncmp_sse42
+# define STRCMP_AVX2 __strncmp_avx2
+# define STRCMP_SSE2_UNALIGNED __strncmp_sse2_unaligned
# define STRCMP_SSSE3 __strncmp_ssse3
# define STRCMP_SSE2 __strncmp_sse2
# define __GI_STRCMP __GI_strncmp
@@ -69,8 +69,9 @@
# define USE_AS_STRCMP
# define UPDATE_STRNCMP_COUNTER
# ifndef STRCMP
+# define STRCMP_AVX2 __strcmp_avx2
+# define STRCMP_SSE2_UNALIGNED __strcmp_sse2_unaligned
# define STRCMP strcmp
-# define STRCMP_SSE42 __strcmp_sse42
# define STRCMP_SSSE3 __strcmp_ssse3
# define STRCMP_SSE2 __strcmp_sse2
# define __GI_STRCMP __GI_strcmp
@@ -89,23 +90,23 @@ ENTRY(STRCMP)
jne 1f
call __init_cpu_features
1:
-#ifdef USE_AS_STRCMP
-# ifdef HAVE_AVX2_SUPPORT
+# if defined (USE_AS_STRCMP) || defined (USE_AS_STRNCMP)
+# ifdef HAVE_AVX2_SUPPORT
- leaq __strcmp_avx2(%rip), %rax
+ leaq STRCMP_AVX2(%rip), %rax
testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
jnz 3f
-# endif
- leaq __strcmp_sse2_unaligned(%rip), %rax
+# endif
+ leaq STRCMP_SSE2_UNALIGNED(%rip), %rax
testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
jnz 3f
-#else
+# else
testl $bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip)
jnz 2f
leaq STRCMP_SSE42(%rip), %rax
testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
jnz 3f
-#endif
+# endif
2: leaq STRCMP_SSSE3(%rip), %rax
testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jnz 3f
@@ -166,15 +167,13 @@ END(__strncasecmp)
weak_alias (__strncasecmp, strncasecmp)
# endif
-# undef LABEL
-# define LABEL(l) .L##l##_sse42
-# define GLABEL(l) l##_sse42
-# define SECTION sse4.2
-# include "strcmp-sse42.S"
-
-
-# ifdef HAVE_AVX_SUPPORT
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# undef LABEL
+# define LABEL(l) .L##l##_sse42
+# define GLABEL(l) l##_sse42
+# define SECTION sse4.2
+# include "strcmp-sse42.S"
+# ifdef HAVE_AVX_SUPPORT
# define LABEL(l) .L##l##_avx
# define GLABEL(l) l##_avx
# define USE_AVX 1
diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S
new file mode 100644
index 0000000..fe70abd
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S
@@ -0,0 +1,4 @@
+#define USE_AVX2
+#define AS_STRNCMP
+#define __strcmp_sse2_unaligned __strncmp_avx2
+#include "strcmp-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S
new file mode 100644
index 0000000..d987b28
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S
@@ -0,0 +1,3 @@
+#define AS_STRNCMP
+#define __strcmp_sse2_unaligned __strncmp_sse2_unaligned
+#include "strcmp-sse2-unaligned.S"
--
1.8.4.rc3