This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH 1/2] Unaligned strcmp.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Sat, 31 Aug 2013 23:09:14 +0200
- Subject: [PATCH 1/2] Unaligned strcmp.
- Authentication-results: sourceware.org; auth=none
Hi,
This patch improves strcmp performance by around 20% for i7 processors
on gcc workloads, and 10% for bulldozer. For big sizes it around 50%
faster than sse4.2 variant.
Benchmarks are here:
http://kam.mff.cuni.cz/~ondra/benchmark_string/strcmp_profile.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/strcmp_profile310813.tar.bz2
Also on old athlons unaligned implementation is faster than current sse2
one so we should switch it.
For machines with ssse3 an implementation also needs to be optimized. I
looked to code size and we have 16 branches, each 288 byte large which
contain only 16-byte loops.
As my 64-byte loop fits into 115 bytes there is plenty of room for
improvement.
Also we can reduce number of branches from 16 to 9 by switching
arguments and multiplying result by 1/-1 in the end.
This is first part of patch, second part would add strncmp macros. A strcmp.S
ifunc selection would then need rework as I do not have effective
strcasecmp yet. Now I add macro to avoid sse4_2 for strcmp.
There is possible improvement in cross-page case where I do byte-by-byte
loop.
Passes tests, OK to commit?
Ondra
* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: New file.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list):
Add ifunc.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strcmp-sse2-unaligned
* sysdeps/x86_64/multiarch/strcmp.S (strcmp): Add ifunc.
---
sysdeps/x86_64/multiarch/Makefile | 6 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 1 +
sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S | 215 +++++++++++++++++++++++
sysdeps/x86_64/multiarch/strcmp.S | 7 +
4 files changed, 227 insertions(+), 2 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 203d16e..5ab950a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -6,8 +6,10 @@ endif
ifeq ($(subdir),string)
-sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
- strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \
+sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
+ strcmp-sse2-unaligned strncmp-ssse3 \
+ strend-sse4 memcmp-sse4 memcpy-ssse3 \
+ memcpy-sse2-unaligned mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index d0992e1..f8756d7 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -118,6 +118,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strcmp,
IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSE4_2, __strcmp_sse42)
IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSSE3, __strcmp_ssse3)
+ IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
/* Support sysdeps/x86_64/multiarch/strcpy.S. */
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
new file mode 100644
index 0000000..87338f0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -0,0 +1,215 @@
+/* strcmp with unaligned loads
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define L(x) .L##x
+#define ALIGN(x) .p2align x
+#include "sysdep.h"
+
+ENTRY ( __strcmp_sse2_unaligned)
+ movl %edi, %eax
+ xorl %edx, %edx
+ pxor %xmm7, %xmm7
+ orl %esi, %eax
+ andl $4095, %eax
+ cmpl $4032, %eax
+ jg L(cross_page)
+ movdqu (%rdi), %xmm1
+ movdqu (%rsi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pminub %xmm1, %xmm0
+ pxor %xmm1, %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ testq %rax, %rax
+ je L(next_48_bytes)
+L(return):
+ bsfq %rax, %rdx
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ subl %edx, %eax
+ ret
+
+ ALIGN (4)
+L(next_48_bytes):
+ movdqu 16(%rdi), %xmm6
+ movdqu 16(%rsi), %xmm3
+ movdqu 32(%rdi), %xmm5
+ pcmpeqb %xmm6, %xmm3
+ movdqu 32(%rsi), %xmm2
+ pminub %xmm6, %xmm3
+ pcmpeqb %xmm1, %xmm3
+ movdqu 48(%rdi), %xmm4
+ pcmpeqb %xmm5, %xmm2
+ pmovmskb %xmm3, %edx
+ movdqu 48(%rsi), %xmm0
+ pminub %xmm5, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm2, %eax
+ salq $16, %rdx
+ pminub %xmm4, %xmm0
+ pcmpeqb %xmm1, %xmm0
+ salq $32, %rax
+ orq %rdx, %rax
+ pmovmskb %xmm0, %ecx
+ movq %rcx, %rdx
+ salq $48, %rdx
+ orq %rdx, %rax
+ jne L(return)
+L(main_loop_header):
+ leaq 64(%rdi), %rdx
+ movl $4096, %ecx
+ pxor %xmm9, %xmm9
+ andq $-64, %rdx
+ subq %rdi, %rdx
+ leaq (%rdi, %rdx), %rax
+ addq %rsi, %rdx
+ movq %rdx, %rsi
+ andl $4095, %esi
+ subq %rsi, %rcx
+ shrq $6, %rcx
+ movq %rcx, %rsi
+ jmp L(loop_start)
+
+ ALIGN (4)
+L(loop):
+ addq $64, %rax
+ addq $64, %rdx
+L(loop_start):
+ testq %rsi, %rsi
+ leaq -1(%rsi), %rsi
+ je L(loop_cross_page)
+L(back_to_loop):
+ movdqu (%rdx), %xmm0
+ movdqu 16(%rdx), %xmm1
+ movdqa (%rax), %xmm2
+ movdqa 16(%rax), %xmm3
+ pcmpeqb %xmm2, %xmm0
+ movdqu 32(%rdx), %xmm5
+ pcmpeqb %xmm3, %xmm1
+ pminub %xmm2, %xmm0
+ movdqu 48(%rdx), %xmm6
+ pminub %xmm3, %xmm1
+ movdqa 32(%rax), %xmm2
+ pminub %xmm1, %xmm0
+ movdqa 48(%rax), %xmm3
+ pcmpeqb %xmm2, %xmm5
+ pcmpeqb %xmm3, %xmm6
+ pminub %xmm2, %xmm5
+ pminub %xmm3, %xmm6
+ pminub %xmm5, %xmm0
+ pminub %xmm6, %xmm0
+
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %ecx
+ testl %ecx, %ecx
+ je L(loop)
+ pcmpeqb %xmm7, %xmm5
+ movdqu (%rdx), %xmm0
+ pcmpeqb %xmm7, %xmm1
+ movdqa (%rax), %xmm2
+ pcmpeqb %xmm2, %xmm0
+ pminub %xmm2, %xmm0
+ pcmpeqb %xmm7, %xmm6
+ pcmpeqb %xmm7, %xmm0
+
+ pmovmskb %xmm1, %ecx
+ pmovmskb %xmm5, %r8d
+ pmovmskb %xmm0, %edi
+ salq $16, %rcx
+ salq $32, %r8
+ pmovmskb %xmm6, %esi
+ orq %r8, %rcx
+ orq %rdi, %rcx
+ salq $48, %rsi
+ orq %rsi, %rcx
+ bsfq %rcx, %rcx
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+ ret
+
+ ALIGN (4)
+L(loop_cross_page):
+ xor %r10, %r10
+ movq %rdx, %r9
+ and $63, %r9
+ subq %r9, %r10
+
+ movdqa (%rdx, %r10), %xmm0
+ movdqa 16(%rdx, %r10), %xmm1
+ movdqu (%rax, %r10), %xmm2
+ movdqu 16(%rax, %r10), %xmm3
+ pcmpeqb %xmm2, %xmm0
+ movdqa 32(%rdx, %r10), %xmm5
+ pcmpeqb %xmm3, %xmm1
+ pminub %xmm2, %xmm0
+ movdqa 48(%rdx, %r10), %xmm6
+ pminub %xmm3, %xmm1
+ movdqu 32(%rax, %r10), %xmm2
+ movdqu 48(%rax, %r10), %xmm3
+ pcmpeqb %xmm2, %xmm5
+ pcmpeqb %xmm3, %xmm6
+ pminub %xmm2, %xmm5
+ pminub %xmm3, %xmm6
+
+
+ pcmpeqb %xmm7, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pcmpeqb %xmm7, %xmm5
+ pcmpeqb %xmm7, %xmm6
+
+ pmovmskb %xmm1, %ecx
+ pmovmskb %xmm5, %r8d
+ pmovmskb %xmm0, %edi
+ salq $16, %rcx
+ salq $32, %r8
+ pmovmskb %xmm6, %esi
+ orq %r8, %rdi
+ orq %rcx, %rdi
+ salq $48, %rsi
+ orq %rsi, %rdi
+ movq %r9, %rcx
+ movq $63, %rsi
+ shrq %cl, %rdi
+ test %rdi, %rdi
+ je L(back_to_loop)
+ bsfq %rdi, %rcx
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+ ret
+
+ ALIGN (4)
+L(cross_page_loop):
+ cmpb %cl, %al
+ jne L(different)
+ addq $1, %rdx
+ cmpq $64, %rdx
+ je L(main_loop_header)
+L(cross_page):
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %ecx
+ testb %al, %al
+ jne L(cross_page_loop)
+ xorl %eax, %eax
+L(different):
+ subl %ecx, %eax
+ ret
+END (__strcmp_sse2_unaligned)
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index 1d4d711..c5dcd1a 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -66,6 +66,7 @@
# define STRCMP_SSE2 __strncasecmp_l_sse2
# define __GI_STRCMP __GI___strncasecmp_l
#else
+# define USE_AS_STRCMP
# define UPDATE_STRNCMP_COUNTER
# ifndef STRCMP
# define STRCMP strcmp
@@ -88,11 +89,17 @@ ENTRY(STRCMP)
jne 1f
call __init_cpu_features
1:
+#ifdef USE_AS_STRCMP
+ leaq __strcmp_sse2_unaligned(%rip), %rax
+ testl $bit_Fast_Unaligned_Load, __cpu_features+CPUID_OFFSET+index_Fast_Unaligned_Load(%rip)
+ jnz 3f
+#else
testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
jnz 2f
leaq STRCMP_SSE42(%rip), %rax
testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
jnz 3f
+#endif
2: leaq STRCMP_SSSE3(%rip), %rax
testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jnz 3f
--
1.8.3.2