This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] Faster strchr implementation.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Wed, 7 Aug 2013 16:09:11 +0200
- Subject: [PATCH] Faster strchr implementation.
As are functions using sse4.2 concerned next one is strchr. I wrote a
better implementation about year ago, now I am resubmitting it.
It uses a similar header as in strlen which improves performance in gcc
workload in most of cases.
Asymptotically on all tested architectures a strchr_new wins often by
more than 33% than next implementation
On ivy bridge and bulldozer architectures my implementation is 50%
faster than sse4_2 one for large inputs. A nehalem is exception as we
gain only about 3% there.
I also included avx2 version to benchmark but do not have haswell to get
data.
I also do not have data for atom, so I left strchr_sse2_no_bsf as it is.
Benchmarks are available here and data for haswell/atom are welcome:
http://kam.mff.cuni.cz/~ondra/benchmark_string/strchr_profile.html
http://kam.mff.cuni.cz/~ondra/strchr_profile070813.tar.bz2
No regressions on x64. OK to commit?
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Remove: __strchr_sse42.
* sysdeps/x86_64/multiarch/strchr.S (__strchr_sse42): Remove.
(strchr): Remove __strchr_sse42 ifunc selection.
* sysdeps/x86_64/strchr.S (strchr): Use optimized implementation.
* sysdeps/x86_64/strchrnul.S: Include sysdeps/x86_64/strchr.S.
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 1 -
sysdeps/x86_64/multiarch/strchr.S | 126 ---------------------
sysdeps/x86_64/strchr.S | 176 +++++++++++++++++++++++------
sysdeps/x86_64/strchrnul.S | 44 +-------
4 files changed, 142 insertions(+), 205 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 28d3579..8486294 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -116,7 +116,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strchr.S. */
IFUNC_IMPL (i, name, strchr,
- IFUNC_IMPL_ADD (array, i, strchr, HAS_SSE4_2, __strchr_sse42)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index f170238..9d2db7c 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -29,12 +29,6 @@ ENTRY(strchr)
jne 1f
call __init_cpu_features
1: leaq __strchr_sse2(%rip), %rax
- testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
- jnz 2f
- testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
- jz 2f
- leaq __strchr_sse42(%rip), %rax
- ret
2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
jz 3f
leaq __strchr_sse2_no_bsf(%rip), %rax
@@ -42,126 +36,6 @@ ENTRY(strchr)
END(strchr)
-/*
- This implementation uses SSE4 instructions to compare up to 16 bytes
- at a time looking for the first occurrence of the character c in the
- string s:
-
- char *strchr (const char *s, int c);
-
- We use 0xa:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_EACH
- | _SIDD_LEAST_SIGNIFICANT
- on pcmpistri to compare xmm/mem128
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- X X X X X X X X X X X X X X X X
-
- against xmm
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- C C C C C C C C C C C C C C C C
-
- to find out if the first 16byte data element has a byte C and the
- offset of the first byte. There are 3 cases:
-
- 1. The first 16byte data element has the byte C at the offset X.
- 2. The first 16byte data element has EOS and doesn't have the byte C.
- 3. The first 16byte data element is valid and doesn't have the byte C.
-
- Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
-
- case ECX CFlag ZFlag SFlag
- 1 X 1 0/1 0
- 2 16 0 1 0
- 3 16 0 0 0
-
- We exit from the loop for cases 1 and 2 with jbe which branches
- when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
- X for case 1. */
-
- .section .text.sse4.2,"ax",@progbits
- .align 16
- .type __strchr_sse42, @function
- .globl __strchr_sse42
- .hidden __strchr_sse42
-__strchr_sse42:
- cfi_startproc
- CALL_MCOUNT
- testb %sil, %sil
- je __strend_sse4
- pxor %xmm2, %xmm2
- movd %esi, %xmm1
- movl %edi, %ecx
- pshufb %xmm2, %xmm1
- andl $15, %ecx
- movq %rdi, %r8
- je L(aligned_start)
-
-/* Handle unaligned string. */
- andq $-16, %r8
- movdqa (%r8), %xmm0
- pcmpeqb %xmm0, %xmm2
- pcmpeqb %xmm1, %xmm0
- /* Find where NULL is. */
- pmovmskb %xmm2, %edx
- /* Check if there is a match. */
- pmovmskb %xmm0, %esi
- /* Remove the leading bytes. */
- sarl %cl, %edx
- sarl %cl, %esi
- testl %esi, %esi
- je L(unaligned_no_match)
- /* Check which byte is a match. */
- bsfl %esi, %eax
- /* Is there a NULL? */
- testl %edx, %edx
- je L(unaligned_match)
- bsfl %edx, %esi
- cmpl %esi, %eax
- /* Return NULL if NULL comes first. */
- ja L(return_null)
-L(unaligned_match):
- addq %rdi, %rax
- ret
-
- .p2align 4
-L(unaligned_no_match):
- testl %edx, %edx
- jne L(return_null)
-
-/* Loop start on aligned string. */
-L(loop):
- addq $16, %r8
-L(aligned_start):
- pcmpistri $0x2, (%r8), %xmm1
- jbe L(wrap)
- addq $16, %r8
- pcmpistri $0x2, (%r8), %xmm1
- jbe L(wrap)
- addq $16, %r8
- pcmpistri $0x2, (%r8), %xmm1
- jbe L(wrap)
- addq $16, %r8
- pcmpistri $0x2, (%r8), %xmm1
- jbe L(wrap)
- jmp L(loop)
-L(wrap):
- jc L(loop_exit)
-
-/* Return NULL. */
-L(return_null):
- xorl %eax, %eax
- ret
-
-/* Loop exit. */
- .p2align 4
-L(loop_exit):
- leaq (%r8,%rcx), %rax
- ret
- cfi_endproc
- .size __strchr_sse42, .-__strchr_sse42
# undef ENTRY
diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
index d89f1eb..a6bcfdc 100644
--- a/sysdeps/x86_64/strchr.S
+++ b/sysdeps/x86_64/strchr.S
@@ -19,51 +19,153 @@
#include <sysdep.h>
-
+#ifndef ALIGN
+#define ALIGN(x) .p2align x
+#endif
.text
ENTRY (strchr)
+
movd %esi, %xmm1
- movq %rdi, %rcx
- punpcklbw %xmm1, %xmm1
- andq $~15, %rdi
- pxor %xmm2, %xmm2
- punpcklbw %xmm1, %xmm1
- orl $0xffffffff, %esi
- movdqa (%rdi), %xmm0
+ movl %edi, %eax
+ andl $4095, %eax
+ punpcklbw %xmm1, %xmm1
+ cmpl $4032, %eax
+ punpcklwd %xmm1, %xmm1
pshufd $0, %xmm1, %xmm1
- subq %rdi, %rcx
- movdqa %xmm0, %xmm3
- leaq 16(%rdi), %rdi
+ jg L(cross_cache)
+L(find_result):
+ movdqu (%rdi), %xmm0
+ pxor %xmm3, %xmm3
+ movdqa %xmm1, %xmm2
+ movdqa %xmm0, %xmm4
pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- shl %cl, %esi
- pmovmskb %xmm0, %edx
- pmovmskb %xmm3, %ecx
- andl %esi, %edx
- andl %esi, %ecx
- orl %edx, %ecx
- jnz 1f
-
-2: movdqa (%rdi), %xmm0
- leaq 16(%rdi), %rdi
- movdqa %xmm0, %xmm3
+ pcmpeqb %xmm3, %xmm4
+ por %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ bsfq %rdx, %rax
+ testq %rdx, %rdx
+ je L(next48_bytes)
+L(return):
+ movl $0, %edx
+ leaq (%rdi,%rax), %rax
+#ifndef AS_STRCHRNUL
+ cmpb %sil, (%rax)
+ cmovne %rdx, %rax
+#endif
+ ret
+ ALIGN(4)
+L(cross_cache):
+ movq %rdi, %rdx
+ pxor %xmm2, %xmm2
+ andq $-64, %rdx
+ movdqa %xmm1, %xmm0
+ movdqu (%rdx), %xmm3
+ movdqa %xmm3, %xmm4
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm2, %xmm4
+ por %xmm4, %xmm3
+ pmovmskb %xmm3, %r8d
+ movdqu 16(%rdx), %xmm3
+ movdqa %xmm3, %xmm4
+ movslq %r8d, %r8
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm2, %xmm4
+ por %xmm4, %xmm3
+ pmovmskb %xmm3, %eax
+ movdqu 32(%rdx), %xmm3
+ movdqa %xmm3, %xmm4
+ cltq
+ pcmpeqb %xmm1, %xmm3
+ salq $16, %rax
+ pcmpeqb %xmm2, %xmm4
+ por %xmm4, %xmm3
+ pmovmskb %xmm3, %r9d
+ movdqu 48(%rdx), %xmm3
+ pcmpeqb %xmm3, %xmm2
+ salq $32, %r9
+ pcmpeqb %xmm3, %xmm0
+ orq %r9, %rax
+ orq %r8, %rax
+ por %xmm2, %xmm0
+ pmovmskb %xmm0, %ecx
+ salq $48, %rcx
+ orq %rcx, %rax
+ movl %edi, %ecx
+ subb %dl, %cl
+ shrq %cl, %rax
+ testq %rax, %rax
+ jne L(return2)
+L(align64):
+ addq $64, %rdi
+ pxor %xmm6, %xmm6
+ andq $-64, %rdi
+ jmp L(loop_start)
+ ALIGN(4)
+L(loop):
+ addq $64, %rdi
+L(loop_start):
+ movdqa (%rdi), %xmm5
+ movdqa 16(%rdi), %xmm2
+ movdqa 32(%rdi), %xmm3
+ pxor %xmm1, %xmm5
+ movdqa 48(%rdi), %xmm4
+ pxor %xmm1, %xmm2
+ pxor %xmm1, %xmm3
+ pminub (%rdi), %xmm5
+ pxor %xmm1, %xmm4
+ pminub 16(%rdi), %xmm2
+ pminub 32(%rdi), %xmm3
+ pminub %xmm2, %xmm5
+ pminub 48(%rdi), %xmm4
+ pminub %xmm3, %xmm5
+ pminub %xmm4, %xmm5
+ pcmpeqb %xmm6, %xmm5
+ pmovmskb %xmm5, %eax
+ testl %eax, %eax
+ je L(loop)
+ jmp L(find_result)
+ ALIGN(4)
+L(return2):
+ bsfq %rax, %rax
+ jmp L(return)
+ ALIGN(4)
+L(next48_bytes):
+ movdqu 16(%rdi), %xmm0
+ movdqa %xmm0, %xmm4
pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %edx
- pmovmskb %xmm3, %ecx
- orl %edx, %ecx
- jz 2b
-
-1: bsfl %edx, %edx
- jz 4f
- bsfl %ecx, %ecx
- leaq -16(%rdi,%rdx), %rax
- cmpl %edx, %ecx
- je 5f
-4: xorl %eax, %eax
-5: ret
+ pcmpeqb %xmm3, %xmm4
+ por %xmm4, %xmm0
+ pmovmskb %xmm0, %ecx
+ movdqu 32(%rdi), %xmm0
+ movdqa %xmm0, %xmm4
+ pcmpeqb %xmm1, %xmm0
+ salq $16, %rcx
+ pcmpeqb %xmm3, %xmm4
+ por %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ movdqu 48(%rdi), %xmm0
+ pcmpeqb %xmm0, %xmm3
+ salq $32, %rax
+ pcmpeqb %xmm0, %xmm2
+ orq %rcx, %rax
+ por %xmm3, %xmm2
+ pmovmskb %xmm2, %r8d
+ movq %r8, %rcx
+ salq $48, %rcx
+ orq %rcx, %rax
+ je L(align64)
+ bsfq %rax, %rax
+ leaq (%rdi,%rax), %rax
+#ifndef AS_STRCHRNUL
+ cmpb %sil, (%rax)
+ cmovne %rdx, %rax
+#endif
+ ret
END (strchr)
+#ifndef AS_STRCHRNUL
weak_alias (strchr, index)
+#endif
+
libc_hidden_builtin_def (strchr)
diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
index d8c345b..9aa9e32 100644
--- a/sysdeps/x86_64/strchrnul.S
+++ b/sysdeps/x86_64/strchrnul.S
@@ -17,46 +17,8 @@
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-
- .text
-ENTRY (__strchrnul)
- movd %esi, %xmm1
- movq %rdi, %rcx
- punpcklbw %xmm1, %xmm1
- andq $~15, %rdi
- pxor %xmm2, %xmm2
- punpcklbw %xmm1, %xmm1
- orl $0xffffffff, %esi
- movdqa (%rdi), %xmm0
- pshufd $0, %xmm1, %xmm1
- subq %rdi, %rcx
- movdqa %xmm0, %xmm3
- leaq 16(%rdi), %rdi
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- shl %cl, %esi
- pmovmskb %xmm0, %edx
- pmovmskb %xmm3, %ecx
- orl %edx, %ecx
- andl %esi, %ecx
- jnz 1f
-
-2: movdqa (%rdi), %xmm0
- leaq 16(%rdi), %rdi
- movdqa %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %edx
- pmovmskb %xmm3, %ecx
- orl %edx, %ecx
- jz 2b
-
-1: bsfl %ecx, %edx
- leaq -16(%rdi,%rdx), %rax
- ret
-END (__strchrnul)
+#define strchr __strchrnul
+#define AS_STRCHRNUL
+#include "strchr.S"
weak_alias (__strchrnul, strchrnul)
--
1.8.3.2