This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH] Faster strchr implementation.

From: OndÅej BÃlka <neleai at seznam dot cz>
To: libc-alpha at sourceware dot org
Date: Wed, 7 Aug 2013 16:09:11 +0200
Subject: [PATCH] Faster strchr implementation.
As are functions using sse4.2 concerned next one is strchr. I wrote a
better implementation about year ago, now I am resubmitting it.

It uses a similar header as in strlen which improves performance in gcc
workload in most of cases. 

Asymptotically on all tested architectures a strchr_new wins often by 
more than 33% than next implementation

On ivy bridge and bulldozer architectures my implementation is 50%
faster than sse4_2 one for large inputs. A nehalem is exception as we
gain only about 3% there.

I also included avx2 version to benchmark but do not have haswell to get
data.
I also do not have data for atom, so I left strchr_sse2_no_bsf as it is.

Benchmarks are available here and data for haswell/atom are welcome:

http://kam.mff.cuni.cz/~ondra/benchmark_string/strchr_profile.html

http://kam.mff.cuni.cz/~ondra/strchr_profile070813.tar.bz2

No regressions on x64. OK to commit?

	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Remove: __strchr_sse42.
	* sysdeps/x86_64/multiarch/strchr.S (__strchr_sse42): Remove.
	(strchr): Remove __strchr_sse42 ifunc selection.
	* sysdeps/x86_64/strchr.S (strchr): Use optimized implementation.
	* sysdeps/x86_64/strchrnul.S: Include sysdeps/x86_64/strchr.S.

---
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   1 -
 sysdeps/x86_64/multiarch/strchr.S          | 126 ---------------------
 sysdeps/x86_64/strchr.S                    | 176 +++++++++++++++++++++++------
 sysdeps/x86_64/strchrnul.S                 |  44 +-------
 4 files changed, 142 insertions(+), 205 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 28d3579..8486294 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -116,7 +116,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strchr.S.  */
   IFUNC_IMPL (i, name, strchr,
-	      IFUNC_IMPL_ADD (array, i, strchr, HAS_SSE4_2, __strchr_sse42)
 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
 
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index f170238..9d2db7c 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -29,12 +29,6 @@ ENTRY(strchr)
 	jne	1f
 	call	__init_cpu_features
 1:	leaq	__strchr_sse2(%rip), %rax
-	testl	$bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
-	jnz	2f
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
-	jz	2f
-	leaq	__strchr_sse42(%rip), %rax
-	ret
 2:	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
 	jz	3f
 	leaq    __strchr_sse2_no_bsf(%rip), %rax
@@ -42,126 +36,6 @@ ENTRY(strchr)
 END(strchr)
 
 
-/*
-   This implementation uses SSE4 instructions to compare up to 16 bytes
-   at a time looking for the first occurrence of the character c in the
-   string s:
-
-   char *strchr (const char *s, int c);
-
-   We use 0xa:
-	_SIDD_SBYTE_OPS
-	| _SIDD_CMP_EQUAL_EACH
-	| _SIDD_LEAST_SIGNIFICANT
-   on pcmpistri to compare xmm/mem128
-
-   0 1 2 3 4 5 6 7 8 9 A B C D E F
-   X X X X X X X X X X X X X X X X
-
-   against xmm
-
-   0 1 2 3 4 5 6 7 8 9 A B C D E F
-   C C C C C C C C C C C C C C C C
-
-   to find out if the first 16byte data element has a byte C and the
-   offset of the first byte.  There are 3 cases:
-
-   1. The first 16byte data element has the byte C at the offset X.
-   2. The first 16byte data element has EOS and doesn't have the byte C.
-   3. The first 16byte data element is valid and doesn't have the byte C.
-
-   Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
-
-   case		ECX	CFlag	ZFlag	SFlag
-    1		 X	  1	 0/1	  0
-    2		16	  0	  1	  0
-    3		16	  0	  0	  0
-
-   We exit from the loop for cases 1 and 2 with jbe which branches
-   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
-   X for case 1.  */
-
-	.section .text.sse4.2,"ax",@progbits
-	.align	16
-	.type	__strchr_sse42, @function
-	.globl	__strchr_sse42
-	.hidden	__strchr_sse42
-__strchr_sse42:
-	cfi_startproc
-	CALL_MCOUNT
-	testb	%sil, %sil
-	je	__strend_sse4
-	pxor	%xmm2, %xmm2
-	movd	%esi, %xmm1
-	movl	%edi, %ecx
-	pshufb  %xmm2, %xmm1
-	andl	$15, %ecx
-	movq	%rdi, %r8
-	je	L(aligned_start)
-
-/* Handle unaligned string.  */
-	andq	$-16, %r8
-	movdqa	(%r8), %xmm0
-	pcmpeqb	 %xmm0, %xmm2
-	pcmpeqb	 %xmm1, %xmm0
-	/* Find where NULL is.  */
-	pmovmskb %xmm2, %edx
-	/* Check if there is a match.  */
-	pmovmskb %xmm0, %esi
-	/* Remove the leading  bytes.  */
-	sarl	%cl, %edx
-	sarl	%cl, %esi
-	testl	%esi, %esi
-	je	L(unaligned_no_match)
-	/* Check which byte is a match.  */
-	bsfl	%esi, %eax
-	/* Is there a NULL? */
-	testl	%edx, %edx
-	je      L(unaligned_match)
-	bsfl	%edx, %esi
-	cmpl	%esi, %eax
-	/* Return NULL if NULL comes first.  */
-	ja	L(return_null)
-L(unaligned_match):
-	addq	%rdi, %rax
-	ret
-
-	.p2align 4
-L(unaligned_no_match):
-	testl	%edx, %edx
-	jne	L(return_null)
-
-/* Loop start on aligned string.  */
-L(loop):
-	addq	$16, %r8
-L(aligned_start):
-	pcmpistri	$0x2, (%r8), %xmm1
-	jbe	L(wrap)
-	addq	$16, %r8
-	pcmpistri	$0x2, (%r8), %xmm1
-	jbe	L(wrap)
-	addq	$16, %r8
-	pcmpistri       $0x2, (%r8), %xmm1
-	jbe     L(wrap)
-	addq	$16, %r8
-	pcmpistri	$0x2, (%r8), %xmm1
-	jbe	L(wrap)
-	jmp	L(loop)
-L(wrap):
-	jc	L(loop_exit)
-
-/* Return NULL.  */
-L(return_null):
-	xorl	%eax, %eax
-	ret
-
-/* Loop exit.  */
-	.p2align 4
-L(loop_exit):
-	leaq	(%r8,%rcx), %rax
-	ret
-	cfi_endproc
-	.size	__strchr_sse42, .-__strchr_sse42
 
 
 # undef ENTRY
diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
index d89f1eb..a6bcfdc 100644
--- a/sysdeps/x86_64/strchr.S
+++ b/sysdeps/x86_64/strchr.S
@@ -19,51 +19,153 @@
 
 #include <sysdep.h>
 
-
+#ifndef ALIGN
+#define ALIGN(x) .p2align x
+#endif
 	.text
 ENTRY (strchr)
+
 	movd	%esi, %xmm1
-	movq	%rdi, %rcx
-	punpcklbw %xmm1, %xmm1
-	andq	$~15, %rdi
-	pxor	%xmm2, %xmm2
-	punpcklbw %xmm1, %xmm1
-	orl	$0xffffffff, %esi
-	movdqa	(%rdi), %xmm0
+	movl	%edi, %eax
+	andl	$4095, %eax
+	punpcklbw	%xmm1, %xmm1
+	cmpl	$4032, %eax
+	punpcklwd	%xmm1, %xmm1
 	pshufd	$0, %xmm1, %xmm1
-	subq	%rdi, %rcx
-	movdqa	%xmm0, %xmm3
-	leaq	16(%rdi), %rdi
+	jg	L(cross_cache)
+L(find_result):
+	movdqu	(%rdi), %xmm0
+	pxor	%xmm3, %xmm3
+	movdqa	%xmm1, %xmm2
+	movdqa	%xmm0, %xmm4
 	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	shl	%cl, %esi
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	andl	%esi, %edx
-	andl	%esi, %ecx
-	orl	%edx, %ecx
-	jnz	1f
-
-2:	movdqa	(%rdi), %xmm0
-	leaq	16(%rdi), %rdi
-	movdqa	%xmm0, %xmm3
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb	%xmm0, %edx
+	bsfq	%rdx, %rax
+	testq	%rdx, %rdx
+	je	L(next48_bytes)
+L(return):
+	movl	$0, %edx
+	leaq	(%rdi,%rax), %rax
+#ifndef AS_STRCHRNUL
+	cmpb	%sil, (%rax)
+	cmovne	%rdx, %rax
+#endif
+	ret
+	ALIGN(4)
+L(cross_cache):
+   	movq    %rdi, %rdx
+        pxor    %xmm2, %xmm2
+        andq    $-64, %rdx
+        movdqa  %xmm1, %xmm0
+        movdqu  (%rdx), %xmm3
+        movdqa  %xmm3, %xmm4
+        pcmpeqb %xmm1, %xmm3
+        pcmpeqb %xmm2, %xmm4
+        por     %xmm4, %xmm3
+        pmovmskb        %xmm3, %r8d
+        movdqu  16(%rdx), %xmm3
+        movdqa  %xmm3, %xmm4
+        movslq  %r8d, %r8
+        pcmpeqb %xmm1, %xmm3
+        pcmpeqb %xmm2, %xmm4
+        por     %xmm4, %xmm3
+        pmovmskb        %xmm3, %eax
+        movdqu  32(%rdx), %xmm3
+        movdqa  %xmm3, %xmm4
+        cltq
+        pcmpeqb %xmm1, %xmm3
+        salq    $16, %rax
+        pcmpeqb %xmm2, %xmm4
+        por     %xmm4, %xmm3
+        pmovmskb        %xmm3, %r9d
+        movdqu  48(%rdx), %xmm3
+        pcmpeqb %xmm3, %xmm2
+        salq    $32, %r9
+        pcmpeqb %xmm3, %xmm0
+        orq     %r9, %rax
+        orq     %r8, %rax
+        por     %xmm2, %xmm0
+        pmovmskb        %xmm0, %ecx
+        salq    $48, %rcx
+        orq     %rcx, %rax
+        movl    %edi, %ecx
+        subb    %dl, %cl
+        shrq    %cl, %rax
+        testq   %rax, %rax
+	jne	L(return2)
+L(align64):
+	addq	$64, %rdi
+	pxor	%xmm6, %xmm6
+	andq	$-64, %rdi
+	jmp	L(loop_start)
+	ALIGN(4)
+L(loop):
+	addq	$64, %rdi
+L(loop_start):
+	movdqa	(%rdi), %xmm5
+	movdqa	16(%rdi), %xmm2
+	movdqa	32(%rdi), %xmm3
+	pxor	%xmm1, %xmm5
+	movdqa	48(%rdi), %xmm4
+	pxor	%xmm1, %xmm2
+	pxor	%xmm1, %xmm3
+	pminub	(%rdi), %xmm5
+	pxor	%xmm1, %xmm4
+	pminub	16(%rdi), %xmm2
+	pminub	32(%rdi), %xmm3
+	pminub	%xmm2, %xmm5   
+	pminub	48(%rdi), %xmm4
+	pminub	%xmm3, %xmm5   
+	pminub	%xmm4, %xmm5
+	pcmpeqb %xmm6, %xmm5
+	pmovmskb	%xmm5, %eax
+	testl	%eax, %eax
+	je	L(loop)
+	jmp	L(find_result)
+	ALIGN(4)
+L(return2):
+	bsfq	%rax, %rax
+	jmp	L(return)
+	ALIGN(4)
+L(next48_bytes):
+	movdqu	16(%rdi), %xmm0
+	movdqa	%xmm0, %xmm4
 	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	orl	%edx, %ecx
-	jz	2b
-
-1:	bsfl	%edx, %edx
-	jz	4f
-	bsfl	%ecx, %ecx
-	leaq	-16(%rdi,%rdx), %rax
-	cmpl	%edx, %ecx
-	je	5f
-4:	xorl	%eax, %eax
-5:	ret
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb	%xmm0, %ecx
+	movdqu	32(%rdi), %xmm0
+	movdqa	%xmm0, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	salq	$16, %rcx
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb	%xmm0, %eax
+	movdqu	48(%rdi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	salq	$32, %rax
+	pcmpeqb	%xmm0, %xmm2
+	orq	%rcx, %rax
+	por	%xmm3, %xmm2
+	pmovmskb	%xmm2, %r8d
+	movq	%r8, %rcx
+	salq	$48, %rcx
+	orq	%rcx, %rax
+	je	L(align64)
+	bsfq	%rax, %rax
+	leaq	(%rdi,%rax), %rax
+#ifndef AS_STRCHRNUL
+	cmpb	%sil, (%rax)
+	cmovne	%rdx, %rax
+#endif
+	ret
 END (strchr)
 
+#ifndef AS_STRCHRNUL
 weak_alias (strchr, index)
+#endif
+
 libc_hidden_builtin_def (strchr)
 
diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
index d8c345b..9aa9e32 100644
--- a/sysdeps/x86_64/strchrnul.S
+++ b/sysdeps/x86_64/strchrnul.S
@@ -17,46 +17,8 @@
    You should have received a copy of the GNU Lesser General Public
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-
-	.text
-ENTRY (__strchrnul)
-	movd	%esi, %xmm1
-	movq	%rdi, %rcx
-	punpcklbw %xmm1, %xmm1
-	andq	$~15, %rdi
-	pxor	%xmm2, %xmm2
-	punpcklbw %xmm1, %xmm1
-	orl	$0xffffffff, %esi
-	movdqa	(%rdi), %xmm0
-	pshufd	$0, %xmm1, %xmm1
-	subq	%rdi, %rcx
-	movdqa	%xmm0, %xmm3
-	leaq	16(%rdi), %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	shl	%cl, %esi
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	orl	%edx, %ecx
-	andl	%esi, %ecx
-	jnz	1f
-
-2:	movdqa	(%rdi), %xmm0
-	leaq	16(%rdi), %rdi
-	movdqa	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	orl	%edx, %ecx
-	jz	2b
-
-1:	bsfl	%ecx, %edx
-	leaq	-16(%rdi,%rdx), %rax
-	ret
-END (__strchrnul)
+#define strchr __strchrnul
+#define AS_STRCHRNUL
+#include "strchr.S"
 
 weak_alias (__strchrnul, strchrnul)
-- 
1.8.3.2
Follow-Ups:
- Re: [PATCH] Faster strchr implementation.
  - From: Liubov Dmitrieva
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]