This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Improve rawmemchr implementation.


A patch got accidentaly filtered, so here is correct version.

Hi,

I looked to rawmemchr implementation and it can be improved by using
similar header that is used in strchr. A loop itself was well optimized
so we only gain around 20 cycles per call for sizes from 64 bytes.

Results at show it that this is improvement for unit tests but I did not
find program that calls rawmemchr yet.
http://kam.mff.cuni.cz/~ondra/benchmark_string/rawmemchr_profile.html
A benchmark is at
http://kam.mff.cuni.cz/~ondra/benchmark_string/rawmemchr_profile160813.tar.bz2

Passes test, OK to commit?

	* sysdeps/x86_64/rawmemchr.S (rawmemchr): Optimize implementation.

---
 sysdeps/x86_64/rawmemchr.S | 252 +++++++++++++++------------------------------
 1 file changed, 85 insertions(+), 167 deletions(-)

diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S
index f4d5591..93b8f4f 100644
--- a/sysdeps/x86_64/rawmemchr.S
+++ b/sysdeps/x86_64/rawmemchr.S
@@ -22,185 +22,103 @@
 
 	.text
 ENTRY (rawmemchr)
-	movd	%rsi, %xmm1
-	mov	%rdi, %rcx
-
-	punpcklbw %xmm1, %xmm1
-	punpcklbw %xmm1, %xmm1
-
-	and	$63, %rcx
+	movd	%esi, %xmm1
+	movq	%rdi, %rax
+	andl	$4095, %eax
+	punpcklbw	%xmm1, %xmm1
+	cmpl	$4031, %eax
+	punpcklwd	%xmm1, %xmm1
 	pshufd	$0, %xmm1, %xmm1
-
-	cmp	$48, %rcx
-	ja	L(crosscache)
-
+	jg	L(cross_page)
 	movdqu	(%rdi), %xmm0
 	pcmpeqb	%xmm1, %xmm0
-/* Check if there is a match.  */
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-
-	jnz	L(matches)
-	add	$16, %rdi
-	and	$-16, %rdi
-	jmp	L(loop_prolog)
-
-	.p2align 4
-L(crosscache):
-	and	$15, %rcx
-	and	$-16, %rdi
-	movdqa	(%rdi), %xmm0
-
-	pcmpeqb	%xmm1, %xmm0
-/* Check if there is a match.  */
-	pmovmskb %xmm0, %eax
-/* Remove the leading bytes.  */
-	sar	%cl, %eax
+	pmovmskb	%xmm0, %eax
 	test	%eax, %eax
-	je	L(unaligned_no_match)
-/* Check which byte is a match.  */
-	bsf	%eax, %eax
-
-	add	%rdi, %rax
-	add	%rcx, %rax
-	ret
-
-	.p2align 4
-L(unaligned_no_match):
-	add	$16, %rdi
-
-	.p2align 4
-L(loop_prolog):
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches)
-
-	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb %xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	32(%rdi), %xmm3
+	jne	L(finish)
+	movdqu	16(%rdi), %xmm3
+	movdqu	32(%rdi), %xmm2
 	pcmpeqb	%xmm1, %xmm3
-	pmovmskb %xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	48(%rdi), %xmm4
-	pcmpeqb	%xmm1, %xmm4
-	add	$64, %rdi
-	pmovmskb %xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	test	$0x3f, %rdi
-	jz	L(align64_loop)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches)
-
-	movdqa	16(%rdi), %xmm2
+	movdqu	48(%rdi), %xmm0
 	pcmpeqb	%xmm1, %xmm2
-	pmovmskb %xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb %xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	48(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb %xmm3, %eax
-
-	add	$64, %rdi
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	and	$-64, %rdi
-
-	.p2align 4
-L(align64_loop):
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
-	movdqa	48(%rdi), %xmm4
-
+	pmovmskb	%xmm3, %edx
 	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	%xmm1, %xmm3
+	pmovmskb	%xmm2, %eax
+	pmovmskb	%xmm0, %ecx
+	salq	$16, %rdx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	movq	%rcx, %rdx
+	salq	$48, %rdx
+	orq	%rdx, %rax
+	jne	L(finish)
+L(align):
+	andq	$-64, %rdi
+	.p2align 4
+L(loop64):
+	movdqa	64(%rdi), %xmm5
+	movdqa	80(%rdi), %xmm4
+	pcmpeqb	%xmm1, %xmm5
+	movdqa	96(%rdi), %xmm3
 	pcmpeqb	%xmm1, %xmm4
-
-	pmaxub	%xmm0, %xmm3
-	pmaxub	%xmm2, %xmm4
-	pmaxub	%xmm3, %xmm4
-	pmovmskb %xmm4, %eax
-
-	add	$64, %rdi
-
-	test	%eax, %eax
-	jz	L(align64_loop)
-
-	sub	$64, %rdi
-
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches)
-
-	pmovmskb %xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	32(%rdi), %xmm3
+	movdqa	112(%rdi), %xmm2
 	pcmpeqb	%xmm1, %xmm3
-
-	pcmpeqb	48(%rdi), %xmm1
-	pmovmskb %xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	pmovmskb %xmm1, %eax
-	bsf	%eax, %eax
-	lea	48(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(matches0):
-	bsf	%eax, %eax
-	lea	-16(%rax, %rdi), %rax
-	ret
-
-	.p2align 4
-L(matches):
-	bsf	%eax, %eax
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(matches16):
-	bsf	%eax, %eax
-	lea	16(%rax, %rdi), %rax
-	ret
-
-	.p2align 4
-L(matches32):
-	bsf	%eax, %eax
-	lea	32(%rax, %rdi), %rax
+	pmaxub	%xmm4, %xmm5
+	pcmpeqb	%xmm1, %xmm2
+	pmaxub	%xmm3, %xmm5
+	pmaxub	%xmm2, %xmm5
+	addq	$64, %rdi
+	pmovmskb	%xmm5, %eax
+	testl	%eax, %eax
+	je	L(loop64)
+
+	movdqa	(%rdi), %xmm5
+	pcmpeqb	%xmm1, %xmm5
+	pmovmskb	%xmm5, %ecx
+	pmovmskb	%xmm4, %eax
+	pmovmskb	%xmm3, %esi
+	pmovmskb	%xmm2, %edx
+	salq	$32, %rsi
+	salq	$16, %rax
+	salq	$48, %rdx
+	orq	%rsi, %rax
+	orq	%rcx, %rax
+	orq	%rdx, %rax
+L(finish):
+	bsfq	%rax, %rax
+	addq	%rdi, %rax
 	ret
+	.p2align 4,,10
+	.p2align 3
+L(cross_page):
 
-	.p2align 4
-L(return_null):
-	xor	%rax, %rax
+	movq	%rdi, %rax
+	andq	$-64, %rax
+	movdqa	(%rax), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb	%xmm0, %esi
+	movdqa	16(%rax), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb	%xmm0, %edx
+	movdqa	32(%rax), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	salq	$16, %rdx
+	pmovmskb	%xmm0, %r8d
+	movdqa	48(%rax), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	salq	$32, %r8
+	orq	%r8, %rdx
+	pmovmskb	%xmm0, %ecx
+	orq	%rsi, %rdx
+	salq	$48, %rcx
+	orq	%rcx, %rdx
+	movl	%edi, %ecx
+	subl	%eax, %ecx
+	shrq	%cl, %rdx
+	testq	%rdx, %rdx
+	je	L(align)
+	bsfq	%rdx, %rax
+	addq	%rdi, %rax
 	ret
-
 END (rawmemchr)
 
 strong_alias (rawmemchr, __rawmemchr)
-- 
1.8.3.2



Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]