This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PING][PATCH v1.1] Improve rawmemchr implementation.


On Mon, Sep 02, 2013 at 11:55:30AM +0200, OndÅej BÃlka wrote:
> Ping,
ping
> 
> I noticed that we use strong alias instead of weak, is that intentional?
> 
> strong_alias (rawmemchr, __rawmemchr)
> 
> There is one minor change, I misclassified case s % 4096 == 4032 as
> crossing page boundary. Following should improve performance a bit.
> 
> -       cmpl    $4031, %eax
> +       cmpl    $4032, %eax
> 
> On Fri, Aug 16, 2013 at 02:12:56PM +0200, OndÅej BÃlka wrote:
> > A patch got accidentaly filtered, so here is correct version.
> > 
> > Hi,
> > 
> > I looked to rawmemchr implementation and it can be improved by using
> > similar header that is used in strchr. A loop itself was well optimized
> > so we only gain around 20 cycles per call for sizes from 64 bytes.
> > 
> > Results at show it that this is improvement for unit tests but I did not
> > find program that calls rawmemchr yet.
> > http://kam.mff.cuni.cz/~ondra/benchmark_string/rawmemchr_profile.html
> > A benchmark is at
> > http://kam.mff.cuni.cz/~ondra/benchmark_string/rawmemchr_profile160813.tar.bz2
> > 
> > Passes test, OK to commit?
> > 
> 	* sysdeps/x86_64/rawmemchr.S (rawmemchr): Optimize implementation.
> 
> ---
>  sysdeps/x86_64/rawmemchr.S | 252 +++++++++++++++------------------------------
>  1 file changed, 85 insertions(+), 167 deletions(-)
> 
> diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S
> index f4d5591..93b8f4f 100644
> --- a/sysdeps/x86_64/rawmemchr.S
> +++ b/sysdeps/x86_64/rawmemchr.S
> @@ -22,185 +22,103 @@
>  
>  	.text
>  ENTRY (rawmemchr)
> -	movd	%rsi, %xmm1
> -	mov	%rdi, %rcx
> -
> -	punpcklbw %xmm1, %xmm1
> -	punpcklbw %xmm1, %xmm1
> -
> -	and	$63, %rcx
> +	movd	%esi, %xmm1
> +	movq	%rdi, %rax
> +	andl	$4095, %eax
> +	punpcklbw	%xmm1, %xmm1
> +	cmpl	$4032, %eax
> +	punpcklwd	%xmm1, %xmm1
>  	pshufd	$0, %xmm1, %xmm1
> -
> -	cmp	$48, %rcx
> -	ja	L(crosscache)
> -
> +	jg	L(cross_page)
>  	movdqu	(%rdi), %xmm0
>  	pcmpeqb	%xmm1, %xmm0
> -/* Check if there is a match.  */
> -	pmovmskb %xmm0, %eax
> -	test	%eax, %eax
> -
> -	jnz	L(matches)
> -	add	$16, %rdi
> -	and	$-16, %rdi
> -	jmp	L(loop_prolog)
> -
> -	.p2align 4
> -L(crosscache):
> -	and	$15, %rcx
> -	and	$-16, %rdi
> -	movdqa	(%rdi), %xmm0
> -
> -	pcmpeqb	%xmm1, %xmm0
> -/* Check if there is a match.  */
> -	pmovmskb %xmm0, %eax
> -/* Remove the leading bytes.  */
> -	sar	%cl, %eax
> +	pmovmskb	%xmm0, %eax
>  	test	%eax, %eax
> -	je	L(unaligned_no_match)
> -/* Check which byte is a match.  */
> -	bsf	%eax, %eax
> -
> -	add	%rdi, %rax
> -	add	%rcx, %rax
> -	ret
> -
> -	.p2align 4
> -L(unaligned_no_match):
> -	add	$16, %rdi
> -
> -	.p2align 4
> -L(loop_prolog):
> -	movdqa	(%rdi), %xmm0
> -	pcmpeqb	%xmm1, %xmm0
> -	pmovmskb %xmm0, %eax
> -	test	%eax, %eax
> -	jnz	L(matches)
> -
> -	movdqa	16(%rdi), %xmm2
> -	pcmpeqb	%xmm1, %xmm2
> -	pmovmskb %xmm2, %eax
> -	test	%eax, %eax
> -	jnz	L(matches16)
> -
> -	movdqa	32(%rdi), %xmm3
> +	jne	L(finish)
> +	movdqu	16(%rdi), %xmm3
> +	movdqu	32(%rdi), %xmm2
>  	pcmpeqb	%xmm1, %xmm3
> -	pmovmskb %xmm3, %eax
> -	test	%eax, %eax
> -	jnz	L(matches32)
> -
> -	movdqa	48(%rdi), %xmm4
> -	pcmpeqb	%xmm1, %xmm4
> -	add	$64, %rdi
> -	pmovmskb %xmm4, %eax
> -	test	%eax, %eax
> -	jnz	L(matches0)
> -
> -	test	$0x3f, %rdi
> -	jz	L(align64_loop)
> -
> -	movdqa	(%rdi), %xmm0
> -	pcmpeqb	%xmm1, %xmm0
> -	pmovmskb %xmm0, %eax
> -	test	%eax, %eax
> -	jnz	L(matches)
> -
> -	movdqa	16(%rdi), %xmm2
> +	movdqu	48(%rdi), %xmm0
>  	pcmpeqb	%xmm1, %xmm2
> -	pmovmskb %xmm2, %eax
> -	test	%eax, %eax
> -	jnz	L(matches16)
> -
> -	movdqa	32(%rdi), %xmm3
> -	pcmpeqb	%xmm1, %xmm3
> -	pmovmskb %xmm3, %eax
> -	test	%eax, %eax
> -	jnz	L(matches32)
> -
> -	movdqa	48(%rdi), %xmm3
> -	pcmpeqb	%xmm1, %xmm3
> -	pmovmskb %xmm3, %eax
> -
> -	add	$64, %rdi
> -	test	%eax, %eax
> -	jnz	L(matches0)
> -
> -	and	$-64, %rdi
> -
> -	.p2align 4
> -L(align64_loop):
> -	movdqa	(%rdi), %xmm0
> -	movdqa	16(%rdi), %xmm2
> -	movdqa	32(%rdi), %xmm3
> -	movdqa	48(%rdi), %xmm4
> -
> +	pmovmskb	%xmm3, %edx
>  	pcmpeqb	%xmm1, %xmm0
> -	pcmpeqb	%xmm1, %xmm2
> -	pcmpeqb	%xmm1, %xmm3
> +	pmovmskb	%xmm2, %eax
> +	pmovmskb	%xmm0, %ecx
> +	salq	$16, %rdx
> +	salq	$32, %rax
> +	orq	%rdx, %rax
> +	movq	%rcx, %rdx
> +	salq	$48, %rdx
> +	orq	%rdx, %rax
> +	jne	L(finish)
> +L(align):
> +	andq	$-64, %rdi
> +	.p2align 4
> +L(loop64):
> +	movdqa	64(%rdi), %xmm5
> +	movdqa	80(%rdi), %xmm4
> +	pcmpeqb	%xmm1, %xmm5
> +	movdqa	96(%rdi), %xmm3
>  	pcmpeqb	%xmm1, %xmm4
> -
> -	pmaxub	%xmm0, %xmm3
> -	pmaxub	%xmm2, %xmm4
> -	pmaxub	%xmm3, %xmm4
> -	pmovmskb %xmm4, %eax
> -
> -	add	$64, %rdi
> -
> -	test	%eax, %eax
> -	jz	L(align64_loop)
> -
> -	sub	$64, %rdi
> -
> -	pmovmskb %xmm0, %eax
> -	test	%eax, %eax
> -	jnz	L(matches)
> -
> -	pmovmskb %xmm2, %eax
> -	test	%eax, %eax
> -	jnz	L(matches16)
> -
> -	movdqa	32(%rdi), %xmm3
> +	movdqa	112(%rdi), %xmm2
>  	pcmpeqb	%xmm1, %xmm3
> -
> -	pcmpeqb	48(%rdi), %xmm1
> -	pmovmskb %xmm3, %eax
> -	test	%eax, %eax
> -	jnz	L(matches32)
> -
> -	pmovmskb %xmm1, %eax
> -	bsf	%eax, %eax
> -	lea	48(%rdi, %rax), %rax
> -	ret
> -
> -	.p2align 4
> -L(matches0):
> -	bsf	%eax, %eax
> -	lea	-16(%rax, %rdi), %rax
> -	ret
> -
> -	.p2align 4
> -L(matches):
> -	bsf	%eax, %eax
> -	add	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(matches16):
> -	bsf	%eax, %eax
> -	lea	16(%rax, %rdi), %rax
> -	ret
> -
> -	.p2align 4
> -L(matches32):
> -	bsf	%eax, %eax
> -	lea	32(%rax, %rdi), %rax
> +	pmaxub	%xmm4, %xmm5
> +	pcmpeqb	%xmm1, %xmm2
> +	pmaxub	%xmm3, %xmm5
> +	pmaxub	%xmm2, %xmm5
> +	addq	$64, %rdi
> +	pmovmskb	%xmm5, %eax
> +	testl	%eax, %eax
> +	je	L(loop64)
> +
> +	movdqa	(%rdi), %xmm5
> +	pcmpeqb	%xmm1, %xmm5
> +	pmovmskb	%xmm5, %ecx
> +	pmovmskb	%xmm4, %eax
> +	pmovmskb	%xmm3, %esi
> +	pmovmskb	%xmm2, %edx
> +	salq	$32, %rsi
> +	salq	$16, %rax
> +	salq	$48, %rdx
> +	orq	%rsi, %rax
> +	orq	%rcx, %rax
> +	orq	%rdx, %rax
> +L(finish):
> +	bsfq	%rax, %rax
> +	addq	%rdi, %rax
>  	ret
> +	.p2align 4,,10
> +	.p2align 3
> +L(cross_page):
>  
> -	.p2align 4
> -L(return_null):
> -	xor	%rax, %rax
> +	movq	%rdi, %rax
> +	andq	$-64, %rax
> +	movdqa	(%rax), %xmm0
> +	pcmpeqb	%xmm1, %xmm0
> +	pmovmskb	%xmm0, %esi
> +	movdqa	16(%rax), %xmm0
> +	pcmpeqb	%xmm1, %xmm0
> +	pmovmskb	%xmm0, %edx
> +	movdqa	32(%rax), %xmm0
> +	pcmpeqb	%xmm1, %xmm0
> +	salq	$16, %rdx
> +	pmovmskb	%xmm0, %r8d
> +	movdqa	48(%rax), %xmm0
> +	pcmpeqb	%xmm1, %xmm0
> +	salq	$32, %r8
> +	orq	%r8, %rdx
> +	pmovmskb	%xmm0, %ecx
> +	orq	%rsi, %rdx
> +	salq	$48, %rcx
> +	orq	%rcx, %rdx
> +	movl	%edi, %ecx
> +	subl	%eax, %ecx
> +	shrq	%cl, %rdx
> +	testq	%rdx, %rdx
> +	je	L(align)
> +	bsfq	%rdx, %rax
> +	addq	%rdi, %rax
>  	ret
> -
>  END (rawmemchr)
>  
>  strong_alias (rawmemchr, __rawmemchr)

-- 

Your processor does not develop enough heat.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]