This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PING][PATCH v1.1] Improve rawmemchr implementation.


ping
On Sun, Sep 08, 2013 at 08:33:41PM +0200, OndÅej BÃlka wrote:
> On Mon, Sep 02, 2013 at 11:55:30AM +0200, OndÅej BÃlka wrote:
> > Ping,
> ping
> > 
> > I noticed that we use strong alias instead of weak, is that intentional?
> > 
> > strong_alias (rawmemchr, __rawmemchr)
> > 
> > There is one minor change, I misclassified case s % 4096 == 4032 as
> > crossing page boundary. Following should improve performance a bit.
> > 
> > -       cmpl    $4031, %eax
> > +       cmpl    $4032, %eax
> > 
> > On Fri, Aug 16, 2013 at 02:12:56PM +0200, OndÅej BÃlka wrote:
> > > A patch got accidentaly filtered, so here is correct version.
> > > 
> > > Hi,
> > > 
> > > I looked to rawmemchr implementation and it can be improved by using
> > > similar header that is used in strchr. A loop itself was well optimized
> > > so we only gain around 20 cycles per call for sizes from 64 bytes.
> > > 
> > > Results at show it that this is improvement for unit tests but I did not
> > > find program that calls rawmemchr yet.
> > > http://kam.mff.cuni.cz/~ondra/benchmark_string/rawmemchr_profile.html
> > > A benchmark is at
> > > http://kam.mff.cuni.cz/~ondra/benchmark_string/rawmemchr_profile160813.tar.bz2
> > > 
> > > Passes test, OK to commit?
> > > 
> > 	* sysdeps/x86_64/rawmemchr.S (rawmemchr): Optimize implementation.
> > 
> > ---
> >  sysdeps/x86_64/rawmemchr.S | 252 +++++++++++++++------------------------------
> >  1 file changed, 85 insertions(+), 167 deletions(-)
> > 
> > diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S
> > index f4d5591..93b8f4f 100644
> > --- a/sysdeps/x86_64/rawmemchr.S
> > +++ b/sysdeps/x86_64/rawmemchr.S
> > @@ -22,185 +22,103 @@
> >  
> >  	.text
> >  ENTRY (rawmemchr)
> > -	movd	%rsi, %xmm1
> > -	mov	%rdi, %rcx
> > -
> > -	punpcklbw %xmm1, %xmm1
> > -	punpcklbw %xmm1, %xmm1
> > -
> > -	and	$63, %rcx
> > +	movd	%esi, %xmm1
> > +	movq	%rdi, %rax
> > +	andl	$4095, %eax
> > +	punpcklbw	%xmm1, %xmm1
> > +	cmpl	$4032, %eax
> > +	punpcklwd	%xmm1, %xmm1
> >  	pshufd	$0, %xmm1, %xmm1
> > -
> > -	cmp	$48, %rcx
> > -	ja	L(crosscache)
> > -
> > +	jg	L(cross_page)
> >  	movdqu	(%rdi), %xmm0
> >  	pcmpeqb	%xmm1, %xmm0
> > -/* Check if there is a match.  */
> > -	pmovmskb %xmm0, %eax
> > -	test	%eax, %eax
> > -
> > -	jnz	L(matches)
> > -	add	$16, %rdi
> > -	and	$-16, %rdi
> > -	jmp	L(loop_prolog)
> > -
> > -	.p2align 4
> > -L(crosscache):
> > -	and	$15, %rcx
> > -	and	$-16, %rdi
> > -	movdqa	(%rdi), %xmm0
> > -
> > -	pcmpeqb	%xmm1, %xmm0
> > -/* Check if there is a match.  */
> > -	pmovmskb %xmm0, %eax
> > -/* Remove the leading bytes.  */
> > -	sar	%cl, %eax
> > +	pmovmskb	%xmm0, %eax
> >  	test	%eax, %eax
> > -	je	L(unaligned_no_match)
> > -/* Check which byte is a match.  */
> > -	bsf	%eax, %eax
> > -
> > -	add	%rdi, %rax
> > -	add	%rcx, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(unaligned_no_match):
> > -	add	$16, %rdi
> > -
> > -	.p2align 4
> > -L(loop_prolog):
> > -	movdqa	(%rdi), %xmm0
> > -	pcmpeqb	%xmm1, %xmm0
> > -	pmovmskb %xmm0, %eax
> > -	test	%eax, %eax
> > -	jnz	L(matches)
> > -
> > -	movdqa	16(%rdi), %xmm2
> > -	pcmpeqb	%xmm1, %xmm2
> > -	pmovmskb %xmm2, %eax
> > -	test	%eax, %eax
> > -	jnz	L(matches16)
> > -
> > -	movdqa	32(%rdi), %xmm3
> > +	jne	L(finish)
> > +	movdqu	16(%rdi), %xmm3
> > +	movdqu	32(%rdi), %xmm2
> >  	pcmpeqb	%xmm1, %xmm3
> > -	pmovmskb %xmm3, %eax
> > -	test	%eax, %eax
> > -	jnz	L(matches32)
> > -
> > -	movdqa	48(%rdi), %xmm4
> > -	pcmpeqb	%xmm1, %xmm4
> > -	add	$64, %rdi
> > -	pmovmskb %xmm4, %eax
> > -	test	%eax, %eax
> > -	jnz	L(matches0)
> > -
> > -	test	$0x3f, %rdi
> > -	jz	L(align64_loop)
> > -
> > -	movdqa	(%rdi), %xmm0
> > -	pcmpeqb	%xmm1, %xmm0
> > -	pmovmskb %xmm0, %eax
> > -	test	%eax, %eax
> > -	jnz	L(matches)
> > -
> > -	movdqa	16(%rdi), %xmm2
> > +	movdqu	48(%rdi), %xmm0
> >  	pcmpeqb	%xmm1, %xmm2
> > -	pmovmskb %xmm2, %eax
> > -	test	%eax, %eax
> > -	jnz	L(matches16)
> > -
> > -	movdqa	32(%rdi), %xmm3
> > -	pcmpeqb	%xmm1, %xmm3
> > -	pmovmskb %xmm3, %eax
> > -	test	%eax, %eax
> > -	jnz	L(matches32)
> > -
> > -	movdqa	48(%rdi), %xmm3
> > -	pcmpeqb	%xmm1, %xmm3
> > -	pmovmskb %xmm3, %eax
> > -
> > -	add	$64, %rdi
> > -	test	%eax, %eax
> > -	jnz	L(matches0)
> > -
> > -	and	$-64, %rdi
> > -
> > -	.p2align 4
> > -L(align64_loop):
> > -	movdqa	(%rdi), %xmm0
> > -	movdqa	16(%rdi), %xmm2
> > -	movdqa	32(%rdi), %xmm3
> > -	movdqa	48(%rdi), %xmm4
> > -
> > +	pmovmskb	%xmm3, %edx
> >  	pcmpeqb	%xmm1, %xmm0
> > -	pcmpeqb	%xmm1, %xmm2
> > -	pcmpeqb	%xmm1, %xmm3
> > +	pmovmskb	%xmm2, %eax
> > +	pmovmskb	%xmm0, %ecx
> > +	salq	$16, %rdx
> > +	salq	$32, %rax
> > +	orq	%rdx, %rax
> > +	movq	%rcx, %rdx
> > +	salq	$48, %rdx
> > +	orq	%rdx, %rax
> > +	jne	L(finish)
> > +L(align):
> > +	andq	$-64, %rdi
> > +	.p2align 4
> > +L(loop64):
> > +	movdqa	64(%rdi), %xmm5
> > +	movdqa	80(%rdi), %xmm4
> > +	pcmpeqb	%xmm1, %xmm5
> > +	movdqa	96(%rdi), %xmm3
> >  	pcmpeqb	%xmm1, %xmm4
> > -
> > -	pmaxub	%xmm0, %xmm3
> > -	pmaxub	%xmm2, %xmm4
> > -	pmaxub	%xmm3, %xmm4
> > -	pmovmskb %xmm4, %eax
> > -
> > -	add	$64, %rdi
> > -
> > -	test	%eax, %eax
> > -	jz	L(align64_loop)
> > -
> > -	sub	$64, %rdi
> > -
> > -	pmovmskb %xmm0, %eax
> > -	test	%eax, %eax
> > -	jnz	L(matches)
> > -
> > -	pmovmskb %xmm2, %eax
> > -	test	%eax, %eax
> > -	jnz	L(matches16)
> > -
> > -	movdqa	32(%rdi), %xmm3
> > +	movdqa	112(%rdi), %xmm2
> >  	pcmpeqb	%xmm1, %xmm3
> > -
> > -	pcmpeqb	48(%rdi), %xmm1
> > -	pmovmskb %xmm3, %eax
> > -	test	%eax, %eax
> > -	jnz	L(matches32)
> > -
> > -	pmovmskb %xmm1, %eax
> > -	bsf	%eax, %eax
> > -	lea	48(%rdi, %rax), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(matches0):
> > -	bsf	%eax, %eax
> > -	lea	-16(%rax, %rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(matches):
> > -	bsf	%eax, %eax
> > -	add	%rdi, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(matches16):
> > -	bsf	%eax, %eax
> > -	lea	16(%rax, %rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(matches32):
> > -	bsf	%eax, %eax
> > -	lea	32(%rax, %rdi), %rax
> > +	pmaxub	%xmm4, %xmm5
> > +	pcmpeqb	%xmm1, %xmm2
> > +	pmaxub	%xmm3, %xmm5
> > +	pmaxub	%xmm2, %xmm5
> > +	addq	$64, %rdi
> > +	pmovmskb	%xmm5, %eax
> > +	testl	%eax, %eax
> > +	je	L(loop64)
> > +
> > +	movdqa	(%rdi), %xmm5
> > +	pcmpeqb	%xmm1, %xmm5
> > +	pmovmskb	%xmm5, %ecx
> > +	pmovmskb	%xmm4, %eax
> > +	pmovmskb	%xmm3, %esi
> > +	pmovmskb	%xmm2, %edx
> > +	salq	$32, %rsi
> > +	salq	$16, %rax
> > +	salq	$48, %rdx
> > +	orq	%rsi, %rax
> > +	orq	%rcx, %rax
> > +	orq	%rdx, %rax
> > +L(finish):
> > +	bsfq	%rax, %rax
> > +	addq	%rdi, %rax
> >  	ret
> > +	.p2align 4,,10
> > +	.p2align 3
> > +L(cross_page):
> >  
> > -	.p2align 4
> > -L(return_null):
> > -	xor	%rax, %rax
> > +	movq	%rdi, %rax
> > +	andq	$-64, %rax
> > +	movdqa	(%rax), %xmm0
> > +	pcmpeqb	%xmm1, %xmm0
> > +	pmovmskb	%xmm0, %esi
> > +	movdqa	16(%rax), %xmm0
> > +	pcmpeqb	%xmm1, %xmm0
> > +	pmovmskb	%xmm0, %edx
> > +	movdqa	32(%rax), %xmm0
> > +	pcmpeqb	%xmm1, %xmm0
> > +	salq	$16, %rdx
> > +	pmovmskb	%xmm0, %r8d
> > +	movdqa	48(%rax), %xmm0
> > +	pcmpeqb	%xmm1, %xmm0
> > +	salq	$32, %r8
> > +	orq	%r8, %rdx
> > +	pmovmskb	%xmm0, %ecx
> > +	orq	%rsi, %rdx
> > +	salq	$48, %rcx
> > +	orq	%rcx, %rdx
> > +	movl	%edi, %ecx
> > +	subl	%eax, %ecx
> > +	shrq	%cl, %rdx
> > +	testq	%rdx, %rdx
> > +	je	L(align)
> > +	bsfq	%rdx, %rax
> > +	addq	%rdi, %rax
> >  	ret
> > -
> >  END (rawmemchr)
> >  
> >  strong_alias (rawmemchr, __rawmemchr)
> 
> -- 
> 
> Your processor does not develop enough heat.

-- 

Firmware update in the coffee machine


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]