This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PING][PATCH v1.1] Improve rawmemchr implementation.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Mon, 10 Feb 2014 18:22:51 +0100
- Subject: [PING][PATCH v1.1] Improve rawmemchr implementation.
- Authentication-results: sourceware.org; auth=none
- References: <20130816120314 dot GA25879 at domone dot kolej dot mff dot cuni dot cz> <20130816121256 dot GA26328 at domone dot kolej dot mff dot cuni dot cz> <20130902095530 dot GF11034 at domone dot kolej dot mff dot cuni dot cz> <20130908183341 dot GD27001 at domone dot kolej dot mff dot cuni dot cz>
ping
On Sun, Sep 08, 2013 at 08:33:41PM +0200, OndÅej BÃlka wrote:
> On Mon, Sep 02, 2013 at 11:55:30AM +0200, OndÅej BÃlka wrote:
> > Ping,
> ping
> >
> > I noticed that we use strong alias instead of weak, is that intentional?
> >
> > strong_alias (rawmemchr, __rawmemchr)
> >
> > There is one minor change, I misclassified case s % 4096 == 4032 as
> > crossing page boundary. Following should improve performance a bit.
> >
> > - cmpl $4031, %eax
> > + cmpl $4032, %eax
> >
> > On Fri, Aug 16, 2013 at 02:12:56PM +0200, OndÅej BÃlka wrote:
> > > A patch got accidentaly filtered, so here is correct version.
> > >
> > > Hi,
> > >
> > > I looked to rawmemchr implementation and it can be improved by using
> > > similar header that is used in strchr. A loop itself was well optimized
> > > so we only gain around 20 cycles per call for sizes from 64 bytes.
> > >
> > > Results at show it that this is improvement for unit tests but I did not
> > > find program that calls rawmemchr yet.
> > > http://kam.mff.cuni.cz/~ondra/benchmark_string/rawmemchr_profile.html
> > > A benchmark is at
> > > http://kam.mff.cuni.cz/~ondra/benchmark_string/rawmemchr_profile160813.tar.bz2
> > >
> > > Passes test, OK to commit?
> > >
> > * sysdeps/x86_64/rawmemchr.S (rawmemchr): Optimize implementation.
> >
> > ---
> > sysdeps/x86_64/rawmemchr.S | 252 +++++++++++++++------------------------------
> > 1 file changed, 85 insertions(+), 167 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S
> > index f4d5591..93b8f4f 100644
> > --- a/sysdeps/x86_64/rawmemchr.S
> > +++ b/sysdeps/x86_64/rawmemchr.S
> > @@ -22,185 +22,103 @@
> >
> > .text
> > ENTRY (rawmemchr)
> > - movd %rsi, %xmm1
> > - mov %rdi, %rcx
> > -
> > - punpcklbw %xmm1, %xmm1
> > - punpcklbw %xmm1, %xmm1
> > -
> > - and $63, %rcx
> > + movd %esi, %xmm1
> > + movq %rdi, %rax
> > + andl $4095, %eax
> > + punpcklbw %xmm1, %xmm1
> > + cmpl $4032, %eax
> > + punpcklwd %xmm1, %xmm1
> > pshufd $0, %xmm1, %xmm1
> > -
> > - cmp $48, %rcx
> > - ja L(crosscache)
> > -
> > + jg L(cross_page)
> > movdqu (%rdi), %xmm0
> > pcmpeqb %xmm1, %xmm0
> > -/* Check if there is a match. */
> > - pmovmskb %xmm0, %eax
> > - test %eax, %eax
> > -
> > - jnz L(matches)
> > - add $16, %rdi
> > - and $-16, %rdi
> > - jmp L(loop_prolog)
> > -
> > - .p2align 4
> > -L(crosscache):
> > - and $15, %rcx
> > - and $-16, %rdi
> > - movdqa (%rdi), %xmm0
> > -
> > - pcmpeqb %xmm1, %xmm0
> > -/* Check if there is a match. */
> > - pmovmskb %xmm0, %eax
> > -/* Remove the leading bytes. */
> > - sar %cl, %eax
> > + pmovmskb %xmm0, %eax
> > test %eax, %eax
> > - je L(unaligned_no_match)
> > -/* Check which byte is a match. */
> > - bsf %eax, %eax
> > -
> > - add %rdi, %rax
> > - add %rcx, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(unaligned_no_match):
> > - add $16, %rdi
> > -
> > - .p2align 4
> > -L(loop_prolog):
> > - movdqa (%rdi), %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - pmovmskb %xmm0, %eax
> > - test %eax, %eax
> > - jnz L(matches)
> > -
> > - movdqa 16(%rdi), %xmm2
> > - pcmpeqb %xmm1, %xmm2
> > - pmovmskb %xmm2, %eax
> > - test %eax, %eax
> > - jnz L(matches16)
> > -
> > - movdqa 32(%rdi), %xmm3
> > + jne L(finish)
> > + movdqu 16(%rdi), %xmm3
> > + movdqu 32(%rdi), %xmm2
> > pcmpeqb %xmm1, %xmm3
> > - pmovmskb %xmm3, %eax
> > - test %eax, %eax
> > - jnz L(matches32)
> > -
> > - movdqa 48(%rdi), %xmm4
> > - pcmpeqb %xmm1, %xmm4
> > - add $64, %rdi
> > - pmovmskb %xmm4, %eax
> > - test %eax, %eax
> > - jnz L(matches0)
> > -
> > - test $0x3f, %rdi
> > - jz L(align64_loop)
> > -
> > - movdqa (%rdi), %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - pmovmskb %xmm0, %eax
> > - test %eax, %eax
> > - jnz L(matches)
> > -
> > - movdqa 16(%rdi), %xmm2
> > + movdqu 48(%rdi), %xmm0
> > pcmpeqb %xmm1, %xmm2
> > - pmovmskb %xmm2, %eax
> > - test %eax, %eax
> > - jnz L(matches16)
> > -
> > - movdqa 32(%rdi), %xmm3
> > - pcmpeqb %xmm1, %xmm3
> > - pmovmskb %xmm3, %eax
> > - test %eax, %eax
> > - jnz L(matches32)
> > -
> > - movdqa 48(%rdi), %xmm3
> > - pcmpeqb %xmm1, %xmm3
> > - pmovmskb %xmm3, %eax
> > -
> > - add $64, %rdi
> > - test %eax, %eax
> > - jnz L(matches0)
> > -
> > - and $-64, %rdi
> > -
> > - .p2align 4
> > -L(align64_loop):
> > - movdqa (%rdi), %xmm0
> > - movdqa 16(%rdi), %xmm2
> > - movdqa 32(%rdi), %xmm3
> > - movdqa 48(%rdi), %xmm4
> > -
> > + pmovmskb %xmm3, %edx
> > pcmpeqb %xmm1, %xmm0
> > - pcmpeqb %xmm1, %xmm2
> > - pcmpeqb %xmm1, %xmm3
> > + pmovmskb %xmm2, %eax
> > + pmovmskb %xmm0, %ecx
> > + salq $16, %rdx
> > + salq $32, %rax
> > + orq %rdx, %rax
> > + movq %rcx, %rdx
> > + salq $48, %rdx
> > + orq %rdx, %rax
> > + jne L(finish)
> > +L(align):
> > + andq $-64, %rdi
> > + .p2align 4
> > +L(loop64):
> > + movdqa 64(%rdi), %xmm5
> > + movdqa 80(%rdi), %xmm4
> > + pcmpeqb %xmm1, %xmm5
> > + movdqa 96(%rdi), %xmm3
> > pcmpeqb %xmm1, %xmm4
> > -
> > - pmaxub %xmm0, %xmm3
> > - pmaxub %xmm2, %xmm4
> > - pmaxub %xmm3, %xmm4
> > - pmovmskb %xmm4, %eax
> > -
> > - add $64, %rdi
> > -
> > - test %eax, %eax
> > - jz L(align64_loop)
> > -
> > - sub $64, %rdi
> > -
> > - pmovmskb %xmm0, %eax
> > - test %eax, %eax
> > - jnz L(matches)
> > -
> > - pmovmskb %xmm2, %eax
> > - test %eax, %eax
> > - jnz L(matches16)
> > -
> > - movdqa 32(%rdi), %xmm3
> > + movdqa 112(%rdi), %xmm2
> > pcmpeqb %xmm1, %xmm3
> > -
> > - pcmpeqb 48(%rdi), %xmm1
> > - pmovmskb %xmm3, %eax
> > - test %eax, %eax
> > - jnz L(matches32)
> > -
> > - pmovmskb %xmm1, %eax
> > - bsf %eax, %eax
> > - lea 48(%rdi, %rax), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(matches0):
> > - bsf %eax, %eax
> > - lea -16(%rax, %rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(matches):
> > - bsf %eax, %eax
> > - add %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(matches16):
> > - bsf %eax, %eax
> > - lea 16(%rax, %rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(matches32):
> > - bsf %eax, %eax
> > - lea 32(%rax, %rdi), %rax
> > + pmaxub %xmm4, %xmm5
> > + pcmpeqb %xmm1, %xmm2
> > + pmaxub %xmm3, %xmm5
> > + pmaxub %xmm2, %xmm5
> > + addq $64, %rdi
> > + pmovmskb %xmm5, %eax
> > + testl %eax, %eax
> > + je L(loop64)
> > +
> > + movdqa (%rdi), %xmm5
> > + pcmpeqb %xmm1, %xmm5
> > + pmovmskb %xmm5, %ecx
> > + pmovmskb %xmm4, %eax
> > + pmovmskb %xmm3, %esi
> > + pmovmskb %xmm2, %edx
> > + salq $32, %rsi
> > + salq $16, %rax
> > + salq $48, %rdx
> > + orq %rsi, %rax
> > + orq %rcx, %rax
> > + orq %rdx, %rax
> > +L(finish):
> > + bsfq %rax, %rax
> > + addq %rdi, %rax
> > ret
> > + .p2align 4,,10
> > + .p2align 3
> > +L(cross_page):
> >
> > - .p2align 4
> > -L(return_null):
> > - xor %rax, %rax
> > + movq %rdi, %rax
> > + andq $-64, %rax
> > + movdqa (%rax), %xmm0
> > + pcmpeqb %xmm1, %xmm0
> > + pmovmskb %xmm0, %esi
> > + movdqa 16(%rax), %xmm0
> > + pcmpeqb %xmm1, %xmm0
> > + pmovmskb %xmm0, %edx
> > + movdqa 32(%rax), %xmm0
> > + pcmpeqb %xmm1, %xmm0
> > + salq $16, %rdx
> > + pmovmskb %xmm0, %r8d
> > + movdqa 48(%rax), %xmm0
> > + pcmpeqb %xmm1, %xmm0
> > + salq $32, %r8
> > + orq %r8, %rdx
> > + pmovmskb %xmm0, %ecx
> > + orq %rsi, %rdx
> > + salq $48, %rcx
> > + orq %rcx, %rdx
> > + movl %edi, %ecx
> > + subl %eax, %ecx
> > + shrq %cl, %rdx
> > + testq %rdx, %rdx
> > + je L(align)
> > + bsfq %rdx, %rax
> > + addq %rdi, %rax
> > ret
> > -
> > END (rawmemchr)
> >
> > strong_alias (rawmemchr, __rawmemchr)
>
> --
>
> Your processor does not develop enough heat.
--
Firmware update in the coffee machine