This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PING][PATCH v1.1] Improve rawmemchr implementation.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Sun, 8 Sep 2013 20:33:41 +0200
- Subject: Re: [PING][PATCH v1.1] Improve rawmemchr implementation.
- Authentication-results: sourceware.org; auth=none
- References: <20130816120314 dot GA25879 at domone dot kolej dot mff dot cuni dot cz> <20130816121256 dot GA26328 at domone dot kolej dot mff dot cuni dot cz> <20130902095530 dot GF11034 at domone dot kolej dot mff dot cuni dot cz>
On Mon, Sep 02, 2013 at 11:55:30AM +0200, OndÅej BÃlka wrote:
> Ping,
ping
>
> I noticed that we use strong alias instead of weak, is that intentional?
>
> strong_alias (rawmemchr, __rawmemchr)
>
> There is one minor change, I misclassified case s % 4096 == 4032 as
> crossing page boundary. Following should improve performance a bit.
>
> - cmpl $4031, %eax
> + cmpl $4032, %eax
>
> On Fri, Aug 16, 2013 at 02:12:56PM +0200, OndÅej BÃlka wrote:
> > A patch got accidentaly filtered, so here is correct version.
> >
> > Hi,
> >
> > I looked to rawmemchr implementation and it can be improved by using
> > similar header that is used in strchr. A loop itself was well optimized
> > so we only gain around 20 cycles per call for sizes from 64 bytes.
> >
> > Results at show it that this is improvement for unit tests but I did not
> > find program that calls rawmemchr yet.
> > http://kam.mff.cuni.cz/~ondra/benchmark_string/rawmemchr_profile.html
> > A benchmark is at
> > http://kam.mff.cuni.cz/~ondra/benchmark_string/rawmemchr_profile160813.tar.bz2
> >
> > Passes test, OK to commit?
> >
> * sysdeps/x86_64/rawmemchr.S (rawmemchr): Optimize implementation.
>
> ---
> sysdeps/x86_64/rawmemchr.S | 252 +++++++++++++++------------------------------
> 1 file changed, 85 insertions(+), 167 deletions(-)
>
> diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S
> index f4d5591..93b8f4f 100644
> --- a/sysdeps/x86_64/rawmemchr.S
> +++ b/sysdeps/x86_64/rawmemchr.S
> @@ -22,185 +22,103 @@
>
> .text
> ENTRY (rawmemchr)
> - movd %rsi, %xmm1
> - mov %rdi, %rcx
> -
> - punpcklbw %xmm1, %xmm1
> - punpcklbw %xmm1, %xmm1
> -
> - and $63, %rcx
> + movd %esi, %xmm1
> + movq %rdi, %rax
> + andl $4095, %eax
> + punpcklbw %xmm1, %xmm1
> + cmpl $4032, %eax
> + punpcklwd %xmm1, %xmm1
> pshufd $0, %xmm1, %xmm1
> -
> - cmp $48, %rcx
> - ja L(crosscache)
> -
> + jg L(cross_page)
> movdqu (%rdi), %xmm0
> pcmpeqb %xmm1, %xmm0
> -/* Check if there is a match. */
> - pmovmskb %xmm0, %eax
> - test %eax, %eax
> -
> - jnz L(matches)
> - add $16, %rdi
> - and $-16, %rdi
> - jmp L(loop_prolog)
> -
> - .p2align 4
> -L(crosscache):
> - and $15, %rcx
> - and $-16, %rdi
> - movdqa (%rdi), %xmm0
> -
> - pcmpeqb %xmm1, %xmm0
> -/* Check if there is a match. */
> - pmovmskb %xmm0, %eax
> -/* Remove the leading bytes. */
> - sar %cl, %eax
> + pmovmskb %xmm0, %eax
> test %eax, %eax
> - je L(unaligned_no_match)
> -/* Check which byte is a match. */
> - bsf %eax, %eax
> -
> - add %rdi, %rax
> - add %rcx, %rax
> - ret
> -
> - .p2align 4
> -L(unaligned_no_match):
> - add $16, %rdi
> -
> - .p2align 4
> -L(loop_prolog):
> - movdqa (%rdi), %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %eax
> - test %eax, %eax
> - jnz L(matches)
> -
> - movdqa 16(%rdi), %xmm2
> - pcmpeqb %xmm1, %xmm2
> - pmovmskb %xmm2, %eax
> - test %eax, %eax
> - jnz L(matches16)
> -
> - movdqa 32(%rdi), %xmm3
> + jne L(finish)
> + movdqu 16(%rdi), %xmm3
> + movdqu 32(%rdi), %xmm2
> pcmpeqb %xmm1, %xmm3
> - pmovmskb %xmm3, %eax
> - test %eax, %eax
> - jnz L(matches32)
> -
> - movdqa 48(%rdi), %xmm4
> - pcmpeqb %xmm1, %xmm4
> - add $64, %rdi
> - pmovmskb %xmm4, %eax
> - test %eax, %eax
> - jnz L(matches0)
> -
> - test $0x3f, %rdi
> - jz L(align64_loop)
> -
> - movdqa (%rdi), %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %eax
> - test %eax, %eax
> - jnz L(matches)
> -
> - movdqa 16(%rdi), %xmm2
> + movdqu 48(%rdi), %xmm0
> pcmpeqb %xmm1, %xmm2
> - pmovmskb %xmm2, %eax
> - test %eax, %eax
> - jnz L(matches16)
> -
> - movdqa 32(%rdi), %xmm3
> - pcmpeqb %xmm1, %xmm3
> - pmovmskb %xmm3, %eax
> - test %eax, %eax
> - jnz L(matches32)
> -
> - movdqa 48(%rdi), %xmm3
> - pcmpeqb %xmm1, %xmm3
> - pmovmskb %xmm3, %eax
> -
> - add $64, %rdi
> - test %eax, %eax
> - jnz L(matches0)
> -
> - and $-64, %rdi
> -
> - .p2align 4
> -L(align64_loop):
> - movdqa (%rdi), %xmm0
> - movdqa 16(%rdi), %xmm2
> - movdqa 32(%rdi), %xmm3
> - movdqa 48(%rdi), %xmm4
> -
> + pmovmskb %xmm3, %edx
> pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm1, %xmm2
> - pcmpeqb %xmm1, %xmm3
> + pmovmskb %xmm2, %eax
> + pmovmskb %xmm0, %ecx
> + salq $16, %rdx
> + salq $32, %rax
> + orq %rdx, %rax
> + movq %rcx, %rdx
> + salq $48, %rdx
> + orq %rdx, %rax
> + jne L(finish)
> +L(align):
> + andq $-64, %rdi
> + .p2align 4
> +L(loop64):
> + movdqa 64(%rdi), %xmm5
> + movdqa 80(%rdi), %xmm4
> + pcmpeqb %xmm1, %xmm5
> + movdqa 96(%rdi), %xmm3
> pcmpeqb %xmm1, %xmm4
> -
> - pmaxub %xmm0, %xmm3
> - pmaxub %xmm2, %xmm4
> - pmaxub %xmm3, %xmm4
> - pmovmskb %xmm4, %eax
> -
> - add $64, %rdi
> -
> - test %eax, %eax
> - jz L(align64_loop)
> -
> - sub $64, %rdi
> -
> - pmovmskb %xmm0, %eax
> - test %eax, %eax
> - jnz L(matches)
> -
> - pmovmskb %xmm2, %eax
> - test %eax, %eax
> - jnz L(matches16)
> -
> - movdqa 32(%rdi), %xmm3
> + movdqa 112(%rdi), %xmm2
> pcmpeqb %xmm1, %xmm3
> -
> - pcmpeqb 48(%rdi), %xmm1
> - pmovmskb %xmm3, %eax
> - test %eax, %eax
> - jnz L(matches32)
> -
> - pmovmskb %xmm1, %eax
> - bsf %eax, %eax
> - lea 48(%rdi, %rax), %rax
> - ret
> -
> - .p2align 4
> -L(matches0):
> - bsf %eax, %eax
> - lea -16(%rax, %rdi), %rax
> - ret
> -
> - .p2align 4
> -L(matches):
> - bsf %eax, %eax
> - add %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(matches16):
> - bsf %eax, %eax
> - lea 16(%rax, %rdi), %rax
> - ret
> -
> - .p2align 4
> -L(matches32):
> - bsf %eax, %eax
> - lea 32(%rax, %rdi), %rax
> + pmaxub %xmm4, %xmm5
> + pcmpeqb %xmm1, %xmm2
> + pmaxub %xmm3, %xmm5
> + pmaxub %xmm2, %xmm5
> + addq $64, %rdi
> + pmovmskb %xmm5, %eax
> + testl %eax, %eax
> + je L(loop64)
> +
> + movdqa (%rdi), %xmm5
> + pcmpeqb %xmm1, %xmm5
> + pmovmskb %xmm5, %ecx
> + pmovmskb %xmm4, %eax
> + pmovmskb %xmm3, %esi
> + pmovmskb %xmm2, %edx
> + salq $32, %rsi
> + salq $16, %rax
> + salq $48, %rdx
> + orq %rsi, %rax
> + orq %rcx, %rax
> + orq %rdx, %rax
> +L(finish):
> + bsfq %rax, %rax
> + addq %rdi, %rax
> ret
> + .p2align 4,,10
> + .p2align 3
> +L(cross_page):
>
> - .p2align 4
> -L(return_null):
> - xor %rax, %rax
> + movq %rdi, %rax
> + andq $-64, %rax
> + movdqa (%rax), %xmm0
> + pcmpeqb %xmm1, %xmm0
> + pmovmskb %xmm0, %esi
> + movdqa 16(%rax), %xmm0
> + pcmpeqb %xmm1, %xmm0
> + pmovmskb %xmm0, %edx
> + movdqa 32(%rax), %xmm0
> + pcmpeqb %xmm1, %xmm0
> + salq $16, %rdx
> + pmovmskb %xmm0, %r8d
> + movdqa 48(%rax), %xmm0
> + pcmpeqb %xmm1, %xmm0
> + salq $32, %r8
> + orq %r8, %rdx
> + pmovmskb %xmm0, %ecx
> + orq %rsi, %rdx
> + salq $48, %rcx
> + orq %rcx, %rdx
> + movl %edi, %ecx
> + subl %eax, %ecx
> + shrq %cl, %rdx
> + testq %rdx, %rdx
> + je L(align)
> + bsfq %rdx, %rax
> + addq %rdi, %rax
> ret
> -
> END (rawmemchr)
>
> strong_alias (rawmemchr, __rawmemchr)
--
Your processor does not develop enough heat.