This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PING][PATCH v1.1] Optimize strrchr more.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Sat, 8 Feb 2014 01:35:28 +0100
- Subject: [PING][PATCH v1.1] Optimize strrchr more.
- Authentication-results: sourceware.org; auth=none
- References: <20131004201522 dot GA6269 at domone> <20131005063932 dot GA12657 at domone>
ping
On Sat, Oct 05, 2013 at 08:39:32AM +0200, OndÅej BÃlka wrote:
> On Fri, Oct 04, 2013 at 10:15:22PM +0200, OndÅej BÃlka wrote:
> > Hi,
> >
> > I played with my evolutionary algorithms to optimize various functions.
> > It helped a bit but I got more by looking at code again because I
> > noticed several oppurtunities that I missed.
> >
> > First one is that best way to test if bytes are zero is by preparing
> > zero register in advance, assume you have code
> >
> > pxor %xmm3, %xmm3
> > movdqa %xmm1, %xmm2
> > pcmpeqb %xmm3, %xmm1 /*get zero mask */
> > # do something with xmm2
> >
> > Could be changed into
> >
> > pxor %xmm1, %xmm1
> > pcmpeqb %xmm2, %xmm1 /*get zero mask */
> > # do something with xmm2
> >
> > Second improvement is using 32-byte registers where benefical.
> >
> > And last is that in previous iteration I did not trougthruly checked a loop
> > generated by gcc. A cse pass added four extra moves that are useful only
> > after exiting from loop. By removing these we on most architectures gained
> > around 10% for large inputs.
> >
> > Then there is evolver itself which rearanges scheduling to faster one
> > but its effect is relatively small (1%-2%).
> >
> > OK to commit?
> >
> I introduced typo on manual editation, I tested wrong wrong register at
> exit from loop,
>
> Here is fixed version.
>
> * sysdeps/x86_64/strrchr.S: Optimize implementation.
>
> diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> index 514765b..d532206 100644
> --- a/sysdeps/x86_64/strrchr.S
> +++ b/sysdeps/x86_64/strrchr.S
> @@ -26,61 +26,72 @@
>
> .text
> ENTRY (strrchr)
> + pxor %xmm6, %xmm6
> movd %esi, %xmm1
> + pxor %xmm7, %xmm7
> movq %rdi, %rax
> +#ifdef USE_SSSE3
> andl $4095, %eax
> - punpcklbw %xmm1, %xmm1
> - cmpq $4032, %rax
> - punpcklwd %xmm1, %xmm1
> + pxor %xmm5, %xmm5
> + pxor %xmm4, %xmm4
> + cmp $4032, %eax
> + pshufb %xmm7, %xmm1
> +#else
> + punpcklbw %xmm1, %xmm1
> + andl $4095, %eax
> + pxor %xmm5, %xmm5
> + pxor %xmm4, %xmm4
> + cmp $4032, %eax
> + punpcklwd %xmm1, %xmm1
> pshufd $0, %xmm1, %xmm1
> +#endif
> ja L(cross_page)
> movdqu (%rdi), %xmm0
> - pxor %xmm2, %xmm2
> - movdqa %xmm0, %xmm3
> + pcmpeqb %xmm0, %xmm4
> pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm3
> - pmovmskb %xmm0, %ecx
> - pmovmskb %xmm3, %edx
> - testq %rdx, %rdx
> + pmovmskb %xmm0, %ecx
> + pmovmskb %xmm4, %edx
> + test %edx, %edx
> je L(next_48_bytes)
> - leaq -1(%rdx), %rax
> - xorq %rdx, %rax
> - andq %rcx, %rax
> + lea -1(%edx), %eax
> + xor %edx, %eax
> + and %ecx, %eax
> je L(exit)
> - bsrq %rax, %rax
> + bsr %eax, %eax
> addq %rdi, %rax
> - ret
> + ret
> +
> +L(exit):
> + xorl %eax, %eax
> + ret
>
> - ALIGN(4)
> + .p2align 4
> L(next_48_bytes):
> - movdqu 16(%rdi), %xmm4
> - movdqa %xmm4, %xmm5
> movdqu 32(%rdi), %xmm3
> - pcmpeqb %xmm1, %xmm4
> + movdqu 48(%rdi), %xmm4
> + pcmpeqb %xmm4, %xmm7
> + movdqu 16(%rdi), %xmm2
> pcmpeqb %xmm2, %xmm5
> - movdqu 48(%rdi), %xmm0
> - pmovmskb %xmm5, %edx
> - movdqa %xmm3, %xmm5
> + pmovmskb %xmm5, %eax
> + pcmpeqb %xmm3, %xmm6
> + pcmpeqb %xmm1, %xmm2
> + pmovmskb %xmm2, %esi
> + salq $16, %rsi
> pcmpeqb %xmm1, %xmm3
> - pcmpeqb %xmm2, %xmm5
> - pcmpeqb %xmm0, %xmm2
> - salq $16, %rdx
> - pmovmskb %xmm3, %r8d
> - pmovmskb %xmm5, %eax
> - pmovmskb %xmm2, %esi
> + pcmpeqb %xmm1, %xmm4
> + pmovmskb %xmm4, %edx
> + orq %rcx, %rsi
> + pmovmskb %xmm6, %ecx
> + pmovmskb %xmm3, %r8d
> + salq $16, %rax
> + salq $32, %rcx
> salq $32, %r8
> - salq $32, %rax
> - pcmpeqb %xmm1, %xmm0
> - orq %rdx, %rax
> - movq %rsi, %rdx
> - pmovmskb %xmm4, %esi
> salq $48, %rdx
> - salq $16, %rsi
> + orq %rcx, %rax
> orq %r8, %rsi
> - orq %rcx, %rsi
> - pmovmskb %xmm0, %ecx
> - salq $48, %rcx
> - orq %rcx, %rsi
> + orq %rdx, %rsi
> + pmovmskb %xmm7, %edx
> + salq $48, %rdx
> orq %rdx, %rax
> je L(loop_header2)
> leaq -1(%rax), %rcx
> @@ -88,71 +99,69 @@ L(next_48_bytes):
> andq %rcx, %rsi
> je L(exit)
> bsrq %rsi, %rsi
> - leaq (%rdi,%rsi), %rax
> - ret
> + leaq (%rdi, %rsi), %rax
> + ret
>
> - ALIGN(4)
> + .p2align 3
> L(loop_header2):
> testq %rsi, %rsi
> movq %rdi, %rcx
> - je L(no_c_found)
> + jne L(loop_header)
> + movl $1, %esi /* Evaluates to null. */
> + xorl %ecx, %ecx
> L(loop_header):
> addq $64, %rdi
> pxor %xmm7, %xmm7
> andq $-64, %rdi
> jmp L(loop_entry)
>
> - ALIGN(4)
> + .p2align 3
> L(loop64):
> testq %rdx, %rdx
> cmovne %rdx, %rsi
> cmovne %rdi, %rcx
> addq $64, %rdi
> L(loop_entry):
> - movdqa 32(%rdi), %xmm3
> - pxor %xmm6, %xmm6
> - movdqa 48(%rdi), %xmm2
> - movdqa %xmm3, %xmm0
> - movdqa 16(%rdi), %xmm4
> - pminub %xmm2, %xmm0
> - movdqa (%rdi), %xmm5
> - pminub %xmm4, %xmm0
> - pminub %xmm5, %xmm0
> - pcmpeqb %xmm7, %xmm0
> - pmovmskb %xmm0, %eax
> - movdqa %xmm5, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %r9d
> - movdqa %xmm4, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %edx
> - movdqa %xmm3, %xmm0
> - pcmpeqb %xmm1, %xmm0
> + movdqa 16(%rdi), %xmm3
> + movdqa 48(%rdi), %xmm5
> + movdqa (%rdi), %xmm2
> + movdqa %xmm2, %xmm6
> + pminub %xmm3, %xmm6
> + pcmpeqb %xmm1, %xmm3
> + movdqa 32(%rdi), %xmm4
> + pminub %xmm4, %xmm6
> + pminub %xmm5, %xmm6
> + pmovmskb %xmm3, %edx
> + pcmpeqb %xmm7, %xmm6
> + pmovmskb %xmm6, %eax
> + pcmpeqb %xmm1, %xmm4
> + pmovmskb %xmm4, %r10d
> + pcmpeqb %xmm1, %xmm5
> + pcmpeqb %xmm1, %xmm2
> + pmovmskb %xmm5, %r8d
> salq $16, %rdx
> - pmovmskb %xmm0, %r10d
> - movdqa %xmm2, %xmm0
> - pcmpeqb %xmm1, %xmm0
> salq $32, %r10
> orq %r10, %rdx
> - pmovmskb %xmm0, %r8d
> + pmovmskb %xmm2, %r9d
> orq %r9, %rdx
> salq $48, %r8
> orq %r8, %rdx
> testl %eax, %eax
> je L(loop64)
> - pcmpeqb %xmm6, %xmm4
> - pcmpeqb %xmm6, %xmm3
> - pcmpeqb %xmm6, %xmm5
> - pmovmskb %xmm4, %eax
> - pmovmskb %xmm3, %r10d
> - pcmpeqb %xmm6, %xmm2
> - pmovmskb %xmm5, %r9d
> + movdqa 32(%rdi), %xmm4
> + salq $48, %rax
> + pcmpeqb %xmm7, %xmm4
> + movdqa 16(%rdi), %xmm3
> + pmovmskb %xmm4, %r10d
> + pcmpeqb %xmm7, %xmm3
> + movdqa (%rdi), %xmm2
> + pmovmskb %xmm3, %r9d
> salq $32, %r10
> - salq $16, %rax
> - pmovmskb %xmm2, %r8d
> + pcmpeqb %xmm7, %xmm2
> + pmovmskb %xmm2, %r8d
> orq %r10, %rax
> + salq $16, %r9
> orq %r9, %rax
> - salq $48, %r8
> orq %r8, %rax
> leaq -1(%rax), %r8
> xorq %rax, %r8
> @@ -160,59 +169,50 @@ L(loop_entry):
> cmovne %rdi, %rcx
> cmovne %rdx, %rsi
> bsrq %rsi, %rsi
> - leaq (%rcx,%rsi), %rax
> - ret
> + leaq (%rcx, %rsi), %rax
> + ret
>
> - ALIGN(4)
> -L(no_c_found):
> - movl $1, %esi
> - xorl %ecx, %ecx
> - jmp L(loop_header)
>
> - ALIGN(4)
> -L(exit):
> - xorl %eax, %eax
> - ret
>
> - ALIGN(4)
> + .p2align 2
> L(cross_page):
> - movq %rdi, %rax
> pxor %xmm0, %xmm0
> + movq %rdi, %rax
> andq $-64, %rax
> + movdqu 48(%rax), %xmm2
> movdqu (%rax), %xmm5
> - movdqa %xmm5, %xmm6
> movdqu 16(%rax), %xmm4
> - pcmpeqb %xmm1, %xmm5
> + movdqa %xmm5, %xmm6
> pcmpeqb %xmm0, %xmm6
> - movdqu 32(%rax), %xmm3
> - pmovmskb %xmm6, %esi
> + pmovmskb %xmm6, %esi
> movdqa %xmm4, %xmm6
> - movdqu 48(%rax), %xmm2
> - pcmpeqb %xmm1, %xmm4
> + movdqu 32(%rax), %xmm3
> pcmpeqb %xmm0, %xmm6
> - pmovmskb %xmm6, %edx
> + pmovmskb %xmm6, %edx
> movdqa %xmm3, %xmm6
> - pcmpeqb %xmm1, %xmm3
> + salq $16, %rdx
> pcmpeqb %xmm0, %xmm6
> + pcmpeqb %xmm1, %xmm3
> + pmovmskb %xmm6, %r8d
> pcmpeqb %xmm2, %xmm0
> - salq $16, %rdx
> - pmovmskb %xmm3, %r9d
> - pmovmskb %xmm6, %r8d
> - pmovmskb %xmm0, %ecx
> - salq $32, %r9
> + pmovmskb %xmm0, %ecx
> + pmovmskb %xmm3, %r9d
> + salq $48, %rcx
> salq $32, %r8
> - pcmpeqb %xmm1, %xmm2
> + pcmpeqb %xmm1, %xmm4
> orq %r8, %rdx
> - salq $48, %rcx
> - pmovmskb %xmm5, %r8d
> + pcmpeqb %xmm1, %xmm2
> orq %rsi, %rdx
> - pmovmskb %xmm4, %esi
> + pmovmskb %xmm4, %esi
> orq %rcx, %rdx
> - pmovmskb %xmm2, %ecx
> + pmovmskb %xmm2, %ecx
> + pcmpeqb %xmm1, %xmm5
> salq $16, %rsi
> - salq $48, %rcx
> + pmovmskb %xmm5, %r8d
> + salq $32, %r9
> orq %r9, %rsi
> orq %r8, %rsi
> + salq $48, %rcx
> orq %rcx, %rsi
> movl %edi, %ecx
> subl %eax, %ecx
> @@ -226,7 +226,7 @@ L(cross_page):
> je L(exit)
> bsrq %rsi, %rax
> addq %rdi, %rax
> - ret
> + ret
> END (strrchr)
>
> weak_alias (strrchr, rindex)
> --
> 1.8.4.rc3
>
>
--
broadcast packets on wrong frequency