This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PING][PATCH] Optimize strchr more.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Sat, 8 Feb 2014 01:35:58 +0100
- Subject: [PING][PATCH] Optimize strchr more.
- Authentication-results: sourceware.org; auth=none
- References: <20131004211010 dot GA7538 at domone>
ping
On Fri, Oct 04, 2013 at 11:10:10PM +0200, OndÅej BÃlka wrote:
> Hi,
> I improved strchr in similar way as strrchr. There is one difference
> that as ifunc is available we could use pshufb to broadcast byte. This
> gives us few cycles almost for free.
>
> I did same optimizations as strrchr with one more. In loop check for 4
> 16-byte blocks if they match and calculate a mask for it. As x86-64 has
> destructive operations it looked faster to recalculate one block
> otherwise moves would slow us down like here.
>
> int m0,m1,m2,m3;
> if (m0|m1|m2|m3)
> {
> /* recalculate m0
> mask=m0|m1<<16|m2<<32|m3<<48;
> ...
>
> But this is not neccesary when we are interested only in first nonzero bit.
> We can use combined mask at end and it will come in action only when
> m0,m1,m2 are zero so m3 has correct value as in following pseudocode:
>
> int m0,m1,m2,m3;
> if (m0|m1|m2|m3)
> {
> mask=m0|m1<<16|m2<<32|(m0|m1|m2|m3)<<48;
> ...
>
> These modifications give us around 4%. benchmarks are here:
>
> http://kam.mff.cuni.cz/~ondra/benchmark_string/strchr_profile.html
>
> * sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
> Add strchr-ssse3.
> * sysdeps/x86_64/multiarch/strchr-ssse3.S: New file.
> * sysdeps/x86_64/multiarch/strchr.S: Add ifunc.
> * sysdeps/x86_64/strchr.S: Optimize implementation.
>
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 +-
> sysdeps/x86_64/multiarch/strchr-ssse3.S | 3 +
> sysdeps/x86_64/multiarch/strchr.S | 4 +
> sysdeps/x86_64/strchr.S | 228 ++++++++++++++++----------------
> 4 files changed, 122 insertions(+), 115 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/strchr-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index f910fb2..042f501 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -16,7 +16,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
> strcpy-sse2-unaligned strncpy-sse2-unaligned \
> stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
> strcat-sse2-unaligned strncat-sse2-unaligned \
> - strchr-sse2-no-bsf memcmp-ssse3
> + strchr-sse2-no-bsf strchr-ssse3 memcmp-ssse3
> ifeq (yes,$(config-cflags-sse4))
> sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
> CFLAGS-varshift.c += -msse4
> diff --git a/sysdeps/x86_64/multiarch/strchr-ssse3.S b/sysdeps/x86_64/multiarch/strchr-ssse3.S
> new file mode 100644
> index 0000000..6d09015
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strchr-ssse3.S
> @@ -0,0 +1,3 @@
> +#define USE_SSSE3
> +#define strchr __strchr_ssse3
> +#include "../strchr.S"
> diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
> index 3f0b2c5..2b67ca4 100644
> --- a/sysdeps/x86_64/multiarch/strchr.S
> +++ b/sysdeps/x86_64/multiarch/strchr.S
> @@ -29,6 +29,10 @@ ENTRY(strchr)
> jne 1f
> call __init_cpu_features
> 1: leaq __strchr_sse2(%rip), %rax
> + testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> + je 2f
> + leaq __strchr_ssse3(%rip), %rax
> + ret
> 2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
> jz 3f
> leaq __strchr_sse2_no_bsf(%rip), %rax
> diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
> index 1900b37..0bb45d2 100644
> --- a/sysdeps/x86_64/strchr.S
> +++ b/sysdeps/x86_64/strchr.S
> @@ -19,174 +19,174 @@
>
> #include <sysdep.h>
>
> -# ifndef ALIGN
> -# define ALIGN(n) .p2align n
> -# endif
> -
> -
> .text
> ENTRY (strchr)
> movd %esi, %xmm1
> movl %edi, %eax
> + pxor %xmm4, %xmm4
> andl $4095, %eax
> + pxor %xmm5, %xmm5
> +#ifdef USE_SSSE3
> + pshufb %xmm4, %xmm1
> +#else
> punpcklbw %xmm1, %xmm1
> - cmpl $4032, %eax
> punpcklwd %xmm1, %xmm1
> pshufd $0, %xmm1, %xmm1
> +#endif
> + cmpl $4032, %eax
> jg L(cross_page)
> movdqu (%rdi), %xmm0
> - pxor %xmm3, %xmm3
> - movdqa %xmm0, %xmm4
> + pcmpeqb %xmm0, %xmm4
> pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm3, %xmm4
> por %xmm4, %xmm0
> pmovmskb %xmm0, %eax
> test %eax, %eax
> je L(next_48_bytes)
> - bsf %eax, %eax
> + bsf %eax, %eax #| bsf %ax, %ax #| bsf %rax, %rax
> #ifdef AS_STRCHRNUL
> - leaq (%rdi,%rax), %rax
> + addq %rdi, %rax
> #else
> - movl $0, %edx
> - leaq (%rdi,%rax), %rax
> + xor %edx, %edx
> + addq %rdi, %rax
> cmpb %sil, (%rax)
> cmovne %rdx, %rax
> #endif
> ret
>
> - ALIGN(3)
> - L(next_48_bytes):
> + .p2align 3
> +L(next_48_bytes):
> + pxor %xmm6, %xmm6
> + pxor %xmm7, %xmm7
> + movdqu 48(%rdi), %xmm3
> + movdqu 32(%rdi), %xmm2
> + pcmpeqb %xmm3, %xmm7
> movdqu 16(%rdi), %xmm0
> - movdqa %xmm0, %xmm4
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm3, %xmm4
> - por %xmm4, %xmm0
> - pmovmskb %xmm0, %ecx
> - movdqu 32(%rdi), %xmm0
> - movdqa %xmm0, %xmm4
> + movq %rdi, %r8
> + pcmpeqb %xmm1, %xmm3
> + andq $-64, %rdi
> + por %xmm7, %xmm3
> + pcmpeqb %xmm2, %xmm6
> + pcmpeqb %xmm1, %xmm2
> + pcmpeqb %xmm0, %xmm5
> pcmpeqb %xmm1, %xmm0
> - salq $16, %rcx
> - pcmpeqb %xmm3, %xmm4
> - por %xmm4, %xmm0
> + por %xmm6, %xmm2
> + pmovmskb %xmm3, %edx
> + pmovmskb %xmm2, %ecx
> + shlq $32, %rdx
> + shlq $16, %rcx
> + por %xmm5, %xmm0
> pmovmskb %xmm0, %eax
> - movdqu 48(%rdi), %xmm0
> - pcmpeqb %xmm0, %xmm3
> - salq $32, %rax
> - pcmpeqb %xmm1, %xmm0
> - orq %rcx, %rax
> - por %xmm3, %xmm0
> - pmovmskb %xmm0, %ecx
> - salq $48, %rcx
> orq %rcx, %rax
> - testq %rax, %rax
> - jne L(return)
> -L(loop_start):
> - /* We use this alignment to force loop be aligned to 8 but not
> - 16 bytes. This gives better sheduling on AMD processors. */
> - ALIGN(4)
> pxor %xmm6, %xmm6
> - andq $-64, %rdi
> - ALIGN(3)
> + orq %rdx, %rax
> + je L(loop64)
> + bsfq %rax, %rax
> +#ifdef AS_STRCHRNUL
> + leaq 16(%r8, %rax), %rax
> +#else
> + xor %edx, %edx
> + leaq 16(%r8, %rax), %rax
> + cmpb %sil, (%rax)
> + cmovne %rdx, %rax
> +#endif
> + ret
> +
> + .p2align 4
> L(loop64):
> - addq $64, %rdi
> - movdqa (%rdi), %xmm5
> - movdqa 16(%rdi), %xmm2
> - movdqa 32(%rdi), %xmm3
> + movdqa 96(%rdi), %xmm3
> + movdqa 64(%rdi), %xmm5
> pxor %xmm1, %xmm5
> - movdqa 48(%rdi), %xmm4
> - pxor %xmm1, %xmm2
> - pxor %xmm1, %xmm3
> - pminub (%rdi), %xmm5
> + movdqa 112(%rdi), %xmm4
> + movdqa 80(%rdi), %xmm2
> pxor %xmm1, %xmm4
> - pminub 16(%rdi), %xmm2
> - pminub 32(%rdi), %xmm3
> - pminub %xmm2, %xmm5
> - pminub 48(%rdi), %xmm4
> - pminub %xmm3, %xmm5
> - pminub %xmm4, %xmm5
> - pcmpeqb %xmm6, %xmm5
> - pmovmskb %xmm5, %eax
> -
> - testl %eax, %eax
> + pminub 112(%rdi), %xmm4
> + pminub 64(%rdi), %xmm5
> + pxor %xmm1, %xmm3
> + pxor %xmm1, %xmm2
> + pminub %xmm5, %xmm4
> + pminub 96(%rdi), %xmm3
> + pminub 80(%rdi), %xmm2
> + addq $64, %rdi
> + pminub %xmm3, %xmm4
> + pminub %xmm2, %xmm4
> + pcmpeqb %xmm6, %xmm4
> + pmovmskb %xmm4, %edx
> + testl %edx, %edx
> je L(loop64)
> -
> - movdqa (%rdi), %xmm5
> - movdqa %xmm5, %xmm0
> - pcmpeqb %xmm1, %xmm5
> - pcmpeqb %xmm6, %xmm0
> - por %xmm0, %xmm5
> - pcmpeqb %xmm6, %xmm2
> - pcmpeqb %xmm6, %xmm3
> - pcmpeqb %xmm6, %xmm4
> -
> - pmovmskb %xmm5, %ecx
> + pcmpeqb %xmm6, %xmm3
> + pmovmskb %xmm3, %r8d
> + pcmpeqb %xmm6, %xmm2
> + pcmpeqb %xmm6, %xmm5
> pmovmskb %xmm2, %eax
> + pmovmskb %xmm5, %ecx
> salq $16, %rax
> - pmovmskb %xmm3, %r8d
> - pmovmskb %xmm4, %edx
> salq $32, %r8
> + salq $48, %rdx
> orq %r8, %rax
> orq %rcx, %rax
> - salq $48, %rdx
> orq %rdx, %rax
> - ALIGN(3)
> -L(return):
> bsfq %rax, %rax
> #ifdef AS_STRCHRNUL
> - leaq (%rdi,%rax), %rax
> + leaq (%rdi, %rax), %rax
> #else
> - movl $0, %edx
> - leaq (%rdi,%rax), %rax
> + xor %edx, %edx
> + leaq (%rdi, %rax), %rax
> cmpb %sil, (%rax)
> cmovne %rdx, %rax
> #endif
> ret
> - ALIGN(4)
>
> + .p2align 4
> L(cross_page):
> - movq %rdi, %rdx
> - pxor %xmm2, %xmm2
> - andq $-64, %rdx
> - movdqa %xmm1, %xmm0
> - movdqa (%rdx), %xmm3
> - movdqa %xmm3, %xmm4
> - pcmpeqb %xmm1, %xmm3
> - pcmpeqb %xmm2, %xmm4
> - por %xmm4, %xmm3
> - pmovmskb %xmm3, %r8d
> - movdqa 16(%rdx), %xmm3
> - movdqa %xmm3, %xmm4
> - pcmpeqb %xmm1, %xmm3
> - pcmpeqb %xmm2, %xmm4
> - por %xmm4, %xmm3
> - pmovmskb %xmm3, %eax
> - movdqa 32(%rdx), %xmm3
> - movdqa %xmm3, %xmm4
> + pxor %xmm7, %xmm7
> + movq %rdi, %rcx
> + andq $-64, %rdi
> + movdqu 48(%rdi), %xmm3
> + movdqu (%rdi), %xmm8
> + movdqu 32(%rdi), %xmm2
> + pxor %xmm6, %xmm6
> + pcmpeqb %xmm8, %xmm4
> + movdqu 16(%rdi), %xmm0
> + pcmpeqb %xmm1, %xmm8
> + pcmpeqb %xmm0, %xmm5
> + pcmpeqb %xmm1, %xmm0
> + por %xmm5, %xmm0
> + por %xmm4, %xmm8
> + pcmpeqb %xmm2, %xmm6
> + pcmpeqb %xmm1, %xmm2
> + por %xmm6, %xmm2
> + pmovmskb %xmm8, %edx
> + pcmpeqb %xmm3, %xmm7
> pcmpeqb %xmm1, %xmm3
> - salq $16, %rax
> - pcmpeqb %xmm2, %xmm4
> - por %xmm4, %xmm3
> + pmovmskb %xmm0, %eax
> + pmovmskb %xmm2, %r8d
> + por %xmm7, %xmm3
> pmovmskb %xmm3, %r9d
> - movdqa 48(%rdx), %xmm3
> - pcmpeqb %xmm3, %xmm2
> - salq $32, %r9
> - pcmpeqb %xmm3, %xmm0
> - orq %r9, %rax
> + shlq $48, %r9
> + shlq $32, %r8
> + shlq $16, %rax
> + orq %rdx, %rax
> orq %r8, %rax
> - por %xmm2, %xmm0
> - pmovmskb %xmm0, %ecx
> - salq $48, %rcx
> - orq %rcx, %rax
> - movl %edi, %ecx
> - subb %dl, %cl
> + orq %r9, %rax
> + pxor %xmm6, %xmm6
> shrq %cl, %rax
> testq %rax, %rax
> - jne L(return)
> - jmp L(loop_start)
> -
> + je L(loop64)
> + bsfq %rax, %rax
> +#ifdef AS_STRCHRNUL
> + addq %rcx, %rax
> +#else
> + xor %edx, %edx
> + addq %rcx, %rax
> + cmpb %sil, (%rax)
> + cmovne %rdx, %rax
> +#endif
> + ret
> END (strchr)
>
> #ifndef AS_STRCHRNUL
> +# ifndef USE_SSSE3
> weak_alias (strchr, index)
> libc_hidden_builtin_def (strchr)
> +# endif
> #endif
> --
> 1.8.4.rc3
--
Incorrect time synchronization