This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PING][PATCH neleai/string-x64] Reoptimize strlen and strnlen
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Fri, 3 Jul 2015 09:41:11 +0200
- Subject: [PING][PATCH neleai/string-x64] Reoptimize strlen and strnlen
- Authentication-results: sourceware.org; auth=none
- References: <20150626071254 dot GA1789 at domone>
On Fri, Jun 26, 2015 at 09:12:54AM +0200, OndÅej BÃlka wrote:
> Hi,
>
> I optimized strlen long ago, then my main focus was improve performance
> for core2 and have reasonable performance for athlons and old atoms.
>
> Main change is that I check 16-64th byte unaligned instead aligning
> these to 16 bytes. That improved performance on older processors but now
> unaligned loads are better on i7. I don't remember if last time I keept
> xoring first four xmm registers when checking unaligned loads or read
> from (%rax) instead (%rdi) which increased latency but now simple
> unaligned loads are faster also on core2
>
> Then I made several microoptimizations like using edx instead rdx to
> save space or reorder to improve instruction scheduling.
>
> Also I tested avx2 version, again it doesn't help much, on haswell
> performance difference is 0.2% while new sse2 is 1% faster on haswell.
>
> Full graphs are here, only problem I could find is 0.3% decrease on
> fx10.
>
> I could reintroduce ifunc to handle atom and avx2 but is that worth it?
>
> http://kam.mff.cuni.cz/~ondra/benchmark_string/strlen_profile.html
>
> Ok to commit this?
>
> * sysdeps/x86_64/strlen.S (strlen): Add microoptimizations.
> ---
> sysdeps/x86_64/strlen.S | 336 ++++++++++++++++++++++++------------------------
> 1 file changed, 169 insertions(+), 167 deletions(-)
>
> diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> index c382c8d..3e8beb0 100644
> --- a/sysdeps/x86_64/strlen.S
> +++ b/sysdeps/x86_64/strlen.S
> @@ -1,5 +1,5 @@
> /* SSE2 version of strlen.
> - Copyright (C) 2012-2015 Free Software Foundation, Inc.
> + Copyright (C) 2012-2015 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
>
> The GNU C Library is free software; you can redistribute it and/or
> @@ -18,222 +18,224 @@
>
> #include <sysdep.h>
>
> -/* Long lived register in strlen(s), strnlen(s, n) are:
> -
> - %xmm11 - zero
> - %rdi - s
> - %r10 (s+n) & (~(64-1))
> - %r11 s+n
> -*/
>
>
> .text
> ENTRY(strlen)
> -
> -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
> -#define FIND_ZERO \
> - pcmpeqb (%rax), %xmm8; \
> - pcmpeqb 16(%rax), %xmm9; \
> - pcmpeqb 32(%rax), %xmm10; \
> - pcmpeqb 48(%rax), %xmm11; \
> - pmovmskb %xmm8, %esi; \
> - pmovmskb %xmm9, %edx; \
> - pmovmskb %xmm10, %r8d; \
> - pmovmskb %xmm11, %ecx; \
> - salq $16, %rdx; \
> - salq $16, %rcx; \
> - orq %rsi, %rdx; \
> - orq %r8, %rcx; \
> - salq $32, %rcx; \
> - orq %rcx, %rdx;
> -
> #ifdef AS_STRNLEN
> -/* Do not read anything when n==0. */
> + mov %rsi, %r8
> + xor %edx, %edx
> test %rsi, %rsi
> - jne L(n_nonzero)
> - xor %rax, %rax
> - ret
> -L(n_nonzero):
> -
> -/* Initialize long lived registers. */
> -
> - add %rdi, %rsi
> - mov %rsi, %r10
> - and $-64, %r10
> - mov %rsi, %r11
> + je L(return_zero)
> + cmp $64, %rsi
> + jae L(dont_set)
> + bts %rsi, %rdx
> +L(dont_set):
> #endif
> -
> - pxor %xmm8, %xmm8
> - pxor %xmm9, %xmm9
> - pxor %xmm10, %xmm10
> - pxor %xmm11, %xmm11
> - movq %rdi, %rax
> - movq %rdi, %rcx
> - andq $4095, %rcx
> -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
> - cmpq $4047, %rcx
> -/* We cannot unify this branching as it would be ~6 cycles slower. */
> + pxor %xmm0, %xmm0
> + mov %edi, %ecx
> + and $4095, %ecx
> + cmp $4032, %ecx
> ja L(cross_page)
> -
> + movdqu (%rdi), %xmm4
> + pcmpeqb %xmm0, %xmm4
> + pmovmskb %xmm4, %ecx
> #ifdef AS_STRNLEN
> -/* Test if end is among first 64 bytes. */
> -# define STRNLEN_PROLOG \
> - mov %r11, %rsi; \
> - subq %rax, %rsi; \
> - andq $-64, %rax; \
> - testq $-64, %rsi; \
> - je L(strnlen_ret)
> + or %dx, %cx
> #else
> -# define STRNLEN_PROLOG andq $-64, %rax;
> + test %ecx, %ecx
> #endif
> -
> -/* Ignore bits in mask that come before start of string. */
> -#define PROLOG(lab) \
> - movq %rdi, %rcx; \
> - xorq %rax, %rcx; \
> - STRNLEN_PROLOG; \
> - sarq %cl, %rdx; \
> - test %rdx, %rdx; \
> - je L(lab); \
> - bsfq %rdx, %rax; \
> + je L(next48_bytes)
> + bsf %ecx, %eax
> ret
>
> #ifdef AS_STRNLEN
> - andq $-16, %rax
> - FIND_ZERO
> -#else
> - /* Test first 16 bytes unaligned. */
> - movdqu (%rax), %xmm12
> - pcmpeqb %xmm8, %xmm12
> - pmovmskb %xmm12, %edx
> - test %edx, %edx
> - je L(next48_bytes)
> - bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
> +L(return_zero):
> + xor %eax, %eax
> ret
> -
> +L(return_noread):
> + add $64, %rax
> + sub %rdi, %rax
> + ret
> +#endif
> + .p2align 4
> L(next48_bytes):
> -/* Same as FIND_ZERO except we do not check first 16 bytes. */
> - andq $-16, %rax
> - pcmpeqb 16(%rax), %xmm9
> - pcmpeqb 32(%rax), %xmm10
> - pcmpeqb 48(%rax), %xmm11
> - pmovmskb %xmm9, %edx
> - pmovmskb %xmm10, %r8d
> - pmovmskb %xmm11, %ecx
> - salq $16, %rdx
> - salq $16, %rcx
> - orq %r8, %rcx
> + movdqu 16(%rdi), %xmm1
> + movdqu 32(%rdi), %xmm2
> + movdqu 48(%rdi), %xmm3
> + pcmpeqb %xmm0, %xmm1
> + pcmpeqb %xmm0, %xmm2
> + pcmpeqb %xmm0, %xmm3
> +#ifdef AS_STRNLEN
> + pmovmskb %xmm1, %ecx
> + sal $16, %ecx
> + or %rcx, %rdx
> +#else
> + pmovmskb %xmm1, %edx
> + sal $16, %edx
> +#endif
> + pmovmskb %xmm2, %esi
> + pmovmskb %xmm3, %ecx
> + sal $16, %ecx
> + or %esi, %ecx
> salq $32, %rcx
> orq %rcx, %rdx
> -#endif
> -
> - /* When no zero byte is found xmm9-11 are zero so we do not have to
> - zero them. */
> - PROLOG(loop)
> + je L(loop_init)
> + bsfq %rdx, %rax
> + ret
>
> .p2align 4
> L(cross_page):
> - andq $-64, %rax
> - FIND_ZERO
> - PROLOG(loop_init)
>
> + movq %rdi, %rax
> + pxor %xmm1, %xmm1
> + pxor %xmm2, %xmm2
> + pxor %xmm3, %xmm3
> #ifdef AS_STRNLEN
> -/* We must do this check to correctly handle strnlen (s, -1). */
> -L(strnlen_ret):
> - bts %rsi, %rdx
> + mov %rdx, %r9
> +#endif
> + andq $-64, %rax
> + pcmpeqb (%rax), %xmm0
> + pcmpeqb 16(%rax), %xmm1
> + pcmpeqb 32(%rax), %xmm2
> + pcmpeqb 48(%rax), %xmm3
> + pmovmskb %xmm0, %esi
> + pxor %xmm0, %xmm0
> + pmovmskb %xmm1, %edx
> + pmovmskb %xmm2, %r10d
> + pmovmskb %xmm3, %ecx
> + sal $16, %edx
> + sal $16, %ecx
> + or %esi, %edx
> + or %r10, %rcx
> + salq $32, %rcx
> + orq %rcx, %rdx
> + mov %edi, %ecx
> +#ifdef AS_STRNLEN
> + salq %cl, %r9
> + or %r9, %rdx
> +#endif
> sarq %cl, %rdx
> test %rdx, %rdx
> je L(loop_init)
> bsfq %rdx, %rax
> ret
> -#endif
> .p2align 4
> L(loop_init):
> - pxor %xmm9, %xmm9
> - pxor %xmm10, %xmm10
> - pxor %xmm11, %xmm11
> + movq %rdi, %rax
> + andq $-64, %rax
> #ifdef AS_STRNLEN
> + add %rdi, %r8
> + sub %rax, %r8
> + cmp $64, %r8
> + je L(return_noread)
> +#endif
> + pxor %xmm1, %xmm1
> + pxor %xmm2, %xmm2
> +#ifdef USE_AVX2
> + vpxor %xmm0, %xmm0, %xmm0
> +#endif
> .p2align 4
> L(loop):
> +#ifdef USE_AVX2
> + vmovdqa 64(%rax), %ymm1
> + vpminub 96(%rax), %ymm1, %ymm2
> + vpcmpeqb %ymm0, %ymm2, %ymm2
> + vpmovmskb %ymm2, %edx
> +#else
> + movdqa 64(%rax), %xmm5
> + pminub 80(%rax), %xmm5
> + pminub 96(%rax), %xmm5
> + pminub 112(%rax), %xmm5
> + pcmpeqb %xmm0, %xmm5
> + pmovmskb %xmm5, %edx
> +#endif
>
> - addq $64, %rax
> - cmpq %rax, %r10
> - je L(exit_end)
> -
> - movdqa (%rax), %xmm8
> - pminub 16(%rax), %xmm8
> - pminub 32(%rax), %xmm8
> - pminub 48(%rax), %xmm8
> - pcmpeqb %xmm11, %xmm8
> - pmovmskb %xmm8, %edx
> +#ifdef AS_STRNLEN
> + sub $64, %r8
> testl %edx, %edx
> - jne L(exit)
> - jmp L(loop)
> -
> - .p2align 4
> -L(exit_end):
> - cmp %rax, %r11
> - je L(first) /* Do not read when end is at page boundary. */
> - pxor %xmm8, %xmm8
> - FIND_ZERO
> -
> -L(first):
> - bts %r11, %rdx
> - bsfq %rdx, %rdx
> - addq %rdx, %rax
> - subq %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(exit):
> - pxor %xmm8, %xmm8
> - FIND_ZERO
> -
> - bsfq %rdx, %rdx
> - addq %rdx, %rax
> - subq %rdi, %rax
> - ret
> -
> + jne L(exit64)
> + cmp $64, %r8
> + jbe L(exit64_zero)
> #else
> -
> - /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
> - .p2align 4
> -L(loop):
> -
> - movdqa 64(%rax), %xmm8
> - pminub 80(%rax), %xmm8
> - pminub 96(%rax), %xmm8
> - pminub 112(%rax), %xmm8
> - pcmpeqb %xmm11, %xmm8
> - pmovmskb %xmm8, %edx
> testl %edx, %edx
> jne L(exit64)
> +#endif
>
> subq $-128, %rax
> -
> - movdqa (%rax), %xmm8
> - pminub 16(%rax), %xmm8
> - pminub 32(%rax), %xmm8
> - pminub 48(%rax), %xmm8
> - pcmpeqb %xmm11, %xmm8
> - pmovmskb %xmm8, %edx
> +#ifdef USE_AVX2
> + vmovdqa (%rax), %ymm1
> + vpminub 32(%rax), %ymm1, %ymm2
> + vpcmpeqb %ymm0, %ymm2, %ymm2
> + vpmovmskb %ymm2, %edx
> +#else
> + movdqa (%rax), %xmm5
> + pminub 16(%rax), %xmm5
> + pminub 32(%rax), %xmm5
> + pminub 48(%rax), %xmm5
> + pcmpeqb %xmm0, %xmm5
> + pmovmskb %xmm5, %edx
> +#endif
> +#ifdef AS_STRNLEN
> + sub $64, %r8
> testl %edx, %edx
> jne L(exit0)
> + cmp $64, %r8
> + jbe L(exit0_zero)
> +#else
> + testl %edx, %edx
> + jne L(exit0)
> +#endif
> jmp L(loop)
>
> +#ifdef AS_STRNLEN
> + .p2align 4
> +L(exit64_zero):
> + addq $64, %rax
> +L(exit0_zero):
> + add %r8, %rax
> + sub %rdi, %rax
> + ret
> +#endif
> .p2align 4
> +
> +
> L(exit64):
> addq $64, %rax
> L(exit0):
> - pxor %xmm8, %xmm8
> - FIND_ZERO
> -
> +#ifdef USE_AVX2
> + sal $32, %rdx
> +#else
> + sal $48, %rdx
> +#endif
> +#ifdef AS_STRNLEN
> + cmp $64, %r8
> + jae L(dont_set2)
> + bts %r8, %rdx
> + L(dont_set2):
> +#endif
> +#ifdef USE_AVX2
> + subq %rdi, %rax
> + vpcmpeqb %ymm0, %ymm1, %ymm1
> + vpmovmskb %ymm1, %ecx
> + vzeroupper
> + or %rcx, %rdx
> +#else
> + pcmpeqb (%rax), %xmm0
> + pcmpeqb 16(%rax), %xmm1
> + pcmpeqb 32(%rax), %xmm2
> + subq %rdi, %rax
> + pmovmskb %xmm0, %esi
> + pmovmskb %xmm1, %ecx
> + pmovmskb %xmm2, %r8d
> + sal $16, %ecx
> + or %esi, %ecx
> + salq $32, %r8
> + orq %r8, %rcx
> + orq %rcx, %rdx
> +#endif
> bsfq %rdx, %rdx
> addq %rdx, %rax
> - subq %rdi, %rax
> ret
> -
> -#endif
> -
> END(strlen)
> libc_hidden_builtin_def (strlen)
> --
> 1.8.4.rc3
--
Your modem doesn't speak English.