This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [RFC PATCH] Optimize memset with conditional moves.
- From: "H.J. Lu" <hjl dot tools at gmail dot com>
- To: OndÅej BÃlka <neleai at seznam dot cz>
- Cc: GNU C Library <libc-alpha at sourceware dot org>
- Date: Thu, 27 Aug 2015 09:54:03 -0700
- Subject: Re: [RFC PATCH] Optimize memset with conditional moves.
- Authentication-results: sourceware.org; auth=none
- References: <20150827162836 dot GA28702 at domone>
On Thu, Aug 27, 2015 at 9:28 AM, OndÅej BÃlka <neleai@seznam.cz> wrote:
> Hi,
>
> I got idea that I mentioned on aarch thread to improve memset and memcpy
> performance by using conditional moves to avoid branch misprediction
> penalty. Previously setting one byte was very slow as we needed decision
> tree to determine implementation for each size and for some range it
> needed to be slow so we selected this one as unlikely.
>
> A trick that I use could be done in C to handle n less than 16
>
> char stack[64];
> uint64_t *p1 = (uint64_t *) s;
> uint32_t *p2 = (uint32_t *) (s + (n & 8));
> uint16_t *p3 = (uint16_t *) (s + (n & 12));
> char *p4 = s + n - 1;
> if (n & 8 == 0) p1 = stack;
> if (n & 4 == 0) p2 = stack;
> if (n & 2 == 0) p3 = stack;
> if (n & 1 == 0) p4 = stack;
> *p1 = vc;
> *p2 = vc;
> *p3 = vc;
> *p4 = vc;
>
> and for 32-128 byte range its same as what in 16-64 range you could do
> with 8 byte moves
>
> char stack[64];
> int add = n > 32 ? 16 : 0;
> int sub = n > 32 ? -16 : 0;
> *((uint64_t *)(s)) = vc;
> *((uint64_t *)(s + 8)) = vc;
> *((uint64_t *)(s + add)) = vc;
> *((uint64_t *)(s + 8 + add)) = vc;
> *((uint64_t *)(s + n - 16)) = vc;
> *((uint64_t *)(s + 8 + n - 16)) = vc;
> *((uint64_t *)(s + n - 16 + sub)) = vc;
> *((uint64_t *)(s + 8 + n - 16 + sub)) = vc;
>
> This could be used generically, I believe that on arm it could give
> considerable speedup but would need access to machine for testing.
>
> This gives on average 5% over current implementation, see profile
> results here, I will resend alternate rep stosq implementation later,
> see following:
>
> http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_cmov.html
> http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_cmov270815.tar.bz2
>
> That as I recall was where I left memset as I didn't know on which sizes
> to select it as it needs to be set empirically and is different for each
> architecture. Problem is that on most architectures its faster when data
> are in L3 cache or main memory and I dont know since what sizes it
> usually happens as it may be case in lot smaller size than cache size
> limit.
>
> I also included optimization of counted loop, which helps processor to
> correctly predict end until 512 bytes.
>
> Second one is that I extended header to sizes upto 256 bytes which gives
> additional one percent or so.
>
>
> There is additional microoptimization possible that I could save few
> cycles with ssse3 pshufb so should I add another variant there
> surrounded by standard #ifdef USE_SSSE3 pattern?
>
> I didn't do more performance testing yet as I plan to use this pattern
> also for memcpy and strcpy.
>
> So comments?
>
> * sysdeps/x86_64/memset.S: Use conditional moves to improve
> performance.
>
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index e496254..bea402d 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -24,7 +24,7 @@
> ENTRY(__bzero)
> movq %rdi, %rax /* Set return value. */
> movq %rsi, %rdx /* Set n. */
> - pxor %xmm8, %xmm8
> + pxor %xmm0, %xmm0
The %xmm8->%xmm0 changes are in master branch now.
> jmp L(entry_from_bzero)
> END(__bzero)
> weak_alias (__bzero, bzero)
> @@ -33,10 +33,10 @@ weak_alias (__bzero, bzero)
> ENTRY(__memset_tail)
> movq %rcx, %rax /* Set return value. */
>
> - movd %esi, %xmm8
> - punpcklbw %xmm8, %xmm8
> - punpcklwd %xmm8, %xmm8
> - pshufd $0, %xmm8, %xmm8
> + movd %esi, %xmm0
> + punpcklbw %xmm0, %xmm0
> + punpcklwd %xmm0, %xmm0
> + pshufd $0, %xmm0, %xmm0
>
> jmp L(entry_from_bzero)
> END(__memset_tail)
> @@ -50,78 +50,116 @@ END_CHK (__memset_chk)
> #endif
>
> ENTRY (memset)
> - movd %esi, %xmm8
> + movd %esi, %xmm0
> movq %rdi, %rax
> - punpcklbw %xmm8, %xmm8
> - punpcklwd %xmm8, %xmm8
> - pshufd $0, %xmm8, %xmm8
> + punpcklbw %xmm0, %xmm0
> + punpcklwd %xmm0, %xmm0
> + pshufd $0, %xmm0, %xmm0
> L(entry_from_bzero):
> - cmpq $64, %rdx
> + cmpq $256, %rdx
> ja L(loop_start)
> - cmpq $16, %rdx
> - jbe L(less_16_bytes)
> - cmpq $32, %rdx
> - movdqu %xmm8, (%rdi)
> - movdqu %xmm8, -16(%rdi,%rdx)
> - ja L(between_32_64_bytes)
> -L(return):
> - rep
> + cmp $128, %edx
> + jae L(between_128_256_bytes)
> +
> + cmp $32, %edx
> + jb L(less_32_bytes)
> +
> + lea -32(%rdi), %rcx
> + lea 32(%rdi), %rsi
> +
> + cmp $64, %edx
> + cmovb %rdi, %rcx
> + cmovb %rdi, %rsi
> +
> + movdqu %xmm0, (%rdi)
> + movdqu %xmm0, 16(%rdi)
> + movdqu %xmm0, (%rsi)
> + movdqu %xmm0, 16(%rsi)
> +
> + movdqu %xmm0, -32(%rcx, %rdx)
> + movdqu %xmm0, -16(%rcx, %rdx)
> + movdqu %xmm0, -32(%rdi, %rdx)
> + movdqu %xmm0, -16(%rdi, %rdx)
> ret
> +
> .p2align 4
> -L(between_32_64_bytes):
> - movdqu %xmm8, 16(%rdi)
> - movdqu %xmm8, -32(%rdi,%rdx)
> +L(between_128_256_bytes):
> + lea (%rdi, %rdx), %rsi
> + movdqu %xmm0, (%rdi)
> + movdqu %xmm0, 16(%rdi)
> + movdqu %xmm0, 32(%rdi)
> + movdqu %xmm0, 48(%rdi)
> + movdqu %xmm0, 64(%rdi)
> + movdqu %xmm0, 80(%rdi)
> + movdqu %xmm0, 96(%rdi)
> + movdqu %xmm0, 112(%rdi)
> + movdqu %xmm0, -128(%rsi)
> + movdqu %xmm0, -112(%rsi)
> + movdqu %xmm0, -96(%rsi)
> + movdqu %xmm0, -80(%rsi)
> + movdqu %xmm0, -64(%rsi)
> + movdqu %xmm0, -48(%rsi)
> + movdqu %xmm0, -32(%rsi)
> + movdqu %xmm0, -16(%rsi)
> ret
> +
> .p2align 4
> L(loop_start):
> - leaq 64(%rdi), %rcx
> - movdqu %xmm8, (%rdi)
> - andq $-64, %rcx
> - movdqu %xmm8, -16(%rdi,%rdx)
> - movdqu %xmm8, 16(%rdi)
> - movdqu %xmm8, -32(%rdi,%rdx)
> - movdqu %xmm8, 32(%rdi)
> - movdqu %xmm8, -48(%rdi,%rdx)
> - movdqu %xmm8, 48(%rdi)
> - movdqu %xmm8, -64(%rdi,%rdx)
> - addq %rdi, %rdx
> - andq $-64, %rdx
> - cmpq %rdx, %rcx
> - je L(return)
> - .p2align 4
> + lea (%rdi, %rdx), %rsi
> + leaq 16(%rdi), %rcx
> + andq $-16, %rcx
> + movdqu %xmm0, -16(%rsi)
> + movdqu %xmm0, -32(%rsi)
> + movdqu %xmm0, -48(%rsi)
> + movdqu %xmm0, -64(%rsi)
> + movdqu %xmm0, (%rdi)
> + subq %rcx, %rsi
> + shrq $6, %rsi
> L(loop):
> - movdqa %xmm8, (%rcx)
> - movdqa %xmm8, 16(%rcx)
> - movdqa %xmm8, 32(%rcx)
> - movdqa %xmm8, 48(%rcx)
> + movdqa %xmm0, (%rcx)
> + movdqa %xmm0, 16(%rcx)
> + movdqa %xmm0, 32(%rcx)
> + movdqa %xmm0, 48(%rcx)
> addq $64, %rcx
> - cmpq %rcx, %rdx
> + subq $1, %rsi
> jne L(loop)
> - rep
> - ret
> -L(less_16_bytes):
> - movq %xmm8, %rcx
> - testb $24, %dl
> - jne L(between8_16bytes)
> - testb $4, %dl
> - jne L(between4_7bytes)
> - testb $1, %dl
> - je L(odd_byte)
> - movb %cl, (%rdi)
> -L(odd_byte):
> - testb $2, %dl
> - je L(return)
> - movw %cx, -2(%rax,%rdx)
> - ret
> -L(between4_7bytes):
> - movl %ecx, (%rdi)
> - movl %ecx, -4(%rdi,%rdx)
> - ret
> -L(between8_16bytes):
> - movq %rcx, (%rdi)
> - movq %rcx, -8(%rdi,%rdx)
> ret
>
> +L(less_32_bytes):
> + movq %xmm0, %rcx
> + lea -64(%rsp), %r10
> + sub %rax, %r10
> + lea 64(%rdi), %r9
> + lea 63(%rdi, %rdx), %rsi
> + mov %rdx, %r8
> + mov %rdx, %r11
> + mov %rdx, %rdi
> + and $16, %r8
> + and $24, %rdi
> + and $28, %r11
> +
> + test $16, %edx
> + cmove %rsp, %r9
> +
> + test $8, %edx
> + cmove %r10, %r8
> +
> + test $4, %edx
> + cmove %r10, %rdi
> +
> + test $2, %edx
> + cmove %r10, %r11
> +
> + test $1, %edx
> + cmove %rsp, %rsi
> +
> + movdqu %xmm0, -64(%r9)
> + movq %rcx, (%rax, %r8)
> + movl %ecx, (%rax, %rdi)
> + movw %cx, (%rax, %r11)
> + movb %cl, -64(%rsi)
> +
> + ret
> END (memset)
> libc_hidden_builtin_def (memset)
>
They look OK to me.
Thanks.
--
H.J.