This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Re: [RFC PATCH] Optimize memset with conditional moves.

From: "H.J. Lu" <hjl dot tools at gmail dot com>
To: OndÅej BÃlka <neleai at seznam dot cz>
Cc: GNU C Library <libc-alpha at sourceware dot org>
Date: Thu, 27 Aug 2015 09:54:03 -0700
Subject: Re: [RFC PATCH] Optimize memset with conditional moves.
Authentication-results: sourceware.org; auth=none
References: <20150827162836 dot GA28702 at domone>
On Thu, Aug 27, 2015 at 9:28 AM, OndÅej BÃlka <neleai@seznam.cz> wrote:
> Hi,
>
> I got idea that I mentioned on aarch thread to improve memset and memcpy
> performance by using conditional moves to avoid branch misprediction
> penalty. Previously setting one byte was very slow as we needed decision
> tree to determine implementation for each size and for some range it
> needed to be slow so we selected this one as unlikely.
>
> A trick that I use could be done in C to handle n less than 16
>
> char stack[64];
> uint64_t *p1 = (uint64_t *) s;
> uint32_t *p2 = (uint32_t *) (s + (n & 8));
> uint16_t *p3 = (uint16_t *) (s + (n & 12));
> char *p4 = s + n - 1;
> if (n & 8 == 0) p1 = stack;
> if (n & 4 == 0) p2 = stack;
> if (n & 2 == 0) p3 = stack;
> if (n & 1 == 0) p4 = stack;
> *p1 = vc;
> *p2 = vc;
> *p3 = vc;
> *p4 = vc;
>
> and for 32-128 byte range its same as what in 16-64 range you could do
> with 8 byte moves
>
> char stack[64];
> int add = n > 32 ? 16 : 0;
> int sub = n > 32 ? -16 : 0;
> *((uint64_t *)(s)) = vc;
> *((uint64_t *)(s + 8)) = vc;
> *((uint64_t *)(s + add)) = vc;
> *((uint64_t *)(s + 8 + add)) = vc;
> *((uint64_t *)(s + n - 16)) = vc;
> *((uint64_t *)(s + 8 + n - 16)) = vc;
> *((uint64_t *)(s + n - 16 + sub)) = vc;
> *((uint64_t *)(s + 8 + n - 16 + sub)) = vc;
>
> This could be used generically, I believe that on arm it could give
> considerable speedup but would need access to machine for testing.
>
> This gives on average 5% over current implementation, see profile
> results here, I will resend alternate rep stosq implementation later,
> see following:
>
> http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_cmov.html
> http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_cmov270815.tar.bz2
>
> That as I recall was where I left memset as I didn't know on which sizes
> to select it as it needs to be set empirically and is different for each
> architecture. Problem is that on most architectures its faster when data
> are in L3 cache or main memory and I dont know since what sizes it
> usually happens as it may be case in lot smaller size than cache size
> limit.
>
> I also included optimization of counted loop, which helps processor to
> correctly predict end until 512 bytes.
>
> Second one is that I extended header to sizes upto 256 bytes which gives
> additional one percent or so.
>
>
> There is additional microoptimization possible that I could save few
> cycles with ssse3 pshufb so should I add another variant there
> surrounded by standard #ifdef USE_SSSE3 pattern?
>
> I didn't do more performance testing yet as I plan to use this pattern
> also for memcpy and strcpy.
>
> So comments?
>
>         * sysdeps/x86_64/memset.S: Use conditional moves to improve
>         performance.
>
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index e496254..bea402d 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -24,7 +24,7 @@
>  ENTRY(__bzero)
>         movq    %rdi, %rax /* Set return value.  */
>         movq    %rsi, %rdx /* Set n.  */
> -       pxor    %xmm8, %xmm8
> +       pxor    %xmm0, %xmm0

The  %xmm8->%xmm0 changes are in master branch now.

>         jmp     L(entry_from_bzero)
>  END(__bzero)
>  weak_alias (__bzero, bzero)
> @@ -33,10 +33,10 @@ weak_alias (__bzero, bzero)
>  ENTRY(__memset_tail)
>         movq    %rcx, %rax /* Set return value.  */
>
> -       movd    %esi, %xmm8
> -       punpcklbw       %xmm8, %xmm8
> -       punpcklwd       %xmm8, %xmm8
> -       pshufd  $0, %xmm8, %xmm8
> +       movd    %esi, %xmm0
> +       punpcklbw %xmm0, %xmm0
> +       punpcklwd %xmm0, %xmm0
> +       pshufd  $0, %xmm0, %xmm0
>
>         jmp     L(entry_from_bzero)
>  END(__memset_tail)
> @@ -50,78 +50,116 @@ END_CHK (__memset_chk)
>  #endif
>
>  ENTRY (memset)
> -       movd    %esi, %xmm8
> +       movd    %esi, %xmm0
>         movq    %rdi, %rax
> -       punpcklbw       %xmm8, %xmm8
> -       punpcklwd       %xmm8, %xmm8
> -       pshufd  $0, %xmm8, %xmm8
> +       punpcklbw %xmm0, %xmm0
> +       punpcklwd %xmm0, %xmm0
> +       pshufd  $0, %xmm0, %xmm0
>  L(entry_from_bzero):
> -       cmpq    $64, %rdx
> +       cmpq    $256, %rdx
>         ja      L(loop_start)
> -       cmpq    $16, %rdx
> -       jbe     L(less_16_bytes)
> -       cmpq    $32, %rdx
> -       movdqu  %xmm8, (%rdi)
> -       movdqu  %xmm8, -16(%rdi,%rdx)
> -       ja      L(between_32_64_bytes)
> -L(return):
> -       rep
> +       cmp     $128, %edx
> +       jae     L(between_128_256_bytes)
> +
> +       cmp     $32, %edx
> +       jb      L(less_32_bytes)
> +
> +       lea     -32(%rdi), %rcx
> +       lea     32(%rdi), %rsi
> +
> +       cmp     $64, %edx
> +       cmovb   %rdi, %rcx
> +       cmovb   %rdi, %rsi
> +
> +       movdqu  %xmm0, (%rdi)
> +       movdqu  %xmm0, 16(%rdi)
> +       movdqu  %xmm0, (%rsi)
> +       movdqu  %xmm0, 16(%rsi)
> +
> +       movdqu  %xmm0, -32(%rcx, %rdx)
> +       movdqu  %xmm0, -16(%rcx, %rdx)
> +       movdqu  %xmm0, -32(%rdi, %rdx)
> +       movdqu  %xmm0, -16(%rdi, %rdx)
>         ret
> +
>         .p2align 4
> -L(between_32_64_bytes):
> -       movdqu  %xmm8, 16(%rdi)
> -       movdqu  %xmm8, -32(%rdi,%rdx)
> +L(between_128_256_bytes):
> +       lea     (%rdi, %rdx), %rsi
> +       movdqu  %xmm0, (%rdi)
> +       movdqu  %xmm0, 16(%rdi)
> +       movdqu  %xmm0, 32(%rdi)
> +       movdqu  %xmm0, 48(%rdi)
> +       movdqu  %xmm0, 64(%rdi)
> +       movdqu  %xmm0, 80(%rdi)
> +       movdqu  %xmm0, 96(%rdi)
> +       movdqu  %xmm0, 112(%rdi)
> +       movdqu  %xmm0, -128(%rsi)
> +       movdqu  %xmm0, -112(%rsi)
> +       movdqu  %xmm0, -96(%rsi)
> +       movdqu  %xmm0, -80(%rsi)
> +       movdqu  %xmm0, -64(%rsi)
> +       movdqu  %xmm0, -48(%rsi)
> +       movdqu  %xmm0, -32(%rsi)
> +       movdqu  %xmm0, -16(%rsi)
>         ret
> +
>         .p2align 4
>  L(loop_start):
> -       leaq    64(%rdi), %rcx
> -       movdqu  %xmm8, (%rdi)
> -       andq    $-64, %rcx
> -       movdqu  %xmm8, -16(%rdi,%rdx)
> -       movdqu  %xmm8, 16(%rdi)
> -       movdqu  %xmm8, -32(%rdi,%rdx)
> -       movdqu  %xmm8, 32(%rdi)
> -       movdqu  %xmm8, -48(%rdi,%rdx)
> -       movdqu  %xmm8, 48(%rdi)
> -       movdqu  %xmm8, -64(%rdi,%rdx)
> -       addq    %rdi, %rdx
> -       andq    $-64, %rdx
> -       cmpq    %rdx, %rcx
> -       je      L(return)
> -       .p2align 4
> +       lea     (%rdi, %rdx), %rsi
> +       leaq    16(%rdi), %rcx
> +       andq    $-16, %rcx
> +       movdqu  %xmm0, -16(%rsi)
> +       movdqu  %xmm0, -32(%rsi)
> +       movdqu  %xmm0, -48(%rsi)
> +       movdqu  %xmm0, -64(%rsi)
> +       movdqu  %xmm0, (%rdi)
> +       subq    %rcx, %rsi
> +       shrq    $6, %rsi
>  L(loop):
> -       movdqa  %xmm8, (%rcx)
> -       movdqa  %xmm8, 16(%rcx)
> -       movdqa  %xmm8, 32(%rcx)
> -       movdqa  %xmm8, 48(%rcx)
> +       movdqa  %xmm0, (%rcx)
> +       movdqa  %xmm0, 16(%rcx)
> +       movdqa  %xmm0, 32(%rcx)
> +       movdqa  %xmm0, 48(%rcx)
>         addq    $64, %rcx
> -       cmpq    %rcx, %rdx
> +       subq    $1, %rsi
>         jne     L(loop)
> -       rep
> -       ret
> -L(less_16_bytes):
> -       movq %xmm8, %rcx
> -       testb   $24, %dl
> -       jne     L(between8_16bytes)
> -       testb   $4, %dl
> -       jne     L(between4_7bytes)
> -       testb   $1, %dl
> -       je      L(odd_byte)
> -       movb    %cl, (%rdi)
> -L(odd_byte):
> -       testb   $2, %dl
> -       je      L(return)
> -       movw    %cx, -2(%rax,%rdx)
> -       ret
> -L(between4_7bytes):
> -       movl    %ecx, (%rdi)
> -       movl    %ecx, -4(%rdi,%rdx)
> -       ret
> -L(between8_16bytes):
> -       movq    %rcx, (%rdi)
> -       movq    %rcx, -8(%rdi,%rdx)
>         ret
>
> +L(less_32_bytes):
> +       movq    %xmm0, %rcx
> +       lea     -64(%rsp), %r10
> +       sub     %rax, %r10
> +       lea     64(%rdi), %r9
> +       lea     63(%rdi, %rdx), %rsi
> +       mov     %rdx, %r8
> +       mov     %rdx, %r11
> +       mov     %rdx, %rdi
> +       and     $16, %r8
> +       and     $24, %rdi
> +       and     $28, %r11
> +
> +       test    $16, %edx
> +       cmove   %rsp, %r9
> +
> +       test    $8, %edx
> +       cmove   %r10, %r8
> +
> +       test    $4, %edx
> +       cmove   %r10, %rdi
> +
> +       test    $2, %edx
> +       cmove   %r10, %r11
> +
> +       test    $1, %edx
> +       cmove   %rsp, %rsi
> +
> +       movdqu  %xmm0, -64(%r9)
> +       movq    %rcx, (%rax, %r8)
> +       movl    %ecx, (%rax, %rdi)
> +       movw    %cx, (%rax, %r11)
> +       movb    %cl, -64(%rsi)
> +
> +       ret
>  END (memset)
>  libc_hidden_builtin_def (memset)
>

They look OK to me.

Thanks.

-- 
H.J.
References:
- [RFC PATCH] Optimize memset with conditional moves.
  - From: OndÅej BÃlka
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]