This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH RFC] Imporve 64bit memset performance for Haswell CPU with AVX2 instruction


Hi Ondra,

Thanks for your comments, and changed as below,  the new version have
been sent to you.

Regards
Ling

>
>> diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S
>> b/sysdeps/x86_64/multiarch/memset-avx2.S
>> new file mode 100644
>> index 0000000..5d4a487
>> --- /dev/null
>> +++ b/sysdeps/x86_64/multiarch/memset-avx2.S
>> @@ -0,0 +1,192 @@
>> +/* memset with AVX2
>> +   Copyright (C) 2014 Free Software Foundation, Inc.
>> +   Contributed by Alibaba Group.
>
> We no longer add Contributed by.
Removed in new version.
>
>> +#include "asm-syntax.h"
>> +#ifndef ALIGN
>> +# define ALIGN(n)	.p2align n
>> +#endif
>
> Also in meantime we decided to remove ALIGN macro so remove that and
> s/ALIGN(3)/.p2align 4/  s/ALIGN(4)/.p2align 4/
Fixed in new version
>
>> +
>> +ENTRY (MEMSET)
>> +	vpxor	%xmm0, %xmm0, %xmm0
>> +	vmovd %esi, %xmm1
>> +	lea	(%rdi, %rdx), %r8
>
> < snip >
>
>> +	vmovups %xmm0, 0x70(%rdi)
>> +	vmovups %xmm0, -0x80(%r8)
>
> I would globally replace %r8 by %rsi, this makes instruction byte shorter,
> %r9 is similar.
Fixed in new version, and we make branch instruction in different
16byte to improve branch prediction accurate.

>
>> +L(less_4bytes):
>> +	cmp	$2, %edx
>> +	jb	L(less_2bytes)
>> +	mov	%cx, (%rdi)
>> +	mov	%cx, -0x02(%r8)
>> +	ret
>> +	ALIGN(4)
>> +L(less_2bytes):
>> +	cmp	$1, %edx
>> +	jb	L(less_1bytes)
>> +	mov	%cl, (%rdi)
>> +L(less_1bytes):
>> +	ret
>> +
>
> Here current implementation saves one comparison by
>
Did in new version.
> L(less_4bytes):
>      cmp     $1, %edx
>      jbe     L(less_2bytes)
>      mov     %cx, (%rdi)
>      mov     %cx, -0x02(%r8)
>      ret
>      ALIGN(4)
> L(less_2bytes):
>      jb      L(less_1bytes)
>      mov     %cl, (%rdi)
> L(less_1bytes):
>      ret
>
>> +	ALIGN(4)
>> +L(256bytesormore):
>> +	vinserti128 $1, %xmm0, %ymm0, %ymm0
>> +	vmovups	%ymm0, (%rdi)
>> +	mov	%rdi, %r9
>> +	and	$-0x20, %rdi
>> +	add	$32, %rdi
>> +	sub	%rdi, %r9
>> +	add	%r9, %rdx
>> +	cmp	$4096, %rdx
>> +	ja	L(gobble_data)
>> +
>> +	sub	$0x80, %rdx
>> +L(gobble_128_loop):
>> +	vmovaps	%ymm0, (%rdi)
>> +	vmovaps	%ymm0, 0x20(%rdi)
>> +	vmovaps	%ymm0, 0x40(%rdi)
>> +	vmovaps	%ymm0, 0x60(%rdi)
>> +	lea	0x80(%rdi), %rdi
>> +	sub	$0x80, %rdx
>> +	jae	L(gobble_128_loop)
>> +	vmovups	%ymm0, -0x80(%r8)
>> +	vmovups	%ymm0, -0x60(%r8)
>> +	vmovups	%ymm0, -0x40(%r8)
>> +	vmovups	%ymm0, -0x20(%r8)
>> +	vzeroupper
>> +	ret
>> +
>
> I looked into this by objdump and loop is misaligned by 5 bytes which
> could be problem if haswell could not handle that.
> If you align loop as below does that it improve performance?
>
Fixed the issue ,loop is aligned.
> 	.p2align 4
> 	ret; ret; ret; ret; ret
> L(256bytesormore):
>
>
> also in this pattern
>
>> +
>> +	sub	$0x80, %rdx
>
> A gcc saves three bytes by using
> 	add	$-0x80, %rdx
>
Changed  with similar method
> Third possible optimization is move vmovups before loop which improves
> latency but it needs to be tested on haswell.
>

Tested below mode, but it hurt performance, original code could get
benefit from hardware prefetch because of sequence access

vmovdqu %ymm0, -0x80(%rsi)
vmovdqu %ymm0, -0x60(%rsi)
vmovdqu %ymm0, -0x40(%rsi)
vmovdqu %ymm0, -0x20(%rsi)
sub %ecx, %edx
L(gobble_128_loop):
vmovdqa %ymm0, (%rdi)
vmovdqa %ymm0, 0x20(%rdi)
vmovdqa %ymm0, 0x40(%rdi)
vmovdqa %ymm0, 0x60(%rdi)
add %rcx, %rdi
sub %ecx, %edx
jae L(gobble_128_loop)
....
>
>> +	ALIGN(4)
>> +L(gobble_data):
>> +#ifdef SHARED_CACHE_SIZE_HALF
>> +	mov	$SHARED_CACHE_SIZE_HALF, %r9
>> +#else
>> +	mov	__x86_shared_cache_size_half(%rip), %r9
>> +#endif
>
> typo here, __x86_64_shared_cache_size_half
__x86_64_shared_cache_size_half will cause crash, so keep
__x86_shared_cache_size_half.

>
>> +	shl	$4, %r9
>> +	cmp	%r9, %rdx
>> +	ja	L(gobble_big_data)
>> +	mov	%rax, %r9
>> +	mov	%esi, %eax
>> +	mov	%rdx, %rcx
>> +	rep	stosb
>
> How does this compares with stosq equivalent?
yes, tested but no improvement.
>
>> +	mov	%r9, %rax
>> +	vzeroupper
>> +	ret
>> +
>
>


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]