This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PING] [PATCH] faster memcpy on x64.


ping
On Sun, Apr 28, 2013 at 12:16:20AM +0200, OndÅej BÃlka wrote:
> Hi,
> 
> I was occupied for last few week on analyzing memcpy and memset and I
> have better implementations than current. This patch is about memcpy.
> 
> Benchmark results are at 
> http://kam.mff.cuni.cz/~ondra/memcpy_profile.html
> or archived at 
> http://kam.mff.cuni.cz/~ondra/memcpy_profile_result27_04_13.tar.bz2
> 
> I tried to modify this for  memmove and I found that additional 
> cost is close to zero when not overlapping.
> So this implementation can be aliased to memmove.
> 
> Important part there is test of memcpy in hooked gcc which shows small
> but real speedup. A memcpy_new 1) is faster on newer processors while
> memcpy_new_small on slower.
> 
> Could we test 2) on wider range of usecases and report results?
> 
> Here we hit fact that strings in practice are small and there is we hit
> latency to get data.
> 
> The microbenchmarks tests look much better.
> 
> Main speedup is obtained by avoiding computed loops and simplify control
> flow for better speculative execution. 1)
> 
> This gives 20% speedup for 32-1000 byte strings.
> 
> Second is that loop that I use is in most architectures asymptoticaly
> faster than gcc one for data in L1, L2, L3 cache. 
> When data is in memory then memory is bottleneck and choice of
> implementation can give at most 1%.
> 
> I tested avx version which is slower on current processors due fact that 
> it is faster to load high and low half separately.
> 
> 1) Except core2,athlon where I need even simpler control flow
> (memcpy_new_small) to get that speedup.
> 
> I attached file from which I generated this patch. There are few
> mistakes made by gcc, I could post diff againist vanilla version.
> 
> I did not tried optimize for atom yet so I keep ifunc for it.
> 
> Passes testsuite. OK for 2.18?
> 
> Ondra
> 
> 1) File variant/memcpy_new_small.s in 2)
> 2) http://kam.mff.cuni.cz/~ondra/memcpy_profile27_04_13.tar.bz2
> 
> 
> 	* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: New file.
> 	* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Add
> 	__memcpy_sse2_unaligned ifunc selection. 
> 	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
> 	Add memcpy-sse2-unaligned.S.
> 	sysdeps/x86_64/multiarch/ifunc-impl-list.c: __memcpy_sse2_unaligned.
> 
> ---
>  sysdeps/x86_64/multiarch/Makefile                |    2 +-
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c       |    1 +
>  sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S |  176 ++++++++++++++++++++++
>  sysdeps/x86_64/multiarch/memcpy.S                |   15 +-
>  4 files changed, 186 insertions(+), 8 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
> 
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 86787ee..203d16e 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -7,7 +7,7 @@ endif
>  ifeq ($(subdir),string)
>  
>  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
> -		   strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
> +		   strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \
>  		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
>  		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
>  		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 05315fd..28d3579 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -227,6 +227,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
>  			      __memcpy_ssse3_back)
>  	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
> +	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
>  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
>  
>    /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
> diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
> new file mode 100644
> index 0000000..1b86c41
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
> @@ -0,0 +1,176 @@
> +/* memcpy with unaliged loads
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#include "asm-syntax.h"
> +
> +#ifndef ALIGN
> +# define ALIGN(n)	.p2align n
> +#endif
> +
> +
> +ENTRY(__memcpy_sse2_unaligned)
> +	movq	%rsi, %rax
> +	leaq	(%rdx,%rdx), %rcx
> +	subq	%rdi, %rax
> +	subq	%rdx, %rax
> +	cmpq	%rcx, %rax
> +	jb	L(overlapping)
> +	cmpq	$16, %rdx
> +	jbe	L(less_16)
> +	movdqu	(%rsi), %xmm8
> +	cmpq	$32, %rdx
> +	movdqu	%xmm8, (%rdi)
> +	movdqu	-16(%rsi,%rdx), %xmm8
> +	movdqu	%xmm8, -16(%rdi,%rdx)
> +	ja	.L31
> +L(return):
> +	movq	%rdi, %rax
> +	ret
> +	.p2align 4,,10
> +	ALIGN(4)
> +.L31:
> +	movdqu	16(%rsi), %xmm8
> +	cmpq	$64, %rdx
> +	movdqu	%xmm8, 16(%rdi)
> +	movdqu	-32(%rsi,%rdx), %xmm8
> +	movdqu	%xmm8, -32(%rdi,%rdx)
> +	jbe	L(return)
> +	movdqu	32(%rsi), %xmm8
> +	cmpq	$128, %rdx
> +	movdqu	%xmm8, 32(%rdi)
> +	movdqu	-48(%rsi,%rdx), %xmm8
> +	movdqu	%xmm8, -48(%rdi,%rdx)
> +	movdqu	48(%rsi), %xmm8
> +	movdqu	%xmm8, 48(%rdi)
> +	movdqu	-64(%rsi,%rdx), %xmm8
> +	movdqu	%xmm8, -64(%rdi,%rdx)
> +	jbe	L(return)
> +	leaq	64(%rdi), %rcx
> +	addq	%rdi, %rdx
> +	andq	$-64, %rdx
> +	andq	$-64, %rcx
> +	movq	%rcx, %rax
> +	subq	%rdi, %rax
> +	addq	%rax, %rsi
> +	cmpq	%rdx, %rcx
> +	je	L(return)
> +	movq	%rsi, %r10
> +	subq	%rcx, %r10
> +	leaq	16(%r10), %r9
> +	leaq	32(%r10), %r8
> +	leaq	48(%r10), %rax
> +	.p2align 4,,10
> +	ALIGN(4)
> +L(loop):
> +	movdqu	(%rcx,%r10), %xmm8
> +	movdqa	%xmm8, (%rcx)
> +	movdqu	(%rcx,%r9), %xmm8
> +	movdqa	%xmm8, 16(%rcx)
> +	movdqu	(%rcx,%r8), %xmm8
> +	movdqa	%xmm8, 32(%rcx)
> +	movdqu	(%rcx,%rax), %xmm8
> +	movdqa	%xmm8, 48(%rcx)
> +	addq	$64, %rcx
> +	cmpq	%rcx, %rdx
> +	jne	L(loop)
> +	jmp	L(return)
> +L(overlapping):
> +	cmpq	%rsi, %rdi
> +	jae	.L3
> +	testq	%rdx, %rdx
> +	.p2align 4,,5
> +	je	L(return)
> +	movq	%rdx, %r9
> +	leaq	16(%rsi), %rcx
> +	leaq	16(%rdi), %r8
> +	shrq	$4, %r9
> +	movq	%r9, %rax
> +	salq	$4, %rax
> +	cmpq	%rcx, %rdi
> +	setae	%cl
> +	cmpq	%r8, %rsi
> +	setae	%r8b
> +	orl	%r8d, %ecx
> +	cmpq	$15, %rdx
> +	seta	%r8b
> +	testb	%r8b, %cl
> +	je	.L16
> +	testq	%rax, %rax
> +	je	.L16
> +	xorl	%ecx, %ecx
> +	xorl	%r8d, %r8d
> +.L7:
> +	movdqu	(%rsi,%rcx), %xmm8
> +	addq	$1, %r8
> +	movdqu	%xmm8, (%rdi,%rcx)
> +	addq	$16, %rcx
> +	cmpq	%r8, %r9
> +	ja	.L7
> +	cmpq	%rax, %rdx
> +	je	L(return)
> +.L21:
> +	movzbl	(%rsi,%rax), %ecx
> +	movb	%cl, (%rdi,%rax)
> +	addq	$1, %rax
> +	cmpq	%rax, %rdx
> +	ja	.L21
> +	jmp	L(return)
> +L(less_16):
> +	testb	$24, %dl
> +	jne	L(between_9_16)
> +	testb	$4, %dl
> +	.p2align 4,,5
> +	jne	L(between_5_8)
> +	testq	%rdx, %rdx
> +	.p2align 4,,2
> +	je	L(return)
> +	movzbl	(%rsi), %eax
> +	testb	$2, %dl
> +	movb	%al, (%rdi)
> +	je	L(return)
> +	movzwl	-2(%rsi,%rdx), %eax
> +	movw	%ax, -2(%rdi,%rdx)
> +	jmp	L(return)
> +.L3:
> +	leaq	-1(%rdx), %rax
> +	.p2align 4,,10
> +	ALIGN(4)
> +.L11:
> +	movzbl	(%rsi,%rax), %edx
> +	movb	%dl, (%rdi,%rax)
> +	subq	$1, %rax
> +	jmp	.L11
> +L(between_9_16):
> +	movq	(%rsi), %rax
> +	movq	%rax, (%rdi)
> +	movq	-8(%rsi,%rdx), %rax
> +	movq	%rax, -8(%rdi,%rdx)
> +	jmp	L(return)
> +.L16:
> +	xorl	%eax, %eax
> +	jmp	.L21
> +L(between_5_8):
> +	movl	(%rsi), %eax
> +	movl	%eax, (%rdi)
> +	movl	-4(%rsi,%rdx), %eax
> +	movl	%eax, -4(%rdi,%rdx)
> +	jmp	L(return)
> +END(__memcpy_sse2_unaligned)
> +
> diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
> index b452f53..a1e5031 100644
> --- a/sysdeps/x86_64/multiarch/memcpy.S
> +++ b/sysdeps/x86_64/multiarch/memcpy.S
> @@ -33,13 +33,14 @@ ENTRY(__new_memcpy)
>  	jne	1f
>  	call	__init_cpu_features
>  1:	leaq	__memcpy_sse2(%rip), %rax
> -	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> -	jz	2f
> -	leaq	__memcpy_ssse3(%rip), %rax
> -	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
> -	jz	2f
> -	leaq	__memcpy_ssse3_back(%rip), %rax
> -2:	ret
> +	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
> +	jnz	2f
> +	leaq	__memcpy_sse2_unaligned(%rip), %rax
> +	ret
> +2:	testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> +	jz 3f
> +	leaq    __memcpy_ssse3(%rip), %rax
> +3:	ret
>  END(__new_memcpy)
>  
>  # undef ENTRY
> -- 
> 1.7.4.4
> 

> #include <emmintrin.h>
> #include <stdint.h>
> #include <stdlib.h>
> typedef __m128i tp_vector;
> typedef uint64_t tp_mask;
> 
> #define LOAD(x) _mm_load_si128 ((tp_vector *) (x))
> #define LOADU(x) _mm_loadu_si128 ((tp_vector *) (x))
> #define STORE(x, y) _mm_store_si128 ((tp_vector *) (x), (y))
> #define STOREU(x, y) _mm_storeu_si128 ((tp_vector *) (x), (y))
> #define CONCAT(x, y, n) _mm_alignr_epi8 (y , x ,n)
> #define MIN _mm_min_epu8
> #define EQ  _mm_cmpeq_epi8
> #define OR  _mm_or_si128
> 
> #define BROADCAST(x) _mm_set1_epi8 (x)
> #define get_mask(x) ((uint64_t) _mm_movemask_epi8 (x))
> 
> /* Align VALUE down by ALIGN bytes.  */
> #define ALIGN_DOWN(value, align) \
>        ALIGN_DOWN_M1(value, align - 1)
> /* Align VALUE down by ALIGN_M1 + 1 bytes.
>    Useful if you have precomputed ALIGN - 1.  */
> #define ALIGN_DOWN_M1(value, align_m1) \
>        (void *)((uintptr_t)(value) \
>                 & ~(uintptr_t)(align_m1))
> 
> /* Align VALUE up by ALIGN bytes.  */
> #define ALIGN_UP(value, align) \
>        ALIGN_UP_M1(value, align - 1)
> /* Align VALUE up by ALIGN_M1 + 1 bytes.
>    Useful if you have precomputed ALIGN - 1.  */
> #define ALIGN_UP_M1(value, align_m1) \
>        (void *)(((uintptr_t)(value) + (uintptr_t)(align_m1)) \
>                 & ~(uintptr_t)(align_m1))
> 
> 
> static char *__memcpy_overlap(char *dest,char *src,size_t n)
> {
> 	size_t i;
> 	if (dest < src)
>   	for (i=0;i < n ;i++)
> 	  	dest[i]=src[i];
> 	else
> 		for (i=n-1;i >=0 ;i--)
>       dest[i]=src[i];
> 	return dest;
> }
> char *memcpy_libc(char *x,char *y,size_t n);
> 
> static char *memcpy_small (char *dest, char *src, size_t no, char *ret);
> void *memcpy_new_u(char *dest, char *src, size_t n)
> {
> 	char *from,*mid,*to;
> 	char *dest2= dest;
> 	if (__builtin_expect((uint64_t)((src - dest)-n) < 2*n,0)) 
>     return __memcpy_overlap(dest,src,n);
>   if (__builtin_expect(n <= 128,1))
>     {
> 			if (n<= 16);
> 				return memcpy_small(dest, src, n, dest2);
> 			STOREU(dest, LOADU(src));
> 			STOREU(dest + n - 16, LOADU(src + n - 16));
> 			if (n <= 32) return dest2;
> 			STOREU(dest+16, LOADU(src+16));
> 			STOREU(dest + n - 32, LOADU(src + n - 32));
> 			if (n <= 64) return dest2;
> 			STOREU(dest+32, LOADU(src+32));
> 			STOREU(dest + n - 48, LOADU(src + n - 48));
> 			STOREU(dest+48, LOADU(src+48));
> 			STOREU(dest + n - 64, LOADU(src + n - 64));
> 			return dest2;
> 		}
>     STOREU(dest, LOADU(src));
>     STOREU(dest + n - 16, LOADU(src + n - 16));
> 		STOREU(dest+16, LOADU(src+16));
>     STOREU(dest + n - 32, LOADU(src + n - 32));
> 		STOREU(dest+32, LOADU(src+32));
>     STOREU(dest + n - 48, LOADU(src + n - 48));
> 		STOREU(dest+48, LOADU(src+48));
>     STOREU(dest + n - 64, LOADU(src + n - 64));
> 
> 		from = ALIGN_DOWN(dest + 64, 64);
> 		to   = ALIGN_DOWN(dest + n , 64);
> 		src += from - dest;
> 		while (from != to)
> 		{
> 			STORE(from, LOADU(src));
> 			STORE(from+16, LOADU(src+16));
> 			STORE(from+32, LOADU(src+32));
> 			STORE(from+48, LOADU(src+48));
> 			from += 64;
> 			src += 64;
> 		}
> 		return dest2;
> }
> 
> static char *memcpy_small (char *dest, char *src, size_t no, char *ret)
> {
>   if (no & (8 + 16))
>     {
>       ((uint64_t *) dest)[0] = ((uint64_t *) src)[0];
>       ((uint64_t *)(dest + no - 8))[0] = ((uint64_t *)(src + no - 8))[0];
>       return ret;
>     }
>   if (no & 4)
>     {
>       ((uint32_t *) dest)[0] = ((uint32_t *) src)[0];
>       ((uint32_t *)(dest + no - 4))[0] = ((uint32_t *)(src + no - 4))[0];
>       return ret;
>     }
> 	if (no )
>     dest[0] = src[0];
>   if (no & 2)
>     {
>       ((uint16_t *)(dest + no - 2))[0] = ((uint16_t *)(src + no - 2))[0];
>       return ret;
>     }
>   return ret;
> }
> 


-- 

divide-by-zero error


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]