This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Improve unaligned memcpy and memmove.


Hi,

This patch improves unaligned memcpy by around 7% for gcc workload on
nehalem/ivy bridge.
http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_ivy_bridge/memcpy_profile_loop/results_gcc/result.html

I applied similar tricks as in ssse3 case to get this speedup. One is to
use explicit counter in loop which makes it predicted.

Second are microoptimizations of header.

As in ssse3 case and overhead of memmove was under 1% so I decided to
alias memmove to memcpy.

Also at previous iteration I missed updating mempcpy which this fixes.

Passes test, OK to commit?

Ondra
	* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
	(__memcpy_sse2_unaligned): Optimize implementation.
	(__mempcpy_sse2_unaligned): New function.
	* sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Update ifunc.
	* sysdeps/x86_64/multiarch/mempcpy.c (__mempcpy): Likewise.

---
 sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S | 312 +++++++++++++----------
 sysdeps/x86_64/multiarch/memmove.c               |  10 +-
 sysdeps/x86_64/multiarch/mempcpy.S               |   6 +-
 3 files changed, 183 insertions(+), 145 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index efdfea2..02a129b 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -21,155 +21,187 @@
 #include "asm-syntax.h"
 
 #ifndef ALIGN
-# define ALIGN(n)	.p2align n
+# define ALIGN(n) .p2align n
 #endif
 
+ENTRY (__mempcpy_sse2_unaligned)
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	cmpq	$128, %rdx
+	jbe	L(less_128_bytes)
+	jmp	L(from_mempcpy)
+END (__mempcpy_sse2_unaligned)
 
 ENTRY(__memcpy_sse2_unaligned)
-	movq	%rsi, %rax
-	leaq	(%rdx,%rdx), %rcx
-	subq	%rdi, %rax
-	subq	%rdx, %rax
-	cmpq	%rcx, %rax
-	jb	L(overlapping)
-	cmpq	$16, %rdx
-	jbe	L(less_16)
-	movdqu	(%rsi), %xmm8
-	cmpq	$32, %rdx
-	movdqu	%xmm8, (%rdi)
-	movdqu	-16(%rsi,%rdx), %xmm8
-	movdqu	%xmm8, -16(%rdi,%rdx)
-	ja	.L31
-L(return):
-	movq	%rdi, %rax
-	ret
-	.p2align 4,,10
 	ALIGN(4)
-.L31:
-	movdqu	16(%rsi), %xmm8
-	cmpq	$64, %rdx
-	movdqu	%xmm8, 16(%rdi)
-	movdqu	-32(%rsi,%rdx), %xmm8
-	movdqu	%xmm8, -32(%rdi,%rdx)
-	jbe	L(return)
-	movdqu	32(%rsi), %xmm8
+	movq	%rdi, %rax
 	cmpq	$128, %rdx
-	movdqu	%xmm8, 32(%rdi)
-	movdqu	-48(%rsi,%rdx), %xmm8
-	movdqu	%xmm8, -48(%rdi,%rdx)
-	movdqu	48(%rsi), %xmm8
-	movdqu	%xmm8, 48(%rdi)
-	movdqu	-64(%rsi,%rdx), %xmm8
-	movdqu	%xmm8, -64(%rdi,%rdx)
-	jbe	L(return)
-	leaq	64(%rdi), %rcx
-	addq	%rdi, %rdx
-	andq	$-64, %rdx
-	andq	$-64, %rcx
-	movq	%rcx, %rax
-	subq	%rdi, %rax
-	addq	%rax, %rsi
+	jbe	L(less_128_bytes)
+L(from_mempcpy):
+
+	movdqu	-16(%rsi, %rdx), %xmm4
+	movdqu	-32(%rsi, %rdx), %xmm5
+	movdqu	-48(%rsi, %rdx), %xmm6
+	movdqu	-64(%rsi, %rdx), %xmm7
+	lea	(%rdi, %rdx), %r10
+	movdqu	(%rsi), %xmm8
+
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
 	cmpq	%rdx, %rcx
-	je	L(return)
-	movq	%rsi, %r10
-	subq	%rcx, %r10
-	leaq	16(%r10), %r9
-	leaq	32(%r10), %r8
-	leaq	48(%r10), %rax
-	.p2align 4,,10
+	jb	L(bwd)
+
+	leaq	16(%rdi), %rdx
+	andq	$-16, %rdx
+	movq	%rdx, %rcx
+	subq	%rdi, %rcx
+	addq	%rcx, %rsi
+	movq	%r10, %rcx
+	subq	%rdx, %rcx
+	shrq	$6, %rcx
+
 	ALIGN(4)
 L(loop):
-	movdqu	(%rcx,%r10), %xmm8
-	movdqa	%xmm8, (%rcx)
-	movdqu	(%rcx,%r9), %xmm8
-	movdqa	%xmm8, 16(%rcx)
-	movdqu	(%rcx,%r8), %xmm8
-	movdqa	%xmm8, 32(%rcx)
-	movdqu	(%rcx,%rax), %xmm8
-	movdqa	%xmm8, 48(%rcx)
-	addq	$64, %rcx
-	cmpq	%rcx, %rdx
-	jne	L(loop)
-	jmp	L(return)
-L(overlapping):
-	cmpq	%rsi, %rdi
-	jae	.L3
-	testq	%rdx, %rdx
-	.p2align 4,,5
-	je	L(return)
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	movdqu	48(%rsi), %xmm3
+	movdqa	%xmm0, (%rdx)
+	addq	$64, %rsi
+	movdqa	%xmm1, 16(%rdx)
+	movdqa	%xmm2, 32(%rdx)
+	movdqa	%xmm3, 48(%rdx)
+	addq	$64, %rdx
+	sub	$1, %rcx
+	jnz	L(loop)
+	movdqu	%xmm8, (%rdi)
+	movdqu	%xmm4, -16(%r10)
+	movdqu	%xmm5, -32(%r10)
+	movdqu	%xmm6, -48(%r10)
+	movdqu	%xmm7, -64(%r10)
+	ret
+
+ALIGN(4)
+L(between_8_15_bytes):
+	movq	-8(%rsi, %rdx), %rcx
+	movq	(%rsi), %rsi
+	movq	%rsi, (%rdi)
+	movq	%rcx, -8(%rdi, %rdx)
+	ret
+
+ALIGN(4)
+L(between_4_7_bytes):
+	movl	-4(%rsi, %rdx), %ecx
+	movl	(%rsi), %esi
+	movl	%esi, (%rdi)
+	movl	%ecx, -4(%rdi, %rdx)
+	ret
+
+ALIGN(4)
+L(between_0_1_bytes):
+	jne	L(between_0_0_bytes)
+	movzbl	(%rsi), %edx
+	movb	%dl, (%rdi)
+L(between_0_0_bytes):
+	ret
+
+	ALIGN(4)
+L(less_16_bytes):
+	cmp	$8, %edx
+	jae	L(between_8_15_bytes)
+	cmp	$4, %edx
+	jae	L(between_4_7_bytes)
+	cmp	$1, %edx
+	jbe	L(between_0_1_bytes)
+	movzwl	-2(%rsi, %rdx), %ecx
+	movzwl	(%rsi), %esi
+	movw	%si, (%rdi)
+	movw	%cx, -2(%rdi, %rdx)
+	ret
+
+	/* Here misprediction costs more than copying data twice.  */
+	ALIGN(4)
+L(less_128_bytes):	
+	cmp	$64, %edx
+	jae	L(between_64_128_bytes)
+	cmp	$32, %edx
+	jae	L(between_32_64_bytes)
+	cmp	$16, %edx
+	jb	L(less_16_bytes)
+	movdqu	(%rsi), %xmm1
+	movdqu	-16(%rsi, %rdx), %xmm0
+	movdqu	%xmm1, (%rdi)
+	movdqu	%xmm0, -16(%rdi, %rdx)
+	ret
+
+	ALIGN(4)
+L(between_32_64_bytes):
+	movdqu	(%rsi), %xmm3
+	movdqu	-16(%rsi, %rdx), %xmm2
+	movdqu	16(%rsi), %xmm1
+	movdqu	-32(%rsi, %rdx), %xmm0
+	movdqu	%xmm3, (%rdi)
+	movdqu	%xmm2, -16(%rdi, %rdx)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm0, -32(%rdi, %rdx)
+	ret
+
+ALIGN(4)
+L(between_64_128_bytes):
+	movdqu	(%rsi), %xmm7
+	movdqu	-16(%rsi, %rdx), %xmm6
+	movdqu	16(%rsi), %xmm5
+	movdqu	-32(%rsi, %rdx), %xmm4
+	movdqu	32(%rsi), %xmm3
+	movdqu	-48(%rsi, %rdx), %xmm2
+	movdqu	48(%rsi), %xmm1
+	movdqu	-64(%rsi, %rdx), %xmm0
+	movdqu	%xmm7, (%rdi)
+	movdqu	%xmm6, -16(%rdi, %rdx)
+	movdqu	%xmm5, 16(%rdi)
+	movdqu	%xmm4, -32(%rdi, %rdx)
+	movdqu	%xmm3, 32(%rdi)
+	movdqu	%xmm2, -48(%rdi, %rdx)
+	movdqu	%xmm1, 48(%rdi)
+	movdqu	%xmm0, -64(%rdi, %rdx)
+	ret
+ 
+	ALIGN(4)
+L(bwd):
+	leaq	16(%rdi), %rdx
+	andq	$-16, %rdx
 	movq	%rdx, %r9
-	leaq	16(%rsi), %rcx
-	leaq	16(%rdi), %r8
-	shrq	$4, %r9
-	movq	%r9, %rax
-	salq	$4, %rax
-	cmpq	%rcx, %rdi
-	setae	%cl
-	cmpq	%r8, %rsi
-	setae	%r8b
-	orl	%r8d, %ecx
-	cmpq	$15, %rdx
-	seta	%r8b
-	testb	%r8b, %cl
-	je	.L16
-	testq	%rax, %rax
-	je	.L16
-	xorl	%ecx, %ecx
-	xorl	%r8d, %r8d
-.L7:
-	movdqu	(%rsi,%rcx), %xmm8
-	addq	$1, %r8
-	movdqu	%xmm8, (%rdi,%rcx)
-	addq	$16, %rcx
-	cmpq	%r8, %r9
-	ja	.L7
-	cmpq	%rax, %rdx
-	je	L(return)
-.L21:
-	movzbl	(%rsi,%rax), %ecx
-	movb	%cl, (%rdi,%rax)
-	addq	$1, %rax
-	cmpq	%rax, %rdx
-	ja	.L21
-	jmp	L(return)
-L(less_16):
-	testb	$24, %dl
-	jne	L(between_9_16)
-	testb	$4, %dl
-	.p2align 4,,5
-	jne	L(between_5_8)
-	testq	%rdx, %rdx
-	.p2align 4,,2
-	je	L(return)
-	movzbl	(%rsi), %eax
-	testb	$2, %dl
-	movb	%al, (%rdi)
-	je	L(return)
-	movzwl	-2(%rsi,%rdx), %eax
-	movw	%ax, -2(%rdi,%rdx)
-	jmp	L(return)
-.L3:
-	leaq	-1(%rdx), %rax
-	.p2align 4,,10
+	subq	%rdi, %r9
+	addq	%r9, %rsi
+	subq	%rdx, %rcx
+	shrq	$6, %rcx
+	movq	%rcx, %r9
+
+	shlq	$6, %r9
+	subq	$64, %r9
+	addq	%r9, %rsi
+	addq	%r9, %rdx
+
 	ALIGN(4)
-.L11:
-	movzbl	(%rsi,%rax), %edx
-	movb	%dl, (%rdi,%rax)
-	subq	$1, %rax
-	jmp	.L11
-L(between_9_16):
-	movq	(%rsi), %rax
-	movq	%rax, (%rdi)
-	movq	-8(%rsi,%rdx), %rax
-	movq	%rax, -8(%rdi,%rdx)
-	jmp	L(return)
-.L16:
-	xorl	%eax, %eax
-	jmp	.L21
-L(between_5_8):
-	movl	(%rsi), %eax
-	movl	%eax, (%rdi)
-	movl	-4(%rsi,%rdx), %eax
-	movl	%eax, -4(%rdi,%rdx)
-	jmp	L(return)
+L(bwd_loop):
+	movdqu	48(%rsi), %xmm3
+	movdqu	32(%rsi), %xmm2
+	movdqu	16(%rsi), %xmm1
+	movdqu	(%rsi), %xmm0
+	movdqa	%xmm3, 48(%rdx)
+	movdqa	%xmm2, 32(%rdx)
+	movdqa	%xmm1, 16(%rdx)
+	movdqa	%xmm0, (%rdx)
+	subq	$64, %rdx
+	subq	$64, %rsi
+	sub	$1, %rcx
+	jnz	L(bwd_loop)
+	movdqu	%xmm8, (%rdi)
+	movdqu	%xmm4, -16(%r10)
+	movdqu	%xmm5, -32(%r10)
+	movdqu	%xmm6, -48(%r10)
+	movdqu	%xmm7, -64(%r10)
+	ret
 END(__memcpy_sse2_unaligned)
+
+strong_alias(__memcpy_sse2_unaligned,__memmove_sse2_unaligned)
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
index 8149c48..f59b00c 100644
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -33,6 +33,7 @@
 # undef memmove
 
 extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_sse2_unaligned attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
 #endif
@@ -47,10 +48,11 @@ extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
    ifunc symbol properly.  */
 extern __typeof (__redirect_memmove) __libc_memmove;
 libc_ifunc (__libc_memmove,
-	    HAS_SSSE3
-	    ? (HAS_FAST_COPY_BACKWARD
-	       ? __memmove_ssse3_back : __memmove_ssse3)
-	    : __memmove_sse2)
+	    HAS_FAST_UNALIGNED_LOAD ? __memmove_sse2_unaligned :
+	    ( HAS_SSSE3
+	      ? (HAS_FAST_COPY_BACKWARD
+	         ? __memmove_ssse3_back : __memmove_ssse3)
+	      : __memmove_sse2))
 
 strong_alias (__libc_memmove, memmove)
 
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index b8b7fcd..03d87e4 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -31,7 +31,11 @@ ENTRY(__mempcpy)
 	jne	1f
 	call	__init_cpu_features
 1:	leaq	__mempcpy_sse2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	testl   $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+	jz	3f
+	leaq	__mempcpy_sse2_unaligned(%rip), %rax
+	ret
+3:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
 	jz	2f
 	leaq	__mempcpy_ssse3(%rip), %rax
 	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
-- 
1.8.3.2


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]