This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] Improve unaligned memcpy and memmove.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Mon, 19 Aug 2013 10:52:20 +0200
- Subject: [PATCH] Improve unaligned memcpy and memmove.
Hi,
This patch improves unaligned memcpy by around 7% for gcc workload on
nehalem/ivy bridge.
http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_ivy_bridge/memcpy_profile_loop/results_gcc/result.html
I applied similar tricks as in ssse3 case to get this speedup. One is to
use explicit counter in loop which makes it predicted.
Second are microoptimizations of header.
As in ssse3 case and overhead of memmove was under 1% so I decided to
alias memmove to memcpy.
Also at previous iteration I missed updating mempcpy which this fixes.
Passes test, OK to commit?
Ondra
* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
(__memcpy_sse2_unaligned): Optimize implementation.
(__mempcpy_sse2_unaligned): New function.
* sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Update ifunc.
* sysdeps/x86_64/multiarch/mempcpy.c (__mempcpy): Likewise.
---
sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S | 312 +++++++++++++----------
sysdeps/x86_64/multiarch/memmove.c | 10 +-
sysdeps/x86_64/multiarch/mempcpy.S | 6 +-
3 files changed, 183 insertions(+), 145 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index efdfea2..02a129b 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -21,155 +21,187 @@
#include "asm-syntax.h"
#ifndef ALIGN
-# define ALIGN(n) .p2align n
+# define ALIGN(n) .p2align n
#endif
+ENTRY (__mempcpy_sse2_unaligned)
+ movq %rdi, %rax
+ addq %rdx, %rax
+ cmpq $128, %rdx
+ jbe L(less_128_bytes)
+ jmp L(from_mempcpy)
+END (__mempcpy_sse2_unaligned)
ENTRY(__memcpy_sse2_unaligned)
- movq %rsi, %rax
- leaq (%rdx,%rdx), %rcx
- subq %rdi, %rax
- subq %rdx, %rax
- cmpq %rcx, %rax
- jb L(overlapping)
- cmpq $16, %rdx
- jbe L(less_16)
- movdqu (%rsi), %xmm8
- cmpq $32, %rdx
- movdqu %xmm8, (%rdi)
- movdqu -16(%rsi,%rdx), %xmm8
- movdqu %xmm8, -16(%rdi,%rdx)
- ja .L31
-L(return):
- movq %rdi, %rax
- ret
- .p2align 4,,10
ALIGN(4)
-.L31:
- movdqu 16(%rsi), %xmm8
- cmpq $64, %rdx
- movdqu %xmm8, 16(%rdi)
- movdqu -32(%rsi,%rdx), %xmm8
- movdqu %xmm8, -32(%rdi,%rdx)
- jbe L(return)
- movdqu 32(%rsi), %xmm8
+ movq %rdi, %rax
cmpq $128, %rdx
- movdqu %xmm8, 32(%rdi)
- movdqu -48(%rsi,%rdx), %xmm8
- movdqu %xmm8, -48(%rdi,%rdx)
- movdqu 48(%rsi), %xmm8
- movdqu %xmm8, 48(%rdi)
- movdqu -64(%rsi,%rdx), %xmm8
- movdqu %xmm8, -64(%rdi,%rdx)
- jbe L(return)
- leaq 64(%rdi), %rcx
- addq %rdi, %rdx
- andq $-64, %rdx
- andq $-64, %rcx
- movq %rcx, %rax
- subq %rdi, %rax
- addq %rax, %rsi
+ jbe L(less_128_bytes)
+L(from_mempcpy):
+
+ movdqu -16(%rsi, %rdx), %xmm4
+ movdqu -32(%rsi, %rdx), %xmm5
+ movdqu -48(%rsi, %rdx), %xmm6
+ movdqu -64(%rsi, %rdx), %xmm7
+ lea (%rdi, %rdx), %r10
+ movdqu (%rsi), %xmm8
+
+ movq %rdi, %rcx
+ subq %rsi, %rcx
cmpq %rdx, %rcx
- je L(return)
- movq %rsi, %r10
- subq %rcx, %r10
- leaq 16(%r10), %r9
- leaq 32(%r10), %r8
- leaq 48(%r10), %rax
- .p2align 4,,10
+ jb L(bwd)
+
+ leaq 16(%rdi), %rdx
+ andq $-16, %rdx
+ movq %rdx, %rcx
+ subq %rdi, %rcx
+ addq %rcx, %rsi
+ movq %r10, %rcx
+ subq %rdx, %rcx
+ shrq $6, %rcx
+
ALIGN(4)
L(loop):
- movdqu (%rcx,%r10), %xmm8
- movdqa %xmm8, (%rcx)
- movdqu (%rcx,%r9), %xmm8
- movdqa %xmm8, 16(%rcx)
- movdqu (%rcx,%r8), %xmm8
- movdqa %xmm8, 32(%rcx)
- movdqu (%rcx,%rax), %xmm8
- movdqa %xmm8, 48(%rcx)
- addq $64, %rcx
- cmpq %rcx, %rdx
- jne L(loop)
- jmp L(return)
-L(overlapping):
- cmpq %rsi, %rdi
- jae .L3
- testq %rdx, %rdx
- .p2align 4,,5
- je L(return)
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+ movdqu 32(%rsi), %xmm2
+ movdqu 48(%rsi), %xmm3
+ movdqa %xmm0, (%rdx)
+ addq $64, %rsi
+ movdqa %xmm1, 16(%rdx)
+ movdqa %xmm2, 32(%rdx)
+ movdqa %xmm3, 48(%rdx)
+ addq $64, %rdx
+ sub $1, %rcx
+ jnz L(loop)
+ movdqu %xmm8, (%rdi)
+ movdqu %xmm4, -16(%r10)
+ movdqu %xmm5, -32(%r10)
+ movdqu %xmm6, -48(%r10)
+ movdqu %xmm7, -64(%r10)
+ ret
+
+ALIGN(4)
+L(between_8_15_bytes):
+ movq -8(%rsi, %rdx), %rcx
+ movq (%rsi), %rsi
+ movq %rsi, (%rdi)
+ movq %rcx, -8(%rdi, %rdx)
+ ret
+
+ALIGN(4)
+L(between_4_7_bytes):
+ movl -4(%rsi, %rdx), %ecx
+ movl (%rsi), %esi
+ movl %esi, (%rdi)
+ movl %ecx, -4(%rdi, %rdx)
+ ret
+
+ALIGN(4)
+L(between_0_1_bytes):
+ jne L(between_0_0_bytes)
+ movzbl (%rsi), %edx
+ movb %dl, (%rdi)
+L(between_0_0_bytes):
+ ret
+
+ ALIGN(4)
+L(less_16_bytes):
+ cmp $8, %edx
+ jae L(between_8_15_bytes)
+ cmp $4, %edx
+ jae L(between_4_7_bytes)
+ cmp $1, %edx
+ jbe L(between_0_1_bytes)
+ movzwl -2(%rsi, %rdx), %ecx
+ movzwl (%rsi), %esi
+ movw %si, (%rdi)
+ movw %cx, -2(%rdi, %rdx)
+ ret
+
+ /* Here misprediction costs more than copying data twice. */
+ ALIGN(4)
+L(less_128_bytes):
+ cmp $64, %edx
+ jae L(between_64_128_bytes)
+ cmp $32, %edx
+ jae L(between_32_64_bytes)
+ cmp $16, %edx
+ jb L(less_16_bytes)
+ movdqu (%rsi), %xmm1
+ movdqu -16(%rsi, %rdx), %xmm0
+ movdqu %xmm1, (%rdi)
+ movdqu %xmm0, -16(%rdi, %rdx)
+ ret
+
+ ALIGN(4)
+L(between_32_64_bytes):
+ movdqu (%rsi), %xmm3
+ movdqu -16(%rsi, %rdx), %xmm2
+ movdqu 16(%rsi), %xmm1
+ movdqu -32(%rsi, %rdx), %xmm0
+ movdqu %xmm3, (%rdi)
+ movdqu %xmm2, -16(%rdi, %rdx)
+ movdqu %xmm1, 16(%rdi)
+ movdqu %xmm0, -32(%rdi, %rdx)
+ ret
+
+ALIGN(4)
+L(between_64_128_bytes):
+ movdqu (%rsi), %xmm7
+ movdqu -16(%rsi, %rdx), %xmm6
+ movdqu 16(%rsi), %xmm5
+ movdqu -32(%rsi, %rdx), %xmm4
+ movdqu 32(%rsi), %xmm3
+ movdqu -48(%rsi, %rdx), %xmm2
+ movdqu 48(%rsi), %xmm1
+ movdqu -64(%rsi, %rdx), %xmm0
+ movdqu %xmm7, (%rdi)
+ movdqu %xmm6, -16(%rdi, %rdx)
+ movdqu %xmm5, 16(%rdi)
+ movdqu %xmm4, -32(%rdi, %rdx)
+ movdqu %xmm3, 32(%rdi)
+ movdqu %xmm2, -48(%rdi, %rdx)
+ movdqu %xmm1, 48(%rdi)
+ movdqu %xmm0, -64(%rdi, %rdx)
+ ret
+
+ ALIGN(4)
+L(bwd):
+ leaq 16(%rdi), %rdx
+ andq $-16, %rdx
movq %rdx, %r9
- leaq 16(%rsi), %rcx
- leaq 16(%rdi), %r8
- shrq $4, %r9
- movq %r9, %rax
- salq $4, %rax
- cmpq %rcx, %rdi
- setae %cl
- cmpq %r8, %rsi
- setae %r8b
- orl %r8d, %ecx
- cmpq $15, %rdx
- seta %r8b
- testb %r8b, %cl
- je .L16
- testq %rax, %rax
- je .L16
- xorl %ecx, %ecx
- xorl %r8d, %r8d
-.L7:
- movdqu (%rsi,%rcx), %xmm8
- addq $1, %r8
- movdqu %xmm8, (%rdi,%rcx)
- addq $16, %rcx
- cmpq %r8, %r9
- ja .L7
- cmpq %rax, %rdx
- je L(return)
-.L21:
- movzbl (%rsi,%rax), %ecx
- movb %cl, (%rdi,%rax)
- addq $1, %rax
- cmpq %rax, %rdx
- ja .L21
- jmp L(return)
-L(less_16):
- testb $24, %dl
- jne L(between_9_16)
- testb $4, %dl
- .p2align 4,,5
- jne L(between_5_8)
- testq %rdx, %rdx
- .p2align 4,,2
- je L(return)
- movzbl (%rsi), %eax
- testb $2, %dl
- movb %al, (%rdi)
- je L(return)
- movzwl -2(%rsi,%rdx), %eax
- movw %ax, -2(%rdi,%rdx)
- jmp L(return)
-.L3:
- leaq -1(%rdx), %rax
- .p2align 4,,10
+ subq %rdi, %r9
+ addq %r9, %rsi
+ subq %rdx, %rcx
+ shrq $6, %rcx
+ movq %rcx, %r9
+
+ shlq $6, %r9
+ subq $64, %r9
+ addq %r9, %rsi
+ addq %r9, %rdx
+
ALIGN(4)
-.L11:
- movzbl (%rsi,%rax), %edx
- movb %dl, (%rdi,%rax)
- subq $1, %rax
- jmp .L11
-L(between_9_16):
- movq (%rsi), %rax
- movq %rax, (%rdi)
- movq -8(%rsi,%rdx), %rax
- movq %rax, -8(%rdi,%rdx)
- jmp L(return)
-.L16:
- xorl %eax, %eax
- jmp .L21
-L(between_5_8):
- movl (%rsi), %eax
- movl %eax, (%rdi)
- movl -4(%rsi,%rdx), %eax
- movl %eax, -4(%rdi,%rdx)
- jmp L(return)
+L(bwd_loop):
+ movdqu 48(%rsi), %xmm3
+ movdqu 32(%rsi), %xmm2
+ movdqu 16(%rsi), %xmm1
+ movdqu (%rsi), %xmm0
+ movdqa %xmm3, 48(%rdx)
+ movdqa %xmm2, 32(%rdx)
+ movdqa %xmm1, 16(%rdx)
+ movdqa %xmm0, (%rdx)
+ subq $64, %rdx
+ subq $64, %rsi
+ sub $1, %rcx
+ jnz L(bwd_loop)
+ movdqu %xmm8, (%rdi)
+ movdqu %xmm4, -16(%r10)
+ movdqu %xmm5, -32(%r10)
+ movdqu %xmm6, -48(%r10)
+ movdqu %xmm7, -64(%r10)
+ ret
END(__memcpy_sse2_unaligned)
+
+strong_alias(__memcpy_sse2_unaligned,__memmove_sse2_unaligned)
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
index 8149c48..f59b00c 100644
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -33,6 +33,7 @@
# undef memmove
extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_sse2_unaligned attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
#endif
@@ -47,10 +48,11 @@ extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
ifunc symbol properly. */
extern __typeof (__redirect_memmove) __libc_memmove;
libc_ifunc (__libc_memmove,
- HAS_SSSE3
- ? (HAS_FAST_COPY_BACKWARD
- ? __memmove_ssse3_back : __memmove_ssse3)
- : __memmove_sse2)
+ HAS_FAST_UNALIGNED_LOAD ? __memmove_sse2_unaligned :
+ ( HAS_SSSE3
+ ? (HAS_FAST_COPY_BACKWARD
+ ? __memmove_ssse3_back : __memmove_ssse3)
+ : __memmove_sse2))
strong_alias (__libc_memmove, memmove)
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index b8b7fcd..03d87e4 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -31,7 +31,11 @@ ENTRY(__mempcpy)
jne 1f
call __init_cpu_features
1: leaq __mempcpy_sse2(%rip), %rax
- testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+ jz 3f
+ leaq __mempcpy_sse2_unaligned(%rip), %rax
+ ret
+3: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jz 2f
leaq __mempcpy_ssse3(%rip), %rax
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
--
1.8.3.2