This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH v1.2] Improve unaligned memcpy and memmove.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: Liubov Dmitrieva <liubov dot dmitrieva at gmail dot com>
- Cc: libc-alpha at sourceware dot org
- Date: Fri, 4 Oct 2013 00:09:26 +0200
- Subject: Re: [PATCH v1.2] Improve unaligned memcpy and memmove.
- Authentication-results: sourceware.org; auth=none
- References: <20130819085220 dot GB19541 at domone> <20130829153829 dot GA6105 at domone dot kolej dot mff dot cuni dot cz>
On Thu, Aug 29, 2013 at 05:38:29PM +0200, OndÅej BÃlka wrote:
> ping,
>
> A following version differs by fixing formating issues that Roland noted
> and trailing whitespaces.
>
> On Mon, Aug 19, 2013 at 10:52:20AM +0200, OndÅej BÃlka wrote:
> > Hi,
> >
> > This patch improves unaligned memcpy by around 7% for gcc workload on
> > nehalem/ivy bridge.
> > http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_ivy_bridge/memcpy_profile_loop/results_gcc/result.html
> >
> > I applied similar tricks as in ssse3 case to get this speedup. One is to
> > use explicit counter in loop which makes it predicted.
> >
I updated patch with following changes.
- On silvermont a rep movsq is faster from 4000 bytes onward so I added
implementation that switches to that.
Second change is that I looked if I could get something by rearranging
code to decrease time for sizes upto 16 bytes.
I made smaller header by realizing that we need handle sizes less than
80 until loop takes over. This allows to save one branch by handling
sizes 16-48 by
memcpy(x,y,16);
memcpy(x+(n-16)/2,y+(n-16)/2,16);
memcpy(x+n-16,y+n-16,16);
Third is adding implementations to ifunc-impl-list which I missed in
previous iteration.
Results are here.
http://kam.mff.cuni.cz/~ondra/benchmark_string/memcpy_profile_loop.html
* sysdeps/x86_64/multiarch/init-arch.h: Add HAS_SLOW_SSE4_2.
* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S:
Optimize implementation.
* sysdeps/x86_64/multiarch/memcpy-sse2-silvermont.S: New file.
* sysdeps/x86_64/multiarch/Makefile (sysdep-routines):
Add memcpy-sse-silvermont.
* sysdeps/x86_64/multiarch/memcpy.S: Update ifunc.
* sysdeps/x86_64/multiarch/memmove.c: Likewise.
* sysdeps/x86_64/multiarch/mempcpy.S: Likewise.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Update.
---
sysdeps/x86_64/multiarch/Makefile | 2 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 5 +
sysdeps/x86_64/multiarch/init-arch.h | 1 +
sysdeps/x86_64/multiarch/memcpy-sse2-silvermont.S | 6 +
sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S | 329 ++++++++++++---------
sysdeps/x86_64/multiarch/memcpy.S | 3 +
sysdeps/x86_64/multiarch/memmove.c | 12 +-
sysdeps/x86_64/multiarch/mempcpy.S | 9 +-
8 files changed, 219 insertions(+), 148 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/memcpy-sse2-silvermont.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 9fd0fd6..74662c6 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -17,7 +17,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
- strchr-sse2-no-bsf memcmp-ssse3
+ strchr-sse2-no-bsf memcmp-ssse3 memcpy-sse2-silvermont
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 71beab8..ee59f0c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -59,6 +59,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memmove_ssse3_back)
IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
__memmove_ssse3)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2_silvermont)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
/* Support sysdeps/x86_64/multiarch/stpncpy.S. */
@@ -215,6 +217,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
__memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_silvermont)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
@@ -233,6 +236,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
__mempcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2_silvermont)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
/* Support sysdeps/x86_64/multiarch/strncmp.S. */
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index 0cb5f5b..41c23d5 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -170,6 +170,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define HAS_FAST_REP_STRING HAS_ARCH_FEATURE (Fast_Rep_String)
# define HAS_FAST_COPY_BACKWARD HAS_ARCH_FEATURE (Fast_Copy_Backward)
# define HAS_SLOW_BSF HAS_ARCH_FEATURE (Slow_BSF)
+# define HAS_SLOW_SSE4_2 HAS_ARCH_FEATURE (Slow_SSE4_2)
# define HAS_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (Fast_Unaligned_Load)
# define HAS_AVX HAS_ARCH_FEATURE (AVX_Usable)
# define HAS_FMA HAS_ARCH_FEATURE (FMA_Usable)
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-silvermont.S b/sysdeps/x86_64/multiarch/memcpy-sse2-silvermont.S
new file mode 100644
index 0000000..1c4bec6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-silvermont.S
@@ -0,0 +1,6 @@
+#define FOR_SILVERMONT
+#define __memcpy_sse2_unaligned __memcpy_sse2_silvermont
+#define __mempcpy_sse2_unaligned __mempcpy_sse2_silvermont
+#define __memmove_sse2_unaligned __memmove_sse2_silvermont
+
+#include "memcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index efdfea2..6ed9af6 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -18,158 +18,203 @@
#include <sysdep.h>
-#include "asm-syntax.h"
-
#ifndef ALIGN
# define ALIGN(n) .p2align n
#endif
+ENTRY (__mempcpy_sse2_unaligned)
+ movq %rdi, %rax
+ addq %rdx, %rax
+ lea (%rdi, %rdx), %r10
+ lea (%rsi, %rdx), %r11
+ cmpq $16, %rdx
+ ja L(loop_start)
+ jmp L(from_mempcpy)
+END (__mempcpy_sse2_unaligned)
ENTRY(__memcpy_sse2_unaligned)
- movq %rsi, %rax
- leaq (%rdx,%rdx), %rcx
- subq %rdi, %rax
- subq %rdx, %rax
- cmpq %rcx, %rax
- jb L(overlapping)
+ movq %rdi, %rax
+ lea (%rdi, %rdx), %r10
+ lea (%rsi, %rdx), %r11
cmpq $16, %rdx
- jbe L(less_16)
+ ja L(loop_start)
+L(from_mempcpy):
+ cmp $8, %edx
+ jae L(between_8_16_bytes)
+ cmp $4, %edx
+ jae L(between_4_7_bytes)
+ cmp $1, %edx
+ jbe L(between_0_1_bytes)
+ movzwl -2(%r11), %ecx
+ movzwl (%rsi), %esi /* is movb better? */
+ movw %si, (%rdi)
+ movw %cx, -2(%r10)
+ ret
+
+L(between_0_1_bytes):
+ jne L(between_0_0_bytes)
+ movzbl (%rsi), %edx
+ movb %dl, (%rdi)
+L(between_0_0_bytes):
+ ret
+
+ALIGN(4)
+L(between_8_16_bytes):
+ movq -8(%r11), %rcx
+ movq (%rsi), %rsi
+ movq %rsi, (%rdi)
+ movq %rcx, -8(%r10)
+ ret
+
+ALIGN(4)
+L(between_4_7_bytes):
+ movl -4(%r11), %ecx
+ movl (%rsi), %esi
+ movl %esi, (%rdi)
+ movl %ecx, -4(%r10)
+ ret
+
+ALIGN(3)
+L(loop_start):
+ cmp $80, %rdx
+ jb L(between_16_80_bytes)
+#ifdef FOR_SILVERMONT
+ cmpq $4096, %rdx
+ jae L(rep_loop)
+#endif
+ movdqu -16(%r11), %xmm4
+ movdqu -32(%r11), %xmm5
+ movdqu -48(%r11), %xmm6
+ movdqu -64(%r11), %xmm7
movdqu (%rsi), %xmm8
- cmpq $32, %rdx
+
+ movq %rdi, %rcx
+ subq %rsi, %rcx
+ cmpq %rdx, %rcx
+ jb L(bwd)
+
+ leaq 16(%rdi), %rdx
+ andq $-16, %rdx
+ movq %rdx, %rcx
+ subq %rdi, %rcx
+ lea (%rcx, %rsi), %rsi
+ mov %r10, %rcx
+ subq %rdx, %rcx
+ shrq $6, %rcx
+L(loop):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+ movdqu 32(%rsi), %xmm2
+ movdqu 48(%rsi), %xmm3
+ movdqa %xmm0, (%rdx)
+ addq $64, %rsi
+ movdqa %xmm1, 16(%rdx)
+ movdqa %xmm2, 32(%rdx)
+ movdqa %xmm3, 48(%rdx)
+ addq $64, %rdx
+ sub $1, %rcx
+ jnz L(loop)
movdqu %xmm8, (%rdi)
- movdqu -16(%rsi,%rdx), %xmm8
- movdqu %xmm8, -16(%rdi,%rdx)
- ja .L31
-L(return):
- movq %rdi, %rax
+ movdqu %xmm7, -64(%r10)
+ movdqu %xmm6, -48(%r10)
+ movdqu %xmm5, -32(%r10)
+ movdqu %xmm4, -16(%r10)
ret
- .p2align 4,,10
- ALIGN(4)
-.L31:
- movdqu 16(%rsi), %xmm8
- cmpq $64, %rdx
- movdqu %xmm8, 16(%rdi)
- movdqu -32(%rsi,%rdx), %xmm8
- movdqu %xmm8, -32(%rdi,%rdx)
- jbe L(return)
- movdqu 32(%rsi), %xmm8
- cmpq $128, %rdx
- movdqu %xmm8, 32(%rdi)
- movdqu -48(%rsi,%rdx), %xmm8
- movdqu %xmm8, -48(%rdi,%rdx)
- movdqu 48(%rsi), %xmm8
- movdqu %xmm8, 48(%rdi)
- movdqu -64(%rsi,%rdx), %xmm8
- movdqu %xmm8, -64(%rdi,%rdx)
- jbe L(return)
- leaq 64(%rdi), %rcx
- addq %rdi, %rdx
- andq $-64, %rdx
- andq $-64, %rcx
- movq %rcx, %rax
- subq %rdi, %rax
- addq %rax, %rsi
+
+#ifdef FOR_SILVERMONT
+L(rep_loop):
+ movq %rdi, %rcx
+ subq %rsi, %rcx
cmpq %rdx, %rcx
- je L(return)
- movq %rsi, %r10
- subq %rcx, %r10
- leaq 16(%r10), %r9
- leaq 32(%r10), %r8
- leaq 48(%r10), %rax
- .p2align 4,,10
- ALIGN(4)
-L(loop):
- movdqu (%rcx,%r10), %xmm8
- movdqa %xmm8, (%rcx)
- movdqu (%rcx,%r9), %xmm8
- movdqa %xmm8, 16(%rcx)
- movdqu (%rcx,%r8), %xmm8
- movdqa %xmm8, 32(%rcx)
- movdqu (%rcx,%rax), %xmm8
- movdqa %xmm8, 48(%rcx)
- addq $64, %rcx
- cmpq %rcx, %rdx
- jne L(loop)
- jmp L(return)
-L(overlapping):
- cmpq %rsi, %rdi
- jae .L3
- testq %rdx, %rdx
- .p2align 4,,5
- je L(return)
- movq %rdx, %r9
- leaq 16(%rsi), %rcx
- leaq 16(%rdi), %r8
- shrq $4, %r9
- movq %r9, %rax
- salq $4, %rax
- cmpq %rcx, %rdi
- setae %cl
- cmpq %r8, %rsi
- setae %r8b
- orl %r8d, %ecx
- cmpq $15, %rdx
- seta %r8b
- testb %r8b, %cl
- je .L16
- testq %rax, %rax
- je .L16
- xorl %ecx, %ecx
- xorl %r8d, %r8d
-.L7:
- movdqu (%rsi,%rcx), %xmm8
- addq $1, %r8
- movdqu %xmm8, (%rdi,%rcx)
- addq $16, %rcx
- cmpq %r8, %r9
- ja .L7
- cmpq %rax, %rdx
- je L(return)
-.L21:
- movzbl (%rsi,%rax), %ecx
- movb %cl, (%rdi,%rax)
- addq $1, %rax
- cmpq %rax, %rdx
- ja .L21
- jmp L(return)
-L(less_16):
- testb $24, %dl
- jne L(between_9_16)
- testb $4, %dl
- .p2align 4,,5
- jne L(between_5_8)
- testq %rdx, %rdx
- .p2align 4,,2
- je L(return)
- movzbl (%rsi), %eax
- testb $2, %dl
- movb %al, (%rdi)
- je L(return)
- movzwl -2(%rsi,%rdx), %eax
- movw %ax, -2(%rdi,%rdx)
- jmp L(return)
-.L3:
- leaq -1(%rdx), %rax
- .p2align 4,,10
+ jb L(bwd_rep)
+ movq -8(%rsi, %rdx), %r8
+ addq %rdi, %rdx
+ movq (%rsi), %r10
+ movq %rdi, %r11
+
+ addq $8, %rdi
+ andq $-8, %rdi
+ addq %rdi, %rsi
+ subq %r11, %rsi
+
+ movq %rdx, %rcx
+ subq %rdi, %rcx
+ shrq $3, %rcx
+ rep ; movsq
+ movq %r8, -8(%rdx)
+ movq %r10, (%r11)
+ ret
+
+L(bwd_rep):
+ movdqu -16(%r11), %xmm4
+ movdqu -32(%r11), %xmm5
+ movdqu -48(%r11), %xmm6
+ movdqu -64(%r11), %xmm7
+ movdqu (%rsi), %xmm8
+ jmp L(bwd)
+#endif
+
+L(between_16_80_bytes):
+ cmp $48, %edx
+ jae L(between_48_80_bytes)
+ subq $16, %rdx
+ shrq $1, %rdx
+ movdqu (%rsi), %xmm1
+ movdqu (%rsi, %rdx), %xmm2
+ movdqu -16(%r11), %xmm0
+ movdqu %xmm1, (%rdi)
+ movdqu %xmm2, (%rdi, %rdx)
+ movdqu %xmm0, -16(%r10)
+ ret
+
ALIGN(4)
-.L11:
- movzbl (%rsi,%rax), %edx
- movb %dl, (%rdi,%rax)
- subq $1, %rax
- jmp .L11
-L(between_9_16):
- movq (%rsi), %rax
- movq %rax, (%rdi)
- movq -8(%rsi,%rdx), %rax
- movq %rax, -8(%rdi,%rdx)
- jmp L(return)
-.L16:
- xorl %eax, %eax
- jmp .L21
-L(between_5_8):
- movl (%rsi), %eax
- movl %eax, (%rdi)
- movl -4(%rsi,%rdx), %eax
- movl %eax, -4(%rdi,%rdx)
- jmp L(return)
+L(between_48_80_bytes):
+ movdqu (%rsi), %xmm7
+ movdqu -32(%r11), %xmm4
+ movdqu 16(%rsi), %xmm5
+ movdqu -16(%r11), %xmm6
+ movdqu 32(%rsi), %xmm3
+ movdqu %xmm7, (%rdi)
+ movdqu %xmm4, -32(%r10)
+ movdqu %xmm5, 16(%rdi)
+ movdqu %xmm6, -16(%r10)
+ movdqu %xmm3, 32(%rdi)
+ ret
+
+L(bwd):
+ leaq 16(%rdi), %rdx
+ andq $-16, %rdx
+ movq %rdx, %rcx
+ subq %rdi, %rcx
+ lea (%rcx, %rsi), %rsi
+ mov %r10, %rcx
+ subq %rdx, %rcx
+ shrq $6, %rcx
+
+ movq %rcx, %r9
+ shlq $6, %r9
+ subq $64, %r9
+ addq %r9, %rsi
+ addq %r9, %rdx
+L(bwd_loop):
+ movdqu 48(%rsi), %xmm3
+ movdqu 32(%rsi), %xmm2
+ movdqu 16(%rsi), %xmm1
+ movdqu (%rsi), %xmm0
+ movdqa %xmm3, 48(%rdx)
+ movdqa %xmm2, 32(%rdx)
+ movdqa %xmm1, 16(%rdx)
+ movdqa %xmm0, (%rdx)
+ subq $64, %rdx
+ subq $64, %rsi
+ sub $1, %rcx
+ jnz L(bwd_loop)
+ movdqu %xmm8, (%rdi)
+ movdqu %xmm4, -16(%r10)
+ movdqu %xmm5, -32(%r10)
+ movdqu %xmm6, -48(%r10)
+ movdqu %xmm7, -64(%r10)
+ ret
END(__memcpy_sse2_unaligned)
+
+strong_alias(__memcpy_sse2_unaligned,__memmove_sse2_unaligned)
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index a1e5031..0159159 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -35,6 +35,9 @@ ENTRY(__new_memcpy)
1: leaq __memcpy_sse2(%rip), %rax
testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
jnz 2f
+ leaq __memcpy_sse2_silvermont(%rip), %rax
+ testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
+ jnz 3f
leaq __memcpy_sse2_unaligned(%rip), %rax
ret
2: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
index 8149c48..7a3d685 100644
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -33,6 +33,8 @@
# undef memmove
extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_sse2_unaligned attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_sse2_silvermont attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
#endif
@@ -47,10 +49,12 @@ extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
ifunc symbol properly. */
extern __typeof (__redirect_memmove) __libc_memmove;
libc_ifunc (__libc_memmove,
- HAS_SSSE3
- ? (HAS_FAST_COPY_BACKWARD
- ? __memmove_ssse3_back : __memmove_ssse3)
- : __memmove_sse2)
+ HAS_SLOW_SSE4_2 ? __memmove_sse2_silvermont :
+ ( HAS_FAST_UNALIGNED_LOAD ? __memmove_sse2_unaligned :
+ ( HAS_SSSE3
+ ? ( HAS_FAST_COPY_BACKWARD
+ ? __memmove_ssse3_back : __memmove_ssse3)
+ : __memmove_sse2)))
strong_alias (__libc_memmove, memmove)
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index b8b7fcd..f26318b 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -31,7 +31,14 @@ ENTRY(__mempcpy)
jne 1f
call __init_cpu_features
1: leaq __mempcpy_sse2(%rip), %rax
- testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+ jz 3f
+ leaq __mempcpy_sse2_silvermont(%rip), %rax
+ testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
+ jnz 2f
+ leaq __mempcpy_sse2_unaligned(%rip), %rax
+ ret
+3: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jz 2f
leaq __mempcpy_ssse3(%rip), %rax
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
--
1.7.10.4