This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[RFC] Improve strcpy: Faster ssse3 version.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: Carlos O'Donell <carlos at redhat dot com>
- Cc: Andreas Schwab <schwab at linux-m68k dot org>, libc-alpha at sourceware dot org
- Date: Tue, 10 Sep 2013 17:19:48 +0200
- Subject: [RFC] Improve strcpy: Faster ssse3 version.
- Authentication-results: sourceware.org; auth=none
- References: <20130909153051 dot GA23047 at domone dot kolej dot mff dot cuni dot cz> <20130909161112 dot GB23047 at domone dot kolej dot mff dot cuni dot cz> <mvmbo42dkiq dot fsf at hawking dot suse dot de> <20130909171703 dot GA32141 at domone dot kolej dot mff dot cuni dot cz> <87ob81c1yk dot fsf at igel dot home> <20130909191829 dot GA997 at domone dot kolej dot mff dot cuni dot cz> <522E28E9 dot 5000709 at redhat dot com> <20130910142117 dot GB6536 at domone dot kolej dot mff dot cuni dot cz>
Hi,
I also wrote a ssse3 version with same optimized header.
On core2 and xeon it has similar performance to unaligned loads for small inputs
and is slightly faster than current ssse3 on large inputs.
http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strcpy_profile/results_rand/result.html
These factors cause this implementation to be 20% faster on profiling in
block mode. There inputs are bit atypical as most time is spend bash and
it copies quite large strings which make ssse3 version faster than
unaligned one.
http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strcpy_profile/results_gcc/result.html
A change is in loop and code to setup/cleanup loop so what is best way to add this?
Also there would be third implementation by that mechanically replacing palignr
with shifts avoids ssse3. How should incorporate that?
Currently I use separate files which are almost identical, diff is below.
Comments?
--- sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S 2013-09-10 16:44:21.486453410 +0200
+++ sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S 2013-09-10 16:53:33.836485107 +0200
@@ -1,4 +1,4 @@
-/* strcpy with SSE2 and unaligned load
+/* strcpy with SSSE3
Copyright (C) 2013 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -21,22 +21,21 @@
# include <sysdep.h>
# ifndef STRCPY
-# define STRCPY __strcpy_sse2_unaligned
+# define STRCPY __strcpy_ssse3
# endif
# define ALIGN(x) .p2align x
#ifndef USE_AS_STPCPY
-ENTRY (__strcpy_sse2_unaligned_tail)
+ENTRY (__strcpy_ssse3_tail)
movq %rsi, %rdx
pxor %xmm4, %xmm4
jmp L(from_tail)
-END (__strcpy_sse2_unaligned_tail)
+END (__strcpy_ssse3_tail)
#endif
ENTRY (STRCPY)
movq %rsi, %rdx
pxor %xmm4, %xmm4
movq %rdi, %rax
@@ -46,6 +45,7 @@
pxor %xmm6, %xmm6
cmpq $4032, %rdx
ja L(cross_page)
+L(from_next_64_bytes):
movdqu (%rsi), %xmm1
pxor %xmm7, %xmm7
movdqu 16(%rsi), %xmm2
@@ -74,20 +74,102 @@
leaq 64(%rsi), %rdx
andq $-64, %rdx
addq %rdx, %rdi
- pxor %xmm5, %xmm5
+ pxor %xmm4, %xmm4
subq %rsi, %rdi
movq %rdx, %rsi
- jmp L(loop_entry)
- ALIGN (4)
-L(loop):
+ /* We need to read additional 64 bytes to be sure that loop can
+ write before %rsi address. */
+ movdqa (%rsi), %xmm1
+ movdqa 16(%rsi), %xmm2
+ movdqa %xmm1, %xmm0
+ movdqa 32(%rsi), %xmm3
+ pminub %xmm2, %xmm0
+ movdqa 48(%rsi), %xmm8
+ pminub %xmm3, %xmm0
+ pminub %xmm8, %xmm0
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(from_next_64_bytes)
movdqu %xmm1, (%rdi)
- addq $64, %rsi
movdqu %xmm2, 16(%rdi)
movdqu %xmm3, 32(%rdi)
- movdqu %xmm4, 48(%rdi)
+ movdqu %xmm8, 48(%rdi)
+ addq $64, %rsi
+ addq $64, %rdi
+
+ movq %rdi, %r11 /* Compute jump address. */
+ andl $15, %r11d
+ subq %r11, %rdi
+ addq $16, %rdi
+ leaq L(entry0)(%rip), %rdx
+ movdqu -16(%rsi), %xmm0
+ movq %r11, %rcx
+ imul $128, %rcx
+ subq %rcx, %rdx
+ jmp *%rdx
+
+
+#define LOOP(shift, align) \
+ ALIGN (4) ;\
+L(loop##align): ;\
+ movdqa %xmm1, -16(%rdi) ;\
+ addq $64, %rsi ;\
+ movdqa %xmm2, (%rdi) ;\
+ movdqa %xmm6, %xmm0 ;\
+ movdqa %xmm3, 16(%rdi) ;\
+ movdqa %xmm4, 32(%rdi) ;\
+ rep ; \
+ addq $64, %rdi ;\
+L(entry##align): ;\
+ movdqa (%rsi), %xmm5 ;\
+ movdqa %xmm5, %xmm1 ;\
+ movdqa 16(%rsi), %xmm2 ;\
+ pminub %xmm2, %xmm5 ;\
+ movdqa 32(%rsi), %xmm3 ;\
+ pminub %xmm3, %xmm5 ;\
+ movdqa 48(%rsi), %xmm4 ;\
+ pminub %xmm4, %xmm5 ;\
+ movdqa %xmm4, %xmm6 ;\
+ pcmpeqb %xmm7, %xmm5 ;\
+ pmovmskb %xmm5, %edx ;\
+ testl %edx, %edx ;\
+ jne L(return) ;\
+ palignr shift, %xmm3, %xmm4 ;\
+ palignr shift, %xmm2, %xmm3 ;\
+ palignr shift, %xmm1, %xmm2 ;\
+ palignr shift, %xmm0, %xmm1 ;\
+ jmp L(loop##align)
+
+
+ LOOP($1,15)
+ LOOP($2,14)
+ LOOP($3,13)
+ LOOP($4,12)
+ LOOP($5,11)
+ LOOP($6,10)
+ LOOP($7,9)
+ LOOP($8,8)
+ LOOP($9,7)
+ LOOP($10,6)
+ LOOP($11,5)
+ LOOP($12,4)
+ LOOP($13,3)
+ LOOP($14,2)
+ LOOP($15,1)
+
+ ALIGN (4)
+ test %edx, %edx
+ ALIGN (4)
+L(loop0):
+ movdqa %xmm1, -16(%rdi)
+ addq $64, %rsi
+ movdqa %xmm2, (%rdi)
+ movdqa %xmm3, 16(%rdi)
+ movdqa %xmm4, 32(%rdi)
addq $64, %rdi
-L(loop_entry):
+ ALIGN (4)
+L(entry0):
movdqa 32(%rsi), %xmm3
movdqa 48(%rsi), %xmm4
movdqa %xmm3, %xmm0
@@ -99,8 +181,23 @@
pcmpeqb %xmm5, %xmm0
pmovmskb %xmm0, %edx
testq %rdx, %rdx
- je L(loop)
+ je L(loop0)
+L(return):
+ /* Restore to pre-loop state */
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ pxor %xmm6, %xmm6
+ addq %r11, %rdi
+ subq $16, %rdi
+ movdqu -16(%rsi), %xmm0
+ movdqu %xmm0, -16(%rdi)
+
+ movdqu (%rsi), %xmm1
+ movdqu 16(%rsi), %xmm2
+ movdqu 32(%rsi), %xmm3
+ movdqu 48(%rsi), %xmm4
+
+ pcmpeqb %xmm5, %xmm4
+ pmovmskb %xmm4, %edx
salq $48, %rdx
pcmpeqb %xmm1, %xmm5
pcmpeqb %xmm2, %xmm6