This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[RFC] Improve strcpy: Faster ssse3 version.


Hi, 

I also wrote a ssse3 version with same optimized header. 

On core2 and xeon it has similar performance to unaligned loads for small inputs
and is slightly faster than current ssse3 on large inputs.
http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strcpy_profile/results_rand/result.html
These factors cause this implementation to be 20% faster on profiling in
block mode. There inputs are bit atypical as most time is spend bash and
it copies quite large strings which make ssse3 version faster than
unaligned one.
http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strcpy_profile/results_gcc/result.html

A change is in loop and code to setup/cleanup loop so what is best way to add this?

Also there would be third implementation by that mechanically replacing palignr
with shifts avoids ssse3. How should incorporate that?

Currently I use separate files which are almost identical, diff is below.

Comments?

--- sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S	2013-09-10 16:44:21.486453410 +0200
+++ sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S	2013-09-10 16:53:33.836485107 +0200
@@ -1,4 +1,4 @@
-/* strcpy with SSE2 and unaligned load
+/* strcpy with SSSE3
    Copyright (C) 2013 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -21,22 +21,21 @@
 # include <sysdep.h>
 
 #  ifndef STRCPY
-#   define STRCPY  __strcpy_sse2_unaligned
+#  define STRCPY  __strcpy_ssse3
 #  endif
 
 # define ALIGN(x) .p2align x
 
 #ifndef USE_AS_STPCPY
-ENTRY (__strcpy_sse2_unaligned_tail)
+ENTRY (__strcpy_ssse3_tail)
 	movq  %rsi, %rdx
 	pxor  %xmm4, %xmm4
 	jmp L(from_tail)
-END (__strcpy_sse2_unaligned_tail)
+END (__strcpy_ssse3_tail)
 #endif
 
 ENTRY (STRCPY)
 	movq	%rsi, %rdx
 	pxor	%xmm4, %xmm4
 	movq	%rdi, %rax
@@ -46,6 +45,7 @@
 	pxor	%xmm6, %xmm6
 	cmpq	$4032, %rdx
 	ja	L(cross_page)
+L(from_next_64_bytes):
 	movdqu	(%rsi), %xmm1
 	pxor	%xmm7, %xmm7
 	movdqu	16(%rsi), %xmm2
@@ -74,20 +74,102 @@
 	leaq	64(%rsi), %rdx
 	andq	$-64, %rdx
 	addq	%rdx, %rdi
-	pxor	%xmm5, %xmm5
+	pxor	%xmm4, %xmm4
 	subq	%rsi, %rdi
 	movq	%rdx, %rsi
-	jmp	L(loop_entry)
 	
-	ALIGN (4)
-L(loop):
+	/* We need to read additional 64 bytes to be sure that loop can
+	   write before %rsi address.  */
+	movdqa	(%rsi), %xmm1
+	movdqa	16(%rsi), %xmm2
+	movdqa	%xmm1, %xmm0
+	movdqa	32(%rsi), %xmm3
+	pminub	%xmm2, %xmm0
+	movdqa	48(%rsi), %xmm8
+	pminub	%xmm3, %xmm0
+	pminub	%xmm8, %xmm0
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(from_next_64_bytes)
 	movdqu	%xmm1, (%rdi)
-	addq	$64, %rsi
 	movdqu	%xmm2, 16(%rdi)
 	movdqu	%xmm3, 32(%rdi)
-	movdqu	%xmm4, 48(%rdi)
+	movdqu	%xmm8, 48(%rdi)
+	addq	$64, %rsi
+	addq	$64, %rdi
+
+	movq	%rdi, %r11 /* Compute jump address.  */
+	andl	$15, %r11d
+	subq	%r11, %rdi
+	addq	$16, %rdi
+	leaq	L(entry0)(%rip), %rdx
+	movdqu	-16(%rsi), %xmm0
+	movq	%r11, %rcx
+	imul	$128, %rcx
+	subq	%rcx, %rdx
+	jmp	*%rdx
+
+
+#define LOOP(shift, align) \
+	ALIGN (4) ;\
+L(loop##align): ;\
+	movdqa	%xmm1, -16(%rdi) ;\
+	addq	$64, %rsi ;\
+	movdqa	%xmm2, (%rdi) ;\
+	movdqa	%xmm6, %xmm0 ;\
+	movdqa	%xmm3, 16(%rdi) ;\
+	movdqa	%xmm4, 32(%rdi) ;\
+	rep ; \
+	addq	$64, %rdi ;\
+L(entry##align): ;\
+	movdqa	(%rsi), %xmm5 ;\
+	movdqa	%xmm5, %xmm1 ;\
+	movdqa	16(%rsi), %xmm2 ;\
+	pminub	%xmm2, %xmm5 ;\
+	movdqa	32(%rsi), %xmm3 ;\
+	pminub	%xmm3, %xmm5 ;\
+	movdqa	48(%rsi), %xmm4 ;\
+	pminub	%xmm4, %xmm5 ;\
+	movdqa	%xmm4, %xmm6 ;\
+	pcmpeqb	%xmm7, %xmm5 ;\
+	pmovmskb	%xmm5, %edx ;\
+	testl	%edx, %edx ;\
+	jne	L(return) ;\
+	palignr	shift, %xmm3, %xmm4 ;\
+	palignr	shift, %xmm2, %xmm3 ;\
+	palignr	shift, %xmm1, %xmm2 ;\
+	palignr	shift, %xmm0, %xmm1 ;\
+	jmp	L(loop##align)
+
+
+	LOOP($1,15)
+	LOOP($2,14)
+	LOOP($3,13)
+	LOOP($4,12)
+	LOOP($5,11)
+	LOOP($6,10)
+	LOOP($7,9)
+	LOOP($8,8)
+	LOOP($9,7)
+	LOOP($10,6)
+	LOOP($11,5)
+	LOOP($12,4)
+	LOOP($13,3)
+	LOOP($14,2)
+	LOOP($15,1)
+
+	ALIGN (4)
+	test	%edx, %edx
+	ALIGN (4)
+L(loop0):
+	movdqa	%xmm1, -16(%rdi)
+	addq	$64, %rsi
+	movdqa	%xmm2, (%rdi)
+	movdqa	%xmm3, 16(%rdi)
+	movdqa	%xmm4, 32(%rdi)
 	addq	$64, %rdi
-L(loop_entry):
+	ALIGN (4)
+L(entry0):
 	movdqa	32(%rsi), %xmm3
 	movdqa	48(%rsi), %xmm4
 	movdqa	%xmm3, %xmm0
@@ -99,8 +181,23 @@
 	pcmpeqb	%xmm5, %xmm0
 	pmovmskb	%xmm0, %edx
 	testq	%rdx, %rdx
-	je	L(loop)
+	je	L(loop0)
 
+L(return):
+	/* Restore to pre-loop state */
+	pxor	%xmm4, %xmm4 
+	pxor	%xmm5, %xmm5
+	pxor	%xmm6, %xmm6
+	addq	%r11, %rdi
+	subq	$16, %rdi
+	movdqu	-16(%rsi), %xmm0
+	movdqu	%xmm0, -16(%rdi)
+
+	movdqu	(%rsi), %xmm1
+	movdqu	16(%rsi), %xmm2
+	movdqu	32(%rsi), %xmm3
+	movdqu	48(%rsi), %xmm4
+
+	pcmpeqb	%xmm5, %xmm4
+	pmovmskb	%xmm4, %edx
 	salq	$48, %rdx
 	pcmpeqb	%xmm1, %xmm5
 	pcmpeqb	%xmm2, %xmm6


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]