This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH 3/3] Split strcmp to header/loop parts,


Third change consist of splitting a strcmp function to header and loop
parts. This header makes strcmp also faster on older machines but in 
rare cases when you enconter big inputs it migth be slower.

I will use same system like I proposed to strcpy which is have multiple
loop files, one to handle unaligned loads, one to use ssse3 to do
alignments and one just align by sse2 shifts.

Comments?


	* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Move loop
	implementation ...
	* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned-loop.S ... here.

---
 .../x86_64/multiarch/strcmp-sse2-unaligned-loop.S  | 80 ++++++++++++++++++++++
 sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S   | 73 ++++----------------
 2 files changed, 93 insertions(+), 60 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcmp-sse2-unaligned-loop.S

diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned-loop.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned-loop.S
new file mode 100644
index 0000000..80a78e5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned-loop.S
@@ -0,0 +1,80 @@
+/* strcmp with unaligned loads - loop
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+	ALIGN	(4)
+L(loop):
+	addq	$64, %rax
+	addq	$64, %rdx
+L(loop_start):
+	testq	%rsi, %rsi /* TODO sub $1, %rsi; je L(loop_cross_page) */
+	leaq	-1(%rsi), %rsi
+	je	L(loop_cross_page)
+L(back_to_loop):
+	movdqu	48(%rdx), %xmm6
+	movdqa	48(%rax), %xmm3
+	pcmpeqb	%xmm3, %xmm6
+	pminub	%xmm3, %xmm6
+
+	movdqu	(%rdx), %xmm0
+	movdqa	(%rax), %xmm2
+	pcmpeqb	%xmm2, %xmm0
+	pminub	%xmm2, %xmm0
+	pminub	%xmm0, %xmm6
+
+	movdqu	16(%rdx), %xmm1
+	movdqa	16(%rax), %xmm8
+	pcmpeqb	%xmm8, %xmm1
+	pminub	%xmm8, %xmm1
+	pminub	%xmm1, %xmm6
+
+	movdqu	32(%rdx), %xmm2
+	movdqa	32(%rax), %xmm5
+	pcmpeqb	%xmm5, %xmm2
+	pminub	%xmm5, %xmm2
+	pminub	%xmm2, %xmm6
+	
+	pcmpeqb	%xmm7, %xmm6
+	pmovmskb %xmm6, %ecx
+	testl	%ecx, %ecx
+	je	L(loop)
+	pcmpeqb	%xmm7, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pcmpeqb	%xmm7, %xmm2
+
+	pmovmskb %xmm0, %r8d
+	pmovmskb %xmm1, %esi
+	pmovmskb %xmm2, %edi
+	salq	$48, %rcx
+	orq	%r8, %rcx
+	salq	$16, %rsi
+	salq	$32, %rdi
+	orq	%rdi, %rcx
+	orq	%rsi, %rcx
+#ifndef AS_STRCASECMP
+	bsfq	%rcx, %rcx
+	movzbl	(%rax, %rcx), %eax
+	movzbl	(%rdx, %rcx), %edx
+	subl	%edx, %eax
+	ret
+#else
+	movq	%rax, %rdi
+	movq	%rdx, %rsi
+	movq	%rcx, %rax
+	jmp	L(return)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index 05f90f9..87d9064 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -179,68 +179,21 @@ L(casecnt3):
 
 #endif
 
+	/* Now we go to use loop implementation, entry point is L(loop_start)
+	   where registers are set in following way
+	   xmm7 - set to 0
+	   rax - a aligned to 64 bytes
+	   rdx - b not aligned
+	   rsi - number of iterations until rdx will cross page boundary
+	   r11 - tolower array
 
-	ALIGN	(4)
-L(loop):
-	addq	$64, %rax
-	addq	$64, %rdx
-L(loop_start):
-	testq	%rsi, %rsi /* TODO sub $1, %rsi; je L(loop_cross_page) */
-	leaq	-1(%rsi), %rsi
-	je	L(loop_cross_page)
-L(back_to_loop):
-	movdqu	48(%rdx), %xmm6
-	movdqa	48(%rax), %xmm3
-	pcmpeqb	%xmm3, %xmm6
-	pminub	%xmm3, %xmm6
-
-	movdqu	(%rdx), %xmm0
-	movdqa	(%rax), %xmm2
-	pcmpeqb	%xmm2, %xmm0
-	pminub	%xmm2, %xmm0
-	pminub	%xmm0, %xmm6
-
-	movdqu	16(%rdx), %xmm1
-	movdqa	16(%rax), %xmm8
-	pcmpeqb	%xmm8, %xmm1
-	pminub	%xmm8, %xmm1
-	pminub	%xmm1, %xmm6
+	   When a rdx...rdx+64 crosses page boundary we need to call 
+	   L(loop_cross_page) which handles this case and returns control
+	   by jumping to L(back_to_loop)
+	   when match is found implementation is required to jump to L(return)
+	*/
 
-	movdqu	32(%rdx), %xmm2
-	movdqa	32(%rax), %xmm5
-	pcmpeqb	%xmm5, %xmm2
-	pminub	%xmm5, %xmm2
-	pminub	%xmm2, %xmm6
-	
-	pcmpeqb	%xmm7, %xmm6
-	pmovmskb %xmm6, %ecx
-	testl	%ecx, %ecx
-	je	L(loop)
-	pcmpeqb	%xmm7, %xmm0
-	pcmpeqb	%xmm7, %xmm1
-	pcmpeqb	%xmm7, %xmm2
-
-	pmovmskb %xmm0, %r8d
-	pmovmskb %xmm1, %esi
-	pmovmskb %xmm2, %edi
-	salq	$48, %rcx
-	orq	%r8, %rcx
-	salq	$16, %rsi
-	salq	$32, %rdi
-	orq	%rdi, %rcx
-	orq	%rsi, %rcx
-#ifndef AS_STRCASECMP
-	bsfq	%rcx, %rcx
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-	ret
-#else
-	movq	%rax, %rdi
-	movq	%rdx, %rsi
-	movq	%rcx, %rax
-	jmp	L(return)
-#endif
+#include "strcmp-sse2-unaligned-loop.S"
 
 	ALIGN (4)
 L(loop_cross_page):
-- 
1.8.3.2


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]