This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH 3/3] Split strcmp to header/loop parts,
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Mon, 16 Sep 2013 20:40:24 +0200
- Subject: [PATCH 3/3] Split strcmp to header/loop parts,
- Authentication-results: sourceware.org; auth=none
- References: <20130913200552 dot GA31992 at domone> <20130913205303 dot GA3620 at domone> <20130916123234 dot GA24928 at domone> <20130916131112 dot GA31508 at domone>
Third change consist of splitting a strcmp function to header and loop
parts. This header makes strcmp also faster on older machines but in
rare cases when you enconter big inputs it migth be slower.
I will use same system like I proposed to strcpy which is have multiple
loop files, one to handle unaligned loads, one to use ssse3 to do
alignments and one just align by sse2 shifts.
Comments?
* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Move loop
implementation ...
* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned-loop.S ... here.
---
.../x86_64/multiarch/strcmp-sse2-unaligned-loop.S | 80 ++++++++++++++++++++++
sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S | 73 ++++----------------
2 files changed, 93 insertions(+), 60 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcmp-sse2-unaligned-loop.S
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned-loop.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned-loop.S
new file mode 100644
index 0000000..80a78e5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned-loop.S
@@ -0,0 +1,80 @@
+/* strcmp with unaligned loads - loop
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+ ALIGN (4)
+L(loop):
+ addq $64, %rax
+ addq $64, %rdx
+L(loop_start):
+ testq %rsi, %rsi /* TODO sub $1, %rsi; je L(loop_cross_page) */
+ leaq -1(%rsi), %rsi
+ je L(loop_cross_page)
+L(back_to_loop):
+ movdqu 48(%rdx), %xmm6
+ movdqa 48(%rax), %xmm3
+ pcmpeqb %xmm3, %xmm6
+ pminub %xmm3, %xmm6
+
+ movdqu (%rdx), %xmm0
+ movdqa (%rax), %xmm2
+ pcmpeqb %xmm2, %xmm0
+ pminub %xmm2, %xmm0
+ pminub %xmm0, %xmm6
+
+ movdqu 16(%rdx), %xmm1
+ movdqa 16(%rax), %xmm8
+ pcmpeqb %xmm8, %xmm1
+ pminub %xmm8, %xmm1
+ pminub %xmm1, %xmm6
+
+ movdqu 32(%rdx), %xmm2
+ movdqa 32(%rax), %xmm5
+ pcmpeqb %xmm5, %xmm2
+ pminub %xmm5, %xmm2
+ pminub %xmm2, %xmm6
+
+ pcmpeqb %xmm7, %xmm6
+ pmovmskb %xmm6, %ecx
+ testl %ecx, %ecx
+ je L(loop)
+ pcmpeqb %xmm7, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pcmpeqb %xmm7, %xmm2
+
+ pmovmskb %xmm0, %r8d
+ pmovmskb %xmm1, %esi
+ pmovmskb %xmm2, %edi
+ salq $48, %rcx
+ orq %r8, %rcx
+ salq $16, %rsi
+ salq $32, %rdi
+ orq %rdi, %rcx
+ orq %rsi, %rcx
+#ifndef AS_STRCASECMP
+ bsfq %rcx, %rcx
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+ ret
+#else
+ movq %rax, %rdi
+ movq %rdx, %rsi
+ movq %rcx, %rax
+ jmp L(return)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index 05f90f9..87d9064 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -179,68 +179,21 @@ L(casecnt3):
#endif
+ /* Now we go to use loop implementation, entry point is L(loop_start)
+ where registers are set in following way
+ xmm7 - set to 0
+ rax - a aligned to 64 bytes
+ rdx - b not aligned
+ rsi - number of iterations until rdx will cross page boundary
+ r11 - tolower array
- ALIGN (4)
-L(loop):
- addq $64, %rax
- addq $64, %rdx
-L(loop_start):
- testq %rsi, %rsi /* TODO sub $1, %rsi; je L(loop_cross_page) */
- leaq -1(%rsi), %rsi
- je L(loop_cross_page)
-L(back_to_loop):
- movdqu 48(%rdx), %xmm6
- movdqa 48(%rax), %xmm3
- pcmpeqb %xmm3, %xmm6
- pminub %xmm3, %xmm6
-
- movdqu (%rdx), %xmm0
- movdqa (%rax), %xmm2
- pcmpeqb %xmm2, %xmm0
- pminub %xmm2, %xmm0
- pminub %xmm0, %xmm6
-
- movdqu 16(%rdx), %xmm1
- movdqa 16(%rax), %xmm8
- pcmpeqb %xmm8, %xmm1
- pminub %xmm8, %xmm1
- pminub %xmm1, %xmm6
+ When a rdx...rdx+64 crosses page boundary we need to call
+ L(loop_cross_page) which handles this case and returns control
+ by jumping to L(back_to_loop)
+ when match is found implementation is required to jump to L(return)
+ */
- movdqu 32(%rdx), %xmm2
- movdqa 32(%rax), %xmm5
- pcmpeqb %xmm5, %xmm2
- pminub %xmm5, %xmm2
- pminub %xmm2, %xmm6
-
- pcmpeqb %xmm7, %xmm6
- pmovmskb %xmm6, %ecx
- testl %ecx, %ecx
- je L(loop)
- pcmpeqb %xmm7, %xmm0
- pcmpeqb %xmm7, %xmm1
- pcmpeqb %xmm7, %xmm2
-
- pmovmskb %xmm0, %r8d
- pmovmskb %xmm1, %esi
- pmovmskb %xmm2, %edi
- salq $48, %rcx
- orq %r8, %rcx
- salq $16, %rsi
- salq $32, %rdi
- orq %rdi, %rcx
- orq %rsi, %rcx
-#ifndef AS_STRCASECMP
- bsfq %rcx, %rcx
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
- ret
-#else
- movq %rax, %rdi
- movq %rdx, %rsi
- movq %rcx, %rax
- jmp L(return)
-#endif
+#include "strcmp-sse2-unaligned-loop.S"
ALIGN (4)
L(loop_cross_page):
--
1.8.3.2