This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Getting l->__ctype_tolower in assembly. unaligned strcmp
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Sat, 14 Sep 2013 10:35:18 +0200
- Subject: Getting l->__ctype_tolower in assembly. unaligned strcmp
- Authentication-results: sourceware.org; auth=none
- References: <20130913200552 dot GA31992 at domone> <20130913205303 dot GA3620 at domone>
Hi, I have strcasecmp almost ready I now just need to know how to get
__ctype_tolower table address in assembly. I could use workaround of
jumping from c implementation which is ugly.
strcasecmp_l_c(char *x,char *y, __locale_t l)
{
return strcasecmp_l_as(x,y,l->__ctype_tolower);
}
On Fri, Sep 13, 2013 at 10:53:03PM +0200, OndÅej BÃlka wrote:
> If assumptions above are true in general then best course of action
> is to do minimal modifications in strcasecmp. A strcmp first produces
> bitmask of different characters, then returns differene of first
> characters:
>
> if (m) {
> return a[ffs (m) - b[ffs (m)];
> }
>
> which I would replace by enumerating over all characters as it is likely
> we stop at second one.
>
> while (m) {
> if (tolower(a[ffs (m)) != tolower(b[ffs (m)))
> return tolower(a[ffs (m)) - tolower(b[ffs (m));
> if (!a[ffs (m)]) return 0;
> m = m & (m-1);
> }
Which I now converted to assembly modulo getting tolower table to %r11.
---
sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S | 110 +++++++++++++++++++++++
1 file changed, 110 insertions(+)
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index eed8432..d3fc1f8 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -16,10 +16,29 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
+#ifdef AS_STRCASECMP
+# include "locale-defines.h"
+
+ENTRY (__strcasecmp_sse2_unaligned)
+ movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
+ mov %fs:(%rax),%RDX_LP
+
+ // XXX 5 byte should be before the function
+ /* 5-byte NOP. */
+ .byte 0x0f,0x1f,0x44,0x00,0x00
+END (__strcasecmp_sse2_unaligned)
+
+# define __strcmp_sse2_unaligned __strcasecmp_sse2_unaligned_l
+#endif
+
#include "sysdep.h"
#define ALIGN(x) .p2align x
ENTRY ( __strcmp_sse2_unaligned)
+
+#ifdef AS_STRCASECMP
+ mov /* l->__ctype_tolower */, %r11
+#endif
movl %edi, %eax
xorl %edx, %edx
pxor %xmm7, %xmm7
@@ -36,12 +55,16 @@ ENTRY ( __strcmp_sse2_unaligned)
pmovmskb %xmm0, %eax
testq %rax, %rax
je L(next_48_bytes)
+#ifndef AS_STRCASECMP
L(return):
bsfq %rax, %rdx
movzbl (%rdi, %rdx), %eax
movzbl (%rsi, %rdx), %edx
subl %edx, %eax
ret
+#else
+ jmp L(caseloop1)
+#endif
ALIGN (4)
L(next_48_bytes):
@@ -85,6 +108,74 @@ L(main_loop_header):
movq %rcx, %rsi
jmp L(loop_start)
+#ifdef AS_STRCASECMP
+L(caseloop1):
+ bsfq %rax, %rdx
+ leaq -1(%rax), %rcx
+ andq %rax, %rcx
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ testl %eax, %edx
+ je L(zero1)
+ movl (%r11, %rax, 4), %eax
+ movl (%r11, %rdx, 4), %edx
+ testl %edx, %eax
+ je L(casecnt1)
+L(zero1):
+ subl %edx, %eax
+ ret
+L(casecnt1):
+ testl %rcx, %rcx
+ je L(next_48_bytes)
+ movq %rcx, %rax
+ jmp L(caseloop1)
+
+L(return):
+L(caseloop2):
+ bsfq %rax, %rdx
+ leaq -1(%rax), %rcx
+ andq %rax, %rcx
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ testl %eax, %edx
+ je L(zero2)
+ movl (%r11, %rax, 4), %eax
+ movl (%r11, %rdx, 4), %edx
+ testl %edx, %eax
+ je L(casecnt2)
+L(zero2):
+ subl %edx, %eax
+ ret
+L(casecnt2):
+ testl %rcx, %rcx
+ je L(main_loop_header)
+ movq %rcx, %rax
+ jmp L(caseloop2)
+
+L(caseloop3):
+ bsfq %rax, %rdx
+ leaq -1(%rax), %rcx
+ andq %rax, %rcx
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ testl %eax, %edx
+ je L(zero3)
+ movl (%r11, %rax, 4), %eax
+ movl (%r11, %rdx, 4), %edx
+ testl %edx, %eax
+ je L(casecnt3)
+L(zero3):
+ subl %edx, %eax
+ ret
+L(casecnt3):
+ testl %rcx, %rcx
+ je L(back_to_loop)
+ movq %rcx, %rax
+ jmp L(caseloop3)
+
+#endif
+
+
ALIGN (4)
L(loop):
addq $64, %rax
@@ -135,11 +226,18 @@ L(back_to_loop):
orq %rdi, %rcx
salq $48, %rsi
orq %rsi, %rcx
+#ifndef AS_STRCASECMP
bsfq %rcx, %rcx
movzbl (%rax, %rcx), %eax
movzbl (%rdx, %rcx), %edx
subl %edx, %eax
ret
+#else
+ movq %rax, %rdi
+ movq %rdx, %rsi
+ movq %rcx, %rax
+ jmp L(return)
+#endif
ALIGN (4)
L(loop_cross_page):
@@ -185,11 +283,19 @@ L(loop_cross_page):
shrq %cl, %rdi
test %rdi, %rdi
je L(back_to_loop)
+#ifndef AS_STRCASECMP
bsfq %rdi, %rcx
movzbl (%rax, %rcx), %eax
movzbl (%rdx, %rcx), %edx
subl %edx, %eax
ret
+#else
+ movq %rdi, %rcx
+ movq %rax, %rdi
+ movq %rdx, %rsi
+ movq %rcx, %rax
+ jmp L(caseloop3)
+#endif
ALIGN (4)
L(cross_page_loop):
@@ -201,6 +307,10 @@ L(cross_page_loop):
L(cross_page):
movzbl (%rdi, %rdx), %eax
movzbl (%rsi, %rdx), %ecx
+#ifdef AS_STRCASECMP
+ movl (%r11, %rax, 4), %eax
+ movl (%r11, %rcx, 4), %ecx
+#endif
testb %al, %al
jne L(cross_page_loop)
xorl %eax, %eax
--
1.8.3.2