This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Getting l->__ctype_tolower in assembly. unaligned strcmp


Hi, I have strcasecmp almost ready I now just need to know how to get
__ctype_tolower table address in assembly. I could use workaround of
jumping from c implementation which is ugly.

strcasecmp_l_c(char *x,char *y, __locale_t l)
{
  return strcasecmp_l_as(x,y,l->__ctype_tolower);
}

On Fri, Sep 13, 2013 at 10:53:03PM +0200, OndÅej BÃlka wrote:
> If assumptions above are true in general then best course of action
> is to do minimal modifications in strcasecmp. A strcmp first produces
> bitmask of different characters, then returns differene of first
> characters:
> 
> if (m) {
>   return a[ffs (m) - b[ffs (m)];
> }
>
> which I would replace by enumerating over all characters as it is likely
> we stop at second one.
> 
> while (m) {
>   if (tolower(a[ffs (m)) != tolower(b[ffs (m)))
>     return tolower(a[ffs (m)) - tolower(b[ffs (m));
>   if (!a[ffs (m)]) return 0;
>   m = m & (m-1);
> }

Which I now converted to assembly modulo getting tolower table to %r11.


---
 sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S | 110 +++++++++++++++++++++++
 1 file changed, 110 insertions(+)

diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index eed8432..d3fc1f8 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -16,10 +16,29 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#ifdef AS_STRCASECMP
+# include "locale-defines.h"
+
+ENTRY (__strcasecmp_sse2_unaligned)
+        movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
+        mov     %fs:(%rax),%RDX_LP
+
+        // XXX 5 byte should be before the function
+        /* 5-byte NOP.  */
+        .byte   0x0f,0x1f,0x44,0x00,0x00
+END (__strcasecmp_sse2_unaligned)
+
+# define  __strcmp_sse2_unaligned __strcasecmp_sse2_unaligned_l
+#endif
+
 #include "sysdep.h"
 #define ALIGN(x)	.p2align x
 
 ENTRY ( __strcmp_sse2_unaligned)
+
+#ifdef AS_STRCASECMP
+	mov /*  l->__ctype_tolower  */, %r11
+#endif
 	movl	%edi, %eax
 	xorl	%edx, %edx
 	pxor	%xmm7, %xmm7
@@ -36,12 +55,16 @@ ENTRY ( __strcmp_sse2_unaligned)
 	pmovmskb	%xmm0, %eax
 	testq	%rax, %rax
 	je	L(next_48_bytes)
+#ifndef AS_STRCASECMP
 L(return):
 	bsfq	%rax, %rdx
 	movzbl	(%rdi, %rdx), %eax
 	movzbl	(%rsi, %rdx), %edx
 	subl	%edx, %eax
 	ret
+#else
+	jmp L(caseloop1)
+#endif
 
 	ALIGN (4)
 L(next_48_bytes):
@@ -85,6 +108,74 @@ L(main_loop_header):
 	movq	%rcx, %rsi
 	jmp	L(loop_start)
 
+#ifdef AS_STRCASECMP
+L(caseloop1):
+	bsfq    %rax, %rdx
+	leaq	-1(%rax), %rcx
+	andq	%rax, %rcx
+        movzbl  (%rdi, %rdx), %eax
+        movzbl  (%rsi, %rdx), %edx
+	testl	%eax, %edx
+	je	L(zero1)
+	movl	(%r11, %rax, 4), %eax
+	movl	(%r11, %rdx, 4), %edx
+	testl	%edx, %eax
+	je	L(casecnt1)
+L(zero1):
+	subl    %edx, %eax
+	ret
+L(casecnt1):
+	testl	%rcx, %rcx
+	je	L(next_48_bytes)
+	movq	%rcx, %rax
+	jmp L(caseloop1)
+
+L(return):
+L(caseloop2):
+	bsfq    %rax, %rdx
+	leaq	-1(%rax), %rcx
+	andq	%rax, %rcx
+	movzbl  (%rdi, %rdx), %eax
+        movzbl  (%rsi, %rdx), %edx
+	testl	%eax, %edx
+	je	L(zero2)
+	movl	(%r11, %rax, 4), %eax
+	movl	(%r11, %rdx, 4), %edx
+	testl	%edx, %eax
+	je	L(casecnt2)
+L(zero2):
+	subl    %edx, %eax
+	ret
+L(casecnt2):
+	testl	%rcx, %rcx
+	je	L(main_loop_header)
+	movq	%rcx, %rax
+	jmp L(caseloop2)
+
+L(caseloop3):
+        bsfq    %rax, %rdx
+        leaq    -1(%rax), %rcx
+        andq    %rax, %rcx
+        movzbl  (%rdi, %rdx), %eax
+        movzbl  (%rsi, %rdx), %edx
+        testl   %eax, %edx
+        je      L(zero3)
+        movl    (%r11, %rax, 4), %eax
+        movl    (%r11, %rdx, 4), %edx
+        testl   %edx, %eax
+        je      L(casecnt3)
+L(zero3):
+        subl    %edx, %eax
+        ret
+L(casecnt3):
+        testl   %rcx, %rcx
+        je      L(back_to_loop)
+        movq    %rcx, %rax
+        jmp L(caseloop3)
+
+#endif
+
+
 	ALIGN	(4)
 L(loop):
 	addq	$64, %rax
@@ -135,11 +226,18 @@ L(back_to_loop):
 	orq	%rdi, %rcx
 	salq	$48, %rsi
 	orq	%rsi, %rcx
+#ifndef AS_STRCASECMP
 	bsfq	%rcx, %rcx
 	movzbl	(%rax, %rcx), %eax
 	movzbl	(%rdx, %rcx), %edx
 	subl	%edx, %eax
 	ret
+#else
+	movq	%rax, %rdi
+	movq	%rdx, %rsi
+	movq	%rcx, %rax
+	jmp	L(return)
+#endif
 
 	ALIGN (4)
 L(loop_cross_page):
@@ -185,11 +283,19 @@ L(loop_cross_page):
 	shrq	%cl, %rdi
 	test	%rdi, %rdi
 	je	L(back_to_loop)
+#ifndef AS_STRCASECMP
 	bsfq	%rdi, %rcx
 	movzbl	(%rax, %rcx), %eax
 	movzbl	(%rdx, %rcx), %edx
 	subl	%edx, %eax
 	ret
+#else
+	movq	%rdi, %rcx
+	movq	%rax, %rdi
+	movq	%rdx, %rsi
+	movq	%rcx, %rax
+	jmp	L(caseloop3)
+#endif
 
 	ALIGN (4)
 L(cross_page_loop):
@@ -201,6 +307,10 @@ L(cross_page_loop):
 L(cross_page):
 	movzbl	(%rdi, %rdx), %eax
 	movzbl	(%rsi, %rdx), %ecx
+#ifdef AS_STRCASECMP
+	movl	(%r11, %rax, 4), %eax
+	movl	(%r11, %rcx, 4), %ecx
+#endif
 	testb	%al, %al
 	jne	L(cross_page_loop)
 	xorl	%eax, %eax
-- 
1.8.3.2


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]