This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH v4] faster strlen on x64
- From: Andreas Jaeger <aj at suse dot com>
- To: OndÅej BÃlka <neleai at seznam dot cz>
- Cc: libc-alpha at sourceware dot org
- Date: Wed, 06 Mar 2013 22:54:11 +0100
- Subject: Re: [PATCH v4] faster strlen on x64
- References: <20130213113840.GA7781@domone.kolej.mff.cuni.cz>
On 02/13/2013 12:38 PM, OndÅej BÃlka wrote:
Hello,
I wrote at previous version that unaligned read of first 16 bytes is bad
tradeoff. When I made faster strcpy header I realized that it was because
I was doing separate check if it crosses page.
When I do only check if next 64 bytes do not cross page and first do
unaligned 16 byte load then it causes only small overhead for larger
strings. This makes my implementation faster for wider family of
workloads. It speed up gcc benchmark and most other programs.
On unit tests revised version is somewhat slower than previous version.
It is caused by choosing first 16 bytes only rarely which causes branch
misprediction.
I did two additional small improvements, first is squashing padding patch.
Second bit is test to cross page can be done as x%4096 < 4096-48 instead
x%4096 <= 4096-64 because I align x into 16 bytes.
I updated benchmarks, difference between new and revised version is at
http://kam.mff.cuni.cz/~ondra/benchmark_string/strlen_profile.html
http://kam.mff.cuni.cz/~ondra/strlen_profile.tar.bz2
Ondra
2013-01-31 Ondrej Bilka <neleai@seznam.cz>
* sysdeps/x86_64/strlen.S: Replace with new SSE2 based
implementation which is faster on all x86_64 architectures.
Tested on AMD, Intel Nehalem, SNB, IVB.
* sysdeps/x86_64/strnlen.S: Likewise.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
Remove all multiarch strlen and strnlen versions.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Update.
Remove strlen and strnlen related parts.
* sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S: Update.
Inline strlen part.
* sysdeps/x86_64/multiarch/strcat-ssse3.S: Likewise.
* sysdeps/x86_64/multiarch/strlen.S: Remove.
* sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S: Remove.
* sysdeps/x86_64/multiarch/strlen-sse2-pminub.S: Remove.
* sysdeps/x86_64/multiarch/rtld-strlen.S: Remove.
* sysdeps/x86_64/multiarch/strlen-sse4.S: Remove.
* sysdeps/x86_64/multiarch/strnlen.S: Remove.
* sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S: Remove.
From e7b469b93cf4cfd6475b644b0f2d72b8ae47170f Mon Sep 17 00:00:00 2001
From: Ondrej Bilka <neleai@seznam.cz>
Date: Wed, 30 Jan 2013 18:13:22 +0100
Subject: [PATCH] Faster strlen on x86-64.
---
sysdeps/x86_64/multiarch/Makefile | 6 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 13 -
sysdeps/x86_64/multiarch/rtld-strlen.S | 1 -
sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S | 229 +++++++-
sysdeps/x86_64/multiarch/strcat-ssse3.S | 312 ++++++++++-
sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S | 685 ----------------------
sysdeps/x86_64/multiarch/strlen-sse2-pminub.S | 259 --------
sysdeps/x86_64/multiarch/strlen-sse4.S | 84 ---
sysdeps/x86_64/multiarch/strlen.S | 68 ---
sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S | 3 -
sysdeps/x86_64/multiarch/strnlen.S | 57 --
sysdeps/x86_64/strlen.S | 265 +++++++--
sysdeps/x86_64/strnlen.S | 66 +--
13 files changed, 742 insertions(+), 1306 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/rtld-strlen.S
delete mode 100644 sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S
delete mode 100644 sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
delete mode 100644 sysdeps/x86_64/multiarch/strlen-sse4.S
delete mode 100644 sysdeps/x86_64/multiarch/strlen.S
delete mode 100644 sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
delete mode 100644 sysdeps/x86_64/multiarch/strnlen.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index dd6c27d..67686ad 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -10,14 +10,12 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
- strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \
+ strncase_l-ssse3 memset-x86-64 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
- strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
- strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
- memcmp-ssse3
+ strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 643cb2d..848991e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -187,11 +187,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strncpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
- /* Support sysdeps/x86_64/multiarch/strnlen.S. */
- IFUNC_IMPL (i, name, strnlen,
- IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2_no_bsf)
- IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
-
/* Support sysdeps/x86_64/multiarch/strpbrk.S. */
IFUNC_IMPL (i, name, strpbrk,
IFUNC_IMPL_ADD (array, i, strpbrk, HAS_SSE4_2,
@@ -262,14 +257,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_ssse3)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
- /* Support sysdeps/x86_64/multiarch/strlen.S. */
- IFUNC_IMPL (i, name, strlen,
- IFUNC_IMPL_ADD (array, i, strlen, HAS_SSE4_2, __strlen_sse42)
- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2_pminub)
- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2_no_bsf)
- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)
- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
-
/* Support sysdeps/x86_64/multiarch/strncmp.S. */
IFUNC_IMPL (i, name, strncmp,
IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2,
diff --git a/sysdeps/x86_64/multiarch/rtld-strlen.S b/sysdeps/x86_64/multiarch/rtld-strlen.S
deleted file mode 100644
index 596e054..0000000
--- a/sysdeps/x86_64/multiarch/rtld-strlen.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../rtld-strlen.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
index 72bb609..6d9951e 100644
--- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
@@ -34,10 +34,233 @@ ENTRY (STRCAT)
mov %rdx, %r8
# endif
-# define RETURN jmp L(StartStrcpyPart)
-# include "strlen-sse2-pminub.S"
-# undef RETURN
You say that you inline the strlen part in the changes entry but I do
not see this from the function.
Please add comments here, it's not clear what this code does at all.
Explain at the beginning the algorithm used, and explain what's
happening in the code.
+ xor %rax, %rax
+ mov %edi, %ecx
+ and $0x3f, %ecx
+ pxor %xmm0, %xmm0
+ cmp $0x30, %ecx
+ ja L(next)
+ movdqu (%rdi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit_less16)
+ mov %rdi, %rax
+ and $-16, %rax
+ jmp L(align16_start)
+L(next):
+ mov %rdi, %rax
+ and $-16, %rax
+ pcmpeqb (%rax), %xmm0
+ mov $-1, %r10d
+ sub %rax, %rcx
+ shl %cl, %r10d
+ pmovmskb %xmm0, %edx
+ and %r10d, %edx
+ jnz L(exit)
+L(align16_start):
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ pcmpeqb 16(%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit64)
+
+ pcmpeqb 80(%rax), %xmm0
+ add $64, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit64)
+
+ pcmpeqb 80(%rax), %xmm0
+ add $64, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit64)
+
+ pcmpeqb 80(%rax), %xmm0
+ add $64, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit64)
+
+ test $0x3f, %rax
+ jz L(align64_loop)
+
+ pcmpeqb 80(%rax), %xmm0
+ add $80, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $0x3f, %rax
+ jz L(align64_loop)
+
+ pcmpeqb 16(%rax), %xmm1
+ add $16, %rax
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $0x3f, %rax
+ jz L(align64_loop)
+
+ pcmpeqb 16(%rax), %xmm2
+ add $16, %rax
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $0x3f, %rax
+ jz L(align64_loop)
+
+ pcmpeqb 16(%rax), %xmm3
+ add $16, %rax
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ add $16, %rax
+ .p2align 4
+ L(align64_loop):
+ movaps (%rax), %xmm4
+ pminub 16(%rax), %xmm4
+ movaps 32(%rax), %xmm5
+ pminub 48(%rax), %xmm5
+ add $64, %rax
+ pminub %xmm4, %xmm5
+ pcmpeqb %xmm0, %xmm5
+ pmovmskb %xmm5, %edx
+ test %edx, %edx
+ jz L(align64_loop)
+
+ pcmpeqb -64(%rax), %xmm0
+ sub $80, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $64, %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit):
+ sub %rdi, %rax
+L(exit_less16):
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit16):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $16, %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit32):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $32, %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit48):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $48, %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit64):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $64, %rax
+
+ .p2align 4
L(StartStrcpyPart):
lea (%r9, %rax), %rdi
mov %rsi, %rcx
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
index fea9d11..901e66f 100644
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S
@@ -33,11 +33,317 @@ ENTRY (STRCAT)
mov %rdx, %r8
# endif
-# define RETURN jmp L(StartStrcpyPart)
-# include "strlen-sse2-no-bsf.S"
Same here, this code needs comments
+ xor %eax, %eax
+ cmpb $0, (%rdi)
+ jz L(exit_tail0)
+ cmpb $0, 1(%rdi)
+ jz L(exit_tail1)
+ cmpb $0, 2(%rdi)
+ jz L(exit_tail2)
+ cmpb $0, 3(%rdi)
+ jz L(exit_tail3)
+
+ cmpb $0, 4(%rdi)
+ jz L(exit_tail4)
+ cmpb $0, 5(%rdi)
+ jz L(exit_tail5)
+ cmpb $0, 6(%rdi)
+ jz L(exit_tail6)
+ cmpb $0, 7(%rdi)
+ jz L(exit_tail7)
+
+ cmpb $0, 8(%rdi)
+ jz L(exit_tail8)
+ cmpb $0, 9(%rdi)
+ jz L(exit_tail9)
+ cmpb $0, 10(%rdi)
+ jz L(exit_tail10)
+ cmpb $0, 11(%rdi)
+ jz L(exit_tail11)
+
+ cmpb $0, 12(%rdi)
+ jz L(exit_tail12)
+ cmpb $0, 13(%rdi)
+ jz L(exit_tail13)
+ cmpb $0, 14(%rdi)
+ jz L(exit_tail14)
+ cmpb $0, 15(%rdi)
+ jz L(exit_tail15)
+ pxor %xmm0, %xmm0
+ lea 16(%rdi), %rcx
+ lea 16(%rdi), %rax
+ and $-16, %rax
+
+ pcmpeqb (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ pxor %xmm2, %xmm2
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ pxor %xmm3, %xmm3
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ and $-0x40, %rax
-# undef RETURN
+ .p2align 4
+L(aligned_64):
+ pcmpeqb (%rax), %xmm0
+ pcmpeqb 16(%rax), %xmm1
+ pcmpeqb 32(%rax), %xmm2
+ pcmpeqb 48(%rax), %xmm3
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %r11d
+ pmovmskb %xmm2, %r10d
+ pmovmskb %xmm3, %r9d
+ or %edx, %r9d
+ or %r11d, %r9d
+ or %r10d, %r9d
+ lea 64(%rax), %rax
+ jz L(aligned_64)
+
+ test %edx, %edx
+ jnz L(aligned_64_exit_16)
+ test %r11d, %r11d
+ jnz L(aligned_64_exit_32)
+ test %r10d, %r10d
+ jnz L(aligned_64_exit_48)
+
+L(aligned_64_exit_64):
+ pmovmskb %xmm3, %edx
+ jmp L(exit)
+
+L(aligned_64_exit_48):
+ lea -16(%rax), %rax
+ mov %r10d, %edx
+ jmp L(exit)
+
+L(aligned_64_exit_32):
+ lea -32(%rax), %rax
+ mov %r11d, %edx
+ jmp L(exit)
+
+L(aligned_64_exit_16):
+ lea -48(%rax), %rax
+
+L(exit):
+ sub %rcx, %rax
+ test %dl, %dl
+ jz L(exit_high)
+ test $0x01, %dl
+ jnz L(exit_tail0)
+
+ test $0x02, %dl
+ jnz L(exit_tail1)
+
+ test $0x04, %dl
+ jnz L(exit_tail2)
+
+ test $0x08, %dl
+ jnz L(exit_tail3)
+
+ test $0x10, %dl
+ jnz L(exit_tail4)
+
+ test $0x20, %dl
+ jnz L(exit_tail5)
+
+ test $0x40, %dl
+ jnz L(exit_tail6)
+ add $7, %eax
+L(exit_tail0):
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_high):
+ add $8, %eax
+ test $0x01, %dh
+ jnz L(exit_tail0)
+
+ test $0x02, %dh
+ jnz L(exit_tail1)
+
+ test $0x04, %dh
+ jnz L(exit_tail2)
+
+ test $0x08, %dh
+ jnz L(exit_tail3)
+
+ test $0x10, %dh
+ jnz L(exit_tail4)
+
+ test $0x20, %dh
+ jnz L(exit_tail5)
+
+ test $0x40, %dh
+ jnz L(exit_tail6)
+ add $7, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_tail1):
+ add $1, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_tail2):
+ add $2, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_tail3):
+ add $3, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_tail4):
+ add $4, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_tail5):
+ add $5, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_tail6):
+ add $6, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_tail7):
+ add $7, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_tail8):
+ add $8, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_tail9):
+ add $9, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_tail10):
+ add $10, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_tail11):
+ add $11, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_tail12):
+ add $12, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_tail13):
+ add $13, %eax
+ jmp L(StartStrcpyPart)
+ .p2align 4
+L(exit_tail14):
+ add $14, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_tail15):
+ add $15, %eax
+
+ .p2align 4
L(StartStrcpyPart):
mov %rsi, %rcx
lea (%rdi, %rax), %rdx
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index 4bdca0a..c8ced10 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,6 +1,5 @@
-/* strlen(str) -- determine the length of the string STR.
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
- Contributed by Ulrich Drepper <drepper@redhat.com>.
+/* SSE2 version of strlen.
+ Copyright (C) 2012, 2013 Free Software Foundation, Inc.
It's 2012-2013
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -19,83 +18,219 @@
#include <sysdep.h>
+/* Used in linker - use only %xmm8-%xmm15. */
Better expand
Say explictely - if it's true: This is used inside the dynamic linker
during relocation, thus we cannot touch the usual registers. We can use
%xmm8 to %xmm15.
- .text
+/* Long lived register are
+ strlen(s), strnlen(s, n):
No line break here.
Rewrite this a bit. What about?
Long lived register usage for strlen(s), strnlen(s, n):
Still not perfect but I don't have a better idea.
+
+ %xmm11 - zero
+ %rdi - s
+ %r10 (s+n) & (~(64-1))
don't add a tab here, make it (s+n) & (~64-1))
+ %r11 s+n
+*/
+
+
+.text
ENTRY(strlen)
+
this needs a comment. What is this macro doing?
+#define FIND_ZERO \
+ pcmpeqb (%rax), %xmm8; \
+ pcmpeqb 16(%rax), %xmm9; \
+ pcmpeqb 32(%rax), %xmm10; \
+ pcmpeqb 48(%rax), %xmm11; \
+ pmovmskb %xmm8, %esi; \
+ pmovmskb %xmm9, %edx; \
+ pmovmskb %xmm10, %r8d; \
+ pmovmskb %xmm11, %ecx; \
+ salq $16, %rdx; \
+ salq $16, %rcx; \
+ orq %rsi, %rdx; \
+ orq %r8, %rcx; \
+ salq $32, %rcx; \
+ orq %rcx, %rdx;
+
+#ifdef AS_STRNLEN
+/* Do not read anything when n==0. */
+ test %rsi, %rsi
+ jne L(n_nonzero)
xor %rax, %rax
- mov %edi, %ecx
- and $0x3f, %ecx
- pxor %xmm0, %xmm0
- cmp $0x30, %ecx
+ ret
+L(n_nonzero):
+
+/* Initialize long lived registers. */
+
+ add %rdi, %rsi
+ mov %rsi, %r10
+ and $-64, %r10
+ mov %rsi, %r11
+#endif
+
+ pxor %xmm8, %xmm8
+ pxor %xmm9, %xmm9
+ pxor %xmm10, %xmm10
+ pxor %xmm11, %xmm11
+ movq %rdi, %rax
+ movq %rdi, %rcx
+ andq $4095, %rcx
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
+ cmpq $4047, %rcx
+/* We cannot unify this branching as it would be ~6 cycles slower. */
ja L(next)
- movdqu (%rdi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
+
+#ifdef AS_STRNLEN
What is this prolog doing?
+# define STRNLEN_PROLOG \
+ mov %r11, %rsi; \
+ subq %rax, %rsi; \
+ andq $-64, %rax; \
+ testq $-64, %rsi; \
+ je L(strnlen_ret)
+#else
+# define STRNLEN_PROLOG andq $-64, %rax;
+#endif
+
And this one? Please document!
+#define PROLOG(lab) \
+ movq %rdi, %rcx; \
+ xorq %rax, %rcx; \
+ STRNLEN_PROLOG; \
+ sarq %cl, %rdx; \
+ test %rdx, %rdx; \
+ je L(lab); \
+ bsfq %rdx, %rax; \
+ ret
+
+#ifdef AS_STRNLEN
+ andq $-16, %rax
+ FIND_ZERO
+#else
+ movdqu (%rax), %xmm12
+ pcmpeqb %xmm8, %xmm12
+ pmovmskb %xmm12, %edx
test %edx, %edx
- jnz L(exit_less16)
- mov %rdi, %rax
- and $-16, %rax
- jmp L(align16_start)
+ je L(next48_bytes)
+ bsfq %rdx, %rax
+ ret
+
+L(next48_bytes):
+/* Same as FIND_ZERO except we do not check first 16 bytes. */
+ andq $-16, %rax
+ pcmpeqb 16(%rax), %xmm9;
+ pcmpeqb 32(%rax), %xmm10;
+ pcmpeqb 48(%rax), %xmm11;
+ pmovmskb %xmm9, %edx;
+ pmovmskb %xmm10, %r8d;
+ pmovmskb %xmm11, %ecx;
+ salq $16, %rdx;
+ salq $16, %rcx;
+ orq %r8, %rcx;
+ salq $32, %rcx;
+ orq %rcx, %rdx;
+#endif
+
+ PROLOG(loop)
+
+ .p2align 4
L(next):
- mov %rdi, %rax
- and $-16, %rax
- pcmpeqb (%rax), %xmm0
- mov $-1, %esi
- sub %rax, %rcx
- shl %cl, %esi
- pmovmskb %xmm0, %edx
- and %esi, %edx
- jnz L(exit)
-L(align16_start):
- pxor %xmm0, %xmm0
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
+ andq $-64, %rax
+ FIND_ZERO
+ PROLOG(loop_init)
+
+#ifdef AS_STRNLEN
+/* We must do this check to correctly handle strnlen (s, -1). */
+L(strnlen_ret):
+ bts %rsi, %rdx
+ sarq %cl, %rdx
+ test %rdx, %rdx
+ je L(loop_init)
+ bsfq %rdx, %rax
+ ret
+#endif
.p2align 4
-L(align16_loop):
- pcmpeqb 16(%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
+L(loop_init):
+ pxor %xmm9, %xmm9
+ pxor %xmm10, %xmm10
+ pxor %xmm11, %xmm11
+#ifdef AS_STRNLEN
+ .p2align 4
+L(loop):
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
+ addq $64, %rax
+ cmpq %rax, %r10
+ je L(exit_end)
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
+ movdqa (%rax), %xmm8
+ pminub 16(%rax), %xmm8
+ pminub 32(%rax), %xmm8
+ pminub 48(%rax), %xmm8
+ pcmpeqb %xmm11, %xmm8
+ pmovmskb %xmm8, %edx
+ testl %edx, %edx
+ jne L(exit)
+ jmp L(loop)
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- lea 64(%rax), %rax
- test %edx, %edx
- jz L(align16_loop)
-L(exit):
- sub %rdi, %rax
-L(exit_less16):
- bsf %rdx, %rdx
- add %rdx, %rax
- ret
.p2align 4
-L(exit16):
- sub %rdi, %rax
- bsf %rdx, %rdx
- lea 16(%rdx,%rax), %rax
+L(exit_end):
+ cmp %rax, %r11
+ je L(first)
+ pxor %xmm8, %xmm8
+ FIND_ZERO
+
+L(first):
+ bts %r11, %rdx
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
ret
+
.p2align 4
-L(exit32):
- sub %rdi, %rax
- bsf %rdx, %rdx
- lea 32(%rdx,%rax), %rax
+L(exit):
+ pxor %xmm8, %xmm8
+ FIND_ZERO
+
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
ret
+
+#else
+ .p2align 4
+L(loop):
+
+ movdqa 64(%rax), %xmm8
+ pminub 80(%rax), %xmm8
+ pminub 96(%rax), %xmm8
+ pminub 112(%rax), %xmm8
+ pcmpeqb %xmm11, %xmm8
+ pmovmskb %xmm8, %edx
+ testl %edx, %edx
+ jne L(exit64)
+
+ subq $-128, %rax
+
+ movdqa (%rax), %xmm8
+ pminub 16(%rax), %xmm8
+ pminub 32(%rax), %xmm8
+ pminub 48(%rax), %xmm8
+ pcmpeqb %xmm11, %xmm8
+ pmovmskb %xmm8, %edx
+ testl %edx, %edx
+ jne L(exit0)
+ jmp L(loop)
+
.p2align 4
-L(exit48):
- sub %rdi, %rax
- bsf %rdx, %rdx
- lea 48(%rdx,%rax), %rax
+L(exit64):
+ addq $64, %rax
+L(exit0):
+ pxor %xmm8, %xmm8
+ FIND_ZERO
+
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
ret
+
+#endif
+
END(strlen)
+#ifndef AS_STRLEN
libc_hidden_builtin_def (strlen)
+#endif
diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S
index 6e53503..d0694a5 100644
--- a/sysdeps/x86_64/strnlen.S
+++ b/sysdeps/x86_64/strnlen.S
@@ -1,63 +1,7 @@
-/* strnlen(str,maxlen) -- determine the length of the string STR up to MAXLEN.
- Copyright (C) 2010-2013 Free Software Foundation, Inc.
- Contributed by Ulrich Drepper <drepper@redhat.com>.
- This file is part of the GNU C Library.
Please add the usual copyright etc. headers - even for a trivial file.
+#define AS_STRNLEN
+#define strlen __strnlen
+#include "strlen.S"
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
+weak_alias (__strnlen, strnlen);
+libc_hidden_builtin_def (strnlen)
[...]
Please add more comments, especially where I noted that and resend for
another review,
thanks,
Andreas
--
Andreas Jaeger aj@{suse.com,opensuse.org} Twitter/Identica: jaegerandi
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 NÃrnberg, Germany
GF: Jeff Hawn,Jennifer Guild,Felix ImendÃrffer,HRB16746 (AG NÃrnberg)
GPG fingerprint = 93A3 365E CE47 B889 DF7F FED1 389A 563C C272 A126