This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] Faster strlen
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Sun, 7 Oct 2012 19:27:53 +0200
- Subject: [PATCH] Faster strlen
Hello, I investigated strlen bit more and improved pminub variant.
I got upto 10% speedup by unrolling main loop. I did not measured
difference when I unrolled loop more.
I also benchmarked atom and added variant which is identical to
strlen-sse2-pminub except bsf is replaced by table lookup.
Last addition is attempt to generate VEX encoded strlen. I need only to
pass -mavx flag when compiling strlen_avx.S but do not know how.
Benchmarks are at usual place. To fit all functions consider only random
alignment. I also increased granularity of sampling.
http://kam.mff.cuni.cz/~ondra/benchmark_string/
Results for this patch are
http://kam.mff.cuni.cz/~ondra/benchmark_string/benchmark_strlen_7_10_2012.tar.bz2
On sandy bridge
http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_sandy_bridge/strlen/html/test_r.html
there is phase change around sizes 1500-2000. Do you know what caused it?
Other optimalization is prefetching. Most of time prefetching variant is
slower than nonprefetching(as large strings are rare.)
On sandy bridge prefetching is free. I need additional flag to ifunc to
indicate that.
I disabled prefetching in my patch.
On atom ironicaly strlen-sse2-no-bsf was slower than pminub variant
except for string less than 16 bytes long.
For exit from main loop of no-bsf variant using bsfq instead binary
search saves 10 cycles. Multiplication+table lookup is also slow in atom
because 64bit multiplication is slow.
I used pminub variant with bsf instruction replaced by my table lookup. This
is by about 8 cycles faster on atom.
I did not reschedule instructions for atom for easier review.
sse2, pminub, no-bsf, sse4 variants are everywhere slower than my patch so I
remove them. pminub and no-bsf are used in strcat and will be removed in
separate patch.
2012-10-07 Ondrej Bilka <neleai@seznam.cz>
* sysdeps/x86_64/strlen.S:
Use unrolled pminub variant by default.
* sysdeps/x86_64/multiarch/strlen_avx.S:
Recode default variant using VEX prefix.
* sysdeps/x86_64/multiarch/strlen_atom.S:
New variant tailored to atom.
* sysdeps/x86_64/strlen.S: Updated function selection.
* sysdeps/x86_64/multiarch/strlen-sse4.S: deleted
* sysdeps/x86_64/multiarch/Makefile: updated
---
sysdeps/x86_64/multiarch/Makefile | 6 +-
sysdeps/x86_64/multiarch/strlen-sse4.S | 84 -------
sysdeps/x86_64/multiarch/strlen.S | 38 +---
sysdeps/x86_64/multiarch/strlen_atom.S | 398 ++++++++++++++++++++++++++++++++
sysdeps/x86_64/multiarch/strlen_avx.S | 4 +
sysdeps/x86_64/strlen.S | 156 +++++++------
6 files changed, 498 insertions(+), 188 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/strlen-sse4.S
create mode 100644 sysdeps/x86_64/multiarch/strlen_atom.S
create mode 100644 sysdeps/x86_64/multiarch/strlen_avx.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index dd6c27d..22f1435 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -10,12 +10,12 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
- strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \
+ strncase_l-ssse3 memset-x86-64 \
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
- strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
+ strcat-ssse3 strncat-ssse3 strlen_atom strlen_avx \
strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
memcmp-ssse3
ifeq (yes,$(config-cflags-sse4))
@@ -27,6 +27,8 @@ CFLAGS-strspn-c.c += -msse4
CFLAGS-strstr.c += -msse4
CFLAGS-strcasestr.c += -msse4
CFLAGS-strcasestr-nonascii.c += -msse4
+CFLAGS-strlen_avx.S += -mavx
+
endif
endif
diff --git a/sysdeps/x86_64/multiarch/strlen-sse4.S b/sysdeps/x86_64/multiarch/strlen-sse4.S
deleted file mode 100644
index ea5b783..0000000
--- a/sysdeps/x86_64/multiarch/strlen-sse4.S
+++ /dev/null
@@ -1,84 +0,0 @@
-/* strlen with SSE4
- Copyright (C) 2009, 2010 Free Software Foundation, Inc.
- Contributed by Ulrich Drepper <drepper@redhat.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if defined SHARED && !defined NOT_IN_libc
-
-#include <sysdep.h>
-
- .section .text.sse4.2,"ax",@progbits
-ENTRY (__strlen_sse42)
- pxor %xmm1, %xmm1
- movl %edi, %ecx
- movq %rdi, %r8
- andq $~15, %rdi
- xor %edi, %ecx
- pcmpeqb (%rdi), %xmm1
- pmovmskb %xmm1, %edx
- shrl %cl, %edx
- shll %cl, %edx
- andl %edx, %edx
- jnz L(less16bytes)
- pxor %xmm1, %xmm1
-
- .p2align 4
-L(more64bytes_loop):
- pcmpistri $0x08, 16(%rdi), %xmm1
- jz L(more32bytes)
-
- pcmpistri $0x08, 32(%rdi), %xmm1
- jz L(more48bytes)
-
- pcmpistri $0x08, 48(%rdi), %xmm1
- jz L(more64bytes)
-
- add $64, %rdi
- pcmpistri $0x08, (%rdi), %xmm1
- jnz L(more64bytes_loop)
- leaq (%rdi,%rcx), %rax
- subq %r8, %rax
- ret
-
- .p2align 4
-L(more32bytes):
- leaq 16(%rdi,%rcx, 1), %rax
- subq %r8, %rax
- ret
-
- .p2align 4
-L(more48bytes):
- leaq 32(%rdi,%rcx, 1), %rax
- subq %r8, %rax
- ret
-
- .p2align 4
-L(more64bytes):
- leaq 48(%rdi,%rcx, 1), %rax
- subq %r8, %rax
- ret
-
- .p2align 4
-L(less16bytes):
- subq %r8, %rdi
- bsfl %edx, %eax
- addq %rdi, %rax
- ret
-
-END (__strlen_sse42)
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
index 0c46b4f..cf1be3f 100644
--- a/sysdeps/x86_64/multiarch/strlen.S
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -24,42 +24,26 @@
/* Define multiple versions only for the definition in libc and for
the DSO. In static binaries we need strlen before the initialization
happened. */
-#if defined SHARED && !defined NOT_IN_libc
+#if defined SHARED && !defined NOT_IN_libc
.text
ENTRY(strlen)
.type strlen, @gnu_indirect_function
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
jne 1f
call __init_cpu_features
-1: leaq __strlen_sse2_pminub(%rip), %rax
- testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
- jnz 2f
- leaq __strlen_sse2(%rip), %rax
- testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
- jz 2f
- leaq __strlen_sse42(%rip), %rax
- ret
-2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
- jz 3f
- leaq __strlen_sse2_no_bsf(%rip), %rax
-3: ret
+1: leaq __strlen_avx(%rip), %rax
+ testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+ jnz 3f
+ leaq __strlen_atom(%rip), %rax
+ testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
+ jnz 3f
+ leaq __strlen_sse2(%rip), %rax
+ 3: ret
END(strlen)
-# undef ENTRY
-# define ENTRY(name) \
- .type __strlen_sse2, @function; \
- .align 16; \
- __strlen_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __strlen_sse2, .-__strlen_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strlen calls through a PLT.
- The speedup we get from using SSE4.2 instruction is likely eaten away
- by the indirect call in the PLT. */
+
+# define strlen __strlen_sse2
+# undef libc_hidden_builtin_def
# define libc_hidden_builtin_def(name) \
.globl __GI_strlen; __GI_strlen = __strlen_sse2
#endif
-
#include "../strlen.S"
diff --git a/sysdeps/x86_64/multiarch/strlen_atom.S b/sysdeps/x86_64/multiarch/strlen_atom.S
new file mode 100644
index 0000000..d90a79e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen_atom.S
@@ -0,0 +1,398 @@
+/* strlen(str) -- determine the length of the string STR.
+ Copyright (C) 2009-2012 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+
+ .text
+ENTRY(__strlen_atom)
+ xor %rax, %rax
+ mov %edi, %ecx
+ pxor %xmm0, %xmm0
+ and $0x3f, %ecx
+ cmp $0x30, %ecx
+ ja .next
+ movdqu (%rdi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz .exit_less16
+ mov %rdi, %rax
+ and $-16, %rax
+ jmp .align16_start
+.next:
+ mov %rdi, %rax
+ and $-16, %rax
+ pcmpeqb (%rax), %xmm0
+ mov $-1, %r10d
+ sub %rax, %rcx
+ shl %cl, %r10d
+ pmovmskb %xmm0, %edx
+ and %r10d, %edx
+ jnz .exit
+.align16_start:
+ pxor %xmm0, %xmm0
+ pcmpeqb 16(%rax), %xmm0
+ pxor %xmm1, %xmm1
+ pmovmskb %xmm0, %edx
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ test %edx, %edx
+ jnz .exit16
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz .exit32
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz .exit48
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz .exit64
+
+ pcmpeqb 80(%rax), %xmm0
+ add $64, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz .exit16
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz .exit32
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz .exit48
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz .exit64
+
+ pcmpeqb 80(%rax), %xmm0
+ add $64, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz .exit16
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz .exit32
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz .exit48
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz .exit64
+
+ pcmpeqb 80(%rax), %xmm0
+ add $64, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz .exit16
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz .exit32
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz .exit48
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz .exit64
+
+
+ test $0x3f, %rax
+ jz .align64_loop
+
+ pcmpeqb 80(%rax), %xmm0
+ add $80, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz .exit
+
+ test $0x3f, %rax
+ jz .align64_loop
+
+ pcmpeqb 16(%rax), %xmm1
+ add $16, %rax
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz .exit
+
+ test $0x3f, %rax
+ jz .align64_loop
+
+ pcmpeqb 16(%rax), %xmm2
+ add $16, %rax
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz .exit
+
+ test $0x3f, %rax
+ jz .align64_loop
+
+ pcmpeqb 16(%rax), %xmm3
+ add $16, %rax
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz .exit
+
+ add $16, %rax
+ .p2align 4
+ .align64_loop:
+ movaps (%rax), %xmm4
+ pminub 16(%rax), %xmm4
+ movaps 32(%rax), %xmm5
+ pminub 48(%rax), %xmm5
+ add $64, %rax
+ pminub %xmm4, %xmm5
+ pcmpeqb %xmm0, %xmm5
+ pmovmskb %xmm5, %edx
+ test %edx, %edx
+ jz .align64_loop
+
+
+ pcmpeqb -64(%rax), %xmm0
+ sub $80, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz .exit16
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz .exit32
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz .exit48
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ sub %rdi, %rax
+ movq %rdx, %rcx
+ negq %rcx
+ andq %rdx, %rcx
+ leaq first_byte_idx(%rip), %rdx
+ movzbl (%rdx,%rcx), %edx
+ add %rdx, %rax
+ add $64, %rax
+ ret
+
+ .p2align 4
+.exit:
+ sub %rdi, %rax
+.exit_less16:
+ movq %rdx, %rcx
+ negq %rcx
+ andq %rdx, %rcx
+ leaq first_byte_idx(%rip), %rdx
+ movzbl (%rdx,%rcx), %edx
+ add %rdx, %rax
+ ret
+ .p2align 4
+.exit16:
+ sub %rdi, %rax
+ movq %rdx, %rcx
+ negq %rcx
+ andq %rdx, %rcx
+ leaq 5+first_byte_idx(%rip), %rdx
+ movzbl (%rdx,%rcx), %edx
+ add %rdx, %rax
+ ret
+ .p2align 4
+.exit32:
+ sub %rdi, %rax
+ movq %rdx, %rcx
+ negq %rcx
+ andq %rdx, %rcx
+ leaq 10+first_byte_idx(%rip), %rdx
+ movzbl (%rdx,%rcx), %edx
+ add %rdx, %rax
+ ret
+ .p2align 4
+.exit48:
+ sub %rdi, %rax
+ movq %rdx, %rcx
+ negq %rcx
+ andq %rdx, %rcx
+ leaq 23+first_byte_idx(%rip), %rdx
+ movzbl (%rdx,%rcx), %edx
+ add %rdx, %rax
+ ret
+ .p2align 4
+.exit64:
+ sub %rdi, %rax
+ movq %rdx, %rcx
+ negq %rcx
+ andq %rdx, %rcx
+ leaq first_byte_idx(%rip), %rdx
+ movzbl (%rdx,%rcx), %edx
+ add %rdx, %rax
+ add $64, %rax
+ ret
+
+END(__strlen_atom)
+libc_hidden_builtin_def (__strlen_atom)
+
+.section .rodata
+ .align 32
+ .type first_byte_idx, @object
+ .size first_byte_idx, 32792
+
+first_byte_idx:
+ .zero 1
+ .byte 0
+ .byte 1
+ .zero 1
+ .byte 2
+ .zero 1
+ .byte 16
+ .byte 17
+ .byte 3
+ .byte 18
+ .zero 1
+ .byte 32
+ .byte 33
+ .byte 19
+ .byte 34
+ .zero 1
+ .byte 4
+ .zero 1
+ .byte 35
+ .zero 2
+ .byte 20
+ .zero 2
+ .byte 48
+ .byte 49
+ .byte 36
+ .byte 50
+ .zero 3
+ .byte 51
+ .byte 5
+ .zero 4
+ .byte 21
+ .zero 1
+ .byte 52
+ .zero 2
+ .byte 37
+ .zero 12
+ .byte 53
+ .zero 8
+ .byte 6
+ .zero 4
+ .byte 22
+ .zero 4
+ .byte 38
+ .zero 12
+ .byte 54
+ .zero 40
+ .byte 7
+ .zero 4
+ .byte 23
+ .zero 4
+ .byte 39
+ .zero 12
+ .byte 55
+ .zero 104
+ .byte 8
+ .zero 4
+ .byte 24
+ .zero 4
+ .byte 40
+ .zero 12
+ .byte 56
+ .zero 232
+ .byte 9
+ .zero 4
+ .byte 25
+ .zero 4
+ .byte 41
+ .zero 12
+ .byte 57
+ .zero 488
+ .byte 10
+ .zero 4
+ .byte 26
+ .zero 4
+ .byte 42
+ .zero 12
+ .byte 58
+ .zero 1000
+ .byte 11
+ .zero 4
+ .byte 27
+ .zero 4
+ .byte 43
+ .zero 12
+ .byte 59
+ .zero 2024
+ .byte 12
+ .zero 4
+ .byte 28
+ .zero 4
+ .byte 44
+ .zero 12
+ .byte 60
+ .zero 4072
+ .byte 13
+ .zero 4
+ .byte 29
+ .zero 4
+ .byte 45
+ .zero 12
+ .byte 61
+ .zero 8168
+ .byte 14
+ .zero 4
+ .byte 30
+ .zero 4
+ .byte 46
+ .zero 12
+ .byte 62
+ .zero 16360
+ .byte 15
+ .zero 4
+ .byte 31
+ .zero 4
+ .byte 47
+ .zero 12
+ .byte 63
+
diff --git a/sysdeps/x86_64/multiarch/strlen_avx.S b/sysdeps/x86_64/multiarch/strlen_avx.S
new file mode 100644
index 0000000..d76f16d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen_avx.S
@@ -0,0 +1,4 @@
+/*recode strlen with VEX prefix*/
+#define strlen __strlen_avx
+#include "../strlen.S"
+
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index f83d857..ce0b150 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,6 +1,5 @@
/* strlen(str) -- determine the length of the string STR.
- Copyright (C) 2009, 2010 Free Software Foundation, Inc.
- Contributed by Ulrich Drepper <drepper@redhat.com>.
+ Copyright (C) 2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -22,80 +21,87 @@
.text
ENTRY(strlen)
- xor %rax, %rax
- mov %edi, %ecx
- and $0x3f, %ecx
- pxor %xmm0, %xmm0
- cmp $0x30, %ecx
- ja L(next)
- movdqu (%rdi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit_less16)
- mov %rdi, %rax
- and $-16, %rax
- jmp L(align16_start)
-L(next):
- mov %rdi, %rax
- and $-16, %rax
- pcmpeqb (%rax), %xmm0
- mov $-1, %esi
- sub %rax, %rcx
- shl %cl, %esi
- pmovmskb %xmm0, %edx
- and %esi, %edx
- jnz L(exit)
-L(align16_start):
- pxor %xmm0, %xmm0
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
- .p2align 4
-L(align16_loop):
- pcmpeqb 16(%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- lea 64(%rax), %rax
- test %edx, %edx
- jz L(align16_loop)
-L(exit):
- sub %rdi, %rax
-L(exit_less16):
- bsf %rdx, %rdx
- add %rdx, %rax
- ret
- .p2align 4
-L(exit16):
- sub %rdi, %rax
- bsf %rdx, %rdx
- lea 16(%rdx,%rax), %rax
+/*
+ We replace bsf by table lookup like in the following code:
+
+unsigned char first_byte_idx[32791 +1]={
+[1]=0,[6]=16,[11]=32,[24]=48,[2]=1,[7]=17,[12]=33,[25]=49,[4]=2,[9]=18,[14]=34,[27]=50,[8]=3,[13]=19,[18]=35,[31]=51,[16]=4,[21]=20,[26]=36,[39]=52,[32]=5,[37]=21,[42]=37,[55]=53,[64]=6,[69]=22,[74]=38,[87]=54,[128]=7,[133]=23,[138]=39,[151]=55,[256]=8,[261]=24,[266]=40,[279]=56,[512]=9,[517]=25,[522]=41,[535]=57,[1024]=10,[1029]=26,[1034]=42,[1047]=58,[2048]=11,[2053]=27,[2058]=43,[2071]=59,[4096]=12,[4101]=28,[4106]=44,[4119]=60,[8192]=13,[8197]=29,[8202]=45,[8215]=61,[16384]=14,[16389]=30,[16394]=46,[16407]=62,[32768]=15,[32773]=31,[32778]=47,[32791]=63
+};
+int first_bit(uint64_t x)
+{
+ x=x&(-x);
+ if( (uint32_t) x){
+ if ((uint16_t) x)
+ return (first_byte_idx+0)[(uint16_t) x];
+ else
+ return (first_byte_idx+5)[(uint16_t) ( ((uint32_t) x)>>16)];
+ } else {
+ x=x>>32;
+ if ((uint16_t) x)
+ return (first_byte_idx+10)[(uint16_t) x];
+ else
+ return (first_byte_idx+23)[(uint16_t) ( ((uint32_t) x)>>16)];
+ }
+ }
+ */
+ movq %rdi, %rax
+ pxor %xmm5, %xmm5
+ andq $-64, %rax
+ pxor %xmm3,%xmm3
+ pxor %xmm2,%xmm2
+ pcmpeqb (%rax), %xmm3
+ pxor %xmm1,%xmm1
+ pcmpeqb 16(%rax), %xmm2
+ pmovmskb %xmm3, %r8d
+ pxor %xmm0,%xmm0
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm2, %edx
+ pcmpeqb 48(%rax), %xmm0
+ salq $16, %rdx
+ pmovmskb %xmm1, %esi
+ orq %r8, %rdx
+ pmovmskb %xmm0, %ecx
+ salq $16, %rcx
+ orq %rsi, %rcx
+ movq $-1, %rsi
+ salq $32, %rcx
+ orq %rcx, %rdx
+ movl %edi, %ecx
+ andl $63, %ecx
+ salq %cl, %rsi
+ andq %rsi, %rdx
+ je .L16
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
ret
- .p2align 4
-L(exit32):
- sub %rdi, %rax
- bsf %rdx, %rdx
- lea 32(%rdx,%rax), %rax
- ret
- .p2align 4
-L(exit48):
- sub %rdi, %rax
- bsf %rdx, %rdx
- lea 48(%rdx,%rax), %rax
+
+ .p2align 4,,10
+ .p2align 3
+.L19:
+ addq $64, %rax
+.L17:
+ pxor %xmm3,%xmm3
+ pcmpeqb (%rax),%xmm5
+ pxor %xmm2,%xmm2
+ pcmpeqb 16(%rax),%xmm3
+ pxor %xmm1,%xmm1
+ pmovmskb %xmm5, %r8d
+ pcmpeqb 32(%rax),%xmm2
+ pmovmskb %xmm3, %ecx
+ pcmpeqb 48(%rax),%xmm1
+ salq $16, %rcx
+ pmovmskb %xmm1, %edx
+ orq %r8, %rcx
+ salq $16, %rdx
+ pmovmskb %xmm2, %esi
+ orq %rsi, %rdx
+ salq $32, %rdx
+ orq %rcx, %rdx
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
ret
+
+ .p2align 4,,10
+ .p2align 3
+.L16:
+ #prefetcht0 576(%rax)
+ movdqa 64(%rax), %xmm0
+ pminub 80(%rax), %xmm0
+ pminub 96(%rax), %xmm0
+ pminub 112(%rax), %xmm0
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne .L19
+ subq $-128, %rax
+ #prefetcht0 512(%rax)
+ movdqa (%rax), %xmm0
+ pminub 16(%rax), %xmm0
+ pminub 32(%rax), %xmm0
+ pminub 48(%rax), %xmm0
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ je .L16
+ jmp .L17
END(strlen)
libc_hidden_builtin_def (strlen)
--
1.7.4.4