This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] Faster strchr implementation.
- From: Liubov Dmitrieva <liubov dot dmitrieva at gmail dot com>
- To: Ondřej Bílka <neleai at seznam dot cz>
- Cc: GNU C Library <libc-alpha at sourceware dot org>
- Date: Thu, 8 Aug 2013 22:22:36 +0400
- Subject: Re: [PATCH] Faster strchr implementation.
- References: <20130807140911 dot GA31968 at domone dot kolej dot mff dot cuni dot cz>
Hello,
I will run your tests for atom, silvermont, haswell tomorrow morning.
I should check strcmp, strchr, strrchr you wrote about. Right? Or
something else?
--
Liubov
On Wed, Aug 7, 2013 at 6:09 PM, OndÅej BÃlka <neleai@seznam.cz> wrote:
>
> As are functions using sse4.2 concerned next one is strchr. I wrote a
> better implementation about year ago, now I am resubmitting it.
>
> It uses a similar header as in strlen which improves performance in gcc
> workload in most of cases.
>
> Asymptotically on all tested architectures a strchr_new wins often by
> more than 33% than next implementation
>
> On ivy bridge and bulldozer architectures my implementation is 50%
> faster than sse4_2 one for large inputs. A nehalem is exception as we
> gain only about 3% there.
>
> I also included avx2 version to benchmark but do not have haswell to get
> data.
> I also do not have data for atom, so I left strchr_sse2_no_bsf as it is.
>
> Benchmarks are available here and data for haswell/atom are welcome:
>
> http://kam.mff.cuni.cz/~ondra/benchmark_string/strchr_profile.html
>
> http://kam.mff.cuni.cz/~ondra/strchr_profile070813.tar.bz2
>
> No regressions on x64. OK to commit?
>
> * sysdeps/x86_64/multiarch/ifunc-impl-list.c
> (__libc_ifunc_impl_list): Remove: __strchr_sse42.
> * sysdeps/x86_64/multiarch/strchr.S (__strchr_sse42): Remove.
> (strchr): Remove __strchr_sse42 ifunc selection.
> * sysdeps/x86_64/strchr.S (strchr): Use optimized implementation.
> * sysdeps/x86_64/strchrnul.S: Include sysdeps/x86_64/strchr.S.
>
> ---
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 1 -
> sysdeps/x86_64/multiarch/strchr.S | 126 ---------------------
> sysdeps/x86_64/strchr.S | 176 +++++++++++++++++++++++------
> sysdeps/x86_64/strchrnul.S | 44 +-------
> 4 files changed, 142 insertions(+), 205 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 28d3579..8486294 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -116,7 +116,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strchr.S. */
> IFUNC_IMPL (i, name, strchr,
> - IFUNC_IMPL_ADD (array, i, strchr, HAS_SSE4_2, __strchr_sse42)
> IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
> IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
>
> diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
> index f170238..9d2db7c 100644
> --- a/sysdeps/x86_64/multiarch/strchr.S
> +++ b/sysdeps/x86_64/multiarch/strchr.S
> @@ -29,12 +29,6 @@ ENTRY(strchr)
> jne 1f
> call __init_cpu_features
> 1: leaq __strchr_sse2(%rip), %rax
> - testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
> - jnz 2f
> - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
> - jz 2f
> - leaq __strchr_sse42(%rip), %rax
> - ret
> 2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
> jz 3f
> leaq __strchr_sse2_no_bsf(%rip), %rax
> @@ -42,126 +36,6 @@ ENTRY(strchr)
> END(strchr)
>
>
> -/*
> - This implementation uses SSE4 instructions to compare up to 16 bytes
> - at a time looking for the first occurrence of the character c in the
> - string s:
> -
> - char *strchr (const char *s, int c);
> -
> - We use 0xa:
> - _SIDD_SBYTE_OPS
> - | _SIDD_CMP_EQUAL_EACH
> - | _SIDD_LEAST_SIGNIFICANT
> - on pcmpistri to compare xmm/mem128
> -
> - 0 1 2 3 4 5 6 7 8 9 A B C D E F
> - X X X X X X X X X X X X X X X X
> -
> - against xmm
> -
> - 0 1 2 3 4 5 6 7 8 9 A B C D E F
> - C C C C C C C C C C C C C C C C
> -
> - to find out if the first 16byte data element has a byte C and the
> - offset of the first byte. There are 3 cases:
> -
> - 1. The first 16byte data element has the byte C at the offset X.
> - 2. The first 16byte data element has EOS and doesn't have the byte C.
> - 3. The first 16byte data element is valid and doesn't have the byte C.
> -
> - Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
> -
> - case ECX CFlag ZFlag SFlag
> - 1 X 1 0/1 0
> - 2 16 0 1 0
> - 3 16 0 0 0
> -
> - We exit from the loop for cases 1 and 2 with jbe which branches
> - when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
> - X for case 1. */
> -
> - .section .text.sse4.2,"ax",@progbits
> - .align 16
> - .type __strchr_sse42, @function
> - .globl __strchr_sse42
> - .hidden __strchr_sse42
> -__strchr_sse42:
> - cfi_startproc
> - CALL_MCOUNT
> - testb %sil, %sil
> - je __strend_sse4
> - pxor %xmm2, %xmm2
> - movd %esi, %xmm1
> - movl %edi, %ecx
> - pshufb %xmm2, %xmm1
> - andl $15, %ecx
> - movq %rdi, %r8
> - je L(aligned_start)
> -
> -/* Handle unaligned string. */
> - andq $-16, %r8
> - movdqa (%r8), %xmm0
> - pcmpeqb %xmm0, %xmm2
> - pcmpeqb %xmm1, %xmm0
> - /* Find where NULL is. */
> - pmovmskb %xmm2, %edx
> - /* Check if there is a match. */
> - pmovmskb %xmm0, %esi
> - /* Remove the leading bytes. */
> - sarl %cl, %edx
> - sarl %cl, %esi
> - testl %esi, %esi
> - je L(unaligned_no_match)
> - /* Check which byte is a match. */
> - bsfl %esi, %eax
> - /* Is there a NULL? */
> - testl %edx, %edx
> - je L(unaligned_match)
> - bsfl %edx, %esi
> - cmpl %esi, %eax
> - /* Return NULL if NULL comes first. */
> - ja L(return_null)
> -L(unaligned_match):
> - addq %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(unaligned_no_match):
> - testl %edx, %edx
> - jne L(return_null)
> -
> -/* Loop start on aligned string. */
> -L(loop):
> - addq $16, %r8
> -L(aligned_start):
> - pcmpistri $0x2, (%r8), %xmm1
> - jbe L(wrap)
> - addq $16, %r8
> - pcmpistri $0x2, (%r8), %xmm1
> - jbe L(wrap)
> - addq $16, %r8
> - pcmpistri $0x2, (%r8), %xmm1
> - jbe L(wrap)
> - addq $16, %r8
> - pcmpistri $0x2, (%r8), %xmm1
> - jbe L(wrap)
> - jmp L(loop)
> -L(wrap):
> - jc L(loop_exit)
> -
> -/* Return NULL. */
> -L(return_null):
> - xorl %eax, %eax
> - ret
> -
> -/* Loop exit. */
> - .p2align 4
> -L(loop_exit):
> - leaq (%r8,%rcx), %rax
> - ret
> - cfi_endproc
> - .size __strchr_sse42, .-__strchr_sse42
>
>
> # undef ENTRY
> diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
> index d89f1eb..a6bcfdc 100644
> --- a/sysdeps/x86_64/strchr.S
> +++ b/sysdeps/x86_64/strchr.S
> @@ -19,51 +19,153 @@
>
> #include <sysdep.h>
>
> -
> +#ifndef ALIGN
> +#define ALIGN(x) .p2align x
> +#endif
> .text
> ENTRY (strchr)
> +
> movd %esi, %xmm1
> - movq %rdi, %rcx
> - punpcklbw %xmm1, %xmm1
> - andq $~15, %rdi
> - pxor %xmm2, %xmm2
> - punpcklbw %xmm1, %xmm1
> - orl $0xffffffff, %esi
> - movdqa (%rdi), %xmm0
> + movl %edi, %eax
> + andl $4095, %eax
> + punpcklbw %xmm1, %xmm1
> + cmpl $4032, %eax
> + punpcklwd %xmm1, %xmm1
> pshufd $0, %xmm1, %xmm1
> - subq %rdi, %rcx
> - movdqa %xmm0, %xmm3
> - leaq 16(%rdi), %rdi
> + jg L(cross_cache)
> +L(find_result):
> + movdqu (%rdi), %xmm0
> + pxor %xmm3, %xmm3
> + movdqa %xmm1, %xmm2
> + movdqa %xmm0, %xmm4
> pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm3
> - shl %cl, %esi
> - pmovmskb %xmm0, %edx
> - pmovmskb %xmm3, %ecx
> - andl %esi, %edx
> - andl %esi, %ecx
> - orl %edx, %ecx
> - jnz 1f
> -
> -2: movdqa (%rdi), %xmm0
> - leaq 16(%rdi), %rdi
> - movdqa %xmm0, %xmm3
> + pcmpeqb %xmm3, %xmm4
> + por %xmm4, %xmm0
> + pmovmskb %xmm0, %edx
> + bsfq %rdx, %rax
> + testq %rdx, %rdx
> + je L(next48_bytes)
> +L(return):
> + movl $0, %edx
> + leaq (%rdi,%rax), %rax
> +#ifndef AS_STRCHRNUL
> + cmpb %sil, (%rax)
> + cmovne %rdx, %rax
> +#endif
> + ret
> + ALIGN(4)
> +L(cross_cache):
> + movq %rdi, %rdx
> + pxor %xmm2, %xmm2
> + andq $-64, %rdx
> + movdqa %xmm1, %xmm0
> + movdqu (%rdx), %xmm3
> + movdqa %xmm3, %xmm4
> + pcmpeqb %xmm1, %xmm3
> + pcmpeqb %xmm2, %xmm4
> + por %xmm4, %xmm3
> + pmovmskb %xmm3, %r8d
> + movdqu 16(%rdx), %xmm3
> + movdqa %xmm3, %xmm4
> + movslq %r8d, %r8
> + pcmpeqb %xmm1, %xmm3
> + pcmpeqb %xmm2, %xmm4
> + por %xmm4, %xmm3
> + pmovmskb %xmm3, %eax
> + movdqu 32(%rdx), %xmm3
> + movdqa %xmm3, %xmm4
> + cltq
> + pcmpeqb %xmm1, %xmm3
> + salq $16, %rax
> + pcmpeqb %xmm2, %xmm4
> + por %xmm4, %xmm3
> + pmovmskb %xmm3, %r9d
> + movdqu 48(%rdx), %xmm3
> + pcmpeqb %xmm3, %xmm2
> + salq $32, %r9
> + pcmpeqb %xmm3, %xmm0
> + orq %r9, %rax
> + orq %r8, %rax
> + por %xmm2, %xmm0
> + pmovmskb %xmm0, %ecx
> + salq $48, %rcx
> + orq %rcx, %rax
> + movl %edi, %ecx
> + subb %dl, %cl
> + shrq %cl, %rax
> + testq %rax, %rax
> + jne L(return2)
> +L(align64):
> + addq $64, %rdi
> + pxor %xmm6, %xmm6
> + andq $-64, %rdi
> + jmp L(loop_start)
> + ALIGN(4)
> +L(loop):
> + addq $64, %rdi
> +L(loop_start):
> + movdqa (%rdi), %xmm5
> + movdqa 16(%rdi), %xmm2
> + movdqa 32(%rdi), %xmm3
> + pxor %xmm1, %xmm5
> + movdqa 48(%rdi), %xmm4
> + pxor %xmm1, %xmm2
> + pxor %xmm1, %xmm3
> + pminub (%rdi), %xmm5
> + pxor %xmm1, %xmm4
> + pminub 16(%rdi), %xmm2
> + pminub 32(%rdi), %xmm3
> + pminub %xmm2, %xmm5
> + pminub 48(%rdi), %xmm4
> + pminub %xmm3, %xmm5
> + pminub %xmm4, %xmm5
> + pcmpeqb %xmm6, %xmm5
> + pmovmskb %xmm5, %eax
> + testl %eax, %eax
> + je L(loop)
> + jmp L(find_result)
> + ALIGN(4)
> +L(return2):
> + bsfq %rax, %rax
> + jmp L(return)
> + ALIGN(4)
> +L(next48_bytes):
> + movdqu 16(%rdi), %xmm0
> + movdqa %xmm0, %xmm4
> pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm3
> - pmovmskb %xmm0, %edx
> - pmovmskb %xmm3, %ecx
> - orl %edx, %ecx
> - jz 2b
> -
> -1: bsfl %edx, %edx
> - jz 4f
> - bsfl %ecx, %ecx
> - leaq -16(%rdi,%rdx), %rax
> - cmpl %edx, %ecx
> - je 5f
> -4: xorl %eax, %eax
> -5: ret
> + pcmpeqb %xmm3, %xmm4
> + por %xmm4, %xmm0
> + pmovmskb %xmm0, %ecx
> + movdqu 32(%rdi), %xmm0
> + movdqa %xmm0, %xmm4
> + pcmpeqb %xmm1, %xmm0
> + salq $16, %rcx
> + pcmpeqb %xmm3, %xmm4
> + por %xmm4, %xmm0
> + pmovmskb %xmm0, %eax
> + movdqu 48(%rdi), %xmm0
> + pcmpeqb %xmm0, %xmm3
> + salq $32, %rax
> + pcmpeqb %xmm0, %xmm2
> + orq %rcx, %rax
> + por %xmm3, %xmm2
> + pmovmskb %xmm2, %r8d
> + movq %r8, %rcx
> + salq $48, %rcx
> + orq %rcx, %rax
> + je L(align64)
> + bsfq %rax, %rax
> + leaq (%rdi,%rax), %rax
> +#ifndef AS_STRCHRNUL
> + cmpb %sil, (%rax)
> + cmovne %rdx, %rax
> +#endif
> + ret
> END (strchr)
>
> +#ifndef AS_STRCHRNUL
> weak_alias (strchr, index)
> +#endif
> +
> libc_hidden_builtin_def (strchr)
>
> diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
> index d8c345b..9aa9e32 100644
> --- a/sysdeps/x86_64/strchrnul.S
> +++ b/sysdeps/x86_64/strchrnul.S
> @@ -17,46 +17,8 @@
> You should have received a copy of the GNU Lesser General Public
> License along with the GNU C Library; if not, see
> <http://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -
> -
> - .text
> -ENTRY (__strchrnul)
> - movd %esi, %xmm1
> - movq %rdi, %rcx
> - punpcklbw %xmm1, %xmm1
> - andq $~15, %rdi
> - pxor %xmm2, %xmm2
> - punpcklbw %xmm1, %xmm1
> - orl $0xffffffff, %esi
> - movdqa (%rdi), %xmm0
> - pshufd $0, %xmm1, %xmm1
> - subq %rdi, %rcx
> - movdqa %xmm0, %xmm3
> - leaq 16(%rdi), %rdi
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm3
> - shl %cl, %esi
> - pmovmskb %xmm0, %edx
> - pmovmskb %xmm3, %ecx
> - orl %edx, %ecx
> - andl %esi, %ecx
> - jnz 1f
> -
> -2: movdqa (%rdi), %xmm0
> - leaq 16(%rdi), %rdi
> - movdqa %xmm0, %xmm3
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm3
> - pmovmskb %xmm0, %edx
> - pmovmskb %xmm3, %ecx
> - orl %edx, %ecx
> - jz 2b
> -
> -1: bsfl %ecx, %edx
> - leaq -16(%rdi,%rdx), %rax
> - ret
> -END (__strchrnul)
> +#define strchr __strchrnul
> +#define AS_STRCHRNUL
> +#include "strchr.S"
>
> weak_alias (__strchrnul, strchrnul)
> --
> 1.8.3.2
>