This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH v2] Faster strchr implementation.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: Liubov Dmitrieva <liubov dot dmitrieva at gmail dot com>
- Cc: GNU C Library <libc-alpha at sourceware dot org>
- Date: Tue, 10 Sep 2013 19:22:40 +0200
- Subject: Re: [PATCH v2] Faster strchr implementation.
- Authentication-results: sourceware.org; auth=none
- References: <20130807140911 dot GA31968 at domone dot kolej dot mff dot cuni dot cz> <CAHjhQ926EE-MYDJR5Eftf+DUefBg-Gox0pw57vZ7XUwsO3OPJg at mail dot gmail dot com> <20130816095908 dot GA15776 at domone dot kolej dot mff dot cuni dot cz> <20130816121533 dot GB26552 at domone dot kolej dot mff dot cuni dot cz> <20130902210152 dot GA19372 at domone dot kolej dot mff dot cuni dot cz>
ping
On Mon, Sep 02, 2013 at 11:01:52PM +0200, OndÅej BÃlka wrote:
> ping,
>
> As this is similar to rawmemchr implementation these patches should be read
> together
> On Fri, Aug 16, 2013 at 02:15:33PM +0200, OndÅej BÃlka wrote:
> >
> > A correct version is here.
> >
> > > Hello,
> > >
> > > I will run your tests for atom, silvermont, haswell tomorrow morning.
> > > I should check strcmp, strchr, strrchr you wrote about. Right? Or
> > > something else?
> > >
> > A strchr will need to rerun tests.
> >
> > Hi, I tuned my implementation do decrease loop overhead. It decreases
> > loop overhead by significant constant factor over previous
> > implementation.
> >
> > There are architectures that I do not cover,
> > haswell - an avx2 implementation that I posted is better and it is
> > better posted separately.
> >
> > atom - An loop caused big overhead for sizes around 64 bytes and we need
> > work on more effective header, we keep no-bsf implementation for now.
> >
> > silvermont - similar issues as atom but we need separate IFUNC casing to
> > choose no-bsf variant
> >
> > athlon,athlon x2 - Same situation an we also need flag to choose other variant.
> >
> > I updated results of my profiler.
> > In my random test strchr would always find terminating zero. This does
> > not happen in practice so now strchr will find character with 50%
> > probability.
> >
> > http://kam.mff.cuni.cz/~ondra/benchmark_string/strchr_profile.html
> > http://kam.mff.cuni.cz/~ondra/benchmark_string/strchr_profile160813.tar.bz2
> >
> >
> > OK to commit this iteration?
> >
> > * sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > (__libc_ifunc_impl_list): Remove: __strchr_sse42.
> > * sysdeps/x86_64/multiarch/strchr.S (__strchr_sse42): Remove.
> > (strchr): Remove __strchr_sse42 ifunc selection.
> > * sysdeps/x86_64/strchr.S (strchr): Use optimized implementation.
> > * sysdeps/x86_64/strchrnul.S: Include sysdeps/x86_64/strchr.S.
> >
> >
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 28d3579..8486294 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -116,7 +116,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> > /* Support sysdeps/x86_64/multiarch/strchr.S. */
> > IFUNC_IMPL (i, name, strchr,
> > - IFUNC_IMPL_ADD (array, i, strchr, HAS_SSE4_2, __strchr_sse42)
> > IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
> > IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
> >
> > diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
> > index f170238..3f0b2c5 100644
> > --- a/sysdeps/x86_64/multiarch/strchr.S
> > +++ b/sysdeps/x86_64/multiarch/strchr.S
> > @@ -29,12 +29,6 @@ ENTRY(strchr)
> > jne 1f
> > call __init_cpu_features
> > 1: leaq __strchr_sse2(%rip), %rax
> > - testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
> > - jnz 2f
> > - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
> > - jz 2f
> > - leaq __strchr_sse42(%rip), %rax
> > - ret
> > 2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
> > jz 3f
> > leaq __strchr_sse2_no_bsf(%rip), %rax
> > @@ -42,127 +36,6 @@ ENTRY(strchr)
> > END(strchr)
> >
> >
> > -/*
> > - This implementation uses SSE4 instructions to compare up to 16 bytes
> > - at a time looking for the first occurrence of the character c in the
> > - string s:
> > -
> > - char *strchr (const char *s, int c);
> > -
> > - We use 0xa:
> > - _SIDD_SBYTE_OPS
> > - | _SIDD_CMP_EQUAL_EACH
> > - | _SIDD_LEAST_SIGNIFICANT
> > - on pcmpistri to compare xmm/mem128
> > -
> > - 0 1 2 3 4 5 6 7 8 9 A B C D E F
> > - X X X X X X X X X X X X X X X X
> > -
> > - against xmm
> > -
> > - 0 1 2 3 4 5 6 7 8 9 A B C D E F
> > - C C C C C C C C C C C C C C C C
> > -
> > - to find out if the first 16byte data element has a byte C and the
> > - offset of the first byte. There are 3 cases:
> > -
> > - 1. The first 16byte data element has the byte C at the offset X.
> > - 2. The first 16byte data element has EOS and doesn't have the byte C.
> > - 3. The first 16byte data element is valid and doesn't have the byte C.
> > -
> > - Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
> > -
> > - case ECX CFlag ZFlag SFlag
> > - 1 X 1 0/1 0
> > - 2 16 0 1 0
> > - 3 16 0 0 0
> > -
> > - We exit from the loop for cases 1 and 2 with jbe which branches
> > - when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
> > - X for case 1. */
> > -
> > - .section .text.sse4.2,"ax",@progbits
> > - .align 16
> > - .type __strchr_sse42, @function
> > - .globl __strchr_sse42
> > - .hidden __strchr_sse42
> > -__strchr_sse42:
> > - cfi_startproc
> > - CALL_MCOUNT
> > - testb %sil, %sil
> > - je __strend_sse4
> > - pxor %xmm2, %xmm2
> > - movd %esi, %xmm1
> > - movl %edi, %ecx
> > - pshufb %xmm2, %xmm1
> > - andl $15, %ecx
> > - movq %rdi, %r8
> > - je L(aligned_start)
> > -
> > -/* Handle unaligned string. */
> > - andq $-16, %r8
> > - movdqa (%r8), %xmm0
> > - pcmpeqb %xmm0, %xmm2
> > - pcmpeqb %xmm1, %xmm0
> > - /* Find where NULL is. */
> > - pmovmskb %xmm2, %edx
> > - /* Check if there is a match. */
> > - pmovmskb %xmm0, %esi
> > - /* Remove the leading bytes. */
> > - sarl %cl, %edx
> > - sarl %cl, %esi
> > - testl %esi, %esi
> > - je L(unaligned_no_match)
> > - /* Check which byte is a match. */
> > - bsfl %esi, %eax
> > - /* Is there a NULL? */
> > - testl %edx, %edx
> > - je L(unaligned_match)
> > - bsfl %edx, %esi
> > - cmpl %esi, %eax
> > - /* Return NULL if NULL comes first. */
> > - ja L(return_null)
> > -L(unaligned_match):
> > - addq %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(unaligned_no_match):
> > - testl %edx, %edx
> > - jne L(return_null)
> > -
> > -/* Loop start on aligned string. */
> > -L(loop):
> > - addq $16, %r8
> > -L(aligned_start):
> > - pcmpistri $0x2, (%r8), %xmm1
> > - jbe L(wrap)
> > - addq $16, %r8
> > - pcmpistri $0x2, (%r8), %xmm1
> > - jbe L(wrap)
> > - addq $16, %r8
> > - pcmpistri $0x2, (%r8), %xmm1
> > - jbe L(wrap)
> > - addq $16, %r8
> > - pcmpistri $0x2, (%r8), %xmm1
> > - jbe L(wrap)
> > - jmp L(loop)
> > -L(wrap):
> > - jc L(loop_exit)
> > -
> > -/* Return NULL. */
> > -L(return_null):
> > - xorl %eax, %eax
> > - ret
> > -
> > -/* Loop exit. */
> > - .p2align 4
> > -L(loop_exit):
> > - leaq (%r8,%rcx), %rax
> > - ret
> > - cfi_endproc
> > - .size __strchr_sse42, .-__strchr_sse42
> > -
> >
> > # undef ENTRY
> > # define ENTRY(name) \
> > diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
> > index d89f1eb..e6b705c 100644
> > --- a/sysdeps/x86_64/strchr.S
> > +++ b/sysdeps/x86_64/strchr.S
> > @@ -19,51 +19,174 @@
> >
> > #include <sysdep.h>
> >
> > +# ifndef ALIGN
> > +# define ALIGN(n) .p2align n
> > +# endif
> > +
> >
> > .text
> > ENTRY (strchr)
> > movd %esi, %xmm1
> > - movq %rdi, %rcx
> > - punpcklbw %xmm1, %xmm1
> > - andq $~15, %rdi
> > - pxor %xmm2, %xmm2
> > - punpcklbw %xmm1, %xmm1
> > - orl $0xffffffff, %esi
> > - movdqa (%rdi), %xmm0
> > + movl %edi, %eax
> > + andl $4095, %eax
> > + punpcklbw %xmm1, %xmm1
> > + cmpl $4031, %eax
> > + punpcklwd %xmm1, %xmm1
> > pshufd $0, %xmm1, %xmm1
> > - subq %rdi, %rcx
> > - movdqa %xmm0, %xmm3
> > - leaq 16(%rdi), %rdi
> > + jg L(cross_page)
> > + movdqu (%rdi), %xmm0
> > + pxor %xmm3, %xmm3
> > + movdqa %xmm0, %xmm4
> > pcmpeqb %xmm1, %xmm0
> > - pcmpeqb %xmm2, %xmm3
> > - shl %cl, %esi
> > - pmovmskb %xmm0, %edx
> > - pmovmskb %xmm3, %ecx
> > - andl %esi, %edx
> > - andl %esi, %ecx
> > - orl %edx, %ecx
> > - jnz 1f
> > + pcmpeqb %xmm3, %xmm4
> > + por %xmm4, %xmm0
> > + pmovmskb %xmm0, %eax
> > + test %eax, %eax
> > + je L(next_48_bytes)
> > + bsf %eax, %eax
> > +#ifdef AS_STRCHRNUL
> > + leaq (%rdi,%rax), %rax
> > +#else
> > + movl $0, %edx
> > + leaq (%rdi,%rax), %rax
> > + cmpb %sil, (%rax)
> > + cmovne %rdx, %rax
> > +#endif
> > + ret
> >
> > -2: movdqa (%rdi), %xmm0
> > - leaq 16(%rdi), %rdi
> > - movdqa %xmm0, %xmm3
> > + ALIGN(3)
> > + L(next_48_bytes):
> > + movdqu 16(%rdi), %xmm0
> > + movdqa %xmm0, %xmm4
> > + pcmpeqb %xmm1, %xmm0
> > + pcmpeqb %xmm3, %xmm4
> > + por %xmm4, %xmm0
> > + pmovmskb %xmm0, %ecx
> > + movdqu 32(%rdi), %xmm0
> > + movdqa %xmm0, %xmm4
> > pcmpeqb %xmm1, %xmm0
> > - pcmpeqb %xmm2, %xmm3
> > - pmovmskb %xmm0, %edx
> > - pmovmskb %xmm3, %ecx
> > - orl %edx, %ecx
> > - jz 2b
> > + salq $16, %rcx
> > + pcmpeqb %xmm3, %xmm4
> > + por %xmm4, %xmm0
> > + pmovmskb %xmm0, %eax
> > + movdqu 48(%rdi), %xmm0
> > + pcmpeqb %xmm0, %xmm3
> > + salq $32, %rax
> > + pcmpeqb %xmm1, %xmm0
> > + orq %rcx, %rax
> > + por %xmm3, %xmm0
> > + pmovmskb %xmm0, %ecx
> > + salq $48, %rcx
> > + orq %rcx, %rax
> > + testq %rax,%rax
> > + jne L(return)
> > +L(loop_start):
> > + /* We use this alignment to force loop be aligned to 8 but not
> > + 16 bytes. This gives better sheduling on AMD processors. */
> > + ALIGN(4)
> > + pxor %xmm6, %xmm6
> > + andq $-64, %rdi
> > + ALIGN(3)
> > +L(loop64):
> > + addq $64, %rdi
> > + movdqa (%rdi), %xmm5
> > + movdqa 16(%rdi), %xmm2
> > + movdqa 32(%rdi), %xmm3
> > + pxor %xmm1, %xmm5
> > + movdqa 48(%rdi), %xmm4
> > + pxor %xmm1, %xmm2
> > + pxor %xmm1, %xmm3
> > + pminub (%rdi), %xmm5
> > + pxor %xmm1, %xmm4
> > + pminub 16(%rdi), %xmm2
> > + pminub 32(%rdi), %xmm3
> > + pminub %xmm2, %xmm5
> > + pminub 48(%rdi), %xmm4
> > + pminub %xmm3, %xmm5
> > + pminub %xmm4, %xmm5
> > + pcmpeqb %xmm6, %xmm5
> > + pmovmskb %xmm5, %eax
> > +
> > + testl %eax, %eax
> > + je L(loop64)
> > +
> > + movdqa (%rdi), %xmm5
> > + movdqa %xmm5, %xmm0
> > + pcmpeqb %xmm1, %xmm5
> > + pcmpeqb %xmm6, %xmm0
> > + por %xmm0, %xmm5
> > + pcmpeqb %xmm6, %xmm2
> > + pcmpeqb %xmm6, %xmm3
> > + pcmpeqb %xmm6, %xmm4
> > +
> > + pmovmskb %xmm5, %ecx
> > + pmovmskb %xmm2, %eax
> > + salq $16, %rax
> > + pmovmskb %xmm3, %r8d
> > + pmovmskb %xmm4, %edx
> > + salq $32, %r8
> > + orq %r8, %rax
> > + orq %rcx, %rax
> > + salq $48, %rdx
> > + orq %rdx, %rax
> > + ALIGN(3)
> > +L(return):
> > + bsfq %rax, %rax
> > +#ifdef AS_STRCHRNUL
> > + leaq (%rdi,%rax), %rax
> > +#else
> > + movl $0, %edx
> > + leaq (%rdi,%rax), %rax
> > + cmpb %sil, (%rax)
> > + cmovne %rdx, %rax
> > +#endif
> > + ret
> > + ALIGN(4)
> > +
> > +L(cross_page):
> > + movq %rdi, %rdx
> > + pxor %xmm2, %xmm2
> > + andq $-64, %rdx
> > + movdqa %xmm1, %xmm0
> > + movdqa (%rdx), %xmm3
> > + movdqa %xmm3, %xmm4
> > + pcmpeqb %xmm1, %xmm3
> > + pcmpeqb %xmm2, %xmm4
> > + por %xmm4, %xmm3
> > + pmovmskb %xmm3, %r8d
> > + movdqa 16(%rdx), %xmm3
> > + movdqa %xmm3, %xmm4
> > + pcmpeqb %xmm1, %xmm3
> > + pcmpeqb %xmm2, %xmm4
> > + por %xmm4, %xmm3
> > + pmovmskb %xmm3, %eax
> > + movdqa 32(%rdx), %xmm3
> > + movdqa %xmm3, %xmm4
> > + pcmpeqb %xmm1, %xmm3
> > + salq $16, %rax
> > + pcmpeqb %xmm2, %xmm4
> > + por %xmm4, %xmm3
> > + pmovmskb %xmm3, %r9d
> > + movdqa 48(%rdx), %xmm3
> > + pcmpeqb %xmm3, %xmm2
> > + salq $32, %r9
> > + pcmpeqb %xmm3, %xmm0
> > + orq %r9, %rax
> > + orq %r8, %rax
> > + por %xmm2, %xmm0
> > + pmovmskb %xmm0, %ecx
> > + salq $48, %rcx
> > + orq %rcx, %rax
> > + movl %edi, %ecx
> > + subb %dl, %cl
> > + shrq %cl, %rax
> > + testq %rax, %rax
> > + jne L(return)
> > + jmp L(loop_start)
> >
> > -1: bsfl %edx, %edx
> > - jz 4f
> > - bsfl %ecx, %ecx
> > - leaq -16(%rdi,%rdx), %rax
> > - cmpl %edx, %ecx
> > - je 5f
> > -4: xorl %eax, %eax
> > -5: ret
> > END (strchr)
> >
> > +#ifndef AS_STRCHRNUL
> > weak_alias (strchr, index)
> > libc_hidden_builtin_def (strchr)
> > -
> > +#endif
> > diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
> > index d8c345b..bceeb61 100644
> > --- a/sysdeps/x86_64/strchrnul.S
> > +++ b/sysdeps/x86_64/strchrnul.S
> > @@ -20,43 +20,8 @@
> >
> > #include <sysdep.h>
> >
> > -
> > - .text
> > -ENTRY (__strchrnul)
> > - movd %esi, %xmm1
> > - movq %rdi, %rcx
> > - punpcklbw %xmm1, %xmm1
> > - andq $~15, %rdi
> > - pxor %xmm2, %xmm2
> > - punpcklbw %xmm1, %xmm1
> > - orl $0xffffffff, %esi
> > - movdqa (%rdi), %xmm0
> > - pshufd $0, %xmm1, %xmm1
> > - subq %rdi, %rcx
> > - movdqa %xmm0, %xmm3
> > - leaq 16(%rdi), %rdi
> > - pcmpeqb %xmm1, %xmm0
> > - pcmpeqb %xmm2, %xmm3
> > - shl %cl, %esi
> > - pmovmskb %xmm0, %edx
> > - pmovmskb %xmm3, %ecx
> > - orl %edx, %ecx
> > - andl %esi, %ecx
> > - jnz 1f
> > -
> > -2: movdqa (%rdi), %xmm0
> > - leaq 16(%rdi), %rdi
> > - movdqa %xmm0, %xmm3
> > - pcmpeqb %xmm1, %xmm0
> > - pcmpeqb %xmm2, %xmm3
> > - pmovmskb %xmm0, %edx
> > - pmovmskb %xmm3, %ecx
> > - orl %edx, %ecx
> > - jz 2b
> > -
> > -1: bsfl %ecx, %edx
> > - leaq -16(%rdi,%rdx), %rax
> > - ret
> > -END (__strchrnul)
> > +#define strchr __strchrnul
> > +#define AS_STRCHRNUL
> > +#include "strchr.S"
> >
> > weak_alias (__strchrnul, strchrnul)
--
system needs to be rebooted