This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: Status of strchr
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: Liubov Dmitrieva <liubov dot dmitrieva at gmail dot com>
- Cc: GNU C Library <libc-alpha at sourceware dot org>
- Date: Tue, 20 Aug 2013 18:34:35 +0200
- Subject: Re: Status of strchr
- References: <20130807140911 dot GA31968 at domone dot kolej dot mff dot cuni dot cz> <CAHjhQ926EE-MYDJR5Eftf+DUefBg-Gox0pw57vZ7XUwsO3OPJg at mail dot gmail dot com> <20130808190716 dot GA4589 at domone dot kolej dot mff dot cuni dot cz> <CAHjhQ92+C6uXyrUhTd3OWuoa6v2SeUaKLBuqaNX5Sqtn4ANBdg at mail dot gmail dot com> <CAHjhQ90S-1uBhwV44KODTcQkr=0U-P+_9Pu0O=RbYYY9e82JCA at mail dot gmail dot com> <20130809164420 dot GB4972 at domone dot kolej dot mff dot cuni dot cz> <CAHjhQ91rFwppQ4ixhPNuB9xe8FH9OrEoz3=eFrTQTscwOvSBCA at mail dot gmail dot com> <20130814203724 dot GA6769 at domone dot kolej dot mff dot cuni dot cz>
On Wed, Aug 14, 2013 at 10:37:24PM +0200, OndÅej BÃlka wrote:
> On Wed, Aug 14, 2013 at 11:46:23AM +0400, Liubov Dmitrieva wrote:
>
> A problem here is that my 64-byte loop is after 512 characters fastest
> but there is big constant overhead that makes 16-byte loop better at
> that interval.
>
> It is partially caused by that I did not do much tuning to this
> implementation. I wrote strchr_new_v2 that decreases overhead somewhat
> but it has a catch.
>
>
> It is possible, other variant is to write header that does not use
> unaligned loads.
>
> It is not only problem of atom but also for old athlons a no-bsf variant
> looks 10% faster than what is selected and my improvement.
> http://kam.mff.cuni.cz/~ondra/benchmark_string/athlon_x2/strchr_profile/results_gcc/result.html
>
> A silvermont is problematic as problem looks to be in loop overhead. One
> possibility is wait how optimized can 64-byte loop be.
>
That was v2.
> A second possibility is also try a 32-byte loop and see how it fares.
>
Which I tried now. It does have smaller overhead except for 32-64 byte
range. I want to see if it helps atom and silvermont.
http://kam.mff.cuni.cz/~ondra/benchmark_string/strchr_profile.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/strchr_profile200813.tar.bz2
It is also few percent faster in practice. However there is drawback
that it is about 10% slower for sizes from 8000 bytes and more than
64-byte loop.
Now I am inclined to make this default if you can cope with that
tradeoff.
# sse2 strchr_new_loop32 blue
.file "strchr_pair.c"
.text
.p2align 4,,15
.globl strchr_new_loop32
.type strchr_new_loop32, @function
strchr_new_loop32:
.LFB519:
.cfi_startproc
movd %esi, %xmm3
movl %edi, %eax
andl $4095, %eax
punpcklbw %xmm3, %xmm3
cmpl $4063, %eax
punpcklwd %xmm3, %xmm3
pshufd $0, %xmm3, %xmm3
jg .L11
movdqu (%rdi), %xmm1
pxor %xmm2, %xmm2
movdqa %xmm3, %xmm0
movdqa %xmm1, %xmm4
pcmpeqb %xmm3, %xmm1
pcmpeqb %xmm2, %xmm4
por %xmm4, %xmm1
pmovmskb %xmm1, %edx
movdqu 16(%rdi), %xmm1
pcmpeqb %xmm1, %xmm2
pcmpeqb %xmm1, %xmm0
por %xmm2, %xmm0
pmovmskb %xmm0, %eax
salq $16, %rax
orq %rdx, %rax
jne .L9
.L3:
pxor %xmm5, %xmm5
andq $-32, %rdi
.p2align 4,,10
.p2align 3
.L6:
addq $32, %rdi
movdqa (%rdi), %xmm1
movdqa 16(%rdi), %xmm0
pxor %xmm3, %xmm1
pxor %xmm3, %xmm0
pminub (%rdi), %xmm1
pminub 16(%rdi), %xmm0
pminub %xmm1, %xmm0
pcmpeqb %xmm5, %xmm0
pmovmskb %xmm0, %eax
testl %eax, %eax
je .L6
movdqa 16(%rdi), %xmm0
pcmpeqb %xmm5, %xmm1
pcmpeqb %xmm0, %xmm3
pcmpeqb %xmm5, %xmm0
por %xmm3, %xmm0
pmovmskb %xmm1, %edx
pmovmskb %xmm0, %eax
salq $16, %rax
orq %rdx, %rax
.L9:
bsf %eax, %eax
movl $0, %edx
leaq (%rdi,%rax), %rax
cmpb %sil, (%rax)
cmovne %rdx, %rax
ret
.p2align 4,,10
.p2align 3
.L11:
movq %rdi, %rdx
pxor %xmm2, %xmm2
andq $-32, %rdx
movdqa %xmm3, %xmm0
movdqa (%rdx), %xmm1
movdqa %xmm1, %xmm4
pcmpeqb %xmm3, %xmm1
pcmpeqb %xmm2, %xmm4
por %xmm4, %xmm1
pmovmskb %xmm1, %ecx
movdqa 16(%rdx), %xmm1
pcmpeqb %xmm1, %xmm2
pcmpeqb %xmm1, %xmm0
por %xmm2, %xmm0
pmovmskb %xmm0, %eax
salq $16, %rax
orq %rcx, %rax
movl %edi, %ecx
subb %dl, %cl
shrq %cl, %rax
testq %rax, %rax
je .L3
jmp .L9
.cfi_endproc
.LFE519:
.size strchr_new_loop32, .-strchr_new_loop32
.ident "GCC: (Debian 4.5.3-12) 4.5.3"
.section .note.GNU-stack,"",@progbits