This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Status of strrchr.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: Liubov Dmitrieva <liubov dot dmitrieva at gmail dot com>
- Cc: GNU C Library <libc-alpha at sourceware dot org>
- Date: Wed, 14 Aug 2013 18:43:59 +0200
- Subject: Status of strrchr.
- References: <20130807140911 dot GA31968 at domone dot kolej dot mff dot cuni dot cz> <CAHjhQ926EE-MYDJR5Eftf+DUefBg-Gox0pw57vZ7XUwsO3OPJg at mail dot gmail dot com> <20130808190716 dot GA4589 at domone dot kolej dot mff dot cuni dot cz> <CAHjhQ92+C6uXyrUhTd3OWuoa6v2SeUaKLBuqaNX5Sqtn4ANBdg at mail dot gmail dot com> <CAHjhQ90S-1uBhwV44KODTcQkr=0U-P+_9Pu0O=RbYYY9e82JCA at mail dot gmail dot com> <20130809164420 dot GB4972 at domone dot kolej dot mff dot cuni dot cz> <CAHjhQ91rFwppQ4ixhPNuB9xe8FH9OrEoz3=eFrTQTscwOvSBCA at mail dot gmail dot com>
I will reply separately to these issues. This will touch strrchr.
On Wed, Aug 14, 2013 at 11:46:23AM +0400, Liubov Dmitrieva wrote:
..
No, now I am gathering data what should be optimized. I was concerned to
quickly verify that it improves performance in 1-64 byte range and
asymptotic behaviour. Now I will start tuning to decrease constant
factors in various cases.
A ifunc will stay for haswell as avx2 looks 20% faster for big sizes.
http://kam.mff.cuni.cz/~ondra/benchmark_string/haswell/strrchr_profile/results_rand/result.html
Not only atom but also for athlons.
http://kam.mff.cuni.cz/~ondra/benchmark_string/athlon_x2/strrchr_profile/results_rand/result.html
I computed bsrq every time in every 64byte block that contained c. A
better idea is just remember mask and pointer and do calculation only at
end. This asymptotically improves performance by 50%.
A real performance is problematic, in results_gcc that I run there are
few calls mostly from makefile. A more representative sample would be
welcome.
I added this as strrchr_new_v2 implementation that is below
>
I will fix that.
# sse2 strrchr_new_v2 cyan
.file "strrchr.c"
.text
.p2align 4,,15
.globl strrchr_new_v2
.type strrchr_new_v2, @function
strrchr_new_v2:
.LFB521:
.cfi_startproc
movd %esi, %xmm1
movq %rdi, %rax
andl $4095, %eax
punpcklbw %xmm1, %xmm1
cmpq $4032, %rax
punpcklwd %xmm1, %xmm1
pshufd $0, %xmm1, %xmm1
ja .L2
movdqu (%rdi), %xmm0
pxor %xmm2, %xmm2
movdqa %xmm0, %xmm3
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm3
pmovmskb %xmm0, %ecx
pmovmskb %xmm3, %edx
movslq %ecx, %rcx
movslq %edx, %rdx
testq %rdx, %rdx
je .L3
leaq -1(%rdx), %rax
xorq %rdx, %rax
andq %rcx, %rax
je .L4
#APP
# 47 "strrchr.c" 1
bsrq %rax, %rax
# 0 "" 2
#NO_APP
addq %rdi, %rax
ret
.p2align 4,,10
.p2align 3
.L2:
movq %rdi, %rax
pxor %xmm0, %xmm0
andq $-64, %rax
movdqu (%rax), %xmm5
movdqa %xmm5, %xmm6
movdqu 16(%rax), %xmm4
pcmpeqb %xmm1, %xmm5
pcmpeqb %xmm0, %xmm6
movdqu 32(%rax), %xmm3
pmovmskb %xmm6, %esi
movdqa %xmm4, %xmm6
movdqu 48(%rax), %xmm2
pcmpeqb %xmm1, %xmm4
pcmpeqb %xmm0, %xmm6
movslq %esi, %rsi
pmovmskb %xmm6, %edx
movdqa %xmm3, %xmm6
pcmpeqb %xmm1, %xmm3
pcmpeqb %xmm0, %xmm6
movslq %edx, %rdx
pcmpeqb %xmm2, %xmm0
salq $16, %rdx
pmovmskb %xmm3, %r9d
pmovmskb %xmm6, %r8d
pmovmskb %xmm0, %ecx
salq $32, %r9
salq $32, %r8
pcmpeqb %xmm1, %xmm2
orq %r8, %rdx
salq $48, %rcx
pmovmskb %xmm5, %r8d
orq %rsi, %rdx
pmovmskb %xmm4, %esi
orq %rcx, %rdx
pmovmskb %xmm2, %ecx
movslq %r8d, %r8
movslq %esi, %rsi
salq $16, %rsi
salq $48, %rcx
orq %r9, %rsi
orq %r8, %rsi
orq %rcx, %rsi
movl %edi, %ecx
subl %eax, %ecx
shrq %cl, %rdx
shrq %cl, %rsi
testq %rdx, %rdx
je .L6
leaq -1(%rdx), %rax
xorq %rdx, %rax
andq %rax, %rsi
je .L4
#APP
# 47 "strrchr.c" 1
bsrq %rsi, %rax
# 0 "" 2
#NO_APP
addq %rdi, %rax
ret
.p2align 4,,10
.p2align 3
.L6:
testq %rsi, %rsi
movq %rdi, %rcx
je .L30
.L7:
addq $64, %rdi
pxor %xmm7, %xmm7
andq $-64, %rdi
jmp .L13
.p2align 4,,10
.p2align 3
.L10:
testq %rdx, %rdx
cmovne %rdx, %rsi
cmovne %rdi, %rcx
addq $64, %rdi
.L13:
movdqa 32(%rdi), %xmm3
pxor %xmm6, %xmm6
movdqa 48(%rdi), %xmm2
movdqa %xmm3, %xmm0
movdqa 16(%rdi), %xmm4
pminub %xmm2, %xmm0
movdqa (%rdi), %xmm5
pminub %xmm4, %xmm0
pminub %xmm5, %xmm0
pcmpeqb %xmm7, %xmm0
pmovmskb %xmm0, %eax
movdqa %xmm5, %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %r9d
movdqa %xmm4, %xmm0
pcmpeqb %xmm1, %xmm0
movslq %r9d, %r9
pmovmskb %xmm0, %edx
movdqa %xmm3, %xmm0
pcmpeqb %xmm1, %xmm0
movslq %edx, %rdx
salq $16, %rdx
pmovmskb %xmm0, %r10d
movdqa %xmm2, %xmm0
pcmpeqb %xmm1, %xmm0
salq $32, %r10
orq %r10, %rdx
pmovmskb %xmm0, %r8d
orq %r9, %rdx
salq $48, %r8
orq %r8, %rdx
testl %eax, %eax
je .L10
pcmpeqb %xmm6, %xmm4
pcmpeqb %xmm6, %xmm3
pcmpeqb %xmm6, %xmm5
pmovmskb %xmm4, %eax
pmovmskb %xmm3, %r10d
pcmpeqb %xmm6, %xmm2
pmovmskb %xmm5, %r9d
cltq
salq $32, %r10
salq $16, %rax
pmovmskb %xmm2, %r8d
orq %r10, %rax
movslq %r9d, %r9
orq %r9, %rax
salq $48, %r8
orq %r8, %rax
leaq -1(%rax), %r8
xorq %rax, %r8
andq %r8, %rdx
cmovne %rdi, %rcx
cmovne %rdx, %rsi
#APP
# 47 "strrchr.c" 1
bsrq %rsi, %rsi
# 0 "" 2
#NO_APP
leaq (%rcx,%rsi), %rax
ret
.p2align 4,,10
.p2align 3
.L30:
movl $1, %esi
xorl %ecx, %ecx
jmp .L7
.p2align 4,,10
.p2align 3
.L3:
movdqu 16(%rdi), %xmm4
movdqa %xmm4, %xmm5
movdqu 32(%rdi), %xmm3
pcmpeqb %xmm1, %xmm4
pcmpeqb %xmm2, %xmm5
movdqu 48(%rdi), %xmm0
pmovmskb %xmm5, %edx
movdqa %xmm3, %xmm5
pcmpeqb %xmm1, %xmm3
pcmpeqb %xmm2, %xmm5
movslq %edx, %rdx
pcmpeqb %xmm0, %xmm2
salq $16, %rdx
pmovmskb %xmm3, %r8d
pmovmskb %xmm5, %eax
pmovmskb %xmm2, %esi
salq $32, %r8
salq $32, %rax
pcmpeqb %xmm1, %xmm0
orq %rdx, %rax
movq %rsi, %rdx
pmovmskb %xmm4, %esi
salq $48, %rdx
movslq %esi, %rsi
salq $16, %rsi
orq %r8, %rsi
orq %rcx, %rsi
pmovmskb %xmm0, %ecx
salq $48, %rcx
orq %rcx, %rsi
orq %rdx, %rax
je .L6
leaq -1(%rax), %rcx
xorq %rax, %rcx
andq %rcx, %rsi
je .L4
#APP
# 47 "strrchr.c" 1
bsrq %rsi, %rsi
# 0 "" 2
#NO_APP
leaq (%rdi,%rsi), %rax
ret
.p2align 4,,10
.p2align 3
.L4:
xorl %eax, %eax
ret
.cfi_endproc
.LFE521:
.size strrchr_new_v2, .-strrchr_new_v2
.ident "GCC: (Debian 4.7.1-2) 4.7.1"
.section .note.GNU-stack,"",@progbits