This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 1/2] Fix strrchr regression.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: Liubov Dmitrieva <liubov dot dmitrieva at gmail dot com>
- Cc: GNU C Library <libc-alpha at sourceware dot org>
- Date: Tue, 6 Aug 2013 22:29:53 +0200
- Subject: Re: [PATCH 1/2] Fix strrchr regression.
- References: <20130805173346 dot GA4978 at domone dot kolej dot mff dot cuni dot cz> <CAHjhQ937YxKG7UrLOfvwaD_PZuo0FNzA29NYZKnwCCLDVbYwLQ at mail dot gmail dot com> <20130805201140 dot GA23195 at domone dot kolej dot mff dot cuni dot cz>
On Mon, Aug 05, 2013 at 10:11:40PM +0200, OndÅej BÃlka wrote:
> On Mon, Aug 05, 2013 at 07:54:14PM +0200, Liubov Dmitrieva wrote:
> > I see you didn't check for Haswell. Can you please contribute your
> > profiler to glibc to let everyone reproduce measurements of your good
> > benchmarks. I want to be able to get data for Haswell, Atom and Silvermont
> > for your benchmark to be sure that we didn't miss something. There was the
> > discussion already that it is important to improve glibc benchmarks.
> >
> Hi,
>
I did a obvious modification of loop to compute same expression with avx2 instead sse2.
A profiler with avx2 version added is here.
http://kam.mff.cuni.cz/~ondra/strrchr_profile060813_v2.tar.bz2
And avx2 implementation is following
.file "strrchr_avx.c"
.text
.p2align 4,,15
.globl strrchr_new
.type strrchr_new, @function
strrchr_new:
.LFB990:
.cfi_startproc
movq %rdi, %rax
vmovd %esi, %xmm0
andl $4095, %eax
cmpq $4032, %rax
vpbroadcastb %xmm0, %xmm0
ja .L2
vpxor %xmm3, %xmm3, %xmm3
vmovdqu (%rdi), %xmm1
vpcmpeqb %xmm3, %xmm1, %xmm4
vpcmpeqb %xmm0, %xmm1, %xmm1
vpmovmskb %xmm4, %edx
vpmovmskb %xmm1, %ecx
movslq %edx, %rdx
testq %rdx, %rdx
movslq %ecx, %rcx
je .L3
leaq -1(%rdx), %rax
xorq %rdx, %rax
andq %rcx, %rax
je .L4
#APP
# 63 "header.h" 1
bsrq %rax, %rax
# 0 "" 2
#NO_APP
addq %rdi, %rax
ret
.p2align 4,,10
.p2align 3
.L2:
movq %rdi, %rcx
vpxor %xmm1, %xmm1, %xmm1
andq $-64, %rcx
vmovdqu (%rcx), %xmm6
vmovdqu 16(%rcx), %xmm5
vpcmpeqb %xmm1, %xmm6, %xmm7
vpcmpeqb %xmm0, %xmm6, %xmm6
vmovdqu 32(%rcx), %xmm4
vpmovmskb %xmm7, %r9d
vpcmpeqb %xmm1, %xmm5, %xmm7
vmovdqu 48(%rcx), %xmm3
vpcmpeqb %xmm0, %xmm5, %xmm5
vpmovmskb %xmm7, %eax
vpcmpeqb %xmm1, %xmm4, %xmm7
vpcmpeqb %xmm1, %xmm3, %xmm1
cltq
vpmovmskb %xmm7, %r8d
salq $16, %rax
vpmovmskb %xmm1, %edx
salq $32, %r8
vpcmpeqb %xmm0, %xmm4, %xmm4
orq %r8, %rax
movslq %r9d, %r8
salq $48, %rdx
orq %r8, %rax
vpmovmskb %xmm4, %r9d
vpmovmskb %xmm6, %r8d
orq %rdx, %rax
vpmovmskb %xmm5, %edx
vpcmpeqb %xmm0, %xmm3, %xmm0
salq $32, %r9
movslq %r8d, %r8
movslq %edx, %rdx
salq $16, %rdx
vpmovmskb %xmm0, %r10d
orq %r9, %rdx
orq %r8, %rdx
movq %r10, %r8
salq $48, %r8
orq %r8, %rdx
movl %edi, %r8d
subl %ecx, %r8d
movl %r8d, %ecx
shrq %cl, %rax
shrq %cl, %rdx
testq %rax, %rax
je .L6
leaq -1(%rax), %rcx
xorq %rax, %rcx
andq %rcx, %rdx
je .L4
.L33:
#APP
# 63 "header.h" 1
bsrq %rdx, %rdx
# 0 "" 2
#NO_APP
leaq (%rdi,%rdx), %rax
vzeroupper
ret
.p2align 4,,10
.p2align 3
.L6:
xorl %eax, %eax
testq %rdx, %rdx
jne .L35
.L8:
vmovd %esi, %xmm2
addq $64, %rdi
andq $-64, %rdi
vpxor %xmm5, %xmm5, %xmm5
vpbroadcastb %xmm2, %xmm2
vinserti128 $1, %xmm2, %ymm2, %ymm2
jmp .L13
.p2align 4,,10
.p2align 3
.L11:
testq %rdx, %rdx
je .L12
#APP
# 63 "header.h" 1
bsrq %rdx, %rdx
# 0 "" 2
#NO_APP
leaq (%rdi,%rdx), %rax
.L12:
addq $64, %rdi
.L13:
vmovdqa 32(%rdi), %ymm0
vpxor %xmm4, %xmm4, %xmm4
vmovdqa (%rdi), %ymm1
vpminub %ymm0, %ymm1, %ymm3
vpcmpeqb %ymm5, %ymm3, %ymm3
vpmovmskb %ymm3, %ecx
vpcmpeqb %ymm2, %ymm1, %ymm3
vpmovmskb %ymm3, %esi
vpcmpeqb %ymm2, %ymm0, %ymm3
movslq %esi, %rsi
vpmovmskb %ymm3, %edx
salq $32, %rdx
orq %rsi, %rdx
testl %ecx, %ecx
je .L11
vpcmpeqb %ymm4, %ymm1, %ymm1
vpcmpeqb %ymm4, %ymm0, %ymm0
vpmovmskb %ymm1, %ecx
vpmovmskb %ymm0, %esi
movslq %ecx, %rcx
salq $32, %rsi
orq %rcx, %rsi
leaq -1(%rsi), %rcx
xorq %rsi, %rcx
andq %rcx, %rdx
jne .L33
vzeroupper
ret
.p2align 4,,10
.p2align 3
.L3:
vmovdqu 16(%rdi), %xmm5
vmovdqu 32(%rdi), %xmm4
vpcmpeqb %xmm3, %xmm5, %xmm6
vpcmpeqb %xmm0, %xmm5, %xmm5
vmovdqu 48(%rdi), %xmm1
vpmovmskb %xmm6, %edx
vpcmpeqb %xmm3, %xmm4, %xmm6
vpcmpeqb %xmm3, %xmm1, %xmm3
movslq %edx, %rdx
vpmovmskb %xmm6, %eax
salq $16, %rdx
vpmovmskb %xmm3, %r8d
salq $32, %rax
vpcmpeqb %xmm0, %xmm4, %xmm4
orq %rdx, %rax
vpmovmskb %xmm5, %edx
movq %r8, %r9
vpmovmskb %xmm4, %r8d
salq $48, %r9
movslq %edx, %rdx
vpcmpeqb %xmm0, %xmm1, %xmm0
salq $16, %rdx
salq $32, %r8
orq %r8, %rdx
orq %rcx, %rdx
vpmovmskb %xmm0, %ecx
salq $48, %rcx
orq %rcx, %rdx
orq %r9, %rax
je .L6
leaq -1(%rax), %rcx
xorq %rax, %rcx
andq %rcx, %rdx
jne .L36
.p2align 4,,10
.p2align 3
.L4:
xorl %eax, %eax
ret
.p2align 4,,10
.p2align 3
.L35:
#APP
# 63 "header.h" 1
bsrq %rdx, %rax
# 0 "" 2
#NO_APP
addq %rdi, %rax
jmp .L8
.p2align 4,,10
.p2align 3
.L36:
#APP
# 63 "header.h" 1
bsrq %rdx, %rax
# 0 "" 2
#NO_APP
addq %rdi, %rax
ret
.cfi_endproc
.LFE990:
.size strrchr_new, .-strrchr_new
.ident "GCC: (Debian 4.7.1-2) 4.7.1"
.section .note.GNU-stack,"",@progbits