This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: Status of strchr

From: OndÅej BÃlka <neleai at seznam dot cz>
To: Liubov Dmitrieva <liubov dot dmitrieva at gmail dot com>
Cc: GNU C Library <libc-alpha at sourceware dot org>
Date: Tue, 20 Aug 2013 18:34:35 +0200
Subject: Re: Status of strchr
References: <20130807140911 dot GA31968 at domone dot kolej dot mff dot cuni dot cz> <CAHjhQ926EE-MYDJR5Eftf+DUefBg-Gox0pw57vZ7XUwsO3OPJg at mail dot gmail dot com> <20130808190716 dot GA4589 at domone dot kolej dot mff dot cuni dot cz> <CAHjhQ92+C6uXyrUhTd3OWuoa6v2SeUaKLBuqaNX5Sqtn4ANBdg at mail dot gmail dot com> <CAHjhQ90S-1uBhwV44KODTcQkr=0U-P+_9Pu0O=RbYYY9e82JCA at mail dot gmail dot com> <20130809164420 dot GB4972 at domone dot kolej dot mff dot cuni dot cz> <CAHjhQ91rFwppQ4ixhPNuB9xe8FH9OrEoz3=eFrTQTscwOvSBCA at mail dot gmail dot com> <20130814203724 dot GA6769 at domone dot kolej dot mff dot cuni dot cz>

On Wed, Aug 14, 2013 at 10:37:24PM +0200, OndÅej BÃlka wrote:
> On Wed, Aug 14, 2013 at 11:46:23AM +0400, Liubov Dmitrieva wrote:
> 
> A problem here is that my 64-byte loop is after 512 characters fastest
> but there is big constant overhead that makes 16-byte loop better at
> that interval.
> 
> It is partially caused by that I did not do much tuning to this
> implementation. I wrote strchr_new_v2 that decreases overhead somewhat
> but it has a catch.
> 
> 
> It is possible, other variant is to write header that does not use
> unaligned loads. 
> 
> It is not only problem of atom but also for old athlons a no-bsf variant
> looks 10% faster than what is selected and my improvement.
> http://kam.mff.cuni.cz/~ondra/benchmark_string/athlon_x2/strchr_profile/results_gcc/result.html
> 
> A silvermont is problematic as problem looks to be in loop overhead. One
> possibility is wait how optimized can 64-byte loop be.
>
That was v2.
 
> A second possibility is also try a 32-byte loop and see how it fares.
> 
Which I tried now. It does have smaller overhead except for 32-64 byte
range. I want to see if it helps atom and silvermont.

 http://kam.mff.cuni.cz/~ondra/benchmark_string/strchr_profile.html
 http://kam.mff.cuni.cz/~ondra/benchmark_string/strchr_profile200813.tar.bz2

It is also few percent faster in practice. However there is drawback
that it is about 10% slower for sizes from 8000 bytes and more than
64-byte loop. 

Now I am inclined to make this default if you can cope with that
tradeoff.

# sse2 strchr_new_loop32 blue
	.file	"strchr_pair.c"
	.text
	.p2align 4,,15
.globl strchr_new_loop32
	.type	strchr_new_loop32, @function
strchr_new_loop32:
.LFB519:
	.cfi_startproc
	movd	%esi, %xmm3
	movl	%edi, %eax
	andl	$4095, %eax
	punpcklbw	%xmm3, %xmm3
	cmpl	$4063, %eax
	punpcklwd	%xmm3, %xmm3
	pshufd	$0, %xmm3, %xmm3
	jg	.L11
	movdqu	(%rdi), %xmm1
	pxor	%xmm2, %xmm2
	movdqa	%xmm3, %xmm0
	movdqa	%xmm1, %xmm4
	pcmpeqb	%xmm3, %xmm1
	pcmpeqb	%xmm2, %xmm4
	por	%xmm4, %xmm1
	pmovmskb	%xmm1, %edx
	movdqu	16(%rdi), %xmm1
	pcmpeqb	%xmm1, %xmm2
	pcmpeqb	%xmm1, %xmm0
	por	%xmm2, %xmm0
	pmovmskb	%xmm0, %eax
	salq	$16, %rax
	orq	%rdx, %rax
	jne	.L9
.L3:
	pxor	%xmm5, %xmm5
	andq	$-32, %rdi
	.p2align 4,,10
	.p2align 3
.L6:
	addq	$32, %rdi
	movdqa	(%rdi), %xmm1
	movdqa	16(%rdi), %xmm0
	pxor	%xmm3, %xmm1
	pxor	%xmm3, %xmm0
	pminub	(%rdi), %xmm1
	pminub  16(%rdi), %xmm0
	pminub	%xmm1, %xmm0
	pcmpeqb	%xmm5, %xmm0
	pmovmskb	%xmm0, %eax
	testl	%eax, %eax
	je	.L6
	movdqa	16(%rdi), %xmm0
	pcmpeqb	%xmm5, %xmm1
	pcmpeqb %xmm0, %xmm3
	pcmpeqb %xmm5, %xmm0
	por	%xmm3, %xmm0
	pmovmskb	%xmm1, %edx
	pmovmskb	%xmm0, %eax
	salq	$16, %rax
	orq	%rdx, %rax
.L9:
	bsf	%eax, %eax
	movl	$0, %edx
	leaq	(%rdi,%rax), %rax
	cmpb	%sil, (%rax)
	cmovne	%rdx, %rax
	ret
	.p2align 4,,10
	.p2align 3
.L11:
	movq	%rdi, %rdx
	pxor	%xmm2, %xmm2
	andq	$-32, %rdx
	movdqa	%xmm3, %xmm0
	movdqa	(%rdx), %xmm1
	movdqa	%xmm1, %xmm4
	pcmpeqb	%xmm3, %xmm1
	pcmpeqb	%xmm2, %xmm4
	por	%xmm4, %xmm1
	pmovmskb	%xmm1, %ecx
	movdqa	16(%rdx), %xmm1
	pcmpeqb	%xmm1, %xmm2
	pcmpeqb	%xmm1, %xmm0
	por	%xmm2, %xmm0
	pmovmskb	%xmm0, %eax
	salq	$16, %rax
	orq	%rcx, %rax
	movl	%edi, %ecx
	subb	%dl, %cl
	shrq	%cl, %rax
	testq	%rax, %rax
	je	.L3
	jmp	.L9
	.cfi_endproc
.LFE519:
	.size	strchr_new_loop32, .-strchr_new_loop32
	.ident	"GCC: (Debian 4.5.3-12) 4.5.3"
	.section	.note.GNU-stack,"",@progbits

Follow-Ups:
- Re: Status of strchr
  - From: Liubov Dmitrieva

References:
- [PATCH] Faster strchr implementation.
  - From: OndÅej BÃlka
- Re: [PATCH] Faster strchr implementation.
  - From: Liubov Dmitrieva
- Re: [PATCH] Faster strchr implementation.
  - From: OndÅej BÃlka
- Re: [PATCH] Faster strchr implementation.
  - From: Liubov Dmitrieva
- Re: [PATCH] Faster strchr implementation.
  - From: Liubov Dmitrieva
- Re: [PATCH] Faster strchr implementation.
  - From: OndÅej BÃlka
- Re: [PATCH] Faster strchr implementation.
  - From: Liubov Dmitrieva
- Status of strchr
  - From: OndÅej BÃlka

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]