This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[RFC] faster strcmp by avoiding sse42.


Hi,

Continuing to improving implementation that needlessly use sse42 we move
to strcmp. A strcmp_sse42 is actually faster than existing
implementations. It is mostly caused by lack of unrolling in other
implementations than sse4 itself.

I decided to write new implementation. It mirrors ideas that I used in a
strlen implementation, a header consist of checking if we cross page and
then first check 16 bytes unaligned, then remaining 48 bytes unaligned. 

A loop is nearly same as in strlen, we exploit fact that pcmpeqb
xmma,xmmb will produce zeros only on places that differ.

A implementation is complete upto micro tuning, notably that I now handle
cross-page situations by byte-by-byte loop.

I also did obvious conversion to avx2 but as I do not have haswell I do
not have performance data about it.

I used similar benchmark as in strrchr that is available here:

http://kam.mff.cuni.cz/~ondra/benchmark_string/strcmp_profile.html

http://kam.mff.cuni.cz/~ondra/strcmp_profile060813.tar.bz2


As real performance is concerned my strcmp_new is between 10%-20% faster than 
alternatives on all tested architectures. 
A workload used is quite representative as in most apps a strcmp will
inspect at most 16 characters with high probability. One of reasons is
that strcmp is used for sorting and searching where difference in first
character is quite likely.

As asymptotic performance is concerned I outperform sse4.2 by 50% for
architectures that support sse4.2 This allows me to replace sse42
implementation with mine. 

As other strcmp_sse2 and strcmp_ssse3 are concerned we need to replace
header with strcmp_new header to improve performance. They are
asymptotically faster for processors that are not new to support sse4.2
but also not old so they support ssse3. 

For old processors unrolling will beat a strcmp_sse2 implementation
despite that I use unalinged loads that are bit ineffective there. See
http://kam.mff.cuni.cz/~ondra/benchmark_string/athlon_x2/strcmp_profile/results_rand/result.html

Comments?

A new implementation is following:

	.file	"strcmp.c"
	.text
	.p2align 4,,15
.globl strcmp_new
	.type	strcmp_new, @function
strcmp_new:
.LFB519:
	.cfi_startproc
	movl	%esi, %eax
	orl	%edi, %eax
	andl	$4095, %eax
	cmpl	$4031, %eax
	jle	.L15
	movq	%rsi, %rcx
	leaq	64(%rdi), %r9
	movq	%rdi, %rdx
	jmp	.L11
	.p2align 4,,10
	.p2align 3
.L8:
	movzbl	(%rcx), %r8d
	cmpb	%r8b, %al
	jne	.L16
	addq	$1, %rdx
	addq	$1, %rcx
	cmpq	%r9, %rdx
	je	.L7
.L11:
	movzbl	(%rdx), %eax
	movq	%rdx, %r8
	subq	%rdi, %r8
	testb	%al, %al
	jne	.L8
	movzbl	(%rsi,%r8), %r8d
	xorl	%eax, %eax
	movsbl	%r8b, %r8d
	subl	%r8d, %eax
	ret
	.p2align 4,,10
	.p2align 3
.L15:
	movdqu	(%rsi), %xmm5
.L3:
	movdqu	(%rdi), %xmm0
	pcmpeqb	%xmm0, %xmm5
	pminub	%xmm0, %xmm5
	pxor	%xmm0, %xmm0
	pcmpeqb	%xmm0, %xmm5
	pmovmskb	%xmm5, %eax
	cltq
	testq	%rax, %rax
	je	.L4
.L6:
	bsfq	%rax, %rax
	movslq	%eax, %rdx
	movsbl	(%rdi,%rdx), %eax
	movsbl	(%rsi,%rdx), %edx
	subl	%edx, %eax
	ret
	.p2align 4,,10
	.p2align 3
.L16:
	movsbl	%al, %eax
	movsbl	%r8b, %r8d
	subl	%r8d, %eax
	ret
	.p2align 4,,10
	.p2align 3
.L4:
	movdqu	16(%rdi), %xmm4
	movdqu	16(%rsi), %xmm3
	pcmpeqb	%xmm4, %xmm3
	movdqu	32(%rsi), %xmm2
	pminub	%xmm4, %xmm3
	pcmpeqb	%xmm0, %xmm3
	movdqu	48(%rsi), %xmm1
	pmovmskb	%xmm3, %edx
	movdqu	32(%rdi), %xmm3
	pcmpeqb	%xmm3, %xmm2
	movslq	%edx, %rdx
	salq	$16, %rdx
	pminub	%xmm3, %xmm2
	pcmpeqb	%xmm0, %xmm2
	pmovmskb	%xmm2, %eax
	movdqu	48(%rdi), %xmm2
	pcmpeqb	%xmm2, %xmm1
	salq	$32, %rax
	orq	%rdx, %rax
	pminub	%xmm2, %xmm1
	pcmpeqb	%xmm0, %xmm1
	pmovmskb	%xmm1, %ecx
	movq	%rcx, %rdx
	salq	$48, %rdx
	orq	%rdx, %rax
	jne	.L6
	leaq	64(%rdi), %r9
.L7:
	andq	$-64, %r9
	pxor	%xmm6, %xmm6
	subq	%rdi, %r9
	movslq	%r9d, %r9
	addq	%r9, %rdi
	addq	%r9, %rsi
	jmp	.L12
	.p2align 4,,10
	.p2align 3
.L17:
	addq	$64, %rdi
	addq	$64, %rsi
.L12:
	movdqa	(%rdi), %xmm4
	movdqu	(%rsi), %xmm5
	movdqa	%xmm4, %xmm0
	movdqu	16(%rsi), %xmm3
	pcmpeqb	%xmm5, %xmm0
	movdqu	32(%rsi), %xmm2
	pminub	%xmm4, %xmm0
	movdqa	16(%rdi), %xmm4
	pcmpeqb	%xmm4, %xmm3
	pminub	%xmm4, %xmm0
	movdqu	48(%rsi), %xmm1
	pminub	%xmm3, %xmm0
	movdqa	32(%rdi), %xmm3
	pcmpeqb	%xmm3, %xmm2
	pminub	%xmm3, %xmm0
	pminub	%xmm2, %xmm0
	movdqa	48(%rdi), %xmm2
	pcmpeqb	%xmm2, %xmm1
	pminub	%xmm2, %xmm0
	pminub	%xmm1, %xmm0
	pcmpeqb	%xmm6, %xmm0
	pmovmskb	%xmm0, %eax
	testl	%eax, %eax
	je	.L17
	jmp	.L3
	.cfi_endproc
.LFE519:
	.size	strcmp_new, .-strcmp_new
	.ident	"GCC: (Debian 4.5.3-12) 4.5.3"
	.section	.note.GNU-stack,"",@progbits


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]