This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Fix rawmemchr regression on bulldozer


Ping

On Mon, Aug 05, 2013 at 02:26:24PM +0200, OndÅej BÃlka wrote:
> Hi,
> 
> We now have a huge performance regression of rawmemchr on bulldozer.
> We select __rawmemchr_sse42 which is about five times slower than
> alternative __rawmemchr_sse2. Our benchtests can capture asymptotic
> speed relatively well and their output on AMD FX(tm)-8150 is below. 
> 
> 
> On intel a gap between __rawmemchr_sse42 and __rawmemchr_sse2 is similar
> but a bit_Prefer_PMINUB_for_stringop flag selects __rawmemchr_sse2.
> 
> After this a  bit_Prefer_PMINUB_for_stringop will no longer be needed
> and can be removed by separate cleanup.
> 
> OK to commit?
> 
>                       simple_rawmemchr  __rawmemchr_sse42 __rawmemchr_sse2
> Length   32, alignment  0:  264 108 75
> Length   64, alignment  1:  431 193 95
> Length   32, alignment  0:  254 138 62
> Length   64, alignment  1:  431 193 69
> Length   64, alignment  0:  419 193 85
> Length   64, alignment  2:  419 177 77
> Length   64, alignment  0:  419 177 85
> Length   64, alignment  2:  419 164 69
> Length  128, alignment  0:  748 298 93
> Length   64, alignment  3:  419 164 92
> Length  128, alignment  0:  751 298 118
> Length   64, alignment  3:  422 180 59
> Length  256, alignment  0:  1407  628 131
> Length   64, alignment  4:  419 192 72
> Length  256, alignment  0:  1409  642 131
> Length   64, alignment  4:  422 192 56
> Length  512, alignment  0:  2723  1111  183
> Length   64, alignment  5:  419 193 59
> Length  512, alignment  0:  2725  1111  190
> Length   64, alignment  5:  422 193 84
> Length 1024, alignment  0:  5356  2026  417
> Length   64, alignment  6:  419 192 84
> Length 1024, alignment  0:  5359  2026  388
> 
> 
> 
> 	* sysdeps/x86_64/multiarch/rawmemchr.S: Delete.
> 
> ---
>  sysdeps/x86_64/multiarch/rawmemchr.S | 103 -----------------------------------
>  1 file changed, 103 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/rawmemchr.S
> 
> diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S
> deleted file mode 100644
> index 50de38f..0000000
> --- a/sysdeps/x86_64/multiarch/rawmemchr.S
> +++ /dev/null
> @@ -1,103 +0,0 @@
> -/* Multiple versions of rawmemchr
> -   All versions must be listed in ifunc-impl-list.c.
> -   Copyright (C) 2009-2013 Free Software Foundation, Inc.
> -   Contributed by Ulrich Drepper <drepper@redhat.com>.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -#include <init-arch.h>
> -
> -
> -/* Define multiple versions only for the definition in lib.  */
> -#ifndef NOT_IN_libc
> -	.text
> -ENTRY(rawmemchr)
> -	.type	rawmemchr, @gnu_indirect_function
> -	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
> -	jne	1f
> -	call	__init_cpu_features
> -1:	testl	$bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
> -	jnz	2f
> -	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
> -	jz	2f
> -	leaq	__rawmemchr_sse42(%rip), %rax
> -	ret
> -2:	leaq	__rawmemchr_sse2(%rip), %rax
> -	ret
> -
> -END(rawmemchr)
> -strong_alias (rawmemchr, __rawmemchr)
> -
> -
> -	.section .text.sse4.2,"ax",@progbits
> -	.align	16
> -	.type	__rawmemchr_sse42, @function
> -	.globl __rawmemchr_sse42
> -	.hidden __rawmemchr_sse42
> -__rawmemchr_sse42:
> -	cfi_startproc
> -	CALL_MCOUNT
> -	movd	%esi, %xmm1
> -	movq	%rdi, %rcx
> -	pxor	%xmm2, %xmm2
> -	andq	$~15, %rdi
> -	orl	$0xffffffff, %esi
> -	pshufb	%xmm2, %xmm1
> -	movdqa	(%rdi), %xmm0
> -	subq	%rdi, %rcx
> -	pcmpeqb	%xmm1, %xmm0
> -	shl	%cl, %esi
> -	pmovmskb %xmm0, %ecx
> -	movl	$16, %eax
> -	movl	$16, %edx
> -	andl	%esi, %ecx
> -	jnz	1f
> -
> -2:	pcmpestri $0x08, 16(%rdi), %xmm1
> -	leaq	16(%rdi), %rdi
> -	jnc	2b
> -
> -	leaq	(%rdi,%rcx), %rax
> -	ret
> -
> -1:	bsfl	%ecx, %eax
> -	addq	%rdi, %rax
> -	ret
> -	cfi_endproc
> -	.size	__rawmemchr_sse42, .-__rawmemchr_sse42
> -
> -
> -# undef ENTRY
> -# define ENTRY(name) \
> -	.type __rawmemchr_sse2, @function; \
> -	.align 16; \
> -	.globl __rawmemchr_sse2; \
> -	.hidden __rawmemchr_sse2; \
> -	__rawmemchr_sse2: cfi_startproc; \
> -	CALL_MCOUNT
> -# undef END
> -# define END(name) \
> -	cfi_endproc; .size __rawmemchr_sse2, .-__rawmemchr_sse2
> -# undef libc_hidden_builtin_def
> -/* It doesn't make sense to send libc-internal rawmemchr calls through a PLT.
> -   The speedup we get from using SSE4.2 instruction is likely eaten away
> -   by the indirect call in the PLT.  */
> -# define libc_hidden_builtin_def(name) \
> -	.globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_sse2
> -#endif
> -
> -#include "../rawmemchr.S"
> -- 
> 1.8.3.2


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]