This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] Fix rawmemchr regression on bulldozer
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Mon, 26 Aug 2013 09:30:04 +0200
- Subject: Re: [PATCH] Fix rawmemchr regression on bulldozer
- Authentication-results: sourceware.org; auth=none
- References: <20130805122624 dot GA4682 at domone dot kolej dot mff dot cuni dot cz>
Ping
On Mon, Aug 05, 2013 at 02:26:24PM +0200, OndÅej BÃlka wrote:
> Hi,
>
> We now have a huge performance regression of rawmemchr on bulldozer.
> We select __rawmemchr_sse42 which is about five times slower than
> alternative __rawmemchr_sse2. Our benchtests can capture asymptotic
> speed relatively well and their output on AMD FX(tm)-8150 is below.
>
>
> On intel a gap between __rawmemchr_sse42 and __rawmemchr_sse2 is similar
> but a bit_Prefer_PMINUB_for_stringop flag selects __rawmemchr_sse2.
>
> After this a bit_Prefer_PMINUB_for_stringop will no longer be needed
> and can be removed by separate cleanup.
>
> OK to commit?
>
> simple_rawmemchr __rawmemchr_sse42 __rawmemchr_sse2
> Length 32, alignment 0: 264 108 75
> Length 64, alignment 1: 431 193 95
> Length 32, alignment 0: 254 138 62
> Length 64, alignment 1: 431 193 69
> Length 64, alignment 0: 419 193 85
> Length 64, alignment 2: 419 177 77
> Length 64, alignment 0: 419 177 85
> Length 64, alignment 2: 419 164 69
> Length 128, alignment 0: 748 298 93
> Length 64, alignment 3: 419 164 92
> Length 128, alignment 0: 751 298 118
> Length 64, alignment 3: 422 180 59
> Length 256, alignment 0: 1407 628 131
> Length 64, alignment 4: 419 192 72
> Length 256, alignment 0: 1409 642 131
> Length 64, alignment 4: 422 192 56
> Length 512, alignment 0: 2723 1111 183
> Length 64, alignment 5: 419 193 59
> Length 512, alignment 0: 2725 1111 190
> Length 64, alignment 5: 422 193 84
> Length 1024, alignment 0: 5356 2026 417
> Length 64, alignment 6: 419 192 84
> Length 1024, alignment 0: 5359 2026 388
>
>
>
> * sysdeps/x86_64/multiarch/rawmemchr.S: Delete.
>
> ---
> sysdeps/x86_64/multiarch/rawmemchr.S | 103 -----------------------------------
> 1 file changed, 103 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/rawmemchr.S
>
> diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S
> deleted file mode 100644
> index 50de38f..0000000
> --- a/sysdeps/x86_64/multiarch/rawmemchr.S
> +++ /dev/null
> @@ -1,103 +0,0 @@
> -/* Multiple versions of rawmemchr
> - All versions must be listed in ifunc-impl-list.c.
> - Copyright (C) 2009-2013 Free Software Foundation, Inc.
> - Contributed by Ulrich Drepper <drepper@redhat.com>.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <http://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -#include <init-arch.h>
> -
> -
> -/* Define multiple versions only for the definition in lib. */
> -#ifndef NOT_IN_libc
> - .text
> -ENTRY(rawmemchr)
> - .type rawmemchr, @gnu_indirect_function
> - cmpl $0, __cpu_features+KIND_OFFSET(%rip)
> - jne 1f
> - call __init_cpu_features
> -1: testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
> - jnz 2f
> - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
> - jz 2f
> - leaq __rawmemchr_sse42(%rip), %rax
> - ret
> -2: leaq __rawmemchr_sse2(%rip), %rax
> - ret
> -
> -END(rawmemchr)
> -strong_alias (rawmemchr, __rawmemchr)
> -
> -
> - .section .text.sse4.2,"ax",@progbits
> - .align 16
> - .type __rawmemchr_sse42, @function
> - .globl __rawmemchr_sse42
> - .hidden __rawmemchr_sse42
> -__rawmemchr_sse42:
> - cfi_startproc
> - CALL_MCOUNT
> - movd %esi, %xmm1
> - movq %rdi, %rcx
> - pxor %xmm2, %xmm2
> - andq $~15, %rdi
> - orl $0xffffffff, %esi
> - pshufb %xmm2, %xmm1
> - movdqa (%rdi), %xmm0
> - subq %rdi, %rcx
> - pcmpeqb %xmm1, %xmm0
> - shl %cl, %esi
> - pmovmskb %xmm0, %ecx
> - movl $16, %eax
> - movl $16, %edx
> - andl %esi, %ecx
> - jnz 1f
> -
> -2: pcmpestri $0x08, 16(%rdi), %xmm1
> - leaq 16(%rdi), %rdi
> - jnc 2b
> -
> - leaq (%rdi,%rcx), %rax
> - ret
> -
> -1: bsfl %ecx, %eax
> - addq %rdi, %rax
> - ret
> - cfi_endproc
> - .size __rawmemchr_sse42, .-__rawmemchr_sse42
> -
> -
> -# undef ENTRY
> -# define ENTRY(name) \
> - .type __rawmemchr_sse2, @function; \
> - .align 16; \
> - .globl __rawmemchr_sse2; \
> - .hidden __rawmemchr_sse2; \
> - __rawmemchr_sse2: cfi_startproc; \
> - CALL_MCOUNT
> -# undef END
> -# define END(name) \
> - cfi_endproc; .size __rawmemchr_sse2, .-__rawmemchr_sse2
> -# undef libc_hidden_builtin_def
> -/* It doesn't make sense to send libc-internal rawmemchr calls through a PLT.
> - The speedup we get from using SSE4.2 instruction is likely eaten away
> - by the indirect call in the PLT. */
> -# define libc_hidden_builtin_def(name) \
> - .globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_sse2
> -#endif
> -
> -#include "../rawmemchr.S"
> --
> 1.8.3.2