This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH][BZ #17801] Fix memcpy regression (five times slower on bulldozer.)
- From: "H.J. Lu" <hjl dot tools at gmail dot com>
- To: OndÅej BÃlka <neleai at seznam dot cz>, GNU C Library <libc-alpha at sourceware dot org>
- Date: Fri, 30 Jan 2015 12:04:44 -0800
- Subject: Re: [PATCH][BZ #17801] Fix memcpy regression (five times slower on bulldozer.)
- Authentication-results: sourceware.org; auth=none
- References: <20150106142939 dot GB5835 at domone> <CAMe9rOo4tmQc0bJ1Z=pjURvPBBMndwQ-ynbHc=Mpz3eD=eqjbg at mail dot gmail dot com> <20150130145656 dot GA26219 at gmail dot com>
On Fri, Jan 30, 2015 at 6:56 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Tue, Jan 06, 2015 at 06:54:50AM -0800, H.J. Lu wrote:
>> On Tue, Jan 6, 2015 at 6:29 AM, OndÅej BÃlka <neleai@seznam.cz> wrote:
>> > H. J, in this commit there slipped performance regression by review.
>> >
>> > commit 05f3633da4f9df870d04dd77336e793746e57ed4
>> > Author: Ling Ma <ling.ml@alibaba-inc.com>
>> > Date: Mon Jul 14 00:02:52 2014 -0400
>> >
>> > Improve 64bit memcpy performance for Haswell CPU with AVX
>> > instruction
>> >
>> >
>> > I seem to recall that I mentioned something about avx being typo and
>> > should be avx2 but did not look it further.
>> >
>> > As I assumed its avx2 only I was ok with that nad haswell specific
>> > optimizations like using rep movsq. However ifunc checks for avx which
>> > is bad as we already know that avx loads/stores are slow on sandy
>> > bridge.
>> >
>> > Also testing on affected architectures would reveal that. Especially amd
>> > bulldozer where its five times slower on 2kb-16kb range, see
>> > http://kam.mff.cuni.cz/~ondra/benchmark_string/fx10/memcpy_profile_avx/results_rand/result.html
>> > because movsb is slow.
>> >
>> > On sandy bridge its only 20% regression on same range.
>> > http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_ivy_bridge/memcpy_profile_avx/results_rand/result.html
>> >
>> >
>> > Also avx loop for 128-2024 bytes is slower there so there is no point
>> > using it.
>> >
>> > What about following change?
>> >
>> >
>> > * sysdeps/x86_64/multiarch/memcpy.S: Fix performance regression.
>> >
>> > diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
>> > index 992e40d..27f89e4 100644
>> > --- a/sysdeps/x86_64/multiarch/memcpy.S
>> > +++ b/sysdeps/x86_64/multiarch/memcpy.S
>> > @@ -32,10 +32,13 @@ ENTRY(__new_memcpy)
>> > cmpl $0, KIND_OFFSET+__cpu_features(%rip)
>> > jne 1f
>> > call __init_cpu_features
>> > +#ifdef HAVE_AVX2_SUPPORT
>> > 1: leaq __memcpy_avx_unaligned(%rip), %rax
>> > - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
>> > + testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
>> > +
>> > jz 1f
>> > ret
>> > +#endif
>> > 1: leaq __memcpy_sse2(%rip), %rax
>> > testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
>> > jnz 2f
>>
>> Please add a new feature bit, bit_Fast_AVX_Unaligned_Load, and turn it
>> on together
>> with bit_AVX2_Usable.
>>
>
> I know we are in freeze. But I'd like to fix this regression in 2.21.
> OK for master?
Since this is a serious performance regression, I will check it in
before the end of the day unless I am told otherwise.
> Thanks.
>
>
> H.J.
> ---
> From 56d25c11b64a97255a115901d136d753c86de24e Mon Sep 17 00:00:00 2001
> From: "H.J. Lu" <hjl.tools@gmail.com>
> Date: Fri, 30 Jan 2015 06:50:20 -0800
> Subject: [PATCH] Use AVX unaligned memcpy only if AVX2 is available
>
> memcpy with unaligned 256-bit AVX register loads/stores are slow on older
> processorsl like Sandy Bridge. This patch adds bit_AVX_Fast_Unaligned_Load
> and sets it only when AVX2 is available.
>
> [BZ #17801]
> * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
> Set the bit_AVX_Fast_Unaligned_Load bit for AVX2.
> * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load):
> New.
> (index_AVX_Fast_Unaligned_Load): Likewise.
> (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise.
> * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the
> bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit.
> * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise.
> * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
> * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise.
> * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace
> HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD.
> * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.
> ---
> ChangeLog | 18 ++++++++++++++++++
> sysdeps/x86_64/multiarch/init-arch.c | 9 +++++++--
> sysdeps/x86_64/multiarch/init-arch.h | 4 ++++
> sysdeps/x86_64/multiarch/memcpy.S | 2 +-
> sysdeps/x86_64/multiarch/memcpy_chk.S | 2 +-
> sysdeps/x86_64/multiarch/memmove.c | 2 +-
> sysdeps/x86_64/multiarch/memmove_chk.c | 2 +-
> sysdeps/x86_64/multiarch/mempcpy.S | 2 +-
> sysdeps/x86_64/multiarch/mempcpy_chk.S | 2 +-
> 9 files changed, 35 insertions(+), 8 deletions(-)
>
> diff --git a/ChangeLog b/ChangeLog
> index 26f7f3f..a696e39 100644
> --- a/ChangeLog
> +++ b/ChangeLog
> @@ -1,3 +1,21 @@
> +2015-01-30 H.J. Lu <hongjiu.lu@intel.com>
> +
> + [BZ #17801]
> + * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
> + Set the bit_AVX_Fast_Unaligned_Load bit for AVX2.
> + * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load):
> + New.
> + (index_AVX_Fast_Unaligned_Load): Likewise.
> + (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise.
> + * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the
> + bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit.
> + * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise.
> + * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
> + * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise.
> + * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace
> + HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD.
> + * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.
> +
> 2015-01-29 Andreas Schwab <schwab@suse.de>
>
> * sysdeps/nptl/allocrtsig.c: Include <signal.h>.
> diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
> index 9299360..7dec218 100644
> --- a/sysdeps/x86_64/multiarch/init-arch.c
> +++ b/sysdeps/x86_64/multiarch/init-arch.c
> @@ -171,9 +171,14 @@ __init_cpu_features (void)
> /* Determine if AVX is usable. */
> if (CPUID_AVX)
> __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable;
> - /* Determine if AVX2 is usable. */
> +#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
> +# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
> +#endif
> + /* Determine if AVX2 is usable. Unaligned load with 256-bit
> + AVX registers are faster on processors with AVX2. */
> if (CPUID_AVX2)
> - __cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable;
> + __cpu_features.feature[index_AVX2_Usable]
> + |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load;
> /* Determine if FMA is usable. */
> if (CPUID_FMA)
> __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable;
> diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
> index 55f1c5b..e6b5ba5 100644
> --- a/sysdeps/x86_64/multiarch/init-arch.h
> +++ b/sysdeps/x86_64/multiarch/init-arch.h
> @@ -25,6 +25,7 @@
> #define bit_FMA4_Usable (1 << 8)
> #define bit_Slow_SSE4_2 (1 << 9)
> #define bit_AVX2_Usable (1 << 10)
> +#define bit_AVX_Fast_Unaligned_Load (1 << 11)
>
> /* CPUID Feature flags. */
>
> @@ -74,6 +75,7 @@
> # define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE
> # define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE
> # define index_AVX2_Usable FEATURE_INDEX_1*FEATURE_SIZE
> +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE
>
> #else /* __ASSEMBLER__ */
>
> @@ -169,6 +171,7 @@ extern const struct cpu_features *__get_cpu_features (void)
> # define index_FMA4_Usable FEATURE_INDEX_1
> # define index_Slow_SSE4_2 FEATURE_INDEX_1
> # define index_AVX2_Usable FEATURE_INDEX_1
> +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1
>
> # define HAS_ARCH_FEATURE(name) \
> ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
> @@ -181,5 +184,6 @@ extern const struct cpu_features *__get_cpu_features (void)
> # define HAS_AVX2 HAS_ARCH_FEATURE (AVX2_Usable)
> # define HAS_FMA HAS_ARCH_FEATURE (FMA_Usable)
> # define HAS_FMA4 HAS_ARCH_FEATURE (FMA4_Usable)
> +# define HAS_AVX_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
>
> #endif /* __ASSEMBLER__ */
> diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
> index 992e40d..4e18cd3 100644
> --- a/sysdeps/x86_64/multiarch/memcpy.S
> +++ b/sysdeps/x86_64/multiarch/memcpy.S
> @@ -33,7 +33,7 @@ ENTRY(__new_memcpy)
> jne 1f
> call __init_cpu_features
> 1: leaq __memcpy_avx_unaligned(%rip), %rax
> - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
> + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
> jz 1f
> ret
> 1: leaq __memcpy_sse2(%rip), %rax
> diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
> index 5e9cf00..1e756ea 100644
> --- a/sysdeps/x86_64/multiarch/memcpy_chk.S
> +++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
> @@ -39,7 +39,7 @@ ENTRY(__memcpy_chk)
> testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
> jz 2f
> leaq __memcpy_chk_ssse3_back(%rip), %rax
> - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
> + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
> jz 2f
> leaq __memcpy_chk_avx_unaligned(%rip), %rax
> 2: ret
> diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
> index d93bfd0..dd153a3 100644
> --- a/sysdeps/x86_64/multiarch/memmove.c
> +++ b/sysdeps/x86_64/multiarch/memmove.c
> @@ -49,7 +49,7 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
> ifunc symbol properly. */
> extern __typeof (__redirect_memmove) __libc_memmove;
> libc_ifunc (__libc_memmove,
> - HAS_AVX
> + HAS_AVX_FAST_UNALIGNED_LOAD
> ? __memmove_avx_unaligned
> : (HAS_SSSE3
> ? (HAS_FAST_COPY_BACKWARD
> diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
> index 743ca2a..8b12d00 100644
> --- a/sysdeps/x86_64/multiarch/memmove_chk.c
> +++ b/sysdeps/x86_64/multiarch/memmove_chk.c
> @@ -30,7 +30,7 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
> #include "debug/memmove_chk.c"
>
> libc_ifunc (__memmove_chk,
> - HAS_AVX ? __memmove_chk_avx_unaligned :
> + HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned :
> (HAS_SSSE3
> ? (HAS_FAST_COPY_BACKWARD
> ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
> diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
> index cdf1dab..2eaacdf 100644
> --- a/sysdeps/x86_64/multiarch/mempcpy.S
> +++ b/sysdeps/x86_64/multiarch/mempcpy.S
> @@ -37,7 +37,7 @@ ENTRY(__mempcpy)
> testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
> jz 2f
> leaq __mempcpy_ssse3_back(%rip), %rax
> - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
> + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
> jz 2f
> leaq __mempcpy_avx_unaligned(%rip), %rax
> 2: ret
> diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
> index b7f9e89..17b8470 100644
> --- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
> +++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
> @@ -39,7 +39,7 @@ ENTRY(__mempcpy_chk)
> testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
> jz 2f
> leaq __mempcpy_chk_ssse3_back(%rip), %rax
> - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
> + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
> jz 2f
> leaq __mempcpy_chk_avx_unaligned(%rip), %rax
> 2: ret
> --
> 2.1.0
>
--
H.J.