This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH][BZ #17801] Fix memcpy regression (five times slower on bulldozer.)


On Fri, Jan 30, 2015 at 6:56 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Tue, Jan 06, 2015 at 06:54:50AM -0800, H.J. Lu wrote:
>> On Tue, Jan 6, 2015 at 6:29 AM, OndÅej BÃlka <neleai@seznam.cz> wrote:
>> > H. J, in this commit there slipped performance regression by review.
>> >
>> > commit 05f3633da4f9df870d04dd77336e793746e57ed4
>> > Author: Ling Ma <ling.ml@alibaba-inc.com>
>> > Date:   Mon Jul 14 00:02:52 2014 -0400
>> >
>> >     Improve 64bit memcpy performance for Haswell CPU with AVX
>> > instruction
>> >
>> >
>> > I seem to recall that I mentioned something about avx being typo and
>> > should be avx2 but did not look it further.
>> >
>> > As I assumed its avx2 only I was ok with that nad haswell specific
>> > optimizations like using rep movsq. However ifunc checks for avx which
>> > is bad as we already know that avx loads/stores are slow on sandy
>> > bridge.
>> >
>> > Also testing on affected architectures would reveal that. Especially amd
>> > bulldozer where its five times slower on 2kb-16kb range, see
>> > http://kam.mff.cuni.cz/~ondra/benchmark_string/fx10/memcpy_profile_avx/results_rand/result.html
>> > because movsb is slow.
>> >
>> > On sandy bridge its only 20% regression on same range.
>> > http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_ivy_bridge/memcpy_profile_avx/results_rand/result.html
>> >
>> >
>> > Also avx loop for 128-2024 bytes is slower there so there is no point
>> > using it.
>> >
>> > What about following change?
>> >
>> >
>> >         * sysdeps/x86_64/multiarch/memcpy.S: Fix performance regression.
>> >
>> > diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
>> > index 992e40d..27f89e4 100644
>> > --- a/sysdeps/x86_64/multiarch/memcpy.S
>> > +++ b/sysdeps/x86_64/multiarch/memcpy.S
>> > @@ -32,10 +32,13 @@ ENTRY(__new_memcpy)
>> >         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
>> >         jne     1f
>> >         call    __init_cpu_features
>> > +#ifdef HAVE_AVX2_SUPPORT
>> >  1:     leaq    __memcpy_avx_unaligned(%rip), %rax
>> > -       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
>> > +       testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
>> > +
>> >         jz 1f
>> >         ret
>> > +#endif
>> >  1:     leaq    __memcpy_sse2(%rip), %rax
>> >         testl   $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
>> >         jnz     2f
>>
>> Please add a new feature bit, bit_Fast_AVX_Unaligned_Load, and turn it
>> on together
>> with bit_AVX2_Usable.
>>
>
> I know we are in freeze.  But I'd like to fix this regression in 2.21.
> OK for master?

Since this is a serious performance regression, I will check it in
before the end of the day unless I am told otherwise.


> Thanks.
>
>
> H.J.
> ---
> From 56d25c11b64a97255a115901d136d753c86de24e Mon Sep 17 00:00:00 2001
> From: "H.J. Lu" <hjl.tools@gmail.com>
> Date: Fri, 30 Jan 2015 06:50:20 -0800
> Subject: [PATCH] Use AVX unaligned memcpy only if AVX2 is available
>
> memcpy with unaligned 256-bit AVX register loads/stores are slow on older
> processorsl like Sandy Bridge.  This patch adds bit_AVX_Fast_Unaligned_Load
> and sets it only when AVX2 is available.
>
>         [BZ #17801]
>         * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
>         Set the bit_AVX_Fast_Unaligned_Load bit for AVX2.
>         * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load):
>         New.
>         (index_AVX_Fast_Unaligned_Load): Likewise.
>         (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise.
>         * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the
>         bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit.
>         * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise.
>         * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
>         * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise.
>         * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace
>         HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD.
>         * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.
> ---
>  ChangeLog                              | 18 ++++++++++++++++++
>  sysdeps/x86_64/multiarch/init-arch.c   |  9 +++++++--
>  sysdeps/x86_64/multiarch/init-arch.h   |  4 ++++
>  sysdeps/x86_64/multiarch/memcpy.S      |  2 +-
>  sysdeps/x86_64/multiarch/memcpy_chk.S  |  2 +-
>  sysdeps/x86_64/multiarch/memmove.c     |  2 +-
>  sysdeps/x86_64/multiarch/memmove_chk.c |  2 +-
>  sysdeps/x86_64/multiarch/mempcpy.S     |  2 +-
>  sysdeps/x86_64/multiarch/mempcpy_chk.S |  2 +-
>  9 files changed, 35 insertions(+), 8 deletions(-)
>
> diff --git a/ChangeLog b/ChangeLog
> index 26f7f3f..a696e39 100644
> --- a/ChangeLog
> +++ b/ChangeLog
> @@ -1,3 +1,21 @@
> +2015-01-30  H.J. Lu  <hongjiu.lu@intel.com>
> +
> +       [BZ #17801]
> +       * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
> +       Set the bit_AVX_Fast_Unaligned_Load bit for AVX2.
> +       * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load):
> +       New.
> +       (index_AVX_Fast_Unaligned_Load): Likewise.
> +       (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise.
> +       * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the
> +       bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit.
> +       * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise.
> +       * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
> +       * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise.
> +       * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace
> +       HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD.
> +       * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.
> +
>  2015-01-29  Andreas Schwab  <schwab@suse.de>
>
>         * sysdeps/nptl/allocrtsig.c: Include <signal.h>.
> diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
> index 9299360..7dec218 100644
> --- a/sysdeps/x86_64/multiarch/init-arch.c
> +++ b/sysdeps/x86_64/multiarch/init-arch.c
> @@ -171,9 +171,14 @@ __init_cpu_features (void)
>           /* Determine if AVX is usable.  */
>           if (CPUID_AVX)
>             __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable;
> -         /* Determine if AVX2 is usable.  */
> +#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
> +# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
> +#endif
> +         /* Determine if AVX2 is usable.  Unaligned load with 256-bit
> +            AVX registers are faster on processors with AVX2.  */
>           if (CPUID_AVX2)
> -           __cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable;
> +           __cpu_features.feature[index_AVX2_Usable]
> +             |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load;
>           /* Determine if FMA is usable.  */
>           if (CPUID_FMA)
>             __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable;
> diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
> index 55f1c5b..e6b5ba5 100644
> --- a/sysdeps/x86_64/multiarch/init-arch.h
> +++ b/sysdeps/x86_64/multiarch/init-arch.h
> @@ -25,6 +25,7 @@
>  #define bit_FMA4_Usable                        (1 << 8)
>  #define bit_Slow_SSE4_2                        (1 << 9)
>  #define bit_AVX2_Usable                        (1 << 10)
> +#define bit_AVX_Fast_Unaligned_Load    (1 << 11)
>
>  /* CPUID Feature flags.  */
>
> @@ -74,6 +75,7 @@
>  # define index_FMA4_Usable             FEATURE_INDEX_1*FEATURE_SIZE
>  # define index_Slow_SSE4_2             FEATURE_INDEX_1*FEATURE_SIZE
>  # define index_AVX2_Usable             FEATURE_INDEX_1*FEATURE_SIZE
> +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE
>
>  #else  /* __ASSEMBLER__ */
>
> @@ -169,6 +171,7 @@ extern const struct cpu_features *__get_cpu_features (void)
>  # define index_FMA4_Usable             FEATURE_INDEX_1
>  # define index_Slow_SSE4_2             FEATURE_INDEX_1
>  # define index_AVX2_Usable             FEATURE_INDEX_1
> +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1
>
>  # define HAS_ARCH_FEATURE(name) \
>    ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
> @@ -181,5 +184,6 @@ extern const struct cpu_features *__get_cpu_features (void)
>  # define HAS_AVX2                      HAS_ARCH_FEATURE (AVX2_Usable)
>  # define HAS_FMA                       HAS_ARCH_FEATURE (FMA_Usable)
>  # define HAS_FMA4                      HAS_ARCH_FEATURE (FMA4_Usable)
> +# define HAS_AVX_FAST_UNALIGNED_LOAD   HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
>
>  #endif /* __ASSEMBLER__ */
> diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
> index 992e40d..4e18cd3 100644
> --- a/sysdeps/x86_64/multiarch/memcpy.S
> +++ b/sysdeps/x86_64/multiarch/memcpy.S
> @@ -33,7 +33,7 @@ ENTRY(__new_memcpy)
>         jne     1f
>         call    __init_cpu_features
>  1:     leaq    __memcpy_avx_unaligned(%rip), %rax
> -       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
> +       testl   $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
>         jz 1f
>         ret
>  1:     leaq    __memcpy_sse2(%rip), %rax
> diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
> index 5e9cf00..1e756ea 100644
> --- a/sysdeps/x86_64/multiarch/memcpy_chk.S
> +++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
> @@ -39,7 +39,7 @@ ENTRY(__memcpy_chk)
>         testl   $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
>         jz      2f
>         leaq    __memcpy_chk_ssse3_back(%rip), %rax
> -       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
> +       testl   $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
>         jz  2f
>         leaq    __memcpy_chk_avx_unaligned(%rip), %rax
>  2:     ret
> diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
> index d93bfd0..dd153a3 100644
> --- a/sysdeps/x86_64/multiarch/memmove.c
> +++ b/sysdeps/x86_64/multiarch/memmove.c
> @@ -49,7 +49,7 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
>     ifunc symbol properly.  */
>  extern __typeof (__redirect_memmove) __libc_memmove;
>  libc_ifunc (__libc_memmove,
> -           HAS_AVX
> +           HAS_AVX_FAST_UNALIGNED_LOAD
>             ? __memmove_avx_unaligned
>             : (HAS_SSSE3
>                ? (HAS_FAST_COPY_BACKWARD
> diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
> index 743ca2a..8b12d00 100644
> --- a/sysdeps/x86_64/multiarch/memmove_chk.c
> +++ b/sysdeps/x86_64/multiarch/memmove_chk.c
> @@ -30,7 +30,7 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
>  #include "debug/memmove_chk.c"
>
>  libc_ifunc (__memmove_chk,
> -           HAS_AVX ? __memmove_chk_avx_unaligned :
> +           HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned :
>             (HAS_SSSE3
>             ? (HAS_FAST_COPY_BACKWARD
>                ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
> diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
> index cdf1dab..2eaacdf 100644
> --- a/sysdeps/x86_64/multiarch/mempcpy.S
> +++ b/sysdeps/x86_64/multiarch/mempcpy.S
> @@ -37,7 +37,7 @@ ENTRY(__mempcpy)
>         testl   $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
>         jz      2f
>         leaq    __mempcpy_ssse3_back(%rip), %rax
> -       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
> +       testl   $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
>         jz      2f
>         leaq    __mempcpy_avx_unaligned(%rip), %rax
>  2:     ret
> diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
> index b7f9e89..17b8470 100644
> --- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
> +++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
> @@ -39,7 +39,7 @@ ENTRY(__mempcpy_chk)
>         testl   $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
>         jz      2f
>         leaq    __mempcpy_chk_ssse3_back(%rip), %rax
> -       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
> +       testl   $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
>         jz      2f
>         leaq    __mempcpy_chk_avx_unaligned(%rip), %rax
>  2:     ret
> --
> 2.1.0
>



-- 
H.J.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]