This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] x86_64: memcpy/memmove family optimized with AVX512


On Tue, Jan 12, 2016 at 6:13 AM, Andrew Senkevich
<andrew.n.senkevich@gmail.com> wrote:
> Hi,
>
> here is AVX512 implementations of memcpy, mempcpy, memmove,
> memcpy_chk, mempcpy_chk, memmove_chk.
> It shows average improvement more than 30% over AVX versions on KNL
> hardware, performance results attached.
> Ok for trunk?
>
> 2016-01-12  Andrew Senkevich  <andrew.senkevich@intel.com>
>
>         * sysdeps/x86_64/multiarch/Makefile: Added new files.
>         * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Added new tests.
>         * sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S: New file.
>         * sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S: Likewise.
>         * sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S: Likewise.
>         * sysdeps/x86_64/multiarch/memcpy.S: Added new IFUNC branch.
>         * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise.
>         * sysdeps/x86_64/multiarch/memmove.c: Likewise.
>         * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise.
>         * sysdeps/x86_64/multiarch/mempcpy.S: Likewise.
>         * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise.
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile
> b/sysdeps/x86_64/multiarch/Makefile
> index b2e31ef..d234f4a 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -7,11 +7,12 @@ ifeq ($(subdir),string)
>
>  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
>     strcmp-sse2-unaligned strncmp-ssse3 \
> -   memcmp-sse4 memcpy-ssse3 \
> -   memcpy-sse2-unaligned mempcpy-ssse3 \
> -   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
> -   memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
> -   memmove-ssse3-back strcasecmp_l-ssse3 \
> +   memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
> +   memcpy-avx512-no-vzeroupper mempcpy-ssse3 memmove-ssse3 \
> +   memcpy-ssse3-back mempcpy-ssse3-back memmove-avx-unaligned \
> +   memcpy-avx-unaligned mempcpy-avx-unaligned \
> +   mempcpy-avx512-no-vzeroupper memmove-ssse3-back \
> +   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
>     strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
>     strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
>     strcpy-sse2-unaligned strncpy-sse2-unaligned \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 5f600dc..7746d79 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -24,7 +24,7 @@
>  #include "init-arch.h"
>
>  /* Maximum number of IFUNC implementations.  */
> -#define MAX_IFUNC 4
> +#define MAX_IFUNC 5
>
>  /* Fill ARRAY of MAX elements with IFUNC implementations for function
>     NAME supported on target machine and return the number of valid
> @@ -46,9 +46,12 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>        __memcmp_ssse3)
>        IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
>
> -  /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
> +  /* Support sysdeps/x86_64/multiarch/memmove_chk.c.  */
>    IFUNC_IMPL (i, name, __memmove_chk,
>        IFUNC_IMPL_ADD (array, i, __memmove_chk,
> +      HAS_ARCH_FEATURE (AVX512F_Usable),
> +      __memmove_chk_avx512_no_vzeroupper)
> +      IFUNC_IMPL_ADD (array, i, __memmove_chk,
>        HAS_ARCH_FEATURE (AVX_Usable),
>        __memmove_chk_avx_unaligned)
>        IFUNC_IMPL_ADD (array, i, __memmove_chk,
> @@ -65,6 +68,9 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>        IFUNC_IMPL_ADD (array, i, memmove,
>        HAS_ARCH_FEATURE (AVX_Usable),
>        __memmove_avx_unaligned)
> +      IFUNC_IMPL_ADD (array, i, memmove,
> +      HAS_ARCH_FEATURE (AVX512F_Usable),
> +      __memmove_avx512_no_vzeroupper)
>        IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
>        __memmove_ssse3_back)
>        IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
> @@ -274,6 +280,9 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>        __memcpy_ssse3_back)
>        IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
>        __memcpy_ssse3)
> +      IFUNC_IMPL_ADD (array, i, memcpy,
> +      HAS_ARCH_FEATURE (AVX512F_Usable),
> +      __memcpy_avx512_no_vzeroupper)
>        IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
>        IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
>
> @@ -294,6 +303,9 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
>    IFUNC_IMPL (i, name, mempcpy,
>        IFUNC_IMPL_ADD (array, i, mempcpy,
> +      HAS_ARCH_FEATURE (AVX512F_Usable),
> +      __mempcpy_avx512_no_vzeroupper)
> +      IFUNC_IMPL_ADD (array, i, mempcpy,
>        HAS_ARCH_FEATURE (AVX_Usable),
>        __mempcpy_avx_unaligned)
>        IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),

Please add _chk tests.


> diff --git a/sysdeps/x86_64/multiarch/memcpy.S
> b/sysdeps/x86_64/multiarch/memcpy.S
> index 27fca29..64a1bcd 100644
> --- a/sysdeps/x86_64/multiarch/memcpy.S
> +++ b/sysdeps/x86_64/multiarch/memcpy.S
> @@ -30,19 +30,27 @@
>  ENTRY(__new_memcpy)
>   .type __new_memcpy, @gnu_indirect_function
>   LOAD_RTLD_GLOBAL_RO_RDX
> - leaq __memcpy_avx_unaligned(%rip), %rax
> +#ifdef HAVE_AVX512_ASM_SUPPORT
> + HAS_ARCH_FEATURE (AVX512F_Usable)
> + jz 1f
> + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
> + jz 1f
> + leaq    __memcpy_avx512_no_vzeroupper(%rip), %rax
> + ret
> +#endif
> +1: leaq __memcpy_avx_unaligned(%rip), %rax
>   HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
> - jz 1f
> + jz 2f
>   ret
> -1: leaq __memcpy_sse2(%rip), %rax
> +2: leaq __memcpy_sse2(%rip), %rax
>   HAS_ARCH_FEATURE (Slow_BSF)
> - jnz 2f
> + jnz 3f
>   leaq __memcpy_sse2_unaligned(%rip), %rax
>   ret
> -2: HAS_CPU_FEATURE (SSSE3)
> - jz 3f
> +3: HAS_CPU_FEATURE (SSSE3)
> + jz 4f
>   leaq    __memcpy_ssse3(%rip), %rax
> -3: ret
> +4: ret
>  END(__new_memcpy)
>
>  # undef ENTRY

Please find a way not to re-order labels when adding a new
implementation next time.

Thanks.


-- 
H.J.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]