This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] x86_64: memcpy/memmove family optimized with AVX512
- From: "H.J. Lu" <hjl dot tools at gmail dot com>
- To: Andrew Senkevich <andrew dot n dot senkevich at gmail dot com>
- Cc: libc-alpha <libc-alpha at sourceware dot org>
- Date: Wed, 13 Jan 2016 06:21:12 -0800
- Subject: Re: [PATCH] x86_64: memcpy/memmove family optimized with AVX512
- Authentication-results: sourceware.org; auth=none
- References: <CAMXFM3uGLiFE+pKPzFgWP6Sx4C3w2Ktd4w3+35O0Bj=B1s0naA at mail dot gmail dot com>
On Tue, Jan 12, 2016 at 6:13 AM, Andrew Senkevich
<andrew.n.senkevich@gmail.com> wrote:
> Hi,
>
> here is AVX512 implementations of memcpy, mempcpy, memmove,
> memcpy_chk, mempcpy_chk, memmove_chk.
> It shows average improvement more than 30% over AVX versions on KNL
> hardware, performance results attached.
> Ok for trunk?
>
> 2016-01-12 Andrew Senkevich <andrew.senkevich@intel.com>
>
> * sysdeps/x86_64/multiarch/Makefile: Added new files.
> * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Added new tests.
> * sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S: New file.
> * sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S: Likewise.
> * sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S: Likewise.
> * sysdeps/x86_64/multiarch/memcpy.S: Added new IFUNC branch.
> * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise.
> * sysdeps/x86_64/multiarch/memmove.c: Likewise.
> * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise.
> * sysdeps/x86_64/multiarch/mempcpy.S: Likewise.
> * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise.
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile
> b/sysdeps/x86_64/multiarch/Makefile
> index b2e31ef..d234f4a 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -7,11 +7,12 @@ ifeq ($(subdir),string)
>
> sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
> strcmp-sse2-unaligned strncmp-ssse3 \
> - memcmp-sse4 memcpy-ssse3 \
> - memcpy-sse2-unaligned mempcpy-ssse3 \
> - memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
> - memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
> - memmove-ssse3-back strcasecmp_l-ssse3 \
> + memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
> + memcpy-avx512-no-vzeroupper mempcpy-ssse3 memmove-ssse3 \
> + memcpy-ssse3-back mempcpy-ssse3-back memmove-avx-unaligned \
> + memcpy-avx-unaligned mempcpy-avx-unaligned \
> + mempcpy-avx512-no-vzeroupper memmove-ssse3-back \
> + memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
> strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
> strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
> strcpy-sse2-unaligned strncpy-sse2-unaligned \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 5f600dc..7746d79 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -24,7 +24,7 @@
> #include "init-arch.h"
>
> /* Maximum number of IFUNC implementations. */
> -#define MAX_IFUNC 4
> +#define MAX_IFUNC 5
>
> /* Fill ARRAY of MAX elements with IFUNC implementations for function
> NAME supported on target machine and return the number of valid
> @@ -46,9 +46,12 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
> __memcmp_ssse3)
> IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
>
> - /* Support sysdeps/x86_64/multiarch/memmove_chk.S. */
> + /* Support sysdeps/x86_64/multiarch/memmove_chk.c. */
> IFUNC_IMPL (i, name, __memmove_chk,
> IFUNC_IMPL_ADD (array, i, __memmove_chk,
> + HAS_ARCH_FEATURE (AVX512F_Usable),
> + __memmove_chk_avx512_no_vzeroupper)
> + IFUNC_IMPL_ADD (array, i, __memmove_chk,
> HAS_ARCH_FEATURE (AVX_Usable),
> __memmove_chk_avx_unaligned)
> IFUNC_IMPL_ADD (array, i, __memmove_chk,
> @@ -65,6 +68,9 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, memmove,
> HAS_ARCH_FEATURE (AVX_Usable),
> __memmove_avx_unaligned)
> + IFUNC_IMPL_ADD (array, i, memmove,
> + HAS_ARCH_FEATURE (AVX512F_Usable),
> + __memmove_avx512_no_vzeroupper)
> IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
> __memmove_ssse3_back)
> IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
> @@ -274,6 +280,9 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
> __memcpy_ssse3_back)
> IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
> __memcpy_ssse3)
> + IFUNC_IMPL_ADD (array, i, memcpy,
> + HAS_ARCH_FEATURE (AVX512F_Usable),
> + __memcpy_avx512_no_vzeroupper)
> IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
>
> @@ -294,6 +303,9 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
> /* Support sysdeps/x86_64/multiarch/mempcpy.S. */
> IFUNC_IMPL (i, name, mempcpy,
> IFUNC_IMPL_ADD (array, i, mempcpy,
> + HAS_ARCH_FEATURE (AVX512F_Usable),
> + __mempcpy_avx512_no_vzeroupper)
> + IFUNC_IMPL_ADD (array, i, mempcpy,
> HAS_ARCH_FEATURE (AVX_Usable),
> __mempcpy_avx_unaligned)
> IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
Please add _chk tests.
> diff --git a/sysdeps/x86_64/multiarch/memcpy.S
> b/sysdeps/x86_64/multiarch/memcpy.S
> index 27fca29..64a1bcd 100644
> --- a/sysdeps/x86_64/multiarch/memcpy.S
> +++ b/sysdeps/x86_64/multiarch/memcpy.S
> @@ -30,19 +30,27 @@
> ENTRY(__new_memcpy)
> .type __new_memcpy, @gnu_indirect_function
> LOAD_RTLD_GLOBAL_RO_RDX
> - leaq __memcpy_avx_unaligned(%rip), %rax
> +#ifdef HAVE_AVX512_ASM_SUPPORT
> + HAS_ARCH_FEATURE (AVX512F_Usable)
> + jz 1f
> + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
> + jz 1f
> + leaq __memcpy_avx512_no_vzeroupper(%rip), %rax
> + ret
> +#endif
> +1: leaq __memcpy_avx_unaligned(%rip), %rax
> HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
> - jz 1f
> + jz 2f
> ret
> -1: leaq __memcpy_sse2(%rip), %rax
> +2: leaq __memcpy_sse2(%rip), %rax
> HAS_ARCH_FEATURE (Slow_BSF)
> - jnz 2f
> + jnz 3f
> leaq __memcpy_sse2_unaligned(%rip), %rax
> ret
> -2: HAS_CPU_FEATURE (SSSE3)
> - jz 3f
> +3: HAS_CPU_FEATURE (SSSE3)
> + jz 4f
> leaq __memcpy_ssse3(%rip), %rax
> -3: ret
> +4: ret
> END(__new_memcpy)
>
> # undef ENTRY
Please find a way not to re-order labels when adding a new
implementation next time.
Thanks.
--
H.J.