This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] x86_64: memcpy/memmove family optimized with AVX512



On 13-01-2016 12:21, H.J. Lu wrote:
> On Tue, Jan 12, 2016 at 6:13 AM, Andrew Senkevich
> <andrew.n.senkevich@gmail.com> wrote:
>> Hi,
>>
>> here is AVX512 implementations of memcpy, mempcpy, memmove,
>> memcpy_chk, mempcpy_chk, memmove_chk.
>> It shows average improvement more than 30% over AVX versions on KNL
>> hardware, performance results attached.
>> Ok for trunk?
>>
>> 2016-01-12  Andrew Senkevich  <andrew.senkevich@intel.com>
>>
>>         * sysdeps/x86_64/multiarch/Makefile: Added new files.
>>         * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Added new tests.
>>         * sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S: New file.
>>         * sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S: Likewise.
>>         * sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S: Likewise.
>>         * sysdeps/x86_64/multiarch/memcpy.S: Added new IFUNC branch.
>>         * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise.
>>         * sysdeps/x86_64/multiarch/memmove.c: Likewise.
>>         * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise.
>>         * sysdeps/x86_64/multiarch/mempcpy.S: Likewise.
>>         * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise.
>>
>> diff --git a/sysdeps/x86_64/multiarch/Makefile
>> b/sysdeps/x86_64/multiarch/Makefile
>> index b2e31ef..d234f4a 100644
>> --- a/sysdeps/x86_64/multiarch/Makefile
>> +++ b/sysdeps/x86_64/multiarch/Makefile
>> @@ -7,11 +7,12 @@ ifeq ($(subdir),string)
>>
>>  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
>>     strcmp-sse2-unaligned strncmp-ssse3 \
>> -   memcmp-sse4 memcpy-ssse3 \
>> -   memcpy-sse2-unaligned mempcpy-ssse3 \
>> -   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
>> -   memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
>> -   memmove-ssse3-back strcasecmp_l-ssse3 \
>> +   memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
>> +   memcpy-avx512-no-vzeroupper mempcpy-ssse3 memmove-ssse3 \
>> +   memcpy-ssse3-back mempcpy-ssse3-back memmove-avx-unaligned \
>> +   memcpy-avx-unaligned mempcpy-avx-unaligned \
>> +   mempcpy-avx512-no-vzeroupper memmove-ssse3-back \
>> +   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
>>     strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
>>     strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
>>     strcpy-sse2-unaligned strncpy-sse2-unaligned \
>> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> index 5f600dc..7746d79 100644
>> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> @@ -24,7 +24,7 @@
>>  #include "init-arch.h"
>>
>>  /* Maximum number of IFUNC implementations.  */
>> -#define MAX_IFUNC 4
>> +#define MAX_IFUNC 5
>>
>>  /* Fill ARRAY of MAX elements with IFUNC implementations for function
>>     NAME supported on target machine and return the number of valid
>> @@ -46,9 +46,12 @@ __libc_ifunc_impl_list (const char *name, struct
>> libc_ifunc_impl *array,
>>        __memcmp_ssse3)
>>        IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
>>
>> -  /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
>> +  /* Support sysdeps/x86_64/multiarch/memmove_chk.c.  */
>>    IFUNC_IMPL (i, name, __memmove_chk,
>>        IFUNC_IMPL_ADD (array, i, __memmove_chk,
>> +      HAS_ARCH_FEATURE (AVX512F_Usable),
>> +      __memmove_chk_avx512_no_vzeroupper)
>> +      IFUNC_IMPL_ADD (array, i, __memmove_chk,
>>        HAS_ARCH_FEATURE (AVX_Usable),
>>        __memmove_chk_avx_unaligned)
>>        IFUNC_IMPL_ADD (array, i, __memmove_chk,
>> @@ -65,6 +68,9 @@ __libc_ifunc_impl_list (const char *name, struct
>> libc_ifunc_impl *array,
>>        IFUNC_IMPL_ADD (array, i, memmove,
>>        HAS_ARCH_FEATURE (AVX_Usable),
>>        __memmove_avx_unaligned)
>> +      IFUNC_IMPL_ADD (array, i, memmove,
>> +      HAS_ARCH_FEATURE (AVX512F_Usable),
>> +      __memmove_avx512_no_vzeroupper)
>>        IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
>>        __memmove_ssse3_back)
>>        IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
>> @@ -274,6 +280,9 @@ __libc_ifunc_impl_list (const char *name, struct
>> libc_ifunc_impl *array,
>>        __memcpy_ssse3_back)
>>        IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
>>        __memcpy_ssse3)
>> +      IFUNC_IMPL_ADD (array, i, memcpy,
>> +      HAS_ARCH_FEATURE (AVX512F_Usable),
>> +      __memcpy_avx512_no_vzeroupper)
>>        IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
>>        IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
>>
>> @@ -294,6 +303,9 @@ __libc_ifunc_impl_list (const char *name, struct
>> libc_ifunc_impl *array,
>>    /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
>>    IFUNC_IMPL (i, name, mempcpy,
>>        IFUNC_IMPL_ADD (array, i, mempcpy,
>> +      HAS_ARCH_FEATURE (AVX512F_Usable),
>> +      __mempcpy_avx512_no_vzeroupper)
>> +      IFUNC_IMPL_ADD (array, i, mempcpy,
>>        HAS_ARCH_FEATURE (AVX_Usable),
>>        __mempcpy_avx_unaligned)
>>        IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
> 
> Please add _chk tests.
> 
> 
>> diff --git a/sysdeps/x86_64/multiarch/memcpy.S
>> b/sysdeps/x86_64/multiarch/memcpy.S
>> index 27fca29..64a1bcd 100644
>> --- a/sysdeps/x86_64/multiarch/memcpy.S
>> +++ b/sysdeps/x86_64/multiarch/memcpy.S
>> @@ -30,19 +30,27 @@
>>  ENTRY(__new_memcpy)
>>   .type __new_memcpy, @gnu_indirect_function
>>   LOAD_RTLD_GLOBAL_RO_RDX
>> - leaq __memcpy_avx_unaligned(%rip), %rax
>> +#ifdef HAVE_AVX512_ASM_SUPPORT
>> + HAS_ARCH_FEATURE (AVX512F_Usable)
>> + jz 1f
>> + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
>> + jz 1f
>> + leaq    __memcpy_avx512_no_vzeroupper(%rip), %rax
>> + ret
>> +#endif
>> +1: leaq __memcpy_avx_unaligned(%rip), %rax
>>   HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
>> - jz 1f
>> + jz 2f
>>   ret
>> -1: leaq __memcpy_sse2(%rip), %rax
>> +2: leaq __memcpy_sse2(%rip), %rax
>>   HAS_ARCH_FEATURE (Slow_BSF)
>> - jnz 2f
>> + jnz 3f
>>   leaq __memcpy_sse2_unaligned(%rip), %rax
>>   ret
>> -2: HAS_CPU_FEATURE (SSSE3)
>> - jz 3f
>> +3: HAS_CPU_FEATURE (SSSE3)
>> + jz 4f
>>   leaq    __memcpy_ssse3(%rip), %rax
>> -3: ret
>> +4: ret
>>  END(__new_memcpy)
>>
>>  # undef ENTRY
> 
> Please find a way not to re-order labels when adding a new
> implementation next time.
> 

Maybe using 'libc_ifunc(...)' instead and let the compiler handle it?


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]