]> sourceware.org Git - glibc.git/commitdiff
Use AVX unaligned memcpy only if AVX2 is available
authorH.J. Lu <hjl.tools@gmail.com>
Fri, 30 Jan 2015 14:50:20 +0000 (06:50 -0800)
committerH.J. Lu <hjl.tools@gmail.com>
Fri, 30 Jan 2015 23:37:58 +0000 (15:37 -0800)
memcpy with unaligned 256-bit AVX register loads/stores are slow on older
processorsl like Sandy Bridge.  This patch adds bit_AVX_Fast_Unaligned_Load
and sets it only when AVX2 is available.

[BZ #17801]
* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
Set the bit_AVX_Fast_Unaligned_Load bit for AVX2.
* sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load):
New.
(index_AVX_Fast_Unaligned_Load): Likewise.
(HAS_AVX_FAST_UNALIGNED_LOAD): Likewise.
* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the
bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit.
* sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise.
* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise.
* sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace
HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD.
* sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.

ChangeLog
NEWS
sysdeps/x86_64/multiarch/init-arch.c
sysdeps/x86_64/multiarch/init-arch.h
sysdeps/x86_64/multiarch/memcpy.S
sysdeps/x86_64/multiarch/memcpy_chk.S
sysdeps/x86_64/multiarch/memmove.c
sysdeps/x86_64/multiarch/memmove_chk.c
sysdeps/x86_64/multiarch/mempcpy.S
sysdeps/x86_64/multiarch/mempcpy_chk.S

index 26f7f3f3b188e5fbd2d2ffe05a83fe619d990214..a696e396b266e690c0f86a64c24bd8ffa68e0030 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+2015-01-30  H.J. Lu  <hongjiu.lu@intel.com>
+
+       [BZ #17801]
+       * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
+       Set the bit_AVX_Fast_Unaligned_Load bit for AVX2.
+       * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load):
+       New.
+       (index_AVX_Fast_Unaligned_Load): Likewise.
+       (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise.
+       * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the
+       bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit.
+       * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise.
+       * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
+       * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise.
+       * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace
+       HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD.
+       * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.
+
 2015-01-29  Andreas Schwab  <schwab@suse.de>
 
        * sysdeps/nptl/allocrtsig.c: Include <signal.h>.
diff --git a/NEWS b/NEWS
index 8e2729bddd6a21bdc5a78a64bfd767de8e6f6eae..c91b9fc58a596a46aba37a515d9d48227cf06683 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -17,8 +17,8 @@ Version 2.21
   17601, 17608, 17616, 17625, 17630, 17633, 17634, 17635, 17647, 17653,
   17657, 17658, 17664, 17665, 17668, 17682, 17702, 17717, 17719, 17722,
   17723, 17724, 17725, 17732, 17733, 17744, 17745, 17746, 17747, 17748,
-  17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17803,
-  17806, 17834, 17844, 17848, 17868, 17869, 17870, 17885, 17892.
+  17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17801,
+  17803, 17806, 17834, 17844, 17848, 17868, 17869, 17870, 17885, 17892.
 
 * A new semaphore algorithm has been implemented in generic C code for all
   machines. Previous custom assembly implementations of semaphore were
index 9299360612fc422a29f7b6aef113ae0275a24f74..7dec21884dca99337bbe7215feb6aa23b3736892 100644 (file)
@@ -171,9 +171,14 @@ __init_cpu_features (void)
          /* Determine if AVX is usable.  */
          if (CPUID_AVX)
            __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable;
-         /* Determine if AVX2 is usable.  */
+#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
+# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
+#endif
+         /* Determine if AVX2 is usable.  Unaligned load with 256-bit
+            AVX registers are faster on processors with AVX2.  */
          if (CPUID_AVX2)
-           __cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable;
+           __cpu_features.feature[index_AVX2_Usable]
+             |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load;
          /* Determine if FMA is usable.  */
          if (CPUID_FMA)
            __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable;
index 55f1c5b34cb973796cb5376cb00036fc3d9ae731..e6b5ba5530f34ebb89780052a6c26a1df0d2383f 100644 (file)
@@ -25,6 +25,7 @@
 #define bit_FMA4_Usable                        (1 << 8)
 #define bit_Slow_SSE4_2                        (1 << 9)
 #define bit_AVX2_Usable                        (1 << 10)
+#define bit_AVX_Fast_Unaligned_Load    (1 << 11)
 
 /* CPUID Feature flags.  */
 
@@ -74,6 +75,7 @@
 # define index_FMA4_Usable             FEATURE_INDEX_1*FEATURE_SIZE
 # define index_Slow_SSE4_2             FEATURE_INDEX_1*FEATURE_SIZE
 # define index_AVX2_Usable             FEATURE_INDEX_1*FEATURE_SIZE
+# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE
 
 #else  /* __ASSEMBLER__ */
 
@@ -169,6 +171,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_FMA4_Usable             FEATURE_INDEX_1
 # define index_Slow_SSE4_2             FEATURE_INDEX_1
 # define index_AVX2_Usable             FEATURE_INDEX_1
+# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1
 
 # define HAS_ARCH_FEATURE(name) \
   ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
@@ -181,5 +184,6 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define HAS_AVX2                      HAS_ARCH_FEATURE (AVX2_Usable)
 # define HAS_FMA                       HAS_ARCH_FEATURE (FMA_Usable)
 # define HAS_FMA4                      HAS_ARCH_FEATURE (FMA4_Usable)
+# define HAS_AVX_FAST_UNALIGNED_LOAD   HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
 
 #endif /* __ASSEMBLER__ */
index 992e40db81d58ef8c0435ac07e9f580fc72f9864..4e18cd30704bfe9312f53288d3d4439ef30ec54a 100644 (file)
@@ -33,7 +33,7 @@ ENTRY(__new_memcpy)
        jne     1f
        call    __init_cpu_features
 1:     leaq    __memcpy_avx_unaligned(%rip), %rax
-       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+       testl   $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
        jz 1f
        ret
 1:     leaq    __memcpy_sse2(%rip), %rax
index 5e9cf004b08edd63496b2f9480667e0b826d49e7..1e756ea0c23f684f46a928189401065efd8b76f8 100644 (file)
@@ -39,7 +39,7 @@ ENTRY(__memcpy_chk)
        testl   $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
        jz      2f
        leaq    __memcpy_chk_ssse3_back(%rip), %rax
-       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+       testl   $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
        jz  2f
        leaq    __memcpy_chk_avx_unaligned(%rip), %rax
 2:     ret
index d93bfd05c0b1125c4bcf442250c46f4e010dd193..dd153a3eaa277060850c6f7ba3745d8b17a66c62 100644 (file)
@@ -49,7 +49,7 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
    ifunc symbol properly.  */
 extern __typeof (__redirect_memmove) __libc_memmove;
 libc_ifunc (__libc_memmove,
-           HAS_AVX
+           HAS_AVX_FAST_UNALIGNED_LOAD
            ? __memmove_avx_unaligned
            : (HAS_SSSE3
               ? (HAS_FAST_COPY_BACKWARD
index 743ca2a460f5ef5bc39677cfb9b7a44b75fc77b6..8b12d002dcbd7c60c8d89fc2c44bbf95ec28dd3e 100644 (file)
@@ -30,7 +30,7 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
 #include "debug/memmove_chk.c"
 
 libc_ifunc (__memmove_chk,
-           HAS_AVX ? __memmove_chk_avx_unaligned :
+           HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned :
            (HAS_SSSE3
            ? (HAS_FAST_COPY_BACKWARD
               ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
index cdf1dab62b6a013217cea46cc8fc238a3f85371a..2eaacdf0492d5167976f404ae55fc1c72c32c0a6 100644 (file)
@@ -37,7 +37,7 @@ ENTRY(__mempcpy)
        testl   $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
        jz      2f
        leaq    __mempcpy_ssse3_back(%rip), %rax
-       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+       testl   $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
        jz      2f
        leaq    __mempcpy_avx_unaligned(%rip), %rax
 2:     ret
index b7f9e89ea2456228510e24a0399ab58c7ee6f0e1..17b84701b02454fef866425632a2514b17e63ddf 100644 (file)
@@ -39,7 +39,7 @@ ENTRY(__mempcpy_chk)
        testl   $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
        jz      2f
        leaq    __mempcpy_chk_ssse3_back(%rip), %rax
-       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+       testl   $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
        jz      2f
        leaq    __mempcpy_chk_avx_unaligned(%rip), %rax
 2:     ret
This page took 0.121932 seconds and 5 git commands to generate.