]> sourceware.org Git - glibc.git/commitdiff
[x86] Add a feature bit: Fast_Unaligned_Copy hjl/pr19583
authorH.J. Lu <hjl.tools@gmail.com>
Wed, 23 Mar 2016 17:33:19 +0000 (10:33 -0700)
committerH.J. Lu <hjl.tools@gmail.com>
Wed, 23 Mar 2016 17:56:38 +0000 (10:56 -0700)
On AMD processors, memcpy optimized with unaligned SSE load is
slower than emcpy optimized with aligned SSSE3 while other string
functions are faster with unaligned SSE load.  A feature bit,
Fast_Unaligned_Copy, is added to select memcpy optimized with
unaligned SSE load.

[BZ #19583]
* sysdeps/x86/cpu-features.c (init_cpu_features): Set
Fast_Unaligned_Copy with Fast_Unaligned_Load for Intel
processors.  Set Fast_Copy_Backward for AMD Excavator
processors.
* sysdeps/x86/cpu-features.h (bit_arch_Fast_Unaligned_Copy):
New.
(index_arch_Fast_Unaligned_Copy): Likewise.
* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check
Fast_Unaligned_Copy instead of Fast_Unaligned_Load.

sysdeps/x86/cpu-features.c
sysdeps/x86/cpu-features.h
sysdeps/x86_64/multiarch/memcpy.S

index c8f81efd03b66da96ca8097a963c2703f03ab874..de75c79cf987ce109580bfe22c8d10ed9d9f4de3 100644 (file)
@@ -152,9 +152,13 @@ init_cpu_features (struct cpu_features *cpu_features)
 #endif
 #if index_arch_Fast_Unaligned_Load != index_arch_Slow_SSE4_2
 # error index_arch_Fast_Unaligned_Load != index_arch_Slow_SSE4_2
+#endif
+#if index_arch_Fast_Unaligned_Load != index_arch_Fast_Unaligned_Copy
+# error index_arch_Fast_Unaligned_Load != index_arch_Fast_Unaligned_Copy
 #endif
              cpu_features->feature[index_arch_Fast_Unaligned_Load]
                |= (bit_arch_Fast_Unaligned_Load
+                   | bit_arch_Fast_Unaligned_Copy
                    | bit_arch_Prefer_PMINUB_for_stringop
                    | bit_arch_Slow_SSE4_2);
              break;
@@ -182,11 +186,15 @@ init_cpu_features (struct cpu_features *cpu_features)
 #endif
 #if index_arch_Fast_Rep_String != index_arch_Prefer_PMINUB_for_stringop
 # error index_arch_Fast_Rep_String != index_arch_Prefer_PMINUB_for_stringop
+#endif
+#if index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Copy
+# error index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Copy
 #endif
              cpu_features->feature[index_arch_Fast_Rep_String]
                |= (bit_arch_Fast_Rep_String
                    | bit_arch_Fast_Copy_Backward
                    | bit_arch_Fast_Unaligned_Load
+                   | bit_arch_Fast_Unaligned_Copy
                    | bit_arch_Prefer_PMINUB_for_stringop);
              break;
            }
@@ -220,10 +228,14 @@ init_cpu_features (struct cpu_features *cpu_features)
 
       if (family == 0x15)
        {
+#if index_arch_Fast_Unaligned_Load != index_arch_Fast_Copy_Backward
+# error index_arch_Fast_Unaligned_Load != index_arch_Fast_Copy_Backward
+#endif
          /* "Excavator"   */
          if (model >= 0x60 && model <= 0x7f)
            cpu_features->feature[index_arch_Fast_Unaligned_Load]
-             |= bit_arch_Fast_Unaligned_Load;
+             |= (bit_arch_Fast_Unaligned_Load
+                 | bit_arch_Fast_Copy_Backward);
        }
     }
   else
index e06eb7e41b838b1f10fe8c5c227fc0fc40c5413c..bfe1f4c68d5aedcc3ca37b71548422e997340305 100644 (file)
@@ -35,6 +35,7 @@
 #define bit_arch_I686                          (1 << 15)
 #define bit_arch_Prefer_MAP_32BIT_EXEC         (1 << 16)
 #define bit_arch_Prefer_No_VZEROUPPER          (1 << 17)
+#define bit_arch_Fast_Unaligned_Copy           (1 << 18)
 
 /* CPUID Feature flags.  */
 
 # define index_arch_I686               FEATURE_INDEX_1*FEATURE_SIZE
 # define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE
 # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Fast_Unaligned_Copy        FEATURE_INDEX_1*FEATURE_SIZE
 
 
 # if defined (_LIBC) && !IS_IN (nonlib)
@@ -265,6 +267,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_arch_I686               FEATURE_INDEX_1
 # define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1
 # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
+# define index_arch_Fast_Unaligned_Copy        FEATURE_INDEX_1
 
 #endif /* !__ASSEMBLER__ */
 
index 8882590e51196e87fa95cd7f29da20371d641ffa..5b045d7847d5c13cb3f8542b871097adcd3c8eeb 100644 (file)
@@ -42,7 +42,7 @@ ENTRY(__new_memcpy)
        HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
        jnz     2f
        lea     __memcpy_sse2_unaligned(%rip), %RAX_LP
-       HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+       HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
        jnz     2f
        lea     __memcpy_sse2(%rip), %RAX_LP
        HAS_CPU_FEATURE (SSSE3)
This page took 0.054656 seconds and 5 git commands to generate.