This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch hjl/ifunc/master created. glibc-2.23-523-g638f63c
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 30 Jun 2016 02:26:59 -0000
- Subject: GNU C Library master sources branch hjl/ifunc/master created. glibc-2.23-523-g638f63c
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, hjl/ifunc/master has been created
at 638f63c0a15653d7a4ca85ac42df2585c0760c5e (commit)
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=638f63c0a15653d7a4ca85ac42df2585c0760c5e
commit 638f63c0a15653d7a4ca85ac42df2585c0760c5e
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Jun 27 12:27:49 2016 -0700
Check ERMS in memmove/memcpy/mempcpy/memset
Although the Enhanced REP MOVSB/STOSB (ERMS) implementations of memmove,
memcpy, mempcpy and memset aren't used by the current processors, this
patch adds Prefer_ERMS check in memmove, memcpy, mempcpy and memset so
that glibc developers can experiment with it using GLIBC_IFUNC.
* sysdeps/x86/cpu-features.c (init_cpu_features): Also
check Prefer_ERMS.
* sysdeps/x86/cpu-features.h (bit_arch_Prefer_ERMS): New.
(index_arch_Prefer_ERMS): Likewise.
* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Return
__memcpy_erms for Prefer_ERMS.
* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
(__memmove_erms): Enabled for libc.a.
* ysdeps/x86_64/multiarch/memmove.S (__libc_memmove): Return
__memmove_erms or Prefer_ERMS.
* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Return
__mempcpy_erms for Prefer_ERMS.
* sysdeps/x86_64/multiarch/memset.S (memset): Return
__memset_erms for Prefer_ERMS.
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index fcc74c9..2151d9c 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -475,6 +475,7 @@ no_cpuid:
case 13:
CHECK_GLIBC_IFUNC_ARCH_OFF (AVX2_Usable);
CHECK_GLIBC_IFUNC_ARCH_OFF (FMA4_Usable);
+ CHECK_GLIBC_IFUNC_ARCH_BOTH (Prefer_ERMS);
CHECK_GLIBC_IFUNC_ARCH_NEED_CPU_BOTH (Slow_SSE4_2,
SSE4_2);
break;
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index 2bd9371..97ffe76 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -36,6 +36,7 @@
#define bit_arch_Prefer_MAP_32BIT_EXEC (1 << 16)
#define bit_arch_Prefer_No_VZEROUPPER (1 << 17)
#define bit_arch_Fast_Unaligned_Copy (1 << 18)
+#define bit_arch_Prefer_ERMS (1 << 19)
/* CPUID Feature flags. */
@@ -105,6 +106,7 @@
# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Fast_Unaligned_Copy FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Prefer_ERMS FEATURE_INDEX_1*FEATURE_SIZE
# if defined (_LIBC) && !IS_IN (nonlib)
@@ -274,6 +276,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1
# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
# define index_arch_Fast_Unaligned_Copy FEATURE_INDEX_1
+# define index_arch_Prefer_ERMS FEATURE_INDEX_1
#endif /* !__ASSEMBLER__ */
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index f6771a4..df7fbac 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -29,6 +29,9 @@
ENTRY(__new_memcpy)
.type __new_memcpy, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ lea __memcpy_erms(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Prefer_ERMS)
+ jnz 2f
# ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index a2cce39..4893ea4 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -150,13 +150,15 @@ L(nop):
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMMOVE_SYMBOL (__memmove, unaligned))
-# if VEC_SIZE == 16 && defined SHARED
+# if VEC_SIZE == 16
+# if defined SHARED
/* Only used to measure performance of REP MOVSB. */
ENTRY (__mempcpy_erms)
movq %rdi, %rax
addq %rdx, %rax
jmp L(start_movsb)
END (__mempcpy_erms)
+# endif
ENTRY (__memmove_erms)
movq %rdi, %rax
@@ -181,7 +183,9 @@ L(movsb_backward):
cld
ret
END (__memmove_erms)
+# if defined SHARED
strong_alias (__memmove_erms, __memcpy_erms)
+# endif
# endif
# ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/memmove.S b/sysdeps/x86_64/multiarch/memmove.S
index 25c3586..8e1c6ac 100644
--- a/sysdeps/x86_64/multiarch/memmove.S
+++ b/sysdeps/x86_64/multiarch/memmove.S
@@ -27,6 +27,9 @@
ENTRY(__libc_memmove)
.type __libc_memmove, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ lea __memmove_erms(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Prefer_ERMS)
+ jnz 2f
# ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index f9c6df3..4011a1a 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -29,6 +29,9 @@
ENTRY(__mempcpy)
.type __mempcpy, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ lea __mempcpy_erms(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Prefer_ERMS)
+ jnz 2f
# ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index 4e52d8f..2b964a0 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -26,6 +26,9 @@
ENTRY(memset)
.type memset, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ lea __memset_erms(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Prefer_ERMS)
+ jnz 2f
lea __memset_sse2_unaligned_erms(%rip), %RAX_LP
HAS_CPU_FEATURE (ERMS)
jnz 1f
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4c0c381f8c7e4b37cf955a79720299c53bfe393c
commit 4c0c381f8c7e4b37cf955a79720299c53bfe393c
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Jun 27 15:13:50 2016 -0700
Add GLIBC_IFUNC to control IFUNC selection
The current IFUNC selection is based on microbenchmarks in glibc. It
should give the best performance for most workloads. But other choices
may have better performance for a particular workload or on the hardware
which wasn't available at the selection was made. The environment
variable, GLIBC_IFUNC=xxx=0:yyy=1:zzz=0...., can be used to enable
CPU/ARCH feature yyy, disable CPU/ARCH feature yyy and zzz, where the
feature name is case-sensitive and has to match the ones in
cpu-features.h. It can be used by glibc developers to override the
IFUNC selection to improve performance for a particular workload or
tune for a new processor. It isn't intended for normal end users.
* sysdeps/i386/dl-machine.h (dl_platform_init): Pass the
array of environment strings to init_cpu_features.
* sysdeps/x86/libc-start.c (__libc_start_main): Likewise.
* sysdeps/x86/cpu-features.c (equal): New function.
(CHECK_GLIBC_IFUNC_CPU_OFF): New macro.
(CHECK_GLIBC_IFUNC_ARCH_OFF): Likewise.
(CHECK_GLIBC_IFUNC_ARCH_NEED_ARCH_BOTH): Likewise.
(CHECK_GLIBC_IFUNC_ARCH_NEED_CPU_BOTH): Likewise.
(init_cpu_features): Updated to take the array of environment
strings. Process GLIBC_IFUNC environment variable.
diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
index 4e3968a..7584931 100644
--- a/sysdeps/i386/dl-machine.h
+++ b/sysdeps/i386/dl-machine.h
@@ -240,7 +240,8 @@ dl_platform_init (void)
#ifdef SHARED
/* init_cpu_features has been called early from __libc_start_main in
static executable. */
- init_cpu_features (&GLRO(dl_x86_cpu_features));
+ init_cpu_features (&GLRO(dl_x86_cpu_features),
+ &_dl_argv[_dl_argc + 1]);
#endif
}
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 9ce4b49..fcc74c9 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -91,8 +91,141 @@ get_common_indeces (struct cpu_features *cpu_features,
}
}
+#ifdef __x86_64__
+typedef long long op_t;
+#else
+typedef int op_t;
+#endif
+
+/* Return true if the first LEN bytes of strings A and B are the same
+ where LEN != 0. We don't use string/memory functions since they may
+ not be available here and this version is faster for its usage. */
+
+static bool
+equal (const char *a, const char *b, size_t len)
+{
+ size_t op_len = len % sizeof (op_t);
+ if (op_len)
+ {
+ switch (op_len)
+ {
+ case 1:
+ if (*(char *) a != *(char *) b)
+ return false;
+ break;
+ case 2:
+ if (*(short *) a != *(short *) b)
+ return false;
+ break;
+ case 3:
+ if (*(short *) a != *(short *) b
+ || *(char *) (a + 2) != *(char *) (b + 2))
+ return false;
+ break;
+#ifdef __x86_64__
+ case 4:
+ if (*(int *) a != *(int *) b)
+ return false;
+ break;
+ default:
+ if (*(int *) a != *(int *) b
+ || *(int *) (a + op_len - 4) != *(int *) (b + op_len - 4))
+ return false;
+ break;
+#else
+ default:
+ break;
+#endif
+ }
+ /* Align length to size of op_t. */
+ len -= op_len;
+ if (len == 0)
+ return true;
+ a += op_len;
+ b += op_len;
+ }
+
+ /* Compare one op_t at a time. */
+ do
+ {
+ if (*(op_t *) a != *(op_t *) b)
+ return false;
+ len -= sizeof (op_t);
+ if (len == 0)
+ return true;
+ a += sizeof (op_t);
+ b += sizeof (op_t);
+ }
+ while (1);
+}
+
+/* Disable a CPU feature by setting "name=0". We don't enable a CPU
+ feature which isn't availble. */
+#define CHECK_GLIBC_IFUNC_CPU_OFF(name) \
+ if (equal (p, #name "=", sizeof (#name))) \
+ { \
+ if (p[sizeof (#name)] == '0') \
+ cpu_features->cpuid[index_cpu_##name].reg_##name \
+ &= ~bit_cpu_##name; \
+ break; \
+ }
+
+/* Disable an ARCH feature by setting "name=0". We don't enable an
+ ARCH feature which isn't availble or has security implication. */
+#define CHECK_GLIBC_IFUNC_ARCH_OFF(name) \
+ if (equal (p, #name "=", sizeof (#name))) \
+ { \
+ if (p[sizeof (#name)] == '0') \
+ cpu_features->feature[index_arch_##name] \
+ &= ~bit_arch_##name; \
+ break; \
+ }
+
+/* Enable/disable an ARCH feature by setting "name=1"/"name=0". */
+#define CHECK_GLIBC_IFUNC_ARCH_BOTH(name) \
+ if (equal (p, #name "=", sizeof (#name))) \
+ { \
+ if (p[sizeof (#name)] == '0') \
+ cpu_features->feature[index_arch_##name] \
+ &= ~bit_arch_##name; \
+ else if (p[sizeof (#name)] == '1') \
+ cpu_features->feature[index_arch_##name] \
+ |= bit_arch_##name; \
+ break; \
+ }
+
+/* Disable an ARCH feature by setting "name=0". Enable an ARCH feature
+ by setting "name=1" if the ARCH feature NEED is also enabled. */
+#define CHECK_GLIBC_IFUNC_ARCH_NEED_ARCH_BOTH(name, need) \
+ if (equal (p, #name "=", sizeof (#name))) \
+ { \
+ if (p[sizeof (#name)] == '0') \
+ cpu_features->feature[index_arch_##name] \
+ &= ~bit_arch_##name; \
+ else if (p[sizeof (#name)] == '1' \
+ && CPU_FEATURES_ARCH_P (cpu_features, need)) \
+ cpu_features->feature[index_arch_##name] \
+ |= bit_arch_##name; \
+ break; \
+ }
+
+/* Disable an ARCH feature by setting "name=0". Enable an ARCH feature
+ by setting "name=1" if the CPU feature NEED is also enabled. */
+#define CHECK_GLIBC_IFUNC_ARCH_NEED_CPU_BOTH(name, need) \
+ if (equal (p, #name "=", sizeof (#name))) \
+ { \
+ if (p[sizeof (#name)] == '0') \
+ cpu_features->feature[index_arch_##name] \
+ &= ~bit_arch_##name; \
+ else if (p[sizeof (#name)] == '1' \
+ && CPU_FEATURES_CPU_P (cpu_features, need)) \
+ cpu_features->feature[index_arch_##name] \
+ |= bit_arch_##name; \
+ break; \
+ }
+
static inline void
-init_cpu_features (struct cpu_features *cpu_features)
+init_cpu_features (struct cpu_features *cpu_features, char **env)
{
unsigned int ebx, ecx, edx;
unsigned int family = 0;
@@ -268,4 +401,116 @@ no_cpuid:
cpu_features->family = family;
cpu_features->model = model;
cpu_features->kind = kind;
+
+ if (env == NULL)
+ return;
+
+ /* The current IFUNC selection is based on microbenchmarks in glibc.
+ It should give the best performance for most workloads. But other
+ choices may have better performance for a particular workload or on
+ the hardware which wasn't available at the selection was made. The
+ environment variable, GLIBC_IFUNC=xxx=0:yyy=1:zzz=0...., can be
+ used to enable CPU/ARCH feature yyy, disable CPU/ARCH feature yyy
+ and zzz, where the feature name is case-sensitive and has to match
+ the ones in cpu-features.h. It can be used by glibc developers to
+ override the IFUNC selection to improve performance for a particular
+ workload or tune for a new processor. */
+ while (*env != NULL)
+ {
+ const char *p, *end;
+ size_t len = sizeof ("GLIBC_IFUNC=");
+ end = *env;
+ for (p = end; *p != '\0'; p++)
+ if (--len == 0 && equal (end, "GLIBC_IFUNC=", 12))
+ {
+ len = strlen (p);
+ end = p + len;
+ do
+ {
+ const char *c;
+ for (c = p; *c != ':'; c++)
+ if (c >= end)
+ break;
+ len = c - p;
+ switch (len)
+ {
+ default:
+ break;
+ case 5:
+ CHECK_GLIBC_IFUNC_CPU_OFF (AVX);
+ CHECK_GLIBC_IFUNC_CPU_OFF (CX8);
+ CHECK_GLIBC_IFUNC_CPU_OFF (FMA);
+ CHECK_GLIBC_IFUNC_CPU_OFF (HTT);
+ CHECK_GLIBC_IFUNC_CPU_OFF (RTM);
+ break;
+ case 6:
+ CHECK_GLIBC_IFUNC_CPU_OFF (AVX2);
+ CHECK_GLIBC_IFUNC_CPU_OFF (CMOV);
+ CHECK_GLIBC_IFUNC_CPU_OFF (ERMS);
+ CHECK_GLIBC_IFUNC_CPU_OFF (FMA4);
+ CHECK_GLIBC_IFUNC_CPU_OFF (SSE2);
+ CHECK_GLIBC_IFUNC_ARCH_OFF (I586);
+ CHECK_GLIBC_IFUNC_ARCH_OFF (I686);
+ break;
+ case 7:
+ CHECK_GLIBC_IFUNC_CPU_OFF (SSSE3);
+ break;
+ case 8:
+ CHECK_GLIBC_IFUNC_CPU_OFF (SSE4_1);
+ CHECK_GLIBC_IFUNC_CPU_OFF (SSE4_2);
+ break;
+ case 9:
+ CHECK_GLIBC_IFUNC_CPU_OFF (AVX512F);
+ CHECK_GLIBC_IFUNC_CPU_OFF (OSXSAVE);
+ break;
+ case 10:
+ CHECK_GLIBC_IFUNC_CPU_OFF (AVX512DQ);
+ CHECK_GLIBC_IFUNC_CPU_OFF (POPCOUNT);
+ CHECK_GLIBC_IFUNC_ARCH_BOTH (Slow_BSF);
+ break;
+ case 12:
+ CHECK_GLIBC_IFUNC_ARCH_OFF (AVX_Usable);
+ CHECK_GLIBC_IFUNC_ARCH_OFF (FMA_Usable);
+ break;
+ case 13:
+ CHECK_GLIBC_IFUNC_ARCH_OFF (AVX2_Usable);
+ CHECK_GLIBC_IFUNC_ARCH_OFF (FMA4_Usable);
+ CHECK_GLIBC_IFUNC_ARCH_NEED_CPU_BOTH (Slow_SSE4_2,
+ SSE4_2);
+ break;
+ case 15:
+ CHECK_GLIBC_IFUNC_ARCH_OFF (AVX512F_Usable);
+ CHECK_GLIBC_IFUNC_ARCH_NEED_ARCH_BOTH
+ (AVX_Fast_Unaligned_Load, AVX_Usable);
+ break;
+ case 17:
+ CHECK_GLIBC_IFUNC_ARCH_OFF (AVX512DQ_Usable);
+ CHECK_GLIBC_IFUNC_ARCH_BOTH (Fast_Rep_String);
+ break;
+ case 20:
+ CHECK_GLIBC_IFUNC_ARCH_BOTH (Fast_Copy_Backward);
+ break;
+ case 21:
+ CHECK_GLIBC_IFUNC_ARCH_BOTH (Fast_Unaligned_Load);
+ CHECK_GLIBC_IFUNC_ARCH_BOTH (Fast_Unaligned_Copy);
+ break;
+ case 22:
+ CHECK_GLIBC_IFUNC_ARCH_NEED_ARCH_BOTH
+ (Prefer_No_VZEROUPPER, AVX_Usable);
+ break;
+ case 23:
+ CHECK_GLIBC_IFUNC_ARCH_OFF (Prefer_MAP_32BIT_EXEC);
+ break;
+ case 28:
+ CHECK_GLIBC_IFUNC_ARCH_NEED_CPU_BOTH
+ (Prefer_PMINUB_for_stringop, SSE2);
+ break;
+ }
+ p += len + 1;
+ }
+ while (p < end);
+ return;
+ }
+ env++;
+ }
}
diff --git a/sysdeps/x86/libc-start.c b/sysdeps/x86/libc-start.c
index 3b5ea6e..7dec1ca 100644
--- a/sysdeps/x86/libc-start.c
+++ b/sysdeps/x86/libc-start.c
@@ -34,7 +34,7 @@ __libc_start_main (int (*main) (int, char **, char ** MAIN_AUXVEC_DECL),
void (*fini) (void),
void (*rtld_fini) (void), void *stack_end)
{
- init_cpu_features (&_dl_x86_cpu_features);
+ init_cpu_features (&_dl_x86_cpu_features, &argv[argc + 1]);
return generic_start_main (main, argc, argv, init, fini, rtld_fini,
stack_end);
}
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index ed0c1a8..071a2e1 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -227,7 +227,8 @@ dl_platform_init (void)
#ifdef SHARED
/* init_cpu_features has been called early from __libc_start_main in
static executable. */
- init_cpu_features (&GLRO(dl_x86_cpu_features));
+ init_cpu_features (&GLRO(dl_x86_cpu_features),
+ &_dl_argv[_dl_argc + 1]);
#endif
}
-----------------------------------------------------------------------
hooks/post-receive
--
GNU C Library master sources