This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
PATCH: Enable SSE2 memset for AMD'supcoming Orochi processor and bugfix
- From: Harsha Jagasia <harsha dot jagasia at amd dot com>
- To: <libc-alpha at sourceware dot org>, <harsha dot jagasia at amd dot com>
- Cc: Harsha Jagasia <harsha dot jagasia at amd dot com>
- Date: Wed, 2 Mar 2011 13:20:43 -0600
- Subject: PATCH: Enable SSE2 memset for AMD'supcoming Orochi processor and bugfix
Hi,
This patch enables SSE2 memset for AMD's upcoming Orochi processor.
This patch also fixes the following bug:
For misaligned blocks larger than > 144 Bytes, memset branches into
the integer code path depending on the value of misalignment even if
the startup code chooses the SSE2 code path upfront, when multiarch
is enabled.
-Harsha
2011-03-02 Harsha Jagasia <harsha.jagasia@amd.com>
* sysdeps/x86_64/cacheinfo.c (init_cacheinfo):
Set _x86_64_preferred_memory_instruction for AMD processsors.
* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
Set bit_Prefer_SSE_for_memop for AMD processors.
* sysdeps/x86_64/memset.S: After aligning destination, code
branches to different locations depending on the value of
misalignment, when multiarch is enabled. Fix this.
diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index eae54e7..92bc68c 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -621,6 +621,27 @@ init_cacheinfo (void)
long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
+ unsigned int ebx_1;
+
+#ifdef USE_MULTIARCH
+ eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
+ ebx_1 = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx;
+ ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
+ edx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx;
+#else
+ __cpuid (1, eax, ebx_1, ecx, edx);
+#endif
+
+#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION
+ /* AMD prefers SSSE3 instructions for memory/string routines
+ if they are avaiable, otherwise it prefers integer
+ instructions. */
+ if ((ecx & 0x200))
+ __x86_64_preferred_memory_instruction = 3;
+ else
+ __x86_64_preferred_memory_instruction = 0;
+#endif
+
/* Get maximum extended function. */
__cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index f6eb71f..9aa55ff 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -592,8 +592,11 @@ L(A7Q0): mov %dl,-0x7(%rdi)
L(A6Q0): mov %dx,-0x6(%rdi)
mov %edx,-0x4(%rdi)
+ .balign 16
+L(aligned_now):
+
#ifndef USE_MULTIARCH
- jmp L(aligned_now)
+ jmp L(check_code_path)
L(SSE_pre):
#endif
@@ -1201,7 +1204,7 @@ L(SSExDx):
#endif /* !defined USE_MULTIARCH || defined USE_SSE2 */
.balign 16
-L(aligned_now):
+L(check_code_path):
#ifndef USE_MULTIARCH
cmpl $0x1,__x86_64_preferred_memory_instruction(%rip)
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index f0d2bb7..4716532 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -107,6 +107,18 @@ __init_cpu_features (void)
kind = arch_kind_amd;
get_common_indeces (&family, &model);
+
+ unsigned int ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
+
+ /* AMD processors prefer SSE instructions for memory/string routines
+ if they are avaiable, otherwise they prefer integer
+ instructions. */
+ if ((ecx & 0x200))
+ __cpu_features.feature[index_Prefer_SSE_for_memop]
+ |= bit_Prefer_SSE_for_memop;
+ else
+ __cpu_features.feature[index_Prefer_SSE_for_memop]
+ |= 0;
}
else
kind = arch_kind_other;