Bug 20195 - FMA4 detection requires CPUID execution with register eax=0x80000001
Summary: FMA4 detection requires CPUID execution with register eax=0x80000001
Status: RESOLVED FIXED
Alias: None
Product: glibc
Classification: Unclassified
Component: string (show other bugs)
Version: 2.24
: P2 normal
Target Milestone: 2.24
Assignee: Not yet assigned to anyone
URL:
Keywords:
Depends on:
Blocks:
 
Reported: 2016-06-03 07:21 UTC by Amit Pawar
Modified: 2016-06-07 15:34 UTC (History)
1 user (show)

See Also:
Host:
Target:
Build:
Last reconfirmed: 2016-06-06 00:00:00
fweimer: security-


Attachments
A patch (739 bytes, patch)
2016-06-06 21:01 UTC, H.J. Lu
Details | Diff

Note You need to log in before you can comment on or make changes to this bug.
Description Amit Pawar 2016-06-03 07:21:54 UTC
FMA4 detection is failing in GLIBC (from 2.23 release onwards) due to the fix for defect [BZ #19214].

Cause:
To detect for FMA4 support, CPUID needs to be called by setting the value 0x80000001 in $eax register. FMA4 support only found on AMD cpu's so this requires special handling and can't be detected through generic logic. Earlier detection logic was executed after the CPUID execution but now detection logic is executed before the CPUID execution.

GLIBC-2.23 release also fails to detect so I am filing this defect by selecting the version 2.23 and if this is not OK then will select version 2.24 for this defect.

The code diff is given below.

---------------------- DIFF ------------------------
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index c8f81ef..1787716 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -19,79 +19,23 @@
 #include <cpuid.h>
 #include <cpu-features.h>

-static void
+static inline void
 get_common_indeces (struct cpu_features *cpu_features,
                    unsigned int *family, unsigned int *model,
                    unsigned int *extended_model)
 {
-  if (family)
+  unsigned int eax;
+  __cpuid (1, eax, cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx,
+          cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx,
+          cpu_features->cpuid[COMMON_CPUID_INDEX_1].edx);
+  GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].eax = eax;
+  *family = (eax >> 8) & 0x0f;
+  *model = (eax >> 4) & 0x0f;
+  *extended_model = (eax >> 12) & 0xf0;
+  if (*family == 0x0f)
     {
-      unsigned int eax;
-      __cpuid (1, eax, cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx,
-              cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx,
-              cpu_features->cpuid[COMMON_CPUID_INDEX_1].edx);
-      cpu_features->cpuid[COMMON_CPUID_INDEX_1].eax = eax;
-      *family = (eax >> 8) & 0x0f;
-      *model = (eax >> 4) & 0x0f;
-      *extended_model = (eax >> 12) & 0xf0;
-      if (*family == 0x0f)
-       {
-         *family += (eax >> 20) & 0xff;
-         *model += *extended_model;
-       }
-    }
-
-  if (cpu_features->max_cpuid >= 7)
-    __cpuid_count (7, 0,
-                  cpu_features->cpuid[COMMON_CPUID_INDEX_7].eax,
-                  cpu_features->cpuid[COMMON_CPUID_INDEX_7].ebx,
-                  cpu_features->cpuid[COMMON_CPUID_INDEX_7].ecx,
-                  cpu_features->cpuid[COMMON_CPUID_INDEX_7].edx);
-
-  /* Can we call xgetbv?  */
-  if (CPU_FEATURES_CPU_P (cpu_features, OSXSAVE))
-    {
-      unsigned int xcrlow;
-      unsigned int xcrhigh;
-      asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0));
-      /* Is YMM and XMM state usable?  */
-      if ((xcrlow & (bit_YMM_state | bit_XMM_state)) ==
-         (bit_YMM_state | bit_XMM_state))
-       {
-         /* Determine if AVX is usable.  */
-         if (CPU_FEATURES_CPU_P (cpu_features, AVX))
-           cpu_features->feature[index_arch_AVX_Usable]
-             |= bit_arch_AVX_Usable;
-         /* Determine if AVX2 is usable.  */
-         if (CPU_FEATURES_CPU_P (cpu_features, AVX2))
-           cpu_features->feature[index_arch_AVX2_Usable]
-             |= bit_arch_AVX2_Usable;
-         /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and
-            ZMM16-ZMM31 state are enabled.  */
-         if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state
-                        | bit_ZMM16_31_state)) ==
-             (bit_Opmask_state | bit_ZMM0_15_state | bit_ZMM16_31_state))
-           {
-             /* Determine if AVX512F is usable.  */
-             if (CPU_FEATURES_CPU_P (cpu_features, AVX512F))
-               {
-                 cpu_features->feature[index_arch_AVX512F_Usable]
-                   |= bit_arch_AVX512F_Usable;
-                 /* Determine if AVX512DQ is usable.  */
-                 if (CPU_FEATURES_CPU_P (cpu_features, AVX512DQ))
-                   cpu_features->feature[index_arch_AVX512DQ_Usable]
-                     |= bit_arch_AVX512DQ_Usable;
-               }
-           }
-         /* Determine if FMA is usable.  */
-         if (CPU_FEATURES_CPU_P (cpu_features, FMA))
-           cpu_features->feature[index_arch_FMA_Usable]
-             |= bit_arch_FMA_Usable;
-         /* Determine if FMA4 is usable.  */
-         if (CPU_FEATURES_CPU_P (cpu_features, FMA4))
-           cpu_features->feature[index_arch_FMA4_Usable]
-             |= bit_arch_FMA4_Usable;
-       }
+      *family += (eax >> 20) & 0xff;
+      *model += *extended_model;
     }
 }
@@ -227,19 +165,73 @@ init_cpu_features (struct cpu_features *cpu_features)
        }
     }
   else
-    {
-      kind = arch_kind_other;
-      get_common_indeces (cpu_features, NULL, NULL, NULL);
-    }
+    kind = arch_kind_other;

   /* Support i586 if CX8 is available.  */
-  if (CPU_FEATURES_CPU_P (cpu_features, CX8))
+  if (HAS_CPU_FEATURE (CX8))
     cpu_features->feature[index_arch_I586] |= bit_arch_I586;

   /* Support i686 if CMOV is available.  */
-  if (CPU_FEATURES_CPU_P (cpu_features, CMOV))
+  if (HAS_CPU_FEATURE (CMOV))
     cpu_features->feature[index_arch_I686] |= bit_arch_I686;
+  if (cpu_features->max_cpuid >= 7)
+    __cpuid_count (7, 0,
+                  cpu_features->cpuid[COMMON_CPUID_INDEX_7].eax,
+                  cpu_features->cpuid[COMMON_CPUID_INDEX_7].ebx,
+                  cpu_features->cpuid[COMMON_CPUID_INDEX_7].ecx,
+                  cpu_features->cpuid[COMMON_CPUID_INDEX_7].edx);
+
+  /* Can we call xgetbv?  */
+  if (HAS_CPU_FEATURE (OSXSAVE))
+    {
+      unsigned int xcrlow;
+      unsigned int xcrhigh;
+      asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0));
+      /* Is YMM and XMM state usable?  */
+      if ((xcrlow & (bit_YMM_state | bit_XMM_state)) ==
+         (bit_YMM_state | bit_XMM_state))
+       {
+         /* Determine if AVX is usable.  */
+         if (HAS_CPU_FEATURE (AVX))
+           cpu_features->feature[index_arch_AVX_Usable]
+             |= bit_arch_AVX_Usable;
+#if index_arch_AVX2_Usable != index_arch_AVX_Fast_Unaligned_Load
+# error index_arch_AVX2_Usable != index_arch_AVX_Fast_Unaligned_Load
+#endif
+         /* Determine if AVX2 is usable.  Unaligned load with 256-bit
+            AVX registers are faster on processors with AVX2.  */
+         if (HAS_CPU_FEATURE (AVX2))
+           cpu_features->feature[index_arch_AVX2_Usable]
+             |= bit_arch_AVX2_Usable | bit_arch_AVX_Fast_Unaligned_Load;
+         /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and
+            ZMM16-ZMM31 state are enabled.  */
+         if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state
+                        | bit_ZMM16_31_state)) ==
+             (bit_Opmask_state | bit_ZMM0_15_state | bit_ZMM16_31_state))
+           {
+             /* Determine if AVX512F is usable.  */
+             if (HAS_CPU_FEATURE (AVX512F))
+               {
+                 cpu_features->feature[index_arch_AVX512F_Usable]
+                   |= bit_arch_AVX512F_Usable;
+                 /* Determine if AVX512DQ is usable.  */
+                 if (HAS_CPU_FEATURE (AVX512DQ))
+                   cpu_features->feature[index_arch_AVX512DQ_Usable]
+                     |= bit_arch_AVX512DQ_Usable;
+               }
+           }
+         /* Determine if FMA is usable.  */
+         if (HAS_CPU_FEATURE (FMA))
+           cpu_features->feature[index_arch_FMA_Usable]
+             |= bit_arch_FMA_Usable;
+         /* Determine if FMA4 is usable.  */
+         if (HAS_CPU_FEATURE (FMA4))
+           cpu_features->feature[index_arch_FMA4_Usable]
+             |= bit_arch_FMA4_Usable;
+       }
+    }
+
 #if !HAS_CPUID
 no_cpuid:
 #endif
Comment 1 H.J. Lu 2016-06-06 21:01:50 UTC
Created attachment 9325 [details]
A patch

Try this.
Comment 2 Amit Pawar 2016-06-07 05:48:42 UTC
This works fine. Can you please commit to the trunk.
Comment 3 Sourceware Commits 2016-06-07 15:32:18 UTC
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  91655fc307fec2d5a8d60446b4de11cc986b47fa (commit)
      from  c9bd40daaee18cf1d9824e4a7ebaebe321e0a5a8 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=91655fc307fec2d5a8d60446b4de11cc986b47fa

commit 91655fc307fec2d5a8d60446b4de11cc986b47fa
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Jun 7 08:00:21 2016 -0700

    Check FMA after COMMON_CPUID_INDEX_80000001
    
    Since the FMA4 bit is in COMMON_CPUID_INDEX_80000001 and FMA4 requires
    AVX, determine if FMA4 is usable after COMMON_CPUID_INDEX_80000001 is
    available and if AVX is usable.
    
    	[BZ #20195]
    	* sysdeps/x86/cpu-features.c (get_common_indeces): Move FMA4
    	check to ...
    	(init_cpu_features): Here.

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                  |    7 +++++++
 sysdeps/x86/cpu-features.c |   13 +++++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)
Comment 4 H.J. Lu 2016-06-07 15:34:47 UTC
Fixed.