[RFC PATCH 2/2] x86: Alternative <sys/platform/x86.h> implementation

H.J. Lu hjl.tools@gmail.com
Thu Dec 24 04:29:26 GMT 2020


On Wed, Dec 23, 2020 at 4:35 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Dec 23, 2020 at 11:15:01PM +0100, GNU C Library wrote:
> > This implementation uses a data symbol containing a pointer
> > to the bitmask array and an array length, not a function call.
> > The expectation is that this is the final link-time ABI for
> > this feature.  (The run-time ABI will have to change once more,
> > to support use of this facility in IFUNC resolvers.)
> >
> > The __libc_vars initialization mechanism is used to set up
> > the copy in libc.so.6 before relocation, so that it can be used
> > by IFUNC resolvers.
> >
> > Usage of the C preprocessor is greatly reduced, making it easier
> > to wrap this functionality in other languages.
> >
> > This is still a preview.  There are further cleanups possible,
> > including removal of the function symbol.  The manual still needs
> > updating, and there are a few overlong lines.  I'd like to receive
> > feedback if this is the direction in which we want to move.
> >
> > I think it should be possible to hack in IFUNC resolver support using a
> > custom dynamic section entry that points to a hidden __x86_cpu_array
> > variable.  It would be cleaner to use a new run-time-only relocation for
> > the initialization.  The dynamic section hack would not work with
> > --gc-sections, for instance.
> >
> >  76 files changed, 1032 insertions(+), 1355 deletions(-)
> >
> > diff --git a/sysdeps/i386/fpu/fclrexcpt.c b/sysdeps/i386/fpu/fclrexcpt.c
> > index 7dc357f2d6..79379f78ef 100644
> > --- a/sysdeps/i386/fpu/fclrexcpt.c
> > +++ b/sysdeps/i386/fpu/fclrexcpt.c
> > @@ -41,7 +41,7 @@ __feclearexcept (int excepts)
> >    __asm__ ("fldenv %0" : : "m" (*&temp));
> >
> >    /* If the CPU supports SSE, we clear the MXCSR as well.  */
> > -  if (CPU_FEATURE_USABLE (SSE))
> > +  if (x86_cpu_is_usable (x86_cpu_SSE))
>
> Can we do
>
> #define CPU_FEATURE_USABLE(name) x86_cpu_is_usable (x86_cpu_##name)
>
> instead?
>
> > diff --git a/sysdeps/x86/bits/platform/x86.h b/sysdeps/x86/bits/platform/x86.h
> > new file mode 100644
> > index 0000000000..dd9a273f5b
> > --- /dev/null
> > +++ b/sysdeps/x86/bits/platform/x86.h
> > @@ -0,0 +1,245 @@
> > +/* Constants for x86 CPU features and struct x86_cpu_array definition.
> > +   This file is part of the GNU C Library.
> > +   Copyright (C) 2008-2020 Free Software Foundation, Inc.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _SYS_PLATFORM_X86_H
> > +# error "Never include <bits/platform/x86.h> directly; use <sys/platform/x86.h> instead."
> > +#endif
> > +
> > +struct x86_cpu_array
> > +{
> > +  /* Pointer to an array of __x86_count 32-bit values.  */
> > +  const unsigned int *__x86_word;
> > +  unsigned int __x86_count;
> > +#ifdef __LP64__
> > +  unsigned int __x86_padding;
> > +#endif
> > +};
> > +
> > +enum
> > +{
> > +  /* CPUID.01H:ECX.  */
> > +  x86_cpu_SSE3                =  0u * 64u + 0u,
> > +  x86_cpu_PCLMULQDQ           =  0u * 64u + 1u,
> > +  x86_cpu_DTES64              =  0u * 64u + 2u,
> > +  x86_cpu_MONITOR             =  0u * 64u + 3u,
> > +  x86_cpu_DS_CPL              =  0u * 64u + 4u,
> > +  x86_cpu_VMX                 =  0u * 64u + 5u,
> > +  x86_cpu_SMX                 =  0u * 64u + 6u,
> > +  x86_cpu_EIST                =  0u * 64u + 7u,
> > +  x86_cpu_TM2                 =  0u * 64u + 8u,
> > +  x86_cpu_SSSE3               =  0u * 64u + 9u,
> > +  x86_cpu_CNXT_ID             =  0u * 64u + 10u,
> > +  x86_cpu_SDBG                =  0u * 64u + 11u,
> > +  x86_cpu_FMA                 =  0u * 64u + 12u,
> > +  x86_cpu_CMPXCHG16B          =  0u * 64u + 13u,
> > +  x86_cpu_XTPRUPDCTRL         =  0u * 64u + 14u,
> > +  x86_cpu_PDCM                =  0u * 64u + 15u,
> > +  x86_cpu_INDEX_1_ECX_16      =  0u * 64u + 16u,
> > +  x86_cpu_PCID                =  0u * 64u + 17u,
> > +  x86_cpu_DCA                 =  0u * 64u + 18u,
> > +  x86_cpu_SSE4_1              =  0u * 64u + 19u,
> > +  x86_cpu_SSE4_2              =  0u * 64u + 20u,
> > +  x86_cpu_X2APIC              =  0u * 64u + 21u,
> > +  x86_cpu_MOVBE               =  0u * 64u + 22u,
> > +  x86_cpu_POPCNT              =  0u * 64u + 23u,
> > +  x86_cpu_TSC_DEADLINE        =  0u * 64u + 24u,
> > +  x86_cpu_AES                 =  0u * 64u + 25u,
> > +  x86_cpu_XSAVE               =  0u * 64u + 26u,
> > +  x86_cpu_OSXSAVE             =  0u * 64u + 27u,
> > +  x86_cpu_AVX                 =  0u * 64u + 28u,
> > +  x86_cpu_F16C                =  0u * 64u + 29u,
> > +  x86_cpu_RDRAND              =  0u * 64u + 30u,
> > +  x86_cpu_INDEX_1_ECX_31      =  0u * 64u + 31u,
> > +
> > +  /* CPUID.01H:EDX.  */
> > +  x86_cpu_FPU                 =  1u * 64u + 0u,
> > +  x86_cpu_VME                 =  1u * 64u + 1u,
> > +  x86_cpu_DE                  =  1u * 64u + 2u,
> > +  x86_cpu_PSE                 =  1u * 64u + 3u,
> > +  x86_cpu_TSC                 =  1u * 64u + 4u,
> > +  x86_cpu_MSR                 =  1u * 64u + 5u,
> > +  x86_cpu_PAE                 =  1u * 64u + 6u,
> > +  x86_cpu_MCE                 =  1u * 64u + 7u,
> > +  x86_cpu_CX8                 =  1u * 64u + 8u,
> > +  x86_cpu_APIC                =  1u * 64u + 9u,
> > +  x86_cpu_INDEX_1_EDX_10      =  1u * 64u + 10u,
> > +  x86_cpu_SEP                 =  1u * 64u + 11u,
> > +  x86_cpu_MTRR                =  1u * 64u + 12u,
> > +  x86_cpu_PGE                 =  1u * 64u + 13u,
> > +  x86_cpu_MCA                 =  1u * 64u + 14u,
> > +  x86_cpu_CMOV                =  1u * 64u + 15u,
> > +  x86_cpu_PAT                 =  1u * 64u + 16u,
> > +  x86_cpu_PSE_36              =  1u * 64u + 17u,
> > +  x86_cpu_PSN                 =  1u * 64u + 18u,
> > +  x86_cpu_CLFSH               =  1u * 64u + 19u,
> > +  x86_cpu_INDEX_1_EDX_20      =  1u * 64u + 20u,
> > +  x86_cpu_DS                  =  1u * 64u + 21u,
> > +  x86_cpu_ACPI                =  1u * 64u + 22u,
> > +  x86_cpu_MMX                 =  1u * 64u + 23u,
> > +  x86_cpu_FXSR                =  1u * 64u + 24u,
> > +  x86_cpu_SSE                 =  1u * 64u + 25u,
> > +  x86_cpu_SSE2                =  1u * 64u + 26u,
> > +  x86_cpu_SS                  =  1u * 64u + 27u,
> > +  x86_cpu_HTT                 =  1u * 64u + 28u,
> > +  x86_cpu_TM                  =  1u * 64u + 29u,
> > +  x86_cpu_INDEX_1_EDX_30      =  1u * 64u + 30u,
> > +  x86_cpu_PBE                 =  1u * 64u + 31u,
> > +
> > +  /* CPUID.07H.0H:EBX.  */
> > +  x86_cpu_FSGSBASE            =  2u * 64u + 0u,
> > +  x86_cpu_TSC_ADJUST          =  2u * 64u + 1u,
> > +  x86_cpu_SGX                 =  2u * 64u + 2u,
> > +  x86_cpu_BMI1                =  2u * 64u + 3u,
> > +  x86_cpu_HLE                 =  2u * 64u + 4u,
> > +  x86_cpu_AVX2                =  2u * 64u + 5u,
> > +  x86_cpu_INDEX_7_EBX_6       =  2u * 64u + 6u,
> > +  x86_cpu_SMEP                =  2u * 64u + 7u,
> > +  x86_cpu_BMI2                =  2u * 64u + 8u,
> > +  x86_cpu_ERMS                =  2u * 64u + 9u,
> > +  x86_cpu_INVPCID             =  2u * 64u + 10u,
> > +  x86_cpu_RTM                 =  2u * 64u + 11u,
> > +  x86_cpu_RDT_M               =  2u * 64u + 12u,
> > +  x86_cpu_DEPR_FPU_CS_DS      =  2u * 64u + 13u,
> > +  x86_cpu_MPX                 =  2u * 64u + 14u,
> > +  x86_cpu_RDT_A               =  2u * 64u + 15u,
> > +  x86_cpu_AVX512F             =  2u * 64u + 16u,
> > +  x86_cpu_AVX512DQ            =  2u * 64u + 17u,
> > +  x86_cpu_RDSEED              =  2u * 64u + 18u,
> > +  x86_cpu_ADX                 =  2u * 64u + 19u,
> > +  x86_cpu_SMAP                =  2u * 64u + 20u,
> > +  x86_cpu_AVX512_IFMA         =  2u * 64u + 21u,
> > +  x86_cpu_INDEX_7_EBX_22      =  2u * 64u + 22u,
> > +  x86_cpu_CLFLUSHOPT          =  2u * 64u + 23u,
> > +  x86_cpu_CLWB                =  2u * 64u + 24u,
> > +  x86_cpu_TRACE               =  2u * 64u + 25u,
> > +  x86_cpu_AVX512PF            =  2u * 64u + 26u,
> > +  x86_cpu_AVX512ER            =  2u * 64u + 27u,
> > +  x86_cpu_AVX512CD            =  2u * 64u + 28u,
> > +  x86_cpu_SHA                 =  2u * 64u + 29u,
> > +  x86_cpu_AVX512BW            =  2u * 64u + 30u,
> > +  x86_cpu_AVX512VL            =  2u * 64u + 31u,
> > +
> > +  /* CPUID.07H.0H:ECX.  */
> > +  x86_cpu_PREFETCHWT1         =  3u * 64u + 0u,
> > +  x86_cpu_AVX512_VBMI         =  3u * 64u + 1u,
> > +  x86_cpu_UMIP                =  3u * 64u + 2u,
> > +  x86_cpu_PKU                 =  3u * 64u + 3u,
> > +  x86_cpu_OSPKE               =  3u * 64u + 4u,
> > +  x86_cpu_WAITPKG             =  3u * 64u + 5u,
> > +  x86_cpu_AVX512_VBMI2        =  3u * 64u + 6u,
> > +  x86_cpu_SHSTK               =  3u * 64u + 7u,
> > +  x86_cpu_GFNI                =  3u * 64u + 8u,
> > +  x86_cpu_VAES                =  3u * 64u + 9u,
> > +  x86_cpu_VPCLMULQDQ          =  3u * 64u + 10u,
> > +  x86_cpu_AVX512_VNNI         =  3u * 64u + 11u,
> > +  x86_cpu_AVX512_BITALG       =  3u * 64u + 12u,
> > +  x86_cpu_INDEX_7_ECX_13      =  3u * 64u + 13u,
> > +  x86_cpu_AVX512_VPOPCNTDQ    =  3u * 64u + 14u,
> > +  x86_cpu_INDEX_7_ECX_15      =  3u * 64u + 15u,
> > +  x86_cpu_INDEX_7_ECX_16      =  3u * 64u + 16u,
> > +  /* Note: Bits 17-21: The value of MAWAU used by the BNDLDX and
> > +     BNDSTX instructions in 64-bit mode.  */
> > +  x86_cpu_RDPID               =  3u * 64u + 22u,
> > +  x86_cpu_KL                  =  3u * 64u + 23u,
> > +  x86_cpu_INDEX_7_ECX_24      =  3u * 64u + 24u,
> > +  x86_cpu_CLDEMOTE            =  3u * 64u + 25u,
> > +  x86_cpu_INDEX_7_ECX_26      =  3u * 64u + 26u,
> > +  x86_cpu_MOVDIRI             =  3u * 64u + 27u,
> > +  x86_cpu_MOVDIR64B           =  3u * 64u + 28u,
> > +  x86_cpu_ENQCMD              =  3u * 64u + 29u,
> > +  x86_cpu_SGX_LC              =  3u * 64u + 30u,
> > +  x86_cpu_PKS                 =  3u * 64u + 31u,
> > +
> > +  /* CPUID.07H.0H:EDX.  */
> > +  x86_cpu_INDEX_7_EDX_0       =  4u * 64u + 0u,
> > +  x86_cpu_INDEX_7_EDX_1       =  4u * 64u + 1u,
> > +  x86_cpu_AVX512_4VNNIW       =  4u * 64u + 2u,
> > +  x86_cpu_AVX512_4FMAPS       =  4u * 64u + 3u,
> > +  x86_cpu_FSRM                =  4u * 64u + 4u,
> > +  x86_cpu_UINTR               =  4u * 64u + 5u,
> > +  x86_cpu_INDEX_7_EDX_6       =  4u * 64u + 6u,
> > +  x86_cpu_INDEX_7_EDX_7       =  4u * 64u + 7u,
> > +  x86_cpu_AVX512_VP2INTERSECT =  4u * 64u + 8u,
> > +  x86_cpu_INDEX_7_EDX_9       =  4u * 64u + 9u,
> > +  x86_cpu_MD_CLEAR            =  4u * 64u + 10u,
> > +  x86_cpu_INDEX_7_EDX_11      =  4u * 64u + 11u,
> > +  x86_cpu_INDEX_7_EDX_12      =  4u * 64u + 12u,
> > +  x86_cpu_INDEX_7_EDX_13      =  4u * 64u + 13u,
> > +  x86_cpu_SERIALIZE           =  4u * 64u + 14u,
> > +  x86_cpu_HYBRID              =  4u * 64u + 15u,
> > +  x86_cpu_TSXLDTRK            =  4u * 64u + 16u,
> > +  x86_cpu_INDEX_7_EDX_17      =  4u * 64u + 17u,
> > +  x86_cpu_PCONFIG             =  4u * 64u + 18u,
> > +  x86_cpu_INDEX_7_EDX_19      =  4u * 64u + 19u,
> > +  x86_cpu_IBT                 =  4u * 64u + 20u,
> > +  x86_cpu_INDEX_7_EDX_21      =  4u * 64u + 21u,
> > +  x86_cpu_AMX_BF16            =  4u * 64u + 22u,
> > +  x86_cpu_AVX512_FP16         =  4u * 64u + 23u,
> > +  x86_cpu_AMX_TILE            =  4u * 64u + 24u,
> > +  x86_cpu_AMX_INT8            =  4u * 64u + 25u,
> > +  x86_cpu_IBRS_IBPB           =  4u * 64u + 26u,
> > +  x86_cpu_STIBP               =  4u * 64u + 27u,
> > +  x86_cpu_L1D_FLUSH           =  4u * 64u + 28u,
> > +  x86_cpu_ARCH_CAPABILITIES   =  4u * 64u + 29u,
> > +  x86_cpu_CORE_CAPABILITIES   =  4u * 64u + 30u,
> > +  x86_cpu_SSBD                =  4u * 64u + 31u,
> > +
> > +  /* CPUID.80000001H:ECX.  */
> > +  x86_cpu_LAHF64_SAHF64       =  5u * 64u + 0u,
> > +  x86_cpu_SVM                 =  5u * 64u + 2u,
> > +  x86_cpu_LZCNT               =  5u * 64u + 5u,
> > +  x86_cpu_SSE4A               =  5u * 64u + 6u,
> > +  x86_cpu_PREFETCHW           =  5u * 64u + 8u,
> > +  x86_cpu_XOP                 =  5u * 64u + 11u,
> > +  x86_cpu_LWP                 =  5u * 64u + 15u,
> > +  x86_cpu_FMA4                =  5u * 64u + 16u,
> > +  x86_cpu_TBM                 =  5u * 64u + 21u,
> > +
> > +  /* CPUID.80000001H:EDX.  */
> > +  x86_cpu_SYSCALL_SYSRET      =  6u * 64u + 11u,
> > +  x86_cpu_NX                  =  6u * 64u + 20u,
> > +  x86_cpu_PAGE1GB             =  6u * 64u + 26u,
> > +  x86_cpu_RDTSCP              =  6u * 64u + 27u,
> > +  x86_cpu_LM                  =  6u * 64u + 29u,
> > +
> > +  /* CPUID.(EAX=0DH,ECX=1):EAX.  */
> > +  x86_cpu_XSAVEOPT            =  7u * 64u + 0u,
> > +  x86_cpu_XSAVEC              =  7u * 64u + 1u,
> > +  x86_cpu_XGETBV_ECX_1        =  7u * 64u + 2u,
> > +  x86_cpu_XSAVES              =  7u * 64u + 3u,
> > +  x86_cpu_XFD                 =  7u * 64u + 4u,
> > +
> > +  /* CPUID.80000007H:EDX.  */
> > +  x86_cpu_INVARIANT_TSC       =  8u * 64u + 8u,
> > +
> > +  /* CPUID.80000008H:EBX.  */
> > +  x86_cpu_WBNOINVD            =  9u * 64u + 9u,
> > +
> > +  /* CPUID.(EAX=07H.,ECX=1):EAX.  */
> > +  x86_cpu_AVX_VNNI            = 10u * 64u + 4u,
> > +  x86_cpu_AVX512_BF16         = 10u * 64u + 5u,
> > +  x86_cpu_FZLRM               = 10u * 64u + 10u,
> > +  x86_cpu_FSRS                = 10u * 64u + 11u,
> > +  x86_cpu_FSRCS               = 10u * 64u + 12u,
> > +  x86_cpu_HRESET              = 10u * 64u + 22u,
> > +  x86_cpu_LAM                 = 10u * 64u + 26u,
> > +
> > +  /* CPUID.19H:EBX.  */
> > +  x86_cpu_AESKLE              = 11u * 64u + 0u,
> > +  x86_cpu_WIDE_KL             = 11u * 64u + 2u,
> > +};
>
> Can we change
>
> struct cpuid_registers
> {
>   unsigned int eax;
>   unsigned int ebx;
>   unsigned int ecx;
>   unsigned int edx;
> };
>
> to an array
>
> struct cpuid_registers
> {
>   unsigned int cpuid_registers[4];
> } cpuid;
>
> define x86_cpu_AVX_XXX to the bit position into cpuid_registers arrary?
>
> > diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
> > index 0aec0e2875..4a68072f3d 100644
> > --- a/sysdeps/x86/cacheinfo.h
> > +++ b/sysdeps/x86/cacheinfo.h
> > @@ -91,7 +91,7 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
> >
> >    /* A value of 0 for the HTT bit indicates there is only a single
> >       logical processor.  */
> > -  if (HAS_CPU_FEATURE (HTT))
> > +  if (x86_cpu_has_feature (x86_cpu_HTT))
> >      {
> >        /* Figure out the number of logical threads that share the
> >           highest cache level.  */
> > @@ -236,12 +236,12 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
> >          }
> >        else
> >          {
> > -intel_bug_no_cache_info:
> > +intel_bug_no_cache_info:;
>
> Is this change needed?
>
> >            /* Assume that all logical threads share the highest cache
> >               level.  */
> > -          threads
> > -            = ((cpu_features->features[COMMON_CPUID_INDEX_1].cpuid.ebx
> > -                >> 16) & 0xff);
> > +       unsigned int eax, ebx, ecx, edx;
> > +       __cpuid (1, eax, ebx, ecx, edx);
>
> We can avoid __cpuid.
>
> > +          threads = (ebx >> 16) & 0xff;
> >          }
> >
> >          /* Cap usage of highest cache level to the number of supported
> > @@ -401,7 +401,7 @@ init_cacheinfo (void)
> >    unsigned int minimum_rep_movsb_threshold;
> >    /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
> >    unsigned int rep_movsb_threshold;
> > -  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> > +  if (x86_cpu_is_usable (x86_cpu_AVX512F)
> >        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
> >      {
> >        rep_movsb_threshold = 2048 * (64 / 16);
> > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> > index fe080b63b2..f9cf6bbfba 100644
> > --- a/sysdeps/x86/cpu-features.c
> > +++ b/sysdeps/x86/cpu-features.c
> > @@ -46,65 +46,80 @@ extern void TUNABLE_CALLBACK (set_x86_shstk) (tunable_val_t *)
> >  # include <dl-cet.h>
> >  #endif
> >
> > +/* Copy one indexed bit from the features word to the usable word.  */
> > +static inline void
> > +copy_usable_1 (unsigned int index)
> > +{
> > +  __x86_cpu_array_private[(index >> 5) + 1]
> > +    |= __x86_cpu_array_private[index >> 5] & (1u << (index % 32));
> > +}
> > +
> > +/* Mark one feature as usable.  */
> > +static inline void
> > +set_usable_1 (unsigned int index)
> > +{
> > +  __x86_cpu_array_private[(index >> 5) + 1] |= 1u << (index % 32);
> > +}
> > +
>
> We can do
>
> static inline _Bool
> x86_cpu_is_usable (unsigned int __index)
> {
>   unsigned int index1 = __index / (8 * sizeof (struct cpuid_registers);
>   unsigned int index2 = __index & ((8 * sizeof (struct cpuid_registers) - 1);
>   unsigned int shift = index2 & ((8 * sizeof (unsigned int) -1);
>   index2 = index2 / (8 * sizeof (unsigned int));
>
>   return usable[index1][index2] & (1 << shift);
> }
>

Please take a look at users/hjl/cpuid/master branch:

https://gitlab.com/x86-glibc/glibc/-/commits/users/hjl/cpuid/master

with

#define CPU_FEATURE_INDEX(name) \
  (index_cpu_##name * 8 * sizeof (unsigned int) * 4 \
   + reg_##name * 8 * sizeof (unsigned int) + bit_cpu_##name)

to compute the feature index.  Of course, you can expend it explicitly
for each feature.


-- 
H.J.


More information about the Libc-alpha mailing list