[RFC PATCH 2/2] x86: Alternative <sys/platform/x86.h> implementation
H.J. Lu
hjl.tools@gmail.com
Thu Dec 24 04:29:26 GMT 2020
On Wed, Dec 23, 2020 at 4:35 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Dec 23, 2020 at 11:15:01PM +0100, GNU C Library wrote:
> > This implementation uses a data symbol containing a pointer
> > to the bitmask array and an array length, not a function call.
> > The expectation is that this is the final link-time ABI for
> > this feature. (The run-time ABI will have to change once more,
> > to support use of this facility in IFUNC resolvers.)
> >
> > The __libc_vars initialization mechanism is used to set up
> > the copy in libc.so.6 before relocation, so that it can be used
> > by IFUNC resolvers.
> >
> > Usage of the C preprocessor is greatly reduced, making it easier
> > to wrap this functionality in other languages.
> >
> > This is still a preview. There are further cleanups possible,
> > including removal of the function symbol. The manual still needs
> > updating, and there are a few overlong lines. I'd like to receive
> > feedback if this is the direction in which we want to move.
> >
> > I think it should be possible to hack in IFUNC resolver support using a
> > custom dynamic section entry that points to a hidden __x86_cpu_array
> > variable. It would be cleaner to use a new run-time-only relocation for
> > the initialization. The dynamic section hack would not work with
> > --gc-sections, for instance.
> >
> > 76 files changed, 1032 insertions(+), 1355 deletions(-)
> >
> > diff --git a/sysdeps/i386/fpu/fclrexcpt.c b/sysdeps/i386/fpu/fclrexcpt.c
> > index 7dc357f2d6..79379f78ef 100644
> > --- a/sysdeps/i386/fpu/fclrexcpt.c
> > +++ b/sysdeps/i386/fpu/fclrexcpt.c
> > @@ -41,7 +41,7 @@ __feclearexcept (int excepts)
> > __asm__ ("fldenv %0" : : "m" (*&temp));
> >
> > /* If the CPU supports SSE, we clear the MXCSR as well. */
> > - if (CPU_FEATURE_USABLE (SSE))
> > + if (x86_cpu_is_usable (x86_cpu_SSE))
>
> Can we do
>
> #define CPU_FEATURE_USABLE(name) x86_cpu_is_usable (x86_cpu_##name)
>
> instead?
>
> > diff --git a/sysdeps/x86/bits/platform/x86.h b/sysdeps/x86/bits/platform/x86.h
> > new file mode 100644
> > index 0000000000..dd9a273f5b
> > --- /dev/null
> > +++ b/sysdeps/x86/bits/platform/x86.h
> > @@ -0,0 +1,245 @@
> > +/* Constants for x86 CPU features and struct x86_cpu_array definition.
> > + This file is part of the GNU C Library.
> > + Copyright (C) 2008-2020 Free Software Foundation, Inc.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#ifndef _SYS_PLATFORM_X86_H
> > +# error "Never include <bits/platform/x86.h> directly; use <sys/platform/x86.h> instead."
> > +#endif
> > +
> > +struct x86_cpu_array
> > +{
> > + /* Pointer to an array of __x86_count 32-bit values. */
> > + const unsigned int *__x86_word;
> > + unsigned int __x86_count;
> > +#ifdef __LP64__
> > + unsigned int __x86_padding;
> > +#endif
> > +};
> > +
> > +enum
> > +{
> > + /* CPUID.01H:ECX. */
> > + x86_cpu_SSE3 = 0u * 64u + 0u,
> > + x86_cpu_PCLMULQDQ = 0u * 64u + 1u,
> > + x86_cpu_DTES64 = 0u * 64u + 2u,
> > + x86_cpu_MONITOR = 0u * 64u + 3u,
> > + x86_cpu_DS_CPL = 0u * 64u + 4u,
> > + x86_cpu_VMX = 0u * 64u + 5u,
> > + x86_cpu_SMX = 0u * 64u + 6u,
> > + x86_cpu_EIST = 0u * 64u + 7u,
> > + x86_cpu_TM2 = 0u * 64u + 8u,
> > + x86_cpu_SSSE3 = 0u * 64u + 9u,
> > + x86_cpu_CNXT_ID = 0u * 64u + 10u,
> > + x86_cpu_SDBG = 0u * 64u + 11u,
> > + x86_cpu_FMA = 0u * 64u + 12u,
> > + x86_cpu_CMPXCHG16B = 0u * 64u + 13u,
> > + x86_cpu_XTPRUPDCTRL = 0u * 64u + 14u,
> > + x86_cpu_PDCM = 0u * 64u + 15u,
> > + x86_cpu_INDEX_1_ECX_16 = 0u * 64u + 16u,
> > + x86_cpu_PCID = 0u * 64u + 17u,
> > + x86_cpu_DCA = 0u * 64u + 18u,
> > + x86_cpu_SSE4_1 = 0u * 64u + 19u,
> > + x86_cpu_SSE4_2 = 0u * 64u + 20u,
> > + x86_cpu_X2APIC = 0u * 64u + 21u,
> > + x86_cpu_MOVBE = 0u * 64u + 22u,
> > + x86_cpu_POPCNT = 0u * 64u + 23u,
> > + x86_cpu_TSC_DEADLINE = 0u * 64u + 24u,
> > + x86_cpu_AES = 0u * 64u + 25u,
> > + x86_cpu_XSAVE = 0u * 64u + 26u,
> > + x86_cpu_OSXSAVE = 0u * 64u + 27u,
> > + x86_cpu_AVX = 0u * 64u + 28u,
> > + x86_cpu_F16C = 0u * 64u + 29u,
> > + x86_cpu_RDRAND = 0u * 64u + 30u,
> > + x86_cpu_INDEX_1_ECX_31 = 0u * 64u + 31u,
> > +
> > + /* CPUID.01H:EDX. */
> > + x86_cpu_FPU = 1u * 64u + 0u,
> > + x86_cpu_VME = 1u * 64u + 1u,
> > + x86_cpu_DE = 1u * 64u + 2u,
> > + x86_cpu_PSE = 1u * 64u + 3u,
> > + x86_cpu_TSC = 1u * 64u + 4u,
> > + x86_cpu_MSR = 1u * 64u + 5u,
> > + x86_cpu_PAE = 1u * 64u + 6u,
> > + x86_cpu_MCE = 1u * 64u + 7u,
> > + x86_cpu_CX8 = 1u * 64u + 8u,
> > + x86_cpu_APIC = 1u * 64u + 9u,
> > + x86_cpu_INDEX_1_EDX_10 = 1u * 64u + 10u,
> > + x86_cpu_SEP = 1u * 64u + 11u,
> > + x86_cpu_MTRR = 1u * 64u + 12u,
> > + x86_cpu_PGE = 1u * 64u + 13u,
> > + x86_cpu_MCA = 1u * 64u + 14u,
> > + x86_cpu_CMOV = 1u * 64u + 15u,
> > + x86_cpu_PAT = 1u * 64u + 16u,
> > + x86_cpu_PSE_36 = 1u * 64u + 17u,
> > + x86_cpu_PSN = 1u * 64u + 18u,
> > + x86_cpu_CLFSH = 1u * 64u + 19u,
> > + x86_cpu_INDEX_1_EDX_20 = 1u * 64u + 20u,
> > + x86_cpu_DS = 1u * 64u + 21u,
> > + x86_cpu_ACPI = 1u * 64u + 22u,
> > + x86_cpu_MMX = 1u * 64u + 23u,
> > + x86_cpu_FXSR = 1u * 64u + 24u,
> > + x86_cpu_SSE = 1u * 64u + 25u,
> > + x86_cpu_SSE2 = 1u * 64u + 26u,
> > + x86_cpu_SS = 1u * 64u + 27u,
> > + x86_cpu_HTT = 1u * 64u + 28u,
> > + x86_cpu_TM = 1u * 64u + 29u,
> > + x86_cpu_INDEX_1_EDX_30 = 1u * 64u + 30u,
> > + x86_cpu_PBE = 1u * 64u + 31u,
> > +
> > + /* CPUID.07H.0H:EBX. */
> > + x86_cpu_FSGSBASE = 2u * 64u + 0u,
> > + x86_cpu_TSC_ADJUST = 2u * 64u + 1u,
> > + x86_cpu_SGX = 2u * 64u + 2u,
> > + x86_cpu_BMI1 = 2u * 64u + 3u,
> > + x86_cpu_HLE = 2u * 64u + 4u,
> > + x86_cpu_AVX2 = 2u * 64u + 5u,
> > + x86_cpu_INDEX_7_EBX_6 = 2u * 64u + 6u,
> > + x86_cpu_SMEP = 2u * 64u + 7u,
> > + x86_cpu_BMI2 = 2u * 64u + 8u,
> > + x86_cpu_ERMS = 2u * 64u + 9u,
> > + x86_cpu_INVPCID = 2u * 64u + 10u,
> > + x86_cpu_RTM = 2u * 64u + 11u,
> > + x86_cpu_RDT_M = 2u * 64u + 12u,
> > + x86_cpu_DEPR_FPU_CS_DS = 2u * 64u + 13u,
> > + x86_cpu_MPX = 2u * 64u + 14u,
> > + x86_cpu_RDT_A = 2u * 64u + 15u,
> > + x86_cpu_AVX512F = 2u * 64u + 16u,
> > + x86_cpu_AVX512DQ = 2u * 64u + 17u,
> > + x86_cpu_RDSEED = 2u * 64u + 18u,
> > + x86_cpu_ADX = 2u * 64u + 19u,
> > + x86_cpu_SMAP = 2u * 64u + 20u,
> > + x86_cpu_AVX512_IFMA = 2u * 64u + 21u,
> > + x86_cpu_INDEX_7_EBX_22 = 2u * 64u + 22u,
> > + x86_cpu_CLFLUSHOPT = 2u * 64u + 23u,
> > + x86_cpu_CLWB = 2u * 64u + 24u,
> > + x86_cpu_TRACE = 2u * 64u + 25u,
> > + x86_cpu_AVX512PF = 2u * 64u + 26u,
> > + x86_cpu_AVX512ER = 2u * 64u + 27u,
> > + x86_cpu_AVX512CD = 2u * 64u + 28u,
> > + x86_cpu_SHA = 2u * 64u + 29u,
> > + x86_cpu_AVX512BW = 2u * 64u + 30u,
> > + x86_cpu_AVX512VL = 2u * 64u + 31u,
> > +
> > + /* CPUID.07H.0H:ECX. */
> > + x86_cpu_PREFETCHWT1 = 3u * 64u + 0u,
> > + x86_cpu_AVX512_VBMI = 3u * 64u + 1u,
> > + x86_cpu_UMIP = 3u * 64u + 2u,
> > + x86_cpu_PKU = 3u * 64u + 3u,
> > + x86_cpu_OSPKE = 3u * 64u + 4u,
> > + x86_cpu_WAITPKG = 3u * 64u + 5u,
> > + x86_cpu_AVX512_VBMI2 = 3u * 64u + 6u,
> > + x86_cpu_SHSTK = 3u * 64u + 7u,
> > + x86_cpu_GFNI = 3u * 64u + 8u,
> > + x86_cpu_VAES = 3u * 64u + 9u,
> > + x86_cpu_VPCLMULQDQ = 3u * 64u + 10u,
> > + x86_cpu_AVX512_VNNI = 3u * 64u + 11u,
> > + x86_cpu_AVX512_BITALG = 3u * 64u + 12u,
> > + x86_cpu_INDEX_7_ECX_13 = 3u * 64u + 13u,
> > + x86_cpu_AVX512_VPOPCNTDQ = 3u * 64u + 14u,
> > + x86_cpu_INDEX_7_ECX_15 = 3u * 64u + 15u,
> > + x86_cpu_INDEX_7_ECX_16 = 3u * 64u + 16u,
> > + /* Note: Bits 17-21: The value of MAWAU used by the BNDLDX and
> > + BNDSTX instructions in 64-bit mode. */
> > + x86_cpu_RDPID = 3u * 64u + 22u,
> > + x86_cpu_KL = 3u * 64u + 23u,
> > + x86_cpu_INDEX_7_ECX_24 = 3u * 64u + 24u,
> > + x86_cpu_CLDEMOTE = 3u * 64u + 25u,
> > + x86_cpu_INDEX_7_ECX_26 = 3u * 64u + 26u,
> > + x86_cpu_MOVDIRI = 3u * 64u + 27u,
> > + x86_cpu_MOVDIR64B = 3u * 64u + 28u,
> > + x86_cpu_ENQCMD = 3u * 64u + 29u,
> > + x86_cpu_SGX_LC = 3u * 64u + 30u,
> > + x86_cpu_PKS = 3u * 64u + 31u,
> > +
> > + /* CPUID.07H.0H:EDX. */
> > + x86_cpu_INDEX_7_EDX_0 = 4u * 64u + 0u,
> > + x86_cpu_INDEX_7_EDX_1 = 4u * 64u + 1u,
> > + x86_cpu_AVX512_4VNNIW = 4u * 64u + 2u,
> > + x86_cpu_AVX512_4FMAPS = 4u * 64u + 3u,
> > + x86_cpu_FSRM = 4u * 64u + 4u,
> > + x86_cpu_UINTR = 4u * 64u + 5u,
> > + x86_cpu_INDEX_7_EDX_6 = 4u * 64u + 6u,
> > + x86_cpu_INDEX_7_EDX_7 = 4u * 64u + 7u,
> > + x86_cpu_AVX512_VP2INTERSECT = 4u * 64u + 8u,
> > + x86_cpu_INDEX_7_EDX_9 = 4u * 64u + 9u,
> > + x86_cpu_MD_CLEAR = 4u * 64u + 10u,
> > + x86_cpu_INDEX_7_EDX_11 = 4u * 64u + 11u,
> > + x86_cpu_INDEX_7_EDX_12 = 4u * 64u + 12u,
> > + x86_cpu_INDEX_7_EDX_13 = 4u * 64u + 13u,
> > + x86_cpu_SERIALIZE = 4u * 64u + 14u,
> > + x86_cpu_HYBRID = 4u * 64u + 15u,
> > + x86_cpu_TSXLDTRK = 4u * 64u + 16u,
> > + x86_cpu_INDEX_7_EDX_17 = 4u * 64u + 17u,
> > + x86_cpu_PCONFIG = 4u * 64u + 18u,
> > + x86_cpu_INDEX_7_EDX_19 = 4u * 64u + 19u,
> > + x86_cpu_IBT = 4u * 64u + 20u,
> > + x86_cpu_INDEX_7_EDX_21 = 4u * 64u + 21u,
> > + x86_cpu_AMX_BF16 = 4u * 64u + 22u,
> > + x86_cpu_AVX512_FP16 = 4u * 64u + 23u,
> > + x86_cpu_AMX_TILE = 4u * 64u + 24u,
> > + x86_cpu_AMX_INT8 = 4u * 64u + 25u,
> > + x86_cpu_IBRS_IBPB = 4u * 64u + 26u,
> > + x86_cpu_STIBP = 4u * 64u + 27u,
> > + x86_cpu_L1D_FLUSH = 4u * 64u + 28u,
> > + x86_cpu_ARCH_CAPABILITIES = 4u * 64u + 29u,
> > + x86_cpu_CORE_CAPABILITIES = 4u * 64u + 30u,
> > + x86_cpu_SSBD = 4u * 64u + 31u,
> > +
> > + /* CPUID.80000001H:ECX. */
> > + x86_cpu_LAHF64_SAHF64 = 5u * 64u + 0u,
> > + x86_cpu_SVM = 5u * 64u + 2u,
> > + x86_cpu_LZCNT = 5u * 64u + 5u,
> > + x86_cpu_SSE4A = 5u * 64u + 6u,
> > + x86_cpu_PREFETCHW = 5u * 64u + 8u,
> > + x86_cpu_XOP = 5u * 64u + 11u,
> > + x86_cpu_LWP = 5u * 64u + 15u,
> > + x86_cpu_FMA4 = 5u * 64u + 16u,
> > + x86_cpu_TBM = 5u * 64u + 21u,
> > +
> > + /* CPUID.80000001H:EDX. */
> > + x86_cpu_SYSCALL_SYSRET = 6u * 64u + 11u,
> > + x86_cpu_NX = 6u * 64u + 20u,
> > + x86_cpu_PAGE1GB = 6u * 64u + 26u,
> > + x86_cpu_RDTSCP = 6u * 64u + 27u,
> > + x86_cpu_LM = 6u * 64u + 29u,
> > +
> > + /* CPUID.(EAX=0DH,ECX=1):EAX. */
> > + x86_cpu_XSAVEOPT = 7u * 64u + 0u,
> > + x86_cpu_XSAVEC = 7u * 64u + 1u,
> > + x86_cpu_XGETBV_ECX_1 = 7u * 64u + 2u,
> > + x86_cpu_XSAVES = 7u * 64u + 3u,
> > + x86_cpu_XFD = 7u * 64u + 4u,
> > +
> > + /* CPUID.80000007H:EDX. */
> > + x86_cpu_INVARIANT_TSC = 8u * 64u + 8u,
> > +
> > + /* CPUID.80000008H:EBX. */
> > + x86_cpu_WBNOINVD = 9u * 64u + 9u,
> > +
> > + /* CPUID.(EAX=07H.,ECX=1):EAX. */
> > + x86_cpu_AVX_VNNI = 10u * 64u + 4u,
> > + x86_cpu_AVX512_BF16 = 10u * 64u + 5u,
> > + x86_cpu_FZLRM = 10u * 64u + 10u,
> > + x86_cpu_FSRS = 10u * 64u + 11u,
> > + x86_cpu_FSRCS = 10u * 64u + 12u,
> > + x86_cpu_HRESET = 10u * 64u + 22u,
> > + x86_cpu_LAM = 10u * 64u + 26u,
> > +
> > + /* CPUID.19H:EBX. */
> > + x86_cpu_AESKLE = 11u * 64u + 0u,
> > + x86_cpu_WIDE_KL = 11u * 64u + 2u,
> > +};
>
> Can we change
>
> struct cpuid_registers
> {
> unsigned int eax;
> unsigned int ebx;
> unsigned int ecx;
> unsigned int edx;
> };
>
> to an array
>
> struct cpuid_registers
> {
> unsigned int cpuid_registers[4];
> } cpuid;
>
> define x86_cpu_AVX_XXX to the bit position into cpuid_registers arrary?
>
> > diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
> > index 0aec0e2875..4a68072f3d 100644
> > --- a/sysdeps/x86/cacheinfo.h
> > +++ b/sysdeps/x86/cacheinfo.h
> > @@ -91,7 +91,7 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
> >
> > /* A value of 0 for the HTT bit indicates there is only a single
> > logical processor. */
> > - if (HAS_CPU_FEATURE (HTT))
> > + if (x86_cpu_has_feature (x86_cpu_HTT))
> > {
> > /* Figure out the number of logical threads that share the
> > highest cache level. */
> > @@ -236,12 +236,12 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
> > }
> > else
> > {
> > -intel_bug_no_cache_info:
> > +intel_bug_no_cache_info:;
>
> Is this change needed?
>
> > /* Assume that all logical threads share the highest cache
> > level. */
> > - threads
> > - = ((cpu_features->features[COMMON_CPUID_INDEX_1].cpuid.ebx
> > - >> 16) & 0xff);
> > + unsigned int eax, ebx, ecx, edx;
> > + __cpuid (1, eax, ebx, ecx, edx);
>
> We can avoid __cpuid.
>
> > + threads = (ebx >> 16) & 0xff;
> > }
> >
> > /* Cap usage of highest cache level to the number of supported
> > @@ -401,7 +401,7 @@ init_cacheinfo (void)
> > unsigned int minimum_rep_movsb_threshold;
> > /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */
> > unsigned int rep_movsb_threshold;
> > - if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> > + if (x86_cpu_is_usable (x86_cpu_AVX512F)
> > && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
> > {
> > rep_movsb_threshold = 2048 * (64 / 16);
> > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> > index fe080b63b2..f9cf6bbfba 100644
> > --- a/sysdeps/x86/cpu-features.c
> > +++ b/sysdeps/x86/cpu-features.c
> > @@ -46,65 +46,80 @@ extern void TUNABLE_CALLBACK (set_x86_shstk) (tunable_val_t *)
> > # include <dl-cet.h>
> > #endif
> >
> > +/* Copy one indexed bit from the features word to the usable word. */
> > +static inline void
> > +copy_usable_1 (unsigned int index)
> > +{
> > + __x86_cpu_array_private[(index >> 5) + 1]
> > + |= __x86_cpu_array_private[index >> 5] & (1u << (index % 32));
> > +}
> > +
> > +/* Mark one feature as usable. */
> > +static inline void
> > +set_usable_1 (unsigned int index)
> > +{
> > + __x86_cpu_array_private[(index >> 5) + 1] |= 1u << (index % 32);
> > +}
> > +
>
> We can do
>
> static inline _Bool
> x86_cpu_is_usable (unsigned int __index)
> {
> unsigned int index1 = __index / (8 * sizeof (struct cpuid_registers);
> unsigned int index2 = __index & ((8 * sizeof (struct cpuid_registers) - 1);
> unsigned int shift = index2 & ((8 * sizeof (unsigned int) -1);
> index2 = index2 / (8 * sizeof (unsigned int));
>
> return usable[index1][index2] & (1 << shift);
> }
>
Please take a look at users/hjl/cpuid/master branch:
https://gitlab.com/x86-glibc/glibc/-/commits/users/hjl/cpuid/master
with
#define CPU_FEATURE_INDEX(name) \
(index_cpu_##name * 8 * sizeof (unsigned int) * 4 \
+ reg_##name * 8 * sizeof (unsigned int) + bit_cpu_##name)
to compute the feature index. Of course, you can expend it explicitly
for each feature.
--
H.J.
More information about the Libc-alpha
mailing list