This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH v2 16/28] arm64/sve: Probe SVE capabilities and usable vector lengths


Dave Martin <Dave.Martin@arm.com> writes:

> This patch uses the cpufeatures framework to determine common SVE
> capabilities and vector lengths, and configures the runtime SVE
> support code appropriately.
>
> ZCR_ELx is not really a feature register, but it is convenient to
> use it as a template for recording the maximum vector length
> supported by a CPU, using the LEN field.  This field is similar to
> a feature field in that it is a contiguous bitfield for which we
> want to determine the minimum system-wide value.  This patch adds
> ZCR as a pseudo-register in cpuinfo/cpufeatures, with appropriate
> custom code to populate it.  Finding the minimum supported value of
> the LEN field is left to the cpufeatures framework in the usual
> way.
>
> The meaning of ID_AA64ZFR0_EL1 is not architecturally defined yet,
> so for now we just require it to be zero.
>
> Note that much of this code is dormant and SVE still won't be used
> yet, since system_supports_sve() remains hardwired to false.
>
> Signed-off-by: Dave Martin <Dave.Martin@arm.com>
> Cc: Alex Bennée <alex.bennee@linaro.org>
> Cc: Suzuki K Poulose <Suzuki.Poulose@arm.com>
>
> ---
>
> Changes since v1
> ----------------
>
> Requested by Alex Bennée:
>
> * Thin out BUG_ON()s:
> Redundant BUG_ON()s and ones that just check invariants are removed.
> Important sanity-checks are migrated to WARN_ON()s, with some
> minimal best-effort patch-up code.
>
> Other changes related to Alex Bennée's comments:
>
> * Migrate away from magic numbers for converting VL to VQ.
>
> Requested by Suzuki Poulose:
>
> * Make sve_vq_map __ro_after_init.
>
> Other changes related to Suzuki Poulose's comments:
>
> * Rely on cpufeatures for not attempting to update the vq map after boot.
> ---
>  arch/arm64/include/asm/cpu.h        |   4 ++
>  arch/arm64/include/asm/cpufeature.h |  29 ++++++++++
>  arch/arm64/include/asm/fpsimd.h     |  10 ++++
>  arch/arm64/kernel/cpufeature.c      |  50 +++++++++++++++++
>  arch/arm64/kernel/cpuinfo.c         |   6 ++
>  arch/arm64/kernel/fpsimd.c          | 106 +++++++++++++++++++++++++++++++++++-
>  6 files changed, 202 insertions(+), 3 deletions(-)
>
> diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
> index 889226b..8839227 100644
> --- a/arch/arm64/include/asm/cpu.h
> +++ b/arch/arm64/include/asm/cpu.h
> @@ -41,6 +41,7 @@ struct cpuinfo_arm64 {
>  	u64		reg_id_aa64mmfr2;
>  	u64		reg_id_aa64pfr0;
>  	u64		reg_id_aa64pfr1;
> +	u64		reg_id_aa64zfr0;
>
>  	u32		reg_id_dfr0;
>  	u32		reg_id_isar0;
> @@ -59,6 +60,9 @@ struct cpuinfo_arm64 {
>  	u32		reg_mvfr0;
>  	u32		reg_mvfr1;
>  	u32		reg_mvfr2;
> +
> +	/* pseudo-ZCR for recording maximum ZCR_EL1 LEN value: */
> +	u64		reg_zcr;
>  };
>
>  DECLARE_PER_CPU(struct cpuinfo_arm64, cpu_data);
> diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
> index 4ea3441..d98e7ba 100644
> --- a/arch/arm64/include/asm/cpufeature.h
> +++ b/arch/arm64/include/asm/cpufeature.h
> @@ -10,7 +10,9 @@
>  #define __ASM_CPUFEATURE_H
>
>  #include <asm/cpucaps.h>
> +#include <asm/fpsimd.h>
>  #include <asm/hwcap.h>
> +#include <asm/sigcontext.h>
>  #include <asm/sysreg.h>
>
>  /*
> @@ -223,6 +225,13 @@ static inline bool id_aa64pfr0_32bit_el0(u64 pfr0)
>  	return val == ID_AA64PFR0_EL0_32BIT_64BIT;
>  }
>
> +static inline bool id_aa64pfr0_sve(u64 pfr0)
> +{
> +	u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_SVE_SHIFT);
> +
> +	return val > 0;
> +}
> +
>  void __init setup_cpu_features(void);
>
>  void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
> @@ -267,6 +276,26 @@ static inline bool system_supports_sve(void)
>  	return false;
>  }
>
> +/*
> + * Read the pseudo-ZCR used by cpufeatures to identify the supported SVE
> + * vector length.
> + * Use only if SVE is present.  This function clobbers the SVE vector length.
> + */

:nit whitespace formatting.

> +static u64 __maybe_unused read_zcr_features(void)
> +{
> +	u64 zcr;
> +	unsigned int vq_max;
> +
> +	write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL1);

I'm confused, why are we writing something here? You mention clobbering
the SVE vector length but what was the point?

> +
> +	zcr = read_sysreg_s(SYS_ZCR_EL1);
> +	zcr &= ~(u64)ZCR_ELx_LEN_MASK;
> +	vq_max = sve_vq_from_vl(sve_get_vl());
> +	zcr |= vq_max - 1;
> +
> +	return zcr;
> +}
> +
>  #endif /* __ASSEMBLY__ */
>
>  #endif
> diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
> index 32c8e19..6c22624 100644
> --- a/arch/arm64/include/asm/fpsimd.h
> +++ b/arch/arm64/include/asm/fpsimd.h
> @@ -92,12 +92,22 @@ extern void fpsimd_dup_sve(struct task_struct *dst,
>  extern int sve_set_vector_length(struct task_struct *task,
>  				 unsigned long vl, unsigned long flags);
>
> +extern void __init sve_init_vq_map(void);
> +extern void sve_update_vq_map(void);
> +extern int sve_verify_vq_map(void);
> +extern void __init sve_setup(void);
> +
>  #else /* ! CONFIG_ARM64_SVE */
>
>  static void __maybe_unused sve_alloc(struct task_struct *task) { }
>  static void __maybe_unused fpsimd_release_thread(struct task_struct *task) { }
>  static void __maybe_unused fpsimd_dup_sve(struct task_struct *dst,
>  					  struct task_struct const *src) { }
> +static void __maybe_unused sve_init_vq_map(void) { }
> +static void __maybe_unused sve_update_vq_map(void) { }
> +static int __maybe_unused sve_verify_vq_map(void) { return 0; }
> +static void __maybe_unused sve_setup(void) { }
> +
>  #endif /* ! CONFIG_ARM64_SVE */
>
>  /* For use by EFI runtime services calls only */
> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index 43ba8df..c30bb6b 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -27,6 +27,7 @@
>  #include <asm/cpu.h>
>  #include <asm/cpufeature.h>
>  #include <asm/cpu_ops.h>
> +#include <asm/fpsimd.h>
>  #include <asm/mmu_context.h>
>  #include <asm/processor.h>
>  #include <asm/sysreg.h>
> @@ -283,6 +284,12 @@ static const struct arm64_ftr_bits ftr_id_dfr0[] = {
>  	ARM64_FTR_END,
>  };
>
> +static const struct arm64_ftr_bits ftr_zcr[] = {
> +	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE,
> +		ZCR_ELx_LEN_SHIFT, ZCR_ELx_LEN_SIZE, 0),	/* LEN */
> +	ARM64_FTR_END,
> +};
> +
>  /*
>   * Common ftr bits for a 32bit register with all hidden, strict
>   * attributes, with 4bit feature fields and a default safe value of
> @@ -349,6 +356,7 @@ static const struct __ftr_reg_entry {
>  	/* Op1 = 0, CRn = 0, CRm = 4 */
>  	ARM64_FTR_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0),
>  	ARM64_FTR_REG(SYS_ID_AA64PFR1_EL1, ftr_raz),
> +	ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_raz),
>
>  	/* Op1 = 0, CRn = 0, CRm = 5 */
>  	ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0),
> @@ -363,6 +371,9 @@ static const struct __ftr_reg_entry {
>  	ARM64_FTR_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1),
>  	ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2),
>
> +	/* Op1 = 0, CRn = 1, CRm = 2 */
> +	ARM64_FTR_REG(SYS_ZCR_EL1, ftr_zcr),
> +
>  	/* Op1 = 3, CRn = 0, CRm = 0 */
>  	{ SYS_CTR_EL0, &arm64_ftr_reg_ctrel0 },
>  	ARM64_FTR_REG(SYS_DCZID_EL0, ftr_dczid),
> @@ -500,6 +511,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
>  	init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2);
>  	init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0);
>  	init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1);
> +	init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0);
>
>  	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
>  		init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0);
> @@ -520,6 +532,10 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
>  		init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2);
>  	}
>
> +	if (id_aa64pfr0_sve(info->reg_id_aa64pfr0)) {
> +		init_cpu_ftr_reg(SYS_ZCR_EL1, info->reg_zcr);
> +		sve_init_vq_map();
> +	}
>  }
>
>  static void update_cpu_ftr_reg(struct arm64_ftr_reg *reg, u64 new)
> @@ -623,6 +639,9 @@ void update_cpu_features(int cpu,
>  	taint |= check_update_ftr_reg(SYS_ID_AA64PFR1_EL1, cpu,
>  				      info->reg_id_aa64pfr1, boot->reg_id_aa64pfr1);
>
> +	taint |= check_update_ftr_reg(SYS_ID_AA64ZFR0_EL1, cpu,
> +				      info->reg_id_aa64zfr0, boot->reg_id_aa64zfr0);
> +
>  	/*
>  	 * If we have AArch32, we care about 32-bit features for compat.
>  	 * If the system doesn't support AArch32, don't update them.
> @@ -670,6 +689,14 @@ void update_cpu_features(int cpu,
>  					info->reg_mvfr2, boot->reg_mvfr2);
>  	}
>
> +	if (id_aa64pfr0_sve(info->reg_id_aa64pfr0)) {
> +		taint |= check_update_ftr_reg(SYS_ZCR_EL1, cpu,
> +					info->reg_zcr, boot->reg_zcr);
> +
> +		if (!sys_caps_initialised)
> +			sve_update_vq_map();
> +	}
> +
>  	/*
>  	 * Mismatched CPU features are a recipe for disaster. Don't even
>  	 * pretend to support them.
> @@ -1097,6 +1124,23 @@ verify_local_cpu_features(const struct arm64_cpu_capabilities *caps)
>  	}
>  }
>
> +static void verify_sve_features(void)
> +{
> +	u64 safe_zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1);
> +	u64 zcr = read_zcr_features();
> +
> +	unsigned int safe_len = safe_zcr & ZCR_ELx_LEN_MASK;
> +	unsigned int len = zcr & ZCR_ELx_LEN_MASK;
> +
> +	if (len < safe_len || sve_verify_vq_map()) {
> +		pr_crit("CPU%d: SVE: required vector length(s) missing\n",
> +			smp_processor_id());
> +		cpu_die_early();
> +	}
> +
> +	/* Add checks on other ZCR bits here if necessary */
> +}
> +
>  /*
>   * Run through the enabled system capabilities and enable() it on this CPU.
>   * The capabilities were decided based on the available CPUs at the boot time.
> @@ -1110,8 +1154,12 @@ static void verify_local_cpu_capabilities(void)
>  	verify_local_cpu_errata_workarounds();
>  	verify_local_cpu_features(arm64_features);
>  	verify_local_elf_hwcaps(arm64_elf_hwcaps);
> +
>  	if (system_supports_32bit_el0())
>  		verify_local_elf_hwcaps(compat_elf_hwcaps);
> +
> +	if (system_supports_sve())
> +		verify_sve_features();
>  }
>
>  void check_local_cpu_capabilities(void)
> @@ -1189,6 +1237,8 @@ void __init setup_cpu_features(void)
>  	if (system_supports_32bit_el0())
>  		setup_elf_hwcaps(compat_elf_hwcaps);
>
> +	sve_setup();
> +
>  	/* Advertise that we have computed the system capabilities */
>  	set_sys_caps_initialised();
>
> diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
> index 3118859..be260e8 100644
> --- a/arch/arm64/kernel/cpuinfo.c
> +++ b/arch/arm64/kernel/cpuinfo.c
> @@ -19,6 +19,7 @@
>  #include <asm/cpu.h>
>  #include <asm/cputype.h>
>  #include <asm/cpufeature.h>
> +#include <asm/fpsimd.h>
>
>  #include <linux/bitops.h>
>  #include <linux/bug.h>
> @@ -326,6 +327,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
>  	info->reg_id_aa64mmfr2 = read_cpuid(ID_AA64MMFR2_EL1);
>  	info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1);
>  	info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1);
> +	info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1);
>
>  	/* Update the 32bit ID registers only if AArch32 is implemented */
>  	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
> @@ -348,6 +350,10 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
>  		info->reg_mvfr2 = read_cpuid(MVFR2_EL1);
>  	}
>
> +	if (IS_ENABLED(CONFIG_ARM64_SVE) &&
> +	    id_aa64pfr0_sve(info->reg_id_aa64pfr0))
> +		info->reg_zcr = read_zcr_features();
> +
>  	cpuinfo_detect_icache_policy(info);
>  }
>
> diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
> index 713476e..cea05a7 100644
> --- a/arch/arm64/kernel/fpsimd.c
> +++ b/arch/arm64/kernel/fpsimd.c
> @@ -110,19 +110,19 @@
>  static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state);
>
>  /* Default VL for tasks that don't set it explicitly: */
> -static int sve_default_vl = SVE_VL_MIN;
> +static int sve_default_vl = -1;
>
>  #ifdef CONFIG_ARM64_SVE
>
>  /* Maximum supported vector length across all CPUs (initially poisoned) */
>  int __ro_after_init sve_max_vl = -1;
>  /* Set of available vector lengths, as vq_to_bit(vq): */
> -static DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
> +static __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
>
>  #else /* ! CONFIG_ARM64_SVE */
>
>  /* Dummy declaration for code that will be optimised out: */
> -extern DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
> +extern __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
>
>  #endif /* ! CONFIG_ARM64_SVE */
>
> @@ -387,6 +387,103 @@ int sve_set_vector_length(struct task_struct *task,
>  	return 0;
>  }
>
> +static unsigned long *sve_alloc_vq_map(void)
> +{
> +	return kzalloc(BITS_TO_LONGS(SVE_VQ_MAX) * sizeof(unsigned long),
> +		       GFP_KERNEL);
> +}
> +
> +static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX))
> +{
> +	unsigned int vq, vl;
> +	unsigned long zcr;
> +
> +	zcr = ZCR_ELx_LEN_MASK;
> +	zcr = read_sysreg_s(SYS_ZCR_EL1) & ~zcr;
> +
> +	for (vq = SVE_VQ_MAX; vq >= SVE_VQ_MIN; --vq) {
> +		write_sysreg_s(zcr | (vq - 1), SYS_ZCR_EL1); /* self-syncing */
> +		vl = sve_get_vl();
> +		vq = sve_vq_from_vl(vl); /* skip intervening lengths */
> +		set_bit(vq_to_bit(vq), map);
> +	}
> +}
> +
> +void __init sve_init_vq_map(void)
> +{
> +	sve_probe_vqs(sve_vq_map);
> +}
> +
> +/*
> + * If we haven't committed to the set of supported VQs yet, filter out
> + * those not supported by the current CPU.
> + */
> +void sve_update_vq_map(void)
> +{
> +	unsigned long *map;
> +
> +	map = sve_alloc_vq_map();
> +	sve_probe_vqs(map);
> +	bitmap_and(sve_vq_map, sve_vq_map, map, SVE_VQ_MAX);
> +	kfree(map);
> +}
> +
> +/* Check whether the current CPU supports all VQs in the committed set */
> +int sve_verify_vq_map(void)
> +{
> +	int ret = 0;
> +	unsigned long *map = sve_alloc_vq_map();
> +
> +	sve_probe_vqs(map);
> +	bitmap_andnot(map, sve_vq_map, map, SVE_VQ_MAX);
> +	if (!bitmap_empty(map, SVE_VQ_MAX)) {
> +		pr_warn("SVE: cpu%d: Required vector length(s) missing\n",
> +			smp_processor_id());
> +		ret = -EINVAL;
> +	}
> +
> +	kfree(map);
> +
> +	return ret;
> +}
> +
> +void __init sve_setup(void)
> +{
> +	u64 zcr;
> +
> +	if (!system_supports_sve())
> +		return;
> +
> +	/*
> +	 * The SVE architecture mandates support for 128-bit vectors,
> +	 * so sve_vq_map must have at least SVE_VQ_MIN set.
> +	 * If something went wrong, at least try to patch it up:
> +	 */
> +	if (WARN_ON(!test_bit(vq_to_bit(SVE_VQ_MIN), sve_vq_map)))
> +		set_bit(vq_to_bit(SVE_VQ_MIN), sve_vq_map);
> +
> +	zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1);
> +	sve_max_vl = sve_vl_from_vq((zcr & ZCR_ELx_LEN_MASK) + 1);
> +
> +	/*
> +	 * Sanity-check that the max VL we determined through CPU features
> +	 * corresponds properly to sve_vq_map.  If not, do our best:
> +	 */
> +	if (WARN_ON(sve_max_vl != find_supported_vector_length(sve_max_vl)))
> +		sve_max_vl = find_supported_vector_length(sve_max_vl);
> +
> +	/*
> +	 * For the default VL, pick the maximum supported value <= 64.
> +	 * VL == 64 is guaranteed not to grow the signal frame.
> +	 */
> +	sve_default_vl = find_supported_vector_length(64);
> +
> +	pr_info("SVE: maximum available vector length %u bytes per vector\n",
> +		sve_max_vl);
> +	pr_info("SVE: default vector length %u bytes per vector\n",
> +		sve_default_vl);
> +}
> +
>  void fpsimd_release_thread(struct task_struct *dead_task)
>  {
>  	sve_free(dead_task);
> @@ -502,6 +599,9 @@ void fpsimd_flush_thread(void)
>  		 * This is where we ensure that all user tasks have a valid
>  		 * vector length configured: no kernel task can become a user
>  		 * task without an exec and hence a call to this function.
> +		 * By the time the first call to this function is made, all
> +		 * early hardware probing is complete, so sve_default_vl
> +		 * should be valid.
>  		 * If a bug causes this to go wrong, we make some noise and
>  		 * try to fudge thread.sve_vl to a safe value here.
>  		 */


Otherwise:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

--
Alex Bennée


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]