[PATCH v1 0/5][Binutils] aarch64: Add support for sme2 and sve2 BFloat16 feature.

Fri Aug 16 18:09:13 GMT 2024

On Thu, Jul 18, 2024 at 11:27:49AM +0100, Srinath Parvathaneni wrote:
> Hi,
> 
> In the current assembler, SVE2 Bfloat16 instructions are implemented with tick
> FEAT_B16B16 and command line flag "+b16b16" and this feature was suspended
> due to incomplete support.
> 
> In the new spec available here(SVE[1], SME[2]), FEAT_B16B16 is replaced with
> FEAT_SVE_B16B16 and FEAT_SME_B16B16 and command line flag "+b16b16" is replaced 
> with "+sve-b16b16" and "+sme-b16b16".
> 
> More details about the Bflaot16 are provided below:
> The Bfloat16 feature in sve2 and sme2 is divided into 3 combinations.
> * SVE Z-targeting non-widening BFloat16 instructions under FEAT_SVE_B16B16
>   implemented with command line flag "+sve-b16b16+sve2".
> * SME Z-targeting multi-vector non-widening BFloat16 instructions under
>   FEAT_SVE_B16B16 implemented with command line flag "+sve-b16b16+sme2".
> * SME ZA-targeting non-widening BFloat16 instructions under FEAT_SME_B16B16
>   implemented with command line flag "+sme-b16b16".
> 
> This following 5 patch series add support for above combinations and instructions:
> Srinath Parvathaneni (5):
>   aarch64: Add support for FEAT_SVE_B16B16 feature.
>   aarch64: Add support for FEAT_SVE_B16B16 min and max instructions.
>   aarch64: Add support for FEAT_SVE_B16B16 min and max instructions (autogenerated files).
>   aarch64: Add support for FEAT_SME_B16B16 feature.
>   aarch64: Add support for FEAT_SME_B16B16 feature (autogenerated files).
> 
> SVE[1]: https://developer.arm.com/documentation/ddi0602/2024-06/SVE-Instructions?lang=en
> SME[2]: https://developer.arm.com/documentation/ddi0602/2024-06/SME-Instructions?lang=en
> 
> Regression testing for aarch64-none-elf target and found no regressions.
> 
> Ok for binutils-master?
> 
> Regards,
> Srinath.

I only have comments on feature dependencies for now; I haven't checked the
rest of the patch series yet.

I've compiled an extract of the squashed patch series below, since I think that
will make my comments clearer.

> diff --git a/gas/config/tc-aarch64.c b/gas/config/tc-aarch64.c
> index 929ee3d0245..67bde6da131 100644
> --- a/gas/config/tc-aarch64.c
> +++ b/gas/config/tc-aarch64.c
> @@ -10706,9 +10706,7 @@ static const struct aarch64_option_cpu_value_table aarch64_features[] = {
>    {"rasv2",            AARCH64_FEATURE (RASv2), AARCH64_FEATURE (RAS)},
>    {"ite",              AARCH64_FEATURE (ITE), AARCH64_NO_FEATURES},
>    {"d128",             AARCH64_FEATURE (D128), D128_FEATURE_DEPS},
> -  // Feature b16b16 is currently incomplete.
> -  // TODO: finish implementation and enable relevant flags.
> -  //{"b16b16",         AARCH64_FEATURE (B16B16), AARCH64_FEATURE (SVE2)},
> +  {"sve-b16b16",       AARCH64_FEATURE (SVE_B16B16), AARCH64_NO_FEATURES},
>    {"sme2p1",           AARCH64_FEATURE (SME2p1), AARCH64_FEATURE (SME2)},
>    {"sve2p1",           AARCH64_FEATURE (SVE2p1), AARCH64_FEATURE (SVE2)},
>    {"rcpc3",            AARCH64_FEATURE (RCPC3), AARCH64_FEATURE (RCPC2)},
> @@ -10730,6 +10730,8 @@ static const struct aarch64_option_cpu_value_table aarch64_features[] = {
>    {"sme-f8f16",                AARCH64_FEATURE (SME_F8F16),
>                         AARCH64_FEATURE (SME_F8F32)},
>    {"sme-f16f16",       AARCH64_FEATURE (SME_F16F16), AARCH64_FEATURE (SME2)},
> +  {"sme-b16b16",       AARCH64_FEATURES (3, SME_B16B16, SVE_B16B16, SME2),
> +                       AARCH64_NO_FEATURES},

This entry is incorrect. It should list the feature bit and the dependencies separately - i.e.:
 {"sme-b16b16",       AARCH64_FEATURE (SME_B16B16), AARCH64_FEATURES (2, SVE_B16B16, SME2)},

>    {NULL,               AARCH64_NO_FEATURES, AARCH64_NO_FEATURES},
>  };
> 
> @@ -10752,6 +10752,9 @@ static const struct aarch64_virtual_dependency_table aarch64_dependencies[] = {
>    {AARCH64_FEATURE (SSVE_FP8DOT2), AARCH64_FEATURE (FP8DOT2_SVE)},
>    {AARCH64_FEATURE (SME_F16F16), AARCH64_FEATURE (SME_F16F16_F8F16)},
>    {AARCH64_FEATURE (SME_F8F16), AARCH64_FEATURE (SME_F16F16_F8F16)},
> +  {AARCH64_FEATURES (2, SVE_B16B16, SVE2), AARCH64_FEATURE (SVE_SVE2_B16B16)},
> +  {AARCH64_FEATURES (2, SVE_B16B16, SME2), AARCH64_FEATURES (2, SVE_SME2_B16B16,
> +                                                            SVE_SVE2_B16B16)},

It isn't necessary to use virtual feature bits and dependencies for
SVE_SVE2_B16B16 because there is only one enabling condition.

It isn't necessary to use virtual features for SVE_B16B16 either, since we
currently assume in Binutils that SME2 depends upon SVE2.  (If we were to
remove this dependency, then it would be better to introduce and "SVE2 or SME2"
feature bit instead, and then that single bit could be used for other features
as well).

>  };
> diff --git a/include/opcode/aarch64.h b/include/opcode/aarch64.h
> index e2e4c6d0a09..e5fe9ea3c88 100644
> --- a/include/opcode/aarch64.h
> +++ b/include/opcode/aarch64.h
> @@ -226,8 +226,6 @@ enum aarch64_feature_bit {
>    AARCH64_FEATURE_SPMU2,
>    /* Performance Monitors Synchronous-Exception-Based Event Extension.  */
>    AARCH64_FEATURE_SEBEP,
> -  /* SVE2.1 and SME2.1 non-widening BFloat16 instructions.  */
> -  AARCH64_FEATURE_B16B16,
>    /* SME2.1 instructions.  */
>    AARCH64_FEATURE_SME2p1,
>    /* SVE2.1 instructions.  */
> @@ -266,6 +264,10 @@ enum aarch64_feature_bit {
>    AARCH64_FEATURE_SME_F8F16,
>    /* Non-widening half-precision FP16 to FP16 arithmetic for SME2.  */
>    AARCH64_FEATURE_SME_F16F16,
> +  /* SVE2 non-widening BFloat16 instructions.  */
> +  AARCH64_FEATURE_SVE_B16B16,
> +  /* SME2 non-widening BFloat16 instructions.  */
> +  AARCH64_FEATURE_SME_B16B16,
> 
>    /* Virtual features.  These are used to gate instructions that are enabled
>       by either of two (or more) sets of command line flags.  */
> @@ -279,6 +281,10 @@ enum aarch64_feature_bit {
>    AARCH64_FEATURE_SME_F16F16_F8F16,
>    /* Armv9.5-A processors.  */
>    AARCH64_FEATURE_V9_5A,
> +  /* +sve-b16b16+sve2.  */
> +  AARCH64_FEATURE_SVE_SVE2_B16B16,
> +  /* +sve-b16b16+sme2.  */
> +  AARCH64_FEATURE_SVE_SME2_B16B16,
>    AARCH64_NUM_FEATURES
>  };
> diff --git a/opcodes/aarch64-tbl.h b/opcodes/aarch64-tbl.h
> index 149571994ff..935dcaedd15 100644
> --- a/opcodes/aarch64-tbl.h
> +++ b/opcodes/aarch64-tbl.h
> @@ -2819,8 +2819,12 @@ static const aarch64_feature_set aarch64_feature_the =
>    AARCH64_FEATURE (THE);
>  static const aarch64_feature_set aarch64_feature_d128_the =
>    AARCH64_FEATURES (2, D128, THE);
> -static const aarch64_feature_set aarch64_feature_b16b16_sve2 =
> -  AARCH64_FEATURES (2, B16B16, SVE2);
> +static const aarch64_feature_set aarch64_feature_sve_sve2_b16b16 =
> +  AARCH64_FEATURES (3, SVE_B16B16, SVE2, SVE_SVE2_B16B16);

You could reduce the diff in the opcode table by retaining the same feature set
name (or, at least, the same macro name) for the existing instrutions; I think
that might be slightly clearer.

> +static const aarch64_feature_set aarch64_feature_sve_sme2_b16b16 =
> +  AARCH64_FEATURES (3, SVE_B16B16, SME2, SVE_SME2_B16B16);
> +static const aarch64_feature_set aarch64_feature_sme_b16b16 =
> +  AARCH64_FEATURES (2, SME_B16B16, SME2);
>  static const aarch64_feature_set aarch64_feature_sme2p1 =
>    AARCH64_FEATURE (SME2p1);
>  static const aarch64_feature_set aarch64_feature_sve2p1 =