[PATCH 2/8] Support APX GPR32 with extend evex prefix
Jan Beulich
jbeulich@suse.com
Fri Sep 22 10:12:15 GMT 2023
On 19.09.2023 17:25, Cui, Lili wrote:
> --- a/gas/config/tc-i386.c
> +++ b/gas/config/tc-i386.c
> @@ -1945,6 +1945,30 @@ cpu_flags_match (const insn_template *t)
> && (!x.bitfield.cpuvpclmulqdq || cpu.bitfield.cpuvpclmulqdq))
> match |= CPU_FLAGS_ARCH_MATCH;
> }
> + else if (x.bitfield.cpuapx_f)
> + {
> + if (cpu.bitfield.cpuapx_f
> + && (!x.bitfield.cpumovbe || cpu.bitfield.cpumovbe)
> + && (!x.bitfield.cpuept || cpu.bitfield.cpuept)
> + && (!x.bitfield.cpuinvpcid || cpu.bitfield.cpuinvpcid)
> + && (!x.bitfield.cpusse4_2 || cpu.bitfield.cpusse4_2)
> + && (!x.bitfield.cpubmi2 || cpu.bitfield.cpubmi2)
> + && (!x.bitfield.cpubmi || cpu.bitfield.cpubmi)
> + && (!x.bitfield.cpuadx || cpu.bitfield.cpuadx)
> + && (!x.bitfield.cpusha || cpu.bitfield.cpusha)
> + && (!x.bitfield.cpuavx512bw || cpu.bitfield.cpuavx512bw)
> + && (!x.bitfield.cpuavx512dq || cpu.bitfield.cpuavx512dq)
> + && (!x.bitfield.cpuavx512f || cpu.bitfield.cpuavx512f)
> + && (!x.bitfield.cpushstk || cpu.bitfield.cpushstk)
> + && (!x.bitfield.cpumovdir64b || cpu.bitfield.cpumovdir64b)
> + && (!x.bitfield.cpumovdiri || cpu.bitfield.cpumovdiri)
> + && (!x.bitfield.cpuenqcmd || cpu.bitfield.cpuenqcmd)
> + && (!x.bitfield.cpukl || cpu.bitfield.cpukl)
> + && (!x.bitfield.cpuwidekl || cpu.bitfield.cpuwidekl)
> + && (!x.bitfield.cpucmpccxadd || cpu.bitfield.cpucmpccxadd)
> + && (!x.bitfield.cpurao_int || cpu.bitfield.cpurao_int))
> + match |= CPU_FLAGS_ARCH_MATCH;
> + }
> else
> match |= CPU_FLAGS_ARCH_MATCH;
>
This is getting unwieldy, so I think we will need to think of a better way
of expressing both "multiple ISAs need to be enabled" and "one of a set of
ISAs needs to be enabled". It's only the mix of these expressed in a
uniform way in the insn table that requires these extra conditionals. With
the size of i386_cpu_attr greatly shrunk as of recently, I wonder if we
couldn't simply add a 2nd instance of it to insn_template. One would be
"all of these are required", while the other would be "any one of these is
sufficient".
> @@ -3850,7 +3874,10 @@ is_any_vex_encoding (const insn_template *t)
> static INLINE bool
> is_any_apx_encoding (void)
> {
> - return i.rex2 || i.rex2_encoding;
> + return i.rex2
> + || i.rex2_encoding
> + || (i.vex.register_specifier
> + && i.vex.register_specifier->reg_flags & RegRex2);
Nit: For readability as well as for consistency this wants indenting
differently:
return i.rex2
|| i.rex2_encoding
|| (i.vex.register_specifier
&& i.vex.register_specifier->reg_flags & RegRex2);
or possibly (slightly shorter)
return i.rex2 || i.rex2_encoding
|| (i.vex.register_specifier
&& i.vex.register_specifier->reg_flags & RegRex2);
In any event you want to avoid trailing blanks on any line.
> @@ -3859,6 +3886,12 @@ is_any_apx_rex2_encoding (void)
> return (i.rex2 && i.vex.length == 2) || i.rex2_encoding;
> }
>
> +static INLINE bool
> +is_any_apx_evex_encoding (void)
> +{
> + return i.rex2 && i.vex.length == 4;
> +}
This doesn't feel right: {evex} use would demand this encoding even if
i.rex2 is still zero.
Also - what is "any" in the name (also of the earlier predicate) intending
to express? is_any_vex_encoding() is named the way it is because it covers
both VEX and EVEX.
> @@ -4129,6 +4162,50 @@ build_rex2_prefix (void)
> | (i.rex2 << 4) | i.rex);
> }
>
> +/* Build the EVEX prefix (4-byte) for evex insn
> + | 62h |
> + | `R`X`B`R' | B'mmm |
> + | W | v`v`v`v | `x' | pp |
> + | z| L'L | b | `v | aaa |
> +*/
> +static void
> +build_evex_insns_with_extend_evex_prefix (void)
The name is somewhat odd and doesn't fit that of other similar functions.
In particular this function doesn't build an entire insn, but still just
the prefix. So perhaps build_apx_evex_prefix()?
> +{
> + build_evex_prefix ();
> + if (i.rex2 & REX_R)
> + i.vex.bytes[1] &= 0xef;
> + if (i.vex.register_specifier
> + && register_number (i.vex.register_specifier) > 0xf)
> + i.vex.bytes[3] &=0xf7;
Nit: Missing blank.
But: Is this needed? Doesn't build_evex_prefix() fill this bit already,
which isn't new in APX?
> + if (i.rex2 & REX_B)
> + i.vex.bytes[1] |= 0x08;
> + if (i.rex2 & REX_X)
> + i.vex.bytes[2] &= 0xfb;
> +}
> +
> +/* Build the EVEX prefix (4-byte) for legacy insn
> + | 62h |
> + | `R`X`B`R' | B'100 |
> + | W | v`v`v`v | `x' | pp |
> + | 000 | ND | `v | NF | 00 |
> + For legacy insn without ndd nor nf, [vvvvv] must be all zero. */
> +static void
> +build_legacy_insns_with_apx_encoding (void)
As per above, maybe build_extended_evex_prefix()? Or, ...
> +{
> + /* map{0,1} of legacy space without ndd or nf could use rex2 prefix. */
> + if (i.tm.opcode_space <= SPACE_0F
> + && !i.vex.register_specifier && !i.has_nf && !i.has_zero_upper)
> + return build_rex2_prefix ();
... because of this, build_apx_prefix()? Yet I think the call to this
function might better remain in the caller.
> + if (i.prefix[DATA_PREFIX] != 0)
> + {
> + i.tm.opcode_modifier.opcodeprefix = PREFIX_0X66;
> + i.prefix[DATA_PREFIX] = 0;
> + }
While this looks to be correct for the case when the prefix was derived
from an insn template and the use of 16-bit operands, I don't think it
is uniformly correct when "data16" was used as a prefix explicitly. In
such a case either REX2 encoding needs to be used, or an error needs
emitting.
You may further want to assert that i.tm.opcode_modifier.opcodeprefix
is still zero ahead of the assignment.
> @@ -10057,7 +10136,7 @@ output_insn (void)
>
> /* Since the VEX/EVEX prefix contains the implicit prefix, we
> don't need the explicit prefix. */
> - if (!is_any_vex_encoding (&i.tm))
> + if (!is_any_vex_encoding (&i.tm) && !is_any_apx_evex_encoding ())
> {
> switch (i.tm.opcode_modifier.opcodeprefix)
I'm not convinced the use of this predicate is appropriate here. I'd
generally have expected is_any_vex_encoding() to be extended to also
detect all cases of EVEX encodings in APX.
> --- a/opcodes/i386-dis-evex-len.h
> +++ b/opcodes/i386-dis-evex-len.h
As for the earlier patch, I'll look at the disassembler changes separately.
> @@ -1121,6 +1122,15 @@ process_i386_opcode_modifier (FILE *table, char *mod, unsigned int space,
> fprintf (stderr,
> "%s: %d: W modifier without Word/Dword/Qword operand(s)\n",
> filename, lineno);
> + if (modifiers[Vex].value
> + || (space > SPACE_0F
> + && !(space == SPACE_EVEXMAP4
> + || modifiers[EVex].value
> + || modifiers[Disp8MemShift].value
> + || modifiers[Broadcast].value
> + || modifiers[Masking].value
> + || modifiers[SAE].value)))
First of all, this wants simplifying to
if (modifiers[Vex].value
|| (space > SPACE_0F
&& space != SPACE_EVEXMAP4
&& !modifiers[EVex].value
&& !modifiers[Disp8MemShift].value
&& !modifiers[Broadcast].value
&& !modifiers[Masking].value
&& !modifiers[SAE].value))
which helps readability and makes more obvious that this parallels
tc-i386.c:is_evex_encoding(). Such a connection, where updates need
to be made in sync, needs pointing out in code comments at both sites.
Yet of course this condition won't hold anymore for combined VEX/EVEX
templates.
> + modifiers[No_egpr].value = 1;
> }
And then - shouldn't at least part of this already be put in place in
patch 1?
Finally, to avoid the split between where this attribute gets set,
wouldn't it be possible to also handle the XSAVE/XRSTOR variants here
rather than directly in the opcode table?
> @@ -187,6 +188,7 @@ mov, 0xf24, i386|No64, D|RegMem|IgnoreSize|No_bSuf|No_wSuf|No_sSuf|No_qSuf, { Te
>
> // Move after swapping the bytes
> movbe, 0x0f38f0, Movbe, D|Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
> +movbe, 0x60, Movbe|APX_F|x64, D|Modrm|CheckOperandSize|No_bSuf|No_sSuf|EVex128|EVexMap4, { Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
In new code please omit redundant Word, Dword, and alike.
I further wonder if it wouldn't help if i386-gen inserted the x64 for
all APX templates, rather than open-coding that on every single template.
Or alternatively put
#define APX_F APX_F|x64
earlier in the file.
> @@ -300,6 +302,9 @@ sbb, 0x18, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg
> sbb, 0x83/3, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
> sbb, 0x1c, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
> sbb, 0x80/3, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +sbb, 0x18, APX_F|x64, D|W|CheckOperandSize|Modrm|EVex128|EVexMap4|No_sSuf, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +sbb, 0x83/3, APX_F|x64, Modrm|EVex128|EVexMap4|No_bSuf|No_sSuf, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
> +sbb, 0x80/3, APX_F|x64, W|Modrm|EVex128|EVexMap4|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
>
> cmp, 0x38, 0, D|W|CheckOperandSize|Modrm|No_sSuf, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> cmp, 0x83/7, 0, Modrm|No_bSuf|No_sSuf, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
> @@ -332,9 +337,14 @@ adc, 0x10, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg
> adc, 0x83/2, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
> adc, 0x14, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
> adc, 0x80/2, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +adc, 0x10, APX_F|x64, D|W|CheckOperandSize|Modrm|EVex128|EVexMap4|No_sSuf, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +adc, 0x83/2, APX_F|x64, Modrm|EVex128|EVexMap4|No_bSuf|No_sSuf, { Imm8S, Reg16|Reg32|Reg64|Word|Dword|Qword|Unspecified|BaseIndex }
> +adc, 0x80/2, APX_F|x64, W|Modrm|EVex128|EVexMap4|No_sSuf, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
>
> neg, 0xf6/3, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +
> not, 0xf6/2, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
> +not, 0xf6/2, APX_F|x64, W|Modrm|No_sSuf|EVex128|EVexMap4, { Reg8|Reg16|Reg32|Reg64|Byte|Word|Dword|Qword|Unspecified|BaseIndex }
Looking at just the additions up to here, I'm getting the impression that
in this patch - despite its title - you only add non-ND, non-NF insn forms
for previously non-VEX-encoded insns. This could do with clarifying, by
both making the title more concise and by stating the exact scope of the
work done in the description.
> @@ -1312,13 +1330,16 @@ getsec, 0xf37, SMX, NoSuf, {}
>
> invept, 0x660f3880, EPT|No64, Modrm|IgnoreSize|NoSuf, { Oword|Unspecified|BaseIndex, Reg32 }
> invept, 0x660f3880, EPT|x64, Modrm|NoSuf|NoRex64, { Oword|Unspecified|BaseIndex, Reg64 }
> +invept, 0xf3f0, APX_F|EPT|x64, Modrm|NoSuf|NoRex64|EVex128|EVexMap4, { Oword|Unspecified|BaseIndex, Reg64 }
> invvpid, 0x660f3881, EPT|No64, Modrm|IgnoreSize|NoSuf, { Oword|Unspecified|BaseIndex, Reg32 }
> invvpid, 0x660f3881, EPT|x64, Modrm|NoSuf|NoRex64, { Oword|Unspecified|BaseIndex, Reg64 }
> +invvpid, 0xf3f1, APX_F|EPT|x64, Modrm|NoSuf|NoRex64|EVex128|EVexMap4, { Oword|Unspecified|BaseIndex, Reg64 }
>
> // INVPCID instruction
>
> invpcid, 0x660f3882, INVPCID|No64, Modrm|IgnoreSize|NoSuf, { Oword|Unspecified|BaseIndex, Reg32 }
> invpcid, 0x660f3882, INVPCID|x64, Modrm|NoSuf|NoRex64, { Oword|Unspecified|BaseIndex, Reg64 }
> +invpcid, 0xf3f2, APX_F|INVPCID|x64, Modrm|NoSuf|NoRex64|EVex128|EVexMap4, { Oword|Unspecified|BaseIndex, Reg64 }
I don't think NoRex64 belongs in any EVEX template.
> @@ -1418,7 +1439,9 @@ pcmpestrm, 0x660f3a60, SSE4_2|x64, Modrm|IgnoreSize|No_bSuf|No_wSuf|No_sSuf, { I
> pcmpistri<sse42>, 0x660f3a63, <sse42:cpu>, Modrm|<sse42:attr>|NoSuf, { Imm8, RegXMM|Unspecified|BaseIndex, RegXMM }
> pcmpistrm<sse42>, 0x660f3a62, <sse42:cpu>, Modrm|<sse42:attr>|NoSuf, { Imm8, RegXMM|Unspecified|BaseIndex, RegXMM }
> crc32, 0xf20f38f0, SSE4_2, W|Modrm|No_sSuf|No_qSuf, { Reg8|Reg16|Reg32|Unspecified|BaseIndex, Reg32 }
> +crc32, 0xf0, APX_F|x64, W|Modrm|No_sSuf|No_qSuf|EVex128|EVexMap4, { Reg8|Reg16|Reg32|Unspecified|BaseIndex, Reg32 }
> crc32, 0xf20f38f0, SSE4_2|x64, W|Modrm|No_wSuf|No_lSuf|No_sSuf, { Reg8|Reg64|Unspecified|BaseIndex, Reg64 }
> +crc32, 0xf0, APX_F|x64, W|Modrm|No_wSuf|No_lSuf|No_sSuf|EVex128|EVexMap4, { Reg8|Reg64|Unspecified|BaseIndex, Reg64 }
There's quite a bit of logic in tc-i386.c to get CRC32 right. I wonder
if you can really get away without adjusting that logic to also take
effect on the EVEX encodings.
> @@ -3408,3 +3487,4 @@ erets, 0xf20f01ca, FRED|x64, NoSuf, {}
> eretu, 0xf30f01ca, FRED|x64, NoSuf, {}
>
> // FRED instructions end.
> +
Nit: Stray change.
Jan
More information about the Binutils
mailing list