[PATCH 2/2] x86/APX: V{BROADCAST, EXTRACT, INSERT}{F, I}128 can also be expressed
Jan Beulich
jbeulich@suse.com
Fri Feb 2 10:40:58 GMT 2024
Interestingly unlike VROUND{P,S}{S,D} and VPERM{F,I}128 they weren't
even present in the x86-64-apx-egpr-inval testcase, hence why I
overlooked that these can actually be encoded, (again) using suitable
AVX512 counterparts.
While there also "modernize" the adjacent AVX/AVX2 entries.
---
Here and also for the VROUND{P,S}{S,D} surrogates: If we omitted the
APX_F part of the conditions, dual VEX/EVEX templates ought to be
possible to use. Afaics the only (further) "anomaly" that would result
is that then {evex} prefixes also become usable with these mnemonics
even when APX is disabled. Of course for consistency the restriction to
memory-only forms should then also be lifted. (As to "further": The
other one is that these insns permit use of the high 16 XMM/YMM
registers along with the [intended] permitting of the eGPR-s.) Thoughts?
Some VPERM{F,I}128 forms could be expressed by VSHUF{F,I}32x4, others by
VPERM{PD,Q}, but perhaps this isn't worth it overall, first and foremost
because we wouldn't be able to reach full coverage.
--- a/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-intel.d
+++ b/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-intel.d
@@ -158,6 +158,12 @@ Disassembly of section \.text:
[ ]*[a-f0-9]+:[ ]*62 da 7f 08 4b b4 87 23 01 00 00[ ]+tileloadd tmm6,\[r31\+rax\*4\+0x123\]
[ ]*[a-f0-9]+:[ ]*62 da 7d 08 4b b4 87 23 01 00 00[ ]+tileloaddt1 tmm6,\[r31\+rax\*4\+0x123\]
[ ]*[a-f0-9]+:[ ]*62 da 7e 08 4b b4 87 23 01 00 00[ ]+tilestored[ ]+\[r31\+rax\*4\+0x123\],tmm6
+[ ]*[a-f0-9]+:[ ]*62 fa 7d 28 1a 18[ ]+vbroadcastf32x4 ymm3,XMMWORD PTR \[r16\]
+[ ]*[a-f0-9]+:[ ]*62 fa 7d 28 5a 18[ ]+vbroadcasti32x4 ymm3,XMMWORD PTR \[r16\]
+[ ]*[a-f0-9]+:[ ]*62 fb 7d 28 19 18 01[ ]+vextractf32x4 XMMWORD PTR \[r16\],ymm3,(0x)?1
+[ ]*[a-f0-9]+:[ ]*62 fb 7d 28 39 18 01[ ]+vextracti32x4 XMMWORD PTR \[r16\],ymm3,(0x)?1
+[ ]*[a-f0-9]+:[ ]*62 7b 65 28 18 00 01[ ]+vinsertf32x4 ymm8,ymm3,XMMWORD PTR \[r16\],(0x)?1
+[ ]*[a-f0-9]+:[ ]*62 7b 65 28 38 00 01[ ]+vinserti32x4 ymm8,ymm3,XMMWORD PTR \[r16\],(0x)?1
[ ]*[a-f0-9]+:[ ]*62 db fd 08 09 30 01[ ]+vrndscalepd xmm6,XMMWORD PTR \[r24\],(0x)?1
[ ]*[a-f0-9]+:[ ]*62 db 7d 08 08 30 02[ ]+vrndscaleps xmm6,XMMWORD PTR \[r24\],(0x)?2
[ ]*[a-f0-9]+:[ ]*62 db cd 08 0b 18 03[ ]+vrndscalesd xmm3,xmm6,QWORD PTR \[r24\],(0x)?3
--- a/gas/testsuite/gas/i386/x86-64-apx-evex-promoted.d
+++ b/gas/testsuite/gas/i386/x86-64-apx-evex-promoted.d
@@ -158,6 +158,12 @@ Disassembly of section \.text:
[ ]*[a-f0-9]+:[ ]*62 da 7f 08 4b b4 87 23 01 00 00[ ]+tileloadd[ ]+0x123\(%r31,%rax,4\),%tmm6
[ ]*[a-f0-9]+:[ ]*62 da 7d 08 4b b4 87 23 01 00 00[ ]+tileloaddt1[ ]+0x123\(%r31,%rax,4\),%tmm6
[ ]*[a-f0-9]+:[ ]*62 da 7e 08 4b b4 87 23 01 00 00[ ]+tilestored[ ]+%tmm6,0x123\(%r31,%rax,4\)
+[ ]*[a-f0-9]+:[ ]*62 fa 7d 28 1a 18[ ]+vbroadcastf32x4 \(%r16\),%ymm3
+[ ]*[a-f0-9]+:[ ]*62 fa 7d 28 5a 18[ ]+vbroadcasti32x4 \(%r16\),%ymm3
+[ ]*[a-f0-9]+:[ ]*62 fb 7d 28 19 18 01[ ]+vextractf32x4 \$(0x)?1,%ymm3,\(%r16\)
+[ ]*[a-f0-9]+:[ ]*62 fb 7d 28 39 18 01[ ]+vextracti32x4 \$(0x)?1,%ymm3,\(%r16\)
+[ ]*[a-f0-9]+:[ ]*62 7b 65 28 18 00 01[ ]+vinsertf32x4 \$(0x)?1,\(%r16\),%ymm3,%ymm8
+[ ]*[a-f0-9]+:[ ]*62 7b 65 28 38 00 01[ ]+vinserti32x4 \$(0x)?1,\(%r16\),%ymm3,%ymm8
[ ]*[a-f0-9]+:[ ]*62 db fd 08 09 30 01[ ]+vrndscalepd \$0x1,\(%r24\),%xmm6
[ ]*[a-f0-9]+:[ ]*62 db 7d 08 08 30 02[ ]+vrndscaleps \$0x2,\(%r24\),%xmm6
[ ]*[a-f0-9]+:[ ]*62 db cd 08 0b 18 03[ ]+vrndscalesd \$0x3,\(%r24\),%xmm6,%xmm3
--- a/gas/testsuite/gas/i386/x86-64-apx-evex-promoted.s
+++ b/gas/testsuite/gas/i386/x86-64-apx-evex-promoted.s
@@ -152,6 +152,12 @@ _start:
tileloadd 0x123(%r31,%rax,4),%tmm6
tileloaddt1 0x123(%r31,%rax,4),%tmm6
tilestored %tmm6,0x123(%r31,%rax,4)
+ vbroadcastf128 (%r16),%ymm3
+ vbroadcasti128 (%r16),%ymm3
+ vextractf128 $1,%ymm3,(%r16)
+ vextracti128 $1,%ymm3,(%r16)
+ vinsertf128 $1,(%r16),%ymm3,%ymm8
+ vinserti128 $1,(%r16),%ymm3,%ymm8
vroundpd $1,(%r24),%xmm6
vroundps $2,(%r24),%xmm6
vroundsd $3,(%r24),%xmm6,%xmm3
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -1588,7 +1588,9 @@ vandnp<sd>, 0x<sd:ppfx>55, AVX, Modrm|Ve
vandp<sd>, 0x<sd:ppfx>54, AVX, Modrm|C|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
vblendp<sd>, 0x660c | <sd:opc>, AVX, Modrm|Vex|Space0F3A|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
vblendvp<sd>, 0x664a | <sd:opc>, AVX, Modrm|Vex|Space0F3A|VexVVVV|VexW0|CheckOperandSize|NoSuf, { RegXMM|RegYMM, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vbroadcastf128, 0x661a, AVX, Modrm|Vex=2|Space0F38|VexW=1|NoSuf, { Xmmword|Unspecified|BaseIndex, RegYMM }
+vbroadcastf128, 0x661a, AVX, Modrm|Vex256|Space0F38|VexW0|NoSuf, { Xmmword|Unspecified|BaseIndex, RegYMM }
+// vbroadcastf32x4 in disguise (see vround{p,s}{s,d} comment)
+vbroadcastf128, 0x661a, APX_F&AVX512VL, Modrm|EVex256|Space0F38|VexW0|Disp8MemShift=4|NoSuf, { Xmmword|Unspecified|BaseIndex, RegYMM }
vbroadcastsd, 0x6619, AVX, Modrm|Vex256|Space0F38|VexW0|NoSuf, { Qword|Unspecified|BaseIndex, RegYMM }
vbroadcastss, 0x6618, AVX, Modrm|Vex128|Space0F38|VexW0|NoSuf, { Dword|Unspecified|BaseIndex, RegXMM|RegYMM }
vcmp<frel>p<sd>, 0x<sd:ppfx>c2/0x<frel:imm>, AVX, Modrm|<frel:comm>|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf|ImmExt, { RegXMM|RegYMM|Unspecified|BaseIndex, RegXMM|RegYMM, RegXMM|RegYMM }
@@ -1616,7 +1618,9 @@ vdivp<sd>, 0x<sd:ppfx>5e, AVX, Modrm|Vex
vdivs<sd>, 0x<sd:spfx>5e, AVX, Modrm|VexLIG|Space0F|VexVVVV|VexWIG|NoSuf, { <sd:elem>|Unspecified|BaseIndex|RegXMM, RegXMM, RegXMM }
vdppd, 0x6641, AVX, Modrm|Vex|Space0F3A|VexVVVV|VexWIG|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM, RegXMM, RegXMM }
vdpps, 0x6640, AVX, Modrm|Vex|Space0F3A|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vextractf128, 0x6619, AVX, Modrm|Vex=2|Space0F3A|VexW=1|NoSuf, { Imm8, RegYMM, Unspecified|BaseIndex|RegXMM }
+vextractf128, 0x6619, AVX, Modrm|Vex256|Space0F3A|VexW0|NoSuf, { Imm8, RegYMM, Unspecified|BaseIndex|RegXMM }
+// vextractf32x4 in disguise (see vround{p,s}{s,d} comment)
+vextractf128, 0x6619, APX_F&AVX512VL, Modrm|EVex256|Space0F3A|VexW0|Disp8MemShift=4|NoSuf, { Imm8, RegYMM, Xmmword|Unspecified|BaseIndex }
vextractps, 0x6617, AVX|AVX512F, Modrm|Vex128|EVex128|Space0F3A|VexWIG|Disp8MemShift=2|NoSuf, { Imm8, RegXMM, Reg32|Unspecified|BaseIndex }
vextractps, 0x6617, x64&(AVX|AVX512F), RegMem|Vex128|EVex128|Space0F3A|VexWIG|NoSuf, { Imm8, RegXMM, Reg64 }
vhaddpd, 0x667c, AVX, Modrm|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
@@ -1624,6 +1628,8 @@ vhaddps, 0xf27c, AVX, Modrm|Vex|Space0F|
vhsubpd, 0x667d, AVX, Modrm|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
vhsubps, 0xf27d, AVX, Modrm|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
vinsertf128, 0x6618, AVX, Modrm|Vex256|Space0F3A|VexVVVV|VexW0|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM, RegYMM, RegYMM }
+// vinsertf32x4 in disguise (see vround{p,s}{s,d} comment)
+vinsertf128, 0x6618, APX_F&AVX512VL, Modrm|EVex256|Space0F3A|VexVVVV|VexW0|Disp8MemShift=4|NoSuf, { Imm8, Xmmword|Unspecified|BaseIndex, RegYMM, RegYMM }
vinsertps, 0x6621, AVX, Modrm|Vex|Space0F3A|VexVVVV|VexWIG|NoSuf, { Imm8, Dword|Unspecified|BaseIndex|RegXMM, RegXMM, RegXMM }
vlddqu, 0xf2f0, AVX, Modrm|Vex|Space0F|VexWIG|CheckOperandSize|NoSuf, { Xmmword|Ymmword|Unspecified|BaseIndex, RegXMM|RegYMM }
vldmxcsr, 0xae/2, AVX, Modrm|Vex128|Space0F|VexWIG|NoSuf, { Dword|Unspecified|BaseIndex }
@@ -1830,7 +1836,9 @@ vpmovzxwq, 0x6634, AVX2|AVX512VL, Modrm|
// New AVX2 instructions.
-vbroadcasti128, 0x665A, AVX2, Modrm|Vex=2|Space0F38|VexW=1|NoSuf, { Xmmword|Unspecified|BaseIndex, RegYMM }
+vbroadcasti128, 0x665A, AVX2, Modrm|Vex256|Space0F38|VexW0|NoSuf, { Xmmword|Unspecified|BaseIndex, RegYMM }
+// vbroadcasti32x4 in disguise (see vround{p,s}{s,d} comment)
+vbroadcasti128, 0x665a, APX_F&AVX512VL, Modrm|EVex256|Space0F38|VexW0|Disp8MemShift=4|NoSuf, { Xmmword|Unspecified|BaseIndex, RegYMM }
vbroadcastsd, 0x6619, AVX2, Modrm|Vex=2|Space0F38|VexW=1|NoSuf, { RegXMM, RegYMM }
vbroadcastss, 0x6618, AVX2|AVX512F, Modrm|Vex|EVexDYN|Masking|Space0F38|VexW0|Disp8MemShift=2|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
vpblendd, 0x6602, AVX2, Modrm|Vex|Space0F3A|VexVVVV|VexW0|CheckOperandSize|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
@@ -1842,8 +1850,12 @@ vpermd, 0x6636, AVX2|AVX512F, Modrm|Vex2
vpermpd, 0x6601, AVX2|AVX512F, Modrm|Vex256|EVexDYN|Masking|Space0F3A|VexW1|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8|Imm8S, RegYMM|RegZMM|Qword|Unspecified|BaseIndex, RegYMM|RegZMM }
vpermps, 0x6616, AVX2|AVX512F, Modrm|Vex256|EVexDYN|Masking|Space0F38|VexVVVV|VexW0|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegYMM|RegZMM|Dword|Unspecified|BaseIndex, RegYMM|RegZMM, RegYMM|RegZMM }
vpermq, 0x6600, AVX2|AVX512F, Modrm|Vex256|EVexDYN|Masking|Space0F3A|VexW1|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8|Imm8S, RegYMM|RegZMM|Qword|Unspecified|BaseIndex, RegYMM|RegZMM }
-vextracti128, 0x6639, AVX2, Modrm|Vex=2|Space0F3A|VexW=1|NoSuf, { Imm8, RegYMM, Unspecified|BaseIndex|RegXMM }
+vextracti128, 0x6639, AVX2, Modrm|Vex256|Space0F3A|VexW0|NoSuf, { Imm8, RegYMM, Unspecified|BaseIndex|RegXMM }
+// vextracti32x4 in disguise (see vround{p,s}{s,d} comment)
+vextracti128, 0x6639, APX_F&AVX512VL, Modrm|EVex256|Space0F3A|VexW0|Disp8MemShift=4|NoSuf, { Imm8, RegYMM, Xmmword|Unspecified|BaseIndex }
vinserti128, 0x6638, AVX2, Modrm|Vex256|Space0F3A|VexVVVV|VexW0|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM, RegYMM, RegYMM }
+// vinserti32x4 in disguise (see vround{p,s}{s,d} comment)
+vinserti128, 0x6638, APX_F&AVX512VL, Modrm|EVex256|Space0F3A|VexVVVV|VexW0|Disp8MemShift=4|NoSuf, { Imm8, Xmmword|Unspecified|BaseIndex, RegYMM, RegYMM }
vpmaskmov<dq>, 0x668e, AVX2, Modrm|Vex|Space0F38|VexVVVV|<dq:vexw>|CheckOperandSize|NoSuf, { RegXMM|RegYMM, RegXMM|RegYMM, Xmmword|Ymmword|Unspecified|BaseIndex }
vpmaskmov<dq>, 0x668c, AVX2, Modrm|Vex|Space0F38|VexVVVV|<dq:vexw>|CheckOperandSize|NoSuf, { Xmmword|Ymmword|Unspecified|BaseIndex, RegXMM|RegYMM, RegXMM|RegYMM }
vpsllv<dq>, 0x6647, AVX2|AVX512F, Modrm|Vex|EVexDYN|Masking|Space0F38|VexVVVV|<dq:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|<dq:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
More information about the Binutils
mailing list