[PATCH] Support Intel AVX10.2 minmax, vector copy and compare instructions
Jan Beulich
jbeulich@suse.com
Tue Dec 10 07:57:01 GMT 2024
On 04.12.2024 08:34, Haochen Jiang wrote:
> --- /dev/null
> +++ b/gas/testsuite/gas/i386/avx10_2-256-miscs.s
> @@ -0,0 +1,183 @@
> +# Check 32bit AVX10.2/256 instructions
> +
> + .arch generic32
> + .arch .avx10.2/256
> + .text
> +_start:
> + vminmaxpbf16 $123, %xmm4, %xmm5, %xmm6
> + vminmaxpbf16 $123, %ymm4, %ymm5, %ymm6
> + vminmaxpbf16 $123, 0x10000000(%esp, %esi, 8), %ymm5, %ymm6{%k7}
> + vminmaxpbf16 $123, (%ecx){1to16}, %ymm5, %ymm6
> + vminmaxpbf16 $123, 4064(%ecx), %ymm5, %ymm6
> + vminmaxpbf16 $123, -256(%edx){1to16}, %ymm5, %ymm6{%k7}{z}
> + vminmaxpbf16 $123, 0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> + vminmaxpbf16 $123, (%ecx){1to8}, %xmm5, %xmm6
> + vminmaxpbf16 $123, 2032(%ecx), %xmm5, %xmm6
> + vminmaxpbf16 $123, -256(%edx){1to8}, %xmm5, %xmm6{%k7}{z}
> + vminmaxpd $123, %xmm4, %xmm5, %xmm6
> + vminmaxpd $123, %ymm4, %ymm5, %ymm6
> + vminmaxpd $123, {sae}, %ymm4, %ymm5, %ymm6
> + vminmaxpd $123, 0x10000000(%esp, %esi, 8), %ymm5, %ymm6{%k7}
> + vminmaxpd $123, (%ecx){1to4}, %ymm5, %ymm6
> + vminmaxpd $123, 4064(%ecx), %ymm5, %ymm6
> + vminmaxpd $123, -1024(%edx){1to4}, %ymm5, %ymm6{%k7}{z}
> + vminmaxpd $123, 0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> + vminmaxpd $123, (%ecx){1to2}, %xmm5, %xmm6
> + vminmaxpd $123, 2032(%ecx), %xmm5, %xmm6
> + vminmaxpd $123, -1024(%edx){1to2}, %xmm5, %xmm6{%k7}{z}
> + vminmaxph $123, %xmm4, %xmm5, %xmm6
> + vminmaxph $123, %ymm4, %ymm5, %ymm6
> + vminmaxph $123, {sae}, %ymm4, %ymm5, %ymm6
> + vminmaxph $123, 0x10000000(%esp, %esi, 8), %ymm5, %ymm6{%k7}
> + vminmaxph $123, (%ecx){1to16}, %ymm5, %ymm6
> + vminmaxph $123, 4064(%ecx), %ymm5, %ymm6
> + vminmaxph $123, -256(%edx){1to16}, %ymm5, %ymm6{%k7}{z}
> + vminmaxph $123, 0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> + vminmaxph $123, (%ecx){1to8}, %xmm5, %xmm6
> + vminmaxph $123, 2032(%ecx), %xmm5, %xmm6
> + vminmaxph $123, -256(%edx){1to8}, %xmm5, %xmm6{%k7}{z}
> + vminmaxps $123, %xmm4, %xmm5, %xmm6
> + vminmaxps $123, %ymm4, %ymm5, %ymm6
> + vminmaxps $123, {sae}, %ymm4, %ymm5, %ymm6
> + vminmaxps $123, 0x10000000(%esp, %esi, 8), %ymm5, %ymm6{%k7}
> + vminmaxps $123, (%ecx){1to8}, %ymm5, %ymm6
> + vminmaxps $123, 4064(%ecx), %ymm5, %ymm6
> + vminmaxps $123, -512(%edx){1to8}, %ymm5, %ymm6{%k7}{z}
> + vminmaxps $123, 0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> + vminmaxps $123, (%ecx){1to4}, %xmm5, %xmm6
> + vminmaxps $123, 2032(%ecx), %xmm5, %xmm6
> + vminmaxps $123, -512(%edx){1to4}, %xmm5, %xmm6{%k7}{z}
> + vminmaxsd $123, %xmm4, %xmm5, %xmm6
> + vminmaxsd $123, {sae}, %xmm4, %xmm5, %xmm6
> + vminmaxsd $123, 0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> + vminmaxsd $123, (%ecx), %xmm5, %xmm6
> + vminmaxsd $123, 1016(%ecx), %xmm5, %xmm6
> + vminmaxsd $123, -1024(%edx), %xmm5, %xmm6{%k7}{z}
> + vminmaxsh $123, %xmm4, %xmm5, %xmm6
> + vminmaxsh $123, {sae}, %xmm4, %xmm5, %xmm6
> + vminmaxsh $123, 0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> + vminmaxsh $123, (%ecx), %xmm5, %xmm6
> + vminmaxsh $123, 254(%ecx), %xmm5, %xmm6
> + vminmaxsh $123, -256(%edx), %xmm5, %xmm6{%k7}{z}
> + vminmaxss $123, %xmm4, %xmm5, %xmm6
> + vminmaxss $123, {sae}, %xmm4, %xmm5, %xmm6
> + vminmaxss $123, 0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> + vminmaxss $123, (%ecx), %xmm5, %xmm6
> + vminmaxss $123, 508(%ecx), %xmm5, %xmm6
> + vminmaxss $123, -512(%edx), %xmm5, %xmm6{%k7}{z}
Did you consider using .irp for the above as well, like you already do ...
> + vmovd %xmm5, %xmm6
> + vmovd.s %xmm5, %xmm6
> + vmovw %xmm5, %xmm6
> + vmovw.s %xmm5, %xmm6
> +
> + .irp m, "", u
> + v\m\()comxsd %xmm5, %xmm6
... here? I realize the broadcast forms would need pulling out, yet that
would still be a fair reduction of redundancy.
> --- a/opcodes/i386-dis-evex-len.h
> +++ b/opcodes/i386-dis-evex-len.h
> @@ -1,4 +1,14 @@
> static const struct dis386 evex_len_table[][3] = {
> + /* EVEX_LEN_0F7E_P_1_W_1 */
> + {
> + { "vmovd", { XMScalar, EXd }, 0 },
> + },
Isn't this ..._W_0? (There's also an extra blank there.)
> --- a/opcodes/i386-opc.tbl
> +++ b/opcodes/i386-opc.tbl
> @@ -1930,10 +1930,10 @@ vcvtps2ph, 0x661d, F16C, Modrm|Vex=2|Space0F3A|VexW=1|NoSuf, { Imm8, RegYMM, Uns
>
> <fma:opc, 132:10, 213:20, 231:30>
>
> -<sdh:cpu:cpudq:fma:ppfx:spfx:pfx:spc1:spc2:opc:vex:vexlig:vexw:elem, +
> - s:AVX512F:AVX512DQ:FMA|AVX512F::f3:66:Space0F:Space0F38:0:Vex|EVexDYN:VexLIG|EVexLIG:VexW0:Dword, +
> - d:AVX512F:AVX512DQ:FMA|AVX512F:66:f2:66:Space0F:Space0F38:1:Vex|EVexDYN:VexLIG|EVexLIG:VexW1:Qword, +
> - h:AVX512_FP16:AVX512_FP16:AVX512_FP16::f3::Map5:Map6:0::EVexLIG:VexW0:Word>
> +<sdh:cpu:cpudq:fma:ppfx:spfx:pfx:spc1:spc2:opc:vex:vexlig:vexw:elem:sdisp8, +
> + s:AVX512F:AVX512DQ:FMA|AVX512F::f3:66:Space0F:Space0F38:0:Vex|EVexDYN:VexLIG|EVexLIG:VexW0:Dword:Disp8MemShift=2, +
> + d:AVX512F:AVX512DQ:FMA|AVX512F:66:f2:66:Space0F:Space0F38:1:Vex|EVexDYN:VexLIG|EVexLIG:VexW1:Qword:Disp8MemShift=3, +
> + h:AVX512_FP16:AVX512_FP16:AVX512_FP16::f3::Map5:Map6:0::EVexLIG:VexW0:Word:Disp8MemShift=1>
I don't think this is needed; see vcomis<sdh> and vucomis<sdh>. They simply
use Disp8MemShift without value, and the same ought to work here.
Jan
More information about the Binutils
mailing list