[PATCH] Support Intel AVX10.2 minmax, vector copy and compare instructions

Jan Beulich jbeulich@suse.com
Tue Dec 10 07:57:01 GMT 2024


On 04.12.2024 08:34, Haochen Jiang wrote:
> --- /dev/null
> +++ b/gas/testsuite/gas/i386/avx10_2-256-miscs.s
> @@ -0,0 +1,183 @@
> +# Check 32bit AVX10.2/256 instructions
> +
> +	.arch generic32
> +	.arch .avx10.2/256
> +	.text
> +_start:
> +	vminmaxpbf16	$123, %xmm4, %xmm5, %xmm6
> +	vminmaxpbf16	$123, %ymm4, %ymm5, %ymm6
> +	vminmaxpbf16	$123, 0x10000000(%esp, %esi, 8), %ymm5, %ymm6{%k7}
> +	vminmaxpbf16	$123, (%ecx){1to16}, %ymm5, %ymm6
> +	vminmaxpbf16	$123, 4064(%ecx), %ymm5, %ymm6
> +	vminmaxpbf16	$123, -256(%edx){1to16}, %ymm5, %ymm6{%k7}{z}
> +	vminmaxpbf16	$123, 0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> +	vminmaxpbf16	$123, (%ecx){1to8}, %xmm5, %xmm6
> +	vminmaxpbf16	$123, 2032(%ecx), %xmm5, %xmm6
> +	vminmaxpbf16	$123, -256(%edx){1to8}, %xmm5, %xmm6{%k7}{z}
> +	vminmaxpd	$123, %xmm4, %xmm5, %xmm6
> +	vminmaxpd	$123, %ymm4, %ymm5, %ymm6
> +	vminmaxpd	$123, {sae}, %ymm4, %ymm5, %ymm6
> +	vminmaxpd	$123, 0x10000000(%esp, %esi, 8), %ymm5, %ymm6{%k7}
> +	vminmaxpd	$123, (%ecx){1to4}, %ymm5, %ymm6
> +	vminmaxpd	$123, 4064(%ecx), %ymm5, %ymm6
> +	vminmaxpd	$123, -1024(%edx){1to4}, %ymm5, %ymm6{%k7}{z}
> +	vminmaxpd	$123, 0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> +	vminmaxpd	$123, (%ecx){1to2}, %xmm5, %xmm6
> +	vminmaxpd	$123, 2032(%ecx), %xmm5, %xmm6
> +	vminmaxpd	$123, -1024(%edx){1to2}, %xmm5, %xmm6{%k7}{z}
> +	vminmaxph	$123, %xmm4, %xmm5, %xmm6
> +	vminmaxph	$123, %ymm4, %ymm5, %ymm6
> +	vminmaxph	$123, {sae}, %ymm4, %ymm5, %ymm6
> +	vminmaxph	$123, 0x10000000(%esp, %esi, 8), %ymm5, %ymm6{%k7}
> +	vminmaxph	$123, (%ecx){1to16}, %ymm5, %ymm6
> +	vminmaxph	$123, 4064(%ecx), %ymm5, %ymm6
> +	vminmaxph	$123, -256(%edx){1to16}, %ymm5, %ymm6{%k7}{z}
> +	vminmaxph	$123, 0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> +	vminmaxph	$123, (%ecx){1to8}, %xmm5, %xmm6
> +	vminmaxph	$123, 2032(%ecx), %xmm5, %xmm6
> +	vminmaxph	$123, -256(%edx){1to8}, %xmm5, %xmm6{%k7}{z}
> +	vminmaxps	$123, %xmm4, %xmm5, %xmm6
> +	vminmaxps	$123, %ymm4, %ymm5, %ymm6
> +	vminmaxps	$123, {sae}, %ymm4, %ymm5, %ymm6
> +	vminmaxps	$123, 0x10000000(%esp, %esi, 8), %ymm5, %ymm6{%k7}
> +	vminmaxps	$123, (%ecx){1to8}, %ymm5, %ymm6
> +	vminmaxps	$123, 4064(%ecx), %ymm5, %ymm6
> +	vminmaxps	$123, -512(%edx){1to8}, %ymm5, %ymm6{%k7}{z}
> +	vminmaxps	$123, 0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> +	vminmaxps	$123, (%ecx){1to4}, %xmm5, %xmm6
> +	vminmaxps	$123, 2032(%ecx), %xmm5, %xmm6
> +	vminmaxps	$123, -512(%edx){1to4}, %xmm5, %xmm6{%k7}{z}
> +	vminmaxsd	$123, %xmm4, %xmm5, %xmm6
> +	vminmaxsd	$123, {sae}, %xmm4, %xmm5, %xmm6
> +	vminmaxsd	$123, 0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> +	vminmaxsd	$123, (%ecx), %xmm5, %xmm6
> +	vminmaxsd	$123, 1016(%ecx), %xmm5, %xmm6
> +	vminmaxsd	$123, -1024(%edx), %xmm5, %xmm6{%k7}{z}
> +	vminmaxsh	$123, %xmm4, %xmm5, %xmm6
> +	vminmaxsh	$123, {sae}, %xmm4, %xmm5, %xmm6
> +	vminmaxsh	$123, 0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> +	vminmaxsh	$123, (%ecx), %xmm5, %xmm6
> +	vminmaxsh	$123, 254(%ecx), %xmm5, %xmm6
> +	vminmaxsh	$123, -256(%edx), %xmm5, %xmm6{%k7}{z}
> +	vminmaxss	$123, %xmm4, %xmm5, %xmm6
> +	vminmaxss	$123, {sae}, %xmm4, %xmm5, %xmm6
> +	vminmaxss	$123, 0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> +	vminmaxss	$123, (%ecx), %xmm5, %xmm6
> +	vminmaxss	$123, 508(%ecx), %xmm5, %xmm6
> +	vminmaxss	$123, -512(%edx), %xmm5, %xmm6{%k7}{z}

Did you consider using .irp for the above as well, like you already do ...

> +	vmovd	%xmm5, %xmm6
> +	vmovd.s	%xmm5, %xmm6
> +	vmovw	%xmm5, %xmm6
> +	vmovw.s	%xmm5, %xmm6
> +
> +	.irp m, "", u
> +	v\m\()comxsd	%xmm5, %xmm6

... here? I realize the broadcast forms would need pulling out, yet that
would still be a fair reduction of redundancy.

> --- a/opcodes/i386-dis-evex-len.h
> +++ b/opcodes/i386-dis-evex-len.h
> @@ -1,4 +1,14 @@
>  static const struct dis386 evex_len_table[][3] = {
> +  /* EVEX_LEN_0F7E_P_1_W_1  */
> +  {
> +    { "vmovd",       { XMScalar, EXd }, 0 },
> +  },

Isn't this ..._W_0? (There's also an extra blank there.)

> --- a/opcodes/i386-opc.tbl
> +++ b/opcodes/i386-opc.tbl
> @@ -1930,10 +1930,10 @@ vcvtps2ph, 0x661d, F16C, Modrm|Vex=2|Space0F3A|VexW=1|NoSuf, { Imm8, RegYMM, Uns
>  
>  <fma:opc, 132:10, 213:20, 231:30>
>  
> -<sdh:cpu:cpudq:fma:ppfx:spfx:pfx:spc1:spc2:opc:vex:vexlig:vexw:elem, +
> -    s:AVX512F:AVX512DQ:FMA|AVX512F::f3:66:Space0F:Space0F38:0:Vex|EVexDYN:VexLIG|EVexLIG:VexW0:Dword, +
> -    d:AVX512F:AVX512DQ:FMA|AVX512F:66:f2:66:Space0F:Space0F38:1:Vex|EVexDYN:VexLIG|EVexLIG:VexW1:Qword, +
> -    h:AVX512_FP16:AVX512_FP16:AVX512_FP16::f3::Map5:Map6:0::EVexLIG:VexW0:Word>
> +<sdh:cpu:cpudq:fma:ppfx:spfx:pfx:spc1:spc2:opc:vex:vexlig:vexw:elem:sdisp8, +
> +    s:AVX512F:AVX512DQ:FMA|AVX512F::f3:66:Space0F:Space0F38:0:Vex|EVexDYN:VexLIG|EVexLIG:VexW0:Dword:Disp8MemShift=2, +
> +    d:AVX512F:AVX512DQ:FMA|AVX512F:66:f2:66:Space0F:Space0F38:1:Vex|EVexDYN:VexLIG|EVexLIG:VexW1:Qword:Disp8MemShift=3, +
> +    h:AVX512_FP16:AVX512_FP16:AVX512_FP16::f3::Map5:Map6:0::EVexLIG:VexW0:Word:Disp8MemShift=1>

I don't think this is needed; see vcomis<sdh> and vucomis<sdh>. They simply
use Disp8MemShift without value, and the same ought to work here.

Jan


More information about the Binutils mailing list