[PATCH 5/8] Support APX NDD optimized encoding.

Cui, Lili lili.cui@intel.com
Tue Sep 19 15:25:24 GMT 2023


From: "Hu, Lin1" <lin1.hu@intel.com>

This patch aims to optimize:

add %r16, %r15, %r15 -> add %r16, %r15

gas/ChangeLog:

	* config/tc-i386.c (optimize_NDD_to_nonNDD): New function.
	(match_template): If we can optimzie APX NDD insns, so rematch
	template.
	* testsuite/gas/i386/x86-64.exp: Add test.
	* testsuite/gas/i386/x86-64-apx-ndd-optimize.d: New test.
	* testsuite/gas/i386/x86-64-apx-ndd-optimize.s: Ditto.
---
 gas/config/tc-i386.c                          |  49 +++++++
 .../gas/i386/x86-64-apx-ndd-optimize.d        | 120 ++++++++++++++++++
 .../gas/i386/x86-64-apx-ndd-optimize.s        | 115 +++++++++++++++++
 gas/testsuite/gas/i386/x86-64.exp             |   1 +
 4 files changed, 285 insertions(+)
 create mode 100644 gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.d
 create mode 100644 gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.s

diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c
index 381e389bb04..fba97ae37d8 100644
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -7091,6 +7091,46 @@ check_EgprOperands (const insn_template *t)
   return 0;
 }
 
+/* Optimize APX NDD insns to non-NDD insns.  */
+
+static int
+optimize_NDD_to_nonNDD (const insn_template *t)
+{
+  if (t->opcode_modifier.vexvvvv
+      && t->opcode_space == SPACE_EVEXMAP4
+      && i.reg_operands >= 2
+      && (i.types[i.operands - 1].bitfield.dword
+	  || i.types[i.operands - 1].bitfield.qword))
+    {
+      int tmp_flag = -1;
+      int dest = i.operands - 1;
+      int src1 = (i.operands > 2) ? i.operands - 2 : 0;
+      int src2 = (i.operands > 3) ? i.operands - 3 : 0;
+
+      if (i.op[src1].regs == i.op[dest].regs)
+	tmp_flag = src2;
+      /* adcx and adox don't have D bit.  */
+      else if (i.op[src2].regs == i.op[dest].regs
+	       && (t->opcode_modifier.d
+		   || t->mnem_off == MN_adcx
+		   || t->mnem_off == MN_adox)
+	       && (t->mnem_off != MN_sub)
+	       && (t->mnem_off != MN_sbb))
+	tmp_flag = src1;
+      if (tmp_flag != -1)
+	{
+	  --i.operands;
+	  --i.reg_operands;
+	  --i.tm.operands;
+
+	  if (tmp_flag != src2)
+	      swap_2_operands (tmp_flag, src2);
+	  return 1;
+	}
+    }
+  return 0;
+}
+
 /* Helper function for the progress() macro in match_template().  */
 static INLINE enum i386_error progress (enum i386_error new,
 					enum i386_error last,
@@ -7562,6 +7602,15 @@ match_template (char mnem_suffix)
 	     slip through to break.  */
 	}
 
+      /* If we can optimize a NDD insn to non-NDD insn, like
+	 add %r16, %r8, %r8 -> add %r16, %r8, then rematch template.  */
+      if (optimize_NDD_to_nonNDD (t))
+	{
+	  t = current_templates->start;
+	  --t;
+	  continue;
+	}
+
       /* Check if VEX/EVEX encoding requirements can be satisfied.  */
       if (VEX_check_encoding (t))
 	{
diff --git a/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.d b/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.d
new file mode 100644
index 00000000000..173b368da71
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.d
@@ -0,0 +1,120 @@
+#objdump: -drw
+#name: x86-64 APX NDD optimized encoding
+#source: x86-64-apx-ndd-optimize.s
+
+.*: +file format .*
+
+
+Disassembly of section .text:
+
+0+ <_start>:
+\s*[a-f0-9]+:\s*62 f4 7d 18 ff c0\s+inc\s+%ax,%ax
+\s*[a-f0-9]+:\s*ff c0\s+inc\s+%eax
+\s*[a-f0-9]+:\s*48 ff c0\s+inc\s+%rax
+\s*[a-f0-9]+:\s*d5 18 ff c0\s+inc\s+%r16
+\s*[a-f0-9]+:\s*62 44 3c 18 00 f8\s+add\s+%r31b,%r8b,%r8b
+\s*[a-f0-9]+:\s*62 44 04 10 00 f8\s+add\s+%r31b,%r8b,%r31b
+\s*[a-f0-9]+:\s*62 44 3c 18 00 f8\s+add\s+%r31b,%r8b,%r8b
+\s*[a-f0-9]+:\s*d5 4d 01 f8\s+add\s+%r31,%r8
+\s*[a-f0-9]+:\s*d5 4d 01 f8\s+add\s+%r31,%r8
+\s*[a-f0-9]+:\s*d5 45 01 f8\s+add\s+%r31d,%r8d
+\s*[a-f0-9]+:\s*d5 45 01 f8\s+add\s+%r31d,%r8d
+\s*[a-f0-9]+:\s*62 44 3d 18 01 f8\s+add\s+%r31w,%r8w,%r8w
+\s*[a-f0-9]+:\s*62 44 3d 18 01 f8\s+add\s+%r31w,%r8w,%r8w
+\s*[a-f0-9]+:\s*d5 4d 01 f8\s+add\s+%r31,%r8
+\s*[a-f0-9]+:\s*d5 1d 03 c7\s+add\s+%r31,%r8
+\s*[a-f0-9]+:\s*d5 4d 03 38\s+add\s+\(%r8\),%r31
+\s*[a-f0-9]+:\s*d5 1d 03 07\s+add\s+\(%r31\),%r8
+\s*[a-f0-9]+:\s*49 81 c7 33 44 34 12\s+add\s+\$0x12344433,%r15
+\s*[a-f0-9]+:\s*49 81 c0 11 22 33 f4\s+add\s+\$0xfffffffff4332211,%r8
+\s*[a-f0-9]+:\s*d5 18 ff c9\s+dec\s+%r17
+\s*[a-f0-9]+:\s*d5 18 f7 d1\s+not\s+%r17
+\s*[a-f0-9]+:\s*d5 18 f7 d9\s+neg\s+%r17
+\s*[a-f0-9]+:\s*d5 1c 29 f9\s+sub\s+%r15,%r17
+\s*[a-f0-9]+:\s*62 54 04 18 29 38\s+sub\s+%r15d,\(%r8\),%r15d
+\s*[a-f0-9]+:\s*d5 49 2b 04 07\s+sub\s+\(%r15,%rax,1\),%r16
+\s*[a-f0-9]+:\s*d5 19 81 ee 34 12 00 00\s+sub\s+\$0x1234,%r30
+\s*[a-f0-9]+:\s*d5 1c 19 f9\s+sbb\s+%r15,%r17
+\s*[a-f0-9]+:\s*62 54 84 18 19 38\s+sbb\s+%r15,\(%r8\),%r15
+\s*[a-f0-9]+:\s*d5 49 1b 04 07\s+sbb\s+\(%r15,%rax,1\),%r16
+\s*[a-f0-9]+:\s*d5 19 81 de 34 12 00 00\s+sbb\s+\$0x1234,%r30
+\s*[a-f0-9]+:\s*d5 1c 11 f9\s+adc\s+%r15,%r17
+\s*[a-f0-9]+:\s*45 13 38\s+adc\s+\(%r8\),%r15d
+\s*[a-f0-9]+:\s*d5 49 13 04 07\s+adc\s+\(%r15,%rax,1\),%r16
+\s*[a-f0-9]+:\s*d5 19 81 d6 34 12 00 00\s+adc\s+\$0x1234,%r30
+\s*[a-f0-9]+:\s*d5 1c 09 f9\s+or\s+ %r15,%r17
+\s*[a-f0-9]+:\s*45 0b 38\s+or\s+ \(%r8\),%r15d
+\s*[a-f0-9]+:\s*d5 49 0b 04 07\s+or\s+\(%r15,%rax,1\),%r16
+\s*[a-f0-9]+:\s*d5 19 81 ce 34 12 00 00\s+or\s+\$0x1234,%r30
+\s*[a-f0-9]+:\s*d5 1c 31 f9\s+xor\s+%r15,%r17
+\s*[a-f0-9]+:\s*45 33 38\s+xor\s+\(%r8\),%r15d
+\s*[a-f0-9]+:\s*d5 49 33 04 07\s+xor\s+\(%r15,%rax,1\),%r16
+\s*[a-f0-9]+:\s*d5 19 81 f6 34 12 00 00\s+xor\s+\$0x1234,%r30
+\s*[a-f0-9]+:\s*d5 1c 21 f9\s+and\s+%r15,%r17
+\s*[a-f0-9]+:\s*45 23 38\s+and\s+\(%r8\),%r15d
+\s*[a-f0-9]+:\s*d5 49 23 04 07\s+and\s+\(%r15,%rax,1\),%r16
+\s*[a-f0-9]+:\s*d5 19 81 e6 34 12 00 00\s+and\s+\$0x1234,%r30
+\s*[a-f0-9]+:\s*d5 19 81 e6 34 12 00 00\s+and\s+\$0x1234,%r30
+\s*[a-f0-9]+:\s*d5 19 d1 cf\s+ror\s+%r31
+\s*[a-f0-9]+:\s*62 dc 04 10 d0 cf\s+ror\s+%r31b,%r31b
+\s*[a-f0-9]+:\s*49 c1 cc 02\s+ror\s+\$0x2,%r12
+\s*[a-f0-9]+:\s*d5 19 d1 c7\s+rol\s+%r31
+\s*[a-f0-9]+:\s*62 dc 04 10 d0 c7\s+rol\s+%r31b,%r31b
+\s*[a-f0-9]+:\s*49 c1 c4 02\s+rol\s+\$0x2,%r12
+\s*[a-f0-9]+:\s*d5 19 d1 df\s+rcr\s+%r31
+\s*[a-f0-9]+:\s*62 dc 04 10 d0 df\s+rcr\s+%r31b,%r31b
+\s*[a-f0-9]+:\s*62 d4 1c 18 c0 dc 02\s+rcr\s+\$0x2,%r12b,%r12b
+\s*[a-f0-9]+:\s*49 c1 dc 02\s+rcr\s+\$0x2,%r12
+\s*[a-f0-9]+:\s*d5 19 d1 d7\s+rcl\s+%r31
+\s*[a-f0-9]+:\s*62 dc 04 10 d0 d7\s+rcl\s+%r31b,%r31b
+\s*[a-f0-9]+:\s*62 d4 1c 18 c0 d4 02\s+rcl\s+\$0x2,%r12b,%r12b
+\s*[a-f0-9]+:\s*49 c1 d4 02\s+rcl\s+\$0x2,%r12
+\s*[a-f0-9]+:\s*d5 19 d1 e7\s+shl\s+%r31
+\s*[a-f0-9]+:\s*62 dc 04 10 d0 e7 \s+shl\s+%r31b,%r31b
+\s*[a-f0-9]+:\s*62 d4 1c 18 c0 e4 02\s+shl\s+\$0x2,%r12b,%r12b
+\s*[a-f0-9]+:\s*49 c1 e4 02\s+shl\s+\$0x2,%r12
+\s*[a-f0-9]+:\s*d5 19 d1 ff\s+sar\s+%r31
+\s*[a-f0-9]+:\s*62 dc 04 10 d0 ff\s+sar\s+%r31b,%r31b
+\s*[a-f0-9]+:\s*62 d4 1c 18 c0 fc 02\s+sar\s+\$0x2,%r12b,%r12b
+\s*[a-f0-9]+:\s*49 c1 fc 02\s+sar\s+\$0x2,%r12
+\s*[a-f0-9]+:\s*d5 19 d1 e7\s+shl\s+%r31
+\s*[a-f0-9]+:\s*62 dc 04 10 d0 e7\s+shl\s+%r31b,%r31b
+\s*[a-f0-9]+:\s*62 d4 1c 18 c0 e4 02\s+shl\s+\$0x2,%r12b,%r12b
+\s*[a-f0-9]+:\s*49 c1 e4 02\s+shl\s+\$0x2,%r12
+\s*[a-f0-9]+:\s*d5 19 d1 ef\s+shr\s+%r31
+\s*[a-f0-9]+:\s*62 dc 04 10 d0 ef\s+shr\s+%r31b,%r31b
+\s*[a-f0-9]+:\s*62 d4 1c 18 c0 ec 02\s+shr\s+\$0x2,%r12b,%r12b
+\s*[a-f0-9]+:\s*49 c1 ec 02\s+shr\s+\$0x2,%r12
+\s*[a-f0-9]+:\s*62 74 9c 18 24 20 01\s+shld\s+\$0x1,%r12,\(%rax\),%r12
+\s*[a-f0-9]+:\s*62 54 1d 18 24 c4 02\s+shld\s+\$0x2,%r8w,%r12w,%r12w
+\s*[a-f0-9]+:\s*62 74 35 18 a5 08\s+shld\s+ %cl,%r9w,\(%rax\),%r9w
+\s*[a-f0-9]+:\s*d5 9c a5 e0\s+shld\s+%cl,%r12,%r16
+\s*[a-f0-9]+:\s*62 7c 15 18 a5 2c 83\s+shld\s+%cl,%r13w,\(%r19,%rax,4\),%r13w
+\s*[a-f0-9]+:\s*62 74 9c 18 2c 20 01\s+shrd\s+\$0x1,%r12,\(%rax\),%r12
+\s*[a-f0-9]+:\s*4d 0f ac ec 01\s+shrd\s+\$0x1,%r13,%r12
+\s*[a-f0-9]+:\s*62 54 1d 18 2c c4 02\s+shrd\s+\$0x2,%r8w,%r12w,%r12w
+\s*[a-f0-9]+:\s*62 74 35 18 ad 08\s+shrd\s+%cl,%r9w,\(%rax\),%r9w
+\s*[a-f0-9]+:\s*d5 9c ad e0\s+shrd\s+%cl,%r12,%r16
+\s*[a-f0-9]+:\s*62 7c 15 18 ad 2c 83\s+shrd\s+%cl,%r13w,\(%r19,%rax,4\),%r13w
+\s*[a-f0-9]+:\s*66 45 0f 38 f6 c7\s+adcx\s+%r15d,%r8d
+\s*[a-f0-9]+:\s*62 14 79 08 66 04 3f\s+adcx\s+\(%r15,%r31,1\),%r8d
+\s*[a-f0-9]+:\s*f3 45 0f 38 f6 c7\s+adox\s+%r15d,%r8d
+\s*[a-f0-9]+:\s*62 14 7a 08 66 04 3f\s+adox\s+\(%r15,%r31,1\),%r8d
+\s*[a-f0-9]+:\s*67 0f 40 90 90 90 90 90\s+cmovo\s+-0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 41 90 90 90 90 90\s+cmovno\s+-0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 42 90 90 90 90 90\s+cmovb\s+-0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 43 90 90 90 90 90\s+cmovae\s+-0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 44 90 90 90 90 90\s+cmove\s+-0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 45 90 90 90 90 90\s+cmovne\s+-0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 46 90 90 90 90 90\s+cmovbe\s+-0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 47 90 90 90 90 90\s+cmova\s+-0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 48 90 90 90 90 90\s+cmovs\s+-0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 49 90 90 90 90 90\s+cmovns\s+-0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 4a 90 90 90 90 90\s+cmovp\s+-0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 4b 90 90 90 90 90\s+cmovnp\s+-0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 4c 90 90 90 90 90\s+cmovl\s+-0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 4d 90 90 90 90 90\s+cmovge\s+-0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 4e 90 90 90 90 90\s+cmovle\s+-0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f 4f 90 90 90 90 90\s+cmovg\s+-0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*67 0f af 90 09 09 09 00\s+imul\s+0x90909\(%eax\),%edx
+\s*[a-f0-9]+:\s*d5 aa af 94 f8 09 09 00 00\s+imul\s+0x909\(%rax,%r31,8\),%rdx
diff --git a/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.s b/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.s
new file mode 100644
index 00000000000..139d875d5a7
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.s
@@ -0,0 +1,115 @@
+# Check 64bit APX NDD instructions with optimized encoding
+
+	.allow_index_reg
+	.text
+_start:
+inc    %ax,%ax
+inc    %eax,%eax
+inc    %rax,%rax
+inc    %r16,%r16
+add    %r31b,%r8b,%r8b
+add    %r31b,%r8b,%r31b
+addb    %r31b,%r8b,%r8b
+add    %r31,%r8,%r8
+addq    %r31,%r8,%r8
+add    %r31d,%r8d,%r8d
+addl    %r31d,%r8d,%r8d
+add    %r31w,%r8w,%r8w
+addw    %r31w,%r8w,%r8w
+{store} add    %r31,%r8,%r8
+{load}  add    %r31,%r8,%r8
+add    %r31,(%r8),%r31
+add    (%r31),%r8,%r8
+add    $0x12344433,%r15,%r15
+add    $0xfffffffff4332211,%r8,%r8
+dec    %r17,%r17
+not    %r17,%r17
+neg    %r17,%r17
+sub    %r15,%r17,%r17
+sub    %r15d,(%r8),%r15d
+sub    (%r15,%rax,1),%r16,%r16
+sub    $0x1234,%r30,%r30
+sbb    %r15,%r17,%r17
+sbb    %r15,(%r8),%r15
+sbb    (%r15,%rax,1),%r16,%r16
+sbb    $0x1234,%r30,%r30
+adc    %r15,%r17,%r17
+adc    %r15d,(%r8),%r15d
+adc    (%r15,%rax,1),%r16,%r16
+adc    $0x1234,%r30,%r30
+or    %r15,%r17,%r17
+or    %r15d,(%r8),%r15d
+or    (%r15,%rax,1),%r16,%r16
+or    $0x1234,%r30,%r30
+xor    %r15,%r17,%r17
+xor    %r15d,(%r8),%r15d
+xor    (%r15,%rax,1),%r16,%r16
+xor    $0x1234,%r30,%r30
+and    %r15,%r17,%r17
+and    %r15d,(%r8),%r15d
+and    (%r15,%rax,1),%r16,%r16
+and    $0x1234,%r30,%r30
+and    $0x1234,%r30
+ror    %r31,%r31
+rorb   %r31b,%r31b
+ror    $0x2,%r12,%r12
+rol    %r31,%r31
+rolb   %r31b,%r31b
+rol    $0x2,%r12,%r12
+rcr    %r31,%r31
+rcrb   %r31b,%r31b
+rcr    $0x2,%r12b,%r12b
+rcr    $0x2,%r12,%r12
+rcl    %r31,%r31
+rclb   %r31b,%r31b
+rcl    $0x2,%r12b,%r12b
+rcl    $0x2,%r12,%r12
+shl    %r31,%r31
+shlb   %r31b,%r31b
+shl    $0x2,%r12b,%r12b
+shl    $0x2,%r12,%r12
+sar    %r31,%r31
+sarb   %r31b,%r31b
+sar    $0x2,%r12b,%r12b
+sar    $0x2,%r12,%r12
+shl    %r31,%r31
+shlb   %r31b,%r31b
+shl    $0x2,%r12b,%r12b
+shl    $0x2,%r12,%r12
+shr    %r31,%r31
+shrb   %r31b,%r31b
+shr    $0x2,%r12b,%r12b
+shr    $0x2,%r12,%r12
+shld   $0x1,%r12,(%rax),%r12
+shld   $0x2,%r8w,%r12w,%r12w
+shld   %cl,%r9w,(%rax),%r9w
+shld   %cl,%r12,%r16,%r16
+shld   %cl,%r13w,(%r19,%rax,4),%r13w
+shrd   $0x1,%r12,(%rax),%r12
+shrd   $0x1,%r13,%r12,%r12
+shrd   $0x2,%r8w,%r12w,%r12w
+shrd   %cl,%r9w,(%rax),%r9w
+shrd   %cl,%r12,%r16,%r16
+shrd   %cl,%r13w,(%r19,%rax,4),%r13w
+adcx   %r15d,%r8d,%r8d
+adcx   (%r15,%r31,1),%r8d,%r8d
+adox   %r15d,%r8d,%r8d
+adox   (%r15,%r31,1),%r8d,%r8d
+cmovo  0x90909090(%eax),%edx,%edx
+cmovno 0x90909090(%eax),%edx,%edx
+cmovb  0x90909090(%eax),%edx,%edx
+cmovae 0x90909090(%eax),%edx,%edx
+cmove  0x90909090(%eax),%edx,%edx
+cmovne 0x90909090(%eax),%edx,%edx
+cmovbe 0x90909090(%eax),%edx,%edx
+cmova  0x90909090(%eax),%edx,%edx
+cmovs  0x90909090(%eax),%edx,%edx
+cmovns 0x90909090(%eax),%edx,%edx
+cmovp  0x90909090(%eax),%edx,%edx
+cmovnp 0x90909090(%eax),%edx,%edx
+cmovl  0x90909090(%eax),%edx,%edx
+cmovge 0x90909090(%eax),%edx,%edx
+cmovle 0x90909090(%eax),%edx,%edx
+cmovg  0x90909090(%eax),%edx,%edx
+imul   0x90909(%eax),%edx,%edx
+imul   0x909(%rax,%r31,8),%rdx,%rdx
diff --git a/gas/testsuite/gas/i386/x86-64.exp b/gas/testsuite/gas/i386/x86-64.exp
index ca1583c6f88..c48430ca7cb 100644
--- a/gas/testsuite/gas/i386/x86-64.exp
+++ b/gas/testsuite/gas/i386/x86-64.exp
@@ -549,6 +549,7 @@ run_dump_test "x86-64-optimize-6"
 run_list_test "x86-64-optimize-7a" "-I${srcdir}/$subdir -march=+noavx -al"
 run_dump_test "x86-64-optimize-7b"
 run_list_test "x86-64-optimize-8" "-I${srcdir}/$subdir -march=+noavx2 -al"
+run_dump_test "x86-64-apx-ndd-optimize"
 run_dump_test "x86-64-align-branch-1a"
 run_dump_test "x86-64-align-branch-1b"
 run_dump_test "x86-64-align-branch-1c"
-- 
2.25.1



More information about the Binutils mailing list