From eb4031cb20aa710834be891f8638e04dbba81edc Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 4 Jul 2023 17:07:26 +0200 Subject: [PATCH] x86: optimize 128-bit VPBROADCASTQ to VPUNPCKLQDQ The alternative is 1 byte shorter when the source is %xmm0-7, as a 2-byte VEX prefix can then be used. --- gas/config/tc-i386.c | 27 +++++++++++++++++++++ gas/testsuite/gas/i386/optimize-2.d | 1 + gas/testsuite/gas/i386/optimize-2.s | 2 ++ gas/testsuite/gas/i386/optimize-2b.d | 1 + gas/testsuite/gas/i386/x86-64-optimize-3.d | 2 ++ gas/testsuite/gas/i386/x86-64-optimize-3.s | 3 +++ gas/testsuite/gas/i386/x86-64-optimize-3b.d | 2 ++ opcodes/i386-opc.tbl | 2 +- opcodes/i386-tbl.h | 4 +-- 9 files changed, 41 insertions(+), 3 deletions(-) diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c index 8ebeaf1ab84..bc02f8e0abf 100644 --- a/gas/config/tc-i386.c +++ b/gas/config/tc-i386.c @@ -4620,6 +4620,33 @@ optimize_encoding (void) i.op[1].regs = i.op[0].regs; } } + else if (optimize_for_space + && i.tm.base_opcode == 0x59 + && i.tm.opcode_space == SPACE_0F38 + && i.operands == i.reg_operands + && i.tm.opcode_modifier.vex + && !(i.op[0].regs->reg_flags & RegRex) + && i.op[0].regs->reg_type.bitfield.xmmword + && i.vec_encoding != vex_encoding_vex3) + { + /* Optimize: -Os: + vpbroadcastq %xmmN, %xmmM -> vpunpcklqdq %xmmN, %xmmN, %xmmM (N < 8) + */ + i.tm.opcode_space = SPACE_0F; + i.tm.base_opcode = 0x6c; + i.tm.opcode_modifier.vexvvvv = 1; + + ++i.operands; + ++i.reg_operands; + ++i.tm.operands; + + i.op[2].regs = i.op[0].regs; + i.types[2] = i.types[0]; + i.flags[2] = i.flags[0]; + i.tm.operand_types[2] = i.tm.operand_types[0]; + + swap_2_operands (1, 2); + } } /* Return non-zero for load instruction. */ diff --git a/gas/testsuite/gas/i386/optimize-2.d b/gas/testsuite/gas/i386/optimize-2.d index 60a9069c6a7..41056fb6ef1 100644 --- a/gas/testsuite/gas/i386/optimize-2.d +++ b/gas/testsuite/gas/i386/optimize-2.d @@ -164,4 +164,5 @@ Disassembly of section .text: +[a-f0-9]+: 66 .* pcmpeqd %xmm2,%xmm2 +[a-f0-9]+: c5 .* vpcmpeqd %xmm2,%xmm2,%xmm0 +[a-f0-9]+: c5 .* vpcmpeqd %ymm2,%ymm2,%ymm0 + +[a-f0-9]+: c5 .* vpunpcklqdq %xmm2,%xmm2,%xmm0 #pass diff --git a/gas/testsuite/gas/i386/optimize-2.s b/gas/testsuite/gas/i386/optimize-2.s index 3cf41b1f59c..29399ae8473 100644 --- a/gas/testsuite/gas/i386/optimize-2.s +++ b/gas/testsuite/gas/i386/optimize-2.s @@ -184,3 +184,5 @@ _start: pcmpeqq %xmm2, %xmm2 vpcmpeqq %xmm2, %xmm2, %xmm0 vpcmpeqq %ymm2, %ymm2, %ymm0 + + vpbroadcastq %xmm2, %xmm0 diff --git a/gas/testsuite/gas/i386/optimize-2b.d b/gas/testsuite/gas/i386/optimize-2b.d index 0624cb482d5..d9e83198fb6 100644 --- a/gas/testsuite/gas/i386/optimize-2b.d +++ b/gas/testsuite/gas/i386/optimize-2b.d @@ -165,4 +165,5 @@ Disassembly of section .text: +[a-f0-9]+: 66 .* pcmpeqq %xmm2,%xmm2 +[a-f0-9]+: c4 .* vpcmpeqq %xmm2,%xmm2,%xmm0 +[a-f0-9]+: c4 .* vpcmpeqq %ymm2,%ymm2,%ymm0 + +[a-f0-9]+: c4 .* vpbroadcastq %xmm2,%xmm0 #pass diff --git a/gas/testsuite/gas/i386/x86-64-optimize-3.d b/gas/testsuite/gas/i386/x86-64-optimize-3.d index fc5ec5e43d0..23b9305d25c 100644 --- a/gas/testsuite/gas/i386/x86-64-optimize-3.d +++ b/gas/testsuite/gas/i386/x86-64-optimize-3.d @@ -205,4 +205,6 @@ Disassembly of section .text: +[a-f0-9]+: 66 .* pcmpeqd %xmm12,%xmm12 +[a-f0-9]+: c4 .* vpcmpeqq %xmm12,%xmm12,%xmm0 +[a-f0-9]+: c4 .* vpcmpeqq %ymm12,%ymm12,%ymm0 + +[a-f0-9]+: c5 .* vpunpcklqdq %xmm2,%xmm2,%xmm0 + +[a-f0-9]+: c4 .* vpbroadcastq %xmm12,%xmm0 #pass diff --git a/gas/testsuite/gas/i386/x86-64-optimize-3.s b/gas/testsuite/gas/i386/x86-64-optimize-3.s index d2bcbce3bf4..bab99cf1eae 100644 --- a/gas/testsuite/gas/i386/x86-64-optimize-3.s +++ b/gas/testsuite/gas/i386/x86-64-optimize-3.s @@ -229,3 +229,6 @@ _start: pcmpeqq %xmm12, %xmm12 vpcmpeqq %xmm12, %xmm12, %xmm0 vpcmpeqq %ymm12, %ymm12, %ymm0 + + vpbroadcastq %xmm2, %xmm0 + vpbroadcastq %xmm12, %xmm0 diff --git a/gas/testsuite/gas/i386/x86-64-optimize-3b.d b/gas/testsuite/gas/i386/x86-64-optimize-3b.d index abd48dce3a8..929c6f70e8b 100644 --- a/gas/testsuite/gas/i386/x86-64-optimize-3b.d +++ b/gas/testsuite/gas/i386/x86-64-optimize-3b.d @@ -206,4 +206,6 @@ Disassembly of section .text: +[a-f0-9]+: 66 .* pcmpeqq %xmm12,%xmm12 +[a-f0-9]+: c4 .* vpcmpeqq %xmm12,%xmm12,%xmm0 +[a-f0-9]+: c4 .* vpcmpeqq %ymm12,%ymm12,%ymm0 + +[a-f0-9]+: c4 .* vpbroadcastq %xmm2,%xmm0 + +[a-f0-9]+: c4 .* vpbroadcastq %xmm12,%xmm0 #pass diff --git a/opcodes/i386-opc.tbl b/opcodes/i386-opc.tbl index f9abffb4b9f..b6263f88605 100644 --- a/opcodes/i386-opc.tbl +++ b/opcodes/i386-opc.tbl @@ -1734,7 +1734,7 @@ vbroadcastsd, 0x6619, AVX2, Modrm|Vex=2|Space0F38|VexW=1|NoSuf, { RegXMM, RegYMM vbroadcastss, 0x6618, AVX2, Modrm|Vex|Space0F38|VexW=1|NoSuf, { RegXMM, RegXMM|RegYMM } vpblendd, 0x6602, AVX2, Modrm|Vex|Space0F3A|VexVVVV|VexW0|CheckOperandSize|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM } vpbroadcast, 0x6678 | , AVX2, Modrm|Vex|Space0F38|VexW0|NoSuf, { |Unspecified|BaseIndex|RegXMM, RegXMM|RegYMM } -vpbroadcast, 0x6658 | , AVX2, Modrm|Vex|Space0F38|VexW0|NoSuf, { |Unspecified|BaseIndex|RegXMM, RegXMM|RegYMM } +vpbroadcast, 0x6658 | , AVX2, Modrm|Vex|Space0F38|VexW0|NoSuf|Optimize, { |Unspecified|BaseIndex|RegXMM, RegXMM|RegYMM } vperm2i128, 0x6646, AVX2, Modrm|Vex=2|Space0F3A|VexVVVV|VexW0|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegYMM, RegYMM, RegYMM } vpermd, 0x6636, AVX2, Modrm|Vex256|Space0F38|VexVVVV|VexW0|NoSuf, { Unspecified|BaseIndex|RegYMM, RegYMM, RegYMM } vpermpd, 0x6601, AVX2, Modrm|Vex=2|Space0F3A|VexW1|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegYMM, RegYMM } diff --git a/opcodes/i386-tbl.h b/opcodes/i386-tbl.h index 4779cc958db..43bcd688852 100644 --- a/opcodes/i386-tbl.h +++ b/opcodes/i386-tbl.h @@ -34844,7 +34844,7 @@ static const insn_template i386_optab[] = 1, 1, 1, 0, 0, 0 } } } }, { MN_vpbroadcastd, 0x58 | 0, 2, SPACE_0F38, None, { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0 }, { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -34886,7 +34886,7 @@ static const insn_template i386_optab[] = 1, 1, 1, 0, 0, 0 } } } }, { MN_vpbroadcastq, 0x58 | 1, 2, SPACE_0F38, None, { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0 }, { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -- 2.30.2