From 1e4e17fbd9296cc5064aabdb351a894d10190cb6 Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Mon, 11 May 2015 09:29:56 -0700 Subject: [PATCH] i965/fs: Lower integer multiplication after optimizations. 32-bit x 32-bit integer multiplication requires multiple instructions until Broadwell. This patch just lets us treat the MUL instruction in the FS backend like it operates on Broadwell, and after optimizations we lower it into a sequence of instructions on older platforms. Doing this will allow us to some extra optimization on integer multiplies. Reviewed-by: Jason Ekstrand --- src/mesa/drivers/dri/i965/brw_fs.cpp | 66 ++++++++++++++++++++ src/mesa/drivers/dri/i965/brw_fs.h | 1 + src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 36 +---------- src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 31 +-------- 4 files changed, 70 insertions(+), 64 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index b63ca23e3d8..cb13fcb1cc8 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -3523,6 +3523,71 @@ fs_visitor::lower_load_payload() return progress; } +bool +fs_visitor::lower_integer_multiplication() +{ + bool progress = false; + + /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation + * directly, but Cherryview cannot. + */ + if (devinfo->gen >= 8 && !devinfo->is_cherryview) + return false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + if (inst->opcode != BRW_OPCODE_MUL || + inst->dst.is_accumulator() || + (inst->dst.type != BRW_REGISTER_TYPE_D && + inst->dst.type != BRW_REGISTER_TYPE_UD)) + continue; + +#define insert(instr) inst->insert_before(block, instr) + + /* The MUL instruction isn't commutative. On Gen <= 6, only the low + * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of + * src1 are used. + * + * If multiplying by an immediate value that fits in 16-bits, do a + * single MUL instruction with that value in the proper location. + */ + if (inst->src[1].file == IMM && + inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) { + if (devinfo->gen < 7) { + fs_reg imm(GRF, alloc.allocate(dispatch_width / 8), + inst->dst.type, dispatch_width); + insert(MOV(imm, inst->src[1])); + insert(MUL(inst->dst, imm, inst->src[0])); + } else { + insert(MUL(inst->dst, inst->src[0], inst->src[1])); + } + } else { + if (devinfo->gen >= 7) + no16("SIMD16 integer multiply unsupported\n"); + + const unsigned channels = dispatch_width; + const enum brw_reg_type type = inst->dst.type; + const fs_reg acc(retype(brw_acc_reg(channels), type)); + const fs_reg null(retype(brw_null_vec(channels), type)); + + const fs_reg &src0 = inst->src[0]; + const fs_reg &src1 = inst->src[1]; + + insert(MUL(acc, src0, src1)); + insert(MACH(null, src0, src1)); + insert(MOV(inst->dst, acc)); + } +#undef insert + + inst->remove(block); + progress = true; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + void fs_visitor::dump_instructions() { @@ -4001,6 +4066,7 @@ fs_visitor::optimize() } OPT(opt_combine_constants); + OPT(lower_integer_multiplication); lower_uniform_pull_constant_loads(); } diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 991cff96325..f2aa0ae9576 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -241,6 +241,7 @@ public: void no16(const char *msg, ...); void lower_uniform_pull_constant_loads(); bool lower_load_payload(); + bool lower_integer_multiplication(); bool opt_combine_constants(); void emit_dummy_fs(); diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 9cfd0e792a2..5dd8363b91e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -780,41 +780,9 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr) inst->saturate = instr->dest.saturate; break; - case nir_op_imul: { - if (devinfo->gen >= 8) { - emit(MUL(result, op[0], op[1])); - break; - } else { - nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src); - nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src); - - if (value0 && value0->u[0] < (1 << 16)) { - if (devinfo->gen < 7) { - emit(MUL(result, op[0], op[1])); - } else { - emit(MUL(result, op[1], op[0])); - } - break; - } else if (value1 && value1->u[0] < (1 << 16)) { - if (devinfo->gen < 7) { - emit(MUL(result, op[1], op[0])); - } else { - emit(MUL(result, op[0], op[1])); - } - break; - } - } - - if (devinfo->gen >= 7) - no16("SIMD16 explicit accumulator operands unsupported\n"); - - struct brw_reg acc = retype(brw_acc_reg(dispatch_width), result.type); - - emit(MUL(acc, op[0], op[1])); - emit(MACH(reg_null_d, op[0], op[1])); - emit(MOV(result, fs_reg(acc))); + case nir_op_imul: + emit(MUL(result, op[0], op[1])); break; - } case nir_op_imul_high: case nir_op_umul_high: { diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index abaea5f4e13..ead77686640 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -873,36 +873,7 @@ fs_visitor::visit(ir_expression *ir) unreachable("not reached: should be handled by ir_sub_to_add_neg"); case ir_binop_mul: - if (devinfo->gen < 8 && ir->type->is_integer()) { - /* For integer multiplication, the MUL uses the low 16 bits - * of one of the operands (src0 on gen6, src1 on gen7). The - * MACH accumulates in the contribution of the upper 16 bits - * of that operand. - */ - if (ir->operands[0]->is_uint16_constant()) { - if (devinfo->gen < 7) - emit(MUL(this->result, op[0], op[1])); - else - emit(MUL(this->result, op[1], op[0])); - } else if (ir->operands[1]->is_uint16_constant()) { - if (devinfo->gen < 7) - emit(MUL(this->result, op[1], op[0])); - else - emit(MUL(this->result, op[0], op[1])); - } else { - if (devinfo->gen >= 7) - no16("SIMD16 explicit accumulator operands unsupported\n"); - - struct brw_reg acc = retype(brw_acc_reg(dispatch_width), - this->result.type); - - emit(MUL(acc, op[0], op[1])); - emit(MACH(reg_null_d, op[0], op[1])); - emit(MOV(this->result, fs_reg(acc))); - } - } else { - emit(MUL(this->result, op[0], op[1])); - } + emit(MUL(this->result, op[0], op[1])); break; case ir_binop_imul_high: { if (devinfo->gen >= 7) -- 2.30.2