i965/fs: Lower integer multiplication after optimizations.
authorMatt Turner <mattst88@gmail.com>
Mon, 11 May 2015 16:29:56 +0000 (09:29 -0700)
committerMatt Turner <mattst88@gmail.com>
Mon, 18 May 2015 17:11:36 +0000 (10:11 -0700)
32-bit x 32-bit integer multiplication requires multiple instructions
until Broadwell. This patch just lets us treat the MUL instruction in
the FS backend like it operates on Broadwell, and after optimizations
we lower it into a sequence of instructions on older platforms.

Doing this will allow us to some extra optimization on integer
multiplies.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
src/mesa/drivers/dri/i965/brw_fs.cpp
src/mesa/drivers/dri/i965/brw_fs.h
src/mesa/drivers/dri/i965/brw_fs_nir.cpp
src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

index b63ca23e3d89c73144a89e3470941ad3b29c6345..cb13fcb1cc81a1dd920edd7d00bb77e733c3892b 100644 (file)
@@ -3523,6 +3523,71 @@ fs_visitor::lower_load_payload()
    return progress;
 }
 
+bool
+fs_visitor::lower_integer_multiplication()
+{
+   bool progress = false;
+
+   /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
+    * directly, but Cherryview cannot.
+    */
+   if (devinfo->gen >= 8 && !devinfo->is_cherryview)
+      return false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      if (inst->opcode != BRW_OPCODE_MUL ||
+          inst->dst.is_accumulator() ||
+          (inst->dst.type != BRW_REGISTER_TYPE_D &&
+           inst->dst.type != BRW_REGISTER_TYPE_UD))
+         continue;
+
+#define insert(instr) inst->insert_before(block, instr)
+
+      /* The MUL instruction isn't commutative. On Gen <= 6, only the low
+       * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
+       * src1 are used.
+       *
+       * If multiplying by an immediate value that fits in 16-bits, do a
+       * single MUL instruction with that value in the proper location.
+       */
+      if (inst->src[1].file == IMM &&
+          inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
+         if (devinfo->gen < 7) {
+            fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
+                       inst->dst.type, dispatch_width);
+            insert(MOV(imm, inst->src[1]));
+            insert(MUL(inst->dst, imm, inst->src[0]));
+         } else {
+            insert(MUL(inst->dst, inst->src[0], inst->src[1]));
+         }
+      } else {
+         if (devinfo->gen >= 7)
+            no16("SIMD16 integer multiply unsupported\n");
+
+         const unsigned channels = dispatch_width;
+         const enum brw_reg_type type = inst->dst.type;
+         const fs_reg acc(retype(brw_acc_reg(channels), type));
+         const fs_reg null(retype(brw_null_vec(channels), type));
+
+         const fs_reg &src0 = inst->src[0];
+         const fs_reg &src1 = inst->src[1];
+
+         insert(MUL(acc, src0, src1));
+         insert(MACH(null, src0, src1));
+         insert(MOV(inst->dst, acc));
+      }
+#undef insert
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
 void
 fs_visitor::dump_instructions()
 {
@@ -4001,6 +4066,7 @@ fs_visitor::optimize()
    }
 
    OPT(opt_combine_constants);
+   OPT(lower_integer_multiplication);
 
    lower_uniform_pull_constant_loads();
 }
index 991cff96325a7600bb9264ccc72e6eec2e3c6998..f2aa0ae957642bce27eaa9b42c17100b443add2b 100644 (file)
@@ -241,6 +241,7 @@ public:
    void no16(const char *msg, ...);
    void lower_uniform_pull_constant_loads();
    bool lower_load_payload();
+   bool lower_integer_multiplication();
    bool opt_combine_constants();
 
    void emit_dummy_fs();
index 9cfd0e792a2b917961fbb851ed43e1bee27781a4..5dd8363b91eda1f01f89317e544a72aed31cea4f 100644 (file)
@@ -780,41 +780,9 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       inst->saturate = instr->dest.saturate;
       break;
 
-   case nir_op_imul: {
-      if (devinfo->gen >= 8) {
-         emit(MUL(result, op[0], op[1]));
-         break;
-      } else {
-         nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
-         nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
-
-         if (value0 && value0->u[0] < (1 << 16)) {
-            if (devinfo->gen < 7) {
-               emit(MUL(result, op[0], op[1]));
-            } else {
-               emit(MUL(result, op[1], op[0]));
-            }
-            break;
-         } else if (value1 && value1->u[0] < (1 << 16)) {
-            if (devinfo->gen < 7) {
-               emit(MUL(result, op[1], op[0]));
-            } else {
-               emit(MUL(result, op[0], op[1]));
-            }
-            break;
-         }
-      }
-
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width), result.type);
-
-      emit(MUL(acc, op[0], op[1]));
-      emit(MACH(reg_null_d, op[0], op[1]));
-      emit(MOV(result, fs_reg(acc)));
+   case nir_op_imul:
+      emit(MUL(result, op[0], op[1]));
       break;
-   }
 
    case nir_op_imul_high:
    case nir_op_umul_high: {
index abaea5f4e134350dfa2d642051ea658ddd2cd9fd..ead77686640edbbf4364c00dc01c77395fd441a9 100644 (file)
@@ -873,36 +873,7 @@ fs_visitor::visit(ir_expression *ir)
       unreachable("not reached: should be handled by ir_sub_to_add_neg");
 
    case ir_binop_mul:
-      if (devinfo->gen < 8 && ir->type->is_integer()) {
-        /* For integer multiplication, the MUL uses the low 16 bits
-         * of one of the operands (src0 on gen6, src1 on gen7).  The
-         * MACH accumulates in the contribution of the upper 16 bits
-         * of that operand.
-          */
-         if (ir->operands[0]->is_uint16_constant()) {
-            if (devinfo->gen < 7)
-               emit(MUL(this->result, op[0], op[1]));
-            else
-               emit(MUL(this->result, op[1], op[0]));
-         } else if (ir->operands[1]->is_uint16_constant()) {
-            if (devinfo->gen < 7)
-               emit(MUL(this->result, op[1], op[0]));
-            else
-               emit(MUL(this->result, op[0], op[1]));
-         } else {
-            if (devinfo->gen >= 7)
-               no16("SIMD16 explicit accumulator operands unsupported\n");
-
-            struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                        this->result.type);
-
-            emit(MUL(acc, op[0], op[1]));
-            emit(MACH(reg_null_d, op[0], op[1]));
-            emit(MOV(this->result, fs_reg(acc)));
-         }
-      } else {
-        emit(MUL(this->result, op[0], op[1]));
-      }
+      emit(MUL(this->result, op[0], op[1]));
       break;
    case ir_binop_imul_high: {
       if (devinfo->gen >= 7)