i965: Add writes_accumulator flag
authorJuha-Pekka Heikkila <juhapekka.heikkila@gmail.com>
Fri, 4 Apr 2014 13:51:59 +0000 (16:51 +0300)
committerMatt Turner <mattst88@gmail.com>
Thu, 17 Apr 2014 05:46:45 +0000 (22:46 -0700)
Our hardware has an "accumulator" register, which can be used to store
intermediate results across multiple instructions.  Many instructions
can implicitly write a value to the accumulator in addition to their
normal destination register.  This is enabled by the "AccWrEn" flag.

This patch introduces a new flag, inst->writes_accumulator, which
allows us to express the AccWrEn notion in the IR.  It also creates a
n ALU2_ACC macro to easily define emitters for instructions that
implicitly write the accumulator.

Previously, we only supported implicit accumulator writes from the
ADDC, SUBB, and MACH instructions.  We always enabled them on those
instructions, and left them disabled for other instructions.

To take advantage of the MAC (multiply-accumulate) instruction, we
need to be able to set AccWrEn on other types of instructions.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Signed-off-by: Juha-Pekka Heikkila <juhapekka.heikkila@gmail.com>
src/mesa/drivers/dri/i965/brw_fs.cpp
src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp
src/mesa/drivers/dri/i965/brw_fs_generator.cpp
src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
src/mesa/drivers/dri/i965/brw_shader.h
src/mesa/drivers/dri/i965/brw_vec4.cpp
src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

index bff38f0d6e7848140d88a1318d9aa2c1634f82fa..075857f742561cc574bbc3129045d731884b7ab9 100644 (file)
@@ -64,6 +64,8 @@ fs_inst::init()
 
    /* This will be the case for almost all instructions. */
    this->regs_written = 1;
+
+   this->writes_accumulator = false;
 }
 
 fs_inst::fs_inst()
@@ -151,6 +153,15 @@ fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
    }
 
+#define ALU2_ACC(op)                                                    \
+   fs_inst *                                                            \
+   fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
+   {                                                                    \
+      fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
+      inst->writes_accumulator = true;                                  \
+      return inst;                                                      \
+   }
+
 #define ALU3(op)                                                        \
    fs_inst *                                                            \
    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
@@ -166,7 +177,7 @@ ALU1(RNDE)
 ALU1(RNDZ)
 ALU2(ADD)
 ALU2(MUL)
-ALU2(MACH)
+ALU2_ACC(MACH)
 ALU2(AND)
 ALU2(OR)
 ALU2(XOR)
@@ -182,8 +193,8 @@ ALU1(FBH)
 ALU1(FBL)
 ALU1(CBIT)
 ALU3(MAD)
-ALU2(ADDC)
-ALU2(SUBB)
+ALU2_ACC(ADDC)
+ALU2_ACC(SUBB)
 ALU2(SEL)
 
 /** Gen4 predicated IF. */
index 6672f840fc529bfe05f7e9357d05d065e104914f..dfeceb006194ae6e78ccf51ac08eda12a2227956 100644 (file)
@@ -72,13 +72,9 @@ fs_visitor::dead_code_eliminate()
             if (!result_live) {
                progress = true;
 
-               switch (inst->opcode) {
-               case BRW_OPCODE_ADDC:
-               case BRW_OPCODE_SUBB:
-               case BRW_OPCODE_MACH:
+               if (inst->writes_accumulator) {
                   inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
-                  break;
-               default:
+               } else {
                   inst->opcode = BRW_OPCODE_NOP;
                   continue;
                }
index e590bdf4c58fb6cb174742401553f53f97febe6a..1cf35b40ad22d0270ad6e744a6d7a99f631ee849 100644 (file)
@@ -1411,6 +1411,7 @@ fs_generator::generate_code(exec_list *instructions, FILE *dump_file)
       brw_set_flag_reg(p, 0, inst->flag_subreg);
       brw_set_saturate(p, inst->saturate);
       brw_set_mask_control(p, inst->force_writemask_all);
+      brw_set_acc_write_control(p, inst->writes_accumulator);
 
       if (inst->force_uncompressed || dispatch_width == 8) {
         brw_set_compression_control(p, BRW_COMPRESSION_NONE);
@@ -1434,9 +1435,7 @@ fs_generator::generate_code(exec_list *instructions, FILE *dump_file)
         brw_AVG(p, dst, src[0], src[1]);
         break;
       case BRW_OPCODE_MACH:
-        brw_set_acc_write_control(p, 1);
         brw_MACH(p, dst, src[0], src[1]);
-        brw_set_acc_write_control(p, 0);
         break;
 
       case BRW_OPCODE_MAD:
@@ -1540,15 +1539,11 @@ fs_generator::generate_code(exec_list *instructions, FILE *dump_file)
          break;
       case BRW_OPCODE_ADDC:
          assert(brw->gen >= 7);
-         brw_set_acc_write_control(p, 1);
          brw_ADDC(p, dst, src[0], src[1]);
-         brw_set_acc_write_control(p, 0);
          break;
       case BRW_OPCODE_SUBB:
          assert(brw->gen >= 7);
-         brw_set_acc_write_control(p, 1);
          brw_SUBB(p, dst, src[0], src[1]);
-         brw_set_acc_write_control(p, 0);
          break;
 
       case BRW_OPCODE_BFE:
index a9514594e86f24219f9a538fb24ceb69bbeafd7f..5e4f2fe747879e1240907f3af41b2a5a1e89551e 100644 (file)
@@ -742,6 +742,8 @@ fs_instruction_scheduler::is_compressed(fs_inst *inst)
 void
 fs_instruction_scheduler::calculate_deps()
 {
+   const bool gen6plus = v->brw->gen >= 6;
+
    /* Pre-register-allocation, this tracks the last write per VGRF (so
     * different reg_offsets within it can interfere when they shouldn't).
     * After register allocation, reg_offsets are gone and we track individual
@@ -750,6 +752,7 @@ fs_instruction_scheduler::calculate_deps()
    schedule_node *last_grf_write[grf_count];
    schedule_node *last_mrf_write[BRW_MAX_MRF];
    schedule_node *last_conditional_mod[2] = { NULL, NULL };
+   schedule_node *last_accumulator_write = NULL;
    /* Fixed HW registers are assumed to be separate from the virtual
     * GRFs, so they can be tracked separately.  We don't really write
     * to fixed GRFs much, so don't bother tracking them on a more
@@ -800,6 +803,8 @@ fs_instruction_scheduler::calculate_deps()
             } else {
                add_dep(last_fixed_grf_write, n);
             }
+         } else if (inst->src[i].is_accumulator() && gen6plus) {
+            add_dep(last_accumulator_write, n);
         } else if (inst->src[i].file != BAD_FILE &&
                    inst->src[i].file != IMM &&
                    inst->src[i].file != UNIFORM) {
@@ -822,6 +827,14 @@ fs_instruction_scheduler::calculate_deps()
         add_dep(last_conditional_mod[inst->flag_subreg], n);
       }
 
+      if (inst->reads_accumulator_implicitly()) {
+         if (gen6plus) {
+            add_dep(last_accumulator_write, n);
+         } else {
+            add_barrier_deps(n);
+         }
+      }
+
       /* write-after-write deps. */
       if (inst->dst.file == GRF) {
          if (post_reg_alloc) {
@@ -854,6 +867,9 @@ fs_instruction_scheduler::calculate_deps()
          } else {
             last_fixed_grf_write = n;
          }
+      } else if (inst->dst.is_accumulator() && gen6plus) {
+         add_dep(last_accumulator_write, n);
+         last_accumulator_write = n;
       } else if (inst->dst.file != BAD_FILE) {
         add_barrier_deps(n);
       }
@@ -869,12 +885,22 @@ fs_instruction_scheduler::calculate_deps()
         add_dep(last_conditional_mod[inst->flag_subreg], n, 0);
         last_conditional_mod[inst->flag_subreg] = n;
       }
+
+      if (inst->writes_accumulator) {
+         if (gen6plus) {
+            add_dep(last_accumulator_write, n);
+            last_accumulator_write = n;
+         } else {
+            add_barrier_deps(n);
+         }
+      }
    }
 
    /* bottom-to-top dependencies: WAR */
    memset(last_grf_write, 0, sizeof(last_grf_write));
    memset(last_mrf_write, 0, sizeof(last_mrf_write));
    memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
+   last_accumulator_write = NULL;
    last_fixed_grf_write = NULL;
 
    exec_node *node;
@@ -906,6 +932,8 @@ fs_instruction_scheduler::calculate_deps()
             } else {
                add_dep(n, last_fixed_grf_write);
             }
+         } else if (inst->src[i].is_accumulator() && gen6plus) {
+            add_dep(n, last_accumulator_write);
          } else if (inst->src[i].file != BAD_FILE &&
                    inst->src[i].file != IMM &&
                    inst->src[i].file != UNIFORM) {
@@ -928,6 +956,14 @@ fs_instruction_scheduler::calculate_deps()
         add_dep(n, last_conditional_mod[inst->flag_subreg]);
       }
 
+      if (inst->reads_accumulator_implicitly()) {
+         if (gen6plus) {
+            add_dep(n, last_accumulator_write);
+         } else {
+            add_barrier_deps(n);
+         }
+      }
+
       /* Update the things this instruction wrote, so earlier reads
        * can mark this as WAR dependency.
        */
@@ -959,6 +995,8 @@ fs_instruction_scheduler::calculate_deps()
          } else {
             last_fixed_grf_write = n;
          }
+      } else if (inst->dst.is_accumulator() && gen6plus) {
+         last_accumulator_write = n;
       } else if (inst->dst.file != BAD_FILE) {
         add_barrier_deps(n);
       }
@@ -972,15 +1010,26 @@ fs_instruction_scheduler::calculate_deps()
       if (inst->writes_flag()) {
         last_conditional_mod[inst->flag_subreg] = n;
       }
+
+      if (inst->writes_accumulator) {
+         if (gen6plus) {
+            last_accumulator_write = n;
+         } else {
+            add_barrier_deps(n);
+         }
+      }
    }
 }
 
 void
 vec4_instruction_scheduler::calculate_deps()
 {
+   const bool gen6plus = v->brw->gen >= 6;
+
    schedule_node *last_grf_write[grf_count];
    schedule_node *last_mrf_write[BRW_MAX_MRF];
    schedule_node *last_conditional_mod = NULL;
+   schedule_node *last_accumulator_write = NULL;
    /* Fixed HW registers are assumed to be separate from the virtual
     * GRFs, so they can be tracked separately.  We don't really write
     * to fixed GRFs much, so don't bother tracking them on a more
@@ -1016,6 +1065,9 @@ vec4_instruction_scheduler::calculate_deps()
                     (inst->src[i].fixed_hw_reg.file ==
                      BRW_GENERAL_REGISTER_FILE)) {
             add_dep(last_fixed_grf_write, n);
+         } else if (inst->src[i].is_accumulator() && gen6plus) {
+            assert(last_accumulator_write);
+            add_dep(last_accumulator_write, n);
          } else if (inst->src[i].file != BAD_FILE &&
                     inst->src[i].file != IMM &&
                     inst->src[i].file != UNIFORM) {
@@ -1039,6 +1091,15 @@ vec4_instruction_scheduler::calculate_deps()
          add_dep(last_conditional_mod, n);
       }
 
+      if (inst->reads_accumulator_implicitly()) {
+         if (gen6plus) {
+            assert(last_accumulator_write);
+            add_dep(last_accumulator_write, n);
+         } else {
+            add_barrier_deps(n);
+         }
+      }
+
       /* write-after-write deps. */
       if (inst->dst.file == GRF) {
          add_dep(last_grf_write[inst->dst.reg], n);
@@ -1049,6 +1110,9 @@ vec4_instruction_scheduler::calculate_deps()
      } else if (inst->dst.file == HW_REG &&
                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
          last_fixed_grf_write = n;
+      } else if (inst->dst.is_accumulator() && gen6plus) {
+         add_dep(last_accumulator_write, n);
+         last_accumulator_write = n;
       } else if (inst->dst.file != BAD_FILE) {
          add_barrier_deps(n);
       }
@@ -1064,12 +1128,22 @@ vec4_instruction_scheduler::calculate_deps()
          add_dep(last_conditional_mod, n, 0);
          last_conditional_mod = n;
       }
+
+      if (inst->writes_accumulator) {
+         if (gen6plus) {
+            add_dep(last_accumulator_write, n);
+            last_accumulator_write = n;
+         } else {
+            add_barrier_deps(n);
+         }
+      }
    }
 
    /* bottom-to-top dependencies: WAR */
    memset(last_grf_write, 0, sizeof(last_grf_write));
    memset(last_mrf_write, 0, sizeof(last_mrf_write));
    last_conditional_mod = NULL;
+   last_accumulator_write = NULL;
    last_fixed_grf_write = NULL;
 
    exec_node *node;
@@ -1088,6 +1162,8 @@ vec4_instruction_scheduler::calculate_deps()
                     (inst->src[i].fixed_hw_reg.file ==
                      BRW_GENERAL_REGISTER_FILE)) {
             add_dep(n, last_fixed_grf_write);
+         } else if (inst->src[i].is_accumulator() && gen6plus) {
+            add_dep(n, last_accumulator_write);
          } else if (inst->src[i].file != BAD_FILE &&
                     inst->src[i].file != IMM &&
                     inst->src[i].file != UNIFORM) {
@@ -1109,6 +1185,14 @@ vec4_instruction_scheduler::calculate_deps()
          add_dep(n, last_conditional_mod);
       }
 
+      if (inst->reads_accumulator_implicitly()) {
+         if (gen6plus) {
+            add_dep(n, last_accumulator_write);
+         } else {
+            add_barrier_deps(n);
+         }
+      }
+
       /* Update the things this instruction wrote, so earlier reads
        * can mark this as WAR dependency.
        */
@@ -1119,6 +1203,8 @@ vec4_instruction_scheduler::calculate_deps()
       } else if (inst->dst.file == HW_REG &&
                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
          last_fixed_grf_write = n;
+      } else if (inst->dst.is_accumulator() && gen6plus) {
+         last_accumulator_write = n;
       } else if (inst->dst.file != BAD_FILE) {
          add_barrier_deps(n);
       }
@@ -1132,6 +1218,14 @@ vec4_instruction_scheduler::calculate_deps()
       if (inst->writes_flag()) {
          last_conditional_mod = n;
       }
+
+      if (inst->writes_accumulator) {
+         if (gen6plus) {
+            last_accumulator_write = n;
+         } else {
+            add_barrier_deps(n);
+         }
+      }
    }
 }
 
index 9ef08e58456a413cfbbf456cbc257377c7a37ca5..e730ed02b18cb0cc136ac656b4f943406aab4495 100644 (file)
@@ -60,6 +60,7 @@ public:
 
    uint8_t predicate;
    bool predicate_inverse;
+   bool writes_accumulator; /**< instruction implicitly writes accumulator */
 };
 
 enum instruction_scheduler_mode {
index 8aa746d36308930dd906b1aa80102f46b64d1fc5..daff36411199d240755eb2ee9a80cdc3a288c031 100644 (file)
@@ -350,19 +350,12 @@ try_eliminate_instruction(vec4_instruction *inst, int new_writemask,
        * accumulator as a side-effect. Instead just set the destination
        * to the null register to free it.
        */
-      switch (inst->opcode) {
-      case BRW_OPCODE_ADDC:
-      case BRW_OPCODE_SUBB:
-      case BRW_OPCODE_MACH:
+      if (inst->writes_accumulator || inst->writes_flag()) {
          inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
-         break;
-      default:
-         if (inst->writes_flag()) {
-            inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
-         } else {
-            inst->remove();
-         }
+      } else {
+         inst->remove();
       }
+
       return true;
    } else if (inst->dst.writemask != new_writemask) {
       switch (inst->opcode) {
index a74514f512c6b8e5069d1647ffe89b63dbdb2758..5f85d315c7120ade392e3afac6abc5cfbc32c002 100644 (file)
@@ -971,9 +971,7 @@ vec4_generator::generate_vec4_instruction(vec4_instruction *instruction,
       brw_MUL(p, dst, src[0], src[1]);
       break;
    case BRW_OPCODE_MACH:
-      brw_set_acc_write_control(p, 1);
       brw_MACH(p, dst, src[0], src[1]);
-      brw_set_acc_write_control(p, 0);
       break;
 
    case BRW_OPCODE_MAD:
@@ -1077,15 +1075,11 @@ vec4_generator::generate_vec4_instruction(vec4_instruction *instruction,
       break;
    case BRW_OPCODE_ADDC:
       assert(brw->gen >= 7);
-      brw_set_acc_write_control(p, 1);
       brw_ADDC(p, dst, src[0], src[1]);
-      brw_set_acc_write_control(p, 0);
       break;
    case BRW_OPCODE_SUBB:
       assert(brw->gen >= 7);
-      brw_set_acc_write_control(p, 1);
       brw_SUBB(p, dst, src[0], src[1]);
-      brw_set_acc_write_control(p, 0);
       break;
 
    case BRW_OPCODE_BFE:
@@ -1317,6 +1311,7 @@ vec4_generator::generate_code(exec_list *instructions)
       brw_set_predicate_inverse(p, inst->predicate_inverse);
       brw_set_saturate(p, inst->saturate);
       brw_set_mask_control(p, inst->force_writemask_all);
+      brw_set_acc_write_control(p, inst->writes_accumulator);
 
       unsigned pre_emit_nr_insn = p->nr_insn;
 
index edace108f19152964127526784fe67a0acf5898e..3a764424df8f7c11fd61c055ac37a7074b56e05b 100644 (file)
@@ -42,6 +42,7 @@ vec4_instruction::vec4_instruction(vec4_visitor *v,
    this->force_writemask_all = false;
    this->no_dd_clear = false;
    this->no_dd_check = false;
+   this->writes_accumulator = false;
    this->conditional_mod = BRW_CONDITIONAL_NONE;
    this->sampler = 0;
    this->texture_offset = 0;
@@ -124,6 +125,16 @@ vec4_visitor::emit(enum opcode opcode)
                                           src0, src1);                 \
    }
 
+#define ALU2_ACC(op)                                                   \
+   vec4_instruction *                                                  \
+   vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)           \
+   {                                                                   \
+      vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
+                       BRW_OPCODE_##op, dst, src0, src1);              \
+      inst->writes_accumulator = true;                                 \
+      return inst;                                                     \
+   }
+
 #define ALU3(op)                                                       \
    vec4_instruction *                                                  \
    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
@@ -143,7 +154,7 @@ ALU1(F32TO16)
 ALU1(F16TO32)
 ALU2(ADD)
 ALU2(MUL)
-ALU2(MACH)
+ALU2_ACC(MACH)
 ALU2(AND)
 ALU2(OR)
 ALU2(XOR)
@@ -162,8 +173,8 @@ ALU1(FBH)
 ALU1(FBL)
 ALU1(CBIT)
 ALU3(MAD)
-ALU2(ADDC)
-ALU2(SUBB)
+ALU2_ACC(ADDC)
+ALU2_ACC(SUBB)
 
 /** Gen4 predicated IF. */
 vec4_instruction *