i965/fs: Set up gen7 UBO loads as sends from GRFs.
authorEric Anholt <eric@anholt.net>
Wed, 5 Dec 2012 08:06:30 +0000 (00:06 -0800)
committerEric Anholt <eric@anholt.net>
Fri, 14 Dec 2012 23:18:05 +0000 (15:18 -0800)
This gives the instruction scheduler a chance to schedule between the
loads, whereas before it was restricted due to the dependencies between
the MRFs for setting them up.

For one shader in gles3conform, it goes from getting stuck in register
allocation for as long as anybody's bothered to leave it running down
to 23 seconds, thanks to the LIFO scheduling.

Acked-by: Kenneth Graunke <kenneth@whitecape.org>
src/mesa/drivers/dri/i965/brw_defines.h
src/mesa/drivers/dri/i965/brw_fs.cpp
src/mesa/drivers/dri/i965/brw_fs.h
src/mesa/drivers/dri/i965/brw_fs_emit.cpp
src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

index 40571a4d54d6dbceb6d420b6181ba104f0ded662..ab206d1920f504216d0178895219397e5ee735e4 100644 (file)
@@ -676,10 +676,12 @@ enum opcode {
    FS_OPCODE_SPILL,
    FS_OPCODE_UNSPILL,
    FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+   FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7,
    FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
    FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
    FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
    FS_OPCODE_DISCARD_JUMP,
+   FS_OPCODE_SET_GLOBAL_OFFSET,
 
    VS_OPCODE_URB_WRITE,
    VS_OPCODE_SCRATCH_READ,
index 9a18410ac5f97c9de62af0abd3dd1bfd95638698..83128117328c031f6c4d5e7ac435da119cf4e6fe 100644 (file)
@@ -330,7 +330,9 @@ fs_inst::is_math()
 bool
 fs_inst::is_send_from_grf()
 {
-   return opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
+   return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
+           (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
+            src[1].file == GRF));
 }
 
 bool
index b75314cd665121ec245179500c9f7514f8a42005..87257123f27d2a3a52555d3405a166abca37b748 100644 (file)
@@ -529,6 +529,10 @@ private:
    void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
                                             struct brw_reg index,
                                             struct brw_reg offset);
+   void generate_uniform_pull_constant_load_gen7(fs_inst *inst,
+                                                 struct brw_reg dst,
+                                                 struct brw_reg surf_index,
+                                                 struct brw_reg offset);
    void generate_varying_pull_constant_load(fs_inst *inst, struct brw_reg dst,
                                             struct brw_reg index);
    void generate_varying_pull_constant_load_gen7(fs_inst *inst,
@@ -536,6 +540,10 @@ private:
                                                  struct brw_reg index,
                                                  struct brw_reg offset);
    void generate_mov_dispatch_to_flags(fs_inst *inst);
+   void generate_set_global_offset(fs_inst *inst,
+                                   struct brw_reg dst,
+                                   struct brw_reg src,
+                                   struct brw_reg offset);
    void generate_discard_jump(fs_inst *inst);
 
    void patch_discard_jumps_to_fb_writes();
index 9a891414e623088310893e25b3493613d7b8dd75..63f09fe79415de5c0e21db1ea9908a1faedf46a8 100644 (file)
@@ -665,6 +665,44 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
    }
 }
 
+void
+fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
+                                                       struct brw_reg dst,
+                                                       struct brw_reg index,
+                                                       struct brw_reg offset)
+{
+   assert(inst->mlen == 0);
+
+   assert(index.file == BRW_IMMEDIATE_VALUE &&
+         index.type == BRW_REGISTER_TYPE_UD);
+   uint32_t surf_index = index.dw1.ud;
+
+   assert(offset.file == BRW_GENERAL_REGISTER_FILE);
+
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_pop_insn_state(p);
+
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, offset);
+   if (intel->gen < 6)
+      send->header.destreg__conditionalmod = inst->base_mrf;
+
+   uint32_t msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
+   uint32_t msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ;
+   bool header_present = true;
+   brw_set_dp_read_message(p, send,
+                           surf_index,
+                           msg_control,
+                           msg_type,
+                           BRW_DATAPORT_READ_TARGET_DATA_CACHE,
+                           1,
+                           header_present,
+                           1);
+}
+
 void
 fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
                                                   struct brw_reg dst,
@@ -852,6 +890,35 @@ brw_reg_from_fs_reg(fs_reg *reg)
    return brw_reg;
 }
 
+/**
+ * Sets the second dword of a vgrf for gen7+ message setup.
+ *
+ * For setting up gen7 messages in VGRFs, we need to be able to set the second
+ * dword for some payloads where in the MRF world we'd have just used
+ * brw_message_reg().  We don't want to bake it into the send message's code
+ * generation because that means we don't get a chance to schedule the
+ * instructions.
+ */
+void
+fs_generator::generate_set_global_offset(fs_inst *inst,
+                                         struct brw_reg dst,
+                                         struct brw_reg src,
+                                         struct brw_reg value)
+{
+   /* We use a matching src and dst to get the information on how this
+    * instruction works exposed to various optimization passes that would
+    * otherwise treat it as completely overwriting the dst.
+    */
+   assert(src.file == dst.file && src.nr == dst.nr);
+   assert(value.file == BRW_IMMEDIATE_VALUE);
+
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 2), value.type), value);
+   brw_pop_insn_state(p);
+}
+
 void
 fs_generator::generate_code(exec_list *instructions)
 {
@@ -1127,6 +1194,10 @@ fs_generator::generate_code(exec_list *instructions)
         generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
         break;
 
+      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+        generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
+        break;
+
       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
         generate_varying_pull_constant_load(inst, dst, src[0]);
         break;
@@ -1151,6 +1222,10 @@ fs_generator::generate_code(exec_list *instructions)
          brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME);
          break;
 
+      case FS_OPCODE_SET_GLOBAL_OFFSET:
+         generate_set_global_offset(inst, dst, src[0], src[1]);
+         break;
+
       default:
         if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
            _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
index ccf905ebc62f6765ef76f1069626ec28a8927b56..6a39f98509e4ad0adde1c31e37a20c13175bc7f1 100644 (file)
@@ -581,12 +581,32 @@ fs_visitor::visit(ir_expression *ir)
       if (const_offset) {
          fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
          packed_consts.type = result.type;
-         fs_inst *pull = emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                                      packed_consts,
-                                      surf_index,
-                                      fs_reg(const_offset->value.u[0])));
-         pull->base_mrf = 14;
-         pull->mlen = 1;
+
+         if (intel->gen >= 7) {
+            fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] / 16);
+            fs_reg payload = fs_reg(this, glsl_type::uint_type);
+            struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
+                                       BRW_REGISTER_TYPE_UD);
+            fs_inst *setup = emit(MOV(payload, fs_reg(g0)));
+            setup->force_writemask_all = true;
+            /* We don't need the second half of this vgrf to be filled with g1
+             * in the 16-wide case, but if we use force_uncompressed then live
+             * variable analysis won't consider this a def!
+             */
+
+            emit(FS_OPCODE_SET_GLOBAL_OFFSET, payload,
+                 payload, const_offset_reg);
+            emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7, packed_consts,
+                 surf_index, payload);
+         } else {
+            fs_reg const_offset_reg = fs_reg(const_offset->value.u[0]);
+            fs_inst *pull = emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                                         packed_consts,
+                                         surf_index,
+                                         const_offset_reg));
+            pull->base_mrf = 14;
+            pull->mlen = 1;
+         }
 
          packed_consts.smear = const_offset->value.u[0] % 16 / 4;
          for (int i = 0; i < ir->type->vector_elements; i++) {