i965/fs: Delay setup of uniform loads until after pre-regalloc scheduling.
authorEric Anholt <eric@anholt.net>
Sat, 16 Feb 2013 03:26:48 +0000 (19:26 -0800)
committerEric Anholt <eric@anholt.net>
Tue, 19 Feb 2013 18:33:32 +0000 (10:33 -0800)
This should fix the register allocation explosion on the GLES 3.0 test
on gen6.  It also gives us an instruction that will fit our CSE handling.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
NOTE: This is a candidate for the 9.1 branch.

src/mesa/drivers/dri/i965/brw_fs.cpp
src/mesa/drivers/dri/i965/brw_fs.h
src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

index 35cdc6a02e48005cb4105195c85aa3193720ed3e..f3232b292719dc6b80edc2da0d9f13af24f77433 100644 (file)
@@ -1710,8 +1710,6 @@ fs_visitor::setup_pull_constants()
                                  dst, index, offset);
         pull->ir = inst->ir;
         pull->annotation = inst->annotation;
-        pull->base_mrf = 14;
-        pull->mlen = 1;
 
         inst->insert_before(pull);
 
@@ -2447,6 +2445,66 @@ fs_visitor::insert_gen4_send_dependency_workarounds()
    }
 }
 
+/**
+ * Turns the generic expression-style uniform pull constant load instruction
+ * into a hardware-specific series of instructions for loading a pull
+ * constant.
+ *
+ * The expression style allows the CSE pass before this to optimize out
+ * repeated loads from the same offset, and gives the pre-register-allocation
+ * scheduling full flexibility, while the conversion to native instructions
+ * allows the post-register-allocation scheduler the best information
+ * possible.
+ */
+void
+fs_visitor::lower_uniform_pull_constant_loads()
+{
+   foreach_list(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
+
+      if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
+         continue;
+
+      if (intel->gen >= 7) {
+         fs_reg const_offset_reg = inst->src[1];
+         assert(const_offset_reg.file == IMM &&
+                const_offset_reg.type == BRW_REGISTER_TYPE_UD);
+         const_offset_reg.imm.u /= 16;
+         fs_reg payload = fs_reg(this, glsl_type::uint_type);
+         struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
+                                    BRW_REGISTER_TYPE_UD);
+
+         fs_inst *setup1 = MOV(payload, fs_reg(g0));
+         setup1->force_writemask_all = true;
+         /* We don't need the second half of this vgrf to be filled with g1
+          * in the 16-wide case, but if we use force_uncompressed then live
+          * variable analysis won't consider this a def!
+          */
+
+         fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET,
+                                                payload, payload,
+                                                const_offset_reg);
+
+         setup1->ir = inst->ir;
+         setup1->annotation = inst->annotation;
+         inst->insert_before(setup1);
+         setup2->ir = inst->ir;
+         setup2->annotation = inst->annotation;
+         inst->insert_before(setup2);
+         inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
+         inst->src[1] = payload;
+      } else {
+         /* Before register allocation, we didn't tell the scheduler about the
+          * MRF we use.  We know it's safe to use this MRF because nothing
+          * else does except for register spill/unspill, which generates and
+          * uses its MRF within a single IR instruction.
+          */
+         inst->base_mrf = 14;
+         inst->mlen = 1;
+      }
+   }
+}
+
 void
 fs_visitor::dump_instruction(fs_inst *inst)
 {
@@ -2748,6 +2806,8 @@ fs_visitor::run()
 
       schedule_instructions(false);
 
+      lower_uniform_pull_constant_loads();
+
       assign_curb_setup();
       assign_urb_setup();
 
index d5ebd515cbb8c261f3c40ab8a05da0a4fc37b18d..d1bb111bf5fddcf8f34a674ae813b37dcc30c702 100644 (file)
@@ -334,6 +334,7 @@ public:
    void insert_gen4_pre_send_dependency_workarounds(fs_inst *inst);
    void insert_gen4_post_send_dependency_workarounds(fs_inst *inst);
    void fail(const char *msg, ...);
+   void lower_uniform_pull_constant_loads();
 
    void push_force_uncompressed();
    void pop_force_uncompressed();
index d4f6fc9ca7eaba5e10f9f7a465938e4947f13c3d..573921cf8cc671c0ae30266b9c4a47becbbee0c6 100644 (file)
@@ -597,31 +597,9 @@ fs_visitor::visit(ir_expression *ir)
          fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
          packed_consts.type = result.type;
 
-         if (intel->gen >= 7) {
-            fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] / 16);
-            fs_reg payload = fs_reg(this, glsl_type::uint_type);
-            struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
-                                       BRW_REGISTER_TYPE_UD);
-            fs_inst *setup = emit(MOV(payload, fs_reg(g0)));
-            setup->force_writemask_all = true;
-            /* We don't need the second half of this vgrf to be filled with g1
-             * in the 16-wide case, but if we use force_uncompressed then live
-             * variable analysis won't consider this a def!
-             */
-
-            emit(FS_OPCODE_SET_GLOBAL_OFFSET, payload,
-                 payload, const_offset_reg);
-            emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7, packed_consts,
-                 surf_index, payload);
-         } else {
-            fs_reg const_offset_reg = fs_reg(const_offset->value.u[0]);
-            fs_inst *pull = emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                                         packed_consts,
-                                         surf_index,
-                                         const_offset_reg));
-            pull->base_mrf = 14;
-            pull->mlen = 1;
-         }
+         fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
+         emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                      packed_consts, surf_index, const_offset_reg));
 
          packed_consts.smear = const_offset->value.u[0] % 16 / 4;
          for (int i = 0; i < ir->type->vector_elements; i++) {