From 483f5b348b0f3c0ca7082fd2047c354e8af285e7 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 22 Aug 2011 10:35:24 -0700
Subject: [PATCH] i965/vs: Add support for pull constant loads for uniform
 arrays.

v2: reworked the instruction emit and made use of gen6_resolve_implied_move,
    from Ken's review
---
 src/mesa/drivers/dri/i965/brw_defines.h       |   1 +
 src/mesa/drivers/dri/i965/brw_eu.h            |   3 +
 src/mesa/drivers/dri/i965/brw_eu_emit.c       |   2 +-
 src/mesa/drivers/dri/i965/brw_vec4.h          |  11 ++
 src/mesa/drivers/dri/i965/brw_vec4_emit.cpp   |  46 +++++-
 .../drivers/dri/i965/brw_vec4_visitor.cpp     | 137 +++++++++++++++++-
 6 files changed, 195 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index d1799c0ab4f..5f349392ae9 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -643,6 +643,7 @@ enum opcode {
    VS_OPCODE_URB_WRITE,
    VS_OPCODE_SCRATCH_READ,
    VS_OPCODE_SCRATCH_WRITE,
+   VS_OPCODE_PULL_CONSTANT_LOAD,
 };
 
 #define BRW_PREDICATE_NONE             0
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index af50305fc2b..31334ce8c49 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -807,6 +807,9 @@ void brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
 void brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
 		  struct brw_reg reg);
 
+void gen6_resolve_implied_move(struct brw_compile *p,
+			       struct brw_reg *src,
+			       GLuint msg_reg_nr);
 
 /* Helpers for regular instructions:
  */
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index e8d09983405..f9f8d49a0d0 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -58,7 +58,7 @@ static void guess_execution_size(struct brw_compile *p,
  * On Sandybridge, this is no longer the case.  This function performs the
  * explicit move; it should be called before emitting a SEND instruction.
  */
-static void
+void
 gen6_resolve_implied_move(struct brw_compile *p,
 			  struct brw_reg *src,
 			  GLuint msg_reg_nr)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index c6071544f61..1bb15016b52 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -370,6 +370,7 @@ public:
     */
    dst_reg output_reg[VERT_RESULT_MAX];
    int uniform_size[MAX_UNIFORMS];
+   int uniform_vector_size[MAX_UNIFORMS];
    int uniforms;
 
    struct hash_table *variable_ht;
@@ -386,6 +387,7 @@ public:
    void reg_allocate_trivial();
    void reg_allocate();
    void move_grf_array_access_to_scratch();
+   void move_uniform_array_access_to_pull_constants();
    void calculate_live_intervals();
    bool dead_code_eliminate();
    bool virtual_grf_interferes(int a, int b);
@@ -447,6 +449,8 @@ public:
 
    src_reg get_scratch_offset(vec4_instruction *inst,
 			      src_reg *reladdr, int reg_offset);
+   src_reg get_pull_constant_offset(vec4_instruction *inst,
+				    src_reg *reladdr, int reg_offset);
    void emit_scratch_read(vec4_instruction *inst,
 			  dst_reg dst,
 			  src_reg orig_src,
@@ -455,6 +459,10 @@ public:
 			   src_reg temp,
 			   dst_reg orig_dst,
 			   int base_offset);
+   void emit_pull_constant_load(vec4_instruction *inst,
+				dst_reg dst,
+				src_reg orig_src,
+				int base_offset);
 
    GLboolean try_emit_sat(ir_expression *ir);
 
@@ -490,6 +498,9 @@ public:
    void generate_scratch_read(vec4_instruction *inst,
 			      struct brw_reg dst,
 			      struct brw_reg index);
+   void generate_pull_constant_load(vec4_instruction *inst,
+				    struct brw_reg dst,
+				    struct brw_reg index);
 };
 
 } /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
index 11d0278ccd0..49514070f34 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
@@ -114,7 +114,7 @@ vec4_visitor::setup_uniforms(int reg)
     * matter what, or the GPU would hang.
     */
    if (intel->gen < 6 && this->uniforms == 0) {
-      this->uniform_size[this->uniforms] = 1;
+      this->uniform_vector_size[this->uniforms] = 1;
 
       for (unsigned int i = 0; i < 4; i++) {
 	 unsigned int slot = this->uniforms * 4 + i;
@@ -229,6 +229,9 @@ vec4_instruction::get_src(int i)
 	 brw_reg = brw_abs(brw_reg);
       if (src[i].negate)
 	 brw_reg = negate(brw_reg);
+
+      /* This should have been moved to pull constants. */
+      assert(!src[i].reladdr);
       break;
 
    case HW_REG:
@@ -487,6 +490,42 @@ vec4_visitor::generate_scratch_write(vec4_instruction *inst,
 			    false /* commit */);
 }
 
+void
+vec4_visitor::generate_pull_constant_load(vec4_instruction *inst,
+					  struct brw_reg dst,
+					  struct brw_reg index)
+{
+   struct brw_reg header = brw_vec8_grf(0, 0);
+
+   gen6_resolve_implied_move(p, &header, inst->base_mrf);
+
+   brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_D),
+	   index);
+
+   uint32_t msg_type;
+
+   if (intel->gen >= 6)
+      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else if (intel->gen == 5 || intel->is_g4x)
+      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else
+      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+
+   /* Each of the 8 channel enables is considered for whether each
+    * dword is written.
+    */
+   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, header);
+   brw_set_dp_read_message(p, send,
+			   SURF_INDEX_VERT_CONST_BUFFER,
+			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+			   msg_type,
+			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
+			   2, /* mlen */
+			   1 /* rlen */);
+}
+
 void
 vec4_visitor::generate_vs_instruction(vec4_instruction *instruction,
 				      struct brw_reg dst,
@@ -529,6 +568,10 @@ vec4_visitor::generate_vs_instruction(vec4_instruction *instruction,
       generate_scratch_write(inst, dst, src[0], src[1]);
       break;
 
+   case VS_OPCODE_PULL_CONSTANT_LOAD:
+      generate_pull_constant_load(inst, dst, src[0]);
+      break;
+
    default:
       if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
 	 fail("unsupported opcode in `%s' in VS\n",
@@ -556,6 +599,7 @@ vec4_visitor::run()
     * often do repeated subexpressions for those.
     */
    move_grf_array_access_to_scratch();
+   move_uniform_array_access_to_pull_constants();
 
    bool progress;
    do {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 912145538a6..c4a3bbadd40 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -381,7 +381,10 @@ vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
 	 c->prog_data.param[this->uniforms * 4 + i] = &zero;
       }
 
-      this->uniform_size[this->uniforms] = type->vector_elements;
+      /* Track the size of this uniform vector, for future packing of
+       * uniforms.
+       */
+      this->uniform_vector_size[this->uniforms] = type->vector_elements;
       this->uniforms++;
 
       return 1;
@@ -429,7 +432,7 @@ vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 					    (gl_state_index *)slots[i].tokens);
       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 
-      this->uniform_size[this->uniforms] = 0;
+      this->uniform_vector_size[this->uniforms] = 0;
       /* Add each of the unique swizzled channels of the element.
        * This will end up matching the size of the glsl_type of this field.
        */
@@ -440,7 +443,7 @@ vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 
 	 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 	 if (swiz <= last_swiz)
-	    this->uniform_size[this->uniforms]++;
+	    this->uniform_vector_size[this->uniforms]++;
       }
       this->uniforms++;
    }
@@ -677,6 +680,11 @@ vec4_visitor::visit(ir_variable *ir)
    case ir_var_uniform:
       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 
+      /* Track how big the whole uniform variable is, in case we need to put a
+       * copy of its data into pull constants for array access.
+       */
+      this->uniform_size[this->uniforms] = type_size(ir->type);
+
       if (!strncmp(ir->name, "gl_", 3)) {
 	 setup_builtin_uniform_values(ir);
       } else {
@@ -1947,6 +1955,42 @@ vec4_visitor::get_scratch_offset(vec4_instruction *inst,
    }
 }
 
+src_reg
+vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
+				       src_reg *reladdr, int reg_offset)
+{
+   if (reladdr) {
+      src_reg index = src_reg(this, glsl_type::int_type);
+
+      vec4_instruction *add = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_ADD,
+							    dst_reg(index),
+							    *reladdr,
+							    src_reg(reg_offset));
+      add->ir = inst->ir;
+      add->annotation = inst->annotation;
+      inst->insert_before(add);
+
+      /* Pre-gen6, the message header uses byte offsets instead of vec4
+       * (16-byte) offset units.
+       */
+      if (intel->gen < 6) {
+	 vec4_instruction *mul = new(mem_ctx) vec4_instruction(this,
+							       BRW_OPCODE_MUL,
+							       dst_reg(index),
+							       index,
+							       src_reg(16));
+	 mul->ir = inst->ir;
+	 mul->annotation = inst->annotation;
+	 inst->insert_before(mul);
+      }
+
+      return index;
+   } else {
+      int message_header_scale = intel->gen < 6 ? 16 : 1;
+      return src_reg(reg_offset * message_header_scale);
+   }
+}
+
 /**
  * Emits an instruction before @inst to load the value named by @orig_src
  * from scratch space at @base_offset to @temp.
@@ -2072,6 +2116,93 @@ vec4_visitor::move_grf_array_access_to_scratch()
    }
 }
 
+/**
+ * Emits an instruction before @inst to load the value named by @orig_src
+ * from the pull constant buffer (surface) at @base_offset to @temp.
+ */
+void
+vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
+				      dst_reg temp, src_reg orig_src,
+				      int base_offset)
+{
+   int reg_offset = base_offset + orig_src.reg_offset;
+   src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
+   vec4_instruction *load;
+
+   load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
+					temp, index);
+   load->annotation = inst->annotation;
+   load->ir = inst->ir;
+   load->base_mrf = 14;
+   load->mlen = 1;
+   inst->insert_before(load);
+}
+
+/**
+ * Implements array access of uniforms by inserting a
+ * PULL_CONSTANT_LOAD instruction.
+ *
+ * Unlike temporary GRF array access (where we don't support it due to
+ * the difficulty of doing relative addressing on instruction
+ * destinations), we could potentially do array access of uniforms
+ * that were loaded in GRF space as push constants.  In real-world
+ * usage we've seen, though, the arrays being used are always larger
+ * than we could load as push constants, so just always move all
+ * uniform array access out to a pull constant buffer.
+ */
+void
+vec4_visitor::move_uniform_array_access_to_pull_constants()
+{
+   int pull_constant_loc[this->uniforms];
+
+   for (int i = 0; i < this->uniforms; i++) {
+      pull_constant_loc[i] = -1;
+   }
+
+   /* Walk through and find array access of uniforms.  Put a copy of that
+    * uniform in the pull constant buffer.
+    *
+    * Note that we don't move constant-indexed accesses to arrays.  No
+    * testing has been done of the performance impact of this choice.
+    */
+   foreach_list_safe(node, &this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)node;
+
+      for (int i = 0 ; i < 3; i++) {
+	 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
+	    continue;
+
+	 int uniform = inst->src[i].reg;
+
+	 /* If this array isn't already present in the pull constant buffer,
+	  * add it.
+	  */
+	 if (pull_constant_loc[uniform] == -1) {
+	    const float **values = &prog_data->param[uniform * 4];
+
+	    pull_constant_loc[uniform] = prog_data->nr_pull_params;
+
+	    for (int j = 0; j < uniform_size[uniform] * 4; j++) {
+	       prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
+	    }
+	 }
+
+	 /* Set up the annotation tracking for new generated instructions. */
+	 base_ir = inst->ir;
+	 current_annotation = inst->annotation;
+
+	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
+
+	 emit_pull_constant_load(inst, temp, inst->src[i],
+				 pull_constant_loc[uniform]);
+
+	 inst->src[i].file = temp.file;
+	 inst->src[i].reg = temp.reg;
+	 inst->src[i].reg_offset = temp.reg_offset;
+	 inst->src[i].reladdr = NULL;
+      }
+   }
+}
 
 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
 			   struct gl_shader_program *prog,
-- 
2.30.2