From 4c1fdae0a01b3f92ec03b61aac1d3df500d51fc6 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 6 Mar 2013 14:47:22 -0800
Subject: [PATCH] i965/fs: Switch to using sampler LD messages for uniform pull
 constants.

When forcing the compiler to always generate pull constants instead of
push constants (in order to have an easy to use testcase), improves
performance of my old GLSL demo 23.3553% +/- 1.42968% (n=7).

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=60866
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_defines.h   |  2 +-
 src/mesa/drivers/dri/i965/brw_fs.cpp      | 39 ++++++++--------
 src/mesa/drivers/dri/i965/brw_fs.h        |  7 ++-
 src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 54 +++++++++++------------
 4 files changed, 50 insertions(+), 52 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index d9b7f9aeb15..6414e69892d 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -727,7 +727,7 @@ enum opcode {
    FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
    FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
    FS_OPCODE_DISCARD_JUMP,
-   FS_OPCODE_SET_GLOBAL_OFFSET,
+   FS_OPCODE_SET_SIMD4X2_OFFSET,
    FS_OPCODE_PACK_HALF_2x16_SPLIT,
    FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X,
    FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y,
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index b97a19e0510..5380abfe2f4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2461,6 +2461,11 @@ fs_visitor::insert_gen4_send_dependency_workarounds()
  * scheduling full flexibility, while the conversion to native instructions
  * allows the post-register-allocation scheduler the best information
  * possible.
+ *
+ * Note that execution masking for setting up pull constant loads is special:
+ * the channels that need to be written are unrelated to the current execution
+ * mask, since a later instruction will use one of the result channels as a
+ * source operand for all 8 or 16 of its channels.
  */
 void
 fs_visitor::lower_uniform_pull_constant_loads()
@@ -2477,26 +2482,24 @@ fs_visitor::lower_uniform_pull_constant_loads()
                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
          const_offset_reg.imm.u /= 16;
          fs_reg payload = fs_reg(this, glsl_type::uint_type);
-         struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
-                                    BRW_REGISTER_TYPE_UD);
-
-         fs_inst *setup1 = MOV(payload, fs_reg(g0));
-         setup1->force_writemask_all = true;
-         /* We don't need the second half of this vgrf to be filled with g1
-          * in the 16-wide case, but if we use force_uncompressed then live
-          * variable analysis won't consider this a def!
+
+         /* This is actually going to be a MOV, but since only the first dword
+          * is accessed, we have a special opcode to do just that one.  Note
+          * that this needs to be an operation that will be considered a def
+          * by live variable analysis, or register allocation will explode.
           */
+         fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
+                                               payload, const_offset_reg);
+         setup->force_writemask_all = true;
 
-         fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET,
-                                                payload, payload,
-                                                const_offset_reg);
+         setup->ir = inst->ir;
+         setup->annotation = inst->annotation;
+         inst->insert_before(setup);
 
-         setup1->ir = inst->ir;
-         setup1->annotation = inst->annotation;
-         inst->insert_before(setup1);
-         setup2->ir = inst->ir;
-         setup2->annotation = inst->annotation;
-         inst->insert_before(setup2);
+         /* Similarly, this will only populate the first 4 channels of the
+          * result register (since we only use smear values from 0-3), but we
+          * don't tell the optimizer.
+          */
          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
          inst->src[1] = payload;
 
@@ -2533,7 +2536,7 @@ fs_visitor::dump_instruction(fs_inst *inst)
       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
          printf("uniform_pull_const_gen7");
          break;
-      case FS_OPCODE_SET_GLOBAL_OFFSET:
+      case FS_OPCODE_SET_SIMD4X2_OFFSET:
          printf("set_global_offset");
          break;
       default:
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index f7ccc7909e2..febd56bfe2e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -546,10 +546,9 @@ private:
                                                  struct brw_reg index,
                                                  struct brw_reg offset);
    void generate_mov_dispatch_to_flags(fs_inst *inst);
-   void generate_set_global_offset(fs_inst *inst,
-                                   struct brw_reg dst,
-                                   struct brw_reg src,
-                                   struct brw_reg offset);
+   void generate_set_simd4x2_offset(fs_inst *inst,
+                                    struct brw_reg dst,
+                                    struct brw_reg offset);
    void generate_discard_jump(fs_inst *inst);
 
    void generate_pack_half_2x16_split(fs_inst *inst,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index 2391ad12026..712fef6e093 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -647,6 +647,8 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
    uint32_t surf_index = index.dw1.ud;
 
    assert(offset.file == BRW_GENERAL_REGISTER_FILE);
+   /* Reference just the dword we need, to avoid angering validate_reg(). */
+   offset = brw_vec1_grf(offset.nr, 0);
 
    brw_push_insn_state(p);
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
@@ -654,20 +656,22 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
    brw_pop_insn_state(p);
 
+   /* We use the SIMD4x2 mode because we want to end up with 4 components in
+    * the destination loaded consecutively from the same offset (which appears
+    * in the first component, and the rest are ignored).
+    */
+   dst.width = BRW_WIDTH_4;
    brw_set_dest(p, send, dst);
    brw_set_src0(p, send, offset);
-
-   uint32_t msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
-   uint32_t msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ;
-   bool header_present = true;
-   brw_set_dp_read_message(p, send,
+   brw_set_sampler_message(p, send,
                            surf_index,
-                           msg_control,
-                           msg_type,
-                           BRW_DATAPORT_READ_TARGET_DATA_CACHE,
-                           1,
-                           header_present,
-                           1);
+                           0, /* LD message ignores sampler unit */
+                           GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+                           1, /* rlen */
+                           1, /* mlen */
+                           false, /* no header */
+                           BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+                           0);
 }
 
 void
@@ -858,31 +862,23 @@ brw_reg_from_fs_reg(fs_reg *reg)
 }
 
 /**
- * Sets the second dword of a vgrf for gen7+ message setup.
+ * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
+ * sampler LD messages.
  *
- * For setting up gen7 messages in VGRFs, we need to be able to set the second
- * dword for some payloads where in the MRF world we'd have just used
- * brw_message_reg().  We don't want to bake it into the send message's code
- * generation because that means we don't get a chance to schedule the
- * instructions.
+ * We don't want to bake it into the send message's code generation because
+ * that means we don't get a chance to schedule the instructions.
  */
 void
-fs_generator::generate_set_global_offset(fs_inst *inst,
-                                         struct brw_reg dst,
-                                         struct brw_reg src,
-                                         struct brw_reg value)
+fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
+                                          struct brw_reg dst,
+                                          struct brw_reg value)
 {
-   /* We use a matching src and dst to get the information on how this
-    * instruction works exposed to various optimization passes that would
-    * otherwise treat it as completely overwriting the dst.
-    */
-   assert(src.file == dst.file && src.nr == dst.nr);
    assert(value.file == BRW_IMMEDIATE_VALUE);
 
    brw_push_insn_state(p);
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_mask_control(p, BRW_MASK_DISABLE);
-   brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 2), value.type), value);
+   brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
    brw_pop_insn_state(p);
 }
 
@@ -1298,8 +1294,8 @@ fs_generator::generate_code(exec_list *instructions)
          brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME);
          break;
 
-      case FS_OPCODE_SET_GLOBAL_OFFSET:
-         generate_set_global_offset(inst, dst, src[0], src[1]);
+      case FS_OPCODE_SET_SIMD4X2_OFFSET:
+         generate_set_simd4x2_offset(inst, dst, src[0]);
          break;
 
       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
-- 
2.30.2