i965/fs: Improve performance of varying-index uniform loads on IVB.
authorEric Anholt <eric@anholt.net>
Wed, 13 Mar 2013 21:48:55 +0000 (14:48 -0700)
committerEric Anholt <eric@anholt.net>
Mon, 1 Apr 2013 23:17:25 +0000 (16:17 -0700)
Like we have done for the VS and for constant-index uniform loads, we use
the sampler engine to get caching in front of the L3 to avoid tickling the
IVB L3 bug.  This is also a bit of a functional change, as we're now
loading a vec4 instead of a single dword, though we're not taking
advantage of the other 3 components of the vec4 (yet).

With the driver hacked to always take the varying-index path for all
uniforms, improves performance of my old GLSL demo by 315% +/- 2% (n=4).
This a major fix for some blur shaders in compositors from the
varying-index uniforms support I introduced in 9.1.

v2: Move old offset computation into the pre-gen7 path.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=61554
NOTE: This is a candidate for the 9.1 branch.

src/mesa/drivers/dri/i965/brw_fs.cpp
src/mesa/drivers/dri/i965/brw_fs_emit.cpp

index c60d041867806029ed6028ccb94aa8cfc59a689a..703c3c5d8b4fe255303a57818ce6ba0419c8a5fd 100644 (file)
@@ -235,14 +235,33 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
    exec_list instructions;
    fs_inst *inst;
 
-   fs_reg offset = fs_reg(this, glsl_type::uint_type);
-   instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset)));
-
    if (intel->gen >= 7) {
+      /* We have our constant surface use a pitch of 4 bytes, so our index can
+       * be any component of a vector, and then we load 4 contiguous
+       * components starting from that.
+       *
+       * We break down the const_offset to a portion added to the variable
+       * offset and a portion done using reg_offset, which means that if you
+       * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
+       * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
+       * CSE can later notice that those loads are all the same and eliminate
+       * the redundant ones.
+       */
+      fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
+      instructions.push_tail(ADD(vec4_offset,
+                                 varying_offset, const_offset & ~3));
+
+      fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4), dst.type);
       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
-                                  dst, surf_index, offset);
+                                  vec4_result, surf_index, vec4_offset);
       instructions.push_tail(inst);
+
+      vec4_result.reg_offset += const_offset & 3;
+      instructions.push_tail(MOV(dst, vec4_result));
    } else {
+      fs_reg offset = fs_reg(this, glsl_type::uint_type);
+      instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset)));
+
       int base_mrf = 13;
       bool header_present = true;
 
@@ -313,7 +332,7 @@ fs_inst::equals(fs_inst *inst)
 int
 fs_inst::regs_written()
 {
-   if (is_tex())
+   if (is_tex() || opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7)
       return 4;
 
    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
index a729569c8404aaa2c7650ce3c261b0e8d9a311bb..bc1fef16b018f4d18d5366ae69796910ed8f6a1b 100644 (file)
@@ -734,28 +734,29 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
          index.type == BRW_REGISTER_TYPE_UD);
    uint32_t surf_index = index.dw1.ud;
 
-   uint32_t msg_control, rlen, mlen;
+   uint32_t simd_mode, rlen, mlen;
    if (dispatch_width == 16) {
-      msg_control = BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS;
-      mlen = rlen = 2;
+      mlen = 2;
+      rlen = 8;
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
    } else {
-      msg_control = BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS;
-      mlen = rlen = 1;
+      mlen = 1;
+      rlen = 4;
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
    }
 
    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
    brw_set_dest(p, send, dst);
    brw_set_src0(p, send, offset);
-   if (intel->gen < 6)
-      send->header.destreg__conditionalmod = inst->base_mrf;
-   brw_set_dp_read_message(p, send,
+   brw_set_sampler_message(p, send,
                            surf_index,
-                           msg_control,
-                           GEN7_DATAPORT_DC_DWORD_SCATTERED_READ,
-                           BRW_DATAPORT_READ_TARGET_DATA_CACHE,
+                           0, /* LD message ignores sampler unit */
+                           GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+                           rlen,
                            mlen,
-                           inst->header_present,
-                           rlen);
+                           false, /* no header */
+                           simd_mode,
+                           0);
 }
 
 /**