From 75afe17b7954984ea5b55c2a6d5d124f5eb03328 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 26 Sep 2014 14:47:03 -0700
Subject: [PATCH] i965/fs: Manually generate the meta fast-clear shader
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Previously, we were generating the fast-clear shader from GLSL.  The
problem is that fast clears require that we use a replicated write rather
than a regular write instruction.  In order to get this we had a
complicated and somewhat fragile optimization pass that looked for places
where we can use a replicated write and used it.  Since replicated writes
have a lot of restrictions, we only ever use them for fast-clear
operations.

This commit replaces the optimization pass with a function that just
generates the shader we want.  This is a) less code, b) less fragile than
the optimization pass, and c) generates a more efficient shader.

Signed-off-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kristian HÃ¸gsberg <krh@bitplanet.net>
Acked-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 122 ++++++++-------------------
 src/mesa/drivers/dri/i965/brw_fs.h   |   3 +-
 2 files changed, 35 insertions(+), 90 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index ffe8ba89497..f3c39e7cdb4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2329,98 +2329,44 @@ fs_visitor::compute_to_mrf()
  * instructions to FS_OPCODE_REP_FB_WRITE.
  */
 void
-fs_visitor::try_rep_send()
+fs_visitor::emit_repclear_shader()
 {
-   int i, count;
-   fs_inst *start = NULL;
-   bblock_t *mov_block;
+   int base_mrf = 1;
+   int color_mrf = base_mrf + 2;
 
-   /* From the Ivybridge PRM, Volume 4 Part 1, section 3.9.11.2
-    * ("Message Descriptor - Render Target Write"):
-    *
-    * "SIMD16_REPDATA message must not be used in SIMD8 pixel-shaders."
-    */
-   if (dispatch_width != 16)
-      return;
-
-   /* The constant color write message can't handle anything but the 4 color
-    * values.  We could do MRT, but the loops below would need to understand
-    * handling the header being enabled or disabled on different messages.  It
-    * also requires that the render target be tiled, which might not be the
-    * case for some EGLImage paths or if we some day do rendering to PBOs.
-    */
-   if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH) ||
-       payload.aa_dest_stencil_reg ||
-       payload.dest_depth_reg ||
-       dual_src_output.file != BAD_FILE)
-      return;
+   fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
+                           fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
+   mov->force_writemask_all = true;
+   mov->force_uncompressed = true;
 
-   /* The optimization is implemented as one pass through the instruction
-    * list.  We keep track of the most recent block of MOVs into sequential
-    * MRFs from single, sequential float registers (ie uniforms).  Then when
-    * we find an FB_WRITE opcode, we see if the payload registers match the
-    * destination registers in our block of MOVs.
-    */
-   count = 0;
-   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
-      if (count == 0) {
-         start = inst;
-         mov_block = block;
-      }
-      if (inst->opcode == BRW_OPCODE_MOV &&
-	  inst->dst.file == MRF &&
-          inst->dst.reg == start->dst.reg + 2 * count &&
-          inst->src[0].file == HW_REG &&
-          inst->src[0].reg_offset == start->src[0].reg_offset + count) {
-         if (count == 0) {
-            start = inst;
-            mov_block = block;
-         }
-         count++;
+   fs_inst *write;
+   if (key->nr_color_regions == 1) {
+      write = emit(FS_OPCODE_REP_FB_WRITE);
+      write->saturate = key->clamp_fragment_color;
+      write->base_mrf = color_mrf;
+      write->target = 0;
+      write->header_present = false;
+      write->mlen = 1;
+   } else {
+      for (int i = 0; i < key->nr_color_regions; ++i) {
+         write = emit(FS_OPCODE_REP_FB_WRITE);
+         write->saturate = key->clamp_fragment_color;
+         write->base_mrf = base_mrf;
+         write->target = i;
+         write->header_present = true;
+         write->mlen = 3;
       }
+   }
+   write->eot = true;
 
-      if (inst->opcode == FS_OPCODE_FB_WRITE &&
-          count == 4 &&
-          (inst->base_mrf == start->dst.reg ||
-           (inst->base_mrf + 2 == start->dst.reg && inst->header_present))) {
-         fs_inst *mov = MOV(start->dst, start->src[0]);
+   calculate_cfg();
 
-         /* Make a MOV that moves the four floats into the replicated write
-          * payload.  Since we're running at the very end of code generation
-          * we can use hw registers and generate the stride and offsets we
-          * need for this MOV.  We use the first of the eight registers
-          * allocated for the SIMD16 payload for the four floats.
-          */
-         mov->dst.fixed_hw_reg =
-            brw_vec4_reg(BRW_MESSAGE_REGISTER_FILE,
-                         start->dst.reg, 0);
-         mov->dst.file = HW_REG;
-         mov->dst.type = mov->dst.fixed_hw_reg.type;
-
-         mov->src[0].fixed_hw_reg =
-            brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
-         mov->src[0].file = HW_REG;
-         mov->src[0].type = mov->src[0].fixed_hw_reg.type;
-         mov->force_writemask_all = true;
-         mov->dst.type = BRW_REGISTER_TYPE_F;
-
-         /* Replace the four MOVs with the new vec4 MOV. */
-         start->insert_before(mov_block, mov);
-         for (i = 0; i < 4; i++)
-            ((fs_inst *) mov->next)->remove(mov_block);
-
-         /* Finally, adjust the message length and set the opcode to
-          * REP_FB_WRITE for the send, so that the generator will use the
-          * replicated data mesage type.  Then reset count so we'll start
-          * looking for a new block in case we're in a MRT shader.
-          */
-         inst->opcode = FS_OPCODE_REP_FB_WRITE;
-         inst->mlen -= 7;
-         count = 0;
-      }
-   }
+   assign_constant_locations();
+   assign_curb_setup();
 
-   return;
+   /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
+   assert(mov->src[0].file == HW_REG);
+   mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
 }
 
 /**
@@ -3199,6 +3145,9 @@ fs_visitor::run()
 
    if (0) {
       emit_dummy_fs();
+   } else if (brw->use_rep_send && dispatch_width == 16) {
+      emit_repclear_shader();
+      allocated_without_spills = true;
    } else {
       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
          emit_shader_time_begin();
@@ -3379,9 +3328,6 @@ fs_visitor::run()
       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
    }
 
-   if (brw->use_rep_send)
-      try_rep_send();
-
    if (stage == MESA_SHADER_FRAGMENT) {
       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
       if (dispatch_width == 8)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index f8bc46c710a..108e5b34d9f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -358,12 +358,11 @@ public:
    void lower_uniform_pull_constant_loads();
    bool lower_load_payload();
 
-   void try_rep_send();
-
    void push_force_uncompressed();
    void pop_force_uncompressed();
 
    void emit_dummy_fs();
+   void emit_repclear_shader();
    fs_reg *emit_fragcoord_interpolation(ir_variable *ir);
    fs_inst *emit_linterp(const fs_reg &attr, const fs_reg &interp,
                          glsl_interp_qualifier interpolation_mode,
-- 
2.30.2