i965: Add optimization pass to let us use the replicate data message

author Kristian Høgsberg <krh@bitplanet.net>

Mon, 7 Jul 2014 22:27:17 +0000 (15:27 -0700)

committer Kristian Høgsberg <krh@bitplanet.net>

Fri, 15 Aug 2014 18:25:47 +0000 (11:25 -0700)
author Kristian Høgsberg <krh@bitplanet.net>
Mon, 7 Jul 2014 22:27:17 +0000 (15:27 -0700)
committer Kristian Høgsberg <krh@bitplanet.net>
Fri, 15 Aug 2014 18:25:47 +0000 (11:25 -0700)
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h

index 19331827ce5099194756d73219a1db4fc37db9a6..a8f5d0ffc7b4b9886291547da3e8f6469c30519c 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1034,6 +1034,7 @@ struct brw_context
     bool has_negative_rhw_bug;
     bool has_pln;
     bool no_simd8;
+   bool use_rep_send;
  
     /**
      * Some versions of Gen hardware don't do centroid interpolation correctly
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h

index 88f205785bb8909442cba3fc04b4eaa6caa1e1f3..248a866377d7c7d114abda761e64339c4da1fa41 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -850,6 +850,7 @@ enum opcode {
      */
     FS_OPCODE_FB_WRITE = 128,
     FS_OPCODE_BLORP_FB_WRITE,
+   FS_OPCODE_REP_FB_WRITE,
     SHADER_OPCODE_RCP,
     SHADER_OPCODE_RSQ,
     SHADER_OPCODE_SQRT,
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index 565189ba3aa0e6d2ac914bb4740b9d6748fba271..f1d3fb8ec679e25977a97c59dce247238b5b62f5 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2283,6 +2283,100 @@ fs_visitor::compute_to_mrf()
     return progress;
  }
  
+/**
+ * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
+ * instructions to FS_OPCODE_REP_FB_WRITE.
+ */
+void
+fs_visitor::try_rep_send()
+{
+   int i, count;
+   fs_inst *start = NULL;
+
+   /* From the Ivybridge PRM, Volume 4 Part 1, section 3.9.11.2
+    * ("Message Descriptor - Render Target Write"):
+    *
+    * "SIMD16_REPDATA message must not be used in SIMD8 pixel-shaders."
+    */
+   if (dispatch_width != 16)
+      return;
+
+   /* The constant color write message can't handle anything but the 4 color
+    * values.  We could do MRT, but the loops below would need to understand
+    * handling the header being enabled or disabled on different messages.  It
+    * also requires that the render target be tiled, which might not be the
+    * case for some EGLImage paths or if we some day do rendering to PBOs.
+    */
+   if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH) ||
+       payload.aa_dest_stencil_reg ||
+       payload.dest_depth_reg ||
+       dual_src_output.file != BAD_FILE)
+      return;
+
+   /* The optimization is implemented as one pass through the instruction
+    * list.  We keep track of the most recent block of MOVs into sequential
+    * MRFs from single, sequential float registers (ie uniforms).  Then when
+    * we find an FB_WRITE opcode, we see if the payload registers match the
+    * destination registers in our block of MOVs.
+    */
+   count = 0;
+   foreach_in_list_safe(fs_inst, inst, &this->instructions) {
+      if (count == 0)
+         start = inst;
+      if (inst->opcode == BRW_OPCODE_MOV &&
+         inst->dst.file == MRF &&
+          inst->dst.reg == start->dst.reg + 2 * count &&
+          inst->src[0].file == HW_REG &&
+          inst->src[0].reg_offset == start->src[0].reg_offset + count) {
+         if (count == 0)
+            start = inst;
+         count++;
+      }
+
+      if (inst->opcode == FS_OPCODE_FB_WRITE &&
+          count == 4 &&
+          (inst->base_mrf == start->dst.reg ||
+           (inst->base_mrf + 2 == start->dst.reg && inst->header_present))) {
+         fs_inst *mov = MOV(start->dst, start->src[0]);
+
+         /* Make a MOV that moves the four floats into the replicated write
+          * payload.  Since we're running at the very end of code generation
+          * we can use hw registers and generate the stride and offsets we
+          * need for this MOV.  We use the first of the eight registers
+          * allocated for the SIMD16 payload for the four floats.
+          */
+         mov->dst.fixed_hw_reg =
+            brw_vec4_reg(BRW_MESSAGE_REGISTER_FILE,
+                         start->dst.reg, 0);
+         mov->dst.file = HW_REG;
+         mov->dst.type = mov->dst.fixed_hw_reg.type;
+
+         mov->src[0].fixed_hw_reg =
+            brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
+         mov->src[0].file = HW_REG;
+         mov->src[0].type = mov->src[0].fixed_hw_reg.type;
+         mov->force_writemask_all = true;
+         mov->dst.type = BRW_REGISTER_TYPE_F;
+
+         /* Replace the four MOVs with the new vec4 MOV. */
+         start->insert_before(mov);
+         for (i = 0; i < 4; i++)
+            mov->next->remove();
+
+         /* Finally, adjust the message length and set the opcode to
+          * REP_FB_WRITE for the send, so that the generator will use the
+          * replicated data mesage type.  Then reset count so we'll start
+          * looking for a new block in case we're in a MRT shader.
+          */
+         inst->opcode = FS_OPCODE_REP_FB_WRITE;
+         inst->mlen -= 7;
+         count = 0;
+      }
+   }
+
+   return;
+}
+
  /**
   * Walks through basic blocks, looking for repeated MRF writes and
   * removing the later ones.
@@ -3226,6 +3320,9 @@ fs_visitor::run()
        prog_data->total_scratch = brw_get_scratch_size(last_scratch);
     }
  
+   if (brw->use_rep_send)
+      try_rep_send();
+
     if (dispatch_width == 8)
        prog_data->reg_blocks = brw_register_blocks(grf_used);
     else
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h

index 0f8fb2d8c551420d4b3c632e3e29cd8138472958..9e5b5d7eff38037962cb48504d0c736d645c573c 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -355,6 +355,8 @@ public:
     void lower_uniform_pull_constant_loads();
     bool lower_load_payload();
  
+   void try_rep_send();
+
     void push_force_uncompressed();
     void pop_force_uncompressed();
  
@@ -590,6 +592,7 @@ private:
                        GLuint nr);
     void generate_fb_write(fs_inst *inst);
     void generate_blorp_fb_write(fs_inst *inst);
+   void generate_rep_fb_write(fs_inst *inst);
     void generate_pixel_xy(struct brw_reg dst, bool is_x);
     void generate_linterp(fs_inst *inst, struct brw_reg dst,
                          struct brw_reg *src);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp

index 1cf5a886fa2dedb5909099388986069286b22cce..a2430034086079947eb8fb38079653c7aa451ef8 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -115,7 +115,9 @@ fs_generator::fire_fb_write(fs_inst *inst,
        brw_pop_insn_state(p);
     }
  
-   if (prog_data->dual_src_blend)
+   if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
+      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
+   else if (prog_data->dual_src_blend)
        msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
     else if (dispatch_width == 16)
        msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
@@ -1839,6 +1841,7 @@ fs_generator::generate_code(exec_list *instructions)
          generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
          break;
  
+      case FS_OPCODE_REP_FB_WRITE:
        case FS_OPCODE_FB_WRITE:
          generate_fb_write(inst);
          break;
author	Kristian Høgsberg <krh@bitplanet.net>
	Mon, 7 Jul 2014 22:27:17 +0000 (15:27 -0700)
committer	Kristian Høgsberg <krh@bitplanet.net>
	Fri, 15 Aug 2014 18:25:47 +0000 (11:25 -0700)
src/mesa/drivers/dri/i965/brw_context.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_defines.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs_generator.cpp		patch \| blob \| history