i965/fs: Combine tex/fb_write operations (opt)

author Ben Widawsky <benjamin.widawsky@intel.com>

Sun, 8 Feb 2015 21:59:57 +0000 (13:59 -0800)

committer Ben Widawsky <benjamin.widawsky@intel.com>

Tue, 14 Apr 2015 22:22:47 +0000 (15:22 -0700)
author Ben Widawsky <benjamin.widawsky@intel.com>
Sun, 8 Feb 2015 21:59:57 +0000 (13:59 -0800)
committer Ben Widawsky <benjamin.widawsky@intel.com>
Tue, 14 Apr 2015 22:22:47 +0000 (15:22 -0700)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index f04fb59f69faa8a63809d0a701feb27ad20a5386..7cc88eade45482550875410286d457d826481d6d 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2555,6 +2555,94 @@ fs_visitor::opt_algebraic()
     return progress;
  }
  
+/**
+ * Optimize sample messages which are followed by the final RT write.
+ *
+ * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
+ * results sent directly to the framebuffer, bypassing the EU.  Recognize the
+ * final texturing results copied to the framebuffer write payload and modify
+ * them to write to the framebuffer directly.
+ */
+bool
+fs_visitor::opt_sampler_eot()
+{
+   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+
+   if (brw->gen < 9 && !brw->is_cherryview)
+      return false;
+
+   /* FINISHME: It should be possible to implement this optimization when there
+    * are multiple drawbuffers.
+    */
+   if (key->nr_color_regions != 1)
+      return false;
+
+   /* Look for a texturing instruction immediately before the final FB_WRITE. */
+   fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
+   assert(fb_write->eot);
+   assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
+
+   fs_inst *tex_inst = (fs_inst *) fb_write->prev;
+
+   /* There wasn't one; nothing to do. */
+   if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
+      return false;
+
+   /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
+    * It's very likely to be the previous instruction.
+    */
+   fs_inst *load_payload = (fs_inst *) tex_inst->prev;
+   if (load_payload->is_head_sentinel() ||
+       load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+      return false;
+
+   assert(!tex_inst->eot); /* We can't get here twice */
+   assert((tex_inst->offset & (0xff << 24)) == 0);
+
+   tex_inst->offset |= fb_write->target << 24;
+   tex_inst->eot = true;
+   fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
+
+   /* If a header is present, marking the eot is sufficient. Otherwise, we need
+    * to create a new LOAD_PAYLOAD command with the same sources and a space
+    * saved for the header. Using a new destination register not only makes sure
+    * we have enough space, but it will make sure the dead code eliminator kills
+    * the instruction that this will replace.
+    */
+   if (tex_inst->header_present)
+      return true;
+
+   fs_reg send_header = vgrf(load_payload->sources + 1);
+   fs_reg *new_sources =
+      ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
+
+   new_sources[0] = fs_reg();
+   for (int i = 0; i < load_payload->sources; i++)
+      new_sources[i+1] = load_payload->src[i];
+
+   /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
+    * requires a lot of information about the sources to appropriately figure
+    * out the number of registers needed to be used. Given this stage in our
+    * optimization, we may not have the appropriate GRFs required by
+    * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
+    * manually emit the instruction.
+    */
+   fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
+                                                    load_payload->exec_size,
+                                                    send_header,
+                                                    new_sources,
+                                                    load_payload->sources + 1);
+
+   new_load_payload->regs_written = load_payload->regs_written + 1;
+   tex_inst->mlen++;
+   tex_inst->header_present = true;
+   tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
+   tex_inst->src[0] = send_header;
+   tex_inst->dst = reg_null_ud;
+
+   return true;
+}
+
  bool
  fs_visitor::opt_register_renaming()
  {
@@ -3761,6 +3849,8 @@ fs_visitor::optimize()
  
     pass_num = 0;
  
+   OPT(opt_sampler_eot);
+
     if (OPT(lower_load_payload)) {
        split_virtual_grfs();
        OPT(register_coalesce);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h

index cfdbf555d62c06f50948925b9ab48acaac044b2c..32063f01b8c5034294c1e584eb2fc7a8bb976644 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -231,6 +231,8 @@ public:
     bool compute_to_mrf();
     bool dead_code_eliminate();
     bool remove_duplicate_mrf_writes();
+
+   bool opt_sampler_eot();
     bool virtual_grf_interferes(int a, int b);
     void schedule_instructions(instruction_scheduler_mode mode);
     void insert_gen4_send_dependency_workarounds();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp

index 7c000206d0954b967323dcdcafa4ff40747ac17e..b06a947519c26245f55e82385c973a7ed469e2ce 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -517,6 +517,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
     int rlen = 4;
     uint32_t simd_mode;
     uint32_t return_format;
+   bool is_combined_send = inst->eot;
  
     switch (dst.type) {
     case BRW_REGISTER_TYPE_D:
@@ -688,6 +689,11 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
        dst = vec16(dst);
     }
  
+   if (is_combined_send) {
+      assert(brw->gen >= 9 || brw->is_cherryview);
+      rlen = 0;
+   }
+
     assert(brw->gen < 7 || !inst->header_present ||
            src.file == BRW_GENERAL_REGISTER_FILE);
  
@@ -793,6 +799,11 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
         * so has already done marking.
         */
     }
+
+   if (is_combined_send) {
+      brw_inst_set_eot(brw, brw_last_inst, true);
+      brw_inst_set_opcode(brw, brw_last_inst, BRW_OPCODE_SENDC);
+   }
  }
author	Ben Widawsky <benjamin.widawsky@intel.com>
	Sun, 8 Feb 2015 21:59:57 +0000 (13:59 -0800)
committer	Ben Widawsky <benjamin.widawsky@intel.com>
	Tue, 14 Apr 2015 22:22:47 +0000 (15:22 -0700)
src/mesa/drivers/dri/i965/brw_fs.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs_generator.cpp		patch \| blob \| history