i965/fs: Improve performance of shaders that start out with a discard.

author Eric Anholt <eric@anholt.net>

Thu, 6 Dec 2012 18:15:08 +0000 (10:15 -0800)

committer Eric Anholt <eric@anholt.net>

Tue, 11 Dec 2012 18:13:15 +0000 (10:13 -0800)
author Eric Anholt <eric@anholt.net>
Thu, 6 Dec 2012 18:15:08 +0000 (10:15 -0800)
committer Eric Anholt <eric@anholt.net>
Tue, 11 Dec 2012 18:13:15 +0000 (10:13 -0800)
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h

index 2b77ae62ff394240d260da6d72319799d22f9f64..40571a4d54d6dbceb6d420b6181ba104f0ded662 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -679,6 +679,7 @@ enum opcode {
     FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
     FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
     FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
+   FS_OPCODE_DISCARD_JUMP,
  
     VS_OPCODE_URB_WRITE,
     VS_OPCODE_SCRATCH_READ,
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h

index adefcfd9badd39190f1ffe4742b2aa3a3f4f2d6e..b2fa44864db4035d2f24c55707ebf05879e6d855 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -1032,6 +1032,7 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p);
  struct brw_instruction *brw_BREAK(struct brw_compile *p);
  struct brw_instruction *brw_CONT(struct brw_compile *p);
  struct brw_instruction *gen6_CONT(struct brw_compile *p);
+struct brw_instruction *gen6_HALT(struct brw_compile *p);
  /* Forward jumps:
   */
  void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx);
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c

index fb1255f728cebe8ea24138f2b7ed77976b365ca2..dd91a3087c8151af47f14deaa7c59d43b2b12fe8 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -1461,6 +1461,24 @@ struct brw_instruction *brw_CONT(struct brw_compile *p)
     return insn;
  }
  
+struct brw_instruction *gen6_HALT(struct brw_compile *p)
+{
+   struct brw_instruction *insn;
+
+   insn = next_insn(p, BRW_OPCODE_HALT);
+   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
+
+   if (p->compressed) {
+      insn->header.execution_size = BRW_EXECUTE_16;
+   } else {
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = BRW_EXECUTE_8;
+   }
+   return insn;
+}
+
  /* DO/WHILE loop:
   *
   * The DO/WHILE is just an unterminated loop -- break or continue are
@@ -2302,8 +2320,8 @@ brw_find_next_block_end(struct brw_compile *p, int start)
          return ip;
        }
     }
-   assert(!"not reached");
-   return start + 1;
+
+   return 0;
  }
  
  /* There is no DO instruction on gen6, so to find the end of the loop
@@ -2336,7 +2354,7 @@ brw_find_loop_end(struct brw_compile *p, int start)
  }
  
  /* After program generation, go back and update the UIP and JIP of
- * BREAK and CONT instructions to their correct locations.
+ * BREAK, CONT, and HALT instructions to their correct locations.
   */
  void
  brw_set_uip_jip(struct brw_compile *p)
@@ -2360,24 +2378,45 @@ brw_set_uip_jip(struct brw_compile *p)
          continue;
        }
  
+      int block_end_ip = brw_find_next_block_end(p, ip);
        switch (insn->header.opcode) {
        case BRW_OPCODE_BREAK:
-        insn->bits3.break_cont.jip =
-            (brw_find_next_block_end(p, ip) - ip) / scale;
+         assert(block_end_ip != 0);
+        insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
          /* Gen7 UIP points to WHILE; Gen6 points just after it */
          insn->bits3.break_cont.uip =
             (brw_find_loop_end(p, ip) - ip +
               (intel->gen == 6 ? 16 : 0)) / scale;
          break;
        case BRW_OPCODE_CONTINUE:
-        insn->bits3.break_cont.jip =
-            (brw_find_next_block_end(p, ip) - ip) / scale;
+         assert(block_end_ip != 0);
+        insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
          insn->bits3.break_cont.uip =
              (brw_find_loop_end(p, ip) - ip) / scale;
  
          assert(insn->bits3.break_cont.uip != 0);
          assert(insn->bits3.break_cont.jip != 0);
          break;
+      case BRW_OPCODE_HALT:
+        /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
+         *
+         *    "In case of the halt instruction not inside any conditional
+         *     code block, the value of <JIP> and <UIP> should be the
+         *     same. In case of the halt instruction inside conditional code
+         *     block, the <UIP> should be the end of the program, and the
+         *     <JIP> should be end of the most inner conditional code block."
+         *
+         * The uip will have already been set by whoever set up the
+         * instruction.
+         */
+        if (block_end_ip == 0) {
+           insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
+        } else {
+           insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
+        }
+        assert(insn->bits3.break_cont.uip != 0);
+        assert(insn->bits3.break_cont.jip != 0);
+        break;
        }
     }
  }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h

index b00755f953d95c7c12125a33c14a51cbab2a34bd..d4ddb478fd25099724d53d9457627ddcefce5ed0 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -129,6 +129,26 @@ static const fs_reg reg_undef;
  static const fs_reg reg_null_f(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_F);
  static const fs_reg reg_null_d(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_D);
  
+class ip_record : public exec_node {
+public:
+   static void* operator new(size_t size, void *ctx)
+   {
+      void *node;
+
+      node = rzalloc_size(ctx, size);
+      assert(node != NULL);
+
+      return node;
+   }
+
+   ip_record(int ip)
+   {
+      this->ip = ip;
+   }
+
+   int ip;
+};
+
  class fs_inst : public backend_instruction {
  public:
     /* Callers of this ralloc-based new need not call delete. It's
@@ -516,6 +536,9 @@ private:
                                                   struct brw_reg index,
                                                   struct brw_reg offset);
     void generate_mov_dispatch_to_flags(fs_inst *inst);
+   void generate_discard_jump(fs_inst *inst);
+
+   void patch_discard_jumps_to_fb_writes();
  
     struct brw_context *brw;
     struct intel_context *intel;
@@ -530,6 +553,7 @@ private:
  
     unsigned dispatch_width; /**< 8 or 16 */
  
+   exec_list discard_halt_patches;
     bool dual_source_output;
     void *mem_ctx;
  };
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp

index f185eb52104d85b57da2dbe1ed1673512710de4f..9a891414e623088310893e25b3493613d7b8dd75 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -60,6 +60,41 @@ fs_generator::~fs_generator()
  {
  }
  
+void
+fs_generator::patch_discard_jumps_to_fb_writes()
+{
+   if (intel->gen < 6 || this->discard_halt_patches.is_empty())
+      return;
+
+   /* There is a somewhat strange undocumented requirement of using
+    * HALT, according to the simulator.  If some channel has HALTed to
+    * a particular UIP, then by the end of the program, every channel
+    * must have HALTed to that UIP.  Furthermore, the tracking is a
+    * stack, so you can't do the final halt of a UIP after starting
+    * halting to a new UIP.
+    *
+    * Symptoms of not emitting this instruction on actual hardware
+    * included GPU hangs and sparkly rendering on the piglit discard
+    * tests.
+    */
+   struct brw_instruction *last_halt = gen6_HALT(p);
+   last_halt->bits3.break_cont.uip = 2;
+   last_halt->bits3.break_cont.jip = 2;
+
+   int ip = p->nr_insn;
+
+   foreach_list(node, &this->discard_halt_patches) {
+      ip_record *patch_ip = (ip_record *)node;
+      struct brw_instruction *patch = &p->store[patch_ip->ip];
+
+      assert(patch->header.opcode == BRW_OPCODE_HALT);
+      /* HALT takes a half-instruction distance from the pre-incremented IP. */
+      patch->bits3.break_cont.uip = (ip - patch_ip->ip) * 2;
+   }
+
+   this->discard_halt_patches.make_empty();
+}
+
  void
  fs_generator::generate_fb_write(fs_inst *inst)
  {
@@ -67,6 +102,12 @@ fs_generator::generate_fb_write(fs_inst *inst)
     struct brw_reg implied_header;
     uint32_t msg_control;
  
+   /* Note that the jumps emitted to this point mean that the g0 ->
+    * base_mrf setup must be inside of this function, so that we jump
+    * to a point containing it.
+    */
+   patch_discard_jumps_to_fb_writes();
+
     /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
      * move, here's g1.
      */
@@ -524,6 +565,23 @@ fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src
        brw_ADD(p, dst, src0, negate(src1));
  }
  
+void
+fs_generator::generate_discard_jump(fs_inst *inst)
+{
+   assert(intel->gen >= 6);
+
+   /* This HALT will be patched up at FB write time to point UIP at the end of
+    * the program, and at brw_uip_jip() JIP will be set to the end of the
+    * current block (or the program).
+    */
+   this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
+
+   brw_push_insn_state(p);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   gen6_HALT(p);
+   brw_pop_insn_state(p);
+}
+
  void
  fs_generator::generate_spill(fs_inst *inst, struct brw_reg src)
  {
@@ -1085,6 +1143,10 @@ fs_generator::generate_code(exec_list *instructions)
           generate_mov_dispatch_to_flags(inst);
           break;
  
+      case FS_OPCODE_DISCARD_JUMP:
+         generate_discard_jump(inst);
+         break;
+
        case SHADER_OPCODE_SHADER_TIME_ADD:
           brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME);
           break;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

index 98cd064aceb0f121266a91b6fa2d28346793fd82..56d43309f9d744e7a82a5686b9c101eae7433cb0 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1446,6 +1446,20 @@ fs_visitor::visit(ir_discard *ir)
                             BRW_CONDITIONAL_NZ));
     cmp->predicate = BRW_PREDICATE_NORMAL;
     cmp->flag_subreg = 1;
+
+   if (intel->gen >= 6) {
+      /* For performance, after a discard, jump to the end of the shader.
+       * However, many people will do foliage by discarding based on a
+       * texture's alpha mask, and then continue on to texture with the
+       * remaining pixels.  To avoid trashing the derivatives for those
+       * texture samples, we'll only jump if all of the pixels in the subspan
+       * have been discarded.
+       */
+      fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
+      discard_jump->flag_subreg = 1;
+      discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
+      discard_jump->predicate_inverse = true;
+   }
  }
  
  void
author	Eric Anholt <eric@anholt.net>
	Thu, 6 Dec 2012 18:15:08 +0000 (10:15 -0800)
committer	Eric Anholt <eric@anholt.net>
	Tue, 11 Dec 2012 18:13:15 +0000 (10:13 -0800)
src/mesa/drivers/dri/i965/brw_defines.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_eu.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_eu_emit.c		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs_emit.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs_visitor.cpp		patch \| blob \| history