i965: Note that write-after-write dependencies are blocking.

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_schedule_instructions.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp

index 0ea4e5c36f091b0e47b982c472a0c3b6dbb71a9a..72b3729d5d0c490cada9881d67875991a125429d 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
@@ -26,9 +26,9 @@
   */
  
  #include "brw_fs.h"
-#include "../glsl/glsl_types.h"
-#include "../glsl/ir_optimization.h"
-#include "../glsl/ir_print_visitor.h"
+#include "glsl/glsl_types.h"
+#include "glsl/ir_optimization.h"
+#include "glsl/ir_print_visitor.h"
  
  /** @file brw_fs_schedule_instructions.cpp
   *
@@ -36,7 +36,7 @@
   *
   * The basic model of the list scheduler is to take a basic block,
   * compute a DAG of the dependencies (RAW ordering with latency, WAW
- * ordering, WAR ordering), and make a list of the DAG heads.
+ * ordering with latency, WAR ordering), and make a list of the DAG heads.
   * Heuristically pick a DAG head, then put all the children that are
   * now DAG heads into the list of things to schedule.
   *
@@ -52,10 +52,12 @@
   * from in picking among those.
   */
  
+static bool debug = false;
+
  class schedule_node : public exec_node
  {
  public:
-   schedule_node(fs_inst *inst)
+   schedule_node(fs_inst *inst, int gen)
     {
        this->inst = inst;
        this->child_array_size = 0;
@@ -65,39 +67,15 @@ public:
        this->parent_count = 0;
        this->unblocked_time = 0;
  
-      int chans = 8;
-      int math_latency = 22;
-
-      switch (inst->opcode) {
-      case SHADER_OPCODE_RCP:
-        this->latency = 1 * chans * math_latency;
-        break;
-      case SHADER_OPCODE_RSQ:
-        this->latency = 2 * chans * math_latency;
-        break;
-      case SHADER_OPCODE_SQRT:
-      case SHADER_OPCODE_LOG2:
-        /* full precision log.  partial is 2. */
-        this->latency = 3 * chans * math_latency;
-        break;
-      case SHADER_OPCODE_EXP2:
-        /* full precision.  partial is 3, same throughput. */
-        this->latency = 4 * chans * math_latency;
-        break;
-      case SHADER_OPCODE_POW:
-        this->latency = 8 * chans * math_latency;
-        break;
-      case SHADER_OPCODE_SIN:
-      case SHADER_OPCODE_COS:
-        /* minimum latency, max is 12 rounds. */
-        this->latency = 5 * chans * math_latency;
-        break;
-      default:
-        this->latency = 2;
-        break;
-      }
+      if (gen >= 7)
+         set_latency_gen7();
+      else
+         set_latency_gen4();
     }
  
+   void set_latency_gen4();
+   void set_latency_gen7();
+
     fs_inst *inst;
     schedule_node **children;
     int *child_latency;
@@ -108,15 +86,227 @@ public:
     int latency;
  };
  
+void
+schedule_node::set_latency_gen4()
+{
+   int chans = 8;
+   int math_latency = 22;
+
+   switch (inst->opcode) {
+   case SHADER_OPCODE_RCP:
+      this->latency = 1 * chans * math_latency;
+      break;
+   case SHADER_OPCODE_RSQ:
+      this->latency = 2 * chans * math_latency;
+      break;
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_LOG2:
+      /* full precision log.  partial is 2. */
+      this->latency = 3 * chans * math_latency;
+      break;
+   case SHADER_OPCODE_INT_REMAINDER:
+   case SHADER_OPCODE_EXP2:
+      /* full precision.  partial is 3, same throughput. */
+      this->latency = 4 * chans * math_latency;
+      break;
+   case SHADER_OPCODE_POW:
+      this->latency = 8 * chans * math_latency;
+      break;
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      /* minimum latency, max is 12 rounds. */
+      this->latency = 5 * chans * math_latency;
+      break;
+   default:
+      this->latency = 2;
+      break;
+   }
+}
+
+void
+schedule_node::set_latency_gen7()
+{
+   switch (inst->opcode) {
+   case BRW_OPCODE_MAD:
+      /* 3 cycles (this is said to be 4 cycles sometimes depending on the
+       * register numbers in the sources):
+       * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
+       *
+       * 20 cycles:
+       * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
+       * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
+       */
+      latency = 17;
+      break;
+
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      /* 2 cycles:
+       * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
+       *
+       * 18 cycles:
+       * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
+       * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
+       *
+       * Same for exp2, log2, rsq, sqrt, sin, cos.
+       */
+      latency = 16;
+      break;
+
+   case SHADER_OPCODE_POW:
+      /* 2 cycles:
+       * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
+       *
+       * 26 cycles:
+       * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
+       * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
+       */
+      latency = 24;
+      break;
+
+   case SHADER_OPCODE_TEX:
+   case SHADER_OPCODE_TXD:
+   case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXL:
+      /* 18 cycles:
+       * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
+       * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       *
+       * 697 +/-49 cycles (min 610, n=26):
+       * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
+       * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
+       *
+       * So the latency on our first texture load of the batchbuffer takes
+       * ~700 cycles, since the caches are cold at that point.
+       *
+       * 840 +/- 92 cycles (min 720, n=25):
+       * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
+       * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
+       *
+       * On the second load, it takes just an extra ~140 cycles, and after
+       * accounting for the 14 cycles of the MOV's latency, that makes ~130.
+       *
+       * 683 +/- 49 cycles (min = 602, n=47):
+       * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
+       * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       * send(8) g50<1>UW   g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
+       *
+       * The unit appears to be pipelined, since this matches up with the
+       * cache-cold case, despite there being two loads here.  If you replace
+       * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
+       *
+       * So, take some number between the cache-hot 140 cycles and the
+       * cache-cold 700 cycles.  No particular tuning was done on this.
+       *
+       * I haven't done significant testing of the non-TEX opcodes.  TXL at
+       * least looked about the same as TEX.
+       */
+      latency = 200;
+      break;
+
+   case SHADER_OPCODE_TXS:
+      /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
+       * cycles (n=15):
+       * mov(8)   g114<1>UD  0D                        { align1 WE_normal 1Q };
+       * send(8)  g6<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 10, 1) mlen 1 rlen 4        { align1 WE_normal 1Q };
+       * mov(16)  g6<1>F     g6<8,8,1>D                { align1 WE_normal 1Q };
+       *
+       *
+       * Two loads was 535 +/- 30 cycles (n=19):
+       * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
+       * send(16)  g6<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
+       * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
+       * mov(16)   g6<1>F     g6<8,8,1>D               { align1 WE_normal 1H };
+       * send(16)  g8<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
+       * mov(16)   g8<1>F     g8<8,8,1>D               { align1 WE_normal 1H };
+       * add(16)   g6<1>F     g6<8,8,1>F   g8<8,8,1>F  { align1 WE_normal 1H };
+       *
+       * Since the only caches that should matter are just the
+       * instruction/state cache containing the surface state, assume that we
+       * always have hot caches.
+       */
+      latency = 100;
+      break;
+
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+      /* testing using varying-index pull constants:
+       *
+       * 16 cycles:
+       * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
+       * send(8) g4<1>F  g4<8,8,1>D
+       *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
+       *
+       * ~480 cycles:
+       * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
+       * send(8) g4<1>F  g4<8,8,1>D
+       *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
+       * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
+       *
+       * ~620 cycles:
+       * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
+       * send(8) g4<1>F  g4<8,8,1>D
+       *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
+       * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
+       * send(8) g4<1>F  g4<8,8,1>D
+       *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
+       * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
+       *
+       * So, if it's cache-hot, it's about 140.  If it's cache cold, it's
+       * about 460.  We expect to mostly be cache hot, so pick something more
+       * in that direction.
+       */
+      latency = 200;
+      break;
+
+   default:
+      /* 2 cycles:
+       * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
+       *
+       * 16 cycles:
+       * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
+       * mov(8) null   g4<8,8,1>F                      { align1 WE_normal 1Q };
+       */
+      latency = 14;
+      break;
+   }
+}
+
  class instruction_scheduler {
  public:
-   instruction_scheduler(fs_visitor *v, void *mem_ctx, int virtual_grf_count)
+   instruction_scheduler(fs_visitor *v, void *mem_ctx, int grf_count,
+                         bool post_reg_alloc)
     {
        this->v = v;
        this->mem_ctx = ralloc_context(mem_ctx);
-      this->virtual_grf_count = virtual_grf_count;
+      this->grf_count = grf_count;
        this->instructions.make_empty();
        this->instructions_to_schedule = 0;
+      this->post_reg_alloc = post_reg_alloc;
     }
  
     ~instruction_scheduler()
@@ -135,8 +325,9 @@ public:
  
     void *mem_ctx;
  
+   bool post_reg_alloc;
     int instructions_to_schedule;
-   int virtual_grf_count;
+   int grf_count;
     exec_list instructions;
     fs_visitor *v;
  };
@@ -144,7 +335,7 @@ public:
  void
  instruction_scheduler::add_inst(fs_inst *inst)
  {
-   schedule_node *n = new(mem_ctx) schedule_node(inst);
+   schedule_node *n = new(mem_ctx) schedule_node(inst, v->intel->gen);
  
     assert(!inst->is_head_sentinel());
     assert(!inst->is_tail_sentinel());
@@ -237,7 +428,7 @@ instruction_scheduler::add_barrier_deps(schedule_node *n)
  bool
  instruction_scheduler::is_compressed(fs_inst *inst)
  {
-   return (v->c->dispatch_width == 16 &&
+   return (v->dispatch_width == 16 &&
            !inst->force_uncompressed &&
            !inst->force_sechalf);
  }
@@ -245,15 +436,21 @@ instruction_scheduler::is_compressed(fs_inst *inst)
  void
  instruction_scheduler::calculate_deps()
  {
-   schedule_node *last_grf_write[virtual_grf_count];
+   /* Pre-register-allocation, this tracks the last write per VGRF (so
+    * different reg_offsets within it can interfere when they shouldn't).
+    * After register allocation, reg_offsets are gone and we track individual
+    * GRF registers.
+    */
+   schedule_node *last_grf_write[grf_count];
     schedule_node *last_mrf_write[BRW_MAX_MRF];
-   schedule_node *last_conditional_mod = NULL;
+   schedule_node *last_conditional_mod[2] = { NULL, NULL };
     /* Fixed HW registers are assumed to be separate from the virtual
      * GRFs, so they can be tracked separately.  We don't really write
      * to fixed GRFs much, so don't bother tracking them on a more
      * granular level.
      */
     schedule_node *last_fixed_grf_write = NULL;
+   int reg_width = v->dispatch_width / 8;
  
     /* The last instruction always needs to still be the last
      * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
@@ -275,11 +472,21 @@ instruction_scheduler::calculate_deps()
        /* read-after-write deps. */
        for (int i = 0; i < 3; i++) {
          if (inst->src[i].file == GRF) {
-           add_dep(last_grf_write[inst->src[i].reg], n);
+            if (post_reg_alloc) {
+               for (int r = 0; r < reg_width; r++)
+                  add_dep(last_grf_write[inst->src[i].reg + r], n);
+            } else {
+               add_dep(last_grf_write[inst->src[i].reg], n);
+            }
          } else if (inst->src[i].file == FIXED_HW_REG &&
                     (inst->src[i].fixed_hw_reg.file ==
                      BRW_GENERAL_REGISTER_FILE)) {
-           add_dep(last_fixed_grf_write, n);
+           if (post_reg_alloc) {
+               for (int r = 0; r < reg_width; r++)
+                  add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n);
+            } else {
+               add_dep(last_fixed_grf_write, n);
+            }
          } else if (inst->src[i].file != BAD_FILE &&
                     inst->src[i].file != IMM &&
                     inst->src[i].file != UNIFORM) {
@@ -296,15 +503,21 @@ instruction_scheduler::calculate_deps()
          add_dep(last_mrf_write[inst->base_mrf + i], n);
        }
  
-      if (inst->predicated) {
-        assert(last_conditional_mod);
-        add_dep(last_conditional_mod, n);
+      if (inst->predicate) {
+        add_dep(last_conditional_mod[inst->flag_subreg], n);
        }
  
        /* write-after-write deps. */
        if (inst->dst.file == GRF) {
-        add_dep(last_grf_write[inst->dst.reg], n);
-        last_grf_write[inst->dst.reg] = n;
+         if (post_reg_alloc) {
+            for (int r = 0; r < inst->regs_written() * reg_width; r++) {
+               add_dep(last_grf_write[inst->dst.reg + r], n);
+               last_grf_write[inst->dst.reg + r] = n;
+            }
+         } else {
+            add_dep(last_grf_write[inst->dst.reg], n);
+            last_grf_write[inst->dst.reg] = n;
+         }
        } else if (inst->dst.file == MRF) {
          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
  
@@ -320,7 +533,12 @@ instruction_scheduler::calculate_deps()
          }
        } else if (inst->dst.file == FIXED_HW_REG &&
                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
-        last_fixed_grf_write = n;
+         if (post_reg_alloc) {
+            for (int r = 0; r < reg_width; r++)
+               last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
+         } else {
+            last_fixed_grf_write = n;
+         }
        } else if (inst->dst.file != BAD_FILE) {
          add_barrier_deps(n);
        }
@@ -332,16 +550,20 @@ instruction_scheduler::calculate_deps()
          }
        }
  
-      if (inst->conditional_mod) {
-        add_dep(last_conditional_mod, n, 0);
-        last_conditional_mod = n;
+      /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
+       * conditional_mod, because it sets the flag register.
+       */
+      if (inst->conditional_mod ||
+          inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
+        add_dep(last_conditional_mod[inst->flag_subreg], n, 0);
+        last_conditional_mod[inst->flag_subreg] = n;
        }
     }
  
     /* bottom-to-top dependencies: WAR */
     memset(last_grf_write, 0, sizeof(last_grf_write));
     memset(last_mrf_write, 0, sizeof(last_mrf_write));
-   last_conditional_mod = NULL;
+   memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
     last_fixed_grf_write = NULL;
  
     exec_node *node;
@@ -355,12 +577,22 @@ instruction_scheduler::calculate_deps()
        /* write-after-read deps. */
        for (int i = 0; i < 3; i++) {
          if (inst->src[i].file == GRF) {
-           add_dep(n, last_grf_write[inst->src[i].reg]);
+            if (post_reg_alloc) {
+               for (int r = 0; r < reg_width; r++)
+                  add_dep(n, last_grf_write[inst->src[i].reg + r]);
+            } else {
+               add_dep(n, last_grf_write[inst->src[i].reg]);
+            }
          } else if (inst->src[i].file == FIXED_HW_REG &&
                     (inst->src[i].fixed_hw_reg.file ==
                      BRW_GENERAL_REGISTER_FILE)) {
-           add_dep(n, last_fixed_grf_write);
-        } else if (inst->src[i].file != BAD_FILE &&
+           if (post_reg_alloc) {
+               for (int r = 0; r < reg_width; r++)
+                  add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r]);
+            } else {
+               add_dep(n, last_fixed_grf_write);
+            }
+         } else if (inst->src[i].file != BAD_FILE &&
                     inst->src[i].file != IMM &&
                     inst->src[i].file != UNIFORM) {
             assert(inst->src[i].file != MRF);
@@ -376,15 +608,20 @@ instruction_scheduler::calculate_deps()
          add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
        }
  
-      if (inst->predicated) {
-        add_dep(n, last_conditional_mod);
+      if (inst->predicate) {
+        add_dep(n, last_conditional_mod[inst->flag_subreg]);
        }
  
        /* Update the things this instruction wrote, so earlier reads
         * can mark this as WAR dependency.
         */
        if (inst->dst.file == GRF) {
-        last_grf_write[inst->dst.reg] = n;
+         if (post_reg_alloc) {
+            for (int r = 0; r < inst->regs_written() * reg_width; r++)
+               last_grf_write[inst->dst.reg + r] = n;
+         } else {
+            last_grf_write[inst->dst.reg] = n;
+         }
        } else if (inst->dst.file == MRF) {
          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
  
@@ -400,7 +637,12 @@ instruction_scheduler::calculate_deps()
          }
        } else if (inst->dst.file == FIXED_HW_REG &&
                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
-        last_fixed_grf_write = n;
+         if (post_reg_alloc) {
+            for (int r = 0; r < reg_width; r++)
+               last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
+         } else {
+            last_fixed_grf_write = n;
+         }
        } else if (inst->dst.file != BAD_FILE) {
          add_barrier_deps(n);
        }
@@ -411,8 +653,13 @@ instruction_scheduler::calculate_deps()
          }
        }
  
-      if (inst->conditional_mod)
-        last_conditional_mod = n;
+      /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
+       * conditional_mod, because it sets the flag register.
+       */
+      if (inst->conditional_mod ||
+          inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
+        last_conditional_mod[inst->flag_subreg] = n;
+      }
     }
  }
  
@@ -432,13 +679,48 @@ instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
        schedule_node *chosen = NULL;
        int chosen_time = 0;
  
-      foreach_list(node, &instructions) {
-        schedule_node *n = (schedule_node *)node;
-
-        if (!chosen || n->unblocked_time < chosen_time) {
-           chosen = n;
-           chosen_time = n->unblocked_time;
-        }
+      if (post_reg_alloc) {
+         /* Of the instructions closest ready to execute or the closest to
+          * being ready, choose the oldest one.
+          */
+         foreach_list(node, &instructions) {
+            schedule_node *n = (schedule_node *)node;
+
+            if (!chosen || n->unblocked_time < chosen_time) {
+               chosen = n;
+               chosen_time = n->unblocked_time;
+            }
+         }
+      } else {
+         /* Before register allocation, we don't care about the latencies of
+          * instructions.  All we care about is reducing live intervals of
+          * variables so that we can avoid register spilling, or get 16-wide
+          * shaders which naturally do a better job of hiding instruction
+          * latency.
+          *
+          * To do so, schedule our instructions in a roughly LIFO/depth-first
+          * order: when new instructions become available as a result of
+          * scheduling something, choose those first so that our result
+          * hopefully is consumed quickly.
+          *
+          * The exception is messages that generate more than one result
+          * register (AKA texturing).  In those cases, the LIFO search would
+          * normally tend to choose them quickly (because scheduling the
+          * previous message not only unblocked the children using its result,
+          * but also the MRF setup for the next sampler message, which in turn
+          * unblocks the next sampler message).
+          */
+         for (schedule_node *node = (schedule_node *)instructions.get_tail();
+              node != instructions.get_head()->prev;
+              node = (schedule_node *)node->prev) {
+            schedule_node *n = (schedule_node *)node;
+
+            chosen = n;
+            if (chosen->inst->regs_written() <= 1)
+               break;
+         }
+
+         chosen_time = chosen->unblocked_time;
        }
  
        /* Schedule this instruction. */
@@ -447,10 +729,27 @@ instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
        next_block_header->insert_before(chosen->inst);
        instructions_to_schedule--;
  
-      /* Bump the clock.  If we expected a delay for scheduling, then
-       * bump the clock to reflect that.
+      /* Bump the clock.  Instructions in gen hardware are handled one simd4
+       * vector at a time, with 1 cycle per vector dispatched.  Thus 8-wide
+       * pixel shaders take 2 cycles to dispatch and 16-wide (compressed)
+       * instructions take 4.
+       */
+      if (is_compressed(chosen->inst))
+         time += 4;
+      else
+         time += 2;
+
+      /* If we expected a delay for scheduling, then bump the clock to reflect
+       * that as well.  In reality, the hardware will switch to another
+       * hyperthread and may not return to dispatching our thread for a while
+       * even after we're unblocked.
         */
-      time = MAX2(time + 1, chosen_time);
+      time = MAX2(time, chosen_time);
+
+      if (debug) {
+         printf("clock %4d, scheduled: ", time);
+         v->dump_instruction(chosen->inst);
+      }
  
        /* Now that we've scheduled a new instruction, some of its
         * children can be promoted to the list of instructions ready to
@@ -465,14 +764,18 @@ instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
  
          child->parent_count--;
          if (child->parent_count == 0) {
+            if (debug) {
+               printf("now available: ");
+               v->dump_instruction(child->inst);
+            }
             instructions.push_tail(child);
          }
        }
  
-      /* Shared resource: the mathbox.  There's one per EU (on later
-       * generations, it's even more limited pre-gen6), so if we send
-       * something off to it then the next math isn't going to make
-       * progress until the first is done.
+      /* Shared resource: the mathbox.  There's one mathbox per EU on Gen6+
+       * but it's more limited pre-gen6, so if we send something off to it then
+       * the next math instruction isn't going to make progress until the first
+       * is done.
         */
        if (chosen->inst->is_math()) {
          foreach_list(node, &instructions) {
@@ -489,10 +792,22 @@ instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
  }
  
  void
-fs_visitor::schedule_instructions()
+fs_visitor::schedule_instructions(bool post_reg_alloc)
  {
     fs_inst *next_block_header = (fs_inst *)instructions.head;
-   instruction_scheduler sched(this, mem_ctx, this->virtual_grf_next);
+
+   int grf_count;
+   if (post_reg_alloc)
+      grf_count = grf_used;
+   else
+      grf_count = virtual_grf_count;
+
+   if (debug) {
+      printf("\nInstructions before scheduling (reg_alloc %d)\n", post_reg_alloc);
+      dump_instructions();
+   }
+
+   instruction_scheduler sched(this, mem_ctx, grf_count, post_reg_alloc);
  
     while (!next_block_header->is_tail_sentinel()) {
        /* Add things to be scheduled until we get to a new BB. */
@@ -501,19 +816,17 @@ fs_visitor::schedule_instructions()
          next_block_header = (fs_inst *)next_block_header->next;
  
          sched.add_inst(inst);
-        if (inst->opcode == BRW_OPCODE_IF ||
-            inst->opcode == BRW_OPCODE_ELSE ||
-            inst->opcode == BRW_OPCODE_ENDIF ||
-            inst->opcode == BRW_OPCODE_DO ||
-            inst->opcode == BRW_OPCODE_WHILE ||
-            inst->opcode == BRW_OPCODE_BREAK ||
-            inst->opcode == BRW_OPCODE_CONTINUE) {
+         if (inst->is_control_flow())
             break;
-        }
        }
        sched.calculate_deps();
        sched.schedule_instructions(next_block_header);
     }
  
+   if (debug) {
+      printf("\nInstructions after scheduling (reg_alloc %d)\n", post_reg_alloc);
+      dump_instructions();
+   }
+
     this->live_intervals_valid = false;
  }