i965: Add support for Broadwell's new register types.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_schedule_instructions.cpp
index ccedee3ba58f87cd64bcc003ead22cd73f5b78bd..baf67fb1ea26252f110148cd52a31c5853a35a43 100644 (file)
@@ -29,7 +29,6 @@
 #include "brw_vec4.h"
 #include "glsl/glsl_types.h"
 #include "glsl/ir_optimization.h"
-#include "glsl/ir_print_visitor.h"
 
 using namespace brw;
 
@@ -57,28 +56,12 @@ using namespace brw;
 
 static bool debug = false;
 
+class instruction_scheduler;
+
 class schedule_node : public exec_node
 {
 public:
-   schedule_node(backend_instruction *inst, const struct intel_context *intel)
-   {
-      this->inst = inst;
-      this->child_array_size = 0;
-      this->children = NULL;
-      this->child_latency = NULL;
-      this->child_count = 0;
-      this->parent_count = 0;
-      this->unblocked_time = 0;
-
-      /* We can't measure Gen6 timings directly but expect them to be much
-       * closer to Gen7 than Gen4.
-       */
-      if (intel->gen >= 6)
-         set_latency_gen7(intel->is_haswell);
-      else
-         set_latency_gen4();
-   }
-
+   schedule_node(backend_instruction *inst, instruction_scheduler *sched);
    void set_latency_gen4();
    void set_latency_gen7(bool is_haswell);
 
@@ -90,6 +73,18 @@ public:
    int child_array_size;
    int unblocked_time;
    int latency;
+
+   /**
+    * Which iteration of pushing groups of children onto the candidates list
+    * this node was a part of.
+    */
+   unsigned cand_generation;
+
+   /**
+    * This is the sum of the instruction's latency plus the maximum delay of
+    * its children, or just the issue_time if it's a leaf node.
+    */
+   int delay;
 };
 
 void
@@ -330,6 +325,59 @@ schedule_node::set_latency_gen7(bool is_haswell)
       latency = 200;
       break;
 
+   case SHADER_OPCODE_GEN7_SCRATCH_READ:
+      /* Testing a load from offset 0, that had been previously written:
+       *
+       * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q };
+       * mov(8)  null      g114<8,8,1>F { align1 WE_normal 1Q };
+       *
+       * The cycles spent seemed to be grouped around 40-50 (as low as 38),
+       * then around 140.  Presumably this is cache hit vs miss.
+       */
+      latency = 50;
+      break;
+
+   case SHADER_OPCODE_UNTYPED_ATOMIC:
+      /* Test code:
+       *   mov(8)    g112<1>ud       0x00000000ud       { align1 WE_all 1Q };
+       *   mov(1)    g112.7<1>ud     g1.7<0,1,0>ud      { align1 WE_all };
+       *   mov(8)    g113<1>ud       0x00000000ud       { align1 WE_normal 1Q };
+       *   send(8)   g4<1>ud         g112<8,8,1>ud
+       *             data (38, 5, 6) mlen 2 rlen 1      { align1 WE_normal 1Q };
+       *
+       * Running it 100 times as fragment shader on a 128x128 quad
+       * gives an average latency of 13867 cycles per atomic op,
+       * standard deviation 3%.  Note that this is a rather
+       * pessimistic estimate, the actual latency in cases with few
+       * collisions between threads and favorable pipelining has been
+       * seen to be reduced by a factor of 100.
+       */
+      latency = 14000;
+      break;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+      /* Test code:
+       *   mov(8)    g112<1>UD       0x00000000UD       { align1 WE_all 1Q };
+       *   mov(1)    g112.7<1>UD     g1.7<0,1,0>UD      { align1 WE_all };
+       *   mov(8)    g113<1>UD       0x00000000UD       { align1 WE_normal 1Q };
+       *   send(8)   g4<1>UD         g112<8,8,1>UD
+       *             data (38, 6, 5) mlen 2 rlen 1      { align1 WE_normal 1Q };
+       *   .
+       *   . [repeats 8 times]
+       *   .
+       *   mov(8)    g112<1>UD       0x00000000UD       { align1 WE_all 1Q };
+       *   mov(1)    g112.7<1>UD     g1.7<0,1,0>UD      { align1 WE_all };
+       *   mov(8)    g113<1>UD       0x00000000UD       { align1 WE_normal 1Q };
+       *   send(8)   g4<1>UD         g112<8,8,1>UD
+       *             data (38, 6, 5) mlen 2 rlen 1      { align1 WE_normal 1Q };
+       *
+       * Running it 100 times as fragment shader on a 128x128 quad
+       * gives an average latency of 583 cycles per surface read,
+       * standard deviation 0.9%.
+       */
+      latency = is_haswell ? 300 : 600;
+      break;
+
    default:
       /* 2 cycles:
        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
@@ -345,15 +393,24 @@ schedule_node::set_latency_gen7(bool is_haswell)
 
 class instruction_scheduler {
 public:
-   instruction_scheduler(backend_visitor *v, int grf_count, bool post_reg_alloc)
+   instruction_scheduler(backend_visitor *v, int grf_count,
+                         instruction_scheduler_mode mode)
    {
       this->bv = v;
-      this->mem_ctx = ralloc_context(v->mem_ctx);
+      this->mem_ctx = ralloc_context(NULL);
       this->grf_count = grf_count;
       this->instructions.make_empty();
       this->instructions_to_schedule = 0;
-      this->post_reg_alloc = post_reg_alloc;
+      this->post_reg_alloc = (mode == SCHEDULE_POST);
+      this->mode = mode;
       this->time = 0;
+      if (!post_reg_alloc) {
+         this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count);
+         this->grf_active = rzalloc_array(mem_ctx, bool, grf_count);
+      } else {
+         this->remaining_grf_uses = NULL;
+         this->grf_active = NULL;
+      }
    }
 
    ~instruction_scheduler()
@@ -366,6 +423,7 @@ public:
 
    void run(exec_list *instructions);
    void add_inst(backend_instruction *inst);
+   void compute_delay(schedule_node *node);
    virtual void calculate_deps() = 0;
    virtual schedule_node *choose_instruction_to_schedule() = 0;
 
@@ -378,6 +436,10 @@ public:
     */
    virtual int issue_time(backend_instruction *inst) = 0;
 
+   virtual void count_remaining_grf_uses(backend_instruction *inst) = 0;
+   virtual void update_register_pressure(backend_instruction *inst) = 0;
+   virtual int get_register_pressure_benefit(backend_instruction *inst) = 0;
+
    void schedule_instructions(backend_instruction *next_block_header);
 
    void *mem_ctx;
@@ -388,27 +450,116 @@ public:
    int time;
    exec_list instructions;
    backend_visitor *bv;
+
+   instruction_scheduler_mode mode;
+
+   /**
+    * Number of instructions left to schedule that reference each vgrf.
+    *
+    * Used so that we can prefer scheduling instructions that will end the
+    * live intervals of multiple variables, to reduce register pressure.
+    */
+   int *remaining_grf_uses;
+
+   /**
+    * Tracks whether each VGRF has had an instruction scheduled that uses it.
+    *
+    * This is used to estimate whether scheduling a new instruction will
+    * increase register pressure.
+    */
+   bool *grf_active;
 };
 
 class fs_instruction_scheduler : public instruction_scheduler
 {
 public:
-   fs_instruction_scheduler(fs_visitor *v, int grf_count, bool post_reg_alloc);
+   fs_instruction_scheduler(fs_visitor *v, int grf_count,
+                            instruction_scheduler_mode mode);
    void calculate_deps();
    bool is_compressed(fs_inst *inst);
    schedule_node *choose_instruction_to_schedule();
    int issue_time(backend_instruction *inst);
    fs_visitor *v;
+
+   void count_remaining_grf_uses(backend_instruction *inst);
+   void update_register_pressure(backend_instruction *inst);
+   int get_register_pressure_benefit(backend_instruction *inst);
 };
 
 fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
                                                    int grf_count,
-                                                   bool post_reg_alloc)
-   : instruction_scheduler(v, grf_count, post_reg_alloc),
+                                                   instruction_scheduler_mode mode)
+   : instruction_scheduler(v, grf_count, mode),
      v(v)
 {
 }
 
+void
+fs_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be)
+{
+   fs_inst *inst = (fs_inst *)be;
+
+   if (!remaining_grf_uses)
+      return;
+
+   if (inst->dst.file == GRF)
+      remaining_grf_uses[inst->dst.reg]++;
+
+   for (int i = 0; i < 3; i++) {
+      if (inst->src[i].file != GRF)
+         continue;
+
+      remaining_grf_uses[inst->src[i].reg]++;
+   }
+}
+
+void
+fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
+{
+   fs_inst *inst = (fs_inst *)be;
+
+   if (!remaining_grf_uses)
+      return;
+
+   if (inst->dst.file == GRF) {
+      remaining_grf_uses[inst->dst.reg]--;
+      grf_active[inst->dst.reg] = true;
+   }
+
+   for (int i = 0; i < 3; i++) {
+      if (inst->src[i].file == GRF) {
+         remaining_grf_uses[inst->src[i].reg]--;
+         grf_active[inst->src[i].reg] = true;
+      }
+   }
+}
+
+int
+fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
+{
+   fs_inst *inst = (fs_inst *)be;
+   int benefit = 0;
+
+   if (inst->dst.file == GRF) {
+      if (remaining_grf_uses[inst->dst.reg] == 1)
+         benefit += v->virtual_grf_sizes[inst->dst.reg];
+      if (!grf_active[inst->dst.reg])
+         benefit -= v->virtual_grf_sizes[inst->dst.reg];
+   }
+
+   for (int i = 0; i < 3; i++) {
+      if (inst->src[i].file != GRF)
+         continue;
+
+      if (remaining_grf_uses[inst->src[i].reg] == 1)
+         benefit += v->virtual_grf_sizes[inst->src[i].reg];
+      if (!grf_active[inst->src[i].reg])
+         benefit -= v->virtual_grf_sizes[inst->src[i].reg];
+   }
+
+   return benefit;
+}
+
 class vec4_instruction_scheduler : public instruction_scheduler
 {
 public:
@@ -417,19 +568,65 @@ public:
    schedule_node *choose_instruction_to_schedule();
    int issue_time(backend_instruction *inst);
    vec4_visitor *v;
+
+   void count_remaining_grf_uses(backend_instruction *inst);
+   void update_register_pressure(backend_instruction *inst);
+   int get_register_pressure_benefit(backend_instruction *inst);
 };
 
 vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
                                                        int grf_count)
-   : instruction_scheduler(v, grf_count, true),
+   : instruction_scheduler(v, grf_count, SCHEDULE_POST),
      v(v)
 {
 }
 
+void
+vec4_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be)
+{
+}
+
+void
+vec4_instruction_scheduler::update_register_pressure(backend_instruction *be)
+{
+}
+
+int
+vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
+{
+   return 0;
+}
+
+schedule_node::schedule_node(backend_instruction *inst,
+                             instruction_scheduler *sched)
+{
+   struct brw_context *brw = sched->bv->brw;
+
+   this->inst = inst;
+   this->child_array_size = 0;
+   this->children = NULL;
+   this->child_latency = NULL;
+   this->child_count = 0;
+   this->parent_count = 0;
+   this->unblocked_time = 0;
+   this->cand_generation = 0;
+   this->delay = 0;
+
+   /* We can't measure Gen6 timings directly but expect them to be much
+    * closer to Gen7 than Gen4.
+    */
+   if (!sched->post_reg_alloc)
+      this->latency = 1;
+   else if (brw->gen >= 6)
+      set_latency_gen7(brw->is_haswell);
+   else
+      set_latency_gen4();
+}
+
 void
 instruction_scheduler::add_inst(backend_instruction *inst)
 {
-   schedule_node *n = new(mem_ctx) schedule_node(inst, bv->intel);
+   schedule_node *n = new(mem_ctx) schedule_node(inst, this);
 
    assert(!inst->is_head_sentinel());
    assert(!inst->is_tail_sentinel());
@@ -440,6 +637,21 @@ instruction_scheduler::add_inst(backend_instruction *inst)
    instructions.push_tail(n);
 }
 
+/** Recursive computation of the delay member of a node. */
+void
+instruction_scheduler::compute_delay(schedule_node *n)
+{
+   if (!n->child_count) {
+      n->delay = issue_time(n->inst);
+   } else {
+      for (int i = 0; i < n->child_count; i++) {
+         if (!n->children[i]->delay)
+            compute_delay(n->children[i]);
+         n->delay = MAX2(n->delay, n->latency + n->children[i]->delay);
+      }
+   }
+}
+
 /**
  * Add a dependency between two instruction nodes.
  *
@@ -563,14 +775,15 @@ fs_instruction_scheduler::calculate_deps()
       schedule_node *n = (schedule_node *)node;
       fs_inst *inst = (fs_inst *)n->inst;
 
-      if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT)
+      if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
+         inst->has_side_effects())
          add_barrier_deps(n);
 
       /* read-after-write deps. */
       for (int i = 0; i < 3; i++) {
         if (inst->src[i].file == GRF) {
             if (post_reg_alloc) {
-               for (int r = 0; r < reg_width; r++)
+               for (int r = 0; r < reg_width * inst->regs_read(v, i); r++)
                   add_dep(last_grf_write[inst->src[i].reg + r], n);
             } else {
                add_dep(last_grf_write[inst->src[i].reg], n);
@@ -595,15 +808,17 @@ fs_instruction_scheduler::calculate_deps()
         }
       }
 
-      for (int i = 0; i < inst->mlen; i++) {
-        /* It looks like the MRF regs are released in the send
-         * instruction once it's sent, not when the result comes
-         * back.
-         */
-        add_dep(last_mrf_write[inst->base_mrf + i], n);
+      if (inst->base_mrf != -1) {
+        for (int i = 0; i < inst->mlen; i++) {
+           /* It looks like the MRF regs are released in the send
+            * instruction once it's sent, not when the result comes
+            * back.
+            */
+           add_dep(last_mrf_write[inst->base_mrf + i], n);
+        }
       }
 
-      if (inst->predicate) {
+      if (inst->reads_flag()) {
         add_dep(last_conditional_mod[inst->flag_subreg], n);
       }
 
@@ -643,18 +858,14 @@ fs_instruction_scheduler::calculate_deps()
         add_barrier_deps(n);
       }
 
-      if (inst->mlen > 0) {
+      if (inst->mlen > 0 && inst->base_mrf != -1) {
         for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
            add_dep(last_mrf_write[inst->base_mrf + i], n);
            last_mrf_write[inst->base_mrf + i] = n;
         }
       }
 
-      /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
-       * conditional_mod, because it sets the flag register.
-       */
-      if (inst->conditional_mod ||
-          inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
+      if (inst->writes_flag()) {
         add_dep(last_conditional_mod[inst->flag_subreg], n, 0);
         last_conditional_mod[inst->flag_subreg] = n;
       }
@@ -678,7 +889,7 @@ fs_instruction_scheduler::calculate_deps()
       for (int i = 0; i < 3; i++) {
         if (inst->src[i].file == GRF) {
             if (post_reg_alloc) {
-               for (int r = 0; r < reg_width; r++)
+               for (int r = 0; r < reg_width * inst->regs_read(v, i); r++)
                   add_dep(n, last_grf_write[inst->src[i].reg + r]);
             } else {
                add_dep(n, last_grf_write[inst->src[i].reg]);
@@ -703,15 +914,17 @@ fs_instruction_scheduler::calculate_deps()
         }
       }
 
-      for (int i = 0; i < inst->mlen; i++) {
-        /* It looks like the MRF regs are released in the send
-         * instruction once it's sent, not when the result comes
-         * back.
-         */
-        add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
+      if (inst->base_mrf != -1) {
+        for (int i = 0; i < inst->mlen; i++) {
+           /* It looks like the MRF regs are released in the send
+            * instruction once it's sent, not when the result comes
+            * back.
+            */
+           add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
+        }
       }
 
-      if (inst->predicate) {
+      if (inst->reads_flag()) {
         add_dep(n, last_conditional_mod[inst->flag_subreg]);
       }
 
@@ -750,17 +963,13 @@ fs_instruction_scheduler::calculate_deps()
         add_barrier_deps(n);
       }
 
-      if (inst->mlen > 0) {
+      if (inst->mlen > 0 && inst->base_mrf != -1) {
         for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
            last_mrf_write[inst->base_mrf + i] = n;
         }
       }
 
-      /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
-       * conditional_mod, because it sets the flag register.
-       */
-      if (inst->conditional_mod ||
-          inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
+      if (inst->writes_flag()) {
         last_conditional_mod[inst->flag_subreg] = n;
       }
    }
@@ -796,6 +1005,9 @@ vec4_instruction_scheduler::calculate_deps()
       schedule_node *n = (schedule_node *)node;
       vec4_instruction *inst = (vec4_instruction *)n->inst;
 
+      if (inst->has_side_effects())
+         add_barrier_deps(n);
+
       /* read-after-write deps. */
       for (int i = 0; i < 3; i++) {
          if (inst->src[i].file == GRF) {
@@ -822,7 +1034,7 @@ vec4_instruction_scheduler::calculate_deps()
          add_dep(last_mrf_write[inst->base_mrf + i], n);
       }
 
-      if (inst->predicate) {
+      if (inst->depends_on_flags()) {
          assert(last_conditional_mod);
          add_dep(last_conditional_mod, n);
       }
@@ -893,7 +1105,7 @@ vec4_instruction_scheduler::calculate_deps()
          add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
       }
 
-      if (inst->predicate) {
+      if (inst->depends_on_flags()) {
          add_dep(n, last_conditional_mod);
       }
 
@@ -928,10 +1140,10 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
 {
    schedule_node *chosen = NULL;
 
-   if (post_reg_alloc) {
+   if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) {
       int chosen_time = 0;
 
-      /* Of the instructions closest ready to execute or the closest to
+      /* Of the instructions ready to execute or the closest to
        * being ready, choose the oldest one.
        */
       foreach_list(node, &instructions) {
@@ -948,28 +1160,87 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
        * variables so that we can avoid register spilling, or get 16-wide
        * shaders which naturally do a better job of hiding instruction
        * latency.
-       *
-       * To do so, schedule our instructions in a roughly LIFO/depth-first
-       * order: when new instructions become available as a result of
-       * scheduling something, choose those first so that our result
-       * hopefully is consumed quickly.
-       *
-       * The exception is messages that generate more than one result
-       * register (AKA texturing).  In those cases, the LIFO search would
-       * normally tend to choose them quickly (because scheduling the
-       * previous message not only unblocked the children using its result,
-       * but also the MRF setup for the next sampler message, which in turn
-       * unblocks the next sampler message).
        */
-      for (schedule_node *node = (schedule_node *)instructions.get_tail();
-           node != instructions.get_head()->prev;
-           node = (schedule_node *)node->prev) {
+      foreach_list(node, &instructions) {
          schedule_node *n = (schedule_node *)node;
          fs_inst *inst = (fs_inst *)n->inst;
 
-         chosen = n;
-         if (inst->regs_written <= 1)
-            break;
+         if (!chosen) {
+            chosen = n;
+            continue;
+         }
+
+         /* Most important: If we can definitely reduce register pressure, do
+          * so immediately.
+          */
+         int register_pressure_benefit = get_register_pressure_benefit(n->inst);
+         int chosen_register_pressure_benefit =
+            get_register_pressure_benefit(chosen->inst);
+
+         if (register_pressure_benefit > 0 &&
+             register_pressure_benefit > chosen_register_pressure_benefit) {
+            chosen = n;
+            continue;
+         } else if (chosen_register_pressure_benefit > 0 &&
+                    (register_pressure_benefit <
+                     chosen_register_pressure_benefit)) {
+            continue;
+         }
+
+         if (mode == SCHEDULE_PRE_LIFO) {
+            /* Prefer instructions that recently became available for
+             * scheduling.  These are the things that are most likely to
+             * (eventually) make a variable dead and reduce register pressure.
+             * Typical register pressure estimates don't work for us because
+             * most of our pressure comes from texturing, where no single
+             * instruction to schedule will make a vec4 value dead.
+             */
+            if (n->cand_generation > chosen->cand_generation) {
+               chosen = n;
+               continue;
+            } else if (n->cand_generation < chosen->cand_generation) {
+               continue;
+            }
+
+            /* On MRF-using chips, prefer non-SEND instructions.  If we don't
+             * do this, then because we prefer instructions that just became
+             * candidates, we'll end up in a pattern of scheduling a SEND,
+             * then the MRFs for the next SEND, then the next SEND, then the
+             * MRFs, etc., without ever consuming the results of a send.
+             */
+            if (v->brw->gen < 7) {
+               fs_inst *chosen_inst = (fs_inst *)chosen->inst;
+
+               /* We use regs_written > 1 as our test for the kind of send
+                * instruction to avoid -- only sends generate many regs, and a
+                * single-result send is probably actually reducing register
+                * pressure.
+                */
+               if (inst->regs_written <= 1 && chosen_inst->regs_written > 1) {
+                  chosen = n;
+                  continue;
+               } else if (inst->regs_written > chosen_inst->regs_written) {
+                  continue;
+               }
+            }
+         }
+
+         /* For instructions pushed on the cands list at the same time, prefer
+          * the one with the highest delay to the end of the program.  This is
+          * most likely to have its values able to be consumed first (such as
+          * for a large tree of lowered ubo loads, which appear reversed in
+          * the instruction stream with respect to when they can be consumed).
+          */
+         if (n->delay > chosen->delay) {
+            chosen = n;
+            continue;
+         } else if (n->delay < chosen->delay) {
+            continue;
+         }
+
+         /* If all other metrics are equal, we prefer the first instruction in
+          * the list (program execution).
+          */
       }
    }
 
@@ -1025,6 +1296,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea
         n->remove();
    }
 
+   unsigned cand_generation = 1;
    while (!instructions.is_empty()) {
       schedule_node *chosen = choose_instruction_to_schedule();
 
@@ -1033,6 +1305,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea
       chosen->remove();
       next_block_header->insert_before(chosen->inst);
       instructions_to_schedule--;
+      update_register_pressure(chosen->inst);
 
       /* Update the clock for how soon an instruction could start after the
        * chosen one.
@@ -1056,21 +1329,27 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea
        * be scheduled.  Update the children's unblocked time for this
        * DAG edge as we do so.
        */
-      for (int i = 0; i < chosen->child_count; i++) {
+      for (int i = chosen->child_count - 1; i >= 0; i--) {
         schedule_node *child = chosen->children[i];
 
         child->unblocked_time = MAX2(child->unblocked_time,
                                      time + chosen->child_latency[i]);
 
+         if (debug) {
+            printf("\tchild %d, %d parents: ", i, child->parent_count);
+            bv->dump_instruction(child->inst);
+         }
+
+         child->cand_generation = cand_generation;
         child->parent_count--;
         if (child->parent_count == 0) {
             if (debug) {
-               printf("now available: ");
-               bv->dump_instruction(child->inst);
+               printf("\t\tnow available\n");
             }
-           instructions.push_tail(child);
+           instructions.push_head(child);
         }
       }
+      cand_generation++;
 
       /* Shared resource: the mathbox.  There's one mathbox per EU on Gen6+
        * but it's more limited pre-gen6, so if we send something off to it then
@@ -1102,6 +1381,15 @@ instruction_scheduler::run(exec_list *all_instructions)
       bv->dump_instructions();
    }
 
+   /* Populate the remaining GRF uses array to improve the pre-regalloc
+    * scheduling.
+    */
+   if (remaining_grf_uses) {
+      foreach_list(node, all_instructions) {
+         count_remaining_grf_uses((backend_instruction *)node);
+      }
+   }
+
    while (!next_block_header->is_tail_sentinel()) {
       /* Add things to be scheduled until we get to a new BB. */
       while (!next_block_header->is_tail_sentinel()) {
@@ -1113,6 +1401,12 @@ instruction_scheduler::run(exec_list *all_instructions)
            break;
       }
       calculate_deps();
+
+      foreach_list(node, &instructions) {
+         schedule_node *n = (schedule_node *)node;
+         compute_delay(n);
+      }
+
       schedule_instructions(next_block_header);
    }
 
@@ -1123,23 +1417,23 @@ instruction_scheduler::run(exec_list *all_instructions)
 }
 
 void
-fs_visitor::schedule_instructions(bool post_reg_alloc)
+fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
 {
    int grf_count;
-   if (post_reg_alloc)
+   if (mode == SCHEDULE_POST)
       grf_count = grf_used;
    else
       grf_count = virtual_grf_count;
 
-   fs_instruction_scheduler sched(this, grf_count, post_reg_alloc);
+   fs_instruction_scheduler sched(this, grf_count, mode);
    sched.run(&instructions);
 
-   if (unlikely(INTEL_DEBUG & DEBUG_WM) && post_reg_alloc) {
+   if (unlikely(INTEL_DEBUG & DEBUG_WM) && mode == SCHEDULE_POST) {
       printf("fs%d estimated execution time: %d cycles\n",
              dispatch_width, sched.time);
    }
 
-   this->live_intervals_valid = false;
+   invalidate_live_intervals();
 }
 
 void
@@ -1152,5 +1446,5 @@ vec4_visitor::opt_schedule_instructions()
       printf("vec4 estimated execution time: %d cycles\n", sched.time);
    }
 
-   this->live_intervals_valid = false;
+   invalidate_live_intervals();
 }