i965: Use MESA_FORMAT_B8G8R8X8_SRGB for RGB visuals

[mesa.git] / src / mesa / drivers / dri / i965 / brw_schedule_instructions.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp

index 3e86cb05cce0751223716a2c2ef3b61ef5437c17..60f7fd9cfcd63d040cde3f7bee4b2eb153dc90f7 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -26,11 +26,10 @@
   */
  
  #include "brw_fs.h"
+#include "brw_fs_live_variables.h"
  #include "brw_vec4.h"
  #include "brw_cfg.h"
  #include "brw_shader.h"
-#include "glsl/nir/glsl_types.h"
-#include "glsl/ir_optimization.h"
  
  using namespace brw;
  
@@ -400,22 +399,49 @@ schedule_node::set_latency_gen7(bool is_haswell)
  class instruction_scheduler {
  public:
     instruction_scheduler(backend_shader *s, int grf_count,
+                         int hw_reg_count, int block_count,
                           instruction_scheduler_mode mode)
     {
        this->bs = s;
        this->mem_ctx = ralloc_context(NULL);
        this->grf_count = grf_count;
+      this->hw_reg_count = hw_reg_count;
        this->instructions.make_empty();
        this->instructions_to_schedule = 0;
        this->post_reg_alloc = (mode == SCHEDULE_POST);
        this->mode = mode;
        this->time = 0;
        if (!post_reg_alloc) {
-         this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count);
-         this->grf_active = rzalloc_array(mem_ctx, bool, grf_count);
+         this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count);
+
+         this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
+         for (int i = 0; i < block_count; i++)
+            this->livein[i] = rzalloc_array(mem_ctx, BITSET_WORD,
+                                            BITSET_WORDS(grf_count));
+
+         this->liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
+         for (int i = 0; i < block_count; i++)
+            this->liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD,
+                                             BITSET_WORDS(grf_count));
+
+         this->hw_liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
+         for (int i = 0; i < block_count; i++)
+            this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD,
+                                                BITSET_WORDS(hw_reg_count));
+
+         this->written = rzalloc_array(mem_ctx, bool, grf_count);
+
+         this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count);
+
+         this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count);
        } else {
-         this->remaining_grf_uses = NULL;
-         this->grf_active = NULL;
+         this->reg_pressure_in = NULL;
+         this->livein = NULL;
+         this->liveout = NULL;
+         this->hw_liveout = NULL;
+         this->written = NULL;
+         this->reads_remaining = NULL;
+         this->hw_reads_remaining = NULL;
        }
     }
  
@@ -442,7 +468,8 @@ public:
      */
     virtual int issue_time(backend_instruction *inst) = 0;
  
-   virtual void count_remaining_grf_uses(backend_instruction *inst) = 0;
+   virtual void count_reads_remaining(backend_instruction *inst) = 0;
+   virtual void setup_liveness(cfg_t *cfg) = 0;
     virtual void update_register_pressure(backend_instruction *inst) = 0;
     virtual int get_register_pressure_benefit(backend_instruction *inst) = 0;
  
@@ -453,33 +480,63 @@ public:
     bool post_reg_alloc;
     int instructions_to_schedule;
     int grf_count;
+   int hw_reg_count;
     int time;
+   int reg_pressure;
+   int block_idx;
     exec_list instructions;
     backend_shader *bs;
  
     instruction_scheduler_mode mode;
  
-   /**
-    * Number of instructions left to schedule that reference each vgrf.
-    *
-    * Used so that we can prefer scheduling instructions that will end the
-    * live intervals of multiple variables, to reduce register pressure.
+   /*
+    * The register pressure at the beginning of each basic block.
      */
-   int *remaining_grf_uses;
  
-   /**
-    * Tracks whether each VGRF has had an instruction scheduled that uses it.
-    *
-    * This is used to estimate whether scheduling a new instruction will
-    * increase register pressure.
+   int *reg_pressure_in;
+
+   /*
+    * The virtual GRF's whose range overlaps the beginning of each basic block.
+    */
+
+   BITSET_WORD **livein;
+
+   /*
+    * The virtual GRF's whose range overlaps the end of each basic block.
+    */
+
+   BITSET_WORD **liveout;
+
+   /*
+    * The hardware GRF's whose range overlaps the end of each basic block.
+    */
+
+   BITSET_WORD **hw_liveout;
+
+   /*
+    * Whether we've scheduled a write for this virtual GRF yet.
+    */
+
+   bool *written;
+
+   /*
+    * How many reads we haven't scheduled for this virtual GRF yet.
      */
-   bool *grf_active;
+
+   int *reads_remaining;
+
+   /*
+    * How many reads we haven't scheduled for this hardware GRF yet.
+    */
+
+   int *hw_reads_remaining;
  };
  
  class fs_instruction_scheduler : public instruction_scheduler
  {
  public:
-   fs_instruction_scheduler(fs_visitor *v, int grf_count,
+   fs_instruction_scheduler(fs_visitor *v, int grf_count, int hw_reg_count,
+                            int block_count,
                              instruction_scheduler_mode mode);
     void calculate_deps();
     bool is_compressed(fs_inst *inst);
@@ -487,35 +544,108 @@ public:
     int issue_time(backend_instruction *inst);
     fs_visitor *v;
  
-   void count_remaining_grf_uses(backend_instruction *inst);
+   void count_reads_remaining(backend_instruction *inst);
+   void setup_liveness(cfg_t *cfg);
     void update_register_pressure(backend_instruction *inst);
     int get_register_pressure_benefit(backend_instruction *inst);
  };
  
  fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
-                                                   int grf_count,
+                                                   int grf_count, int hw_reg_count,
+                                                   int block_count,
                                                     instruction_scheduler_mode mode)
-   : instruction_scheduler(v, grf_count, mode),
+   : instruction_scheduler(v, grf_count, hw_reg_count, block_count, mode),
       v(v)
  {
  }
  
+static bool
+is_src_duplicate(fs_inst *inst, int src)
+{
+   for (int i = 0; i < src; i++)
+     if (inst->src[i].equals(inst->src[src]))
+       return true;
+
+  return false;
+}
+
  void
-fs_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be)
+fs_instruction_scheduler::count_reads_remaining(backend_instruction *be)
  {
     fs_inst *inst = (fs_inst *)be;
  
-   if (!remaining_grf_uses)
+   if (!reads_remaining)
        return;
  
-   if (inst->dst.file == GRF)
-      remaining_grf_uses[inst->dst.reg]++;
-
     for (int i = 0; i < inst->sources; i++) {
-      if (inst->src[i].file != GRF)
+      if (is_src_duplicate(inst, i))
+         continue;
+
+      if (inst->src[i].file == VGRF) {
+         reads_remaining[inst->src[i].nr]++;
+      } else if (inst->src[i].file == FIXED_GRF) {
+         if (inst->src[i].nr >= hw_reg_count)
+            continue;
+
+         for (int j = 0; j < inst->regs_read(i); j++)
+            hw_reads_remaining[inst->src[i].nr + j]++;
+      }
+   }
+}
+
+void
+fs_instruction_scheduler::setup_liveness(cfg_t *cfg)
+{
+   /* First, compute liveness on a per-GRF level using the in/out sets from
+    * liveness calculation.
+    */
+   for (int block = 0; block < cfg->num_blocks; block++) {
+      for (int i = 0; i < v->live_intervals->num_vars; i++) {
+         if (BITSET_TEST(v->live_intervals->block_data[block].livein, i)) {
+            int vgrf = v->live_intervals->vgrf_from_var[i];
+            if (!BITSET_TEST(livein[block], vgrf)) {
+               reg_pressure_in[block] += v->alloc.sizes[vgrf];
+               BITSET_SET(livein[block], vgrf);
+            }
+         }
+
+         if (BITSET_TEST(v->live_intervals->block_data[block].liveout, i))
+            BITSET_SET(liveout[block], v->live_intervals->vgrf_from_var[i]);
+      }
+   }
+
+   /* Now, extend the live in/live out sets for when a range crosses a block
+    * boundary, which matches what our register allocator/interference code
+    * does to account for force_writemask_all and incompatible exec_mask's.
+    */
+   for (int block = 0; block < cfg->num_blocks - 1; block++) {
+      for (int i = 0; i < grf_count; i++) {
+         if (v->virtual_grf_start[i] <= cfg->blocks[block]->end_ip &&
+             v->virtual_grf_end[i] >= cfg->blocks[block + 1]->start_ip) {
+            if (!BITSET_TEST(livein[block + 1], i)) {
+                reg_pressure_in[block + 1] += v->alloc.sizes[i];
+                BITSET_SET(livein[block + 1], i);
+            }
+
+            BITSET_SET(liveout[block], i);
+         }
+      }
+   }
+
+   int payload_last_use_ip[hw_reg_count];
+   v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip);
+
+   for (int i = 0; i < hw_reg_count; i++) {
+      if (payload_last_use_ip[i] == -1)
           continue;
  
-      remaining_grf_uses[inst->src[i].reg]++;
+      for (int block = 0; block < cfg->num_blocks; block++) {
+         if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i])
+            reg_pressure_in[block]++;
+
+         if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i])
+            BITSET_SET(hw_liveout[block], i);
+      }
     }
  }
  
@@ -524,18 +654,23 @@ fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
  {
     fs_inst *inst = (fs_inst *)be;
  
-   if (!remaining_grf_uses)
+   if (!reads_remaining)
        return;
  
-   if (inst->dst.file == GRF) {
-      remaining_grf_uses[inst->dst.reg]--;
-      grf_active[inst->dst.reg] = true;
+   if (inst->dst.file == VGRF) {
+      written[inst->dst.nr] = true;
     }
  
     for (int i = 0; i < inst->sources; i++) {
-      if (inst->src[i].file == GRF) {
-         remaining_grf_uses[inst->src[i].reg]--;
-         grf_active[inst->src[i].reg] = true;
+      if (is_src_duplicate(inst, i))
+          continue;
+
+      if (inst->src[i].file == VGRF) {
+         reads_remaining[inst->src[i].nr]--;
+      } else if (inst->src[i].file == FIXED_GRF &&
+                 inst->src[i].nr < hw_reg_count) {
+         for (int off = 0; off < inst->regs_read(i); off++)
+            hw_reads_remaining[inst->src[i].nr + off]--;
        }
     }
  }
@@ -546,21 +681,31 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
     fs_inst *inst = (fs_inst *)be;
     int benefit = 0;
  
-   if (inst->dst.file == GRF) {
-      if (remaining_grf_uses[inst->dst.reg] == 1)
-         benefit += v->alloc.sizes[inst->dst.reg];
-      if (!grf_active[inst->dst.reg])
-         benefit -= v->alloc.sizes[inst->dst.reg];
+   if (inst->dst.file == VGRF) {
+      if (!BITSET_TEST(livein[block_idx], inst->dst.nr) &&
+          !written[inst->dst.nr])
+         benefit -= v->alloc.sizes[inst->dst.nr];
     }
  
     for (int i = 0; i < inst->sources; i++) {
-      if (inst->src[i].file != GRF)
+      if (is_src_duplicate(inst, i))
           continue;
  
-      if (remaining_grf_uses[inst->src[i].reg] == 1)
-         benefit += v->alloc.sizes[inst->src[i].reg];
-      if (!grf_active[inst->src[i].reg])
-         benefit -= v->alloc.sizes[inst->src[i].reg];
+      if (inst->src[i].file == VGRF &&
+          !BITSET_TEST(liveout[block_idx], inst->src[i].nr) &&
+          reads_remaining[inst->src[i].nr] == 1)
+         benefit += v->alloc.sizes[inst->src[i].nr];
+
+      if (inst->src[i].file == FIXED_GRF &&
+          inst->src[i].nr < hw_reg_count) {
+         for (int off = 0; off < inst->regs_read(i); off++) {
+            int reg = inst->src[i].nr + off;
+            if (!BITSET_TEST(hw_liveout[block_idx], reg) &&
+                hw_reads_remaining[reg] == 1) {
+               benefit++;
+            }
+         }
+      }
     }
  
     return benefit;
@@ -575,20 +720,26 @@ public:
     int issue_time(backend_instruction *inst);
     vec4_visitor *v;
  
-   void count_remaining_grf_uses(backend_instruction *inst);
+   void count_reads_remaining(backend_instruction *inst);
+   void setup_liveness(cfg_t *cfg);
     void update_register_pressure(backend_instruction *inst);
     int get_register_pressure_benefit(backend_instruction *inst);
  };
  
  vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
                                                         int grf_count)
-   : instruction_scheduler(v, grf_count, SCHEDULE_POST),
+   : instruction_scheduler(v, grf_count, 0, 0, SCHEDULE_POST),
       v(v)
  {
  }
  
  void
-vec4_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be)
+vec4_instruction_scheduler::count_reads_remaining(backend_instruction *be)
+{
+}
+
+void
+vec4_instruction_scheduler::setup_liveness(cfg_t *cfg)
  {
  }
  
@@ -771,7 +922,6 @@ fs_instruction_scheduler::calculate_deps()
      * granular level.
      */
     schedule_node *last_fixed_grf_write = NULL;
-   int reg_width = v->dispatch_width / 8;
  
     /* The last instruction always needs to still be the last
      * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
@@ -795,24 +945,19 @@ fs_instruction_scheduler::calculate_deps()
  
        /* read-after-write deps. */
        for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == GRF) {
+         if (inst->src[i].file == VGRF) {
              if (post_reg_alloc) {
                 for (int r = 0; r < inst->regs_read(i); r++)
-                  add_dep(last_grf_write[inst->src[i].reg + r], n);
+                  add_dep(last_grf_write[inst->src[i].nr + r], n);
              } else {
                 for (int r = 0; r < inst->regs_read(i); r++) {
-                  add_dep(last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r], n);
+                  add_dep(last_grf_write[inst->src[i].nr * 16 + inst->src[i].reg_offset + r], n);
                 }
              }
-         } else if (inst->src[i].file == HW_REG &&
-                    (inst->src[i].fixed_hw_reg.file ==
-                     BRW_GENERAL_REGISTER_FILE)) {
+         } else if (inst->src[i].file == FIXED_GRF) {
              if (post_reg_alloc) {
-               int size = reg_width;
-               if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
-                  size = 1;
-               for (int r = 0; r < size; r++)
-                  add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n);
+               for (int r = 0; r < inst->regs_read(i); r++)
+                  add_dep(last_grf_write[inst->src[i].nr + r], n);
              } else {
                 add_dep(last_fixed_grf_write, n);
              }
@@ -820,9 +965,7 @@ fs_instruction_scheduler::calculate_deps()
              add_dep(last_accumulator_write, n);
           } else if (inst->src[i].file != BAD_FILE &&
                      inst->src[i].file != IMM &&
-                    inst->src[i].file != UNIFORM &&
-                    (inst->src[i].file != HW_REG ||
-                     inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) {
+                    inst->src[i].file != UNIFORM) {
              assert(inst->src[i].file != MRF);
              add_barrier_deps(n);
           }
@@ -847,36 +990,35 @@ fs_instruction_scheduler::calculate_deps()
        }
  
        /* write-after-write deps. */
-      if (inst->dst.file == GRF) {
+      if (inst->dst.file == VGRF) {
           if (post_reg_alloc) {
              for (int r = 0; r < inst->regs_written; r++) {
-               add_dep(last_grf_write[inst->dst.reg + r], n);
-               last_grf_write[inst->dst.reg + r] = n;
+               add_dep(last_grf_write[inst->dst.nr + r], n);
+               last_grf_write[inst->dst.nr + r] = n;
              }
           } else {
              for (int r = 0; r < inst->regs_written; r++) {
-               add_dep(last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r], n);
-               last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r] = n;
+               add_dep(last_grf_write[inst->dst.nr * 16 + inst->dst.reg_offset + r], n);
+               last_grf_write[inst->dst.nr * 16 + inst->dst.reg_offset + r] = n;
              }
           }
        } else if (inst->dst.file == MRF) {
-         int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
+         int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
  
           add_dep(last_mrf_write[reg], n);
           last_mrf_write[reg] = n;
           if (is_compressed(inst)) {
-            if (inst->dst.reg & BRW_MRF_COMPR4)
+            if (inst->dst.nr & BRW_MRF_COMPR4)
                 reg += 4;
              else
                 reg++;
              add_dep(last_mrf_write[reg], n);
              last_mrf_write[reg] = n;
           }
-      } else if (inst->dst.file == HW_REG &&
-                 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+      } else if (inst->dst.file == FIXED_GRF) {
           if (post_reg_alloc) {
-            for (int r = 0; r < reg_width; r++)
-               last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
+            for (int r = 0; r < inst->regs_written; r++)
+               last_grf_write[inst->dst.nr + r] = n;
           } else {
              last_fixed_grf_write = n;
           }
@@ -924,34 +1066,27 @@ fs_instruction_scheduler::calculate_deps()
  
        /* write-after-read deps. */
        for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == GRF) {
+         if (inst->src[i].file == VGRF) {
              if (post_reg_alloc) {
                 for (int r = 0; r < inst->regs_read(i); r++)
-                  add_dep(n, last_grf_write[inst->src[i].reg + r]);
+                  add_dep(n, last_grf_write[inst->src[i].nr + r], 0);
              } else {
                 for (int r = 0; r < inst->regs_read(i); r++) {
-                  add_dep(n, last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r]);
+                  add_dep(n, last_grf_write[inst->src[i].nr * 16 + inst->src[i].reg_offset + r], 0);
                 }
              }
-         } else if (inst->src[i].file == HW_REG &&
-                    (inst->src[i].fixed_hw_reg.file ==
-                     BRW_GENERAL_REGISTER_FILE)) {
+         } else if (inst->src[i].file == FIXED_GRF) {
              if (post_reg_alloc) {
-               int size = reg_width;
-               if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
-                  size = 1;
-               for (int r = 0; r < size; r++)
-                  add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r]);
+               for (int r = 0; r < inst->regs_read(i); r++)
+                  add_dep(n, last_grf_write[inst->src[i].nr + r], 0);
              } else {
-               add_dep(n, last_fixed_grf_write);
+               add_dep(n, last_fixed_grf_write, 0);
              }
           } else if (inst->src[i].is_accumulator()) {
-            add_dep(n, last_accumulator_write);
+            add_dep(n, last_accumulator_write, 0);
           } else if (inst->src[i].file != BAD_FILE &&
                      inst->src[i].file != IMM &&
-                    inst->src[i].file != UNIFORM &&
-                    (inst->src[i].file != HW_REG ||
-                     inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) {
+                    inst->src[i].file != UNIFORM) {
              assert(inst->src[i].file != MRF);
              add_barrier_deps(n);
           }
@@ -978,33 +1113,32 @@ fs_instruction_scheduler::calculate_deps()
        /* Update the things this instruction wrote, so earlier reads
         * can mark this as WAR dependency.
         */
-      if (inst->dst.file == GRF) {
+      if (inst->dst.file == VGRF) {
           if (post_reg_alloc) {
              for (int r = 0; r < inst->regs_written; r++)
-               last_grf_write[inst->dst.reg + r] = n;
+               last_grf_write[inst->dst.nr + r] = n;
           } else {
              for (int r = 0; r < inst->regs_written; r++) {
-               last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r] = n;
+               last_grf_write[inst->dst.nr * 16 + inst->dst.reg_offset + r] = n;
              }
           }
        } else if (inst->dst.file == MRF) {
-         int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
+         int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
  
           last_mrf_write[reg] = n;
  
           if (is_compressed(inst)) {
-            if (inst->dst.reg & BRW_MRF_COMPR4)
+            if (inst->dst.nr & BRW_MRF_COMPR4)
                 reg += 4;
              else
                 reg++;
  
              last_mrf_write[reg] = n;
           }
-      } else if (inst->dst.file == HW_REG &&
-                 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+      } else if (inst->dst.file == FIXED_GRF) {
           if (post_reg_alloc) {
-            for (int r = 0; r < reg_width; r++)
-               last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
+            for (int r = 0; r < inst->regs_written; r++)
+               last_grf_write[inst->dst.nr + r] = n;
           } else {
              last_fixed_grf_write = n;
           }
@@ -1066,21 +1200,17 @@ vec4_instruction_scheduler::calculate_deps()
  
        /* read-after-write deps. */
        for (int i = 0; i < 3; i++) {
-         if (inst->src[i].file == GRF) {
+         if (inst->src[i].file == VGRF) {
              for (unsigned j = 0; j < inst->regs_read(i); ++j)
-               add_dep(last_grf_write[inst->src[i].reg + j], n);
-         } else if (inst->src[i].file == HW_REG &&
-                    (inst->src[i].fixed_hw_reg.file ==
-                     BRW_GENERAL_REGISTER_FILE)) {
+               add_dep(last_grf_write[inst->src[i].nr + j], n);
+         } else if (inst->src[i].file == FIXED_GRF) {
              add_dep(last_fixed_grf_write, n);
           } else if (inst->src[i].is_accumulator()) {
              assert(last_accumulator_write);
              add_dep(last_accumulator_write, n);
           } else if (inst->src[i].file != BAD_FILE &&
                      inst->src[i].file != IMM &&
-                    inst->src[i].file != UNIFORM &&
-                    (inst->src[i].file != HW_REG ||
-                     inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) {
+                    inst->src[i].file != UNIFORM) {
              /* No reads from MRF, and ATTR is already translated away */
              assert(inst->src[i].file != MRF &&
                     inst->src[i].file != ATTR);
@@ -1109,16 +1239,15 @@ vec4_instruction_scheduler::calculate_deps()
        }
  
        /* write-after-write deps. */
-      if (inst->dst.file == GRF) {
+      if (inst->dst.file == VGRF) {
           for (unsigned j = 0; j < inst->regs_written; ++j) {
-            add_dep(last_grf_write[inst->dst.reg + j], n);
-            last_grf_write[inst->dst.reg + j] = n;
+            add_dep(last_grf_write[inst->dst.nr + j], n);
+            last_grf_write[inst->dst.nr + j] = n;
           }
        } else if (inst->dst.file == MRF) {
-         add_dep(last_mrf_write[inst->dst.reg], n);
-         last_mrf_write[inst->dst.reg] = n;
-     } else if (inst->dst.file == HW_REG &&
-                 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+         add_dep(last_mrf_write[inst->dst.nr], n);
+         last_mrf_write[inst->dst.nr] = n;
+     } else if (inst->dst.file == FIXED_GRF) {
           last_fixed_grf_write = n;
        } else if (inst->dst.is_accumulator()) {
           add_dep(last_accumulator_write, n);
@@ -1164,20 +1293,16 @@ vec4_instruction_scheduler::calculate_deps()
  
        /* write-after-read deps. */
        for (int i = 0; i < 3; i++) {
-         if (inst->src[i].file == GRF) {
+         if (inst->src[i].file == VGRF) {
              for (unsigned j = 0; j < inst->regs_read(i); ++j)
-               add_dep(n, last_grf_write[inst->src[i].reg + j]);
-         } else if (inst->src[i].file == HW_REG &&
-                    (inst->src[i].fixed_hw_reg.file ==
-                     BRW_GENERAL_REGISTER_FILE)) {
+               add_dep(n, last_grf_write[inst->src[i].nr + j]);
+         } else if (inst->src[i].file == FIXED_GRF) {
              add_dep(n, last_fixed_grf_write);
           } else if (inst->src[i].is_accumulator()) {
              add_dep(n, last_accumulator_write);
           } else if (inst->src[i].file != BAD_FILE &&
                      inst->src[i].file != IMM &&
-                    inst->src[i].file != UNIFORM &&
-                    (inst->src[i].file != HW_REG ||
-                     inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) {
+                    inst->src[i].file != UNIFORM) {
              assert(inst->src[i].file != MRF &&
                     inst->src[i].file != ATTR);
              add_barrier_deps(n);
@@ -1205,13 +1330,12 @@ vec4_instruction_scheduler::calculate_deps()
        /* Update the things this instruction wrote, so earlier reads
         * can mark this as WAR dependency.
         */
-      if (inst->dst.file == GRF) {
+      if (inst->dst.file == VGRF) {
           for (unsigned j = 0; j < inst->regs_written; ++j)
-            last_grf_write[inst->dst.reg + j] = n;
+            last_grf_write[inst->dst.nr + j] = n;
        } else if (inst->dst.file == MRF) {
-         last_mrf_write[inst->dst.reg] = n;
-      } else if (inst->dst.file == HW_REG &&
-                 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+         last_mrf_write[inst->dst.nr] = n;
+      } else if (inst->dst.file == FIXED_GRF) {
           last_fixed_grf_write = n;
        } else if (inst->dst.is_accumulator()) {
           last_accumulator_write = n;
@@ -1387,6 +1511,9 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
     const struct brw_device_info *devinfo = bs->devinfo;
     backend_instruction *inst = block->end();
     time = 0;
+   if (!post_reg_alloc)
+      reg_pressure = reg_pressure_in[block->num];
+   block_idx = block->num;
  
     /* Remove non-DAG heads from the list. */
     foreach_in_list_safe(schedule_node, n, &instructions) {
@@ -1403,7 +1530,11 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
        chosen->remove();
        inst->insert_before(block, chosen->inst);
        instructions_to_schedule--;
-      update_register_pressure(chosen->inst);
+
+      if (!post_reg_alloc) {
+         reg_pressure -= get_register_pressure_benefit(chosen->inst);
+         update_register_pressure(chosen->inst);
+      }
  
        /* If we expected a delay for scheduling, then bump the clock to reflect
         * that.  In reality, the hardware will switch to another hyperthread
@@ -1421,6 +1552,8 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
        if (debug) {
           fprintf(stderr, "clock %4d, scheduled: ", time);
           bs->dump_instruction(chosen->inst);
+         if (!post_reg_alloc)
+            fprintf(stderr, "(register pressure %d)\n", reg_pressure);
        }
  
        /* Now that we've scheduled a new instruction, some of its
@@ -1467,30 +1600,53 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
     if (block->end()->opcode == BRW_OPCODE_NOP)
        block->end()->remove(block);
     assert(instructions_to_schedule == 0);
+
+   block->cycle_count = time;
+}
+
+static unsigned get_cycle_count(cfg_t *cfg)
+{
+   unsigned count = 0, multiplier = 1;
+   foreach_block(block, cfg) {
+      if (block->start()->opcode == BRW_OPCODE_DO)
+         multiplier *= 10; /* assume that loops execute ~10 times */
+
+      count += block->cycle_count * multiplier;
+
+      if (block->end()->opcode == BRW_OPCODE_WHILE)
+         multiplier /= 10;
+   }
+
+   return count;
  }
  
  void
  instruction_scheduler::run(cfg_t *cfg)
  {
-   if (debug) {
+   if (debug && !post_reg_alloc) {
        fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n",
                post_reg_alloc);
-      bs->dump_instructions();
+         bs->dump_instructions();
     }
  
-   /* Populate the remaining GRF uses array to improve the pre-regalloc
-    * scheduling.
-    */
-   if (remaining_grf_uses) {
-      foreach_block_and_inst(block, backend_instruction, inst, cfg) {
-         count_remaining_grf_uses(inst);
-      }
-   }
+   if (!post_reg_alloc)
+      setup_liveness(cfg);
  
     foreach_block(block, cfg) {
        if (block->end_ip - block->start_ip <= 1)
           continue;
  
+      if (reads_remaining) {
+         memset(reads_remaining, 0,
+                grf_count * sizeof(*reads_remaining));
+         memset(hw_reads_remaining, 0,
+                hw_reg_count * sizeof(*hw_reads_remaining));
+         memset(written, 0, grf_count * sizeof(*written));
+
+         foreach_inst_in_block(fs_inst, inst, block)
+            count_reads_remaining(inst);
+      }
+
        add_insts_from_block(block);
  
        calculate_deps();
@@ -1502,23 +1658,29 @@ instruction_scheduler::run(cfg_t *cfg)
        schedule_instructions(block);
     }
  
-   if (debug) {
+   if (debug && !post_reg_alloc) {
        fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n",
                post_reg_alloc);
        bs->dump_instructions();
     }
+
+   cfg->cycle_count = get_cycle_count(cfg);
  }
  
  void
  fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
  {
+   if (mode != SCHEDULE_POST)
+      calculate_live_intervals();
+
     int grf_count;
     if (mode == SCHEDULE_POST)
        grf_count = grf_used;
     else
        grf_count = alloc.count;
  
-   fs_instruction_scheduler sched(this, grf_count, mode);
+   fs_instruction_scheduler sched(this, grf_count, first_non_payload_grf,
+                                  cfg->num_blocks, mode);
     sched.run(cfg);
  
     if (unlikely(debug_enabled) && mode == SCHEDULE_POST) {