intel/fs/ra: Do the spill loop inside RA

[mesa.git] / src / intel / compiler / brw_fs_reg_allocate.cpp
diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp

index 94b1815349231f2722d81ac278b8cf219347de22..66a92095ab89f0e5cab0cbd00c604b333d329329 100644 (file)
--- a/src/intel/compiler/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@@ -393,6 +393,44 @@ void fs_visitor::calculate_payload_ranges(int payload_node_count,
     }
  }
  
+class fs_reg_alloc {
+public:
+   fs_reg_alloc(fs_visitor *fs):
+      fs(fs), devinfo(fs->devinfo), compiler(fs->compiler), g(NULL)
+   {
+      mem_ctx = ralloc_context(NULL);
+      int reg_width = fs->dispatch_width / 8;
+      rsi = _mesa_logbase2(reg_width);
+   }
+
+   ~fs_reg_alloc()
+   {
+      ralloc_free(mem_ctx);
+   }
+
+   bool assign_regs(bool allow_spilling, bool spill_all);
+
+private:
+   void setup_payload_interference(int payload_node_count,
+                                   int first_payload_node);
+   void setup_mrf_hack_interference(int first_mrf_node,
+                                    int *first_used_mrf);
+   void build_interference_graph(bool allow_spilling);
+
+   int choose_spill_reg();
+   void spill_reg(unsigned spill_reg);
+
+   void *mem_ctx;
+   fs_visitor *fs;
+   const gen_device_info *devinfo;
+   const brw_compiler *compiler;
+
+   /* Which compiler->fs_reg_sets[] to use */
+   int rsi;
+
+   ra_graph *g;
+};
+
  
  /**
   * Sets up interference between thread payload registers and the virtual GRFs
@@ -412,12 +450,11 @@ void fs_visitor::calculate_payload_ranges(int payload_node_count,
   * (note that in SIMD16, a node is two registers).
   */
  void
-fs_visitor::setup_payload_interference(struct ra_graph *g,
-                                       int payload_node_count,
-                                       int first_payload_node)
+fs_reg_alloc::setup_payload_interference(int payload_node_count,
+                                         int first_payload_node)
  {
     int payload_last_use_ip[payload_node_count];
-   calculate_payload_ranges(payload_node_count, payload_last_use_ip);
+   fs->calculate_payload_ranges(payload_node_count, payload_last_use_ip);
  
     for (int i = 0; i < payload_node_count; i++) {
        if (payload_last_use_ip[i] == -1)
@@ -427,12 +464,12 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
         * live between the start of the program and our last use of the payload
         * node.
         */
-      for (unsigned j = 0; j < this->alloc.count; j++) {
+      for (unsigned j = 0; j < fs->alloc.count; j++) {
           /* Note that we use a <= comparison, unlike virtual_grf_interferes(),
            * in order to not have to worry about the uniform issue described in
            * calculate_live_intervals().
            */
-         if (this->virtual_grf_start[j] <= payload_last_use_ip[i]) {
+         if (fs->virtual_grf_start[j] <= payload_last_use_ip[i]) {
              ra_add_node_interference(g, first_payload_node + i, j);
           }
        }
@@ -444,7 +481,7 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
         * The alternative would be to have per-physical-register classes, which
         * would just be silly.
         */
-      if (devinfo->gen <= 5 && dispatch_width >= 16) {
+      if (devinfo->gen <= 5 && fs->dispatch_width >= 16) {
           /* We have to divide by 2 here because we only have even numbered
            * registers.  Some of the payload registers will be odd, but
            * that's ok because their physical register numbers have already
@@ -493,19 +530,57 @@ get_used_mrfs(fs_visitor *v, bool *mrf_used)
     }
  }
  
+namespace {
+   /**
+    * Maximum spill block size we expect to encounter in 32B units.
+    *
+    * This is somewhat arbitrary and doesn't necessarily limit the maximum
+    * variable size that can be spilled -- A higher value will allow a
+    * variable of a given size to be spilled more efficiently with a smaller
+    * number of scratch messages, but will increase the likelihood of a
+    * collision between the MRFs reserved for spilling and other MRFs used by
+    * the program (and possibly increase GRF register pressure on platforms
+    * without hardware MRFs), what could cause register allocation to fail.
+    *
+    * For the moment reserve just enough space so a register of 32 bit
+    * component type and natural region width can be spilled without splitting
+    * into multiple (force_writemask_all) scratch messages.
+    */
+   unsigned
+   spill_max_size(const backend_shader *s)
+   {
+      /* FINISHME - On Gen7+ it should be possible to avoid this limit
+       *            altogether by spilling directly from the temporary GRF
+       *            allocated to hold the result of the instruction (and the
+       *            scratch write header).
+       */
+      /* FINISHME - The shader's dispatch width probably belongs in
+       *            backend_shader (or some nonexistent fs_shader class?)
+       *            rather than in the visitor class.
+       */
+      return static_cast<const fs_visitor *>(s)->dispatch_width / 8;
+   }
+
+   /**
+    * First MRF register available for spilling.
+    */
+   unsigned
+   spill_base_mrf(const backend_shader *s)
+   {
+      return BRW_MAX_MRF(s->devinfo->gen) - spill_max_size(s) - 1;
+   }
+}
+
  /**
   * Sets interference between virtual GRFs and usage of the high GRFs for SEND
   * messages (treated as MRFs in code generation).
   */
-static void
-setup_mrf_hack_interference(fs_visitor *v, struct ra_graph *g,
-                            int first_mrf_node, int *first_used_mrf)
+void
+fs_reg_alloc::setup_mrf_hack_interference(int first_mrf_node,
+                                          int *first_used_mrf)
  {
-   bool mrf_used[BRW_MAX_MRF(v->devinfo->gen)];
-   get_used_mrfs(v, mrf_used);
-
-   *first_used_mrf = BRW_MAX_MRF(v->devinfo->gen);
-   for (int i = 0; i < BRW_MAX_MRF(v->devinfo->gen); i++) {
+   *first_used_mrf = spill_base_mrf(fs);
+   for (int i = spill_base_mrf(fs); i < BRW_MAX_MRF(devinfo->gen); i++) {
        /* Mark each MRF reg node as being allocated to its physical register.
         *
         * The alternative would be to have per-physical-register classes, which
@@ -513,36 +588,29 @@ setup_mrf_hack_interference(fs_visitor *v, struct ra_graph *g,
         */
        ra_set_node_reg(g, first_mrf_node + i, GEN7_MRF_HACK_START + i);
  
-      /* Since we don't have any live/dead analysis on the MRFs, just mark all
-       * that are used as conflicting with all virtual GRFs.
-       */
-      if (mrf_used[i]) {
-         if (i < *first_used_mrf)
-            *first_used_mrf = i;
-
-         for (unsigned j = 0; j < v->alloc.count; j++) {
-            ra_add_node_interference(g, first_mrf_node + i, j);
-         }
-      }
+      for (unsigned j = 0; j < fs->alloc.count; j++)
+         ra_add_node_interference(g, first_mrf_node + i, j);
     }
  }
  
-bool
-fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
+void
+fs_reg_alloc::build_interference_graph(bool allow_spilling)
  {
+   const gen_device_info *devinfo = fs->devinfo;
+   const brw_compiler *compiler = fs->compiler;
+
     /* Most of this allocation was written for a reg_width of 1
      * (dispatch_width == 8).  In extending to SIMD16, the code was
      * left in place and it was converted to have the hardware
      * registers it's allocating be contiguous physical pairs of regs
      * for reg_width == 2.
      */
-   int reg_width = dispatch_width / 8;
-   unsigned hw_reg_mapping[this->alloc.count];
-   int payload_node_count = ALIGN(this->first_non_payload_grf, reg_width);
-   int rsi = _mesa_logbase2(reg_width); /* Which compiler->fs_reg_sets[] to use */
-   calculate_live_intervals();
+   int reg_width = fs->dispatch_width / 8;
+   int payload_node_count = ALIGN(fs->first_non_payload_grf, reg_width);
+
+   fs->calculate_live_intervals();
  
-   int node_count = this->alloc.count;
+   int node_count = fs->alloc.count;
     int first_payload_node = node_count;
     node_count += payload_node_count;
     int first_mrf_hack_node = node_count;
@@ -551,11 +619,13 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
     int grf127_send_hack_node = node_count;
     if (devinfo->gen >= 8)
        node_count ++;
-   struct ra_graph *g =
-      ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count);
  
-   for (unsigned i = 0; i < this->alloc.count; i++) {
-      unsigned size = this->alloc.sizes[i];
+   assert(g == NULL);
+   g = ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count);
+   ralloc_steal(mem_ctx, g);
+
+   for (unsigned i = 0; i < fs->alloc.count; i++) {
+      unsigned size = fs->alloc.sizes[i];
        int c;
  
        assert(size <= ARRAY_SIZE(compiler->fs_reg_sets[rsi].classes) &&
@@ -566,21 +636,21 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
         * second operand of a PLN instruction needs to be an
         * even-numbered register, so we have a special register class
         * wm_aligned_pairs_class to handle this case.  pre-GEN6 always
-       * uses this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] as the
+       * uses fs->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] as the
         * second operand of a PLN instruction (since it doesn't support
         * any other interpolation modes).  So all we need to do is find
         * that register and set it to the appropriate class.
         */
        if (compiler->fs_reg_sets[rsi].aligned_pairs_class >= 0 &&
-          this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].file == VGRF &&
-          this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].nr == i) {
+          fs->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].file == VGRF &&
+          fs->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].nr == i) {
           c = compiler->fs_reg_sets[rsi].aligned_pairs_class;
        }
  
        ra_set_node_class(g, i, c);
  
        for (unsigned j = 0; j < i; j++) {
-        if (virtual_grf_interferes(i, j)) {
+        if (fs->virtual_grf_interferes(i, j)) {
             ra_add_node_interference(g, i, j);
          }
        }
@@ -589,9 +659,9 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
     /* Certain instructions can't safely use the same register for their
      * sources and destination.  Add interference.
      */
-   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+   foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
        if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
-         for (unsigned i = 0; i < 3; i++) {
+         for (unsigned i = 0; i < inst->sources; i++) {
              if (inst->src[i].file == VGRF) {
                 ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
              }
@@ -599,13 +669,13 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
        }
     }
  
-   setup_payload_interference(g, payload_node_count, first_payload_node);
+   setup_payload_interference(payload_node_count, first_payload_node);
     if (devinfo->gen >= 7) {
        int first_used_mrf = BRW_MAX_MRF(devinfo->gen);
-      setup_mrf_hack_interference(this, g, first_mrf_hack_node,
-                                  &first_used_mrf);
+      if (allow_spilling)
+         setup_mrf_hack_interference(first_mrf_hack_node, &first_used_mrf);
  
-      foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
           /* When we do send-from-GRF for FB writes, we need to ensure that
            * the last write instruction sends from a high register.  This is
            * because the vertex fetcher wants to start filling the low
@@ -619,7 +689,7 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
           if (inst->eot) {
              const int vgrf = inst->opcode == SHADER_OPCODE_SEND ?
                               inst->src[2].nr : inst->src[0].nr;
-            int size = alloc.sizes[vgrf];
+            int size = fs->alloc.sizes[vgrf];
              int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1;
  
              /* If something happened to spill, we want to push the EOT send
@@ -645,7 +715,7 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
      * about this level of granularity, we simply make the source and
      * destination interfere.
      */
-   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+   foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
        if (inst->exec_size < 16 || inst->dst.file != VGRF)
           continue;
  
@@ -671,14 +741,14 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
         * any register overlap between sources and destination.
         */
        ra_set_node_reg(g, grf127_send_hack_node, 127);
-      foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
           if (inst->exec_size < 16 && inst->is_send_from_grf() &&
               inst->dst.file == VGRF)
              ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node);
        }
  
-      if (spilled_any_registers) {
-         foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (fs->spilled_any_registers) {
+         foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
              /* Spilling instruction are genereated as SEND messages from MRF
               * but as Gen7+ supports sending from GRF the driver will maps
               * assingn these MRF registers to a GRF. Implementations reuses
@@ -706,116 +776,15 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
      * interference here.
      */
     if (devinfo->gen >= 9) {
-      foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
           if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
               inst->src[2].file == VGRF &&
               inst->src[3].file == VGRF &&
-             inst->src[2].nr != inst->src[3].nr) {
-            for (unsigned i = 0; i < inst->mlen; i++) {
-               for (unsigned j = 0; j < inst->ex_mlen; j++) {
-                  ra_add_node_interference(g, inst->src[2].nr + i,
-                                           inst->src[3].nr + j);
-               }
-            }
-         }
-      }
-   }
-
-   /* Debug of register spilling: Go spill everything. */
-   if (unlikely(spill_all)) {
-      int reg = choose_spill_reg(g);
-
-      if (reg != -1) {
-         spill_reg(reg);
-         ralloc_free(g);
-         return false;
-      }
-   }
-
-   if (!ra_allocate(g)) {
-      /* Failed to allocate registers.  Spill a reg, and the caller will
-       * loop back into here to try again.
-       */
-      int reg = choose_spill_reg(g);
-
-      if (reg == -1) {
-         fail("no register to spill:\n");
-         dump_instructions(NULL);
-      } else if (allow_spilling) {
-         spill_reg(reg);
-      }
-
-      ralloc_free(g);
-
-      return false;
-   }
-
-   /* Get the chosen virtual registers for each node, and map virtual
-    * regs in the register classes back down to real hardware reg
-    * numbers.
-    */
-   this->grf_used = payload_node_count;
-   for (unsigned i = 0; i < this->alloc.count; i++) {
-      int reg = ra_get_node_reg(g, i);
-
-      hw_reg_mapping[i] = compiler->fs_reg_sets[rsi].ra_reg_to_grf[reg];
-      this->grf_used = MAX2(this->grf_used,
-                           hw_reg_mapping[i] + this->alloc.sizes[i]);
-   }
-
-   foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      assign_reg(hw_reg_mapping, &inst->dst);
-      for (int i = 0; i < inst->sources; i++) {
-         assign_reg(hw_reg_mapping, &inst->src[i]);
+             inst->src[2].nr != inst->src[3].nr)
+            ra_add_node_interference(g, inst->src[2].nr,
+                                     inst->src[3].nr);
        }
     }
-
-   this->alloc.count = this->grf_used;
-
-   ralloc_free(g);
-
-   return true;
-}
-
-namespace {
-   /**
-    * Maximum spill block size we expect to encounter in 32B units.
-    *
-    * This is somewhat arbitrary and doesn't necessarily limit the maximum
-    * variable size that can be spilled -- A higher value will allow a
-    * variable of a given size to be spilled more efficiently with a smaller
-    * number of scratch messages, but will increase the likelihood of a
-    * collision between the MRFs reserved for spilling and other MRFs used by
-    * the program (and possibly increase GRF register pressure on platforms
-    * without hardware MRFs), what could cause register allocation to fail.
-    *
-    * For the moment reserve just enough space so a register of 32 bit
-    * component type and natural region width can be spilled without splitting
-    * into multiple (force_writemask_all) scratch messages.
-    */
-   unsigned
-   spill_max_size(const backend_shader *s)
-   {
-      /* FINISHME - On Gen7+ it should be possible to avoid this limit
-       *            altogether by spilling directly from the temporary GRF
-       *            allocated to hold the result of the instruction (and the
-       *            scratch write header).
-       */
-      /* FINISHME - The shader's dispatch width probably belongs in
-       *            backend_shader (or some nonexistent fs_shader class?)
-       *            rather than in the visitor class.
-       */
-      return static_cast<const fs_visitor *>(s)->dispatch_width / 8;
-   }
-
-   /**
-    * First MRF register available for spilling.
-    */
-   unsigned
-   spill_base_mrf(const backend_shader *s)
-   {
-      return BRW_MAX_MRF(s->devinfo->gen) - spill_max_size(s) - 1;
-   }
  }
  
  static void
@@ -872,13 +841,13 @@ emit_spill(const fs_builder &bld, fs_reg src,
  }
  
  int
-fs_visitor::choose_spill_reg(struct ra_graph *g)
+fs_reg_alloc::choose_spill_reg()
  {
     float block_scale = 1.0;
-   float spill_costs[this->alloc.count];
-   bool no_spill[this->alloc.count];
+   float spill_costs[fs->alloc.count];
+   bool no_spill[fs->alloc.count];
  
-   for (unsigned i = 0; i < this->alloc.count; i++) {
+   for (unsigned i = 0; i < fs->alloc.count; i++) {
        spill_costs[i] = 0.0;
        no_spill[i] = false;
     }
@@ -887,7 +856,7 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)
      * spill/unspill we'll have to do, and guess that the insides of
      * loops run 10 times.
      */
-   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+   foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
        for (unsigned int i = 0; i < inst->sources; i++) {
          if (inst->src[i].file == VGRF)
              spill_costs[inst->src[i].nr] += regs_read(inst, i) * block_scale;
@@ -931,19 +900,31 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)
        }
     }
  
-   for (unsigned i = 0; i < this->alloc.count; i++) {
+   for (unsigned i = 0; i < fs->alloc.count; i++) {
+      int live_length = fs->virtual_grf_end[i] - fs->virtual_grf_start[i];
+      if (live_length <= 0)
+         continue;
+
+      /* Divide the cost (in number of spills/fills) by the log of the length
+       * of the live range of the register.  This will encourage spill logic
+       * to spill long-living things before spilling short-lived things where
+       * spilling is less likely to actually do us any good.  We use the log
+       * of the length because it will fall off very quickly and not cause us
+       * to spill medium length registers with more uses.
+       */
+      float adjusted_cost = spill_costs[i] / logf(live_length);
        if (!no_spill[i])
-        ra_set_node_spill_cost(g, i, spill_costs[i]);
+        ra_set_node_spill_cost(g, i, adjusted_cost);
     }
  
     return ra_get_best_spill_node(g);
  }
  
  void
-fs_visitor::spill_reg(unsigned spill_reg)
+fs_reg_alloc::spill_reg(unsigned spill_reg)
  {
-   int size = alloc.sizes[spill_reg];
-   unsigned int spill_offset = last_scratch;
+   int size = fs->alloc.sizes[spill_reg];
+   unsigned int spill_offset = fs->last_scratch;
     assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */
  
     /* Spills may use MRFs 13-15 in the SIMD16 case.  Our texturing is done
@@ -953,29 +934,29 @@ fs_visitor::spill_reg(unsigned spill_reg)
      * depth), starting from m1.  In summary: We may not be able to spill in
      * SIMD16 mode, because we'd stomp the FB writes.
      */
-   if (!spilled_any_registers) {
+   if (!fs->spilled_any_registers) {
        bool mrf_used[BRW_MAX_MRF(devinfo->gen)];
-      get_used_mrfs(this, mrf_used);
+      get_used_mrfs(fs, mrf_used);
  
-      for (int i = spill_base_mrf(this); i < BRW_MAX_MRF(devinfo->gen); i++) {
+      for (int i = spill_base_mrf(fs); i < BRW_MAX_MRF(devinfo->gen); i++) {
           if (mrf_used[i]) {
-            fail("Register spilling not supported with m%d used", i);
+            fs->fail("Register spilling not supported with m%d used", i);
            return;
           }
        }
  
-      spilled_any_registers = true;
+      fs->spilled_any_registers = true;
     }
  
-   last_scratch += size * REG_SIZE;
+   fs->last_scratch += size * REG_SIZE;
  
     /* Generate spill/unspill instructions for the objects being
      * spilled.  Right now, we spill or unspill the whole thing to a
      * virtual grf of the same size.  For most instructions, though, we
      * could just spill/unspill the GRF being accessed.
      */
-   foreach_block_and_inst (block, fs_inst, inst, cfg) {
-      const fs_builder ibld = fs_builder(this, block, inst);
+   foreach_block_and_inst (block, fs_inst, inst, fs->cfg) {
+      const fs_builder ibld = fs_builder(fs, block, inst);
  
        for (unsigned int i = 0; i < inst->sources; i++) {
          if (inst->src[i].file == VGRF &&
@@ -983,7 +964,7 @@ fs_visitor::spill_reg(unsigned spill_reg)
              int count = regs_read(inst, i);
              int subset_spill_offset = spill_offset +
                 ROUND_DOWN_TO(inst->src[i].offset, REG_SIZE);
-            fs_reg unspill_dst(VGRF, alloc.allocate(count));
+            fs_reg unspill_dst(VGRF, fs->alloc.allocate(count));
  
              inst->src[i].nr = unspill_dst.nr;
              inst->src[i].offset %= REG_SIZE;
@@ -1011,7 +992,7 @@ fs_visitor::spill_reg(unsigned spill_reg)
            inst->dst.nr == spill_reg) {
           int subset_spill_offset = spill_offset +
              ROUND_DOWN_TO(inst->dst.offset, REG_SIZE);
-         fs_reg spill_src(VGRF, alloc.allocate(regs_written(inst)));
+         fs_reg spill_src(VGRF, fs->alloc.allocate(regs_written(inst)));
  
           inst->dst.nr = spill_src.nr;
           inst->dst.offset %= REG_SIZE;
@@ -1033,7 +1014,7 @@ fs_visitor::spill_reg(unsigned spill_reg)
            */
           const unsigned width = 8 * MIN2(
              DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE),
-            spill_max_size(this));
+            spill_max_size(fs));
  
           /* Spills should only write data initialized by the instruction for
            * whichever channels are enabled in the excution mask.  If that's
@@ -1054,7 +1035,7 @@ fs_visitor::spill_reg(unsigned spill_reg)
            * write, there should be no need for the unspill since the
            * instruction will be overwriting the whole destination in any case.
           */
-         if (inst->is_partial_reg_write() ||
+         if (inst->is_partial_write() ||
               (!inst->force_writemask_all && !per_channel))
              emit_unspill(ubld, spill_src, subset_spill_offset,
                           regs_written(inst));
@@ -1064,5 +1045,78 @@ fs_visitor::spill_reg(unsigned spill_reg)
        }
     }
  
-   invalidate_live_intervals();
+   fs->invalidate_live_intervals();
+}
+
+bool
+fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all)
+{
+   while (1) {
+      build_interference_graph(fs->spilled_any_registers);
+
+      /* Debug of register spilling: Go spill everything. */
+      if (unlikely(spill_all)) {
+         int reg = choose_spill_reg();
+         if (reg != -1) {
+            spill_reg(reg);
+            ralloc_free(g);
+            g = NULL;
+            continue;
+         }
+      }
+
+      if (ra_allocate(g))
+         break;
+
+      if (!allow_spilling)
+         return false;
+
+      /* Failed to allocate registers.  Spill a reg, and the caller will
+       * loop back into here to try again.
+       */
+      int reg = choose_spill_reg();
+      if (reg == -1)
+         return false;
+
+      spill_reg(reg);
+      ralloc_free(g);
+      g = NULL;
+   }
+
+   /* Get the chosen virtual registers for each node, and map virtual
+    * regs in the register classes back down to real hardware reg
+    * numbers.
+    */
+   unsigned hw_reg_mapping[fs->alloc.count];
+   fs->grf_used = fs->first_non_payload_grf;
+   for (unsigned i = 0; i < fs->alloc.count; i++) {
+      int reg = ra_get_node_reg(g, i);
+
+      hw_reg_mapping[i] = compiler->fs_reg_sets[rsi].ra_reg_to_grf[reg];
+      fs->grf_used = MAX2(fs->grf_used,
+                         hw_reg_mapping[i] + fs->alloc.sizes[i]);
+   }
+
+   foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
+      assign_reg(hw_reg_mapping, &inst->dst);
+      for (int i = 0; i < inst->sources; i++) {
+         assign_reg(hw_reg_mapping, &inst->src[i]);
+      }
+   }
+
+   fs->alloc.count = fs->grf_used;
+
+   return true;
+}
+
+bool
+fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
+{
+   fs_reg_alloc alloc(this);
+   bool success = alloc.assign_regs(allow_spilling, spill_all);
+   if (!success && allow_spilling) {
+      fail("no register to spill:\n");
+      dump_instructions(NULL);
+   }
+   return success;
  }