i965: Define method to check whether a backend_reg is inside a given range.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
index b9bd94c3070d275ba116678fe97497732114d569..a57f501a37e2b1ed17a62ed37e96ba7e9482ef16 100644 (file)
@@ -28,8 +28,6 @@
  * from the LIR.
  */
 
-extern "C" {
-
 #include <sys/types.h>
 
 #include "util/hash_table.h"
@@ -39,28 +37,30 @@ extern "C" {
 #include "program/prog_parameter.h"
 #include "program/prog_print.h"
 #include "util/register_allocate.h"
-#include "program/sampler.h"
 #include "program/hash_table.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
-}
 #include "brw_fs.h"
 #include "brw_cfg.h"
 #include "brw_dead_control_flow.h"
 #include "main/uniforms.h"
 #include "brw_fs_live_variables.h"
 #include "glsl/glsl_types.h"
+#include "program/sampler.h"
 
 void
 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
-              fs_reg *src, int sources)
+              const fs_reg *src, unsigned sources)
 {
    memset(this, 0, sizeof(*this));
 
+   this->src = new fs_reg[MAX2(sources, 3)];
+   for (unsigned i = 0; i < sources; i++)
+      this->src[i] = src[i];
+
    this->opcode = opcode;
    this->dst = dst;
-   this->src = src;
    this->sources = sources;
    this->exec_size = exec_size;
 
@@ -75,8 +75,8 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
       if (dst.file == GRF) {
          this->exec_size = dst.width;
       } else {
-         for (int i = 0; i < sources; ++i) {
-            if (src[i].file != GRF)
+         for (unsigned i = 0; i < sources; ++i) {
+            if (src[i].file != GRF && src[i].file != ATTR)
                continue;
 
             if (this->exec_size <= 1)
@@ -90,13 +90,14 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    }
    assert(this->exec_size != 0);
 
-   for (int i = 0; i < sources; ++i) {
+   for (unsigned i = 0; i < sources; ++i) {
       switch (this->src[i].file) {
       case BAD_FILE:
          this->src[i].effective_width = 8;
          break;
       case GRF:
       case HW_REG:
+      case ATTR:
          assert(this->src[i].width > 0);
          if (this->src[i].width == 1) {
             this->src[i].effective_width = this->exec_size;
@@ -121,7 +122,9 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    case GRF:
    case HW_REG:
    case MRF:
-      this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
+   case ATTR:
+      this->regs_written =
+         DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
       break;
    case BAD_FILE:
       this->regs_written = 0;
@@ -138,82 +141,68 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 
 fs_inst::fs_inst()
 {
-   fs_reg *src = ralloc_array(this, fs_reg, 3);
-   init(BRW_OPCODE_NOP, 8, dst, src, 0);
+   init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 }
 
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 {
-   fs_reg *src = ralloc_array(this, fs_reg, 3);
-   init(opcode, exec_size, reg_undef, src, 0);
+   init(opcode, exec_size, reg_undef, NULL, 0);
 }
 
 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 {
-   fs_reg *src = ralloc_array(this, fs_reg, 3);
-   init(opcode, 0, dst, src, 0);
+   init(opcode, 0, dst, NULL, 0);
 }
 
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
                  const fs_reg &src0)
 {
-   fs_reg *src = ralloc_array(this, fs_reg, 3);
-   src[0] = src0;
+   const fs_reg src[1] = { src0 };
    init(opcode, exec_size, dst, src, 1);
 }
 
 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 {
-   fs_reg *src = ralloc_array(this, fs_reg, 3);
-   src[0] = src0;
+   const fs_reg src[1] = { src0 };
    init(opcode, 0, dst, src, 1);
 }
 
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
                  const fs_reg &src0, const fs_reg &src1)
 {
-   fs_reg *src = ralloc_array(this, fs_reg, 3);
-   src[0] = src0;
-   src[1] = src1;
+   const fs_reg src[2] = { src0, src1 };
    init(opcode, exec_size, dst, src, 2);
 }
 
 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
                  const fs_reg &src1)
 {
-   fs_reg *src = ralloc_array(this, fs_reg, 3);
-   src[0] = src0;
-   src[1] = src1;
+   const fs_reg src[2] = { src0, src1 };
    init(opcode, 0, dst, src, 2);
 }
 
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 {
-   fs_reg *src = ralloc_array(this, fs_reg, 3);
-   src[0] = src0;
-   src[1] = src1;
-   src[2] = src2;
+   const fs_reg src[3] = { src0, src1, src2 };
    init(opcode, exec_size, dst, src, 3);
 }
 
 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
                  const fs_reg &src1, const fs_reg &src2)
 {
-   fs_reg *src = ralloc_array(this, fs_reg, 3);
-   src[0] = src0;
-   src[1] = src1;
-   src[2] = src2;
+   const fs_reg src[3] = { src0, src1, src2 };
    init(opcode, 0, dst, src, 3);
 }
 
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
+fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
+                 const fs_reg src[], unsigned sources)
 {
    init(opcode, 0, dst, src, sources);
 }
 
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
-                 fs_reg src[], int sources)
+                 const fs_reg src[], unsigned sources)
 {
    init(opcode, exec_width, dst, src, sources);
 }
@@ -222,17 +211,28 @@ fs_inst::fs_inst(const fs_inst &that)
 {
    memcpy(this, &that, sizeof(that));
 
-   this->src = ralloc_array(this, fs_reg, that.sources);
+   this->src = new fs_reg[MAX2(that.sources, 3)];
 
-   for (int i = 0; i < that.sources; i++)
+   for (unsigned i = 0; i < that.sources; i++)
       this->src[i] = that.src[i];
 }
 
+fs_inst::~fs_inst()
+{
+   delete[] this->src;
+}
+
 void
 fs_inst::resize_sources(uint8_t num_sources)
 {
    if (this->sources != num_sources) {
-      this->src = reralloc(this, this->src, fs_reg, num_sources);
+      fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
+
+      for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
+         src[i] = this->src[i];
+
+      delete[] this->src;
+      this->src = src;
       this->sources = num_sources;
    }
 }
@@ -337,17 +337,13 @@ fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
     *
     * Original gen4 does type conversion to the destination type before
     * comparison, producing garbage results for floating point comparisons.
-    * gen5 does the comparison on the execution type (resolved source types),
-    * so dst type doesn't matter.  gen6 does comparison and then uses the
-    * result as if it was the dst type with no conversion, which happens to
-    * mostly work out for float-interpreted-as-int since our comparisons are
-    * for >0, =0, <0.
+    *
+    * The destination type doesn't matter on newer generations, so we set the
+    * type to match src0 so we can compact the instruction.
     */
-   if (brw->gen == 4) {
-      dst.type = src0.type;
-      if (dst.file == HW_REG)
-        dst.fixed_hw_reg.type = dst.type;
-   }
+   dst.type = src0.type;
+   if (dst.file == HW_REG)
+      dst.fixed_hw_reg.type = dst.type;
 
    resolve_ud_negate(&src0);
    resolve_ud_negate(&src1);
@@ -376,7 +372,7 @@ fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
        * dealing with whole registers.  If this ever changes, we can deal
        * with it later.
        */
-      int size = src[i].effective_width * type_sz(src[i].type);
+      int size = inst->src[i].effective_width * type_sz(src[i].type);
       assert(size % 32 == 0);
       inst->regs_written += (size + 31) / 32;
    }
@@ -404,7 +400,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
     * CSE can later notice that those loads are all the same and eliminate
     * the redundant ones.
     */
-   fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
+   fs_reg vec4_offset = vgrf(glsl_type::int_type);
    instructions.push_tail(ADD(vec4_offset,
                               varying_offset, fs_reg(const_offset & ~3)));
 
@@ -426,7 +422,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 
    assert(dst.width % 8 == 0);
    int regs_written = 4 * (dst.width / 8) * scale;
-   fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
+   fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
                                dst.type, dst.width);
    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
    inst->regs_written = regs_written;
@@ -491,10 +487,7 @@ fs_inst::equals(fs_inst *inst) const
 bool
 fs_inst::overwrites_reg(const fs_reg &reg) const
 {
-   return (reg.file == dst.file &&
-           reg.reg == dst.reg &&
-           reg.reg_offset >= dst.reg_offset  &&
-           reg.reg_offset < dst.reg_offset + regs_written);
+   return reg.in_range(dst, regs_written);
 }
 
 bool
@@ -509,6 +502,7 @@ fs_inst::is_send_from_grf() const
    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
    case SHADER_OPCODE_UNTYPED_ATOMIC:
    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+   case SHADER_OPCODE_URB_WRITE_SIMD8:
       return true;
    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
       return src[1].file == GRF;
@@ -581,6 +575,27 @@ fs_reg::fs_reg(uint32_t u)
    this->width = 1;
 }
 
+/** Vector float immediate value constructor. */
+fs_reg::fs_reg(uint8_t vf[4])
+{
+   init();
+   this->file = IMM;
+   this->type = BRW_REGISTER_TYPE_VF;
+   memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
+}
+
+/** Vector float immediate value constructor. */
+fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
+{
+   init();
+   this->file = IMM;
+   this->type = BRW_REGISTER_TYPE_VF;
+   this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
+                               (vf1 <<  8) |
+                               (vf2 << 16) |
+                               (vf3 << 24);
+}
+
 /** Fixed brw_reg. */
 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 {
@@ -607,16 +622,6 @@ fs_reg::equals(const fs_reg &r) const
            stride == r.stride);
 }
 
-fs_reg &
-fs_reg::apply_stride(unsigned stride)
-{
-   assert((this->stride * stride) <= 4 &&
-          (is_power_of_two(stride) || stride == 0) &&
-          file != HW_REG && file != IMM);
-   this->stride *= stride;
-   return *this;
-}
-
 fs_reg &
 fs_reg::set_smear(unsigned subreg)
 {
@@ -632,12 +637,6 @@ fs_reg::is_contiguous() const
    return stride == 1;
 }
 
-bool
-fs_reg::is_valid_3src() const
-{
-   return file == GRF || file == UNIFORM;
-}
-
 int
 fs_visitor::type_size(const struct glsl_type *type)
 {
@@ -668,30 +667,36 @@ fs_visitor::type_size(const struct glsl_type *type)
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_ERROR:
    case GLSL_TYPE_INTERFACE:
+   case GLSL_TYPE_DOUBLE:
       unreachable("not reached");
    }
 
    return 0;
 }
 
+/**
+ * Create a MOV to read the timestamp register.
+ *
+ * The caller is responsible for emitting the MOV.  The return value is
+ * the destination of the MOV, with extra parameters set.
+ */
 fs_reg
-fs_visitor::get_timestamp()
+fs_visitor::get_timestamp(fs_inst **out_mov)
 {
    assert(brw->gen >= 7);
 
-   fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+   fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
                                           BRW_ARF_TIMESTAMP,
                                           0),
                              BRW_REGISTER_TYPE_UD));
 
-   fs_reg dst = fs_reg(this, glsl_type::uint_type);
+   fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 
-   fs_inst *mov = emit(MOV(dst, ts));
-   /* We want to read the 3 fields we care about (mostly field 0, but also 2)
-    * even if it's not enabled in the dispatch.
+   fs_inst *mov = MOV(dst, ts);
+   /* We want to read the 3 fields we care about even if it's not enabled in
+    * the dispatch.
     */
    mov->force_writemask_all = true;
-   mov->exec_size = 8;
 
    /* The caller wants the low 32 bits of the timestamp.  Since it's running
     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
@@ -705,6 +710,7 @@ fs_visitor::get_timestamp()
     */
    dst.set_smear(0);
 
+   *out_mov = mov;
    return dst;
 }
 
@@ -712,7 +718,9 @@ void
 fs_visitor::emit_shader_time_begin()
 {
    current_annotation = "shader time start";
-   shader_start_time = get_timestamp();
+   fs_inst *mov;
+   shader_start_time = get_timestamp(&mov);
+   emit(mov);
 }
 
 void
@@ -721,52 +729,77 @@ fs_visitor::emit_shader_time_end()
    current_annotation = "shader time end";
 
    enum shader_time_shader_type type, written_type, reset_type;
-   if (dispatch_width == 8) {
-      type = ST_FS8;
-      written_type = ST_FS8_WRITTEN;
-      reset_type = ST_FS8_RESET;
-   } else {
-      assert(dispatch_width == 16);
-      type = ST_FS16;
-      written_type = ST_FS16_WRITTEN;
-      reset_type = ST_FS16_RESET;
+   switch (stage) {
+   case MESA_SHADER_VERTEX:
+      type = ST_VS;
+      written_type = ST_VS_WRITTEN;
+      reset_type = ST_VS_RESET;
+      break;
+   case MESA_SHADER_GEOMETRY:
+      type = ST_GS;
+      written_type = ST_GS_WRITTEN;
+      reset_type = ST_GS_RESET;
+      break;
+   case MESA_SHADER_FRAGMENT:
+      if (dispatch_width == 8) {
+         type = ST_FS8;
+         written_type = ST_FS8_WRITTEN;
+         reset_type = ST_FS8_RESET;
+      } else {
+         assert(dispatch_width == 16);
+         type = ST_FS16;
+         written_type = ST_FS16_WRITTEN;
+         reset_type = ST_FS16_RESET;
+      }
+      break;
+   default:
+      unreachable("fs_visitor::emit_shader_time_end missing code");
    }
 
-   fs_reg shader_end_time = get_timestamp();
+   /* Insert our code just before the final SEND with EOT. */
+   exec_node *end = this->instructions.get_tail();
+   assert(end && ((fs_inst *) end)->eot);
+
+   fs_inst *tm_read;
+   fs_reg shader_end_time = get_timestamp(&tm_read);
+   end->insert_before(tm_read);
 
    /* Check that there weren't any timestamp reset events (assuming these
     * were the only two timestamp reads that happened).
     */
    fs_reg reset = shader_end_time;
    reset.set_smear(2);
-   fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
+   fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
    test->conditional_mod = BRW_CONDITIONAL_Z;
-   emit(IF(BRW_PREDICATE_NORMAL));
+   test->force_writemask_all = true;
+   end->insert_before(test);
+   end->insert_before(IF(BRW_PREDICATE_NORMAL));
 
-   push_force_uncompressed();
    fs_reg start = shader_start_time;
    start.negate = true;
-   fs_reg diff = fs_reg(this, glsl_type::uint_type);
-   emit(ADD(diff, start, shader_end_time));
+   fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
+   diff.set_smear(0);
+   fs_inst *add = ADD(diff, start, shader_end_time);
+   add->force_writemask_all = true;
+   end->insert_before(add);
 
    /* If there were no instructions between the two timestamp gets, the diff
     * is 2 cycles.  Remove that overhead, so I can forget about that when
     * trying to determine the time taken for single instructions.
     */
-   emit(ADD(diff, diff, fs_reg(-2u)));
-
-   emit_shader_time_write(type, diff);
-   emit_shader_time_write(written_type, fs_reg(1u));
-   emit(BRW_OPCODE_ELSE);
-   emit_shader_time_write(reset_type, fs_reg(1u));
-   emit(BRW_OPCODE_ENDIF);
-
-   pop_force_uncompressed();
+   add = ADD(diff, diff, fs_reg(-2u));
+   add->force_writemask_all = true;
+   end->insert_before(add);
+
+   end->insert_before(SHADER_TIME_ADD(type, diff));
+   end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
+   end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
+   end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
+   end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 }
 
-void
-fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
-                                   fs_reg value)
+fs_inst *
+fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 {
    int shader_time_index =
       brw_get_shader_time_index(brw, shader_prog, prog, type);
@@ -774,12 +807,12 @@ fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 
    fs_reg payload;
    if (dispatch_width == 8)
-      payload = fs_reg(this, glsl_type::uvec2_type);
+      payload = vgrf(glsl_type::uvec2_type);
    else
-      payload = fs_reg(this, glsl_type::uint_type);
+      payload = vgrf(glsl_type::uint_type);
 
-   emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
-                             fs_reg(), payload, offset, value));
+   return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
+                               fs_reg(), payload, offset, value);
 }
 
 void
@@ -793,11 +826,11 @@ fs_visitor::vfail(const char *format, va_list va)
    failed = true;
 
    msg = ralloc_vasprintf(mem_ctx, format, va);
-   msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
+   msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 
    this->fail_msg = msg;
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (debug_enabled) {
       fprintf(stderr, "%s",  msg);
    }
 }
@@ -883,19 +916,6 @@ fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 }
 
-void
-fs_visitor::push_force_uncompressed()
-{
-   force_uncompressed_stack++;
-}
-
-void
-fs_visitor::pop_force_uncompressed()
-{
-   force_uncompressed_stack--;
-   assert(force_uncompressed_stack >= 0);
-}
-
 /**
  * Returns true if the instruction has a flag that means it won't
  * update an entire destination register.
@@ -913,16 +933,20 @@ fs_inst::is_partial_write() const
 }
 
 int
-fs_inst::regs_read(fs_visitor *v, int arg) const
+fs_inst::regs_read(int arg) const
 {
    if (is_tex() && arg == 0 && src[0].file == GRF) {
       return mlen;
    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
       return mlen;
+   } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
+      return mlen;
    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
       return mlen;
    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
       return mlen;
+   } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
+      return mlen;
    }
 
    switch (src[arg].file) {
@@ -954,7 +978,9 @@ fs_inst::reads_flag() const
 bool
 fs_inst::writes_flag() const
 {
-   return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
+   return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
+                               opcode != BRW_OPCODE_IF &&
+                               opcode != BRW_OPCODE_WHILE)) ||
           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 }
 
@@ -1009,6 +1035,7 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
       return 2;
    case SHADER_OPCODE_UNTYPED_ATOMIC:
    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+   case SHADER_OPCODE_URB_WRITE_SIMD8:
    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
@@ -1019,19 +1046,20 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
    }
 }
 
-int
-fs_visitor::virtual_grf_alloc(int size)
+fs_reg
+fs_visitor::vgrf(const glsl_type *const type)
 {
-   if (virtual_grf_array_size <= virtual_grf_count) {
-      if (virtual_grf_array_size == 0)
-        virtual_grf_array_size = 16;
-      else
-        virtual_grf_array_size *= 2;
-      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
-                                  virtual_grf_array_size);
-   }
-   virtual_grf_sizes[virtual_grf_count] = size;
-   return virtual_grf_count++;
+   int reg_width = dispatch_width / 8;
+   return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
+                 brw_type_for_base_type(type), dispatch_width);
+}
+
+fs_reg
+fs_visitor::vgrf(int num_components)
+{
+   int reg_width = dispatch_width / 8;
+   return fs_reg(GRF, alloc.allocate(num_components * reg_width),
+                 BRW_REGISTER_TYPE_F, dispatch_width);
 }
 
 /** Fixed HW reg constructor. */
@@ -1079,20 +1107,6 @@ fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
    this->width = width;
 }
 
-/** Automatic reg constructor. */
-fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
-{
-   init();
-   int reg_width = v->dispatch_width / 8;
-
-   this->file = GRF;
-   this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
-   this->reg_offset = 0;
-   this->type = brw_type_for_base_type(type);
-   this->width = v->dispatch_width;
-   assert(this->width == 8 || this->width == 16);
-}
-
 fs_reg *
 fs_visitor::variable_storage(ir_variable *var)
 {
@@ -1177,10 +1191,10 @@ fs_visitor::setup_uniform_values(ir_variable *ir)
 void
 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 {
-   const ir_state_slot *const slots = ir->state_slots;
-   assert(ir->state_slots != NULL);
+   const ir_state_slot *const slots = ir->get_state_slots();
+   assert(slots != NULL);
 
-   for (unsigned int i = 0; i < ir->num_state_slots; i++) {
+   for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
       /* This state reference has already been setup by ir_to_mesa, but we'll
        * get the same index back here.
        */
@@ -1205,16 +1219,17 @@ fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 }
 
 fs_reg *
-fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
+fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
+                                         bool origin_upper_left)
 {
    assert(stage == MESA_SHADER_FRAGMENT);
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
+   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
    fs_reg wpos = *reg;
-   bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
+   bool flip = !origin_upper_left ^ key->render_to_fbo;
 
    /* gl_FragCoord.x */
-   if (ir->data.pixel_center_integer) {
+   if (pixel_center_integer) {
       emit(MOV(wpos, this->pixel_x));
    } else {
       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
@@ -1222,11 +1237,11 @@ fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
    wpos = offset(wpos, 1);
 
    /* gl_FragCoord.y */
-   if (!flip && ir->data.pixel_center_integer) {
+   if (!flip && pixel_center_integer) {
       emit(MOV(wpos, this->pixel_y));
    } else {
       fs_reg pixel_y = this->pixel_y;
-      float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
+      float offset = (pixel_center_integer ? 0.0 : 0.5);
 
       if (flip) {
         pixel_y.negate = true;
@@ -1289,35 +1304,41 @@ fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
                this->delta_y[barycoord_mode], interp);
 }
 
-fs_reg *
-fs_visitor::emit_general_interpolation(ir_variable *ir)
+void
+fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
+                                       const glsl_type *type,
+                                       glsl_interp_qualifier interpolation_mode,
+                                       int location, bool mod_centroid,
+                                       bool mod_sample)
 {
-   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
-   reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
-   fs_reg attr = *reg;
+   attr.type = brw_type_for_base_type(type->get_scalar_type());
 
    assert(stage == MESA_SHADER_FRAGMENT);
    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 
    unsigned int array_elements;
-   const glsl_type *type;
 
-   if (ir->type->is_array()) {
-      array_elements = ir->type->length;
+   if (type->is_array()) {
+      array_elements = type->length;
       if (array_elements == 0) {
-        fail("dereferenced array '%s' has length 0\n", ir->name);
+         fail("dereferenced array '%s' has length 0\n", name);
       }
-      type = ir->type->fields.array;
+      type = type->fields.array;
    } else {
       array_elements = 1;
-      type = ir->type;
    }
 
-   glsl_interp_qualifier interpolation_mode =
-      ir->determine_interpolation_mode(key->flat_shade);
+   if (interpolation_mode == INTERP_QUALIFIER_NONE) {
+      bool is_gl_Color =
+         location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
+      if (key->flat_shade && is_gl_Color) {
+         interpolation_mode = INTERP_QUALIFIER_FLAT;
+      } else {
+         interpolation_mode = INTERP_QUALIFIER_SMOOTH;
+      }
+   }
 
-   int location = ir->data.location;
    for (unsigned int i = 0; i < array_elements; i++) {
       for (unsigned int j = 0; j < type->matrix_columns; j++) {
         if (prog_data->urb_setup[location] == -1) {
@@ -1337,7 +1358,7 @@ fs_visitor::emit_general_interpolation(ir_variable *ir)
            for (unsigned int k = 0; k < type->vector_elements; k++) {
               struct brw_reg interp = interp_reg(location, k);
               interp = suboffset(interp, 3);
-               interp.type = reg->type;
+               interp.type = attr.type;
               emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
               attr = offset(attr, 1);
            }
@@ -1345,7 +1366,7 @@ fs_visitor::emit_general_interpolation(ir_variable *ir)
            /* Smooth/noperspective interpolation case. */
            for (unsigned int k = 0; k < type->vector_elements; k++) {
                struct brw_reg interp = interp_reg(location, k);
-               if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
+               if (brw->needs_unlit_centroid_workaround && mod_centroid) {
                   /* Get the pixel/sample mask into f0 so that we know
                    * which pixels are lit.  Then, for each channel that is
                    * unlit, replace the centroid data with non-centroid
@@ -1362,8 +1383,8 @@ fs_visitor::emit_general_interpolation(ir_variable *ir)
                      inst->no_dd_clear = true;
 
                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
-                                      ir->data.centroid && !key->persample_shading,
-                                      ir->data.sample || key->persample_shading);
+                                      mod_centroid && !key->persample_shading,
+                                      mod_sample || key->persample_shading);
                   inst->predicate = BRW_PREDICATE_NORMAL;
                   inst->predicate_inverse = false;
                   if (brw->has_pln)
@@ -1371,8 +1392,8 @@ fs_visitor::emit_general_interpolation(ir_variable *ir)
 
                } else {
                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
-                               ir->data.centroid && !key->persample_shading,
-                               ir->data.sample || key->persample_shading);
+                               mod_centroid && !key->persample_shading,
+                               mod_sample || key->persample_shading);
                }
                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
@@ -1384,14 +1405,12 @@ fs_visitor::emit_general_interpolation(ir_variable *ir)
         location++;
       }
    }
-
-   return reg;
 }
 
 fs_reg *
 fs_visitor::emit_frontfacing_interpolation()
 {
-   fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
+   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
 
    if (brw->gen >= 6) {
       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
@@ -1418,15 +1437,12 @@ fs_visitor::emit_frontfacing_interpolation()
        * instruction only operates on UD (or D with an abs source modifier)
        * sources without negation.
        *
-       * Instead, use ASR (which will give ~0/true or 0/false) followed by an
-       * AND 1.
+       * Instead, use ASR (which will give ~0/true or 0/false).
        */
-      fs_reg asr = fs_reg(this, glsl_type::bool_type);
       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
       g1_6.negate = true;
 
-      emit(ASR(asr, g1_6, fs_reg(31)));
-      emit(AND(*reg, asr, fs_reg(1)));
+      emit(ASR(*reg, g1_6, fs_reg(31)));
    }
 
    return reg;
@@ -1461,10 +1477,10 @@ fs_visitor::emit_samplepos_setup()
    assert(brw->gen >= 6);
 
    this->current_annotation = "compute sample position";
-   fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
+   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
    fs_reg pos = *reg;
-   fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
-   fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
+   fs_reg int_sample_x = vgrf(glsl_type::int_type);
+   fs_reg int_sample_y = vgrf(glsl_type::int_type);
 
    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
     * mode will be enabled.
@@ -1505,18 +1521,18 @@ fs_visitor::emit_samplepos_setup()
 }
 
 fs_reg *
-fs_visitor::emit_sampleid_setup(ir_variable *ir)
+fs_visitor::emit_sampleid_setup()
 {
    assert(stage == MESA_SHADER_FRAGMENT);
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
    assert(brw->gen >= 6);
 
    this->current_annotation = "compute sample id";
-   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
+   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
 
    if (key->compute_sample_id) {
-      fs_reg t1 = fs_reg(this, glsl_type::int_type);
-      fs_reg t2 = fs_reg(this, glsl_type::int_type);
+      fs_reg t1 = vgrf(glsl_type::int_type);
+      fs_reg t2 = vgrf(glsl_type::int_type);
       t2.type = BRW_REGISTER_TYPE_UW;
 
       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
@@ -1563,6 +1579,17 @@ fs_visitor::emit_sampleid_setup(ir_variable *ir)
    return reg;
 }
 
+void
+fs_visitor::resolve_source_modifiers(fs_reg *src)
+{
+   if (!src->abs && !src->negate)
+      return;
+
+   fs_reg temp = retype(vgrf(1), src->type);
+   emit(MOV(temp, *src));
+   *src = temp;
+}
+
 fs_reg
 fs_visitor::fix_math_operand(fs_reg src)
 {
@@ -1584,7 +1611,7 @@ fs_visitor::fix_math_operand(fs_reg src)
    if (brw->gen >= 7 && src.file != IMM)
       return src;
 
-   fs_reg expanded = fs_reg(this, glsl_type::float_type);
+   fs_reg expanded = vgrf(glsl_type::float_type);
    expanded.type = src.type;
    emit(BRW_OPCODE_MOV, expanded, src);
    return expanded;
@@ -1663,6 +1690,21 @@ fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
    return inst;
 }
 
+void
+fs_visitor::emit_discard_jump()
+{
+   /* For performance, after a discard, jump to the end of the
+    * shader if all relevant channels have been discarded.
+    */
+   fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
+   discard_jump->flag_subreg = 1;
+
+   discard_jump->predicate = (dispatch_width == 8)
+                             ? BRW_PREDICATE_ALIGN1_ANY8H
+                             : BRW_PREDICATE_ALIGN1_ANY16H;
+   discard_jump->predicate_inverse = true;
+}
+
 void
 fs_visitor::assign_curb_setup()
 {
@@ -1822,6 +1864,61 @@ fs_visitor::assign_urb_setup()
       urb_start + prog_data->num_varying_inputs * 2;
 }
 
+void
+fs_visitor::assign_vs_urb_setup()
+{
+   brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
+   int grf, count, slot, channel, attr;
+
+   assert(stage == MESA_SHADER_VERTEX);
+   count = _mesa_bitcount_64(vs_prog_data->inputs_read);
+   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
+      count++;
+
+   /* Each attribute is 4 regs. */
+   this->first_non_payload_grf =
+      payload.num_regs + prog_data->curb_read_length + count * 4;
+
+   unsigned vue_entries =
+      MAX2(count, vs_prog_data->base.vue_map.num_slots);
+
+   vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
+   vs_prog_data->base.urb_read_length = (count + 1) / 2;
+
+   assert(vs_prog_data->base.urb_read_length <= 15);
+
+   /* Rewrite all ATTR file references to the hw grf that they land in. */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == ATTR) {
+
+            if (inst->src[i].reg == VERT_ATTRIB_MAX) {
+               slot = count - 1;
+            } else {
+               /* Attributes come in in a contiguous block, ordered by their
+                * gl_vert_attrib value.  That means we can compute the slot
+                * number for an attribute by masking out the enabled
+                * attributes before it and counting the bits.
+                */
+               attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
+               slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
+                                        BITFIELD64_MASK(attr));
+            }
+
+            channel = inst->src[i].reg_offset & 3;
+
+            grf = payload.num_regs +
+               prog_data->curb_read_length +
+               slot * 4 + channel;
+
+            inst->src[i].file = HW_REG;
+            inst->src[i].fixed_hw_reg =
+               retype(brw_vec8_grf(grf, 0), inst->src[i].type);
+         }
+      }
+   }
+}
+
 /**
  * Split large virtual GRFs into separate components if we can.
  *
@@ -1843,14 +1940,14 @@ fs_visitor::assign_urb_setup()
 void
 fs_visitor::split_virtual_grfs()
 {
-   int num_vars = this->virtual_grf_count;
+   int num_vars = this->alloc.count;
 
    /* Count the total number of registers */
    int reg_count = 0;
    int vgrf_to_reg[num_vars];
    for (int i = 0; i < num_vars; i++) {
       vgrf_to_reg[i] = reg_count;
-      reg_count += virtual_grf_sizes[i];
+      reg_count += alloc.sizes[i];
    }
 
    /* An array of "split points".  For each register slot, this indicates
@@ -1866,14 +1963,14 @@ fs_visitor::split_virtual_grfs()
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       if (inst->dst.file == GRF) {
          int reg = vgrf_to_reg[inst->dst.reg];
-         for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
+         for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
             split_points[reg + j] = true;
       }
 
       for (int i = 0; i < inst->sources; i++) {
          if (inst->src[i].file == GRF) {
             int reg = vgrf_to_reg[inst->src[i].reg];
-            for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
+            for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
                split_points[reg + j] = true;
          }
       }
@@ -1899,7 +1996,7 @@ fs_visitor::split_virtual_grfs()
       for (int i = 0; i < inst->sources; i++) {
          if (inst->src[i].file == GRF) {
             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
-            for (int j = 1; j < inst->regs_read(this, i); j++)
+            for (int j = 1; j < inst->regs_read(i); j++)
                split_points[reg + j] = false;
          }
       }
@@ -1919,12 +2016,13 @@ fs_visitor::split_virtual_grfs()
       int offset = 1;
 
       /* j > 0 case */
-      for (int j = 1; j < virtual_grf_sizes[i]; j++) {
+      for (unsigned j = 1; j < alloc.sizes[i]; j++) {
          /* If this is a split point, reset the offset to 0 and allocate a
           * new virtual GRF for the previous offset many registers
           */
          if (split_points[reg]) {
-            int grf = virtual_grf_alloc(offset);
+            assert(offset <= MAX_VGRF_SIZE);
+            int grf = alloc.allocate(offset);
             for (int k = reg - offset; k < reg; k++)
                new_virtual_grf[k] = grf;
             offset = 0;
@@ -1935,7 +2033,8 @@ fs_visitor::split_virtual_grfs()
       }
 
       /* The last one gets the original register number */
-      virtual_grf_sizes[i] = offset;
+      assert(offset <= MAX_VGRF_SIZE);
+      alloc.sizes[i] = offset;
       for (int k = reg - offset; k < reg; k++)
          new_virtual_grf[k] = i;
    }
@@ -1946,14 +2045,14 @@ fs_visitor::split_virtual_grfs()
          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
          inst->dst.reg = new_virtual_grf[reg];
          inst->dst.reg_offset = new_reg_offset[reg];
-         assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
+         assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
       }
       for (int i = 0; i < inst->sources; i++) {
         if (inst->src[i].file == GRF) {
             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
             inst->src[i].reg = new_virtual_grf[reg];
             inst->src[i].reg_offset = new_reg_offset[reg];
-            assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
+            assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
          }
       }
    }
@@ -1973,7 +2072,7 @@ bool
 fs_visitor::compact_virtual_grfs()
 {
    bool progress = false;
-   int remap_table[this->virtual_grf_count];
+   int remap_table[this->alloc.count];
    memset(remap_table, -1, sizeof(remap_table));
 
    /* Mark which virtual GRFs are used. */
@@ -1989,7 +2088,7 @@ fs_visitor::compact_virtual_grfs()
 
    /* Compact the GRF arrays. */
    int new_index = 0;
-   for (int i = 0; i < this->virtual_grf_count; i++) {
+   for (unsigned i = 0; i < this->alloc.count; i++) {
       if (remap_table[i] == -1) {
          /* We just found an unused register.  This means that we are
           * actually going to compact something.
@@ -1997,13 +2096,13 @@ fs_visitor::compact_virtual_grfs()
          progress = true;
       } else {
          remap_table[i] = new_index;
-         virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
+         alloc.sizes[new_index] = alloc.sizes[i];
          invalidate_live_intervals();
          ++new_index;
       }
    }
 
-   this->virtual_grf_count = new_index;
+   this->alloc.count = new_index;
 
    /* Patch all the instructions to use the newly renumbered registers */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
@@ -2194,8 +2293,13 @@ fs_visitor::demote_pull_constants()
         if (inst->src[i].file != UNIFORM)
            continue;
 
-         int pull_index = pull_constant_loc[inst->src[i].reg +
-                                            inst->src[i].reg_offset];
+         int pull_index;
+         unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
+         if (location >= uniforms) /* Out of bounds access */
+            pull_index = -1;
+         else
+            pull_index = pull_constant_loc[location];
+
          if (pull_index == -1)
            continue;
 
@@ -2204,7 +2308,7 @@ fs_visitor::demote_pull_constants()
          current_annotation = inst->annotation;
 
          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
-         fs_reg dst = fs_reg(this, glsl_type::float_type);
+         fs_reg dst = vgrf(glsl_type::float_type);
 
          /* Generate a pull load into dst. */
          if (inst->src[i].reladdr) {
@@ -2240,6 +2344,22 @@ fs_visitor::opt_algebraic()
 
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       switch (inst->opcode) {
+      case BRW_OPCODE_MOV:
+         if (inst->src[0].file != IMM)
+            break;
+
+         if (inst->saturate) {
+            if (inst->dst.type != inst->src[0].type)
+               assert(!"unimplemented: saturate mixed types");
+
+            if (brw_saturate_immediate(inst->dst.type,
+                                       &inst->src[0].fixed_hw_reg)) {
+               inst->saturate = false;
+               progress = true;
+            }
+         }
+         break;
+
       case BRW_OPCODE_MUL:
         if (inst->src[1].file != IMM)
            continue;
@@ -2252,6 +2372,15 @@ fs_visitor::opt_algebraic()
            break;
         }
 
+         /* a * -1.0 = -a */
+         if (inst->src[1].is_negative_one()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0].negate = !inst->src[0].negate;
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+
          /* a * 0.0 = 0.0 */
          if (inst->src[1].is_zero()) {
             inst->opcode = BRW_OPCODE_MOV;
@@ -2261,6 +2390,14 @@ fs_visitor::opt_algebraic()
             break;
          }
 
+         if (inst->src[0].file == IMM) {
+            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
         break;
       case BRW_OPCODE_ADD:
          if (inst->src[1].file != IMM)
@@ -2273,6 +2410,15 @@ fs_visitor::opt_algebraic()
             progress = true;
             break;
          }
+
+         if (inst->src[0].file == IMM) {
+            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
          break;
       case BRW_OPCODE_OR:
          if (inst->src[0].equals(inst->src[1])) {
@@ -2292,6 +2438,18 @@ fs_visitor::opt_algebraic()
             break;
          }
          break;
+      case BRW_OPCODE_CMP:
+         if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
+             inst->src[0].abs &&
+             inst->src[0].negate &&
+             inst->src[1].is_zero()) {
+            inst->src[0].abs = false;
+            inst->src[0].negate = false;
+            inst->conditional_mod = BRW_CONDITIONAL_Z;
+            progress = true;
+            break;
+         }
+         break;
       case BRW_OPCODE_SEL:
          if (inst->src[0].equals(inst->src[1])) {
             inst->opcode = BRW_OPCODE_MOV;
@@ -2308,6 +2466,7 @@ fs_visitor::opt_algebraic()
                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
                      inst->opcode = BRW_OPCODE_MOV;
                      inst->src[1] = reg_undef;
+                     inst->conditional_mod = BRW_CONDITIONAL_NONE;
                      progress = true;
                   }
                   break;
@@ -2334,11 +2493,57 @@ fs_visitor::opt_algebraic()
             }
          }
          break;
+      case BRW_OPCODE_MAD:
+         if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[1] = reg_undef;
+            inst->src[2] = reg_undef;
+            progress = true;
+         } else if (inst->src[0].is_zero()) {
+            inst->opcode = BRW_OPCODE_MUL;
+            inst->src[0] = inst->src[2];
+            inst->src[2] = reg_undef;
+            progress = true;
+         } else if (inst->src[1].is_one()) {
+            inst->opcode = BRW_OPCODE_ADD;
+            inst->src[1] = inst->src[2];
+            inst->src[2] = reg_undef;
+            progress = true;
+         } else if (inst->src[2].is_one()) {
+            inst->opcode = BRW_OPCODE_ADD;
+            inst->src[2] = reg_undef;
+            progress = true;
+         } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
+            inst->opcode = BRW_OPCODE_ADD;
+            inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
+            inst->src[2] = reg_undef;
+            progress = true;
+         }
+         break;
+      case SHADER_OPCODE_RCP: {
+         fs_inst *prev = (fs_inst *)inst->prev;
+         if (prev->opcode == SHADER_OPCODE_SQRT) {
+            if (inst->src[0].equals(prev->dst)) {
+               inst->opcode = SHADER_OPCODE_RSQ;
+               inst->src[0] = prev->src[0];
+               progress = true;
+            }
+         }
+         break;
+      }
       default:
         break;
       }
-   }
 
+      /* Swap if src[0] is immediate. */
+      if (progress && inst->is_commutative()) {
+         if (inst->src[0].file == IMM) {
+            fs_reg tmp = inst->src[1];
+            inst->src[1] = inst->src[0];
+            inst->src[0] = tmp;
+         }
+      }
+   }
    return progress;
 }
 
@@ -2348,8 +2553,8 @@ fs_visitor::opt_register_renaming()
    bool progress = false;
    int depth = 0;
 
-   int remap[virtual_grf_count];
-   memset(remap, -1, sizeof(int) * virtual_grf_count);
+   int remap[alloc.count];
+   memset(remap, -1, sizeof(int) * alloc.count);
 
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
@@ -2373,12 +2578,12 @@ fs_visitor::opt_register_renaming()
 
       if (depth == 0 &&
           inst->dst.file == GRF &&
-          virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
+          alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
           !inst->is_partial_write()) {
          if (remap[dst] == -1) {
             remap[dst] = dst;
          } else {
-            remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
+            remap[dst] = alloc.allocate(inst->dst.width / 8);
             inst->dst.reg = remap[dst];
             progress = true;
          }
@@ -2408,12 +2613,57 @@ fs_visitor::opt_register_renaming()
    return progress;
 }
 
+/**
+ * Remove redundant or useless discard jumps.
+ *
+ * For example, we can eliminate jumps in the following sequence:
+ *
+ * discard-jump       (redundant with the next jump)
+ * discard-jump       (useless; jumps to the next instruction)
+ * placeholder-halt
+ */
+bool
+fs_visitor::opt_redundant_discard_jumps()
+{
+   bool progress = false;
+
+   bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
+
+   fs_inst *placeholder_halt = NULL;
+   foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
+      if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
+         placeholder_halt = inst;
+         break;
+      }
+   }
+
+   if (!placeholder_halt)
+      return false;
+
+   /* Delete any HALTs immediately before the placeholder halt. */
+   for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
+        !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
+        prev = (fs_inst *) placeholder_halt->prev) {
+      prev->remove(last_bblock);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
 bool
 fs_visitor::compute_to_mrf()
 {
    bool progress = false;
    int next_ip = 0;
 
+   /* No MRFs on Gen >= 7. */
+   if (brw->gen >= 7)
+      return false;
+
    calculate_live_intervals();
 
    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
@@ -2588,6 +2838,7 @@ fs_visitor::emit_repclear_shader()
       write->header_present = false;
       write->mlen = 1;
    } else {
+      assume(key->nr_color_regions > 0);
       for (int i = 0; i < key->nr_color_regions; ++i) {
          write = emit(FS_OPCODE_REP_FB_WRITE);
          write->saturate = key->clamp_fragment_color;
@@ -2656,7 +2907,7 @@ fs_visitor::remove_duplicate_mrf_writes()
 
       /* Clear out any MRF move records whose sources got overwritten. */
       if (inst->dst.file == GRF) {
-        for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
+        for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
            if (last_mrf_move[i] &&
                last_mrf_move[i]->src[0].reg == inst->dst.reg) {
               last_mrf_move[i] = NULL;
@@ -2679,8 +2930,7 @@ fs_visitor::remove_duplicate_mrf_writes()
 }
 
 static void
-clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
-                        int first_grf, int grf_len)
+clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
 {
    /* Clear the flag for registers that actually got read (as expected). */
    for (int i = 0; i < inst->sources; i++) {
@@ -2731,8 +2981,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
    memset(needs_dep, false, sizeof(needs_dep));
    memset(needs_dep, true, write_len);
 
-   clear_deps_for_inst_src(inst, dispatch_width,
-                           needs_dep, first_write_grf, write_len);
+   clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
 
    /* Walk backwards looking for writes to registers we're writing which
     * aren't read since being written.  If we hit the start of the program,
@@ -2772,8 +3021,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
       }
 
       /* Clear the flag for registers that actually got read (as expected). */
-      clear_deps_for_inst_src(scan_inst, dispatch_width,
-                              needs_dep, first_write_grf, write_len);
+      clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
 
       /* Continue the loop only if we haven't resolved all the dependencies */
       int i;
@@ -2818,8 +3066,7 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
       }
 
       /* Clear the flag for registers that actually got read (as expected). */
-      clear_deps_for_inst_src(scan_inst, dispatch_width,
-                              needs_dep, first_write_grf, write_len);
+      clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
 
       /* We insert our reads as late as possible since they're reading the
        * result of a SEND, which has massive latency.
@@ -2841,16 +3088,6 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
       if (i == write_len)
          return;
    }
-
-   /* If we hit the end of the program, resolve all remaining dependencies out
-    * of paranoia.
-    */
-   fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
-   assert(last_inst->eot);
-   for (int i = 0; i < write_len; i++) {
-      if (needs_dep[i])
-         last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
-   }
 }
 
 void
@@ -2908,7 +3145,15 @@ fs_visitor::lower_uniform_pull_constant_loads()
          assert(const_offset_reg.file == IMM &&
                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
-         fs_reg payload = fs_reg(this, glsl_type::uint_type);
+         fs_reg payload = fs_reg(GRF, alloc.allocate(1));
+
+         /* We have to use a message header on Skylake to get SIMD4x2 mode.
+          * Reserve space for the register.
+          */
+         if (brw->gen >= 9) {
+            payload.reg_offset++;
+            alloc.sizes[payload.reg] = 2;
+         }
 
          /* This is actually going to be a MOV, but since only the first dword
           * is accessed, we have a special opcode to do just that one.  Note
@@ -2948,11 +3193,11 @@ fs_visitor::lower_load_payload()
 {
    bool progress = false;
 
-   int vgrf_to_reg[virtual_grf_count];
-   int reg_count = 16; /* Leave room for MRF */
-   for (int i = 0; i < virtual_grf_count; ++i) {
+   int vgrf_to_reg[alloc.count];
+   int reg_count = 0;
+   for (unsigned i = 0; i < alloc.count; ++i) {
       vgrf_to_reg[i] = reg_count;
-      reg_count += virtual_grf_sizes[i];
+      reg_count += alloc.sizes[i];
    }
 
    struct {
@@ -2963,17 +3208,13 @@ fs_visitor::lower_load_payload()
    memset(metadata, 0, sizeof(metadata));
 
    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
-      int dst_reg;
-      if (inst->dst.file == MRF) {
-         dst_reg = inst->dst.reg;
-      } else if (inst->dst.file == GRF) {
-         dst_reg = vgrf_to_reg[inst->dst.reg];
-      }
-
-      if (inst->dst.file == MRF || inst->dst.file == GRF) {
-         bool force_sechalf = inst->force_sechalf;
+      if (inst->dst.file == GRF) {
+         const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
+         bool force_sechalf = inst->force_sechalf &&
+                              !inst->force_writemask_all;
          bool toggle_sechalf = inst->dst.width == 16 &&
-                               type_sz(inst->dst.type) == 4;
+                               type_sz(inst->dst.type) == 4 &&
+                               !inst->force_writemask_all;
          for (int i = 0; i < inst->regs_written; ++i) {
             metadata[dst_reg + i].written = true;
             metadata[dst_reg + i].force_sechalf = force_sechalf;
@@ -3014,22 +3255,28 @@ fs_visitor::lower_load_payload()
                                 inst->src[i].reg_offset;
                   mov->force_sechalf = metadata[src_reg].force_sechalf;
                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
-                  metadata[dst_reg] = metadata[src_reg];
-                  if (dst.width * type_sz(dst.type) > 32) {
-                     assert((!metadata[src_reg].written ||
-                             !metadata[src_reg].force_sechalf) &&
-                            (!metadata[src_reg + 1].written ||
-                             metadata[src_reg + 1].force_sechalf));
-                     metadata[dst_reg + 1] = metadata[src_reg + 1];
-                  }
                } else {
-                  metadata[dst_reg].force_writemask_all = false;
-                  metadata[dst_reg].force_sechalf = false;
-                  if (dst.width == 16) {
-                     metadata[dst_reg + 1].force_writemask_all = false;
-                     metadata[dst_reg + 1].force_sechalf = true;
+                  /* We don't have any useful metadata for immediates or
+                   * uniforms.  Assume that any of the channels of the
+                   * destination may be used.
+                   */
+                  assert(inst->src[i].file == IMM ||
+                         inst->src[i].file == UNIFORM);
+                  mov->force_writemask_all = true;
+               }
+
+               if (dst.file == GRF) {
+                  const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
+                  const bool force_writemask = mov->force_writemask_all;
+                  metadata[dst_reg].force_writemask_all = force_writemask;
+                  metadata[dst_reg].force_sechalf = mov->force_sechalf;
+                  if (dst.width * type_sz(dst.type) > 32) {
+                     assert(!mov->force_sechalf);
+                     metadata[dst_reg + 1].force_writemask_all = force_writemask;
+                     metadata[dst_reg + 1].force_sechalf = !force_writemask;
                   }
                }
+
                inst->insert_before(block, mov);
             }
 
@@ -3056,7 +3303,6 @@ fs_visitor::dump_instructions()
 void
 fs_visitor::dump_instructions(const char *name)
 {
-   calculate_register_pressure();
    FILE *file = stderr;
    if (name && geteuid() != 0) {
       file = fopen(name, "w");
@@ -3064,14 +3310,23 @@ fs_visitor::dump_instructions(const char *name)
          file = stderr;
    }
 
-   int ip = 0, max_pressure = 0;
-   foreach_block_and_inst(block, backend_instruction, inst, cfg) {
-      max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
-      fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
-      dump_instruction(inst, file);
-      ++ip;
+   if (cfg) {
+      calculate_register_pressure();
+      int ip = 0, max_pressure = 0;
+      foreach_block_and_inst(block, backend_instruction, inst, cfg) {
+         max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
+         fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
+         dump_instruction(inst, file);
+         ip++;
+      }
+      fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
+   } else {
+      int ip = 0;
+      foreach_in_list(backend_instruction, inst, &instructions) {
+         fprintf(file, "%4d: ", ip++);
+         dump_instruction(inst, file);
+      }
    }
-   fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
 
    if (file != stderr) {
       fclose(file);
@@ -3115,7 +3370,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
       fprintf(file, "vgrf%d", inst->dst.reg);
       if (inst->dst.width != dispatch_width)
          fprintf(file, "@%d", inst->dst.width);
-      if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
+      if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
           inst->dst.subreg_offset)
          fprintf(file, "+%d.%d",
                  inst->dst.reg_offset, inst->dst.subreg_offset);
@@ -3129,6 +3384,9 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
    case UNIFORM:
       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
       break;
+   case ATTR:
+      fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
+      break;
    case HW_REG:
       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
          switch (inst->dst.fixed_hw_reg.nr) {
@@ -3172,7 +3430,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
          fprintf(file, "vgrf%d", inst->src[i].reg);
          if (inst->src[i].width != dispatch_width)
             fprintf(file, "@%d", inst->src[i].width);
-         if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
+         if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
              inst->src[i].subreg_offset)
             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
                     inst->src[i].subreg_offset);
@@ -3180,6 +3438,9 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
       case MRF:
          fprintf(file, "***m%d***", inst->src[i].reg);
          break;
+      case ATTR:
+         fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
+         break;
       case UNIFORM:
          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
          if (inst->src[i].reladdr) {
@@ -3197,12 +3458,21 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
          case BRW_REGISTER_TYPE_F:
             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
             break;
+         case BRW_REGISTER_TYPE_W:
          case BRW_REGISTER_TYPE_D:
             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
             break;
+         case BRW_REGISTER_TYPE_UW:
          case BRW_REGISTER_TYPE_UD:
             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
             break;
+         case BRW_REGISTER_TYPE_VF:
+            fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
+                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
+                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
+                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
+                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
+            break;
          default:
             fprintf(file, "???");
             break;
@@ -3377,6 +3647,13 @@ fs_visitor::setup_payload_gen6()
    }
 }
 
+void
+fs_visitor::setup_vs_payload()
+{
+   /* R0: thread header, R1: urb handles */
+   payload.num_regs = 2;
+}
+
 void
 fs_visitor::assign_binding_table_offsets()
 {
@@ -3400,50 +3677,248 @@ fs_visitor::calculate_register_pressure()
    invalidate_live_intervals();
    calculate_live_intervals();
 
-   unsigned num_instructions = instructions.length();
+   unsigned num_instructions = 0;
+   foreach_block(block, cfg)
+      num_instructions += block->instructions.length();
 
    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
 
-   for (int reg = 0; reg < virtual_grf_count; reg++) {
+   for (unsigned reg = 0; reg < alloc.count; reg++) {
       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
-         regs_live_at_ip[ip] += virtual_grf_sizes[reg];
+         regs_live_at_ip[ip] += alloc.sizes[reg];
+   }
+}
+
+void
+fs_visitor::optimize()
+{
+   const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
+
+   split_virtual_grfs();
+
+   move_uniform_array_access_to_pull_constants();
+   assign_constant_locations();
+   demote_pull_constants();
+
+#define OPT(pass, args...) ({                                           \
+      pass_num++;                                                       \
+      bool this_progress = pass(args);                                  \
+                                                                        \
+      if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
+         char filename[64];                                             \
+         snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
+                  stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
+                                                                        \
+         backend_visitor::dump_instructions(filename);                  \
+      }                                                                 \
+                                                                        \
+      progress = progress || this_progress;                             \
+      this_progress;                                                    \
+   })
+
+   if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
+      char filename[64];
+      snprintf(filename, 64, "%s%d-%04d-00-start",
+               stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
+
+      backend_visitor::dump_instructions(filename);
    }
+
+   bool progress;
+   int iteration = 0;
+   int pass_num = 0;
+   do {
+      progress = false;
+      pass_num = 0;
+      iteration++;
+
+      OPT(remove_duplicate_mrf_writes);
+
+      OPT(opt_algebraic);
+      OPT(opt_cse);
+      OPT(opt_copy_propagate);
+      OPT(opt_peephole_predicated_break);
+      OPT(opt_cmod_propagation);
+      OPT(dead_code_eliminate);
+      OPT(opt_peephole_sel);
+      OPT(dead_control_flow_eliminate, this);
+      OPT(opt_register_renaming);
+      OPT(opt_redundant_discard_jumps);
+      OPT(opt_saturate_propagation);
+      OPT(register_coalesce);
+      OPT(compute_to_mrf);
+
+      OPT(compact_virtual_grfs);
+   } while (progress);
+
+   pass_num = 0;
+
+   if (OPT(lower_load_payload)) {
+      split_virtual_grfs();
+      OPT(register_coalesce);
+      OPT(compute_to_mrf);
+      OPT(dead_code_eliminate);
+   }
+
+   OPT(opt_combine_constants);
+
+   lower_uniform_pull_constant_loads();
 }
 
 /**
- * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
- *
- * The needs_unlit_centroid_workaround ends up producing one of these per
- * channel of centroid input, so it's good to clean them up.
- *
- * An assumption here is that nothing ever modifies the dispatched pixels
- * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
- * dictates that anyway.
+ * Three source instruction must have a GRF/MRF destination register.
+ * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
  */
 void
-fs_visitor::opt_drop_redundant_mov_to_flags()
+fs_visitor::fixup_3src_null_dest()
 {
-   bool flag_mov_found[2] = {false};
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      if (inst->is_3src() && inst->dst.is_null()) {
+         inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
+                            inst->dst.type);
+      }
+   }
+}
 
-   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
-      if (inst->is_control_flow()) {
-         memset(flag_mov_found, 0, sizeof(flag_mov_found));
-      } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
-         if (!flag_mov_found[inst->flag_subreg])
-            flag_mov_found[inst->flag_subreg] = true;
-         else
-            inst->remove(block);
-      } else if (inst->writes_flag()) {
-         flag_mov_found[inst->flag_subreg] = false;
+void
+fs_visitor::allocate_registers()
+{
+   bool allocated_without_spills;
+
+   static const enum instruction_scheduler_mode pre_modes[] = {
+      SCHEDULE_PRE,
+      SCHEDULE_PRE_NON_LIFO,
+      SCHEDULE_PRE_LIFO,
+   };
+
+   /* Try each scheduling heuristic to see if it can successfully register
+    * allocate without spilling.  They should be ordered by decreasing
+    * performance but increasing likelihood of allocating.
+    */
+   for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
+      schedule_instructions(pre_modes[i]);
+
+      if (0) {
+         assign_regs_trivial();
+         allocated_without_spills = true;
+      } else {
+         allocated_without_spills = assign_regs(false);
+      }
+      if (allocated_without_spills)
+         break;
+   }
+
+   if (!allocated_without_spills) {
+      const char *stage_name = stage == MESA_SHADER_VERTEX ?
+         "Vertex" : "Fragment";
+
+      /* We assume that any spilling is worse than just dropping back to
+       * SIMD8.  There's probably actually some intermediate point where
+       * SIMD16 with a couple of spills is still better.
+       */
+      if (dispatch_width == 16) {
+         fail("Failure to register allocate.  Reduce number of "
+              "live scalar values to avoid this.");
+      } else {
+         perf_debug("%s shader triggered register spilling.  "
+                    "Try reducing the number of live scalar values to "
+                    "improve performance.\n", stage_name);
+      }
+
+      /* Since we're out of heuristics, just go spill registers until we
+       * get an allocation.
+       */
+      while (!assign_regs(true)) {
+         if (failed)
+            break;
       }
    }
+
+   /* This must come after all optimization and register allocation, since
+    * it inserts dead code that happens to have side effects, and it does
+    * so based on the actual physical registers in use.
+    */
+   insert_gen4_send_dependency_workarounds();
+
+   if (failed)
+      return;
+
+   if (!allocated_without_spills)
+      schedule_instructions(SCHEDULE_POST);
+
+   if (last_scratch > 0)
+      prog_data->total_scratch = brw_get_scratch_size(last_scratch);
+}
+
+static bool
+env_var_as_boolean(const char *var_name, bool default_value)
+{
+   const char *str = getenv(var_name);
+   if (str == NULL)
+      return default_value;
+
+   if (strcmp(str, "1") == 0 ||
+       strcasecmp(str, "true") == 0 ||
+       strcasecmp(str, "yes") == 0) {
+      return true;
+   } else if (strcmp(str, "0") == 0 ||
+              strcasecmp(str, "false") == 0 ||
+              strcasecmp(str, "no") == 0) {
+      return false;
+   } else {
+      return default_value;
+   }
+}
+
+bool
+fs_visitor::run_vs()
+{
+   assert(stage == MESA_SHADER_VERTEX);
+
+   assign_common_binding_table_offsets(0);
+   setup_vs_payload();
+
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      emit_shader_time_begin();
+
+   if (env_var_as_boolean("INTEL_USE_NIR", false)) {
+      emit_nir_code();
+   } else {
+      foreach_in_list(ir_instruction, ir, shader->base.ir) {
+         base_ir = ir;
+         this->result = reg_undef;
+         ir->accept(this);
+      }
+      base_ir = NULL;
+   }
+
+   if (failed)
+      return false;
+
+   emit_urb_writes();
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_vs_urb_setup();
+
+   fixup_3src_null_dest();
+   allocate_registers();
+
+   return !failed;
 }
 
 bool
-fs_visitor::run()
+fs_visitor::run_fs()
 {
+   brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
+   brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
+
+   assert(stage == MESA_SHADER_FRAGMENT);
+
    sanity_param_count = prog->Parameters->NumParameters;
-   bool allocated_without_spills;
 
    assign_binding_table_offsets();
 
@@ -3456,7 +3931,6 @@ fs_visitor::run()
       emit_dummy_fs();
    } else if (brw->use_rep_send && dispatch_width == 16) {
       emit_repclear_shader();
-      allocated_without_spills = true;
    } else {
       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
          emit_shader_time_begin();
@@ -3472,13 +3946,7 @@ fs_visitor::run()
       /* We handle discards by keeping track of the still-live pixels in f0.1.
        * Initialize it with the dispatched pixels.
        */
-      bool uses_kill =
-         (stage == MESA_SHADER_FRAGMENT) &&
-         ((brw_wm_prog_data*) this->prog_data)->uses_kill;
-      bool alpha_test_func =
-         (stage == MESA_SHADER_FRAGMENT) &&
-         ((brw_wm_prog_key*) this->key)->alpha_test_func;
-      if (uses_kill || alpha_test_func) {
+      if (wm_prog_data->uses_kill) {
          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
          discard_init->flag_subreg = 1;
       }
@@ -3487,10 +3955,14 @@ fs_visitor::run()
        * functions called "main").
        */
       if (shader) {
-         foreach_in_list(ir_instruction, ir, shader->base.ir) {
-            base_ir = ir;
-            this->result = reg_undef;
-            ir->accept(this);
+         if (env_var_as_boolean("INTEL_USE_NIR", false)) {
+            emit_nir_code();
+         } else {
+            foreach_in_list(ir_instruction, ir, shader->base.ir) {
+               base_ir = ir;
+               this->result = reg_undef;
+               ir->accept(this);
+            }
          }
       } else {
          emit_fragment_program_code();
@@ -3501,151 +3973,32 @@ fs_visitor::run()
 
       emit(FS_OPCODE_PLACEHOLDER_HALT);
 
-      if (alpha_test_func)
+      if (wm_key->alpha_test_func)
          emit_alpha_test();
 
       emit_fb_writes();
 
-      calculate_cfg();
+      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+         emit_shader_time_end();
 
-      split_virtual_grfs();
+      calculate_cfg();
 
-      move_uniform_array_access_to_pull_constants();
-      assign_constant_locations();
-      demote_pull_constants();
-
-      opt_drop_redundant_mov_to_flags();
-
-#define OPT(pass, args...) do {                                            \
-      pass_num++;                                                          \
-      bool this_progress = pass(args);                                     \
-                                                                           \
-      if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {      \
-         char filename[64];                                                \
-         snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass,              \
-                  dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
-                                                                           \
-         backend_visitor::dump_instructions(filename);                     \
-      }                                                                    \
-                                                                           \
-      progress = progress || this_progress;                                \
-   } while (false)
-
-      if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
-         char filename[64];
-         snprintf(filename, 64, "fs%d-%04d-00-start",
-                  dispatch_width, shader_prog ? shader_prog->Name : 0);
-
-         backend_visitor::dump_instructions(filename);
-      }
-
-      bool progress;
-      int iteration = 0;
-      do {
-        progress = false;
-         iteration++;
-         int pass_num = 0;
-
-         OPT(remove_duplicate_mrf_writes);
-
-         OPT(opt_algebraic);
-         OPT(opt_cse);
-         OPT(opt_copy_propagate);
-         OPT(opt_peephole_predicated_break);
-         OPT(dead_code_eliminate);
-         OPT(opt_peephole_sel);
-         OPT(dead_control_flow_eliminate, this);
-         OPT(opt_register_renaming);
-         OPT(opt_saturate_propagation);
-         OPT(register_coalesce);
-         OPT(compute_to_mrf);
-
-         OPT(compact_virtual_grfs);
-      } while (progress);
-
-      if (lower_load_payload()) {
-         split_virtual_grfs();
-         register_coalesce();
-         compute_to_mrf();
-         dead_code_eliminate();
-      }
-
-      lower_uniform_pull_constant_loads();
+      optimize();
 
       assign_curb_setup();
       assign_urb_setup();
 
-      static enum instruction_scheduler_mode pre_modes[] = {
-         SCHEDULE_PRE,
-         SCHEDULE_PRE_NON_LIFO,
-         SCHEDULE_PRE_LIFO,
-      };
+      fixup_3src_null_dest();
+      allocate_registers();
 
-      /* Try each scheduling heuristic to see if it can successfully register
-       * allocate without spilling.  They should be ordered by decreasing
-       * performance but increasing likelihood of allocating.
-       */
-      for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
-         schedule_instructions(pre_modes[i]);
-
-         if (0) {
-            assign_regs_trivial();
-            allocated_without_spills = true;
-         } else {
-            allocated_without_spills = assign_regs(false);
-         }
-         if (allocated_without_spills)
-            break;
-      }
-
-      if (!allocated_without_spills) {
-         /* We assume that any spilling is worse than just dropping back to
-          * SIMD8.  There's probably actually some intermediate point where
-          * SIMD16 with a couple of spills is still better.
-          */
-         if (dispatch_width == 16) {
-            fail("Failure to register allocate.  Reduce number of "
-                 "live scalar values to avoid this.");
-         } else {
-            perf_debug("Fragment shader triggered register spilling.  "
-                       "Try reducing the number of live scalar values to "
-                       "improve performance.\n");
-         }
-
-         /* Since we're out of heuristics, just go spill registers until we
-          * get an allocation.
-          */
-         while (!assign_regs(true)) {
-            if (failed)
-               break;
-         }
-      }
-   }
-   assert(force_uncompressed_stack == 0);
-
-   /* This must come after all optimization and register allocation, since
-    * it inserts dead code that happens to have side effects, and it does
-    * so based on the actual physical registers in use.
-    */
-   insert_gen4_send_dependency_workarounds();
-
-   if (failed)
-      return false;
-
-   if (!allocated_without_spills)
-      schedule_instructions(SCHEDULE_POST);
-
-   if (last_scratch > 0) {
-      prog_data->total_scratch = brw_get_scratch_size(last_scratch);
+      if (failed)
+         return false;
    }
 
-   if (stage == MESA_SHADER_FRAGMENT) {
-      brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
-      if (dispatch_width == 8)
-         prog_data->reg_blocks = brw_register_blocks(grf_used);
-      else
-         prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
-   }
+   if (dispatch_width == 8)
+      wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
+   else
+      wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
 
    /* If any state parameters were appended, then ParameterValues could have
     * been realloced, in which case the driver uniform storage set up by
@@ -3680,12 +4033,12 @@ brw_wm_fs_emit(struct brw_context *brw,
       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
 
    if (unlikely(INTEL_DEBUG & DEBUG_WM))
-      brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
+      brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
 
    /* Now the main event: Visit the shader IR and generate our FS IR for it.
     */
    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
-   if (!v.run()) {
+   if (!v.run_fs()) {
       if (prog) {
          prog->LinkStatus = false;
          ralloc_strcat(&prog->InfoLog, v.fail_msg);
@@ -3704,7 +4057,7 @@ brw_wm_fs_emit(struct brw_context *brw,
       if (!v.simd16_unsupported) {
          /* Try a SIMD16 compile */
          v2.import_uniforms(&v);
-         if (!v2.run()) {
+         if (!v2.run_fs()) {
             perf_debug("SIMD16 shader failed to compile, falling back to "
                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
          } else {
@@ -3726,11 +4079,25 @@ brw_wm_fs_emit(struct brw_context *brw,
       prog_data->no_8 = false;
    }
 
-   const unsigned *assembly = NULL;
-   fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
-                  v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
-   assembly = g.generate_assembly(simd8_cfg, simd16_cfg,
-                                  final_assembly_size);
+   fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
+                  &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
+
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+      char *name;
+      if (prog)
+         name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
+                                prog->Label ? prog->Label : "unnamed",
+                                prog->Name);
+      else
+         name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
+
+      g.enable_debug(name);
+   }
+
+   if (simd8_cfg)
+      g.generate_code(simd8_cfg, 8);
+   if (simd16_cfg)
+      prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
 
    if (unlikely(brw->perf_debug) && shader) {
       if (shader->compiled_once)
@@ -3743,20 +4110,18 @@ brw_wm_fs_emit(struct brw_context *brw,
       }
    }
 
-   return assembly;
+   return g.get_assembly(final_assembly_size);
 }
 
-bool
-brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
+extern "C" bool
+brw_fs_precompile(struct gl_context *ctx,
+                  struct gl_shader_program *shader_prog,
+                  struct gl_program *prog)
 {
    struct brw_context *brw = brw_context(ctx);
    struct brw_wm_prog_key key;
 
-   if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
-      return true;
-
-   struct gl_fragment_program *fp = (struct gl_fragment_program *)
-      prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
+   struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
    struct brw_fragment_program *bfp = brw_fragment_program(fp);
    bool program_uses_dfdy = fp->UsesDFdy;
 
@@ -3778,9 +4143,10 @@ brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
                                          BRW_FS_VARYING_INPUT_MASK) > 16)
       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
 
+   const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
    for (unsigned i = 0; i < sampler_count; i++) {
-      if (fp->Base.ShadowSamplers & (1 << i)) {
+      if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
          key.tex.swizzles[i] =
             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
@@ -3803,18 +4169,12 @@ brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
                           key.nr_color_regions > 1;
    }
 
-   /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
-    * quality of the derivatives is likely to be determined by the driconf
-    * option.
-    */
-   key.high_quality_derivatives = brw->disable_derivative_optimization;
-
    key.program_string_id = bfp->id;
 
    uint32_t old_prog_offset = brw->wm.base.prog_offset;
    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
 
-   bool success = do_wm_prog(brw, prog, bfp, &key);
+   bool success = do_wm_prog(brw, shader_prog, bfp, &key);
 
    brw->wm.base.prog_offset = old_prog_offset;
    brw->wm.prog_data = old_prog_data;