i965/fs: Lower DDY instructions to SIMD8 during SIMD lowering time

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index a5d403474f92ffb69e6044380717de454f3c49d2..c2dd9da5a4923e497c50b091a67effe3a315447e 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -39,6 +39,7 @@
  #include "brw_program.h"
  #include "brw_dead_control_flow.h"
  #include "compiler/glsl_types.h"
+#include "program/prog_parameter.h"
  
  using namespace brw;
  
@@ -174,40 +175,28 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
      * CSE can later notice that those loads are all the same and eliminate
      * the redundant ones.
      */
-   fs_reg vec4_offset = vgrf(glsl_type::int_type);
+   fs_reg vec4_offset = vgrf(glsl_type::uint_type);
     bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
  
-   int scale = 1;
-   if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
-      /* Pre-gen5, we can either use a SIMD8 message that requires (header,
-       * u, v, r) as parameters, or we can just use the SIMD16 message
-       * consisting of (header, u).  We choose the second, at the cost of a
-       * longer return length.
-       */
-      scale = 2;
-   }
-
-   enum opcode op;
-   if (devinfo->gen >= 7)
-      op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
-   else
-      op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
-
-   int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
-   fs_reg vec4_result = fs_reg(VGRF, alloc.allocate(regs_written), dst.type);
-   fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
-   inst->regs_written = regs_written;
+   /* The pull load message will load a vec4 (16 bytes). If we are loading
+    * a double this means we are only loading 2 elements worth of data.
+    * We also want to use a 32-bit data type for the dst of the load operation
+    * so other parts of the driver don't get confused about the size of the
+    * result.
+    */
+   fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+   fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
+                            vec4_result, surf_index, vec4_offset);
+   inst->regs_written = 4 * bld.dispatch_width() / 8;
  
-   if (devinfo->gen < 7) {
-      inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen);
-      inst->header_size = 1;
-      if (devinfo->gen == 4)
-         inst->mlen = 3;
-      else
-         inst->mlen = 1 + bld.dispatch_width() / 8;
+   if (type_sz(dst.type) == 8) {
+      shuffle_32bit_load_result_to_64bit_data(
+         bld, retype(vec4_result, dst.type), vec4_result, 2);
     }
  
-   bld.MOV(dst, offset(vec4_result, bld, ((const_offset & 0xf) / 4) * scale));
+   vec4_result.type = dst.type;
+   bld.MOV(dst, offset(vec4_result, bld,
+                       (const_offset & 0xf) / type_sz(vec4_result.type)));
  }
  
  /**
@@ -374,7 +363,7 @@ fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
        if (i < this->header_size) {
           reg.reg_offset += 1;
        } else {
-         reg.reg_offset += this->exec_size / 8;
+         reg = horiz_offset(reg, this->exec_size);
        }
     }
  
@@ -433,7 +422,6 @@ fs_reg::fs_reg(struct ::brw_reg reg) :
  {
     this->reg_offset = 0;
     this->subreg_offset = 0;
-   this->reladdr = NULL;
     this->stride = 1;
     if (this->file == IMM &&
         (this->type != BRW_REGISTER_TYPE_V &&
@@ -448,7 +436,6 @@ fs_reg::equals(const fs_reg &r) const
  {
     return (this->backend_reg::equals(r) &&
             subreg_offset == r.subreg_offset &&
-           !reladdr && !r.reladdr &&
             stride == r.stride);
  }
  
@@ -487,6 +474,8 @@ type_size_scalar(const struct glsl_type *type)
     case GLSL_TYPE_FLOAT:
     case GLSL_TYPE_BOOL:
        return type->components();
+   case GLSL_TYPE_DOUBLE:
+      return type->components() * 2;
     case GLSL_TYPE_ARRAY:
        return type_size_scalar(type->fields.array) * type->length;
     case GLSL_TYPE_STRUCT:
@@ -509,7 +498,7 @@ type_size_scalar(const struct glsl_type *type)
     case GLSL_TYPE_VOID:
     case GLSL_TYPE_ERROR:
     case GLSL_TYPE_INTERFACE:
-   case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_FUNCTION:
        unreachable("not reached");
     }
  
@@ -529,6 +518,19 @@ type_size_vec4_times_4(const struct glsl_type *type)
     return 4 * type_size_vec4(type);
  }
  
+/* Attribute arrays are loaded as one vec4 per element (or matrix column),
+ * except for double-precision types, which are loaded as one dvec4.
+ */
+extern "C" int
+type_size_vs_input(const struct glsl_type *type)
+{
+   if (type->is_double()) {
+      return type_size_dvec4(type);
+   } else {
+      return type_size_vec4(type);
+   }
+}
+
  /**
   * Create a MOV to read the timestamp register.
   *
@@ -700,7 +702,8 @@ fs_inst::is_partial_write() const
  {
     return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
             (this->exec_size * type_sz(this->dst.type)) < 32 ||
-           !this->dst.is_contiguous());
+           !this->dst.is_contiguous() ||
+           this->dst.subreg_offset > 0);
  }
  
  unsigned
@@ -739,6 +742,7 @@ fs_inst::components_read(unsigned i) const
     case SHADER_OPCODE_LOD_LOGICAL:
     case SHADER_OPCODE_TG4_LOGICAL:
     case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
        assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
               src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
        /* Texture coordinates. */
@@ -852,7 +856,10 @@ fs_inst::regs_read(int arg) const
           assert(src[2].file == IMM);
           unsigned region_length = src[2].ud;
  
-         if (src[0].file == FIXED_GRF) {
+         if (src[0].file == UNIFORM) {
+            assert(region_length % 4 == 0);
+            return region_length / 4;
+         } else if (src[0].file == FIXED_GRF) {
              /* If the start of the region is not register aligned, then
               * there's some portion of the register that's technically
               * unread at the beginning.
@@ -866,7 +873,7 @@ fs_inst::regs_read(int arg) const
               * unread portion at the beginning.
               */
              if (src[0].subnr)
-               region_length += src[0].subnr * type_sz(src[0].type);
+               region_length += src[0].subnr;
  
              return DIV_ROUND_UP(region_length, REG_SIZE);
           } else {
@@ -947,12 +954,14 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
     case FS_OPCODE_TXB:
     case SHADER_OPCODE_TXD:
     case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_LZ:
     case SHADER_OPCODE_TXF_CMS:
     case SHADER_OPCODE_TXF_CMS_W:
     case SHADER_OPCODE_TXF_MCS:
     case SHADER_OPCODE_TG4:
     case SHADER_OPCODE_TG4_OFFSET:
     case SHADER_OPCODE_TXL:
+   case SHADER_OPCODE_TXL_LZ:
     case SHADER_OPCODE_TXS:
     case SHADER_OPCODE_LOD:
     case SHADER_OPCODE_SAMPLEINFO:
@@ -963,7 +972,7 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
     case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
     case SHADER_OPCODE_GEN4_SCRATCH_READ:
        return 1;
-   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
        return inst->mlen;
     case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
        return inst->mlen;
@@ -1022,41 +1031,21 @@ fs_visitor::import_uniforms(fs_visitor *v)
     this->push_constant_loc = v->push_constant_loc;
     this->pull_constant_loc = v->pull_constant_loc;
     this->uniforms = v->uniforms;
-   this->param_size = v->param_size;
  }
  
  fs_reg *
-fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
-                                         bool origin_upper_left)
+fs_visitor::emit_fragcoord_interpolation()
  {
     assert(stage == MESA_SHADER_FRAGMENT);
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
     fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
     fs_reg wpos = *reg;
-   bool flip = !origin_upper_left ^ key->render_to_fbo;
  
     /* gl_FragCoord.x */
-   if (pixel_center_integer) {
-      bld.MOV(wpos, this->pixel_x);
-   } else {
-      bld.ADD(wpos, this->pixel_x, brw_imm_f(0.5f));
-   }
+   bld.MOV(wpos, this->pixel_x);
     wpos = offset(wpos, bld, 1);
  
     /* gl_FragCoord.y */
-   if (!flip && pixel_center_integer) {
-      bld.MOV(wpos, this->pixel_y);
-   } else {
-      fs_reg pixel_y = this->pixel_y;
-      float offset = (pixel_center_integer ? 0.0f : 0.5f);
-
-      if (flip) {
-        pixel_y.negate = true;
-        offset += key->drawable_height - 1.0f;
-      }
-
-      bld.ADD(wpos, pixel_y, brw_imm_f(offset));
-   }
+   bld.MOV(wpos, this->pixel_y);
     wpos = offset(wpos, bld, 1);
  
     /* gl_FragCoord.z */
@@ -1191,8 +1180,8 @@ fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
                    inst->no_dd_clear = true;
  
                 inst = emit_linterp(*attr, fs_reg(interp), interpolation_mode,
-                                   mod_centroid && !key->persample_shading,
-                                   mod_sample || key->persample_shading);
+                                   mod_centroid && !key->persample_interp,
+                                   mod_sample || key->persample_interp);
                 inst->predicate = BRW_PREDICATE_NORMAL;
                 inst->predicate_inverse = false;
                 if (devinfo->has_pln)
@@ -1200,8 +1189,8 @@ fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
  
              } else {
                 emit_linterp(*attr, fs_reg(interp), interpolation_mode,
-                            mod_centroid && !key->persample_shading,
-                            mod_sample || key->persample_shading);
+                            mod_centroid && !key->persample_interp,
+                            mod_sample || key->persample_interp);
              }
              if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
                 bld.MUL(*attr, *attr, this->pixel_w);
@@ -1258,10 +1247,10 @@ void
  fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
  {
     assert(stage == MESA_SHADER_FRAGMENT);
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+   brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
     assert(dst.type == BRW_REGISTER_TYPE_F);
  
-   if (key->compute_pos_offset) {
+   if (wm_prog_data->persample_dispatch) {
        /* Convert int_sample_pos to floating point */
        bld.MOV(dst, int_sample_pos);
        /* Scale to the range [0, 1] */
@@ -1336,7 +1325,48 @@ fs_visitor::emit_sampleid_setup()
     const fs_builder abld = bld.annotate("compute sample id");
     fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
  
-   if (key->compute_sample_id) {
+   if (!key->multisample_fbo) {
+      /* As per GL_ARB_sample_shading specification:
+       * "When rendering to a non-multisample buffer, or if multisample
+       *  rasterization is disabled, gl_SampleID will always be zero."
+       */
+      abld.MOV(*reg, brw_imm_d(0));
+   } else if (devinfo->gen >= 8) {
+      /* Sample ID comes in as 4-bit numbers in g1.0:
+       *
+       *    15:12 Slot 3 SampleID (only used in SIMD16)
+       *     11:8 Slot 2 SampleID (only used in SIMD16)
+       *      7:4 Slot 1 SampleID
+       *      3:0 Slot 0 SampleID
+       *
+       * Each slot corresponds to four channels, so we want to replicate each
+       * half-byte value to 4 channels in a row:
+       *
+       *    dst+0:    .7    .6    .5    .4    .3    .2    .1    .0
+       *             7:4   7:4   7:4   7:4   3:0   3:0   3:0   3:0
+       *
+       *    dst+1:    .7    .6    .5    .4    .3    .2    .1    .0  (if SIMD16)
+       *           15:12 15:12 15:12 15:12  11:8  11:8  11:8  11:8
+       *
+       * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
+       * channels to read the first byte (7:0), and the second group of 8
+       * channels to read the second byte (15:8).  Then, we shift right by
+       * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
+       * values into place.  Finally, we AND with 0xf to keep the low nibble.
+       *
+       *    shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
+       *    and(16) dst<1>D tmp<8,8,1>W  0xf:W
+       *
+       * TODO: These payload bits exist on Gen7 too, but they appear to always
+       *       be zero, so this code fails to work.  We should find out why.
+       */
+      fs_reg tmp(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
+
+      abld.SHR(tmp, fs_reg(stride(retype(brw_vec1_grf(1, 0),
+                                         BRW_REGISTER_TYPE_B), 1, 8, 0)),
+                    brw_imm_v(0x44440000));
+      abld.AND(*reg, tmp, brw_imm_w(0xf));
+   } else {
        fs_reg t1(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_D);
        t1.set_smear(0);
        fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
@@ -1364,32 +1394,63 @@ fs_visitor::emit_sampleid_setup()
        /* SKL+ has an extra bit for the Starting Sample Pair Index to
         * accomodate 16x MSAA.
         */
-      unsigned sspi_mask = devinfo->gen >= 9 ? 0x1c0 : 0xc0;
-
        abld.exec_all().group(1, 0)
            .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
-               brw_imm_ud(sspi_mask));
+               brw_imm_ud(0xc0));
        abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
  
        /* This works for both SIMD8 and SIMD16 */
-      abld.exec_all().group(4, 0)
-          .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
+      abld.exec_all().group(4, 0).MOV(t2, brw_imm_v(0x3210));
  
        /* This special instruction takes care of setting vstride=1,
         * width=4, hstride=0 of t2 during an ADD instruction.
         */
        abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
-   } else {
-      /* As per GL_ARB_sample_shading specification:
-       * "When rendering to a non-multisample buffer, or if multisample
-       *  rasterization is disabled, gl_SampleID will always be zero."
-       */
-      abld.MOV(*reg, brw_imm_d(0));
     }
  
     return reg;
  }
  
+fs_reg *
+fs_visitor::emit_samplemaskin_setup()
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
+   assert(devinfo->gen >= 6);
+
+   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
+
+   fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
+                               BRW_REGISTER_TYPE_D));
+
+   if (wm_prog_data->persample_dispatch) {
+      /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
+       * and a mask representing which sample is being processed by the
+       * current shader invocation.
+       *
+       * From the OES_sample_variables specification:
+       * "When per-sample shading is active due to the use of a fragment input
+       *  qualified by "sample" or due to the use of the gl_SampleID or
+       *  gl_SamplePosition variables, only the bit for the current sample is
+       *  set in gl_SampleMaskIn."
+       */
+      const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
+
+      if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
+         nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
+
+      fs_reg one = vgrf(glsl_type::int_type);
+      fs_reg enabled_mask = vgrf(glsl_type::int_type);
+      abld.MOV(one, brw_imm_d(1));
+      abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
+      abld.AND(*reg, enabled_mask, coverage_mask);
+   } else {
+      /* In per-pixel mode, the coverage mask is sufficient. */
+      *reg = coverage_mask;
+   }
+   return reg;
+}
+
  fs_reg
  fs_visitor::resolve_source_modifiers(const fs_reg &src)
  {
@@ -1473,20 +1534,6 @@ fs_visitor::emit_gs_thread_end()
  void
  fs_visitor::assign_curb_setup()
  {
-   if (dispatch_width == 8) {
-      prog_data->dispatch_grf_start_reg = payload.num_regs;
-   } else {
-      if (stage == MESA_SHADER_FRAGMENT) {
-         brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
-         prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
-      } else if (stage == MESA_SHADER_COMPUTE) {
-         brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
-         prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
-      } else {
-         unreachable("Unsupported shader type!");
-      }
-   }
-
     prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
  
     /* Map the offsets in the UNIFORM file to fixed HW regs. */
@@ -1652,11 +1699,28 @@ fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
                     inst->src[i].nr +
                     inst->src[i].reg_offset;
  
-         unsigned width = inst->src[i].stride == 0 ? 1 : inst->exec_size;
+         /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
+          *
+          * VertStride must be used to cross GRF register boundaries. This
+          * rule implies that elements within a 'Width' cannot cross GRF
+          * boundaries.
+          *
+          * So, for registers that are large enough, we have to split the exec
+          * size in two and trust the compression state to sort it out.
+          */
+         unsigned total_size = inst->exec_size *
+                               inst->src[i].stride *
+                               type_sz(inst->src[i].type);
+
+         assert(total_size <= 2 * REG_SIZE);
+         const unsigned exec_size =
+            (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
+
+         unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
           struct brw_reg reg =
              stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
                                 inst->src[i].subreg_offset),
-                   inst->exec_size * inst->src[i].stride,
+                   exec_size * inst->src[i].stride,
                     width, inst->src[i].stride);
           reg.abs = inst->src[i].abs;
           reg.negate = inst->src[i].negate;
@@ -1674,7 +1738,7 @@ fs_visitor::assign_vs_urb_setup()
     assert(stage == MESA_SHADER_VERTEX);
  
     /* Each attribute is 4 regs. */
-   this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes;
+   this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
  
     assert(vs_prog_data->base.urb_read_length <= 15);
  
@@ -1684,6 +1748,17 @@ fs_visitor::assign_vs_urb_setup()
     }
  }
  
+void
+fs_visitor::assign_tcs_single_patch_urb_setup()
+{
+   assert(stage == MESA_SHADER_TESS_CTRL);
+
+   /* Rewrite all ATTR file references to HW_REGs. */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      convert_attr_sources_to_hw_regs(inst);
+   }
+}
+
  void
  fs_visitor::assign_tes_urb_setup()
  {
@@ -1918,6 +1993,45 @@ fs_visitor::compact_virtual_grfs()
     return progress;
  }
  
+static void
+set_push_pull_constant_loc(unsigned uniform, int *chunk_start, bool contiguous,
+                           int *push_constant_loc, int *pull_constant_loc,
+                           unsigned *num_push_constants,
+                           unsigned *num_pull_constants,
+                           const unsigned max_push_components,
+                           const unsigned max_chunk_size,
+                           struct brw_stage_prog_data *stage_prog_data)
+{
+   /* This is the first live uniform in the chunk */
+   if (*chunk_start < 0)
+      *chunk_start = uniform;
+
+   /* If this element does not need to be contiguous with the next, we
+    * split at this point and everything between chunk_start and u forms a
+    * single chunk.
+    */
+   if (!contiguous) {
+      unsigned chunk_size = uniform - *chunk_start + 1;
+
+      /* Decide whether we should push or pull this parameter.  In the
+       * Vulkan driver, push constants are explicitly exposed via the API
+       * so we push everything.  In GL, we only push small arrays.
+       */
+      if (stage_prog_data->pull_param == NULL ||
+          (*num_push_constants + chunk_size <= max_push_components &&
+           chunk_size <= max_chunk_size)) {
+         assert(*num_push_constants + chunk_size <= max_push_components);
+         for (unsigned j = *chunk_start; j <= uniform; j++)
+            push_constant_loc[j] = (*num_push_constants)++;
+      } else {
+         for (unsigned j = *chunk_start; j <= uniform; j++)
+            pull_constant_loc[j] = (*num_pull_constants)++;
+      }
+
+      *chunk_start = -1;
+   }
+}
+
  /**
   * Assign UNIFORM file registers to either push constants or pull constants.
   *
@@ -1925,31 +2039,32 @@ fs_visitor::compact_virtual_grfs()
   * maximum number of fragment shader uniform components (64).  If
   * there are too many of these, they'd fill up all of register space.
   * So, this will push some of them out to the pull constant buffer and
- * update the program to load them.  We also use pull constants for all
- * indirect constant loads because we don't support indirect accesses in
- * registers yet.
+ * update the program to load them.
   */
  void
  fs_visitor::assign_constant_locations()
  {
-   /* Only the first compile (SIMD8 mode) gets to decide on locations. */
-   if (dispatch_width != 8)
+   /* Only the first compile gets to decide on locations. */
+   if (dispatch_width != min_dispatch_width)
        return;
  
-   unsigned int num_pull_constants = 0;
-
-   pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
-   memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
-
     bool is_live[uniforms];
     memset(is_live, 0, sizeof(is_live));
+   bool is_live_64bit[uniforms];
+   memset(is_live_64bit, 0, sizeof(is_live_64bit));
+
+   /* For each uniform slot, a value of true indicates that the given slot and
+    * the next slot must remain contiguous.  This is used to keep us from
+    * splitting arrays apart.
+    */
+   bool contiguous[uniforms];
+   memset(contiguous, 0, sizeof(contiguous));
  
     /* First, we walk through the instructions and do two things:
      *
      *  1) Figure out which uniforms are live.
      *
-    *  2) Find all indirect access of uniform arrays and flag them as needing
-    *     to go into the pull constant buffer.
+    *  2) Mark any indirectly used ranges of registers as contiguous.
      *
      * Note that we don't move constant-indexed accesses to arrays.  No
      * testing has been done of the performance impact of this choice.
@@ -1959,22 +2074,32 @@ fs_visitor::assign_constant_locations()
           if (inst->src[i].file != UNIFORM)
              continue;
  
-         if (inst->src[i].reladdr) {
-            int uniform = inst->src[i].nr;
+         int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
  
-            /* If this array isn't already present in the pull constant buffer,
-             * add it.
-             */
-            if (pull_constant_loc[uniform] == -1) {
-               assert(param_size[uniform]);
-               for (int j = 0; j < param_size[uniform]; j++)
-                  pull_constant_loc[uniform + j] = num_pull_constants++;
+         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
+            assert(inst->src[2].ud % 4 == 0);
+            unsigned last = constant_nr + (inst->src[2].ud / 4) - 1;
+            assert(last < uniforms);
+
+            for (unsigned j = constant_nr; j < last; j++) {
+               is_live[j] = true;
+               contiguous[j] = true;
+               if (type_sz(inst->src[i].type) == 8) {
+                  is_live_64bit[j] = true;
+               }
              }
+            is_live[last] = true;
           } else {
-            /* Mark the the one accessed uniform as live */
-            int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
-            if (constant_nr >= 0 && constant_nr < (int) uniforms)
-               is_live[constant_nr] = true;
+            if (constant_nr >= 0 && constant_nr < (int) uniforms) {
+               int regs_read = inst->components_read(i) *
+                  type_sz(inst->src[i].type) / 4;
+               for (int j = 0; j < regs_read; j++) {
+                  is_live[constant_nr + j] = true;
+                  if (type_sz(inst->src[i].type) == 8) {
+                     is_live_64bit[constant_nr + j] = true;
+                  }
+               }
+            }
           }
        }
     }
@@ -1987,32 +2112,58 @@ fs_visitor::assign_constant_locations()
      * If changing this value, note the limitation about total_regs in
      * brw_curbe.c.
      */
-   unsigned int max_push_components = 16 * 8;
+   const unsigned int max_push_components = 16 * 8;
+
+   /* We push small arrays, but no bigger than 16 floats.  This is big enough
+    * for a vec4 but hopefully not large enough to push out other stuff.  We
+    * should probably use a better heuristic at some point.
+    */
+   const unsigned int max_chunk_size = 16;
+
     unsigned int num_push_constants = 0;
+   unsigned int num_pull_constants = 0;
  
     push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+   pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
  
-   for (unsigned int i = 0; i < uniforms; i++) {
-      if (!is_live[i] || pull_constant_loc[i] != -1) {
-         /* This UNIFORM register is either dead, or has already been demoted
-          * to a pull const.  Mark it as no longer living in the param[] array.
-          */
-         push_constant_loc[i] = -1;
+   /* Default to -1 meaning no location */
+   memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
+   memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
+
+   int chunk_start = -1;
+
+   /* First push 64-bit uniforms to ensure they are properly aligned */
+   for (unsigned u = 0; u < uniforms; u++) {
+      if (!is_live[u] || !is_live_64bit[u])
           continue;
-      }
  
-      if (num_push_constants < max_push_components) {
-         /* Retain as a push constant.  Record the location in the params[]
-          * array.
-          */
-         push_constant_loc[i] = num_push_constants++;
-      } else {
-         /* Demote to a pull constant. */
-         push_constant_loc[i] = -1;
-         pull_constant_loc[i] = num_pull_constants++;
-      }
+      set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
+                                 push_constant_loc, pull_constant_loc,
+                                 &num_push_constants, &num_pull_constants,
+                                 max_push_components, max_chunk_size,
+                                 stage_prog_data);
+
+   }
+
+   /* Then push the rest of uniforms */
+   for (unsigned u = 0; u < uniforms; u++) {
+      if (!is_live[u] || is_live_64bit[u])
+         continue;
+
+      set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
+                                 push_constant_loc, pull_constant_loc,
+                                 &num_push_constants, &num_pull_constants,
+                                 max_push_components, max_chunk_size,
+                                 stage_prog_data);
     }
  
+   /* As the uniforms are going to be reordered, take the data from a temporary
+    * copy of the original param[].
+    */
+   gl_constant_value **param = ralloc_array(NULL, gl_constant_value*,
+                                            stage_prog_data->nr_params);
+   memcpy(param, stage_prog_data->param,
+          sizeof(gl_constant_value*) * stage_prog_data->nr_params);
     stage_prog_data->nr_params = num_push_constants;
     stage_prog_data->nr_pull_params = num_pull_constants;
  
@@ -2025,7 +2176,7 @@ fs_visitor::assign_constant_locations()
      * having to make a copy.
      */
     for (unsigned int i = 0; i < uniforms; i++) {
-      const gl_constant_value *value = stage_prog_data->param[i];
+      const gl_constant_value *value = param[i];
  
        if (pull_constant_loc[i] != -1) {
           stage_prog_data->pull_param[pull_constant_loc[i]] = value;
@@ -2033,6 +2184,7 @@ fs_visitor::assign_constant_locations()
           stage_prog_data->param[push_constant_loc[i]] = value;
        }
     }
+   ralloc_free(param);
  }
  
  /**
@@ -2040,51 +2192,75 @@ fs_visitor::assign_constant_locations()
   * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
   */
  void
-fs_visitor::demote_pull_constants()
+fs_visitor::lower_constant_loads()
  {
-   foreach_block_and_inst (block, fs_inst, inst, cfg) {
+   const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      /* Set up the annotation tracking for new generated instructions. */
+      const fs_builder ibld(this, block, inst);
+
        for (int i = 0; i < inst->sources; i++) {
          if (inst->src[i].file != UNIFORM)
             continue;
  
-         int pull_index;
+         /* We'll handle this case later */
+         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
+            continue;
+
           unsigned location = inst->src[i].nr + inst->src[i].reg_offset;
-         if (location >= uniforms) /* Out of bounds access */
-            pull_index = -1;
-         else
-            pull_index = pull_constant_loc[location];
+         if (location >= uniforms)
+            continue; /* Out of bounds access */
+
+         int pull_index = pull_constant_loc[location];
  
           if (pull_index == -1)
             continue;
  
-         /* Set up the annotation tracking for new generated instructions. */
-         const fs_builder ibld(this, block, inst);
           const unsigned index = stage_prog_data->binding_table.pull_constants_start;
-         fs_reg dst = vgrf(glsl_type::float_type);
+         fs_reg dst;
+
+         if (type_sz(inst->src[i].type) <= 4)
+            dst = vgrf(glsl_type::float_type);
+         else
+            dst = vgrf(glsl_type::double_type);
  
           assert(inst->src[i].stride == 0);
  
-         /* Generate a pull load into dst. */
-         if (inst->src[i].reladdr) {
-            VARYING_PULL_CONSTANT_LOAD(ibld, dst,
-                                       brw_imm_ud(index),
-                                       *inst->src[i].reladdr,
-                                       pull_index * 4);
-            inst->src[i].reladdr = NULL;
-            inst->src[i].stride = 1;
-         } else {
-            const fs_builder ubld = ibld.exec_all().group(8, 0);
-            struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
-            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                      dst, brw_imm_ud(index), offset);
-            inst->src[i].set_smear(pull_index & 3);
-         }
-         brw_mark_surface_used(prog_data, index);
+         const fs_builder ubld = ibld.exec_all().group(8, 0);
+         struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
+         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                   dst, brw_imm_ud(index), offset);
  
           /* Rewrite the instruction to use the temporary VGRF. */
           inst->src[i].file = VGRF;
           inst->src[i].nr = dst.nr;
           inst->src[i].reg_offset = 0;
+         inst->src[i].set_smear((pull_index & 3) * 4 /
+                                type_sz(inst->src[i].type));
+
+         brw_mark_surface_used(prog_data, index);
+      }
+
+      if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
+          inst->src[0].file == UNIFORM) {
+
+         unsigned location = inst->src[0].nr + inst->src[0].reg_offset;
+         if (location >= uniforms)
+            continue; /* Out of bounds access */
+
+         int pull_index = pull_constant_loc[location];
+
+         if (pull_index == -1)
+           continue;
+
+         VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
+                                    brw_imm_ud(index),
+                                    inst->src[1],
+                                    pull_index * 4);
+         inst->remove(block);
+
+         brw_mark_surface_used(prog_data, index);
        }
     }
     invalidate_live_intervals();
@@ -2273,17 +2449,6 @@ fs_visitor::opt_algebraic()
              progress = true;
           }
           break;
-      case SHADER_OPCODE_RCP: {
-         fs_inst *prev = (fs_inst *)inst->prev;
-         if (prev->opcode == SHADER_OPCODE_SQRT) {
-            if (inst->src[0].equals(prev->dst)) {
-               inst->opcode = SHADER_OPCODE_RSQ;
-               inst->src[0] = prev->src[0];
-               progress = true;
-            }
-         }
-         break;
-      }
        case SHADER_OPCODE_BROADCAST:
           if (is_uniform(inst->src[0])) {
              inst->opcode = BRW_OPCODE_MOV;
@@ -2432,6 +2597,7 @@ fs_visitor::opt_sampler_eot()
     tex_inst->offset |= fb_write->target << 24;
     tex_inst->eot = true;
     tex_inst->dst = ibld.null_reg_ud();
+   tex_inst->regs_written = 0;
     fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
  
     /* If a header is present, marking the eot is sufficient. Otherwise, we need
@@ -2440,8 +2606,10 @@ fs_visitor::opt_sampler_eot()
      * we have enough space, but it will make sure the dead code eliminator kills
      * the instruction that this will replace.
      */
-   if (tex_inst->header_size != 0)
+   if (tex_inst->header_size != 0) {
+      invalidate_live_intervals();
        return true;
+   }
  
     fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
                                    load_payload->sources + 1);
@@ -2472,6 +2640,7 @@ fs_visitor::opt_sampler_eot()
     tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
     tex_inst->src[0] = send_header;
  
+   invalidate_live_intervals();
     return true;
  }
  
@@ -2792,12 +2961,23 @@ void
  fs_visitor::emit_repclear_shader()
  {
     brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   int base_mrf = 1;
+   int base_mrf = 0;
     int color_mrf = base_mrf + 2;
+   fs_inst *mov;
  
-   fs_inst *mov = bld.exec_all().group(4, 0)
-                     .MOV(brw_message_reg(color_mrf),
-                          fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+   if (uniforms > 0) {
+      mov = bld.exec_all().group(4, 0)
+               .MOV(brw_message_reg(color_mrf),
+                    fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+   } else {
+      struct brw_reg reg =
+         brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
+                 BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
+                 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+
+      mov = bld.exec_all().group(4, 0)
+               .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
+   }
  
     fs_inst *write;
     if (key->nr_color_regions == 1) {
@@ -2826,8 +3006,10 @@ fs_visitor::emit_repclear_shader()
     assign_curb_setup();
  
     /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
-   assert(mov->src[0].file == FIXED_GRF);
-   mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+   if (uniforms > 0) {
+      assert(mov->src[0].file == FIXED_GRF);
+      mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+   }
  }
  
  /**
@@ -3475,6 +3657,36 @@ fs_visitor::lower_integer_multiplication()
     return progress;
  }
  
+bool
+fs_visitor::lower_minmax()
+{
+   assert(devinfo->gen < 6);
+
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const fs_builder ibld(this, block, inst);
+
+      if (inst->opcode == BRW_OPCODE_SEL &&
+          inst->predicate == BRW_PREDICATE_NONE) {
+         /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
+          *        the original SEL.L/GE instruction
+          */
+         ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                  inst->conditional_mod);
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst->conditional_mod = BRW_CONDITIONAL_NONE;
+
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
  static void
  setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
                      fs_reg *dst, fs_reg color, unsigned components)
@@ -3885,6 +4097,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
  
     if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
         offset_value.file != BAD_FILE ||
+       op == SHADER_OPCODE_SAMPLEINFO ||
         is_high_sampler(devinfo, sampler)) {
        /* For general texture offsets (no txf workaround), we need a header to
         * put them in.  Note that we're only reserving space for it in the
@@ -3899,6 +4112,16 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
        header_size = 1;
        sources[0] = fs_reg();
        length++;
+
+      /* If we're requesting fewer than four channels worth of response,
+       * and we have an explicit header, we need to set up the sampler
+       * writemask.  It's reversed from normal: 1 means "don't write".
+       */
+      if (inst->regs_written != 4 * reg_width) {
+         assert((inst->regs_written % reg_width) == 0);
+         unsigned mask = ~((1 << (inst->regs_written / reg_width)) - 1) & 0xf;
+         inst->offset |= mask << 12;
+      }
     }
  
     if (shadow_c.file != BAD_FILE) {
@@ -3922,6 +4145,10 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
     switch (op) {
     case FS_OPCODE_TXB:
     case SHADER_OPCODE_TXL:
+      if (devinfo->gen >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
+         op = SHADER_OPCODE_TXL_LZ;
+         break;
+      }
        bld.MOV(sources[length], lod);
        length++;
        break;
@@ -3973,8 +4200,12 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
           length++;
        }
  
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
-      length++;
+      if (devinfo->gen >= 9 && lod.is_zero()) {
+         op = SHADER_OPCODE_TXF_LZ;
+      } else {
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
+         length++;
+      }
  
        for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
           bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
@@ -3984,6 +4215,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
  
        coordinate_done = true;
        break;
+
     case SHADER_OPCODE_TXF_CMS:
     case SHADER_OPCODE_TXF_CMS_W:
     case SHADER_OPCODE_TXF_UMS:
@@ -4182,6 +4414,58 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
     delete[] components;
  }
  
+static void
+lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   const brw_device_info *devinfo = bld.shader->devinfo;
+
+   if (devinfo->gen >= 7) {
+      inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
+
+   } else {
+      const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->gen),
+                           BRW_REGISTER_TYPE_UD);
+
+      bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]);
+
+      inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4;
+      inst->resize_sources(1);
+      inst->base_mrf = payload.nr;
+      inst->header_size = 1;
+      inst->mlen = 1 + inst->exec_size / 8;
+   }
+}
+
+static void
+lower_math_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   assert(bld.shader->devinfo->gen < 6);
+
+   inst->base_mrf = 2;
+   inst->mlen = inst->sources * inst->exec_size / 8;
+
+   if (inst->sources > 1) {
+      /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
+       * "Message Payload":
+       *
+       * "Operand0[7].  For the INT DIV functions, this operand is the
+       *  denominator."
+       *  ...
+       * "Operand1[7].  For the INT DIV functions, this operand is the
+       *  numerator."
+       */
+      const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
+      const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
+      const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
+
+      inst->resize_sources(1);
+      inst->src[0] = src0;
+
+      assert(inst->exec_size == 8);
+      bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
+   }
+}
+
  bool
  fs_visitor::lower_logical_sends()
  {
@@ -4251,6 +4535,10 @@ fs_visitor::lower_logical_sends()
           lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
           break;
  
+      case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
+         break;
+
        case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
           lower_surface_logical_send(ibld, inst,
                                      SHADER_OPCODE_UNTYPED_SURFACE_READ,
@@ -4287,6 +4575,35 @@ fs_visitor::lower_logical_sends()
                                      ibld.sample_mask_reg());
           break;
  
+      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
+         lower_varying_pull_constant_logical_send(ibld, inst);
+         break;
+
+      case SHADER_OPCODE_RCP:
+      case SHADER_OPCODE_RSQ:
+      case SHADER_OPCODE_SQRT:
+      case SHADER_OPCODE_EXP2:
+      case SHADER_OPCODE_LOG2:
+      case SHADER_OPCODE_SIN:
+      case SHADER_OPCODE_COS:
+      case SHADER_OPCODE_POW:
+      case SHADER_OPCODE_INT_QUOTIENT:
+      case SHADER_OPCODE_INT_REMAINDER:
+         /* The math opcodes are overloaded for the send-like and
+          * expression-like instructions which seems kind of icky.  Gen6+ has
+          * a native (but rather quirky) MATH instruction so we don't need to
+          * do anything here.  On Gen4-5 we'll have to lower the Gen6-like
+          * logical instructions (which we can easily recognize because they
+          * have mlen = 0) into send-like virtual instructions.
+          */
+         if (devinfo->gen < 6 && inst->mlen == 0) {
+            lower_math_logical_send(ibld, inst);
+            break;
+
+         } else {
+            continue;
+         }
+
        default:
           continue;
        }
@@ -4300,6 +4617,108 @@ fs_visitor::lower_logical_sends()
     return progress;
  }
  
+/**
+ * Get the closest allowed SIMD width for instruction \p inst accounting for
+ * some common regioning and execution control restrictions that apply to FPU
+ * instructions.  These restrictions don't necessarily have any relevance to
+ * instructions not executed by the FPU pipeline like extended math, control
+ * flow or send message instructions.
+ *
+ * For virtual opcodes it's really up to the instruction -- In some cases
+ * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
+ * instructions) it may simplify virtual instruction lowering if we can
+ * enforce FPU-like regioning restrictions already on the virtual instruction,
+ * in other cases (e.g. virtual send-like instructions) this may be
+ * excessively restrictive.
+ */
+static unsigned
+get_fpu_lowered_simd_width(const struct brw_device_info *devinfo,
+                           const fs_inst *inst)
+{
+   /* Maximum execution size representable in the instruction controls. */
+   unsigned max_width = MIN2(32, inst->exec_size);
+
+   /* According to the PRMs:
+    *  "A. In Direct Addressing mode, a source cannot span more than 2
+    *      adjacent GRF registers.
+    *   B. A destination cannot span more than 2 adjacent GRF registers."
+    *
+    * Look for the source or destination with the largest register region
+    * which is the one that is going to limit the overall execution size of
+    * the instruction due to this rule.
+    */
+   unsigned reg_count = inst->regs_written;
+
+   for (unsigned i = 0; i < inst->sources; i++)
+      reg_count = MAX2(reg_count, (unsigned)inst->regs_read(i));
+
+   /* Calculate the maximum execution size of the instruction based on the
+    * factor by which it goes over the hardware limit of 2 GRFs.
+    */
+   if (reg_count > 2)
+      max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, 2));
+
+   /* According to the IVB PRMs:
+    *  "When destination spans two registers, the source MUST span two
+    *   registers. The exception to the above rule:
+    *
+    *    - When source is scalar, the source registers are not incremented.
+    *    - When source is packed integer Word and destination is packed
+    *      integer DWord, the source register is not incremented but the
+    *      source sub register is incremented."
+    *
+    * The hardware specs from Gen4 to Gen7.5 mention similar regioning
+    * restrictions.  The code below intentionally doesn't check whether the
+    * destination type is integer because empirically the hardware doesn't
+    * seem to care what the actual type is as long as it's dword-aligned.
+    */
+   if (devinfo->gen < 8) {
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (inst->regs_written == 2 &&
+             inst->regs_read(i) != 0 && inst->regs_read(i) != 2 &&
+             !is_uniform(inst->src[i]) &&
+             !(type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
+               type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1))
+            max_width = MIN2(max_width, inst->exec_size /
+                             inst->regs_written);
+      }
+   }
+
+   /* From the IVB PRMs:
+    *  "When an instruction is SIMD32, the low 16 bits of the execution mask
+    *   are applied for both halves of the SIMD32 instruction. If different
+    *   execution mask channels are required, split the instruction into two
+    *   SIMD16 instructions."
+    *
+    * There is similar text in the HSW PRMs.  Gen4-6 don't even implement
+    * 32-wide control flow support in hardware and will behave similarly.
+    */
+   if (devinfo->gen < 8 && !inst->force_writemask_all)
+      max_width = MIN2(max_width, 16);
+
+   /* From the IVB PRMs (applies to HSW too):
+    *  "Instructions with condition modifiers must not use SIMD32."
+    *
+    * From the BDW PRMs (applies to later hardware too):
+    *  "Ternary instruction with condition modifiers must not use SIMD32."
+    */
+   if (inst->conditional_mod && (devinfo->gen < 8 || inst->is_3src(devinfo)))
+      max_width = MIN2(max_width, 16);
+
+   /* From the IVB PRMs (applies to other devices that don't have the
+    * brw_device_info::supports_simd16_3src flag set):
+    *  "In Align16 access mode, SIMD16 is not allowed for DW operations and
+    *   SIMD8 is not allowed for DF operations."
+    */
+   if (inst->is_3src(devinfo) && !devinfo->supports_simd16_3src)
+      max_width = MIN2(max_width, inst->exec_size / reg_count);
+
+   /* Only power-of-two execution sizes are representable in the instruction
+    * control fields.
+    */
+   return 1 << _mesa_logbase2(max_width);
+}
+
  /**
   * Get the closest native SIMD width supported by the hardware for instruction
   * \p inst.  The instruction will be left untouched by
@@ -4320,15 +4739,12 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
     case BRW_OPCODE_SHR:
     case BRW_OPCODE_SHL:
     case BRW_OPCODE_ASR:
-   case BRW_OPCODE_CMP:
     case BRW_OPCODE_CMPN:
     case BRW_OPCODE_CSEL:
     case BRW_OPCODE_F32TO16:
     case BRW_OPCODE_F16TO32:
     case BRW_OPCODE_BFREV:
     case BRW_OPCODE_BFE:
-   case BRW_OPCODE_BFI1:
-   case BRW_OPCODE_BFI2:
     case BRW_OPCODE_ADD:
     case BRW_OPCODE_MUL:
     case BRW_OPCODE_AVG:
@@ -4344,40 +4760,106 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
     case BRW_OPCODE_SAD2:
     case BRW_OPCODE_MAD:
     case BRW_OPCODE_LRP:
+   case FS_OPCODE_PACK:
+      return get_fpu_lowered_simd_width(devinfo, inst);
+
+   case BRW_OPCODE_CMP: {
+      /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
+       * when the destination is a GRF the dependency-clear bit on the flag
+       * register is cleared early.
+       *
+       * Suggested workarounds are to disable coissuing CMP instructions
+       * or to split CMP(16) instructions into two CMP(8) instructions.
+       *
+       * We choose to split into CMP(8) instructions since disabling
+       * coissuing would affect CMP instructions not otherwise affected by
+       * the errata.
+       */
+      const unsigned max_width = (devinfo->gen == 7 && !devinfo->is_haswell &&
+                                  !inst->dst.is_null() ? 8 : ~0);
+      return MIN2(max_width, get_fpu_lowered_simd_width(devinfo, inst));
+   }
+   case BRW_OPCODE_BFI1:
+   case BRW_OPCODE_BFI2:
+      /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
+       * should
+       *  "Force BFI instructions to be executed always in SIMD8."
+       */
+      return MIN2(devinfo->is_haswell ? 8 : ~0u,
+                  get_fpu_lowered_simd_width(devinfo, inst));
+
+   case BRW_OPCODE_IF:
+      assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
+      return inst->exec_size;
+
     case SHADER_OPCODE_RCP:
     case SHADER_OPCODE_RSQ:
     case SHADER_OPCODE_SQRT:
     case SHADER_OPCODE_EXP2:
     case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      /* Unary extended math instructions are limited to SIMD8 on Gen4 and
+       * Gen6.
+       */
+      return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
+              devinfo->gen == 5 || devinfo->is_g4x ? MIN2(16, inst->exec_size) :
+              MIN2(8, inst->exec_size));
+
     case SHADER_OPCODE_POW:
+      /* SIMD16 is only allowed on Gen7+. */
+      return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
+              MIN2(8, inst->exec_size));
+
     case SHADER_OPCODE_INT_QUOTIENT:
     case SHADER_OPCODE_INT_REMAINDER:
-   case SHADER_OPCODE_SIN:
-   case SHADER_OPCODE_COS: {
-      /* According to the PRMs:
-       *  "A. In Direct Addressing mode, a source cannot span more than 2
-       *      adjacent GRF registers.
-       *   B. A destination cannot span more than 2 adjacent GRF registers."
-       *
-       * Look for the source or destination with the largest register region
-       * which is the one that is going to limit the overal execution size of
-       * the instruction due to this rule.
+      /* Integer division is limited to SIMD8 on all generations. */
+      return MIN2(8, inst->exec_size);
+
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
+      /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
+       * message used to implement varying pull constant loads, so expand it
+       * to SIMD16.  An alternative with longer message payload length but
+       * shorter return payload would be to use the SIMD8 sampler message that
+       * takes (header, u, v, r) as parameters instead of (header, u).
         */
-      unsigned reg_count = inst->regs_written;
-
-      for (unsigned i = 0; i < inst->sources; i++)
-         reg_count = MAX2(reg_count, (unsigned)inst->regs_read(i));
+      return (devinfo->gen == 4 ? 16 : MIN2(16, inst->exec_size));
  
-      /* Calculate the maximum execution size of the instruction based on the
-       * factor by which it goes over the hardware limit of 2 GRFs.
+   case FS_OPCODE_DDY_FINE:
+      /* The implementation of this virtual opcode may require emitting
+       * compressed Align16 instructions, which are severely limited on some
+       * generations.
+       *
+       * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
+       * Region Restrictions):
+       *
+       *  "In Align16 access mode, SIMD16 is not allowed for DW operations
+       *   and SIMD8 is not allowed for DF operations."
+       *
+       * In this context, "DW operations" means "operations acting on 32-bit
+       * values", so it includes operations on floats.
+       *
+       * Gen4 has a similar restriction.  From the i965 PRM, section 11.5.3
+       * (Instruction Compression -> Rules and Restrictions):
+       *
+       *  "A compressed instruction must be in Align1 access mode. Align16
+       *   mode instructions cannot be compressed."
+       *
+       * Similar text exists in the g45 PRM.
+       *
+       * Empirically, compressed align16 instructions using odd register
+       * numbers don't appear to work on Sandybridge either.
         */
-      return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
-   }
+      return (devinfo->gen == 4 || devinfo->gen == 6 ||
+              (devinfo->gen == 7 && !devinfo->is_haswell) ?
+              MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
+
     case SHADER_OPCODE_MULH:
        /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
         * is 8-wide on Gen7+.
         */
-      return (devinfo->gen >= 7 ? 8 : inst->exec_size);
+      return (devinfo->gen >= 7 ? 8 :
+              get_fpu_lowered_simd_width(devinfo, inst));
  
     case FS_OPCODE_FB_WRITE_LOGICAL:
        /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
@@ -4390,6 +4872,9 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
        return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
                8 : inst->exec_size);
  
+   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+      return MIN2(16, inst->exec_size);
+
     case SHADER_OPCODE_TXD_LOGICAL:
        /* TXD is unsupported in SIMD16 mode. */
        return 8;
@@ -4444,34 +4929,17 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
     case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
        return 8;
  
+   case SHADER_OPCODE_MOV_INDIRECT:
+      /* Prior to Broadwell, we only have 8 address subregisters */
+      return MIN3(devinfo->gen >= 8 ? 16 : 8,
+                  2 * REG_SIZE / (inst->dst.stride * type_sz(inst->dst.type)),
+                  inst->exec_size);
+
     default:
        return inst->exec_size;
     }
  }
  
-/**
- * The \p rows array of registers represents a \p num_rows by \p num_columns
- * matrix in row-major order, write it in column-major order into the register
- * passed as destination.  \p stride gives the separation between matrix
- * elements in the input in fs_builder::dispatch_width() units.
- */
-static void
-emit_transpose(const fs_builder &bld,
-               const fs_reg &dst, const fs_reg *rows,
-               unsigned num_rows, unsigned num_columns, unsigned stride)
-{
-   fs_reg *const components = new fs_reg[num_rows * num_columns];
-
-   for (unsigned i = 0; i < num_columns; ++i) {
-      for (unsigned j = 0; j < num_rows; ++j)
-         components[num_rows * i + j] = offset(rows[j], bld, stride * i);
-   }
-
-   bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0);
-
-   delete[] components;
-}
-
  bool
  fs_visitor::lower_simd_width()
  {
@@ -4517,21 +4985,24 @@ fs_visitor::lower_simd_width()
               * instruction.
               */
              const fs_builder lbld = ibld.group(lower_width, i);
+            const fs_builder cbld = lbld.group(copy_width, 0);
  
              for (unsigned j = 0; j < inst->sources; j++) {
                 if (inst->src[j].file != BAD_FILE &&
-                   !is_uniform(inst->src[j])) {
+                   !is_periodic(inst->src[j], lower_width)) {
                    /* Get the i-th copy_width-wide chunk of the source. */
-                  const fs_reg src = horiz_offset(inst->src[j], copy_width * i);
+                  const fs_reg src = offset(inst->src[j], cbld, i);
                    const unsigned src_size = inst->components_read(j);
  
-                  /* Use a trivial transposition to copy one every n
-                   * copy_width-wide components of the register into a
-                   * temporary passed as source to the lowered instruction.
+                  /* Copy one every n copy_width-wide components of the
+                   * register into a temporary passed as source to the lowered
+                   * instruction.
                     */
                    split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size);
-                  emit_transpose(lbld.group(copy_width, 0),
-                                 split_inst.src[j], &src, 1, src_size, n);
+
+                  for (unsigned k = 0; k < src_size; ++k)
+                     cbld.MOV(offset(split_inst.src[j], lbld, k),
+                              offset(src, cbld, n * k));
                 }
              }
  
@@ -4542,28 +5013,36 @@ fs_visitor::lower_simd_width()
                 split_inst.dst = dsts[i] =
                    lbld.vgrf(inst->dst.type, dst_size);
                 split_inst.regs_written =
-                  DIV_ROUND_UP(inst->regs_written * lower_width,
-                               inst->exec_size);
+                  DIV_ROUND_UP(type_sz(inst->dst.type) * dst_size * lower_width,
+                               REG_SIZE);
+
+               if (inst->predicate) {
+                  /* Handle predication by copying the original contents of
+                   * the destination into the temporary before emitting the
+                   * lowered instruction.
+                   */
+                  for (unsigned k = 0; k < dst_size; ++k)
+                     cbld.MOV(offset(split_inst.dst, lbld, k),
+                              offset(inst->dst, cbld, n * k + i));
+               }
              }
  
              lbld.emit(split_inst);
           }
  
           if (inst->regs_written) {
-            /* Distance between useful channels in the temporaries, skipping
-             * garbage if the lowered instruction is wider than the original.
-             */
-            const unsigned m = lower_width / copy_width;
+            const fs_builder lbld = ibld.group(lower_width, 0);
  
              /* Interleave the components of the result from the lowered
-             * instructions.  We need to set exec_all() when copying more than
-             * one half per component, because LOAD_PAYLOAD (in terms of which
-             * emit_transpose is implemented) can only use the same channel
-             * enable signals for all of its non-header sources.
+             * instructions.
               */
-            emit_transpose(ibld.exec_all(inst->exec_size > copy_width)
-                               .group(copy_width, 0),
-                           inst->dst, dsts, n, dst_size, m);
+            for (unsigned i = 0; i < dst_size; ++i) {
+               for (unsigned j = 0; j < n; ++j) {
+                  const fs_builder cbld = ibld.group(copy_width, j);
+                  cbld.MOV(offset(inst->dst, cbld, n * i + j),
+                           offset(dsts[j], lbld, i));
+               }
+            }
           }
  
           inst->remove(block);
@@ -4633,7 +5112,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
               inst->flag_subreg);
     }
  
-   fprintf(file, "%s", brw_instruction_name(inst->opcode));
+   fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode));
     if (inst->saturate)
        fprintf(file, ".sat");
     if (inst->conditional_mod) {
@@ -4726,9 +5205,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
           break;
        case UNIFORM:
           fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset);
-         if (inst->src[i].reladdr) {
-            fprintf(file, "+reladdr");
-         } else if (inst->src[i].subreg_offset) {
+         if (inst->src[i].subreg_offset) {
              fprintf(file, "+%d.%d", inst->src[i].reg_offset,
                      inst->src[i].subreg_offset);
           }
@@ -4741,6 +5218,9 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
           case BRW_REGISTER_TYPE_F:
              fprintf(file, "%-gf", inst->src[i].f);
              break;
+         case BRW_REGISTER_TYPE_DF:
+            fprintf(file, "%fdf", inst->src[i].df);
+            break;
           case BRW_REGISTER_TYPE_W:
           case BRW_REGISTER_TYPE_D:
              fprintf(file, "%dd", inst->src[i].d);
@@ -4839,7 +5319,6 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start,
  {
     if (end == start ||
         end->is_partial_write() ||
-       reg.reladdr ||
         !reg.equals(end->dst)) {
        return NULL;
     } else {
@@ -4852,10 +5331,7 @@ fs_visitor::setup_fs_payload_gen6()
  {
     assert(stage == MESA_SHADER_FRAGMENT);
     brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  
-   bool uses_depth =
-      (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
     unsigned barycentric_interp_modes =
        (stage == MESA_SHADER_FRAGMENT) ?
        ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
@@ -4884,7 +5360,9 @@ fs_visitor::setup_fs_payload_gen6()
     }
  
     /* R27: interpolated depth if uses source depth */
-   if (uses_depth) {
+   prog_data->uses_src_depth =
+      (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+   if (prog_data->uses_src_depth) {
        payload.source_depth_reg = payload.num_regs;
        payload.num_regs++;
        if (dispatch_width == 16) {
@@ -4892,8 +5370,11 @@ fs_visitor::setup_fs_payload_gen6()
           payload.num_regs++;
        }
     }
+
     /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
-   if (uses_depth) {
+   prog_data->uses_src_w =
+      (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+   if (prog_data->uses_src_w) {
        payload.source_w_reg = payload.num_regs;
        payload.num_regs++;
        if (dispatch_width == 16) {
@@ -4902,15 +5383,27 @@ fs_visitor::setup_fs_payload_gen6()
        }
     }
  
-   prog_data->uses_pos_offset = key->compute_pos_offset;
     /* R31: MSAA position offsets. */
-   if (prog_data->uses_pos_offset) {
+   if (prog_data->persample_dispatch &&
+       (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS)) {
+      /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
+       *
+       *    "MSDISPMODE_PERSAMPLE is required in order to select
+       *    POSOFFSET_SAMPLE"
+       *
+       * So we can only really get sample positions if we are doing real
+       * per-sample dispatch.  If we need gl_SamplePosition and we don't have
+       * persample dispatch, we hard-code it to 0.5.
+       */
+      prog_data->uses_pos_offset = true;
        payload.sample_pos_reg = payload.num_regs;
        payload.num_regs++;
     }
  
     /* R32: MSAA input coverage mask */
-   if (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) {
+   prog_data->uses_sample_mask =
+      (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
+   if (prog_data->uses_sample_mask) {
        assert(devinfo->gen >= 7);
        payload.sample_mask_in_reg = payload.num_regs;
        payload.num_regs++;
@@ -4978,8 +5471,8 @@ fs_visitor::setup_gs_payload()
        payload.num_regs++;
     }
  
-   /* Use a maximum of 32 registers for push-model inputs. */
-   const unsigned max_push_components = 32;
+   /* Use a maximum of 24 registers for push-model inputs. */
+   const unsigned max_push_components = 24;
  
     /* If pushing our inputs would take too many registers, reduce the URB read
      * length (which is in HWords, or 8 registers), and resort to pulling.
@@ -5032,6 +5525,44 @@ fs_visitor::calculate_register_pressure()
     }
  }
  
+/**
+ * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
+ *
+ * The needs_unlit_centroid_workaround ends up producing one of these per
+ * channel of centroid input, so it's good to clean them up.
+ *
+ * An assumption here is that nothing ever modifies the dispatched pixels
+ * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
+ * dictates that anyway.
+ */
+bool
+fs_visitor::opt_drop_redundant_mov_to_flags()
+{
+   bool flag_mov_found[2] = {false};
+   bool progress = false;
+
+   /* Instructions removed by this pass can only be added if this were true */
+   if (!devinfo->needs_unlit_centroid_workaround)
+      return false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      if (inst->is_control_flow()) {
+         memset(flag_mov_found, 0, sizeof(flag_mov_found));
+      } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
+         if (!flag_mov_found[inst->flag_subreg]) {
+            flag_mov_found[inst->flag_subreg] = true;
+         } else {
+            inst->remove(block);
+            progress = true;
+         }
+      } else if (inst->writes_flag()) {
+         flag_mov_found[inst->flag_subreg] = false;
+      }
+   }
+
+   return progress;
+}
+
  void
  fs_visitor::optimize()
  {
@@ -5052,7 +5583,7 @@ fs_visitor::optimize()
     bld = fs_builder(this, 64);
  
     assign_constant_locations();
-   demote_pull_constants();
+   lower_constant_loads();
  
     validate();
  
@@ -5079,7 +5610,7 @@ fs_visitor::optimize()
  
     if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
        char filename[64];
-      snprintf(filename, 64, "%s%d-%s-00-start",
+      snprintf(filename, 64, "%s%d-%s-00-00-start",
                 stage_abbrev, dispatch_width, nir->info.name);
  
        backend_shader::dump_instructions(filename);
@@ -5089,6 +5620,8 @@ fs_visitor::optimize()
     int iteration = 0;
     int pass_num = 0;
  
+   OPT(opt_drop_redundant_mov_to_flags);
+
     OPT(lower_simd_width);
     OPT(lower_logical_sends);
  
@@ -5129,9 +5662,26 @@ fs_visitor::optimize()
        OPT(dead_code_eliminate);
     }
  
+   if (OPT(lower_pack)) {
+      OPT(register_coalesce);
+      OPT(dead_code_eliminate);
+   }
+
+   if (OPT(lower_d2x)) {
+      OPT(opt_copy_propagate);
+      OPT(dead_code_eliminate);
+   }
+
     OPT(opt_combine_constants);
     OPT(lower_integer_multiplication);
  
+   if (devinfo->gen <= 5 && OPT(lower_minmax)) {
+      OPT(opt_cmod_propagation);
+      OPT(opt_cse);
+      OPT(opt_copy_propagate);
+      OPT(dead_code_eliminate);
+   }
+
     lower_uniform_pull_constant_loads();
  
     validate();
@@ -5144,16 +5694,22 @@ fs_visitor::optimize()
  void
  fs_visitor::fixup_3src_null_dest()
  {
+   bool progress = false;
+
     foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
-      if (inst->is_3src() && inst->dst.is_null()) {
+      if (inst->is_3src(devinfo) && inst->dst.is_null()) {
           inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
                              inst->dst.type);
+         progress = true;
        }
     }
+
+   if (progress)
+      invalidate_live_intervals();
  }
  
  void
-fs_visitor::allocate_registers()
+fs_visitor::allocate_registers(bool allow_spilling)
  {
     bool allocated_without_spills;
  
@@ -5163,6 +5719,8 @@ fs_visitor::allocate_registers()
        SCHEDULE_PRE_LIFO,
     };
  
+   bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS);
+
     /* Try each scheduling heuristic to see if it can successfully register
      * allocate without spilling.  They should be ordered by decreasing
      * performance but increasing likelihood of allocating.
@@ -5174,7 +5732,7 @@ fs_visitor::allocate_registers()
           assign_regs_trivial();
           allocated_without_spills = true;
        } else {
-         allocated_without_spills = assign_regs(false);
+         allocated_without_spills = assign_regs(false, spill_all);
        }
        if (allocated_without_spills)
           break;
@@ -5185,7 +5743,7 @@ fs_visitor::allocate_registers()
         * SIMD8.  There's probably actually some intermediate point where
         * SIMD16 with a couple of spills is still better.
         */
-      if (dispatch_width == 16) {
+      if (dispatch_width == 16 && min_dispatch_width <= 8) {
           fail("Failure to register allocate.  Reduce number of "
                "live scalar values to avoid this.");
        } else {
@@ -5199,12 +5757,14 @@ fs_visitor::allocate_registers()
        /* Since we're out of heuristics, just go spill registers until we
         * get an allocation.
         */
-      while (!assign_regs(true)) {
+      while (!assign_regs(true, spill_all)) {
           if (failed)
              break;
        }
     }
  
+   assert(last_scratch == 0 || allow_spilling);
+
     /* This must come after all optimization and register allocation, since
      * it inserts dead code that happens to have side effects, and it does
      * so based on the actual physical registers in use.
@@ -5250,7 +5810,89 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
     assign_vs_urb_setup();
  
     fixup_3src_null_dest();
-   allocate_registers();
+   allocate_registers(true);
+
+   return !failed;
+}
+
+bool
+fs_visitor::run_tcs_single_patch()
+{
+   assert(stage == MESA_SHADER_TESS_CTRL);
+
+   struct brw_tcs_prog_data *tcs_prog_data =
+      (struct brw_tcs_prog_data *) prog_data;
+
+   /* r1-r4 contain the ICP handles. */
+   payload.num_regs = 5;
+
+   if (shader_time_index >= 0)
+      emit_shader_time_begin();
+
+   /* Initialize gl_InvocationID */
+   fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
+   fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
+   bld.MOV(channels_ud, channels_uw);
+
+   if (tcs_prog_data->instances == 1) {
+      invocation_id = channels_ud;
+   } else {
+      invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+      /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */
+      fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
+              brw_imm_ud(INTEL_MASK(23, 17)));
+      bld.SHR(instance_times_8, t, brw_imm_ud(17 - 3));
+
+      bld.ADD(invocation_id, instance_times_8, channels_ud);
+   }
+
+   /* Fix the disptach mask */
+   if (nir->info.tcs.vertices_out % 8) {
+      bld.CMP(bld.null_reg_ud(), invocation_id,
+              brw_imm_ud(nir->info.tcs.vertices_out), BRW_CONDITIONAL_L);
+      bld.IF(BRW_PREDICATE_NORMAL);
+   }
+
+   emit_nir_code();
+
+   if (nir->info.tcs.vertices_out % 8) {
+      bld.emit(BRW_OPCODE_ENDIF);
+   }
+
+   /* Emit EOT write; set TR DS Cache bit */
+   fs_reg srcs[3] = {
+      fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+      fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
+      fs_reg(brw_imm_ud(0)),
+   };
+   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
+   bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
+
+   fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
+                            bld.null_reg_ud(), payload);
+   inst->mlen = 3;
+   inst->base_mrf = -1;
+   inst->eot = true;
+
+   if (shader_time_index >= 0)
+      emit_shader_time_end();
+
+   if (failed)
+      return false;
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_tcs_single_patch_urb_setup();
+
+   fixup_3src_null_dest();
+   allocate_registers(true);
  
     return !failed;
  }
@@ -5284,7 +5926,7 @@ fs_visitor::run_tes()
     assign_tes_urb_setup();
  
     fixup_3src_null_dest();
-   allocate_registers();
+   allocate_registers(true);
  
     return !failed;
  }
@@ -5333,13 +5975,13 @@ fs_visitor::run_gs()
     assign_gs_urb_setup();
  
     fixup_3src_null_dest();
-   allocate_registers();
+   allocate_registers(true);
  
     return !failed;
  }
  
  bool
-fs_visitor::run_fs(bool do_rep_send)
+fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
  {
     brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
     brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
@@ -5403,17 +6045,12 @@ fs_visitor::run_fs(bool do_rep_send)
        assign_urb_setup();
  
        fixup_3src_null_dest();
-      allocate_registers();
+      allocate_registers(allow_spilling);
  
        if (failed)
           return false;
     }
  
-   if (dispatch_width == 8)
-      wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
-   else
-      wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
-
     return !failed;
  }
  
@@ -5427,6 +6064,13 @@ fs_visitor::run_cs()
     if (shader_time_index >= 0)
        emit_shader_time_begin();
  
+   if (devinfo->is_haswell && prog_data->total_shared > 0) {
+      /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
+      const fs_builder abld = bld.exec_all().group(1, 0);
+      abld.MOV(retype(suboffset(brw_sr0_reg(), 1), BRW_REGISTER_TYPE_UW),
+               suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
+   }
+
     emit_nir_code();
  
     if (failed)
@@ -5444,7 +6088,7 @@ fs_visitor::run_cs()
     assign_curb_setup();
  
     fixup_3src_null_dest();
-   allocate_registers();
+   allocate_registers(true);
  
     if (failed)
        return false;
@@ -5517,6 +6161,31 @@ brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo,
     return barycentric_interp_modes;
  }
  
+static void
+brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
+                        bool shade_model_flat, const nir_shader *shader)
+{
+   prog_data->flat_inputs = 0;
+
+   nir_foreach_variable(var, &shader->inputs) {
+      enum glsl_interp_qualifier interp_qualifier =
+         (enum glsl_interp_qualifier)var->data.interpolation;
+      bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) ||
+                         (var->data.location == VARYING_SLOT_COL1);
+
+      int input_index = prog_data->urb_setup[var->data.location];
+
+      if (input_index < 0)
+        continue;
+
+      /* flat shading */
+      if (interp_qualifier == INTERP_QUALIFIER_FLAT ||
+          (shade_model_flat && is_gl_Color &&
+           interp_qualifier == INTERP_QUALIFIER_NONE))
+         prog_data->flat_inputs |= (1 << input_index);
+   }
+}
+
  static uint8_t
  computed_depth_mode(const nir_shader *shader)
  {
@@ -5544,6 +6213,7 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                 const nir_shader *src_shader,
                 struct gl_program *prog,
                 int shader_time_index8, int shader_time_index16,
+               bool allow_spilling,
                 bool use_rep_send,
                 unsigned *final_assembly_size,
                 char **error_str)
@@ -5551,66 +6221,109 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
     nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
     shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
                                        true);
+   brw_nir_lower_fs_inputs(shader);
+   brw_nir_lower_fs_outputs(shader);
     shader = brw_postprocess_nir(shader, compiler->devinfo, true);
  
     /* key->alpha_test_func means simulating alpha testing via discards,
      * so the shader definitely kills pixels.
      */
     prog_data->uses_kill = shader->info.fs.uses_discard || key->alpha_test_func;
-   prog_data->uses_omask =
+   prog_data->uses_omask = key->multisample_fbo &&
        shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
     prog_data->computed_depth_mode = computed_depth_mode(shader);
     prog_data->computed_stencil =
        shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
  
+   prog_data->persample_dispatch =
+      key->multisample_fbo &&
+      (key->persample_interp ||
+       (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID |
+                                           SYSTEM_BIT_SAMPLE_POS)) ||
+       shader->info.fs.uses_sample_qualifier);
+
     prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
  
     prog_data->barycentric_interp_modes =
        brw_compute_barycentric_interp_modes(compiler->devinfo,
                                             key->flat_shade,
-                                           key->persample_shading,
+                                           key->persample_interp,
                                             shader);
  
-   fs_visitor v(compiler, log_data, mem_ctx, key,
-                &prog_data->base, prog, shader, 8,
-                shader_time_index8);
-   if (!v.run_fs(false /* do_rep_send */)) {
+   cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL;
+   uint8_t simd8_grf_start = 0, simd16_grf_start = 0;
+   unsigned simd8_grf_used = 0, simd16_grf_used = 0;
+
+   fs_visitor v8(compiler, log_data, mem_ctx, key,
+                 &prog_data->base, prog, shader, 8,
+                 shader_time_index8);
+   if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) {
        if (error_str)
-         *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+         *error_str = ralloc_strdup(mem_ctx, v8.fail_msg);
  
        return NULL;
+   } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
+      simd8_cfg = v8.cfg;
+      simd8_grf_start = v8.payload.num_regs;
+      simd8_grf_used = v8.grf_used;
     }
  
-   cfg_t *simd16_cfg = NULL;
-   fs_visitor v2(compiler, log_data, mem_ctx, key,
-                 &prog_data->base, prog, shader, 16,
-                 shader_time_index16);
-   if (likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
-      if (!v.simd16_unsupported) {
-         /* Try a SIMD16 compile */
-         v2.import_uniforms(&v);
-         if (!v2.run_fs(use_rep_send)) {
-            compiler->shader_perf_log(log_data,
-                                      "SIMD16 shader failed to compile: %s",
-                                      v2.fail_msg);
-         } else {
-            simd16_cfg = v2.cfg;
-         }
+   if (!v8.simd16_unsupported &&
+       likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
+      /* Try a SIMD16 compile */
+      fs_visitor v16(compiler, log_data, mem_ctx, key,
+                     &prog_data->base, prog, shader, 16,
+                     shader_time_index16);
+      v16.import_uniforms(&v8);
+      if (!v16.run_fs(allow_spilling, use_rep_send)) {
+         compiler->shader_perf_log(log_data,
+                                   "SIMD16 shader failed to compile: %s",
+                                   v16.fail_msg);
+      } else {
+         simd16_cfg = v16.cfg;
+         simd16_grf_start = v16.payload.num_regs;
+         simd16_grf_used = v16.grf_used;
        }
     }
  
-   cfg_t *simd8_cfg;
-   int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send;
-   if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) {
+   /* When the caller requests a repclear shader, they want SIMD16-only */
+   if (use_rep_send)
        simd8_cfg = NULL;
-      prog_data->no_8 = true;
-   } else {
-      simd8_cfg = v.cfg;
-      prog_data->no_8 = false;
+
+   /* Prior to Iron Lake, the PS had a single shader offset with a jump table
+    * at the top to select the shader.  We've never implemented that.
+    * Instead, we just give them exactly one shader and we pick the widest one
+    * available.
+    */
+   if (compiler->devinfo->gen < 5 && simd16_cfg)
+      simd8_cfg = NULL;
+
+   if (prog_data->persample_dispatch) {
+      /* Starting with SandyBridge (where we first get MSAA), the different
+       * pixel dispatch combinations are grouped into classifications A
+       * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1).  On all hardware
+       * generations, the only configurations supporting persample dispatch
+       * are are this in which only one dispatch width is enabled.
+       *
+       * If computed depth is enabled, SNB only allows SIMD8 while IVB+
+       * allow SIMD8 or SIMD16 so we choose SIMD16 if available.
+       */
+      if (compiler->devinfo->gen == 6 &&
+          prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) {
+         simd16_cfg = NULL;
+      } else if (simd16_cfg) {
+         simd8_cfg = NULL;
+      }
     }
  
+   /* We have to compute the flat inputs after the visitor is finished running
+    * because it relies on prog_data->urb_setup which is computed in
+    * fs_visitor::calculate_urb_setup().
+    */
+   brw_compute_flat_inputs(prog_data, key->flat_shade, shader);
+
     fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
-                  v.promoted_constants, v.runtime_check_aads_emit,
+                  v8.promoted_constants, v8.runtime_check_aads_emit,
                    MESA_SHADER_FRAGMENT);
  
     if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
@@ -5620,10 +6333,24 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                                       shader->info.name));
     }
  
-   if (simd8_cfg)
+   if (simd8_cfg) {
+      prog_data->dispatch_8 = true;
        g.generate_code(simd8_cfg, 8);
-   if (simd16_cfg)
-      prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
+      prog_data->base.dispatch_grf_start_reg = simd8_grf_start;
+      prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used);
+
+      if (simd16_cfg) {
+         prog_data->dispatch_16 = true;
+         prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16);
+         prog_data->dispatch_grf_start_reg_2 = simd16_grf_start;
+         prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used);
+      }
+   } else if (simd16_cfg) {
+      prog_data->dispatch_16 = true;
+      g.generate_code(simd16_cfg, 16);
+      prog_data->base.dispatch_grf_start_reg = simd16_grf_start;
+      prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used);
+   }
  
     return g.get_assembly(final_assembly_size);
  }
@@ -5678,6 +6405,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
     nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
     shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
                                        true);
+   brw_nir_lower_cs_shared(shader);
+   prog_data->base.total_shared += shader->num_shared;
     shader = brw_postprocess_nir(shader, compiler->devinfo, true);
  
     prog_data->local_size[0] = shader->info.cs.local_size[0];
@@ -5688,6 +6417,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
        shader->info.cs.local_size[2];
  
     unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
+   unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);
  
     cfg_t *cfg = NULL;
     const char *fail_msg = NULL;
@@ -5697,11 +6427,14 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
     fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
                   NULL, /* Never used in core profile */
                   shader, 8, shader_time_index);
-   if (!v8.run_cs()) {
-      fail_msg = v8.fail_msg;
-   } else if (local_workgroup_size <= 8 * max_cs_threads) {
-      cfg = v8.cfg;
-      prog_data->simd_size = 8;
+   if (simd_required <= 8) {
+      if (!v8.run_cs()) {
+         fail_msg = v8.fail_msg;
+      } else {
+         cfg = v8.cfg;
+         prog_data->simd_size = 8;
+         prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
+      }
     }
  
     fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
@@ -5711,7 +6444,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
         !fail_msg && !v8.simd16_unsupported &&
         local_workgroup_size <= 16 * max_cs_threads) {
        /* Try a SIMD16 compile */
-      v16.import_uniforms(&v8);
+      if (simd_required <= 8)
+         v16.import_uniforms(&v8);
        if (!v16.run_cs()) {
           compiler->shader_perf_log(log_data,
                                     "SIMD16 shader failed to compile: %s",
@@ -5724,6 +6458,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
        } else {
           cfg = v16.cfg;
           prog_data->simd_size = 16;
+         prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
        }
     }