Merge remote-tracking branch 'public/master' into vulkan

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index 72a21587a4fdac1061e4e1ce881d629158b64bce..3f307f4ef70cb38b70b44ac6d4c80a1f5a4cf3f7 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -28,28 +28,17 @@
   * from the LIR.
   */
  
-#include <sys/types.h>
-
-#include "util/hash_table.h"
  #include "main/macros.h"
-#include "main/shaderobj.h"
-#include "main/fbobject.h"
-#include "program/prog_parameter.h"
-#include "program/prog_print.h"
-#include "util/register_allocate.h"
-#include "program/hash_table.h"
  #include "brw_context.h"
  #include "brw_eu.h"
-#include "brw_wm.h"
  #include "brw_fs.h"
  #include "brw_cs.h"
+#include "brw_nir.h"
  #include "brw_vec4_gs_visitor.h"
  #include "brw_cfg.h"
+#include "brw_program.h"
  #include "brw_dead_control_flow.h"
-#include "main/uniforms.h"
-#include "brw_fs_live_variables.h"
-#include "glsl/nir/glsl_types.h"
-#include "program/sampler.h"
+#include "compiler/glsl_types.h"
  
  using namespace brw;
  
@@ -185,8 +174,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
      * CSE can later notice that those loads are all the same and eliminate
      * the redundant ones.
      */
-   fs_reg vec4_offset = vgrf(glsl_type::int_type);
-   bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
+   fs_reg vec4_offset = vgrf(glsl_type::uint_type);
+   bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
  
     int scale = 1;
     if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
@@ -218,7 +207,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
           inst->mlen = 1 + bld.dispatch_width() / 8;
     }
  
-   bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
+   bld.MOV(dst, offset(vec4_result, bld, ((const_offset & 0xf) / 4) * scale));
  }
  
  /**
@@ -299,6 +288,71 @@ fs_inst::is_send_from_grf() const
     }
  }
  
+/**
+ * Returns true if this instruction's sources and destinations cannot
+ * safely be the same register.
+ *
+ * In most cases, a register can be written over safely by the same
+ * instruction that is its last use.  For a single instruction, the
+ * sources are dereferenced before writing of the destination starts
+ * (naturally).
+ *
+ * However, there are a few cases where this can be problematic:
+ *
+ * - Virtual opcodes that translate to multiple instructions in the
+ *   code generator: if src == dst and one instruction writes the
+ *   destination before a later instruction reads the source, then
+ *   src will have been clobbered.
+ *
+ * - SIMD16 compressed instructions with certain regioning (see below).
+ *
+ * The register allocator uses this information to set up conflicts between
+ * GRF sources and the destination.
+ */
+bool
+fs_inst::has_source_and_destination_hazard() const
+{
+   switch (opcode) {
+   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+      /* Multiple partial writes to the destination */
+      return true;
+   default:
+      /* The SIMD16 compressed instruction
+       *
+       * add(16)      g4<1>F      g4<8,8,1>F   g6<8,8,1>F
+       *
+       * is actually decoded in hardware as:
+       *
+       * add(8)       g4<1>F      g4<8,8,1>F   g6<8,8,1>F
+       * add(8)       g5<1>F      g5<8,8,1>F   g7<8,8,1>F
+       *
+       * Which is safe.  However, if we have uniform accesses
+       * happening, we get into trouble:
+       *
+       * add(8)       g4<1>F      g4<0,1,0>F   g6<8,8,1>F
+       * add(8)       g5<1>F      g4<0,1,0>F   g7<8,8,1>F
+       *
+       * Now our destination for the first instruction overwrote the
+       * second instruction's src0, and we get garbage for those 8
+       * pixels.  There's a similar issue for the pre-gen6
+       * pixel_x/pixel_y, which are registers of 16-bit values and thus
+       * would get stomped by the first decode as well.
+       */
+      if (exec_size == 16) {
+         for (int i = 0; i < sources; i++) {
+            if (src[i].file == VGRF && (src[i].stride == 0 ||
+                                        src[i].type == BRW_REGISTER_TYPE_UW ||
+                                        src[i].type == BRW_REGISTER_TYPE_W ||
+                                        src[i].type == BRW_REGISTER_TYPE_UB ||
+                                        src[i].type == BRW_REGISTER_TYPE_B)) {
+               return true;
+            }
+         }
+      }
+      return false;
+   }
+}
+
  bool
  fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
  {
@@ -374,60 +428,11 @@ fs_reg::fs_reg()
     this->file = BAD_FILE;
  }
  
-/** Immediate value constructor. */
-fs_reg::fs_reg(float f)
-{
-   init();
-   this->file = IMM;
-   this->type = BRW_REGISTER_TYPE_F;
-   this->stride = 0;
-   this->f = f;
-}
-
-/** Immediate value constructor. */
-fs_reg::fs_reg(int32_t i)
-{
-   init();
-   this->file = IMM;
-   this->type = BRW_REGISTER_TYPE_D;
-   this->stride = 0;
-   this->d = i;
-}
-
-/** Immediate value constructor. */
-fs_reg::fs_reg(uint32_t u)
-{
-   init();
-   this->file = IMM;
-   this->type = BRW_REGISTER_TYPE_UD;
-   this->stride = 0;
-   this->ud = u;
-}
-
-/** Vector float immediate value constructor. */
-fs_reg::fs_reg(uint8_t vf[4])
-{
-   init();
-   this->file = IMM;
-   this->type = BRW_REGISTER_TYPE_VF;
-   memcpy(&this->ud, vf, sizeof(unsigned));
-}
-
-/** Vector float immediate value constructor. */
-fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
-{
-   init();
-   this->file = IMM;
-   this->type = BRW_REGISTER_TYPE_VF;
-   this->ud = (vf0 <<  0) | (vf1 <<  8) | (vf2 << 16) | (vf3 << 24);
-}
-
-fs_reg::fs_reg(struct brw_reg reg) :
+fs_reg::fs_reg(struct ::brw_reg reg) :
     backend_reg(reg)
  {
     this->reg_offset = 0;
     this->subreg_offset = 0;
-   this->reladdr = NULL;
     this->stride = 1;
     if (this->file == IMM &&
         (this->type != BRW_REGISTER_TYPE_V &&
@@ -440,10 +445,8 @@ fs_reg::fs_reg(struct brw_reg reg) :
  bool
  fs_reg::equals(const fs_reg &r) const
  {
-   return (memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0 &&
-           reg_offset == r.reg_offset &&
+   return (this->backend_reg::equals(r) &&
             subreg_offset == r.subreg_offset &&
-           !reladdr && !r.reladdr &&
             stride == r.stride);
  }
  
@@ -505,6 +508,7 @@ type_size_scalar(const struct glsl_type *type)
     case GLSL_TYPE_ERROR:
     case GLSL_TYPE_INTERFACE:
     case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_FUNCTION:
        unreachable("not reached");
     }
  
@@ -590,7 +594,7 @@ fs_visitor::emit_shader_time_end()
     fs_reg reset = shader_end_time;
     reset.set_smear(2);
     set_condmod(BRW_CONDITIONAL_Z,
-               ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
+               ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
     ibld.IF(BRW_PREDICATE_NORMAL);
  
     fs_reg start = shader_start_time;
@@ -605,11 +609,11 @@ fs_visitor::emit_shader_time_end()
      * is 2 cycles.  Remove that overhead, so I can forget about that when
      * trying to determine the time taken for single instructions.
      */
-   cbld.ADD(diff, diff, fs_reg(-2u));
+   cbld.ADD(diff, diff, brw_imm_ud(-2u));
     SHADER_TIME_ADD(cbld, 0, diff);
-   SHADER_TIME_ADD(cbld, 1, fs_reg(1u));
+   SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));
     ibld.emit(BRW_OPCODE_ELSE);
-   SHADER_TIME_ADD(cbld, 2, fs_reg(1u));
+   SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));
     ibld.emit(BRW_OPCODE_ENDIF);
  }
  
@@ -619,7 +623,7 @@ fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
                              fs_reg value)
  {
     int index = shader_time_index * 3 + shader_time_subindex;
-   fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
+   struct brw_reg offset = brw_imm_d(index * SHADER_TIME_STRIDE);
  
     fs_reg payload;
     if (dispatch_width == 8)
@@ -734,18 +738,20 @@ fs_inst::components_read(unsigned i) const
     case SHADER_OPCODE_LOD_LOGICAL:
     case SHADER_OPCODE_TG4_LOGICAL:
     case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
-      assert(src[8].file == IMM && src[9].file == IMM);
+      assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
+             src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
        /* Texture coordinates. */
-      if (i == 0)
-         return src[8].ud;
+      if (i == TEX_LOGICAL_SRC_COORDINATE)
+         return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
        /* Texture derivatives. */
-      else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
-         return src[9].ud;
+      else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
+               opcode == SHADER_OPCODE_TXD_LOGICAL)
+         return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
        /* Texture offset. */
-      else if (i == 7)
+      else if (i == TEX_LOGICAL_SRC_OFFSET_VALUE)
           return 2;
        /* MCS */
-      else if (i == 5 && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
+      else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
           return 2;
        else
           return 1;
@@ -845,7 +851,10 @@ fs_inst::regs_read(int arg) const
           assert(src[2].file == IMM);
           unsigned region_length = src[2].ud;
  
-         if (src[0].file == FIXED_GRF) {
+         if (src[0].file == UNIFORM) {
+            assert(region_length % 4 == 0);
+            return region_length / 4;
+         } else if (src[0].file == FIXED_GRF) {
              /* If the start of the region is not register aligned, then
               * there's some portion of the register that's technically
               * unread at the beginning.
@@ -859,7 +868,7 @@ fs_inst::regs_read(int arg) const
               * unread portion at the beginning.
               */
              if (src[0].subnr)
-               region_length += src[0].subnr * type_sz(src[0].type);
+               region_length += src[0].subnr;
  
              return DIV_ROUND_UP(region_length, REG_SIZE);
           } else {
@@ -1015,7 +1024,6 @@ fs_visitor::import_uniforms(fs_visitor *v)
     this->push_constant_loc = v->push_constant_loc;
     this->pull_constant_loc = v->pull_constant_loc;
     this->uniforms = v->uniforms;
-   this->param_size = v->param_size;
  }
  
  fs_reg *
@@ -1032,7 +1040,7 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
     if (pixel_center_integer) {
        bld.MOV(wpos, this->pixel_x);
     } else {
-      bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
+      bld.ADD(wpos, this->pixel_x, brw_imm_f(0.5f));
     }
     wpos = offset(wpos, bld, 1);
  
@@ -1048,7 +1056,7 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
          offset += key->drawable_height - 1.0f;
        }
  
-      bld.ADD(wpos, pixel_y, fs_reg(offset));
+      bld.ADD(wpos, pixel_y, brw_imm_f(offset));
     }
     wpos = offset(wpos, bld, 1);
  
@@ -1103,33 +1111,19 @@ fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
  }
  
  void
-fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
+fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
                                         const glsl_type *type,
                                         glsl_interp_qualifier interpolation_mode,
-                                       int location, bool mod_centroid,
+                                       int *location, bool mod_centroid,
                                         bool mod_sample)
  {
-   attr.type = brw_type_for_base_type(type->get_scalar_type());
-
     assert(stage == MESA_SHADER_FRAGMENT);
     brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
     brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  
-   unsigned int array_elements;
-
-   if (type->is_array()) {
-      array_elements = type->arrays_of_arrays_size();
-      if (array_elements == 0) {
-         fail("dereferenced array '%s' has length 0\n", name);
-      }
-      type = type->without_array();
-   } else {
-      array_elements = 1;
-   }
-
     if (interpolation_mode == INTERP_QUALIFIER_NONE) {
        bool is_gl_Color =
-         location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
+         *location == VARYING_SLOT_COL0 || *location == VARYING_SLOT_COL1;
        if (key->flat_shade && is_gl_Color) {
           interpolation_mode = INTERP_QUALIFIER_FLAT;
        } else {
@@ -1137,71 +1131,86 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
        }
     }
  
-   for (unsigned int i = 0; i < array_elements; i++) {
-      for (unsigned int j = 0; j < type->matrix_columns; j++) {
-        if (prog_data->urb_setup[location] == -1) {
-           /* If there's no incoming setup data for this slot, don't
-            * emit interpolation for it.
-            */
-           attr = offset(attr, bld, type->vector_elements);
-           location++;
-           continue;
-        }
+   if (type->is_array() || type->is_matrix()) {
+      const glsl_type *elem_type = glsl_get_array_element(type);
+      const unsigned length = glsl_get_length(type);
  
-        if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
-           /* Constant interpolation (flat shading) case. The SF has
-            * handed us defined values in only the constant offset
-            * field of the setup reg.
-            */
-           for (unsigned int k = 0; k < type->vector_elements; k++) {
-              struct brw_reg interp = interp_reg(location, k);
-              interp = suboffset(interp, 3);
-               interp.type = attr.type;
-               bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
-              attr = offset(attr, bld, 1);
-           }
-        } else {
-           /* Smooth/noperspective interpolation case. */
-           for (unsigned int k = 0; k < type->vector_elements; k++) {
-               struct brw_reg interp = interp_reg(location, k);
-               if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
-                  /* Get the pixel/sample mask into f0 so that we know
-                   * which pixels are lit.  Then, for each channel that is
-                   * unlit, replace the centroid data with non-centroid
-                   * data.
-                   */
-                  bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
-
-                  fs_inst *inst;
-                  inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
-                                      false, false);
-                  inst->predicate = BRW_PREDICATE_NORMAL;
-                  inst->predicate_inverse = true;
-                  if (devinfo->has_pln)
-                     inst->no_dd_clear = true;
-
-                  inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
-                                      mod_centroid && !key->persample_shading,
-                                      mod_sample || key->persample_shading);
-                  inst->predicate = BRW_PREDICATE_NORMAL;
-                  inst->predicate_inverse = false;
-                  if (devinfo->has_pln)
-                     inst->no_dd_check = true;
+      for (unsigned i = 0; i < length; i++) {
+         emit_general_interpolation(attr, name, elem_type, interpolation_mode,
+                                    location, mod_centroid, mod_sample);
+      }
+   } else if (type->is_record()) {
+      for (unsigned i = 0; i < type->length; i++) {
+         const glsl_type *field_type = type->fields.structure[i].type;
+         emit_general_interpolation(attr, name, field_type, interpolation_mode,
+                                    location, mod_centroid, mod_sample);
+      }
+   } else {
+      assert(type->is_scalar() || type->is_vector());
  
-               } else {
-                  emit_linterp(attr, fs_reg(interp), interpolation_mode,
-                               mod_centroid && !key->persample_shading,
-                               mod_sample || key->persample_shading);
-               }
-               if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
-                  bld.MUL(attr, attr, this->pixel_w);
-               }
-              attr = offset(attr, bld, 1);
-           }
+      if (prog_data->urb_setup[*location] == -1) {
+         /* If there's no incoming setup data for this slot, don't
+          * emit interpolation for it.
+          */
+         *attr = offset(*attr, bld, type->vector_elements);
+         (*location)++;
+         return;
+      }
  
-        }
-        location++;
+      attr->type = brw_type_for_base_type(type->get_scalar_type());
+
+      if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
+         /* Constant interpolation (flat shading) case. The SF has
+          * handed us defined values in only the constant offset
+          * field of the setup reg.
+          */
+         for (unsigned int i = 0; i < type->vector_elements; i++) {
+            struct brw_reg interp = interp_reg(*location, i);
+            interp = suboffset(interp, 3);
+            interp.type = attr->type;
+            bld.emit(FS_OPCODE_CINTERP, *attr, fs_reg(interp));
+            *attr = offset(*attr, bld, 1);
+         }
+      } else {
+         /* Smooth/noperspective interpolation case. */
+         for (unsigned int i = 0; i < type->vector_elements; i++) {
+            struct brw_reg interp = interp_reg(*location, i);
+            if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
+               /* Get the pixel/sample mask into f0 so that we know
+                * which pixels are lit.  Then, for each channel that is
+                * unlit, replace the centroid data with non-centroid
+                * data.
+                */
+               bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+
+               fs_inst *inst;
+               inst = emit_linterp(*attr, fs_reg(interp), interpolation_mode,
+                                   false, false);
+               inst->predicate = BRW_PREDICATE_NORMAL;
+               inst->predicate_inverse = true;
+               if (devinfo->has_pln)
+                  inst->no_dd_clear = true;
+
+               inst = emit_linterp(*attr, fs_reg(interp), interpolation_mode,
+                                   mod_centroid && !key->persample_shading,
+                                   mod_sample || key->persample_shading);
+               inst->predicate = BRW_PREDICATE_NORMAL;
+               inst->predicate_inverse = false;
+               if (devinfo->has_pln)
+                  inst->no_dd_check = true;
+
+            } else {
+               emit_linterp(*attr, fs_reg(interp), interpolation_mode,
+                            mod_centroid && !key->persample_shading,
+                            mod_sample || key->persample_shading);
+            }
+            if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
+               bld.MUL(*attr, *attr, this->pixel_w);
+            }
+            *attr = offset(*attr, bld, 1);
+         }
        }
+      (*location)++;
     }
  }
  
@@ -1225,7 +1234,7 @@ fs_visitor::emit_frontfacing_interpolation()
        fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
        g0.negate = true;
  
-      bld.ASR(*reg, g0, fs_reg(15));
+      bld.ASR(*reg, g0, brw_imm_d(15));
     } else {
        /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
         * a boolean result from this (1/true or 0/false).
@@ -1240,7 +1249,7 @@ fs_visitor::emit_frontfacing_interpolation()
        fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
        g1_6.negate = true;
  
-      bld.ASR(*reg, g1_6, fs_reg(31));
+      bld.ASR(*reg, g1_6, brw_imm_d(31));
     }
  
     return reg;
@@ -1257,7 +1266,7 @@ fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
        /* Convert int_sample_pos to floating point */
        bld.MOV(dst, int_sample_pos);
        /* Scale to the range [0, 1] */
-      bld.MUL(dst, dst, fs_reg(1 / 16.0f));
+      bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));
     }
     else {
        /* From ARB_sample_shading specification:
@@ -1265,7 +1274,7 @@ fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
         *  rasterization is disabled, gl_SamplePosition will always be
         *  (0.5, 0.5).
         */
-      bld.MOV(dst, fs_reg(0.5f));
+      bld.MOV(dst, brw_imm_f(0.5f));
     }
  }
  
@@ -1360,8 +1369,8 @@ fs_visitor::emit_sampleid_setup()
  
        abld.exec_all().group(1, 0)
            .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
-               fs_reg(sspi_mask));
-      abld.exec_all().group(1, 0).SHR(t1, t1, fs_reg(5));
+               brw_imm_ud(sspi_mask));
+      abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
  
        /* This works for both SIMD8 and SIMD16 */
        abld.exec_all().group(4, 0)
@@ -1376,7 +1385,7 @@ fs_visitor::emit_sampleid_setup()
         * "When rendering to a non-multisample buffer, or if multisample
         *  rasterization is disabled, gl_SampleID will always be zero."
         */
-      abld.MOV(*reg, fs_reg(0));
+      abld.MOV(*reg, brw_imm_d(0));
     }
  
     return reg;
@@ -1664,9 +1673,6 @@ fs_visitor::assign_vs_urb_setup()
     brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
  
     assert(stage == MESA_SHADER_VERTEX);
-   int count = _mesa_bitcount_64(vs_prog_data->inputs_read);
-   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
-      count++;
  
     /* Each attribute is 4 regs. */
     this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes;
@@ -1679,6 +1685,21 @@ fs_visitor::assign_vs_urb_setup()
     }
  }
  
+void
+fs_visitor::assign_tes_urb_setup()
+{
+   assert(stage == MESA_SHADER_TESS_EVAL);
+
+   brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data;
+
+   first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
+
+   /* Rewrite all ATTR file references to HW_REGs. */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      convert_attr_sources_to_hw_regs(inst);
+   }
+}
+
  void
  fs_visitor::assign_gs_urb_setup()
  {
@@ -1905,31 +1926,30 @@ fs_visitor::compact_virtual_grfs()
   * maximum number of fragment shader uniform components (64).  If
   * there are too many of these, they'd fill up all of register space.
   * So, this will push some of them out to the pull constant buffer and
- * update the program to load them.  We also use pull constants for all
- * indirect constant loads because we don't support indirect accesses in
- * registers yet.
+ * update the program to load them.
   */
  void
  fs_visitor::assign_constant_locations()
  {
-   /* Only the first compile (SIMD8 mode) gets to decide on locations. */
-   if (dispatch_width != 8)
+   /* Only the first compile gets to decide on locations. */
+   if (dispatch_width != min_dispatch_width)
        return;
  
-   unsigned int num_pull_constants = 0;
-
-   pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
-   memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
-
     bool is_live[uniforms];
     memset(is_live, 0, sizeof(is_live));
  
+   /* For each uniform slot, a value of true indicates that the given slot and
+    * the next slot must remain contiguous.  This is used to keep us from
+    * splitting arrays apart.
+    */
+   bool contiguous[uniforms];
+   memset(contiguous, 0, sizeof(contiguous));
+
     /* First, we walk through the instructions and do two things:
      *
      *  1) Figure out which uniforms are live.
      *
-    *  2) Find all indirect access of uniform arrays and flag them as needing
-    *     to go into the pull constant buffer.
+    *  2) Mark any indirectly used ranges of registers as contiguous.
      *
      * Note that we don't move constant-indexed accesses to arrays.  No
      * testing has been done of the performance impact of this choice.
@@ -1939,20 +1959,19 @@ fs_visitor::assign_constant_locations()
           if (inst->src[i].file != UNIFORM)
              continue;
  
-         if (inst->src[i].reladdr) {
-            int uniform = inst->src[i].nr;
+         int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
  
-            /* If this array isn't already present in the pull constant buffer,
-             * add it.
-             */
-            if (pull_constant_loc[uniform] == -1) {
-               assert(param_size[uniform]);
-               for (int j = 0; j < param_size[uniform]; j++)
-                  pull_constant_loc[uniform + j] = num_pull_constants++;
+         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
+            assert(inst->src[2].ud % 4 == 0);
+            unsigned last = constant_nr + (inst->src[2].ud / 4) - 1;
+            assert(last < uniforms);
+
+            for (unsigned j = constant_nr; j < last; j++) {
+               is_live[j] = true;
+               contiguous[j] = true;
              }
+            is_live[last] = true;
           } else {
-            /* Mark the the one accessed uniform as live */
-            int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
              if (constant_nr >= 0 && constant_nr < (int) uniforms)
                 is_live[constant_nr] = true;
           }
@@ -1967,29 +1986,48 @@ fs_visitor::assign_constant_locations()
      * If changing this value, note the limitation about total_regs in
      * brw_curbe.c.
      */
-   unsigned int max_push_components = 16 * 8;
+   const unsigned int max_push_components = 16 * 8;
+
+   /* For vulkan we don't limit the max_chunk_size. We set it to 32 float =
+    * 128 bytes, which is the maximum vulkan push constant size.
+    */
+   const unsigned int max_chunk_size = 32;
+
     unsigned int num_push_constants = 0;
+   unsigned int num_pull_constants = 0;
  
     push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+   pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
  
-   for (unsigned int i = 0; i < uniforms; i++) {
-      if (!is_live[i] || pull_constant_loc[i] != -1) {
-         /* This UNIFORM register is either dead, or has already been demoted
-          * to a pull const.  Mark it as no longer living in the param[] array.
-          */
-         push_constant_loc[i] = -1;
+   int chunk_start = -1;
+   for (unsigned u = 0; u < uniforms; u++) {
+      push_constant_loc[u] = -1;
+      pull_constant_loc[u] = -1;
+
+      if (!is_live[u])
           continue;
-      }
  
-      if (num_push_constants < max_push_components) {
-         /* Retain as a push constant.  Record the location in the params[]
-          * array.
-          */
-         push_constant_loc[i] = num_push_constants++;
-      } else {
-         /* Demote to a pull constant. */
-         push_constant_loc[i] = -1;
-         pull_constant_loc[i] = num_pull_constants++;
+      /* This is the first live uniform in the chunk */
+      if (chunk_start < 0)
+         chunk_start = u;
+
+      /* If this element does not need to be contiguous with the next, we
+       * split at this point and everthing between chunk_start and u forms a
+       * single chunk.
+       */
+      if (!contiguous[u]) {
+         unsigned chunk_size = u - chunk_start + 1;
+
+         if (num_push_constants + chunk_size <= max_push_components &&
+             chunk_size <= max_chunk_size) {
+            for (unsigned j = chunk_start; j <= u; j++)
+               push_constant_loc[j] = num_push_constants++;
+         } else {
+            for (unsigned j = chunk_start; j <= u; j++)
+               pull_constant_loc[j] = num_pull_constants++;
+         }
+
+         chunk_start = -1;
        }
     }
  
@@ -2020,51 +2058,67 @@ fs_visitor::assign_constant_locations()
   * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
   */
  void
-fs_visitor::demote_pull_constants()
+fs_visitor::lower_constant_loads()
  {
-   foreach_block_and_inst (block, fs_inst, inst, cfg) {
+   const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      /* Set up the annotation tracking for new generated instructions. */
+      const fs_builder ibld(this, block, inst);
+
        for (int i = 0; i < inst->sources; i++) {
          if (inst->src[i].file != UNIFORM)
             continue;
  
-         int pull_index;
+         /* We'll handle this case later */
+         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
+            continue;
+
           unsigned location = inst->src[i].nr + inst->src[i].reg_offset;
-         if (location >= uniforms) /* Out of bounds access */
-            pull_index = -1;
-         else
-            pull_index = pull_constant_loc[location];
+         if (location >= uniforms)
+            continue; /* Out of bounds access */
+
+         int pull_index = pull_constant_loc[location];
  
           if (pull_index == -1)
             continue;
  
-         /* Set up the annotation tracking for new generated instructions. */
-         const fs_builder ibld(this, block, inst);
-         const unsigned index = stage_prog_data->binding_table.pull_constants_start;
-         fs_reg dst = vgrf(glsl_type::float_type);
-
           assert(inst->src[i].stride == 0);
  
-         /* Generate a pull load into dst. */
-         if (inst->src[i].reladdr) {
-            VARYING_PULL_CONSTANT_LOAD(ibld, dst,
-                                       fs_reg(index),
-                                       *inst->src[i].reladdr,
-                                       pull_index);
-            inst->src[i].reladdr = NULL;
-            inst->src[i].stride = 1;
-         } else {
-            const fs_builder ubld = ibld.exec_all().group(8, 0);
-            fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
-            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                      dst, fs_reg(index), offset);
-            inst->src[i].set_smear(pull_index & 3);
-         }
-         brw_mark_surface_used(prog_data, index);
+         fs_reg dst = vgrf(glsl_type::float_type);
+         const fs_builder ubld = ibld.exec_all().group(8, 0);
+         struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
+         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                   dst, brw_imm_ud(index), offset);
  
           /* Rewrite the instruction to use the temporary VGRF. */
           inst->src[i].file = VGRF;
           inst->src[i].nr = dst.nr;
           inst->src[i].reg_offset = 0;
+         inst->src[i].set_smear(pull_index & 3);
+
+         brw_mark_surface_used(prog_data, index);
+      }
+
+      if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
+          inst->src[0].file == UNIFORM) {
+
+         unsigned location = inst->src[0].nr + inst->src[0].reg_offset;
+         if (location >= uniforms)
+            continue; /* Out of bounds access */
+
+         int pull_index = pull_constant_loc[location];
+
+         if (pull_index == -1)
+           continue;
+
+         VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
+                                    brw_imm_ud(index),
+                                    inst->src[1],
+                                    pull_index * 4);
+         inst->remove(block);
+
+         brw_mark_surface_used(prog_data, index);
        }
     }
     invalidate_live_intervals();
@@ -2085,7 +2139,8 @@ fs_visitor::opt_algebraic()
              if (inst->dst.type != inst->src[0].type)
                 assert(!"unimplemented: saturate mixed types");
  
-            if (brw_saturate_immediate(inst->dst.type, &inst->src[0])) {
+            if (brw_saturate_immediate(inst->dst.type,
+                                       &inst->src[0].as_brw_reg())) {
                 inst->saturate = false;
                 progress = true;
              }
@@ -2252,17 +2307,6 @@ fs_visitor::opt_algebraic()
              progress = true;
           }
           break;
-      case SHADER_OPCODE_RCP: {
-         fs_inst *prev = (fs_inst *)inst->prev;
-         if (prev->opcode == SHADER_OPCODE_SQRT) {
-            if (inst->src[0].equals(prev->dst)) {
-               inst->opcode = SHADER_OPCODE_RSQ;
-               inst->src[0] = prev->src[0];
-               progress = true;
-            }
-         }
-         break;
-      }
        case SHADER_OPCODE_BROADCAST:
           if (is_uniform(inst->src[0])) {
              inst->opcode = BRW_OPCODE_MOV;
@@ -2419,8 +2463,10 @@ fs_visitor::opt_sampler_eot()
      * we have enough space, but it will make sure the dead code eliminator kills
      * the instruction that this will replace.
      */
-   if (tex_inst->header_size != 0)
+   if (tex_inst->header_size != 0) {
+      invalidate_live_intervals();
        return true;
+   }
  
     fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
                                    load_payload->sources + 1);
@@ -2451,6 +2497,7 @@ fs_visitor::opt_sampler_eot()
     tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
     tex_inst->src[0] = send_header;
  
+   invalidate_live_intervals();
     return true;
  }
  
@@ -2748,7 +2795,7 @@ fs_visitor::eliminate_find_live_channel()
        case SHADER_OPCODE_FIND_LIVE_CHANNEL:
           if (depth == 0) {
              inst->opcode = BRW_OPCODE_MOV;
-            inst->src[0] = fs_reg(0u);
+            inst->src[0] = brw_imm_ud(0u);
              inst->sources = 1;
              inst->force_writemask_all = true;
              progress = true;
@@ -2773,10 +2820,21 @@ fs_visitor::emit_repclear_shader()
     brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
     int base_mrf = 1;
     int color_mrf = base_mrf + 2;
+   fs_inst *mov;
  
-   fs_inst *mov = bld.exec_all().group(4, 0)
-                     .MOV(brw_message_reg(color_mrf),
-                          fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+   if (uniforms > 0) {
+      mov = bld.exec_all().group(4, 0)
+               .MOV(brw_message_reg(color_mrf),
+                    fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+   } else {
+      struct brw_reg reg =
+         brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
+                 BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
+                 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+
+      mov = bld.exec_all().group(4, 0)
+               .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
+   }
  
     fs_inst *write;
     if (key->nr_color_regions == 1) {
@@ -2805,8 +2863,10 @@ fs_visitor::emit_repclear_shader()
     assign_curb_setup();
  
     /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
-   assert(mov->src[0].file == FIXED_GRF);
-   mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+   if (uniforms > 0) {
+      assert(mov->src[0].file == FIXED_GRF);
+      mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+   }
  }
  
  /**
@@ -3085,13 +3145,11 @@ fs_visitor::lower_uniform_pull_constant_loads()
           continue;
  
        if (devinfo->gen >= 7) {
-         /* The offset arg before was a vec4-aligned byte offset.  We need to
-          * turn it into a dword offset.
-          */
+         /* The offset arg is a vec4-aligned immediate byte offset. */
           fs_reg const_offset_reg = inst->src[1];
           assert(const_offset_reg.file == IMM &&
                  const_offset_reg.type == BRW_REGISTER_TYPE_UD);
-         const_offset_reg.ud /= 4;
+         assert(const_offset_reg.ud % 16 == 0);
  
           fs_reg payload, offset;
           if (devinfo->gen >= 9) {
@@ -3416,8 +3474,7 @@ fs_visitor::lower_integer_multiplication()
               */
              assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
                     mul->src[1].type == BRW_REGISTER_TYPE_UD);
-            mul->src[1].type = (type_is_signed(mul->src[1].type) ?
-                                BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
+            mul->src[1].type = BRW_REGISTER_TYPE_UW;
              mul->src[1].stride *= 2;
  
           } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
@@ -3457,6 +3514,36 @@ fs_visitor::lower_integer_multiplication()
     return progress;
  }
  
+bool
+fs_visitor::lower_minmax()
+{
+   assert(devinfo->gen < 6);
+
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const fs_builder ibld(this, block, inst);
+
+      if (inst->opcode == BRW_OPCODE_SEL &&
+          inst->predicate == BRW_PREDICATE_NONE) {
+         /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
+          *        the original SEL.L/GE instruction
+          */
+         ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                  inst->conditional_mod);
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst->conditional_mod = BRW_CONDITIONAL_NONE;
+
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
  static void
  setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
                      fs_reg *dst, fs_reg color, unsigned components)
@@ -3635,6 +3722,7 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
                                  const fs_reg &coordinate,
                                  const fs_reg &shadow_c,
                                  const fs_reg &lod, const fs_reg &lod2,
+                                const fs_reg &surface,
                                  const fs_reg &sampler,
                                  unsigned coord_components,
                                  unsigned grad_components)
@@ -3660,7 +3748,7 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
         (has_lod || shadow_c.file != BAD_FILE ||
          (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
        for (unsigned i = coord_components; i < 3; i++)
-         bld.MOV(offset(msg_end, bld, i), fs_reg(0.0f));
+         bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
  
        msg_end = offset(msg_end, bld, 3 - coord_components);
     }
@@ -3717,7 +3805,7 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
           /* There's no plain shadow compare message, so we use shadow
            * compare with a bias of 0.0.
            */
-         bld.MOV(msg_end, fs_reg(0.0f));
+         bld.MOV(msg_end, brw_imm_f(0.0f));
           msg_end = offset(msg_end, bld, 1);
        }
  
@@ -3727,8 +3815,9 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
  
     inst->opcode = op;
     inst->src[0] = reg_undef;
-   inst->src[1] = sampler;
-   inst->resize_sources(2);
+   inst->src[1] = surface;
+   inst->src[2] = sampler;
+   inst->resize_sources(3);
     inst->base_mrf = msg_begin.nr;
     inst->mlen = msg_end.nr - msg_begin.nr;
     inst->header_size = 1;
@@ -3740,6 +3829,7 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
                                  const fs_reg &shadow_c,
                                  fs_reg lod, fs_reg lod2,
                                  const fs_reg &sample_index,
+                                const fs_reg &surface,
                                  const fs_reg &sampler,
                                  const fs_reg &offset_value,
                                  unsigned coord_components,
@@ -3811,7 +3901,7 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
     case SHADER_OPCODE_TXF_CMS:
        msg_lod = offset(msg_coords, bld, 3);
        /* lod */
-      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
        /* sample index */
        bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
        msg_end = offset(msg_lod, bld, 2);
@@ -3822,8 +3912,9 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
  
     inst->opcode = op;
     inst->src[0] = reg_undef;
-   inst->src[1] = sampler;
-   inst->resize_sources(2);
+   inst->src[1] = surface;
+   inst->src[2] = sampler;
+   inst->resize_sources(3);
     inst->base_mrf = message.nr;
     inst->mlen = msg_end.nr - message.nr;
     inst->header_size = header_size;
@@ -3847,7 +3938,9 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
                                  const fs_reg &shadow_c,
                                  fs_reg lod, fs_reg lod2,
                                  const fs_reg &sample_index,
-                                const fs_reg &mcs, const fs_reg &sampler,
+                                const fs_reg &mcs,
+                                const fs_reg &surface,
+                                const fs_reg &sampler,
                                  fs_reg offset_value,
                                  unsigned coord_components,
                                  unsigned grad_components)
@@ -3891,7 +3984,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
     if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
         op == SHADER_OPCODE_TEX) {
        op = SHADER_OPCODE_TXL;
-      lod = fs_reg(0.0f);
+      lod = brw_imm_f(0.0f);
     }
  
     /* Set up the LOD info */
@@ -4050,8 +4143,9 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
     /* Generate the SEND. */
     inst->opcode = op;
     inst->src[0] = src_payload;
-   inst->src[1] = sampler;
-   inst->resize_sources(2);
+   inst->src[1] = surface;
+   inst->src[2] = sampler;
+   inst->resize_sources(3);
     inst->base_mrf = -1;
     inst->mlen = mlen;
     inst->header_size = header_size;
@@ -4064,31 +4158,34 @@ static void
  lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
  {
     const brw_device_info *devinfo = bld.shader->devinfo;
-   const fs_reg &coordinate = inst->src[0];
-   const fs_reg &shadow_c = inst->src[1];
-   const fs_reg &lod = inst->src[2];
-   const fs_reg &lod2 = inst->src[3];
-   const fs_reg &sample_index = inst->src[4];
-   const fs_reg &mcs = inst->src[5];
-   const fs_reg &sampler = inst->src[6];
-   const fs_reg &offset_value = inst->src[7];
-   assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
-   const unsigned coord_components = inst->src[8].ud;
-   const unsigned grad_components = inst->src[9].ud;
+   const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
+   const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
+   const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];
+   const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
+   const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
+   const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
+   const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
+   const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
+   const fs_reg &offset_value = inst->src[TEX_LOGICAL_SRC_OFFSET_VALUE];
+   assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
+   const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
+   assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
+   const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
  
     if (devinfo->gen >= 7) {
        lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
                                        shadow_c, lod, lod2, sample_index,
-                                      mcs, sampler, offset_value,
+                                      mcs, surface, sampler, offset_value,
                                        coord_components, grad_components);
     } else if (devinfo->gen >= 5) {
        lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
                                        shadow_c, lod, lod2, sample_index,
-                                      sampler, offset_value,
+                                      surface, sampler, offset_value,
                                        coord_components, grad_components);
     } else {
        lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
-                                      shadow_c, lod, lod2, sampler,
+                                      shadow_c, lod, lod2,
+                                      surface, sampler,
                                        coord_components, grad_components);
     }
  }
@@ -4102,7 +4199,7 @@ emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
  {
     fs_builder ubld = bld.exec_all().group(8, 0);
     const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
-   ubld.MOV(dst, fs_reg(0));
+   ubld.MOV(dst, brw_imm_d(0));
     ubld.MOV(component(dst, 7), sample_mask);
     return dst;
  }
@@ -4244,7 +4341,7 @@ fs_visitor::lower_logical_sends()
        case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
           lower_surface_logical_send(ibld, inst,
                                      SHADER_OPCODE_TYPED_SURFACE_READ,
-                                    fs_reg(0xffff));
+                                    brw_imm_d(0xffff));
           break;
  
        case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
@@ -4368,7 +4465,7 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
  
     case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
        /* gather4_po_c is unsupported in SIMD16 mode. */
-      const fs_reg &shadow_c = inst->src[1];
+      const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
        return (shadow_c.file != BAD_FILE ? 8 : inst->exec_size);
     }
     case SHADER_OPCODE_TXL_LOGICAL:
@@ -4377,7 +4474,7 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
         * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
         * mode because the message exceeds the maximum length of 11.
         */
-      const fs_reg &shadow_c = inst->src[1];
+      const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
        if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
           return 16;
        else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
@@ -4400,7 +4497,8 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
         * circumstances it can end up with a message that is too long in SIMD16
         * mode.
         */
-      const unsigned coord_components = inst->src[8].ud;
+      const unsigned coord_components =
+         inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
        /* First three arguments are the sample index and the two arguments for
         * the MCS data.
         */
@@ -4415,6 +4513,10 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
     case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
        return 8;
  
+   case SHADER_OPCODE_MOV_INDIRECT:
+      /* Prior to Broadwell, we only have 8 address subregisters */
+      return devinfo->gen < 8 ? 8 : inst->exec_size;
+
     default:
        return inst->exec_size;
     }
@@ -4669,6 +4771,8 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
     case IMM:
        unreachable("not reached");
     }
+   if (inst->dst.stride != 1)
+      fprintf(file, "<%u>", inst->dst.stride);
     fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
  
     for (int i = 0; i < inst->sources; i++) {
@@ -4695,9 +4799,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
           break;
        case UNIFORM:
           fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset);
-         if (inst->src[i].reladdr) {
-            fprintf(file, "+reladdr");
-         } else if (inst->src[i].subreg_offset) {
+         if (inst->src[i].subreg_offset) {
              fprintf(file, "+%d.%d", inst->src[i].reg_offset,
                      inst->src[i].subreg_offset);
           }
@@ -4708,7 +4810,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
        case IMM:
           switch (inst->src[i].type) {
           case BRW_REGISTER_TYPE_F:
-            fprintf(file, "%ff", inst->src[i].f);
+            fprintf(file, "%-gf", inst->src[i].f);
              break;
           case BRW_REGISTER_TYPE_W:
           case BRW_REGISTER_TYPE_D:
@@ -4756,6 +4858,16 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
           fprintf(file, "|");
  
        if (inst->src[i].file != IMM) {
+         unsigned stride;
+         if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
+            unsigned hstride = inst->src[i].hstride;
+            stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
+         } else {
+            stride = inst->src[i].stride;
+         }
+         if (stride != 1)
+            fprintf(file, "<%u>", stride);
+
           fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
        }
  
@@ -4798,7 +4910,6 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start,
  {
     if (end == start ||
         end->is_partial_write() ||
-       reg.reladdr ||
         !reg.equals(end->dst)) {
        return NULL;
     } else {
@@ -4807,10 +4918,12 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start,
  }
  
  void
-fs_visitor::setup_payload_gen6()
+fs_visitor::setup_fs_payload_gen6()
  {
-   bool uses_depth =
-      (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+   assert(stage == MESA_SHADER_FRAGMENT);
+   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
+   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+
     unsigned barycentric_interp_modes =
        (stage == MESA_SHADER_FRAGMENT) ?
        ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
@@ -4839,7 +4952,9 @@ fs_visitor::setup_payload_gen6()
     }
  
     /* R27: interpolated depth if uses source depth */
-   if (uses_depth) {
+   prog_data->uses_src_depth =
+      (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+   if (prog_data->uses_src_depth) {
        payload.source_depth_reg = payload.num_regs;
        payload.num_regs++;
        if (dispatch_width == 16) {
@@ -4847,8 +4962,11 @@ fs_visitor::setup_payload_gen6()
           payload.num_regs++;
        }
     }
+
     /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
-   if (uses_depth) {
+   prog_data->uses_src_w =
+      (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+   if (prog_data->uses_src_w) {
        payload.source_w_reg = payload.num_regs;
        payload.num_regs++;
        if (dispatch_width == 16) {
@@ -4857,19 +4975,17 @@ fs_visitor::setup_payload_gen6()
        }
     }
  
-   if (stage == MESA_SHADER_FRAGMENT) {
-      brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
-      brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-      prog_data->uses_pos_offset = key->compute_pos_offset;
-      /* R31: MSAA position offsets. */
-      if (prog_data->uses_pos_offset) {
-         payload.sample_pos_reg = payload.num_regs;
-         payload.num_regs++;
-      }
+   prog_data->uses_pos_offset = key->compute_pos_offset;
+   /* R31: MSAA position offsets. */
+   if (prog_data->uses_pos_offset) {
+      payload.sample_pos_reg = payload.num_regs;
+      payload.num_regs++;
     }
  
     /* R32: MSAA input coverage mask */
-   if (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) {
+   prog_data->uses_sample_mask =
+      (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
+   if (prog_data->uses_sample_mask) {
        assert(devinfo->gen >= 7);
        payload.sample_mask_in_reg = payload.num_regs;
        payload.num_regs++;
@@ -5011,7 +5127,7 @@ fs_visitor::optimize()
     bld = fs_builder(this, 64);
  
     assign_constant_locations();
-   demote_pull_constants();
+   lower_constant_loads();
  
     validate();
  
@@ -5038,7 +5154,7 @@ fs_visitor::optimize()
  
     if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
        char filename[64];
-      snprintf(filename, 64, "%s%d-%s-00-start",
+      snprintf(filename, 64, "%s%d-%s-00-00-start",
                 stage_abbrev, dispatch_width, nir->info.name);
  
        backend_shader::dump_instructions(filename);
@@ -5091,6 +5207,13 @@ fs_visitor::optimize()
     OPT(opt_combine_constants);
     OPT(lower_integer_multiplication);
  
+   if (devinfo->gen <= 5 && OPT(lower_minmax)) {
+      OPT(opt_cmod_propagation);
+      OPT(opt_cse);
+      OPT(opt_copy_propagate);
+      OPT(dead_code_eliminate);
+   }
+
     lower_uniform_pull_constant_loads();
  
     validate();
@@ -5103,12 +5226,18 @@ fs_visitor::optimize()
  void
  fs_visitor::fixup_3src_null_dest()
  {
+   bool progress = false;
+
     foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
        if (inst->is_3src() && inst->dst.is_null()) {
           inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
                              inst->dst.type);
+         progress = true;
        }
     }
+
+   if (progress)
+      invalidate_live_intervals();
  }
  
  void
@@ -5144,7 +5273,7 @@ fs_visitor::allocate_registers()
         * SIMD8.  There's probably actually some intermediate point where
         * SIMD16 with a couple of spills is still better.
         */
-      if (dispatch_width == 16) {
+      if (dispatch_width == 16 && min_dispatch_width <= 8) {
           fail("Failure to register allocate.  Reduce number of "
                "live scalar values to avoid this.");
        } else {
@@ -5214,6 +5343,40 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
     return !failed;
  }
  
+bool
+fs_visitor::run_tes()
+{
+   assert(stage == MESA_SHADER_TESS_EVAL);
+
+   /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */
+   payload.num_regs = 5;
+
+   if (shader_time_index >= 0)
+      emit_shader_time_begin();
+
+   emit_nir_code();
+
+   if (failed)
+      return false;
+
+   emit_urb_writes();
+
+   if (shader_time_index >= 0)
+      emit_shader_time_end();
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_tes_urb_setup();
+
+   fixup_3src_null_dest();
+   allocate_registers();
+
+   return !failed;
+}
+
  bool
  fs_visitor::run_gs()
  {
@@ -5233,7 +5396,7 @@ fs_visitor::run_gs()
         */
        if (gs_compile->control_data_header_size_bits <= 32) {
           const fs_builder abld = bld.annotate("initialize control data bits");
-         abld.MOV(this->control_data_bits, fs_reg(0u));
+         abld.MOV(this->control_data_bits, brw_imm_ud(0u));
        }
     }
  
@@ -5272,9 +5435,9 @@ fs_visitor::run_fs(bool do_rep_send)
     assert(stage == MESA_SHADER_FRAGMENT);
  
     if (devinfo->gen >= 6)
-      setup_payload_gen6();
+      setup_fs_payload_gen6();
     else
-      setup_payload_gen4();
+      setup_fs_payload_gen4();
  
     if (0) {
        emit_dummy_fs();
@@ -5352,6 +5515,13 @@ fs_visitor::run_cs()
     if (shader_time_index >= 0)
        emit_shader_time_begin();
  
+   if (devinfo->is_haswell && prog_data->total_shared > 0) {
+      /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
+      const fs_builder abld = bld.exec_all().group(1, 0);
+      abld.MOV(retype(suboffset(brw_sr0_reg(), 1), BRW_REGISTER_TYPE_UW),
+               suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
+   }
+
     emit_nir_code();
  
     if (failed)
@@ -5442,6 +5612,31 @@ brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo,
     return barycentric_interp_modes;
  }
  
+static void
+brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
+                        bool shade_model_flat, const nir_shader *shader)
+{
+   prog_data->flat_inputs = 0;
+
+   nir_foreach_variable(var, &shader->inputs) {
+      enum glsl_interp_qualifier interp_qualifier =
+         (enum glsl_interp_qualifier)var->data.interpolation;
+      bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) ||
+                         (var->data.location == VARYING_SLOT_COL1);
+
+      int input_index = prog_data->urb_setup[var->data.location];
+
+      if (input_index < 0)
+        continue;
+
+      /* flat shading */
+      if (interp_qualifier == INTERP_QUALIFIER_FLAT ||
+          (shade_model_flat && is_gl_Color &&
+           interp_qualifier == INTERP_QUALIFIER_NONE))
+         prog_data->flat_inputs |= (1 << input_index);
+   }
+}
+
  static uint8_t
  computed_depth_mode(const nir_shader *shader)
  {
@@ -5466,13 +5661,20 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                 void *mem_ctx,
                 const struct brw_wm_prog_key *key,
                 struct brw_wm_prog_data *prog_data,
-               const nir_shader *shader,
+               const nir_shader *src_shader,
                 struct gl_program *prog,
                 int shader_time_index8, int shader_time_index16,
                 bool use_rep_send,
                 unsigned *final_assembly_size,
                 char **error_str)
  {
+   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+   shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
+                                      true);
+   brw_nir_lower_fs_inputs(shader);
+   brw_nir_lower_fs_outputs(shader);
+   shader = brw_postprocess_nir(shader, compiler->devinfo, true);
+
     /* key->alpha_test_func means simulating alpha testing via discards,
      * so the shader definitely kills pixels.
      */
@@ -5519,6 +5721,12 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
        }
     }
  
+   /* We have to compute the flat inputs after the visitor is finished running
+    * because it relies on prog_data->urb_setup which is computed in
+    * fs_visitor::calculate_urb_setup().
+    */
+   brw_compute_flat_inputs(prog_data, key->flat_shade, shader);
+
     cfg_t *simd8_cfg;
     int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send;
     if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) {
@@ -5530,7 +5738,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
     }
  
     fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
-                  v.promoted_constants, v.runtime_check_aads_emit, "FS");
+                  v.promoted_constants, v.runtime_check_aads_emit,
+                  MESA_SHADER_FRAGMENT);
  
     if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
        g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
@@ -5547,42 +5756,6 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
     return g.get_assembly(final_assembly_size);
  }
  
-void
-brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
-                             void *buffer, uint32_t threads, uint32_t stride)
-{
-   if (prog_data->local_invocation_id_regs == 0)
-      return;
-
-   /* 'stride' should be an integer number of registers, that is, a multiple
-    * of 32 bytes.
-    */
-   assert(stride % 32 == 0);
-
-   unsigned x = 0, y = 0, z = 0;
-   for (unsigned t = 0; t < threads; t++) {
-      uint32_t *param = (uint32_t *) buffer + stride * t / 4;
-
-      for (unsigned i = 0; i < prog_data->simd_size; i++) {
-         param[0 * prog_data->simd_size + i] = x;
-         param[1 * prog_data->simd_size + i] = y;
-         param[2 * prog_data->simd_size + i] = z;
-
-         x++;
-         if (x == prog_data->local_size[0]) {
-            x = 0;
-            y++;
-            if (y == prog_data->local_size[1]) {
-               y = 0;
-               z++;
-               if (z == prog_data->local_size[2])
-                  z = 0;
-            }
-         }
-      }
-   }
-}
-
  fs_reg *
  fs_visitor::emit_cs_local_invocation_id_setup()
  {
@@ -5625,11 +5798,18 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
                 void *mem_ctx,
                 const struct brw_cs_prog_key *key,
                 struct brw_cs_prog_data *prog_data,
-               const nir_shader *shader,
+               const nir_shader *src_shader,
                 int shader_time_index,
                 unsigned *final_assembly_size,
                 char **error_str)
  {
+   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+   shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
+                                      true);
+   brw_nir_lower_cs_shared(shader);
+   prog_data->base.total_shared += shader->num_shared;
+   shader = brw_postprocess_nir(shader, compiler->devinfo, true);
+
     prog_data->local_size[0] = shader->info.cs.local_size[0];
     prog_data->local_size[1] = shader->info.cs.local_size[1];
     prog_data->local_size[2] = shader->info.cs.local_size[2];
@@ -5638,6 +5818,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
        shader->info.cs.local_size[2];
  
     unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
+   unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);
  
     cfg_t *cfg = NULL;
     const char *fail_msg = NULL;
@@ -5647,11 +5828,13 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
     fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
                   NULL, /* Never used in core profile */
                   shader, 8, shader_time_index);
-   if (!v8.run_cs()) {
-      fail_msg = v8.fail_msg;
-   } else if (local_workgroup_size <= 8 * max_cs_threads) {
-      cfg = v8.cfg;
-      prog_data->simd_size = 8;
+   if (simd_required <= 8) {
+      if (!v8.run_cs()) {
+         fail_msg = v8.fail_msg;
+      } else {
+         cfg = v8.cfg;
+         prog_data->simd_size = 8;
+      }
     }
  
     fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
@@ -5661,7 +5844,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
         !fail_msg && !v8.simd16_unsupported &&
         local_workgroup_size <= 16 * max_cs_threads) {
        /* Try a SIMD16 compile */
-      v16.import_uniforms(&v8);
+      if (simd_required <= 8)
+         v16.import_uniforms(&v8);
        if (!v16.run_cs()) {
           compiler->shader_perf_log(log_data,
                                     "SIMD16 shader failed to compile: %s",
@@ -5686,7 +5870,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
     }
  
     fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
-                  v8.promoted_constants, v8.runtime_check_aads_emit, "CS");
+                  v8.promoted_constants, v8.runtime_check_aads_emit,
+                  MESA_SHADER_COMPUTE);
     if (INTEL_DEBUG & DEBUG_CS) {
        char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
                                     shader->info.label ? shader->info.label :
@@ -5699,3 +5884,39 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
  
     return g.get_assembly(final_assembly_size);
  }
+
+void
+brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
+                             void *buffer, uint32_t threads, uint32_t stride)
+{
+   if (prog_data->local_invocation_id_regs == 0)
+      return;
+
+   /* 'stride' should be an integer number of registers, that is, a multiple
+    * of 32 bytes.
+    */
+   assert(stride % 32 == 0);
+
+   unsigned x = 0, y = 0, z = 0;
+   for (unsigned t = 0; t < threads; t++) {
+      uint32_t *param = (uint32_t *) buffer + stride * t / 4;
+
+      for (unsigned i = 0; i < prog_data->simd_size; i++) {
+         param[0 * prog_data->simd_size + i] = x;
+         param[1 * prog_data->simd_size + i] = y;
+         param[2 * prog_data->simd_size + i] = z;
+
+         x++;
+         if (x == prog_data->local_size[0]) {
+            x = 0;
+            y++;
+            if (y == prog_data->local_size[1]) {
+               y = 0;
+               z++;
+               if (z == prog_data->local_size[2])
+                  z = 0;
+            }
+         }
+      }
+   }
+}