i965: Avoid int64 induced warnings

[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

index f80425a5b00f273cbcbb48ca698a619f8ca0fc08..f376618b3cdd2d121a510f9a84b20cbf13277758 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -23,10 +23,8 @@
  
  #include "brw_vec4.h"
  #include "brw_cfg.h"
-#include "glsl/ir_uniform.h"
-#include "program/sampler.h"
-
-#define FIRST_SPILL_MRF(gen) (gen == 6 ? 21 : 13)
+#include "brw_eu.h"
+#include "brw_program.h"
  
  namespace brw {
  
@@ -48,7 +46,6 @@ vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
     this->predicate = BRW_PREDICATE_NONE;
     this->predicate_inverse = false;
     this->target = 0;
-   this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
     this->shadow_compare = false;
     this->ir = NULL;
     this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
@@ -57,6 +54,10 @@ vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
     this->mlen = 0;
     this->base_mrf = 0;
     this->offset = 0;
+   this->exec_size = 8;
+   this->group = 0;
+   this->size_written = (dst.file == BAD_FILE ?
+                         0 : this->exec_size * type_sz(dst.type));
     this->annotation = NULL;
  }
  
@@ -185,6 +186,7 @@ ALU3(MAD)
  ALU2_ACC(ADDC)
  ALU2_ACC(SUBB)
  ALU2(MAC)
+ALU1(DIM)
  
  /** Gen4 predicated IF. */
  vec4_instruction *
@@ -239,8 +241,6 @@ vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
      * type to match src0 so we can compact the instruction.
      */
     dst.type = src0.type;
-   if (dst.file == HW_REG)
-      dst.fixed_hw_reg.type = dst.type;
  
     resolve_ud_negate(&src0);
     resolve_ud_negate(&src1);
@@ -278,16 +278,6 @@ vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
     return inst;
  }
  
-void
-vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
-{
-   static enum opcode dot_opcodes[] = {
-      BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
-   };
-
-   emit(dot_opcodes[elements - 2], dst, src0, src1);
-}
-
  src_reg
  vec4_visitor::fix_3src_operand(const src_reg &src)
  {
@@ -422,7 +412,7 @@ vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
      * You should inspect the disasm output in order to verify that the MOV is
      * not optimized away.
      */
-   emit(MOV(tmp_dst, src_reg(0x12345678u)));
+   emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
  #endif
  
     /* Give tmp the form below, where "." means untouched.
@@ -441,7 +431,7 @@ vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
      *   0xhhhh0000
      */
     tmp_src.swizzle = BRW_SWIZZLE_YYYY;
-   emit(SHL(dst, tmp_src, src_reg(16u)));
+   emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
  
     /* Finally, give the write-channels of dst the form of packHalf2x16's
      * output:
@@ -480,10 +470,10 @@ vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
     src_reg tmp_src(tmp_dst);
  
     tmp_dst.writemask = WRITEMASK_X;
-   emit(AND(tmp_dst, src0, src_reg(0xffffu)));
+   emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
  
     tmp_dst.writemask = WRITEMASK_Y;
-   emit(SHR(tmp_dst, src0, src_reg(16u)));
+   emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
  
     dst.writemask = WRITEMASK_XY;
     emit(F16TO32(dst, tmp_src));
@@ -498,7 +488,7 @@ vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
      * vector float and a type-converting MOV.
      */
     dst_reg shift(this, glsl_type::uvec4_type);
-   emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
+   emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
  
     dst_reg shifted(this, glsl_type::uvec4_type);
     src0.swizzle = BRW_SWIZZLE_XXXX;
@@ -508,7 +498,7 @@ vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
     dst_reg f(this, glsl_type::vec4_type);
     emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
  
-   emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
+   emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
  }
  
  void
@@ -520,7 +510,7 @@ vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
      * vector float and a type-converting MOV.
      */
     dst_reg shift(this, glsl_type::uvec4_type);
-   emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
+   emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
  
     dst_reg shifted(this, glsl_type::uvec4_type);
     src0.swizzle = BRW_SWIZZLE_XXXX;
@@ -531,11 +521,11 @@ vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
     emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
  
     dst_reg scaled(this, glsl_type::vec4_type);
-   emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
+   emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
  
     dst_reg max(this, glsl_type::vec4_type);
-   emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
-   emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
+   emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
+   emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
  }
  
  void
@@ -546,7 +536,7 @@ vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
     inst->saturate = true;
  
     dst_reg scaled(this, glsl_type::vec4_type);
-   emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
+   emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
  
     dst_reg rounded(this, glsl_type::vec4_type);
     emit(RNDE(rounded, src_reg(scaled)));
@@ -562,13 +552,13 @@ void
  vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
  {
     dst_reg max(this, glsl_type::vec4_type);
-   emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
+   emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
  
     dst_reg min(this, glsl_type::vec4_type);
-   emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
+   emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
  
     dst_reg scaled(this, glsl_type::vec4_type);
-   emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
+   emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
  
     dst_reg rounded(this, glsl_type::vec4_type);
     emit(RNDE(rounded, src_reg(scaled)));
@@ -580,18 +570,12 @@ vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
     emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
  }
  
-/**
- * Returns the minimum number of vec4 elements needed to pack a type.
- *
- * For simple types, it will return 1 (a single vec4); for matrices, the
- * number of columns; for array and struct, the sum of the vec4_size of
- * each of its elements; and for sampler and atomic, zero.
- *
- * This method is useful to calculate how much register space is needed to
- * store a particular type.
+/*
+ * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
+ * false) elements needed to pack a type.
   */
-extern "C" int
-type_size_vec4(const struct glsl_type *type)
+static int
+type_size_xvec4(const struct glsl_type *type, bool as_vec4)
  {
     unsigned int i;
     int size;
@@ -601,23 +585,29 @@ type_size_vec4(const struct glsl_type *type)
     case GLSL_TYPE_INT:
     case GLSL_TYPE_FLOAT:
     case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_UINT64:
+   case GLSL_TYPE_INT64:
        if (type->is_matrix()) {
-        return type->matrix_columns;
+         const glsl_type *col_type = type->column_type();
+         unsigned col_slots =
+            (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
+         return type->matrix_columns * col_slots;
        } else {
-        /* Regardless of size of vector, it gets a vec4. This is bad
-         * packing for things like floats, but otherwise arrays become a
-         * mess.  Hopefully a later pass over the code can pack scalars
-         * down if appropriate.
-         */
-        return 1;
+         /* Regardless of size of vector, it gets a vec4. This is bad
+          * packing for things like floats, but otherwise arrays become a
+          * mess.  Hopefully a later pass over the code can pack scalars
+          * down if appropriate.
+          */
+         return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
        }
     case GLSL_TYPE_ARRAY:
        assert(type->length > 0);
-      return type_size_vec4(type->fields.array) * type->length;
+      return type_size_xvec4(type->fields.array, as_vec4) * type->length;
     case GLSL_TYPE_STRUCT:
        size = 0;
        for (i = 0; i < type->length; i++) {
-        size += type_size_vec4(type->fields.structure[i].type);
+        size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
        }
        return size;
     case GLSL_TYPE_SUBROUTINE:
@@ -633,7 +623,6 @@ type_size_vec4(const struct glsl_type *type)
     case GLSL_TYPE_IMAGE:
        return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
     case GLSL_TYPE_VOID:
-   case GLSL_TYPE_DOUBLE:
     case GLSL_TYPE_ERROR:
     case GLSL_TYPE_INTERFACE:
     case GLSL_TYPE_FUNCTION:
@@ -643,12 +632,53 @@ type_size_vec4(const struct glsl_type *type)
     return 0;
  }
  
+/**
+ * Returns the minimum number of vec4 elements needed to pack a type.
+ *
+ * For simple types, it will return 1 (a single vec4); for matrices, the
+ * number of columns; for array and struct, the sum of the vec4_size of
+ * each of its elements; and for sampler and atomic, zero.
+ *
+ * This method is useful to calculate how much register space is needed to
+ * store a particular type.
+ */
+extern "C" int
+type_size_vec4(const struct glsl_type *type)
+{
+   return type_size_xvec4(type, true);
+}
+
+/**
+ * Returns the minimum number of dvec4 elements needed to pack a type.
+ *
+ * For simple types, it will return 1 (a single dvec4); for matrices, the
+ * number of columns; for array and struct, the sum of the dvec4_size of
+ * each of its elements; and for sampler and atomic, zero.
+ *
+ * This method is useful to calculate how much register space is needed to
+ * store a particular type.
+ *
+ * Measuring double-precision vertex inputs as dvec4 is required because
+ * ARB_vertex_attrib_64bit states that these uses the same number of locations
+ * than the single-precision version. That is, two consecutives dvec4 would be
+ * located in location "x" and location "x+1", not "x+2".
+ *
+ * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
+ * remap_vs_attrs() will take in account both the location and also if the
+ * type fits in one or two vec4 slots.
+ */
+extern "C" int
+type_size_dvec4(const struct glsl_type *type)
+{
+   return type_size_xvec4(type, false);
+}
+
  src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
  {
     init();
  
-   this->file = GRF;
-   this->reg = v->alloc.allocate(type_size_vec4(type));
+   this->file = VGRF;
+   this->nr = v->alloc.allocate(type_size_vec4(type));
  
     if (type->is_array() || type->is_record()) {
        this->swizzle = BRW_SWIZZLE_NOOP;
@@ -665,8 +695,8 @@ src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
  
     init();
  
-   this->file = GRF;
-   this->reg = v->alloc.allocate(type_size_vec4(type) * size);
+   this->file = VGRF;
+   this->nr = v->alloc.allocate(type_size_vec4(type) * size);
  
     this->swizzle = BRW_SWIZZLE_NOOP;
  
@@ -677,8 +707,8 @@ dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
  {
     init();
  
-   this->file = GRF;
-   this->reg = v->alloc.allocate(type_size_vec4(type));
+   this->file = VGRF;
+   this->nr = v->alloc.allocate(type_size_vec4(type));
  
     if (type->is_array() || type->is_record()) {
        this->writemask = WRITEMASK_XYZW;
@@ -693,18 +723,8 @@ vec4_instruction *
  vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
                            src_reg src0, src_reg src1)
  {
-   vec4_instruction *inst;
-
-   if (devinfo->gen >= 6) {
-      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
-      inst->conditional_mod = conditionalmod;
-   } else {
-      emit(CMP(dst, src0, src1, conditionalmod));
-
-      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
-      inst->predicate = BRW_PREDICATE_NORMAL;
-   }
-
+   vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
+   inst->conditional_mod = conditionalmod;
     return inst;
  }
  
@@ -730,7 +750,7 @@ vec4_visitor::emit_lrp(const dst_reg &dst,
        x_times_one_minus_a.writemask = dst.writemask;
  
        emit(MUL(y_times_a, y, a));
-      emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
+      emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
        emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
        return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
     }
@@ -766,7 +786,7 @@ vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
        else
           emit(pull);
  
-      dst_reg index_reg = retype(offset(dst_reg(header), 1),
+      dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
                                   offset_reg.type);
        pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
  
@@ -782,7 +802,7 @@ vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
        pull->mlen = 2;
        pull->header_size = 1;
     } else if (devinfo->gen >= 7) {
-      dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
+      dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
  
        grf_offset.type = offset_reg.type;
  
@@ -803,7 +823,7 @@ vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
                                             dst,
                                             surf_index,
                                             offset_reg);
-      pull->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
+      pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
        pull->mlen = 1;
     }
  
@@ -830,13 +850,14 @@ vec4_visitor::emit_uniformize(const src_reg &src)
  
  src_reg
  vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
-                             src_reg coordinate, src_reg sampler)
+                             src_reg coordinate, src_reg surface)
  {
     vec4_instruction *inst =
        new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
                                      dst_reg(this, glsl_type::uvec4_type));
     inst->base_mrf = 2;
-   inst->src[1] = sampler;
+   inst->src[1] = surface;
+   inst->src[2] = surface;
  
     int param_base;
  
@@ -864,7 +885,7 @@ vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
              coordinate));
  
     emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
-            src_reg(0)));
+            brw_imm_d(0)));
  
     emit(inst);
     return src_reg(inst->dst);
@@ -876,7 +897,7 @@ vec4_visitor::is_high_sampler(src_reg sampler)
     if (devinfo->gen < 8 && !devinfo->is_haswell)
        return false;
  
-   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
+   return sampler.file != IMM || sampler.ud >= 16;
  }
  
  void
@@ -885,23 +906,36 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
                             const glsl_type *dest_type,
                             src_reg coordinate,
                             int coord_components,
-                           src_reg shadow_comparitor,
+                           src_reg shadow_comparator,
                             src_reg lod, src_reg lod2,
                             src_reg sample_index,
                             uint32_t constant_offset,
                             src_reg offset_value,
                             src_reg mcs,
-                           bool is_cube_array,
-                           uint32_t sampler,
+                           uint32_t surface,
+                           src_reg surface_reg,
                             src_reg sampler_reg)
  {
+   /* The sampler can only meaningfully compute LOD for fragment shader
+    * messages. For all other stages, we change the opcode to TXL and hardcode
+    * the LOD to 0.
+    *
+    * textureQueryLevels() is implemented in terms of TXS so we need to pass a
+    * valid LOD argument.
+    */
+   if (op == ir_tex || op == ir_query_levels) {
+      assert(lod.file == BAD_FILE);
+      lod = brw_imm_f(0.0f);
+   }
+
     enum opcode opcode;
     switch (op) {
     case ir_tex: opcode = SHADER_OPCODE_TXL; break;
     case ir_txl: opcode = SHADER_OPCODE_TXL; break;
     case ir_txd: opcode = SHADER_OPCODE_TXD; break;
     case ir_txf: opcode = SHADER_OPCODE_TXF; break;
-   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
+   case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
+                             SHADER_OPCODE_TXF_CMS); break;
     case ir_txs: opcode = SHADER_OPCODE_TXS; break;
     case ir_tg4: opcode = offset_value.file != BAD_FILE
                           ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
@@ -911,12 +945,18 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
        unreachable("TXB is not valid for vertex shaders.");
     case ir_lod:
        unreachable("LOD is not valid for vertex shaders.");
+   case ir_samples_identical: {
+      /* There are some challenges implementing this for vec4, and it seems
+       * unlikely to be used anyway.  For now, just return false ways.
+       */
+      emit(MOV(dest, brw_imm_ud(0u)));
+      return;
+   }
     default:
        unreachable("Unrecognized tex op");
     }
  
-   vec4_instruction *inst = new(mem_ctx) vec4_instruction(
-      opcode, dst_reg(this, dest_type));
+   vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
  
     inst->offset = constant_offset;
  
@@ -936,9 +976,10 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
     inst->base_mrf = 2;
     inst->mlen = inst->header_size;
     inst->dst.writemask = WRITEMASK_XYZW;
-   inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
+   inst->shadow_compare = shadow_comparator.file != BAD_FILE;
  
-   inst->src[1] = sampler_reg;
+   inst->src[1] = surface_reg;
+   inst->src[2] = sampler_reg;
  
     /* MRF for the first parameter */
     int param_base = inst->base_mrf + inst->header_size;
@@ -961,13 +1002,13 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
  
        if (zero_mask != 0) {
           emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
-                  src_reg(0)));
+                  brw_imm_d(0)));
        }
-      /* Load the shadow comparitor */
-      if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
-        emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
+      /* Load the shadow comparator */
+      if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
+        emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
                           WRITEMASK_X),
-                 shadow_comparitor));
+                 shadow_comparator));
          inst->mlen++;
        }
  
@@ -976,7 +1017,7 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
          int mrf, writemask;
          if (devinfo->gen >= 5) {
             mrf = param_base + 1;
-           if (shadow_comparitor.file != BAD_FILE) {
+           if (shadow_comparator.file != BAD_FILE) {
                writemask = WRITEMASK_Y;
                /* mlen already incremented */
             } else {
@@ -993,7 +1034,16 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
        } else if (op == ir_txf_ms) {
           emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
                    sample_index));
-         if (devinfo->gen >= 7) {
+         if (opcode == SHADER_OPCODE_TXF_CMS_W) {
+            /* MCS data is stored in the first two channels of ‘mcs’, but we
+             * need to get it into the .y and .z channels of the second vec4
+             * of params.
+             */
+            mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
+            emit(MOV(dst_reg(MRF, param_base + 1,
+                             glsl_type::uint_type, WRITEMASK_YZ),
+                     mcs));
+         } else if (devinfo->gen >= 7) {
              /* MCS data is in the first channel of `mcs`, but we need to get it into
               * the .y channel of the second vec4 of params, so replicate .x across
               * the whole vec4 and then mask off everything except .y
@@ -1013,17 +1063,17 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
             inst->mlen++;
  
-           if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
+           if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
                lod.swizzle = BRW_SWIZZLE_ZZZZ;
                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
                inst->mlen++;
  
-               if (shadow_comparitor.file != BAD_FILE) {
+               if (shadow_comparator.file != BAD_FILE) {
                    emit(MOV(dst_reg(MRF, param_base + 2,
-                                   shadow_comparitor.type, WRITEMASK_Z),
-                           shadow_comparitor));
+                                   shadow_comparator.type, WRITEMASK_Z),
+                           shadow_comparator));
                 }
             }
          } else /* devinfo->gen == 4 */ {
@@ -1032,9 +1082,9 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
             inst->mlen += 2;
          }
        } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
-         if (shadow_comparitor.file != BAD_FILE) {
-            emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
-                     shadow_comparitor));
+         if (shadow_comparator.file != BAD_FILE) {
+            emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
+                     shadow_comparator));
           }
  
           emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
@@ -1048,18 +1098,23 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
     /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
      * spec requires layers.
      */
-   if (op == ir_txs && is_cube_array) {
-      emit_math(SHADER_OPCODE_INT_QUOTIENT,
-                writemask(inst->dst, WRITEMASK_Z),
-                src_reg(inst->dst), src_reg(6));
+   if (op == ir_txs && devinfo->gen < 7) {
+      /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
+      emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
+                  src_reg(inst->dst), brw_imm_d(1));
     }
  
     if (devinfo->gen == 6 && op == ir_tg4) {
-      emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
+      emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
     }
  
-   swizzle_result(op, dest,
-                  src_reg(inst->dst), sampler, dest_type);
+   if (op == ir_query_levels) {
+      /* # levels is in .w */
+      src_reg swizzled(dest);
+      swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
+                                      SWIZZLE_W, SWIZZLE_W);
+      emit(MOV(dest, swizzled));
+   }
  }
  
  /**
@@ -1076,7 +1131,7 @@ vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
     dst_f.type = BRW_REGISTER_TYPE_F;
  
     /* Convert from UNORM to UINT */
-   emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
+   emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
     emit(MOV(dst, src_reg(dst_f)));
  
     if (wa & WA_SIGN) {
@@ -1084,94 +1139,13 @@ vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
         * shifting the sign bit into place, then shifting back
         * preserving sign.
         */
-      emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
-      emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
-   }
-}
-
-/**
- * Set up the gather channel based on the swizzle, for gather4.
- */
-uint32_t
-vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
-{
-   int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
-   switch (swiz) {
-      case SWIZZLE_X: return 0;
-      case SWIZZLE_Y:
-         /* gather4 sampler is broken for green channel on RG32F --
-          * we must ask for blue instead.
-          */
-         if (key_tex->gather_channel_quirk_mask & (1 << sampler))
-            return 2;
-         return 1;
-      case SWIZZLE_Z: return 2;
-      case SWIZZLE_W: return 3;
-      default:
-         unreachable("Not reached"); /* zero, one swizzles handled already */
-   }
-}
-
-void
-vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
-                             src_reg orig_val, uint32_t sampler,
-                             const glsl_type *dest_type)
-{
-   int s = key_tex->swizzles[sampler];
-
-   dst_reg swizzled_result = dest;
-
-   if (op == ir_query_levels) {
-      /* # levels is in .w */
-      orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
-      emit(MOV(swizzled_result, orig_val));
-      return;
-   }
-
-   if (op == ir_txs || dest_type == glsl_type::float_type
-                       || s == SWIZZLE_NOOP || op == ir_tg4) {
-      emit(MOV(swizzled_result, orig_val));
-      return;
-   }
-
-
-   int zero_mask = 0, one_mask = 0, copy_mask = 0;
-   int swizzle[4] = {0};
-
-   for (int i = 0; i < 4; i++) {
-      switch (GET_SWZ(s, i)) {
-      case SWIZZLE_ZERO:
-        zero_mask |= (1 << i);
-        break;
-      case SWIZZLE_ONE:
-        one_mask |= (1 << i);
-        break;
-      default:
-        copy_mask |= (1 << i);
-        swizzle[i] = GET_SWZ(s, i);
-        break;
-      }
-   }
-
-   if (copy_mask) {
-      orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
-      swizzled_result.writemask = copy_mask;
-      emit(MOV(swizzled_result, orig_val));
-   }
-
-   if (zero_mask) {
-      swizzled_result.writemask = zero_mask;
-      emit(MOV(swizzled_result, src_reg(0.0f)));
-   }
-
-   if (one_mask) {
-      swizzled_result.writemask = one_mask;
-      emit(MOV(swizzled_result, src_reg(1.0f)));
+      emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
+      emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
     }
  }
  
  void
-vec4_visitor::gs_emit_vertex(int stream_id)
+vec4_visitor::gs_emit_vertex(int /* stream_id */)
  {
     unreachable("not reached");
  }
@@ -1182,64 +1156,19 @@ vec4_visitor::gs_end_primitive()
     unreachable("not reached");
  }
  
-void
-vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
-                                  dst_reg dst, src_reg offset,
-                                  src_reg src0, src_reg src1)
-{
-   unsigned mlen = 0;
-
-   /* Set the atomic operation offset. */
-   emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
-   mlen++;
-
-   /* Set the atomic operation arguments. */
-   if (src0.file != BAD_FILE) {
-      emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
-      mlen++;
-   }
-
-   if (src1.file != BAD_FILE) {
-      emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
-      mlen++;
-   }
-
-   /* Emit the instruction.  Note that this maps to the normal SIMD8
-    * untyped atomic message on Ivy Bridge, but that's OK because
-    * unused channels will be masked out.
-    */
-   vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
-                                 brw_message_reg(0),
-                                 src_reg(surf_index), src_reg(atomic_op));
-   inst->mlen = mlen;
-}
-
-void
-vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
-                                        src_reg offset)
-{
-   /* Set the surface read offset. */
-   emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
-
-   /* Emit the instruction.  Note that this maps to the normal SIMD8
-    * untyped surface read message, but that's OK because unused
-    * channels will be masked out.
-    */
-   vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
-                                 brw_message_reg(0),
-                                 src_reg(surf_index), src_reg(1));
-   inst->mlen = 1;
-}
-
  void
  vec4_visitor::emit_ndc_computation()
  {
+   if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
+      return;
+
     /* Get the position */
-   src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
+   src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
  
     /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
     dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
-   output_reg[BRW_VARYING_SLOT_NDC] = ndc;
+   output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
+   output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
  
     current_annotation = "NDC";
     dst_reg ndc_w = ndc;
@@ -1259,34 +1188,34 @@ vec4_visitor::emit_psiz_and_flags(dst_reg reg)
  {
     if (devinfo->gen < 6 &&
         ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
-        output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
+        output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
          devinfo->has_negative_rhw_bug)) {
        dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
        dst_reg header1_w = header1;
        header1_w.writemask = WRITEMASK_W;
  
-      emit(MOV(header1, 0u));
+      emit(MOV(header1, brw_imm_ud(0u)));
  
        if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
-        src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
+        src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
  
          current_annotation = "Point size";
-        emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
-        emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
+        emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
+        emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
        }
  
-      if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
+      if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
           current_annotation = "Clipping flags";
           dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
           dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
  
-         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
-         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
+         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
+         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
           emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
  
-         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
-         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
-         emit(SHL(flags1, src_reg(flags1), src_reg(4)));
+         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
+         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
+         emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
           emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
        }
  
@@ -1299,27 +1228,28 @@ vec4_visitor::emit_psiz_and_flags(dst_reg reg)
         * Later, clipping will detect ucp[6] and ensure the primitive is
         * clipped against all fixed planes.
         */
-      if (devinfo->has_negative_rhw_bug) {
-         src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
+      if (devinfo->has_negative_rhw_bug &&
+          output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
+         src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
           ndc_w.swizzle = BRW_SWIZZLE_WWWW;
-         emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
+         emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
           vec4_instruction *inst;
-         inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
+         inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
           inst->predicate = BRW_PREDICATE_NORMAL;
-         output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
-         inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
+         output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
+         inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
           inst->predicate = BRW_PREDICATE_NORMAL;
        }
  
        emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
     } else if (devinfo->gen < 6) {
-      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
+      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
     } else {
-      emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
+      emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
        if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
           dst_reg reg_w = reg;
           reg_w.writemask = WRITEMASK_W;
-         src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
+         src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
           reg_as_src.type = reg_w.type;
           reg_as_src.swizzle = brw_swizzle_for_size(1);
           emit(MOV(reg_w, reg_as_src));
@@ -1328,34 +1258,45 @@ vec4_visitor::emit_psiz_and_flags(dst_reg reg)
           dst_reg reg_y = reg;
           reg_y.writemask = WRITEMASK_Y;
           reg_y.type = BRW_REGISTER_TYPE_D;
-         output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
-         emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
+         output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
+         emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
        }
        if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
           dst_reg reg_z = reg;
           reg_z.writemask = WRITEMASK_Z;
           reg_z.type = BRW_REGISTER_TYPE_D;
-         output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
-         emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
+         output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
+         emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
        }
     }
  }
  
  vec4_instruction *
-vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
+vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
  {
     assert(varying < VARYING_SLOT_MAX);
-   assert(output_reg[varying].type == reg.type);
+
+   unsigned num_comps = output_num_components[varying][component];
+   if (num_comps == 0)
+      return NULL;
+
+   assert(output_reg[varying][component].type == reg.type);
     current_annotation = output_reg_annotation[varying];
-   /* Copy the register, saturating if necessary */
-   return emit(MOV(reg, src_reg(output_reg[varying])));
+   if (output_reg[varying][component].file != BAD_FILE) {
+      src_reg src = src_reg(output_reg[varying][component]);
+      src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
+      reg.writemask =
+         brw_writemask_for_component_packing(num_comps, component);
+      return emit(MOV(reg, src));
+   }
+   return NULL;
  }
  
  void
  vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
  {
     reg.type = BRW_REGISTER_TYPE_F;
-   output_reg[varying].type = reg.type;
+   output_reg[varying][0].type = reg.type;
  
     switch (varying) {
     case VARYING_SLOT_PSIZ:
@@ -1367,11 +1308,13 @@ vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
     }
     case BRW_VARYING_SLOT_NDC:
        current_annotation = "NDC";
-      emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
+      if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
+         emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
        break;
     case VARYING_SLOT_POS:
        current_annotation = "gl_Position";
-      emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
+      if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
+         emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
        break;
     case VARYING_SLOT_EDGE:
        /* This is present when doing unfilled polygons.  We're supposed to copy
@@ -1388,13 +1331,15 @@ vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
        /* No need to write to this slot */
        break;
     default:
-      emit_generic_urb_slot(reg, varying);
+      for (int i = 0; i < 4; i++) {
+         emit_generic_urb_slot(reg, varying, i);
+      }
        break;
     }
  }
  
  static int
-align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
+align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
  {
     if (devinfo->gen >= 6) {
        /* URB data written (does not include the message header reg) must
@@ -1500,45 +1445,26 @@ vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
        message_header_scale *= 16;
  
     if (reladdr) {
-      src_reg index = src_reg(this, glsl_type::int_type);
-
-      emit_before(block, inst, ADD(dst_reg(index), *reladdr,
-                                   src_reg(reg_offset)));
-      emit_before(block, inst, MUL(dst_reg(index), index,
-                                   src_reg(message_header_scale)));
-
-      return index;
-   } else {
-      return src_reg(reg_offset * message_header_scale);
-   }
-}
-
-src_reg
-vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
-                                      src_reg *reladdr, int reg_offset)
-{
-   if (reladdr) {
-      src_reg index = src_reg(this, glsl_type::int_type);
-
-      emit_before(block, inst, ADD(dst_reg(index), *reladdr,
-                                   src_reg(reg_offset)));
-
-      /* Pre-gen6, the message header uses byte offsets instead of vec4
-       * (16-byte) offset units.
+      /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
+       * to multiply the reladdr by 2. Notice that the reg_offset part
+       * is in units of 16 bytes and is used to select the low/high 16-byte
+       * chunk of a full dvec4, so we don't want to multiply that part.
         */
-      if (devinfo->gen < 6) {
-         emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
+      src_reg index = src_reg(this, glsl_type::int_type);
+      if (type_sz(inst->dst.type) < 8) {
+         emit_before(block, inst, ADD(dst_reg(index), *reladdr,
+                                      brw_imm_d(reg_offset)));
+         emit_before(block, inst, MUL(dst_reg(index), index,
+                                      brw_imm_d(message_header_scale)));
+      } else {
+         emit_before(block, inst, MUL(dst_reg(index), *reladdr,
+                                      brw_imm_d(message_header_scale * 2)));
+         emit_before(block, inst, ADD(dst_reg(index), index,
+                                      brw_imm_d(reg_offset * message_header_scale)));
        }
-
        return index;
-   } else if (devinfo->gen >= 8) {
-      /* Store the offset in a GRF so we can send-from-GRF. */
-      src_reg offset = src_reg(this, glsl_type::int_type);
-      emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
-      return offset;
     } else {
-      int message_header_scale = devinfo->gen < 6 ? 16 : 1;
-      return src_reg(reg_offset * message_header_scale);
+      return brw_imm_d(reg_offset * message_header_scale);
     }
  }
  
@@ -1553,11 +1479,23 @@ vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
                                 dst_reg temp, src_reg orig_src,
                                 int base_offset)
  {
-   int reg_offset = base_offset + orig_src.reg_offset;
+   assert(orig_src.offset % REG_SIZE == 0);
+   int reg_offset = base_offset + orig_src.offset / REG_SIZE;
     src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
                                        reg_offset);
  
-   emit_before(block, inst, SCRATCH_READ(temp, index));
+   if (type_sz(orig_src.type) < 8) {
+      emit_before(block, inst, SCRATCH_READ(temp, index));
+   } else {
+      dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
+      dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
+      emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
+      index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
+      vec4_instruction *last_read =
+         SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
+      emit_before(block, inst, last_read);
+      shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
+   }
  }
  
  /**
@@ -1570,7 +1508,8 @@ void
  vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
                                   int base_offset)
  {
-   int reg_offset = base_offset + inst->dst.reg_offset;
+   assert(inst->dst.offset % REG_SIZE == 0);
+   int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
     src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
                                        reg_offset);
  
@@ -1581,21 +1520,67 @@ vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
      * weren't initialized, it will confuse live interval analysis, which will
      * make spilling fail to make progress.
      */
-   const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
+   bool is_64bit = type_sz(inst->dst.type) == 8;
+   const glsl_type *alloc_type =
+      is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
+   const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
                                         inst->dst.type),
                                  brw_swizzle_for_mask(inst->dst.writemask));
-   dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
-                                      inst->dst.writemask));
-   vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
-   if (inst->opcode != BRW_OPCODE_SEL)
-      write->predicate = inst->predicate;
-   write->ir = inst->ir;
-   write->annotation = inst->annotation;
-   inst->insert_after(block, write);
+
+   if (!is_64bit) {
+      dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
+                                         inst->dst.writemask));
+      vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
+      if (inst->opcode != BRW_OPCODE_SEL)
+         write->predicate = inst->predicate;
+      write->ir = inst->ir;
+      write->annotation = inst->annotation;
+      inst->insert_after(block, write);
+   } else {
+      dst_reg shuffled = dst_reg(this, alloc_type);
+      vec4_instruction *last =
+         shuffle_64bit_data(shuffled, temp, true, block, inst);
+      src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
+
+      uint8_t mask = 0;
+      if (inst->dst.writemask & WRITEMASK_X)
+         mask |= WRITEMASK_XY;
+      if (inst->dst.writemask & WRITEMASK_Y)
+         mask |= WRITEMASK_ZW;
+      if (mask) {
+         dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
+
+         vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
+         if (inst->opcode != BRW_OPCODE_SEL)
+            write->predicate = inst->predicate;
+         write->ir = inst->ir;
+         write->annotation = inst->annotation;
+         last->insert_after(block, write);
+      }
+
+      mask = 0;
+      if (inst->dst.writemask & WRITEMASK_Z)
+         mask |= WRITEMASK_XY;
+      if (inst->dst.writemask & WRITEMASK_W)
+         mask |= WRITEMASK_ZW;
+      if (mask) {
+         dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
+
+         src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
+                                            reg_offset + 1);
+         vec4_instruction *write =
+            SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
+         if (inst->opcode != BRW_OPCODE_SEL)
+            write->predicate = inst->predicate;
+         write->ir = inst->ir;
+         write->annotation = inst->annotation;
+         last->insert_after(block, write);
+      }
+   }
  
     inst->dst.file = temp.file;
-   inst->dst.reg = temp.reg;
-   inst->dst.reg_offset = temp.reg_offset;
+   inst->dst.nr = temp.nr;
+   inst->dst.offset %= REG_SIZE;
     inst->dst.reladdr = NULL;
  }
  
@@ -1621,11 +1606,12 @@ vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
                                            *src.reladdr);
  
     /* Now handle scratch access on src */
-   if (src.file == GRF && scratch_loc[src.reg] != -1) {
-      dst_reg temp = dst_reg(this, glsl_type::vec4_type);
-      emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
-      src.reg = temp.reg;
-      src.reg_offset = temp.reg_offset;
+   if (src.file == VGRF && scratch_loc[src.nr] != -1) {
+      dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
+         glsl_type::dvec4_type : glsl_type::vec4_type);
+      emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
+      src.nr = temp.nr;
+      src.offset %= REG_SIZE;
        src.reladdr = NULL;
     }
  
@@ -1649,18 +1635,18 @@ vec4_visitor::move_grf_array_access_to_scratch()
      * scratch.
      */
     foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
-      if (inst->dst.file == GRF && inst->dst.reladdr) {
-         if (scratch_loc[inst->dst.reg] == -1) {
-            scratch_loc[inst->dst.reg] = last_scratch;
-            last_scratch += this->alloc.sizes[inst->dst.reg];
+      if (inst->dst.file == VGRF && inst->dst.reladdr) {
+         if (scratch_loc[inst->dst.nr] == -1) {
+            scratch_loc[inst->dst.nr] = last_scratch;
+            last_scratch += this->alloc.sizes[inst->dst.nr];
           }
  
           for (src_reg *iter = inst->dst.reladdr;
                iter->reladdr;
                iter = iter->reladdr) {
-            if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
-               scratch_loc[iter->reg] = last_scratch;
-               last_scratch += this->alloc.sizes[iter->reg];
+            if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
+               scratch_loc[iter->nr] = last_scratch;
+               last_scratch += this->alloc.sizes[iter->nr];
              }
           }
        }
@@ -1669,9 +1655,9 @@ vec4_visitor::move_grf_array_access_to_scratch()
           for (src_reg *iter = &inst->src[i];
                iter->reladdr;
                iter = iter->reladdr) {
-            if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
-               scratch_loc[iter->reg] = last_scratch;
-               last_scratch += this->alloc.sizes[iter->reg];
+            if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
+               scratch_loc[iter->nr] = last_scratch;
+               last_scratch += this->alloc.sizes[iter->nr];
              }
           }
        }
@@ -1697,8 +1683,8 @@ vec4_visitor::move_grf_array_access_to_scratch()
        /* Now that we have handled any (possibly recursive) reladdr scratch
         * accesses for dst we can safely do the scratch write for dst itself
         */
-      if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
-         emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
+      if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
+         emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
  
        /* Now handle scratch access on any src. In this case, since inst->src[i]
         * already is a src_reg, we can just call emit_resolve_reladdr with
@@ -1718,18 +1704,57 @@ vec4_visitor::move_grf_array_access_to_scratch()
   */
  void
  vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
-                                     dst_reg temp, src_reg orig_src,
-                                     int base_offset)
+                                      dst_reg temp, src_reg orig_src,
+                                      int base_offset, src_reg indirect)
  {
-   int reg_offset = base_offset + orig_src.reg_offset;
-   src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
-   src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
-                                             reg_offset);
-
-   emit_pull_constant_load_reg(temp,
-                               index,
-                               offset,
-                               block, inst);
+   assert(orig_src.offset % 16 == 0);
+   const unsigned index = prog_data->base.binding_table.pull_constants_start;
+
+   /* For 64bit loads we need to emit two 32-bit load messages and we also
+    * we need to shuffle the 32-bit data result into proper 64-bit data. To do
+    * that we emit the 32-bit loads into a temporary and we shuffle the result
+    * into the original destination.
+    */
+   dst_reg orig_temp = temp;
+   bool is_64bit = type_sz(orig_src.type) == 8;
+   if (is_64bit) {
+      assert(type_sz(temp.type) == 8);
+      dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
+      temp = retype(temp_df, BRW_REGISTER_TYPE_F);
+   }
+
+   src_reg src = orig_src;
+   for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
+      int reg_offset = base_offset + src.offset / 16;
+
+      src_reg offset;
+      if (indirect.file != BAD_FILE) {
+         offset = src_reg(this, glsl_type::uint_type);
+         emit_before(block, inst, ADD(dst_reg(offset), indirect,
+                                      brw_imm_ud(reg_offset * 16)));
+      } else if (devinfo->gen >= 8) {
+         /* Store the offset in a GRF so we can send-from-GRF. */
+         offset = src_reg(this, glsl_type::uint_type);
+         emit_before(block, inst, MOV(dst_reg(offset),
+                                      brw_imm_ud(reg_offset * 16)));
+      } else {
+         offset = brw_imm_d(reg_offset * 16);
+      }
+
+      emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
+                                  brw_imm_ud(index),
+                                  offset,
+                                  block, inst);
+
+      src = byte_offset(src, 16);
+   }
+
+   brw_mark_surface_used(&prog_data->base, index);
+
+   if (is_64bit) {
+      temp = retype(temp, BRW_REGISTER_TYPE_DF);
+      shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
+   }
  }
  
  /**
@@ -1747,61 +1772,65 @@ vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
  void
  vec4_visitor::move_uniform_array_access_to_pull_constants()
  {
+   /* The vulkan dirver doesn't support pull constants other than UBOs so
+    * everything has to be pushed regardless.
+    */
+   if (stage_prog_data->pull_param == NULL) {
+      split_uniform_registers();
+      return;
+   }
+
     int pull_constant_loc[this->uniforms];
     memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
-   bool nested_reladdr;
  
-   /* Walk through and find array access of uniforms.  Put a copy of that
-    * uniform in the pull constant buffer.
-    *
-    * Note that we don't move constant-indexed accesses to arrays.  No
-    * testing has been done of the performance impact of this choice.
+   /* First, walk through the instructions and determine which things need to
+    * be pulled.  We mark something as needing to be pulled by setting
+    * pull_constant_loc to 0.
      */
-   do {
-      nested_reladdr = false;
-
-      foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
-         for (int i = 0 ; i < 3; i++) {
-            if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
-               continue;
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      /* We only care about MOV_INDIRECT of a uniform */
+      if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
+          inst->src[0].file != UNIFORM)
+         continue;
  
-            int uniform = inst->src[i].reg;
+      int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
  
-            if (inst->src[i].reladdr->reladdr)
-               nested_reladdr = true;  /* will need another pass */
+      for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
+         pull_constant_loc[uniform_nr + j] = 0;
+   }
  
-            /* If this array isn't already present in the pull constant buffer,
-             * add it.
-             */
-            if (pull_constant_loc[uniform] == -1) {
-               const gl_constant_value **values =
-                  &stage_prog_data->param[uniform * 4];
+   /* Next, we walk the list of uniforms and assign real pull constant
+    * locations and set their corresponding entries in pull_param.
+    */
+   for (int j = 0; j < this->uniforms; j++) {
+      if (pull_constant_loc[j] < 0)
+         continue;
  
-               pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
+      pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
  
-               assert(uniform < uniform_array_size);
-               for (int j = 0; j < uniform_size[uniform] * 4; j++) {
-                  stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
-                     = values[j];
-               }
-            }
+      for (int i = 0; i < 4; i++) {
+         stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
+            = stage_prog_data->param[j * 4 + i];
+      }
+   }
  
-            /* Set up the annotation tracking for new generated instructions. */
-            base_ir = inst->ir;
-            current_annotation = inst->annotation;
+   /* Finally, we can walk through the instructions and lower MOV_INDIRECT
+    * instructions to actual uniform pulls.
+    */
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      /* We only care about MOV_INDIRECT of a uniform */
+      if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
+          inst->src[0].file != UNIFORM)
+         continue;
  
-            dst_reg temp = dst_reg(this, glsl_type::vec4_type);
+      int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
  
-            emit_pull_constant_load(block, inst, temp, inst->src[i],
-                                    pull_constant_loc[uniform]);
+      assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
  
-            inst->src[i].file = temp.file;
-            inst->src[i].reg = temp.reg;
-            inst->src[i].reg_offset = temp.reg_offset;
-            inst->src[i].reladdr = NULL;
-         }
-      }
-   } while (nested_reladdr);
+      emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
+                              pull_constant_loc[uniform_nr], inst->src[1]);
+      inst->remove(block);
+   }
  
     /* Now there are no accesses of the UNIFORM file with a reladdr, so
      * no need to track them as larger-than-vec4 objects.  This will be
@@ -1827,7 +1856,7 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
                             void *log_data,
                             const struct brw_sampler_prog_key_data *key_tex,
                             struct brw_vue_prog_data *prog_data,
-                           nir_shader *shader,
+                           const nir_shader *shader,
                            void *mem_ctx,
                             bool no_spills,
                             int shader_time_index)
@@ -1847,6 +1876,8 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
     this->current_annotation = NULL;
     memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
  
+   memset(this->output_num_components, 0, sizeof(this->output_num_components));
+
     this->virtual_grf_start = NULL;
     this->virtual_grf_end = NULL;
     this->live_intervals = NULL;
@@ -1854,17 +1885,6 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
     this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
  
     this->uniforms = 0;
-
-   /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
-    * at least one. See setup_uniforms() in brw_vec4.cpp.
-    */
-   this->uniform_array_size = 1;
-   if (prog_data) {
-      this->uniform_array_size =
-         MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
-   }
-
-   this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
  }
  
  vec4_visitor::~vec4_visitor()