intel/fs: Use new shuffle_32bit_write for all 64-bit storage writes

[mesa.git] / src / intel / compiler / brw_fs_nir.cpp
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp

index d90eaad32ba6aeedfebff08bbbebfd942baa8801..d7a0b7d6849ebf12705892e52c4b1b35a85d2217 100644 (file)
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -53,14 +53,27 @@ fs_visitor::nir_setup_outputs()
     if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
        return;
  
+   unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
+
+   /* Calculate the size of output registers in a separate pass, before
+    * allocating them.  With ARB_enhanced_layouts, multiple output variables
+    * may occupy the same slot, but have different type sizes.
+    */
     nir_foreach_variable(var, &nir->outputs) {
-      const unsigned vec4s =
+      const int loc = var->data.driver_location;
+      const unsigned var_vec4s =
           var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
                             : type_size_vec4(var->type);
-      fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * vec4s);
-      for (unsigned i = 0; i < vec4s; i++) {
-         if (outputs[var->data.driver_location + i].file == BAD_FILE)
-            outputs[var->data.driver_location + i] = offset(reg, bld, 4 * i);
+      vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
+   }
+
+   nir_foreach_variable(var, &nir->outputs) {
+      const int loc = var->data.driver_location;
+      if (outputs[loc].file == BAD_FILE) {
+         fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * vec4s[loc]);
+         for (unsigned i = 0; i < vec4s[loc]; i++) {
+            outputs[loc + i] = offset(reg, bld, 4 * i);
+         }
        }
     }
  }
@@ -68,10 +81,23 @@ fs_visitor::nir_setup_outputs()
  void
  fs_visitor::nir_setup_uniforms()
  {
-   if (dispatch_width != min_dispatch_width)
+   /* Only the first compile gets to set up uniforms. */
+   if (push_constant_loc) {
+      assert(pull_constant_loc);
        return;
+   }
  
     uniforms = nir->num_uniforms / 4;
+
+   if (stage == MESA_SHADER_COMPUTE) {
+      /* Add a uniform for the thread local id.  It must be the last uniform
+       * on the list.
+       */
+      assert(uniforms == prog_data->nr_params);
+      uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
+      *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
+      subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
+   }
  }
  
  static bool
@@ -86,42 +112,16 @@ emit_system_values_block(nir_block *block, fs_visitor *v)
        nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
        switch (intrin->intrinsic) {
        case nir_intrinsic_load_vertex_id:
-         unreachable("should be lowered by lower_vertex_id().");
-
-      case nir_intrinsic_load_vertex_id_zero_base:
-         assert(v->stage == MESA_SHADER_VERTEX);
-         reg = &v->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
-         break;
-
        case nir_intrinsic_load_base_vertex:
-         assert(v->stage == MESA_SHADER_VERTEX);
-         reg = &v->nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_VERTEX);
-         break;
+         unreachable("should be lowered by nir_lower_system_values().");
  
+      case nir_intrinsic_load_vertex_id_zero_base:
+      case nir_intrinsic_load_is_indexed_draw:
+      case nir_intrinsic_load_first_vertex:
        case nir_intrinsic_load_instance_id:
-         assert(v->stage == MESA_SHADER_VERTEX);
-         reg = &v->nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID);
-         break;
-
        case nir_intrinsic_load_base_instance:
-         assert(v->stage == MESA_SHADER_VERTEX);
-         reg = &v->nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_INSTANCE);
-         break;
-
        case nir_intrinsic_load_draw_id:
-         assert(v->stage == MESA_SHADER_VERTEX);
-         reg = &v->nir_system_values[SYSTEM_VALUE_DRAW_ID];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_DRAW_ID);
-         break;
+         unreachable("should be lowered by brw_nir_lower_vs_inputs().");
  
        case nir_intrinsic_load_invocation_id:
           if (v->stage == MESA_SHADER_TESS_CTRL)
@@ -233,6 +233,24 @@ fs_visitor::nir_emit_system_values()
        nir_system_values[i] = fs_reg();
     }
  
+   /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
+    * never end up using it.
+    */
+   {
+      const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
+      fs_reg &reg = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
+      reg = abld.vgrf(BRW_REGISTER_TYPE_UW);
+
+      const fs_builder allbld8 = abld.group(8, 0).exec_all();
+      allbld8.MOV(reg, brw_imm_v(0x76543210));
+      if (dispatch_width > 8)
+         allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
+      if (dispatch_width > 16) {
+         const fs_builder allbld16 = abld.group(16, 0).exec_all();
+         allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
+      }
+   }
+
     nir_foreach_function(function, nir) {
        assert(strcmp(function->name, "main") == 0);
        assert(function->impl);
@@ -242,6 +260,65 @@ fs_visitor::nir_emit_system_values()
     }
  }
  
+/*
+ * Returns a type based on a reference_type (word, float, half-float) and a
+ * given bit_size.
+ *
+ * Reference BRW_REGISTER_TYPE are HF,F,DF,W,D,UW,UD.
+ *
+ * @FIXME: 64-bit return types are always DF on integer types to maintain
+ * compability with uses of DF previously to the introduction of int64
+ * support.
+ */
+static brw_reg_type
+brw_reg_type_from_bit_size(const unsigned bit_size,
+                           const brw_reg_type reference_type)
+{
+   switch(reference_type) {
+   case BRW_REGISTER_TYPE_HF:
+   case BRW_REGISTER_TYPE_F:
+   case BRW_REGISTER_TYPE_DF:
+      switch(bit_size) {
+      case 16:
+         return BRW_REGISTER_TYPE_HF;
+      case 32:
+         return BRW_REGISTER_TYPE_F;
+      case 64:
+         return BRW_REGISTER_TYPE_DF;
+      default:
+         unreachable("Invalid bit size");
+      }
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_Q:
+      switch(bit_size) {
+      case 16:
+         return BRW_REGISTER_TYPE_W;
+      case 32:
+         return BRW_REGISTER_TYPE_D;
+      case 64:
+         return BRW_REGISTER_TYPE_Q;
+      default:
+         unreachable("Invalid bit size");
+      }
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_UQ:
+      switch(bit_size) {
+      case 16:
+         return BRW_REGISTER_TYPE_UW;
+      case 32:
+         return BRW_REGISTER_TYPE_UD;
+      case 64:
+         return BRW_REGISTER_TYPE_UQ;
+      default:
+         unreachable("Invalid bit size");
+      }
+   default:
+      unreachable("Unknown type");
+   }
+}
+
  void
  fs_visitor::nir_emit_impl(nir_function_impl *impl)
  {
@@ -255,7 +332,7 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl)
           reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
        unsigned size = array_elems * reg->num_components;
        const brw_reg_type reg_type =
-         reg->bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
+         brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F);
        nir_locals[reg->index] = bld.vgrf(reg_type, size);
     }
  
@@ -546,6 +623,18 @@ emit_find_msb_using_lzd(const fs_builder &bld,
     inst->src[0].negate = true;
  }
  
+static brw_rnd_mode
+brw_rnd_mode_from_nir_op (const nir_op op) {
+   switch (op) {
+   case nir_op_f2f16_rtz:
+      return BRW_RND_MODE_RTZ;
+   case nir_op_f2f16_rtne:
+      return BRW_RND_MODE_RTNE;
+   default:
+      unreachable("Operation doesn't support rounding mode");
+   }
+}
+
  void
  fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
  {
@@ -649,9 +738,34 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        inst->saturate = instr->dest.saturate;
        break;
  
+   case nir_op_f2f16_rtne:
+   case nir_op_f2f16_rtz:
+      bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
+               brw_imm_d(brw_rnd_mode_from_nir_op(instr->op)));
+      /* fallthrough */
+
+      /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending
+       * on the HW gen, it is a special hw opcode or just a MOV, and
+       * brw_F32TO16 (at brw_eu_emit) would do the work to chose.
+       *
+       * But if we want to use that opcode, we need to provide support on
+       * different optimizations and lowerings. As right now HF support is
+       * only for gen8+, it will be better to use directly the MOV, and use
+       * BRW_OPCODE_F32TO16 when/if we work for HF support on gen7.
+       */
+
+   case nir_op_f2f16_undef:
+      inst = bld.MOV(result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
     case nir_op_f2f64:
+   case nir_op_f2i64:
+   case nir_op_f2u64:
     case nir_op_i2f64:
+   case nir_op_i2i64:
     case nir_op_u2f64:
+   case nir_op_u2u64:
        /* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions:
         *
         *    "When source or destination is 64b (...), regioning in Align1
@@ -661,13 +775,13 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
         *        the same qword.
         *     (...)"
         *
-       * This means that 32-bit to 64-bit conversions need to have the 32-bit
-       * data elements aligned to 64-bit. This restriction does not apply to
-       * BDW and later.
+       * This means that conversions from bit-sizes smaller than 64-bit to
+       * 64-bit need to have the source data elements aligned to 64-bit.
+       * This restriction does not apply to BDW and later.
         */
        if (nir_dest_bit_size(instr->dest.dest) == 64 &&
-          nir_src_bit_size(instr->src[0].src) == 32 &&
-          (devinfo->is_cherryview || devinfo->is_broxton)) {
+          nir_src_bit_size(instr->src[0].src) < 64 &&
+          (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
           fs_reg tmp = bld.vgrf(result.type, 1);
           tmp = subscript(tmp, op[0].type, 0);
           inst = bld.MOV(tmp, op[0]);
@@ -679,12 +793,14 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
     case nir_op_f2f32:
     case nir_op_f2i32:
     case nir_op_f2u32:
-   case nir_op_f2i64:
-   case nir_op_f2u64:
+   case nir_op_f2i16:
+   case nir_op_f2u16:
     case nir_op_i2i32:
-   case nir_op_i2i64:
     case nir_op_u2u32:
-   case nir_op_u2u64:
+   case nir_op_i2i16:
+   case nir_op_u2u16:
+   case nir_op_i2f16:
+   case nir_op_u2f16:
        inst = bld.MOV(result, op[0]);
        inst->saturate = instr->dest.saturate;
        break;
@@ -721,53 +837,21 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
            *
            * - 2-src instructions can't operate with 64-bit immediates
            * - The sign is encoded in the high 32-bit of each DF
-          * - CMP with DF requires special handling in SIMD16
            * - We need to produce a DF result.
            */
  
-         /* 2-src instructions can't have 64-bit immediates, so put 0.0 in
-          * a register and compare with that.
-          */
-         fs_reg tmp = vgrf(glsl_type::double_type);
-         bld.MOV(tmp, setup_imm_df(bld, 0.0));
-
-         /* A direct DF CMP using the flag register (null dst) won't work in
-          * SIMD16 because the CMP will be split in two by lower_simd_width,
-          * resulting in two CMP instructions with the same dst (NULL),
-          * leading to dead code elimination of the first one. In SIMD8,
-          * however, there is no need to split the CMP and we can save some
-          * work.
-          */
-         fs_reg dst_tmp = vgrf(glsl_type::double_type);
-         bld.CMP(dst_tmp, op[0], tmp, BRW_CONDITIONAL_NZ);
-
-         /* In SIMD16 we want to avoid using a NULL dst register with DF CMP,
-          * so we store the result of the comparison in a vgrf instead and
-          * then we generate a UD comparison from that that won't have to
-          * be split by lower_simd_width. This is what NIR does to handle
-          * double comparisons in the general case.
-          */
-         if (bld.dispatch_width() == 16 ) {
-            fs_reg dst_tmp_ud = retype(dst_tmp, BRW_REGISTER_TYPE_UD);
-            bld.MOV(dst_tmp_ud, subscript(dst_tmp, BRW_REGISTER_TYPE_UD, 0));
-            bld.CMP(bld.null_reg_ud(),
-                    dst_tmp_ud, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
-         }
-
-         /* Get the high 32-bit of each double component where the sign is */
-         fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
-         bld.MOV(result_int, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
+         fs_reg zero = vgrf(glsl_type::double_type);
+         bld.MOV(zero, setup_imm_df(bld, 0.0));
+         bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
  
-         /* Get the sign bit */
-         bld.AND(result_int, result_int, brw_imm_ud(0x80000000u));
+         bld.MOV(result, zero);
  
-         /* Add 1.0 to the sign, predicated to skip the case of op[0] == 0.0 */
-         inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
-         inst->predicate = BRW_PREDICATE_NORMAL;
+         fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
+         bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
+                 brw_imm_ud(0x80000000u));
  
-         /* Convert from 32-bit float to 64-bit double */
-         result.type = BRW_REGISTER_TYPE_DF;
-         inst = bld.MOV(result, retype(result_int, BRW_REGISTER_TYPE_F));
+         set_predicate(BRW_PREDICATE_NORMAL,
+                       bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
  
           if (instr->dest.saturate) {
              inst = bld.MOV(result, result);
@@ -777,17 +861,24 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        break;
     }
  
-   case nir_op_isign:
+   case nir_op_isign: {
        /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
         *               -> non-negative val generates 0x00000000.
         *  Predicated OR sets 1 if val is positive.
         */
-      assert(nir_dest_bit_size(instr->dest.dest) < 64);
-      bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G);
-      bld.ASR(result, op[0], brw_imm_d(31));
-      inst = bld.OR(result, result, brw_imm_d(1));
+      uint32_t bit_size = nir_dest_bit_size(instr->dest.dest);
+      assert(bit_size == 32 || bit_size == 16);
+
+      fs_reg zero = bit_size == 32 ? brw_imm_d(0) : brw_imm_w(0);
+      fs_reg one = bit_size == 32 ? brw_imm_d(1) : brw_imm_w(1);
+      fs_reg shift = bit_size == 32 ? brw_imm_d(31) : brw_imm_w(15);
+
+      bld.CMP(bld.null_reg_d(), op[0], zero, BRW_CONDITIONAL_G);
+      bld.ASR(result, op[0], shift);
+      inst = bld.OR(result, result, one);
        inst->predicate = BRW_PREDICATE_NORMAL;
        break;
+   }
  
     case nir_op_frcp:
        inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
@@ -927,9 +1018,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
     case nir_op_feq:
     case nir_op_fne: {
        fs_reg dest = result;
-      if (nir_src_bit_size(instr->src[0].src) > 32) {
-         dest = bld.vgrf(BRW_REGISTER_TYPE_DF, 1);
-      }
+
+      const uint32_t bit_size =  nir_src_bit_size(instr->src[0].src);
+      if (bit_size != 32)
+         dest = bld.vgrf(op[0].type, 1);
+
        brw_conditional_mod cond;
        switch (instr->op) {
        case nir_op_flt:
@@ -947,9 +1040,19 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        default:
           unreachable("bad opcode");
        }
+
        bld.CMP(dest, op[0], op[1], cond);
-      if (nir_src_bit_size(instr->src[0].src) > 32) {
+
+      if (bit_size > 32) {
           bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
+      } else if(bit_size < 32) {
+         /* When we convert the result to 32-bit we need to be careful and do
+          * it as a signed conversion to get sign extension (for 32-bit true)
+          */
+         const brw_reg_type src_type =
+            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
+
+         bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
        }
        break;
     }
@@ -961,9 +1064,10 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
     case nir_op_ieq:
     case nir_op_ine: {
        fs_reg dest = result;
-      if (nir_src_bit_size(instr->src[0].src) > 32) {
-         dest = bld.vgrf(BRW_REGISTER_TYPE_UQ, 1);
-      }
+
+      const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+      if (bit_size != 32)
+         dest = bld.vgrf(op[0].type, 1);
  
        brw_conditional_mod cond;
        switch (instr->op) {
@@ -985,8 +1089,17 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
           unreachable("bad opcode");
        }
        bld.CMP(dest, op[0], op[1], cond);
-      if (nir_src_bit_size(instr->src[0].src) > 32) {
+
+      if (bit_size > 32) {
           bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
+      } else if (bit_size < 32) {
+         /* When we convert the result to 32-bit we need to be careful and do
+          * it as a signed conversion to get sign extension (for 32-bit true)
+          */
+         const brw_reg_type src_type =
+            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
+
+         bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
        }
        break;
     }
@@ -1073,8 +1186,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        break;
  
     case nir_op_i2b:
-   case nir_op_f2b:
-      if (nir_src_bit_size(instr->src[0].src) == 64) {
+   case nir_op_f2b: {
+      uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+      if (bit_size == 64) {
           /* two-argument instructions can't take 64-bit immediates */
           fs_reg zero;
           fs_reg tmp;
@@ -1082,12 +1196,13 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
           if (instr->op == nir_op_f2b) {
              zero = vgrf(glsl_type::double_type);
              tmp = vgrf(glsl_type::double_type);
+            bld.MOV(zero, setup_imm_df(bld, 0.0));
           } else {
              zero = vgrf(glsl_type::int64_t_type);
              tmp = vgrf(glsl_type::int64_t_type);
+            bld.MOV(zero, brw_imm_q(0));
           }
  
-         bld.MOV(zero, setup_imm_df(bld, 0.0));
           /* A SIMD16 execution needs to be split in two instructions, so use
            * a vgrf instead of the flag register as dst so instruction splitting
            * works
@@ -1095,13 +1210,18 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
           bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
           bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
        } else {
-         if (instr->op == nir_op_f2b) {
-            bld.CMP(result, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
+         fs_reg zero;
+         if (bit_size == 32) {
+            zero = instr->op == nir_op_f2b ? brw_imm_f(0.0f) : brw_imm_d(0);
           } else {
-            bld.CMP(result, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
+            assert(bit_size == 16);
+            zero = instr->op == nir_op_f2b ?
+               retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0);
           }
+         bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ);
        }
        break;
+   }
  
     case nir_op_ftrunc:
        inst = bld.RNDZ(result, op[0]);
@@ -1194,6 +1314,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        break;
  
     case nir_op_pack_64_2x32_split:
+   case nir_op_pack_32_2x16_split:
        bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
        break;
  
@@ -1206,6 +1327,15 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        break;
     }
  
+   case nir_op_unpack_32_2x16_split_x:
+   case nir_op_unpack_32_2x16_split_y: {
+      if (instr->op == nir_op_unpack_32_2x16_split_x)
+         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
+      else
+         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
+      break;
+   }
+
     case nir_op_fpow:
        inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
        inst->saturate = instr->dest.saturate;
@@ -1295,14 +1425,36 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        unreachable("not reached: should have been lowered");
  
     case nir_op_ishl:
-      bld.SHL(result, op[0], op[1]);
-      break;
     case nir_op_ishr:
-      bld.ASR(result, op[0], op[1]);
-      break;
-   case nir_op_ushr:
-      bld.SHR(result, op[0], op[1]);
+   case nir_op_ushr: {
+      fs_reg shift_count = op[1];
+
+      if (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) {
+         if (op[1].file == VGRF &&
+             (result.type == BRW_REGISTER_TYPE_Q ||
+              result.type == BRW_REGISTER_TYPE_UQ)) {
+            shift_count = fs_reg(VGRF, alloc.allocate(dispatch_width / 4),
+                                 BRW_REGISTER_TYPE_UD);
+            shift_count.stride = 2;
+            bld.MOV(shift_count, op[1]);
+         }
+      }
+
+      switch (instr->op) {
+      case nir_op_ishl:
+         bld.SHL(result, op[0], shift_count);
+         break;
+      case nir_op_ishr:
+         bld.ASR(result, op[0], shift_count);
+         break;
+      case nir_op_ushr:
+         bld.SHR(result, op[0], shift_count);
+         break;
+      default:
+         unreachable("not reached");
+      }
        break;
+   }
  
     case nir_op_pack_half_2x16_split:
        bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
@@ -1329,10 +1481,31 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
  
     case nir_op_extract_u8:
     case nir_op_extract_i8: {
-      const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
        nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
        assert(byte != NULL);
-      bld.MOV(result, subscript(op[0], type, byte->u32[0]));
+
+      /* The PRMs say:
+       *
+       *    BDW+
+       *    There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
+       *    Use two instructions and a word or DWord intermediate integer type.
+       */
+      if (nir_dest_bit_size(instr->dest.dest) == 64) {
+         const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i8);
+
+         if (instr->op == nir_op_extract_i8) {
+            /* If we need to sign extend, extract to a word first */
+            fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
+            bld.MOV(w_temp, subscript(op[0], type, byte->u32[0]));
+            bld.MOV(result, w_temp);
+         } else {
+            /* Otherwise use an AND with 0xff and a word type */
+            bld.AND(result, subscript(op[0], type, byte->u32[0] / 2), brw_imm_uw(0xff));
+         }
+      } else {
+         const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
+         bld.MOV(result, subscript(op[0], type, byte->u32[0]));
+      }
        break;
     }
  
@@ -1366,19 +1539,32 @@ fs_visitor::nir_emit_load_const(const fs_builder &bld,
                                  nir_load_const_instr *instr)
  {
     const brw_reg_type reg_type =
-      instr->def.bit_size == 32 ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
+      brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D);
     fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
  
     switch (instr->def.bit_size) {
+   case 16:
+      for (unsigned i = 0; i < instr->def.num_components; i++)
+         bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value.i16[i]));
+      break;
+
     case 32:
        for (unsigned i = 0; i < instr->def.num_components; i++)
           bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i]));
        break;
  
     case 64:
-      for (unsigned i = 0; i < instr->def.num_components; i++)
-         bld.MOV(offset(reg, bld, i),
-                 setup_imm_df(bld, instr->value.f64[i]));
+      assert(devinfo->gen >= 7);
+      if (devinfo->gen == 7) {
+         /* We don't get 64-bit integer types until gen8 */
+         for (unsigned i = 0; i < instr->def.num_components; i++) {
+            bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF),
+                    setup_imm_df(bld, instr->value.f64[i]));
+         }
+      } else {
+         for (unsigned i = 0; i < instr->def.num_components; i++)
+            bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value.i64[i]));
+      }
        break;
  
     default:
@@ -1394,8 +1580,8 @@ fs_visitor::get_nir_src(const nir_src &src)
     fs_reg reg;
     if (src.is_ssa) {
        if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
-         const brw_reg_type reg_type = src.ssa->bit_size == 32 ?
-            BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
+         const brw_reg_type reg_type =
+            brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D);
           reg = bld.vgrf(reg_type, src.ssa->num_components);
        } else {
           reg = nir_ssa_values[src.ssa->index];
@@ -1407,20 +1593,35 @@ fs_visitor::get_nir_src(const nir_src &src)
                     src.reg.base_offset * src.reg.reg->num_components);
     }
  
-   /* to avoid floating-point denorm flushing problems, set the type by
-    * default to D - instructions that need floating point semantics will set
-    * this to F if they need to
-    */
-   return retype(reg, BRW_REGISTER_TYPE_D);
+   if (nir_src_bit_size(src) == 64 && devinfo->gen == 7) {
+      /* The only 64-bit type available on gen7 is DF, so use that. */
+      reg.type = BRW_REGISTER_TYPE_DF;
+   } else {
+      /* To avoid floating-point denorm flushing problems, set the type by
+       * default to an integer type - instructions that need floating point
+       * semantics will set this to F if they need to
+       */
+      reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),
+                                            BRW_REGISTER_TYPE_D);
+   }
+
+   return reg;
  }
  
  /**
   * Return an IMM for constants; otherwise call get_nir_src() as normal.
+ *
+ * This function should not be called on any value which may be 64 bits.
+ * We could theoretically support 64-bit on gen8+ but we choose not to
+ * because it wouldn't work in general (no gen7 support) and there are
+ * enough restrictions in 64-bit immediates that you can't take the return
+ * value and treat it the same as the result of get_nir_src().
   */
  fs_reg
  fs_visitor::get_nir_src_imm(const nir_src &src)
  {
     nir_const_value *val = nir_src_as_const_value(src);
+   assert(nir_src_bit_size(src) == 32);
     return val ? fs_reg(brw_imm_d(val->i32[0])) : get_nir_src(src);
  }
  
@@ -1429,7 +1630,7 @@ fs_visitor::get_nir_dest(const nir_dest &dest)
  {
     if (dest.is_ssa) {
        const brw_reg_type reg_type =
-         dest.ssa.bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
+         brw_reg_type_from_bit_size(dest.ssa.bit_size, BRW_REGISTER_TYPE_F);
        nir_ssa_values[dest.ssa.index] =
           bld.vgrf(reg_type, dest.ssa.num_components);
        return nir_ssa_values[dest.ssa.index];
@@ -1544,23 +1745,23 @@ static unsigned
  get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type)
  {
     switch (op) {
-   case nir_intrinsic_image_atomic_add:
+   case nir_intrinsic_image_var_atomic_add:
        return BRW_AOP_ADD;
-   case nir_intrinsic_image_atomic_min:
+   case nir_intrinsic_image_var_atomic_min:
        return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
                BRW_AOP_IMIN : BRW_AOP_UMIN);
-   case nir_intrinsic_image_atomic_max:
+   case nir_intrinsic_image_var_atomic_max:
        return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
                BRW_AOP_IMAX : BRW_AOP_UMAX);
-   case nir_intrinsic_image_atomic_and:
+   case nir_intrinsic_image_var_atomic_and:
        return BRW_AOP_AND;
-   case nir_intrinsic_image_atomic_or:
+   case nir_intrinsic_image_var_atomic_or:
        return BRW_AOP_OR;
-   case nir_intrinsic_image_atomic_xor:
+   case nir_intrinsic_image_var_atomic_xor:
        return BRW_AOP_XOR;
-   case nir_intrinsic_image_atomic_exchange:
+   case nir_intrinsic_image_var_atomic_exchange:
        return BRW_AOP_MOV;
-   case nir_intrinsic_image_atomic_comp_swap:
+   case nir_intrinsic_image_var_atomic_comp_swap:
        return BRW_AOP_CMPWR;
     default:
        unreachable("Not reachable.");
@@ -1804,7 +2005,7 @@ fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
     assert(gs_compile->control_data_bits_per_vertex == 2);
  
     /* Must be a valid stream */
-   assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
+   assert(stream_id < MAX_VERTEX_STREAMS);
  
     /* Control data bits are initialized to 0 so we don't have to set any
      * bits when sending vertices to stream 0.
@@ -1942,33 +2143,23 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
     nir_const_value *offset_const = nir_src_as_const_value(offset_src);
     const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
  
-   /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y],
-    * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].  Only
-    * gl_PointSize is available as a GS input, however, so it must be that.
-    */
-   const bool is_point_size = (base_offset == 0);
-
     /* TODO: figure out push input layout for invocations == 1 */
+   /* TODO: make this work with 64-bit inputs */
     if (gs_prog_data->invocations == 1 &&
+       type_sz(dst.type) <= 4 &&
         offset_const != NULL && vertex_const != NULL &&
         4 * (base_offset + offset_const->u32[0]) < push_reg_count) {
        int imm_offset = (base_offset + offset_const->u32[0]) * 4 +
                         vertex_const->u32[0] * push_reg_count;
-      /* This input was pushed into registers. */
-      if (is_point_size) {
-         /* gl_PointSize comes in .w */
-         bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type));
-      } else {
-         for (unsigned i = 0; i < num_components; i++) {
-            bld.MOV(offset(dst, bld, i),
-                    fs_reg(ATTR, imm_offset + i + first_component, dst.type));
-         }
+      for (unsigned i = 0; i < num_components; i++) {
+         bld.MOV(offset(dst, bld, i),
+                 fs_reg(ATTR, imm_offset + i + first_component, dst.type));
        }
        return;
     }
  
     /* Resort to the pull model.  Ensure the VUE handles are provided. */
-   gs_prog_data->base.include_vue_handles = true;
+   assert(gs_prog_data->base.include_vue_handles);
  
     unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
     fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
@@ -1991,7 +2182,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
            * by 32 (shifting by 5), and add the two together.  This is
            * the final indirect byte offset.
            */
-         fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1);
+         fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
           fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
           fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
           fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
@@ -2114,11 +2305,11 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
        }
  
        if (type_sz(dst.type) == 8) {
-         shuffle_32bit_load_result_to_64bit_data(
-            bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components);
-
-         for (unsigned c = 0; c < num_components; c++)
-            bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
+         shuffle_from_32bit_read(bld,
+                                 offset(dst, bld, iter * 2),
+                                 retype(tmp_dst, BRW_REGISTER_TYPE_D),
+                                 0,
+                                 num_components);
        }
  
        if (num_iterations > 1) {
@@ -2132,14 +2323,6 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
           }
        }
     }
-
-   if (is_point_size) {
-      /* Read the whole VUE header (because of alignment) and read .w. */
-      fs_reg tmp = bld.vgrf(dst.type, 4);
-      inst->dst = tmp;
-      inst->size_written = 4 * REG_SIZE;
-      bld.MOV(dst, offset(tmp, bld, 3));
-   }
  }
  
  fs_reg
@@ -2167,7 +2350,53 @@ do_untyped_vector_read(const fs_builder &bld,
                         const fs_reg offset_reg,
                         unsigned num_components)
  {
-   if (type_sz(dest.type) == 4) {
+   if (type_sz(dest.type) <= 2) {
+      assert(dest.stride == 1);
+      boolean is_const_offset = offset_reg.file == BRW_IMMEDIATE_VALUE;
+
+      if (is_const_offset) {
+         uint32_t start = offset_reg.ud & ~3;
+         uint32_t end = offset_reg.ud + num_components * type_sz(dest.type);
+         end = ALIGN(end, 4);
+         assert (end - start <= 16);
+
+         /* At this point we have 16-bit component/s that have constant
+          * offset aligned to 4-bytes that can be read with untyped_reads.
+          * untyped_read message requires 32-bit aligned offsets.
+          */
+         unsigned first_component = (offset_reg.ud & 3) / type_sz(dest.type);
+         unsigned num_components_32bit = (end - start) / 4;
+
+         fs_reg read_result =
+            emit_untyped_read(bld, surf_index, brw_imm_ud(start),
+                              1 /* dims */,
+                              num_components_32bit,
+                              BRW_PREDICATE_NONE);
+         shuffle_from_32bit_read(bld, dest, read_result, first_component,
+                                 num_components);
+      } else {
+         fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
+         for (unsigned i = 0; i < num_components; i++) {
+            if (i == 0) {
+               bld.MOV(read_offset, offset_reg);
+            } else {
+               bld.ADD(read_offset, offset_reg,
+                       brw_imm_ud(i * type_sz(dest.type)));
+            }
+            /* Non constant offsets are not guaranteed to be aligned 32-bits
+             * so they are read using one byte_scattered_read message
+             * for each component.
+             */
+            fs_reg read_result =
+               emit_byte_scattered_read(bld, surf_index, read_offset,
+                                        1 /* dims */, 1,
+                                        type_sz(dest.type) * 8 /* bit_size */,
+                                        BRW_PREDICATE_NONE);
+            bld.MOV(offset(dest, bld, i),
+                    subscript (read_result, dest.type, 0));
+         }
+      }
+   } else if (type_sz(dest.type) == 4) {
        fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
                                               1 /* dims */,
                                               num_components,
@@ -2205,16 +2434,8 @@ do_untyped_vector_read(const fs_builder &bld,
                                                  BRW_PREDICATE_NONE);
  
           /* Shuffle the 32-bit load result into valid 64-bit data */
-         const fs_reg packed_result = bld.vgrf(dest.type, iter_components);
-         shuffle_32bit_load_result_to_64bit_data(
-            bld, packed_result, read_result, iter_components);
-
-         /* Move each component to its destination */
-         read_result = retype(read_result, BRW_REGISTER_TYPE_DF);
-         for (int c = 0; c < iter_components; c++) {
-            bld.MOV(offset(dest, bld, it * 2 + c),
-                    offset(packed_result, bld, c));
-         }
+         shuffle_from_32bit_read(bld, offset(dest, bld, it * 2),
+                                 read_result, 0, iter_components);
  
           bld.ADD(read_offset, read_offset, brw_imm_ud(16));
        }
@@ -2235,10 +2456,10 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
  
     switch (instr->intrinsic) {
     case nir_intrinsic_load_vertex_id:
-      unreachable("should be lowered by lower_vertex_id()");
+   case nir_intrinsic_load_base_vertex:
+      unreachable("should be lowered by nir_lower_system_values()");
  
     case nir_intrinsic_load_vertex_id_zero_base:
-   case nir_intrinsic_load_base_vertex:
     case nir_intrinsic_load_instance_id:
     case nir_intrinsic_load_base_instance:
     case nir_intrinsic_load_draw_id: {
@@ -2254,33 +2475,26 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
        fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type);
        unsigned first_component = nir_intrinsic_component(instr);
        unsigned num_components = instr->num_components;
-      enum brw_reg_type type = dest.type;
  
        nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
        assert(const_offset && "Indirect input loads not allowed");
        src = offset(src, bld, const_offset->u32[0]);
  
-      for (unsigned j = 0; j < num_components; j++) {
-         bld.MOV(offset(dest, bld, j), offset(src, bld, j + first_component));
-      }
+      if (type_sz(dest.type) == 8)
+         first_component /= 2;
  
-      if (type == BRW_REGISTER_TYPE_DF) {
-         /* Once the double vector is read, set again its original register
-          * type to continue with normal execution.
-          */
-         src = retype(src, type);
-         dest = retype(dest, type);
-      }
-
-      if (type_sz(src.type) == 8) {
-         shuffle_32bit_load_result_to_64bit_data(bld,
-                                                 dest,
-                                                 retype(dest, BRW_REGISTER_TYPE_F),
-                                                 instr->num_components);
-      }
+      /* For 16-bit support maybe a temporary will be needed to copy from
+       * the ATTR file.
+       */
+      shuffle_from_32bit_read(bld, dest, retype(src, BRW_REGISTER_TYPE_D),
+                              first_component, num_components);
        break;
     }
  
+   case nir_intrinsic_load_first_vertex:
+   case nir_intrinsic_load_is_indexed_draw:
+      unreachable("lowered by brw_nir_lower_vs_inputs");
+
     default:
        nir_emit_intrinsic(bld, instr);
        break;
@@ -2451,13 +2665,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
            * or SSBOs.
            */
           if (type_sz(dst.type) == 8) {
-            shuffle_32bit_load_result_to_64bit_data(
-               bld, dst, retype(dst, BRW_REGISTER_TYPE_F), num_components);
-
-            for (unsigned c = 0; c < num_components; c++) {
-               bld.MOV(offset(orig_dst, bld, iter * 2 + c),
-                       offset(dst, bld, c));
-            }
+            shuffle_from_32bit_read(bld,
+                                    offset(orig_dst, bld, iter * 2),
+                                    retype(dst, BRW_REGISTER_TYPE_D),
+                                    0, num_components);
           }
  
           /* Copy the temporary to the destination to deal with writemasking.
@@ -2552,7 +2763,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
           instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
        fs_reg indirect_offset = get_indirect_offset(instr);
        unsigned imm_offset = instr->const_index[0];
-      unsigned swiz = BRW_SWIZZLE_XYZW;
        unsigned mask = instr->const_index[1];
        unsigned header_regs = 0;
        fs_reg srcs[7];
@@ -2582,13 +2792,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
           }
        }
  
-      /* 64-bit data needs to me shuffled before we can write it to the URB.
-       * We will use this temporary to shuffle the components in each
-       * iteration.
-       */
-      fs_reg tmp =
-         fs_reg(VGRF, alloc.allocate(2 * iter_components), value.type);
-
        mask = mask << first_component;
  
        for (unsigned iter = 0; iter < num_iterations; iter++) {
@@ -2632,26 +2835,17 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
                 continue;
  
              if (!is_64bit) {
-               srcs[header_regs + i + first_component] =
-                  offset(value, bld, BRW_GET_SWZ(swiz, i));
+               srcs[header_regs + i + first_component] = offset(value, bld, i);
              } else {
                 /* We need to shuffle the 64-bit data to match the layout
                  * expected by our 32-bit URB write messages. We use a temporary
                  * for that.
                  */
-               unsigned channel = BRW_GET_SWZ(swiz, iter * 2 + i);
-               shuffle_64bit_data_for_32bit_write(bld,
-                  retype(offset(tmp, bld, 2 * i), BRW_REGISTER_TYPE_F),
-                  retype(offset(value, bld, 2 * channel), BRW_REGISTER_TYPE_DF),
-                  1);
-
-               /* Now copy the data to the destination */
-               fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type);
-               unsigned idx = 2 * i;
-               bld.MOV(dest, offset(tmp, bld, idx));
-               bld.MOV(offset(dest, bld, 1), offset(tmp, bld, idx + 1));
-               srcs[header_regs + idx + first_component * 2] = dest;
-               srcs[header_regs + idx + 1 + first_component * 2] =
+               unsigned channel = iter * 2 + i;
+               fs_reg dest = shuffle_for_32bit_write(bld, value, channel, 1);
+
+               srcs[header_regs + (i + first_component) * 2] = dest;
+               srcs[header_regs + (i + first_component) * 2 + 1] =
                    offset(dest, bld, 1);
              }
           }
@@ -2721,17 +2915,22 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
           /* Arbitrarily only push up to 32 vec4 slots worth of data,
            * which is 16 registers (since each holds 2 vec4 slots).
            */
+         unsigned slot_count = 1;
+         if (type_sz(dest.type) == 8 && instr->num_components > 2)
+            slot_count++;
+
           const unsigned max_push_slots = 32;
-         if (imm_offset < max_push_slots) {
+         if (imm_offset + slot_count <= max_push_slots) {
              fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
              for (int i = 0; i < instr->num_components; i++) {
                 unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) +
                    i + first_component;
                 bld.MOV(offset(dest, bld, i), component(src, comp));
              }
+
              tes_prog_data->base.urb_read_length =
                 MAX2(tes_prog_data->base.urb_read_length,
-                    DIV_ROUND_UP(imm_offset + 1, 2));
+                    DIV_ROUND_UP(imm_offset + slot_count, 2));
           } else {
              /* Replicate the patch handle to all enabled channels */
              const fs_reg srcs[] = {
@@ -2811,13 +3010,10 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
               * or SSBOs.
               */
              if (type_sz(dest.type) == 8) {
-               shuffle_32bit_load_result_to_64bit_data(
-                  bld, dest, retype(dest, BRW_REGISTER_TYPE_F), num_components);
-
-               for (unsigned c = 0; c < num_components; c++) {
-                  bld.MOV(offset(orig_dest, bld, iter * 2 + c),
-                          offset(dest, bld, c));
-               }
+               shuffle_from_32bit_read(bld,
+                                       offset(orig_dest, bld, iter * 2),
+                                       retype(dest, BRW_REGISTER_TYPE_D),
+                                       0, num_components);
              }
  
              /* If we are loading double data and we need a second read message
@@ -3154,15 +3350,16 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
     case nir_intrinsic_load_input: {
        /* load_input is only used for flat inputs */
        unsigned base = nir_intrinsic_base(instr);
-      unsigned component = nir_intrinsic_component(instr);
+      unsigned comp = nir_intrinsic_component(instr);
        unsigned num_components = instr->num_components;
+      fs_reg orig_dest = dest;
        enum brw_reg_type type = dest.type;
  
        /* Special case fields in the VUE header */
        if (base == VARYING_SLOT_LAYER)
-         component = 1;
+         comp = 1;
        else if (base == VARYING_SLOT_VIEWPORT)
-         component = 2;
+         comp = 2;
  
        if (nir_dest_bit_size(instr->dest) == 64) {
           /* const_index is in 32-bit type size units that could not be aligned
@@ -3171,20 +3368,17 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
            */
           type = BRW_REGISTER_TYPE_F;
           num_components *= 2;
+         dest = bld.vgrf(type, num_components);
        }
  
        for (unsigned int i = 0; i < num_components; i++) {
-         struct brw_reg interp = interp_reg(base, component + i);
-         interp = suboffset(interp, 3);
-         bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),
-                  retype(fs_reg(interp), type));
+         bld.MOV(offset(retype(dest, type), bld, i),
+                 retype(component(interp_reg(base, comp + i), 3), type));
        }
  
        if (nir_dest_bit_size(instr->dest) == 64) {
-         shuffle_32bit_load_result_to_64bit_data(bld,
-                                                 dest,
-                                                 retype(dest, type),
-                                                 instr->num_components);
+         shuffle_from_32bit_read(bld, orig_dest, dest, 0,
+                                 instr->num_components);
        }
        break;
     }
@@ -3350,8 +3544,8 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
  
        for (unsigned int i = 0; i < instr->num_components; i++) {
           fs_reg interp =
-            fs_reg(interp_reg(nir_intrinsic_base(instr),
-                              nir_intrinsic_component(instr) + i));
+            component(interp_reg(nir_intrinsic_base(instr),
+                                 nir_intrinsic_component(instr) + i), 0);
           interp.type = BRW_REGISTER_TYPE_F;
           dest.type = BRW_REGISTER_TYPE_F;
  
@@ -3389,6 +3583,10 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
        cs_prog_data->uses_barrier = true;
        break;
  
+   case nir_intrinsic_load_subgroup_id:
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id);
+      break;
+
     case nir_intrinsic_load_local_invocation_id:
     case nir_intrinsic_load_work_group_id: {
        gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
@@ -3496,18 +3694,10 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
         * expected by our 32-bit write messages.
         */
        unsigned type_size = 4;
-      unsigned bit_size = instr->src[0].is_ssa ?
-         instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
-      if (bit_size == 64) {
+      if (nir_src_bit_size(instr->src[0]) == 64) {
           type_size = 8;
-         fs_reg tmp =
-           fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
-         shuffle_64bit_data_for_32bit_write(
-            bld,
-            retype(tmp, BRW_REGISTER_TYPE_F),
-            retype(val_reg, BRW_REGISTER_TYPE_DF),
-            instr->num_components);
-         val_reg = tmp;
+         val_reg = shuffle_for_32bit_write(bld, val_reg, 0,
+                                           instr->num_components);
        }
  
        unsigned type_slots = type_size / 4;
@@ -3560,6 +3750,71 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
     }
  }
  
+static fs_reg
+brw_nir_reduction_op_identity(const fs_builder &bld,
+                              nir_op op, brw_reg_type type)
+{
+   nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
+   switch (type_sz(type)) {
+   case 2:
+      assert(type != BRW_REGISTER_TYPE_HF);
+      return retype(brw_imm_uw(value.u16[0]), type);
+   case 4:
+      return retype(brw_imm_ud(value.u32[0]), type);
+   case 8:
+      if (type == BRW_REGISTER_TYPE_DF)
+         return setup_imm_df(bld, value.f64[0]);
+      else
+         return retype(brw_imm_u64(value.u64[0]), type);
+   default:
+      unreachable("Invalid type size");
+   }
+}
+
+static opcode
+brw_op_for_nir_reduction_op(nir_op op)
+{
+   switch (op) {
+   case nir_op_iadd: return BRW_OPCODE_ADD;
+   case nir_op_fadd: return BRW_OPCODE_ADD;
+   case nir_op_imul: return BRW_OPCODE_MUL;
+   case nir_op_fmul: return BRW_OPCODE_MUL;
+   case nir_op_imin: return BRW_OPCODE_SEL;
+   case nir_op_umin: return BRW_OPCODE_SEL;
+   case nir_op_fmin: return BRW_OPCODE_SEL;
+   case nir_op_imax: return BRW_OPCODE_SEL;
+   case nir_op_umax: return BRW_OPCODE_SEL;
+   case nir_op_fmax: return BRW_OPCODE_SEL;
+   case nir_op_iand: return BRW_OPCODE_AND;
+   case nir_op_ior:  return BRW_OPCODE_OR;
+   case nir_op_ixor: return BRW_OPCODE_XOR;
+   default:
+      unreachable("Invalid reduction operation");
+   }
+}
+
+static brw_conditional_mod
+brw_cond_mod_for_nir_reduction_op(nir_op op)
+{
+   switch (op) {
+   case nir_op_iadd: return BRW_CONDITIONAL_NONE;
+   case nir_op_fadd: return BRW_CONDITIONAL_NONE;
+   case nir_op_imul: return BRW_CONDITIONAL_NONE;
+   case nir_op_fmul: return BRW_CONDITIONAL_NONE;
+   case nir_op_imin: return BRW_CONDITIONAL_L;
+   case nir_op_umin: return BRW_CONDITIONAL_L;
+   case nir_op_fmin: return BRW_CONDITIONAL_L;
+   case nir_op_imax: return BRW_CONDITIONAL_GE;
+   case nir_op_umax: return BRW_CONDITIONAL_GE;
+   case nir_op_fmax: return BRW_CONDITIONAL_GE;
+   case nir_op_iand: return BRW_CONDITIONAL_NONE;
+   case nir_op_ior:  return BRW_CONDITIONAL_NONE;
+   case nir_op_ixor: return BRW_CONDITIONAL_NONE;
+   default:
+      unreachable("Invalid reduction operation");
+   }
+}
+
  void
  fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
  {
@@ -3568,67 +3823,20 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        dest = get_nir_dest(instr->dest);
  
     switch (instr->intrinsic) {
-   case nir_intrinsic_atomic_counter_inc:
-   case nir_intrinsic_atomic_counter_dec:
-   case nir_intrinsic_atomic_counter_read:
-   case nir_intrinsic_atomic_counter_add:
-   case nir_intrinsic_atomic_counter_min:
-   case nir_intrinsic_atomic_counter_max:
-   case nir_intrinsic_atomic_counter_and:
-   case nir_intrinsic_atomic_counter_or:
-   case nir_intrinsic_atomic_counter_xor:
-   case nir_intrinsic_atomic_counter_exchange:
-   case nir_intrinsic_atomic_counter_comp_swap: {
-      if (stage == MESA_SHADER_FRAGMENT &&
-          instr->intrinsic != nir_intrinsic_atomic_counter_read)
-         brw_wm_prog_data(prog_data)->has_side_effects = true;
-
-      /* Get some metadata from the image intrinsic. */
-      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
-
-      /* Get the arguments of the atomic intrinsic. */
-      const fs_reg offset = get_nir_src(instr->src[0]);
-      const unsigned surface = (stage_prog_data->binding_table.abo_start +
-                                instr->const_index[0]);
-      const fs_reg src0 = (info->num_srcs >= 2
-                           ? get_nir_src(instr->src[1]) : fs_reg());
-      const fs_reg src1 = (info->num_srcs >= 3
-                           ? get_nir_src(instr->src[2]) : fs_reg());
-      fs_reg tmp;
-
-      assert(info->num_srcs <= 3);
-
-      /* Emit a surface read or atomic op. */
-      if (instr->intrinsic == nir_intrinsic_atomic_counter_read) {
-         tmp = emit_untyped_read(bld, brw_imm_ud(surface), offset, 1, 1);
-      } else {
-         tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, src0,
-                                   src1, 1, 1,
-                                   get_atomic_counter_op(instr->intrinsic));
-      }
-
-      /* Assign the result. */
-      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp);
-
-      /* Mark the surface as used. */
-      brw_mark_surface_used(stage_prog_data, surface);
-      break;
-   }
-
-   case nir_intrinsic_image_load:
-   case nir_intrinsic_image_store:
-   case nir_intrinsic_image_atomic_add:
-   case nir_intrinsic_image_atomic_min:
-   case nir_intrinsic_image_atomic_max:
-   case nir_intrinsic_image_atomic_and:
-   case nir_intrinsic_image_atomic_or:
-   case nir_intrinsic_image_atomic_xor:
-   case nir_intrinsic_image_atomic_exchange:
-   case nir_intrinsic_image_atomic_comp_swap: {
+   case nir_intrinsic_image_var_load:
+   case nir_intrinsic_image_var_store:
+   case nir_intrinsic_image_var_atomic_add:
+   case nir_intrinsic_image_var_atomic_min:
+   case nir_intrinsic_image_var_atomic_max:
+   case nir_intrinsic_image_var_atomic_and:
+   case nir_intrinsic_image_var_atomic_or:
+   case nir_intrinsic_image_var_atomic_xor:
+   case nir_intrinsic_image_var_atomic_exchange:
+   case nir_intrinsic_image_var_atomic_comp_swap: {
        using namespace image_access;
  
        if (stage == MESA_SHADER_FRAGMENT &&
-          instr->intrinsic != nir_intrinsic_image_load)
+          instr->intrinsic != nir_intrinsic_image_var_load)
           brw_wm_prog_data(prog_data)->has_side_effects = true;
  
        /* Get the referenced image variable and type. */
@@ -3641,6 +3849,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        const unsigned arr_dims = type->sampler_array ? 1 : 0;
        const unsigned surf_dims = type->coordinate_components() - arr_dims;
        const unsigned format = var->data.image.format;
+      const unsigned dest_components = nir_intrinsic_dest_components(instr);
  
        /* Get the arguments of the image intrinsic. */
        const fs_reg image = get_nir_image_deref(instr->variables[0]);
@@ -3655,22 +3864,23 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        fs_reg tmp;
  
        /* Emit an image load, store or atomic op. */
-      if (instr->intrinsic == nir_intrinsic_image_load)
+      if (instr->intrinsic == nir_intrinsic_image_var_load)
           tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
  
-      else if (instr->intrinsic == nir_intrinsic_image_store)
+      else if (instr->intrinsic == nir_intrinsic_image_var_store)
           emit_image_store(bld, image, addr, src0, surf_dims, arr_dims,
                            var->data.image.write_only ? GL_NONE : format);
  
        else
           tmp = emit_image_atomic(bld, image, addr, src0, src1,
-                                 surf_dims, arr_dims, info->dest_components,
+                                 surf_dims, arr_dims, dest_components,
                                   get_image_atomic_op(instr->intrinsic, type));
  
        /* Assign the result. */
-      for (unsigned c = 0; c < info->dest_components; ++c)
+      for (unsigned c = 0; c < dest_components; ++c) {
           bld.MOV(offset(retype(dest, base_type), bld, c),
-                 offset(tmp, bld, c));
+               offset(tmp, bld, c));
+      }
        break;
     }
  
@@ -3717,7 +3927,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        break;
     }
  
-   case nir_intrinsic_image_size: {
+   case nir_intrinsic_image_var_size: {
        /* Get the referenced image variable and type. */
        const nir_variable *var = instr->variables[0]->var;
        const glsl_type *type = var->type->without_array();
@@ -3761,22 +3971,27 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        break;
     }
  
-   case nir_intrinsic_image_samples:
+   case nir_intrinsic_image_var_samples:
        /* The driver does not support multi-sampled images. */
        bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
        break;
  
     case nir_intrinsic_load_uniform: {
-      /* Offsets are in bytes but they should always be multiples of 4 */
-      assert(instr->const_index[0] % 4 == 0);
+      /* Offsets are in bytes but they should always aligned to
+       * the type size
+       */
+      assert(instr->const_index[0] % 4 == 0 ||
+             instr->const_index[0] % type_sz(dest.type) == 0);
  
        fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
  
        nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
        if (const_offset) {
-         /* Offsets are in bytes but they should always be multiples of 4 */
-         assert(const_offset->u32[0] % 4 == 0);
-         src.offset = const_offset->u32[0];
+         assert(const_offset->u32[0] % type_sz(dest.type) == 0);
+         /* For 16-bit types we add the module of the const_index[0]
+          * offset to access to not 32-bit aligned element
+          */
+         src.offset = const_offset->u32[0] + instr->const_index[0] % 4;
  
           for (unsigned j = 0; j < instr->num_components; j++) {
              bld.MOV(offset(dest, bld, j), offset(src, bld, j));
@@ -3796,7 +4011,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
              (instr->num_components - 1) * type_sz(dest.type);
  
           bool supports_64bit_indirects =
-            !devinfo->is_cherryview && !devinfo->is_broxton;
+            !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo);
  
           if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
              for (unsigned j = 0; j < instr->num_components; j++) {
@@ -3870,6 +4085,34 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
            * and we have to split it if necessary.
            */
           const unsigned type_size = type_sz(dest.type);
+
+         /* See if we've selected this as a push constant candidate */
+         if (const_index) {
+            const unsigned ubo_block = const_index->u32[0];
+            const unsigned offset_256b = const_offset->u32[0] / 32;
+
+            fs_reg push_reg;
+            for (int i = 0; i < 4; i++) {
+               const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
+               if (range->block == ubo_block &&
+                   offset_256b >= range->start &&
+                   offset_256b < range->start + range->length) {
+
+                  push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);
+                  push_reg.offset = const_offset->u32[0] - 32 * range->start;
+                  break;
+               }
+            }
+
+            if (push_reg.file != BAD_FILE) {
+               for (unsigned i = 0; i < instr->num_components; i++) {
+                  bld.MOV(offset(dest, bld, i),
+                          byte_offset(push_reg, i * type_size));
+               }
+               break;
+            }
+         }
+
           const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
           const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
           const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
@@ -3927,7 +4170,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        if (const_offset) {
           offset_reg = brw_imm_ud(const_offset->u32[0]);
        } else {
-         offset_reg = get_nir_src(instr->src[1]);
+         offset_reg = retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD);
        }
  
        /* Read the vector */
@@ -3974,40 +4217,72 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
         * Also, we have to suffle 64-bit data to be in the appropriate layout
         * expected by our 32-bit write messages.
         */
-      unsigned type_size = 4;
-      unsigned bit_size = instr->src[0].is_ssa ?
-         instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
-      if (bit_size == 64) {
-         type_size = 8;
-         fs_reg tmp =
-           fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
-         shuffle_64bit_data_for_32bit_write(bld,
-            retype(tmp, BRW_REGISTER_TYPE_F),
-            retype(val_reg, BRW_REGISTER_TYPE_DF),
-            instr->num_components);
-         val_reg = tmp;
-      }
-
-      unsigned type_slots = type_size / 4;
+      unsigned bit_size = nir_src_bit_size(instr->src[0]);
+      unsigned type_size = bit_size / 8;
  
        /* Combine groups of consecutive enabled channels in one write
         * message. We use ffs to find the first enabled channel and then ffs on
-       * the bit-inverse, down-shifted writemask to determine the length of
-       * the block of enabled bits.
+       * the bit-inverse, down-shifted writemask to determine the num_components
+       * of the block of enabled bits.
         */
        while (writemask) {
           unsigned first_component = ffs(writemask) - 1;
-         unsigned length = ffs(~(writemask >> first_component)) - 1;
+         unsigned num_components = ffs(~(writemask >> first_component)) - 1;
+         fs_reg write_src = offset(val_reg, bld, first_component);
  
-         /* We can't write more than 2 64-bit components at once. Limit the
-          * length of the write to what we can do and let the next iteration
-          * handle the rest
-          */
-         if (type_size > 4)
-            length = MIN2(2, length);
+         nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
+
+         if (type_size > 4) {
+            /* We can't write more than 2 64-bit components at once. Limit
+             * the num_components of the write to what we can do and let the next
+             * iteration handle the rest.
+             */
+            num_components = MIN2(2, num_components);
+            write_src = shuffle_for_32bit_write(bld, write_src, 0,
+                                                num_components);
+         } else if (type_size < 4) {
+            assert(type_size == 2);
+            /* For 16-bit types we pack two consecutive values into a 32-bit
+             * word and use an untyped write message. For single values or not
+             * 32-bit-aligned we need to use byte-scattered writes because
+             * untyped writes works with 32-bit components with 32-bit
+             * alignment. byte_scattered_write messages only support one
+             * 16-bit component at a time. As VK_KHR_relaxed_block_layout
+             * could be enabled we can not guarantee that not constant offsets
+             * to be 32-bit aligned for 16-bit types. For example an array, of
+             * 16-bit vec3 with array element stride of 6.
+             *
+             * In the case of 32-bit aligned constant offsets if there is
+             * a 3-components vector we submit one untyped-write message
+             * of 32-bit (first two components), and one byte-scattered
+             * write message (the last component).
+             */
+
+            if ( !const_offset || ((const_offset->u32[0] +
+                                   type_size * first_component) % 4)) {
+               /* If we use a .yz writemask we also need to emit 2
+                * byte-scattered write messages because of y-component not
+                * being aligned to 32-bit.
+                */
+               num_components = 1;
+            } else if (num_components > 2 && (num_components % 2)) {
+               /* If there is an odd number of consecutive components we left
+                * the not paired component for a following emit of length == 1
+                * with byte_scattered_write.
+                */
+               num_components --;
+            }
+            /* For num_components == 1 we are also shuffling the component
+             * because byte scattered writes of 16-bit need values to be dword
+             * aligned. Shuffling only one component would be the same as
+             * striding it.
+             */
+            write_src = shuffle_for_32bit_write(bld, write_src, 0,
+                                                num_components);
+         }
  
           fs_reg offset_reg;
-         nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
+
           if (const_offset) {
              offset_reg = brw_imm_ud(const_offset->u32[0] +
                                      type_size * first_component);
@@ -4018,16 +4293,35 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                      brw_imm_ud(type_size * first_component));
           }
  
-
-         emit_untyped_write(bld, surf_index, offset_reg,
-                            offset(val_reg, bld, first_component * type_slots),
-                            1 /* dims */, length * type_slots,
-                            BRW_PREDICATE_NONE);
+         if (type_size < 4 && num_components == 1) {
+            assert(type_size == 2);
+            /* Untyped Surface messages have a fixed 32-bit size, so we need
+             * to rely on byte scattered in order to write 16-bit elements.
+             * The byte_scattered_write message needs that every written 16-bit
+             * type to be aligned 32-bits (stride=2).
+             */
+            emit_byte_scattered_write(bld, surf_index, offset_reg,
+                                      write_src,
+                                      1 /* dims */, 1,
+                                      bit_size,
+                                      BRW_PREDICATE_NONE);
+         } else {
+            assert(num_components * type_size <= 16);
+            assert((num_components * type_size) % 4 == 0);
+            assert(offset_reg.file != BRW_IMMEDIATE_VALUE ||
+                   offset_reg.ud % 4 == 0);
+            unsigned num_slots = (num_components * type_size) / 4;
+
+            emit_untyped_write(bld, surf_index, offset_reg,
+                               write_src,
+                               1 /* dims */, num_slots,
+                               BRW_PREDICATE_NONE);
+         }
  
           /* Clear the bits in the writemask that we just wrote, then try
            * again to see if more channels are left.
            */
-         writemask &= (15 << (first_component + length));
+         writemask &= (15 << (first_component + num_components));
        }
        break;
     }
@@ -4037,23 +4331,16 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
  
        nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
        assert(const_offset && "Indirect output stores not allowed");
-      fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
-                                      4 * const_offset->u32[0]), src.type);
  
        unsigned num_components = instr->num_components;
        unsigned first_component = nir_intrinsic_component(instr);
-      unsigned bit_size = instr->src[0].is_ssa ?
-         instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
-      if (bit_size == 64) {
-         fs_reg tmp =
-            fs_reg(VGRF, alloc.allocate(2 * num_components),
-                   BRW_REGISTER_TYPE_F);
-         shuffle_64bit_data_for_32bit_write(
-            bld, tmp, retype(src, BRW_REGISTER_TYPE_DF), num_components);
-         src = retype(tmp, src.type);
+      if (nir_src_bit_size(instr->src[0]) == 64) {
+         src = shuffle_for_32bit_write(bld, src, 0, num_components);
           num_components *= 2;
        }
  
+      fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
+                                      4 * const_offset->u32[0]), src.type);
        for (unsigned j = 0; j < num_components; j++) {
           bld.MOV(offset(new_dest, bld, j + first_component),
                   offset(src, bld, j));
@@ -4112,29 +4399,417 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        ubld.MOV(src_payload, brw_imm_d(0));
  
        const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
-      fs_inst *inst = ubld.emit(FS_OPCODE_GET_BUFFER_SIZE, ret_payload,
+      fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
                                  src_payload, brw_imm_ud(index));
        inst->header_size = 0;
        inst->mlen = 1;
        inst->size_written = 4 * REG_SIZE;
  
-      bld.MOV(retype(dest, ret_payload.type), component(ret_payload, 0));
+      /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
+       *
+       * "Out-of-bounds checking is always performed at a DWord granularity. If
+       * any part of the DWord is out-of-bounds then the whole DWord is
+       * considered out-of-bounds."
+       *
+       * This implies that types with size smaller than 4-bytes need to be
+       * padded if they don't complete the last dword of the buffer. But as we
+       * need to maintain the original size we need to reverse the padding
+       * calculation to return the correct size to know the number of elements
+       * of an unsized array. As we stored in the last two bits of the surface
+       * size the needed padding for the buffer, we calculate here the
+       * original buffer_size reversing the surface_size calculation:
+       *
+       * surface_size = isl_align(buffer_size, 4) +
+       *                (isl_align(buffer_size) - buffer_size)
+       *
+       * buffer_size = surface_size & ~3 - surface_size & 3
+       */
+
+      fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+
+      ubld.AND(size_padding, ret_payload, brw_imm_ud(3));
+      ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3));
+      ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
+
+      bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
+
        brw_mark_surface_used(prog_data, index);
        break;
     }
  
-   case nir_intrinsic_load_channel_num: {
-      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
-      dest = retype(dest, BRW_REGISTER_TYPE_UD);
-      const fs_builder allbld8 = bld.group(8, 0).exec_all();
-      allbld8.MOV(tmp, brw_imm_v(0x76543210));
-      if (dispatch_width > 8)
-         allbld8.ADD(byte_offset(tmp, 16), tmp, brw_imm_uw(8u));
-      if (dispatch_width > 16) {
-         const fs_builder allbld16 = bld.group(16, 0).exec_all();
-         allbld16.ADD(byte_offset(tmp, 32), tmp, brw_imm_uw(16u));
+   case nir_intrinsic_load_subgroup_invocation:
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
+              nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
+      break;
+
+   case nir_intrinsic_load_subgroup_eq_mask:
+   case nir_intrinsic_load_subgroup_ge_mask:
+   case nir_intrinsic_load_subgroup_gt_mask:
+   case nir_intrinsic_load_subgroup_le_mask:
+   case nir_intrinsic_load_subgroup_lt_mask:
+      unreachable("not reached");
+
+   case nir_intrinsic_vote_any: {
+      const fs_builder ubld = bld.exec_all().group(1, 0);
+
+      /* The any/all predicates do not consider channel enables. To prevent
+       * dead channels from affecting the result, we initialize the flag with
+       * with the identity value for the logical operation.
+       */
+      if (dispatch_width == 32) {
+         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
+         ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
+                         brw_imm_ud(0));
+      } else {
+         ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
+      }
+      bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
+
+      /* For some reason, the any/all predicates don't work properly with
+       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
+       * doesn't read the correct subset of the flag register and you end up
+       * getting garbage in the second half.  Work around this by using a pair
+       * of 1-wide MOVs and scattering the result.
+       */
+      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
+      ubld.MOV(res1, brw_imm_d(0));
+      set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
+                    dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
+                                           BRW_PREDICATE_ALIGN1_ANY32H,
+                    ubld.MOV(res1, brw_imm_d(-1)));
+
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
+      break;
+   }
+   case nir_intrinsic_vote_all: {
+      const fs_builder ubld = bld.exec_all().group(1, 0);
+
+      /* The any/all predicates do not consider channel enables. To prevent
+       * dead channels from affecting the result, we initialize the flag with
+       * with the identity value for the logical operation.
+       */
+      if (dispatch_width == 32) {
+         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
+         ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
+                         brw_imm_ud(0xffffffff));
+      } else {
+         ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
+      }
+      bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
+
+      /* For some reason, the any/all predicates don't work properly with
+       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
+       * doesn't read the correct subset of the flag register and you end up
+       * getting garbage in the second half.  Work around this by using a pair
+       * of 1-wide MOVs and scattering the result.
+       */
+      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
+      ubld.MOV(res1, brw_imm_d(0));
+      set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
+                    dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
+                                           BRW_PREDICATE_ALIGN1_ALL32H,
+                    ubld.MOV(res1, brw_imm_d(-1)));
+
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
+      break;
+   }
+   case nir_intrinsic_vote_feq:
+   case nir_intrinsic_vote_ieq: {
+      fs_reg value = get_nir_src(instr->src[0]);
+      if (instr->intrinsic == nir_intrinsic_vote_feq) {
+         const unsigned bit_size = nir_src_bit_size(instr->src[0]);
+         value.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
+      }
+
+      fs_reg uniformized = bld.emit_uniformize(value);
+      const fs_builder ubld = bld.exec_all().group(1, 0);
+
+      /* The any/all predicates do not consider channel enables. To prevent
+       * dead channels from affecting the result, we initialize the flag with
+       * with the identity value for the logical operation.
+       */
+      if (dispatch_width == 32) {
+         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
+         ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
+                         brw_imm_ud(0xffffffff));
+      } else {
+         ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
        }
-      bld.MOV(dest, tmp);
+      bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
+
+      /* For some reason, the any/all predicates don't work properly with
+       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
+       * doesn't read the correct subset of the flag register and you end up
+       * getting garbage in the second half.  Work around this by using a pair
+       * of 1-wide MOVs and scattering the result.
+       */
+      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
+      ubld.MOV(res1, brw_imm_d(0));
+      set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
+                    dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
+                                           BRW_PREDICATE_ALIGN1_ALL32H,
+                    ubld.MOV(res1, brw_imm_d(-1)));
+
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
+      break;
+   }
+
+   case nir_intrinsic_ballot: {
+      const fs_reg value = retype(get_nir_src(instr->src[0]),
+                                  BRW_REGISTER_TYPE_UD);
+      struct brw_reg flag = brw_flag_reg(0, 0);
+      /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
+       * as f0.0.  This is a problem for fragment programs as we currently use
+       * f0.1 for discards.  Fortunately, we don't support SIMD32 fragment
+       * programs yet so this isn't a problem.  When we do, something will
+       * have to change.
+       */
+      if (dispatch_width == 32)
+         flag.type = BRW_REGISTER_TYPE_UD;
+
+      bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u));
+      bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
+
+      if (instr->dest.ssa.bit_size > 32) {
+         dest.type = BRW_REGISTER_TYPE_UQ;
+      } else {
+         dest.type = BRW_REGISTER_TYPE_UD;
+      }
+      bld.MOV(dest, flag);
+      break;
+   }
+
+   case nir_intrinsic_read_invocation: {
+      const fs_reg value = get_nir_src(instr->src[0]);
+      const fs_reg invocation = get_nir_src(instr->src[1]);
+      fs_reg tmp = bld.vgrf(value.type);
+
+      bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
+                          bld.emit_uniformize(invocation));
+
+      bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));
+      break;
+   }
+
+   case nir_intrinsic_read_first_invocation: {
+      const fs_reg value = get_nir_src(instr->src[0]);
+      bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
+      break;
+   }
+
+   case nir_intrinsic_shuffle: {
+      const fs_reg value = get_nir_src(instr->src[0]);
+      const fs_reg index = get_nir_src(instr->src[1]);
+
+      bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
+      break;
+   }
+
+   case nir_intrinsic_first_invocation: {
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
+              fs_reg(component(tmp, 0)));
+      break;
+   }
+
+   case nir_intrinsic_quad_broadcast: {
+      const fs_reg value = get_nir_src(instr->src[0]);
+      nir_const_value *index = nir_src_as_const_value(instr->src[1]);
+      assert(nir_src_bit_size(instr->src[1]) == 32);
+
+      bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
+               value, brw_imm_ud(index->u32[0]), brw_imm_ud(4));
+      break;
+   }
+
+   case nir_intrinsic_quad_swap_horizontal: {
+      const fs_reg value = get_nir_src(instr->src[0]);
+      const fs_reg tmp = bld.vgrf(value.type);
+      const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
+
+      const fs_reg src_left = horiz_stride(value, 2);
+      const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
+      const fs_reg tmp_left = horiz_stride(tmp, 2);
+      const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
+
+      /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn":
+       *
+       *    "When source or destination datatype is 64b or operation is
+       *    integer DWord multiply, regioning in Align1 must follow
+       *    these rules:
+       *
+       *    [...]
+       *
+       *    3. Source and Destination offset must be the same, except
+       *       the case of scalar source."
+       *
+       * In order to work around this, we have to emit two 32-bit MOVs instead
+       * of a single 64-bit MOV to do the shuffle.
+       */
+      if (type_sz(value.type) > 4 &&
+          (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
+         ubld.MOV(subscript(tmp_left, BRW_REGISTER_TYPE_D, 0),
+                  subscript(src_right, BRW_REGISTER_TYPE_D, 0));
+         ubld.MOV(subscript(tmp_left, BRW_REGISTER_TYPE_D, 1),
+                  subscript(src_right, BRW_REGISTER_TYPE_D, 1));
+         ubld.MOV(subscript(tmp_right, BRW_REGISTER_TYPE_D, 0),
+                  subscript(src_left, BRW_REGISTER_TYPE_D, 0));
+         ubld.MOV(subscript(tmp_right, BRW_REGISTER_TYPE_D, 1),
+                  subscript(src_left, BRW_REGISTER_TYPE_D, 1));
+      } else {
+         ubld.MOV(tmp_left, src_right);
+         ubld.MOV(tmp_right, src_left);
+      }
+      bld.MOV(retype(dest, value.type), tmp);
+      break;
+   }
+
+   case nir_intrinsic_quad_swap_vertical: {
+      const fs_reg value = get_nir_src(instr->src[0]);
+      if (nir_src_bit_size(instr->src[0]) == 32) {
+         /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
+         const fs_reg tmp = bld.vgrf(value.type);
+         const fs_builder ubld = bld.exec_all();
+         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
+                   brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));
+         bld.MOV(retype(dest, value.type), tmp);
+      } else {
+         /* For larger data types, we have to either emit dispatch_width many
+          * MOVs or else fall back to doing indirects.
+          */
+         fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
+         bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
+                      brw_imm_w(0x2));
+         bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
+      }
+      break;
+   }
+
+   case nir_intrinsic_quad_swap_diagonal: {
+      const fs_reg value = get_nir_src(instr->src[0]);
+      if (nir_src_bit_size(instr->src[0]) == 32) {
+         /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
+         const fs_reg tmp = bld.vgrf(value.type);
+         const fs_builder ubld = bld.exec_all();
+         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
+                   brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));
+         bld.MOV(retype(dest, value.type), tmp);
+      } else {
+         /* For larger data types, we have to either emit dispatch_width many
+          * MOVs or else fall back to doing indirects.
+          */
+         fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
+         bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
+                      brw_imm_w(0x3));
+         bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
+      }
+      break;
+   }
+
+   case nir_intrinsic_reduce: {
+      fs_reg src = get_nir_src(instr->src[0]);
+      nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
+      unsigned cluster_size = nir_intrinsic_cluster_size(instr);
+      if (cluster_size == 0 || cluster_size > dispatch_width)
+         cluster_size = dispatch_width;
+
+      /* Figure out the source type */
+      src.type = brw_type_for_nir_type(devinfo,
+         (nir_alu_type)(nir_op_infos[redop].input_types[0] |
+                        nir_src_bit_size(instr->src[0])));
+
+      fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
+      opcode brw_op = brw_op_for_nir_reduction_op(redop);
+      brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
+
+      /* Set up a register for all of our scratching around and initialize it
+       * to reduction operation's identity value.
+       */
+      fs_reg scan = bld.vgrf(src.type);
+      bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
+
+      bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
+
+      dest.type = src.type;
+      if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
+         /* In this case, CLUSTER_BROADCAST instruction isn't needed because
+          * the distance between clusters is at least 2 GRFs.  In this case,
+          * we don't need the weird striding of the CLUSTER_BROADCAST
+          * instruction and can just do regular MOVs.
+          */
+         assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
+         const unsigned groups =
+            (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
+         const unsigned group_size = dispatch_width / groups;
+         for (unsigned i = 0; i < groups; i++) {
+            const unsigned cluster = (i * group_size) / cluster_size;
+            const unsigned comp = cluster * cluster_size + (cluster_size - 1);
+            bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
+                                         component(scan, comp));
+         }
+      } else {
+         bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
+                  brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
+      }
+      break;
+   }
+
+   case nir_intrinsic_inclusive_scan:
+   case nir_intrinsic_exclusive_scan: {
+      fs_reg src = get_nir_src(instr->src[0]);
+      nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
+
+      /* Figure out the source type */
+      src.type = brw_type_for_nir_type(devinfo,
+         (nir_alu_type)(nir_op_infos[redop].input_types[0] |
+                        nir_src_bit_size(instr->src[0])));
+
+      fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
+      opcode brw_op = brw_op_for_nir_reduction_op(redop);
+      brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
+
+      /* Set up a register for all of our scratching around and initialize it
+       * to reduction operation's identity value.
+       */
+      fs_reg scan = bld.vgrf(src.type);
+      const fs_builder allbld = bld.exec_all();
+      allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
+
+      if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
+         /* Exclusive scan is a bit harder because we have to do an annoying
+          * shift of the contents before we can begin.  To make things worse,
+          * we can't do this with a normal stride; we have to use indirects.
+          */
+         fs_reg shifted = bld.vgrf(src.type);
+         fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
+         allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
+                         brw_imm_w(-1));
+         allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
+         allbld.group(1, 0).MOV(component(shifted, 0), identity);
+         scan = shifted;
+      }
+
+      bld.emit_scan(brw_op, scan, dispatch_width, cond_mod);
+
+      bld.MOV(retype(dest, src.type), scan);
+      break;
+   }
+
+   case nir_intrinsic_begin_invocation_interlock: {
+      const fs_builder ubld = bld.group(8, 0);
+      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+
+      ubld.emit(SHADER_OPCODE_INTERLOCK, tmp)->size_written = 2 *
+         REG_SIZE;
+
+      break;
+   }
+
+   case nir_intrinsic_end_invocation_interlock: {
+      /* We don't need to do anything here */
        break;
     }
  
@@ -4446,15 +5121,6 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
        unreachable("unknown texture opcode");
     }
  
-   /* TXS and TXL require a LOD but not everything we implement using those
-    * two opcodes provides one.  Provide a default LOD of 0.
-    */
-   if ((opcode == SHADER_OPCODE_TXS_LOGICAL ||
-        opcode == SHADER_OPCODE_TXL_LOGICAL) &&
-       srcs[TEX_LOGICAL_SRC_LOD].file == BAD_FILE) {
-      srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0u);
-   }
-
     if (instr->op == nir_texop_tg4) {
        if (instr->component == 1 &&
            key_tex->gather_channel_quirk_mask & (1 << texture)) {
@@ -4524,59 +5190,6 @@ fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
     }
  }
  
-/**
- * This helper takes the result of a load operation that reads 32-bit elements
- * in this format:
- *
- * x x x x x x x x
- * y y y y y y y y
- * z z z z z z z z
- * w w w w w w w w
- *
- * and shuffles the data to get this:
- *
- * x y x y x y x y
- * x y x y x y x y
- * z w z w z w z w
- * z w z w z w z w
- *
- * Which is exactly what we want if the load is reading 64-bit components
- * like doubles, where x represents the low 32-bit of the x double component
- * and y represents the high 32-bit of the x double component (likewise with
- * z and w for double component y). The parameter @components represents
- * the number of 64-bit components present in @src. This would typically be
- * 2 at most, since we can only fit 2 double elements in the result of a
- * vec4 load.
- *
- * Notice that @dst and @src can be the same register.
- */
-void
-shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld,
-                                        const fs_reg &dst,
-                                        const fs_reg &src,
-                                        uint32_t components)
-{
-   assert(type_sz(src.type) == 4);
-   assert(type_sz(dst.type) == 8);
-
-   /* A temporary that we will use to shuffle the 32-bit data of each
-    * component in the vector into valid 64-bit data. We can't write directly
-    * to dst because dst can be (and would usually be) the same as src
-    * and in that case the first MOV in the loop below would overwrite the
-    * data read in the second MOV.
-    */
-   fs_reg tmp = bld.vgrf(dst.type);
-
-   for (unsigned i = 0; i < components; i++) {
-      const fs_reg component_i = offset(src, bld, 2 * i);
-
-      bld.MOV(subscript(tmp, src.type, 0), component_i);
-      bld.MOV(subscript(tmp, src.type, 1), offset(component_i, bld, 1));
-
-      bld.MOV(offset(dst, bld, i), tmp);
-   }
-}
-
  /**
   * This helper does the inverse operation of
   * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
@@ -4591,24 +5204,166 @@ shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld,
   * 64-bit data they are about to write. Because of this the function checks
   * that the src and dst regions involved in the operation do not overlap.
   */
-void
+fs_reg
  shuffle_64bit_data_for_32bit_write(const fs_builder &bld,
-                                   const fs_reg &dst,
                                     const fs_reg &src,
                                     uint32_t components)
  {
     assert(type_sz(src.type) == 8);
-   assert(type_sz(dst.type) == 4);
  
-   assert(!regions_overlap(
-             dst, 2 * components * dst.component_size(bld.dispatch_width()),
-             src, components * src.component_size(bld.dispatch_width())));
+   fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D, 2 * components);
  
     for (unsigned i = 0; i < components; i++) {
        const fs_reg component_i = offset(src, bld, i);
        bld.MOV(offset(dst, bld, 2 * i), subscript(component_i, dst.type, 0));
        bld.MOV(offset(dst, bld, 2 * i + 1), subscript(component_i, dst.type, 1));
     }
+
+   return dst;
+}
+
+/*
+ * This helper takes a source register and un/shuffles it into the destination
+ * register.
+ *
+ * If source type size is smaller than destination type size the operation
+ * needed is a component shuffle. The opposite case would be an unshuffle. If
+ * source/destination type size is equal a shuffle is done that would be
+ * equivalent to a simple MOV.
+ *
+ * For example, if source is a 16-bit type and destination is 32-bit. A 3
+ * components .xyz 16-bit vector on SIMD8 would be.
+ *
+ *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
+ *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
+ *
+ * This helper will return the following 2 32-bit components with the 16-bit
+ * values shuffled:
+ *
+ *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
+ *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
+ *
+ * For unshuffle, the example would be the opposite, a 64-bit type source
+ * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
+ * would be:
+ *
+ *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
+ *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
+ *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
+ *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
+ *
+ * The returned result would be the following 4 32-bit components unshuffled:
+ *
+ *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
+ *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
+ *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
+ *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
+ *
+ * - Source and destination register must not be overlapped.
+ * - components units are measured in terms of the smaller type between
+ *   source and destination because we are un/shuffling the smaller
+ *   components from/into the bigger ones.
+ * - first_component parameter allows skipping source components.
+ */
+void
+shuffle_src_to_dst(const fs_builder &bld,
+                   const fs_reg &dst,
+                   const fs_reg &src,
+                   uint32_t first_component,
+                   uint32_t components)
+{
+   if (type_sz(src.type) == type_sz(dst.type)) {
+      assert(!regions_overlap(dst,
+         type_sz(dst.type) * bld.dispatch_width() * components,
+         offset(src, bld, first_component),
+         type_sz(src.type) * bld.dispatch_width() * components));
+      for (unsigned i = 0; i < components; i++) {
+         bld.MOV(retype(offset(dst, bld, i), src.type),
+                 offset(src, bld, i + first_component));
+      }
+   } else if (type_sz(src.type) < type_sz(dst.type)) {
+      /* Source is shuffled into destination */
+      unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
+      assert(!regions_overlap(dst,
+         type_sz(dst.type) * bld.dispatch_width() *
+         DIV_ROUND_UP(components, size_ratio),
+         offset(src, bld, first_component),
+         type_sz(src.type) * bld.dispatch_width() * components));
+
+      brw_reg_type shuffle_type =
+         brw_reg_type_from_bit_size(8 * type_sz(src.type),
+                                    BRW_REGISTER_TYPE_D);
+      for (unsigned i = 0; i < components; i++) {
+         fs_reg shuffle_component_i =
+            subscript(offset(dst, bld, i / size_ratio),
+                      shuffle_type, i % size_ratio);
+         bld.MOV(shuffle_component_i,
+                 retype(offset(src, bld, i + first_component), shuffle_type));
+      }
+   } else {
+      /* Source is unshuffled into destination */
+      unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
+      assert(!regions_overlap(dst,
+         type_sz(dst.type) * bld.dispatch_width() * components,
+         offset(src, bld, first_component / size_ratio),
+         type_sz(src.type) * bld.dispatch_width() *
+         DIV_ROUND_UP(components + (first_component % size_ratio),
+                      size_ratio)));
+
+      brw_reg_type shuffle_type =
+         brw_reg_type_from_bit_size(8 * type_sz(dst.type),
+                                    BRW_REGISTER_TYPE_D);
+      for (unsigned i = 0; i < components; i++) {
+         fs_reg shuffle_component_i =
+            subscript(offset(src, bld, (first_component + i) / size_ratio),
+                      shuffle_type, (first_component + i) % size_ratio);
+         bld.MOV(retype(offset(dst, bld, i), shuffle_type),
+                 shuffle_component_i);
+      }
+   }
+}
+
+void
+shuffle_from_32bit_read(const fs_builder &bld,
+                        const fs_reg &dst,
+                        const fs_reg &src,
+                        uint32_t first_component,
+                        uint32_t components)
+{
+   assert(type_sz(src.type) == 4);
+
+   /* This function takes components in units of the destination type while
+    * shuffle_src_to_dst takes components in units of the smallest type
+    */
+   if (type_sz(dst.type) > 4) {
+      assert(type_sz(dst.type) == 8);
+      first_component *= 2;
+      components *= 2;
+   }
+
+   shuffle_src_to_dst(bld, dst, src, first_component, components);
+}
+
+fs_reg
+shuffle_for_32bit_write(const fs_builder &bld,
+                        const fs_reg &src,
+                        uint32_t first_component,
+                        uint32_t components)
+{
+   fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D,
+                         DIV_ROUND_UP (components * type_sz(src.type), 4));
+   /* This function takes components in units of the source type while
+    * shuffle_src_to_dst takes components in units of the smallest type
+    */
+   if (type_sz(src.type) > 4) {
+      assert(type_sz(src.type) == 8);
+      first_component *= 2;
+      components *= 2;
+   }
+
+   shuffle_src_to_dst(bld, dst, src, first_component, components);
+
+   return dst;
  }
  
  fs_reg