i965/fs: Fix extract_i8/u8 to a 64-bit destination
[mesa.git] / src / intel / compiler / brw_fs_nir.cpp
index bc1ccfba3d21695b7fd95dc7cab622b327c289e4..ca3707db90f76ed4b955d10d96d162a486c4e5a0 100644 (file)
@@ -53,14 +53,27 @@ fs_visitor::nir_setup_outputs()
    if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
       return;
 
+   unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
+
+   /* Calculate the size of output registers in a separate pass, before
+    * allocating them.  With ARB_enhanced_layouts, multiple output variables
+    * may occupy the same slot, but have different type sizes.
+    */
    nir_foreach_variable(var, &nir->outputs) {
-      const unsigned vec4s =
+      const int loc = var->data.driver_location;
+      const unsigned var_vec4s =
          var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
                            : type_size_vec4(var->type);
-      fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * vec4s);
-      for (unsigned i = 0; i < vec4s; i++) {
-         if (outputs[var->data.driver_location + i].file == BAD_FILE)
-            outputs[var->data.driver_location + i] = offset(reg, bld, 4 * i);
+      vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
+   }
+
+   nir_foreach_variable(var, &nir->outputs) {
+      const int loc = var->data.driver_location;
+      if (outputs[loc].file == BAD_FILE) {
+         fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * vec4s[loc]);
+         for (unsigned i = 0; i < vec4s[loc]; i++) {
+            outputs[loc + i] = offset(reg, bld, 4 * i);
+         }
       }
    }
 }
@@ -68,10 +81,23 @@ fs_visitor::nir_setup_outputs()
 void
 fs_visitor::nir_setup_uniforms()
 {
-   if (dispatch_width != min_dispatch_width)
+   /* Only the first compile gets to set up uniforms. */
+   if (push_constant_loc) {
+      assert(pull_constant_loc);
       return;
+   }
 
    uniforms = nir->num_uniforms / 4;
+
+   if (stage == MESA_SHADER_COMPUTE) {
+      /* Add a uniform for the thread local id.  It must be the last uniform
+       * on the list.
+       */
+      assert(uniforms == prog_data->nr_params);
+      uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
+      *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
+      subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
+   }
 }
 
 static bool
@@ -89,39 +115,11 @@ emit_system_values_block(nir_block *block, fs_visitor *v)
          unreachable("should be lowered by lower_vertex_id().");
 
       case nir_intrinsic_load_vertex_id_zero_base:
-         assert(v->stage == MESA_SHADER_VERTEX);
-         reg = &v->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
-         break;
-
       case nir_intrinsic_load_base_vertex:
-         assert(v->stage == MESA_SHADER_VERTEX);
-         reg = &v->nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_VERTEX);
-         break;
-
       case nir_intrinsic_load_instance_id:
-         assert(v->stage == MESA_SHADER_VERTEX);
-         reg = &v->nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID);
-         break;
-
       case nir_intrinsic_load_base_instance:
-         assert(v->stage == MESA_SHADER_VERTEX);
-         reg = &v->nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_INSTANCE);
-         break;
-
       case nir_intrinsic_load_draw_id:
-         assert(v->stage == MESA_SHADER_VERTEX);
-         reg = &v->nir_system_values[SYSTEM_VALUE_DRAW_ID];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_DRAW_ID);
-         break;
+         unreachable("should be lowered by brw_nir_lower_vs_inputs().");
 
       case nir_intrinsic_load_invocation_id:
          if (v->stage == MESA_SHADER_TESS_CTRL)
@@ -233,6 +231,24 @@ fs_visitor::nir_emit_system_values()
       nir_system_values[i] = fs_reg();
    }
 
+   /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
+    * never end up using it.
+    */
+   {
+      const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
+      fs_reg &reg = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
+      reg = abld.vgrf(BRW_REGISTER_TYPE_W);
+
+      const fs_builder allbld8 = abld.group(8, 0).exec_all();
+      allbld8.MOV(reg, brw_imm_v(0x76543210));
+      if (dispatch_width > 8)
+         allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
+      if (dispatch_width > 16) {
+         const fs_builder allbld16 = abld.group(16, 0).exec_all();
+         allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
+      }
+   }
+
    nir_foreach_function(function, nir) {
       assert(strcmp(function->name, "main") == 0);
       assert(function->impl);
@@ -242,6 +258,65 @@ fs_visitor::nir_emit_system_values()
    }
 }
 
+/*
+ * Returns a type based on a reference_type (word, float, half-float) and a
+ * given bit_size.
+ *
+ * Reference BRW_REGISTER_TYPE are HF,F,DF,W,D,UW,UD.
+ *
+ * @FIXME: 64-bit return types are always DF on integer types to maintain
+ * compability with uses of DF previously to the introduction of int64
+ * support.
+ */
+static brw_reg_type
+brw_reg_type_from_bit_size(const unsigned bit_size,
+                           const brw_reg_type reference_type)
+{
+   switch(reference_type) {
+   case BRW_REGISTER_TYPE_HF:
+   case BRW_REGISTER_TYPE_F:
+   case BRW_REGISTER_TYPE_DF:
+      switch(bit_size) {
+      case 16:
+         return BRW_REGISTER_TYPE_HF;
+      case 32:
+         return BRW_REGISTER_TYPE_F;
+      case 64:
+         return BRW_REGISTER_TYPE_DF;
+      default:
+         unreachable("Invalid bit size");
+      }
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_Q:
+      switch(bit_size) {
+      case 16:
+         return BRW_REGISTER_TYPE_W;
+      case 32:
+         return BRW_REGISTER_TYPE_D;
+      case 64:
+         return BRW_REGISTER_TYPE_Q;
+      default:
+         unreachable("Invalid bit size");
+      }
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_UQ:
+      switch(bit_size) {
+      case 16:
+         return BRW_REGISTER_TYPE_UW;
+      case 32:
+         return BRW_REGISTER_TYPE_UD;
+      case 64:
+         return BRW_REGISTER_TYPE_UQ;
+      default:
+         unreachable("Invalid bit size");
+      }
+   default:
+      unreachable("Unknown type");
+   }
+}
+
 void
 fs_visitor::nir_emit_impl(nir_function_impl *impl)
 {
@@ -255,7 +330,7 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl)
          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
       unsigned size = array_elems * reg->num_components;
       const brw_reg_type reg_type =
-         reg->bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
+         brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F);
       nir_locals[reg->index] = bld.vgrf(reg_type, size);
    }
 
@@ -650,8 +725,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       break;
 
    case nir_op_f2f64:
+   case nir_op_f2i64:
+   case nir_op_f2u64:
    case nir_op_i2f64:
+   case nir_op_i2i64:
    case nir_op_u2f64:
+   case nir_op_u2u64:
       /* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions:
        *
        *    "When source or destination is 64b (...), regioning in Align1
@@ -667,7 +746,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        */
       if (nir_dest_bit_size(instr->dest.dest) == 64 &&
           nir_src_bit_size(instr->src[0].src) == 32 &&
-          (devinfo->is_cherryview || devinfo->is_broxton)) {
+          (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
          fs_reg tmp = bld.vgrf(result.type, 1);
          tmp = subscript(tmp, op[0].type, 0);
          inst = bld.MOV(tmp, op[0]);
@@ -679,12 +758,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    case nir_op_f2f32:
    case nir_op_f2i32:
    case nir_op_f2u32:
-   case nir_op_f2i64:
-   case nir_op_f2u64:
    case nir_op_i2i32:
-   case nir_op_i2i64:
    case nir_op_u2u32:
-   case nir_op_u2u64:
       inst = bld.MOV(result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
@@ -721,53 +796,21 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
           *
           * - 2-src instructions can't operate with 64-bit immediates
           * - The sign is encoded in the high 32-bit of each DF
-          * - CMP with DF requires special handling in SIMD16
           * - We need to produce a DF result.
           */
 
-         /* 2-src instructions can't have 64-bit immediates, so put 0.0 in
-          * a register and compare with that.
-          */
-         fs_reg tmp = vgrf(glsl_type::double_type);
-         bld.MOV(tmp, setup_imm_df(bld, 0.0));
-
-         /* A direct DF CMP using the flag register (null dst) won't work in
-          * SIMD16 because the CMP will be split in two by lower_simd_width,
-          * resulting in two CMP instructions with the same dst (NULL),
-          * leading to dead code elimination of the first one. In SIMD8,
-          * however, there is no need to split the CMP and we can save some
-          * work.
-          */
-         fs_reg dst_tmp = vgrf(glsl_type::double_type);
-         bld.CMP(dst_tmp, op[0], tmp, BRW_CONDITIONAL_NZ);
-
-         /* In SIMD16 we want to avoid using a NULL dst register with DF CMP,
-          * so we store the result of the comparison in a vgrf instead and
-          * then we generate a UD comparison from that that won't have to
-          * be split by lower_simd_width. This is what NIR does to handle
-          * double comparisons in the general case.
-          */
-         if (bld.dispatch_width() == 16 ) {
-            fs_reg dst_tmp_ud = retype(dst_tmp, BRW_REGISTER_TYPE_UD);
-            bld.MOV(dst_tmp_ud, subscript(dst_tmp, BRW_REGISTER_TYPE_UD, 0));
-            bld.CMP(bld.null_reg_ud(),
-                    dst_tmp_ud, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
-         }
+         fs_reg zero = vgrf(glsl_type::double_type);
+         bld.MOV(zero, setup_imm_df(bld, 0.0));
+         bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
 
-         /* Get the high 32-bit of each double component where the sign is */
-         fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
-         bld.MOV(result_int, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
+         bld.MOV(result, zero);
 
-         /* Get the sign bit */
-         bld.AND(result_int, result_int, brw_imm_ud(0x80000000u));
+         fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
+         bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
+                 brw_imm_ud(0x80000000u));
 
-         /* Add 1.0 to the sign, predicated to skip the case of op[0] == 0.0 */
-         inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
-         inst->predicate = BRW_PREDICATE_NORMAL;
-
-         /* Convert from 32-bit float to 64-bit double */
-         result.type = BRW_REGISTER_TYPE_DF;
-         inst = bld.MOV(result, retype(result_int, BRW_REGISTER_TYPE_F));
+         set_predicate(BRW_PREDICATE_NORMAL,
+                       bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
 
          if (instr->dest.saturate) {
             inst = bld.MOV(result, result);
@@ -1082,12 +1125,13 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
          if (instr->op == nir_op_f2b) {
             zero = vgrf(glsl_type::double_type);
             tmp = vgrf(glsl_type::double_type);
+            bld.MOV(zero, setup_imm_df(bld, 0.0));
          } else {
             zero = vgrf(glsl_type::int64_t_type);
             tmp = vgrf(glsl_type::int64_t_type);
+            bld.MOV(zero, brw_imm_q(0));
          }
 
-         bld.MOV(zero, setup_imm_df(bld, 0.0));
          /* A SIMD16 execution needs to be split in two instructions, so use
           * a vgrf instead of the flag register as dst so instruction splitting
           * works
@@ -1295,14 +1339,36 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       unreachable("not reached: should have been lowered");
 
    case nir_op_ishl:
-      bld.SHL(result, op[0], op[1]);
-      break;
    case nir_op_ishr:
-      bld.ASR(result, op[0], op[1]);
-      break;
-   case nir_op_ushr:
-      bld.SHR(result, op[0], op[1]);
+   case nir_op_ushr: {
+      fs_reg shift_count = op[1];
+
+      if (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) {
+         if (op[1].file == VGRF &&
+             (result.type == BRW_REGISTER_TYPE_Q ||
+              result.type == BRW_REGISTER_TYPE_UQ)) {
+            shift_count = fs_reg(VGRF, alloc.allocate(dispatch_width / 4),
+                                 BRW_REGISTER_TYPE_UD);
+            shift_count.stride = 2;
+            bld.MOV(shift_count, op[1]);
+         }
+      }
+
+      switch (instr->op) {
+      case nir_op_ishl:
+         bld.SHL(result, op[0], shift_count);
+         break;
+      case nir_op_ishr:
+         bld.ASR(result, op[0], shift_count);
+         break;
+      case nir_op_ushr:
+         bld.SHR(result, op[0], shift_count);
+         break;
+      default:
+         unreachable("not reached");
+      }
       break;
+   }
 
    case nir_op_pack_half_2x16_split:
       bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
@@ -1329,10 +1395,31 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
 
    case nir_op_extract_u8:
    case nir_op_extract_i8: {
-      const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
       nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
       assert(byte != NULL);
-      bld.MOV(result, subscript(op[0], type, byte->u32[0]));
+
+      /* The PRMs say:
+       *
+       *    BDW+
+       *    There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
+       *    Use two instructions and a word or DWord intermediate integer type.
+       */
+      if (nir_dest_bit_size(instr->dest.dest) == 64) {
+         const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i8);
+
+         if (instr->op == nir_op_extract_i8) {
+            /* If we need to sign extend, extract to a word first */
+            fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
+            bld.MOV(w_temp, subscript(op[0], type, byte->u32[0]));
+            bld.MOV(result, w_temp);
+         } else {
+            /* Otherwise use an AND with 0xff and a word type */
+            bld.AND(result, subscript(op[0], type, byte->u32[0] / 2), brw_imm_uw(0xff));
+         }
+      } else {
+         const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
+         bld.MOV(result, subscript(op[0], type, byte->u32[0]));
+      }
       break;
    }
 
@@ -1366,7 +1453,7 @@ fs_visitor::nir_emit_load_const(const fs_builder &bld,
                                 nir_load_const_instr *instr)
 {
    const brw_reg_type reg_type =
-      instr->def.bit_size == 32 ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
+      brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D);
    fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
 
    switch (instr->def.bit_size) {
@@ -1376,9 +1463,17 @@ fs_visitor::nir_emit_load_const(const fs_builder &bld,
       break;
 
    case 64:
-      for (unsigned i = 0; i < instr->def.num_components; i++)
-         bld.MOV(offset(reg, bld, i),
-                 setup_imm_df(bld, instr->value.f64[i]));
+      assert(devinfo->gen >= 7);
+      if (devinfo->gen == 7) {
+         /* We don't get 64-bit integer types until gen8 */
+         for (unsigned i = 0; i < instr->def.num_components; i++) {
+            bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF),
+                    setup_imm_df(bld, instr->value.f64[i]));
+         }
+      } else {
+         for (unsigned i = 0; i < instr->def.num_components; i++)
+            bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value.i64[i]));
+      }
       break;
 
    default:
@@ -1394,8 +1489,8 @@ fs_visitor::get_nir_src(const nir_src &src)
    fs_reg reg;
    if (src.is_ssa) {
       if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
-         const brw_reg_type reg_type = src.ssa->bit_size == 32 ?
-            BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
+         const brw_reg_type reg_type =
+            brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D);
          reg = bld.vgrf(reg_type, src.ssa->num_components);
       } else {
          reg = nir_ssa_values[src.ssa->index];
@@ -1407,20 +1502,35 @@ fs_visitor::get_nir_src(const nir_src &src)
                    src.reg.base_offset * src.reg.reg->num_components);
    }
 
-   /* to avoid floating-point denorm flushing problems, set the type by
-    * default to D - instructions that need floating point semantics will set
-    * this to F if they need to
-    */
-   return retype(reg, BRW_REGISTER_TYPE_D);
+   if (nir_src_bit_size(src) == 64 && devinfo->gen == 7) {
+      /* The only 64-bit type available on gen7 is DF, so use that. */
+      reg.type = BRW_REGISTER_TYPE_DF;
+   } else {
+      /* To avoid floating-point denorm flushing problems, set the type by
+       * default to an integer type - instructions that need floating point
+       * semantics will set this to F if they need to
+       */
+      reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),
+                                            BRW_REGISTER_TYPE_D);
+   }
+
+   return reg;
 }
 
 /**
  * Return an IMM for constants; otherwise call get_nir_src() as normal.
+ *
+ * This function should not be called on any value which may be 64 bits.
+ * We could theoretically support 64-bit on gen8+ but we choose not to
+ * because it wouldn't work in general (no gen7 support) and there are
+ * enough restrictions in 64-bit immediates that you can't take the return
+ * value and treat it the same as the result of get_nir_src().
  */
 fs_reg
 fs_visitor::get_nir_src_imm(const nir_src &src)
 {
    nir_const_value *val = nir_src_as_const_value(src);
+   assert(nir_src_bit_size(src) == 32);
    return val ? fs_reg(brw_imm_d(val->i32[0])) : get_nir_src(src);
 }
 
@@ -1429,7 +1539,7 @@ fs_visitor::get_nir_dest(const nir_dest &dest)
 {
    if (dest.is_ssa) {
       const brw_reg_type reg_type =
-         dest.ssa.bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
+         brw_reg_type_from_bit_size(dest.ssa.bit_size, BRW_REGISTER_TYPE_F);
       nir_ssa_values[dest.ssa.index] =
          bld.vgrf(reg_type, dest.ssa.num_components);
       return nir_ssa_values[dest.ssa.index];
@@ -1804,7 +1914,7 @@ fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
    assert(gs_compile->control_data_bits_per_vertex == 2);
 
    /* Must be a valid stream */
-   assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
+   assert(stream_id < MAX_VERTEX_STREAMS);
 
    /* Control data bits are initialized to 0 so we don't have to set any
     * bits when sending vertices to stream 0.
@@ -1853,7 +1963,7 @@ fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
     * be recorded by transform feedback, we can simply discard all geometry
     * bound to these streams when transform feedback is disabled.
     */
-   if (stream_id > 0 && !nir->info->has_transform_feedback_varyings)
+   if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
       return;
 
    /* If we're outputting 32 control data bits or less, then we can wait
@@ -1942,33 +2052,23 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
    nir_const_value *offset_const = nir_src_as_const_value(offset_src);
    const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
 
-   /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y],
-    * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].  Only
-    * gl_PointSize is available as a GS input, however, so it must be that.
-    */
-   const bool is_point_size = (base_offset == 0);
-
    /* TODO: figure out push input layout for invocations == 1 */
+   /* TODO: make this work with 64-bit inputs */
    if (gs_prog_data->invocations == 1 &&
+       type_sz(dst.type) <= 4 &&
        offset_const != NULL && vertex_const != NULL &&
        4 * (base_offset + offset_const->u32[0]) < push_reg_count) {
       int imm_offset = (base_offset + offset_const->u32[0]) * 4 +
                        vertex_const->u32[0] * push_reg_count;
-      /* This input was pushed into registers. */
-      if (is_point_size) {
-         /* gl_PointSize comes in .w */
-         bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type));
-      } else {
-         for (unsigned i = 0; i < num_components; i++) {
-            bld.MOV(offset(dst, bld, i),
-                    fs_reg(ATTR, imm_offset + i + first_component, dst.type));
-         }
+      for (unsigned i = 0; i < num_components; i++) {
+         bld.MOV(offset(dst, bld, i),
+                 fs_reg(ATTR, imm_offset + i + first_component, dst.type));
       }
       return;
    }
 
    /* Resort to the pull model.  Ensure the VUE handles are provided. */
-   gs_prog_data->base.include_vue_handles = true;
+   assert(gs_prog_data->base.include_vue_handles);
 
    unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
    fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
@@ -2008,12 +2108,12 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
 
          /* Use first_icp_handle as the base offset.  There is one register
           * of URB handles per vertex, so inform the register allocator that
-          * we might read up to nir->info->gs.vertices_in registers.
+          * we might read up to nir->info.gs.vertices_in registers.
           */
          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
                   fs_reg(icp_offset_bytes),
-                  brw_imm_ud(nir->info->gs.vertices_in * REG_SIZE));
+                  brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE));
       }
    } else {
       assert(gs_prog_data->invocations > 1);
@@ -2039,12 +2139,12 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
 
          /* Use first_icp_handle as the base offset.  There is one DWord
           * of URB handles per vertex, so inform the register allocator that
-          * we might read up to ceil(nir->info->gs.vertices_in / 8) registers.
+          * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
           */
          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
                   fs_reg(icp_offset_bytes),
-                  brw_imm_ud(DIV_ROUND_UP(nir->info->gs.vertices_in, 8) *
+                  brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) *
                              REG_SIZE));
       }
    }
@@ -2132,14 +2232,6 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
          }
       }
    }
-
-   if (is_point_size) {
-      /* Read the whole VUE header (because of alignment) and read .w. */
-      fs_reg tmp = bld.vgrf(dst.type, 4);
-      inst->dst = tmp;
-      inst->size_written = 4 * REG_SIZE;
-      bld.MOV(dst, offset(tmp, bld, 3));
-   }
 }
 
 fs_reg
@@ -2251,7 +2343,7 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
    }
 
    case nir_intrinsic_load_input: {
-      fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type);
+      fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type);
       unsigned first_component = nir_intrinsic_component(instr);
       unsigned num_components = instr->num_components;
       enum brw_reg_type type = dest.type;
@@ -2552,7 +2644,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
       fs_reg indirect_offset = get_indirect_offset(instr);
       unsigned imm_offset = instr->const_index[0];
-      unsigned swiz = BRW_SWIZZLE_XYZW;
       unsigned mask = instr->const_index[1];
       unsigned header_regs = 0;
       fs_reg srcs[7];
@@ -2582,13 +2673,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
          }
       }
 
-      /* 64-bit data needs to me shuffled before we can write it to the URB.
-       * We will use this temporary to shuffle the components in each
-       * iteration.
-       */
-      fs_reg tmp =
-         fs_reg(VGRF, alloc.allocate(2 * iter_components), value.type);
-
       mask = mask << first_component;
 
       for (unsigned iter = 0; iter < num_iterations; iter++) {
@@ -2632,26 +2716,18 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
                continue;
 
             if (!is_64bit) {
-               srcs[header_regs + i + first_component] =
-                  offset(value, bld, BRW_GET_SWZ(swiz, i));
+               srcs[header_regs + i + first_component] = offset(value, bld, i);
             } else {
                /* We need to shuffle the 64-bit data to match the layout
                 * expected by our 32-bit URB write messages. We use a temporary
                 * for that.
                 */
-               unsigned channel = BRW_GET_SWZ(swiz, iter * 2 + i);
-               shuffle_64bit_data_for_32bit_write(bld,
-                  retype(offset(tmp, bld, 2 * i), BRW_REGISTER_TYPE_F),
-                  retype(offset(value, bld, 2 * channel), BRW_REGISTER_TYPE_DF),
-                  1);
-
-               /* Now copy the data to the destination */
-               fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type);
-               unsigned idx = 2 * i;
-               bld.MOV(dest, offset(tmp, bld, idx));
-               bld.MOV(offset(dest, bld, 1), offset(tmp, bld, idx + 1));
-               srcs[header_regs + idx + first_component * 2] = dest;
-               srcs[header_regs + idx + 1 + first_component * 2] =
+               unsigned channel = iter * 2 + i;
+               fs_reg dest = shuffle_64bit_data_for_32bit_write(bld,
+                  offset(value, bld, channel), 1);
+
+               srcs[header_regs + (i + first_component) * 2] = dest;
+               srcs[header_regs + (i + first_component) * 2 + 1] =
                   offset(dest, bld, 1);
             }
          }
@@ -2721,17 +2797,22 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
          /* Arbitrarily only push up to 32 vec4 slots worth of data,
           * which is 16 registers (since each holds 2 vec4 slots).
           */
+         unsigned slot_count = 1;
+         if (type_sz(dest.type) == 8 && instr->num_components > 2)
+            slot_count++;
+
          const unsigned max_push_slots = 32;
-         if (imm_offset < max_push_slots) {
+         if (imm_offset + slot_count <= max_push_slots) {
             fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
             for (int i = 0; i < instr->num_components; i++) {
                unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) +
                   i + first_component;
                bld.MOV(offset(dest, bld, i), component(src, comp));
             }
+
             tes_prog_data->base.urb_read_length =
                MAX2(tes_prog_data->base.urb_read_length,
-                    DIV_ROUND_UP(imm_offset + 1, 2));
+                    DIV_ROUND_UP(imm_offset + slot_count, 2));
          } else {
             /* Replicate the patch handle to all enabled channels */
             const fs_reg srcs[] = {
@@ -3389,6 +3470,10 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
       cs_prog_data->uses_barrier = true;
       break;
 
+   case nir_intrinsic_load_subgroup_id:
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id);
+      break;
+
    case nir_intrinsic_load_local_invocation_id:
    case nir_intrinsic_load_work_group_id: {
       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
@@ -3496,18 +3581,10 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
        * expected by our 32-bit write messages.
        */
       unsigned type_size = 4;
-      unsigned bit_size = instr->src[0].is_ssa ?
-         instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
-      if (bit_size == 64) {
+      if (nir_src_bit_size(instr->src[0]) == 64) {
          type_size = 8;
-         fs_reg tmp =
-           fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
-         shuffle_64bit_data_for_32bit_write(
-            bld,
-            retype(tmp, BRW_REGISTER_TYPE_F),
-            retype(val_reg, BRW_REGISTER_TYPE_DF),
-            instr->num_components);
-         val_reg = tmp;
+         val_reg = shuffle_64bit_data_for_32bit_write(bld,
+            val_reg, instr->num_components);
       }
 
       unsigned type_slots = type_size / 4;
@@ -3796,7 +3873,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
             (instr->num_components - 1) * type_sz(dest.type);
 
          bool supports_64bit_indirects =
-            !devinfo->is_cherryview && !devinfo->is_broxton;
+            !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo);
 
          if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
             for (unsigned j = 0; j < instr->num_components; j++) {
@@ -3849,7 +3926,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
           */
          brw_mark_surface_used(prog_data,
                                stage_prog_data->binding_table.ubo_start +
-                               nir->info->num_ubos - 1);
+                               nir->info.num_ubos - 1);
       }
 
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
@@ -3870,6 +3947,34 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
           * and we have to split it if necessary.
           */
          const unsigned type_size = type_sz(dest.type);
+
+         /* See if we've selected this as a push constant candidate */
+         if (const_index) {
+            const unsigned ubo_block = const_index->u32[0];
+            const unsigned offset_256b = const_offset->u32[0] / 32;
+
+            fs_reg push_reg;
+            for (int i = 0; i < 4; i++) {
+               const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
+               if (range->block == ubo_block &&
+                   offset_256b >= range->start &&
+                   offset_256b < range->start + range->length) {
+
+                  push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);
+                  push_reg.offset = const_offset->u32[0] - 32 * range->start;
+                  break;
+               }
+            }
+
+            if (push_reg.file != BAD_FILE) {
+               for (unsigned i = 0; i < instr->num_components; i++) {
+                  bld.MOV(offset(dest, bld, i),
+                          byte_offset(push_reg, i * type_size));
+               }
+               break;
+            }
+         }
+
          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
          const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
          const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
@@ -3919,7 +4024,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
           */
          brw_mark_surface_used(prog_data,
                                stage_prog_data->binding_table.ssbo_start +
-                               nir->info->num_ssbos - 1);
+                               nir->info.num_ssbos - 1);
       }
 
       fs_reg offset_reg;
@@ -3959,7 +4064,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
          brw_mark_surface_used(prog_data,
                                stage_prog_data->binding_table.ssbo_start +
-                               nir->info->num_ssbos - 1);
+                               nir->info.num_ssbos - 1);
       }
 
       /* Value */
@@ -3975,17 +4080,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        * expected by our 32-bit write messages.
        */
       unsigned type_size = 4;
-      unsigned bit_size = instr->src[0].is_ssa ?
-         instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
-      if (bit_size == 64) {
+      if (nir_src_bit_size(instr->src[0]) == 64) {
          type_size = 8;
-         fs_reg tmp =
-           fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
-         shuffle_64bit_data_for_32bit_write(bld,
-            retype(tmp, BRW_REGISTER_TYPE_F),
-            retype(val_reg, BRW_REGISTER_TYPE_DF),
-            instr->num_components);
-         val_reg = tmp;
+         val_reg = shuffle_64bit_data_for_32bit_write(bld,
+            val_reg, instr->num_components);
       }
 
       unsigned type_slots = type_size / 4;
@@ -4037,23 +4135,16 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
       assert(const_offset && "Indirect output stores not allowed");
-      fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
-                                      4 * const_offset->u32[0]), src.type);
 
       unsigned num_components = instr->num_components;
       unsigned first_component = nir_intrinsic_component(instr);
-      unsigned bit_size = instr->src[0].is_ssa ?
-         instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
-      if (bit_size == 64) {
-         fs_reg tmp =
-            fs_reg(VGRF, alloc.allocate(2 * num_components),
-                   BRW_REGISTER_TYPE_F);
-         shuffle_64bit_data_for_32bit_write(
-            bld, tmp, retype(src, BRW_REGISTER_TYPE_DF), num_components);
-         src = retype(tmp, src.type);
+      if (nir_src_bit_size(instr->src[0]) == 64) {
+         src = shuffle_64bit_data_for_32bit_write(bld, src, num_components);
          num_components *= 2;
       }
 
+      fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
+                                      4 * const_offset->u32[0]), src.type);
       for (unsigned j = 0; j < num_components; j++) {
          bld.MOV(offset(new_dest, bld, j + first_component),
                  offset(src, bld, j));
@@ -4123,18 +4214,157 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
-   case nir_intrinsic_load_channel_num: {
-      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
-      dest = retype(dest, BRW_REGISTER_TYPE_UD);
-      const fs_builder allbld8 = bld.group(8, 0).exec_all();
-      allbld8.MOV(tmp, brw_imm_v(0x76543210));
-      if (dispatch_width > 8)
-         allbld8.ADD(byte_offset(tmp, 16), tmp, brw_imm_uw(8u));
-      if (dispatch_width > 16) {
-         const fs_builder allbld16 = bld.group(16, 0).exec_all();
-         allbld16.ADD(byte_offset(tmp, 32), tmp, brw_imm_uw(16u));
+   case nir_intrinsic_load_subgroup_invocation:
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
+              nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
+      break;
+
+   case nir_intrinsic_load_subgroup_eq_mask:
+   case nir_intrinsic_load_subgroup_ge_mask:
+   case nir_intrinsic_load_subgroup_gt_mask:
+   case nir_intrinsic_load_subgroup_le_mask:
+   case nir_intrinsic_load_subgroup_lt_mask:
+      unreachable("not reached");
+
+   case nir_intrinsic_vote_any: {
+      const fs_builder ubld = bld.exec_all().group(1, 0);
+
+      /* The any/all predicates do not consider channel enables. To prevent
+       * dead channels from affecting the result, we initialize the flag with
+       * with the identity value for the logical operation.
+       */
+      if (dispatch_width == 32) {
+         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
+         ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
+                         brw_imm_ud(0));
+      } else {
+         ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
       }
-      bld.MOV(dest, tmp);
+      bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
+
+      /* For some reason, the any/all predicates don't work properly with
+       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
+       * doesn't read the correct subset of the flag register and you end up
+       * getting garbage in the second half.  Work around this by using a pair
+       * of 1-wide MOVs and scattering the result.
+       */
+      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
+      ubld.MOV(res1, brw_imm_d(0));
+      set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
+                    dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
+                                           BRW_PREDICATE_ALIGN1_ANY32H,
+                    ubld.MOV(res1, brw_imm_d(-1)));
+
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
+      break;
+   }
+   case nir_intrinsic_vote_all: {
+      const fs_builder ubld = bld.exec_all().group(1, 0);
+
+      /* The any/all predicates do not consider channel enables. To prevent
+       * dead channels from affecting the result, we initialize the flag with
+       * with the identity value for the logical operation.
+       */
+      if (dispatch_width == 32) {
+         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
+         ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
+                         brw_imm_ud(0xffffffff));
+      } else {
+         ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
+      }
+      bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
+
+      /* For some reason, the any/all predicates don't work properly with
+       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
+       * doesn't read the correct subset of the flag register and you end up
+       * getting garbage in the second half.  Work around this by using a pair
+       * of 1-wide MOVs and scattering the result.
+       */
+      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
+      ubld.MOV(res1, brw_imm_d(0));
+      set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
+                    dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
+                                           BRW_PREDICATE_ALIGN1_ALL32H,
+                    ubld.MOV(res1, brw_imm_d(-1)));
+
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
+      break;
+   }
+   case nir_intrinsic_vote_eq: {
+      fs_reg value = get_nir_src(instr->src[0]);
+      fs_reg uniformized = bld.emit_uniformize(value);
+      const fs_builder ubld = bld.exec_all().group(1, 0);
+
+      /* The any/all predicates do not consider channel enables. To prevent
+       * dead channels from affecting the result, we initialize the flag with
+       * with the identity value for the logical operation.
+       */
+      if (dispatch_width == 32) {
+         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
+         ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
+                         brw_imm_ud(0xffffffff));
+      } else {
+         ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
+      }
+      bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
+
+      /* For some reason, the any/all predicates don't work properly with
+       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
+       * doesn't read the correct subset of the flag register and you end up
+       * getting garbage in the second half.  Work around this by using a pair
+       * of 1-wide MOVs and scattering the result.
+       */
+      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
+      ubld.MOV(res1, brw_imm_d(0));
+      set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
+                    dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
+                                           BRW_PREDICATE_ALIGN1_ALL32H,
+                    ubld.MOV(res1, brw_imm_d(-1)));
+
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
+      break;
+   }
+
+   case nir_intrinsic_ballot: {
+      const fs_reg value = retype(get_nir_src(instr->src[0]),
+                                  BRW_REGISTER_TYPE_UD);
+      struct brw_reg flag = brw_flag_reg(0, 0);
+      /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
+       * as f0.0.  This is a problem for fragment programs as we currently use
+       * f0.1 for discards.  Fortunately, we don't support SIMD32 fragment
+       * programs yet so this isn't a problem.  When we do, something will
+       * have to change.
+       */
+      if (dispatch_width == 32)
+         flag.type = BRW_REGISTER_TYPE_UD;
+
+      bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u));
+      bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
+
+      if (instr->dest.ssa.bit_size > 32) {
+         dest.type = BRW_REGISTER_TYPE_UQ;
+      } else {
+         dest.type = BRW_REGISTER_TYPE_UD;
+      }
+      bld.MOV(dest, flag);
+      break;
+   }
+
+   case nir_intrinsic_read_invocation: {
+      const fs_reg value = get_nir_src(instr->src[0]);
+      const fs_reg invocation = get_nir_src(instr->src[1]);
+      fs_reg tmp = bld.vgrf(value.type);
+
+      bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
+                          bld.emit_uniformize(invocation));
+
+      bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));
+      break;
+   }
+
+   case nir_intrinsic_read_first_invocation: {
+      const fs_reg value = get_nir_src(instr->src[0]);
+      bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
       break;
    }
 
@@ -4171,7 +4401,7 @@ fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
        */
       brw_mark_surface_used(prog_data,
                             stage_prog_data->binding_table.ssbo_start +
-                            nir->info->num_ssbos - 1);
+                            nir->info.num_ssbos - 1);
    }
 
    fs_reg offset = get_nir_src(instr->src[1]);
@@ -4380,15 +4610,6 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
 
-   if (instr->op == nir_texop_query_levels ||
-       (instr->op == nir_texop_tex && stage != MESA_SHADER_FRAGMENT)) {
-      /* textureQueryLevels() and texture() are implemented in terms of TXS
-       * and TXL respectively, so we need to pass a valid LOD argument.
-       */
-      assert(srcs[TEX_LOGICAL_SRC_LOD].file == BAD_FILE);
-      srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0u);
-   }
-
    enum opcode opcode;
    switch (instr->op) {
    case nir_texop_tex:
@@ -4591,24 +4812,22 @@ shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld,
  * 64-bit data they are about to write. Because of this the function checks
  * that the src and dst regions involved in the operation do not overlap.
  */
-void
+fs_reg
 shuffle_64bit_data_for_32bit_write(const fs_builder &bld,
-                                   const fs_reg &dst,
                                    const fs_reg &src,
                                    uint32_t components)
 {
    assert(type_sz(src.type) == 8);
-   assert(type_sz(dst.type) == 4);
 
-   assert(!regions_overlap(
-             dst, 2 * components * dst.component_size(bld.dispatch_width()),
-             src, components * src.component_size(bld.dispatch_width())));
+   fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D, 2 * components);
 
    for (unsigned i = 0; i < components; i++) {
       const fs_reg component_i = offset(src, bld, i);
       bld.MOV(offset(dst, bld, 2 * i), subscript(component_i, dst.type, 0));
       bld.MOV(offset(dst, bld, 2 * i + 1), subscript(component_i, dst.type, 1));
    }
+
+   return dst;
 }
 
 fs_reg