intel/fs: Use new shuffle_32bit_write for all 64-bit storage writes
[mesa.git] / src / intel / compiler / brw_fs_nir.cpp
index 5f8f7cd6a66a151d10d2747e06bd188fff30a3ca..d7a0b7d6849ebf12705892e52c4b1b35a85d2217 100644 (file)
@@ -775,12 +775,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        *        the same qword.
        *     (...)"
        *
-       * This means that 32-bit to 64-bit conversions need to have the 32-bit
-       * data elements aligned to 64-bit. This restriction does not apply to
-       * BDW and later.
+       * This means that conversions from bit-sizes smaller than 64-bit to
+       * 64-bit need to have the source data elements aligned to 64-bit.
+       * This restriction does not apply to BDW and later.
        */
       if (nir_dest_bit_size(instr->dest.dest) == 64 &&
-          nir_src_bit_size(instr->src[0].src) == 32 &&
+          nir_src_bit_size(instr->src[0].src) < 64 &&
           (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
          fs_reg tmp = bld.vgrf(result.type, 1);
          tmp = subscript(tmp, op[0].type, 0);
@@ -1018,9 +1018,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    case nir_op_feq:
    case nir_op_fne: {
       fs_reg dest = result;
-      if (nir_src_bit_size(instr->src[0].src) > 32) {
-         dest = bld.vgrf(BRW_REGISTER_TYPE_DF, 1);
-      }
+
+      const uint32_t bit_size =  nir_src_bit_size(instr->src[0].src);
+      if (bit_size != 32)
+         dest = bld.vgrf(op[0].type, 1);
+
       brw_conditional_mod cond;
       switch (instr->op) {
       case nir_op_flt:
@@ -1038,9 +1040,19 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       default:
          unreachable("bad opcode");
       }
+
       bld.CMP(dest, op[0], op[1], cond);
-      if (nir_src_bit_size(instr->src[0].src) > 32) {
+
+      if (bit_size > 32) {
          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
+      } else if(bit_size < 32) {
+         /* When we convert the result to 32-bit we need to be careful and do
+          * it as a signed conversion to get sign extension (for 32-bit true)
+          */
+         const brw_reg_type src_type =
+            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
+
+         bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
       }
       break;
    }
@@ -1052,9 +1064,10 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    case nir_op_ieq:
    case nir_op_ine: {
       fs_reg dest = result;
-      if (nir_src_bit_size(instr->src[0].src) > 32) {
-         dest = bld.vgrf(BRW_REGISTER_TYPE_UQ, 1);
-      }
+
+      const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+      if (bit_size != 32)
+         dest = bld.vgrf(op[0].type, 1);
 
       brw_conditional_mod cond;
       switch (instr->op) {
@@ -1076,8 +1089,17 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
          unreachable("bad opcode");
       }
       bld.CMP(dest, op[0], op[1], cond);
-      if (nir_src_bit_size(instr->src[0].src) > 32) {
+
+      if (bit_size > 32) {
          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
+      } else if (bit_size < 32) {
+         /* When we convert the result to 32-bit we need to be careful and do
+          * it as a signed conversion to get sign extension (for 32-bit true)
+          */
+         const brw_reg_type src_type =
+            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
+
+         bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
       }
       break;
    }
@@ -1292,6 +1314,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       break;
 
    case nir_op_pack_64_2x32_split:
+   case nir_op_pack_32_2x16_split:
       bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
       break;
 
@@ -1304,6 +1327,15 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       break;
    }
 
+   case nir_op_unpack_32_2x16_split_x:
+   case nir_op_unpack_32_2x16_split_y: {
+      if (instr->op == nir_op_unpack_32_2x16_split_x)
+         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
+      else
+         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
+      break;
+   }
+
    case nir_op_fpow:
       inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
       inst->saturate = instr->dest.saturate;
@@ -2273,11 +2305,11 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
       }
 
       if (type_sz(dst.type) == 8) {
-         shuffle_32bit_load_result_to_64bit_data(
-            bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components);
-
-         for (unsigned c = 0; c < num_components; c++)
-            bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
+         shuffle_from_32bit_read(bld,
+                                 offset(dst, bld, iter * 2),
+                                 retype(tmp_dst, BRW_REGISTER_TYPE_D),
+                                 0,
+                                 num_components);
       }
 
       if (num_iterations > 1) {
@@ -2340,10 +2372,8 @@ do_untyped_vector_read(const fs_builder &bld,
                               1 /* dims */,
                               num_components_32bit,
                               BRW_PREDICATE_NONE);
-         shuffle_32bit_load_result_to_16bit_data(bld,
-               retype(dest, BRW_REGISTER_TYPE_W),
-               retype(read_result, BRW_REGISTER_TYPE_D),
-               first_component, num_components);
+         shuffle_from_32bit_read(bld, dest, read_result, first_component,
+                                 num_components);
       } else {
          fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
          for (unsigned i = 0; i < num_components; i++) {
@@ -2404,16 +2434,8 @@ do_untyped_vector_read(const fs_builder &bld,
                                                 BRW_PREDICATE_NONE);
 
          /* Shuffle the 32-bit load result into valid 64-bit data */
-         const fs_reg packed_result = bld.vgrf(dest.type, iter_components);
-         shuffle_32bit_load_result_to_64bit_data(
-            bld, packed_result, read_result, iter_components);
-
-         /* Move each component to its destination */
-         read_result = retype(read_result, BRW_REGISTER_TYPE_DF);
-         for (int c = 0; c < iter_components; c++) {
-            bld.MOV(offset(dest, bld, it * 2 + c),
-                    offset(packed_result, bld, c));
-         }
+         shuffle_from_32bit_read(bld, offset(dest, bld, it * 2),
+                                 read_result, 0, iter_components);
 
          bld.ADD(read_offset, read_offset, brw_imm_ud(16));
       }
@@ -2461,16 +2483,11 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
       if (type_sz(dest.type) == 8)
          first_component /= 2;
 
-      for (unsigned j = 0; j < num_components; j++) {
-         bld.MOV(offset(dest, bld, j), offset(src, bld, j + first_component));
-      }
-
-      if (type_sz(dest.type) == 8) {
-         shuffle_32bit_load_result_to_64bit_data(bld,
-                                                 dest,
-                                                 retype(dest, BRW_REGISTER_TYPE_F),
-                                                 instr->num_components);
-      }
+      /* For 16-bit support maybe a temporary will be needed to copy from
+       * the ATTR file.
+       */
+      shuffle_from_32bit_read(bld, dest, retype(src, BRW_REGISTER_TYPE_D),
+                              first_component, num_components);
       break;
    }
 
@@ -2648,13 +2665,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
           * or SSBOs.
           */
          if (type_sz(dst.type) == 8) {
-            shuffle_32bit_load_result_to_64bit_data(
-               bld, dst, retype(dst, BRW_REGISTER_TYPE_F), num_components);
-
-            for (unsigned c = 0; c < num_components; c++) {
-               bld.MOV(offset(orig_dst, bld, iter * 2 + c),
-                       offset(dst, bld, c));
-            }
+            shuffle_from_32bit_read(bld,
+                                    offset(orig_dst, bld, iter * 2),
+                                    retype(dst, BRW_REGISTER_TYPE_D),
+                                    0, num_components);
          }
 
          /* Copy the temporary to the destination to deal with writemasking.
@@ -2828,8 +2842,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
                 * for that.
                 */
                unsigned channel = iter * 2 + i;
-               fs_reg dest = shuffle_64bit_data_for_32bit_write(bld,
-                  offset(value, bld, channel), 1);
+               fs_reg dest = shuffle_for_32bit_write(bld, value, channel, 1);
 
                srcs[header_regs + (i + first_component) * 2] = dest;
                srcs[header_regs + (i + first_component) * 2 + 1] =
@@ -2997,13 +3010,10 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
              * or SSBOs.
              */
             if (type_sz(dest.type) == 8) {
-               shuffle_32bit_load_result_to_64bit_data(
-                  bld, dest, retype(dest, BRW_REGISTER_TYPE_F), num_components);
-
-               for (unsigned c = 0; c < num_components; c++) {
-                  bld.MOV(offset(orig_dest, bld, iter * 2 + c),
-                          offset(dest, bld, c));
-               }
+               shuffle_from_32bit_read(bld,
+                                       offset(orig_dest, bld, iter * 2),
+                                       retype(dest, BRW_REGISTER_TYPE_D),
+                                       0, num_components);
             }
 
             /* If we are loading double data and we need a second read message
@@ -3340,15 +3350,16 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
    case nir_intrinsic_load_input: {
       /* load_input is only used for flat inputs */
       unsigned base = nir_intrinsic_base(instr);
-      unsigned component = nir_intrinsic_component(instr);
+      unsigned comp = nir_intrinsic_component(instr);
       unsigned num_components = instr->num_components;
+      fs_reg orig_dest = dest;
       enum brw_reg_type type = dest.type;
 
       /* Special case fields in the VUE header */
       if (base == VARYING_SLOT_LAYER)
-         component = 1;
+         comp = 1;
       else if (base == VARYING_SLOT_VIEWPORT)
-         component = 2;
+         comp = 2;
 
       if (nir_dest_bit_size(instr->dest) == 64) {
          /* const_index is in 32-bit type size units that could not be aligned
@@ -3357,20 +3368,17 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
           */
          type = BRW_REGISTER_TYPE_F;
          num_components *= 2;
+         dest = bld.vgrf(type, num_components);
       }
 
       for (unsigned int i = 0; i < num_components; i++) {
-         struct brw_reg interp = interp_reg(base, component + i);
-         interp = suboffset(interp, 3);
-         bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),
-                  retype(fs_reg(interp), type));
+         bld.MOV(offset(retype(dest, type), bld, i),
+                 retype(component(interp_reg(base, comp + i), 3), type));
       }
 
       if (nir_dest_bit_size(instr->dest) == 64) {
-         shuffle_32bit_load_result_to_64bit_data(bld,
-                                                 dest,
-                                                 retype(dest, type),
-                                                 instr->num_components);
+         shuffle_from_32bit_read(bld, orig_dest, dest, 0,
+                                 instr->num_components);
       }
       break;
    }
@@ -3536,8 +3544,8 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
 
       for (unsigned int i = 0; i < instr->num_components; i++) {
          fs_reg interp =
-            fs_reg(interp_reg(nir_intrinsic_base(instr),
-                              nir_intrinsic_component(instr) + i));
+            component(interp_reg(nir_intrinsic_base(instr),
+                                 nir_intrinsic_component(instr) + i), 0);
          interp.type = BRW_REGISTER_TYPE_F;
          dest.type = BRW_REGISTER_TYPE_F;
 
@@ -3688,8 +3696,8 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
       unsigned type_size = 4;
       if (nir_src_bit_size(instr->src[0]) == 64) {
          type_size = 8;
-         val_reg = shuffle_64bit_data_for_32bit_write(bld,
-            val_reg, instr->num_components);
+         val_reg = shuffle_for_32bit_write(bld, val_reg, 0,
+                                           instr->num_components);
       }
 
       unsigned type_slots = type_size / 4;
@@ -4230,8 +4238,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
              * iteration handle the rest.
              */
             num_components = MIN2(2, num_components);
-            write_src = shuffle_64bit_data_for_32bit_write(bld, write_src,
-                                                           num_components);
+            write_src = shuffle_for_32bit_write(bld, write_src, 0,
+                                                num_components);
          } else if (type_size < 4) {
             assert(type_size == 2);
             /* For 16-bit types we pack two consecutive values into a 32-bit
@@ -4269,11 +4277,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
              * aligned. Shuffling only one component would be the same as
              * striding it.
              */
-            fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D,
-                                  DIV_ROUND_UP(num_components, 2));
-            shuffle_16bit_data_for_32bit_write(bld, tmp, write_src,
-                                               num_components);
-            write_src = tmp;
+            write_src = shuffle_for_32bit_write(bld, write_src, 0,
+                                                num_components);
          }
 
          fs_reg offset_reg;
@@ -4330,7 +4335,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       unsigned num_components = instr->num_components;
       unsigned first_component = nir_intrinsic_component(instr);
       if (nir_src_bit_size(instr->src[0]) == 64) {
-         src = shuffle_64bit_data_for_32bit_write(bld, src, num_components);
+         src = shuffle_for_32bit_write(bld, src, 0, num_components);
          num_components *= 2;
       }
 
@@ -4793,6 +4798,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
+   case nir_intrinsic_begin_invocation_interlock: {
+      const fs_builder ubld = bld.group(8, 0);
+      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+
+      ubld.emit(SHADER_OPCODE_INTERLOCK, tmp)->size_written = 2 *
+         REG_SIZE;
+
+      break;
+   }
+
+   case nir_intrinsic_end_invocation_interlock: {
+      /* We don't need to do anything here */
+      break;
+   }
+
    default:
       unreachable("unknown intrinsic");
    }
@@ -5170,93 +5190,6 @@ fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
    }
 }
 
-/**
- * This helper takes the result of a load operation that reads 32-bit elements
- * in this format:
- *
- * x x x x x x x x
- * y y y y y y y y
- * z z z z z z z z
- * w w w w w w w w
- *
- * and shuffles the data to get this:
- *
- * x y x y x y x y
- * x y x y x y x y
- * z w z w z w z w
- * z w z w z w z w
- *
- * Which is exactly what we want if the load is reading 64-bit components
- * like doubles, where x represents the low 32-bit of the x double component
- * and y represents the high 32-bit of the x double component (likewise with
- * z and w for double component y). The parameter @components represents
- * the number of 64-bit components present in @src. This would typically be
- * 2 at most, since we can only fit 2 double elements in the result of a
- * vec4 load.
- *
- * Notice that @dst and @src can be the same register.
- */
-void
-shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld,
-                                        const fs_reg &dst,
-                                        const fs_reg &src,
-                                        uint32_t components)
-{
-   assert(type_sz(src.type) == 4);
-   assert(type_sz(dst.type) == 8);
-
-   /* A temporary that we will use to shuffle the 32-bit data of each
-    * component in the vector into valid 64-bit data. We can't write directly
-    * to dst because dst can be (and would usually be) the same as src
-    * and in that case the first MOV in the loop below would overwrite the
-    * data read in the second MOV.
-    */
-   fs_reg tmp = bld.vgrf(dst.type);
-
-   for (unsigned i = 0; i < components; i++) {
-      const fs_reg component_i = offset(src, bld, 2 * i);
-
-      bld.MOV(subscript(tmp, src.type, 0), component_i);
-      bld.MOV(subscript(tmp, src.type, 1), offset(component_i, bld, 1));
-
-      bld.MOV(offset(dst, bld, i), tmp);
-   }
-}
-
-void
-shuffle_32bit_load_result_to_16bit_data(const fs_builder &bld,
-                                        const fs_reg &dst,
-                                        const fs_reg &src,
-                                        uint32_t first_component,
-                                        uint32_t components)
-{
-   assert(type_sz(src.type) == 4);
-   assert(type_sz(dst.type) == 2);
-
-   /* A temporary is used to un-shuffle the 32-bit data of each component in
-    * into a valid 16-bit vector. We can't write directly to dst because it
-    * can be the same register as src and in that case the first MOV in the
-    * loop below would overwrite the data read in the second MOV.
-    */
-   fs_reg tmp = retype(bld.vgrf(src.type), dst.type);
-
-   for (unsigned i = 0; i < components; i++) {
-      const fs_reg component_i =
-         subscript(offset(src, bld, (first_component + i) / 2), dst.type,
-                   (first_component + i) % 2);
-
-      bld.MOV(offset(tmp, bld, i % 2), component_i);
-
-      if (i % 2) {
-         bld.MOV(offset(dst, bld, i -1), offset(tmp, bld, 0));
-         bld.MOV(offset(dst, bld, i), offset(tmp, bld, 1));
-      }
-   }
-   if (components % 2) {
-      bld.MOV(offset(dst, bld, components - 1), tmp);
-   }
-}
-
 /**
  * This helper does the inverse operation of
  * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
@@ -5289,32 +5222,148 @@ shuffle_64bit_data_for_32bit_write(const fs_builder &bld,
    return dst;
 }
 
+/*
+ * This helper takes a source register and un/shuffles it into the destination
+ * register.
+ *
+ * If source type size is smaller than destination type size the operation
+ * needed is a component shuffle. The opposite case would be an unshuffle. If
+ * source/destination type size is equal a shuffle is done that would be
+ * equivalent to a simple MOV.
+ *
+ * For example, if source is a 16-bit type and destination is 32-bit. A 3
+ * components .xyz 16-bit vector on SIMD8 would be.
+ *
+ *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
+ *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
+ *
+ * This helper will return the following 2 32-bit components with the 16-bit
+ * values shuffled:
+ *
+ *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
+ *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
+ *
+ * For unshuffle, the example would be the opposite, a 64-bit type source
+ * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
+ * would be:
+ *
+ *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
+ *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
+ *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
+ *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
+ *
+ * The returned result would be the following 4 32-bit components unshuffled:
+ *
+ *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
+ *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
+ *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
+ *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
+ *
+ * - Source and destination register must not be overlapped.
+ * - components units are measured in terms of the smaller type between
+ *   source and destination because we are un/shuffling the smaller
+ *   components from/into the bigger ones.
+ * - first_component parameter allows skipping source components.
+ */
 void
-shuffle_16bit_data_for_32bit_write(const fs_builder &bld,
-                                   const fs_reg &dst,
-                                   const fs_reg &src,
-                                   uint32_t components)
+shuffle_src_to_dst(const fs_builder &bld,
+                   const fs_reg &dst,
+                   const fs_reg &src,
+                   uint32_t first_component,
+                   uint32_t components)
 {
-   assert(type_sz(src.type) == 2);
-   assert(type_sz(dst.type) == 4);
+   if (type_sz(src.type) == type_sz(dst.type)) {
+      assert(!regions_overlap(dst,
+         type_sz(dst.type) * bld.dispatch_width() * components,
+         offset(src, bld, first_component),
+         type_sz(src.type) * bld.dispatch_width() * components));
+      for (unsigned i = 0; i < components; i++) {
+         bld.MOV(retype(offset(dst, bld, i), src.type),
+                 offset(src, bld, i + first_component));
+      }
+   } else if (type_sz(src.type) < type_sz(dst.type)) {
+      /* Source is shuffled into destination */
+      unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
+      assert(!regions_overlap(dst,
+         type_sz(dst.type) * bld.dispatch_width() *
+         DIV_ROUND_UP(components, size_ratio),
+         offset(src, bld, first_component),
+         type_sz(src.type) * bld.dispatch_width() * components));
+
+      brw_reg_type shuffle_type =
+         brw_reg_type_from_bit_size(8 * type_sz(src.type),
+                                    BRW_REGISTER_TYPE_D);
+      for (unsigned i = 0; i < components; i++) {
+         fs_reg shuffle_component_i =
+            subscript(offset(dst, bld, i / size_ratio),
+                      shuffle_type, i % size_ratio);
+         bld.MOV(shuffle_component_i,
+                 retype(offset(src, bld, i + first_component), shuffle_type));
+      }
+   } else {
+      /* Source is unshuffled into destination */
+      unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
+      assert(!regions_overlap(dst,
+         type_sz(dst.type) * bld.dispatch_width() * components,
+         offset(src, bld, first_component / size_ratio),
+         type_sz(src.type) * bld.dispatch_width() *
+         DIV_ROUND_UP(components + (first_component % size_ratio),
+                      size_ratio)));
+
+      brw_reg_type shuffle_type =
+         brw_reg_type_from_bit_size(8 * type_sz(dst.type),
+                                    BRW_REGISTER_TYPE_D);
+      for (unsigned i = 0; i < components; i++) {
+         fs_reg shuffle_component_i =
+            subscript(offset(src, bld, (first_component + i) / size_ratio),
+                      shuffle_type, (first_component + i) % size_ratio);
+         bld.MOV(retype(offset(dst, bld, i), shuffle_type),
+                 shuffle_component_i);
+      }
+   }
+}
 
-   /* A temporary is used to shuffle the 16-bit data of each component in the
-    * 32-bit data vector. We can't write directly to dst because it can be the
-    * same register as src and in that case the first MOV in the loop below
-    * would overwrite the data read in the second MOV.
-    */
-   fs_reg tmp = bld.vgrf(dst.type);
+void
+shuffle_from_32bit_read(const fs_builder &bld,
+                        const fs_reg &dst,
+                        const fs_reg &src,
+                        uint32_t first_component,
+                        uint32_t components)
+{
+   assert(type_sz(src.type) == 4);
 
-   for (unsigned i = 0; i < components; i++) {
-      const fs_reg component_i = offset(src, bld, i);
-      bld.MOV(subscript(tmp, src.type, i % 2), component_i);
-      if (i % 2) {
-         bld.MOV(offset(dst, bld, i / 2), tmp);
-      }
+   /* This function takes components in units of the destination type while
+    * shuffle_src_to_dst takes components in units of the smallest type
+    */
+   if (type_sz(dst.type) > 4) {
+      assert(type_sz(dst.type) == 8);
+      first_component *= 2;
+      components *= 2;
    }
-   if (components % 2) {
-      bld.MOV(offset(dst, bld, components / 2), tmp);
+
+   shuffle_src_to_dst(bld, dst, src, first_component, components);
+}
+
+fs_reg
+shuffle_for_32bit_write(const fs_builder &bld,
+                        const fs_reg &src,
+                        uint32_t first_component,
+                        uint32_t components)
+{
+   fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D,
+                         DIV_ROUND_UP (components * type_sz(src.type), 4));
+   /* This function takes components in units of the source type while
+    * shuffle_src_to_dst takes components in units of the smallest type
+    */
+   if (type_sz(src.type) > 4) {
+      assert(type_sz(src.type) == 8);
+      first_component *= 2;
+      components *= 2;
    }
+
+   shuffle_src_to_dst(bld, dst, src, first_component, components);
+
+   return dst;
 }
 
 fs_reg