intel/fs: Use shuffle_from_32bit_read for 64-bit gs_input_load
[mesa.git] / src / intel / compiler / brw_fs_nir.cpp
index 9a0de3ae92a1ddd874efb8c448fc3bc4fcb95fe0..f45e123cc1b590f71a3b38ff90852dbceacdcbfd 100644 (file)
@@ -2305,11 +2305,11 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
       }
 
       if (type_sz(dst.type) == 8) {
-         shuffle_32bit_load_result_to_64bit_data(
-            bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components);
-
-         for (unsigned c = 0; c < num_components; c++)
-            bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
+         shuffle_from_32bit_read(bld,
+                                 offset(dst, bld, iter * 2),
+                                 retype(tmp_dst, BRW_REGISTER_TYPE_D),
+                                 0,
+                                 num_components);
       }
 
       if (num_iterations > 1) {
@@ -2372,10 +2372,8 @@ do_untyped_vector_read(const fs_builder &bld,
                               1 /* dims */,
                               num_components_32bit,
                               BRW_PREDICATE_NONE);
-         shuffle_32bit_load_result_to_16bit_data(bld,
-               retype(dest, BRW_REGISTER_TYPE_W),
-               retype(read_result, BRW_REGISTER_TYPE_D),
-               first_component, num_components);
+         shuffle_from_32bit_read(bld, dest, read_result, first_component,
+                                 num_components);
       } else {
          fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
          for (unsigned i = 0; i < num_components; i++) {
@@ -2436,16 +2434,8 @@ do_untyped_vector_read(const fs_builder &bld,
                                                 BRW_PREDICATE_NONE);
 
          /* Shuffle the 32-bit load result into valid 64-bit data */
-         const fs_reg packed_result = bld.vgrf(dest.type, iter_components);
-         shuffle_32bit_load_result_to_64bit_data(
-            bld, packed_result, read_result, iter_components);
-
-         /* Move each component to its destination */
-         read_result = retype(read_result, BRW_REGISTER_TYPE_DF);
-         for (int c = 0; c < iter_components; c++) {
-            bld.MOV(offset(dest, bld, it * 2 + c),
-                    offset(packed_result, bld, c));
-         }
+         shuffle_from_32bit_read(bld, offset(dest, bld, it * 2),
+                                 read_result, 0, iter_components);
 
          bld.ADD(read_offset, read_offset, brw_imm_ud(16));
       }
@@ -4299,11 +4289,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
              * aligned. Shuffling only one component would be the same as
              * striding it.
              */
-            fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D,
-                                  DIV_ROUND_UP(num_components, 2));
-            shuffle_16bit_data_for_32bit_write(bld, tmp, write_src,
-                                               num_components);
-            write_src = tmp;
+            write_src = shuffle_for_32bit_write(bld, write_src, 0,
+                                                num_components);
          }
 
          fs_reg offset_reg;
@@ -5268,40 +5255,6 @@ shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld,
    }
 }
 
-void
-shuffle_32bit_load_result_to_16bit_data(const fs_builder &bld,
-                                        const fs_reg &dst,
-                                        const fs_reg &src,
-                                        uint32_t first_component,
-                                        uint32_t components)
-{
-   assert(type_sz(src.type) == 4);
-   assert(type_sz(dst.type) == 2);
-
-   /* A temporary is used to un-shuffle the 32-bit data of each component in
-    * into a valid 16-bit vector. We can't write directly to dst because it
-    * can be the same register as src and in that case the first MOV in the
-    * loop below would overwrite the data read in the second MOV.
-    */
-   fs_reg tmp = retype(bld.vgrf(src.type), dst.type);
-
-   for (unsigned i = 0; i < components; i++) {
-      const fs_reg component_i =
-         subscript(offset(src, bld, (first_component + i) / 2), dst.type,
-                   (first_component + i) % 2);
-
-      bld.MOV(offset(tmp, bld, i % 2), component_i);
-
-      if (i % 2) {
-         bld.MOV(offset(dst, bld, i -1), offset(tmp, bld, 0));
-         bld.MOV(offset(dst, bld, i), offset(tmp, bld, 1));
-      }
-   }
-   if (components % 2) {
-      bld.MOV(offset(dst, bld, components - 1), tmp);
-   }
-}
-
 /**
  * This helper does the inverse operation of
  * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
@@ -5334,34 +5287,6 @@ shuffle_64bit_data_for_32bit_write(const fs_builder &bld,
    return dst;
 }
 
-void
-shuffle_16bit_data_for_32bit_write(const fs_builder &bld,
-                                   const fs_reg &dst,
-                                   const fs_reg &src,
-                                   uint32_t components)
-{
-   assert(type_sz(src.type) == 2);
-   assert(type_sz(dst.type) == 4);
-
-   /* A temporary is used to shuffle the 16-bit data of each component in the
-    * 32-bit data vector. We can't write directly to dst because it can be the
-    * same register as src and in that case the first MOV in the loop below
-    * would overwrite the data read in the second MOV.
-    */
-   fs_reg tmp = bld.vgrf(dst.type);
-
-   for (unsigned i = 0; i < components; i++) {
-      const fs_reg component_i = offset(src, bld, i);
-      bld.MOV(subscript(tmp, src.type, i % 2), component_i);
-      if (i % 2) {
-         bld.MOV(offset(dst, bld, i / 2), tmp);
-      }
-   }
-   if (components % 2) {
-      bld.MOV(offset(dst, bld, components / 2), tmp);
-   }
-}
-
 /*
  * This helper takes a source register and un/shuffles it into the destination
  * register.