}
if (type_sz(dst.type) == 8) {
- shuffle_32bit_load_result_to_64bit_data(
- bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components);
-
- for (unsigned c = 0; c < num_components; c++)
- bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
+ shuffle_from_32bit_read(bld,
+ offset(dst, bld, iter * 2),
+ retype(tmp_dst, BRW_REGISTER_TYPE_D),
+ 0,
+ num_components);
}
if (num_iterations > 1) {
1 /* dims */,
num_components_32bit,
BRW_PREDICATE_NONE);
- shuffle_32bit_load_result_to_16bit_data(bld,
- retype(dest, BRW_REGISTER_TYPE_W),
- retype(read_result, BRW_REGISTER_TYPE_D),
- first_component, num_components);
+ shuffle_from_32bit_read(bld, dest, read_result, first_component,
+ num_components);
} else {
fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
for (unsigned i = 0; i < num_components; i++) {
BRW_PREDICATE_NONE);
/* Shuffle the 32-bit load result into valid 64-bit data */
- const fs_reg packed_result = bld.vgrf(dest.type, iter_components);
- shuffle_32bit_load_result_to_64bit_data(
- bld, packed_result, read_result, iter_components);
-
- /* Move each component to its destination */
- read_result = retype(read_result, BRW_REGISTER_TYPE_DF);
- for (int c = 0; c < iter_components; c++) {
- bld.MOV(offset(dest, bld, it * 2 + c),
- offset(packed_result, bld, c));
- }
+ shuffle_from_32bit_read(bld, offset(dest, bld, it * 2),
+ read_result, 0, iter_components);
bld.ADD(read_offset, read_offset, brw_imm_ud(16));
}
* aligned. Shuffling only one component would be the same as
* striding it.
*/
- fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D,
- DIV_ROUND_UP(num_components, 2));
- shuffle_16bit_data_for_32bit_write(bld, tmp, write_src,
- num_components);
- write_src = tmp;
+ write_src = shuffle_for_32bit_write(bld, write_src, 0,
+ num_components);
}
fs_reg offset_reg;
}
}
-void
-shuffle_32bit_load_result_to_16bit_data(const fs_builder &bld,
- const fs_reg &dst,
- const fs_reg &src,
- uint32_t first_component,
- uint32_t components)
-{
- assert(type_sz(src.type) == 4);
- assert(type_sz(dst.type) == 2);
-
- /* A temporary is used to un-shuffle the 32-bit data of each component in
- * into a valid 16-bit vector. We can't write directly to dst because it
- * can be the same register as src and in that case the first MOV in the
- * loop below would overwrite the data read in the second MOV.
- */
- fs_reg tmp = retype(bld.vgrf(src.type), dst.type);
-
- for (unsigned i = 0; i < components; i++) {
- const fs_reg component_i =
- subscript(offset(src, bld, (first_component + i) / 2), dst.type,
- (first_component + i) % 2);
-
- bld.MOV(offset(tmp, bld, i % 2), component_i);
-
- if (i % 2) {
- bld.MOV(offset(dst, bld, i -1), offset(tmp, bld, 0));
- bld.MOV(offset(dst, bld, i), offset(tmp, bld, 1));
- }
- }
- if (components % 2) {
- bld.MOV(offset(dst, bld, components - 1), tmp);
- }
-}
-
/**
* This helper does the inverse operation of
* SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
return dst;
}
-void
-shuffle_16bit_data_for_32bit_write(const fs_builder &bld,
- const fs_reg &dst,
- const fs_reg &src,
- uint32_t components)
-{
- assert(type_sz(src.type) == 2);
- assert(type_sz(dst.type) == 4);
-
- /* A temporary is used to shuffle the 16-bit data of each component in the
- * 32-bit data vector. We can't write directly to dst because it can be the
- * same register as src and in that case the first MOV in the loop below
- * would overwrite the data read in the second MOV.
- */
- fs_reg tmp = bld.vgrf(dst.type);
-
- for (unsigned i = 0; i < components; i++) {
- const fs_reg component_i = offset(src, bld, i);
- bld.MOV(subscript(tmp, src.type, i % 2), component_i);
- if (i % 2) {
- bld.MOV(offset(dst, bld, i / 2), tmp);
- }
- }
- if (components % 2) {
- bld.MOV(offset(dst, bld, components / 2), tmp);
- }
-}
-
/*
* This helper takes a source register and un/shuffles it into the destination
* register.