i965/vec4: Fix UBO loads for 64-bit data
authorIago Toral Quiroga <itoral@igalia.com>
Wed, 13 Jul 2016 10:10:18 +0000 (12:10 +0200)
committerSamuel Iglesias Gonsálvez <siglesias@igalia.com>
Tue, 3 Jan 2017 10:26:51 +0000 (11:26 +0100)
We need to emit 2 32-bit load messages to load a full dvec4. If only
1 or 2 double components are needed dead-code-elimination will remove
the second one.

We also need to shuffle the result of the 32-bit messages to form
valid 64-bit SIMD4x2 data.

v2:
 - use byte_offset() instead of offset() (Iago)
 - keep the const. offset as an immediate like the original code did (Juan)

Reviewed-by: Matt Turner <mattst88@gmail.com>
src/mesa/drivers/dri/i965/brw_vec4_nir.cpp

index 14d0546c5c09e4f12c1e56076f12c562526dc8ae..65decb49b3b4b1fa95117f3af959f588bf91fb5f 100644 (file)
@@ -822,31 +822,50 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
                                nir->info->num_ubos - 1);
       }
 
-      src_reg offset;
+      src_reg offset_reg;
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
       if (const_offset) {
-         offset = brw_imm_ud(const_offset->u32[0] & ~15);
+         offset_reg = brw_imm_ud(const_offset->u32[0] & ~15);
       } else {
-         offset = get_nir_src(instr->src[1], nir_type_uint32, 1);
+         offset_reg = get_nir_src(instr->src[1], nir_type_uint32, 1);
       }
 
-      src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
-      packed_consts.type = dest.type;
-
-      emit_pull_constant_load_reg(dst_reg(packed_consts),
-                                  surf_index,
-                                  offset,
-                                  NULL, NULL /* before_block/inst */);
+      src_reg packed_consts;
+      if (nir_dest_bit_size(instr->dest) == 32) {
+         packed_consts = src_reg(this, glsl_type::vec4_type);
+         emit_pull_constant_load_reg(dst_reg(packed_consts),
+                                     surf_index,
+                                     offset_reg,
+                                     NULL, NULL /* before_block/inst */);
+      } else {
+         src_reg temp = src_reg(this, glsl_type::dvec4_type);
+         src_reg temp_float = retype(temp, BRW_REGISTER_TYPE_F);
+
+         emit_pull_constant_load_reg(dst_reg(temp_float),
+                                     surf_index, offset_reg, NULL, NULL);
+         if (offset_reg.file == IMM)
+            offset_reg.ud += 16;
+         else
+            emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16u)));
+         emit_pull_constant_load_reg(dst_reg(byte_offset(temp_float, REG_SIZE)),
+                                     surf_index, offset_reg, NULL, NULL);
+
+         packed_consts = src_reg(this, glsl_type::dvec4_type);
+         shuffle_64bit_data(dst_reg(packed_consts), temp, false);
+      }
 
       packed_consts.swizzle = brw_swizzle_for_size(instr->num_components);
       if (const_offset) {
-         packed_consts.swizzle += BRW_SWIZZLE4(const_offset->u32[0] % 16 / 4,
-                                               const_offset->u32[0] % 16 / 4,
-                                               const_offset->u32[0] % 16 / 4,
-                                               const_offset->u32[0] % 16 / 4);
+         unsigned type_size = type_sz(dest.type);
+         packed_consts.swizzle +=
+            BRW_SWIZZLE4(const_offset->u32[0] % 16 / type_size,
+                         const_offset->u32[0] % 16 / type_size,
+                         const_offset->u32[0] % 16 / type_size,
+                         const_offset->u32[0] % 16 / type_size);
       }
 
-      emit(MOV(dest, packed_consts));
+      emit(MOV(dest, retype(packed_consts, dest.type)));
+
       break;
    }