return progress;
}
+static nir_ssa_def *
+r600_lower_ubo_to_align16_impl(nir_builder *b, nir_instr *instr, void *_options)
+{
+ b->cursor = nir_before_instr(instr);
+
+ nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr);
+ assert(op->intrinsic == nir_intrinsic_load_ubo);
+
+ bool const_address = (nir_src_is_const(op->src[1]) && nir_src_is_const(op->src[0]));
+
+ nir_ssa_def *offset = op->src[1].ssa;
+
+ /* This is ugly: With const addressing we can actually set a proper fetch target mask,
+ * but for this we need the component encoded, we don't shift and do de decoding in the
+ * backend. Otherwise we shift by four and resolve the component here
+ * (TODO: encode the start component in the intrinsic when the offset base is non-constant
+ * but a multiple of 16 */
+
+ nir_ssa_def *new_offset = offset;
+ if (!const_address)
+ new_offset = nir_ishr(b, offset, nir_imm_int(b, 4));
+
+ nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_r600);
+ load->num_components = const_address ? op->num_components : 4;
+ load->src[0] = op->src[0];
+ load->src[1] = nir_src_for_ssa(new_offset);
+ nir_intrinsic_set_align(load, nir_intrinsic_align(op), nir_intrinsic_align_offset(op));
+
+ nir_ssa_dest_init(&load->instr, &load->dest, load->num_components, 32, NULL);
+ nir_builder_instr_insert(b, &load->instr);
+
+ /* when four components are loaded or both the offset and the location
+ * are constant, then the backend can deal with it better */
+ if (op->num_components == 4 || const_address)
+ return &load->dest.ssa;
+
+ /* What comes below is a performance disaster when the offset is not constant
+ * because then we have to assume that any component can be the first one and we
+ * have to pick the result manually. */
+ nir_ssa_def *first_comp = nir_iand(b, nir_ishr(b, offset, nir_imm_int(b, 2)),
+ nir_imm_int(b,3));
+
+ const unsigned swz_000[4] = {0, 0, 0, 0};
+ nir_ssa_def *component_select = nir_ieq(b, nir_imm_ivec4(b, 0, 1, 2, 3),
+ nir_swizzle(b, first_comp, swz_000, 4));
+
+ const unsigned szw_0[1] = {0};
+ const unsigned szw_1[1] = {1};
+ const unsigned szw_2[1] = {2};
+
+ if (op->num_components == 1) {
+ const unsigned szw_3[1] = {3};
+ nir_ssa_def *check0 = nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1),
+ nir_swizzle(b, &load->dest.ssa, szw_0, 1),
+ nir_swizzle(b, &load->dest.ssa, szw_3, 1));
+ nir_ssa_def *check1 = nir_bcsel(b, nir_swizzle(b, component_select, szw_1, 1),
+ nir_swizzle(b, &load->dest.ssa, szw_1, 1),
+ check0);
+ return nir_bcsel(b, nir_swizzle(b, component_select, szw_2, 1),
+ nir_swizzle(b, &load->dest.ssa, szw_2, 1),
+ check1);
+ } else if (op->num_components == 2) {
+ const unsigned szw_01[2] = {0, 1};
+ const unsigned szw_12[2] = {1, 2};
+ const unsigned szw_23[2] = {2, 3};
+
+ nir_ssa_def *check0 = nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1),
+ nir_swizzle(b, &load->dest.ssa, szw_01, 2),
+ nir_swizzle(b, &load->dest.ssa, szw_23, 2));
+ return nir_bcsel(b, nir_swizzle(b, component_select, szw_1, 1),
+ nir_swizzle(b, &load->dest.ssa, szw_12, 2),
+ check0);
+ } else {
+ const unsigned szw_012[3] = {0, 1, 3};
+ const unsigned szw_123[3] = {1, 2, 3};
+ return nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1),
+ nir_swizzle(b, &load->dest.ssa, szw_012, 3),
+ nir_swizzle(b, &load->dest.ssa, szw_123, 3));
+ }
+}
+
+bool r600_lower_ubo_to_align16_filter(const nir_instr *instr, const void *_options)
+{
+ if (instr->type != nir_instr_type_intrinsic)
+ return false;
+
+ nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr);
+ return op->intrinsic == nir_intrinsic_load_ubo;
+}
+
+
+bool r600_lower_ubo_to_align16(nir_shader *shader)
+{
+ return nir_shader_lower_instructions(shader,
+ r600_lower_ubo_to_align16_filter,
+ r600_lower_ubo_to_align16_impl,
+ nullptr);
+}
+
}
using r600::r600_nir_lower_int_tg4;
using r600::r600_nir_lower_pack_unpack_2x16;
using r600::r600_lower_scratch_addresses;
using r600::r600_lower_fs_out_to_vector;
+using r600::r600_lower_ubo_to_align16;
int
r600_glsl_type_size(const struct glsl_type *type, bool is_bindless)
const nir_function *func = reinterpret_cast<const nir_function *>(exec_list_get_head_const(&sel->nir->functions));
bool optimize = func->impl->registers.length() == 0 && !has_saturate(func);
-
+ if (optimize) {
+ optimize_once(sel->nir);
+ NIR_PASS_V(sel->nir, r600_lower_ubo_to_align16);
+ }
/* It seems the output of this optimization is cached somewhere, and
* when there are registers, then we can no longer copy propagate, so
* skip the optimization then. (There is probably a better way, but yeah)