intrin->intrinsic == nir_intrinsic_load_scratch;
assert(intrin->dest.is_ssa);
- if (intrin->dest.ssa.bit_size == 32 &&
- (!needs_scalar || intrin->num_components == 1))
- return false;
-
const unsigned bit_size = intrin->dest.ssa.bit_size;
const unsigned num_components = intrin->dest.ssa.num_components;
const unsigned bytes_read = num_components * (bit_size / 8);
const unsigned align = nir_intrinsic_align(intrin);
+ if (bit_size == 32 && align >= 32 && intrin->num_components <= 4 &&
+ (!needs_scalar || intrin->num_components == 1))
+ return false;
+
nir_ssa_def *result;
nir_src *offset_src = nir_get_io_offset_src(intrin);
if (bit_size < 32 && nir_src_is_const(*offset_src)) {
result = nir_extract_bits(b, &load, 1, load_offset * 8,
num_components, bit_size);
} else {
- /* Otherwise, we have to break it into smaller loads */
- nir_ssa_def *loads[8];
+ /* Otherwise, we have to break it into smaller loads. We could end up
+ * with as many as 32 loads if we're loading a u64vec16 from scratch.
+ */
+ nir_ssa_def *loads[32];
unsigned num_loads = 0;
int load_offset = 0;
while (load_offset < bytes_read) {
assert(writemask < (1 << num_components));
if ((value->bit_size <= 32 && num_components == 1) ||
- (value->bit_size == 32 &&
+ (value->bit_size == 32 && num_components <= 4 && align >= 32 &&
writemask == (1 << num_components) - 1 &&
!needs_scalar))
return false;
if (progress) {
nir_metadata_preserve(impl, nir_metadata_block_index |
nir_metadata_dominance);
+ } else {
+ nir_metadata_preserve(impl, nir_metadata_all);
}
return progress;