return;
if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
return;
- aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
- split->operands[0] = Operand(vec_src);
- std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
RegClass rc;
if (num_components > vec_src.size()) {
- if (vec_src.type() == RegType::sgpr)
+ if (vec_src.type() == RegType::sgpr) {
+ /* should still help get_alu_src() */
+ emit_split_vector(ctx, vec_src, vec_src.size());
return;
-
+ }
/* sub-dword split */
- assert(vec_src.type() == RegType::vgpr);
rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
} else {
rc = RegClass(vec_src.type(), vec_src.size() / num_components);
}
+ aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
+ split->operands[0] = Operand(vec_src);
+ std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
for (unsigned i = 0; i < num_components; i++) {
elems[i] = {ctx->program->allocateId(), rc};
split->definitions[i] = Definition(elems[i]);
return vec;
Temp dst{ctx->program->allocateId(), s1};
- aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 1)};
+ aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 2)};
bfe->operands[0] = Operand(vec);
bfe->operands[1] = Operand(uint32_t((src.src.ssa->bit_size << 16) | (src.src.ssa->bit_size * swizzle)));
bfe->definitions[0] = Definition(dst);
+ bfe->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
ctx->block->instructions.emplace_back(std::move(bfe));
return dst;
}
if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
- for (unsigned i = 0; i < num; ++i)
- vec->operands[i] = Operand{elems[i]};
+ RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
+ for (unsigned i = 0; i < num; ++i) {
+ if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
+ vec->operands[i] = Operand(emit_extract_vector(ctx, elems[i], 0, elem_rc));
+ else
+ vec->operands[i] = Operand{elems[i]};
+ }
vec->definitions[0] = Definition(dst);
ctx->block->instructions.emplace_back(std::move(vec));
ctx->allocated_vec.emplace(dst.id(), elems);
return dst;
}
-Temp extract_subvector(isel_context *ctx, Temp data, unsigned start, unsigned size, RegType type)
+void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp *dst, unsigned *offsets, Temp src)
{
- if (start == 0 && size == data.size())
- return type == RegType::vgpr ? as_vgpr(ctx, data) : data;
+ if (!count)
+ return;
- unsigned size_hint = 1;
- auto it = ctx->allocated_vec.find(data.id());
- if (it != ctx->allocated_vec.end())
- size_hint = it->second[0].size();
- if (size % size_hint || start % size_hint)
- size_hint = 1;
+ Builder bld(ctx->program, ctx->block);
- start /= size_hint;
- size /= size_hint;
+ ASSERTED bool is_subdword = false;
+ for (unsigned i = 0; i < count; i++)
+ is_subdword |= offsets[i] % 4;
+ is_subdword |= (src.bytes() - offsets[count - 1]) % 4;
+ assert(!is_subdword || dst_type == RegType::vgpr);
- Temp elems[size];
- for (unsigned i = 0; i < size; i++)
- elems[i] = emit_extract_vector(ctx, data, start + i, RegClass(type, size_hint));
+ /* count == 1 fast path */
+ if (count == 1) {
+ if (dst_type == RegType::sgpr)
+ dst[0] = bld.as_uniform(src);
+ else
+ dst[0] = as_vgpr(ctx, src);
+ return;
+ }
- if (size == 1)
- return type == RegType::vgpr ? as_vgpr(ctx, elems[0]) : elems[0];
+ for (unsigned i = 0; i < count - 1; i++)
+ dst[i] = bld.tmp(RegClass::get(dst_type, offsets[i + 1] - offsets[i]));
+ dst[count - 1] = bld.tmp(RegClass::get(dst_type, src.bytes() - offsets[count - 1]));
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
- for (unsigned i = 0; i < size; i++)
- vec->operands[i] = Operand(elems[i]);
- Temp res = {ctx->program->allocateId(), RegClass(type, size * size_hint)};
- vec->definitions[0] = Definition(res);
- ctx->block->instructions.emplace_back(std::move(vec));
- return res;
+ if (is_subdword && src.type() == RegType::sgpr) {
+ src = as_vgpr(ctx, src);
+ } else {
+ /* use allocated_vec if possible */
+ auto it = ctx->allocated_vec.find(src.id());
+ if (it != ctx->allocated_vec.end()) {
+ unsigned total_size = 0;
+ for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++)
+ total_size += it->second[i].bytes();
+ if (total_size != src.bytes())
+ goto split;
+
+ unsigned elem_size = it->second[0].bytes();
+
+ for (unsigned i = 0; i < count; i++) {
+ if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
+ goto split;
+ }
+
+ for (unsigned i = 0; i < count; i++) {
+ unsigned start_idx = offsets[i] / elem_size;
+ unsigned op_count = dst[i].bytes() / elem_size;
+ if (op_count == 1) {
+ if (dst_type == RegType::sgpr)
+ dst[i] = bld.as_uniform(it->second[start_idx]);
+ else
+ dst[i] = as_vgpr(ctx, it->second[start_idx]);
+ continue;
+ }
+
+ aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)};
+ for (unsigned j = 0; j < op_count; j++) {
+ Temp tmp = it->second[start_idx + j];
+ if (dst_type == RegType::sgpr)
+ tmp = bld.as_uniform(tmp);
+ vec->operands[j] = Operand(tmp);
+ }
+ vec->definitions[0] = Definition(dst[i]);
+ bld.insert(std::move(vec));
+ }
+ return;
+ }
+ }
+
+ if (dst_type == RegType::sgpr)
+ src = bld.as_uniform(src);
+
+ split:
+ /* just split it */
+ aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
+ split->operands[0] = Operand(src);
+ for (unsigned i = 0; i < count; i++)
+ split->definitions[i] = Definition(dst[i]);
+ bld.insert(std::move(split));
+}
+
+bool scan_write_mask(uint32_t mask, uint32_t todo_mask,
+ int *start, int *count)
+{
+ unsigned start_elem = ffs(todo_mask) - 1;
+ bool skip = !(mask & (1 << start_elem));
+ if (skip)
+ mask = ~mask & todo_mask;
+
+ mask &= todo_mask;
+
+ u_bit_scan_consecutive_range(&mask, start, count);
+
+ return !skip;
}
-void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned data_start, unsigned total_size, unsigned offset0, unsigned offset1, unsigned align)
+void advance_write_mask(uint32_t *todo_mask, int start, int count)
{
+ *todo_mask &= ~u_bit_consecutive(0, count) << start;
+}
+
+void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
+ Temp address, unsigned base_offset, unsigned align)
+{
+ assert(util_is_power_of_two_nonzero(align));
+ assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
+
Builder bld(ctx->program, ctx->block);
- unsigned bytes_written = 0;
bool large_ds_write = ctx->options->chip_class >= GFX7;
bool usable_write2 = ctx->options->chip_class >= GFX7;
- while (bytes_written < total_size * 4) {
- unsigned todo = total_size * 4 - bytes_written;
- bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0;
- bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0;
+ unsigned write_count = 0;
+ Temp write_datas[32];
+ unsigned offsets[32];
+ aco_opcode opcodes[32];
+
+ wrmask = widen_mask(wrmask, elem_size_bytes);
+
+ uint32_t todo = u_bit_consecutive(0, data.bytes());
+ while (todo) {
+ int offset, bytes;
+ if (!scan_write_mask(wrmask, todo, &offset, &bytes)) {
+ offsets[write_count] = offset;
+ opcodes[write_count] = aco_opcode::num_opcodes;
+ write_count++;
+ advance_write_mask(&todo, offset, bytes);
+ continue;
+ }
+
+ bool aligned2 = offset % 2 == 0 && align % 2 == 0;
+ bool aligned4 = offset % 4 == 0 && align % 4 == 0;
+ bool aligned8 = offset % 8 == 0 && align % 8 == 0;
+ bool aligned16 = offset % 16 == 0 && align % 16 == 0;
- aco_opcode op = aco_opcode::last_opcode;
- bool write2 = false;
- unsigned size = 0;
- if (todo >= 16 && aligned16 && large_ds_write) {
+ //TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
+ aco_opcode op = aco_opcode::num_opcodes;
+ if (bytes >= 16 && aligned16 && large_ds_write) {
op = aco_opcode::ds_write_b128;
- size = 4;
- } else if (todo >= 16 && aligned8 && usable_write2) {
- op = aco_opcode::ds_write2_b64;
- write2 = true;
- size = 4;
- } else if (todo >= 12 && aligned16 && large_ds_write) {
+ bytes = 16;
+ } else if (bytes >= 12 && aligned16 && large_ds_write) {
op = aco_opcode::ds_write_b96;
- size = 3;
- } else if (todo >= 8 && aligned8) {
+ bytes = 12;
+ } else if (bytes >= 8 && aligned8) {
op = aco_opcode::ds_write_b64;
- size = 2;
- } else if (todo >= 8 && usable_write2) {
- op = aco_opcode::ds_write2_b32;
- write2 = true;
- size = 2;
- } else if (todo >= 4) {
+ bytes = 8;
+ } else if (bytes >= 4 && aligned4) {
op = aco_opcode::ds_write_b32;
- size = 1;
+ bytes = 4;
+ } else if (bytes >= 2 && aligned2) {
+ op = aco_opcode::ds_write_b16;
+ bytes = 2;
+ } else if (bytes >= 1) {
+ op = aco_opcode::ds_write_b8;
+ bytes = 1;
} else {
assert(false);
}
- unsigned offset = offset0 + offset1 + bytes_written;
- unsigned max_offset = write2 ? 1020 : 65535;
- Temp address_offset = address;
- if (offset > max_offset) {
- address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset);
- offset = offset1 + bytes_written;
- }
- assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */
+ offsets[write_count] = offset;
+ opcodes[write_count] = op;
+ write_count++;
+ advance_write_mask(&todo, offset, bytes);
+ }
- if (write2) {
- Temp val0 = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size / 2, RegType::vgpr);
- Temp val1 = extract_subvector(ctx, data, data_start + (bytes_written >> 2) + 1, size / 2, RegType::vgpr);
- bld.ds(op, address_offset, val0, val1, m, offset / size / 2, (offset / size / 2) + 1);
- } else {
- Temp val = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size, RegType::vgpr);
- bld.ds(op, address_offset, val, m, offset);
- }
+ Operand m = load_lds_size_m0(bld);
- bytes_written += size * 4;
- }
-}
+ split_store_data(ctx, RegType::vgpr, write_count, write_datas, offsets, data);
-void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
- Temp address, unsigned base_offset, unsigned align)
-{
- assert(util_is_power_of_two_nonzero(align) && align >= 4);
- assert(elem_size_bytes == 4 || elem_size_bytes == 8);
+ for (unsigned i = 0; i < write_count; i++) {
+ aco_opcode op = opcodes[i];
+ if (op == aco_opcode::num_opcodes)
+ continue;
- Builder bld(ctx->program, ctx->block);
- Operand m = load_lds_size_m0(bld);
+ Temp data = write_datas[i];
- /* we need at most two stores, assuming that the writemask is at most 4 bits wide */
- assert(wrmask <= 0x0f);
- int start[2], count[2];
- u_bit_scan_consecutive_range(&wrmask, &start[0], &count[0]);
- u_bit_scan_consecutive_range(&wrmask, &start[1], &count[1]);
- assert(wrmask == 0);
+ unsigned second = write_count;
+ if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
+ for (second = i + 1; second < write_count; second++) {
+ if (opcodes[second] == op && (offsets[second] - offsets[i]) % data.bytes() == 0) {
+ op = data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
+ opcodes[second] = aco_opcode::num_opcodes;
+ break;
+ }
+ }
+ }
+
+ bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
+ unsigned write2_off = (offsets[second] - offsets[i]) / data.bytes();
- /* one combined store is sufficient */
- if (count[0] == count[1] && (align % elem_size_bytes) == 0 && (base_offset % elem_size_bytes) == 0) {
+ unsigned inline_offset = base_offset + offsets[i];
+ unsigned max_offset = write2 ? (255 - write2_off) * data.bytes() : 65535;
Temp address_offset = address;
- if ((base_offset / elem_size_bytes) + start[1] > 255) {
+ if (inline_offset > max_offset) {
address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
- base_offset = 0;
+ inline_offset = offsets[i];
}
+ assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */
- assert(count[0] == 1);
- RegClass xtract_rc(RegType::vgpr, elem_size_bytes / 4);
-
- Temp val0 = emit_extract_vector(ctx, data, start[0], xtract_rc);
- Temp val1 = emit_extract_vector(ctx, data, start[1], xtract_rc);
- aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
- base_offset = base_offset / elem_size_bytes;
- bld.ds(op, address_offset, val0, val1, m,
- base_offset + start[0], base_offset + start[1]);
- return;
- }
-
- for (unsigned i = 0; i < 2; i++) {
- if (count[i] == 0)
- continue;
-
- unsigned elem_size_words = elem_size_bytes / 4;
- ds_write_helper(ctx, m, address, data, start[i] * elem_size_words, count[i] * elem_size_words,
- base_offset, start[i] * elem_size_bytes, align);
+ if (write2) {
+ Temp second_data = write_datas[second];
+ inline_offset /= data.bytes();
+ bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
+ } else {
+ bld.ds(op, address_offset, data, m, inline_offset);
+ }
}
- return;
}
unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
}
+aco_opcode get_buffer_store_op(bool smem, unsigned bytes)
+{
+ switch (bytes) {
+ case 1:
+ assert(!smem);
+ return aco_opcode::buffer_store_byte;
+ case 2:
+ assert(!smem);
+ return aco_opcode::buffer_store_short;
+ case 4:
+ return smem ? aco_opcode::s_buffer_store_dword : aco_opcode::buffer_store_dword;
+ case 8:
+ return smem ? aco_opcode::s_buffer_store_dwordx2 : aco_opcode::buffer_store_dwordx2;
+ case 12:
+ assert(!smem);
+ return aco_opcode::buffer_store_dwordx3;
+ case 16:
+ return smem ? aco_opcode::s_buffer_store_dwordx4 : aco_opcode::buffer_store_dwordx4;
+ }
+ unreachable("Unexpected store size");
+ return aco_opcode::num_opcodes;
+}
+
+void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem, RegType dst_type,
+ Temp data, unsigned writemask, int swizzle_element_size,
+ unsigned *write_count, Temp *write_datas, unsigned *offsets)
+{
+ unsigned write_count_with_skips = 0;
+ bool skips[16];
+
+ /* determine how to split the data */
+ unsigned todo = u_bit_consecutive(0, data.bytes());
+ while (todo) {
+ int offset, bytes;
+ skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &bytes);
+ offsets[write_count_with_skips] = offset;
+ if (skips[write_count_with_skips]) {
+ advance_write_mask(&todo, offset, bytes);
+ write_count_with_skips++;
+ continue;
+ }
+
+ /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
+ * larger than swizzle_element_size */
+ bytes = MIN2(bytes, swizzle_element_size);
+ if (bytes % 4)
+ bytes = bytes > 4 ? bytes & ~0x3 : MIN2(bytes, 2);
+
+ /* SMEM and GFX6 VMEM can't emit 12-byte stores */
+ if ((ctx->program->chip_class == GFX6 || smem) && bytes == 12)
+ bytes = 8;
+
+ /* dword or larger stores have to be dword-aligned */
+ unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
+ unsigned align_offset = instr ? nir_intrinsic_align_mul(instr) : 0;
+ bool dword_aligned = (align_offset + offset) % 4 == 0 && align_mul % 4 == 0;
+ if (bytes >= 4 && !dword_aligned)
+ bytes = MIN2(bytes, 2);
+
+ advance_write_mask(&todo, offset, bytes);
+ write_count_with_skips++;
+ }
+
+ /* actually split data */
+ split_store_data(ctx, dst_type, write_count_with_skips, write_datas, offsets, data);
+
+ /* remove skips */
+ for (unsigned i = 0; i < write_count_with_skips; i++) {
+ if (skips[i])
+ continue;
+ write_datas[*write_count] = write_datas[i];
+ offsets[*write_count] = offsets[i];
+ (*write_count)++;
+ }
+}
+
Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes,
unsigned split_cnt = 0u, Temp dst = Temp())
{
assert(vdata.size() >= 1 && vdata.size() <= 4);
Builder bld(ctx->program, ctx->block);
- aco_opcode op = (aco_opcode) ((unsigned) aco_opcode::buffer_store_dword + vdata.size() - 1);
+ aco_opcode op = get_buffer_store_op(false, vdata.bytes());
const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
Builder bld(ctx->program, ctx->block);
assert(elem_size_bytes == 4 || elem_size_bytes == 8);
assert(write_mask);
+ write_mask = widen_mask(write_mask, elem_size_bytes);
- if (elem_size_bytes == 8) {
- elem_size_bytes = 4;
- write_mask = widen_mask(write_mask, 2);
- }
-
- while (write_mask) {
- int start = 0;
- int count = 0;
- u_bit_scan_consecutive_range(&write_mask, &start, &count);
- assert(count > 0);
- assert(start >= 0);
-
- while (count > 0) {
- unsigned sub_count = allow_combining ? MIN2(count, 4) : 1;
- unsigned const_offset = (unsigned) start * elem_size_bytes + base_const_offset;
+ unsigned write_count = 0;
+ Temp write_datas[32];
+ unsigned offsets[32];
+ split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask,
+ allow_combining ? 16 : 4, &write_count, write_datas, offsets);
- /* GFX6 doesn't have buffer_store_dwordx3, so make sure not to emit that here either. */
- if (unlikely(ctx->program->chip_class == GFX6 && sub_count == 3))
- sub_count = 2;
-
- Temp elem = extract_subvector(ctx, src, start, sub_count, RegType::vgpr);
- emit_single_mubuf_store(ctx, descriptor, voffset, soffset, elem, const_offset, reorder, slc);
-
- count -= sub_count;
- start += sub_count;
- }
-
- assert(count == 0);
+ for (unsigned i = 0; i < write_count; i++) {
+ unsigned const_offset = offsets[i] + base_const_offset;
+ emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc);
}
}
Builder bld(ctx->program, ctx->block);
uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16;
- uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
- uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written);
- uint32_t output_vertex_size = num_tcs_outputs * 16;
+ uint32_t output_vertex_size = ctx->tcs_num_outputs * 16;
uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
- uint32_t output_patch_stride = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+ uint32_t output_patch_stride = pervertex_output_patch_size + ctx->tcs_num_patch_outputs * 16;
std::pair<Temp, unsigned> offs = instr
? get_intrinsic_io_basic_offset(ctx, instr, 4u)
{
Builder bld(ctx->program, ctx->block);
- unsigned num_tcs_outputs = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL
- ? util_last_bit64(ctx->args->shader_info->tcs.outputs_written)
- : ctx->args->options->key.tes.tcs_num_outputs;
-
- unsigned output_vertex_size = num_tcs_outputs * 16;
+ unsigned output_vertex_size = ctx->tcs_num_outputs * 16;
unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches;
unsigned attr_stride = ctx->tcs_num_patches;
/* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS.
* GFX9+: LS is merged into HS, but still uses the same LDS layout.
*/
- unsigned num_tcs_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written);
Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id);
- lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, num_tcs_inputs * 16u);
+ lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->tcs_num_inputs * 16u);
} else {
unreachable("Invalid LS or ES stage");
}
ctx->block->instructions.emplace_back(std::move(load));
Operand sample_index4;
- if (sample_index.isConstant() && sample_index.constantValue() < 16) {
- sample_index4 = Operand(sample_index.constantValue() << 2);
+ if (sample_index.isConstant()) {
+ if (sample_index.constantValue() < 16) {
+ sample_index4 = Operand(sample_index.constantValue() << 2);
+ } else {
+ sample_index4 = Operand(0u);
+ }
} else if (sample_index.regClass() == s1) {
sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
} else {
Builder bld(ctx->program, ctx->block);
Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
- unsigned writemask = nir_intrinsic_write_mask(instr);
+ unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
offset = bld.as_uniform(offset);
bool smem_nonfs = smem && ctx->stage != fragment_fs;
- while (writemask) {
- int start, count;
- u_bit_scan_consecutive_range(&writemask, &start, &count);
- if (count == 3 && (smem || ctx->options->chip_class == GFX6)) {
- /* GFX6 doesn't support storing vec3, split it. */
- writemask |= 1u << (start + 2);
- count = 2;
- }
- int num_bytes = count * elem_size_bytes;
+ unsigned write_count = 0;
+ Temp write_datas[32];
+ unsigned offsets[32];
+ split_buffer_store(ctx, instr, smem, smem_nonfs ? RegType::sgpr : (smem ? data.type() : RegType::vgpr),
+ data, writemask, 16, &write_count, write_datas, offsets);
- /* dword or larger stores have to be dword-aligned */
- if (elem_size_bytes < 4 && num_bytes > 2) {
- // TODO: improve alignment check of sub-dword stores
- unsigned count_new = 2 / elem_size_bytes;
- writemask |= ((1 << (count - count_new)) - 1) << (start + count_new);
- count = count_new;
- num_bytes = 2;
- }
-
- if (num_bytes > 16) {
- assert(elem_size_bytes == 8);
- writemask |= (((count - 2) << 1) - 1) << (start + 2);
- count = 2;
- num_bytes = 16;
- }
-
- Temp write_data;
- if (elem_size_bytes < 4) {
- if (data.type() == RegType::sgpr) {
- data = as_vgpr(ctx, data);
- emit_split_vector(ctx, data, 4 * data.size() / elem_size_bytes);
- }
- RegClass rc = RegClass(RegType::vgpr, elem_size_bytes).as_subdword();
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
- for (int i = 0; i < count; i++)
- vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, rc));
- write_data = bld.tmp(RegClass(RegType::vgpr, num_bytes).as_subdword());
- vec->definitions[0] = Definition(write_data);
- bld.insert(std::move(vec));
- } else if (count != instr->num_components) {
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
- for (int i = 0; i < count; i++) {
- Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
- vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem);
- }
- write_data = bld.tmp(!smem ? RegType::vgpr : smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4);
- vec->definitions[0] = Definition(write_data);
- ctx->block->instructions.emplace_back(std::move(vec));
- } else if (!smem && data.type() != RegType::vgpr) {
- assert(num_bytes % 4 == 0);
- write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data);
- } else if (smem_nonfs && data.type() == RegType::vgpr) {
- assert(num_bytes % 4 == 0);
- write_data = bld.as_uniform(data);
- } else {
- write_data = data;
- }
-
- aco_opcode vmem_op, smem_op = aco_opcode::last_opcode;
- switch (num_bytes) {
- case 1:
- vmem_op = aco_opcode::buffer_store_byte;
- break;
- case 2:
- vmem_op = aco_opcode::buffer_store_short;
- break;
- case 4:
- vmem_op = aco_opcode::buffer_store_dword;
- smem_op = aco_opcode::s_buffer_store_dword;
- break;
- case 8:
- vmem_op = aco_opcode::buffer_store_dwordx2;
- smem_op = aco_opcode::s_buffer_store_dwordx2;
- break;
- case 12:
- vmem_op = aco_opcode::buffer_store_dwordx3;
- assert(!smem && ctx->options->chip_class > GFX6);
- break;
- case 16:
- vmem_op = aco_opcode::buffer_store_dwordx4;
- smem_op = aco_opcode::s_buffer_store_dwordx4;
- break;
- default:
- unreachable("Store SSBO not implemented for this size.");
- }
- if (ctx->stage == fragment_fs)
- smem_op = aco_opcode::p_fs_buffer_store_smem;
+ for (unsigned i = 0; i < write_count; i++) {
+ aco_opcode op = get_buffer_store_op(smem, write_datas[i].bytes());
+ if (smem && ctx->stage == fragment_fs)
+ op = aco_opcode::p_fs_buffer_store_smem;
if (smem) {
- aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
+ aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
store->operands[0] = Operand(rsrc);
- if (start) {
+ if (offsets[i]) {
Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
- offset, Operand(start * elem_size_bytes));
+ offset, Operand(offsets[i]));
store->operands[1] = Operand(off);
} else {
store->operands[1] = Operand(offset);
}
- if (smem_op != aco_opcode::p_fs_buffer_store_smem)
+ if (op != aco_opcode::p_fs_buffer_store_smem)
store->operands[1].setFixed(m0);
- store->operands[2] = Operand(write_data);
+ store->operands[2] = Operand(write_datas[i]);
store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
store->dlc = false;
store->disable_wqm = true;
store->barrier = barrier_buffer;
ctx->block->instructions.emplace_back(std::move(store));
ctx->program->wb_smem_l1_on_end = true;
- if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
+ if (op == aco_opcode::p_fs_buffer_store_smem) {
ctx->block->kind |= block_kind_needs_lowering;
ctx->program->needs_exact = true;
}
} else {
- aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
+ aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
store->operands[0] = Operand(rsrc);
store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
- store->operands[3] = Operand(write_data);
- store->offset = start * elem_size_bytes;
+ store->operands[3] = Operand(write_datas[i]);
+ store->offset = offsets[i];
store->offen = (offset.type() == RegType::vgpr);
store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
store->dlc = false;
{
Builder bld(ctx->program, ctx->block);
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
+ unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
+ bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
if (ctx->options->chip_class >= GFX7)
addr = as_vgpr(ctx, addr);
- unsigned writemask = nir_intrinsic_write_mask(instr);
- while (writemask) {
- int start, count;
- u_bit_scan_consecutive_range(&writemask, &start, &count);
- if (count == 3 && ctx->options->chip_class == GFX6) {
- /* GFX6 doesn't support storing vec3, split it. */
- writemask |= 1u << (start + 2);
- count = 2;
- }
- unsigned num_bytes = count * elem_size_bytes;
-
- Temp write_data = data;
- if (count != instr->num_components) {
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
- for (int i = 0; i < count; i++)
- vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
- write_data = bld.tmp(RegType::vgpr, count);
- vec->definitions[0] = Definition(write_data);
- ctx->block->instructions.emplace_back(std::move(vec));
- }
-
- bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
- unsigned offset = start * elem_size_bytes;
+ unsigned write_count = 0;
+ Temp write_datas[32];
+ unsigned offsets[32];
+ split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
+ 16, &write_count, write_datas, offsets);
+ for (unsigned i = 0; i < write_count; i++) {
if (ctx->options->chip_class >= GFX7) {
+ unsigned offset = offsets[i];
+ Temp store_addr = addr;
if (offset > 0 && ctx->options->chip_class < GFX9) {
Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
Operand(0u), addr1,
carry).def(1).setHint(vcc);
- addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
+ store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
offset = 0;
}
bool global = ctx->options->chip_class >= GFX9;
aco_opcode op;
- switch (num_bytes) {
+ switch (write_datas[i].bytes()) {
+ case 1:
+ op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte;
+ break;
+ case 2:
+ op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short;
+ break;
case 4:
op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
break;
}
aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
- flat->operands[0] = Operand(addr);
+ flat->operands[0] = Operand(store_addr);
flat->operands[1] = Operand(s1);
- flat->operands[2] = Operand(data);
+ flat->operands[2] = Operand(write_datas[i]);
flat->glc = glc;
flat->dlc = false;
flat->offset = offset;
} else {
assert(ctx->options->chip_class == GFX6);
- aco_opcode op;
- switch (num_bytes) {
- case 4:
- op = aco_opcode::buffer_store_dword;
- break;
- case 8:
- op = aco_opcode::buffer_store_dwordx2;
- break;
- case 16:
- op = aco_opcode::buffer_store_dwordx4;
- break;
- default:
- unreachable("store_global not implemented for this size.");
- }
+ aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
Temp rsrc = get_gfx6_global_rsrc(bld, addr);
mubuf->operands[0] = Operand(rsrc);
mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
mubuf->operands[2] = Operand(0u);
- mubuf->operands[3] = Operand(write_data);
+ mubuf->operands[3] = Operand(write_datas[i]);
mubuf->glc = glc;
mubuf->dlc = false;
- mubuf->offset = offset;
+ mubuf->offset = offsets[i];
mubuf->addr64 = addr.type() == RegType::vgpr;
mubuf->disable_wqm = true;
mubuf->barrier = barrier_buffer;
{
// TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
- assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
Builder bld(ctx->program, ctx->block);
Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
- assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
}
void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
- assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
Builder bld(ctx->program, ctx->block);
Temp rsrc = get_scratch_resource(ctx);
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
- unsigned writemask = nir_intrinsic_write_mask(instr);
-
- while (writemask) {
- int start, count;
- u_bit_scan_consecutive_range(&writemask, &start, &count);
- int num_bytes = count * elem_size_bytes;
-
- if (num_bytes > 16) {
- assert(elem_size_bytes == 8);
- writemask |= (((count - 2) << 1) - 1) << (start + 2);
- count = 2;
- num_bytes = 16;
- }
-
- // TODO: check alignment of sub-dword stores
- // TODO: split 3 bytes. there is no store instruction for that
+ unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
- Temp write_data;
- if (count != instr->num_components) {
- aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
- for (int i = 0; i < count; i++) {
- Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
- vec->operands[i] = Operand(elem);
- }
- write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
- vec->definitions[0] = Definition(write_data);
- ctx->block->instructions.emplace_back(std::move(vec));
- } else {
- write_data = data;
- }
-
- aco_opcode op;
- switch (num_bytes) {
- case 4:
- op = aco_opcode::buffer_store_dword;
- break;
- case 8:
- op = aco_opcode::buffer_store_dwordx2;
- break;
- case 12:
- op = aco_opcode::buffer_store_dwordx3;
- break;
- case 16:
- op = aco_opcode::buffer_store_dwordx4;
- break;
- default:
- unreachable("Invalid data size for nir_intrinsic_store_scratch.");
- }
+ unsigned write_count = 0;
+ Temp write_datas[32];
+ unsigned offsets[32];
+ split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
+ 16, &write_count, write_datas, offsets);
- bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true);
+ for (unsigned i = 0; i < write_count; i++) {
+ aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
+ bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true);
}
}
Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
- Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
+ Operand default_sample = Operand(1u);
+ if (ctx->options->robust_buffer_access) {
+ /* Extract the second dword of the descriptor, if it's
+ * all zero, then it's a null descriptor.
+ */
+ Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
+ Temp is_non_null_descriptor = bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u));
+ default_sample = Operand(is_non_null_descriptor);
+ }
+
+ Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
- samples, Operand(1u), bld.scc(is_msaa));
+ samples, default_sample, bld.scc(is_msaa));
return;
}
visit_cf_list(ctx, &if_stmt->else_list);
end_uniform_if(ctx, &ic);
-
- return !ctx->cf_info.has_branch;
} else { /* non-uniform condition */
/**
* To maintain a logical and linear CFG without critical edges,
visit_cf_list(ctx, &if_stmt->else_list);
end_divergent_if(ctx, &ic);
-
- return true;
}
+
+ return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
}
static bool visit_cf_list(isel_context *ctx,