+ uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
+ if (addr.type() == RegType::vgpr)
+ return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u), Operand(-1u), Operand(rsrc_conf));
+ return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u), Operand(rsrc_conf));
+}
+
+Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
+ Temp offset, unsigned bytes_needed,
+ unsigned align_, unsigned const_offset,
+ Temp dst_hint)
+{
+ unsigned bytes_size = 0;
+ bool mubuf = bld.program->chip_class == GFX6;
+ bool global = bld.program->chip_class >= GFX9;
+ aco_opcode op;
+ if (bytes_needed == 1) {
+ bytes_size = 1;
+ op = mubuf ? aco_opcode::buffer_load_ubyte : global ? aco_opcode::global_load_ubyte : aco_opcode::flat_load_ubyte;
+ } else if (bytes_needed == 2) {
+ bytes_size = 2;
+ op = mubuf ? aco_opcode::buffer_load_ushort : global ? aco_opcode::global_load_ushort : aco_opcode::flat_load_ushort;
+ } else if (bytes_needed <= 4) {
+ bytes_size = 4;
+ op = mubuf ? aco_opcode::buffer_load_dword : global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
+ } else if (bytes_needed <= 8) {
+ bytes_size = 8;
+ op = mubuf ? aco_opcode::buffer_load_dwordx2 : global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
+ } else if (bytes_needed <= 12 && !mubuf) {
+ bytes_size = 12;
+ op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
+ } else {
+ bytes_size = 16;
+ op = mubuf ? aco_opcode::buffer_load_dwordx4 : global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
+ }
+ RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
+ Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
+ if (mubuf) {
+ aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
+ mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset));
+ mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
+ mubuf->operands[2] = Operand(0u);
+ mubuf->glc = info->glc;
+ mubuf->dlc = false;
+ mubuf->offset = 0;
+ mubuf->addr64 = offset.type() == RegType::vgpr;
+ mubuf->disable_wqm = false;
+ mubuf->barrier = info->barrier;
+ mubuf->definitions[0] = Definition(val);
+ bld.insert(std::move(mubuf));
+ } else {
+ offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset;
+
+ aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
+ flat->operands[0] = Operand(offset);
+ flat->operands[1] = Operand(s1);
+ flat->glc = info->glc;
+ flat->dlc = info->glc && bld.program->chip_class >= GFX10;
+ flat->barrier = info->barrier;
+ flat->offset = 0u;
+ flat->definitions[0] = Definition(val);
+ bld.insert(std::move(flat));
+ }
+
+ return val;
+}
+
+static auto emit_global_load = emit_load<global_load_callback, true, true, 1>;
+
+Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
+ Temp address, unsigned base_offset, unsigned align)
+{
+ assert(util_is_power_of_two_nonzero(align));
+
+ Builder bld(ctx->program, ctx->block);
+
+ unsigned num_components = dst.bytes() / elem_size_bytes;
+ LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
+ info.align_mul = align;
+ info.align_offset = 0;
+ info.barrier = barrier_shared;
+ info.can_reorder = false;
+ info.const_offset = base_offset;
+ emit_lds_load(ctx, bld, &info);
+
+ return dst;
+}
+
+void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp *dst, unsigned *offsets, Temp src)
+{
+ if (!count)
+ return;
+
+ Builder bld(ctx->program, ctx->block);
+
+ ASSERTED bool is_subdword = false;
+ for (unsigned i = 0; i < count; i++)
+ is_subdword |= offsets[i] % 4;
+ is_subdword |= (src.bytes() - offsets[count - 1]) % 4;
+ assert(!is_subdword || dst_type == RegType::vgpr);
+
+ /* count == 1 fast path */
+ if (count == 1) {
+ if (dst_type == RegType::sgpr)
+ dst[0] = bld.as_uniform(src);
+ else
+ dst[0] = as_vgpr(ctx, src);
+ return;
+ }
+
+ for (unsigned i = 0; i < count - 1; i++)
+ dst[i] = bld.tmp(RegClass::get(dst_type, offsets[i + 1] - offsets[i]));
+ dst[count - 1] = bld.tmp(RegClass::get(dst_type, src.bytes() - offsets[count - 1]));
+
+ if (is_subdword && src.type() == RegType::sgpr) {
+ src = as_vgpr(ctx, src);
+ } else {
+ /* use allocated_vec if possible */
+ auto it = ctx->allocated_vec.find(src.id());
+ if (it != ctx->allocated_vec.end()) {
+ unsigned total_size = 0;
+ for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++)
+ total_size += it->second[i].bytes();
+ if (total_size != src.bytes())
+ goto split;
+
+ unsigned elem_size = it->second[0].bytes();
+
+ for (unsigned i = 0; i < count; i++) {
+ if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
+ goto split;
+ }
+
+ for (unsigned i = 0; i < count; i++) {
+ unsigned start_idx = offsets[i] / elem_size;
+ unsigned op_count = dst[i].bytes() / elem_size;
+ if (op_count == 1) {
+ if (dst_type == RegType::sgpr)
+ dst[i] = bld.as_uniform(it->second[start_idx]);
+ else
+ dst[i] = as_vgpr(ctx, it->second[start_idx]);
+ continue;
+ }
+
+ aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)};
+ for (unsigned j = 0; j < op_count; j++) {
+ Temp tmp = it->second[start_idx + j];
+ if (dst_type == RegType::sgpr)
+ tmp = bld.as_uniform(tmp);
+ vec->operands[j] = Operand(tmp);
+ }
+ vec->definitions[0] = Definition(dst[i]);
+ bld.insert(std::move(vec));
+ }
+ return;
+ }
+ }
+
+ if (dst_type == RegType::sgpr)
+ src = bld.as_uniform(src);
+
+ split:
+ /* just split it */
+ aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
+ split->operands[0] = Operand(src);
+ for (unsigned i = 0; i < count; i++)
+ split->definitions[i] = Definition(dst[i]);
+ bld.insert(std::move(split));
+}
+
+bool scan_write_mask(uint32_t mask, uint32_t todo_mask,
+ int *start, int *count)
+{
+ unsigned start_elem = ffs(todo_mask) - 1;
+ bool skip = !(mask & (1 << start_elem));
+ if (skip)
+ mask = ~mask & todo_mask;
+
+ mask &= todo_mask;
+
+ u_bit_scan_consecutive_range(&mask, start, count);
+
+ return !skip;
+}
+
+void advance_write_mask(uint32_t *todo_mask, int start, int count)
+{
+ *todo_mask &= ~u_bit_consecutive(0, count) << start;
+}
+
+void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
+ Temp address, unsigned base_offset, unsigned align)
+{
+ assert(util_is_power_of_two_nonzero(align));
+ assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
+
+ Builder bld(ctx->program, ctx->block);
+ bool large_ds_write = ctx->options->chip_class >= GFX7;
+ bool usable_write2 = ctx->options->chip_class >= GFX7;
+
+ unsigned write_count = 0;
+ Temp write_datas[32];
+ unsigned offsets[32];
+ aco_opcode opcodes[32];
+
+ wrmask = widen_mask(wrmask, elem_size_bytes);
+
+ uint32_t todo = u_bit_consecutive(0, data.bytes());
+ while (todo) {
+ int offset, bytes;
+ if (!scan_write_mask(wrmask, todo, &offset, &bytes)) {
+ offsets[write_count] = offset;
+ opcodes[write_count] = aco_opcode::num_opcodes;
+ write_count++;
+ advance_write_mask(&todo, offset, bytes);
+ continue;
+ }
+
+ bool aligned2 = offset % 2 == 0 && align % 2 == 0;
+ bool aligned4 = offset % 4 == 0 && align % 4 == 0;
+ bool aligned8 = offset % 8 == 0 && align % 8 == 0;
+ bool aligned16 = offset % 16 == 0 && align % 16 == 0;
+
+ //TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
+ aco_opcode op = aco_opcode::num_opcodes;
+ if (bytes >= 16 && aligned16 && large_ds_write) {
+ op = aco_opcode::ds_write_b128;
+ bytes = 16;
+ } else if (bytes >= 12 && aligned16 && large_ds_write) {
+ op = aco_opcode::ds_write_b96;
+ bytes = 12;
+ } else if (bytes >= 8 && aligned8) {
+ op = aco_opcode::ds_write_b64;
+ bytes = 8;
+ } else if (bytes >= 4 && aligned4) {
+ op = aco_opcode::ds_write_b32;
+ bytes = 4;
+ } else if (bytes >= 2 && aligned2) {
+ op = aco_opcode::ds_write_b16;
+ bytes = 2;
+ } else if (bytes >= 1) {
+ op = aco_opcode::ds_write_b8;
+ bytes = 1;
+ } else {
+ assert(false);
+ }
+
+ offsets[write_count] = offset;
+ opcodes[write_count] = op;
+ write_count++;
+ advance_write_mask(&todo, offset, bytes);
+ }
+
+ Operand m = load_lds_size_m0(bld);
+
+ split_store_data(ctx, RegType::vgpr, write_count, write_datas, offsets, data);
+
+ for (unsigned i = 0; i < write_count; i++) {
+ aco_opcode op = opcodes[i];
+ if (op == aco_opcode::num_opcodes)
+ continue;
+
+ Temp data = write_datas[i];
+
+ unsigned second = write_count;
+ if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
+ for (second = i + 1; second < write_count; second++) {
+ if (opcodes[second] == op && (offsets[second] - offsets[i]) % data.bytes() == 0) {
+ op = data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
+ opcodes[second] = aco_opcode::num_opcodes;
+ break;
+ }
+ }
+ }
+
+ bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
+ unsigned write2_off = (offsets[second] - offsets[i]) / data.bytes();
+
+ unsigned inline_offset = base_offset + offsets[i];
+ unsigned max_offset = write2 ? (255 - write2_off) * data.bytes() : 65535;
+ Temp address_offset = address;
+ if (inline_offset > max_offset) {
+ address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
+ inline_offset = offsets[i];
+ }
+ assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */
+
+ if (write2) {
+ Temp second_data = write_datas[second];
+ inline_offset /= data.bytes();
+ bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
+ } else {
+ bld.ds(op, address_offset, data, m, inline_offset);
+ }
+ }
+}
+
+unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
+{
+ unsigned align = 16;
+ if (const_offset)
+ align = std::min(align, 1u << (ffs(const_offset) - 1));
+
+ return align;
+}
+
+
+aco_opcode get_buffer_store_op(bool smem, unsigned bytes)
+{
+ switch (bytes) {
+ case 1:
+ assert(!smem);
+ return aco_opcode::buffer_store_byte;
+ case 2:
+ assert(!smem);
+ return aco_opcode::buffer_store_short;
+ case 4:
+ return smem ? aco_opcode::s_buffer_store_dword : aco_opcode::buffer_store_dword;
+ case 8:
+ return smem ? aco_opcode::s_buffer_store_dwordx2 : aco_opcode::buffer_store_dwordx2;
+ case 12:
+ assert(!smem);
+ return aco_opcode::buffer_store_dwordx3;
+ case 16:
+ return smem ? aco_opcode::s_buffer_store_dwordx4 : aco_opcode::buffer_store_dwordx4;
+ }
+ unreachable("Unexpected store size");
+ return aco_opcode::num_opcodes;
+}
+
+void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem, RegType dst_type,
+ Temp data, unsigned writemask, int swizzle_element_size,
+ unsigned *write_count, Temp *write_datas, unsigned *offsets)
+{
+ unsigned write_count_with_skips = 0;
+ bool skips[16];
+
+ /* determine how to split the data */
+ unsigned todo = u_bit_consecutive(0, data.bytes());
+ while (todo) {
+ int offset, bytes;
+ skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &bytes);
+ offsets[write_count_with_skips] = offset;
+ if (skips[write_count_with_skips]) {
+ advance_write_mask(&todo, offset, bytes);
+ write_count_with_skips++;
+ continue;
+ }
+
+ /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
+ * larger than swizzle_element_size */
+ bytes = MIN2(bytes, swizzle_element_size);
+ if (bytes % 4)
+ bytes = bytes > 4 ? bytes & ~0x3 : MIN2(bytes, 2);
+
+ /* SMEM and GFX6 VMEM can't emit 12-byte stores */
+ if ((ctx->program->chip_class == GFX6 || smem) && bytes == 12)
+ bytes = 8;
+
+ /* dword or larger stores have to be dword-aligned */
+ unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
+ unsigned align_offset = instr ? nir_intrinsic_align_mul(instr) : 0;
+ bool dword_aligned = (align_offset + offset) % 4 == 0 && align_mul % 4 == 0;
+ if (bytes >= 4 && !dword_aligned)
+ bytes = MIN2(bytes, 2);
+
+ advance_write_mask(&todo, offset, bytes);
+ write_count_with_skips++;
+ }
+
+ /* actually split data */
+ split_store_data(ctx, dst_type, write_count_with_skips, write_datas, offsets, data);
+
+ /* remove skips */
+ for (unsigned i = 0; i < write_count_with_skips; i++) {
+ if (skips[i])
+ continue;
+ write_datas[*write_count] = write_datas[i];
+ offsets[*write_count] = offsets[i];
+ (*write_count)++;
+ }
+}
+
+Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes,
+ unsigned split_cnt = 0u, Temp dst = Temp())
+{
+ Builder bld(ctx->program, ctx->block);
+ unsigned dword_size = elem_size_bytes / 4;