+ /* concatenate components and p_as_uniform() result if needed */
+ if (info->dst.type() == RegType::vgpr || !has_vgprs)
+ ctx->allocated_vec.emplace(info->dst.id(), allocated_vec);
+
+ int padding_bytes = MAX2((int)info->dst.bytes() - int(allocated_vec[0].bytes() * info->num_components), 0);
+
+ aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+ aco_opcode::p_create_vector, Format::PSEUDO, info->num_components + !!padding_bytes, 1)};
+ for (unsigned i = 0; i < info->num_components; i++)
+ vec->operands[i] = Operand(allocated_vec[i]);
+ if (padding_bytes)
+ vec->operands[info->num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
+ if (info->dst.type() == RegType::sgpr && has_vgprs) {
+ Temp tmp = bld.tmp(RegType::vgpr, info->dst.size());
+ vec->definitions[0] = Definition(tmp);
+ bld.insert(std::move(vec));
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(info->dst), tmp);
+ } else {
+ vec->definitions[0] = Definition(info->dst);
+ bld.insert(std::move(vec));
+ }
+}
+
+Operand load_lds_size_m0(Builder& bld)
+{
+ /* TODO: m0 does not need to be initialized on GFX9+ */
+ return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
+}
+
+Temp lds_load_callback(Builder& bld, const LoadEmitInfo *info,
+ Temp offset, unsigned bytes_needed,
+ unsigned align, unsigned const_offset,
+ Temp dst_hint)
+{
+ offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
+
+ Operand m = load_lds_size_m0(bld);
+
+ bool large_ds_read = bld.program->chip_class >= GFX7;
+ bool usable_read2 = bld.program->chip_class >= GFX7;
+
+ bool read2 = false;
+ unsigned size = 0;
+ aco_opcode op;
+ //TODO: use ds_read_u8_d16_hi/ds_read_u16_d16_hi if beneficial
+ if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
+ size = 16;
+ op = aco_opcode::ds_read_b128;
+ } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
+ size = 16;
+ read2 = true;
+ op = aco_opcode::ds_read2_b64;
+ } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
+ size = 12;
+ op = aco_opcode::ds_read_b96;
+ } else if (bytes_needed >= 8 && align % 8 == 0) {
+ size = 8;
+ op = aco_opcode::ds_read_b64;
+ } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0) {
+ size = 8;
+ read2 = true;
+ op = aco_opcode::ds_read2_b32;
+ } else if (bytes_needed >= 4 && align % 4 == 0) {
+ size = 4;
+ op = aco_opcode::ds_read_b32;
+ } else if (bytes_needed >= 2 && align % 2 == 0) {
+ size = 2;
+ op = aco_opcode::ds_read_u16;
+ } else {
+ size = 1;
+ op = aco_opcode::ds_read_u8;
+ }
+
+ unsigned max_offset_plus_one = read2 ? 254 * (size / 2u) + 1 : 65536;
+ if (const_offset >= max_offset_plus_one) {
+ offset = bld.vadd32(bld.def(v1), offset, Operand(const_offset / max_offset_plus_one));
+ const_offset %= max_offset_plus_one;
+ }
+
+ if (read2)
+ const_offset /= (size / 2u);
+
+ RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4));
+ Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
+ if (read2)
+ bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
+ else
+ bld.ds(op, Definition(val), offset, m, const_offset);
+
+ if (size < 4)
+ val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u));
+
+ return val;
+}
+
+static auto emit_lds_load = emit_load<lds_load_callback, false, true, UINT32_MAX>;
+
+Temp smem_load_callback(Builder& bld, const LoadEmitInfo *info,
+ Temp offset, unsigned bytes_needed,
+ unsigned align, unsigned const_offset,
+ Temp dst_hint)
+{
+ unsigned size = 0;
+ aco_opcode op;
+ if (bytes_needed <= 4) {
+ size = 1;
+ op = info->resource.id() ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
+ } else if (bytes_needed <= 8) {
+ size = 2;
+ op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
+ } else if (bytes_needed <= 16) {
+ size = 4;
+ op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
+ } else if (bytes_needed <= 32) {
+ size = 8;
+ op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
+ } else {
+ size = 16;
+ op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
+ }
+ aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
+ if (info->resource.id()) {
+ load->operands[0] = Operand(info->resource);
+ load->operands[1] = Operand(offset);
+ } else {
+ load->operands[0] = Operand(offset);
+ load->operands[1] = Operand(0u);
+ }
+ RegClass rc(RegType::sgpr, size);
+ Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
+ load->definitions[0] = Definition(val);
+ load->glc = info->glc;
+ load->dlc = info->glc && bld.program->chip_class >= GFX10;
+ load->barrier = info->barrier;
+ load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
+ bld.insert(std::move(load));
+ return val;
+}
+
+static auto emit_smem_load = emit_load<smem_load_callback, true, false, 1024>;
+
+Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
+ Temp offset, unsigned bytes_needed,
+ unsigned align_, unsigned const_offset,
+ Temp dst_hint)
+{
+ Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
+ Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
+
+ if (info->soffset.id()) {
+ if (soffset.isTemp())
+ vaddr = bld.copy(bld.def(v1), soffset);
+ soffset = Operand(info->soffset);
+ }
+
+ unsigned bytes_size = 0;
+ aco_opcode op;
+ if (bytes_needed == 1) {
+ bytes_size = 1;
+ op = aco_opcode::buffer_load_ubyte;
+ } else if (bytes_needed == 2) {
+ bytes_size = 2;
+ op = aco_opcode::buffer_load_ushort;
+ } else if (bytes_needed <= 4) {
+ bytes_size = 4;
+ op = aco_opcode::buffer_load_dword;
+ } else if (bytes_needed <= 8) {
+ bytes_size = 8;
+ op = aco_opcode::buffer_load_dwordx2;
+ } else if (bytes_needed <= 12 && bld.program->chip_class > GFX6) {
+ bytes_size = 12;
+ op = aco_opcode::buffer_load_dwordx3;
+ } else {
+ bytes_size = 16;
+ op = aco_opcode::buffer_load_dwordx4;
+ }
+ aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
+ mubuf->operands[0] = Operand(info->resource);
+ mubuf->operands[1] = vaddr;
+ mubuf->operands[2] = soffset;
+ mubuf->offen = (offset.type() == RegType::vgpr);
+ mubuf->glc = info->glc;
+ mubuf->dlc = info->glc && bld.program->chip_class >= GFX10;
+ mubuf->barrier = info->barrier;
+ mubuf->can_reorder = info->can_reorder;
+ mubuf->offset = const_offset;
+ RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
+ Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
+ mubuf->definitions[0] = Definition(val);
+ bld.insert(std::move(mubuf));
+
+ if (bytes_size < 4)
+ val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, bytes_size)), val, Operand(0u));
+
+ return val;
+}
+
+static auto emit_mubuf_load = emit_load<mubuf_load_callback, true, true, 4096>;
+
+Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
+{
+ uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
+ if (addr.type() == RegType::vgpr)
+ return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u), Operand(-1u), Operand(rsrc_conf));
+ return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u), Operand(rsrc_conf));
+}
+
+Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
+ Temp offset, unsigned bytes_needed,
+ unsigned align_, unsigned const_offset,
+ Temp dst_hint)
+{
+ unsigned bytes_size = 0;
+ bool mubuf = bld.program->chip_class == GFX6;
+ bool global = bld.program->chip_class >= GFX9;
+ aco_opcode op;
+ if (bytes_needed == 1) {
+ bytes_size = 1;
+ op = mubuf ? aco_opcode::buffer_load_ubyte : global ? aco_opcode::global_load_ubyte : aco_opcode::flat_load_ubyte;
+ } else if (bytes_needed == 2) {
+ bytes_size = 2;
+ op = mubuf ? aco_opcode::buffer_load_ushort : global ? aco_opcode::global_load_ushort : aco_opcode::flat_load_ushort;
+ } else if (bytes_needed <= 4) {
+ bytes_size = 4;
+ op = mubuf ? aco_opcode::buffer_load_dword : global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
+ } else if (bytes_needed <= 8) {
+ bytes_size = 8;
+ op = mubuf ? aco_opcode::buffer_load_dwordx2 : global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
+ } else if (bytes_needed <= 12 && !mubuf) {
+ bytes_size = 12;
+ op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
+ } else {
+ bytes_size = 16;
+ op = mubuf ? aco_opcode::buffer_load_dwordx4 : global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
+ }
+ RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
+ Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
+ if (mubuf) {
+ aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
+ mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset));
+ mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
+ mubuf->operands[2] = Operand(0u);
+ mubuf->glc = info->glc;
+ mubuf->dlc = false;
+ mubuf->offset = 0;
+ mubuf->addr64 = offset.type() == RegType::vgpr;
+ mubuf->disable_wqm = false;
+ mubuf->barrier = info->barrier;
+ mubuf->definitions[0] = Definition(val);
+ bld.insert(std::move(mubuf));
+ } else {
+ offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset;
+
+ aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
+ flat->operands[0] = Operand(offset);
+ flat->operands[1] = Operand(s1);
+ flat->glc = info->glc;
+ flat->dlc = info->glc && bld.program->chip_class >= GFX10;
+ flat->barrier = info->barrier;
+ flat->offset = 0u;
+ flat->definitions[0] = Definition(val);
+ bld.insert(std::move(flat));
+ }
+
+ if (bytes_size < 4)
+ val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, bytes_size)), val, Operand(0u));
+
+ return val;
+}
+
+static auto emit_global_load = emit_load<global_load_callback, true, true, 1>;
+
+Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
+ Temp address, unsigned base_offset, unsigned align)
+{
+ assert(util_is_power_of_two_nonzero(align));
+
+ Builder bld(ctx->program, ctx->block);
+
+ unsigned num_components = dst.bytes() / elem_size_bytes;
+ LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
+ info.align_mul = align;
+ info.align_offset = 0;
+ info.barrier = barrier_shared;
+ info.can_reorder = false;
+ info.const_offset = base_offset;
+ emit_lds_load(ctx, bld, &info);