+unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
+{
+ unsigned align = 16;
+ if (const_offset)
+ align = std::min(align, 1u << (ffs(const_offset) - 1));
+
+ return align;
+}
+
+
+Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes,
+ unsigned split_cnt = 0u, Temp dst = Temp())
+{
+ Builder bld(ctx->program, ctx->block);
+ unsigned dword_size = elem_size_bytes / 4;
+
+ if (!dst.id())
+ dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
+
+ std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
+ aco_ptr<Pseudo_instruction> instr {create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
+ instr->definitions[0] = Definition(dst);
+
+ for (unsigned i = 0; i < cnt; ++i) {
+ if (arr[i].id()) {
+ assert(arr[i].size() == dword_size);
+ allocated_vec[i] = arr[i];
+ instr->operands[i] = Operand(arr[i]);
+ } else {
+ Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)), Operand(0u, dword_size == 2));
+ allocated_vec[i] = zero;
+ instr->operands[i] = Operand(zero);
+ }
+ }
+
+ bld.insert(std::move(instr));
+
+ if (split_cnt)
+ emit_split_vector(ctx, dst, split_cnt);
+ else
+ ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
+
+ return dst;
+}
+
+inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, unsigned const_offset)
+{
+ if (const_offset >= 4096) {
+ unsigned excess_const_offset = const_offset / 4096u * 4096u;
+ const_offset %= 4096u;
+
+ if (!voffset.id())
+ voffset = bld.copy(bld.def(v1), Operand(excess_const_offset));
+ else if (unlikely(voffset.regClass() == s1))
+ voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), Operand(excess_const_offset), Operand(voffset));
+ else if (likely(voffset.regClass() == v1))
+ voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand(excess_const_offset));
+ else
+ unreachable("Unsupported register class of voffset");
+ }
+
+ return const_offset;
+}
+
+void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
+ unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false)
+{
+ assert(vdata.id());
+ assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
+ assert(vdata.size() >= 1 && vdata.size() <= 4);
+
+ Builder bld(ctx->program, ctx->block);
+ aco_opcode op = (aco_opcode) ((unsigned) aco_opcode::buffer_store_dword + vdata.size() - 1);
+ const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
+
+ Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
+ Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
+ Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
+ /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
+ /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc);
+
+ static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
+}
+
+void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
+ unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
+ bool allow_combining = true, bool reorder = true, bool slc = false)
+{
+ Builder bld(ctx->program, ctx->block);
+ assert(elem_size_bytes == 4 || elem_size_bytes == 8);
+ assert(write_mask);
+
+ if (elem_size_bytes == 8) {
+ elem_size_bytes = 4;
+ write_mask = widen_mask(write_mask, 2);
+ }
+
+ while (write_mask) {
+ int start = 0;
+ int count = 0;
+ u_bit_scan_consecutive_range(&write_mask, &start, &count);
+ assert(count > 0);
+ assert(start >= 0);
+
+ while (count > 0) {
+ unsigned sub_count = allow_combining ? MIN2(count, 4) : 1;
+ unsigned const_offset = (unsigned) start * elem_size_bytes + base_const_offset;
+
+ /* GFX6 doesn't have buffer_store_dwordx3, so make sure not to emit that here either. */
+ if (unlikely(ctx->program->chip_class == GFX6 && sub_count == 3))
+ sub_count = 2;
+
+ Temp elem = extract_subvector(ctx, src, start, sub_count, RegType::vgpr);
+ emit_single_mubuf_store(ctx, descriptor, voffset, soffset, elem, const_offset, reorder, slc);
+
+ count -= sub_count;
+ start += sub_count;
+ }
+
+ assert(count == 0);
+ }
+}
+
+Temp emit_single_mubuf_load(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset,
+ unsigned const_offset, unsigned size_dwords, bool allow_reorder = true)
+{
+ assert(size_dwords != 3 || ctx->program->chip_class != GFX6);
+ assert(size_dwords >= 1 && size_dwords <= 4);
+
+ Builder bld(ctx->program, ctx->block);
+ Temp vdata = bld.tmp(RegClass(RegType::vgpr, size_dwords));
+ aco_opcode op = (aco_opcode) ((unsigned) aco_opcode::buffer_load_dword + size_dwords - 1);
+ const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
+
+ Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
+ Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
+ Builder::Result r = bld.mubuf(op, Definition(vdata), Operand(descriptor), voffset_op, soffset_op, const_offset,
+ /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
+ /* disable_wqm */ false, /* glc */ true,
+ /* dlc*/ ctx->program->chip_class >= GFX10, /* slc */ false);
+
+ static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
+
+ return vdata;
+}
+
+void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
+ unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
+ unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true)
+{
+ assert(elem_size_bytes == 4 || elem_size_bytes == 8);
+ assert((num_components * elem_size_bytes / 4) == dst.size());
+ assert(!!stride != allow_combining);
+
+ Builder bld(ctx->program, ctx->block);
+ unsigned split_cnt = num_components;
+
+ if (elem_size_bytes == 8) {
+ elem_size_bytes = 4;
+ num_components *= 2;
+ }
+
+ if (!stride)
+ stride = elem_size_bytes;
+
+ unsigned load_size = 1;
+ if (allow_combining) {
+ if ((num_components % 4) == 0)
+ load_size = 4;
+ else if ((num_components % 3) == 0 && ctx->program->chip_class != GFX6)
+ load_size = 3;
+ else if ((num_components % 2) == 0)
+ load_size = 2;
+ }
+
+ unsigned num_loads = num_components / load_size;
+ std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
+
+ for (unsigned i = 0; i < num_loads; ++i) {
+ unsigned const_offset = i * stride * load_size + base_const_offset;
+ elems[i] = emit_single_mubuf_load(ctx, descriptor, voffset, soffset, const_offset, load_size, allow_reorder);
+ }
+
+ create_vec_from_array(ctx, elems.data(), num_loads, RegType::vgpr, load_size * 4u, split_cnt, dst);
+}
+
+std::pair<Temp, unsigned> offset_add_from_nir(isel_context *ctx, const std::pair<Temp, unsigned> &base_offset, nir_src *off_src, unsigned stride = 1u)
+{
+ Builder bld(ctx->program, ctx->block);
+ Temp offset = base_offset.first;
+ unsigned const_offset = base_offset.second;
+
+ if (!nir_src_is_const(*off_src)) {
+ Temp indirect_offset_arg = get_ssa_temp(ctx, off_src->ssa);
+ Temp with_stride;
+
+ /* Calculate indirect offset with stride */
+ if (likely(indirect_offset_arg.regClass() == v1))
+ with_stride = bld.v_mul_imm(bld.def(v1), indirect_offset_arg, stride);
+ else if (indirect_offset_arg.regClass() == s1)
+ with_stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), indirect_offset_arg);
+ else
+ unreachable("Unsupported register class of indirect offset");
+
+ /* Add to the supplied base offset */
+ if (offset.id() == 0)
+ offset = with_stride;
+ else if (unlikely(offset.regClass() == s1 && with_stride.regClass() == s1))
+ offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), with_stride, offset);
+ else if (offset.size() == 1 && with_stride.size() == 1)
+ offset = bld.vadd32(bld.def(v1), with_stride, offset);
+ else
+ unreachable("Unsupported register class of indirect offset");
+ } else {
+ unsigned const_offset_arg = nir_src_as_uint(*off_src);
+ const_offset += const_offset_arg * stride;
+ }
+
+ return std::make_pair(offset, const_offset);
+}
+
+std::pair<Temp, unsigned> offset_add(isel_context *ctx, const std::pair<Temp, unsigned> &off1, const std::pair<Temp, unsigned> &off2)
+{
+ Builder bld(ctx->program, ctx->block);
+ Temp offset;
+
+ if (off1.first.id() && off2.first.id()) {
+ if (unlikely(off1.first.regClass() == s1 && off2.first.regClass() == s1))
+ offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), off1.first, off2.first);
+ else if (off1.first.size() == 1 && off2.first.size() == 1)
+ offset = bld.vadd32(bld.def(v1), off1.first, off2.first);
+ else
+ unreachable("Unsupported register class of indirect offset");
+ } else {
+ offset = off1.first.id() ? off1.first : off2.first;
+ }
+
+ return std::make_pair(offset, off1.second + off2.second);
+}
+
+std::pair<Temp, unsigned> offset_mul(isel_context *ctx, const std::pair<Temp, unsigned> &offs, unsigned multiplier)
+{
+ Builder bld(ctx->program, ctx->block);
+ unsigned const_offset = offs.second * multiplier;
+
+ if (!offs.first.id())
+ return std::make_pair(offs.first, const_offset);
+
+ Temp offset = unlikely(offs.first.regClass() == s1)
+ ? bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(multiplier), offs.first)
+ : bld.v_mul_imm(bld.def(v1), offs.first, multiplier);
+
+ return std::make_pair(offset, const_offset);
+}
+
+std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride, unsigned component_stride)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ /* base is the driver_location, which is already multiplied by 4, so is in dwords */
+ unsigned const_offset = nir_intrinsic_base(instr) * base_stride;
+ /* component is in bytes */
+ const_offset += nir_intrinsic_component(instr) * component_stride;
+
+ /* offset should be interpreted in relation to the base, so the instruction effectively reads/writes another input/output when it has an offset */
+ nir_src *off_src = nir_get_io_offset_src(instr);
+ return offset_add_from_nir(ctx, std::make_pair(Temp(), const_offset), off_src, 4u * base_stride);
+}
+
+std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned stride = 1u)
+{
+ return get_intrinsic_io_basic_offset(ctx, instr, stride, stride);
+}
+
+Temp get_tess_rel_patch_id(isel_context *ctx)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ switch (ctx->shader->info.stage) {
+ case MESA_SHADER_TESS_CTRL:
+ return bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffu),
+ get_arg(ctx, ctx->args->ac.tcs_rel_ids));
+ case MESA_SHADER_TESS_EVAL:
+ return get_arg(ctx, ctx->args->tes_rel_patch_id);
+ default:
+ unreachable("Unsupported stage in get_tess_rel_patch_id");
+ }
+}
+
+std::pair<Temp, unsigned> get_tcs_per_vertex_input_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+ Builder bld(ctx->program, ctx->block);
+
+ uint32_t tcs_in_patch_stride = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 4;
+ uint32_t tcs_in_vertex_stride = ctx->tcs_num_inputs * 4;
+
+ std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr);
+
+ nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+ offs = offset_add_from_nir(ctx, offs, vertex_index_src, tcs_in_vertex_stride);
+
+ Temp rel_patch_id = get_tess_rel_patch_id(ctx);
+ Temp tcs_in_current_patch_offset = bld.v_mul24_imm(bld.def(v1), rel_patch_id, tcs_in_patch_stride);
+ offs = offset_add(ctx, offs, std::make_pair(tcs_in_current_patch_offset, 0));
+
+ return offset_mul(ctx, offs, 4u);
+}
+
+std::pair<Temp, unsigned> get_tcs_output_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, bool per_vertex = false)
+{
+ assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+ Builder bld(ctx->program, ctx->block);
+
+ uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16;
+ uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
+ uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written);
+ uint32_t output_vertex_size = num_tcs_outputs * 16;
+ uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
+ uint32_t output_patch_stride = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+
+ std::pair<Temp, unsigned> offs = instr
+ ? get_intrinsic_io_basic_offset(ctx, instr, 4u)
+ : std::make_pair(Temp(), 0u);
+
+ Temp rel_patch_id = get_tess_rel_patch_id(ctx);
+ Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, output_patch_stride);
+
+ if (per_vertex) {
+ assert(instr);
+
+ nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+ offs = offset_add_from_nir(ctx, offs, vertex_index_src, output_vertex_size);
+
+ uint32_t output_patch0_offset = (input_patch_size * ctx->tcs_num_patches);
+ offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_offset));
+ } else {
+ uint32_t output_patch0_patch_data_offset = (input_patch_size * ctx->tcs_num_patches + pervertex_output_patch_size);
+ offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_patch_data_offset));
+ }
+
+ return offs;
+}
+
+std::pair<Temp, unsigned> get_tcs_per_vertex_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ unsigned vertices_per_patch = ctx->shader->info.tess.tcs_vertices_out;
+ unsigned attr_stride = vertices_per_patch * ctx->tcs_num_patches;
+
+ std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u);
+
+ Temp rel_patch_id = get_tess_rel_patch_id(ctx);
+ Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, vertices_per_patch * 16u);
+ offs = offset_add(ctx, offs, std::make_pair(patch_off, 0u));
+
+ nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+ offs = offset_add_from_nir(ctx, offs, vertex_index_src, 16u);
+
+ return offs;
+}
+
+std::pair<Temp, unsigned> get_tcs_per_patch_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, unsigned const_base_offset = 0u)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ unsigned num_tcs_outputs = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL
+ ? util_last_bit64(ctx->args->shader_info->tcs.outputs_written)
+ : ctx->args->options->key.tes.tcs_num_outputs;
+
+ unsigned output_vertex_size = num_tcs_outputs * 16;
+ unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
+ unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches;
+ unsigned attr_stride = ctx->tcs_num_patches;
+
+ std::pair<Temp, unsigned> offs = instr
+ ? get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u)
+ : std::make_pair(Temp(), 0u);
+
+ if (const_base_offset)
+ offs.second += const_base_offset * attr_stride;
+
+ Temp rel_patch_id = get_tess_rel_patch_id(ctx);
+ Temp patch_off = bld.v_mul_imm(bld.def(v1), rel_patch_id, 16u);
+ offs = offset_add(ctx, offs, std::make_pair(patch_off, per_patch_data_offset));
+
+ return offs;
+}
+
+bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex, uint64_t mask, bool *indirect)
+{
+ unsigned off = nir_intrinsic_base(instr) * 4u;
+ nir_src *off_src = nir_get_io_offset_src(instr);
+
+ if (!nir_src_is_const(*off_src)) {
+ *indirect = true;
+ return false;
+ }
+
+ *indirect = false;
+ off += nir_src_as_uint(*off_src) * 16u;
+
+ while (mask) {
+ unsigned slot = u_bit_scan64(&mask) + (per_vertex ? 0 : VARYING_SLOT_PATCH0);
+ if (off == shader_io_get_unique_index((gl_varying_slot) slot) * 16u)
+ return true;
+ }
+
+ return false;
+}
+
+bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)