+std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned stride = 1u)
+{
+ return get_intrinsic_io_basic_offset(ctx, instr, stride, stride);
+}
+
+Temp get_tess_rel_patch_id(isel_context *ctx)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ switch (ctx->shader->info.stage) {
+ case MESA_SHADER_TESS_CTRL:
+ return bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffu),
+ get_arg(ctx, ctx->args->ac.tcs_rel_ids));
+ case MESA_SHADER_TESS_EVAL:
+ return get_arg(ctx, ctx->args->tes_rel_patch_id);
+ default:
+ unreachable("Unsupported stage in get_tess_rel_patch_id");
+ }
+}
+
+std::pair<Temp, unsigned> get_tcs_per_vertex_input_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+ Builder bld(ctx->program, ctx->block);
+
+ uint32_t tcs_in_patch_stride = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 4;
+ uint32_t tcs_in_vertex_stride = ctx->tcs_num_inputs * 4;
+
+ std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr);
+
+ nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+ offs = offset_add_from_nir(ctx, offs, vertex_index_src, tcs_in_vertex_stride);
+
+ Temp rel_patch_id = get_tess_rel_patch_id(ctx);
+ Temp tcs_in_current_patch_offset = bld.v_mul24_imm(bld.def(v1), rel_patch_id, tcs_in_patch_stride);
+ offs = offset_add(ctx, offs, std::make_pair(tcs_in_current_patch_offset, 0));
+
+ return offset_mul(ctx, offs, 4u);
+}
+
+std::pair<Temp, unsigned> get_tcs_output_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, bool per_vertex = false)
+{
+ assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+ Builder bld(ctx->program, ctx->block);
+
+ uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16;
+ uint32_t output_vertex_size = ctx->tcs_num_outputs * 16;
+ uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
+ uint32_t output_patch_stride = pervertex_output_patch_size + ctx->tcs_num_patch_outputs * 16;
+
+ std::pair<Temp, unsigned> offs = instr
+ ? get_intrinsic_io_basic_offset(ctx, instr, 4u)
+ : std::make_pair(Temp(), 0u);
+
+ Temp rel_patch_id = get_tess_rel_patch_id(ctx);
+ Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, output_patch_stride);
+
+ if (per_vertex) {
+ assert(instr);
+
+ nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+ offs = offset_add_from_nir(ctx, offs, vertex_index_src, output_vertex_size);
+
+ uint32_t output_patch0_offset = (input_patch_size * ctx->tcs_num_patches);
+ offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_offset));
+ } else {
+ uint32_t output_patch0_patch_data_offset = (input_patch_size * ctx->tcs_num_patches + pervertex_output_patch_size);
+ offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_patch_data_offset));
+ }
+
+ return offs;
+}
+
+std::pair<Temp, unsigned> get_tcs_per_vertex_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ unsigned vertices_per_patch = ctx->shader->info.tess.tcs_vertices_out;
+ unsigned attr_stride = vertices_per_patch * ctx->tcs_num_patches;
+
+ std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u);
+
+ Temp rel_patch_id = get_tess_rel_patch_id(ctx);
+ Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, vertices_per_patch * 16u);
+ offs = offset_add(ctx, offs, std::make_pair(patch_off, 0u));
+
+ nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+ offs = offset_add_from_nir(ctx, offs, vertex_index_src, 16u);
+
+ return offs;
+}
+
+std::pair<Temp, unsigned> get_tcs_per_patch_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, unsigned const_base_offset = 0u)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ unsigned output_vertex_size = ctx->tcs_num_outputs * 16;
+ unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
+ unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches;
+ unsigned attr_stride = ctx->tcs_num_patches;
+
+ std::pair<Temp, unsigned> offs = instr
+ ? get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u)
+ : std::make_pair(Temp(), 0u);
+
+ if (const_base_offset)
+ offs.second += const_base_offset * attr_stride;
+
+ Temp rel_patch_id = get_tess_rel_patch_id(ctx);
+ Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, 16u);
+ offs = offset_add(ctx, offs, std::make_pair(patch_off, per_patch_data_offset));
+
+ return offs;
+}
+
+bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex, uint64_t mask, bool *indirect)
+{
+ assert(per_vertex || ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+
+ if (mask == 0)
+ return false;
+
+ unsigned drv_loc = nir_intrinsic_base(instr);
+ nir_src *off_src = nir_get_io_offset_src(instr);
+
+ if (!nir_src_is_const(*off_src)) {
+ *indirect = true;
+ return false;
+ }
+
+ *indirect = false;
+ uint64_t slot = per_vertex
+ ? ctx->output_drv_loc_to_var_slot[ctx->shader->info.stage][drv_loc / 4]
+ : (ctx->output_tcs_patch_drv_loc_to_var_slot[drv_loc / 4] - VARYING_SLOT_PATCH0);
+ return (((uint64_t) 1) << slot) & mask;
+}
+
+bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ unsigned write_mask = nir_intrinsic_write_mask(instr);
+ unsigned component = nir_intrinsic_component(instr);
+ unsigned idx = nir_intrinsic_base(instr) + component;
+
+ nir_instr *off_instr = instr->src[1].ssa->parent_instr;
+ if (off_instr->type != nir_instr_type_load_const)
+ return false;
+
+ Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+ idx += nir_src_as_uint(instr->src[1]) * 4u;
+
+ if (instr->src[0].ssa->bit_size == 64)
+ write_mask = widen_mask(write_mask, 2);
+
+ for (unsigned i = 0; i < 8; ++i) {
+ if (write_mask & (1 << i)) {
+ ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
+ ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, v1);
+ }
+ idx++;
+ }
+
+ return true;
+}
+
+bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp dst)
+{
+ /* Only TCS per-vertex inputs are supported by this function.
+ * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations is the same.
+ */
+ if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
+ return false;
+
+ nir_src *off_src = nir_get_io_offset_src(instr);
+ nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+ nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
+ bool can_use_temps = nir_src_is_const(*off_src) &&
+ vertex_index_instr->type == nir_instr_type_intrinsic &&
+ nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
+
+ if (!can_use_temps)
+ return false;
+
+ unsigned idx = nir_intrinsic_base(instr) + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
+ Temp *src = &ctx->inputs.temps[idx];
+ create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
+
+ return true;
+}
+
+void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ if (ctx->tcs_in_out_eq && store_output_to_temps(ctx, instr)) {
+ /* When the TCS only reads this output directly and for the same vertices as its invocation id, it is unnecessary to store the VS output to LDS. */
+ bool indirect_write;
+ bool temp_only_input = tcs_driver_location_matches_api_mask(ctx, instr, true, ctx->tcs_temp_only_inputs, &indirect_write);
+ if (temp_only_input && !indirect_write)
+ return;
+ }
+
+ std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, 4u);
+ Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+ unsigned write_mask = nir_intrinsic_write_mask(instr);
+ unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8u;
+
+ if (ctx->stage == vertex_es || ctx->stage == tess_eval_es) {
+ /* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
+ Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
+ Temp es2gs_offset = get_arg(ctx, ctx->args->es2gs_offset);
+ store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, true, true);
+ } else {
+ Temp lds_base;
+
+ if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
+ /* GFX9+: ES stage is merged into GS, data is passed between them using LDS. */
+ unsigned itemsize = ctx->stage == vertex_geometry_gs
+ ? ctx->program->info->vs.es_info.esgs_itemsize
+ : ctx->program->info->tes.es_info.esgs_itemsize;
+ Temp thread_id = emit_mbcnt(ctx, bld.def(v1));
+ Temp wave_idx = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand(4u << 16 | 24));
+ Temp vertex_idx = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), thread_id,
+ bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_idx), ctx->program->wave_size));
+ lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, itemsize);
+ } else if (ctx->stage == vertex_ls || ctx->stage == vertex_tess_control_hs) {
+ /* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS.
+ * GFX9+: LS is merged into HS, but still uses the same LDS layout.
+ */
+ Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id);
+ lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->tcs_num_inputs * 16u);
+ } else {
+ unreachable("Invalid LS or ES stage");
+ }
+
+ offs = offset_add(ctx, offs, std::make_pair(lds_base, 0u));
+ unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
+ store_lds(ctx, elem_size_bytes, src, write_mask, offs.first, offs.second, lds_align);
+ }
+}
+
+bool tcs_output_is_tess_factor(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
+{
+ if (per_vertex)
+ return false;
+
+ unsigned off = nir_intrinsic_base(instr) * 4u;
+ return off == ctx->tcs_tess_lvl_out_loc ||
+ off == ctx->tcs_tess_lvl_in_loc;
+
+}
+
+bool tcs_output_is_read_by_tes(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
+{
+ uint64_t mask = per_vertex
+ ? ctx->program->info->tcs.tes_inputs_read
+ : ctx->program->info->tcs.tes_patch_inputs_read;
+
+ bool indirect_write = false;
+ bool output_read_by_tes = tcs_driver_location_matches_api_mask(ctx, instr, per_vertex, mask, &indirect_write);
+ return indirect_write || output_read_by_tes;
+}
+
+bool tcs_output_is_read_by_tcs(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
+{
+ uint64_t mask = per_vertex
+ ? ctx->shader->info.outputs_read
+ : ctx->shader->info.patch_outputs_read;
+
+ bool indirect_write = false;
+ bool output_read = tcs_driver_location_matches_api_mask(ctx, instr, per_vertex, mask, &indirect_write);
+ return indirect_write || output_read;
+}
+
+void visit_store_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
+{
+ assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
+ assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+
+ Builder bld(ctx->program, ctx->block);
+
+ Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
+ unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
+ unsigned write_mask = nir_intrinsic_write_mask(instr);
+
+ bool is_tess_factor = tcs_output_is_tess_factor(ctx, instr, per_vertex);
+ bool write_to_vmem = !is_tess_factor && tcs_output_is_read_by_tes(ctx, instr, per_vertex);
+ bool write_to_lds = is_tess_factor || tcs_output_is_read_by_tcs(ctx, instr, per_vertex);
+
+ if (write_to_vmem) {
+ std::pair<Temp, unsigned> vmem_offs = per_vertex
+ ? get_tcs_per_vertex_output_vmem_offset(ctx, instr)
+ : get_tcs_per_patch_output_vmem_offset(ctx, instr);
+
+ Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
+ Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
+ store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, false);
+ }
+
+ if (write_to_lds) {
+ std::pair<Temp, unsigned> lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex);
+ unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second);
+ store_lds(ctx, elem_size_bytes, store_val, write_mask, lds_offs.first, lds_offs.second, lds_align);
+ }
+}
+
+void visit_load_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
+{
+ assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
+ assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+
+ Builder bld(ctx->program, ctx->block);
+
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+ std::pair<Temp, unsigned> lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex);
+ unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second);
+ unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
+
+ load_lds(ctx, elem_size_bytes, dst, lds_offs.first, lds_offs.second, lds_align);
+}
+
+void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ if (ctx->stage == vertex_vs ||
+ ctx->stage == tess_eval_vs ||
+ ctx->stage == fragment_fs ||
+ ctx->stage == ngg_vertex_gs ||
+ ctx->stage == ngg_tess_eval_gs ||
+ ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
+ bool stored_to_temps = store_output_to_temps(ctx, instr);
+ if (!stored_to_temps) {
+ fprintf(stderr, "Unimplemented output offset instruction:\n");
+ nir_print_instr(instr->src[1].ssa->parent_instr, stderr);
+ fprintf(stderr, "\n");
+ abort();
+ }
+ } else if (ctx->stage == vertex_es ||
+ ctx->stage == vertex_ls ||
+ ctx->stage == tess_eval_es ||
+ (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
+ (ctx->stage == vertex_geometry_gs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
+ (ctx->stage == tess_eval_geometry_gs && ctx->shader->info.stage == MESA_SHADER_TESS_EVAL)) {
+ visit_store_ls_or_es_output(ctx, instr);
+ } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
+ visit_store_tcs_output(ctx, instr, false);
+ } else {
+ unreachable("Shader stage not implemented");
+ }
+}
+
+void visit_load_output(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ visit_load_tcs_output(ctx, instr, false);
+}
+
+void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
+{
+ Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
+ Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
+
+ Builder bld(ctx->program, ctx->block);
+ Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
+ if (ctx->program->has_16bank_lds)
+ interp_p1.instr->operands[0].setLateKill(true);
+ bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx, component);
+}
+
+void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
+{
+ aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
+ for (unsigned i = 0; i < num_components; i++)
+ vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
+ if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
+ assert(num_components == 4);
+ Builder bld(ctx->program, ctx->block);
+ vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
+ }
+
+ for (Operand& op : vec->operands)
+ op = op.isUndefined() ? Operand(0u) : op;
+
+ vec->definitions[0] = Definition(dst);
+ ctx->block->instructions.emplace_back(std::move(vec));
+ emit_split_vector(ctx, dst, num_components);
+ return;
+}
+
+void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+ Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
+ unsigned idx = nir_intrinsic_base(instr);
+ unsigned component = nir_intrinsic_component(instr);
+ Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
+
+ nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
+ if (offset) {
+ assert(offset->u32 == 0);
+ } else {
+ /* the lower 15bit of the prim_mask contain the offset into LDS
+ * while the upper bits contain the number of prims */
+ Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
+ assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
+ Builder bld(ctx->program, ctx->block);
+ Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
+ stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
+ stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
+ offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
+ prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
+ }
+
+ if (instr->dest.ssa.num_components == 1) {
+ emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
+ } else {
+ aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
+ for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
+ {
+ Temp tmp = {ctx->program->allocateId(), v1};
+ emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
+ vec->operands[i] = Operand(tmp);
+ }
+ vec->definitions[0] = Definition(dst);
+ ctx->block->instructions.emplace_back(std::move(vec));
+ }
+}
+
+bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info,
+ unsigned offset, unsigned stride, unsigned channels)
+{
+ unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
+ if (vtx_info->chan_byte_size != 4 && channels == 3)
+ return false;
+ return (ctx->options->chip_class != GFX6 && ctx->options->chip_class != GFX10) ||
+ (offset % vertex_byte_size == 0 && stride % vertex_byte_size == 0);
+}
+
+uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info,
+ unsigned offset, unsigned stride, unsigned *channels)
+{
+ if (!vtx_info->chan_byte_size) {
+ *channels = vtx_info->num_channels;
+ return vtx_info->chan_format;
+ }
+
+ unsigned num_channels = *channels;
+ if (!check_vertex_fetch_size(ctx, vtx_info, offset, stride, *channels)) {
+ unsigned new_channels = num_channels + 1;
+ /* first, assume more loads is worse and try using a larger data format */
+ while (new_channels <= 4 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels)) {
+ new_channels++;
+ /* don't make the attribute potentially out-of-bounds */
+ if (offset + new_channels * vtx_info->chan_byte_size > stride)
+ new_channels = 5;
+ }
+
+ if (new_channels == 5) {
+ /* then try decreasing load size (at the cost of more loads) */
+ new_channels = *channels;
+ while (new_channels > 1 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels))
+ new_channels--;
+ }
+
+ if (new_channels < *channels)
+ *channels = new_channels;
+ num_channels = new_channels;
+ }
+
+ switch (vtx_info->chan_format) {
+ case V_008F0C_BUF_DATA_FORMAT_8:
+ return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
+ V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
+ case V_008F0C_BUF_DATA_FORMAT_16:
+ return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
+ V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
+ case V_008F0C_BUF_DATA_FORMAT_32:
+ return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
+ V_008F0C_BUF_DATA_FORMAT_32_32_32, V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
+ }
+ unreachable("shouldn't reach here");
+ return V_008F0C_BUF_DATA_FORMAT_INVALID;
+}
+
+/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
+ * so we may need to fix it up. */
+Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
+{
+ Builder bld(ctx->program, ctx->block);
+
+ if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
+ alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
+
+ /* For the integer-like cases, do a natural sign extension.
+ *
+ * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
+ * and happen to contain 0, 1, 2, 3 as the two LSBs of the
+ * exponent.
+ */
+ alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
+ alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
+
+ /* Convert back to the right type. */
+ if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
+ alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
+ Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0xbf800000u), alpha);
+ alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
+ } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
+ alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
+ }
+
+ return alpha;
+}
+
+void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ Builder bld(ctx->program, ctx->block);
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+ if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
+
+ nir_instr *off_instr = instr->src[0].ssa->parent_instr;
+ if (off_instr->type != nir_instr_type_load_const) {
+ fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
+ nir_print_instr(off_instr, stderr);
+ fprintf(stderr, "\n");
+ }
+ uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
+
+ Temp vertex_buffers = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->vertex_buffers));
+
+ unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
+ unsigned component = nir_intrinsic_component(instr);
+ unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];