From bb78f9b4e499d8048eaff08dcf7ba9c3de851cad Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Wed, 13 Nov 2019 13:30:52 +0100 Subject: [PATCH] aco: Use common argument handling MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Reviewed-by: Daniel Schürmann --- src/amd/common/ac_shader_args.h | 1 + .../compiler/aco_instruction_selection.cpp | 185 +++-- .../aco_instruction_selection_setup.cpp | 651 +++--------------- src/amd/compiler/aco_interface.cpp | 3 +- src/amd/compiler/aco_ir.h | 4 +- src/amd/vulkan/radv_shader_args.c | 4 +- 6 files changed, 211 insertions(+), 637 deletions(-) diff --git a/src/amd/common/ac_shader_args.h b/src/amd/common/ac_shader_args.h index e188c2ef12e..0cd4b6aac90 100644 --- a/src/amd/common/ac_shader_args.h +++ b/src/amd/common/ac_shader_args.h @@ -58,6 +58,7 @@ struct ac_shader_args { enum ac_arg_regfile file; uint8_t offset; uint8_t size; + bool skip; } args[AC_MAX_ARGS]; uint8_t arg_count; diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 856e73366db..ce8a7b02c77 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -2911,12 +2911,11 @@ void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components) { aco_ptr vec(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)); for (unsigned i = 0; i < num_components; i++) - vec->operands[i] = Operand(ctx->fs_inputs[fs_input::frag_pos_0 + i]); - - if (ctx->fs_vgpr_args[fs_input::frag_pos_3]) { + vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i])); + if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) { assert(num_components == 4); Builder bld(ctx->program, ctx->block); - vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ctx->fs_inputs[fs_input::frag_pos_3]); + vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3])); } for (Operand& op : vec->operands) @@ -2934,7 +2933,7 @@ void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr Temp coords = get_ssa_temp(ctx, instr->src[0].ssa); unsigned idx = nir_intrinsic_base(instr); unsigned component = nir_intrinsic_component(instr); - Temp prim_mask = ctx->prim_mask; + Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask); nir_const_value* offset = nir_src_as_const_value(instr->src[1]); if (offset) { @@ -3039,7 +3038,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) } uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32; - Temp vertex_buffers = convert_pointer_to_64_bit(ctx, ctx->vertex_buffers); + Temp vertex_buffers = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->vertex_buffers)); unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset; unsigned component = nir_intrinsic_component(instr); @@ -3064,21 +3063,24 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) Temp index; if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) { uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location]; + Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance); if (divisor) { ctx->needs_instance_id = true; - + Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id); if (divisor != 1) { Temp divided = bld.tmp(v1); - emit_v_div_u32(ctx, divided, as_vgpr(ctx, ctx->instance_id), divisor); - index = bld.vadd32(bld.def(v1), ctx->start_instance, divided); + emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor); + index = bld.vadd32(bld.def(v1), start_instance, divided); } else { - index = bld.vadd32(bld.def(v1), ctx->start_instance, ctx->instance_id); + index = bld.vadd32(bld.def(v1), start_instance, instance_id); } } else { - index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), ctx->start_instance); + index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), start_instance); } } else { - index = bld.vadd32(bld.def(v1), ctx->base_vertex, ctx->vertex_id); + index = bld.vadd32(bld.def(v1), + get_arg(ctx, ctx->args->ac.base_vertex), + get_arg(ctx, ctx->args->ac.vertex_id)); } if (attrib_stride != 0 && attrib_offset > attrib_stride) { @@ -3165,7 +3167,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) fprintf(stderr, "\n"); } - Temp prim_mask = ctx->prim_mask; + Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask); nir_const_value* offset = nir_src_as_const_value(instr->src[0]); if (offset) { assert(offset->u32 == 0); @@ -3204,11 +3206,11 @@ Temp load_desc_ptr(isel_context *ctx, unsigned desc_set) { if (ctx->program->info->need_indirect_descriptor_sets) { Builder bld(ctx->program, ctx->block); - Temp ptr64 = convert_pointer_to_64_bit(ctx, ctx->descriptor_sets[0]); + Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0])); return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, Operand(desc_set << 2));//, false, false, false); } - return ctx->descriptor_sets[desc_set]; + return get_arg(ctx, ctx->args->descriptor_sets[desc_set]); } @@ -3229,7 +3231,7 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr) if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) { unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset; - desc_ptr = ctx->push_constants; + desc_ptr = get_arg(ctx, ctx->args->ac.push_constants); offset = pipeline_layout->push_constant_size + 16 * idx; stride = 16; } else { @@ -3473,12 +3475,12 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr) unsigned count = instr->dest.ssa.num_components; unsigned start = (offset + index_cv->u32) / 4u; - start -= ctx->base_inline_push_consts; - if (start + count <= ctx->num_inline_push_consts) { + start -= ctx->args->ac.base_inline_push_consts; + if (start + count <= ctx->args->ac.num_inline_push_consts) { std::array elems; aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; for (unsigned i = 0; i < count; ++i) { - elems[i] = ctx->inline_push_consts[start + i]; + elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]); vec->operands[i] = Operand{elems[i]}; } vec->definitions[0] = Definition(dst); @@ -3491,7 +3493,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr) Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); if (offset != 0) // TODO check if index != 0 as well index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index); - Temp ptr = convert_pointer_to_64_bit(ctx, ctx->push_constants); + Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants)); Temp vec = dst; bool trim = false; aco_opcode op; @@ -5091,11 +5093,12 @@ void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) { Builder bld(ctx->program, ctx->block); - Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ctx->fs_inputs[fs_input::ancillary], Operand(8u), Operand(4u)); + Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), + get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u)); Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples])); Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, ctx->fs_inputs[fs_input::sample_coverage]); + bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage)); } Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src) @@ -5239,8 +5242,9 @@ void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp s void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2) { Builder bld(ctx->program, ctx->block); - Temp p1 = ctx->fs_inputs[fs_input::persp_center_p1]; - Temp p2 = ctx->fs_inputs[fs_input::persp_center_p2]; + Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center); + Temp p1 = emit_extract_vector(ctx, persp_center, 0, v1); + Temp p2 = emit_extract_vector(ctx, persp_center, 1, v1); /* Build DD X/Y */ Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_quad_perm(0, 0, 0, 0)); @@ -5271,17 +5275,33 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_load_barycentric_pixel: case nir_intrinsic_load_barycentric_centroid: { glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr); - fs_input input = get_interp_input(instr->intrinsic, mode); - - Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - if (input == fs_input::max_inputs) { - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), - Operand(0u), Operand(0u)); - } else { - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), - ctx->fs_inputs[input], - ctx->fs_inputs[input + 1]); + Temp bary = Temp(0, s2); + switch (mode) { + case INTERP_MODE_SMOOTH: + case INTERP_MODE_NONE: + if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel) + bary = get_arg(ctx, ctx->args->ac.persp_center); + else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid) + bary = ctx->persp_centroid; + else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample) + bary = get_arg(ctx, ctx->args->ac.persp_sample); + break; + case INTERP_MODE_NOPERSPECTIVE: + if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel) + bary = get_arg(ctx, ctx->args->ac.linear_center); + else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid) + bary = ctx->linear_centroid; + else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample) + bary = get_arg(ctx, ctx->args->ac.linear_sample); + break; + default: + break; } + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp p1 = emit_extract_vector(ctx, bary, 0, v1); + Temp p2 = emit_extract_vector(ctx, bary, 1, v1); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + Operand(p1), Operand(p2)); emit_split_vector(ctx, dst, 2); break; } @@ -5352,20 +5372,20 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } case nir_intrinsic_load_front_face: { bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), - Operand(0u), ctx->fs_inputs[fs_input::front_face]).def(0).setHint(vcc); + Operand(0u), get_arg(ctx, ctx->args->ac.front_face)).def(0).setHint(vcc); break; } case nir_intrinsic_load_view_index: case nir_intrinsic_load_layer_id: { if (instr->intrinsic == nir_intrinsic_load_view_index && (ctx->stage & sw_vs)) { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - bld.copy(Definition(dst), Operand(ctx->view_index)); + bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index))); break; } unsigned idx = nir_intrinsic_base(instr); bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), - Operand(2u), bld.m0(ctx->prim_mask), idx, 0); + Operand(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0); break; } case nir_intrinsic_load_frag_coord: { @@ -5373,8 +5393,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; } case nir_intrinsic_load_sample_pos: { - Temp posx = ctx->fs_inputs[fs_input::frag_pos_0]; - Temp posy = ctx->fs_inputs[fs_input::frag_pos_1]; + Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]); + Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]); bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u), posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u)); @@ -5496,36 +5516,38 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_num_work_groups: { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - bld.copy(Definition(dst), Operand(ctx->num_workgroups)); + bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups))); emit_split_vector(ctx, dst, 3); break; } case nir_intrinsic_load_local_invocation_id: { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - bld.copy(Definition(dst), Operand(ctx->local_invocation_ids)); + bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids))); emit_split_vector(ctx, dst, 3); break; } case nir_intrinsic_load_work_group_id: { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - Temp* ids = ctx->workgroup_ids; + struct ac_arg *args = ctx->args->ac.workgroup_ids; bld.pseudo(aco_opcode::p_create_vector, Definition(dst), - ids[0].id() ? Operand(ids[0]) : Operand(1u), - ids[1].id() ? Operand(ids[1]) : Operand(1u), - ids[2].id() ? Operand(ids[2]) : Operand(1u)); + args[0].used ? Operand(get_arg(ctx, args[0])) : Operand(1u), + args[1].used ? Operand(get_arg(ctx, args[1])) : Operand(1u), + args[2].used ? Operand(get_arg(ctx, args[2])) : Operand(1u)); emit_split_vector(ctx, dst, 3); break; } case nir_intrinsic_load_local_invocation_index: { Temp id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1), bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u))); - Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size); + Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), + get_arg(ctx, ctx->args->ac.tg_size)); bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id); break; } case nir_intrinsic_load_subgroup_id: { if (ctx->stage == compute_cs) { - Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size); + Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), + get_arg(ctx, ctx->args->ac.tg_size)); bld.sop2(aco_opcode::s_lshr_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), tg_num, Operand(0x6u)); } else { bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u)); @@ -5539,7 +5561,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } case nir_intrinsic_load_num_subgroups: { if (ctx->stage == compute_cs) - bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu), ctx->tg_size); + bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu), + get_arg(ctx, ctx->args->ac.tg_size)); else bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u)); break; @@ -5601,7 +5624,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } case nir_intrinsic_load_sample_id: { bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), - ctx->fs_inputs[ancillary], Operand(8u), Operand(4u)); + get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u)); break; } case nir_intrinsic_load_sample_mask_in: { @@ -5939,27 +5962,27 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_vertex_id_zero_base: { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - bld.copy(Definition(dst), ctx->vertex_id); + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id)); break; } case nir_intrinsic_load_first_vertex: { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - bld.copy(Definition(dst), ctx->base_vertex); + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex)); break; } case nir_intrinsic_load_base_instance: { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - bld.copy(Definition(dst), ctx->start_instance); + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance)); break; } case nir_intrinsic_load_instance_id: { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - bld.copy(Definition(dst), ctx->instance_id); + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id)); break; } case nir_intrinsic_load_draw_id: { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - bld.copy(Definition(dst), ctx->draw_id); + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id)); break; } default: @@ -7470,12 +7493,12 @@ static void create_vs_exports(isel_context *ctx) if (outinfo->export_prim_id) { ctx->vs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1; - ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = ctx->vs_prim_id; + ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = get_arg(ctx, ctx->args->vs_prim_id); } if (ctx->options->key.has_multiview_view_index) { ctx->vs_output.mask[VARYING_SLOT_LAYER] |= 0x1; - ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, ctx->view_index); + ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index)); } /* the order these position exports are created is important */ @@ -7579,7 +7602,7 @@ static void emit_streamout(isel_context *ctx, unsigned stream) Builder bld(ctx->program, ctx->block); Temp so_buffers[4]; - Temp buf_ptr = convert_pointer_to_64_bit(ctx, ctx->streamout_buffers); + Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers)); for (unsigned i = 0; i < 4; i++) { unsigned stride = ctx->program->info->so.strides[i]; if (!stride) @@ -7589,7 +7612,7 @@ static void emit_streamout(isel_context *ctx, unsigned stream) } Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), - ctx->streamout_config, Operand(0x70010u)); + get_arg(ctx, ctx->args->streamout_config), Operand(0x70010u)); Temp tid = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1), bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u))); @@ -7601,7 +7624,7 @@ static void emit_streamout(isel_context *ctx, unsigned stream) bld.reset(ctx->block); - Temp so_write_index = bld.vadd32(bld.def(v1), ctx->streamout_write_idx, tid); + Temp so_write_index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->streamout_write_idx), tid); Temp so_write_offset[4]; @@ -7612,13 +7635,15 @@ static void emit_streamout(isel_context *ctx, unsigned stream) if (stride == 1) { Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), - ctx->streamout_write_idx, ctx->streamout_offset[i]); + get_arg(ctx, ctx->args->streamout_write_idx), + get_arg(ctx, ctx->args->streamout_offset[i])); Temp new_offset = bld.vadd32(bld.def(v1), offset, tid); so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset); } else { Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u); - Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u), ctx->streamout_offset[i]); + Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u), + get_arg(ctx, ctx->args->streamout_offset[i])); so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2); } } @@ -7658,27 +7683,38 @@ void handle_bc_optimize(isel_context *ctx) uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena; bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena); bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena); + ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid); + ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid); if (uses_center && uses_centroid) { - Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)), ctx->prim_mask, Operand(0u)); + Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)), + get_arg(ctx, ctx->args->ac.prim_mask), Operand(0u)); if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) { + Temp new_coord[2]; for (unsigned i = 0; i < 2; i++) { - Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), - ctx->fs_inputs[fs_input::persp_centroid_p1 + i], - ctx->fs_inputs[fs_input::persp_center_p1 + i], - sel); - ctx->fs_inputs[fs_input::persp_centroid_p1 + i] = new_coord; + Temp persp_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1); + Temp persp_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1); + new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), + persp_centroid, persp_center, sel); } + ctx->persp_centroid = bld.tmp(v2); + bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid), + Operand(new_coord[0]), Operand(new_coord[1])); + emit_split_vector(ctx, ctx->persp_centroid, 2); } if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) { + Temp new_coord[2]; for (unsigned i = 0; i < 2; i++) { - Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), - ctx->fs_inputs[fs_input::linear_centroid_p1 + i], - ctx->fs_inputs[fs_input::linear_center_p1 + i], - sel); - ctx->fs_inputs[fs_input::linear_centroid_p1 + i] = new_coord; + Temp linear_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1); + Temp linear_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1); + new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), + linear_centroid, linear_center, sel); } + ctx->linear_centroid = bld.tmp(v2); + bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid), + Operand(new_coord[0]), Operand(new_coord[1])); + emit_split_vector(ctx, ctx->linear_centroid, 2); } } } @@ -7737,10 +7773,9 @@ void select_program(Program *program, unsigned shader_count, struct nir_shader *const *shaders, ac_shader_config* config, - struct radv_shader_info *info, - const struct radv_nir_compiler_options *options) + struct radv_shader_args *args) { - isel_context ctx = setup_isel_context(program, shader_count, shaders, config, info, options); + isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args); for (unsigned i = 0; i < shader_count; i++) { nir_shader *nir = shaders[i]; diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index fbab89417cd..16b53725408 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -28,6 +28,7 @@ #include "nir.h" #include "vulkan/radv_shader.h" #include "vulkan/radv_descriptor_set.h" +#include "vulkan/radv_shader_args.h" #include "sid.h" #include "ac_exp_param.h" #include "ac_shader_util.h" @@ -38,32 +39,6 @@ namespace aco { -enum fs_input { - persp_sample_p1, - persp_sample_p2, - persp_center_p1, - persp_center_p2, - persp_centroid_p1, - persp_centroid_p2, - persp_pull_model, - linear_sample_p1, - linear_sample_p2, - linear_center_p1, - linear_center_p2, - linear_centroid_p1, - linear_centroid_p2, - line_stipple, - frag_pos_0, - frag_pos_1, - frag_pos_2, - frag_pos_3, - front_face, - ancillary, - sample_coverage, - fixed_pt, - max_inputs, -}; - struct vs_output_state { uint8_t mask[VARYING_SLOT_VAR31 + 1]; Temp outputs[VARYING_SLOT_VAR31 + 1][4]; @@ -71,6 +46,7 @@ struct vs_output_state { struct isel_context { const struct radv_nir_compiler_options *options; + struct radv_shader_args *args; Program *program; nir_shader *shader; uint32_t constant_data_offset; @@ -95,51 +71,30 @@ struct isel_context { bool exec_potentially_empty = false; } cf_info; + Temp arg_temps[AC_MAX_ARGS]; + /* inputs common for merged stages */ Temp merged_wave_info = Temp(0, s1); /* FS inputs */ - bool fs_vgpr_args[fs_input::max_inputs]; - Temp fs_inputs[fs_input::max_inputs]; - Temp prim_mask = Temp(0, s1); - Temp descriptor_sets[MAX_SETS]; - Temp push_constants = Temp(0, s1); - Temp inline_push_consts[MAX_INLINE_PUSH_CONSTS]; - unsigned num_inline_push_consts = 0; - unsigned base_inline_push_consts = 0; + Temp persp_centroid, linear_centroid; /* VS inputs */ - Temp vertex_buffers = Temp(0, s1); - Temp base_vertex = Temp(0, s1); - Temp start_instance = Temp(0, s1); - Temp draw_id = Temp(0, s1); - Temp view_index = Temp(0, s1); - Temp es2gs_offset = Temp(0, s1); - Temp vertex_id = Temp(0, v1); - Temp rel_auto_id = Temp(0, v1); - Temp instance_id = Temp(0, v1); - Temp vs_prim_id = Temp(0, v1); bool needs_instance_id; - /* CS inputs */ - Temp num_workgroups = Temp(0, s3); - Temp workgroup_ids[3] = {Temp(0, s1), Temp(0, s1), Temp(0, s1)}; - Temp tg_size = Temp(0, s1); - Temp local_invocation_ids = Temp(0, v3); - /* VS output information */ unsigned num_clip_distances; unsigned num_cull_distances; vs_output_state vs_output; - - /* Streamout */ - Temp streamout_buffers = Temp(0, s1); - Temp streamout_write_idx = Temp(0, s1); - Temp streamout_config = Temp(0, s1); - Temp streamout_offset[4] = {Temp(0, s1), Temp(0, s1), Temp(0, s1), Temp(0, s1)}; }; -fs_input get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp) +Temp get_arg(isel_context *ctx, struct ac_arg arg) +{ + assert(arg.used); + return ctx->arg_temps[arg.arg_index]; +} + +unsigned get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp) { switch (interp) { case INTERP_MODE_SMOOTH: @@ -147,24 +102,24 @@ fs_input get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp) if (intrin == nir_intrinsic_load_barycentric_pixel || intrin == nir_intrinsic_load_barycentric_at_sample || intrin == nir_intrinsic_load_barycentric_at_offset) - return fs_input::persp_center_p1; + return S_0286CC_PERSP_CENTER_ENA(1); else if (intrin == nir_intrinsic_load_barycentric_centroid) - return fs_input::persp_centroid_p1; + return S_0286CC_PERSP_CENTROID_ENA(1); else if (intrin == nir_intrinsic_load_barycentric_sample) - return fs_input::persp_sample_p1; + return S_0286CC_PERSP_SAMPLE_ENA(1); break; case INTERP_MODE_NOPERSPECTIVE: if (intrin == nir_intrinsic_load_barycentric_pixel) - return fs_input::linear_center_p1; + return S_0286CC_LINEAR_CENTER_ENA(1); else if (intrin == nir_intrinsic_load_barycentric_centroid) - return fs_input::linear_centroid_p1; + return S_0286CC_LINEAR_CENTROID_ENA(1); else if (intrin == nir_intrinsic_load_barycentric_sample) - return fs_input::linear_sample_p1; + return S_0286CC_LINEAR_SAMPLE_ENA(1); break; default: break; } - return fs_input::max_inputs; + return 0; } void init_context(isel_context *ctx, nir_shader *shader) @@ -175,7 +130,8 @@ void init_context(isel_context *ctx, nir_shader *shader) ctx->divergent_vals = nir_divergence_analysis(shader, nir_divergence_view_index_uniform); std::unique_ptr allocated{new Temp[impl->ssa_alloc]()}; - memset(&ctx->fs_vgpr_args, false, sizeof(ctx->fs_vgpr_args)); + + unsigned spi_ps_inputs = 0; bool done = false; while (!done) { @@ -457,28 +413,28 @@ void init_context(isel_context *ctx, nir_shader *shader) case nir_intrinsic_load_barycentric_at_sample: case nir_intrinsic_load_barycentric_at_offset: { glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(intrinsic); - ctx->fs_vgpr_args[get_interp_input(intrinsic->intrinsic, mode)] = true; + spi_ps_inputs |= get_interp_input(intrinsic->intrinsic, mode); break; } case nir_intrinsic_load_front_face: - ctx->fs_vgpr_args[fs_input::front_face] = true; + spi_ps_inputs |= S_0286CC_FRONT_FACE_ENA(1); break; case nir_intrinsic_load_frag_coord: case nir_intrinsic_load_sample_pos: { uint8_t mask = nir_ssa_def_components_read(&intrinsic->dest.ssa); for (unsigned i = 0; i < 4; i++) { if (mask & (1 << i)) - ctx->fs_vgpr_args[fs_input::frag_pos_0 + i] = true; + spi_ps_inputs |= S_0286CC_POS_X_FLOAT_ENA(1) << i; } break; } case nir_intrinsic_load_sample_id: - ctx->fs_vgpr_args[fs_input::ancillary] = true; + spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1); break; case nir_intrinsic_load_sample_mask_in: - ctx->fs_vgpr_args[fs_input::ancillary] = true; - ctx->fs_vgpr_args[fs_input::sample_coverage] = true; + spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1); + spi_ps_inputs |= S_0286CC_SAMPLE_COVERAGE_ENA(1); break; default: break; @@ -555,479 +511,81 @@ void init_context(isel_context *ctx, nir_shader *shader) } } - for (unsigned i = 0; i < impl->ssa_alloc; i++) - allocated[i] = Temp(ctx->program->allocateId(), allocated[i].regClass()); - - ctx->allocated.reset(allocated.release()); -} - -struct user_sgpr_info { - uint8_t num_sgpr; - uint8_t remaining_sgprs; - uint8_t user_sgpr_idx; - bool need_ring_offsets; - bool indirect_all_descriptor_sets; -}; - -static void allocate_inline_push_consts(isel_context *ctx, - user_sgpr_info& user_sgpr_info) -{ - uint8_t remaining_sgprs = user_sgpr_info.remaining_sgprs; - - /* Only supported if shaders use push constants. */ - if (ctx->program->info->min_push_constant_used == UINT8_MAX) - return; - - /* Only supported if shaders don't have indirect push constants. */ - if (ctx->program->info->has_indirect_push_constants) - return; - - /* Only supported for 32-bit push constants. */ - //TODO: it's possible that some day, the load/store vectorization could make this inaccurate - if (!ctx->program->info->has_only_32bit_push_constants) - return; - - uint8_t num_push_consts = - (ctx->program->info->max_push_constant_used - - ctx->program->info->min_push_constant_used) / 4; - - /* Check if the number of user SGPRs is large enough. */ - if (num_push_consts < remaining_sgprs) { - ctx->program->info->num_inline_push_consts = num_push_consts; - } else { - ctx->program->info->num_inline_push_consts = remaining_sgprs; - } - - /* Clamp to the maximum number of allowed inlined push constants. */ - if (ctx->program->info->num_inline_push_consts > MAX_INLINE_PUSH_CONSTS) - ctx->program->info->num_inline_push_consts = MAX_INLINE_PUSH_CONSTS; - - if (ctx->program->info->num_inline_push_consts == num_push_consts && - !ctx->program->info->loads_dynamic_offsets) { - /* Disable the default push constants path if all constants are - * inlined and if shaders don't use dynamic descriptors. - */ - ctx->program->info->loads_push_constants = false; - user_sgpr_info.num_sgpr--; - user_sgpr_info.remaining_sgprs++; - } - - ctx->program->info->base_inline_push_consts = - ctx->program->info->min_push_constant_used / 4; - - user_sgpr_info.num_sgpr += ctx->program->info->num_inline_push_consts; - user_sgpr_info.remaining_sgprs -= ctx->program->info->num_inline_push_consts; -} - -static void allocate_user_sgprs(isel_context *ctx, - bool needs_view_index, user_sgpr_info& user_sgpr_info) -{ - memset(&user_sgpr_info, 0, sizeof(struct user_sgpr_info)); - uint32_t user_sgpr_count = 0; - - /* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */ - if (ctx->stage != fragment_fs && - ctx->stage != compute_cs - /*|| ctx->is_gs_copy_shader */) - user_sgpr_info.need_ring_offsets = true; - - if (ctx->stage == fragment_fs && - ctx->program->info->ps.needs_sample_positions) - user_sgpr_info.need_ring_offsets = true; - - /* 2 user sgprs will nearly always be allocated for scratch/rings */ - user_sgpr_count += 2; - - switch (ctx->stage) { - case vertex_vs: - /* if (!ctx->is_gs_copy_shader) */ { - if (ctx->program->info->vs.has_vertex_buffers) - user_sgpr_count++; - user_sgpr_count += ctx->program->info->vs.needs_draw_id ? 3 : 2; - } - break; - case fragment_fs: - //user_sgpr_count += ctx->program->info->ps.needs_sample_positions; - break; - case compute_cs: - if (ctx->program->info->cs.uses_grid_size) - user_sgpr_count += 3; - break; - default: - unreachable("Shader stage not implemented"); - } - - if (needs_view_index) - user_sgpr_count++; - - if (ctx->program->info->loads_push_constants) - user_sgpr_count += 1; /* we use 32bit pointers */ - - if (ctx->program->info->so.num_outputs) - user_sgpr_count += 1; /* we use 32bit pointers */ - - uint32_t available_sgprs = ctx->options->chip_class >= GFX9 && !(ctx->stage & hw_cs) ? 32 : 16; - uint32_t remaining_sgprs = available_sgprs - user_sgpr_count; - uint32_t num_desc_set = util_bitcount(ctx->program->info->desc_set_used_mask); - - if (available_sgprs < user_sgpr_count + num_desc_set) { - user_sgpr_info.indirect_all_descriptor_sets = true; - user_sgpr_info.num_sgpr = user_sgpr_count + 1; - user_sgpr_info.remaining_sgprs = remaining_sgprs - 1; - } else { - user_sgpr_info.num_sgpr = user_sgpr_count + num_desc_set; - user_sgpr_info.remaining_sgprs = remaining_sgprs - num_desc_set; - } - - allocate_inline_push_consts(ctx, user_sgpr_info); -} - -#define MAX_ARGS 64 -struct arg_info { - RegClass types[MAX_ARGS]; - Temp *assign[MAX_ARGS]; - PhysReg reg[MAX_ARGS]; - unsigned array_params_mask; - uint8_t count; - uint8_t sgpr_count; - uint8_t num_sgprs_used; - uint8_t num_vgprs_used; -}; - -static void -add_arg(arg_info *info, RegClass rc, Temp *param_ptr, unsigned reg) -{ - assert(info->count < MAX_ARGS); - - info->assign[info->count] = param_ptr; - info->types[info->count] = rc; - - if (rc.type() == RegType::sgpr) { - info->num_sgprs_used += rc.size(); - info->sgpr_count++; - info->reg[info->count] = PhysReg{reg}; - } else { - assert(rc.type() == RegType::vgpr); - info->num_vgprs_used += rc.size(); - info->reg[info->count] = PhysReg{reg + 256}; - } - info->count++; -} - -static void -set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx, uint8_t num_sgprs) -{ - ud_info->sgpr_idx = *sgpr_idx; - ud_info->num_sgprs = num_sgprs; - *sgpr_idx += num_sgprs; -} - -static void -set_loc_shader(isel_context *ctx, int idx, uint8_t *sgpr_idx, - uint8_t num_sgprs) -{ - struct radv_userdata_info *ud_info = &ctx->program->info->user_sgprs_locs.shader_data[idx]; - assert(ud_info); - - set_loc(ud_info, sgpr_idx, num_sgprs); -} - -static void -set_loc_shader_ptr(isel_context *ctx, int idx, uint8_t *sgpr_idx) -{ - bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS; - - set_loc_shader(ctx, idx, sgpr_idx, use_32bit_pointers ? 1 : 2); -} - -static void -set_loc_desc(isel_context *ctx, int idx, uint8_t *sgpr_idx) -{ - struct radv_userdata_locations *locs = &ctx->program->info->user_sgprs_locs; - struct radv_userdata_info *ud_info = &locs->descriptor_sets[idx]; - assert(ud_info); - - set_loc(ud_info, sgpr_idx, 1); - locs->descriptor_sets_enabled |= 1 << idx; -} - -static void -declare_global_input_sgprs(isel_context *ctx, - /* bool has_previous_stage, gl_shader_stage previous_stage, */ - user_sgpr_info *user_sgpr_info, - struct arg_info *args, - Temp *desc_sets) -{ - /* 1 for each descriptor set */ - if (!user_sgpr_info->indirect_all_descriptor_sets) { - uint32_t mask = ctx->program->info->desc_set_used_mask; - while (mask) { - int i = u_bit_scan(&mask); - add_arg(args, s1, &desc_sets[i], user_sgpr_info->user_sgpr_idx); - set_loc_desc(ctx, i, &user_sgpr_info->user_sgpr_idx); - } - /* NIR->LLVM might have set this to true if RADV_DEBUG=compiletime */ - ctx->program->info->need_indirect_descriptor_sets = false; - } else { - add_arg(args, s1, desc_sets, user_sgpr_info->user_sgpr_idx); - set_loc_shader_ptr(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS, &user_sgpr_info->user_sgpr_idx); - ctx->program->info->need_indirect_descriptor_sets = true; - } - - if (ctx->program->info->loads_push_constants) { - /* 1 for push constants and dynamic descriptors */ - add_arg(args, s1, &ctx->push_constants, user_sgpr_info->user_sgpr_idx); - set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, &user_sgpr_info->user_sgpr_idx); - } - - if (ctx->program->info->num_inline_push_consts) { - unsigned count = ctx->program->info->num_inline_push_consts; - for (unsigned i = 0; i < count; i++) - add_arg(args, s1, &ctx->inline_push_consts[i], user_sgpr_info->user_sgpr_idx + i); - set_loc_shader(ctx, AC_UD_INLINE_PUSH_CONSTANTS, &user_sgpr_info->user_sgpr_idx, count); - - ctx->num_inline_push_consts = ctx->program->info->num_inline_push_consts; - ctx->base_inline_push_consts = ctx->program->info->base_inline_push_consts; - } - - if (ctx->program->info->so.num_outputs) { - add_arg(args, s1, &ctx->streamout_buffers, user_sgpr_info->user_sgpr_idx); - set_loc_shader_ptr(ctx, AC_UD_STREAMOUT_BUFFERS, &user_sgpr_info->user_sgpr_idx); - } -} - -static void -declare_vs_input_vgprs(isel_context *ctx, struct arg_info *args) -{ - unsigned vgpr_idx = 0; - add_arg(args, v1, &ctx->vertex_id, vgpr_idx++); - if (ctx->options->chip_class >= GFX10) { - add_arg(args, v1, NULL, vgpr_idx++); /* unused */ - add_arg(args, v1, &ctx->vs_prim_id, vgpr_idx++); - add_arg(args, v1, &ctx->instance_id, vgpr_idx++); - } else { - if (ctx->options->key.vs.out.as_ls) { - add_arg(args, v1, &ctx->rel_auto_id, vgpr_idx++); - add_arg(args, v1, &ctx->instance_id, vgpr_idx++); - } else { - add_arg(args, v1, &ctx->instance_id, vgpr_idx++); - add_arg(args, v1, &ctx->vs_prim_id, vgpr_idx++); - } - add_arg(args, v1, NULL, vgpr_idx); /* unused */ + if (G_0286CC_POS_W_FLOAT_ENA(spi_ps_inputs)) { + /* If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be enabled too */ + spi_ps_inputs |= S_0286CC_PERSP_CENTER_ENA(1); } -} - -static void -declare_streamout_sgprs(isel_context *ctx, struct arg_info *args, unsigned *idx) -{ - /* Streamout SGPRs. */ - if (ctx->program->info->so.num_outputs) { - assert(ctx->stage & hw_vs); - - if (ctx->stage != tess_eval_vs) { - add_arg(args, s1, &ctx->streamout_config, (*idx)++); - } else { - args->assign[args->count - 1] = &ctx->streamout_config; - args->types[args->count - 1] = s1; - } - add_arg(args, s1, &ctx->streamout_write_idx, (*idx)++); + if (!(spi_ps_inputs & 0x7F)) { + /* At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled */ + spi_ps_inputs |= S_0286CC_PERSP_CENTER_ENA(1); } - /* A streamout buffer offset is loaded if the stride is non-zero. */ - for (unsigned i = 0; i < 4; i++) { - if (!ctx->program->info->so.strides[i]) - continue; + ctx->program->config->spi_ps_input_ena = spi_ps_inputs; + ctx->program->config->spi_ps_input_addr = spi_ps_inputs; - add_arg(args, s1, &ctx->streamout_offset[i], (*idx)++); - } -} - -static bool needs_view_index_sgpr(isel_context *ctx) -{ - switch (ctx->stage) { - case vertex_vs: - return ctx->program->info->needs_multiview_view_index || ctx->options->key.has_multiview_view_index; - case tess_eval_vs: - return ctx->program->info->needs_multiview_view_index && ctx->options->key.has_multiview_view_index; - case vertex_ls: - case vertex_es: - case vertex_tess_control_hs: - case vertex_geometry_gs: - case tess_control_hs: - case tess_eval_es: - case tess_eval_geometry_gs: - case geometry_gs: - return ctx->program->info->needs_multiview_view_index; - default: - return false; - } -} - -static inline bool -add_fs_arg(isel_context *ctx, arg_info *args, unsigned &vgpr_idx, fs_input input, unsigned value, bool enable_next = false, RegClass rc = v1) -{ - if (!ctx->fs_vgpr_args[input]) - return false; - - add_arg(args, rc, &ctx->fs_inputs[input], vgpr_idx); - vgpr_idx += rc.size(); - - if (enable_next) { - add_arg(args, rc, &ctx->fs_inputs[input + 1], vgpr_idx); - vgpr_idx += rc.size(); - } + for (unsigned i = 0; i < impl->ssa_alloc; i++) + allocated[i] = Temp(ctx->program->allocateId(), allocated[i].regClass()); - ctx->program->config->spi_ps_input_addr |= value; - ctx->program->config->spi_ps_input_ena |= value; - return true; + ctx->allocated.reset(allocated.release()); } Pseudo_instruction *add_startpgm(struct isel_context *ctx) { - user_sgpr_info user_sgpr_info; - bool needs_view_index = needs_view_index_sgpr(ctx); - allocate_user_sgprs(ctx, needs_view_index, user_sgpr_info); - arg_info args = {}; - - /* this needs to be in sgprs 0 and 1 */ - add_arg(&args, s2, &ctx->program->private_segment_buffer, 0); - set_loc_shader_ptr(ctx, AC_UD_SCRATCH_RING_OFFSETS, &user_sgpr_info.user_sgpr_idx); - - unsigned vgpr_idx = 0; - switch (ctx->stage) { - case vertex_vs: { - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, ctx->descriptor_sets); - if (ctx->program->info->vs.has_vertex_buffers) { - add_arg(&args, s1, &ctx->vertex_buffers, user_sgpr_info.user_sgpr_idx); - set_loc_shader_ptr(ctx, AC_UD_VS_VERTEX_BUFFERS, &user_sgpr_info.user_sgpr_idx); - } - add_arg(&args, s1, &ctx->base_vertex, user_sgpr_info.user_sgpr_idx); - add_arg(&args, s1, &ctx->start_instance, user_sgpr_info.user_sgpr_idx + 1); - if (ctx->program->info->vs.needs_draw_id) { - add_arg(&args, s1, &ctx->draw_id, user_sgpr_info.user_sgpr_idx + 2); - set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, &user_sgpr_info.user_sgpr_idx, 3); - } else - set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, &user_sgpr_info.user_sgpr_idx, 2); - - if (needs_view_index) { - add_arg(&args, s1, &ctx->view_index, user_sgpr_info.user_sgpr_idx); - set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_info.user_sgpr_idx, 1); - } - - assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr); - unsigned idx = user_sgpr_info.user_sgpr_idx; - if (ctx->options->key.vs.out.as_es) - add_arg(&args, s1, &ctx->es2gs_offset, idx++); - else - declare_streamout_sgprs(ctx, &args, &idx); - - add_arg(&args, s1, &ctx->program->scratch_offset, idx++); - - declare_vs_input_vgprs(ctx, &args); - break; - } - case fragment_fs: { - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, ctx->descriptor_sets); - - assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr); - add_arg(&args, s1, &ctx->prim_mask, user_sgpr_info.user_sgpr_idx); - - add_arg(&args, s1, &ctx->program->scratch_offset, user_sgpr_info.user_sgpr_idx + 1); - - ctx->program->config->spi_ps_input_addr = 0; - ctx->program->config->spi_ps_input_ena = 0; - - bool has_interp_mode = false; - - has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_sample_p1, S_0286CC_PERSP_SAMPLE_ENA(1), true); - has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_center_p1, S_0286CC_PERSP_CENTER_ENA(1), true); - has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_centroid_p1, S_0286CC_PERSP_CENTROID_ENA(1), true); - has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_pull_model, S_0286CC_PERSP_PULL_MODEL_ENA(1), false, v3); - - if (!has_interp_mode && ctx->fs_vgpr_args[fs_input::frag_pos_3]) { - /* If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be enabled too */ - ctx->fs_vgpr_args[fs_input::persp_center_p1] = true; - has_interp_mode = add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_center_p1, S_0286CC_PERSP_CENTER_ENA(1), true); - } - - has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::linear_sample_p1, S_0286CC_LINEAR_SAMPLE_ENA(1), true); - has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::linear_center_p1, S_0286CC_LINEAR_CENTER_ENA(1), true); - has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::linear_centroid_p1, S_0286CC_LINEAR_CENTROID_ENA(1), true); - has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::line_stipple, S_0286CC_LINE_STIPPLE_TEX_ENA(1)); - - if (!has_interp_mode) { - /* At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled */ - ctx->fs_vgpr_args[fs_input::persp_center_p1] = true; - has_interp_mode = add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_center_p1, S_0286CC_PERSP_CENTER_ENA(1), true); - } - - add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_0, S_0286CC_POS_X_FLOAT_ENA(1)); - add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_1, S_0286CC_POS_Y_FLOAT_ENA(1)); - add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_2, S_0286CC_POS_Z_FLOAT_ENA(1)); - add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_3, S_0286CC_POS_W_FLOAT_ENA(1)); - - add_fs_arg(ctx, &args, vgpr_idx, fs_input::front_face, S_0286CC_FRONT_FACE_ENA(1)); - add_fs_arg(ctx, &args, vgpr_idx, fs_input::ancillary, S_0286CC_ANCILLARY_ENA(1)); - add_fs_arg(ctx, &args, vgpr_idx, fs_input::sample_coverage, S_0286CC_SAMPLE_COVERAGE_ENA(1)); - add_fs_arg(ctx, &args, vgpr_idx, fs_input::fixed_pt, S_0286CC_POS_FIXED_PT_ENA(1)); - - ASSERTED bool unset_interp_mode = !(ctx->program->config->spi_ps_input_addr & 0x7F) || - (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_addr) - && !(ctx->program->config->spi_ps_input_addr & 0xF)); - - assert(has_interp_mode); - assert(!unset_interp_mode); - break; - } - case compute_cs: { - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, ctx->descriptor_sets); + unsigned arg_count = ctx->args->ac.arg_count; + if (ctx->stage == fragment_fs) { + /* LLVM optimizes away unused FS inputs and computes spi_ps_input_addr + * itself and then communicates the results back via the ELF binary. + * Mirror what LLVM does by re-mapping the VGPR arguments here. + * + * TODO: If we made the FS input scanning code into a separate pass that + * could run before argument setup, then this wouldn't be necessary + * anymore. + */ + struct ac_shader_args *args = &ctx->args->ac; + arg_count = 0; + for (unsigned i = 0, vgpr_arg = 0, vgpr_reg = 0; i < args->arg_count; i++) { + if (args->args[i].file != AC_ARG_VGPR) { + arg_count++; + continue; + } - if (ctx->program->info->cs.uses_grid_size) { - add_arg(&args, s3, &ctx->num_workgroups, user_sgpr_info.user_sgpr_idx); - set_loc_shader(ctx, AC_UD_CS_GRID_SIZE, &user_sgpr_info.user_sgpr_idx, 3); - } - assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr); - unsigned idx = user_sgpr_info.user_sgpr_idx; - for (unsigned i = 0; i < 3; i++) { - if (ctx->program->info->cs.uses_block_id[i]) - add_arg(&args, s1, &ctx->workgroup_ids[i], idx++); + if (!(ctx->program->config->spi_ps_input_addr & (1 << vgpr_arg))) { + args->args[i].skip = true; + } else { + args->args[i].offset = vgpr_reg; + vgpr_reg += args->args[i].size; + arg_count++; + } + vgpr_arg++; } - - if (ctx->program->info->cs.uses_local_invocation_idx) - add_arg(&args, s1, &ctx->tg_size, idx++); - add_arg(&args, s1, &ctx->program->scratch_offset, idx++); - - add_arg(&args, v3, &ctx->local_invocation_ids, vgpr_idx++); - break; } - default: - unreachable("Shader stage not implemented"); - } - - ctx->program->info->num_input_vgprs = 0; - ctx->program->info->num_input_sgprs = args.num_sgprs_used; - ctx->program->info->num_user_sgprs = user_sgpr_info.num_sgpr; - ctx->program->info->num_input_vgprs = args.num_vgprs_used; - if (ctx->stage == fragment_fs) { - /* Verify that we have a correct assumption about input VGPR count */ - ASSERTED unsigned input_vgpr_cnt = ac_get_fs_input_vgpr_cnt(ctx->program->config, nullptr, nullptr); - assert(input_vgpr_cnt == ctx->program->info->num_input_vgprs); - } + aco_ptr startpgm{create_instruction(aco_opcode::p_startpgm, Format::PSEUDO, 0, arg_count + 1)}; + for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) { + if (ctx->args->ac.args[i].skip) + continue; - aco_ptr startpgm{create_instruction(aco_opcode::p_startpgm, Format::PSEUDO, 0, args.count + 1)}; - for (unsigned i = 0; i < args.count; i++) { - if (args.assign[i]) { - *args.assign[i] = Temp{ctx->program->allocateId(), args.types[i]}; - startpgm->definitions[i] = Definition(*args.assign[i]); - startpgm->definitions[i].setFixed(args.reg[i]); - } - } - startpgm->definitions[args.count] = Definition{ctx->program->allocateId(), exec, s2}; + enum ac_arg_regfile file = ctx->args->ac.args[i].file; + unsigned size = ctx->args->ac.args[i].size; + unsigned reg = ctx->args->ac.args[i].offset; + RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size); + Temp dst = Temp{ctx->program->allocateId(), type}; + ctx->arg_temps[i] = dst; + startpgm->definitions[arg] = Definition(dst); + startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256}); + arg++; + } + startpgm->definitions[arg_count] = Definition{ctx->program->allocateId(), exec, s2}; Pseudo_instruction *instr = startpgm.get(); ctx->block->instructions.push_back(std::move(startpgm)); + /* Stash these in the program so that they can be accessed later when + * handling spilling. + */ + ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets); + ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset); + return instr; } @@ -1168,8 +726,7 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader *const *shaders, ac_shader_config* config, - radv_shader_info *info, - const radv_nir_compiler_options *options) + struct radv_shader_args *args) { program->stage = 0; for (unsigned i = 0; i < shader_count; i++) { @@ -1206,23 +763,23 @@ setup_isel_context(Program* program, unreachable("Shader stage not implemented"); program->config = config; - program->info = info; - program->chip_class = options->chip_class; - program->family = options->family; - program->wave_size = info->wave_size; + program->info = args->shader_info; + program->chip_class = args->options->chip_class; + program->family = args->options->family; + program->wave_size = args->shader_info->wave_size; - program->lds_alloc_granule = options->chip_class >= GFX7 ? 512 : 256; - program->lds_limit = options->chip_class >= GFX7 ? 65536 : 32768; + program->lds_alloc_granule = args->options->chip_class >= GFX7 ? 512 : 256; + program->lds_limit = args->options->chip_class >= GFX7 ? 65536 : 32768; program->vgpr_limit = 256; - if (options->chip_class >= GFX10) { + if (args->options->chip_class >= GFX10) { program->physical_sgprs = 2560; /* doesn't matter as long as it's at least 128 * 20 */ program->sgpr_alloc_granule = 127; program->sgpr_limit = 106; } else if (program->chip_class >= GFX8) { program->physical_sgprs = 800; program->sgpr_alloc_granule = 15; - if (options->family == CHIP_TONGA || options->family == CHIP_ICELAND) + if (args->options->family == CHIP_TONGA || args->options->family == CHIP_ICELAND) program->sgpr_limit = 94; /* workaround hardware bug */ else program->sgpr_limit = 102; @@ -1234,28 +791,12 @@ setup_isel_context(Program* program, /* TODO: we don't have to allocate VCC if we don't need it */ program->needs_vcc = true; - for (unsigned i = 0; i < MAX_SETS; ++i) - program->info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1; - for (unsigned i = 0; i < AC_UD_MAX_UD; ++i) - program->info->user_sgprs_locs.shader_data[i].sgpr_idx = -1; - isel_context ctx = {}; ctx.program = program; - ctx.options = options; + ctx.args = args; + ctx.options = args->options; ctx.stage = program->stage; - for (unsigned i = 0; i < fs_input::max_inputs; ++i) - ctx.fs_inputs[i] = Temp(0, v1); - ctx.fs_inputs[fs_input::persp_pull_model] = Temp(0, v3); - for (unsigned i = 0; i < MAX_SETS; ++i) - ctx.descriptor_sets[i] = Temp(0, s1); - for (unsigned i = 0; i < MAX_INLINE_PUSH_CONSTS; ++i) - ctx.inline_push_consts[i] = Temp(0, s1); - for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) { - for (unsigned j = 0; j < 4; ++j) - ctx.vs_output.outputs[i][j] = Temp(0, v1); - } - for (unsigned i = 0; i < shader_count; i++) { nir_shader *nir = shaders[i]; @@ -1339,7 +880,7 @@ setup_isel_context(Program* program, nir_function_impl *func = nir_shader_get_entrypoint(nir); nir_index_ssa_defs(func); - if (options->dump_preoptir) { + if (args->options->dump_preoptir) { fprintf(stderr, "NIR shader before instruction selection:\n"); nir_print_shader(nir, stderr); } diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index fe22b964725..802adcefb1b 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -65,8 +65,7 @@ void aco_compile_shader(unsigned shader_count, std::unique_ptr program{new aco::Program}; /* Instruction Selection */ - aco::select_program(program.get(), shader_count, shaders, &config, - args->shader_info, args->options); + aco::select_program(program.get(), shader_count, shaders, &config, args); if (args->options->dump_preoptir) { std::cerr << "After Instruction Selection:\n"; aco_print_program(program.get(), stderr); diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 370aa5a03c2..a0b5698bb67 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -37,6 +37,7 @@ #include "aco_util.h" struct radv_nir_compiler_options; +struct radv_shader_args; struct radv_shader_info; namespace aco { @@ -1208,8 +1209,7 @@ void select_program(Program *program, unsigned shader_count, struct nir_shader *const *shaders, ac_shader_config* config, - struct radv_shader_info *info, - const struct radv_nir_compiler_options *options); + struct radv_shader_args *args); void lower_wqm(Program* program, live& live_vars, const struct radv_nir_compiler_options *options); diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c index bcec3e9d2e7..f79d0b2d2ef 100644 --- a/src/amd/vulkan/radv_shader_args.c +++ b/src/amd/vulkan/radv_shader_args.c @@ -695,9 +695,7 @@ radv_declare_shader_args(struct radv_shader_args *args, args->shader_info->num_input_vgprs = 0; args->shader_info->num_input_sgprs = 2; args->shader_info->num_input_sgprs += args->ac.num_sgprs_used; - - if (stage != MESA_SHADER_FRAGMENT) - args->shader_info->num_input_vgprs = args->ac.num_vgprs_used; + args->shader_info->num_input_vgprs = args->ac.num_vgprs_used; uint8_t user_sgpr_idx = 0; -- 2.30.2