From f8f7712666b738fc9ebd4a6390563e44db46b68f Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 15 Nov 2019 11:31:03 +0000 Subject: [PATCH] aco: implement GS copy shaders MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit v5: rebase on float_controls changes v7: rebase after shader args MR and load/store vectorizer MR Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- .../compiler/aco_instruction_selection.cpp | 180 +++++++++++- .../aco_instruction_selection_setup.cpp | 265 +++++++++--------- src/amd/compiler/aco_interface.cpp | 7 +- src/amd/compiler/aco_ir.h | 23 +- 4 files changed, 327 insertions(+), 148 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 1792e831222..bbdc2dbf3da 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -25,6 +25,7 @@ #include #include +#include #include #include "ac_shader_util.h" @@ -8534,7 +8535,7 @@ static void create_vs_exports(isel_context *ctx) if (ctx->num_clip_distances + ctx->num_cull_distances > 4) export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos); - if (ctx->options->key.vs_common_out.export_clip_dists) { + if (ctx->export_clip_dists) { if (ctx->num_clip_distances + ctx->num_cull_distances > 0) export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos); if (ctx->num_clip_distances + ctx->num_cull_distances > 4) @@ -8568,7 +8569,7 @@ static void emit_stream_output(isel_context *ctx, Temp out[4]; bool all_undef = true; - assert(ctx->stage == vertex_vs); + assert(ctx->stage == vertex_vs || ctx->stage == gs_copy_vs); for (unsigned i = 0; i < num_comps; i++) { out[i] = ctx->vsgs_output.outputs[loc][start + i]; all_undef = all_undef && !out[i].id(); @@ -8804,13 +8805,24 @@ void setup_fp_mode(isel_context *ctx, nir_shader *shader) ctx->block->fp_mode = program->next_fp_mode; } +void cleanup_cfg(Program *program) +{ + /* create linear_succs/logical_succs */ + for (Block& BB : program->blocks) { + for (unsigned idx : BB.linear_preds) + program->blocks[idx].linear_succs.emplace_back(BB.index); + for (unsigned idx : BB.logical_preds) + program->blocks[idx].logical_succs.emplace_back(BB.index); + } +} + void select_program(Program *program, unsigned shader_count, struct nir_shader *const *shaders, ac_shader_config* config, struct radv_shader_args *args) { - isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args); + isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false); for (unsigned i = 0; i < shader_count; i++) { nir_shader *nir = shaders[i]; @@ -8879,12 +8891,162 @@ void select_program(Program *program, bld.smem(aco_opcode::s_dcache_wb, false); bld.sopp(aco_opcode::s_endpgm); - /* cleanup CFG */ - for (Block& BB : program->blocks) { - for (unsigned idx : BB.linear_preds) - program->blocks[idx].linear_succs.emplace_back(BB.index); - for (unsigned idx : BB.logical_preds) - program->blocks[idx].logical_succs.emplace_back(BB.index); + cleanup_cfg(program); +} + +void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, + ac_shader_config* config, + struct radv_shader_args *args) +{ + isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true); + + program->next_fp_mode.preserve_signed_zero_inf_nan32 = false; + program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false; + program->next_fp_mode.must_flush_denorms32 = false; + program->next_fp_mode.must_flush_denorms16_64 = false; + program->next_fp_mode.care_about_round32 = false; + program->next_fp_mode.care_about_round16_64 = false; + program->next_fp_mode.denorm16_64 = fp_denorm_keep; + program->next_fp_mode.denorm32 = 0; + program->next_fp_mode.round32 = fp_round_ne; + program->next_fp_mode.round16_64 = fp_round_ne; + ctx.block->fp_mode = program->next_fp_mode; + + add_startpgm(&ctx); + append_logical_start(ctx.block); + + Builder bld(ctx.program, ctx.block); + + Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), program->private_segment_buffer, Operand(RING_GSVS_VS * 16u)); + + Operand stream_id(0u); + if (args->shader_info->so.num_outputs) + stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), + get_arg(&ctx, ctx.args->streamout_config), Operand(0x20018u)); + + Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), get_arg(&ctx, ctx.args->ac.vertex_id)); + + std::stack endif_blocks; + + for (unsigned stream = 0; stream < 4; stream++) { + if (stream_id.isConstant() && stream != stream_id.constantValue()) + continue; + + unsigned num_components = args->shader_info->gs.num_stream_output_components[stream]; + if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs)) + continue; + + memset(ctx.vsgs_output.mask, 0, sizeof(ctx.vsgs_output.mask)); + + unsigned BB_if_idx = ctx.block->index; + Block BB_endif = Block(); + if (!stream_id.isConstant()) { + /* begin IF */ + Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand(stream)); + append_logical_end(ctx.block); + ctx.block->kind |= block_kind_uniform; + bld.branch(aco_opcode::p_cbranch_z, cond); + + BB_endif.kind |= ctx.block->kind & block_kind_top_level; + + ctx.block = ctx.program->create_and_insert_block(); + add_edge(BB_if_idx, ctx.block); + bld.reset(ctx.block); + append_logical_start(ctx.block); + } + + unsigned offset = 0; + for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) { + if (args->shader_info->gs.output_streams[i] != stream) + continue; + + unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i]; + unsigned length = util_last_bit(output_usage_mask); + for (unsigned j = 0; j < length; ++j) { + if (!(output_usage_mask & (1 << j))) + continue; + + unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4; + Temp voffset = vtx_offset; + if (const_offset >= 4096u) { + voffset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), voffset); + const_offset %= 4096u; + } + + aco_ptr mubuf{create_instruction(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)}; + mubuf->definitions[0] = bld.def(v1); + mubuf->operands[0] = Operand(voffset); + mubuf->operands[1] = Operand(gsvs_ring); + mubuf->operands[2] = Operand(0u); + mubuf->offen = true; + mubuf->offset = const_offset; + mubuf->glc = true; + mubuf->slc = true; + mubuf->dlc = args->options->chip_class >= GFX10; + mubuf->barrier = barrier_none; + mubuf->can_reorder = true; + + ctx.vsgs_output.mask[i] |= 1 << j; + ctx.vsgs_output.outputs[i][j] = mubuf->definitions[0].getTemp(); + + bld.insert(std::move(mubuf)); + + offset++; + } + } + + if (args->shader_info->so.num_outputs) { + emit_streamout(&ctx, stream); + bld.reset(ctx.block); + } + + if (stream == 0) { + create_vs_exports(&ctx); + ctx.block->kind |= block_kind_export_end; + } + + if (!stream_id.isConstant()) { + append_logical_end(ctx.block); + + /* branch from then block to endif block */ + bld.branch(aco_opcode::p_branch); + add_edge(ctx.block->index, &BB_endif); + ctx.block->kind |= block_kind_uniform; + + /* emit else block */ + ctx.block = ctx.program->create_and_insert_block(); + add_edge(BB_if_idx, ctx.block); + bld.reset(ctx.block); + append_logical_start(ctx.block); + + endif_blocks.push(std::move(BB_endif)); + } } + + while (!endif_blocks.empty()) { + Block BB_endif = std::move(endif_blocks.top()); + endif_blocks.pop(); + + Block *BB_else = ctx.block; + + append_logical_end(BB_else); + /* branch from else block to endif block */ + bld.branch(aco_opcode::p_branch); + add_edge(BB_else->index, &BB_endif); + BB_else->kind |= block_kind_uniform; + + /** emit endif merge block */ + ctx.block = program->insert_block(std::move(BB_endif)); + bld.reset(ctx.block); + append_logical_start(ctx.block); + } + + program->config->float_mode = program->blocks[0].fp_mode.val; + + append_logical_end(ctx.block); + ctx.block->kind |= block_kind_uniform; + bld.sopp(aco_opcode::s_endpgm); + + cleanup_cfg(program); } } diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 7c53a0ecd3e..2ad39180e2c 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -85,6 +85,7 @@ struct isel_context { uint64_t output_masks[MESA_SHADER_COMPUTE]; /* VS output information */ + bool export_clip_dists; unsigned num_clip_distances; unsigned num_cull_distances; @@ -661,6 +662,54 @@ mem_vectorize_callback(unsigned align, unsigned bit_size, return false; } +void +setup_vs_output_info(isel_context *ctx, nir_shader *nir, + bool export_prim_id, bool export_clip_dists, + radv_vs_output_info *outinfo) +{ + memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, + sizeof(outinfo->vs_output_param_offset)); + + outinfo->param_exports = 0; + int pos_written = 0x1; + if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer) + pos_written |= 1 << 1; + + uint64_t mask = ctx->output_masks[nir->info.stage]; + while (mask) { + int idx = u_bit_scan64(&mask); + if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID || + ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) { + if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED) + outinfo->vs_output_param_offset[idx] = outinfo->param_exports++; + } + } + if (outinfo->writes_layer && + outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) { + /* when ctx->options->key.has_multiview_view_index = true, the layer + * variable isn't declared in NIR and it's isel's job to get the layer */ + outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++; + } + + if (export_prim_id) { + assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED); + outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++; + } + + ctx->export_clip_dists = export_clip_dists; + ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask); + ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask); + + assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8); + + if (ctx->num_clip_distances + ctx->num_cull_distances > 0) + pos_written |= 1 << 2; + if (ctx->num_clip_distances + ctx->num_cull_distances > 4) + pos_written |= 1 << 3; + + outinfo->pos_exports = util_bitcount(pos_written); +} + void setup_vs_variables(isel_context *ctx, nir_shader *nir) { @@ -681,49 +730,8 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir) if (ctx->stage == vertex_vs) { radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo; - - memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, - sizeof(outinfo->vs_output_param_offset)); - - bool export_clip_dists = ctx->options->key.vs_common_out.export_clip_dists; - - outinfo->param_exports = 0; - int pos_written = 0x1; - if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer) - pos_written |= 1 << 1; - - uint64_t mask = ctx->output_masks[nir->info.stage]; - while (mask) { - int idx = u_bit_scan64(&mask); - if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID || - ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) { - if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED) - outinfo->vs_output_param_offset[idx] = outinfo->param_exports++; - } - } - if (outinfo->writes_layer && - outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) { - /* when ctx->options->key.has_multiview_view_index = true, the layer - * variable isn't declared in NIR and it's isel's job to get the layer */ - outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++; - } - - if (outinfo->export_prim_id) { - assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED); - outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++; - } - - ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask); - ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask); - - assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8); - - if (ctx->num_clip_distances + ctx->num_cull_distances > 0) - pos_written |= 1 << 2; - if (ctx->num_clip_distances + ctx->num_cull_distances > 4) - pos_written |= 1 << 3; - - outinfo->pos_exports = util_bitcount(pos_written); + setup_vs_output_info(ctx, nir, outinfo->export_prim_id, + ctx->options->key.vs_common_out.export_clip_dists, outinfo); } else if (ctx->stage == vertex_geometry_gs || ctx->stage == vertex_es) { /* TODO: radv_nir_shader_info_pass() already sets this but it's larger * than it needs to be in order to set it better, we have to improve @@ -824,12 +832,80 @@ get_io_masks(isel_context *ctx, unsigned shader_count, struct nir_shader *const } } +void +setup_nir(isel_context *ctx, nir_shader *nir) +{ + Program *program = ctx->program; + + /* align and copy constant data */ + while (program->constant_data.size() % 4u) + program->constant_data.push_back(0); + ctx->constant_data_offset = program->constant_data.size(); + program->constant_data.insert(program->constant_data.end(), + (uint8_t*)nir->constant_data, + (uint8_t*)nir->constant_data + nir->constant_data_size); + + /* the variable setup has to be done before lower_io / CSE */ + setup_variables(ctx, nir); + + /* optimize and lower memory operations */ + bool lower_to_scalar = false; + bool lower_pack = false; + if (nir_opt_load_store_vectorize(nir, + (nir_variable_mode)(nir_var_mem_ssbo | nir_var_mem_ubo | + nir_var_mem_push_const | nir_var_mem_shared), + mem_vectorize_callback)) { + lower_to_scalar = true; + lower_pack = true; + } + if (nir->info.stage != MESA_SHADER_COMPUTE) + nir_lower_io(nir, (nir_variable_mode)(nir_var_shader_in | nir_var_shader_out), type_size, (nir_lower_io_options)0); + nir_lower_explicit_io(nir, nir_var_mem_global, nir_address_format_64bit_global); + + if (lower_to_scalar) + nir_lower_alu_to_scalar(nir, NULL, NULL); + if (lower_pack) + nir_lower_pack(nir); + + /* lower ALU operations */ + // TODO: implement logic64 in aco, it's more effective for sgprs + nir_lower_int64(nir, nir->options->lower_int64_options); + + nir_opt_idiv_const(nir, 32); + nir_lower_idiv(nir, nir_lower_idiv_precise); + + /* optimize the lowered ALU operations */ + bool more_algebraic = true; + while (more_algebraic) { + more_algebraic = false; + NIR_PASS_V(nir, nir_copy_prop); + NIR_PASS_V(nir, nir_opt_dce); + NIR_PASS_V(nir, nir_opt_constant_folding); + NIR_PASS(more_algebraic, nir, nir_opt_algebraic); + } + + /* cleanup passes */ + nir_lower_load_const_to_scalar(nir); + nir_opt_shrink_load(nir); + nir_move_options move_opts = (nir_move_options)( + nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | nir_move_comparisons); + nir_opt_sink(nir, move_opts); + nir_opt_move(nir, move_opts); + nir_convert_to_lcssa(nir, true, false); + nir_lower_phis_to_scalar(nir); + + nir_function_impl *func = nir_shader_get_entrypoint(nir); + nir_index_ssa_defs(func); + nir_metadata_require(func, nir_metadata_block_index); +} + isel_context setup_isel_context(Program* program, unsigned shader_count, struct nir_shader *const *shaders, ac_shader_config* config, - struct radv_shader_args *args) + struct radv_shader_args *args, + bool is_gs_copy_shader) { program->stage = 0; for (unsigned i = 0; i < shader_count; i++) { @@ -844,7 +920,7 @@ setup_isel_context(Program* program, program->stage |= sw_tes; break; case MESA_SHADER_GEOMETRY: - program->stage |= sw_gs; + program->stage |= is_gs_copy_shader ? sw_gs_copy : sw_gs; break; case MESA_SHADER_FRAGMENT: program->stage |= sw_fs; @@ -868,6 +944,8 @@ setup_isel_context(Program* program, program->stage |= hw_fs; else if (program->stage == sw_cs) program->stage |= hw_cs; + else if (program->stage == sw_gs_copy) + program->stage |= hw_vs; else if (program->stage == (sw_vs | sw_gs) && gfx9_plus && !ngg) program->stage |= hw_gs; else @@ -918,94 +996,25 @@ setup_isel_context(Program* program, get_io_masks(&ctx, shader_count, shaders); - for (unsigned i = 0; i < shader_count; i++) { - nir_shader *nir = shaders[i]; - - /* align and copy constant data */ - while (program->constant_data.size() % 4u) - program->constant_data.push_back(0); - ctx.constant_data_offset = program->constant_data.size(); - program->constant_data.insert(program->constant_data.end(), - (uint8_t*)nir->constant_data, - (uint8_t*)nir->constant_data + nir->constant_data_size); - - /* the variable setup has to be done before lower_io / CSE */ - setup_variables(&ctx, nir); - - /* optimize and lower memory operations */ - bool lower_to_scalar = false; - bool lower_pack = false; - if (nir_opt_load_store_vectorize(nir, - (nir_variable_mode)(nir_var_mem_ssbo | nir_var_mem_ubo | - nir_var_mem_push_const | nir_var_mem_shared), - mem_vectorize_callback)) { - lower_to_scalar = true; - lower_pack = true; - } - if (nir->info.stage != MESA_SHADER_COMPUTE) - nir_lower_io(nir, (nir_variable_mode)(nir_var_shader_in | nir_var_shader_out), type_size, (nir_lower_io_options)0); - nir_lower_explicit_io(nir, nir_var_mem_global, nir_address_format_64bit_global); - - if (lower_to_scalar) - nir_lower_alu_to_scalar(nir, NULL, NULL); - if (lower_pack) - nir_lower_pack(nir); - - /* lower ALU operations */ - // TODO: implement logic64 in aco, it's more effective for sgprs - nir_lower_int64(nir, nir->options->lower_int64_options); - - nir_opt_idiv_const(nir, 32); - nir_lower_idiv(nir, nir_lower_idiv_precise); - - /* optimize the lowered ALU operations */ - bool more_algebraic = true; - while (more_algebraic) { - more_algebraic = false; - NIR_PASS_V(nir, nir_copy_prop); - NIR_PASS_V(nir, nir_opt_dce); - NIR_PASS_V(nir, nir_opt_constant_folding); - NIR_PASS(more_algebraic, nir, nir_opt_algebraic); - } + unsigned scratch_size = 0; + if (program->stage == gs_copy_vs) { + assert(shader_count == 1); + setup_vs_output_info(&ctx, shaders[0], false, true, &args->shader_info->vs.outinfo); + } else { + for (unsigned i = 0; i < shader_count; i++) { + nir_shader *nir = shaders[i]; + setup_nir(&ctx, nir); - /* Do late algebraic optimization to turn add(a, neg(b)) back into - * subs, then the mandatory cleanup after algebraic. Note that it may - * produce fnegs, and if so then we need to keep running to squash - * fneg(fneg(a)). - */ - bool more_late_algebraic = true; - while (more_late_algebraic) { - more_late_algebraic = false; - NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late); - NIR_PASS_V(nir, nir_opt_constant_folding); - NIR_PASS_V(nir, nir_copy_prop); - NIR_PASS_V(nir, nir_opt_dce); - NIR_PASS_V(nir, nir_opt_cse); + if (args->options->dump_preoptir) { + fprintf(stderr, "NIR shader before instruction selection:\n"); + nir_print_shader(nir, stderr); + } } - /* cleanup passes */ - nir_lower_load_const_to_scalar(nir); - nir_opt_shrink_load(nir); - nir_move_options move_opts = (nir_move_options)( - nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | nir_move_comparisons); - nir_opt_sink(nir, move_opts); - nir_opt_move(nir, move_opts); - nir_convert_to_lcssa(nir, true, false); - nir_lower_phis_to_scalar(nir); - - nir_function_impl *func = nir_shader_get_entrypoint(nir); - nir_index_ssa_defs(func); - nir_metadata_require(func, nir_metadata_block_index); - - if (args->options->dump_preoptir) { - fprintf(stderr, "NIR shader before instruction selection:\n"); - nir_print_shader(nir, stderr); - } + for (unsigned i = 0; i < shader_count; i++) + scratch_size = std::max(scratch_size, shaders[i]->scratch_size); } - unsigned scratch_size = 0; - for (unsigned i = 0; i < shader_count; i++) - scratch_size = std::max(scratch_size, shaders[i]->scratch_size); ctx.program->config->scratch_bytes_per_wave = align(scratch_size * ctx.program->wave_size, 1024); ctx.block = ctx.program->create_and_insert_block(); diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index f951c4fdc5f..686fdca14e9 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -65,7 +65,10 @@ void aco_compile_shader(unsigned shader_count, std::unique_ptr program{new aco::Program}; /* Instruction Selection */ - aco::select_program(program.get(), shader_count, shaders, &config, args); + if (args->is_gs_copy_shader) + aco::select_gs_copy_shader(program.get(), shaders[0], &config, args); + else + aco::select_program(program.get(), shader_count, shaders, &config, args); if (args->options->dump_preoptir) { std::cerr << "After Instruction Selection:\n"; aco_print_program(program.get(), stderr); @@ -162,7 +165,7 @@ void aco_compile_shader(unsigned shader_count, legacy_binary->base.type = RADV_BINARY_TYPE_LEGACY; legacy_binary->base.stage = shaders[shader_count-1]->info.stage; - legacy_binary->base.is_gs_copy_shader = false; + legacy_binary->base.is_gs_copy_shader = args->is_gs_copy_shader; legacy_binary->base.total_size = size; memcpy(legacy_binary->data, code.data(), code.size() * sizeof(uint32_t)); diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index d3ebecc081e..3f38e6aadae 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1106,23 +1106,25 @@ static constexpr Stage sw_tcs = 1 << 2; static constexpr Stage sw_tes = 1 << 3; static constexpr Stage sw_fs = 1 << 4; static constexpr Stage sw_cs = 1 << 5; -static constexpr Stage sw_mask = 0x3f; +static constexpr Stage sw_gs_copy = 1 << 6; +static constexpr Stage sw_mask = 0x7f; /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */ -static constexpr Stage hw_vs = 1 << 6; -static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */ -static constexpr Stage hw_gs = 1 << 8; -static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */ -static constexpr Stage hw_hs = 1 << 10; -static constexpr Stage hw_fs = 1 << 11; -static constexpr Stage hw_cs = 1 << 12; -static constexpr Stage hw_mask = 0x7f << 6; +static constexpr Stage hw_vs = 1 << 7; +static constexpr Stage hw_es = 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */ +static constexpr Stage hw_gs = 1 << 9; +static constexpr Stage hw_ls = 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */ +static constexpr Stage hw_hs = 1 << 11; +static constexpr Stage hw_fs = 1 << 12; +static constexpr Stage hw_cs = 1 << 13; +static constexpr Stage hw_mask = 0x7f << 7; /* possible settings of Program::stage */ static constexpr Stage vertex_vs = sw_vs | hw_vs; static constexpr Stage fragment_fs = sw_fs | hw_fs; static constexpr Stage compute_cs = sw_cs | hw_cs; static constexpr Stage tess_eval_vs = sw_tes | hw_vs; +static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs; /* GFX10/NGG */ static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs; static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs; @@ -1219,6 +1221,9 @@ void select_program(Program *program, struct nir_shader *const *shaders, ac_shader_config* config, struct radv_shader_args *args); +void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, + ac_shader_config* config, + struct radv_shader_args *args); void lower_wqm(Program* program, live& live_vars, const struct radv_nir_compiler_options *options); -- 2.30.2