From: Timur Kristóf Date: Thu, 12 Mar 2020 18:54:16 +0000 (+0100) Subject: aco: Skip 2nd read of merged wave info when TCS in/out vertices are equal. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=17c779ab9e2bb9329f07299e327ac2c1c81f3cb3;p=mesa.git aco: Skip 2nd read of merged wave info when TCS in/out vertices are equal. When TCS has an equal number of input and output, it means that the number of VS and TCS invocations (LS and HS) are the same; and that the HS invocations operate on the same vertices as the LS. When this is the case, this commit removes the else-if between the merged VS and TCS halves, making it possible to schedule and optimize the code accross the two halves. Totals: SGPRS: 5577367 -> 5581735 (0.08 %) VGPRS: 3958592 -> 3960752 (0.05 %) Code Size: 254867144 -> 254838244 (-0.01 %) bytes Max Waves: 1053887 -> 1053747 (-0.01 %) Totals from affected shaders: SGPRS: 29032 -> 33400 (15.05 %) VGPRS: 35664 -> 37824 (6.06 %) Code Size: 1979028 -> 1950128 (-1.46 %) bytes Max Waves: 7310 -> 7170 (-1.92 %) Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 340c6383646..b6add9ae7f3 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -9752,6 +9752,7 @@ void select_program(Program *program, struct radv_shader_args *args) { isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false); + if_context ic_merged_wave_info; for (unsigned i = 0; i < shader_count; i++) { nir_shader *nir = shaders[i]; @@ -9778,14 +9779,15 @@ void select_program(Program *program, (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs)); - if_context ic; - if (shader_count >= 2 && !empty_shader) { + bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader); + bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info; + if (check_merged_wave_info) { Builder bld(ctx.program, ctx.block); Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | (i * 8u))); Temp thread_id = emit_mbcnt(&ctx, bld.def(v1)); Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(bld.lm)), count, thread_id); - begin_divergent_if_then(&ctx, &ic, cond); + begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond); } if (i) { @@ -9821,9 +9823,9 @@ void select_program(Program *program, if (ctx.stage == fragment_fs) create_fs_exports(&ctx); - if (shader_count >= 2 && !empty_shader) { - begin_divergent_if_else(&ctx, &ic); - end_divergent_if(&ctx, &ic); + if (endif_merged_wave_info) { + begin_divergent_if_else(&ctx, &ic_merged_wave_info); + end_divergent_if(&ctx, &ic_merged_wave_info); } ralloc_free(ctx.divergent_vals); diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index ff32aa3dc84..4ff1de72113 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -100,6 +100,7 @@ struct isel_context { unsigned tcs_tess_lvl_in_loc; uint32_t tcs_num_inputs; uint32_t tcs_num_patches; + bool tcs_in_out_eq = false; /* VS, FS or GS output information */ output_state outputs; @@ -899,6 +900,15 @@ setup_tcs_variables(isel_context *ctx, nir_shader *nir) unreachable("Unsupported TCS shader stage"); } + /* When the number of TCS input and output vertices are the same (typically 3): + * - There is an equal amount of LS and HS invocations + * - In case of merged LSHS shaders, the LS and HS halves of the shader + * always process the exact same vertex. We can use this knowledge to optimize them. + */ + ctx->tcs_in_out_eq = + ctx->stage == vertex_tess_control_hs && + ctx->args->options->key.tcs.input_vertices == nir->info.tess.tcs_vertices_out; + ctx->tcs_num_patches = get_tcs_num_patches( ctx->args->options->key.tcs.input_vertices, nir->info.tess.tcs_vertices_out,