From 6129541d6c4b44dc0dd7a1744b51a73c1a39cfd1 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 6 Sep 2020 11:23:13 -0400 Subject: [PATCH] radeonsi: remove redundant GS variables in si_shader_selector Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/gfx10_shader_ngg.c | 32 +++++++-------- src/gallium/drivers/radeonsi/si_shader.h | 3 -- .../drivers/radeonsi/si_shader_llvm_gs.c | 8 ++-- .../drivers/radeonsi/si_state_shaders.c | 40 +++++++++---------- 4 files changed, 38 insertions(+), 45 deletions(-) diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 28c913676fe..9679508e5df 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -1439,7 +1439,7 @@ static LLVMValueRef ngg_gs_get_vertex_storage(struct si_shader_context *ctx) * is in emit order; that is: * - during the epilogue, N is the threadidx (relative to the entire threadgroup) * - during vertex emit, i.e. while the API GS shader invocation is running, - * N = threadidx * gs_max_out_vertices + emitidx + * N = threadidx * gs.vertices_out + emitidx * * Goals of the LDS memory layout: * 1. Eliminate bank conflicts on write for geometry shaders that have all emits @@ -1458,7 +1458,7 @@ static LLVMValueRef ngg_gs_get_vertex_storage(struct si_shader_context *ctx) * * Swizzling is required to satisfy points 1 and 2 simultaneously. * - * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx). + * Vertices are stored in export order (gsthread * gs.vertices_out + emitidx). * Indices are swizzled in groups of 32, which ensures point 1 without * disturbing point 2. * @@ -1470,8 +1470,8 @@ static LLVMValueRef ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRe LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx); - /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */ - unsigned write_stride_2exp = ffs(sel->gs_max_out_vertices) - 1; + /* gs.vertices_out = 2^(write_stride_2exp) * some odd number */ + unsigned write_stride_2exp = ffs(sel->info.base.gs.vertices_out) - 1; if (write_stride_2exp) { LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), ""); LLVMValueRef swizzle = LLVMBuildAnd( @@ -1489,7 +1489,7 @@ static LLVMValueRef ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMVa LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef tmp; - tmp = LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false); + tmp = LLVMConstInt(ctx->ac.i32, sel->info.base.gs.vertices_out, false); tmp = LLVMBuildMul(builder, tmp, gsthread, ""); const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, ""); return ngg_gs_vertex_ptr(ctx, vertexidx); @@ -1531,7 +1531,7 @@ void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LL */ const LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, vertexidx, - LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), ""); + LLVMConstInt(ctx->ac.i32, sel->info.base.gs.vertices_out, false), ""); tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, ""); @@ -1557,7 +1557,7 @@ void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LL /* Determine and store whether this vertex completed a primitive. */ const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], ""); - tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->gs_output_prim) - 1, false); + tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->info.base.gs.output_primitive) - 1, false); const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, ""); /* Since the geometry shader emits triangle strips, we need to @@ -1565,7 +1565,7 @@ void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LL * the correct vertex order. */ LLVMValueRef is_odd = ctx->ac.i1false; - if (stream == 0 && u_vertices_per_prim(sel->gs_output_prim) == 3) { + if (stream == 0 && u_vertices_per_prim(sel->info.base.gs.output_primitive) == 3) { tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, ""); is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, ""); } @@ -1615,7 +1615,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) { const struct si_shader_selector *sel = ctx->shader->selector; const struct si_shader_info *info = &sel->info; - const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim); + const unsigned verts_per_prim = u_vertices_per_prim(sel->info.base.gs.output_primitive); LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false); LLVMValueRef tmp, tmp2; @@ -1637,7 +1637,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], ""); tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx, - LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), ""); + LLVMConstInt(ctx->ac.i32, sel->info.base.gs.vertices_out, false), ""); ac_build_ifcc(&ctx->ac, tmp, 5101); ac_build_break(&ctx->ac); ac_build_endif(&ctx->ac, 5101); @@ -1905,7 +1905,7 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader) const struct si_shader_selector *es_sel = shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel; const gl_shader_stage gs_stage = gs_sel->info.stage; - const unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1); + const unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1); const unsigned input_prim = si_get_input_prim(gs_sel); const bool use_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY; @@ -1946,7 +1946,7 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader) if (gs_stage == MESA_SHADER_GEOMETRY) { bool force_multi_cycling = false; - unsigned max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices * gs_num_invocations; + unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations; retry_select_mode: if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) { @@ -1959,7 +1959,7 @@ retry_select_mode: * tessellation. */ max_vert_out_per_gs_instance = true; max_gsprims_base = 1; - max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices; + max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out; } esvert_lds_size = es_sel->esgs_itemsize / 4; @@ -2050,9 +2050,9 @@ retry_select_mode: unsigned max_out_vertices = max_vert_out_per_gs_instance - ? gs_sel->gs_max_out_vertices + ? gs_sel->info.base.gs.vertices_out : gs_stage == MESA_SHADER_GEOMETRY - ? max_gsprims * gs_num_invocations * gs_sel->gs_max_out_vertices + ? max_gsprims * gs_num_invocations * gs_sel->info.base.gs.vertices_out : max_esverts; assert(max_out_vertices <= 256); @@ -2060,7 +2060,7 @@ retry_select_mode: if (gs_stage == MESA_SHADER_GEOMETRY) { /* Number of output primitives per GS input primitive after * GS instancing. */ - prim_amp_factor = gs_sel->gs_max_out_vertices; + prim_amp_factor = gs_sel->info.base.gs.vertices_out; } /* The GE only checks against the maximum number of ES verts after diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 353b3f0f0e3..d60f4667e85 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -433,11 +433,8 @@ struct si_shader_selector { uint16_t lshs_vertex_stride; /* GS parameters. */ - uint16_t gs_max_out_vertices; uint16_t gsvs_vertex_size; ubyte gs_input_verts_per_prim; - ubyte gs_output_prim; - ubyte gs_num_invocations; ubyte max_gs_stream; /* count - 1 */ unsigned max_gsvs_emit_size; uint16_t enabled_streamout_buffer_mask; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index c7e331114ac..3bbb18019f3 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -295,7 +295,7 @@ static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVM */ can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, - LLVMConstInt(ctx->ac.i32, shader->selector->gs_max_out_vertices, 0), ""); + LLVMConstInt(ctx->ac.i32, shader->selector->info.base.gs.vertices_out, 0), ""); bool use_kill = !info->base.writes_memory; if (use_kill) { @@ -313,7 +313,7 @@ static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVM LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); LLVMValueRef voffset = - LLVMConstInt(ctx->ac.i32, offset * shader->selector->gs_max_out_vertices, 0); + LLVMConstInt(ctx->ac.i32, offset * shader->selector->info.base.gs.vertices_out, 0); offset++; voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, ""); @@ -402,7 +402,7 @@ void si_preload_gs_rings(struct si_shader_context *ctx) if (!num_components) continue; - stride = 4 * num_components * sel->gs_max_out_vertices; + stride = 4 * num_components * sel->info.base.gs.vertices_out; /* Limit on the stride field for <= GFX7. */ assert(stride < (1 << 14)); @@ -535,7 +535,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, } LLVMValueRef soffset = - LLVMConstInt(ctx.ac.i32, offset * gs_selector->gs_max_out_vertices * 16 * 4, 0); + LLVMConstInt(ctx.ac.i32, offset * gs_selector->info.base.gs.vertices_out * 16 * 4, 0); offset++; outputs[i].values[chan] = diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 61535c133f5..ef442bde331 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -617,7 +617,7 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader) void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs, struct gfx9_gs_info *out) { - unsigned gs_num_invocations = MAX2(gs->gs_num_invocations, 1); + unsigned gs_num_invocations = MAX2(gs->info.base.gs.invocations, 1); unsigned input_prim = gs->info.base.gs.input_primitive; bool uses_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY; @@ -644,9 +644,9 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector * /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations. * Make sure we don't go over the maximum value. */ - if (gs->gs_max_out_vertices > 0) { + if (gs->info.base.gs.vertices_out > 0) { max_gs_prims = - MIN2(max_gs_prims, max_out_prims / (gs->gs_max_out_vertices * gs_num_invocations)); + MIN2(max_gs_prims, max_out_prims / (gs->info.base.gs.vertices_out * gs_num_invocations)); } assert(max_gs_prims > 0); @@ -701,7 +701,7 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector * out->es_verts_per_subgroup = es_verts; out->gs_prims_per_subgroup = gs_prims; out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations; - out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup * gs->gs_max_out_vertices; + out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup * gs->info.base.gs.vertices_out; out->esgs_ring_size = esgs_lds_size; assert(out->max_prims_per_subgroup <= max_out_prims); @@ -772,7 +772,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) { struct si_shader_selector *sel = shader->selector; const ubyte *num_components = sel->info.num_stream_output_components; - unsigned gs_num_invocations = sel->gs_num_invocations; + unsigned gs_num_invocations = sel->info.base.gs.invocations; struct si_pm4_state *pm4; uint64_t va; unsigned max_stream = sel->max_gs_stream; @@ -784,25 +784,25 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) pm4->atom.emit = si_emit_shader_gs; - offset = num_components[0] * sel->gs_max_out_vertices; + offset = num_components[0] * sel->info.base.gs.vertices_out; shader->ctx_reg.gs.vgt_gsvs_ring_offset_1 = offset; if (max_stream >= 1) - offset += num_components[1] * sel->gs_max_out_vertices; + offset += num_components[1] * sel->info.base.gs.vertices_out; shader->ctx_reg.gs.vgt_gsvs_ring_offset_2 = offset; if (max_stream >= 2) - offset += num_components[2] * sel->gs_max_out_vertices; + offset += num_components[2] * sel->info.base.gs.vertices_out; shader->ctx_reg.gs.vgt_gsvs_ring_offset_3 = offset; if (max_stream >= 3) - offset += num_components[3] * sel->gs_max_out_vertices; + offset += num_components[3] * sel->info.base.gs.vertices_out; shader->ctx_reg.gs.vgt_gsvs_ring_itemsize = offset; /* The GSVS_RING_ITEMSIZE register takes 15 bits */ assert(offset < (1 << 15)); - shader->ctx_reg.gs.vgt_gs_max_vert_out = sel->gs_max_out_vertices; + shader->ctx_reg.gs.vgt_gs_max_vert_out = sel->info.base.gs.vertices_out; shader->ctx_reg.gs.vgt_gs_vert_itemsize = num_components[0]; shader->ctx_reg.gs.vgt_gs_vert_itemsize_1 = (max_stream >= 1) ? num_components[1] : 0; @@ -1067,7 +1067,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader bool window_space = gs_info->stage == MESA_SHADER_VERTEX ? gs_info->base.vs.window_space_position : 0; bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid; - unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1); + unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1); unsigned input_prim = si_get_input_prim(gs_sel); bool break_wave_at_eoi = false; struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader); @@ -1187,7 +1187,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader if (gs_stage == MESA_SHADER_GEOMETRY) { shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4; - shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->gs_max_out_vertices; + shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->info.base.gs.vertices_out; } else { shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1; } @@ -1375,7 +1375,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, shader->ctx_reg.vs.vgt_primitiveid_en = enable_prim_id; } else { shader->ctx_reg.vs.vgt_gs_mode = - ac_vgt_gs_mode(gs->gs_max_out_vertices, sscreen->info.chip_class); + ac_vgt_gs_mode(gs->info.base.gs.vertices_out, sscreen->info.chip_class); shader->ctx_reg.vs.vgt_primitiveid_en = 0; } @@ -2629,17 +2629,13 @@ static void *si_create_shader_selector(struct pipe_context *ctx, switch (sel->info.stage) { case MESA_SHADER_GEOMETRY: - sel->gs_output_prim = sel->info.base.gs.output_primitive; - /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */ - sel->rast_prim = sel->gs_output_prim; + sel->rast_prim = sel->info.base.gs.output_primitive; if (util_rast_prim_is_triangles(sel->rast_prim)) sel->rast_prim = PIPE_PRIM_TRIANGLES; - sel->gs_max_out_vertices = sel->info.base.gs.vertices_out; - sel->gs_num_invocations = sel->info.base.gs.invocations; sel->gsvs_vertex_size = sel->info.num_outputs * 16; - sel->max_gsvs_emit_size = sel->gsvs_vertex_size * sel->gs_max_out_vertices; + sel->max_gsvs_emit_size = sel->gsvs_vertex_size * sel->info.base.gs.vertices_out; sel->max_gs_stream = 0; for (i = 0; i < sel->so.num_outputs; i++) @@ -2650,12 +2646,12 @@ static void *si_create_shader_selector(struct pipe_context *ctx, /* EN_MAX_VERT_OUT_PER_GS_INSTANCE does not work with tesselation so * we can't split workgroups. Disable ngg if any of the following conditions is true: - * - num_invocations * gs_max_out_vertices > 256 + * - num_invocations * gs.vertices_out > 256 * - LDS usage is too high */ sel->tess_turns_off_ngg = sscreen->info.chip_class >= GFX10 && - (sel->gs_num_invocations * sel->gs_max_out_vertices > 256 || - sel->gs_num_invocations * sel->gs_max_out_vertices * + (sel->info.base.gs.invocations * sel->info.base.gs.vertices_out > 256 || + sel->info.base.gs.invocations * sel->info.base.gs.vertices_out * (sel->info.num_outputs * 4 + 1) > 6500 /* max dw per GS primitive */); break; -- 2.30.2