From 0f30223cf4f8dd23211052669576a4bf59631a8b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 6 Jul 2019 00:12:26 -0400 Subject: [PATCH] radeonsi/gfx10: combine hw edgeflags with user edgeflags for correct behavior Acked-by: Pierre-Eric Pelloux-Prayer Reviewed-by: Samuel Pitoiset --- .../drivers/radeonsi/gfx10_shader_ngg.c | 58 +++++++++++++++++-- src/gallium/drivers/radeonsi/si_shader.c | 6 +- src/gallium/drivers/radeonsi/si_shader.h | 2 + .../drivers/radeonsi/si_state_shaders.c | 26 +++++---- 4 files changed, 73 insertions(+), 19 deletions(-) diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 5c410d79ec1..6b3c1017fb2 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -500,8 +500,12 @@ static unsigned ngg_nogs_vertex_size(struct si_shader *shader) { unsigned lds_vertex_size = 0; + /* The edgeflag is always stored in the last element that's also + * used for padding to reduce LDS bank conflicts. */ if (shader->selector->so.num_outputs) lds_vertex_size = 4 * shader->selector->info.num_outputs + 1; + if (shader->selector->ngg_writes_edgeflag) + lds_vertex_size = MAX2(lds_vertex_size, 1); return lds_vertex_size; } @@ -541,7 +545,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, LLVMValueRef vertex_ptr = NULL; - if (sel->so.num_outputs) + if (sel->so.num_outputs || sel->ngg_writes_edgeflag) vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); for (unsigned i = 0; i < info->num_outputs; i++) { @@ -563,6 +567,19 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, LLVMBuildStore(builder, tmp2, tmp); } } + + /* Store the edgeflag at the end (if streamout is enabled) */ + if (info->output_semantic_name[i] == TGSI_SEMANTIC_EDGEFLAG && + sel->ngg_writes_edgeflag) { + LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], ""); + /* The output is a float, but the hw expects a 1-bit integer. */ + edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->i32, ""); + edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->i32_1); + + tmp = LLVMConstInt(ctx->i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0); + tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp); + LLVMBuildStore(builder, edgeflag, tmp); + } } lp_build_endif(&ctx->merged_wrap_if_state); @@ -623,13 +640,35 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, emitted_prims = nggso.emit[0]; } + LLVMValueRef user_edgeflags[3] = {}; + + if (sel->ngg_writes_edgeflag) { + /* Streamout already inserted the barrier, so don't insert it again. */ + if (!sel->so.num_outputs) + ac_build_s_barrier(&ctx->ac); + + ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); + /* Load edge flags from ES threads and store them into VGPRs in GS threads. */ + for (unsigned i = 0; i < num_vertices; i++) { + tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]); + tmp2 = LLVMConstInt(ctx->i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0); + tmp = ac_build_gep0(&ctx->ac, tmp, tmp2); + tmp = LLVMBuildLoad(builder, tmp, ""); + tmp = LLVMBuildTrunc(builder, tmp, ctx->i1, ""); + + user_edgeflags[i] = ac_build_alloca_undef(&ctx->ac, ctx->i1, ""); + LLVMBuildStore(builder, tmp, user_edgeflags[i]); + } + ac_build_endif(&ctx->ac, 5400); + } + /* Copy Primitive IDs from GS threads to the LDS address corresponding * to the ES thread of the provoking vertex. */ if (ctx->type == PIPE_SHADER_VERTEX && ctx->shader->key.mono.u.vs_export_prim_id) { - /* Streamout uses LDS. We need to wait for it before we can reuse it. */ - if (sel->so.num_outputs) + /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */ + if (sel->so.num_outputs || sel->ngg_writes_edgeflag) ac_build_s_barrier(&ctx->ac); ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); @@ -647,8 +686,6 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, ac_build_endif(&ctx->ac, 5400); } - /* TODO: primitive culling */ - build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx)); /* Update query buffer */ @@ -711,9 +748,20 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3); for (unsigned i = 0; i < num_vertices; ++i) { + if (ctx->type != PIPE_SHADER_VERTEX) { + prim.edgeflag[i] = ctx->i1false; + continue; + } + tmp = LLVMBuildLShr(builder, ctx->abi.gs_invocation_id, LLVMConstInt(ctx->ac.i32, 8 + i, false), ""); prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + + if (sel->ngg_writes_edgeflag) { + tmp2 = LLVMBuildLoad(builder, user_edgeflags[i], ""); + prim.edgeflag[i] = LLVMBuildAnd(builder, prim.edgeflag[i], + tmp2, ""); + } } build_export_prim(ctx, &prim); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 75f2c241d33..a3578ebf720 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2964,11 +2964,11 @@ void si_llvm_export_vs(struct si_shader_context *ctx, /* Write the misc vector (point size, edgeflag, layer, viewport). */ if (shader->selector->info.writes_psize || - shader->selector->info.writes_edgeflag || + shader->selector->pos_writes_edgeflag || shader->selector->info.writes_viewport_index || shader->selector->info.writes_layer) { pos_args[1].enabled_channels = shader->selector->info.writes_psize | - (shader->selector->info.writes_edgeflag << 1) | + (shader->selector->pos_writes_edgeflag << 1) | (shader->selector->info.writes_layer << 2); pos_args[1].valid_mask = 0; /* EXEC mask */ @@ -2983,7 +2983,7 @@ void si_llvm_export_vs(struct si_shader_context *ctx, if (shader->selector->info.writes_psize) pos_args[1].out[0] = psize_value; - if (shader->selector->info.writes_edgeflag) { + if (shader->selector->pos_writes_edgeflag) { /* The output is a float, but the hw expects an integer * with the first bit containing the edge flag. */ edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 070ccd2ad17..211e2db1697 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -351,6 +351,8 @@ struct si_shader_selector { bool vs_needs_prolog; bool force_correct_derivs_after_kill; bool prim_discard_cs_allowed; + bool ngg_writes_edgeflag; + bool pos_writes_edgeflag; unsigned pa_cl_vs_out_cntl; ubyte clipdist_mask; ubyte culldist_mask; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index b381b3953bc..52b3489509d 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1216,17 +1216,13 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE( shader->ngg.max_vert_out_per_gs_instance); - /* User edge flags are set by the pos exports. If user edge flags are - * not used, we must use hw-generated edge flags and pass them via - * the prim export to prevent drawing lines on internal edges of - * decomposed primitives (such as quads) with polygon mode = lines. - * - * TODO: We should combine hw-generated edge flags with user edge - * flags in the shader. + /* Always output hw-generated edge flags and pass them via the prim + * export to prevent drawing lines on internal edges of decomposed + * primitives (such as quads) with polygon mode = lines. Only VS needs + * this. */ shader->ctx_reg.ngg.pa_cl_ngg_cntl = - S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_type == PIPE_SHADER_VERTEX && - !gs_info->writes_edgeflag); + S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_type == PIPE_SHADER_VERTEX); shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | @@ -2671,6 +2667,14 @@ static void *si_create_shader_selector(struct pipe_context *ctx, !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] && !sel->so.num_outputs; + if (sel->type == PIPE_SHADER_VERTEX && + sel->info.writes_edgeflag) { + if (sscreen->info.chip_class >= GFX10) + sel->ngg_writes_edgeflag = true; + else + sel->pos_writes_edgeflag = true; + } + /* Set which opcode uses which (i,j) pair. */ if (sel->info.uses_persp_opcode_interp_centroid) sel->info.uses_persp_centroid = true; @@ -2817,11 +2821,11 @@ static void *si_create_shader_selector(struct pipe_context *ctx, /* PA_CL_VS_OUT_CNTL */ bool misc_vec_ena = - sel->info.writes_psize || sel->info.writes_edgeflag || + sel->info.writes_psize || sel->pos_writes_edgeflag || sel->info.writes_layer || sel->info.writes_viewport_index; sel->pa_cl_vs_out_cntl = S_02881C_USE_VTX_POINT_SIZE(sel->info.writes_psize) | - S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag) | + S_02881C_USE_VTX_EDGE_FLAG(sel->pos_writes_edgeflag) | S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) | S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) | S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) | -- 2.30.2