From: Marek Olšák Date: Tue, 16 Jun 2020 18:52:19 +0000 (-0400) Subject: radeonsi: fix NGG culling for Wave64 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=8fff9beb44b18f6f3077f0460d383aebcf77d176;p=mesa.git radeonsi: fix NGG culling for Wave64 Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 01db3b2cd24..36e84ec12e0 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -713,14 +713,22 @@ static void update_thread_counts(struct si_shader_context *ctx, LLVMValueRef *ne * Also return the position, which is passed to the shader as an input, * so that we don't compute it twice. */ -void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsigned max_outputs, - LLVMValueRef *addrs) +void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs) { struct si_shader_context *ctx = si_shader_context_from_abi(abi); struct si_shader *shader = ctx->shader; struct si_shader_selector *sel = shader->selector; struct si_shader_info *info = &sel->info; LLVMBuilderRef builder = ctx->ac.builder; + unsigned max_waves = ctx->ac.wave_size == 64 ? 2 : 4; + LLVMValueRef ngg_scratch = ctx->gs_ngg_scratch; + + if (ctx->ac.wave_size == 64) { + ngg_scratch = LLVMBuildPointerCast(builder, ngg_scratch, + LLVMPointerType(LLVMArrayType(ctx->ac.i64, max_waves), + AC_ADDR_SPACE_LDS), ""); + } assert(shader->key.opt.ngg_culling); assert(shader->key.as_ngg); @@ -799,19 +807,20 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsign LLVMValueRef tid = ac_get_thread_id(&ctx->ac); - /* Initialize the last 3 gs_ngg_scratch dwords to 0, because we may have less - * than 4 waves, but we always read all 4 values. This is where the thread - * bitmasks of unculled threads will be stored. + /* Initialize all but the first element of ngg_scratch to 0, because we may have less + * than the maximum number of waves, but we always read all values. This is where + * the thread bitmasks of unculled threads will be stored. * - * gs_ngg_scratch layout: esmask[0..3] + * ngg_scratch layout: iN_wavemask esmask[0..n] */ ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntULT, get_thread_id_in_tg(ctx), - LLVMConstInt(ctx->ac.i32, 3, 0), ""), + LLVMConstInt(ctx->ac.i32, max_waves - 1, 0), ""), 16101); { LLVMValueRef index = LLVMBuildAdd(builder, tid, ctx->ac.i32_1, ""); - LLVMBuildStore(builder, ctx->ac.i32_0, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, index)); + LLVMBuildStore(builder, LLVMConstInt(ctx->ac.iN_wavemask, 0, 0), + ac_build_gep0(&ctx->ac, ngg_scratch, index)); } ac_build_endif(&ctx->ac, 16101); ac_build_s_barrier(&ctx->ac); @@ -952,7 +961,7 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsign ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, tid, ctx->ac.i32_0, ""), 16008); { LLVMBuildStore(builder, es_mask, - ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, get_wave_id_in_tg(ctx))); + ac_build_gep0(&ctx->ac, ngg_scratch, get_wave_id_in_tg(ctx))); } ac_build_endif(&ctx->ac, 16008); } @@ -961,7 +970,7 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsign /* Load the vertex masks and compute the new ES thread count. */ LLVMValueRef es_mask[2], new_num_es_threads, kill_wave; - load_bitmasks_2x64(ctx, ctx->gs_ngg_scratch, 0, es_mask, &new_num_es_threads); + load_bitmasks_2x64(ctx, ngg_scratch, 0, es_mask, &new_num_es_threads); new_num_es_threads = ac_build_readlane_no_opt_barrier(&ctx->ac, new_num_es_threads, NULL); /* ES threads compute their prefix sum, which is the new ES thread ID. diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index f5d6d629f35..4386e07cacc 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -213,8 +213,8 @@ bool gfx10_ngg_export_prim_early(struct si_shader *shader); void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx); void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3], LLVMValueRef prim_passthrough); -void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsigned max_outputs, - LLVMValueRef *addrs); +void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs); void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs); void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs); void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx); diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index 5dba9859988..627e0ffdd5d 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -1149,7 +1149,7 @@ void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_sha if (ctx->shader->key.as_es) ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; else if (ngg_cull_shader) - ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32; + ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue; else if (ctx->shader->key.as_ngg) ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue; else diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index 69022cd7234..a168cec39d7 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -1013,7 +1013,7 @@ void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shad else if (shader->key.opt.vs_as_prim_discard_cs) ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue; else if (ngg_cull_shader) - ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32; + ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue; else if (shader->key.as_ngg) ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue; else