ac: add a bug workaround for the 100% NGG culling case
authorMarek Olšák <marek.olsak@amd.com>
Wed, 4 Mar 2020 00:01:17 +0000 (19:01 -0500)
committerMarek Olšák <marek.olsak@amd.com>
Mon, 9 Mar 2020 20:08:11 +0000 (16:08 -0400)
Fixes: 8db00a51f85 - radeonsi/gfx10: implement NGG culling for 4x wave32 subgroups
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4079>

src/amd/llvm/ac_llvm_build.c

index 69d8f4710ca4b964ae893f3767fad399ee860b8d..760d9123c623c4afe6e9a54808b69aa133808593 100644 (file)
@@ -4799,6 +4799,21 @@ void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wav
 {
        LLVMBuilderRef builder = ctx->builder;
        LLVMValueRef tmp;
+       bool export_dummy_prim = false;
+
+       /* HW workaround for a GPU hang with 100% culling.
+        * We always have to export at least 1 primitive.
+        * Export a degenerate triangle using vertex 0 for all 3 vertices.
+        */
+       if (prim_cnt == ctx->i32_0 &&
+           (ctx->family == CHIP_NAVI10 ||
+            ctx->family == CHIP_NAVI12 ||
+            ctx->family == CHIP_NAVI14)) {
+               assert(vtx_cnt == ctx->i32_0);
+               prim_cnt = ctx->i32_1;
+               vtx_cnt = ctx->i32_1;
+               export_dummy_prim = true;
+       }
 
        ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020);
 
@@ -4806,6 +4821,24 @@ void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wav
        tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
        ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp);
 
+       if (export_dummy_prim) {
+               struct ac_ngg_prim prim = {};
+               /* The vertex indices are 0,0,0. */
+               prim.passthrough = ctx->i32_0;
+
+               struct ac_export_args pos = {};
+               pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = ctx->f32_0;
+               pos.target = V_008DFC_SQ_EXP_POS;
+               pos.enabled_channels = 0xf;
+               pos.done = true;
+
+               ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx),
+                                                ctx->i32_0, ""), 5021);
+               ac_build_export_prim(ctx, &prim);
+               ac_build_export(ctx, &pos);
+               ac_build_endif(ctx, 5021);
+       }
+
        ac_build_endif(ctx, 5020);
 }