radeonsi: fix NGG culling for Wave64

author Marek Olšák <marek.olsak@amd.com>

Tue, 16 Jun 2020 18:52:19 +0000 (14:52 -0400)

committer Marge Bot <eric+marge@anholt.net>

Tue, 30 Jun 2020 10:56:41 +0000 (10:56 +0000)
author Marek Olšák <marek.olsak@amd.com>
Tue, 16 Jun 2020 18:52:19 +0000 (14:52 -0400)
committer Marge Bot <eric+marge@anholt.net>
Tue, 30 Jun 2020 10:56:41 +0000 (10:56 +0000)
diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c

index 01db3b2cd24a58aca919e8e2f7eba433771e3aa3..36e84ec12e0a009a54fef120c51d85e002ccf0ed 100644 (file)
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -713,14 +713,22 @@ static void update_thread_counts(struct si_shader_context *ctx, LLVMValueRef *ne
   * Also return the position, which is passed to the shader as an input,
   * so that we don't compute it twice.
   */
-void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsigned max_outputs,
-                                               LLVMValueRef *addrs)
+void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+                                     LLVMValueRef *addrs)
  {
     struct si_shader_context *ctx = si_shader_context_from_abi(abi);
     struct si_shader *shader = ctx->shader;
     struct si_shader_selector *sel = shader->selector;
     struct si_shader_info *info = &sel->info;
     LLVMBuilderRef builder = ctx->ac.builder;
+   unsigned max_waves = ctx->ac.wave_size == 64 ? 2 : 4;
+   LLVMValueRef ngg_scratch = ctx->gs_ngg_scratch;
+
+   if (ctx->ac.wave_size == 64) {
+      ngg_scratch =  LLVMBuildPointerCast(builder, ngg_scratch,
+                                          LLVMPointerType(LLVMArrayType(ctx->ac.i64, max_waves),
+                                                          AC_ADDR_SPACE_LDS), "");
+   }
  
     assert(shader->key.opt.ngg_culling);
     assert(shader->key.as_ngg);
@@ -799,19 +807,20 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsign
  
     LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
  
-   /* Initialize the last 3 gs_ngg_scratch dwords to 0, because we may have less
-    * than 4 waves, but we always read all 4 values. This is where the thread
-    * bitmasks of unculled threads will be stored.
+   /* Initialize all but the first element of ngg_scratch to 0, because we may have less
+    * than the maximum number of waves, but we always read all values. This is where
+    * the thread bitmasks of unculled threads will be stored.
      *
-    * gs_ngg_scratch layout: esmask[0..3]
+    * ngg_scratch layout: iN_wavemask esmask[0..n]
      */
     ac_build_ifcc(&ctx->ac,
                   LLVMBuildICmp(builder, LLVMIntULT, get_thread_id_in_tg(ctx),
-                               LLVMConstInt(ctx->ac.i32, 3, 0), ""),
+                               LLVMConstInt(ctx->ac.i32, max_waves - 1, 0), ""),
                   16101);
     {
        LLVMValueRef index = LLVMBuildAdd(builder, tid, ctx->ac.i32_1, "");
-      LLVMBuildStore(builder, ctx->ac.i32_0, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, index));
+      LLVMBuildStore(builder, LLVMConstInt(ctx->ac.iN_wavemask, 0, 0),
+                     ac_build_gep0(&ctx->ac, ngg_scratch, index));
     }
     ac_build_endif(&ctx->ac, 16101);
     ac_build_s_barrier(&ctx->ac);
@@ -952,7 +961,7 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsign
        ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, tid, ctx->ac.i32_0, ""), 16008);
        {
           LLVMBuildStore(builder, es_mask,
-                        ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, get_wave_id_in_tg(ctx)));
+                        ac_build_gep0(&ctx->ac, ngg_scratch, get_wave_id_in_tg(ctx)));
        }
        ac_build_endif(&ctx->ac, 16008);
     }
@@ -961,7 +970,7 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsign
  
     /* Load the vertex masks and compute the new ES thread count. */
     LLVMValueRef es_mask[2], new_num_es_threads, kill_wave;
-   load_bitmasks_2x64(ctx, ctx->gs_ngg_scratch, 0, es_mask, &new_num_es_threads);
+   load_bitmasks_2x64(ctx, ngg_scratch, 0, es_mask, &new_num_es_threads);
     new_num_es_threads = ac_build_readlane_no_opt_barrier(&ctx->ac, new_num_es_threads, NULL);
  
     /* ES threads compute their prefix sum, which is the new ES thread ID.
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h

index f5d6d629f3555b5e85dd0f7347b5e80747522281..4386e07caccbeaad073a89ee7a51120bd22bb6b5 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -213,8 +213,8 @@ bool gfx10_ngg_export_prim_early(struct si_shader *shader);
  void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx);
  void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
                                   LLVMValueRef prim_passthrough);
-void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsigned max_outputs,
-                                               LLVMValueRef *addrs);
+void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+                                     LLVMValueRef *addrs);
  void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
  void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs);
  void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx);
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c

index 5dba9859988b25912ee2688ec3ea089a2d89672c..627e0ffdd5d16b5fe289697b422e4140f862fbc7 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
@@ -1149,7 +1149,7 @@ void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_sha
     if (ctx->shader->key.as_es)
        ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
     else if (ngg_cull_shader)
-      ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
+      ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;
     else if (ctx->shader->key.as_ngg)
        ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
     else
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c

index 69022cd723483a6fecf01e5d902bdae7fa217360..a168cec39d74d68146c4f3eb9ba4cb91a1f574e4 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
@@ -1013,7 +1013,7 @@ void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shad
     else if (shader->key.opt.vs_as_prim_discard_cs)
        ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
     else if (ngg_cull_shader)
-      ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
+      ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;
     else if (shader->key.as_ngg)
        ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
     else
author	Marek Olšák <marek.olsak@amd.com>
	Tue, 16 Jun 2020 18:52:19 +0000 (14:52 -0400)
committer	Marge Bot <eric+marge@anholt.net>
	Tue, 30 Jun 2020 10:56:41 +0000 (10:56 +0000)
src/gallium/drivers/radeonsi/gfx10_shader_ngg.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_internal.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_llvm_tess.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_llvm_vs.c		patch \| blob \| history