radeonsi: always use Wave32 for GS fast launch, because Wave64 hangs
authorMarek Olšák <marek.olsak@amd.com>
Wed, 17 Jun 2020 15:45:16 +0000 (11:45 -0400)
committerMarge Bot <eric+marge@anholt.net>
Tue, 30 Jun 2020 10:56:41 +0000 (10:56 +0000)
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5524>

src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
src/gallium/drivers/radeonsi/si_state_shaders.c

index f56762a982096742eff036b521bc565f1111c2bf..9ad14cab96a28f44729f26dcfff71bd83400283e 100644 (file)
@@ -1880,12 +1880,14 @@ static inline bool si_compute_prim_discard_enabled(struct si_context *sctx)
 
 static inline unsigned si_get_wave_size(struct si_screen *sscreen,
                                         enum pipe_shader_type shader_type, bool ngg, bool es,
-                                        bool prim_discard_cs)
+                                        bool gs_fast_launch, bool prim_discard_cs)
 {
    if (shader_type == PIPE_SHADER_COMPUTE)
       return sscreen->compute_wave_size;
    else if (shader_type == PIPE_SHADER_FRAGMENT)
       return sscreen->ps_wave_size;
+   else if (gs_fast_launch)
+      return 32; /* GS fast launch hangs with Wave64, so always use Wave32. */
    else if ((shader_type == PIPE_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */
             (shader_type == PIPE_SHADER_VERTEX && es && !ngg) ||
             (shader_type == PIPE_SHADER_TESS_EVAL && es && !ngg) ||
@@ -1898,7 +1900,9 @@ static inline unsigned si_get_wave_size(struct si_screen *sscreen,
 static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
 {
    return si_get_wave_size(shader->selector->screen, shader->selector->type, shader->key.as_ngg,
-                           shader->key.as_es, shader->key.opt.vs_as_prim_discard_cs);
+                           shader->key.as_es,
+                           shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
+                           shader->key.opt.vs_as_prim_discard_cs);
 }
 
 #define PRINT_ERR(fmt, args...)                                                                    \
index cec837d6ebaa6bed1f9bb68647a922110bce9e66..60ff8388d8b6e8121a17d0b3555ff0742a6e46f9 100644 (file)
@@ -1967,6 +1967,9 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
       shader.key.as_ls = key->vs_prolog.as_ls;
       shader.key.as_es = key->vs_prolog.as_es;
       shader.key.as_ngg = key->vs_prolog.as_ngg;
+      shader.key.opt.ngg_culling =
+         (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
+         (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0);
       shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
       break;
    case PIPE_SHADER_TESS_CTRL:
@@ -1990,6 +1993,7 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
    struct si_shader_context ctx;
    si_llvm_context_init(&ctx, sscreen, compiler,
                         si_get_wave_size(sscreen, type, shader.key.as_ngg, shader.key.as_es,
+                                         shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
                                          shader.key.opt.vs_as_prim_discard_cs));
    ctx.shader = &shader;
    ctx.type = type;
index 2a609572d841c1c17b75edb5e68f7efee2460beb..fc14b642b1be5a650986a7498552b1d57692118f 100644 (file)
@@ -474,7 +474,8 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
    shader->is_gs_copy_shader = true;
 
    si_llvm_context_init(&ctx, sscreen, compiler,
-                        si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false, false));
+                        si_get_wave_size(sscreen, PIPE_SHADER_VERTEX,
+                                         false, false, false, false));
    ctx.shader = shader;
    ctx.type = PIPE_SHADER_VERTEX;
 
index 5945a47b16727da2a6ba4a2cfcd783df2c9842d4..b4e95b78549429eab26d2c91217bbd2f89da31fa 100644 (file)
@@ -69,7 +69,7 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
       shader_variant_flags |= 1 << 0;
    if (sel->nir)
       shader_variant_flags |= 1 << 1;
-   if (si_get_wave_size(sel->screen, sel->type, ngg, es, false) == 32)
+   if (si_get_wave_size(sel->screen, sel->type, ngg, es, false, false) == 32)
       shader_variant_flags |= 1 << 2;
    if (sel->type == PIPE_SHADER_FRAGMENT && sel->info.uses_derivatives && sel->info.uses_kill &&
        sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
@@ -1120,11 +1120,13 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
    else
       gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
 
+   unsigned wave_size = si_get_shader_wave_size(shader);
+
    si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
    si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
    si_pm4_set_reg(
       pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
-      S_00B228_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
+      S_00B228_VGPRS((shader->config.num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) |
          S_00B228_FLOAT_MODE(shader->config.float_mode) | S_00B228_DX10_CLAMP(1) |
          S_00B228_MEM_ORDERED(1) | S_00B228_WGP_MODE(1) |
          S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
@@ -3692,7 +3694,9 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
    if (screen->info.chip_class >= GFX9)
       stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
 
-   if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) {
+   if (screen->info.chip_class >= GFX10 &&
+       /* GS fast launch hangs with Wave64, so always use Wave32. */
+       (screen->ge_wave_size == 32 || (key.u.ngg && key.u.ngg_gs_fast_launch))) {
       stages |= S_028B54_HS_W32_EN(1) |
                 S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */
                 S_028B54_VS_W32_EN(1);