From 9049e39804c876e58b3f9496afed7c055a67e9ee Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 17 Jun 2020 11:45:16 -0400 Subject: [PATCH] radeonsi: always use Wave32 for GS fast launch, because Wave64 hangs Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_pipe.h | 8 ++++++-- src/gallium/drivers/radeonsi/si_shader.c | 4 ++++ src/gallium/drivers/radeonsi/si_shader_llvm_gs.c | 3 ++- src/gallium/drivers/radeonsi/si_state_shaders.c | 10 +++++++--- 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index f56762a9820..9ad14cab96a 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1880,12 +1880,14 @@ static inline bool si_compute_prim_discard_enabled(struct si_context *sctx) static inline unsigned si_get_wave_size(struct si_screen *sscreen, enum pipe_shader_type shader_type, bool ngg, bool es, - bool prim_discard_cs) + bool gs_fast_launch, bool prim_discard_cs) { if (shader_type == PIPE_SHADER_COMPUTE) return sscreen->compute_wave_size; else if (shader_type == PIPE_SHADER_FRAGMENT) return sscreen->ps_wave_size; + else if (gs_fast_launch) + return 32; /* GS fast launch hangs with Wave64, so always use Wave32. */ else if ((shader_type == PIPE_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */ (shader_type == PIPE_SHADER_VERTEX && es && !ngg) || (shader_type == PIPE_SHADER_TESS_EVAL && es && !ngg) || @@ -1898,7 +1900,9 @@ static inline unsigned si_get_wave_size(struct si_screen *sscreen, static inline unsigned si_get_shader_wave_size(struct si_shader *shader) { return si_get_wave_size(shader->selector->screen, shader->selector->type, shader->key.as_ngg, - shader->key.as_es, shader->key.opt.vs_as_prim_discard_cs); + shader->key.as_es, + shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL, + shader->key.opt.vs_as_prim_discard_cs); } #define PRINT_ERR(fmt, args...) \ diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index cec837d6eba..60ff8388d8b 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1967,6 +1967,9 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, shader.key.as_ls = key->vs_prolog.as_ls; shader.key.as_es = key->vs_prolog.as_es; shader.key.as_ngg = key->vs_prolog.as_ngg; + shader.key.opt.ngg_culling = + (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) | + (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0); shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs; break; case PIPE_SHADER_TESS_CTRL: @@ -1990,6 +1993,7 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, struct si_shader_context ctx; si_llvm_context_init(&ctx, sscreen, compiler, si_get_wave_size(sscreen, type, shader.key.as_ngg, shader.key.as_es, + shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL, shader.key.opt.vs_as_prim_discard_cs)); ctx.shader = &shader; ctx.type = type; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index 2a609572d84..fc14b642b1b 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -474,7 +474,8 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, shader->is_gs_copy_shader = true; si_llvm_context_init(&ctx, sscreen, compiler, - si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false, false)); + si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, + false, false, false, false)); ctx.shader = shader; ctx.type = PIPE_SHADER_VERTEX; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 5945a47b167..b4e95b78549 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -69,7 +69,7 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es, shader_variant_flags |= 1 << 0; if (sel->nir) shader_variant_flags |= 1 << 1; - if (si_get_wave_size(sel->screen, sel->type, ngg, es, false) == 32) + if (si_get_wave_size(sel->screen, sel->type, ngg, es, false, false) == 32) shader_variant_flags |= 1 << 2; if (sel->type == PIPE_SHADER_FRAGMENT && sel->info.uses_derivatives && sel->info.uses_kill && sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL)) @@ -1120,11 +1120,13 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader else gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ + unsigned wave_size = si_get_shader_wave_size(shader); + si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40); si_pm4_set_reg( pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, - S_00B228_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) | + S_00B228_VGPRS((shader->config.num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) | S_00B228_FLOAT_MODE(shader->config.float_mode) | S_00B228_DX10_CLAMP(1) | S_00B228_MEM_ORDERED(1) | S_00B228_WGP_MODE(1) | S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt)); @@ -3692,7 +3694,9 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, if (screen->info.chip_class >= GFX9) stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2); - if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) { + if (screen->info.chip_class >= GFX10 && + /* GS fast launch hangs with Wave64, so always use Wave32. */ + (screen->ge_wave_size == 32 || (key.u.ngg && key.u.ngg_gs_fast_launch))) { stages |= S_028B54_HS_W32_EN(1) | S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */ S_028B54_VS_W32_EN(1); -- 2.30.2