From c377f45c1833052f3d0d9d4ac341ee9917f9184c Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Thu, 9 Jan 2020 16:09:47 -0500 Subject: [PATCH] radeonsi/gfx10: rewrite late alloc computation - Use conservative late alloc when the number of CUs <= 6. - Move the late alloc GS register to the GS shader state, so that it can be tuned for NGG culling. Acked-by: Pierre-Eric Pelloux-Prayer --- src/gallium/drivers/radeonsi/si_state.c | 72 +++++++++---------- .../drivers/radeonsi/si_state_shaders.c | 26 +++++++ 2 files changed, 59 insertions(+), 39 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 8cd56fd49d6..86c2daff0de 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -5565,46 +5565,46 @@ static void si_init_config(struct si_context *sctx) /* Compute LATE_ALLOC_VS.LIMIT. */ unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh; - unsigned late_alloc_limit; /* The limit is per SH. */ - - if (sctx->family == CHIP_KABINI) { - late_alloc_limit = 0; /* Potential hang on Kabini. */ - } else if (num_cu_per_sh <= 4) { - /* Too few available compute units per SH. Disallowing - * VS to run on one CU could hurt us more than late VS - * allocation would help. - * - * 2 is the highest safe number that allows us to keep - * all CUs enabled. - */ - late_alloc_limit = 2; - } else { - /* This is a good initial value, allowing 1 late_alloc - * wave per SIMD on num_cu - 2. - */ - late_alloc_limit = (num_cu_per_sh - 2) * 4; - } - - unsigned late_alloc_limit_gs = late_alloc_limit; + unsigned late_alloc_wave64 = 0; /* The limit is per SH. */ unsigned cu_mask_vs = 0xffff; unsigned cu_mask_gs = 0xffff; - if (late_alloc_limit > 2) { - if (sctx->chip_class >= GFX10) { + if (sctx->chip_class >= GFX10) { + /* For Wave32, the hw will launch twice the number of late + * alloc waves, so 1 == 2x wave32. + */ + if (num_cu_per_sh <= 6) { + late_alloc_wave64 = num_cu_per_sh - 2; + } else { + late_alloc_wave64 = (num_cu_per_sh - 2) * 4; + /* CU2 & CU3 disabled because of the dual CU design */ + /* Late alloc is not used for NGG on Navi14 due to a hw bug. */ cu_mask_vs = 0xfff3; - cu_mask_gs = 0xfff3; /* NGG only */ + cu_mask_gs = sscreen->use_ngg && + sctx->family != CHIP_NAVI14 ? 0xfff3 : 0xffff; + } + } else { + if (sctx->family == CHIP_KABINI) { + late_alloc_wave64 = 0; /* Potential hang on Kabini. */ + } else if (num_cu_per_sh <= 4) { + /* Too few available compute units per SH. Disallowing + * VS to run on one CU could hurt us more than late VS + * allocation would help. + * + * 2 is the highest safe number that allows us to keep + * all CUs enabled. + */ + late_alloc_wave64 = 2; } else { - cu_mask_vs = 0xfffe; /* 1 CU disabled */ + /* This is a good initial value, allowing 1 late_alloc + * wave per SIMD on num_cu - 2. + */ + late_alloc_wave64 = (num_cu_per_sh - 2) * 4; } - } - /* Don't use late alloc for NGG on Navi14 due to a hw bug. - * If NGG is never used, enable all CUs. - */ - if (!sscreen->use_ngg || sctx->family == CHIP_NAVI14) { - late_alloc_limit_gs = 0; - cu_mask_gs = 0xffff; + if (late_alloc_wave64 > 2) + cu_mask_vs = 0xfffe; /* 1 CU disabled */ } /* VS can't execute on one CU if the limit is > 2. */ @@ -5612,17 +5612,11 @@ static void si_init_config(struct si_context *sctx) S_00B118_CU_EN(cu_mask_vs) | S_00B118_WAVE_LIMIT(0x3F)); si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, - S_00B11C_LIMIT(late_alloc_limit)); + S_00B11C_LIMIT(late_alloc_wave64)); si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F)); - if (sctx->chip_class >= GFX10) { - si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN(0xffff) | - S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_limit_gs)); - } - si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F)); } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 36dbfe9df6f..1b8450c0a8e 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -934,6 +934,12 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2); + if (sscreen->info.chip_class >= GFX10) { + si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + S_00B204_CU_EN(0xffff) | + S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0)); + } + shader->ctx_reg.gs.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) | S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) | @@ -1215,6 +1221,26 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) | S_00B22C_LDS_SIZE(shader->config.lds_size)); + /* Determine LATE_ALLOC_GS. */ + unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh; + unsigned late_alloc_wave64; /* The limit is per SH. */ + + /* For Wave32, the hw will launch twice the number of late + * alloc waves, so 1 == 2x wave32. + * + * Don't use late alloc for NGG on Navi14 due to a hw bug. + */ + if (sscreen->info.family == CHIP_NAVI14) + late_alloc_wave64 = 0; + else if (num_cu_per_sh <= 6) + late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */ + else + late_alloc_wave64 = (num_cu_per_sh - 2) * 4; + + si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + S_00B204_CU_EN(0xffff) | + S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64)); + nparams = MAX2(shader->info.nr_param_exports, 1); shader->ctx_reg.ngg.spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1) | -- 2.30.2