radeonsi/gfx10: rewrite late alloc computation
authorMarek Olšák <marek.olsak@amd.com>
Thu, 9 Jan 2020 21:09:47 +0000 (16:09 -0500)
committerMarek Olšák <marek.olsak@amd.com>
Mon, 20 Jan 2020 21:16:11 +0000 (16:16 -0500)
- Use conservative late alloc when the number of CUs <= 6.
- Move the late alloc GS register to the GS shader state, so that it can be
  tuned for NGG culling.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
src/gallium/drivers/radeonsi/si_state.c
src/gallium/drivers/radeonsi/si_state_shaders.c

index 8cd56fd49d6911888fc62b196ed5159580d80d06..86c2daff0de32bc802734693c1461ee72884db62 100644 (file)
@@ -5565,46 +5565,46 @@ static void si_init_config(struct si_context *sctx)
 
                /* Compute LATE_ALLOC_VS.LIMIT. */
                unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
-               unsigned late_alloc_limit; /* The limit is per SH. */
-
-               if (sctx->family == CHIP_KABINI) {
-                       late_alloc_limit = 0; /* Potential hang on Kabini. */
-               } else if (num_cu_per_sh <= 4) {
-                       /* Too few available compute units per SH. Disallowing
-                        * VS to run on one CU could hurt us more than late VS
-                        * allocation would help.
-                        *
-                        * 2 is the highest safe number that allows us to keep
-                        * all CUs enabled.
-                        */
-                       late_alloc_limit = 2;
-               } else {
-                       /* This is a good initial value, allowing 1 late_alloc
-                        * wave per SIMD on num_cu - 2.
-                        */
-                       late_alloc_limit = (num_cu_per_sh - 2) * 4;
-               }
-
-               unsigned late_alloc_limit_gs = late_alloc_limit;
+               unsigned late_alloc_wave64 = 0; /* The limit is per SH. */
                unsigned cu_mask_vs = 0xffff;
                unsigned cu_mask_gs = 0xffff;
 
-               if (late_alloc_limit > 2) {
-                       if (sctx->chip_class >= GFX10) {
+               if (sctx->chip_class >= GFX10) {
+                       /* For Wave32, the hw will launch twice the number of late
+                        * alloc waves, so 1 == 2x wave32.
+                        */
+                       if (num_cu_per_sh <= 6) {
+                               late_alloc_wave64 = num_cu_per_sh - 2;
+                       } else {
+                               late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
+
                                /* CU2 & CU3 disabled because of the dual CU design */
+                               /* Late alloc is not used for NGG on Navi14 due to a hw bug. */
                                cu_mask_vs = 0xfff3;
-                               cu_mask_gs = 0xfff3; /* NGG only */
+                               cu_mask_gs = sscreen->use_ngg &&
+                                            sctx->family != CHIP_NAVI14 ? 0xfff3 : 0xffff;
+                       }
+               } else {
+                       if (sctx->family == CHIP_KABINI) {
+                               late_alloc_wave64 = 0; /* Potential hang on Kabini. */
+                       } else if (num_cu_per_sh <= 4) {
+                               /* Too few available compute units per SH. Disallowing
+                                * VS to run on one CU could hurt us more than late VS
+                                * allocation would help.
+                                *
+                                * 2 is the highest safe number that allows us to keep
+                                * all CUs enabled.
+                                */
+                               late_alloc_wave64 = 2;
                        } else {
-                               cu_mask_vs = 0xfffe; /* 1 CU disabled */
+                               /* This is a good initial value, allowing 1 late_alloc
+                                * wave per SIMD on num_cu - 2.
+                                */
+                               late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
                        }
-               }
 
-               /* Don't use late alloc for NGG on Navi14 due to a hw bug.
-                * If NGG is never used, enable all CUs.
-                */
-               if (!sscreen->use_ngg || sctx->family == CHIP_NAVI14) {
-                       late_alloc_limit_gs = 0;
-                       cu_mask_gs = 0xffff;
+                       if (late_alloc_wave64 > 2)
+                               cu_mask_vs = 0xfffe; /* 1 CU disabled */
                }
 
                /* VS can't execute on one CU if the limit is > 2. */
@@ -5612,17 +5612,11 @@ static void si_init_config(struct si_context *sctx)
                        S_00B118_CU_EN(cu_mask_vs) |
                        S_00B118_WAVE_LIMIT(0x3F));
                si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS,
-                       S_00B11C_LIMIT(late_alloc_limit));
+                       S_00B11C_LIMIT(late_alloc_wave64));
 
                si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
                               S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
 
-               if (sctx->chip_class >= GFX10) {
-                       si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
-                                      S_00B204_CU_EN(0xffff) |
-                                      S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_limit_gs));
-               }
-
                si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
                               S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F));
        }
index 36dbfe9df6f108925ee8524f47b71a4068017ca5..1b8450c0a8ea10057c3dbe84b4cfd54089b9eddc 100644 (file)
@@ -934,6 +934,12 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
                si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
                si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
 
+               if (sscreen->info.chip_class >= GFX10) {
+                       si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+                                      S_00B204_CU_EN(0xffff) |
+                                      S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
+               }
+
                shader->ctx_reg.gs.vgt_gs_onchip_cntl =
                        S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
                        S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) |
@@ -1215,6 +1221,26 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
                       S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
                       S_00B22C_LDS_SIZE(shader->config.lds_size));
 
+       /* Determine LATE_ALLOC_GS. */
+       unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
+       unsigned late_alloc_wave64; /* The limit is per SH. */
+
+       /* For Wave32, the hw will launch twice the number of late
+        * alloc waves, so 1 == 2x wave32.
+        *
+        * Don't use late alloc for NGG on Navi14 due to a hw bug.
+        */
+       if (sscreen->info.family == CHIP_NAVI14)
+               late_alloc_wave64 = 0;
+       else if (num_cu_per_sh <= 6)
+               late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
+       else
+               late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
+
+       si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+                      S_00B204_CU_EN(0xffff) |
+                      S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
+
        nparams = MAX2(shader->info.nr_param_exports, 1);
        shader->ctx_reg.ngg.spi_vs_out_config =
                S_0286C4_VS_EXPORT_COUNT(nparams - 1) |