radeonsi: enable late VS allocation (v3)
[mesa.git] / src / gallium / drivers / radeonsi / si_state.c
index 76d5017a82d4f02d3a6d5e7e0622003d80536f1a..c010998feab0e8638957efcdf2bed59c664ff010 100644 (file)
@@ -3729,12 +3729,32 @@ static void si_init_config(struct si_context *sctx)
        si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
 
        if (sctx->b.chip_class >= CIK) {
-               si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xffff));
                si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 0);
                si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xffff));
                si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(0xffff));
-               si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xffff));
-               si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(0));
+
+               if (sscreen->b.info.num_good_compute_units /
+                   (sscreen->b.info.max_se * sscreen->b.info.max_sh_per_se) <= 4) {
+                       /* Too few available compute units per SH. Disallowing
+                        * VS to run on CU0 could hurt us more than late VS
+                        * allocation would help.
+                        *
+                        * LATE_ALLOC_VS = 2 is the highest safe number.
+                        */
+                       si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xffff));
+                       si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xffff));
+                       si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(2));
+               } else {
+                       /* Set LATE_ALLOC_VS == 31. It should be less than
+                        * the number of scratch waves. Limitations:
+                        * - VS can't execute on CU0.
+                        * - If HS writes outputs to LDS, LS can't execute on CU0.
+                        */
+                       si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xfffe));
+                       si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xfffe));
+                       si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(31));
+               }
+
                si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff));
        }