radeonsi: remove the NGG hack decreasing LDS usage to deal with overflows

[mesa.git] / src / gallium / drivers / radeonsi / si_pipe.c
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c

index 7364277c96253d3a567f683d0bc030aab7fa5157..c323f33dd8f6a267a8900e87f312064f8481c374 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -33,6 +33,7 @@
  #include "si_public.h"
  #include "si_shader_internal.h"
  #include "sid.h"
+#include "ac_shadowed_regs.h"
  #include "util/disk_cache.h"
  #include "util/u_log.h"
  #include "util/u_memory.h"
@@ -90,11 +91,13 @@ static const struct debug_named_value debug_options[] = {
     {"check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info."},
     {"reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context."},
     {"zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations."},
+   {"shadowregs", DBG(SHADOW_REGS), "Enable CP register shadowing."},
  
     /* 3D engine options: */
     {"nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used."},
     {"nongg", DBG(NO_NGG), "Disable NGG and use the legacy pipeline."},
-   {"nggc", DBG(ALWAYS_NGG_CULLING), "Always use NGG culling even when it can hurt."},
+   {"nggc", DBG(ALWAYS_NGG_CULLING_ALL), "Always use NGG culling even when it can hurt."},
+   {"nggctess", DBG(ALWAYS_NGG_CULLING_TESS), "Always use NGG culling for tessellation."},
     {"nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling."},
     {"alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader."},
     {"pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls."},
@@ -194,7 +197,8 @@ static void si_destroy_context(struct pipe_context *context)
     si_resource_reference(&sctx->wait_mem_scratch, NULL);
     si_resource_reference(&sctx->small_prim_cull_info_buf, NULL);
  
-   si_pm4_free_state(sctx, sctx->cs_preamble_state, ~0);
+   if (sctx->cs_preamble_state)
+      si_pm4_free_state(sctx, sctx->cs_preamble_state, ~0);
     if (sctx->cs_preamble_gs_rings)
        si_pm4_free_state(sctx, sctx->cs_preamble_gs_rings, ~0);
     for (i = 0; i < ARRAY_SIZE(sctx->vgt_shader_config); i++)
@@ -295,6 +299,7 @@ static void si_destroy_context(struct pipe_context *context)
     si_resource_reference(&sctx->index_ring, NULL);
     si_resource_reference(&sctx->barrier_buf, NULL);
     si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
+   si_resource_reference(&sctx->shadowed_regs, NULL);
     pb_reference(&sctx->gds, NULL);
     pb_reference(&sctx->gds_oa, NULL);
  
@@ -478,6 +483,14 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
          *    https://gitlab.freedesktop.org/mesa/mesa/-/issues/1889
          */
         (sctx->chip_class != GFX8 || sscreen->debug_flags & DBG(FORCE_SDMA)) &&
+       /* SDMA causes corruption on gfx9 APUs:
+        *    https://gitlab.freedesktop.org/mesa/mesa/-/issues/2814
+        *
+        * While we could keep buffer copies and clears enabled, let's disable
+        * everything, because neither gfx8 nor gfx10 enable SDMA, and it's not
+        * easy to test.
+        */
+       (sctx->chip_class != GFX9 || sscreen->debug_flags & DBG(FORCE_SDMA)) &&
         /* SDMA timeouts sometimes on gfx10 so disable it for now. See:
          *    https://bugs.freedesktop.org/show_bug.cgi?id=111481
          *    https://gitlab.freedesktop.org/mesa/mesa/-/issues/1907
@@ -550,7 +563,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
        si_init_msaa_functions(sctx);
        si_init_shader_functions(sctx);
        si_init_state_functions(sctx);
-      si_init_cs_preamble_state(sctx);
        si_init_streamout_functions(sctx);
        si_init_viewport_functions(sctx);
  
@@ -667,7 +679,12 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
  
     /* The remainder of this function initializes the gfx CS and must be last. */
     assert(sctx->gfx_cs->current.cdw == 0);
-   si_begin_new_gfx_cs(sctx);
+
+   if (sctx->has_graphics) {
+      si_init_cp_reg_shadowing(sctx);
+   }
+
+   si_begin_new_gfx_cs(sctx, true);
     assert(sctx->gfx_cs->current.cdw == sctx->initial_gfx_cs_size);
  
     /* Initialize per-context buffers. */
@@ -1080,7 +1097,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
     unsigned max_offchip_buffers_per_se;
  
     if (sscreen->info.chip_class >= GFX10)
-      max_offchip_buffers_per_se = 256;
+      max_offchip_buffers_per_se = 128;
     /* Only certain chips can use the maximum value. */
     else if (sscreen->info.family == CHIP_VEGA12 || sscreen->info.family == CHIP_VEGA20)
        max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
@@ -1138,8 +1155,10 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
     sscreen->use_ngg = sscreen->info.chip_class >= GFX10 && sscreen->info.family != CHIP_NAVI14 &&
                        !(sscreen->debug_flags & DBG(NO_NGG));
     sscreen->use_ngg_culling = sscreen->use_ngg && !(sscreen->debug_flags & DBG(NO_NGG_CULLING));
-   sscreen->always_use_ngg_culling =
-      sscreen->use_ngg_culling && sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING);
+   sscreen->always_use_ngg_culling_all =
+      sscreen->use_ngg_culling && sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL);
+   sscreen->always_use_ngg_culling_tess =
+      sscreen->use_ngg_culling && sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_TESS);
     sscreen->use_ngg_streamout = false;
  
     /* Only enable primitive binning on APUs by default. */
@@ -1215,11 +1234,15 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
     sscreen->compute_wave_size = 64;
  
     if (sscreen->info.chip_class >= GFX10) {
-      /* Pixels shaders: Wave64 is recommended.
-       * Compute shaders: There are piglit failures with Wave32.
+      /* Pixel shaders: Wave64 is always fastest.
+       * Vertex shaders: Wave64 is probably better, because:
+       * - greater chance of L0 cache hits, because more threads are assigned
+       *   to the same CU
+       * - scalar instructions are only executed once for 64 threads instead of twice
+       * - VGPR allocation granularity is half of Wave32, so 1 Wave64 can
+       *   sometimes use fewer VGPRs than 2 Wave32
+       * - TessMark X64 with NGG culling is faster with Wave64
         */
-      sscreen->ge_wave_size = 32;
-
        if (sscreen->debug_flags & DBG(W32_GE))
           sscreen->ge_wave_size = 32;
        if (sscreen->debug_flags & DBG(W32_PS))
@@ -1267,6 +1290,8 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
                                      RADEON_DOMAIN_OA);
     }
  
+   ac_print_shadowed_regs(&sscreen->info);
+
     STATIC_ASSERT(sizeof(union si_vgt_stages_key) == 4);
     return &sscreen->b;
  }