radeonsi: set optimal VGT_HS_OFFCHIP_PARAM
authorMarek Olšák <marek.olsak@amd.com>
Tue, 28 Jun 2016 12:11:12 +0000 (14:11 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Wed, 29 Jun 2016 14:34:22 +0000 (16:34 +0200)
ported from Vulkan

Reviewed-by: Edward O'Callaghan <funfunctor@folklore1984.net>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
src/gallium/drivers/radeonsi/si_pipe.c
src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_state.h
src/gallium/drivers/radeonsi/si_state_draw.c
src/gallium/drivers/radeonsi/si_state_shaders.c

index d83568150e1dde015b2fc63b7bac8331eefc56c2..f38ecc15ab4d65820d328a514106e5f0f71fdbc6 100644 (file)
@@ -706,6 +706,12 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
        if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
                si_init_perfcounters(sscreen);
 
+       /* Hawaii has a bug with offchip buffers > 256 that can be worked
+        * around by setting 4K granularity.
+        */
+       sscreen->tess_offchip_block_dw_size =
+               sscreen->b.family == CHIP_HAWAII ? 4096 : 8192;
+
        sscreen->b.has_cp_dma = true;
        sscreen->b.has_streamout = true;
        pipe_mutex_init(sscreen->shader_parts_mutex);
index d1819058b926d561131189c15ca87aca7ac99441..ee64ecc6fd11a7fc0e89947d5b36b2dbf111f6da 100644 (file)
@@ -82,6 +82,7 @@ struct u_suballocator;
 struct si_screen {
        struct r600_common_screen       b;
        unsigned                        gs_table_depth;
+       unsigned                        tess_offchip_block_dw_size;
 
        /* Whether shaders are monolithic (1-part) or separate (3-part). */
        bool                            use_monolithic_shaders;
index 2e4923d7255fe2d236a34b1d4d4ad1fb6a3cb327..9361849f78144fe5055f4e8a4411a927752f5c0b 100644 (file)
@@ -40,8 +40,6 @@
 #define SI_NUM_IMAGES                  16
 #define SI_NUM_SHADER_BUFFERS          16
 
-#define SI_TESS_OFFCHIP_BLOCK_SIZE     (8192 * 4)
-
 struct si_screen;
 struct si_shader;
 
index b9a7c144acea9b3dbeb45cd3c6b93ab03cf1ae2b..35585107cd3809832060008ace66471e68c36056 100644 (file)
@@ -147,8 +147,9 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
                                                               output_patch_size));
 
        /* Make sure the output data fits in the offchip buffer */
-       *num_patches = MIN2(*num_patches, SI_TESS_OFFCHIP_BLOCK_SIZE /
-                                         output_patch_size);
+       *num_patches = MIN2(*num_patches,
+                           (sctx->screen->tess_offchip_block_dw_size * 4) /
+                           output_patch_size);
 
        /* Not necessary for correctness, but improves performance. The
         * specific value is taken from the proprietary driver.
index 89490bd0c29bb81b32c21873ce5a7c8a54b20d3f..9aa4a7c82334b3a30021dba6480e67277e8a644c 100644 (file)
@@ -1798,9 +1798,38 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
 
 static void si_init_tess_factor_ring(struct si_context *sctx)
 {
-       unsigned offchip_blocks = sctx->b.chip_class >= CIK ? 256 : 64;
-       assert(!sctx->tf_ring);
+       bool double_offchip_buffers = sctx->b.chip_class >= CIK;
+       unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
+       unsigned max_offchip_buffers = max_offchip_buffers_per_se *
+                                      sctx->screen->b.info.max_se;
+       unsigned offchip_granularity;
+
+       switch (sctx->screen->tess_offchip_block_dw_size) {
+       default:
+               assert(0);
+               /* fall through */
+       case 8192:
+               offchip_granularity = V_03093C_X_8K_DWORDS;
+               break;
+       case 4096:
+               offchip_granularity = V_03093C_X_4K_DWORDS;
+               break;
+       }
 
+       switch (sctx->b.chip_class) {
+       case SI:
+               max_offchip_buffers = MIN2(max_offchip_buffers, 126);
+               break;
+       case CIK:
+               max_offchip_buffers = MIN2(max_offchip_buffers, 508);
+               break;
+       case VI:
+       default:
+               max_offchip_buffers = MIN2(max_offchip_buffers, 512);
+               break;
+       }
+
+       assert(!sctx->tf_ring);
        sctx->tf_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
                                           PIPE_USAGE_DEFAULT,
                                           32768 * sctx->screen->b.info.max_se);
@@ -1812,8 +1841,8 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
        sctx->tess_offchip_ring = pipe_buffer_create(sctx->b.b.screen,
                                                     PIPE_BIND_CUSTOM,
                                                     PIPE_USAGE_DEFAULT,
-                                                    offchip_blocks *
-                                                    SI_TESS_OFFCHIP_BLOCK_SIZE);
+                                                    max_offchip_buffers *
+                                                    sctx->screen->tess_offchip_block_dw_size * 4);
        if (!sctx->tess_offchip_ring)
                return;
 
@@ -1821,24 +1850,24 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
 
        /* Append these registers to the init config state. */
        if (sctx->b.chip_class >= CIK) {
-               unsigned offchip_buffering = offchip_blocks;
-               if(sctx->b.chip_class >= VI)
-                       --offchip_buffering;
+               if (sctx->b.chip_class >= VI)
+                       --max_offchip_buffers;
 
                si_pm4_set_reg(sctx->init_config, R_030938_VGT_TF_RING_SIZE,
                               S_030938_SIZE(sctx->tf_ring->width0 / 4));
                si_pm4_set_reg(sctx->init_config, R_030940_VGT_TF_MEMORY_BASE,
                               r600_resource(sctx->tf_ring)->gpu_address >> 8);
                si_pm4_set_reg(sctx->init_config, R_03093C_VGT_HS_OFFCHIP_PARAM,
-                            S_03093C_OFFCHIP_BUFFERING(offchip_buffering) |
-                            S_03093C_OFFCHIP_GRANULARITY(V_03093C_X_8K_DWORDS));
+                            S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) |
+                            S_03093C_OFFCHIP_GRANULARITY(offchip_granularity));
        } else {
+               assert(offchip_granularity == V_03093C_X_8K_DWORDS);
                si_pm4_set_reg(sctx->init_config, R_008988_VGT_TF_RING_SIZE,
                               S_008988_SIZE(sctx->tf_ring->width0 / 4));
                si_pm4_set_reg(sctx->init_config, R_0089B8_VGT_TF_MEMORY_BASE,
                               r600_resource(sctx->tf_ring)->gpu_address >> 8);
                si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM,
-                              S_0089B0_OFFCHIP_BUFFERING(offchip_blocks));
+                              S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers));
        }
 
        /* Flush the context to re-emit the init_config state.