From dd56d04568ab1a563a29d2900cca0ebc4cf13f77 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 28 Jun 2016 14:11:12 +0200 Subject: [PATCH] radeonsi: set optimal VGT_HS_OFFCHIP_PARAM MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit ported from Vulkan Reviewed-by: Edward O'Callaghan Reviewed-by: Nicolai Hähnle --- src/gallium/drivers/radeonsi/si_pipe.c | 6 +++ src/gallium/drivers/radeonsi/si_pipe.h | 1 + src/gallium/drivers/radeonsi/si_state.h | 2 - src/gallium/drivers/radeonsi/si_state_draw.c | 5 +- .../drivers/radeonsi/si_state_shaders.c | 49 +++++++++++++++---- 5 files changed, 49 insertions(+), 14 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index d83568150e1..f38ecc15ab4 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -706,6 +706,12 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws) if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false)) si_init_perfcounters(sscreen); + /* Hawaii has a bug with offchip buffers > 256 that can be worked + * around by setting 4K granularity. + */ + sscreen->tess_offchip_block_dw_size = + sscreen->b.family == CHIP_HAWAII ? 4096 : 8192; + sscreen->b.has_cp_dma = true; sscreen->b.has_streamout = true; pipe_mutex_init(sscreen->shader_parts_mutex); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index d1819058b92..ee64ecc6fd1 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -82,6 +82,7 @@ struct u_suballocator; struct si_screen { struct r600_common_screen b; unsigned gs_table_depth; + unsigned tess_offchip_block_dw_size; /* Whether shaders are monolithic (1-part) or separate (3-part). */ bool use_monolithic_shaders; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 2e4923d7255..9361849f781 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -40,8 +40,6 @@ #define SI_NUM_IMAGES 16 #define SI_NUM_SHADER_BUFFERS 16 -#define SI_TESS_OFFCHIP_BLOCK_SIZE (8192 * 4) - struct si_screen; struct si_shader; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index b9a7c144ace..35585107cd3 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -147,8 +147,9 @@ static void si_emit_derived_tess_state(struct si_context *sctx, output_patch_size)); /* Make sure the output data fits in the offchip buffer */ - *num_patches = MIN2(*num_patches, SI_TESS_OFFCHIP_BLOCK_SIZE / - output_patch_size); + *num_patches = MIN2(*num_patches, + (sctx->screen->tess_offchip_block_dw_size * 4) / + output_patch_size); /* Not necessary for correctness, but improves performance. The * specific value is taken from the proprietary driver. diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 89490bd0c29..9aa4a7c8233 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1798,9 +1798,38 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx) static void si_init_tess_factor_ring(struct si_context *sctx) { - unsigned offchip_blocks = sctx->b.chip_class >= CIK ? 256 : 64; - assert(!sctx->tf_ring); + bool double_offchip_buffers = sctx->b.chip_class >= CIK; + unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64; + unsigned max_offchip_buffers = max_offchip_buffers_per_se * + sctx->screen->b.info.max_se; + unsigned offchip_granularity; + + switch (sctx->screen->tess_offchip_block_dw_size) { + default: + assert(0); + /* fall through */ + case 8192: + offchip_granularity = V_03093C_X_8K_DWORDS; + break; + case 4096: + offchip_granularity = V_03093C_X_4K_DWORDS; + break; + } + switch (sctx->b.chip_class) { + case SI: + max_offchip_buffers = MIN2(max_offchip_buffers, 126); + break; + case CIK: + max_offchip_buffers = MIN2(max_offchip_buffers, 508); + break; + case VI: + default: + max_offchip_buffers = MIN2(max_offchip_buffers, 512); + break; + } + + assert(!sctx->tf_ring); sctx->tf_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, PIPE_USAGE_DEFAULT, 32768 * sctx->screen->b.info.max_se); @@ -1812,8 +1841,8 @@ static void si_init_tess_factor_ring(struct si_context *sctx) sctx->tess_offchip_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, PIPE_USAGE_DEFAULT, - offchip_blocks * - SI_TESS_OFFCHIP_BLOCK_SIZE); + max_offchip_buffers * + sctx->screen->tess_offchip_block_dw_size * 4); if (!sctx->tess_offchip_ring) return; @@ -1821,24 +1850,24 @@ static void si_init_tess_factor_ring(struct si_context *sctx) /* Append these registers to the init config state. */ if (sctx->b.chip_class >= CIK) { - unsigned offchip_buffering = offchip_blocks; - if(sctx->b.chip_class >= VI) - --offchip_buffering; + if (sctx->b.chip_class >= VI) + --max_offchip_buffers; si_pm4_set_reg(sctx->init_config, R_030938_VGT_TF_RING_SIZE, S_030938_SIZE(sctx->tf_ring->width0 / 4)); si_pm4_set_reg(sctx->init_config, R_030940_VGT_TF_MEMORY_BASE, r600_resource(sctx->tf_ring)->gpu_address >> 8); si_pm4_set_reg(sctx->init_config, R_03093C_VGT_HS_OFFCHIP_PARAM, - S_03093C_OFFCHIP_BUFFERING(offchip_buffering) | - S_03093C_OFFCHIP_GRANULARITY(V_03093C_X_8K_DWORDS)); + S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) | + S_03093C_OFFCHIP_GRANULARITY(offchip_granularity)); } else { + assert(offchip_granularity == V_03093C_X_8K_DWORDS); si_pm4_set_reg(sctx->init_config, R_008988_VGT_TF_RING_SIZE, S_008988_SIZE(sctx->tf_ring->width0 / 4)); si_pm4_set_reg(sctx->init_config, R_0089B8_VGT_TF_MEMORY_BASE, r600_resource(sctx->tf_ring)->gpu_address >> 8); si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM, - S_0089B0_OFFCHIP_BUFFERING(offchip_blocks)); + S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers)); } /* Flush the context to re-emit the init_config state. -- 2.30.2