From 26b69ad250ee23e70831626a88f70f6ddf2e1bcc Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 8 Jun 2016 14:34:11 +0200 Subject: [PATCH] radeonsi: improve the computation and comment of scratch_waves MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit 2% isn't much. If you think the number should be decreased, please speak up. Reviewed-by: Nicolai Hähnle --- src/gallium/drivers/radeonsi/si_pipe.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 5d7d2f322cf..0c601da9e39 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -236,11 +236,25 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, R600_COHERENCY_SHADER); } - /* XXX: This is the maximum value allowed. I'm not sure how to compute - * this for non-cs shaders. Using the wrong value here can result in - * GPU lockups, but the maximum value seems to always work. + uint64_t max_threads_per_block; + screen->get_compute_param(screen, PIPE_SHADER_IR_TGSI, + PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK, + &max_threads_per_block); + + /* The maximum number of scratch waves. Scratch space isn't divided + * evenly between CUs. The number is only a function of the number of CUs. + * We can decrease the constant to decrease the scratch buffer size. + * + * sctx->scratch_waves must be >= the maximum posible size of + * 1 threadgroup, so that the hw doesn't hang from being unable + * to start any. + * + * The recommended value is 4 per CU at most. Higher numbers don't + * bring much benefit, but they still occupy chip resources (think + * async compute). I've seen ~2% performance difference between 4 and 32. */ - sctx->scratch_waves = 32 * sscreen->b.info.num_good_compute_units; + sctx->scratch_waves = MAX2(32 * sscreen->b.info.num_good_compute_units, + max_threads_per_block / 64); /* Initialize LLVM TargetMachine */ r600_target = radeon_llvm_get_r600_target(triple); -- 2.30.2