From ba1f66a73d493c49a3a329d9f3882ddbae4147b0 Mon Sep 17 00:00:00 2001
From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Date: Sat, 2 Apr 2016 13:04:18 +0200
Subject: [PATCH] radeonsi: rework compute scratch buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Instead of having a scratch buffer per program, have one per
context.

Also removed the per kernel wave count calculations, but
that only helped if the total number of waves in the dispatch
was smaller than sctx->scratch_waves.

v2: Fix style issue.

Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Marek OlÅ¡Ã¡k <marek.olsak@amd.com>
Reviewed-by: Nicolai HÃ¤hnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c | 137 +++++++---------------
 src/gallium/drivers/radeonsi/si_pipe.c    |   1 +
 src/gallium/drivers/radeonsi/si_pipe.h    |   2 +
 3 files changed, 47 insertions(+), 93 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 2ce9306d069..fe6a7a6f77a 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -46,47 +46,6 @@ struct si_compute {
 	struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
 };
 
-static void init_scratch_buffer(struct si_context *sctx, struct si_compute *program)
-{
-	unsigned scratch_bytes = 0;
-	uint64_t scratch_buffer_va;
-	unsigned i;
-
-	/* Compute the scratch buffer size using the maximum number of waves.
-	 * This way we don't need to recompute it for each kernel launch. */
-	unsigned scratch_waves = 32 * sctx->screen->b.info.num_good_compute_units;
-	for (i = 0; i < program->shader.binary.global_symbol_count; i++) {
-		unsigned offset =
-				program->shader.binary.global_symbol_offsets[i];
-		unsigned scratch_bytes_needed;
-
-		si_shader_binary_read_config(&program->shader.binary,
-					     &program->shader.config, offset);
-		scratch_bytes_needed = program->shader.config.scratch_bytes_per_wave;
-		scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
-	}
-
-	if (scratch_bytes == 0)
-		return;
-
-	program->shader.scratch_bo =
-				si_resource_create_custom(sctx->b.b.screen,
-				PIPE_USAGE_DEFAULT,
-				scratch_bytes * scratch_waves);
-
-	scratch_buffer_va = program->shader.scratch_bo->gpu_address;
-
-	/* apply_scratch_relocs needs scratch_bytes_per_wave to be set
-	 * to the maximum bytes needed, so it can compute the stride
-	 * correctly.
-	 */
-	program->shader.config.scratch_bytes_per_wave = scratch_bytes;
-
-	/* Patch the shader with the scratch buffer address. */
-	si_shader_apply_scratch_relocs(sctx,
-				&program->shader, scratch_buffer_va);
-}
-
 static void *si_create_compute_state(
 	struct pipe_context *ctx,
 	const struct pipe_compute_state *cso)
@@ -144,11 +103,6 @@ static void *si_create_compute_state(
 		code = cso->prog + sizeof(struct pipe_llvm_program_header);
 
 		radeon_elf_read(code, header->num_bytes, &program->shader.binary);
-		/* init_scratch_buffer patches the shader code with the scratch address,
-		* so we need to call it before si_shader_binary_read() which uploads
-		* the shader code to the GPU.
-		*/
-		init_scratch_buffer(sctx, program);
 		si_shader_binary_read_config(&program->shader.binary,
 			     &program->shader.config, 0);
 	}
@@ -193,43 +147,6 @@ static void si_set_global_binding(
 	}
 }
 
-/**
- * This function computes the value for R_00B860_COMPUTE_TMPRING_SIZE.WAVES
- * /p block_layout is the number of threads in each work group.
- * /p grid layout is the number of work groups.
- */
-static unsigned compute_num_waves_for_scratch(
-		const struct radeon_info *info,
-		const uint *block_layout,
-		const uint *grid_layout)
-{
-	unsigned num_sh = MAX2(info->max_sh_per_se, 1);
-	unsigned num_se = MAX2(info->max_se, 1);
-	unsigned num_blocks = 1;
-	unsigned threads_per_block = 1;
-	unsigned waves_per_block;
-	unsigned waves_per_sh;
-	unsigned waves;
-	unsigned scratch_waves;
-	unsigned i;
-
-	for (i = 0; i < 3; i++) {
-		threads_per_block *= block_layout[i];
-		num_blocks *= grid_layout[i];
-	}
-
-	waves_per_block = align(threads_per_block, 64) / 64;
-	waves = waves_per_block * num_blocks;
-	waves_per_sh = align(waves, num_sh * num_se) / (num_sh * num_se);
-	scratch_waves = waves_per_sh * num_sh * num_se;
-
-	if (waves_per_block > waves_per_sh) {
-		scratch_waves = waves_per_block * num_sh * num_se;
-	}
-
-	return scratch_waves;
-}
-
 static void si_initialize_compute(struct si_context *sctx)
 {
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
@@ -271,6 +188,44 @@ static void si_initialize_compute(struct si_context *sctx)
 	sctx->cs_shader_state.initialized = true;
 }
 
+static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
+                                            struct si_shader *shader,
+                                            struct si_shader_config *config)
+{
+	uint64_t scratch_bo_size, scratch_needed;
+	scratch_bo_size = 0;
+	scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves;
+	if (sctx->compute_scratch_buffer)
+		scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0;
+
+	if (scratch_bo_size < scratch_needed) {
+		pipe_resource_reference(
+			(struct pipe_resource**)&sctx->compute_scratch_buffer,
+			NULL);
+
+		sctx->compute_scratch_buffer =
+				si_resource_create_custom(&sctx->screen->b.b,
+                                PIPE_USAGE_DEFAULT, scratch_needed);
+
+		if (!sctx->compute_scratch_buffer)
+			return false;
+	}
+
+	if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) {
+		uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
+
+		si_shader_apply_scratch_relocs(sctx, shader, scratch_va);
+
+		if (si_shader_binary_upload(sctx->screen, shader))
+			return false;
+
+		r600_resource_reference(&shader->scratch_bo,
+		                        sctx->compute_scratch_buffer);
+	}
+
+	return true;
+}
+
 static void si_upload_compute_input(struct si_context *sctx,
                                   const struct pipe_grid_info *info)
 {
@@ -331,7 +286,6 @@ static void si_launch_grid(
 	unsigned i;
 	struct si_shader *shader = &program->shader;
 	unsigned lds_blocks;
-	unsigned num_waves_for_scratch;
 
 	si_need_cs_space(sctx);
 
@@ -351,19 +305,19 @@ static void si_launch_grid(
 	/* Read the config information */
 	si_shader_binary_read_config(&shader->binary, &shader->config, info->pc);
 
+	if (!si_setup_compute_scratch_buffer(sctx, shader, &shader->config))
+		return;
+
 	if (program->input_size || program->ir_type == PIPE_SHADER_IR_NATIVE)
 		si_upload_compute_input(sctx, info);
 
-	num_waves_for_scratch =	compute_num_waves_for_scratch(
-		&sctx->screen->b.info, info->block, info->grid);
-
 	if (shader->config.scratch_bytes_per_wave > 0) {
 
 		COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; "
-		            "Total Scratch: %u bytes\n", num_waves_for_scratch,
+		            "Total Scratch: %u bytes\n", sctx->scratch_waves,
 			    shader->config.scratch_bytes_per_wave,
 			    shader->config.scratch_bytes_per_wave *
-			    num_waves_for_scratch);
+			    sctx->scratch_waves);
 
 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 					  shader->scratch_bo,
@@ -419,15 +373,12 @@ static void si_launch_grid(
 
 	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->config.rsrc2);
 
-	num_waves_for_scratch =
-		MIN2(num_waves_for_scratch,
-		     32 * sctx->screen->b.info.num_good_compute_units);
 	si_pm4_set_reg(pm4, R_00B860_COMPUTE_TMPRING_SIZE,
 		/* The maximum value for WAVES is 32 * num CU.
 		 * If you program this value incorrectly, the GPU will hang if
 		 * COMPUTE_PGM_RSRC2.SCRATCH_EN is enabled.
 		 */
-		S_00B860_WAVES(num_waves_for_scratch)
+		S_00B860_WAVES(sctx->scratch_waves)
 		| S_00B860_WAVESIZE(shader->config.scratch_bytes_per_wave >> 10))
 		;
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index ca07331187f..dabd28a4bc2 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -52,6 +52,7 @@ static void si_destroy_context(struct pipe_context *context)
 	r600_resource_reference(&sctx->border_color_buffer, NULL);
 	free(sctx->border_color_table);
 	r600_resource_reference(&sctx->scratch_buffer, NULL);
+	r600_resource_reference(&sctx->compute_scratch_buffer, NULL);
 	sctx->b.ws->fence_reference(&sctx->last_gfx_fence, NULL);
 
 	si_pm4_free_state(sctx, sctx->init_config, ~0);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 91ccbea6ba3..d752c0c7fbd 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -306,6 +306,8 @@ struct si_context {
 	unsigned		scratch_waves;
 	unsigned		spi_tmpring_size;
 
+	struct r600_resource	*compute_scratch_buffer;
+
 	/* Emitted derived tessellation state. */
 	struct si_shader	*last_ls; /* local shader (VS) */
 	struct si_shader_selector *last_tcs;
-- 
2.30.2