From f0c9ef410adb52167509e62b5d4a4dfeb602c163 Mon Sep 17 00:00:00 2001
From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Date: Sun, 14 Jan 2018 21:20:20 +0100
Subject: [PATCH] radv: Add PM4 pregeneration for compute pipelines.

Reviewed-by: Dave Airlie <airlied@redhat.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
---
 src/amd/vulkan/radv_cmd_buffer.c | 60 +----------------------------
 src/amd/vulkan/radv_pipeline.c   | 66 ++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 58 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index f3f765a96e0..1280a186525 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -2180,76 +2180,20 @@ VkResult radv_EndCommandBuffer(
 static void
 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
 {
-	struct radv_shader_variant *compute_shader;
 	struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
-	struct radv_device *device = cmd_buffer->device;
-	unsigned compute_resource_limits;
-	unsigned waves_per_threadgroup;
-	uint64_t va;
 
 	if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
 		return;
 
 	cmd_buffer->state.emitted_compute_pipeline = pipeline;
 
-	compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
-	va = radv_buffer_get_va(compute_shader->bo) + compute_shader->bo_offset;
-
-	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
-							   cmd_buffer->cs, 19);
-
-	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B830_COMPUTE_PGM_LO, 2);
-	radeon_emit(cmd_buffer->cs, va >> 8);
-	radeon_emit(cmd_buffer->cs, va >> 40);
-
-	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
-	radeon_emit(cmd_buffer->cs, compute_shader->rsrc1);
-	radeon_emit(cmd_buffer->cs, compute_shader->rsrc2);
-
+	radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw);
+	radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
 
 	cmd_buffer->compute_scratch_size_needed =
 	                          MAX2(cmd_buffer->compute_scratch_size_needed,
 	                               pipeline->max_waves * pipeline->scratch_bytes_per_wave);
 
-	/* change these once we have scratch support */
-	radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE,
-			  S_00B860_WAVES(pipeline->max_waves) |
-			  S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10));
-
-	/* Calculate best compute resource limits. */
-	waves_per_threadgroup =
-		DIV_ROUND_UP(compute_shader->info.cs.block_size[0] *
-			     compute_shader->info.cs.block_size[1] *
-			     compute_shader->info.cs.block_size[2], 64);
-	compute_resource_limits =
-		S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0);
-
-	if (device->physical_device->rad_info.chip_class >= CIK) {
-		unsigned num_cu_per_se =
-			device->physical_device->rad_info.num_good_compute_units /
-			device->physical_device->rad_info.max_se;
-
-		/* Force even distribution on all SIMDs in CU if the workgroup
-		 * size is 64. This has shown some good improvements if # of
-		 * CUs per SE is not a multiple of 4.
-		 */
-		if (num_cu_per_se % 4 && waves_per_threadgroup == 1)
-			compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1);
-	}
-
-	radeon_set_sh_reg(cmd_buffer->cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
-			  compute_resource_limits);
-
-	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
-	radeon_emit(cmd_buffer->cs,
-		    S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]));
-	radeon_emit(cmd_buffer->cs,
-		    S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]));
-	radeon_emit(cmd_buffer->cs,
-		    S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]));
-
-	assert(cmd_buffer->cs->cdw <= cdw_max);
-
 	if (unlikely(cmd_buffer->device->trace_bo))
 		radv_save_pipeline(cmd_buffer, pipeline, RING_COMPUTE);
 }
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 1c5d994af1c..8627b1122bc 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -3277,6 +3277,69 @@ VkResult radv_CreateGraphicsPipelines(
 	return result;
 }
 
+
+static void
+radv_compute_generate_pm4(struct radv_pipeline *pipeline)
+{
+	struct radv_shader_variant *compute_shader;
+	struct radv_device *device = pipeline->device;
+	unsigned compute_resource_limits;
+	unsigned waves_per_threadgroup;
+	uint64_t va;
+
+	pipeline->cs.buf = malloc(20 * 4);
+	pipeline->cs.max_dw = 20;
+
+	compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
+	va = radv_buffer_get_va(compute_shader->bo) + compute_shader->bo_offset;
+
+	radeon_set_sh_reg_seq(&pipeline->cs, R_00B830_COMPUTE_PGM_LO, 2);
+	radeon_emit(&pipeline->cs, va >> 8);
+	radeon_emit(&pipeline->cs, va >> 40);
+
+	radeon_set_sh_reg_seq(&pipeline->cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
+	radeon_emit(&pipeline->cs, compute_shader->rsrc1);
+	radeon_emit(&pipeline->cs, compute_shader->rsrc2);
+
+	radeon_set_sh_reg(&pipeline->cs, R_00B860_COMPUTE_TMPRING_SIZE,
+			  S_00B860_WAVES(pipeline->max_waves) |
+			  S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10));
+
+	/* Calculate best compute resource limits. */
+	waves_per_threadgroup =
+		DIV_ROUND_UP(compute_shader->info.cs.block_size[0] *
+			     compute_shader->info.cs.block_size[1] *
+			     compute_shader->info.cs.block_size[2], 64);
+	compute_resource_limits =
+		S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0);
+
+	if (device->physical_device->rad_info.chip_class >= CIK) {
+		unsigned num_cu_per_se =
+			device->physical_device->rad_info.num_good_compute_units /
+			device->physical_device->rad_info.max_se;
+
+		/* Force even distribution on all SIMDs in CU if the workgroup
+		 * size is 64. This has shown some good improvements if # of
+		 * CUs per SE is not a multiple of 4.
+		 */
+		if (num_cu_per_se % 4 && waves_per_threadgroup == 1)
+			compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1);
+	}
+
+	radeon_set_sh_reg(&pipeline->cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+			  compute_resource_limits);
+
+	radeon_set_sh_reg_seq(&pipeline->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+	radeon_emit(&pipeline->cs,
+		    S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]));
+	radeon_emit(&pipeline->cs,
+		    S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]));
+	radeon_emit(&pipeline->cs,
+		    S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]));
+
+	assert(pipeline->cs.cdw <= pipeline->cs.max_dw);
+}
+
 static VkResult radv_compute_pipeline_create(
 	VkDevice                                    _device,
 	VkPipelineCache                             _cache,
@@ -3310,6 +3373,8 @@ static VkResult radv_compute_pipeline_create(
 		return result;
 	}
 
+	radv_compute_generate_pm4(pipeline);
+
 	*pPipeline = radv_pipeline_to_handle(pipeline);
 
 	if (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) {
@@ -3317,6 +3382,7 @@ static VkResult radv_compute_pipeline_create(
 	}
 	return VK_SUCCESS;
 }
+
 VkResult radv_CreateComputePipelines(
 	VkDevice                                    _device,
 	VkPipelineCache                             pipelineCache,
-- 
2.30.2