From f0c9ef410adb52167509e62b5d4a4dfeb602c163 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Sun, 14 Jan 2018 21:20:20 +0100 Subject: [PATCH] radv: Add PM4 pregeneration for compute pipelines. Reviewed-by: Dave Airlie Reviewed-by: Samuel Pitoiset --- src/amd/vulkan/radv_cmd_buffer.c | 60 +---------------------------- src/amd/vulkan/radv_pipeline.c | 66 ++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 58 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index f3f765a96e0..1280a186525 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -2180,76 +2180,20 @@ VkResult radv_EndCommandBuffer( static void radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer) { - struct radv_shader_variant *compute_shader; struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; - struct radv_device *device = cmd_buffer->device; - unsigned compute_resource_limits; - unsigned waves_per_threadgroup; - uint64_t va; if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline) return; cmd_buffer->state.emitted_compute_pipeline = pipeline; - compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; - va = radv_buffer_get_va(compute_shader->bo) + compute_shader->bo_offset; - - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, - cmd_buffer->cs, 19); - - radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B830_COMPUTE_PGM_LO, 2); - radeon_emit(cmd_buffer->cs, va >> 8); - radeon_emit(cmd_buffer->cs, va >> 40); - - radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B848_COMPUTE_PGM_RSRC1, 2); - radeon_emit(cmd_buffer->cs, compute_shader->rsrc1); - radeon_emit(cmd_buffer->cs, compute_shader->rsrc2); - + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw); + radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw); cmd_buffer->compute_scratch_size_needed = MAX2(cmd_buffer->compute_scratch_size_needed, pipeline->max_waves * pipeline->scratch_bytes_per_wave); - /* change these once we have scratch support */ - radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE, - S_00B860_WAVES(pipeline->max_waves) | - S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10)); - - /* Calculate best compute resource limits. */ - waves_per_threadgroup = - DIV_ROUND_UP(compute_shader->info.cs.block_size[0] * - compute_shader->info.cs.block_size[1] * - compute_shader->info.cs.block_size[2], 64); - compute_resource_limits = - S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0); - - if (device->physical_device->rad_info.chip_class >= CIK) { - unsigned num_cu_per_se = - device->physical_device->rad_info.num_good_compute_units / - device->physical_device->rad_info.max_se; - - /* Force even distribution on all SIMDs in CU if the workgroup - * size is 64. This has shown some good improvements if # of - * CUs per SE is not a multiple of 4. - */ - if (num_cu_per_se % 4 && waves_per_threadgroup == 1) - compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1); - } - - radeon_set_sh_reg(cmd_buffer->cs, R_00B854_COMPUTE_RESOURCE_LIMITS, - compute_resource_limits); - - radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); - radeon_emit(cmd_buffer->cs, - S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0])); - radeon_emit(cmd_buffer->cs, - S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1])); - radeon_emit(cmd_buffer->cs, - S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2])); - - assert(cmd_buffer->cs->cdw <= cdw_max); - if (unlikely(cmd_buffer->device->trace_bo)) radv_save_pipeline(cmd_buffer, pipeline, RING_COMPUTE); } diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 1c5d994af1c..8627b1122bc 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -3277,6 +3277,69 @@ VkResult radv_CreateGraphicsPipelines( return result; } + +static void +radv_compute_generate_pm4(struct radv_pipeline *pipeline) +{ + struct radv_shader_variant *compute_shader; + struct radv_device *device = pipeline->device; + unsigned compute_resource_limits; + unsigned waves_per_threadgroup; + uint64_t va; + + pipeline->cs.buf = malloc(20 * 4); + pipeline->cs.max_dw = 20; + + compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; + va = radv_buffer_get_va(compute_shader->bo) + compute_shader->bo_offset; + + radeon_set_sh_reg_seq(&pipeline->cs, R_00B830_COMPUTE_PGM_LO, 2); + radeon_emit(&pipeline->cs, va >> 8); + radeon_emit(&pipeline->cs, va >> 40); + + radeon_set_sh_reg_seq(&pipeline->cs, R_00B848_COMPUTE_PGM_RSRC1, 2); + radeon_emit(&pipeline->cs, compute_shader->rsrc1); + radeon_emit(&pipeline->cs, compute_shader->rsrc2); + + radeon_set_sh_reg(&pipeline->cs, R_00B860_COMPUTE_TMPRING_SIZE, + S_00B860_WAVES(pipeline->max_waves) | + S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10)); + + /* Calculate best compute resource limits. */ + waves_per_threadgroup = + DIV_ROUND_UP(compute_shader->info.cs.block_size[0] * + compute_shader->info.cs.block_size[1] * + compute_shader->info.cs.block_size[2], 64); + compute_resource_limits = + S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0); + + if (device->physical_device->rad_info.chip_class >= CIK) { + unsigned num_cu_per_se = + device->physical_device->rad_info.num_good_compute_units / + device->physical_device->rad_info.max_se; + + /* Force even distribution on all SIMDs in CU if the workgroup + * size is 64. This has shown some good improvements if # of + * CUs per SE is not a multiple of 4. + */ + if (num_cu_per_se % 4 && waves_per_threadgroup == 1) + compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1); + } + + radeon_set_sh_reg(&pipeline->cs, R_00B854_COMPUTE_RESOURCE_LIMITS, + compute_resource_limits); + + radeon_set_sh_reg_seq(&pipeline->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); + radeon_emit(&pipeline->cs, + S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0])); + radeon_emit(&pipeline->cs, + S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1])); + radeon_emit(&pipeline->cs, + S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2])); + + assert(pipeline->cs.cdw <= pipeline->cs.max_dw); +} + static VkResult radv_compute_pipeline_create( VkDevice _device, VkPipelineCache _cache, @@ -3310,6 +3373,8 @@ static VkResult radv_compute_pipeline_create( return result; } + radv_compute_generate_pm4(pipeline); + *pPipeline = radv_pipeline_to_handle(pipeline); if (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) { @@ -3317,6 +3382,7 @@ static VkResult radv_compute_pipeline_create( } return VK_SUCCESS; } + VkResult radv_CreateComputePipelines( VkDevice _device, VkPipelineCache pipelineCache, -- 2.30.2