From ea385650115728270e21fb7ff1ec73a8cbaf298c Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 30 Jul 2019 18:32:42 +0200 Subject: [PATCH] radv/gfx10: add Wave32 support for compute shaders It can be enabled with RADV_PERFTEST=cswave32. Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen --- src/amd/vulkan/radv_debug.h | 1 + src/amd/vulkan/radv_device.c | 12 +++++++++++- src/amd/vulkan/radv_nir_to_llvm.c | 14 +++++++++++++- src/amd/vulkan/radv_pipeline.c | 3 ++- src/amd/vulkan/radv_private.h | 3 +++ src/amd/vulkan/radv_shader.c | 25 ++++++++++++++++++++++--- src/amd/vulkan/radv_shader.h | 1 + 7 files changed, 53 insertions(+), 6 deletions(-) diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h index 723fabda57f..6414e882676 100644 --- a/src/amd/vulkan/radv_debug.h +++ b/src/amd/vulkan/radv_debug.h @@ -64,6 +64,7 @@ enum { RADV_PERFTEST_BO_LIST = 0x20, RADV_PERFTEST_SHADER_BALLOT = 0x40, RADV_PERFTEST_TC_COMPAT_CMASK = 0x80, + RADV_PERFTEST_CS_WAVE_32 = 0x100, }; bool diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 65e3ccf91ad..29be192443a 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -383,6 +383,14 @@ radv_physical_device_init(struct radv_physical_device *device, device->use_shader_ballot = device->instance->perftest_flags & RADV_PERFTEST_SHADER_BALLOT; + /* Determine the number of threads per wave for all stages. */ + device->cs_wave_size = 64; + + if (device->rad_info.chip_class >= GFX10) { + if (device->instance->perftest_flags & RADV_PERFTEST_CS_WAVE_32) + device->cs_wave_size = 32; + } + radv_physical_device_init_mem_types(device); radv_fill_device_extension_table(device, &device->supported_extensions); @@ -494,6 +502,7 @@ static const struct debug_control radv_perftest_options[] = { {"bolist", RADV_PERFTEST_BO_LIST}, {"shader_ballot", RADV_PERFTEST_SHADER_BALLOT}, {"tccompatcmask", RADV_PERFTEST_TC_COMPAT_CMASK}, + {"cswave32", RADV_PERFTEST_CS_WAVE_32}, {NULL, 0} }; @@ -1930,7 +1939,8 @@ VkResult radv_CreateDevice( device->scratch_waves = MAX2(32 * physical_device->rad_info.num_good_compute_units, max_threads_per_block / 64); - device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1); + device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) | + S_00B800_CS_W32_EN(device->physical_device->cs_wave_size == 32); if (device->physical_device->rad_info.chip_class >= GFX7) { /* If the KMD allows it (there is a KMD hw register for it), diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index b0ca63f833a..bb78bcccf0e 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -4317,6 +4317,15 @@ static void declare_esgs_ring(struct radv_shader_context *ctx) LLVMSetAlignment(ctx->esgs_ring, 64 * 1024); } +static uint8_t +radv_nir_shader_wave_size(struct nir_shader *const *shaders, int shader_count, + const struct radv_nir_compiler_options *options) +{ + if (shaders[0]->info.stage == MESA_SHADER_COMPUTE) + return options->cs_wave_size; + return 64; +} + static LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *const *shaders, @@ -4333,8 +4342,11 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH : AC_FLOAT_MODE_DEFAULT; + uint8_t wave_size = radv_nir_shader_wave_size(shaders, + shader_count, options); + ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class, - options->family, float_mode, 64); + options->family, float_mode, wave_size); ctx.context = ctx.ac.context; radv_nir_shader_info_init(&shader_info->info); diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index e11196bd82e..5c913f29a5a 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -4649,7 +4649,8 @@ radv_compute_generate_pm4(struct radv_pipeline *pipeline) threads_per_threadgroup = compute_shader->info.cs.block_size[0] * compute_shader->info.cs.block_size[1] * compute_shader->info.cs.block_size[2]; - waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, 64); + waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, + device->physical_device->cs_wave_size); if (device->physical_device->rad_info.chip_class >= GFX10 && waves_per_threadgroup == 1) diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 0700dbcfa49..143c09811c8 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -301,6 +301,9 @@ struct radv_physical_device { /* Whether DISABLE_CONSTANT_ENCODE_REG is supported. */ bool has_dcc_constant_encode; + /* Number of threads per wave. */ + uint8_t cs_wave_size; + /* This is the drivers on-disk cache used as a fallback as opposed to * the pipeline cache defined by apps. */ diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 748fc3a4253..9c88ab551bb 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -667,6 +667,16 @@ radv_get_shader_binary_size(size_t code_size) return code_size + DEBUGGER_NUM_MARKERS * 4; } +static uint8_t +radv_get_shader_wave_size(const struct radv_physical_device *pdevice, + gl_shader_stage stage) +{ + if (stage == MESA_SHADER_COMPUTE) + return pdevice->cs_wave_size; + + return 64; +} + static void radv_postprocess_config(const struct radv_physical_device *pdevice, const struct ac_shader_config *config_in, const struct radv_shader_variant_info *info, @@ -674,6 +684,7 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice, struct ac_shader_config *config_out) { bool scratch_enabled = config_in->scratch_bytes_per_wave > 0; + uint8_t wave_size = radv_get_shader_wave_size(pdevice, stage); unsigned vgpr_comp_cnt = 0; unsigned num_input_vgprs = info->num_input_vgprs; @@ -743,7 +754,8 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice, S_00B12C_SO_BASE3_EN(!!info->info.so.strides[3]) | S_00B12C_SO_EN(!!info->info.so.num_outputs); - config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) / 4) | + config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) / + (wave_size == 32 ? 8 : 4)) | S_00B848_DX10_CLAMP(1) | S_00B848_FLOAT_MODE(config_out->float_mode); @@ -1009,10 +1021,15 @@ radv_shader_variant_create(struct radv_device *device, if (binary->variant_info.is_ngg) sym->size -= 32; } + + uint8_t wave_size = + radv_get_shader_wave_size(device->physical_device, + binary->stage); + struct ac_rtld_open_info open_info = { .info = &device->physical_device->rad_info, .shader_type = binary->stage, - .wave_size = 64, + .wave_size = wave_size, .num_parts = 1, .elf_ptrs = &elf_data, .elf_sizes = &elf_size, @@ -1124,6 +1141,7 @@ shader_variant_compile(struct radv_device *device, options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR; options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size; options->address32_hi = device->physical_device->rad_info.address32_hi; + options->cs_wave_size = device->physical_device->cs_wave_size; if (options->supports_spill) tm_options |= AC_TM_SUPPORTS_SPILL; @@ -1273,6 +1291,7 @@ generate_shader_stats(struct radv_device *device, { enum chip_class chip_class = device->physical_device->rad_info.chip_class; unsigned lds_increment = chip_class >= GFX7 ? 512 : 256; + uint8_t wave_size = radv_get_shader_wave_size(device->physical_device, stage); struct ac_shader_config *conf; unsigned max_simd_waves; unsigned lds_per_wave = 0; @@ -1289,7 +1308,7 @@ generate_shader_stats(struct radv_device *device, unsigned max_workgroup_size = radv_nir_get_max_workgroup_size(chip_class, stage, variant->nir); lds_per_wave = (conf->lds_size * lds_increment) / - DIV_ROUND_UP(max_workgroup_size, 64); + DIV_ROUND_UP(max_workgroup_size, wave_size); } if (conf->num_sgprs) diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 35560631c46..92ae2a7259d 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -128,6 +128,7 @@ struct radv_nir_compiler_options { enum chip_class chip_class; uint32_t tess_offchip_block_dw_size; uint32_t address32_hi; + uint8_t cs_wave_size; }; enum radv_ud_index { -- 2.30.2