radv/gfx10: add Wave32 support for compute shaders
authorSamuel Pitoiset <samuel.pitoiset@gmail.com>
Tue, 30 Jul 2019 16:32:42 +0000 (18:32 +0200)
committerSamuel Pitoiset <samuel.pitoiset@gmail.com>
Wed, 31 Jul 2019 07:35:04 +0000 (09:35 +0200)
It can be enabled with RADV_PERFTEST=cswave32.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
src/amd/vulkan/radv_debug.h
src/amd/vulkan/radv_device.c
src/amd/vulkan/radv_nir_to_llvm.c
src/amd/vulkan/radv_pipeline.c
src/amd/vulkan/radv_private.h
src/amd/vulkan/radv_shader.c
src/amd/vulkan/radv_shader.h

index 723fabda57f2b709fd0b052287f41762baf34e63..6414e882676b2c9a53833f92e8b341b5ee20515f 100644 (file)
@@ -64,6 +64,7 @@ enum {
        RADV_PERFTEST_BO_LIST        =  0x20,
        RADV_PERFTEST_SHADER_BALLOT  =  0x40,
        RADV_PERFTEST_TC_COMPAT_CMASK = 0x80,
+       RADV_PERFTEST_CS_WAVE_32     = 0x100,
 };
 
 bool
index 65e3ccf91ad410c9cd6bfd86b60735515cef31b3..29be192443a9f6927e03963a0fff6ac33cb76a17 100644 (file)
@@ -383,6 +383,14 @@ radv_physical_device_init(struct radv_physical_device *device,
 
        device->use_shader_ballot = device->instance->perftest_flags & RADV_PERFTEST_SHADER_BALLOT;
 
+       /* Determine the number of threads per wave for all stages. */
+       device->cs_wave_size = 64;
+
+       if (device->rad_info.chip_class >= GFX10) {
+               if (device->instance->perftest_flags & RADV_PERFTEST_CS_WAVE_32)
+                       device->cs_wave_size = 32;
+       }
+
        radv_physical_device_init_mem_types(device);
        radv_fill_device_extension_table(device, &device->supported_extensions);
 
@@ -494,6 +502,7 @@ static const struct debug_control radv_perftest_options[] = {
        {"bolist", RADV_PERFTEST_BO_LIST},
        {"shader_ballot", RADV_PERFTEST_SHADER_BALLOT},
        {"tccompatcmask", RADV_PERFTEST_TC_COMPAT_CMASK},
+       {"cswave32", RADV_PERFTEST_CS_WAVE_32},
        {NULL, 0}
 };
 
@@ -1930,7 +1939,8 @@ VkResult radv_CreateDevice(
        device->scratch_waves = MAX2(32 * physical_device->rad_info.num_good_compute_units,
                                     max_threads_per_block / 64);
 
-       device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1);
+       device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) |
+                                    S_00B800_CS_W32_EN(device->physical_device->cs_wave_size == 32);
 
        if (device->physical_device->rad_info.chip_class >= GFX7) {
                /* If the KMD allows it (there is a KMD hw register for it),
index b0ca63f833a31b7b01b575c5e67770833689b1d8..bb78bcccf0e1ebe409fb6b06a5a7ec24c40e5ff2 100644 (file)
@@ -4317,6 +4317,15 @@ static void declare_esgs_ring(struct radv_shader_context *ctx)
        LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
 }
 
+static uint8_t
+radv_nir_shader_wave_size(struct nir_shader *const *shaders, int shader_count,
+                         const struct radv_nir_compiler_options *options)
+{
+       if (shaders[0]->info.stage == MESA_SHADER_COMPUTE)
+               return options->cs_wave_size;
+       return 64;
+}
+
 static
 LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
                                        struct nir_shader *const *shaders,
@@ -4333,8 +4342,11 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
                options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
                                       AC_FLOAT_MODE_DEFAULT;
 
+       uint8_t wave_size = radv_nir_shader_wave_size(shaders,
+                                                     shader_count, options);
+
        ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class,
-                            options->family, float_mode, 64);
+                            options->family, float_mode, wave_size);
        ctx.context = ctx.ac.context;
 
        radv_nir_shader_info_init(&shader_info->info);
index e11196bd82efe84582f45a5f589d0163e787f60e..5c913f29a5a01a063b7a700a94a447b010cba602 100644 (file)
@@ -4649,7 +4649,8 @@ radv_compute_generate_pm4(struct radv_pipeline *pipeline)
        threads_per_threadgroup = compute_shader->info.cs.block_size[0] *
                                  compute_shader->info.cs.block_size[1] *
                                  compute_shader->info.cs.block_size[2];
-       waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, 64);
+       waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup,
+                                            device->physical_device->cs_wave_size);
 
        if (device->physical_device->rad_info.chip_class >= GFX10 &&
            waves_per_threadgroup == 1)
index 0700dbcfa496c6cd2c44bd06b59471681869e067..143c09811c8fe4c952666cbb8af47b00b4a0ff61 100644 (file)
@@ -301,6 +301,9 @@ struct radv_physical_device {
        /* Whether DISABLE_CONSTANT_ENCODE_REG is supported. */
        bool has_dcc_constant_encode;
 
+       /* Number of threads per wave. */
+       uint8_t cs_wave_size;
+
        /* This is the drivers on-disk cache used as a fallback as opposed to
         * the pipeline cache defined by apps.
         */
index 748fc3a42536ea81cf916bea702f0fa15afeef7c..9c88ab551bb4d3ed16c2deb03a6f0a3af7c79251 100644 (file)
@@ -667,6 +667,16 @@ radv_get_shader_binary_size(size_t code_size)
        return code_size + DEBUGGER_NUM_MARKERS * 4;
 }
 
+static uint8_t
+radv_get_shader_wave_size(const struct radv_physical_device *pdevice,
+                         gl_shader_stage stage)
+{
+       if (stage == MESA_SHADER_COMPUTE)
+               return pdevice->cs_wave_size;
+
+       return 64;
+}
+
 static void radv_postprocess_config(const struct radv_physical_device *pdevice,
                                    const struct ac_shader_config *config_in,
                                    const struct radv_shader_variant_info *info,
@@ -674,6 +684,7 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice,
                                    struct ac_shader_config *config_out)
 {
        bool scratch_enabled = config_in->scratch_bytes_per_wave > 0;
+       uint8_t wave_size = radv_get_shader_wave_size(pdevice, stage);
        unsigned vgpr_comp_cnt = 0;
        unsigned num_input_vgprs = info->num_input_vgprs;
 
@@ -743,7 +754,8 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice,
                            S_00B12C_SO_BASE3_EN(!!info->info.so.strides[3]) |
                            S_00B12C_SO_EN(!!info->info.so.num_outputs);
 
-       config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) / 4) |
+       config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) /
+                                          (wave_size == 32 ? 8 : 4)) |
                            S_00B848_DX10_CLAMP(1) |
                            S_00B848_FLOAT_MODE(config_out->float_mode);
 
@@ -1009,10 +1021,15 @@ radv_shader_variant_create(struct radv_device *device,
                        if (binary->variant_info.is_ngg)
                                sym->size -= 32;
                }
+
+               uint8_t wave_size =
+                       radv_get_shader_wave_size(device->physical_device,
+                                                 binary->stage);
+
                struct ac_rtld_open_info open_info = {
                        .info = &device->physical_device->rad_info,
                        .shader_type = binary->stage,
-                       .wave_size = 64,
+                       .wave_size = wave_size,
                        .num_parts = 1,
                        .elf_ptrs = &elf_data,
                        .elf_sizes = &elf_size,
@@ -1124,6 +1141,7 @@ shader_variant_compile(struct radv_device *device,
        options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR;
        options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size;
        options->address32_hi = device->physical_device->rad_info.address32_hi;
+       options->cs_wave_size = device->physical_device->cs_wave_size;
 
        if (options->supports_spill)
                tm_options |= AC_TM_SUPPORTS_SPILL;
@@ -1273,6 +1291,7 @@ generate_shader_stats(struct radv_device *device,
 {
        enum chip_class chip_class = device->physical_device->rad_info.chip_class;
        unsigned lds_increment = chip_class >= GFX7 ? 512 : 256;
+       uint8_t wave_size = radv_get_shader_wave_size(device->physical_device, stage);
        struct ac_shader_config *conf;
        unsigned max_simd_waves;
        unsigned lds_per_wave = 0;
@@ -1289,7 +1308,7 @@ generate_shader_stats(struct radv_device *device,
                unsigned max_workgroup_size =
                        radv_nir_get_max_workgroup_size(chip_class, stage, variant->nir);
                lds_per_wave = (conf->lds_size * lds_increment) /
-                              DIV_ROUND_UP(max_workgroup_size, 64);
+                              DIV_ROUND_UP(max_workgroup_size, wave_size);
        }
 
        if (conf->num_sgprs)
index 35560631c4630751f357bc17f5925100ee846310..92ae2a7259d29b6ec6da8528507568f38c2f09d2 100644 (file)
@@ -128,6 +128,7 @@ struct radv_nir_compiler_options {
        enum chip_class chip_class;
        uint32_t tess_offchip_block_dw_size;
        uint32_t address32_hi;
+       uint8_t cs_wave_size;
 };
 
 enum radv_ud_index {