It can be enabled with RADV_PERFTEST=cswave32.
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
RADV_PERFTEST_BO_LIST = 0x20,
RADV_PERFTEST_SHADER_BALLOT = 0x40,
RADV_PERFTEST_TC_COMPAT_CMASK = 0x80,
+ RADV_PERFTEST_CS_WAVE_32 = 0x100,
};
bool
device->use_shader_ballot = device->instance->perftest_flags & RADV_PERFTEST_SHADER_BALLOT;
+ /* Determine the number of threads per wave for all stages. */
+ device->cs_wave_size = 64;
+
+ if (device->rad_info.chip_class >= GFX10) {
+ if (device->instance->perftest_flags & RADV_PERFTEST_CS_WAVE_32)
+ device->cs_wave_size = 32;
+ }
+
radv_physical_device_init_mem_types(device);
radv_fill_device_extension_table(device, &device->supported_extensions);
{"bolist", RADV_PERFTEST_BO_LIST},
{"shader_ballot", RADV_PERFTEST_SHADER_BALLOT},
{"tccompatcmask", RADV_PERFTEST_TC_COMPAT_CMASK},
+ {"cswave32", RADV_PERFTEST_CS_WAVE_32},
{NULL, 0}
};
device->scratch_waves = MAX2(32 * physical_device->rad_info.num_good_compute_units,
max_threads_per_block / 64);
- device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1);
+ device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) |
+ S_00B800_CS_W32_EN(device->physical_device->cs_wave_size == 32);
if (device->physical_device->rad_info.chip_class >= GFX7) {
/* If the KMD allows it (there is a KMD hw register for it),
LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
}
+static uint8_t
+radv_nir_shader_wave_size(struct nir_shader *const *shaders, int shader_count,
+ const struct radv_nir_compiler_options *options)
+{
+ if (shaders[0]->info.stage == MESA_SHADER_COMPUTE)
+ return options->cs_wave_size;
+ return 64;
+}
+
static
LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
struct nir_shader *const *shaders,
options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
AC_FLOAT_MODE_DEFAULT;
+ uint8_t wave_size = radv_nir_shader_wave_size(shaders,
+ shader_count, options);
+
ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class,
- options->family, float_mode, 64);
+ options->family, float_mode, wave_size);
ctx.context = ctx.ac.context;
radv_nir_shader_info_init(&shader_info->info);
threads_per_threadgroup = compute_shader->info.cs.block_size[0] *
compute_shader->info.cs.block_size[1] *
compute_shader->info.cs.block_size[2];
- waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, 64);
+ waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup,
+ device->physical_device->cs_wave_size);
if (device->physical_device->rad_info.chip_class >= GFX10 &&
waves_per_threadgroup == 1)
/* Whether DISABLE_CONSTANT_ENCODE_REG is supported. */
bool has_dcc_constant_encode;
+ /* Number of threads per wave. */
+ uint8_t cs_wave_size;
+
/* This is the drivers on-disk cache used as a fallback as opposed to
* the pipeline cache defined by apps.
*/
return code_size + DEBUGGER_NUM_MARKERS * 4;
}
+static uint8_t
+radv_get_shader_wave_size(const struct radv_physical_device *pdevice,
+ gl_shader_stage stage)
+{
+ if (stage == MESA_SHADER_COMPUTE)
+ return pdevice->cs_wave_size;
+
+ return 64;
+}
+
static void radv_postprocess_config(const struct radv_physical_device *pdevice,
const struct ac_shader_config *config_in,
const struct radv_shader_variant_info *info,
struct ac_shader_config *config_out)
{
bool scratch_enabled = config_in->scratch_bytes_per_wave > 0;
+ uint8_t wave_size = radv_get_shader_wave_size(pdevice, stage);
unsigned vgpr_comp_cnt = 0;
unsigned num_input_vgprs = info->num_input_vgprs;
S_00B12C_SO_BASE3_EN(!!info->info.so.strides[3]) |
S_00B12C_SO_EN(!!info->info.so.num_outputs);
- config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) / 4) |
+ config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) /
+ (wave_size == 32 ? 8 : 4)) |
S_00B848_DX10_CLAMP(1) |
S_00B848_FLOAT_MODE(config_out->float_mode);
if (binary->variant_info.is_ngg)
sym->size -= 32;
}
+
+ uint8_t wave_size =
+ radv_get_shader_wave_size(device->physical_device,
+ binary->stage);
+
struct ac_rtld_open_info open_info = {
.info = &device->physical_device->rad_info,
.shader_type = binary->stage,
- .wave_size = 64,
+ .wave_size = wave_size,
.num_parts = 1,
.elf_ptrs = &elf_data,
.elf_sizes = &elf_size,
options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR;
options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size;
options->address32_hi = device->physical_device->rad_info.address32_hi;
+ options->cs_wave_size = device->physical_device->cs_wave_size;
if (options->supports_spill)
tm_options |= AC_TM_SUPPORTS_SPILL;
{
enum chip_class chip_class = device->physical_device->rad_info.chip_class;
unsigned lds_increment = chip_class >= GFX7 ? 512 : 256;
+ uint8_t wave_size = radv_get_shader_wave_size(device->physical_device, stage);
struct ac_shader_config *conf;
unsigned max_simd_waves;
unsigned lds_per_wave = 0;
unsigned max_workgroup_size =
radv_nir_get_max_workgroup_size(chip_class, stage, variant->nir);
lds_per_wave = (conf->lds_size * lds_increment) /
- DIV_ROUND_UP(max_workgroup_size, 64);
+ DIV_ROUND_UP(max_workgroup_size, wave_size);
}
if (conf->num_sgprs)
enum chip_class chip_class;
uint32_t tess_offchip_block_dw_size;
uint32_t address32_hi;
+ uint8_t cs_wave_size;
};
enum radv_ud_index {