From: Marek Olšák Date: Wed, 28 Aug 2019 21:38:50 +0000 (-0400) Subject: radeonsi/gfx10: fix wave occupancy computations X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=d95afd8b9e7f9b3880813203292257bf0ed7babf;p=mesa.git radeonsi/gfx10: fix wave occupancy computations Cc: 19.2 Reviewed-by: Pierre-Eric Pelloux-Prayer --- diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index c850da22d4e..9986d58202a 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -187,7 +187,7 @@ unsigned ac_get_compute_resource_limits(struct radeon_info *info, unsigned max_waves_per_sh, unsigned threadgroups_per_cu); -static inline unsigned ac_get_max_simd_waves(enum radeon_family family) +static inline unsigned ac_get_max_wave64_per_simd(enum radeon_family family) { switch (family) { @@ -202,10 +202,26 @@ static inline unsigned ac_get_max_simd_waves(enum radeon_family family) } } +static inline unsigned ac_get_num_physical_vgprs(enum chip_class chip_class, + unsigned wave_size) +{ + /* The number is per SIMD. */ + if (chip_class >= GFX10) + return wave_size == 32 ? 1024 : 512; + else + return 256; +} + static inline uint32_t -ac_get_num_physical_sgprs(enum chip_class chip_class) +ac_get_num_physical_sgprs(const struct radeon_info *info) { - return chip_class >= GFX8 ? 800 : 512; + /* The number is per SIMD. There is enough SGPRs for the maximum number + * of Wave32, which is double the number for Wave64. + */ + if (info->chip_class >= GFX10) + return 128 * ac_get_max_wave64_per_simd(info->family) * 2; + + return info->chip_class >= GFX8 ? 800 : 512; } #ifdef __cplusplus diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index d0de4715947..beeec37e54a 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -1274,7 +1274,7 @@ void radv_GetPhysicalDeviceProperties2( /* SGPR. */ properties->sgprsPerSimd = - ac_get_num_physical_sgprs(pdevice->rad_info.chip_class); + ac_get_num_physical_sgprs(&pdevice->rad_info); properties->minSgprAllocation = pdevice->rad_info.chip_class >= GFX8 ? 16 : 8; properties->maxSgprAllocation = diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index c99e2615fca..082bdde04ab 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -1249,7 +1249,7 @@ radv_get_max_waves(struct radv_device *device, unsigned max_simd_waves; unsigned lds_per_wave = 0; - max_simd_waves = ac_get_max_simd_waves(device->physical_device->rad_info.family); + max_simd_waves = ac_get_max_wave64_per_simd(device->physical_device->rad_info.family); if (stage == MESA_SHADER_FRAGMENT) { lds_per_wave = conf->lds_size * lds_increment + @@ -1265,7 +1265,8 @@ radv_get_max_waves(struct radv_device *device, if (conf->num_sgprs) max_simd_waves = MIN2(max_simd_waves, - ac_get_num_physical_sgprs(chip_class) / conf->num_sgprs); + ac_get_num_physical_sgprs(&device->physical_device->rad_info) / + conf->num_sgprs); if (conf->num_vgprs) max_simd_waves = @@ -1362,7 +1363,7 @@ radv_GetShaderInfoAMD(VkDevice _device, VkShaderStatisticsInfoAMD statistics = {}; statistics.shaderStageMask = shaderStage; statistics.numPhysicalVgprs = RADV_NUM_PHYSICAL_VGPRS; - statistics.numPhysicalSgprs = ac_get_num_physical_sgprs(device->physical_device->rad_info.chip_class); + statistics.numPhysicalSgprs = ac_get_num_physical_sgprs(&device->physical_device->rad_info); statistics.numAvailableSgprs = statistics.numPhysicalSgprs; if (stage == MESA_SHADER_COMPUTE) { diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 86b87566b42..cbe393d03c7 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -5420,7 +5420,7 @@ static void si_calculate_max_simd_waves(struct si_shader *shader) unsigned lds_per_wave = 0; unsigned max_simd_waves; - max_simd_waves = ac_get_max_simd_waves(sscreen->info.family); + max_simd_waves = ac_get_max_wave64_per_simd(sscreen->info.family); /* Compute LDS usage for PS. */ switch (shader->selector->type) { @@ -5454,16 +5454,25 @@ static void si_calculate_max_simd_waves(struct si_shader *shader) if (conf->num_sgprs) { max_simd_waves = MIN2(max_simd_waves, - ac_get_num_physical_sgprs(sscreen->info.chip_class) / conf->num_sgprs); + ac_get_num_physical_sgprs(&sscreen->info) / conf->num_sgprs); } - if (conf->num_vgprs) - max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs); + if (conf->num_vgprs) { + /* Always print wave limits as Wave64, so that we can compare + * Wave32 and Wave64 with shader-db fairly. */ + unsigned max_vgprs = ac_get_num_physical_vgprs(sscreen->info.chip_class, 64); + max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs); + } - /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above - * 16KB makes some SIMDs unoccupied). */ + /* LDS is 64KB per CU (4 SIMDs) on GFX6-9, which is 16KB per SIMD (usage above + * 16KB makes some SIMDs unoccupied). + * + * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used. + */ + unsigned max_lds_size = sscreen->info.chip_class >= GFX10 ? 128*1024 : 64*1024; + unsigned max_lds_per_simd = max_lds_size / 4; if (lds_per_wave) - max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave); + max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave); shader->info.max_simd_waves = max_simd_waves; } @@ -7167,15 +7176,17 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, */ if (sel->type == PIPE_SHADER_COMPUTE) { unsigned wave_size = sscreen->compute_wave_size; - unsigned max_vgprs = 256; - unsigned max_sgprs = sscreen->info.chip_class >= GFX8 ? 800 : 512; + unsigned max_vgprs = ac_get_num_physical_vgprs(sscreen->info.chip_class, + wave_size); + unsigned max_sgprs = ac_get_num_physical_sgprs(&sscreen->info); unsigned max_sgprs_per_wave = 128; - unsigned max_block_threads = si_get_max_workgroup_size(shader); - unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size); - unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4); + unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */ + unsigned threads_per_tg = si_get_max_workgroup_size(shader); + unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size); + unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg); - max_vgprs = max_vgprs / min_waves_per_simd; - max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave); + max_vgprs = max_vgprs / waves_per_simd; + max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave); if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {