radeonsi/gfx10: fix wave occupancy computations
authorMarek Olšák <marek.olsak@amd.com>
Wed, 28 Aug 2019 21:38:50 +0000 (17:38 -0400)
committerMarek Olšák <marek.olsak@amd.com>
Tue, 10 Sep 2019 03:43:03 +0000 (23:43 -0400)
Cc: 19.2 <mesa-stable@lists.freedesktop.org>
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
src/amd/common/ac_gpu_info.h
src/amd/vulkan/radv_device.c
src/amd/vulkan/radv_shader.c
src/gallium/drivers/radeonsi/si_shader.c

index c850da22d4ec053b1a5f3b1ff79b717bf16d6360..9986d58202a662bf969fc840a6f2c49524f344aa 100644 (file)
@@ -187,7 +187,7 @@ unsigned ac_get_compute_resource_limits(struct radeon_info *info,
                                        unsigned max_waves_per_sh,
                                        unsigned threadgroups_per_cu);
 
-static inline unsigned ac_get_max_simd_waves(enum radeon_family family)
+static inline unsigned ac_get_max_wave64_per_simd(enum radeon_family family)
 {
 
        switch (family) {
@@ -202,10 +202,26 @@ static inline unsigned ac_get_max_simd_waves(enum radeon_family family)
        }
 }
 
+static inline unsigned ac_get_num_physical_vgprs(enum chip_class chip_class,
+                                                unsigned wave_size)
+{
+       /* The number is per SIMD. */
+       if (chip_class >= GFX10)
+               return wave_size == 32 ? 1024 : 512;
+       else
+               return 256;
+}
+
 static inline uint32_t
-ac_get_num_physical_sgprs(enum chip_class chip_class)
+ac_get_num_physical_sgprs(const struct radeon_info *info)
 {
-       return chip_class >= GFX8 ? 800 : 512;
+       /* The number is per SIMD. There is enough SGPRs for the maximum number
+        * of Wave32, which is double the number for Wave64.
+        */
+       if (info->chip_class >= GFX10)
+               return 128 * ac_get_max_wave64_per_simd(info->family) * 2;
+
+       return info->chip_class >= GFX8 ? 800 : 512;
 }
 
 #ifdef __cplusplus
index d0de471594731040a91bb27e3c08e17412ff1df5..beeec37e54a3c84c029f47056ccea9775773d780 100644 (file)
@@ -1274,7 +1274,7 @@ void radv_GetPhysicalDeviceProperties2(
 
                        /* SGPR. */
                        properties->sgprsPerSimd =
-                               ac_get_num_physical_sgprs(pdevice->rad_info.chip_class);
+                               ac_get_num_physical_sgprs(&pdevice->rad_info);
                        properties->minSgprAllocation =
                                pdevice->rad_info.chip_class >= GFX8 ? 16 : 8;
                        properties->maxSgprAllocation =
index c99e2615fca054464a1f9b329e83f5aaf3675caf..082bdde04abffd410739299b10a255ac05199f72 100644 (file)
@@ -1249,7 +1249,7 @@ radv_get_max_waves(struct radv_device *device,
        unsigned max_simd_waves;
        unsigned lds_per_wave = 0;
 
-       max_simd_waves = ac_get_max_simd_waves(device->physical_device->rad_info.family);
+       max_simd_waves = ac_get_max_wave64_per_simd(device->physical_device->rad_info.family);
 
        if (stage == MESA_SHADER_FRAGMENT) {
                lds_per_wave = conf->lds_size * lds_increment +
@@ -1265,7 +1265,8 @@ radv_get_max_waves(struct radv_device *device,
        if (conf->num_sgprs)
                max_simd_waves =
                        MIN2(max_simd_waves,
-                            ac_get_num_physical_sgprs(chip_class) / conf->num_sgprs);
+                            ac_get_num_physical_sgprs(&device->physical_device->rad_info) /
+                            conf->num_sgprs);
 
        if (conf->num_vgprs)
                max_simd_waves =
@@ -1362,7 +1363,7 @@ radv_GetShaderInfoAMD(VkDevice _device,
                        VkShaderStatisticsInfoAMD statistics = {};
                        statistics.shaderStageMask = shaderStage;
                        statistics.numPhysicalVgprs = RADV_NUM_PHYSICAL_VGPRS;
-                       statistics.numPhysicalSgprs = ac_get_num_physical_sgprs(device->physical_device->rad_info.chip_class);
+                       statistics.numPhysicalSgprs = ac_get_num_physical_sgprs(&device->physical_device->rad_info);
                        statistics.numAvailableSgprs = statistics.numPhysicalSgprs;
 
                        if (stage == MESA_SHADER_COMPUTE) {
index 86b87566b42de54ab98c17edc21083601f6e9f5f..cbe393d03c74ef11db40ff727a5bd9df6a6f5a14 100644 (file)
@@ -5420,7 +5420,7 @@ static void si_calculate_max_simd_waves(struct si_shader *shader)
        unsigned lds_per_wave = 0;
        unsigned max_simd_waves;
 
-       max_simd_waves = ac_get_max_simd_waves(sscreen->info.family);
+       max_simd_waves = ac_get_max_wave64_per_simd(sscreen->info.family);
 
        /* Compute LDS usage for PS. */
        switch (shader->selector->type) {
@@ -5454,16 +5454,25 @@ static void si_calculate_max_simd_waves(struct si_shader *shader)
        if (conf->num_sgprs) {
                max_simd_waves =
                        MIN2(max_simd_waves,
-                            ac_get_num_physical_sgprs(sscreen->info.chip_class) / conf->num_sgprs);
+                            ac_get_num_physical_sgprs(&sscreen->info) / conf->num_sgprs);
        }
 
-       if (conf->num_vgprs)
-               max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
+       if (conf->num_vgprs) {
+               /* Always print wave limits as Wave64, so that we can compare
+                * Wave32 and Wave64 with shader-db fairly. */
+               unsigned max_vgprs = ac_get_num_physical_vgprs(sscreen->info.chip_class, 64);
+               max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
+       }
 
-       /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
-        * 16KB makes some SIMDs unoccupied). */
+       /* LDS is 64KB per CU (4 SIMDs) on GFX6-9, which is 16KB per SIMD (usage above
+        * 16KB makes some SIMDs unoccupied).
+        *
+        * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
+        */
+       unsigned max_lds_size = sscreen->info.chip_class >= GFX10 ? 128*1024 : 64*1024;
+       unsigned max_lds_per_simd = max_lds_size / 4;
        if (lds_per_wave)
-               max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
+               max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
 
        shader->info.max_simd_waves = max_simd_waves;
 }
@@ -7167,15 +7176,17 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
         */
        if (sel->type == PIPE_SHADER_COMPUTE) {
                unsigned wave_size = sscreen->compute_wave_size;
-               unsigned max_vgprs = 256;
-               unsigned max_sgprs = sscreen->info.chip_class >= GFX8 ? 800 : 512;
+               unsigned max_vgprs = ac_get_num_physical_vgprs(sscreen->info.chip_class,
+                                                              wave_size);
+               unsigned max_sgprs = ac_get_num_physical_sgprs(&sscreen->info);
                unsigned max_sgprs_per_wave = 128;
-               unsigned max_block_threads = si_get_max_workgroup_size(shader);
-               unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
-               unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
+               unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
+               unsigned threads_per_tg = si_get_max_workgroup_size(shader);
+               unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
+               unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
 
-               max_vgprs = max_vgprs / min_waves_per_simd;
-               max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
+               max_vgprs = max_vgprs / waves_per_simd;
+               max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
 
                if (shader->config.num_sgprs > max_sgprs ||
                    shader->config.num_vgprs > max_vgprs) {