From e510c5ee3b8c7f0d1d9afff28760469f43c24c02 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 12 Jul 2019 12:17:11 +0200 Subject: [PATCH] ac: import ac_get_compute_resource_limits() from RadeonSI Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen --- src/amd/common/ac_gpu_info.c | 32 +++++++++++++++++ src/amd/common/ac_gpu_info.h | 4 +++ src/gallium/drivers/radeonsi/si_compute.c | 35 ++----------------- .../radeonsi/si_compute_prim_discard.c | 6 ++-- src/gallium/drivers/radeonsi/si_pipe.h | 4 --- 5 files changed, 42 insertions(+), 39 deletions(-) diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index 596a9ebe508..a501d840b25 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -895,3 +895,35 @@ ac_get_harvested_configs(struct radeon_info *info, } } } + +unsigned ac_get_compute_resource_limits(struct radeon_info *info, + unsigned waves_per_threadgroup, + unsigned max_waves_per_sh, + unsigned threadgroups_per_cu) +{ + unsigned compute_resource_limits = + S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0); + + if (info->chip_class >= GFX7) { + unsigned num_cu_per_se = info->num_good_compute_units / + info->max_se; + + /* Force even distribution on all SIMDs in CU if the workgroup + * size is 64. This has shown some good improvements if # of CUs + * per SE is not a multiple of 4. + */ + if (num_cu_per_se % 4 && waves_per_threadgroup == 1) + compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1); + + assert(threadgroups_per_cu >= 1 && threadgroups_per_cu <= 8); + compute_resource_limits |= S_00B854_WAVES_PER_SH(max_waves_per_sh) | + S_00B854_CU_GROUP_COUNT(threadgroups_per_cu - 1); + } else { + /* GFX6 */ + if (max_waves_per_sh) { + unsigned limit_div16 = DIV_ROUND_UP(max_waves_per_sh, 16); + compute_resource_limits |= S_00B854_WAVES_PER_SH_SI(limit_div16); + } + } + return compute_resource_limits; +} diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index c42548f8352..3ec3e44d665 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -167,6 +167,10 @@ void ac_get_harvested_configs(struct radeon_info *info, unsigned raster_config, unsigned *cik_raster_config_1_p, unsigned *raster_config_se); +unsigned ac_get_compute_resource_limits(struct radeon_info *info, + unsigned waves_per_threadgroup, + unsigned max_waves_per_sh, + unsigned threadgroups_per_cu); static inline unsigned ac_get_max_simd_waves(enum radeon_family family) { diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 0989181aba4..07b1293049f 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -772,38 +772,6 @@ static void si_setup_tgsi_user_data(struct si_context *sctx, } } -unsigned si_get_compute_resource_limits(struct si_screen *sscreen, - unsigned waves_per_threadgroup, - unsigned max_waves_per_sh, - unsigned threadgroups_per_cu) -{ - unsigned compute_resource_limits = - S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0); - - if (sscreen->info.chip_class >= GFX7) { - unsigned num_cu_per_se = sscreen->info.num_good_compute_units / - sscreen->info.max_se; - - /* Force even distribution on all SIMDs in CU if the workgroup - * size is 64. This has shown some good improvements if # of CUs - * per SE is not a multiple of 4. - */ - if (num_cu_per_se % 4 && waves_per_threadgroup == 1) - compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1); - - assert(threadgroups_per_cu >= 1 && threadgroups_per_cu <= 8); - compute_resource_limits |= S_00B854_WAVES_PER_SH(max_waves_per_sh) | - S_00B854_CU_GROUP_COUNT(threadgroups_per_cu - 1); - } else { - /* GFX6 */ - if (max_waves_per_sh) { - unsigned limit_div16 = DIV_ROUND_UP(max_waves_per_sh, 16); - compute_resource_limits |= S_00B854_WAVES_PER_SH_SI(limit_div16); - } - } - return compute_resource_limits; -} - static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_grid_info *info) { @@ -820,7 +788,8 @@ static void si_emit_dispatch_packets(struct si_context *sctx, threadgroups_per_cu = 2; radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, - si_get_compute_resource_limits(sscreen, waves_per_threadgroup, + ac_get_compute_resource_limits(&sscreen->info, + waves_per_threadgroup, sctx->cs_max_waves_per_sh, threadgroups_per_cu)); diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c index e16c0791a27..ad33c8de1c5 100644 --- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c +++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c @@ -1426,8 +1426,10 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, S_00B84C_LDS_SIZE(shader->config.lds_size)); radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, - si_get_compute_resource_limits(sctx->screen, WAVES_PER_TG, - MAX_WAVES_PER_SH, THREADGROUPS_PER_CU)); + ac_get_compute_resource_limits(&sctx->screen->info, + WAVES_PER_TG, + MAX_WAVES_PER_SH, + THREADGROUPS_PER_CU)); sctx->compute_ib_last_shader = shader; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index cd8fb5d5df4..96a7fa4ebf2 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1396,10 +1396,6 @@ unsigned si_end_counter(struct si_screen *sscreen, unsigned type, /* si_compute.c */ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs); -unsigned si_get_compute_resource_limits(struct si_screen *sscreen, - unsigned waves_per_threadgroup, - unsigned max_waves_per_sh, - unsigned threadgroups_per_cu); void si_init_compute_functions(struct si_context *sctx); /* si_compute_prim_discard.c */ -- 2.30.2