From 77c81164bc1cd9ec98b32c40753f590791450434 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Nicolai=20H=C3=A4hnle?= Date: Fri, 9 Sep 2016 10:08:11 +0200 Subject: [PATCH] radeonsi: support ARB_compute_variable_group_size MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Not sure if it's possible to avoid programming the block size twice (once for the userdata and once for the dispatch). Reviewed-by: Edward O'Callaghan Reviewed-by: Marek Olšák --- docs/features.txt | 2 +- docs/relnotes/12.1.0.html | 2 +- src/gallium/drivers/radeon/r600_pipe_common.c | 10 ++++- src/gallium/drivers/radeon/r600_pipe_common.h | 2 + src/gallium/drivers/radeonsi/si_compute.c | 10 ++++- src/gallium/drivers/radeonsi/si_shader.c | 44 +++++++++++++------ src/gallium/drivers/radeonsi/si_shader.h | 4 +- 7 files changed, 55 insertions(+), 19 deletions(-) diff --git a/docs/features.txt b/docs/features.txt index e91ef6caa50..533971fe1cb 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -279,7 +279,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve GL_ARB_bindless_texture started (airlied) GL_ARB_cl_event not started - GL_ARB_compute_variable_group_size DONE (nvc0) + GL_ARB_compute_variable_group_size DONE (nvc0, radeonsi) GL_ARB_ES3_2_compatibility DONE (i965/gen8+) GL_ARB_fragment_shader_interlock not started GL_ARB_gl_spirv not started diff --git a/docs/relnotes/12.1.0.html b/docs/relnotes/12.1.0.html index 9ddd99c8382..2e4b669d8ba 100644 --- a/docs/relnotes/12.1.0.html +++ b/docs/relnotes/12.1.0.html @@ -49,7 +49,7 @@ Note: some of the new features are only available with certain drivers.
  • GL_ARB_ES3_1_compatibility on i965
  • GL_ARB_ES3_2_compatibility on i965/gen8+
  • GL_ARB_clear_texture on r600, radeonsi
  • -
  • GL_ARB_compute_variable_group_size on nvc0
  • +
  • GL_ARB_compute_variable_group_size on nvc0, radeonsi
  • GL_ARB_cull_distance on radeonsi
  • GL_ARB_enhanced_layouts on i965
  • GL_ARB_indirect_parameters on radeonsi
  • diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c index 44863eeca65..3dbcbc67247 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/src/gallium/drivers/radeon/r600_pipe_common.c @@ -1037,7 +1037,15 @@ static int r600_get_compute_param(struct pipe_screen *screen, } return sizeof(uint32_t); case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: - return 0; + if (ret) { + uint64_t *max_variable_threads_per_block = ret; + if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 && + ir_type == PIPE_SHADER_IR_TGSI) + *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK; + else + *max_variable_threads_per_block = 0; + } + return sizeof(uint64_t); } fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param); diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index 54991e87f4c..290b228b73f 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -106,6 +106,8 @@ #define R600_MAP_BUFFER_ALIGNMENT 64 #define R600_MAX_VIEWPORTS 16 +#define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024 + enum r600_coherency { R600_COHERENCY_NONE, /* no cache flushes needed */ R600_COHERENCY_SHADER, diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 1d1df2f6b70..e59bafeb9b6 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -601,11 +601,19 @@ static void si_setup_tgsi_grid(struct si_context *sctx, radeon_emit(cs, 0); } } else { + struct si_compute *program = sctx->cs_shader_state.program; + bool variable_group_size = + program->shader.selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0; - radeon_set_sh_reg_seq(cs, grid_size_reg, 3); + radeon_set_sh_reg_seq(cs, grid_size_reg, variable_group_size ? 6 : 3); radeon_emit(cs, info->grid[0]); radeon_emit(cs, info->grid[1]); radeon_emit(cs, info->grid[2]); + if (variable_group_size) { + radeon_emit(cs, info->block[0]); + radeon_emit(cs, info->block[1]); + radeon_emit(cs, info->block[2]); + } } } diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index ff51c8bd799..49d41216501 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1770,16 +1770,21 @@ static void declare_system_value( LLVMValueRef values[3]; unsigned i; unsigned *properties = ctx->shader->selector->info.properties; - unsigned sizes[3] = { - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] - }; - for (i = 0; i < 3; ++i) - values[i] = lp_build_const_int32(gallivm, sizes[i]); + if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) { + unsigned sizes[3] = { + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] + }; + + for (i = 0; i < 3; ++i) + values[i] = lp_build_const_int32(gallivm, sizes[i]); - value = lp_build_gather_values(gallivm, values, 3); + value = lp_build_gather_values(gallivm, values, 3); + } else { + value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_SIZE); + } break; } @@ -5680,6 +5685,7 @@ static void create_function(struct si_shader_context *ctx) case PIPE_SHADER_COMPUTE: params[SI_PARAM_GRID_SIZE] = v3i32; + params[SI_PARAM_BLOCK_SIZE] = v3i32; params[SI_PARAM_BLOCK_ID] = v3i32; last_sgpr = SI_PARAM_BLOCK_ID; @@ -5716,7 +5722,12 @@ static void create_function(struct si_shader_context *ctx) properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; - assert(max_work_group_size); + if (!max_work_group_size) { + /* This is a variable group size compute shader, + * compile it for the maximum possible group size. + */ + max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK; + } radeon_llvm_add_attribute(ctx->radeon_bld.main_fn, "amdgpu-max-work-group-size", @@ -6653,11 +6664,16 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, unsigned max_vgprs = 256; unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512; unsigned max_sgprs_per_wave = 128; - unsigned min_waves_per_cu = - DIV_ROUND_UP(props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * - props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * - props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH], - wave_size); + unsigned max_block_threads; + + if (props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH]) + max_block_threads = props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * + props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * + props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; + else + max_block_threads = SI_MAX_VARIABLE_THREADS_PER_BLOCK; + + unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size); unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4); max_vgprs = max_vgprs / min_waves_per_simd; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 67cb67d2934..f2618acf198 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -129,7 +129,8 @@ enum { /* CS only */ SI_SGPR_GRID_SIZE = SI_NUM_RESOURCE_SGPRS, - SI_CS_NUM_USER_SGPR = SI_SGPR_GRID_SIZE + 3 + SI_SGPR_BLOCK_SIZE = SI_SGPR_GRID_SIZE + 3, + SI_CS_NUM_USER_SGPR = SI_SGPR_BLOCK_SIZE + 3 }; /* LLVM function parameter indices */ @@ -219,6 +220,7 @@ enum { /* CS only parameters */ SI_PARAM_GRID_SIZE = SI_NUM_RESOURCE_PARAMS, + SI_PARAM_BLOCK_SIZE, SI_PARAM_BLOCK_ID, SI_PARAM_THREAD_ID, -- 2.30.2