From 33c61eb2f10526c0b90c5ad376e5b0433aec296d Mon Sep 17 00:00:00 2001 From: Caio Marcelo de Oliveira Filho Date: Tue, 28 Apr 2020 14:03:47 -0700 Subject: [PATCH] iris: Implement ARB_compute_variable_group_size Reviewed-by: Kenneth Graunke Reviewed-by: Jordan Justen Part-of: --- docs/relnotes/new_features.txt | 1 + src/gallium/drivers/iris/iris_context.c | 1 + src/gallium/drivers/iris/iris_context.h | 3 +++ src/gallium/drivers/iris/iris_draw.c | 6 +++++ src/gallium/drivers/iris/iris_program.c | 21 +++++++++++++++ src/gallium/drivers/iris/iris_screen.c | 34 ++++++++++++++++++++++--- src/gallium/drivers/iris/iris_screen.h | 2 ++ src/gallium/drivers/iris/iris_state.c | 4 +++ 8 files changed, 69 insertions(+), 3 deletions(-) diff --git a/docs/relnotes/new_features.txt b/docs/relnotes/new_features.txt index e69de29bb2d..de4cd0a5317 100644 --- a/docs/relnotes/new_features.txt +++ b/docs/relnotes/new_features.txt @@ -0,0 +1 @@ +GL_ARB_compute_variable_group_size on Iris. diff --git a/src/gallium/drivers/iris/iris_context.c b/src/gallium/drivers/iris/iris_context.c index fcc71f739bb..dbe3ec09562 100644 --- a/src/gallium/drivers/iris/iris_context.c +++ b/src/gallium/drivers/iris/iris_context.c @@ -96,6 +96,7 @@ iris_lost_context_state(struct iris_batch *batch) ice->state.dirty = ~0ull; ice->state.current_hash_scale = 0; + memset(ice->state.last_block, 0, sizeof(ice->state.last_block)); memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid)); batch->last_surface_base_address = ~0ull; batch->last_aux_map_state = 0; diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h index 405de213012..4acf2390e2f 100644 --- a/src/gallium/drivers/iris/iris_context.h +++ b/src/gallium/drivers/iris/iris_context.h @@ -639,6 +639,9 @@ struct iris_context { bool window_space_position; + /** The last compute group size */ + uint32_t last_block[3]; + /** The last compute grid size */ uint32_t last_grid[3]; /** Reference to the BO containing the compute grid size */ diff --git a/src/gallium/drivers/iris/iris_draw.c b/src/gallium/drivers/iris/iris_draw.c index 76bf55ebee2..d59f6e59e50 100644 --- a/src/gallium/drivers/iris/iris_draw.c +++ b/src/gallium/drivers/iris/iris_draw.c @@ -355,6 +355,12 @@ iris_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *grid) iris_update_compiled_compute_shader(ice); + if (memcmp(ice->state.last_block, grid->block, sizeof(grid->block)) != 0) { + memcpy(ice->state.last_block, grid->block, sizeof(grid->block)); + ice->state.dirty |= IRIS_DIRTY_CONSTANTS_CS; + ice->state.shaders[MESA_SHADER_COMPUTE].sysvals_need_upload = true; + } + iris_update_grid_size_resource(ice, grid); iris_binder_reserve_compute(ice); diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c index 8cf15c6ced9..b9131b872da 100644 --- a/src/gallium/drivers/iris/iris_program.c +++ b/src/gallium/drivers/iris/iris_program.c @@ -393,6 +393,7 @@ iris_setup_uniforms(const struct brw_compiler *compiler, unsigned patch_vert_idx = -1; unsigned ucp_idx[IRIS_MAX_CLIP_PLANES]; unsigned img_idx[PIPE_MAX_SHADER_IMAGES]; + unsigned variable_group_size_idx = -1; memset(ucp_idx, -1, sizeof(ucp_idx)); memset(img_idx, -1, sizeof(img_idx)); @@ -516,6 +517,21 @@ iris_setup_uniforms(const struct brw_compiler *compiler, nir_intrinsic_base(intrin) * 16)); break; } + case nir_intrinsic_load_local_group_size: { + assert(nir->info.cs.local_size_variable); + if (variable_group_size_idx == -1) { + variable_group_size_idx = num_system_values; + num_system_values += 3; + for (int i = 0; i < 3; i++) { + system_values[variable_group_size_idx + i] = + BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i; + } + } + + b.cursor = nir_before_instr(instr); + offset = nir_imm_int(&b, variable_group_size_idx * sizeof(uint32_t)); + break; + } default: continue; } @@ -1947,6 +1963,11 @@ iris_compile_cs(struct iris_context *ice, nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir); + if (nir->info.cs.local_size_variable) { + nir->info.cs.max_variable_local_size = + iris_get_max_var_invocations(screen); + } + NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics); iris_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values, diff --git a/src/gallium/drivers/iris/iris_screen.c b/src/gallium/drivers/iris/iris_screen.c index ae486adc945..94ee60eb016 100644 --- a/src/gallium/drivers/iris/iris_screen.c +++ b/src/gallium/drivers/iris/iris_screen.c @@ -443,6 +443,32 @@ iris_get_shader_param(struct pipe_screen *pscreen, } } +static unsigned +get_max_threads(const struct gen_device_info *devinfo) +{ + /* Limit max_threads to 64 for the GPGPU_WALKER command. */ + return MIN2(64, devinfo->max_cs_threads); +} + +uint32_t +iris_get_max_var_invocations(const struct iris_screen *screen) +{ + const unsigned max_threads = get_max_threads(&screen->devinfo); + + /* Constants used for ARB_compute_variable_group_size. The compiler will + * use the maximum to decide which SIMDs can be used. If we top this like + * max_invocations, that would prevent SIMD8 / SIMD16 to be considered. + * + * TODO: To avoid the trade off above between having the lower maximum + * vs. always using SIMD32, keep all three shader variants (for each SIMD) + * and select a suitable one at dispatch time. + */ + const uint32_t max_var_invocations = + (max_threads >= 64 ? 8 : (max_threads >= 32 ? 16 : 32)) * max_threads; + assert(max_var_invocations >= 512); + return max_var_invocations; +} + static int iris_get_compute_param(struct pipe_screen *pscreen, enum pipe_shader_ir ir_type, @@ -450,9 +476,8 @@ iris_get_compute_param(struct pipe_screen *pscreen, void *ret) { struct iris_screen *screen = (struct iris_screen *)pscreen; - const struct gen_device_info *devinfo = &screen->devinfo; - const unsigned max_threads = MIN2(64, devinfo->max_cs_threads); + const unsigned max_threads = get_max_threads(&screen->devinfo); const uint32_t max_invocations = 32 * max_threads; #define RET(x) do { \ @@ -494,13 +519,16 @@ iris_get_compute_param(struct pipe_screen *pscreen, case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: RET((uint32_t []) { BRW_SUBGROUP_SIZE }); + case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: + /* MaxComputeVariableGroupInvocations */ + RET((uint64_t []) { iris_get_max_var_invocations(screen) }); + case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE: case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS: case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: - case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: // XXX: I think these are for Clover... return 0; diff --git a/src/gallium/drivers/iris/iris_screen.h b/src/gallium/drivers/iris/iris_screen.h index 6d5d1c58074..5ec58f6c216 100644 --- a/src/gallium/drivers/iris/iris_screen.h +++ b/src/gallium/drivers/iris/iris_screen.h @@ -227,4 +227,6 @@ iris_is_format_supported(struct pipe_screen *pscreen, void iris_disk_cache_init(struct iris_screen *screen); +uint32_t iris_get_max_var_invocations(const struct iris_screen *screen); + #endif diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index a5b825f1baf..ee643c2ce84 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -3214,6 +3214,10 @@ upload_sysvals(struct iris_context *ice, value = fui(ice->state.default_inner_level[0]); } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) { value = fui(ice->state.default_inner_level[1]); + } else if (sysval >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X && + sysval <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) { + unsigned i = sysval - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X; + value = ice->state.last_block[i]; } else { assert(!"unhandled system value"); } -- 2.30.2