From 5664bd6db383984192cf362884dd9fb17d8ed3a3 Mon Sep 17 00:00:00 2001 From: Plamena Manolova Date: Mon, 12 Nov 2018 16:29:51 +0200 Subject: [PATCH] i965: Implement ARB_compute_variable_group_size This patch adds the implementation of ARB_compute_variable_group_size for i965. We do this by storing the local group size in a push constant. Additional changes made by Caio Marcelo de Oliveira Filho. Signed-off-by: Plamena Manolova Reviewed-by: Caio Marcelo de Oliveira Filho Reviewed-by: Jordan Justen Reviewed-by: Paulo Zanoni Part-of: --- docs/features.txt | 2 +- docs/relnotes/new_features.txt | 1 + src/mesa/drivers/dri/i965/brw_compute.c | 18 ++++++++++ src/mesa/drivers/dri/i965/brw_context.c | 18 ++++++++++ src/mesa/drivers/dri/i965/brw_context.h | 5 +++ src/mesa/drivers/dri/i965/brw_cs.c | 33 +++++++++++++++++++ src/mesa/drivers/dri/i965/brw_cs.h | 3 ++ .../drivers/dri/i965/gen6_constant_state.c | 12 +++++-- src/mesa/drivers/dri/i965/genX_state_upload.c | 19 +++++++---- src/mesa/drivers/dri/i965/intel_extensions.c | 1 + 10 files changed, 103 insertions(+), 9 deletions(-) diff --git a/docs/features.txt b/docs/features.txt index 4db525b9da8..6394245a6df 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -299,7 +299,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve GL_ARB_bindless_texture DONE (nvc0, radeonsi) GL_ARB_cl_event not started - GL_ARB_compute_variable_group_size DONE (nvc0, radeonsi) + GL_ARB_compute_variable_group_size DONE (i965/gen7+, nvc0, radeonsi) GL_ARB_ES3_2_compatibility DONE (i965/gen8+, radeonsi, virgl) GL_ARB_fragment_shader_interlock DONE (i965) GL_ARB_gpu_shader_int64 DONE (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe) diff --git a/docs/relnotes/new_features.txt b/docs/relnotes/new_features.txt index e31b7e68385..41631399737 100644 --- a/docs/relnotes/new_features.txt +++ b/docs/relnotes/new_features.txt @@ -1,3 +1,4 @@ +GL_ARB_compute_variable_group_size on i965. GL_EXT_texture_shadow_lod on radeonsi. GL_NV_copy_image on all gallium drivers. VK_KHR_shader_non_semantic_info on Intel, RADV. diff --git a/src/mesa/drivers/dri/i965/brw_compute.c b/src/mesa/drivers/dri/i965/brw_compute.c index 09df08d7bc0..852db6dd64b 100644 --- a/src/mesa/drivers/dri/i965/brw_compute.c +++ b/src/mesa/drivers/dri/i965/brw_compute.c @@ -101,6 +101,7 @@ brw_dispatch_compute(struct gl_context *ctx, const GLuint *num_groups) { brw->compute.num_work_groups_bo = NULL; brw->compute.num_work_groups = num_groups; + brw->compute.group_size = NULL; ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS; brw_dispatch_compute_common(ctx); @@ -120,6 +121,22 @@ brw_dispatch_compute_indirect(struct gl_context *ctx, GLintptr indirect) brw->compute.num_work_groups_bo = bo; brw->compute.num_work_groups_offset = indirect; brw->compute.num_work_groups = indirect_group_counts; + brw->compute.group_size = NULL; + ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS; + + brw_dispatch_compute_common(ctx); +} + +static void +brw_dispatch_compute_group_size(struct gl_context *ctx, + const GLuint *num_groups, + const GLuint *group_size) +{ + struct brw_context *brw = brw_context(ctx); + + brw->compute.num_work_groups_bo = NULL; + brw->compute.num_work_groups = num_groups; + brw->compute.group_size = group_size; ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS; brw_dispatch_compute_common(ctx); @@ -130,4 +147,5 @@ brw_init_compute_functions(struct dd_function_table *functions) { functions->DispatchCompute = brw_dispatch_compute; functions->DispatchComputeIndirect = brw_dispatch_compute_indirect; + functions->DispatchComputeGroupSize = brw_dispatch_compute_group_size; } diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 5082657deac..e5b66a78d0c 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -843,6 +843,24 @@ brw_initialize_cs_context_constants(struct brw_context *brw) ctx->Const.MaxComputeWorkGroupSize[2] = max_invocations; ctx->Const.MaxComputeWorkGroupInvocations = max_invocations; ctx->Const.MaxComputeSharedMemorySize = 64 * 1024; + + /* Constants used for ARB_compute_variable_group_size. The compiler will + * use the maximum to decide which SIMDs can be used. If we top this like + * max_invocations, that would prevent SIMD8 / SIMD16 to be considered. + * + * TODO: To avoid the trade off above between having the lower maximum + * vs. always using SIMD32, keep all three shader variants (for each SIMD) + * and select a suitable one at dispatch time. + */ + if (devinfo->gen >= 7) { + const uint32_t max_var_invocations = + (max_threads >= 64 ? 8 : (max_threads >= 32 ? 16 : 32)) * max_threads; + assert(max_var_invocations >= 512); + ctx->Const.MaxComputeVariableGroupSize[0] = max_var_invocations; + ctx->Const.MaxComputeVariableGroupSize[1] = max_var_invocations; + ctx->Const.MaxComputeVariableGroupSize[2] = max_var_invocations; + ctx->Const.MaxComputeVariableGroupInvocations = max_var_invocations; + } } /** diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 36dc53cc9d5..6f731f23712 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -925,6 +925,11 @@ struct brw_context struct brw_bo *num_work_groups_bo; GLintptr num_work_groups_offset; const GLuint *num_work_groups; + /** + * This is only used alongside ARB_compute_variable_group_size when the + * local work group size is variable, otherwise it's NULL. + */ + const GLuint *group_size; } compute; struct { diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c index cf9340f15b6..5b4ff456a0d 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.c +++ b/src/mesa/drivers/dri/i965/brw_cs.c @@ -32,6 +32,27 @@ #include "brw_program.h" #include "compiler/glsl/ir_uniform.h" +uint32_t +brw_cs_group_size(const struct brw_context *brw) +{ + assert(brw->cs.base.prog_data); + struct brw_cs_prog_data *cs_prog_data = + brw_cs_prog_data(brw->cs.base.prog_data); + + if (brw->compute.group_size) { + /* With ARB_compute_variable_group_size the group size is set at + * dispatch time, so we can't use the one provided by the compiler. + */ + return brw->compute.group_size[0] * + brw->compute.group_size[1] * + brw->compute.group_size[2]; + } else { + return cs_prog_data->local_size[0] * + cs_prog_data->local_size[1] * + cs_prog_data->local_size[2]; + } +} + static void assign_cs_binding_table_offsets(const struct gen_device_info *devinfo, const struct gl_program *prog, @@ -58,6 +79,7 @@ brw_codegen_cs_prog(struct brw_context *brw, struct brw_cs_prog_data prog_data; bool start_busy = false; double start_time = 0; + struct gl_context *gl_ctx = &brw->ctx; nir_shader *nir = nir_shader_clone(mem_ctx, cp->program.nir); memset(&prog_data, 0, sizeof(prog_data)); @@ -88,6 +110,17 @@ brw_codegen_cs_prog(struct brw_context *brw, if (INTEL_DEBUG & DEBUG_SHADER_TIME) st_index = brw_get_shader_time_index(brw, &cp->program, ST_CS, true); + /* If the work group size is variable we set it to the maximum here since + * the actual size is not known until the dispatch command is issued. + */ + if (nir->info.cs.local_size_variable) { + prog_data.uses_variable_group_size = true; + nir->info.cs.max_variable_local_size = + gl_ctx->Const.MaxComputeWorkGroupSize[2]; + } else { + prog_data.uses_variable_group_size = false; + } + char *error_str; program = brw_compile_cs(brw->screen->compiler, brw, mem_ctx, key, &prog_data, nir, st_index, NULL, &error_str); diff --git a/src/mesa/drivers/dri/i965/brw_cs.h b/src/mesa/drivers/dri/i965/brw_cs.h index a0d43abaae4..9b0262000b6 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.h +++ b/src/mesa/drivers/dri/i965/brw_cs.h @@ -29,6 +29,9 @@ extern "C" { #endif +uint32_t +brw_cs_group_size(const struct brw_context *brw); + void brw_upload_cs_prog(struct brw_context *brw); diff --git a/src/mesa/drivers/dri/i965/gen6_constant_state.c b/src/mesa/drivers/dri/i965/gen6_constant_state.c index 50e34fc9c8f..a4d82884b01 100644 --- a/src/mesa/drivers/dri/i965/gen6_constant_state.c +++ b/src/mesa/drivers/dri/i965/gen6_constant_state.c @@ -22,6 +22,7 @@ */ #include "brw_context.h" +#include "brw_cs.h" #include "brw_state.h" #include "brw_defines.h" #include "brw_program.h" @@ -62,6 +63,10 @@ brw_param_value(struct brw_context *brw, return f_as_u32(ctx->TessCtrlProgram.patch_default_inner_level[0]); } else if (param == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) { return f_as_u32(ctx->TessCtrlProgram.patch_default_inner_level[1]); + } else if (param >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X && + param <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) { + unsigned i = param - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X; + return brw->compute.group_size[i]; } else { unreachable("Invalid param builtin"); } @@ -303,8 +308,11 @@ brw_upload_cs_push_constants(struct brw_context *brw, /* XXX: Should this happen somewhere before to get our state flag set? */ _mesa_load_state_parameters(ctx, prog->Parameters); + const unsigned threads = + DIV_ROUND_UP(brw_cs_group_size(brw), cs_prog_data->simd_size); const unsigned push_const_size = - brw_cs_push_const_total_size(cs_prog_data, cs_prog_data->threads); + brw_cs_push_const_total_size(cs_prog_data, threads); + if (push_const_size == 0) { stage_state->push_const_size = 0; return; @@ -330,7 +338,7 @@ brw_upload_cs_push_constants(struct brw_context *brw, } if (cs_prog_data->push.per_thread.size > 0) { - for (unsigned t = 0; t < cs_prog_data->threads; t++) { + for (unsigned t = 0; t < threads; t++) { unsigned dst = 8 * (cs_prog_data->push.per_thread.regs * t + cs_prog_data->push.cross_thread.regs); diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c index fed5eda8e48..53c57074279 100644 --- a/src/mesa/drivers/dri/i965/genX_state_upload.c +++ b/src/mesa/drivers/dri/i965/genX_state_upload.c @@ -37,6 +37,7 @@ #include "genX_boilerplate.h" #include "brw_context.h" +#include "brw_cs.h" #include "brw_draw.h" #include "brw_multisample_state.h" #include "brw_state.h" @@ -4263,6 +4264,12 @@ genX(upload_cs_state)(struct brw_context *brw) struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); const struct gen_device_info *devinfo = &brw->screen->devinfo; + const unsigned threads = + DIV_ROUND_UP(brw_cs_group_size(brw), cs_prog_data->simd_size); + + if (!cs_prog_data->uses_variable_group_size) + assert(cs_prog_data->threads == threads); + if (INTEL_DEBUG & DEBUG_SHADER_TIME) { brw_emit_buffer_surface_state( brw, &stage_state->surf_offset[ @@ -4353,13 +4360,13 @@ genX(upload_cs_state)(struct brw_context *brw) vfe.URBEntryAllocationSize = GEN_GEN >= 8 ? 2 : 0; const uint32_t vfe_curbe_allocation = - ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads + + ALIGN(cs_prog_data->push.per_thread.regs * threads + cs_prog_data->push.cross_thread.regs, 2); vfe.CURBEAllocationSize = vfe_curbe_allocation; } const unsigned push_const_size = - brw_cs_push_const_total_size(cs_prog_data, cs_prog_data->threads); + brw_cs_push_const_total_size(cs_prog_data, threads); if (push_const_size > 0) { brw_batch_emit(brw, GENX(MEDIA_CURBE_LOAD), curbe) { curbe.CURBETotalDataLength = ALIGN(push_const_size, 64); @@ -4378,7 +4385,7 @@ genX(upload_cs_state)(struct brw_context *brw) DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), .BindingTablePointer = stage_state->bind_bo_offset, .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs, - .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads, + .NumberofThreadsinGPGPUThreadGroup = threads, .SharedLocalMemorySize = encode_slm_size(GEN_GEN, prog_data->total_shared), .BarrierEnable = cs_prog_data->uses_barrier, @@ -4484,9 +4491,9 @@ genX(emit_gpgpu_walker)(struct brw_context *brw) if (indirect) prepare_indirect_gpgpu_walker(brw); + const unsigned group_size = brw_cs_group_size(brw); const unsigned simd_size = prog_data->simd_size; - unsigned group_size = prog_data->local_size[0] * - prog_data->local_size[1] * prog_data->local_size[2]; + unsigned thread_width_max = DIV_ROUND_UP(group_size, simd_size); uint32_t right_mask = 0xffffffffu >> (32 - simd_size); const unsigned right_non_aligned = group_size & (simd_size - 1); @@ -4499,7 +4506,7 @@ genX(emit_gpgpu_walker)(struct brw_context *brw) ggw.SIMDSize = prog_data->simd_size / 16; ggw.ThreadDepthCounterMaximum = 0; ggw.ThreadHeightCounterMaximum = 0; - ggw.ThreadWidthCounterMaximum = prog_data->threads - 1; + ggw.ThreadWidthCounterMaximum = thread_width_max - 1; ggw.ThreadGroupIDXDimension = num_groups[0]; ggw.ThreadGroupIDYDimension = num_groups[1]; ggw.ThreadGroupIDZDimension = num_groups[2]; diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c index 2bf6708a152..276f7aa3d07 100644 --- a/src/mesa/drivers/dri/i965/intel_extensions.c +++ b/src/mesa/drivers/dri/i965/intel_extensions.c @@ -271,6 +271,7 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.ARB_ES3_1_compatibility = devinfo->gen >= 8 || devinfo->is_haswell; ctx->Extensions.NV_compute_shader_derivatives = true; + ctx->Extensions.ARB_compute_variable_group_size = true; } if (can_do_predicate_writes(brw->screen)) { -- 2.30.2