From ee0fc0f6dcf6093f4e3ff0796ace3cb1590a72ea Mon Sep 17 00:00:00 2001 From: Caio Marcelo de Oliveira Filho Date: Thu, 21 May 2020 00:51:44 -0700 Subject: [PATCH 1/1] i965: Use new helper functions to pick SIMD variant for CS Also expand the existing i965 helper to return the other CS related paramters. Reviewed-by: Jason Ekstrand Part-of: --- src/mesa/drivers/dri/i965/brw_cs.c | 25 ++++++++++----- src/mesa/drivers/dri/i965/brw_cs.h | 10 ++++-- .../drivers/dri/i965/gen6_constant_state.c | 7 ++-- src/mesa/drivers/dri/i965/genX_state_upload.c | 32 +++++++++---------- 4 files changed, 43 insertions(+), 31 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c index d6aceccd9f0..ef6b80edf62 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.c +++ b/src/mesa/drivers/dri/i965/brw_cs.c @@ -32,25 +32,34 @@ #include "brw_program.h" #include "compiler/glsl/ir_uniform.h" -uint32_t -brw_cs_group_size(const struct brw_context *brw) +struct brw_cs_parameters +brw_cs_get_parameters(const struct brw_context *brw) { assert(brw->cs.base.prog_data); struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(brw->cs.base.prog_data); + struct brw_cs_parameters params = {}; + if (brw->compute.group_size) { /* With ARB_compute_variable_group_size the group size is set at * dispatch time, so we can't use the one provided by the compiler. */ - return brw->compute.group_size[0] * - brw->compute.group_size[1] * - brw->compute.group_size[2]; + params.group_size = brw->compute.group_size[0] * + brw->compute.group_size[1] * + brw->compute.group_size[2]; } else { - return cs_prog_data->local_size[0] * - cs_prog_data->local_size[1] * - cs_prog_data->local_size[2]; + params.group_size = cs_prog_data->local_size[0] * + cs_prog_data->local_size[1] * + cs_prog_data->local_size[2]; } + + params.simd_size = + brw_cs_simd_size_for_group_size(&brw->screen->devinfo, + cs_prog_data, params.group_size); + params.threads = DIV_ROUND_UP(params.group_size, params.simd_size); + + return params; } static void diff --git a/src/mesa/drivers/dri/i965/brw_cs.h b/src/mesa/drivers/dri/i965/brw_cs.h index 9b0262000b6..9d34af6def4 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.h +++ b/src/mesa/drivers/dri/i965/brw_cs.h @@ -29,8 +29,14 @@ extern "C" { #endif -uint32_t -brw_cs_group_size(const struct brw_context *brw); +struct brw_cs_parameters { + unsigned group_size; + unsigned simd_size; + unsigned threads; +}; + +struct brw_cs_parameters +brw_cs_get_parameters(const struct brw_context *brw); void brw_upload_cs_prog(struct brw_context *brw); diff --git a/src/mesa/drivers/dri/i965/gen6_constant_state.c b/src/mesa/drivers/dri/i965/gen6_constant_state.c index a4d82884b01..c0e1f2fa353 100644 --- a/src/mesa/drivers/dri/i965/gen6_constant_state.c +++ b/src/mesa/drivers/dri/i965/gen6_constant_state.c @@ -308,10 +308,9 @@ brw_upload_cs_push_constants(struct brw_context *brw, /* XXX: Should this happen somewhere before to get our state flag set? */ _mesa_load_state_parameters(ctx, prog->Parameters); - const unsigned threads = - DIV_ROUND_UP(brw_cs_group_size(brw), cs_prog_data->simd_size); + const struct brw_cs_parameters cs_params = brw_cs_get_parameters(brw); const unsigned push_const_size = - brw_cs_push_const_total_size(cs_prog_data, threads); + brw_cs_push_const_total_size(cs_prog_data, cs_params.threads); if (push_const_size == 0) { stage_state->push_const_size = 0; @@ -338,7 +337,7 @@ brw_upload_cs_push_constants(struct brw_context *brw, } if (cs_prog_data->push.per_thread.size > 0) { - for (unsigned t = 0; t < threads; t++) { + for (unsigned t = 0; t < cs_params.threads; t++) { unsigned dst = 8 * (cs_prog_data->push.per_thread.regs * t + cs_prog_data->push.cross_thread.regs); diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c index ac83337b691..2c34daedb80 100644 --- a/src/mesa/drivers/dri/i965/genX_state_upload.c +++ b/src/mesa/drivers/dri/i965/genX_state_upload.c @@ -4264,8 +4264,7 @@ genX(upload_cs_state)(struct brw_context *brw) struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); const struct gen_device_info *devinfo = &brw->screen->devinfo; - const unsigned threads = - DIV_ROUND_UP(brw_cs_group_size(brw), cs_prog_data->simd_size); + const struct brw_cs_parameters cs_params = brw_cs_get_parameters(brw); if (INTEL_DEBUG & DEBUG_SHADER_TIME) { brw_emit_buffer_surface_state( @@ -4357,13 +4356,13 @@ genX(upload_cs_state)(struct brw_context *brw) vfe.URBEntryAllocationSize = GEN_GEN >= 8 ? 2 : 0; const uint32_t vfe_curbe_allocation = - ALIGN(cs_prog_data->push.per_thread.regs * threads + + ALIGN(cs_prog_data->push.per_thread.regs * cs_params.threads + cs_prog_data->push.cross_thread.regs, 2); vfe.CURBEAllocationSize = vfe_curbe_allocation; } const unsigned push_const_size = - brw_cs_push_const_total_size(cs_prog_data, threads); + brw_cs_push_const_total_size(cs_prog_data, cs_params.threads); if (push_const_size > 0) { brw_batch_emit(brw, GENX(MEDIA_CURBE_LOAD), curbe) { curbe.CURBETotalDataLength = ALIGN(push_const_size, 64); @@ -4374,15 +4373,18 @@ genX(upload_cs_state)(struct brw_context *brw) /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */ memcpy(bind, stage_state->surf_offset, prog_data->binding_table.size_bytes); + const uint64_t ksp = brw->cs.base.prog_offset + + brw_cs_prog_data_prog_offset(cs_prog_data, + cs_params.simd_size); const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = { - .KernelStartPointer = brw->cs.base.prog_offset, + .KernelStartPointer = ksp, .SamplerStatePointer = stage_state->sampler_offset, /* WA_1606682166 */ .SamplerCount = GEN_GEN == 11 ? 0 : DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), .BindingTablePointer = stage_state->bind_bo_offset, .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs, - .NumberofThreadsinGPGPUThreadGroup = threads, + .NumberofThreadsinGPGPUThreadGroup = cs_params.threads, .SharedLocalMemorySize = encode_slm_size(GEN_GEN, prog_data->total_shared), .BarrierEnable = cs_prog_data->uses_barrier, @@ -4479,31 +4481,27 @@ prepare_indirect_gpgpu_walker(struct brw_context *brw) static void genX(emit_gpgpu_walker)(struct brw_context *brw) { - const struct brw_cs_prog_data *prog_data = - brw_cs_prog_data(brw->cs.base.prog_data); - const GLuint *num_groups = brw->compute.num_work_groups; bool indirect = brw->compute.num_work_groups_bo != NULL; if (indirect) prepare_indirect_gpgpu_walker(brw); - const unsigned group_size = brw_cs_group_size(brw); - const unsigned simd_size = prog_data->simd_size; - unsigned thread_width_max = DIV_ROUND_UP(group_size, simd_size); + const struct brw_cs_parameters cs_params = brw_cs_get_parameters(brw); - uint32_t right_mask = 0xffffffffu >> (32 - simd_size); - const unsigned right_non_aligned = group_size & (simd_size - 1); + uint32_t right_mask = 0xffffffffu >> (32 - cs_params.simd_size); + const unsigned right_non_aligned = + cs_params.group_size & (cs_params.simd_size - 1); if (right_non_aligned != 0) - right_mask >>= (simd_size - right_non_aligned); + right_mask >>= (cs_params.simd_size - right_non_aligned); brw_batch_emit(brw, GENX(GPGPU_WALKER), ggw) { ggw.IndirectParameterEnable = indirect; ggw.PredicateEnable = GEN_GEN <= 7 && indirect; - ggw.SIMDSize = prog_data->simd_size / 16; + ggw.SIMDSize = cs_params.simd_size / 16; ggw.ThreadDepthCounterMaximum = 0; ggw.ThreadHeightCounterMaximum = 0; - ggw.ThreadWidthCounterMaximum = thread_width_max - 1; + ggw.ThreadWidthCounterMaximum = cs_params.threads - 1; ggw.ThreadGroupIDXDimension = num_groups[0]; ggw.ThreadGroupIDYDimension = num_groups[1]; ggw.ThreadGroupIDZDimension = num_groups[2]; -- 2.30.2