From: Jordan Justen Date: Thu, 26 May 2016 20:49:07 +0000 (-0700) Subject: i965: Store number of threads in brw_cs_prog_data X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=1b79e7ebbd77a7e714fafadd91459059aacf2407;p=mesa.git i965: Store number of threads in brw_cs_prog_data Cc: "12.0" Signed-off-by: Jordan Justen Reviewed-by: Jason Ekstrand --- diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 4d0fd7c67ff..63d096c6e8b 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -1076,9 +1076,8 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) if (reg_aligned_constant_size == 0) return (struct anv_state) { .offset = 0 }; - const unsigned threads = pipeline->cs_thread_width_max; const unsigned total_push_constants_size = - reg_aligned_constant_size * threads; + reg_aligned_constant_size * cs_prog_data->threads; const unsigned push_constant_alignment = cmd_buffer->device->info.gen < 8 ? 32 : 64; const unsigned aligned_total_push_constants_size = @@ -1091,7 +1090,7 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) /* Walk through the param array and fill the buffer with data */ uint32_t *u32_map = state.map; - brw_cs_fill_local_id_payload(cs_prog_data, u32_map, threads, + brw_cs_fill_local_id_payload(cs_prog_data, u32_map, cs_prog_data->threads, reg_aligned_constant_size); /* Setup uniform data for the first thread */ @@ -1102,7 +1101,7 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) /* Copy uniform data from the first thread to every other thread */ const size_t uniform_data_size = prog_data->nr_params * sizeof(uint32_t); - for (unsigned t = 1; t < threads; t++) { + for (unsigned t = 1; t < cs_prog_data->threads; t++) { memcpy(&u32_map[t * param_aligned_count + local_id_dwords], &u32_map[local_id_dwords], uniform_data_size); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 7325f3f5e61..26ffbd65666 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1474,7 +1474,6 @@ struct anv_pipeline { bool primitive_restart; uint32_t topology; - uint32_t cs_thread_width_max; uint32_t cs_right_mask; struct { diff --git a/src/intel/vulkan/gen7_cmd_buffer.c b/src/intel/vulkan/gen7_cmd_buffer.c index 331275e5a88..40ab0089908 100644 --- a/src/intel/vulkan/gen7_cmd_buffer.c +++ b/src/intel/vulkan/gen7_cmd_buffer.c @@ -271,7 +271,7 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) .BarrierEnable = cs_prog_data->uses_barrier, .SharedLocalMemorySize = slm_size, .NumberofThreadsinGPGPUThreadGroup = - pipeline->cs_thread_width_max); + cs_prog_data->threads); const uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); anv_batch_emit(&cmd_buffer->batch, diff --git a/src/intel/vulkan/gen8_cmd_buffer.c b/src/intel/vulkan/gen8_cmd_buffer.c index 547fedd1eab..e139e8af961 100644 --- a/src/intel/vulkan/gen8_cmd_buffer.c +++ b/src/intel/vulkan/gen8_cmd_buffer.c @@ -356,7 +356,7 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) .BarrierEnable = cs_prog_data->uses_barrier, .SharedLocalMemorySize = slm_size, .NumberofThreadsinGPGPUThreadGroup = - pipeline->cs_thread_width_max); + cs_prog_data->threads); uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); anv_batch_emit(&cmd_buffer->batch, diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index e7d322c3f30..d9acf58517c 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -773,7 +773,7 @@ void genX(CmdDispatch)( ggw.SIMDSize = prog_data->simd_size / 16; ggw.ThreadDepthCounterMaximum = 0; ggw.ThreadHeightCounterMaximum = 0; - ggw.ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1; + ggw.ThreadWidthCounterMaximum = prog_data->threads - 1; ggw.ThreadGroupIDXDimension = x; ggw.ThreadGroupIDYDimension = y; ggw.ThreadGroupIDZDimension = z; @@ -874,7 +874,7 @@ void genX(CmdDispatchIndirect)( ggw.SIMDSize = prog_data->simd_size / 16; ggw.ThreadDepthCounterMaximum = 0; ggw.ThreadHeightCounterMaximum = 0; - ggw.ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1; + ggw.ThreadWidthCounterMaximum = prog_data->threads - 1; ggw.RightExecutionMask = pipeline->cs_right_mask; ggw.BottomExecutionMask = 0xffffffff; } diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index 918a9a4f03b..1776577d0a2 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -97,8 +97,6 @@ genX(compute_pipeline_create)( uint32_t group_size = cs_prog_data->local_size[0] * cs_prog_data->local_size[1] * cs_prog_data->local_size[2]; - pipeline->cs_thread_width_max = - DIV_ROUND_UP(group_size, cs_prog_data->simd_size); uint32_t remainder = group_size & (cs_prog_data->simd_size - 1); if (remainder > 0) @@ -107,7 +105,7 @@ genX(compute_pipeline_create)( pipeline->cs_right_mask = ~0u >> (32 - cs_prog_data->simd_size); const uint32_t vfe_curbe_allocation = - push_constant_regs * pipeline->cs_thread_width_max; + push_constant_regs * cs_prog_data->threads; anv_batch_emit(&pipeline->batch, GENX(MEDIA_VFE_STATE), vfe) { vfe.ScratchSpaceBasePointer = pipeline->scratch_start[MESA_SHADER_COMPUTE]; diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h index bed969cf18a..f1f9e5614e2 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -430,6 +430,7 @@ struct brw_cs_prog_data { GLuint dispatch_grf_start_reg_16; unsigned local_size[3]; unsigned simd_size; + unsigned threads; bool uses_barrier; bool uses_num_work_groups; unsigned local_invocation_id_regs; diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 975ac9eb6ba..14b0b428d21 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -6569,6 +6569,15 @@ fs_visitor::emit_cs_work_group_id_setup() return reg; } +static void +cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size) +{ + cs_prog_data->simd_size = size; + unsigned group_size = cs_prog_data->local_size[0] * + cs_prog_data->local_size[1] * cs_prog_data->local_size[2]; + cs_prog_data->threads = (group_size + size - 1) / size; +} + const unsigned * brw_compile_cs(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, @@ -6625,7 +6634,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, fail_msg = v8.fail_msg; } else { cfg = v8.cfg; - prog_data->simd_size = 8; + cs_set_simd_size(prog_data, 8); prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs; } } @@ -6650,7 +6659,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, } } else { cfg = v16.cfg; - prog_data->simd_size = 16; + cs_set_simd_size(prog_data, 16); prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs; } } @@ -6677,7 +6686,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, } } else { cfg = v32.cfg; - prog_data->simd_size = 32; + cs_set_simd_size(prog_data, 32); } } diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c index 7f484dd1586..619edfb0acc 100644 --- a/src/mesa/drivers/dri/i965/gen7_cs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c @@ -33,17 +33,6 @@ #include "program/prog_statevars.h" #include "compiler/glsl/ir_uniform.h" -static unsigned -get_cs_thread_count(const struct brw_cs_prog_data *cs_prog_data) -{ - const unsigned simd_size = cs_prog_data->simd_size; - unsigned group_size = cs_prog_data->local_size[0] * - cs_prog_data->local_size[1] * cs_prog_data->local_size[2]; - - return (group_size + simd_size - 1) / simd_size; -} - - static void brw_upload_cs_state(struct brw_context *brw) { @@ -79,7 +68,6 @@ brw_upload_cs_state(struct brw_context *brw) (prog_data->nr_params + local_id_dwords) * sizeof(gl_constant_value); unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32); unsigned push_constant_regs = reg_aligned_constant_size / 32; - unsigned threads = get_cs_thread_count(cs_prog_data); uint32_t dwords = brw->gen < 8 ? 8 : 9; BEGIN_BATCH(dwords); @@ -129,7 +117,8 @@ brw_upload_cs_state(struct brw_context *brw) * * Note: The constant data is built in brw_upload_cs_push_constants below. */ - const uint32_t vfe_curbe_allocation = push_constant_regs * threads; + const uint32_t vfe_curbe_allocation = + push_constant_regs * cs_prog_data->threads; OUT_BATCH(SET_FIELD(vfe_urb_allocation, MEDIA_VFE_STATE_URB_ALLOC) | SET_FIELD(vfe_curbe_allocation, MEDIA_VFE_STATE_CURBE_ALLOC)); OUT_BATCH(0); @@ -141,7 +130,7 @@ brw_upload_cs_state(struct brw_context *brw) BEGIN_BATCH(4); OUT_BATCH(MEDIA_CURBE_LOAD << 16 | (4 - 2)); OUT_BATCH(0); - OUT_BATCH(ALIGN(reg_aligned_constant_size * threads, 64)); + OUT_BATCH(ALIGN(reg_aligned_constant_size * cs_prog_data->threads, 64)); OUT_BATCH(stage_state->push_const_offset); ADVANCE_BATCH(); } @@ -163,9 +152,9 @@ brw_upload_cs_state(struct brw_context *brw) desc[dw++] = SET_FIELD(push_constant_regs, MEDIA_CURBE_READ_LENGTH); const uint32_t media_threads = brw->gen >= 8 ? - SET_FIELD(threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) : - SET_FIELD(threads, MEDIA_GPGPU_THREAD_COUNT); - assert(threads <= brw->max_cs_threads); + SET_FIELD(cs_prog_data->threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) : + SET_FIELD(cs_prog_data->threads, MEDIA_GPGPU_THREAD_COUNT); + assert(cs_prog_data->threads <= brw->max_cs_threads); assert(prog_data->total_shared <= 64 * 1024); uint32_t slm_size = 0; @@ -247,21 +236,20 @@ brw_upload_cs_push_constants(struct brw_context *brw, const unsigned param_aligned_count = reg_aligned_constant_size / sizeof(*param); - unsigned threads = get_cs_thread_count(cs_prog_data); - param = (gl_constant_value*) brw_state_batch(brw, type, - ALIGN(reg_aligned_constant_size * threads, 64), + ALIGN(reg_aligned_constant_size * + cs_prog_data->threads, 64), 64, &stage_state->push_const_offset); assert(param); STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float)); - brw_cs_fill_local_id_payload(cs_prog_data, param, threads, + brw_cs_fill_local_id_payload(cs_prog_data, param, cs_prog_data->threads, reg_aligned_constant_size); /* _NEW_PROGRAM_CONSTANTS */ - for (t = 0; t < threads; t++) { + for (t = 0; t < cs_prog_data->threads; t++) { gl_constant_value *next_param = ¶m[t * param_aligned_count + local_id_dwords]; for (i = 0; i < prog_data->nr_params; i++) {