From 3ba9594f32239031ddeff764e9896d48d05125d0 Mon Sep 17 00:00:00 2001 From: Jordan Justen Date: Fri, 27 May 2016 00:53:27 -0700 Subject: [PATCH] anv: Support new local ID generation & cross-thread constants The cross thread constant support appears on Haswell. It allows us to upload a set of uniform data for all threads without duplicating it per thread. We also support per-thread data which allows us to store a per-thread ID in one of the uniforms that can be used to calculate the gl_LocalInvocationIndex and gl_LocalInvocationID variables. v4: * Support the old local ID push constant layout as well (Jason) Cc: "12.0" Signed-off-by: Jordan Justen Reviewed-by: Jason Ekstrand --- src/intel/vulkan/anv_cmd_buffer.c | 54 +++++++++++++++++------------- src/intel/vulkan/gen7_cmd_buffer.c | 13 +++---- src/intel/vulkan/gen8_cmd_buffer.c | 13 +++---- src/intel/vulkan/genX_pipeline.c | 10 ++---- 4 files changed, 42 insertions(+), 48 deletions(-) diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 63d096c6e8b..edaaa3d2efa 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -1065,23 +1065,14 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - const unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8; - const unsigned push_constant_data_size = - (local_id_dwords + prog_data->nr_params) * 4; - const unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32); - const unsigned param_aligned_count = - reg_aligned_constant_size / sizeof(uint32_t); - /* If we don't actually have any push constants, bail. */ - if (reg_aligned_constant_size == 0) + if (cs_prog_data->push.total.size == 0) return (struct anv_state) { .offset = 0 }; - const unsigned total_push_constants_size = - reg_aligned_constant_size * cs_prog_data->threads; const unsigned push_constant_alignment = cmd_buffer->device->info.gen < 8 ? 32 : 64; const unsigned aligned_total_push_constants_size = - ALIGN(total_push_constants_size, push_constant_alignment); + ALIGN(cs_prog_data->push.total.size, push_constant_alignment); struct anv_state state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, aligned_total_push_constants_size, @@ -1090,21 +1081,36 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) /* Walk through the param array and fill the buffer with data */ uint32_t *u32_map = state.map; - brw_cs_fill_local_id_payload(cs_prog_data, u32_map, cs_prog_data->threads, - reg_aligned_constant_size); - - /* Setup uniform data for the first thread */ - for (unsigned i = 0; i < prog_data->nr_params; i++) { - uint32_t offset = (uintptr_t)prog_data->param[i]; - u32_map[local_id_dwords + i] = *(uint32_t *)((uint8_t *)data + offset); + if (cs_prog_data->push.cross_thread.size > 0) { + assert(cs_prog_data->thread_local_id_index < 0 || + cs_prog_data->thread_local_id_index >= + cs_prog_data->push.cross_thread.dwords); + for (unsigned i = 0; + i < cs_prog_data->push.cross_thread.dwords; + i++) { + uint32_t offset = (uintptr_t)prog_data->param[i]; + u32_map[i] = *(uint32_t *)((uint8_t *)data + offset); + } } - /* Copy uniform data from the first thread to every other thread */ - const size_t uniform_data_size = prog_data->nr_params * sizeof(uint32_t); - for (unsigned t = 1; t < cs_prog_data->threads; t++) { - memcpy(&u32_map[t * param_aligned_count + local_id_dwords], - &u32_map[local_id_dwords], - uniform_data_size); + if (cs_prog_data->push.per_thread.size > 0) { + brw_cs_fill_local_id_payload(cs_prog_data, u32_map, cs_prog_data->threads, + cs_prog_data->push.per_thread.size); + for (unsigned t = 0; t < cs_prog_data->threads; t++) { + unsigned dst = + 8 * (cs_prog_data->push.per_thread.regs * t + + cs_prog_data->push.cross_thread.regs + + cs_prog_data->local_invocation_id_regs); + unsigned src = cs_prog_data->push.cross_thread.dwords; + for ( ; src < prog_data->nr_params; src++, dst++) { + if (src != cs_prog_data->thread_local_id_index) { + uint32_t offset = (uintptr_t)prog_data->param[src]; + u32_map[dst] = *(uint32_t *)((uint8_t *)data + offset); + } else { + u32_map[dst] = t * cs_prog_data->simd_size; + } + } + } } if (!cmd_buffer->device->info.has_llc) diff --git a/src/intel/vulkan/gen7_cmd_buffer.c b/src/intel/vulkan/gen7_cmd_buffer.c index 40ab0089908..478122b9c0f 100644 --- a/src/intel/vulkan/gen7_cmd_buffer.c +++ b/src/intel/vulkan/gen7_cmd_buffer.c @@ -234,12 +234,6 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8; - unsigned push_constant_data_size = - (prog_data->nr_params + local_id_dwords) * 4; - unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32); - unsigned push_constant_regs = reg_aligned_constant_size / 32; - if (push_state.alloc_size) { anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) { curbe.CURBETotalDataLength = push_state.alloc_size; @@ -264,8 +258,11 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) .BindingTablePointer = surfaces.offset, .SamplerStatePointer = samplers.offset, .ConstantURBEntryReadLength = - push_constant_regs, -#if !GEN_IS_HASWELL + cs_prog_data->push.per_thread.regs, +#if GEN_IS_HASWELL + .CrossThreadConstantDataReadLength = + cs_prog_data->push.cross_thread.regs, +#else .ConstantURBEntryReadOffset = 0, #endif .BarrierEnable = cs_prog_data->uses_barrier, diff --git a/src/intel/vulkan/gen8_cmd_buffer.c b/src/intel/vulkan/gen8_cmd_buffer.c index e139e8af961..1fdc7d29ea0 100644 --- a/src/intel/vulkan/gen8_cmd_buffer.c +++ b/src/intel/vulkan/gen8_cmd_buffer.c @@ -319,12 +319,6 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8; - unsigned push_constant_data_size = - (prog_data->nr_params + local_id_dwords) * 4; - unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32); - unsigned push_constant_regs = reg_aligned_constant_size / 32; - if (push_state.alloc_size) { anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) { curbe.CURBETotalDataLength = push_state.alloc_size; @@ -351,12 +345,15 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) .BindingTableEntryCount = 0, .SamplerStatePointer = samplers.offset, .SamplerCount = 0, - .ConstantIndirectURBEntryReadLength = push_constant_regs, + .ConstantIndirectURBEntryReadLength = + cs_prog_data->push.per_thread.regs, .ConstantURBEntryReadOffset = 0, .BarrierEnable = cs_prog_data->uses_barrier, .SharedLocalMemorySize = slm_size, .NumberofThreadsinGPGPUThreadGroup = - cs_prog_data->threads); + cs_prog_data->threads, + .CrossThreadConstantDataReadLength = + cs_prog_data->push.cross_thread.regs); uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); anv_batch_emit(&cmd_buffer->batch, diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index 1776577d0a2..458e80c82b1 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -87,13 +87,6 @@ genX(compute_pipeline_create)( anv_setup_pipeline_l3_config(pipeline); const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); - const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - - unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8; - unsigned push_constant_data_size = - (prog_data->nr_params + local_id_dwords) * 4; - unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32); - unsigned push_constant_regs = reg_aligned_constant_size / 32; uint32_t group_size = cs_prog_data->local_size[0] * cs_prog_data->local_size[1] * cs_prog_data->local_size[2]; @@ -105,7 +98,8 @@ genX(compute_pipeline_create)( pipeline->cs_right_mask = ~0u >> (32 - cs_prog_data->simd_size); const uint32_t vfe_curbe_allocation = - push_constant_regs * cs_prog_data->threads; + ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads + + cs_prog_data->push.cross_thread.regs, 2); anv_batch_emit(&pipeline->batch, GENX(MEDIA_VFE_STATE), vfe) { vfe.ScratchSpaceBasePointer = pipeline->scratch_start[MESA_SHADER_COMPUTE]; -- 2.30.2