From d1c4e64a69e49c64148529024ecb700d18d3c1c8 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 31 Oct 2019 15:57:52 -0500 Subject: [PATCH] intel/compiler: Add a flag to avoid compacting push constants In vec4, we can just not run the pass. In fs, things are a bit more deeply intertwined. Reviewed-by: Lionel Landwerlin --- src/gallium/drivers/iris/iris_screen.c | 1 + src/intel/compiler/brw_compiler.h | 6 + src/intel/compiler/brw_fs.cpp | 303 ++++++++++++----------- src/intel/compiler/brw_vec4.cpp | 3 + src/intel/vulkan/anv_device.c | 1 + src/mesa/drivers/dri/i965/intel_screen.c | 1 + 6 files changed, 170 insertions(+), 145 deletions(-) diff --git a/src/gallium/drivers/iris/iris_screen.c b/src/gallium/drivers/iris/iris_screen.c index f1bb03de8f9..cbefa9dfb13 100644 --- a/src/gallium/drivers/iris/iris_screen.c +++ b/src/gallium/drivers/iris/iris_screen.c @@ -673,6 +673,7 @@ iris_screen_create(int fd, const struct pipe_screen_config *config) screen->compiler->shader_perf_log = iris_shader_perf_log; screen->compiler->supports_pull_constants = false; screen->compiler->supports_shader_constants = true; + screen->compiler->compact_params = false; iris_disk_cache_init(screen); diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index dd2de2094e7..6e6610a26af 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -119,6 +119,12 @@ struct brw_compiler { * whether nir_opt_large_constants will be run. */ bool supports_shader_constants; + + /** + * Whether or not the driver wants uniform params to be compacted by the + * back-end compiler. + */ + bool compact_params; }; /** diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 37a90d0d489..0d3d6137058 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -2307,159 +2307,190 @@ fs_visitor::assign_constant_locations() return; } - struct uniform_slot_info slots[uniforms]; - memset(slots, 0, sizeof(slots)); + if (compiler->compact_params) { + struct uniform_slot_info slots[uniforms]; + memset(slots, 0, sizeof(slots)); - foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { - for (int i = 0 ; i < inst->sources; i++) { - if (inst->src[i].file != UNIFORM) - continue; + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + for (int i = 0 ; i < inst->sources; i++) { + if (inst->src[i].file != UNIFORM) + continue; - /* NIR tightly packs things so the uniform number might not be - * aligned (if we have a double right after a float, for instance). - * This is fine because the process of re-arranging them will ensure - * that things are properly aligned. The offset into that uniform, - * however, must be aligned. - * - * In Vulkan, we have explicit offsets but everything is crammed - * into a single "variable" so inst->src[i].nr will always be 0. - * Everything will be properly aligned relative to that one base. - */ - assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0); + /* NIR tightly packs things so the uniform number might not be + * aligned (if we have a double right after a float, for + * instance). This is fine because the process of re-arranging + * them will ensure that things are properly aligned. The offset + * into that uniform, however, must be aligned. + * + * In Vulkan, we have explicit offsets but everything is crammed + * into a single "variable" so inst->src[i].nr will always be 0. + * Everything will be properly aligned relative to that one base. + */ + assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0); - unsigned u = inst->src[i].nr + - inst->src[i].offset / UNIFORM_SLOT_SIZE; + unsigned u = inst->src[i].nr + + inst->src[i].offset / UNIFORM_SLOT_SIZE; - if (u >= uniforms) - continue; + if (u >= uniforms) + continue; - unsigned slots_read; - if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { - slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE); - } else { - unsigned bytes_read = inst->components_read(i) * - type_sz(inst->src[i].type); - slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE); - } + unsigned slots_read; + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { + slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE); + } else { + unsigned bytes_read = inst->components_read(i) * + type_sz(inst->src[i].type); + slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE); + } - assert(u + slots_read <= uniforms); - mark_uniform_slots_read(&slots[u], slots_read, - type_sz(inst->src[i].type)); + assert(u + slots_read <= uniforms); + mark_uniform_slots_read(&slots[u], slots_read, + type_sz(inst->src[i].type)); + } } - } - int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data); + int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data); - /* Only allow 16 registers (128 uniform components) as push constants. - * - * Just demote the end of the list. We could probably do better - * here, demoting things that are rarely used in the program first. - * - * If changing this value, note the limitation about total_regs in - * brw_curbe.c. - */ - unsigned int max_push_components = 16 * 8; - if (subgroup_id_index >= 0) - max_push_components--; /* Save a slot for the thread ID */ + /* Only allow 16 registers (128 uniform components) as push constants. + * + * Just demote the end of the list. We could probably do better + * here, demoting things that are rarely used in the program first. + * + * If changing this value, note the limitation about total_regs in + * brw_curbe.c. + */ + unsigned int max_push_components = 16 * 8; + if (subgroup_id_index >= 0) + max_push_components--; /* Save a slot for the thread ID */ - /* We push small arrays, but no bigger than 16 floats. This is big enough - * for a vec4 but hopefully not large enough to push out other stuff. We - * should probably use a better heuristic at some point. - */ - const unsigned int max_chunk_size = 16; + /* We push small arrays, but no bigger than 16 floats. This is big + * enough for a vec4 but hopefully not large enough to push out other + * stuff. We should probably use a better heuristic at some point. + */ + const unsigned int max_chunk_size = 16; - unsigned int num_push_constants = 0; - unsigned int num_pull_constants = 0; + unsigned int num_push_constants = 0; + unsigned int num_pull_constants = 0; - push_constant_loc = ralloc_array(mem_ctx, int, uniforms); - pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); + push_constant_loc = ralloc_array(mem_ctx, int, uniforms); + pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); - /* Default to -1 meaning no location */ - memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc)); - memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); + /* Default to -1 meaning no location */ + memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc)); + memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); - int chunk_start = -1; - struct cplx_align align; - for (unsigned u = 0; u < uniforms; u++) { - if (!slots[u].is_live) { - assert(chunk_start == -1); - continue; - } + int chunk_start = -1; + struct cplx_align align; + for (unsigned u = 0; u < uniforms; u++) { + if (!slots[u].is_live) { + assert(chunk_start == -1); + continue; + } - /* Skip subgroup_id_index to put it in the last push register. */ - if (subgroup_id_index == (int)u) - continue; + /* Skip subgroup_id_index to put it in the last push register. */ + if (subgroup_id_index == (int)u) + continue; - if (chunk_start == -1) { - chunk_start = u; - align = slots[u].align; - } else { - /* Offset into the chunk */ - unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE; + if (chunk_start == -1) { + chunk_start = u; + align = slots[u].align; + } else { + /* Offset into the chunk */ + unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE; - /* Shift the slot alignment down by the chunk offset so it is - * comparable with the base chunk alignment. - */ - struct cplx_align slot_align = slots[u].align; - slot_align.offset = - (slot_align.offset - chunk_offset) & (align.mul - 1); + /* Shift the slot alignment down by the chunk offset so it is + * comparable with the base chunk alignment. + */ + struct cplx_align slot_align = slots[u].align; + slot_align.offset = + (slot_align.offset - chunk_offset) & (align.mul - 1); - align = cplx_align_combine(align, slot_align); - } + align = cplx_align_combine(align, slot_align); + } - /* Sanity check the alignment */ - cplx_align_assert_sane(align); + /* Sanity check the alignment */ + cplx_align_assert_sane(align); - if (slots[u].contiguous) - continue; + if (slots[u].contiguous) + continue; - /* Adjust the alignment to be in terms of slots, not bytes */ - assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0); - assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0); - align.mul /= UNIFORM_SLOT_SIZE; - align.offset /= UNIFORM_SLOT_SIZE; - - unsigned push_start_align = cplx_align_apply(align, num_push_constants); - unsigned chunk_size = u - chunk_start + 1; - if ((!compiler->supports_pull_constants && u < UBO_START) || - (chunk_size < max_chunk_size && - push_start_align + chunk_size <= max_push_components)) { - /* Align up the number of push constants */ - num_push_constants = push_start_align; - for (unsigned i = 0; i < chunk_size; i++) - push_constant_loc[chunk_start + i] = num_push_constants++; - } else { - /* We need to pull this one */ - num_pull_constants = cplx_align_apply(align, num_pull_constants); - for (unsigned i = 0; i < chunk_size; i++) - pull_constant_loc[chunk_start + i] = num_pull_constants++; + /* Adjust the alignment to be in terms of slots, not bytes */ + assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0); + assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0); + align.mul /= UNIFORM_SLOT_SIZE; + align.offset /= UNIFORM_SLOT_SIZE; + + unsigned push_start_align = cplx_align_apply(align, num_push_constants); + unsigned chunk_size = u - chunk_start + 1; + if ((!compiler->supports_pull_constants && u < UBO_START) || + (chunk_size < max_chunk_size && + push_start_align + chunk_size <= max_push_components)) { + /* Align up the number of push constants */ + num_push_constants = push_start_align; + for (unsigned i = 0; i < chunk_size; i++) + push_constant_loc[chunk_start + i] = num_push_constants++; + } else { + /* We need to pull this one */ + num_pull_constants = cplx_align_apply(align, num_pull_constants); + for (unsigned i = 0; i < chunk_size; i++) + pull_constant_loc[chunk_start + i] = num_pull_constants++; + } + + /* Reset the chunk and start again */ + chunk_start = -1; } - /* Reset the chunk and start again */ - chunk_start = -1; - } + /* Add the CS local thread ID uniform at the end of the push constants */ + if (subgroup_id_index >= 0) + push_constant_loc[subgroup_id_index] = num_push_constants++; - /* Add the CS local thread ID uniform at the end of the push constants */ - if (subgroup_id_index >= 0) - push_constant_loc[subgroup_id_index] = num_push_constants++; + /* As the uniforms are going to be reordered, stash the old array and + * create two new arrays for push/pull params. + */ + uint32_t *param = stage_prog_data->param; + stage_prog_data->nr_params = num_push_constants; + if (num_push_constants) { + stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t, + num_push_constants); + } else { + stage_prog_data->param = NULL; + } + assert(stage_prog_data->nr_pull_params == 0); + assert(stage_prog_data->pull_param == NULL); + if (num_pull_constants > 0) { + stage_prog_data->nr_pull_params = num_pull_constants; + stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t, + num_pull_constants); + } - /* As the uniforms are going to be reordered, stash the old array and - * create two new arrays for push/pull params. - */ - uint32_t *param = stage_prog_data->param; - stage_prog_data->nr_params = num_push_constants; - if (num_push_constants) { - stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t, - num_push_constants); + /* Up until now, the param[] array has been indexed by reg + offset + * of UNIFORM registers. Move pull constants into pull_param[] and + * condense param[] to only contain the uniforms we chose to push. + * + * NOTE: Because we are condensing the params[] array, we know that + * push_constant_loc[i] <= i and we can do it in one smooth loop without + * having to make a copy. + */ + for (unsigned int i = 0; i < uniforms; i++) { + uint32_t value = param[i]; + if (pull_constant_loc[i] != -1) { + stage_prog_data->pull_param[pull_constant_loc[i]] = value; + } else if (push_constant_loc[i] != -1) { + stage_prog_data->param[push_constant_loc[i]] = value; + } + } + ralloc_free(param); } else { - stage_prog_data->param = NULL; - } - assert(stage_prog_data->nr_pull_params == 0); - assert(stage_prog_data->pull_param == NULL); - if (num_pull_constants > 0) { - stage_prog_data->nr_pull_params = num_pull_constants; - stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t, - num_pull_constants); + /* If we don't want to compact anything, just set up dummy push/pull + * arrays. All the rest of the compiler cares about are these arrays. + */ + push_constant_loc = ralloc_array(mem_ctx, int, uniforms); + pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); + + for (unsigned u = 0; u < uniforms; u++) + push_constant_loc[u] = u; + + memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); } /* Now that we know how many regular uniforms we'll push, reduce the @@ -2475,24 +2506,6 @@ fs_visitor::assign_constant_locations() push_length += range->length; } assert(push_length <= 64); - - /* Up until now, the param[] array has been indexed by reg + offset - * of UNIFORM registers. Move pull constants into pull_param[] and - * condense param[] to only contain the uniforms we chose to push. - * - * NOTE: Because we are condensing the params[] array, we know that - * push_constant_loc[i] <= i and we can do it in one smooth loop without - * having to make a copy. - */ - for (unsigned int i = 0; i < uniforms; i++) { - uint32_t value = param[i]; - if (pull_constant_loc[i] != -1) { - stage_prog_data->pull_param[pull_constant_loc[i]] = value; - } else if (push_constant_loc[i] != -1) { - stage_prog_data->param[push_constant_loc[i]] = value; - } - } - ralloc_free(param); } bool diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp index 9fa946ccdb4..fb4cf365cb4 100644 --- a/src/intel/compiler/brw_vec4.cpp +++ b/src/intel/compiler/brw_vec4.cpp @@ -633,6 +633,9 @@ set_push_constant_loc(const int nr_uniforms, int *new_uniform_count, void vec4_visitor::pack_uniform_registers() { + if (!compiler->compact_params) + return; + uint8_t chans_used[this->uniforms]; int new_loc[this->uniforms]; int new_chan[this->uniforms]; diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 28a652867d9..0fc507c12a8 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -560,6 +560,7 @@ anv_physical_device_init(struct anv_physical_device *device, device->compiler->constant_buffer_0_is_relative = device->info.gen < 8 || !device->has_context_isolation; device->compiler->supports_shader_constants = true; + device->compiler->compact_params = true; /* Broadwell PRM says: * diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c index 6040161d8f9..c690a8ede7c 100644 --- a/src/mesa/drivers/dri/i965/intel_screen.c +++ b/src/mesa/drivers/dri/i965/intel_screen.c @@ -2799,6 +2799,7 @@ __DRIconfig **intelInitScreen2(__DRIscreen *dri_screen) !(screen->kernel_features & KERNEL_ALLOWS_CONTEXT_ISOLATION); screen->compiler->supports_pull_constants = true; + screen->compiler->compact_params = true; screen->has_exec_fence = intel_get_boolean(screen, I915_PARAM_HAS_EXEC_FENCE); -- 2.30.2