From 16f2770eb40e7e11d149b4551de27d8d663f4e22 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Tue, 19 Mar 2019 09:58:14 -0700 Subject: [PATCH] v3d: Upload all of UBO[0] if any indirect load occurs. The idea was that we could skip uploading the constant-indexed uniform data and just upload the uniforms that are variably-indexed. However, since the VS bin and render shaders may have a different set of uniforms used, this meant that we had to upload the UBO for each of them. The first case is generally a fairly small impact (usually the uniform array is the most space, other than a couple of FSes in shader-db), while the second is a larger impact: 3DMMES2 was uploading 38k/frame of uniforms instead of 18k. Given that the optimization is of dubious value, has a big downside, and is quite a bit of code, just drop it. No change in shader-db. No change on 3DMMES2 (n=15). --- src/broadcom/compiler/nir_to_vir.c | 65 +------------------------- src/broadcom/compiler/v3d_compiler.h | 29 ------------ src/broadcom/compiler/vir.c | 36 -------------- src/gallium/drivers/v3d/v3d_uniforms.c | 57 ++++++++-------------- 4 files changed, 20 insertions(+), 167 deletions(-) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 846a2a704af..2c411b86ed1 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -231,31 +231,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, struct qreg offset; if (instr->intrinsic == nir_intrinsic_load_uniform) { - /* Find what variable in the default uniform block this - * uniform load is coming from. - */ - uint32_t base = nir_intrinsic_base(instr); - int i; - struct v3d_ubo_range *range = NULL; - for (i = 0; i < c->num_ubo_ranges; i++) { - range = &c->ubo_ranges[i]; - if (base >= range->src_offset && - base < range->src_offset + range->size) { - break; - } - } - /* The driver-location-based offset always has to be within a - * declared uniform range. - */ - assert(i != c->num_ubo_ranges); - if (!c->ubo_range_used[i]) { - c->ubo_range_used[i] = true; - range->dst_offset = c->next_ubo_dst_offset; - c->next_ubo_dst_offset += range->size; - } - - const_offset += base - range->src_offset + range->dst_offset; - + const_offset += nir_intrinsic_base(instr); offset = vir_uniform(c, QUNIFORM_UBO_ADDR, v3d_unit_data_create(0, const_offset)); const_offset = 0; @@ -668,27 +644,6 @@ add_output(struct v3d_compile *c, v3d_slot_from_slot_and_component(slot, swizzle); } -static void -declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size) -{ - unsigned array_id = c->num_ubo_ranges++; - if (array_id >= c->ubo_ranges_array_size) { - c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2, - array_id + 1); - c->ubo_ranges = reralloc(c, c->ubo_ranges, - struct v3d_ubo_range, - c->ubo_ranges_array_size); - c->ubo_range_used = reralloc(c, c->ubo_range_used, - bool, - c->ubo_ranges_array_size); - } - - c->ubo_ranges[array_id].dst_offset = 0; - c->ubo_ranges[array_id].src_offset = start; - c->ubo_ranges[array_id].size = size; - c->ubo_range_used[array_id] = false; -} - /** * If compare_instr is a valid comparison instruction, emits the * compare_instr's comparison and returns the sel_instr's return value based @@ -1536,23 +1491,6 @@ ntq_setup_outputs(struct v3d_compile *c) } } -static void -ntq_setup_uniforms(struct v3d_compile *c) -{ - nir_foreach_variable(var, &c->s->uniforms) { - uint32_t vec4_count = glsl_count_attribute_slots(var->type, - false); - unsigned vec4_size = 4 * sizeof(float); - - if (var->data.mode != nir_var_uniform) - continue; - - declare_uniform_range(c, var->data.driver_location * vec4_size, - vec4_count * vec4_size); - - } -} - /** * Sets up the mapping from nir_register to struct qreg *. * @@ -2361,7 +2299,6 @@ nir_to_vir(struct v3d_compile *c) ntq_setup_vpm_inputs(c); ntq_setup_outputs(c); - ntq_setup_uniforms(c); ntq_setup_registers(c, &c->s->registers); /* Find the main function and emit the body. */ diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index d8d916fb14b..155e112d8cd 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -318,25 +318,6 @@ static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot) return slot.slot_and_component & 3; } -struct v3d_ubo_range { - /** - * offset in bytes from the start of the ubo where this range is - * uploaded. - * - * Only set once used is set. - */ - uint32_t dst_offset; - - /** - * offset in bytes from the start of the gallium uniforms where the - * data comes from. - */ - uint32_t src_offset; - - /** size in bytes of this ubo range */ - uint32_t size; -}; - struct v3d_key { void *shader_state; struct { @@ -533,13 +514,6 @@ struct v3d_compile { bool uses_center_w; bool writes_z; - struct v3d_ubo_range *ubo_ranges; - bool *ubo_range_used; - uint32_t ubo_ranges_array_size; - /** Number of uniform areas tracked in ubo_ranges. */ - uint32_t num_ubo_ranges; - uint32_t next_ubo_dst_offset; - /* State for whether we're executing on each channel currently. 0 if * yes, otherwise a block number + 1 that the channel jumped to. */ @@ -674,9 +648,6 @@ struct v3d_uniform_list { struct v3d_prog_data { struct v3d_uniform_list uniforms; - struct v3d_ubo_range *ubo_ranges; - uint32_t num_ubo_ranges; - uint32_t ubo_size; uint32_t spill_size; uint8_t threads; diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index bb04c82d777..8963dd1e350 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -582,41 +582,6 @@ v3d_set_prog_data_uniforms(struct v3d_compile *c, count * sizeof(*ulist->contents)); } -/* Copy the compiler UBO range state to the compiled shader, dropping out - * arrays that were never referenced by an indirect load. - * - * (Note that QIR dead code elimination of an array access still leaves that - * array alive, though) - */ -static void -v3d_set_prog_data_ubo(struct v3d_compile *c, - struct v3d_prog_data *prog_data) -{ - if (!c->num_ubo_ranges) - return; - - prog_data->num_ubo_ranges = 0; - prog_data->ubo_ranges = ralloc_array(prog_data, struct v3d_ubo_range, - c->num_ubo_ranges); - for (int i = 0; i < c->num_ubo_ranges; i++) { - if (!c->ubo_range_used[i]) - continue; - - struct v3d_ubo_range *range = &c->ubo_ranges[i]; - prog_data->ubo_ranges[prog_data->num_ubo_ranges++] = *range; - prog_data->ubo_size += range->size; - } - - if (prog_data->ubo_size) { - if (V3D_DEBUG & V3D_DEBUG_SHADERDB) { - fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n", - vir_get_stage_name(c), - c->program_id, c->variant_id, - prog_data->ubo_size / 4); - } - } -} - static void v3d_vs_set_prog_data(struct v3d_compile *c, struct v3d_vs_prog_data *prog_data) @@ -713,7 +678,6 @@ v3d_set_prog_data(struct v3d_compile *c, prog_data->spill_size = c->spill_size; v3d_set_prog_data_uniforms(c, prog_data); - v3d_set_prog_data_ubo(c, prog_data); if (c->s->info.stage == MESA_SHADER_VERTEX) { v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data); diff --git a/src/gallium/drivers/v3d/v3d_uniforms.c b/src/gallium/drivers/v3d/v3d_uniforms.c index b48f6526d61..a5532bdf2b4 100644 --- a/src/gallium/drivers/v3d/v3d_uniforms.c +++ b/src/gallium/drivers/v3d/v3d_uniforms.c @@ -22,6 +22,7 @@ */ #include "util/u_pack_color.h" +#include "util/u_upload_mgr.h" #include "util/format_srgb.h" #include "v3d_context.h" @@ -95,28 +96,6 @@ get_image_size(struct v3d_shaderimg_stateobj *shaderimg, } } -static struct v3d_bo * -v3d_upload_ubo(struct v3d_context *v3d, - struct v3d_compiled_shader *shader, - const uint32_t *gallium_uniforms) -{ - if (!shader->prog_data.base->ubo_size) - return NULL; - - struct v3d_bo *ubo = v3d_bo_alloc(v3d->screen, - shader->prog_data.base->ubo_size, - "ubo"); - void *data = v3d_bo_map(ubo); - for (uint32_t i = 0; i < shader->prog_data.base->num_ubo_ranges; i++) { - memcpy(data + shader->prog_data.base->ubo_ranges[i].dst_offset, - ((const void *)gallium_uniforms + - shader->prog_data.base->ubo_ranges[i].src_offset), - shader->prog_data.base->ubo_ranges[i].size); - } - - return ubo; -} - /** * Writes the V3D 3.x P0 (CFG_MODE=1) texture parameter. * @@ -235,7 +214,6 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_compiled_shader *shader, struct v3d_uniform_list *uinfo = &shader->prog_data.base->uniforms; struct v3d_job *job = v3d->job; const uint32_t *gallium_uniforms = cb->cb[0].user_buffer; - struct v3d_bo *ubo = v3d_upload_ubo(v3d, shader, gallium_uniforms); /* We always need to return some space for uniforms, because the HW * will be prefetching, even if we don't read any in the program. @@ -329,21 +307,26 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_compiled_shader *shader, v3d->zsa->base.alpha.ref_value); break; - case QUNIFORM_UBO_ADDR: - if (data == 0) { - cl_aligned_reloc(&job->indirect, &uniforms, - ubo, 0); - } else { - int ubo_index = v3d_unit_data_get_unit(data); - struct v3d_resource *rsc = - v3d_resource(cb->cb[ubo_index].buffer); - - cl_aligned_reloc(&job->indirect, &uniforms, - rsc->bo, - cb->cb[ubo_index].buffer_offset + - v3d_unit_data_get_offset(data)); + case QUNIFORM_UBO_ADDR: { + uint32_t unit = v3d_unit_data_get_unit(data); + /* Constant buffer 0 may be a system memory pointer, + * in which case we want to upload a shadow copy to + * the GPU. + */ + if (!cb->cb[unit].buffer) { + u_upload_data(v3d->uploader, 0, + cb->cb[unit].buffer_size, 16, + cb->cb[unit].user_buffer, + &cb->cb[unit].buffer_offset, + &cb->cb[unit].buffer); } + + cl_aligned_reloc(&job->indirect, &uniforms, + v3d_resource(cb->cb[unit].buffer)->bo, + cb->cb[unit].buffer_offset + + v3d_unit_data_get_offset(data)); break; + } case QUNIFORM_SSBO_OFFSET: { struct pipe_shader_buffer *sb = @@ -397,8 +380,6 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_compiled_shader *shader, cl_end(&job->indirect, uniforms); - v3d_bo_unreference(&ubo); - return uniform_stream; } -- 2.30.2