From: Eric Anholt Date: Thu, 1 Dec 2016 20:15:10 +0000 (-0800) Subject: vc4: Upload CS/VS UBO uniforms together. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=afad1f7d625d9402a0f7fe57287d536d7d9b4b5a;p=mesa.git vc4: Upload CS/VS UBO uniforms together. Same as I did for V3D, drop all this code trying to GC the non-indirectly-loaded uniforms from the UBO that's used for indirect access of gallium cb[0]. While it does successfully drop some of those, it came at the cost of uploading the VS's indirect unifroms twice, for the bin and render versions of the shader. With the UBO loads simplified, I was also able to easily backport V3D's change to pack a UBO offset into the uniform_data[] field so that we don't need to do the add of the uniform base in the shader. As a bonus, now vc4 doesn't depend on mesa/st type_size functions. total uniforms in shared programs: 25514 -> 25490 (-0.09%) total instructions in shared programs: 77019 -> 76836 (-0.24%) --- diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h index e7cb831774c..f02992f07ee 100644 --- a/src/gallium/drivers/vc4/vc4_context.h +++ b/src/gallium/drivers/vc4/vc4_context.h @@ -121,25 +121,6 @@ struct vc4_uncompiled_shader { struct pipe_shader_state base; }; -struct vc4_ubo_range { - /** - * offset in bytes from the start of the ubo where this range is - * uploaded. - * - * Only set once used is set. - */ - uint32_t dst_offset; - - /** - * offset in bytes from the start of the gallium uniforms where the - * data comes from. - */ - uint32_t src_offset; - - /** size in bytes of this ubo range */ - uint32_t size; -}; - struct vc4_fs_inputs { /** * Array of the meanings of the VPM inputs this shader needs. @@ -157,9 +138,6 @@ struct vc4_compiled_shader { struct vc4_shader_uniform_info uniforms; - struct vc4_ubo_range *ubo_ranges; - uint32_t num_ubo_ranges; - uint32_t ubo_size; /** * VC4_DIRTY_* flags that, when set in vc4->dirty, mean that the * uniforms have to be rewritten (and therefore the shader state diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c index 06785516cae..df95c313f34 100644 --- a/src/gallium/drivers/vc4/vc4_draw.c +++ b/src/gallium/drivers/vc4/vc4_draw.c @@ -343,6 +343,7 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) vc4_emit_state(pctx); bool needs_drawarrays_shader_state = false; + if ((vc4->dirty & (VC4_DIRTY_VTXBUF | VC4_DIRTY_VTXSTATE | VC4_DIRTY_PRIM_MODE | diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 4c284b6cd7c..7d13544ab28 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -38,7 +38,6 @@ #include "vc4_context.h" #include "vc4_qpu.h" #include "vc4_qir.h" -#include "mesa/state_tracker/st_glsl_types.h" static struct qreg ntq_get_src(struct vc4_compile *c, nir_src src, int i); @@ -51,12 +50,6 @@ type_size(const struct glsl_type *type) return glsl_count_attribute_slots(type, false); } -static int -uniforms_type_size(const struct glsl_type *type) -{ - return st_glsl_storage_type_size(type, false); -} - static void resize_qreg_array(struct vc4_compile *c, struct qreg **regs, @@ -99,43 +92,17 @@ static struct qreg indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr) { struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0); - uint32_t offset = nir_intrinsic_base(intr); - struct vc4_compiler_ubo_range *range = NULL; - unsigned i; - for (i = 0; i < c->num_uniform_ranges; i++) { - range = &c->ubo_ranges[i]; - if (offset >= range->src_offset && - offset < range->src_offset + range->size) { - break; - } - } - /* The driver-location-based offset always has to be within a declared - * uniform range. - */ - assert(range); - if (!range->used) { - range->used = true; - range->dst_offset = c->next_ubo_dst_offset; - c->next_ubo_dst_offset += range->size; - c->num_ubo_ranges++; - } - - offset -= range->src_offset; - - /* Adjust for where we stored the TGSI register base. */ - indirect_offset = qir_ADD(c, indirect_offset, - qir_uniform_ui(c, (range->dst_offset + - offset))); /* Clamp to [0, array size). Note that MIN/MAX are signed. */ + uint32_t range = nir_intrinsic_range(intr); indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0)); indirect_offset = qir_MIN_NOIMM(c, indirect_offset, - qir_uniform_ui(c, (range->dst_offset + - range->size - 4))); + qir_uniform_ui(c, range - 4)); qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0), indirect_offset, - qir_uniform(c, QUNIFORM_UBO0_ADDR, 0)); + qir_uniform(c, QUNIFORM_UBO0_ADDR, + nir_intrinsic_base(intr))); c->num_texture_samples++; @@ -859,24 +826,6 @@ add_output(struct vc4_compile *c, c->output_slots[decl_offset].swizzle = swizzle; } -static void -declare_uniform_range(struct vc4_compile *c, uint32_t start, uint32_t size) -{ - unsigned array_id = c->num_uniform_ranges++; - if (array_id >= c->ubo_ranges_array_size) { - c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2, - array_id + 1); - c->ubo_ranges = reralloc(c, c->ubo_ranges, - struct vc4_compiler_ubo_range, - c->ubo_ranges_array_size); - } - - c->ubo_ranges[array_id].dst_offset = 0; - c->ubo_ranges[array_id].src_offset = start; - c->ubo_ranges[array_id].size = size; - c->ubo_ranges[array_id].used = false; -} - static bool ntq_src_is_only_ssa_def_user(nir_src *src) { @@ -1698,19 +1647,6 @@ ntq_setup_outputs(struct vc4_compile *c) } } -static void -ntq_setup_uniforms(struct vc4_compile *c) -{ - nir_foreach_variable(var, &c->s->uniforms) { - uint32_t vec4_count = uniforms_type_size(var->type); - unsigned vec4_size = 4 * sizeof(float); - - declare_uniform_range(c, var->data.driver_location * vec4_size, - vec4_count * vec4_size); - - } -} - /** * Sets up the mapping from nir_register to struct qreg *. * @@ -2216,7 +2152,6 @@ nir_to_qir(struct vc4_compile *c) ntq_setup_inputs(c); ntq_setup_outputs(c); - ntq_setup_uniforms(c); /* Find the main function and emit the body. */ nir_foreach_function(function, c->s) { @@ -2677,39 +2612,6 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, shader->fs_threaded = c->fs_threaded; - /* Copy the compiler UBO range state to the compiled shader, dropping - * out arrays that were never referenced by an indirect load. - * - * (Note that QIR dead code elimination of an array access still - * leaves that array alive, though) - */ - if (c->num_ubo_ranges) { - shader->num_ubo_ranges = c->num_ubo_ranges; - shader->ubo_ranges = ralloc_array(shader, struct vc4_ubo_range, - c->num_ubo_ranges); - uint32_t j = 0; - for (int i = 0; i < c->num_uniform_ranges; i++) { - struct vc4_compiler_ubo_range *range = - &c->ubo_ranges[i]; - if (!range->used) - continue; - - shader->ubo_ranges[j].dst_offset = range->dst_offset; - shader->ubo_ranges[j].src_offset = range->src_offset; - shader->ubo_ranges[j].size = range->size; - shader->ubo_size += c->ubo_ranges[i].size; - j++; - } - } - if (shader->ubo_size) { - if (vc4_debug & VC4_DEBUG_SHADERDB) { - fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n", - qir_get_stage_name(c->stage), - c->program_id, c->variant_id, - shader->ubo_size / 4); - } - } - if ((vc4_debug & VC4_DEBUG_SHADERDB) && stage == QSTAGE_FRAG) { fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d FS threads\n", qir_get_stage_name(c->stage), diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index 17a0d0febc0..4d8bf60cf44 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -300,31 +300,6 @@ struct vc4_varying_slot { uint8_t swizzle; }; -struct vc4_compiler_ubo_range { - /** - * offset in bytes from the start of the ubo where this range is - * uploaded. - * - * Only set once used is set. - */ - uint32_t dst_offset; - - /** - * offset in bytes from the start of the gallium uniforms where the - * data comes from. - */ - uint32_t src_offset; - - /** size in bytes of this ubo range */ - uint32_t size; - - /** - * Set if this range is used by the shader for indirect uniforms - * access. - */ - bool used; -}; - struct vc4_key { struct vc4_uncompiled_shader *shader_state; struct { @@ -441,14 +416,6 @@ struct vc4_compile { uint32_t outputs_array_size; uint32_t uniforms_array_size; - struct vc4_compiler_ubo_range *ubo_ranges; - uint32_t ubo_ranges_array_size; - /** Number of uniform areas declared in ubo_ranges. */ - uint32_t num_uniform_ranges; - /** Number of uniform areas used for indirect addressed loads. */ - uint32_t num_ubo_ranges; - uint32_t next_ubo_dst_offset; - /* State for whether we're executing on each channel currently. 0 if * yes, otherwise a block number + 1 that the channel jumped to. */ diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c index d12f5667045..dd07487ab16 100644 --- a/src/gallium/drivers/vc4/vc4_uniforms.c +++ b/src/gallium/drivers/vc4/vc4_uniforms.c @@ -22,6 +22,7 @@ */ #include "util/u_pack_color.h" +#include "util/u_upload_mgr.h" #include "util/format_srgb.h" #include "vc4_context.h" @@ -186,26 +187,6 @@ get_texrect_scale(struct vc4_texture_stateobj *texstate, return fui(1.0f / dim); } -static struct vc4_bo * -vc4_upload_ubo(struct vc4_context *vc4, - struct vc4_compiled_shader *shader, - const uint32_t *gallium_uniforms) -{ - if (!shader->ubo_size) - return NULL; - - struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, shader->ubo_size, "ubo"); - void *data = vc4_bo_map(ubo); - for (uint32_t i = 0; i < shader->num_ubo_ranges; i++) { - memcpy(data + shader->ubo_ranges[i].dst_offset, - ((const void *)gallium_uniforms + - shader->ubo_ranges[i].src_offset), - shader->ubo_ranges[i].size); - } - - return ubo; -} - void vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, struct vc4_constbuf_stateobj *cb, @@ -214,7 +195,6 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, struct vc4_shader_uniform_info *uinfo = &shader->uniforms; struct vc4_job *job = vc4->job; const uint32_t *gallium_uniforms = cb->cb[0].user_buffer; - struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms); cl_ensure_space(&job->uniforms, (uinfo->count + uinfo->num_texture_samples) * 4); @@ -272,8 +252,23 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, break; case QUNIFORM_UBO0_ADDR: + /* Constant buffer 0 may be a system memory pointer, + * in which case we want to upload a shadow copy to + * the GPU. + */ + if (!cb->cb[0].buffer) { + u_upload_data(vc4->uploader, 0, + cb->cb[0].buffer_size, 16, + cb->cb[0].user_buffer, + &cb->cb[0].buffer_offset, + &cb->cb[0].buffer); + } + cl_aligned_reloc(job, &job->uniforms, - &uniforms, ubo, data); + &uniforms, + vc4_resource(cb->cb[0].buffer)->bo, + cb->cb[0].buffer_offset + + data); break; case QUNIFORM_UBO1_ADDR: { @@ -374,8 +369,6 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, } cl_end(&job->uniforms, uniforms); - - vc4_bo_unreference(&ubo); } void