From 112c65825fddd00a2136c75e09982e1878c944a4 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 20 Apr 2020 16:39:54 -0700 Subject: [PATCH] freedreno/a6xx: Use LDC for UBO loads. It saves addressing math, but may cause multiple loads to be done and bcseled due to NIR not giving us good address alignment information currently. I don't have any workloads I know of using non-const-uploaded UBOs, so I don't have perf numbers for it This makes us match the GLES blob's behavior, and turnip (other than being bindful). Part-of: --- src/freedreno/ir3/ir3_nir.c | 10 ++- src/freedreno/ir3/ir3_nir.h | 2 +- src/freedreno/ir3/ir3_nir_lower_io_offsets.c | 23 +++--- src/freedreno/ir3/ir3_shader.h | 6 ++ .../drivers/freedreno/a6xx/fd6_const.c | 76 ++++++++++--------- .../drivers/freedreno/a6xx/fd6_program.c | 2 +- 6 files changed, 65 insertions(+), 54 deletions(-) diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 07acb27c6b3..b3f784a557e 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -317,7 +317,7 @@ ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s, const bool ubo_progress = !key && OPT(s, ir3_nir_analyze_ubo_ranges, shader); const bool idiv_progress = OPT(s, nir_lower_idiv, nir_lower_idiv_fast); /* UBO offset lowering has to come after we've decided what will be left as load_ubo */ - OPT_V(s, ir3_nir_lower_io_offsets); + OPT_V(s, ir3_nir_lower_io_offsets, shader->compiler->gpu_id); if (ubo_progress || idiv_progress) ir3_optimize_loop(s); @@ -449,7 +449,13 @@ ir3_setup_const_state(struct ir3_shader *shader, nir_shader *nir) MAX2(const_state->num_driver_params, IR3_DP_VTXCNT_MAX + 1); } - const_state->num_ubos = nir->info.num_ubos; + /* On a6xx, we use UBO descriptors and LDC instead of UBO pointers in the + * constbuf. + */ + if (compiler->gpu_id >= 600) + shader->num_ubos = nir->info.num_ubos; + else + const_state->num_ubos = nir->info.num_ubos; /* num_driver_params is scalar, align to vec4: */ const_state->num_driver_params = align(const_state->num_driver_params, 4); diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index 80cd0870374..bd29da7c6c6 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -36,7 +36,7 @@ bool ir3_nir_apply_trig_workarounds(nir_shader *shader); bool ir3_nir_lower_imul(nir_shader *shader); bool ir3_nir_lower_tg4_to_tex(nir_shader *shader); -bool ir3_nir_lower_io_offsets(nir_shader *shader); +bool ir3_nir_lower_io_offsets(nir_shader *shader, int gpu_id); bool ir3_nir_lower_load_barycentric_at_sample(nir_shader *shader); bool ir3_nir_lower_load_barycentric_at_offset(nir_shader *shader); bool ir3_nir_move_varying_inputs(nir_shader *shader); diff --git a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c index 429cde5bfd7..2d035eedd23 100644 --- a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c +++ b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c @@ -255,15 +255,12 @@ lower_offset_for_ssbo(nir_intrinsic_instr *intrinsic, nir_builder *b, } static bool -lower_offset_for_ubo(nir_intrinsic_instr *intrinsic, nir_builder *b) +lower_offset_for_ubo(nir_intrinsic_instr *intrinsic, nir_builder *b, int gpu_id) { - /* We only need to lower offset if using LDC. Currently, we only use LDC - * in the bindless mode. Also, LDC is introduced on A6xx, but currently we - * only use bindless in turnip which is A6xx only. - * - * TODO: We should be using LDC always on A6xx+. + /* We only need to lower offset if using LDC, which takes an offset in + * vec4 units and has the start component baked into the instruction. */ - if (!ir3_bindless_resource(intrinsic->src[0])) + if (gpu_id < 600) return false; /* TODO handle other bitsizes, including non-dword-aligned loads */ @@ -333,7 +330,7 @@ lower_offset_for_ubo(nir_intrinsic_instr *intrinsic, nir_builder *b) } static bool -lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx) +lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx, int gpu_id) { bool progress = false; @@ -345,7 +342,7 @@ lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx) /* UBO */ if (intr->intrinsic == nir_intrinsic_load_ubo) { - progress |= lower_offset_for_ubo(intr, b); + progress |= lower_offset_for_ubo(intr, b, gpu_id); continue; } @@ -364,7 +361,7 @@ lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx) } static bool -lower_io_offsets_func(nir_function_impl *impl) +lower_io_offsets_func(nir_function_impl *impl, int gpu_id) { void *mem_ctx = ralloc_parent(impl); nir_builder b; @@ -372,7 +369,7 @@ lower_io_offsets_func(nir_function_impl *impl) bool progress = false; nir_foreach_block_safe (block, impl) { - progress |= lower_io_offsets_block(block, &b, mem_ctx); + progress |= lower_io_offsets_block(block, &b, mem_ctx, gpu_id); } if (progress) { @@ -384,13 +381,13 @@ lower_io_offsets_func(nir_function_impl *impl) } bool -ir3_nir_lower_io_offsets(nir_shader *shader) +ir3_nir_lower_io_offsets(nir_shader *shader, int gpu_id) { bool progress = false; nir_foreach_function (function, shader) { if (function->impl) - progress |= lower_io_offsets_func(function->impl); + progress |= lower_io_offsets_func(function->impl, gpu_id); } return progress; diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index e401498612f..d623cf4fdaf 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -607,6 +607,12 @@ struct ir3_shader { struct ir3_compiler *compiler; struct ir3_ubo_analysis_state ubo_state; + + /* Number of UBOs loaded by LDC, as opposed to LDG through pointers in + * ubo_state. + */ + unsigned num_ubos; + struct ir3_const_state const_state; struct nir_shader *nir; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_const.c b/src/gallium/drivers/freedreno/a6xx/fd6_const.c index 86ea1d1e41c..1d24d8aafe2 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_const.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.c @@ -104,39 +104,6 @@ fd6_emit_const(struct fd_ringbuffer *ring, gl_shader_stage type, } } -static void -fd6_emit_const_bo(struct fd_ringbuffer *ring, gl_shader_stage type, - uint32_t regid, uint32_t num, struct pipe_resource **prscs, uint32_t *offsets) -{ - uint32_t anum = align(num, 2); - uint32_t i; - - debug_assert((regid % 4) == 0); - - OUT_PKT7(ring, fd6_stage2opcode(type), 3 + (2 * anum)); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid/4) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS)| - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(type)) | - CP_LOAD_STATE6_0_NUM_UNIT(anum/2)); - OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); - OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); - - for (i = 0; i < num; i++) { - if (prscs[i]) { - OUT_RELOC(ring, fd_resource(prscs[i])->bo, offsets[i], 0, 0); - } else { - OUT_RING(ring, 0xbad00000 | (i << 16)); - OUT_RING(ring, 0xbad00000 | (i << 16)); - } - } - - for (; i < anum; i++) { - OUT_RING(ring, 0xffffffff); - OUT_RING(ring, 0xffffffff); - } -} - static bool is_stateobj(struct fd_ringbuffer *ring) { @@ -160,9 +127,7 @@ emit_const_bo(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v, uint32_t dst_offset, uint32_t num, struct pipe_resource **prscs, uint32_t *offsets) { - /* TODO inline this */ - assert(dst_offset + num < v->constlen * 4); - fd6_emit_const_bo(ring, v->type, dst_offset, num, prscs, offsets); + unreachable("shouldn't be called on a6xx"); } static void @@ -262,6 +227,42 @@ emit_tess_consts(struct fd6_emit *emit) fd6_emit_take_group(emit, constobj, FD6_GROUP_PRIMITIVE_PARAMS, ENABLE_ALL); } +static void +fd6_emit_ubos(const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf) +{ + if (!v->shader->num_ubos) + return; + + int num_ubos = v->shader->num_ubos; + + OUT_PKT7(ring, fd6_stage2opcode(v->type), 3 + (2 * num_ubos)); + OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)| + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(v->type)) | + CP_LOAD_STATE6_0_NUM_UNIT(num_ubos)); + OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + + for (int i = 0; i < num_ubos; i++) { + /* Note: gallium constbuf 0 was always lowered to hardware constbuf, + * and UBO load indices decremented by one. + */ + struct pipe_constant_buffer *cb = &constbuf->cb[i + 1]; + if (cb->buffer) { + int size_vec4s = DIV_ROUND_UP(cb->buffer_size, 16); + OUT_RELOC(ring, fd_resource(cb->buffer)->bo, + cb->buffer_offset, + (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32, + 0); + } else { + OUT_RING(ring, 0xbad00000 | (i << 16)); + OUT_RING(ring, 0xbad00000 | (i << 16)); + } + } +} + static void emit_user_consts(struct fd6_emit *emit) { @@ -288,7 +289,7 @@ emit_user_consts(struct fd6_emit *emit) if (!variants[i]) continue; ir3_emit_user_consts(ctx->screen, variants[i], constobj, &ctx->constbuf[types[i]]); - ir3_emit_ubos(ctx->screen, variants[i], constobj, &ctx->constbuf[types[i]]); + fd6_emit_ubos(variants[i], constobj, &ctx->constbuf[types[i]]); } fd6_emit_take_group(emit, constobj, FD6_GROUP_CONST, ENABLE_ALL); @@ -335,6 +336,7 @@ fd6_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin struct fd_context *ctx, const struct pipe_grid_info *info) { ir3_emit_cs_consts(v, ring, ctx, info); + fd6_emit_ubos(v, ring, &ctx->constbuf[PIPE_SHADER_COMPUTE]); } void diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/src/gallium/drivers/freedreno/a6xx/fd6_program.c index 135adfa59c8..58613d105c4 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c @@ -1022,7 +1022,7 @@ fd6_shader_state_create(struct pipe_context *pctx, const struct pipe_shader_stat /* also account for UBO addresses: */ packets += 1; - size += 2 * align(shader->const_state.num_ubos, 2); + size += 2 * shader->num_ubos; unsigned sizedwords = (4 * packets) + size; shader->ubo_state.cmdstream_size = sizedwords * 4; -- 2.30.2