From fc10dc9fdea6ad7d04dfcdb8fd2e2d59ea67f68b Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Wed, 21 Dec 2016 22:43:52 -0500 Subject: [PATCH] freedreno/ir3: rework location of driver constants Rework how we lay out driver constants (driver-params, UBO/TFBO buffer addresses, immediates) for more flexibility. For a5xx+ we need to deal with the fact that gpu ptrs are 64b instead of 32b, which makes the fixed offset scheme not work so well. While we are dealing with that we might also make the layout more dynamic to account for varying # of UBOs, etc. Signed-off-by: Rob Clark --- src/gallium/drivers/freedreno/a3xx/fd3_emit.c | 9 ++-- src/gallium/drivers/freedreno/a4xx/fd4_emit.c | 9 ++-- .../drivers/freedreno/ir3/ir3_compiler_nir.c | 54 +++++++++++++------ src/gallium/drivers/freedreno/ir3/ir3_cp.c | 2 +- .../drivers/freedreno/ir3/ir3_shader.c | 16 +++--- .../drivers/freedreno/ir3/ir3_shader.h | 38 ++++++------- 6 files changed, 75 insertions(+), 53 deletions(-) diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c index 95e6d26591c..6c3458a3b08 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c @@ -96,16 +96,16 @@ static void fd3_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write, uint32_t regid, uint32_t num, struct pipe_resource **prscs, uint32_t *offsets) { + uint32_t anum = align(num, 4); uint32_t i; debug_assert((regid % 4) == 0); - debug_assert((num % 4) == 0); - OUT_PKT3(ring, CP_LOAD_STATE, 2 + num); + OUT_PKT3(ring, CP_LOAD_STATE, 2 + anum); OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/2) | CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) | - CP_LOAD_STATE_0_NUM_UNIT(num/2)); + CP_LOAD_STATE_0_NUM_UNIT(anum/2)); OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) | CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS)); @@ -120,6 +120,9 @@ fd3_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write, OUT_RING(ring, 0xbad00000 | (i << 16)); } } + + for (; i < anum; i++) + OUT_RING(ring, 0xffffffff); } #define VERT_TEX_OFF 0 diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c index 9231823cb7f..2f3e0a6981b 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c @@ -96,16 +96,16 @@ static void fd4_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write, uint32_t regid, uint32_t num, struct pipe_resource **prscs, uint32_t *offsets) { + uint32_t anum = align(num, 4); uint32_t i; debug_assert((regid % 4) == 0); - debug_assert((num % 4) == 0); - OUT_PKT3(ring, CP_LOAD_STATE, 2 + num); + OUT_PKT3(ring, CP_LOAD_STATE, 2 + anum); OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/4) | CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) | - CP_LOAD_STATE_0_NUM_UNIT(num/4)); + CP_LOAD_STATE_0_NUM_UNIT(anum/4)); OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) | CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS)); @@ -120,6 +120,9 @@ fd4_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write, OUT_RING(ring, 0xbad00000 | (i << 16)); } } + + for (; i < anum; i++) + OUT_RING(ring, 0xffffffff); } static void diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index ac6840cd609..e0fc2aa49bd 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -119,6 +119,11 @@ struct ir3_compile { bool error; }; +/* gpu pointer size in units of 32bit registers/slots */ +static unsigned pointer_size(struct ir3_compile *ctx) +{ + return (ctx->compiler->gpu_id >= 500) ? 2 : 1; +} static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val); static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock); @@ -181,31 +186,46 @@ compile_init(struct ir3_compiler *compiler, nir_print_shader(ctx->s, stdout); } - so->first_driver_param = so->first_immediate = align(ctx->s->num_uniforms, 4); + so->num_uniforms = ctx->s->num_uniforms; + so->num_ubos = ctx->s->info->num_ubos; - /* Layout of constant registers: + /* Layout of constant registers, each section aligned to vec4. Note + * that pointer size (ubo, etc) changes depending on generation. * - * num_uniform * vec4 - user consts - * 4 * vec4 - UBO addresses + * user consts + * UBO addresses * if (vertex shader) { - * N * vec4 - driver params (IR3_DP_*) - * 1 * vec4 - stream-out addresses + * driver params (IR3_DP_*) + * if (stream_output.num_outputs > 0) + * stream-out addresses * } + * immediates * - * TODO this could be made more dynamic, to at least skip sections - * that we don't need.. + * Immediates go last mostly because they are inserted in the CP pass + * after the nir -> ir3 frontend. */ + unsigned constoff = align(ctx->s->num_uniforms, 4); + unsigned ptrsz = pointer_size(ctx); - /* reserve 4 (vec4) slots for ubo base addresses: */ - so->first_immediate += 4; + memset(&so->constbase, ~0, sizeof(so->constbase)); + + if (so->num_ubos > 0) { + so->constbase.ubo = constoff; + constoff += align(ctx->s->info->num_ubos * ptrsz, 4) / 4; + } if (so->type == SHADER_VERTEX) { - /* driver params (see ir3_driver_param): */ - so->first_immediate += IR3_DP_COUNT/4; /* convert to vec4 */ - /* one (vec4) slot for stream-output base addresses: */ - so->first_immediate++; + so->constbase.driver_param = constoff; + constoff += align(IR3_DP_COUNT, 4) / 4; + + if (so->shader->stream_output.num_outputs > 0) { + so->constbase.tfbo = constoff; + constoff += align(PIPE_MAX_SO_BUFFERS * ptrsz, 4) / 4; + } } + so->constbase.immediate = constoff; + return ctx; } @@ -576,7 +596,7 @@ create_driver_param(struct ir3_compile *ctx, enum ir3_driver_param dp) { /* first four vec4 sysval's reserved for UBOs: */ /* NOTE: dp is in scalar, but there can be >4 dp components: */ - unsigned n = ctx->so->first_driver_param + IR3_DRIVER_PARAM_OFF; + unsigned n = ctx->so->constbase.driver_param; unsigned r = regid(n + dp / 4, dp % 4); return create_uniform(ctx, r); } @@ -975,7 +995,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr, struct ir3_instruction *addr, *src0, *src1; nir_const_value *const_offset; /* UBO addresses are the first driver params: */ - unsigned ubo = regid(ctx->so->first_driver_param + IR3_UBOS_OFF, 0); + unsigned ubo = regid(ctx->so->constbase.ubo, 0); int off = 0; /* First src is ubo index, which could either be an immed or not: */ @@ -1905,7 +1925,7 @@ emit_stream_out(struct ir3_compile *ctx) unsigned stride = strmout->stride[i]; struct ir3_instruction *base, *off; - base = create_uniform(ctx, regid(v->first_driver_param + IR3_TFBOS_OFF, i)); + base = create_uniform(ctx, regid(v->constbase.tfbo, i)); /* 24-bit should be enough: */ off = ir3_MUL_U(ctx->block, vtxcnt, 0, diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c index 57c37e26372..71e02615c75 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c @@ -296,7 +296,7 @@ lower_immed(struct ir3_cp_ctx *ctx, struct ir3_register *reg, unsigned new_flags new_flags &= ~IR3_REG_IMMED; new_flags |= IR3_REG_CONST; reg->flags = new_flags; - reg->num = i + (4 * ctx->so->first_immediate); + reg->num = i + (4 * ctx->so->constbase.immediate); return reg; } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c index 8920225be13..4da7246a0cf 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c @@ -366,7 +366,7 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin) } for (i = 0; i < so->immediates_count; i++) { - debug_printf("@const(c%d.x)\t", so->first_immediate + i); + debug_printf("@const(c%d.x)\t", so->constbase.immediate + i); debug_printf("0x%08x, 0x%08x, 0x%08x, 0x%08x\n", so->immediates[i].val[0], so->immediates[i].val[1], @@ -503,7 +503,7 @@ emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v, * the user consts early to avoid HLSQ lockup caused by * writing too many consts */ - uint32_t max_const = MIN2(v->first_driver_param, v->constlen); + uint32_t max_const = MIN2(v->num_uniforms, v->constlen); // I expect that size should be a multiple of vec4's: assert(size == align(size, 4)); @@ -527,9 +527,9 @@ static void emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf) { - uint32_t offset = v->first_driver_param + IR3_UBOS_OFF; + uint32_t offset = v->constbase.ubo; if (v->constlen > offset) { - uint32_t params = MIN2(4, v->constlen - offset) * 4; + uint32_t params = v->num_ubos; uint32_t offsets[params]; struct pipe_resource *prscs[params]; @@ -557,7 +557,7 @@ emit_immediates(struct fd_context *ctx, const struct ir3_shader_variant *v, struct fd_ringbuffer *ring) { int size = v->immediates_count; - uint32_t base = v->first_immediate; + uint32_t base = v->constbase.immediate; /* truncate size to avoid writing constants that shader * does not use: @@ -581,7 +581,7 @@ emit_tfbos(struct fd_context *ctx, const struct ir3_shader_variant *v, struct fd_ringbuffer *ring) { /* streamout addresses after driver-params: */ - uint32_t offset = v->first_driver_param + IR3_TFBOS_OFF; + uint32_t offset = v->constbase.tfbo; if (v->constlen > offset) { struct fd_streamout_stateobj *so = &ctx->streamout; struct pipe_stream_output_info *info = &v->shader->stream_output; @@ -680,8 +680,8 @@ ir3_emit_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, /* emit driver params every time: */ /* TODO skip emit if shader doesn't use driver params to avoid WFI.. */ if (info && (v->type == SHADER_VERTEX)) { - uint32_t offset = v->first_driver_param + IR3_DRIVER_PARAM_OFF; - if (v->constlen >= offset) { + uint32_t offset = v->constbase.driver_param; + if (v->constlen > offset) { uint32_t vertex_params[IR3_DP_COUNT] = { [IR3_DP_VTXID_BASE] = info->indexed ? info->index_bias : info->start, diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h index c603168a04b..7a0ff982e24 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -47,22 +47,6 @@ enum ir3_driver_param { IR3_DP_COUNT = 36 /* must be aligned to vec4 */ }; -/* Layout of constant registers: - * - * num_uniform * vec4 - user consts - * 4 * vec4 - UBO addresses - * if (vertex shader) { - * N * vec4 - driver params (IR3_DP_*) - * 1 * vec4 - stream-out addresses - * } - * - * TODO this could be made more dynamic, to at least skip sections - * that we don't need.. - */ -#define IR3_UBOS_OFF 0 /* UBOs after user consts */ -#define IR3_DRIVER_PARAM_OFF 4 /* driver params after UBOs */ -#define IR3_TFBOS_OFF (IR3_DRIVER_PARAM_OFF + IR3_DP_COUNT/4) - /* Configuration key used to identify a shader variant.. different * shader variants can be used to implement features not supported * in hw (two sided color), binning-pass vertex shader, etc. @@ -143,6 +127,12 @@ struct ir3_shader_variant { */ unsigned constlen; + /* number of uniforms (in vec4), not including built-in compiler + * constants, etc. + */ + unsigned num_uniforms; + unsigned num_ubos; + /* About Linkage: * + Let the frag shader determine the position/compmask for the * varyings, since it is the place where we know if the varying @@ -211,12 +201,18 @@ struct ir3_shader_variant { /* do we have kill instructions: */ bool has_kill; - /* const reg # of first immediate, ie. 1 == c1 - * (not regid, because TGSI thinks in terms of vec4 registers, - * not scalar registers) + /* Layout of constant registers, each section (in vec4). Pointer size + * is 32b (a3xx, a4xx), or 64b (a5xx+), which effects the size of the + * UBO and stream-out consts. */ - unsigned first_driver_param; - unsigned first_immediate; + struct { + /* user const start at zero */ + unsigned ubo; + unsigned driver_param; + unsigned tfbo; + unsigned immediate; + } constbase; + unsigned immediates_count; struct { uint32_t val[4]; -- 2.30.2