From 882d53d8e36592a39cde947e890969a81b2b1226 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 2 Aug 2019 14:07:47 -0700 Subject: [PATCH] freedreno/ir3+a6xx: same VBO state for draw/binning Worth ~+20% on gl_driver2 Signed-off-by: Rob Clark --- src/freedreno/ir3/ir3.h | 2 +- src/freedreno/ir3/ir3_compiler_nir.c | 46 ++++++++-- src/freedreno/ir3/ir3_ra.c | 91 ++++++++++++++++++- src/freedreno/ir3/ir3_shader.c | 14 ++- src/freedreno/ir3/ir3_shader.h | 5 +- src/gallium/drivers/freedreno/a6xx/fd6_emit.c | 5 +- src/gallium/drivers/freedreno/a6xx/fd6_emit.h | 1 - .../drivers/freedreno/a6xx/fd6_program.c | 8 ++ 8 files changed, 150 insertions(+), 22 deletions(-) diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index cbbb9bb61b0..872a6fb0fc2 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -1081,7 +1081,7 @@ void ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so); /* register assignment: */ struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler); -int ir3_ra(struct ir3 *ir3); +int ir3_ra(struct ir3_shader_variant *v); /* legalize: */ void ir3_legalize(struct ir3 *ir, bool *has_ssbo, bool *need_pixlod, int *max_bary); diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index dca55f33b38..3f4a0f43c99 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -2906,6 +2906,32 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, if (so->binning_pass && (ctx->compiler->gpu_id >= 600)) fixup_binning_pass(ctx); + /* for a6xx+, binning and draw pass VS use same VBO state, so we + * need to make sure not to remove any inputs that are used by + * the nonbinning VS. + */ + if (ctx->compiler->gpu_id >= 600 && so->binning_pass) { + debug_assert(so->type == MESA_SHADER_VERTEX); + for (int i = 0; i < ir->ninputs; i++) { + struct ir3_instruction *in = ir->inputs[i]; + + if (!in) + continue; + + unsigned n = i / 4; + unsigned c = i % 4; + + debug_assert(n < so->nonbinning->inputs_count); + + if (so->nonbinning->inputs[n].sysval) + continue; + + /* be sure to keep inputs, even if only used in VS */ + if (so->nonbinning->inputs[n].compmask & (1 << c)) + array_insert(in->block, in->block->keeps, in); + } + } + /* Insert mov if there's same instruction for each output. * eg. dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.const_expression.vertex.sampler2dshadow */ @@ -2962,7 +2988,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ir3_print(ir); } - ret = ir3_ra(ir); + ret = ir3_ra(so); if (ret) { DBG("RA failed!"); goto out; @@ -3003,13 +3029,17 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, for (j = 0; j < 4; j++) { struct ir3_instruction *in = inputs[(i*4) + j]; - if (in && !(in->flags & IR3_INSTR_UNUSED)) { - reg = in->regs[0]->num - j; - if (half) { - compile_assert(ctx, in->regs[0]->flags & IR3_REG_HALF); - } else { - half = !!(in->regs[0]->flags & IR3_REG_HALF); - } + if (!in) + continue; + + if (in->flags & IR3_INSTR_UNUSED) + continue; + + reg = in->regs[0]->num - j; + if (half) { + compile_assert(ctx, in->regs[0]->flags & IR3_REG_HALF); + } else { + half = !!(in->regs[0]->flags & IR3_REG_HALF); } } so->inputs[i].regid = reg; diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c index 980cd62c48b..a641661a441 100644 --- a/src/freedreno/ir3/ir3_ra.c +++ b/src/freedreno/ir3/ir3_ra.c @@ -330,6 +330,7 @@ struct ir3_ra_instr_data { /* register-assign context, per-shader */ struct ir3_ra_ctx { + struct ir3_shader_variant *v; struct ir3 *ir; struct ir3_ra_reg_set *set; @@ -1091,6 +1092,60 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) static int ra_alloc(struct ir3_ra_ctx *ctx) { + /* Pre-assign VS inputs on a6xx+ binning pass shader, to align + * with draw pass VS, so binning and draw pass can both use the + * same VBO state. + * + * Note that VS inputs are expected to be full precision. + */ + bool pre_assign_inputs = (ctx->ir->compiler->gpu_id >= 600) && + (ctx->ir->type == MESA_SHADER_VERTEX) && + ctx->v->binning_pass; + + if (pre_assign_inputs) { + for (unsigned i = 0; i < ctx->ir->ninputs; i++) { + struct ir3_instruction *instr = ctx->ir->inputs[i]; + + if (!instr) + continue; + + debug_assert(!(instr->regs[0]->flags & (IR3_REG_HALF | IR3_REG_HIGH))); + + struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + + /* only consider the first component: */ + if (id->off > 0) + continue; + + unsigned name = ra_name(ctx, id); + + unsigned n = i / 4; + unsigned c = i % 4; + + /* 'base' is in scalar (class 0) but we need to map that + * the conflicting register of the appropriate class (ie. + * input could be vec2/vec3/etc) + * + * Note that the higher class (larger than scalar) regs + * are setup to conflict with others in the same class, + * so for example, R1 (scalar) is also the first component + * of D1 (vec2/double): + * + * Single (base) | Double + * --------------+--------------- + * R0 | D0 + * R1 | D0 D1 + * R2 | D1 D2 + * R3 | D2 + * .. and so on.. + */ + unsigned reg = ctx->set->gpr_to_ra_reg[id->cls] + [ctx->v->nonbinning->inputs[n].regid + c]; + + ra_set_node_reg(ctx->g, name, reg); + } + } + /* pre-assign array elements: */ list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { @@ -1118,6 +1173,35 @@ retry: } } + /* also need to not conflict with any pre-assigned inputs: */ + if (pre_assign_inputs) { + for (unsigned i = 0; i < ctx->ir->ninputs; i++) { + struct ir3_instruction *instr = ctx->ir->inputs[i]; + + if (!instr) + continue; + + struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + + /* only consider the first component: */ + if (id->off > 0) + continue; + + unsigned name = ra_name(ctx, id); + + /* Check if array intersects with liverange AND register + * range of the input: + */ + if (intersects(arr->start_ip, arr->end_ip, + ctx->def[name], ctx->use[name]) && + intersects(base, base + arr->length, + i, i + class_sizes[id->cls])) { + base = MAX2(base, i + class_sizes[id->cls]); + goto retry; + } + } + } + arr->reg = base; for (unsigned i = 0; i < arr->length; i++) { @@ -1140,11 +1224,12 @@ retry: return 0; } -int ir3_ra(struct ir3 *ir) +int ir3_ra(struct ir3_shader_variant *v) { struct ir3_ra_ctx ctx = { - .ir = ir, - .set = ir->compiler->set, + .v = v, + .ir = v->ir, + .set = v->ir->compiler->set, }; int ret; diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c index 7c686f0ee2a..aae7baeb2e0 100644 --- a/src/freedreno/ir3/ir3_shader.c +++ b/src/freedreno/ir3/ir3_shader.c @@ -178,9 +178,14 @@ assemble_variant(struct ir3_shader_variant *v) v->ir = NULL; } +/* + * For creating normal shader variants, 'nonbinning' is NULL. For + * creating binning pass shader, it is link to corresponding normal + * (non-binning) variant. + */ static struct ir3_shader_variant * create_variant(struct ir3_shader *shader, struct ir3_shader_key *key, - bool binning_pass) + struct ir3_shader_variant *nonbinning) { struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant); int ret; @@ -190,7 +195,8 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key *key, v->id = ++shader->variant_count; v->shader = shader; - v->binning_pass = binning_pass; + v->binning_pass = !!nonbinning; + v->nonbinning = nonbinning; v->key = *key; v->type = shader->type; @@ -226,7 +232,7 @@ shader_variant(struct ir3_shader *shader, struct ir3_shader_key *key, return v; /* compile new variant if it doesn't exist already: */ - v = create_variant(shader, key, false); + v = create_variant(shader, key, NULL); if (v) { v->next = shader->variants; shader->variants = v; @@ -246,7 +252,7 @@ ir3_shader_get_variant(struct ir3_shader *shader, struct ir3_shader_key *key, if (v && binning_pass) { if (!v->binning) { - v->binning = create_variant(shader, key, true); + v->binning = create_variant(shader, key, v); *created = true; } mtx_unlock(&shader->variants_lock); diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 53889c7f2ed..f6896c3526b 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -391,7 +391,10 @@ struct ir3_shader_variant { * which is pointed to by so->binning: */ bool binning_pass; - struct ir3_shader_variant *binning; +// union { + struct ir3_shader_variant *binning; + struct ir3_shader_variant *nonbinning; +// }; struct ir3_info info; struct ir3 *ir; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c index ef584177d16..59e0a9780e0 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c @@ -791,10 +791,7 @@ fd6_emit_state(struct fd_ringbuffer *ring, struct fd6_emit *emit) struct fd_ringbuffer *state; state = build_vbo_state(emit, emit->vs); - fd6_emit_take_group(emit, state, FD6_GROUP_VBO, 0x6); - - state = build_vbo_state(emit, emit->bs); - fd6_emit_take_group(emit, state, FD6_GROUP_VBO_BINNING, 0x1); + fd6_emit_take_group(emit, state, FD6_GROUP_VBO, 0x7); } if (dirty & FD_DIRTY_ZSA) { diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h index 2ffb76c3900..bc66884fb5a 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h @@ -49,7 +49,6 @@ enum fd6_state_id { FD6_GROUP_LRZ, FD6_GROUP_LRZ_BINNING, FD6_GROUP_VBO, - FD6_GROUP_VBO_BINNING, FD6_GROUP_VS_CONST, FD6_GROUP_FS_CONST, FD6_GROUP_VS_TEX, diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/src/gallium/drivers/freedreno/a6xx/fd6_program.c index 3aa91c312b3..a2acaa7b5c2 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c @@ -703,6 +703,14 @@ fd6_program_create(void *data, struct ir3_shader_variant *bs, state->binning_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); state->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); +#ifdef DEBUG + for (unsigned i = 0; i < bs->inputs_count; i++) { + if (vs->inputs[i].sysval) + continue; + debug_assert(bs->inputs[i].regid == vs->inputs[i].regid); + } +#endif + setup_config_stateobj(state->config_stateobj, state); setup_stateobj(state->binning_stateobj, ctx->screen, state, key, true); setup_stateobj(state->stateobj, ctx->screen, state, key, false); -- 2.30.2