From de7d90ef53d585ee3efd165df1bf38b20794b3e6 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Wed, 18 Mar 2020 18:06:41 +0100 Subject: [PATCH] ir3: Plumb through support for a1.x This will need to be used in some cases for the upcoming bindless support, plus ldc.k instructions which push data from a UBO to const registers. Part-of: --- src/freedreno/ir3/ir3.c | 9 ++- src/freedreno/ir3/ir3.h | 22 +++++-- src/freedreno/ir3/ir3_compiler_nir.c | 17 +++--- src/freedreno/ir3/ir3_context.c | 53 +++++++++++++---- src/freedreno/ir3/ir3_context.h | 11 +++- src/freedreno/ir3/ir3_delay.c | 2 +- src/freedreno/ir3/ir3_depth.c | 12 +++- src/freedreno/ir3/ir3_postsched.c | 16 ++---- src/freedreno/ir3/ir3_ra.c | 2 +- src/freedreno/ir3/ir3_ra.h | 2 +- src/freedreno/ir3/ir3_sched.c | 85 +++++++++++++++++++--------- 11 files changed, 164 insertions(+), 67 deletions(-) diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c index 7bdf8a39ba8..7af46055c02 100644 --- a/src/freedreno/ir3/ir3.c +++ b/src/freedreno/ir3/ir3.c @@ -1086,7 +1086,14 @@ ir3_instr_set_address(struct ir3_instruction *instr, debug_assert(instr->block == addr->block); instr->address = addr; - array_insert(ir, ir->indirects, instr); + debug_assert(reg_num(addr->regs[0]) == REG_A0); + unsigned comp = reg_comp(addr->regs[0]); + if (comp == 0) { + array_insert(ir, ir->a0_users, instr); + } else { + debug_assert(comp == 1); + array_insert(ir, ir->a1_users, instr); + } } } diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index ed51189ae3d..114df1a1ff7 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -469,7 +469,10 @@ struct ir3 { * convenient list of instructions that reference some address * register simplifies this. */ - DECLARE_ARRAY(struct ir3_instruction *, indirects); + DECLARE_ARRAY(struct ir3_instruction *, a0_users); + + /* same for a1.x: */ + DECLARE_ARRAY(struct ir3_instruction *, a1_users); /* and same for instructions that consume predicate register: */ DECLARE_ARRAY(struct ir3_instruction *, predicates); @@ -695,10 +698,10 @@ static inline bool is_same_type_mov(struct ir3_instruction *instr) dst = instr->regs[0]; - /* mov's that write to a0.x or p0.x are special: */ + /* mov's that write to a0 or p0.x are special: */ if (dst->num == regid(REG_P0, 0)) return false; - if (dst->num == regid(REG_A0, 0)) + if (reg_num(dst) == REG_A0) return false; if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY)) @@ -848,11 +851,20 @@ static inline unsigned dest_regs(struct ir3_instruction *instr) return util_last_bit(instr->regs[0]->wrmask); } -static inline bool writes_addr(struct ir3_instruction *instr) +static inline bool writes_addr0(struct ir3_instruction *instr) +{ + if (instr->regs_count > 0) { + struct ir3_register *dst = instr->regs[0]; + return dst->num == regid(REG_A0, 0); + } + return false; +} + +static inline bool writes_addr1(struct ir3_instruction *instr) { if (instr->regs_count > 0) { struct ir3_register *dst = instr->regs[0]; - return reg_num(dst) == REG_A0; + return dst->num == regid(REG_A0, 1); } return false; } diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index a24605bd714..3a3b939a9f6 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -744,8 +744,8 @@ emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr, base_lo = create_uniform(b, ubo + (src0->regs[1]->iim_val * ptrsz)); base_hi = create_uniform(b, ubo + (src0->regs[1]->iim_val * ptrsz) + 1); } else { - base_lo = create_uniform_indirect(b, ubo, ir3_get_addr(ctx, src0, ptrsz)); - base_hi = create_uniform_indirect(b, ubo + 1, ir3_get_addr(ctx, src0, ptrsz)); + base_lo = create_uniform_indirect(b, ubo, ir3_get_addr0(ctx, src0, ptrsz)); + base_hi = create_uniform_indirect(b, ubo + 1, ir3_get_addr0(ctx, src0, ptrsz)); /* NOTE: since relative addressing is used, make sure constlen is * at least big enough to cover all the UBO addresses, since the @@ -1362,7 +1362,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) src = ir3_get_src(ctx, &intr->src[0]); for (int i = 0; i < intr->num_components; i++) { dst[i] = create_uniform_indirect(b, idx + i, - ir3_get_addr(ctx, src[0], 1)); + ir3_get_addr0(ctx, src[0], 1)); } /* NOTE: if relative addressing is used, we set * constlen in the compiler (to worst-case value) @@ -1558,7 +1558,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) src = ir3_get_src(ctx, &intr->src[0]); struct ir3_instruction *collect = ir3_create_collect(ctx, ctx->ir->inputs, ctx->ninputs); - struct ir3_instruction *addr = ir3_get_addr(ctx, src[0], 4); + struct ir3_instruction *addr = ir3_get_addr0(ctx, src[0], 4); for (int i = 0; i < intr->num_components; i++) { unsigned n = idx * 4 + i + comp; dst[i] = create_indirect_load(ctx, ctx->ninputs, @@ -2424,11 +2424,14 @@ emit_block(struct ir3_context *ctx, nir_block *nblock) list_addtail(&block->node, &ctx->ir->block_list); /* re-emit addr register in each block if needed: */ - for (int i = 0; i < ARRAY_SIZE(ctx->addr_ht); i++) { - _mesa_hash_table_destroy(ctx->addr_ht[i], NULL); - ctx->addr_ht[i] = NULL; + for (int i = 0; i < ARRAY_SIZE(ctx->addr0_ht); i++) { + _mesa_hash_table_destroy(ctx->addr0_ht[i], NULL); + ctx->addr0_ht[i] = NULL; } + _mesa_hash_table_u64_destroy(ctx->addr1_ht, NULL); + ctx->addr1_ht = NULL; + nir_foreach_instr (instr, nblock) { ctx->cur_instr = instr; emit_instr(ctx, instr); diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c index 706316d897c..6050f3f69d3 100644 --- a/src/freedreno/ir3/ir3_context.c +++ b/src/freedreno/ir3/ir3_context.c @@ -184,7 +184,7 @@ ir3_get_src(struct ir3_context *ctx, nir_src *src) ralloc_array(ctx, struct ir3_instruction *, num_components); if (src->reg.indirect) - addr = ir3_get_addr(ctx, ir3_get_src(ctx, src->reg.indirect)[0], + addr = ir3_get_addr0(ctx, ir3_get_src(ctx, src->reg.indirect)[0], reg->num_components); for (unsigned i = 0; i < num_components; i++) { @@ -230,7 +230,7 @@ ir3_put_dst(struct ir3_context *ctx, nir_dest *dst) struct ir3_instruction *addr = NULL; if (dst->reg.indirect) - addr = ir3_get_addr(ctx, ir3_get_src(ctx, dst->reg.indirect)[0], + addr = ir3_get_addr0(ctx, ir3_get_src(ctx, dst->reg.indirect)[0], reg->num_components); for (unsigned i = 0; i < num_components; i++) { @@ -378,7 +378,7 @@ ir3_context_error(struct ir3_context *ctx, const char *format, ...) } static struct ir3_instruction * -create_addr(struct ir3_block *block, struct ir3_instruction *src, int align) +create_addr0(struct ir3_block *block, struct ir3_instruction *src, int align) { struct ir3_instruction *instr, *immed; @@ -433,29 +433,62 @@ create_addr(struct ir3_block *block, struct ir3_instruction *src, int align) return instr; } +static struct ir3_instruction * +create_addr1(struct ir3_block *block, unsigned const_val) +{ + + struct ir3_instruction *immed = create_immed(block, const_val); + struct ir3_instruction *instr = ir3_MOV(block, immed, TYPE_S16); + instr->regs[0]->num = regid(REG_A0, 1); + instr->regs[0]->flags &= ~IR3_REG_SSA; + instr->regs[0]->flags |= IR3_REG_HALF; + instr->regs[1]->flags |= IR3_REG_HALF; + return instr; +} + /* caches addr values to avoid generating multiple cov/shl/mova * sequences for each use of a given NIR level src as address */ struct ir3_instruction * -ir3_get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align) +ir3_get_addr0(struct ir3_context *ctx, struct ir3_instruction *src, int align) { struct ir3_instruction *addr; unsigned idx = align - 1; - compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr_ht)); + compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr0_ht)); - if (!ctx->addr_ht[idx]) { - ctx->addr_ht[idx] = _mesa_hash_table_create(ctx, + if (!ctx->addr0_ht[idx]) { + ctx->addr0_ht[idx] = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); } else { struct hash_entry *entry; - entry = _mesa_hash_table_search(ctx->addr_ht[idx], src); + entry = _mesa_hash_table_search(ctx->addr0_ht[idx], src); if (entry) return entry->data; } - addr = create_addr(ctx->block, src, align); - _mesa_hash_table_insert(ctx->addr_ht[idx], src, addr); + addr = create_addr0(ctx->block, src, align); + _mesa_hash_table_insert(ctx->addr0_ht[idx], src, addr); + + return addr; +} + +/* Similar to ir3_get_addr0, but for a1.x. */ +struct ir3_instruction * +ir3_get_addr1(struct ir3_context *ctx, unsigned const_val) +{ + struct ir3_instruction *addr; + + if (!ctx->addr1_ht) { + ctx->addr1_ht = _mesa_hash_table_u64_create(ctx); + } else { + addr = _mesa_hash_table_u64_search(ctx->addr1_ht, const_val); + if (addr) + return addr; + } + + addr = create_addr1(ctx->block, const_val); + _mesa_hash_table_u64_insert(ctx->addr1_ht, const_val, addr); return addr; } diff --git a/src/freedreno/ir3/ir3_context.h b/src/freedreno/ir3/ir3_context.h index c5845d4e2c6..838635d68fb 100644 --- a/src/freedreno/ir3/ir3_context.h +++ b/src/freedreno/ir3/ir3_context.h @@ -120,7 +120,12 @@ struct ir3_context { * src used for an array of vec1 cannot be also used for an * array of vec4. */ - struct hash_table *addr_ht[4]; + struct hash_table *addr0_ht[4]; + + /* The same for a1.x. We only support immediate values for a1.x, as this + * is the only use so far. + */ + struct hash_table_u64 *addr1_ht; /* last dst array, for indirect we need to insert a var-store. */ @@ -176,8 +181,10 @@ NORETURN void ir3_context_error(struct ir3_context *ctx, const char *format, ... if (!(cond)) ir3_context_error((ctx), "failed assert: "#cond"\n"); \ } while (0) -struct ir3_instruction * ir3_get_addr(struct ir3_context *ctx, +struct ir3_instruction * ir3_get_addr0(struct ir3_context *ctx, struct ir3_instruction *src, int align); +struct ir3_instruction * ir3_get_addr1(struct ir3_context *ctx, + unsigned const_val); struct ir3_instruction * ir3_get_predicate(struct ir3_context *ctx, struct ir3_instruction *src); diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c index 5839128a4c6..3fc4d911f14 100644 --- a/src/freedreno/ir3/ir3_delay.c +++ b/src/freedreno/ir3/ir3_delay.c @@ -82,7 +82,7 @@ ir3_delayslots(struct ir3_instruction *assigner, if (is_meta(assigner) || is_meta(consumer)) return 0; - if (writes_addr(assigner)) + if (writes_addr0(assigner) || writes_addr1(assigner)) return 6; /* On a6xx, it takes the number of delay slots to get a SFU result diff --git a/src/freedreno/ir3/ir3_depth.c b/src/freedreno/ir3/ir3_depth.c index a400406a1eb..6d389772570 100644 --- a/src/freedreno/ir3/ir3_depth.c +++ b/src/freedreno/ir3/ir3_depth.c @@ -201,10 +201,16 @@ compute_depth_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so) /* note that we can end up with unused indirects, but we should * not end up with unused predicates. */ - for (i = 0; i < ir->indirects_count; i++) { - struct ir3_instruction *instr = ir->indirects[i]; + for (i = 0; i < ir->a0_users_count; i++) { + struct ir3_instruction *instr = ir->a0_users[i]; if (instr && (instr->flags & IR3_INSTR_UNUSED)) - ir->indirects[i] = NULL; + ir->a0_users[i] = NULL; + } + + for (i = 0; i < ir->a1_users_count; i++) { + struct ir3_instruction *instr = ir->a1_users[i]; + if (instr && (instr->flags & IR3_INSTR_UNUSED)) + ir->a1_users[i] = NULL; } /* cleanup unused inputs: */ diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c index b283af486fb..2049cca479e 100644 --- a/src/freedreno/ir3/ir3_postsched.c +++ b/src/freedreno/ir3/ir3_postsched.c @@ -392,7 +392,6 @@ static void calculate_deps(struct ir3_postsched_deps_state *state, struct ir3_postsched_node *node) { - static const struct ir3_register half_reg = { .flags = IR3_REG_HALF }; struct ir3_register *reg; int b; @@ -400,12 +399,6 @@ calculate_deps(struct ir3_postsched_deps_state *state, * in the reverse direction) wrote any of our src registers: */ foreach_src_n (reg, i, node->instr) { - /* NOTE: relative access for a src can be either const or gpr: */ - if (reg->flags & IR3_REG_RELATIV) { - /* also reads a0.x: */ - add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false); - } - if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)) continue; @@ -428,6 +421,12 @@ calculate_deps(struct ir3_postsched_deps_state *state, } } + if (node->instr->address) { + add_reg_dep(state, node, node->instr->address->regs[0], + node->instr->address->regs[0]->num, + false); + } + if (dest_regs(node->instr) == 0) return; @@ -441,9 +440,6 @@ calculate_deps(struct ir3_postsched_deps_state *state, for (unsigned i = 0; i < arr->length; i++) { add_reg_dep(state, node, reg, arr->reg + i, true); } - - /* also reads a0.x: */ - add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false); } else { foreach_bit (b, reg->wrmask) { add_reg_dep(state, node, reg, reg->num + b, true); diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c index 40ea78609ce..7999bb34f71 100644 --- a/src/freedreno/ir3/ir3_ra.c +++ b/src/freedreno/ir3/ir3_ra.c @@ -264,7 +264,7 @@ ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block) if (instr->regs_count == 0) continue; /* couple special cases: */ - if (writes_addr(instr) || writes_pred(instr)) { + if (writes_addr0(instr) || writes_addr1(instr) || writes_pred(instr)) { id->cls = -1; } else if (instr->regs[0]->flags & IR3_REG_ARRAY) { id->cls = total_class_count; diff --git a/src/freedreno/ir3/ir3_ra.h b/src/freedreno/ir3/ir3_ra.h index bffe7dff155..43b5726648d 100644 --- a/src/freedreno/ir3/ir3_ra.h +++ b/src/freedreno/ir3/ir3_ra.h @@ -199,7 +199,7 @@ writes_gpr(struct ir3_instruction *instr) /* is dest a normal temp register: */ struct ir3_register *reg = instr->regs[0]; debug_assert(!(reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))); - if ((reg->num == regid(REG_A0, 0)) || + if ((reg_num(reg) == REG_A0) || (reg->num == regid(REG_P0, 0))) return false; return true; diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c index b8ebe0d26cb..9d0bf69d193 100644 --- a/src/freedreno/ir3/ir3_sched.c +++ b/src/freedreno/ir3/ir3_sched.c @@ -68,7 +68,8 @@ struct ir3_sched_ctx { struct ir3_block *block; /* the current block */ struct list_head depth_list; /* depth sorted unscheduled instrs */ struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/ - struct ir3_instruction *addr; /* current a0.x user, if any */ + struct ir3_instruction *addr0; /* current a0.x user, if any */ + struct ir3_instruction *addr1; /* current a1.x user, if any */ struct ir3_instruction *pred; /* current p0.x user, if any */ int live_values; /* estimate of current live values */ int half_live_values; /* estimate of current half precision live values */ @@ -225,9 +226,14 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) */ list_delinit(&instr->node); - if (writes_addr(instr)) { - debug_assert(ctx->addr == NULL); - ctx->addr = instr; + if (writes_addr0(instr)) { + debug_assert(ctx->addr0 == NULL); + ctx->addr0 = instr; + } + + if (writes_addr1(instr)) { + debug_assert(ctx->addr1 == NULL); + ctx->addr1 = instr; } if (writes_pred(instr)) { @@ -244,7 +250,7 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) update_live_values(ctx, instr); - if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) { + if (writes_addr0(instr) || writes_addr1(instr) || writes_pred(instr) || is_input(instr)) { clear_cache(ctx, NULL); } else { /* invalidate only the necessary entries.. */ @@ -281,7 +287,7 @@ struct ir3_sched_notes { /* there is at least one instruction that could be scheduled, * except for conflicting address/predicate register usage: */ - bool addr_conflict, pred_conflict; + bool addr0_conflict, addr1_conflict, pred_conflict; }; /* could an instruction be scheduled if specified ssa src was scheduled? */ @@ -314,11 +320,28 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, * TODO if any instructions use pred register and have other * src args, we would need to do the same for writes_pred().. */ - if (writes_addr(instr)) { + if (writes_addr0(instr)) { struct ir3 *ir = instr->block->shader; bool ready = false; - for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) { - struct ir3_instruction *indirect = ir->indirects[i]; + for (unsigned i = 0; (i < ir->a0_users_count) && !ready; i++) { + struct ir3_instruction *indirect = ir->a0_users[i]; + if (!indirect) + continue; + if (indirect->address != instr) + continue; + ready = could_sched(indirect, instr); + } + + /* nothing could be scheduled, so keep looking: */ + if (!ready) + return false; + } + + if (writes_addr1(instr)) { + struct ir3 *ir = instr->block->shader; + bool ready = false; + for (unsigned i = 0; (i < ir->a1_users_count) && !ready; i++) { + struct ir3_instruction *indirect = ir->a1_users[i]; if (!indirect) continue; if (indirect->address != instr) @@ -335,9 +358,15 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, * register is currently in use, we need to defer until it is * free: */ - if (writes_addr(instr) && ctx->addr) { - debug_assert(ctx->addr != instr); - notes->addr_conflict = true; + if (writes_addr0(instr) && ctx->addr0) { + debug_assert(ctx->addr0 != instr); + notes->addr0_conflict = true; + return false; + } + + if (writes_addr1(instr) && ctx->addr1) { + debug_assert(ctx->addr1 != instr); + notes->addr1_conflict = true; return false; } @@ -585,23 +614,21 @@ split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr) return new_instr; } -/* "spill" the address register by remapping any unscheduled +/* "spill" the address registers by remapping any unscheduled * instructions which depend on the current address register * to a clone of the instruction which wrote the address reg. */ static struct ir3_instruction * -split_addr(struct ir3_sched_ctx *ctx) +split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr, + struct ir3_instruction **users, unsigned users_count) { - struct ir3 *ir; struct ir3_instruction *new_addr = NULL; unsigned i; - debug_assert(ctx->addr); - - ir = ctx->addr->block->shader; + debug_assert(*addr); - for (i = 0; i < ir->indirects_count; i++) { - struct ir3_instruction *indirect = ir->indirects[i]; + for (i = 0; i < users_count; i++) { + struct ir3_instruction *indirect = users[i]; if (!indirect) continue; @@ -613,9 +640,9 @@ split_addr(struct ir3_sched_ctx *ctx) /* remap remaining instructions using current addr * to new addr: */ - if (indirect->address == ctx->addr) { + if (indirect->address == *addr) { if (!new_addr) { - new_addr = split_instr(ctx, ctx->addr); + new_addr = split_instr(ctx, *addr); /* original addr is scheduled, but new one isn't: */ new_addr->flags &= ~IR3_INSTR_MARK; } @@ -625,7 +652,7 @@ split_addr(struct ir3_sched_ctx *ctx) } /* all remaining indirects remapped to new addr: */ - ctx->addr = NULL; + *addr = NULL; return new_addr; } @@ -682,7 +709,8 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) ctx->block = block; /* addr/pred writes are per-block: */ - ctx->addr = NULL; + ctx->addr0 = NULL; + ctx->addr1 = NULL; ctx->pred = NULL; /* move all instructions to the unscheduled list, and @@ -740,14 +768,19 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) schedule(ctx, instr); } else { struct ir3_instruction *new_instr = NULL; + struct ir3 *ir = block->shader; /* nothing available to schedule.. if we are blocked on * address/predicate register conflict, then break the * deadlock by cloning the instruction that wrote that * reg: */ - if (notes.addr_conflict) { - new_instr = split_addr(ctx); + if (notes.addr0_conflict) { + new_instr = split_addr(ctx, &ctx->addr0, + ir->a0_users, ir->a0_users_count); + } else if (notes.addr1_conflict) { + new_instr = split_addr(ctx, &ctx->addr1, + ir->a1_users, ir->a1_users_count); } else if (notes.pred_conflict) { new_instr = split_pred(ctx); } else { -- 2.30.2