From: Rob Clark Date: Mon, 21 Jul 2014 19:24:30 +0000 (-0400) Subject: freedreno/a3xx/compiler: const file relative addressing X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=a5ac36a75f7ceda1093dd982a7fc0c07faae7590;p=mesa.git freedreno/a3xx/compiler: const file relative addressing Teach new compiler scheduling and register assignment how to deal with relative addressing. This gets us what we need to avoid falling back to old compiler for CONST[ADDR[0].x+n]. It is also a prerequisite for temp file relative addressing, although that is going to also need some cleverness in register assignment to keep arrays grouped together. NOTE: doing address calculation in full precision and then narrowing to s16 in the mov to addr reg seems to sometimes cause lockups (and sometimes work?!). It seems more reliable to do the address calculation in s16, like the blob does. Which means teaching RA how to deal with mixed half and full precision allocation. Fortunately that didn't turn out to be too hard, so that is a nice bonus which we could probably take better advantage of elsewhere. Signed-off-by: Rob Clark --- diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c index cee446a9fa8..1138ec9be34 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c @@ -192,8 +192,7 @@ compile_init(struct fd3_compile_context *ctx, struct fd3_shader_variant *so, #define FM(x) (1 << TGSI_FILE_##x) /* optimize can't deal with relative addressing: */ - if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | - FM(OUTPUT) | FM(IMMEDIATE) | FM(CONSTANT))) + if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT))) return TGSI_PARSE_ERROR; /* Immediates go after constants: */ @@ -414,14 +413,7 @@ block_temporary(struct ir3_block *block, unsigned n) static struct ir3_instruction * create_immed(struct fd3_compile_context *ctx, float val) { - /* this can happen when registers (or components of a TGSI - * register) are used as src before they have been assigned - * (undefined contents). To avoid confusing the rest of the - * compiler, and to generally keep things peachy, substitute - * an instruction that sets the src to 0.0. Or to keep - * things undefined, I could plug in a random number? :-P - * - * NOTE: *don't* use instr_create() here! + /* NOTE: *don't* use instr_create() here! */ struct ir3_instruction *instr; instr = ir3_instr_create(ctx->block, 1, 0); @@ -464,6 +456,12 @@ ssa_dst(struct fd3_compile_context *ctx, struct ir3_instruction *instr, ctx->output_updates[idx].instr = instr; ctx->num_output_updates++; break; + case TGSI_FILE_ADDRESS: + compile_assert(ctx, n < 1); + ctx->output_updates[idx].instrp = &ctx->block->address; + ctx->output_updates[idx].instr = instr; + ctx->num_output_updates++; + break; } } @@ -526,7 +524,8 @@ add_dst_reg_wrmask(struct fd3_compile_context *ctx, /* uses SSA */ break; case TGSI_FILE_ADDRESS: - num = REG_A0; + flags |= IR3_REG_ADDR; + /* uses SSA */ break; default: compile_error(ctx, "unsupported dst register file: %s\n", @@ -553,7 +552,8 @@ add_dst_reg_wrmask(struct fd3_compile_context *ctx, if (!ctx->atomic) ssa_dst(ctx, instr, dst, chan); } else if ((dst->File == TGSI_FILE_TEMPORARY) || - (dst->File == TGSI_FILE_OUTPUT)) { + (dst->File == TGSI_FILE_OUTPUT) || + (dst->File == TGSI_FILE_ADDRESS)) { unsigned i; /* if instruction writes multiple, we need to create @@ -591,6 +591,7 @@ add_src_reg_wrmask(struct fd3_compile_context *ctx, { unsigned flags = 0, num = 0; struct ir3_register *reg; + struct ir3_instruction *orig = NULL; /* TODO we need to use a mov to temp for const >= 64.. or maybe * we could use relative addressing.. @@ -628,9 +629,21 @@ add_src_reg_wrmask(struct fd3_compile_context *ctx, flags |= IR3_REG_ABS; if (src->Negate) flags |= IR3_REG_NEGATE; - if (src->Indirect) + + if (src->Indirect) { flags |= IR3_REG_RELATIV; + /* shouldn't happen, and we can't cope with it below: */ + compile_assert(ctx, wrmask == 0x1); + + /* wrap in a meta-deref to track both the src and address: */ + orig = instr; + + instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF); + ir3_reg_create(instr, 0, 0); + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address; + } + reg = ir3_reg_create(instr, regid(num, chan), flags); reg->wrmask = wrmask; @@ -643,6 +656,8 @@ add_src_reg_wrmask(struct fd3_compile_context *ctx, struct ir3_instruction *collect; unsigned i; + compile_assert(ctx, !src->Indirect); + /* if instruction reads multiple, we need to create * some place-holder collect the registers: */ @@ -666,6 +681,10 @@ add_src_reg_wrmask(struct fd3_compile_context *ctx, reg->instr = collect; } + if (src->Indirect) { + reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA); + reg->instr = instr; + } return reg; } @@ -718,36 +737,6 @@ get_internal_temp(struct fd3_compile_context *ctx, return tmp_src; } -/* Get internal half-precision temp src/dst to use for a sequence of - * instructions generated by a single TGSI op. - */ -static struct tgsi_src_register * -get_internal_temp_hr(struct fd3_compile_context *ctx, - struct tgsi_dst_register *tmp_dst) -{ - struct tgsi_src_register *tmp_src; - int n; - - tmp_dst->File = TGSI_FILE_TEMPORARY; - tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW; - tmp_dst->Indirect = 0; - tmp_dst->Dimension = 0; - - /* assign next temporary: */ - n = ctx->num_internal_temps++; - compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps)); - tmp_src = &ctx->internal_temps[n]; - - /* just use hr0 because no one else should be using half- - * precision regs: - */ - tmp_dst->Index = 0; - - src_from_dst(tmp_src, tmp_dst); - - return tmp_src; -} - static inline bool is_const(struct tgsi_src_register *src) { @@ -1049,11 +1038,18 @@ trans_arl(const struct instr_translater *t, struct tgsi_dst_register *dst = &inst->Dst[0].Register; struct tgsi_src_register *src = &inst->Src[0].Register; unsigned chan = src->SwizzleX; + compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS); - tmp_src = get_internal_temp_hr(ctx, &tmp_dst); + /* NOTE: we allocate a temporary from a flat register + * namespace (ignoring half vs full). It turns out + * not to really matter since registers get reassigned + * later in ir3_ra which (hopefully!) can deal a bit + * better with mixed half and full precision. + */ + tmp_src = get_internal_temp(ctx, &tmp_dst); - /* cov.{f32,f16}s16 Rtmp, Rsrc */ + /* cov.f{32,16}s16 Rtmp, Rsrc */ instr = instr_create(ctx, 1, 0); instr->cat1.src_type = get_ftype(ctx); instr->cat1.dst_type = TYPE_S16; diff --git a/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h b/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h index a79998ef56f..c67f1037ced 100644 --- a/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h +++ b/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h @@ -204,6 +204,8 @@ typedef enum { /* branches/flow control */ OPC_META_FLOW = 4, OPC_META_PHI = 5, + /* relative addressing */ + OPC_META_DEREF = 6, } opc_t; @@ -244,6 +246,16 @@ static inline int type_float(type_t type) return (type == TYPE_F32) || (type == TYPE_F16); } +static inline int type_uint(type_t type) +{ + return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8); +} + +static inline int type_sint(type_t type) +{ + return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8); +} + typedef union PACKED { /* normal gpr or const src register: */ struct PACKED { diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.h b/src/gallium/drivers/freedreno/a3xx/ir3.h index 872f47883bb..9ec05da6ae4 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3.h +++ b/src/gallium/drivers/freedreno/a3xx/ir3.h @@ -70,6 +70,7 @@ struct ir3_register { */ IR3_REG_SSA = 0x1000, /* 'instr' is ptr to assigning instr */ IR3_REG_IA = 0x2000, /* meta-input dst is "assigned" */ + IR3_REG_ADDR = 0x4000, /* register is a0.x */ } flags; union { /* normal registers: @@ -232,6 +233,8 @@ struct ir3_block { struct ir3_instruction **temporaries; struct ir3_instruction **inputs; struct ir3_instruction **outputs; + /* only a single address register: */ + struct ir3_instruction *address; struct ir3_block *parent; struct ir3_instruction *head; }; @@ -351,10 +354,24 @@ static inline bool is_meta(struct ir3_instruction *instr) return (instr->category == -1); } +static inline bool is_deref(struct ir3_instruction *instr) +{ + return is_meta(instr) && (instr->opc == OPC_META_DEREF); +} + +static inline bool writes_addr(struct ir3_instruction *instr) +{ + if (instr->regs_count > 0) { + struct ir3_register *dst = instr->regs[0]; + return !!(dst->flags & IR3_REG_ADDR); + } + return false; +} + /* TODO combine is_gpr()/reg_gpr().. */ static inline bool reg_gpr(struct ir3_register *r) { - if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_SSA)) + if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_SSA | IR3_REG_ADDR)) return false; if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0)) return false; diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_cp.c b/src/gallium/drivers/freedreno/a3xx/ir3_cp.c index 81f6c902816..0faed89c25e 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3_cp.c +++ b/src/gallium/drivers/freedreno/a3xx/ir3_cp.c @@ -43,10 +43,13 @@ static bool is_eligible_mov(struct ir3_instruction *instr) { if ((instr->category == 1) && (instr->cat1.src_type == instr->cat1.dst_type)) { + struct ir3_register *dst = instr->regs[0]; struct ir3_register *src = instr->regs[1]; + if (dst->flags & IR3_REG_ADDR) + return false; if ((src->flags & IR3_REG_SSA) && /* TODO: propagate abs/neg modifiers if possible */ - !(src->flags & (IR3_REG_ABS | IR3_REG_NEGATE))) + !(src->flags & (IR3_REG_ABS | IR3_REG_NEGATE | IR3_REG_RELATIV))) return true; } return false; diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_depth.c b/src/gallium/drivers/freedreno/a3xx/ir3_depth.c index 1715f1917f0..b84629b2e07 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3_depth.c +++ b/src/gallium/drivers/freedreno/a3xx/ir3_depth.c @@ -63,6 +63,9 @@ int ir3_delayslots(struct ir3_instruction *assigner, if (is_meta(assigner)) return 0; + if (writes_addr(assigner)) + return 6; + /* handled via sync flags: */ if (is_sfu(assigner) || is_tex(assigner)) return 0; diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_dump.c b/src/gallium/drivers/freedreno/a3xx/ir3_dump.c index 3984cd60e6e..a186d62a819 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3_dump.c +++ b/src/gallium/drivers/freedreno/a3xx/ir3_dump.c @@ -58,6 +58,9 @@ static void dump_instr_name(struct ir3_dump_ctx *ctx, case OPC_META_PHI: fprintf(ctx->f, "Φ"); break; + case OPC_META_DEREF: + fprintf(ctx->f, "(*)"); + break; default: /* shouldn't hit here.. just for debugging: */ switch (instr->opc) { @@ -66,7 +69,6 @@ static void dump_instr_name(struct ir3_dump_ctx *ctx, case OPC_META_FO: fprintf(ctx->f, "_meta:fo"); break; case OPC_META_FI: fprintf(ctx->f, "_meta:fi"); break; case OPC_META_FLOW: fprintf(ctx->f, "_meta:flow"); break; - case OPC_META_PHI: fprintf(ctx->f, "_meta:phi"); break; default: fprintf(ctx->f, "_meta:%d", instr->opc); break; } @@ -162,7 +164,8 @@ static void dump_instr(struct ir3_dump_ctx *ctx, ir3_block_dump(ctx, instr->flow.else_block, "else"); if (reg->flags & IR3_REG_SSA) dump_instr(ctx, reg->instr); - } else if (instr->opc == OPC_META_PHI) { + } else if ((instr->opc == OPC_META_PHI) || + (instr->opc == OPC_META_DEREF)) { /* treat like a normal instruction: */ ir3_instr_dump(ctx, instr); } @@ -228,7 +231,8 @@ static void dump_link2(struct ir3_dump_ctx *ctx, printdef(ctx, defer, "output%lx::w -> %s", PTRID(instr->inout.block), instr->regs[0]->num, target); - } else if (instr->opc == OPC_META_PHI) { + } else if ((instr->opc == OPC_META_PHI) || + (instr->opc == OPC_META_DEREF)) { /* treat like a normal instruction: */ printdef(ctx, defer, "instr%lx: -> %s", PTRID(instr), target); } diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c index 57c68c729c5..a9a510f3bc2 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c +++ b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c @@ -82,8 +82,8 @@ static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr); * Register Allocation: */ -#define REG(n, wm) (struct ir3_register){ \ - /*.flags = ((so)->half_precision) ? IR3_REG_HALF : 0,*/ \ +#define REG(n, wm, f) (struct ir3_register){ \ + .flags = (f), \ .num = (n), \ .wrmask = TGSI_WRITEMASK_ ## wm, \ } @@ -145,7 +145,7 @@ static void compute_liveregs(struct ir3_ra_ctx *ctx, /* be sure to account for output registers too: */ for (i = 0; i < block->noutputs; i++) { - struct ir3_register reg = REG(output_base(ctx) + i, X); + struct ir3_register reg = REG(output_base(ctx) + i, X, 0); regmask_set_if_not(liveregs, ®, &written); } } @@ -212,14 +212,15 @@ static bool compute_clobbers(struct ir3_ra_ctx *ctx, return live || was_live; } -static int find_available(regmask_t *liveregs, int size) +static int find_available(regmask_t *liveregs, int size, bool half) { unsigned i; + unsigned f = half ? IR3_REG_HALF : 0; for (i = 0; i < MAX_REG - size; i++) { - if (!regmask_get(liveregs, ®(i, X))) { + if (!regmask_get(liveregs, ®(i, X, f))) { unsigned start = i++; for (; (i < MAX_REG) && ((i - start) < size); i++) - if (regmask_get(liveregs, ®(i, X))) + if (regmask_get(liveregs, ®(i, X, f))) break; if ((i - start) >= size) return start; @@ -240,7 +241,9 @@ static int alloc_block(struct ir3_ra_ctx *ctx, */ return 0; } else { + struct ir3_register *dst = instr->regs[0]; regmask_t liveregs; + compute_liveregs(ctx, instr, &liveregs); // XXX XXX XXX XXX XXX XXX XXX XXX XXX @@ -257,7 +260,9 @@ static int alloc_block(struct ir3_ra_ctx *ctx, } else // XXX XXX XXX XXX XXX XXX XXX XXX XXX compute_clobbers(ctx, instr->next, instr, &liveregs); - return find_available(&liveregs, size); + + return find_available(&liveregs, size, + !!(dst->flags & IR3_REG_HALF)); } } @@ -547,24 +552,32 @@ static void ra_assign(struct ir3_ra_ctx *ctx, static void ir3_instr_ra(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr) { - struct ir3_ra_assignment a; + struct ir3_register *dst; unsigned num; /* skip over nop's */ if (instr->regs_count == 0) return; - /* skip writes to a0, p0, etc */ - if (!reg_gpr(instr->regs[0])) - return; + dst = instr->regs[0]; /* if we've already visited this instruction, bail now: */ if (instr->flags & IR3_INSTR_MARK) return; /* allocate register(s): */ - a = ra_calc(instr); - num = alloc_block(ctx, instr, a.num) + a.off; + if (is_deref(instr)) { + num = instr->regs[2]->num; + } else if (reg_gpr(dst)) { + struct ir3_ra_assignment a; + a = ra_calc(instr); + num = alloc_block(ctx, instr, a.num) + a.off; + } else if (dst->flags & IR3_REG_ADDR) { + dst->flags &= ~IR3_REG_ADDR; + num = regid(REG_A0, 0) | REG_HALF; + } else { + assert(0); + } ra_assign(ctx, instr, num); } @@ -578,6 +591,7 @@ static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block) struct ir3_instruction *end = ir3_instr_create(block, 0, OPC_END); struct ir3_instruction *last_input = NULL; + struct ir3_instruction *last_rel = NULL; regmask_t needs_ss_war; /* write after read */ regmask_t needs_ss; regmask_t needs_sy; @@ -614,6 +628,13 @@ static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block) regmask_init(&needs_sy); } } + + /* TODO: is it valid to have address reg loaded from a + * relative src (ie. mova a0, c)? If so, the + * last_rel check below should be moved ahead of this: + */ + if (reg->flags & IR3_REG_RELATIV) + last_rel = n; } if (n->regs_count > 0) { @@ -622,6 +643,11 @@ static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block) n->flags |= IR3_INSTR_SS; regmask_init(&needs_ss_war); // ??? I assume? } + + if (last_rel && (reg->num == regid(REG_A0, 0))) { + last_rel->flags |= IR3_INSTR_UL; + last_rel = NULL; + } } /* cat5+ does not have an (ss) bit, if needed we need to @@ -685,6 +711,9 @@ static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block) if (last_input) last_input->regs[0]->flags |= IR3_REG_EI; + if (last_rel) + last_rel->flags |= IR3_INSTR_UL; + shader->instrs[shader->instrs_count++] = end; shader->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY; diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_sched.c b/src/gallium/drivers/freedreno/a3xx/ir3_sched.c index 5e585271f92..4fd3da58b46 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3_sched.c +++ b/src/gallium/drivers/freedreno/a3xx/ir3_sched.c @@ -31,6 +31,11 @@ #include "ir3.h" +enum { + SCHEDULED = -1, + DELAYED = -2, +}; + /* * Instruction Scheduling: * @@ -46,7 +51,8 @@ */ struct ir3_sched_ctx { - struct ir3_instruction *scheduled; + struct ir3_instruction *scheduled; /* last scheduled instr */ + struct ir3_instruction *deref; /* current deref, if any */ unsigned cnt; }; @@ -123,6 +129,11 @@ static void schedule(struct ir3_sched_ctx *ctx, block->head = instr->next; } + if (writes_addr(instr)) { + assert(ctx->deref == NULL); + ctx->deref = instr; + } + instr->flags |= IR3_INSTR_MARK; instr->next = ctx->scheduled; @@ -210,13 +221,19 @@ static int trysched(struct ir3_sched_ctx *ctx, * we have enough delay slots to schedule ourself: */ delay = delay_calc(ctx, instr); + if (delay) + return delay; - if (!delay) { - schedule(ctx, instr, true); - return -1; + /* if this is a write to address register, and addr register + * is currently in use, we need to defer until it is free: + */ + if (writes_addr(instr) && ctx->deref) { + assert(ctx->deref != instr); + return DELAYED; } - return delay; + schedule(ctx, instr, true); + return SCHEDULED; } static struct ir3_instruction * reverse(struct ir3_instruction *instr) @@ -231,6 +248,56 @@ static struct ir3_instruction * reverse(struct ir3_instruction *instr) return reversed; } +static bool uses_current_deref(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr) +{ + unsigned i; + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if (reg->flags & IR3_REG_SSA) { + if (is_deref(reg->instr)) { + struct ir3_instruction *deref; + deref = reg->instr->regs[1]->instr; /* the mova */ + if (ctx->deref == deref) + return true; + } + } + } + return false; +} + +/* when we encounter an instruction that writes to the address register + * when it is in use, we delay that instruction and try to schedule all + * other instructions using the current address register: + */ +static int block_sched_undelayed(struct ir3_sched_ctx *ctx, + struct ir3_block *block) +{ + struct ir3_instruction *instr = block->head; + bool in_use = false; + unsigned cnt = ~0; + + while (instr) { + struct ir3_instruction *next = instr->next; + + if (uses_current_deref(ctx, instr)) { + int ret = trysched(ctx, instr); + if (ret == SCHEDULED) + cnt = 0; + else if (ret > 0) + cnt = MIN2(cnt, ret); + in_use = true; + } + + instr = next; + } + + if (!in_use) + ctx->deref = NULL; + + return cnt; +} + static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block) { struct ir3_instruction *instr; @@ -255,6 +322,10 @@ static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block) */ struct ir3_instruction *next = instr->next; int cnt = trysched(ctx, instr); + + if (cnt == DELAYED) + cnt = block_sched_undelayed(ctx, block); + /* -1 is signal to return up stack, but to us means same as 0: */ cnt = MAX2(0, cnt); cnt += ctx->cnt;