From 0a8292e096bc37eeb225bf7d3854b6b6edc4bceb Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Wed, 1 Sep 2010 17:54:56 +0200 Subject: [PATCH] nv50: attempt at making more complicated loops work Nested loops, and loops with multiple exits (BREAK, CONT). --- src/gallium/drivers/nv50/nv50_pc.c | 20 +++-- src/gallium/drivers/nv50/nv50_pc.h | 6 ++ src/gallium/drivers/nv50/nv50_pc_optimize.c | 14 ++-- src/gallium/drivers/nv50/nv50_pc_regalloc.c | 85 +++++++++++++-------- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 70 +++++++++++++---- 5 files changed, 138 insertions(+), 57 deletions(-) diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index 1c12fe1b9e7..b03f5b27f60 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -220,6 +220,7 @@ edge_name(ubyte type) case CFG_EDGE_BACK: return "back"; case CFG_EDGE_LOOP_ENTER: return "loop"; case CFG_EDGE_LOOP_LEAVE: return "break"; + case CFG_EDGE_FAKE: return "fake"; default: return "?"; } @@ -247,6 +248,7 @@ nv_pc_pass_in_order(struct nv_basic_block *root, nv_pc_pass_func f, void *priv) case CFG_EDGE_BACK: continue; case CFG_EDGE_FORWARD: + case CFG_EDGE_FAKE: if (++b->out[j]->priv == b->out[j]->num_in) bb[p++] = b->out[j]; break; @@ -264,9 +266,11 @@ nv_pc_pass_in_order(struct nv_basic_block *root, nv_pc_pass_func f, void *priv) f(priv, b); - if (!p) - while (pp > 0) - bb[p++] = bbb[--pp]; + if (!p) { + p = pp; + for (; pp > 0; --pp) + bb[pp - 1] = bbb[pp - 1]; + } } } @@ -366,11 +370,17 @@ nv50_generate_code(struct nv50_translation_info *ti) ret = nv_pc_exec_pass0(pc); if (ret) goto out; +#ifdef NV50PC_DEBUG + nv_print_program(pc->root); +#endif /* register allocation */ ret = nv_pc_exec_pass1(pc); if (ret) goto out; +#ifdef NV50PC_DEBUG + nv_print_program(pc->root); +#endif /* prepare for emission */ ret = nv_pc_exec_pass2(pc); @@ -580,10 +590,10 @@ nvbb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp, if (bp == bt) return FALSE; - if (bp->out[0] && bp->out_kind[0] != CFG_EDGE_BACK && + if (bp->out[0] && !IS_WALL_EDGE(bp->out_kind[0]) && nvbb_reachable_by(bf, bp->out[0], bt)) return TRUE; - if (bp->out[1] && bp->out_kind[1] != CFG_EDGE_BACK && + if (bp->out[1] && !IS_WALL_EDGE(bp->out_kind[1]) && nvbb_reachable_by(bf, bp->out[1], bt)) return TRUE; return FALSE; diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index 48918f46d5b..2bb3ea4374c 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -257,6 +257,12 @@ struct nv_instruction { #define CFG_EDGE_BACK 1 #define CFG_EDGE_LOOP_ENTER 2 #define CFG_EDGE_LOOP_LEAVE 4 +#define CFG_EDGE_FAKE 8 + +/* 'WALL' edge means where reachability check doesn't follow */ +/* 'LOOP' edge means just having to do with loops */ +#define IS_LOOP_EDGE(k) ((k) & 7) +#define IS_WALL_EDGE(k) ((k) & 9) struct nv_basic_block { struct nv_instruction *entry; /* first non-phi instruction */ diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 4b1cd56fc13..1d2710a8aca 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -362,6 +362,9 @@ nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b) nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value); if (ld->src[4]) nv_reference(ctx->pc, &nvi->src[4], ld->src[4]->value); + + if (!nv_nvi_refcount(ld)) + nv_nvi_delete(ld); } } DESCEND_ARBITRARY(j, nv_pass_fold_loads); @@ -504,7 +507,7 @@ constant_expression(struct nv_pc *pc, struct nv_instruction *nvi, u1.u32 = src1->reg.imm.u32; modifiers_apply(&u0.u32, type, nvi->src[0]->mod); - modifiers_apply(&u0.u32, type, nvi->src[1]->mod); + modifiers_apply(&u1.u32, type, nvi->src[1]->mod); switch (nvi->opcode) { case NV_OP_MAD: @@ -951,7 +954,9 @@ nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b) if (b->exit && b->exit->opcode == NV_OP_JOINAT) nv_nvi_delete(b->exit); - if ((nvi = b->out[0]->out[0]->entry)) { + i = (b->out[0]->out_kind[0] == CFG_EDGE_LOOP_LEAVE) ? 1 : 0; + + if ((nvi = b->out[0]->out[i]->entry)) { nvi->is_join = 0; if (nvi->opcode == NV_OP_JOIN) nv_nvi_delete(nvi); @@ -980,7 +985,8 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b) if (ir->opcode != ik->opcode) continue; - if (ik->opcode == NV_OP_LDA || + if (!ir->def[0] || !ik->def[0] || + ik->opcode == NV_OP_LDA || ik->opcode == NV_OP_STA || ik->opcode == NV_OP_MOV || nv_is_vector_op(ik->opcode)) @@ -993,8 +999,6 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b) ik->flags_def || ir->flags_def) continue; /* and also not with flags, for now */ - assert(ik->def[0] && ir->def[0]); - if (ik->def[0]->reg.file == NV_FILE_OUT || ir->def[0]->reg.file == NV_FILE_OUT || !values_equal(ik->def[0], ir->def[0])) diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c index 59462cc11e6..81decf8d4a5 100644 --- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -22,6 +22,10 @@ /* #define NV50PC_DEBUG */ +/* #define NV50_RA_DEBUG_LIVEI */ +/* #define NV50_RA_DEBUG_LIVE_SETS */ +/* #define NV50_RA_DEBUG_JOIN */ + #include "nv50_context.h" #include "nv50_pc.h" @@ -119,7 +123,7 @@ add_range(struct nv_value *val, struct nv_basic_block *b, int end) add_range_ex(val, bgn, end, NULL); } -#ifdef NV50_RA_DEBUG_JOIN +#if defined(NV50_RA_DEBUG_JOIN) || defined(NV50_RA_DEBUG_LIVEI) static void livei_print(struct nv_value *a) { @@ -359,16 +363,37 @@ need_new_else_block(struct nv_basic_block *b, struct nv_basic_block *p) int i = 0, n = 0; for (; i < 2; ++i) - if (p->out[i] && p->out_kind[i] != CFG_EDGE_LOOP_LEAVE) + if (p->out[i] && !IS_LOOP_EDGE(p->out_kind[i])) ++n; return (b->num_in > 1) && (n == 2); } +static int +phi_opnd_for_bb(struct nv_instruction *phi, struct nv_basic_block *b, + struct nv_basic_block *tb) +{ + int i, j; + + for (j = -1, i = 0; i < 4 && phi->src[i]; ++i) { + if (!nvbb_reachable_by(b, phi->src[i]->value->insn->bb, tb)) + continue; + /* NOTE: back-edges are ignored by the reachable-by check */ + if (j < 0 || !nvbb_reachable_by(phi->src[j]->value->insn->bb, + phi->src[i]->value->insn->bb, tb)) + j = i; + } + return j; +} + /* For each operand of each PHI in b, generate a new value by inserting a MOV * at the end of the block it is coming from and replace the operand with its * result. This eliminates liveness conflicts and enables us to let values be * copied to the right register if such a conflict exists nonetheless. + * + * These MOVs are also crucial in making sure the live intervals of phi srces + * are extended until the end of the loop, since they are not included in the + * live-in sets. */ static int pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) @@ -404,14 +429,17 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) ctx->pc->current_block = pn; for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) { - for (j = 0; j < 4 && i->src[j]; ++j) { - if (nvbb_reachable_by(p, i->src[j]->value->insn->bb, b)) - break; - } - if (j >= 4 || !i->src[j]) + if ((j = phi_opnd_for_bb(i, p, b)) < 0) continue; val = i->src[j]->value; + if (i->src[j]->flags) { + val = val->insn->src[0]->value; + while (j < 4 && i->src[j]) + ++j; + assert(j < 4); + } + ni = new_instruction(ctx->pc, NV_OP_MOV); /* TODO: insert instruction at correct position in the first place */ @@ -423,6 +451,8 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) ni->src[0] = new_ref(ctx->pc, val); nv_reference(ctx->pc, &i->src[j], ni->def[0]); + + i->src[j]->flags = 1; } if (pn != p && pn->exit) { @@ -452,8 +482,8 @@ pass_join_values(struct nv_pc_pass *ctx, int iter) case NV_OP_PHI: if (!iter) continue; - try_join_values(ctx, i->src[0]->value, i->src[1]->value); - try_join_values(ctx, i->def[0], i->src[0]->value); + for (c = 0; c < 4 && i->src[c]; ++c) + try_join_values(ctx, i->def[0], i->src[c]->value); break; case NV_OP_MOV: if (iter && i->src[0]->value->insn && @@ -576,22 +606,6 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b) for (j = 0; j < (ctx->pc->num_values + 31) / 32; ++j) b->live_set[j] |= b->out[n]->live_set[j]; } - - /* Kick values out of our live set that are created in incoming - * blocks of our successors that are not us. - */ - for (i = b->out[n]->phi; i && i->opcode == NV_OP_PHI; i = i->next) { - for (j = 0; j < 4; ++j) { - if (!i->src[j]) - break; - assert(i->src[j]->value->insn); - - if (nvbb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n])) - live_set_add(b, i->src[j]->value); - else - live_set_rem(b, i->src[j]->value); - } - } } if (!b->entry) @@ -599,7 +613,7 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b) bb_live_set_print(ctx->pc, b); - for (i = b->exit; i; i = i->prev) { + for (i = b->exit; i != b->entry->prev; i = i->prev) { for (j = 0; j < 4; j++) { if (!i->def[j]) break; @@ -617,6 +631,9 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b) if (i->flags_src) live_set_add(b, i->flags_src->value); } + for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) + live_set_rem(b, i->def[0]); + bb_live_set_print(ctx->pc, b); return 0; @@ -680,10 +697,12 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b) for (j = 0; j < ctx->pc->num_values; ++j) { if (!(b->live_set[j / 32] & (1 << (j % 32)))) continue; + add_range(&ctx->pc->values[j], b, b->exit->serial + 1); #ifdef NV50_RA_DEBUG_LIVEI - debug_printf("adding range for live value %i\n", j); + debug_printf("adding range for live value %i: ", j); + livei_print(&ctx->pc->values[j]); #endif - add_range(&ctx->pc->values[j], b, b->exit->serial + 1); + } } @@ -702,20 +721,22 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b) for (j = 0; j < 5; ++j) { if (i->src[j] && !live_set_test(b, i->src[j])) { live_set_add(b, i->src[j]->value); + add_range(i->src[j]->value, b, i->serial); #ifdef NV50_RA_DEBUG_LIVEI - debug_printf("adding range for source that ends living: %i\n", + debug_printf("adding range for source %i (ends living): ", i->src[j]->value->n); + livei_print(i->src[j]->value); #endif - add_range(i->src[j]->value, b, i->serial); } } if (i->flags_src && !live_set_test(b, i->flags_src)) { live_set_add(b, i->flags_src->value); + add_range(i->flags_src->value, b, i->serial); #ifdef NV50_RA_DEBUG_LIVEI - debug_printf("adding range for source that ends living: %i\n", + debug_printf("adding range for source %i (ends living): ", i->flags_src->value->n); + livei_print(i->flags_src->value); #endif - add_range(i->flags_src->value, b, i->serial); } } diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 115b5df9391..8b18a9c0252 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -120,6 +120,8 @@ struct bld_context { struct nv_basic_block *brkt_bb[BLD_MAX_LOOP_NESTING]; int loop_lvl; + ubyte out_kind; /* CFG_EDGE_FORWARD, or FAKE in case of BREAK/CONT */ + struct bld_value_stack tvs[BLD_MAX_TEMPS][4]; /* TGSI_FILE_TEMPORARY */ struct bld_value_stack avs[BLD_MAX_ADDRS][4]; /* TGSI_FILE_ADDRESS */ struct bld_value_stack pvs[BLD_MAX_PREDS][4]; /* TGSI_FILE_PREDICATE */ @@ -268,7 +270,7 @@ fetch_by_bb(struct bld_value_stack *stack, return; } for (i = 0; i < b->num_in; ++i) - if (b->in_kind[i] != CFG_EDGE_BACK) + if (!IS_WALL_EDGE(b->in_kind[i])) fetch_by_bb(stack, vals, n, b->in[i]); } @@ -362,18 +364,31 @@ bld_phi(struct bld_context *bld, struct nv_basic_block *b, return phi->def[0]; } +/* Insert a phi function in the loop header. + * For nested loops, we need to insert phi functions in all the outer + * loop headers if they don't have one yet. + * + * @def: redefinition from inside loop, or NULL if to be replaced later + */ static struct nv_value * bld_loop_phi(struct bld_context *bld, struct bld_value_stack *stack, struct nv_value *def) { - struct nv_basic_block *bb = bld->pc->current_block; struct nv_instruction *phi; - struct nv_value *val; + struct nv_basic_block *bb = bld->pc->current_block; + struct nv_value *val = NULL; - val = bld_phi(bld, bld->pc->current_block, stack); + if (bld->loop_lvl > 1) { + --bld->loop_lvl; + if (!((stack->loop_def | stack->loop_use) & (1 << bld->loop_lvl))) + val = bld_loop_phi(bld, stack, NULL); + ++bld->loop_lvl; + } + + if (!val) + val = bld_phi(bld, bld->pc->current_block, stack); /* old definition */ if (!val) { bld->pc->current_block = bld->loop_bb[bld->loop_lvl - 1]->in[0]; - val = bld_undef(bld, bld_stack_file(bld, stack)); } @@ -449,10 +464,11 @@ bld_replace_value(struct nv_pc *, struct nv_basic_block *, struct nv_value *, static void bld_loop_end(struct bld_context *bld, struct nv_basic_block *bb) { + struct nv_basic_block *save = bld->pc->current_block; struct nv_instruction *phi, *next; struct nv_value *val; struct bld_value_stack *stk; - int s; + int i, s, n; for (phi = bb->phi; phi && phi->opcode == NV_OP_PHI; phi = next) { next = phi->next; @@ -460,19 +476,33 @@ bld_loop_end(struct bld_context *bld, struct nv_basic_block *bb) stk = (struct bld_value_stack *)phi->target; phi->target = NULL; - val = bld_fetch_global(bld, stk); + for (s = 1, n = 0; n < bb->num_in; ++n) { + if (bb->in_kind[n] != CFG_EDGE_BACK) + continue; - nv_reference(bld->pc, &phi->src[1], val); + assert(s < 4); + bld->pc->current_block = bb->in[n]; + val = bld_fetch_global(bld, stk); + + for (i = 0; i < 4; ++i) + if (phi->src[i] && phi->src[i]->value == val) + break; + if (i == 4) + nv_reference(bld->pc, &phi->src[s++], val); + } + bld->pc->current_block = save; - s = -1; if (phi->src[0]->value == phi->def[0] || phi->src[0]->value == phi->src[1]->value) s = 1; else if (phi->src[1]->value == phi->def[0]) s = 0; + else + continue; if (s >= 0) { + /* eliminate the phi */ bld_vals_del_val(stk, phi->def[0]); ++bld->pc->pass_seq; @@ -915,6 +945,8 @@ bld_new_block(struct bld_context *bld, struct nv_basic_block *b) for (i = 0; i < 128; ++i) bld->saved_inputs[i] = NULL; + + bld->out_kind = CFG_EDGE_FORWARD; } static struct nv_value * @@ -1366,7 +1398,7 @@ bld_instruction(struct bld_context *bld, struct nv_basic_block *b = new_basic_block(bld->pc); --bld->cond_lvl; - nvbb_attach_block(bld->pc->current_block, b, CFG_EDGE_FORWARD); + nvbb_attach_block(bld->pc->current_block, b, bld->out_kind); nvbb_attach_block(bld->cond_bb[bld->cond_lvl], b, CFG_EDGE_FORWARD); bld->cond_bb[bld->cond_lvl]->exit->target = b; @@ -1407,8 +1439,10 @@ bld_instruction(struct bld_context *bld, bld_flow(bld, NV_OP_BREAK, NV_CC_TR, NULL, bb, FALSE); - /* XXX: don't do this for redundant BRKs */ - nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_LOOP_LEAVE); + if (bld->out_kind == CFG_EDGE_FORWARD) /* else we already had BRK/CONT */ + nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_LOOP_LEAVE); + + bld->out_kind = CFG_EDGE_FAKE; } break; case TGSI_OPCODE_CONT: @@ -1418,11 +1452,17 @@ bld_instruction(struct bld_context *bld, bld_flow(bld, NV_OP_BRA, NV_CC_TR, NULL, bb, FALSE); nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_BACK); + + if ((bb = bld->join_bb[bld->cond_lvl - 1])) { + bld->join_bb[bld->cond_lvl - 1] = NULL; + nv_nvi_delete(bb->exit->prev); + } + bld->out_kind = CFG_EDGE_FAKE; } break; case TGSI_OPCODE_ENDLOOP: { - struct nv_basic_block *bb = bld->loop_bb[--bld->loop_lvl]; + struct nv_basic_block *bb = bld->loop_bb[bld->loop_lvl - 1]; bld_flow(bld, NV_OP_BRA, NV_CC_TR, NULL, bb, FALSE); @@ -1430,7 +1470,7 @@ bld_instruction(struct bld_context *bld, bld_loop_end(bld, bb); /* replace loop-side operand of the phis */ - bld_new_block(bld, bld->brkt_bb[bld->loop_lvl]); + bld_new_block(bld, bld->brkt_bb[--bld->loop_lvl]); } break; case TGSI_OPCODE_ABS: @@ -1651,7 +1691,7 @@ bld_replace_value(struct nv_pc *pc, struct nv_basic_block *b, { struct nv_instruction *nvi; - for (nvi = b->entry; nvi; nvi = nvi->next) { + for (nvi = b->phi ? b->phi : b->entry; nvi; nvi = nvi->next) { int s; for (s = 0; s < 5; ++s) { if (!nvi->src[s]) -- 2.30.2