From 2bb4dc4054337129cccaf9163adadbf87d98d027 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Thu, 15 Aug 2019 14:53:56 -0700 Subject: [PATCH] pan/midgard: Compute liveness per-block MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Rather than using a regalloc based on live internals, computed hastily with repeated invocations of a forward-analysis pass, we switch to compute liveness information on a per-block basis. Within a given basic block, we compute liveness backwards with a linear-time algorithm; for common shaders, this may help RA terminate quicker. Across blocks, we use a work list (really a work set) and check if we're making progress. This isn't terribly efficient, but it gets the job done. Point is, we get the live_in/live_out for each block. From there, it's simple to rerun the linear-time update algorithm to compute the interference graph. The benefit of this technique is the ability to ignore "gaps" in liveness across intermediate blocks that are never executed. On simple shaders like the loops in glmark, this results in a minor reduction in register pressure. The motivation was a complex shader in Krita that failed register allocation due to an unfortunate interaction between texture pipeline registers and control flow. This shader now compiles successfully. total instructions in shared programs: 3439 -> 3438 (-0.03%) instructions in affected programs: 22 -> 21 (-4.55%) helped: 1 HURT: 0 total bundles in shared programs: 2077 -> 2076 (-0.05%) bundles in affected programs: 12 -> 11 (-8.33%) helped: 1 HURT: 0 total quadwords in shared programs: 3457 -> 3456 (-0.03%) quadwords in affected programs: 20 -> 19 (-5.00%) helped: 1 HURT: 0 total registers in shared programs: 341 -> 338 (-0.88%) registers in affected programs: 9 -> 6 (-33.33%) helped: 3 HURT: 0 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: 33.33% max: 33.33% x̄: 33.33% x̃: 33.33% Signed-off-by: Alyssa Rosenzweig --- src/panfrost/midgard/compiler.h | 8 ++ src/panfrost/midgard/midgard_ra.c | 231 +++++++++++++++++++++--------- 2 files changed, 169 insertions(+), 70 deletions(-) diff --git a/src/panfrost/midgard/compiler.h b/src/panfrost/midgard/compiler.h index 1cbebdbef2e..edf0c105a19 100644 --- a/src/panfrost/midgard/compiler.h +++ b/src/panfrost/midgard/compiler.h @@ -184,6 +184,14 @@ typedef struct midgard_block { * boolean for passes to use as they see fit, provided they * clean up later */ bool visited; + + /* In liveness analysis, these are live masks (per-component) for + * indices for the block. Scalar compilers have the luxury of using + * simple bit fields, but for us, liveness is a vector idea. We use + * 8-bit to allow finegrained tracking up to vec8. If you're + * implementing vec16 on Panfrost... I'm sorry. */ + uint8_t *live_in; + uint8_t *live_out; } midgard_block; typedef struct midgard_bundle { diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c index ebd085cd9a3..9d38054f691 100644 --- a/src/panfrost/midgard/midgard_ra.c +++ b/src/panfrost/midgard/midgard_ra.c @@ -26,6 +26,7 @@ #include "midgard_ops.h" #include "util/register_allocate.h" #include "util/u_math.h" +#include "util/u_memory.h" /* For work registers, we can subdivide in various ways. So we create * classes for the various sizes and conflict accordingly, keeping in @@ -516,6 +517,165 @@ mir_lower_special_reads(compiler_context *ctx) free(texw); } +/* Routines for liveness analysis */ + +static void +liveness_gen(uint8_t *live, unsigned node, unsigned max, unsigned mask) +{ + if ((node < 0) || (node >= max)) + return; + + live[node] |= mask; +} + +static void +liveness_kill(uint8_t *live, unsigned node, unsigned max, unsigned mask) +{ + if ((node < 0) || (node >= max)) + return; + + live[node] &= ~mask; +} + +/* Updates live_in for a single instruction */ + +static void +liveness_ins_update(uint8_t *live, midgard_instruction *ins, unsigned max) +{ + /* live_in[s] = GEN[s] + (live_out[s] - KILL[s]) */ + + liveness_kill(live, ins->ssa_args.dest, max, ins->mask); + + mir_foreach_src(ins, src) { + unsigned node = ins->ssa_args.src[src]; + unsigned mask = mir_mask_of_read_components(ins, node); + + liveness_gen(live, node, max, mask); + } +} + +/* live_out[s] = sum { p in succ[s] } ( live_in[p] ) */ + +static void +liveness_block_live_out(compiler_context *ctx, midgard_block *blk) +{ + mir_foreach_successor(blk, succ) { + for (unsigned i = 0; i < ctx->temp_count; ++i) + blk->live_out[i] |= succ->live_in[i]; + } +} + +/* Liveness analysis is a backwards-may dataflow analysis pass. Within a block, + * we compute live_out from live_in. The intrablock pass is linear-time. It + * returns whether progress was made. */ + +static bool +liveness_block_update(compiler_context *ctx, midgard_block *blk) +{ + bool progress = false; + + liveness_block_live_out(ctx, blk); + + uint8_t *live = mem_dup(blk->live_out, ctx->temp_count); + + mir_foreach_instr_in_block_rev(blk, ins) + liveness_ins_update(live, ins, ctx->temp_count); + + /* To figure out progress, diff live_in */ + + for (unsigned i = 0; (i < ctx->temp_count) && !progress; ++i) + progress |= (blk->live_in[i] != live[i]); + + free(blk->live_in); + blk->live_in = live; + + return progress; +} + +/* Globally, liveness analysis uses a fixed-point algorithm based on a + * worklist. We initialize a work list with the exit block. We iterate the work + * list to compute live_in from live_out for each block on the work list, + * adding the predecessors of the block to the work list if we made progress. + */ + +static void +mir_compute_liveness( + compiler_context *ctx, + struct ra_graph *g) +{ + /* List of midgard_block */ + struct set *work_list; + + work_list = _mesa_set_create(ctx, + _mesa_hash_pointer, + _mesa_key_pointer_equal); + + /* Allocate */ + + mir_foreach_block(ctx, block) { + block->live_in = calloc(ctx->temp_count, 1); + block->live_out = calloc(ctx->temp_count, 1); + } + + /* Initialize the work list with the exit block */ + struct set_entry *cur; + + midgard_block *exit = mir_exit_block(ctx); + cur = _mesa_set_add(work_list, exit); + + /* Iterate the work list */ + + do { + /* Pop off a block */ + midgard_block *blk = (struct midgard_block *) cur->key; + _mesa_set_remove(work_list, cur); + + /* Update its liveness information */ + bool progress = liveness_block_update(ctx, blk); + + /* If we made progress, we need to process the predecessors */ + + if (progress || (blk == exit)) { + mir_foreach_predecessor(blk, pred) + _mesa_set_add(work_list, pred); + } + } while((cur = _mesa_set_next_entry(work_list, NULL)) != NULL); + + /* Now that every block has live_in/live_out computed, we can determine + * interference by walking each block linearly. Take live_out at the + * end of each block and walk the block backwards. */ + + mir_foreach_block(ctx, blk) { + uint8_t *live = calloc(ctx->temp_count, 1); + + mir_foreach_successor(blk, succ) { + for (unsigned i = 0; i < ctx->temp_count; ++i) + live[i] |= succ->live_in[i]; + } + + mir_foreach_instr_in_block_rev(blk, ins) { + /* Mark all registers live after the instruction as + * interfering with the destination */ + + unsigned dest = ins->ssa_args.dest; + + if (dest >= 0 && dest < ctx->temp_count) { + for (unsigned i = 0; i < ctx->temp_count; ++i) + if (live[i]) + ra_add_node_interference(g, dest, i); + } + + /* Update live_in */ + liveness_ins_update(live, ins, ctx->temp_count); + } + } + + mir_foreach_block(ctx, blk) { + free(blk->live_in); + free(blk->live_out); + } +} + /* This routine performs the actual register allocation. It should be succeeded * by install_registers */ @@ -605,76 +765,7 @@ allocate_registers(compiler_context *ctx, bool *spilled) ra_set_node_class(g, i, classes[class]); } - /* Determine liveness */ - - int *live_start = malloc(nodes * sizeof(int)); - int *live_end = malloc(nodes * sizeof(int)); - - /* Initialize as non-existent */ - - for (int i = 0; i < nodes; ++i) { - live_start[i] = live_end[i] = -1; - } - - int d = 0; - - mir_foreach_block(ctx, block) { - mir_foreach_instr_in_block(block, ins) { - if (ins->ssa_args.dest < SSA_FIXED_MINIMUM) { - /* If this destination is not yet live, it is - * now since we just wrote it */ - - int dest = ins->ssa_args.dest; - - if (dest >= 0 && live_start[dest] == -1) - live_start[dest] = d; - } - - /* Since we just used a source, the source might be - * dead now. Scan the rest of the block for - * invocations, and if there are none, the source dies - * */ - - for (int src = 0; src < ARRAY_SIZE(ins->ssa_args.src); ++src) { - int s = ins->ssa_args.src[src]; - - if (s < 0) continue; - - if (s >= SSA_FIXED_MINIMUM) continue; - - if (!mir_is_live_after(ctx, block, ins, s)) { - live_end[s] = d; - } - } - - ++d; - } - } - - /* If a node still hasn't been killed, kill it now */ - - for (int i = 0; i < nodes; ++i) { - /* live_start == -1 most likely indicates a pinned output */ - - if (live_end[i] == -1) - live_end[i] = d; - } - - /* Setup interference between nodes that are live at the same time */ - - for (int i = 0; i < nodes; ++i) { - for (int j = i + 1; j < nodes; ++j) { - bool j_overlaps_i = live_start[j] < live_end[i]; - bool i_overlaps_j = live_end[j] < live_start[i]; - - if (i_overlaps_j || j_overlaps_i) - ra_add_node_interference(g, i, j); - } - } - - /* Cleanup */ - free(live_start); - free(live_end); + mir_compute_liveness(ctx, g); if (!ra_allocate(g)) { *spilled = true; -- 2.30.2