From 1cd1cce03523dcf1ca680b753ea0e32b0f8bc549 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Fri, 13 Sep 2019 13:23:56 +0700 Subject: [PATCH] lima/gpir: Use registers for values live in multiple blocks This commit adds the framework for cross-basic-block register allocation. Like ARM's compiler, we assume that the value registers aren't usable across branches, which means we have to use physical registers to store any value that crosses a basic block. There are three parts to this: 1. When translating from NIR, we rely on the NIR out-of-ssa pass to coalesce values into registers. We insert store_reg instructions for values used in more than one basic block, and load_reg instructions for values not defined in the same basic block (or defined after their use, for loops). So by the time we've translated out of NIR we've already split things into values (which are only used in the same basic block) and registers (which are only used in different basic blocks than where they're defined). 2. We allocate the registers at the same time that we allocate the values, before the final scheduler. Unlike the values, where the assigned color is fake, we assign the actual physical index & component to physregs at this stage. load_reg and store_reg are treated as moves in the allocator and when creating write-after-read dependencies. 3. Finally, in the main scheduler we have to avoid overwriting existing live physregs when spilling. First, we have to tell the scheduler which physical registers are live at the end of each block, to avoid overwriting those. If a register is only live at the beginning, we can reuse it for spilling after the last original use in the final program happens, i.e. before any original use is scheduled, but we have to be careful to add the proper dependencies so that the spill write is scheduled before the original reads. To handle this we repurpose reg_link for uses to be used by the scheduler. A few register-related things copied over from NIR or from other drivers can be dropped. Reviewed-by: Vasily Khoruzhick --- src/gallium/drivers/lima/ir/gp/gpir.h | 39 +- src/gallium/drivers/lima/ir/gp/lower.c | 5 +- src/gallium/drivers/lima/ir/gp/nir.c | 122 +++-- src/gallium/drivers/lima/ir/gp/node.c | 11 - .../drivers/lima/ir/gp/reduce_scheduler.c | 52 +- src/gallium/drivers/lima/ir/gp/regalloc.c | 504 ++++++++++++++++-- src/gallium/drivers/lima/ir/gp/scheduler.c | 71 ++- 7 files changed, 648 insertions(+), 156 deletions(-) diff --git a/src/gallium/drivers/lima/ir/gp/gpir.h b/src/gallium/drivers/lima/ir/gp/gpir.h index 6cbd406032e..4d11fb1eaa7 100644 --- a/src/gallium/drivers/lima/ir/gp/gpir.h +++ b/src/gallium/drivers/lima/ir/gp/gpir.h @@ -211,11 +211,6 @@ typedef struct { typedef struct { int index; struct list_head list; - - struct list_head defs_list; - struct list_head uses_list; - - int start, end; } gpir_reg; typedef struct { @@ -236,7 +231,6 @@ typedef struct gpir_store_node { gpir_node *child; gpir_reg *reg; - struct list_head reg_link; } gpir_store_node; enum gpir_instr_slot { @@ -350,6 +344,26 @@ typedef struct gpir_block { struct list_head predecessors; struct list_head predecessors_node; + /* for regalloc */ + + /* The set of live registers, i.e. registers whose value may be used + * eventually, at the beginning of the block. + */ + BITSET_WORD *live_in; + + /* Set of live registers at the end of the block. */ + BITSET_WORD *live_out; + + /* Set of registers that may have a value defined at the end of the + * block. + */ + BITSET_WORD *def_out; + + /* After register allocation, the set of live physical registers at the end + * of the block. Needed for scheduling. + */ + uint64_t live_out_phys; + /* For codegen, the offset in the final program. */ unsigned instr_offset; @@ -380,8 +394,17 @@ typedef struct gpir_compiler { struct list_head block_list; int cur_index; - /* array for searching ssa node */ - gpir_node **var_nodes; + /* Find the gpir node for a given NIR SSA def. */ + gpir_node **node_for_ssa; + + /* Find the gpir node for a given NIR register. */ + gpir_node **node_for_reg; + + /* Find the gpir register for a given NIR SSA def. */ + gpir_reg **reg_for_ssa; + + /* Find the gpir register for a given NIR register. */ + gpir_reg **reg_for_reg; /* gpir block for NIR block. */ gpir_block **blocks; diff --git a/src/gallium/drivers/lima/ir/gp/lower.c b/src/gallium/drivers/lima/ir/gp/lower.c index eaeeeb8f1eb..296f141216e 100644 --- a/src/gallium/drivers/lima/ir/gp/lower.c +++ b/src/gallium/drivers/lima/ir/gp/lower.c @@ -109,10 +109,7 @@ static bool gpir_lower_load(gpir_compiler *comp) gpir_load_node *nload = gpir_node_to_load(new); nload->index = load->index; nload->component = load->component; - if (load->reg) { - nload->reg = load->reg; - list_addtail(&nload->reg_link, &load->reg->uses_list); - } + nload->reg = load->reg; gpir_node_replace_pred(dep, new); gpir_node_replace_child(succ, node, new); diff --git a/src/gallium/drivers/lima/ir/gp/nir.c b/src/gallium/drivers/lima/ir/gp/nir.c index e2dc939f1a0..8d9a5beb98a 100644 --- a/src/gallium/drivers/lima/ir/gp/nir.c +++ b/src/gallium/drivers/lima/ir/gp/nir.c @@ -30,37 +30,80 @@ #include "gpir.h" #include "lima_context.h" +gpir_reg *gpir_create_reg(gpir_compiler *comp) +{ + gpir_reg *reg = ralloc(comp, gpir_reg); + reg->index = comp->cur_reg++; + list_addtail(®->list, &comp->reg_list); + return reg; +} + +static gpir_reg *reg_for_nir_reg(gpir_compiler *comp, nir_register *nir_reg) +{ + unsigned index = nir_reg->index; + gpir_reg *reg = comp->reg_for_reg[index]; + if (reg) + return reg; + reg = gpir_create_reg(comp); + comp->reg_for_reg[index] = reg; + return reg; +} -static inline void *gpir_node_create_ssa(gpir_block *block, gpir_op op, nir_ssa_def *ssa) +static inline gpir_node *gpir_node_create_ssa(gpir_block *block, gpir_op op, nir_ssa_def *ssa) { int index = ssa->index; gpir_node *node = gpir_node_create(block, op); - block->comp->var_nodes[index] = node; + block->comp->node_for_ssa[index] = node; snprintf(node->name, sizeof(node->name), "ssa%d", index); list_addtail(&node->list, &block->node_list); + + /* If any uses are outside the current block, we'll need to create a store + * instruction for them. + */ + bool needs_register = false; + nir_foreach_use(use, ssa) { + if (use->parent_instr->block != ssa->parent_instr->block) { + needs_register = true; + break; + } + } + + if (!needs_register) { + nir_foreach_if_use(use, ssa) { + if (nir_cf_node_prev(&use->parent_if->cf_node) != + &ssa->parent_instr->block->cf_node) { + needs_register = true; + break; + } + } + } + + if (needs_register) { + gpir_store_node *store = gpir_node_create(block, gpir_op_store_reg); + store->child = node; + store->reg = gpir_create_reg(block->comp); + gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT); + list_addtail(&store->node.list, &block->node_list); + block->comp->reg_for_ssa[ssa->index] = store->reg; + } + return node; } -static inline void *gpir_node_create_reg(gpir_block *block, gpir_op op, nir_reg_dest *reg) +static inline void *gpir_node_create_reg(gpir_block *block, gpir_op op, nir_reg_dest *nir_reg) { - int index = reg->reg->index; + int index = nir_reg->reg->index; gpir_node *node = gpir_node_create(block, op); + block->comp->node_for_reg[index] = node; gpir_store_node *store = gpir_node_create(block, gpir_op_store_reg); snprintf(node->name, sizeof(node->name), "reg%d", index); store->child = node; + store->reg = reg_for_nir_reg(block->comp, nir_reg->reg); gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT); - list_for_each_entry(gpir_reg, reg, &block->comp->reg_list, list) { - if (reg->index == index) { - store->reg = reg; - list_addtail(&store->reg_link, ®->defs_list); - break; - } - } - list_addtail(&node->list, &block->node_list); list_addtail(&store->node.list, &block->node_list); return node; @@ -77,35 +120,34 @@ static void *gpir_node_create_dest(gpir_block *block, gpir_op op, nir_dest *dest static gpir_node *gpir_node_find(gpir_block *block, gpir_node *succ, nir_src *src, int channel) { + gpir_reg *reg = NULL; gpir_node *pred = NULL; - if (src->is_ssa) { if (src->ssa->num_components > 1) { for (int i = 0; i < GPIR_VECTOR_SSA_NUM; i++) { if (block->comp->vector_ssa[i].ssa == src->ssa->index) { - pred = block->comp->vector_ssa[i].nodes[channel]; - break; + return block->comp->vector_ssa[i].nodes[channel]; } } - } else - pred = block->comp->var_nodes[src->ssa->index]; - - assert(pred); - } - else { - pred = gpir_node_create(block, gpir_op_load_reg); - list_addtail(&pred->list, &succ->list); - - gpir_load_node *load = gpir_node_to_load(pred); - list_for_each_entry(gpir_reg, reg, &block->comp->reg_list, list) { - if (reg->index == src->reg.reg->index) { - load->reg = reg; - list_addtail(&load->reg_link, ®->uses_list); - break; - } + } else { + gpir_node *pred = block->comp->node_for_ssa[src->ssa->index]; + if (pred->block == block) + return pred; + reg = block->comp->reg_for_ssa[src->ssa->index]; } + } else { + pred = block->comp->node_for_reg[src->reg.reg->index]; + if (pred && pred->block == block && pred != succ) + return pred; + reg = reg_for_nir_reg(block->comp, src->reg.reg); } + assert(reg); + pred = gpir_node_create(block, gpir_op_load_reg); + gpir_load_node *load = gpir_node_to_load(pred); + load->reg = reg; + list_addtail(&pred->list, &succ->list); + return pred; } @@ -345,16 +387,6 @@ static bool gpir_emit_function(gpir_compiler *comp, nir_function_impl *impl) return true; } -gpir_reg *gpir_create_reg(gpir_compiler *comp) -{ - gpir_reg *reg = ralloc(comp, gpir_reg); - reg->index = comp->cur_reg++; - list_addtail(®->list, &comp->reg_list); - list_inithead(®->defs_list); - list_inithead(®->uses_list); - return reg; -} - static gpir_compiler *gpir_compiler_create(void *prog, unsigned num_reg, unsigned num_ssa) { gpir_compiler *comp = rzalloc(prog, gpir_compiler); @@ -362,13 +394,13 @@ static gpir_compiler *gpir_compiler_create(void *prog, unsigned num_reg, unsigne list_inithead(&comp->block_list); list_inithead(&comp->reg_list); - for (int i = 0; i < num_reg; i++) - gpir_create_reg(comp); - for (int i = 0; i < GPIR_VECTOR_SSA_NUM; i++) comp->vector_ssa[i].ssa = -1; - comp->var_nodes = rzalloc_array(comp, gpir_node *, num_ssa); + comp->node_for_ssa = rzalloc_array(comp, gpir_node *, num_ssa); + comp->node_for_reg = rzalloc_array(comp, gpir_node *, num_reg); + comp->reg_for_ssa = rzalloc_array(comp, gpir_reg *, num_ssa); + comp->reg_for_reg = rzalloc_array(comp, gpir_reg *, num_reg); comp->prog = prog; return comp; } diff --git a/src/gallium/drivers/lima/ir/gp/node.c b/src/gallium/drivers/lima/ir/gp/node.c index e62512890b3..78f7bd130ea 100644 --- a/src/gallium/drivers/lima/ir/gp/node.c +++ b/src/gallium/drivers/lima/ir/gp/node.c @@ -433,17 +433,6 @@ void gpir_node_delete(gpir_node *node) ralloc_free(dep); } - if (node->type == gpir_node_type_store) { - gpir_store_node *store = gpir_node_to_store(node); - if (store->reg) - list_del(&store->reg_link); - } - else if (node->type == gpir_node_type_load) { - gpir_load_node *load = gpir_node_to_load(node); - if (load->reg) - list_del(&load->reg_link); - } - list_del(&node->list); ralloc_free(node); } diff --git a/src/gallium/drivers/lima/ir/gp/reduce_scheduler.c b/src/gallium/drivers/lima/ir/gp/reduce_scheduler.c index a5013a59dbf..d51fc355f2b 100644 --- a/src/gallium/drivers/lima/ir/gp/reduce_scheduler.c +++ b/src/gallium/drivers/lima/ir/gp/reduce_scheduler.c @@ -190,21 +190,47 @@ static void schedule_block(gpir_block *block) schedule_ready_list(block, &ready_list); } -bool gpir_reduce_reg_pressure_schedule_prog(gpir_compiler *comp) +/* Due to how we translate from NIR, we never read a register written in the + * same block (we just pass the node through instead), so we don't have to + * worry about read-after-write dependencies. We do have to worry about + * write-after-read though, so we add those dependencies now. For example in a + * loop like this we need a dependency between the write and the read of i: + * + * i = ... + * while (...) { + * ... = i; + * i = i + 1; + * } + */ + +static void add_false_dependencies(gpir_compiler *comp) { - /* No need to build physical reg load/store dependency here, - * because we just exit SSA form, there should be at most - * one load and one store pair for a physical reg within a - * block, and the store must be after load with the output - * of load as input after some calculation. So we don't need to - * insert extra write-after-read or read-after-write dependecy - * for load/store nodes to maintain the right sequence before - * scheduling. - * - * Also no need to handle SSA def/use in difference block, - * because we'll load/store SSA to a physical reg if def/use - * are not in the same block. + /* Make sure we allocate this only once, in case there are many values and + * many blocks. */ + gpir_node **last_written = calloc(comp->cur_reg, sizeof(gpir_node *)); + + list_for_each_entry(gpir_block, block, &comp->block_list, list) { + list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + gpir_node *store = last_written[load->reg->index]; + if (store && store->block == block) { + gpir_node_add_dep(store, node, GPIR_DEP_WRITE_AFTER_READ); + } + } else if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + last_written[store->reg->index] = node; + } + } + } + + free(last_written); +} + +bool gpir_reduce_reg_pressure_schedule_prog(gpir_compiler *comp) +{ + add_false_dependencies(comp); list_for_each_entry(gpir_block, block, &comp->block_list, list) { block->rsched.node_index = 0; diff --git a/src/gallium/drivers/lima/ir/gp/regalloc.c b/src/gallium/drivers/lima/ir/gp/regalloc.c index c145bfbee81..91590d7364f 100644 --- a/src/gallium/drivers/lima/ir/gp/regalloc.c +++ b/src/gallium/drivers/lima/ir/gp/regalloc.c @@ -23,69 +23,435 @@ */ #include "gpir.h" +#include "util/u_dynarray.h" -/* Register allocation - * - * TODO: This needs to be rewritten when we support multiple basic blocks. We - * need to do proper liveness analysis, combined with either linear scan, - * graph coloring, or SSA-based allocation. We should also support spilling to - * temporaries. - * - * For now, this only assigns fake registers to values, used to build the fake - * dependencies that the scheduler relies on. In the future we should also be - * assigning actual physreg numbers to load_reg/store_reg nodes. - */ +/* Per-register information */ + +struct reg_info { + BITSET_WORD *conflicts; + struct util_dynarray conflict_list; + + /* Number of conflicts that must be allocated to physical registers. + */ + unsigned phys_conflicts; + + unsigned node_conflicts; + + /* Number of conflicts that can be allocated to either. */ + unsigned total_conflicts; + + int assigned_color; + + bool visited; +}; -static void regalloc_block(gpir_block *block) +struct regalloc_ctx { + unsigned bitset_words, num_nodes_and_regs; + struct reg_info *registers; + + /* Reusable scratch liveness array */ + BITSET_WORD *live; + + unsigned *worklist; + unsigned worklist_start, worklist_end; + + unsigned *stack; + unsigned stack_size; + + gpir_compiler *comp; + void *mem_ctx; +}; + +/* Liveness analysis */ + +static void propagate_liveness_instr(gpir_node *node, BITSET_WORD *live, + gpir_compiler *comp) { - /* build each node sequence index in the block node list */ - int index = 0; - list_for_each_entry(gpir_node, node, &block->node_list, list) { - node->vreg.index = index++; + /* KILL */ + if (node->type == gpir_node_type_store) { + if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + BITSET_CLEAR(live, store->reg->index); + } } - /* find the last successor of each node by the sequence index */ - list_for_each_entry(gpir_node, node, &block->node_list, list) { - node->vreg.last = NULL; - gpir_node_foreach_succ(node, dep) { - gpir_node *succ = dep->succ; - if (!node->vreg.last || node->vreg.last->vreg.index < succ->vreg.index) - node->vreg.last = succ; + /* GEN */ + if (node->type == gpir_node_type_load) { + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + BITSET_SET(live, load->reg->index); } } +} + +static bool propagate_liveness_block(gpir_block *block, struct regalloc_ctx *ctx) +{ + for (unsigned i = 0; i < 2; i++) { + if (block->successors[i]) { + for (unsigned j = 0; j < ctx->bitset_words; j++) + block->live_out[j] |= block->successors[i]->live_in[j]; + } + } + + memcpy(ctx->live, block->live_out, ctx->bitset_words * sizeof(BITSET_WORD)); + + list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { + propagate_liveness_instr(node, ctx->live, block->comp); + } - /* do linear scan regalloc */ - int reg_search_start = 0; - gpir_node *active[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0}; + bool changed = false; + for (unsigned i = 0; i < ctx->bitset_words; i++) { + changed |= (block->live_in[i] != ctx->live[i]); + block->live_in[i] = ctx->live[i]; + } + return changed; +} + +static void calc_def_block(gpir_block *block) +{ list_for_each_entry(gpir_node, node, &block->node_list, list) { - /* if some reg is expired */ - gpir_node_foreach_pred(node, dep) { - gpir_node *pred = dep->pred; - if (pred->vreg.last == node) - active[pred->value_reg] = NULL; - } - - /* no need to alloc value reg for root node */ - if (gpir_node_is_root(node)) { - node->value_reg = -1; - continue; - } - - /* find a free reg for this node */ - int i; - for (i = 0; i < GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM; i++) { - /* round robin reg select to reduce false dep when schedule */ - int reg = (reg_search_start + i) % (GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM); - if (!active[reg]) { - active[reg] = node; - node->value_reg = reg; - reg_search_start++; + if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + BITSET_SET(block->def_out, store->reg->index); + } + } +} + +static void calc_liveness(struct regalloc_ctx *ctx) +{ + bool changed = true; + while (changed) { + changed = false; + list_for_each_entry_rev(gpir_block, block, &ctx->comp->block_list, list) { + changed |= propagate_liveness_block(block, ctx); + } + } + + list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { + calc_def_block(block); + } + + changed = true; + while (changed) { + changed = false; + list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { + for (unsigned i = 0; i < 2; i++) { + gpir_block *succ = block->successors[i]; + if (!succ) + continue; + + for (unsigned j = 0; j < ctx->bitset_words; j++) { + BITSET_WORD new = block->def_out[j] & ~succ->def_out[j]; + changed |= (new != 0); + succ->def_out[j] |= block->def_out[j]; + } + } + } + } +} + +/* Interference calculation */ + +static void add_interference(struct regalloc_ctx *ctx, unsigned i, unsigned j) +{ + if (i == j) + return; + + struct reg_info *a = &ctx->registers[i]; + struct reg_info *b = &ctx->registers[j]; + + if (BITSET_TEST(a->conflicts, j)) + return; + + BITSET_SET(a->conflicts, j); + BITSET_SET(b->conflicts, i); + + a->total_conflicts++; + b->total_conflicts++; + if (j < ctx->comp->cur_reg) + a->phys_conflicts++; + else + a->node_conflicts++; + + if (i < ctx->comp->cur_reg) + b->phys_conflicts++; + else + b->node_conflicts++; + + util_dynarray_append(&a->conflict_list, unsigned, j); + util_dynarray_append(&b->conflict_list, unsigned, i); +} + +/* Make the register or node "i" intefere with all the other live registers + * and nodes. + */ +static void add_all_interferences(struct regalloc_ctx *ctx, + unsigned i, + BITSET_WORD *live_nodes, + BITSET_WORD *live_regs) +{ + int live_node; + BITSET_WORD tmp; + BITSET_FOREACH_SET(live_node, tmp, live_nodes, ctx->comp->cur_index) { + add_interference(ctx, i, + live_node + ctx->comp->cur_reg); + } + + int live_reg; + BITSET_FOREACH_SET(live_reg, tmp, ctx->live, ctx->comp->cur_index) { + add_interference(ctx, i, live_reg); + } + +} + +static void print_liveness(struct regalloc_ctx *ctx, + BITSET_WORD *live_reg, BITSET_WORD *live_val) +{ + if (!(lima_debug & LIMA_DEBUG_GP)) + return; + + int live_idx; + BITSET_WORD tmp; + BITSET_FOREACH_SET(live_idx, tmp, live_reg, ctx->comp->cur_reg) { + printf("reg%d ", live_idx); + } + BITSET_FOREACH_SET(live_idx, tmp, live_val, ctx->comp->cur_index) { + printf("%d ", live_idx); + } + printf("\n"); +} + +static void calc_interference(struct regalloc_ctx *ctx) +{ + BITSET_WORD *live_nodes = + rzalloc_array(ctx->mem_ctx, BITSET_WORD, ctx->comp->cur_index); + + list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { + /* Initialize liveness at the end of the block, but exclude values that + * definitely aren't defined by the end. This helps out with + * partially-defined registers, like: + * + * if (condition) { + * foo = ...; + * } + * if (condition) { + * ... = foo; + * } + * + * If we naively propagated liveness backwards, foo would be live from + * the beginning of the program, but if we're not inside a loop then + * its value is undefined before the first if and we don't have to + * consider it live. Mask out registers like foo here. + */ + for (unsigned i = 0; i < ctx->bitset_words; i++) { + ctx->live[i] = block->live_out[i] & block->def_out[i]; + } + + list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { + gpir_debug("processing node %d\n", node->index); + print_liveness(ctx, ctx->live, live_nodes); + if (node->type != gpir_node_type_store && + node->type != gpir_node_type_branch) { + add_all_interferences(ctx, node->index + ctx->comp->cur_reg, + live_nodes, ctx->live); + + /* KILL */ + BITSET_CLEAR(live_nodes, node->index); + } else if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + add_all_interferences(ctx, store->reg->index, + live_nodes, ctx->live); + + /* KILL */ + BITSET_CLEAR(ctx->live, store->reg->index); + } + + /* GEN */ + if (node->type == gpir_node_type_store) { + gpir_store_node *store = gpir_node_to_store(node); + BITSET_SET(live_nodes, store->child->index); + } else if (node->type == gpir_node_type_alu) { + gpir_alu_node *alu = gpir_node_to_alu(node); + for (int i = 0; i < alu->num_child; i++) + BITSET_SET(live_nodes, alu->children[i]->index); + } else if (node->type == gpir_node_type_branch) { + gpir_branch_node *branch = gpir_node_to_branch(node); + BITSET_SET(live_nodes, branch->cond->index); + } else if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + BITSET_SET(ctx->live, load->reg->index); + } + } + } +} + +/* Register allocation */ + +static bool can_simplify(struct regalloc_ctx *ctx, unsigned i) +{ + struct reg_info *info = &ctx->registers[i]; + if (i < ctx->comp->cur_reg) { + /* Physical regs. */ + return info->phys_conflicts + info->node_conflicts < GPIR_PHYSICAL_REG_NUM; + } else { + /* Nodes: if we manage to allocate all of its conflicting physical + * registers, they will take up at most GPIR_PHYSICAL_REG_NUM colors, so + * we can ignore any more than that. + */ + return MIN2(info->phys_conflicts, GPIR_PHYSICAL_REG_NUM) + + info->node_conflicts < GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM; + } +} + +static void push_stack(struct regalloc_ctx *ctx, unsigned i) +{ + ctx->stack[ctx->stack_size++] = i; + if (i < ctx->comp->cur_reg) + gpir_debug("pushing reg%u\n", i); + else + gpir_debug("pushing %d\n", i - ctx->comp->cur_reg); + + struct reg_info *info = &ctx->registers[i]; + assert(info->visited); + + util_dynarray_foreach(&info->conflict_list, unsigned, conflict) { + struct reg_info *conflict_info = &ctx->registers[*conflict]; + if (i < ctx->comp->cur_reg) { + assert(conflict_info->phys_conflicts > 0); + conflict_info->phys_conflicts--; + } else { + assert(conflict_info->node_conflicts > 0); + conflict_info->node_conflicts--; + } + if (!ctx->registers[*conflict].visited && can_simplify(ctx, *conflict)) { + ctx->worklist[ctx->worklist_end++] = *conflict; + ctx->registers[*conflict].visited = true; + } + } +} + +static bool do_regalloc(struct regalloc_ctx *ctx) +{ + ctx->worklist_start = 0; + ctx->worklist_end = 0; + ctx->stack_size = 0; + + /* Step 1: find the initially simplifiable registers */ + for (int i = 0; i < ctx->comp->cur_reg + ctx->comp->cur_index; i++) { + if (can_simplify(ctx, i)) { + ctx->worklist[ctx->worklist_end++] = i; + ctx->registers[i].visited = true; + } + } + + while (true) { + /* Step 2: push onto the stack whatever we can */ + while (ctx->worklist_start != ctx->worklist_end) { + push_stack(ctx, ctx->worklist[ctx->worklist_start++]); + } + + if (ctx->stack_size < ctx->num_nodes_and_regs) { + /* If there are still unsimplifiable nodes left, we need to + * optimistically push a node onto the stack. Choose the one with + * the smallest number of current neighbors, since that's the most + * likely to succeed. + */ + unsigned min_conflicts = UINT_MAX; + unsigned best_reg = 0; + for (unsigned reg = 0; reg < ctx->num_nodes_and_regs; reg++) { + struct reg_info *info = &ctx->registers[reg]; + if (info->visited) + continue; + if (info->phys_conflicts + info->node_conflicts < min_conflicts) { + best_reg = reg; + min_conflicts = info->phys_conflicts + info->node_conflicts; + } + } + gpir_debug("optimistic triggered\n"); + ctx->registers[best_reg].visited = true; + push_stack(ctx, best_reg); + } else { + break; + } + } + + /* Step 4: pop off the stack and assign colors */ + for (int i = ctx->num_nodes_and_regs - 1; i >= 0; i--) { + unsigned idx = ctx->stack[i]; + struct reg_info *reg = &ctx->registers[idx]; + + unsigned num_available_regs; + if (idx < ctx->comp->cur_reg) { + num_available_regs = GPIR_PHYSICAL_REG_NUM; + } else { + num_available_regs = GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM; + } + + bool found = false; + unsigned start = i % num_available_regs; + for (unsigned j = 0; j < num_available_regs; j++) { + unsigned candidate = (j + start) % num_available_regs; + bool available = true; + util_dynarray_foreach(®->conflict_list, unsigned, conflict_idx) { + struct reg_info *conflict = &ctx->registers[*conflict_idx]; + if (conflict->assigned_color >= 0 && + conflict->assigned_color == (int) candidate) { + available = false; + break; + } + } + + if (available) { + reg->assigned_color = candidate; + found = true; break; } } - /* TODO: spill */ - assert(i != GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM); + /* TODO: spilling */ + if (!found) { + gpir_error("Failed to allocate registers\n"); + return false; + } + } + + return true; +} + +static void assign_regs(struct regalloc_ctx *ctx) +{ + list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { + list_for_each_entry(gpir_node, node, &block->node_list, list) { + if (node->index >= 0) { + node->value_reg = + ctx->registers[ctx->comp->cur_reg + node->index].assigned_color; + } + + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + unsigned color = ctx->registers[load->reg->index].assigned_color; + load->index = color / 4; + load->component = color % 4; + } + + if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + unsigned color = ctx->registers[store->reg->index].assigned_color; + store->index = color / 4; + store->component = color % 4; + node->value_reg = color; + } + } + + block->live_out_phys = 0; + + int reg_idx; + BITSET_WORD tmp; + BITSET_FOREACH_SET(reg_idx, tmp, block->live_out, ctx->comp->cur_reg) { + if (BITSET_TEST(block->def_out, reg_idx)) { + block->live_out_phys |= (1ull << ctx->registers[reg_idx].assigned_color); + } + } } } @@ -104,6 +470,14 @@ static void regalloc_print_result(gpir_compiler *comp) gpir_node *pred = dep->pred; printf(" %d/%d", pred->index, pred->value_reg); } + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + printf(" -/%d", 4 * load->index + load->component); + printf(" (%d)", load->reg->index); + } else if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + printf(" (%d)", store->reg->index); + } printf("\n"); } printf("----------------------------\n"); @@ -112,10 +486,38 @@ static void regalloc_print_result(gpir_compiler *comp) bool gpir_regalloc_prog(gpir_compiler *comp) { + struct regalloc_ctx ctx; + + ctx.mem_ctx = ralloc_context(NULL); + ctx.num_nodes_and_regs = comp->cur_reg + comp->cur_index; + ctx.bitset_words = BITSET_WORDS(ctx.num_nodes_and_regs); + ctx.live = ralloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); + ctx.worklist = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs); + ctx.stack = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs); + ctx.comp = comp; + + ctx.registers = rzalloc_array(ctx.mem_ctx, struct reg_info, ctx.num_nodes_and_regs); + for (unsigned i = 0; i < ctx.num_nodes_and_regs; i++) { + ctx.registers[i].conflicts = rzalloc_array(ctx.mem_ctx, BITSET_WORD, + ctx.bitset_words); + util_dynarray_init(&ctx.registers[i].conflict_list, ctx.mem_ctx); + } + list_for_each_entry(gpir_block, block, &comp->block_list, list) { - regalloc_block(block); + block->live_out = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); + block->live_in = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); + block->def_out = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); + } + + calc_liveness(&ctx); + calc_interference(&ctx); + if (!do_regalloc(&ctx)) { + ralloc_free(ctx.mem_ctx); + return false; } + assign_regs(&ctx); regalloc_print_result(comp); + ralloc_free(ctx.mem_ctx); return true; } diff --git a/src/gallium/drivers/lima/ir/gp/scheduler.c b/src/gallium/drivers/lima/ir/gp/scheduler.c index 3b490974fc0..ac132776d43 100644 --- a/src/gallium/drivers/lima/ir/gp/scheduler.c +++ b/src/gallium/drivers/lima/ir/gp/scheduler.c @@ -215,6 +215,14 @@ typedef struct { * schedule the instruction. */ int total_spill_needed; + + /* For each physical register, a linked list of loads associated with it in + * this block. When we spill a value to a given register, and there are + * existing loads associated with it that haven't been scheduled yet, we + * have to make sure that the corresponding unspill happens after the last + * original use has happened, i.e. is scheduled before. + */ + struct list_head physreg_reads[GPIR_PHYSICAL_REG_NUM]; } sched_ctx; static int gpir_min_dist_alu(gpir_dep *dep) @@ -535,6 +543,19 @@ static bool _try_place_node(sched_ctx *ctx, gpir_instr *instr, gpir_node *node) } } + if (node->op == gpir_op_store_reg) { + /* This register may be loaded in the next basic block, in which case + * there still needs to be a 2 instruction gap. We do what the blob + * seems to do and simply disable stores in the last two instructions of + * the basic block. + * + * TODO: We may be able to do better than this, but we have to check + * first if storing a register works across branches. + */ + if (instr->index < 2) + return false; + } + node->sched.instr = instr; int max_node_spill_needed = INT_MAX; @@ -1009,10 +1030,6 @@ static bool try_spill_node(sched_ctx *ctx, gpir_node *node) ctx->live_physregs |= (1ull << physreg); - /* TODO: when we support multiple basic blocks, there may be register - * loads/stores to this register other than this one that haven't been - * scheduled yet so we may need to insert write-after-read dependencies. - */ gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg); store->index = physreg / 4; store->component = physreg % 4; @@ -1030,6 +1047,16 @@ static bool try_spill_node(sched_ctx *ctx, gpir_node *node) } node->sched.physreg_store = store; gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT); + + list_for_each_entry(gpir_load_node, load, + &ctx->physreg_reads[physreg], reg_link) { + gpir_node_add_dep(&store->node, &load->node, GPIR_DEP_WRITE_AFTER_READ); + if (load->node.sched.ready) { + list_del(&load->node.list); + load->node.sched.ready = false; + } + } + node->sched.ready = false; schedule_insert_ready_list(ctx, &store->node); } @@ -1556,13 +1583,21 @@ static bool schedule_block(gpir_block *block) list_inithead(&ctx.ready_list); ctx.block = block; ctx.ready_list_slots = 0; - /* TODO initialize with block live out once we have proper liveness - * tracking - */ - ctx.live_physregs = 0; + ctx.live_physregs = block->live_out_phys; + + for (unsigned i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) { + list_inithead(&ctx.physreg_reads[i]); + } /* construct the ready list from root nodes */ list_for_each_entry_safe(gpir_node, node, &block->node_list, list) { + /* Add to physreg_reads */ + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + unsigned index = 4 * load->index + load->component; + list_addtail(&load->reg_link, &ctx.physreg_reads[index]); + } + if (gpir_node_is_root(node)) schedule_insert_ready_list(&ctx, node); } @@ -1623,22 +1658,6 @@ static void schedule_build_dependency(gpir_block *block) } } - /* Forward dependencies. We only need to add these for register loads, - * since value registers already have an input dependency. - */ - list_for_each_entry(gpir_node, node, &block->node_list, list) { - if (node->op == gpir_op_load_reg) { - gpir_load_node *load = gpir_node_to_load(node); - unsigned index = 4 * load->index + load->component; - if (last_written[index]) { - gpir_node_add_dep(node, last_written[index], GPIR_DEP_READ_AFTER_WRITE); - } - } - - if (node->value_reg >= 0) - last_written[node->value_reg] = node; - } - memset(last_written, 0, sizeof(last_written)); /* False dependencies. For value registers, these exist only to make sure @@ -1651,6 +1670,10 @@ static void schedule_build_dependency(gpir_block *block) if (last_written[index]) { gpir_node_add_dep(last_written[index], node, GPIR_DEP_WRITE_AFTER_READ); } + } else if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + unsigned index = 4 * store->index + store->component; + last_written[index] = node; } else { add_fake_dep(node, node, last_written); } -- 2.30.2