From 71e5ba9c011939c962018af7f3ca78b600c95148 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 22 Sep 2014 12:16:16 -0700 Subject: [PATCH] vc4: Switch to using Mesa's register allocator. This will let me more reliably allocate a-file registers, which are going to be even more in demand when I start using a-file unpacks. Also fixes a bug where the reservation of payload registers (FRAG_Z/W) was off by one but just caused failure to register allocate at all if the off-by-one was fixed. --- src/gallium/drivers/vc4/Makefile.am | 1 + src/gallium/drivers/vc4/vc4_context.h | 8 +- src/gallium/drivers/vc4/vc4_program.c | 12 +- src/gallium/drivers/vc4/vc4_qpu_emit.c | 4 +- .../drivers/vc4/vc4_register_allocate.c | 210 +++++++++--------- 5 files changed, 121 insertions(+), 114 deletions(-) diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am index 0b8279d1763..774463138d0 100644 --- a/src/gallium/drivers/vc4/Makefile.am +++ b/src/gallium/drivers/vc4/Makefile.am @@ -31,6 +31,7 @@ AM_CFLAGS = \ $(LIBDRM_CFLAGS) \ $(GALLIUM_DRIVER_CFLAGS) \ $(SIM_CFLAGS) \ + -I$(top_srcdir)/src/mesa/ \ $() noinst_LTLIBRARIES = libvc4.la diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h index 549becdbf66..1dc0f97a775 100644 --- a/src/gallium/drivers/vc4/vc4_context.h +++ b/src/gallium/drivers/vc4/vc4_context.h @@ -165,6 +165,10 @@ struct vc4_context { struct util_hash_table *fs_cache, *vs_cache; + struct ra_regs *regs; + unsigned int reg_class_any; + unsigned int reg_class_a; + /** @{ Current pipeline state objects */ struct pipe_scissor_state scissor; struct pipe_blend_state *blend; @@ -237,8 +241,8 @@ void vc4_write_uniforms(struct vc4_context *vc4, void vc4_flush(struct pipe_context *pctx); void vc4_flush_for_bo(struct pipe_context *pctx, struct vc4_bo *bo); void vc4_emit_state(struct pipe_context *pctx); -void vc4_generate_code(struct vc4_compile *c); -struct qpu_reg *vc4_register_allocate(struct vc4_compile *c); +void vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c); +struct qpu_reg *vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c); void vc4_update_compiled_shaders(struct vc4_context *vc4, uint8_t prim_mode); bool vc4_rt_format_supported(enum pipe_format f); diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 38d0608480c..60d9ce92935 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -1373,7 +1373,8 @@ emit_coord_end(struct vc4_compile *c) } static struct vc4_compile * -vc4_shader_tgsi_to_qir(struct vc4_compiled_shader *shader, enum qstage stage, +vc4_shader_tgsi_to_qir(struct vc4_context *vc4, + struct vc4_compiled_shader *shader, enum qstage stage, struct vc4_key *key) { struct vc4_compile *c = qir_compile_init(); @@ -1454,7 +1455,7 @@ vc4_shader_tgsi_to_qir(struct vc4_compiled_shader *shader, enum qstage stage, qir_dump(c); } qir_reorder_uniforms(c); - vc4_generate_code(c); + vc4_generate_code(vc4, c); if (vc4_debug & VC4_DEBUG_SHADERDB) { fprintf(stderr, "SHADER-DB: %s: %d instructions\n", @@ -1501,7 +1502,8 @@ static void vc4_fs_compile(struct vc4_context *vc4, struct vc4_compiled_shader *shader, struct vc4_fs_key *key) { - struct vc4_compile *c = vc4_shader_tgsi_to_qir(shader, QSTAGE_FRAG, + struct vc4_compile *c = vc4_shader_tgsi_to_qir(vc4, shader, + QSTAGE_FRAG, &key->base); shader->num_inputs = c->num_inputs; copy_uniform_state_to_shader(shader, 0, c); @@ -1516,12 +1518,12 @@ static void vc4_vs_compile(struct vc4_context *vc4, struct vc4_compiled_shader *shader, struct vc4_vs_key *key) { - struct vc4_compile *vs_c = vc4_shader_tgsi_to_qir(shader, + struct vc4_compile *vs_c = vc4_shader_tgsi_to_qir(vc4, shader, QSTAGE_VERT, &key->base); copy_uniform_state_to_shader(shader, 0, vs_c); - struct vc4_compile *cs_c = vc4_shader_tgsi_to_qir(shader, + struct vc4_compile *cs_c = vc4_shader_tgsi_to_qir(vc4, shader, QSTAGE_COORD, &key->base); copy_uniform_state_to_shader(shader, 1, cs_c); diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index 2fa131f134a..9a5dfa47a5b 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -208,9 +208,9 @@ serialize_insts(struct vc4_compile *c) } void -vc4_generate_code(struct vc4_compile *c) +vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) { - struct qpu_reg *temp_registers = vc4_register_allocate(c); + struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c); bool discard = false; make_empty_list(&c->qpu_inst_list); diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c index 1f9623aa7d2..204c080467e 100644 --- a/src/gallium/drivers/vc4/vc4_register_allocate.c +++ b/src/gallium/drivers/vc4/vc4_register_allocate.c @@ -21,8 +21,8 @@ * IN THE SOFTWARE. */ -#include - +#include "util/ralloc.h" +#include "util/register_allocate.h" #include "vc4_context.h" #include "vc4_qir.h" #include "vc4_qpu.h" @@ -104,135 +104,135 @@ static const struct qpu_reg vc4_regs[] = { #define A_INDEX (ACC_INDEX + 5) #define B_INDEX (A_INDEX + 32) +static void +vc4_alloc_reg_set(struct vc4_context *vc4) +{ + assert(vc4_regs[A_INDEX].addr == 0); + assert(vc4_regs[B_INDEX].addr == 0); + STATIC_ASSERT(ARRAY_SIZE(vc4_regs) == B_INDEX + 32); + + if (vc4->regs) + return; + + vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs)); + + vc4->reg_class_any = ra_alloc_reg_class(vc4->regs); + for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) { + /* Reserve r3 for now, since we're using it for spilling-like + * operations in vc4_qpu_emit.c + */ + if (vc4_regs[i].mux == QPU_MUX_R3) + continue; + + /* R4 can't be written as a general purpose register. (it's + * TMU_NOSWAP as a write address). + */ + if (vc4_regs[i].mux == QPU_MUX_R4) + continue; + + ra_class_add_reg(vc4->regs, vc4->reg_class_any, i); + } + + vc4->reg_class_a = ra_alloc_reg_class(vc4->regs); + for (uint32_t i = A_INDEX; i < A_INDEX + 32; i++) + ra_class_add_reg(vc4->regs, vc4->reg_class_a, i); + + ra_set_finalize(vc4->regs, NULL); +} + /** * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. * * The return value should be freed by the caller. */ struct qpu_reg * -vc4_register_allocate(struct vc4_compile *c) +vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) { struct simple_node *node; - bool reg_in_use[ARRAY_SIZE(vc4_regs)]; - int *reg_allocated = calloc(c->num_temps, sizeof(*reg_allocated)); - int *reg_uses_remaining = - calloc(c->num_temps, sizeof(*reg_uses_remaining)); + uint32_t def[c->num_temps]; + uint32_t use[c->num_temps]; struct qpu_reg *temp_registers = calloc(c->num_temps, sizeof(*temp_registers)); - - for (int i = 0; i < ARRAY_SIZE(reg_in_use); i++) - reg_in_use[i] = false; - for (int i = 0; i < c->num_temps; i++) - reg_allocated[i] = -1; + memset(def, 0, sizeof(def)); + memset(use, 0, sizeof(use)); /* If things aren't ever written (undefined values), just read from * r0. */ - for (int i = 0; i < c->num_temps; i++) + for (uint32_t i = 0; i < c->num_temps; i++) temp_registers[i] = qpu_rn(0); - /* Reserve r3 for spilling-like operations in vc4_qpu_emit.c */ - reg_in_use[ACC_INDEX + 3] = true; + vc4_alloc_reg_set(vc4); + + struct ra_graph *g = ra_alloc_interference_graph(vc4->regs, + c->num_temps); + + for (uint32_t i = 0; i < c->num_temps; i++) + ra_set_node_class(g, i, vc4->reg_class_any); + /* Compute the live ranges so we can figure out interference, and + * figure out our register classes and preallocated registers. + */ + uint32_t ip = 0; foreach(node, &c->instructions) { - struct qinst *qinst = (struct qinst *)node; + struct qinst *inst = (struct qinst *)node; - if (qinst->dst.file == QFILE_TEMP) - reg_uses_remaining[qinst->dst.index]++; - for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) { - if (qinst->src[i].file == QFILE_TEMP) - reg_uses_remaining[qinst->src[i].index]++; + if (inst->dst.file == QFILE_TEMP) { + def[inst->dst.index] = ip; + use[inst->dst.index] = ip; } - if (qinst->op == QOP_FRAG_Z) - reg_in_use[3 + 32 + QPU_R_FRAG_PAYLOAD_ZW] = true; - if (qinst->op == QOP_FRAG_W) - reg_in_use[3 + QPU_R_FRAG_PAYLOAD_ZW] = true; - } - foreach(node, &c->instructions) { - struct qinst *qinst = (struct qinst *)node; - - for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) { - int index = qinst->src[i].index; - - if (qinst->src[i].file != QFILE_TEMP) - continue; - - if (reg_allocated[index] == -1) { - fprintf(stderr, "undefined reg use: "); - qir_dump_inst(qinst); - fprintf(stderr, "\n"); - } else { - reg_uses_remaining[index]--; - if (reg_uses_remaining[index] == 0) - reg_in_use[reg_allocated[index]] = false; - } + for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + if (inst->src[i].file == QFILE_TEMP) + use[inst->src[i].index] = ip; } - if (qinst->dst.file == QFILE_TEMP) { - if (reg_allocated[qinst->dst.index] == -1) { - int alloc; - for (alloc = 0; - alloc < ARRAY_SIZE(reg_in_use); - alloc++) { - struct qpu_reg reg = vc4_regs[alloc]; - - switch (qinst->op) { - case QOP_PACK_SCALED: - /* The pack flags require an - * A-file register. - */ - if (reg.mux != QPU_MUX_A) - continue; - break; - case QOP_TEX_RESULT: - case QOP_TLB_COLOR_READ: - /* Only R4-generating - * instructions get to store - * values in R4 for now, until - * we figure out how to do - * interference. - */ - if (reg.mux != QPU_MUX_R4) - continue; - break; - case QOP_FRAG_Z: - if (reg.mux != QPU_MUX_B || - reg.addr != QPU_R_FRAG_PAYLOAD_ZW) { - continue; - } - break; - case QOP_FRAG_W: - if (reg.mux != QPU_MUX_A || - reg.addr != QPU_R_FRAG_PAYLOAD_ZW) { - continue; - } - break; - default: - if (reg.mux == QPU_MUX_R4) - continue; - break; - } - - if (!reg_in_use[alloc]) - break; - } - assert(alloc != ARRAY_SIZE(reg_in_use) && "need better reg alloc"); - reg_in_use[alloc] = true; - reg_allocated[qinst->dst.index] = alloc; - temp_registers[qinst->dst.index] = vc4_regs[alloc]; - } - - reg_uses_remaining[qinst->dst.index]--; - if (reg_uses_remaining[qinst->dst.index] == 0) { - reg_in_use[reg_allocated[qinst->dst.index]] = - false; - } + switch (inst->op) { + case QOP_FRAG_Z: + def[inst->dst.index] = 0; + ra_set_node_reg(g, inst->dst.index, + B_INDEX + QPU_R_FRAG_PAYLOAD_ZW); + break; + + case QOP_FRAG_W: + def[inst->dst.index] = 0; + ra_set_node_reg(g, inst->dst.index, + A_INDEX + QPU_R_FRAG_PAYLOAD_ZW); + break; + + case QOP_TEX_RESULT: + case QOP_TLB_COLOR_READ: + assert(vc4_regs[ACC_INDEX + 4].mux == QPU_MUX_R4); + ra_set_node_reg(g, inst->dst.index, + ACC_INDEX + 4); + break; + + case QOP_PACK_SCALED: + /* The pack flags require an A-file dst register. */ + ra_set_node_class(g, inst->dst.index, vc4->reg_class_a); + break; + + default: + break; } + + ip++; } - free(reg_allocated); - free(reg_uses_remaining); + for (uint32_t i = 0; i < c->num_temps; i++) { + for (uint32_t j = i + 1; j < c->num_temps; j++) { + if (!(def[i] >= use[j] || def[j] >= use[i])) + ra_add_node_interference(g, i, j); + } + } + + bool ok = ra_allocate(g); + assert(ok); + + for (uint32_t i = 0; i < c->num_temps; i++) + temp_registers[i] = vc4_regs[ra_get_node_reg(g, i)]; + + ralloc_free(g); return temp_registers; } -- 2.30.2