X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Fbroadcom%2Fcompiler%2Fvir_register_allocate.c;h=d88a8df1141d6f7622c8a421f068eac981750f9b;hp=4ec5f232643ec1fb9b6ea65690a49e4ec05522af;hb=36aed70b591f7f4f642b26f46f7928be6d137e7b;hpb=facc3c6f58de88ac3707a1b8435b7fc655d13124 diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c index 4ec5f232643..d88a8df1141 100644 --- a/src/broadcom/compiler/vir_register_allocate.c +++ b/src/broadcom/compiler/vir_register_allocate.c @@ -29,28 +29,44 @@ #define QPU_R(i) { .magic = false, .index = i } #define ACC_INDEX 0 -#define ACC_COUNT 5 +#define ACC_COUNT 6 #define PHYS_INDEX (ACC_INDEX + ACC_COUNT) #define PHYS_COUNT 64 +static inline bool +qinst_writes_tmu(struct qinst *inst) +{ + return (inst->dst.file == QFILE_MAGIC && + v3d_qpu_magic_waddr_is_tmu(inst->dst.index)); +} + static bool is_last_ldtmu(struct qinst *inst, struct qblock *block) { - list_for_each_entry_from(struct qinst, scan_inst, inst, + list_for_each_entry_from(struct qinst, scan_inst, inst->link.next, &block->instructions, link) { - if (inst->qpu.sig.ldtmu) + if (scan_inst->qpu.sig.ldtmu) return false; - if (v3d_qpu_writes_tmu(&inst->qpu)) + if (qinst_writes_tmu(scan_inst)) return true; } return true; } +static bool +vir_is_mov_uniform(struct v3d_compile *c, int temp) +{ + struct qinst *def = c->defs[temp]; + + return def && def->qpu.sig.ldunif; +} + static int v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, uint32_t *temp_to_node) { + const float tmu_scale = 5; float block_scale = 1.0; float spill_costs[c->num_temps]; bool in_tmu_operation = false; @@ -75,25 +91,40 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, continue; int temp = inst->src[i].index; - if (no_spilling) { - BITSET_CLEAR(c->spillable, - temp); - } else { + if (vir_is_mov_uniform(c, temp)) { spill_costs[temp] += block_scale; + } else if (!no_spilling) { + spill_costs[temp] += (block_scale * + tmu_scale); + } else { + BITSET_CLEAR(c->spillable, temp); } } if (inst->dst.file == QFILE_TEMP) { int temp = inst->dst.index; - if (no_spilling) { - BITSET_CLEAR(c->spillable, - temp); + if (vir_is_mov_uniform(c, temp)) { + /* We just rematerialize the unform + * later. + */ + } else if (!no_spilling) { + spill_costs[temp] += (block_scale * + tmu_scale); } else { - spill_costs[temp] += block_scale; + BITSET_CLEAR(c->spillable, temp); } } + /* Refuse to spill a ldvary's dst, because that means + * that ldvary's r5 would end up being used across a + * thrsw. + */ + if (inst->qpu.sig.ldvary) { + assert(inst->dst.file == QFILE_TEMP); + BITSET_CLEAR(c->spillable, inst->dst.index); + } + if (inst->is_last_thrsw) started_last_seg = true; @@ -102,7 +133,7 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, started_last_seg = true; /* Track when we're in between a TMU setup and the - * final LDTMU from that TMU setup. We can't + * final LDTMU or TMUWT from that TMU setup. We can't * spill/fill any temps during that time, because that * involves inserting a new TMU setup/LDTMU sequence. */ @@ -110,7 +141,11 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, is_last_ldtmu(inst, block)) in_tmu_operation = false; - if (v3d_qpu_writes_tmu(&inst->qpu)) + if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && + inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) + in_tmu_operation = false; + + if (qinst_writes_tmu(inst)) in_tmu_operation = true; } } @@ -128,7 +163,7 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, /* The spill offset for this thread takes a bit of setup, so do it once at * program start. */ -static void +void v3d_setup_spill_base(struct v3d_compile *c) { c->cursor = vir_before_block(vir_entry_block(c)); @@ -157,6 +192,8 @@ v3d_setup_spill_base(struct v3d_compile *c) /* Make sure that we don't spill the spilling setup instructions. */ for (int i = start_num_temps; i < c->num_temps; i++) BITSET_CLEAR(c->spillable, i); + + c->cursor = vir_after_block(c->cur_block); } static void @@ -171,18 +208,30 @@ v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset) static void v3d_spill_reg(struct v3d_compile *c, int spill_temp) { - uint32_t spill_offset = c->spill_size; - c->spill_size += 16 * sizeof(uint32_t); + bool is_uniform = vir_is_mov_uniform(c, spill_temp); - if (spill_offset == 0) - v3d_setup_spill_base(c); + uint32_t spill_offset = 0; + + if (!is_uniform) { + uint32_t spill_offset = c->spill_size; + c->spill_size += V3D_CHANNELS * sizeof(uint32_t); + + if (spill_offset == 0) + v3d_setup_spill_base(c); + } struct qinst *last_thrsw = c->last_thrsw; assert(!last_thrsw || last_thrsw->is_last_thrsw); int start_num_temps = c->num_temps; - vir_for_each_inst_inorder(inst, c) { + int uniform_index = ~0; + if (is_uniform) { + struct qinst *orig_unif = c->defs[spill_temp]; + uniform_index = orig_unif->uniform; + } + + vir_for_each_inst_inorder_safe(inst, c) { for (int i = 0; i < vir_get_nsrc(inst); i++) { if (inst->src[i].file != QFILE_TEMP || inst->src[i].index != spill_temp) { @@ -191,22 +240,38 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp) c->cursor = vir_before_inst(inst); - v3d_emit_spill_tmua(c, spill_offset); - vir_emit_thrsw(c); - inst->src[i] = vir_LDTMU(c); - c->fills++; + if (is_uniform) { + struct qreg unif = + vir_uniform(c, + c->uniform_contents[uniform_index], + c->uniform_data[uniform_index]); + inst->src[i] = unif; + } else { + v3d_emit_spill_tmua(c, spill_offset); + vir_emit_thrsw(c); + inst->src[i] = vir_LDTMU(c); + c->fills++; + } } if (inst->dst.file == QFILE_TEMP && inst->dst.index == spill_temp) { - c->cursor = vir_after_inst(inst); - - inst->dst.index = c->num_temps++; - vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), - inst->dst); - v3d_emit_spill_tmua(c, spill_offset); - vir_emit_thrsw(c); - c->spills++; + if (is_uniform) { + c->cursor.link = NULL; + vir_remove_instruction(c, inst); + } else { + c->cursor = vir_after_inst(inst); + + inst->dst.index = c->num_temps++; + vir_MOV_dest(c, vir_reg(QFILE_MAGIC, + V3D_QPU_WADDR_TMUD), + inst->dst); + v3d_emit_spill_tmua(c, spill_offset); + vir_emit_thrsw(c); + vir_TMUWT(c); + c->spills++; + c->tmu_dirty_rcl = true; + } } /* If we didn't have a last-thrsw inserted by nir_to_vir and @@ -214,7 +279,7 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp) * right before we start the vpm/tlb sequence for the last * thread segment. */ - if (!last_thrsw && c->last_thrsw && + if (!is_uniform && !last_thrsw && c->last_thrsw && (v3d_qpu_writes_vpm(&inst->qpu) || v3d_qpu_uses_tlb(&inst->qpu))) { c->cursor = vir_before_inst(inst); @@ -238,6 +303,51 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp) BITSET_CLEAR(c->spillable, i); } +struct v3d_ra_select_callback_data { + uint32_t next_acc; + uint32_t next_phys; +}; + +static unsigned int +v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data) +{ + struct v3d_ra_select_callback_data *v3d_ra = data; + int r5 = ACC_INDEX + 5; + + /* Choose r5 for our ldunifs if possible (nobody else can load to that + * reg, and it keeps the QPU cond field free from being occupied by + * ldunifrf). + */ + if (BITSET_TEST(regs, r5)) + return r5; + + /* Choose an accumulator if possible (I think it's lower power than + * phys regs), but round-robin through them to give post-RA + * instruction selection more options. + */ + for (int i = 0; i < ACC_COUNT; i++) { + int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT; + int acc = ACC_INDEX + acc_off; + + if (BITSET_TEST(regs, acc)) { + v3d_ra->next_acc = acc_off + 1; + return acc; + } + } + + for (int i = 0; i < PHYS_COUNT; i++) { + int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT; + int phys = PHYS_INDEX + phys_off; + + if (BITSET_TEST(regs, phys)) { + v3d_ra->next_phys = phys_off + 1; + return phys; + } + } + + unreachable("RA must pass us at least one possible reg."); +} + bool vir_init_reg_sets(struct v3d_compiler *compiler) { @@ -252,6 +362,10 @@ vir_init_reg_sets(struct v3d_compiler *compiler) return false; for (int threads = 0; threads < max_thread_index; threads++) { + compiler->reg_class_any[threads] = + ra_alloc_reg_class(compiler->regs); + compiler->reg_class_r5[threads] = + ra_alloc_reg_class(compiler->regs); compiler->reg_class_phys_or_acc[threads] = ra_alloc_reg_class(compiler->regs); compiler->reg_class_phys[threads] = @@ -263,12 +377,25 @@ vir_init_reg_sets(struct v3d_compiler *compiler) compiler->reg_class_phys_or_acc[threads], i); ra_class_add_reg(compiler->regs, compiler->reg_class_phys[threads], i); + ra_class_add_reg(compiler->regs, + compiler->reg_class_any[threads], i); } - for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT; i++) { + for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { ra_class_add_reg(compiler->regs, compiler->reg_class_phys_or_acc[threads], i); + ra_class_add_reg(compiler->regs, + compiler->reg_class_any[threads], i); } + /* r5 can only store a single 32-bit value, so not much can + * use it. + */ + ra_class_add_reg(compiler->regs, + compiler->reg_class_r5[threads], + ACC_INDEX + 5); + ra_class_add_reg(compiler->regs, + compiler->reg_class_any[threads], + ACC_INDEX + 5); } ra_set_finalize(compiler->regs, NULL); @@ -291,9 +418,11 @@ node_to_temp_priority(const void *in_a, const void *in_b) } #define CLASS_BIT_PHYS (1 << 0) -#define CLASS_BIT_R0_R2 (1 << 1) -#define CLASS_BIT_R3 (1 << 2) -#define CLASS_BIT_R4 (1 << 3) +#define CLASS_BIT_ACC (1 << 1) +#define CLASS_BIT_R5 (1 << 4) +#define CLASS_BITS_ANY (CLASS_BIT_PHYS | \ + CLASS_BIT_ACC | \ + CLASS_BIT_R5) /** * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. @@ -306,9 +435,14 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) struct node_to_temp_map map[c->num_temps]; uint32_t temp_to_node[c->num_temps]; uint8_t class_bits[c->num_temps]; - struct qpu_reg *temp_registers = calloc(c->num_temps, - sizeof(*temp_registers)); int acc_nodes[ACC_COUNT]; + struct v3d_ra_select_callback_data callback_data = { + .next_acc = 0, + /* Start at RF3, to try to keep the TLB writes from using + * RF0-2. + */ + .next_phys = 3, + }; *spilled = false; @@ -328,6 +462,7 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs, c->num_temps + ARRAY_SIZE(acc_nodes)); + ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data); /* Make some fixed nodes for the accumulators, which we will need to * interfere with when ops have implied r3/r4 writes or for the thread @@ -353,9 +488,7 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) * start with any temp being able to be in any file, then instructions * incrementally remove bits that the temp definitely can't be in. */ - memset(class_bits, - CLASS_BIT_PHYS | CLASS_BIT_R0_R2 | CLASS_BIT_R3 | CLASS_BIT_R4, - sizeof(class_bits)); + memset(class_bits, CLASS_BITS_ANY, sizeof(class_bits)); int ip = 0; vir_for_each_inst_inorder(inst, c) { @@ -400,6 +533,19 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) class_bits[inst->dst.index] &= CLASS_BIT_PHYS; break; + case V3D_QPU_A_RECIP: + case V3D_QPU_A_RSQRT: + case V3D_QPU_A_EXP: + case V3D_QPU_A_LOG: + case V3D_QPU_A_SIN: + case V3D_QPU_A_RSQRT2: + /* The SFU instructions write directly to the + * phys regfile. + */ + assert(inst->dst.file == QFILE_TEMP); + class_bits[inst->dst.index] &= CLASS_BIT_PHYS; + break; + default: break; } @@ -410,6 +556,7 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) case 0: case 1: case 2: + case 3: /* Payload setup instructions: Force allocate * the dst to the given register (so the MOV * will disappear). @@ -424,6 +571,24 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) } } + if (inst->dst.file == QFILE_TEMP) { + /* Only a ldunif gets to write to R5, which only has a + * single 32-bit channel of storage. + */ + if (!inst->qpu.sig.ldunif) { + class_bits[inst->dst.index] &= ~CLASS_BIT_R5; + } else { + /* Until V3D 4.x, we could only load a uniform + * to r5, so we'll need to spill if uniform + * loads interfere with each other. + */ + if (c->devinfo->ver < 40) { + class_bits[inst->dst.index] &= + CLASS_BIT_R5; + } + } + } + if (inst->qpu.sig.thrsw) { /* All accumulators are invalidated across a thread * switch. @@ -441,13 +606,16 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) if (class_bits[i] == CLASS_BIT_PHYS) { ra_set_node_class(g, temp_to_node[i], c->compiler->reg_class_phys[thread_index]); - } else { - assert(class_bits[i] == (CLASS_BIT_PHYS | - CLASS_BIT_R0_R2 | - CLASS_BIT_R3 | - CLASS_BIT_R4)); + } else if (class_bits[i] == (CLASS_BIT_R5)) { + ra_set_node_class(g, temp_to_node[i], + c->compiler->reg_class_r5[thread_index]); + } else if (class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_ACC)) { ra_set_node_class(g, temp_to_node[i], c->compiler->reg_class_phys_or_acc[thread_index]); + } else { + assert(class_bits[i] == CLASS_BITS_ANY); + ra_set_node_class(g, temp_to_node[i], + c->compiler->reg_class_any[thread_index]); } } @@ -462,26 +630,44 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) } } + /* Debug code to force a bit of register spilling, for running across + * conformance tests to make sure that spilling works. + */ + int force_register_spills = 0; + if (c->spill_size < + V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) { + int node = v3d_choose_spill_node(c, g, temp_to_node); + if (node != -1) { + v3d_spill_reg(c, map[node].temp); + ralloc_free(g); + *spilled = true; + return NULL; + } + } + bool ok = ra_allocate(g); if (!ok) { - /* Try to spill, if we can't reduce threading first. */ - if (thread_index == 0) { - int node = v3d_choose_spill_node(c, g, temp_to_node); + int node = v3d_choose_spill_node(c, g, temp_to_node); - if (node != -1) { - v3d_spill_reg(c, map[node].temp); - ralloc_free(g); + /* Don't emit spills using the TMU until we've dropped thread + * conut first. + */ + if (node != -1 && + (vir_is_mov_uniform(c, map[node].temp) || + thread_index == 0)) { + v3d_spill_reg(c, map[node].temp); - /* Ask the outer loop to call back in. */ - *spilled = true; - return NULL; - } + /* Ask the outer loop to call back in. */ + *spilled = true; } - free(temp_registers); + ralloc_free(g); return NULL; } + struct qpu_reg *temp_registers = calloc(c->num_temps, + sizeof(*temp_registers)); + for (uint32_t i = 0; i < c->num_temps; i++) { int ra_reg = ra_get_node_reg(g, temp_to_node[i]); if (ra_reg < PHYS_INDEX) { @@ -504,17 +690,5 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) ralloc_free(g); - if (V3D_DEBUG & V3D_DEBUG_SHADERDB) { - fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d spills\n", - vir_get_stage_name(c), - c->program_id, c->variant_id, - c->spills); - - fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d fills\n", - vir_get_stage_name(c), - c->program_id, c->variant_id, - c->fills); - } - return temp_registers; }