From facc3c6f58de88ac3707a1b8435b7fc655d13124 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Tue, 13 Mar 2018 15:13:00 -0700 Subject: [PATCH] broadcom/vc5: Add support for register spilling. Our register spilling support is nice to have since vc4 couldn't at all, but we're still very restricted due to needing to not spill during a TMU operation, or during the last segment of the program (which would be nice to spill a value of, when there's a long-lived value being passed through with little modification from the start to the end). We could do better by emitting unspills for the last-segment values just before the last thrsw, since the last segment is probably not the maximum interference area. Fixes GTF uniform_buffer_object_arrays_of_all_valid_basic_types and 3 others. --- src/broadcom/compiler/nir_to_vir.c | 11 +- src/broadcom/compiler/v3d_compiler.h | 24 +- src/broadcom/compiler/vir.c | 8 + src/broadcom/compiler/vir_register_allocate.c | 244 +++++++++++++++++- src/gallium/drivers/vc5/vc5_context.h | 3 + src/gallium/drivers/vc5/vc5_program.c | 15 ++ src/gallium/drivers/vc5/vc5_uniforms.c | 12 + 7 files changed, 306 insertions(+), 11 deletions(-) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index c1ba1e3049d..75e35067f27 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -1919,12 +1919,11 @@ vir_remove_thrsw(struct v3d_compile *c) vir_remove_instruction(c, inst); } } - vir_calculate_live_intervals(c); c->last_thrsw = NULL; } -static void +void vir_emit_last_thrsw(struct v3d_compile *c) { /* On V3D before 4.1, we need a TMU op to be outstanding when thread @@ -2012,16 +2011,16 @@ v3d_nir_to_vir(struct v3d_compile *c) fprintf(stderr, "\n"); } - /* Compute the live ranges so we can figure out interference. */ - vir_calculate_live_intervals(c); - /* Attempt to allocate registers for the temporaries. If we fail, * reduce thread count and try again. */ int min_threads = (c->devinfo->ver >= 41) ? 2 : 1; struct qpu_reg *temp_registers; while (true) { - temp_registers = v3d_register_allocate(c); + bool spilled; + temp_registers = v3d_register_allocate(c, &spilled); + if (spilled) + continue; if (temp_registers) break; diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index fdf1b131978..84cc4d290a0 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -248,6 +248,12 @@ enum quniform_contents { QUNIFORM_ALPHA_REF, QUNIFORM_SAMPLE_MASK, + + /** + * Returns the the offset of the scratch buffer for register spilling. + */ + QUNIFORM_SPILL_OFFSET, + QUNIFORM_SPILL_SIZE_PER_THREAD, }; struct v3d_varying_slot { @@ -506,6 +512,20 @@ struct v3d_compile { uint8_t vattr_sizes[V3D_MAX_VS_INPUTS]; uint32_t num_vpm_writes; + /* Size in bytes of registers that have been spilled. This is how much + * space needs to be available in the spill BO per thread per QPU. + */ + uint32_t spill_size; + /* Shader-db stats for register spilling. */ + uint32_t spills, fills; + /** + * Register spilling's per-thread base address, shared between each + * spill/fill's addressing calculations. + */ + struct qreg spill_base; + /* Bit vector of which temps may be spilled */ + BITSET_WORD *spillable; + /** * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads. * @@ -600,6 +620,7 @@ struct v3d_prog_data { struct v3d_ubo_range *ubo_ranges; uint32_t num_ubo_ranges; uint32_t ubo_size; + uint32_t spill_size; uint8_t num_inputs; uint8_t threads; @@ -697,6 +718,7 @@ void vir_set_unpack(struct qinst *inst, int src, enum v3d_qpu_input_unpack unpack); struct qreg vir_get_temp(struct v3d_compile *c); +void vir_emit_last_thrsw(struct v3d_compile *c); void vir_calculate_live_intervals(struct v3d_compile *c); bool vir_has_implicit_uniform(struct qinst *inst); int vir_get_implicit_uniform_src(struct qinst *inst); @@ -746,7 +768,7 @@ void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr); void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers); uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c); void qpu_validate(struct v3d_compile *c); -struct qpu_reg *v3d_register_allocate(struct v3d_compile *c); +struct qpu_reg *v3d_register_allocate(struct v3d_compile *c, bool *spilled); bool vir_init_reg_sets(struct v3d_compiler *compiler); void vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf); diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 6a315dd4823..0cbdc986d3f 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -354,10 +354,17 @@ vir_get_temp(struct v3d_compile *c) if (c->num_temps > c->defs_array_size) { uint32_t old_size = c->defs_array_size; c->defs_array_size = MAX2(old_size * 2, 16); + c->defs = reralloc(c, c->defs, struct qinst *, c->defs_array_size); memset(&c->defs[old_size], 0, sizeof(c->defs[0]) * (c->defs_array_size - old_size)); + + c->spillable = reralloc(c, c->spillable, + BITSET_WORD, + BITSET_WORDS(c->defs_array_size)); + for (int i = old_size; i < c->defs_array_size; i++) + BITSET_SET(c->spillable, i); } return reg; @@ -653,6 +660,7 @@ v3d_set_prog_data(struct v3d_compile *c, { prog_data->threads = c->threads; prog_data->single_seg = !c->last_thrsw; + prog_data->spill_size = c->spill_size; v3d_set_prog_data_uniforms(c, prog_data); v3d_set_prog_data_ubo(c, prog_data); diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c index ab3a4e257ff..4ec5f232643 100644 --- a/src/broadcom/compiler/vir_register_allocate.c +++ b/src/broadcom/compiler/vir_register_allocate.c @@ -33,6 +33,211 @@ #define PHYS_INDEX (ACC_INDEX + ACC_COUNT) #define PHYS_COUNT 64 +static bool +is_last_ldtmu(struct qinst *inst, struct qblock *block) +{ + list_for_each_entry_from(struct qinst, scan_inst, inst, + &block->instructions, link) { + if (inst->qpu.sig.ldtmu) + return false; + if (v3d_qpu_writes_tmu(&inst->qpu)) + return true; + } + + return true; +} + +static int +v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, + uint32_t *temp_to_node) +{ + float block_scale = 1.0; + float spill_costs[c->num_temps]; + bool in_tmu_operation = false; + bool started_last_seg = false; + + for (unsigned i = 0; i < c->num_temps; i++) + spill_costs[i] = 0.0; + + /* XXX: Scale the cost up when inside of a loop. */ + vir_for_each_block(block, c) { + vir_for_each_inst(inst, block) { + /* We can't insert a new TMU operation while currently + * in a TMU operation, and we can't insert new thread + * switches after starting output writes. + */ + bool no_spilling = + (in_tmu_operation || + (c->threads > 1 && started_last_seg)); + + for (int i = 0; i < vir_get_nsrc(inst); i++) { + if (inst->src[i].file != QFILE_TEMP) + continue; + + int temp = inst->src[i].index; + if (no_spilling) { + BITSET_CLEAR(c->spillable, + temp); + } else { + spill_costs[temp] += block_scale; + } + } + + if (inst->dst.file == QFILE_TEMP) { + int temp = inst->dst.index; + + if (no_spilling) { + BITSET_CLEAR(c->spillable, + temp); + } else { + spill_costs[temp] += block_scale; + } + } + + if (inst->is_last_thrsw) + started_last_seg = true; + + if (v3d_qpu_writes_vpm(&inst->qpu) || + v3d_qpu_uses_tlb(&inst->qpu)) + started_last_seg = true; + + /* Track when we're in between a TMU setup and the + * final LDTMU from that TMU setup. We can't + * spill/fill any temps during that time, because that + * involves inserting a new TMU setup/LDTMU sequence. + */ + if (inst->qpu.sig.ldtmu && + is_last_ldtmu(inst, block)) + in_tmu_operation = false; + + if (v3d_qpu_writes_tmu(&inst->qpu)) + in_tmu_operation = true; + } + } + + for (unsigned i = 0; i < c->num_temps; i++) { + int node = temp_to_node[i]; + + if (BITSET_TEST(c->spillable, i)) + ra_set_node_spill_cost(g, node, spill_costs[i]); + } + + return ra_get_best_spill_node(g); +} + +/* The spill offset for this thread takes a bit of setup, so do it once at + * program start. + */ +static void +v3d_setup_spill_base(struct v3d_compile *c) +{ + c->cursor = vir_before_block(vir_entry_block(c)); + + int start_num_temps = c->num_temps; + + /* Each thread wants to be in a separate region of the scratch space + * so that the QPUs aren't fighting over cache lines. We have the + * driver keep a single global spill BO rather than + * per-spilling-program BOs, so we need a uniform from the driver for + * what the per-thread scale is. + */ + struct qreg thread_offset = + vir_UMUL(c, + vir_TIDX(c), + vir_uniform(c, QUNIFORM_SPILL_SIZE_PER_THREAD, 0)); + + /* Each channel in a reg is 4 bytes, so scale them up by that. */ + struct qreg element_offset = vir_SHL(c, vir_EIDX(c), + vir_uniform_ui(c, 2)); + + c->spill_base = vir_ADD(c, + vir_ADD(c, thread_offset, element_offset), + vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0)); + + /* Make sure that we don't spill the spilling setup instructions. */ + for (int i = start_num_temps; i < c->num_temps; i++) + BITSET_CLEAR(c->spillable, i); +} + +static void +v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset) +{ + vir_ADD_dest(c, vir_reg(QFILE_MAGIC, + V3D_QPU_WADDR_TMUA), + c->spill_base, + vir_uniform_ui(c, spill_offset)); +} + +static void +v3d_spill_reg(struct v3d_compile *c, int spill_temp) +{ + uint32_t spill_offset = c->spill_size; + c->spill_size += 16 * sizeof(uint32_t); + + if (spill_offset == 0) + v3d_setup_spill_base(c); + + struct qinst *last_thrsw = c->last_thrsw; + assert(!last_thrsw || last_thrsw->is_last_thrsw); + + int start_num_temps = c->num_temps; + + vir_for_each_inst_inorder(inst, c) { + for (int i = 0; i < vir_get_nsrc(inst); i++) { + if (inst->src[i].file != QFILE_TEMP || + inst->src[i].index != spill_temp) { + continue; + } + + c->cursor = vir_before_inst(inst); + + v3d_emit_spill_tmua(c, spill_offset); + vir_emit_thrsw(c); + inst->src[i] = vir_LDTMU(c); + c->fills++; + } + + if (inst->dst.file == QFILE_TEMP && + inst->dst.index == spill_temp) { + c->cursor = vir_after_inst(inst); + + inst->dst.index = c->num_temps++; + vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), + inst->dst); + v3d_emit_spill_tmua(c, spill_offset); + vir_emit_thrsw(c); + c->spills++; + } + + /* If we didn't have a last-thrsw inserted by nir_to_vir and + * we've been inserting thrsws, then insert a new last_thrsw + * right before we start the vpm/tlb sequence for the last + * thread segment. + */ + if (!last_thrsw && c->last_thrsw && + (v3d_qpu_writes_vpm(&inst->qpu) || + v3d_qpu_uses_tlb(&inst->qpu))) { + c->cursor = vir_before_inst(inst); + vir_emit_thrsw(c); + + last_thrsw = c->last_thrsw; + last_thrsw->is_last_thrsw = true; + } + } + + /* Make sure c->last_thrsw is the actual last thrsw, not just one we + * inserted in our most recent unspill. + */ + if (last_thrsw) + c->last_thrsw = last_thrsw; + + /* Don't allow spilling of our spilling instructions. There's no way + * they can help get things colored. + */ + for (int i = start_num_temps; i < c->num_temps; i++) + BITSET_CLEAR(c->spillable, i); +} + bool vir_init_reg_sets(struct v3d_compiler *compiler) { @@ -96,7 +301,7 @@ node_to_temp_priority(const void *in_a, const void *in_b) * The return value should be freed by the caller. */ struct qpu_reg * -v3d_register_allocate(struct v3d_compile *c) +v3d_register_allocate(struct v3d_compile *c, bool *spilled) { struct node_to_temp_map map[c->num_temps]; uint32_t temp_to_node[c->num_temps]; @@ -105,9 +310,10 @@ v3d_register_allocate(struct v3d_compile *c) sizeof(*temp_registers)); int acc_nodes[ACC_COUNT]; - struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs, - c->num_temps + - ARRAY_SIZE(acc_nodes)); + *spilled = false; + + vir_calculate_live_intervals(c); + /* Convert 1, 2, 4 threads to 0, 1, 2 index. * * V3D 4.x has double the physical register space, so 64 physical regs @@ -119,6 +325,10 @@ v3d_register_allocate(struct v3d_compile *c) thread_index--; } + struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs, + c->num_temps + + ARRAY_SIZE(acc_nodes)); + /* Make some fixed nodes for the accumulators, which we will need to * interfere with when ops have implied r3/r4 writes or for the thread * switches. We could represent these as classes for the nodes to @@ -254,6 +464,20 @@ v3d_register_allocate(struct v3d_compile *c) bool ok = ra_allocate(g); if (!ok) { + /* Try to spill, if we can't reduce threading first. */ + if (thread_index == 0) { + int node = v3d_choose_spill_node(c, g, temp_to_node); + + if (node != -1) { + v3d_spill_reg(c, map[node].temp); + ralloc_free(g); + + /* Ask the outer loop to call back in. */ + *spilled = true; + return NULL; + } + } + free(temp_registers); return NULL; } @@ -280,5 +504,17 @@ v3d_register_allocate(struct v3d_compile *c) ralloc_free(g); + if (V3D_DEBUG & V3D_DEBUG_SHADERDB) { + fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d spills\n", + vir_get_stage_name(c), + c->program_id, c->variant_id, + c->spills); + + fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d fills\n", + vir_get_stage_name(c), + c->program_id, c->variant_id, + c->fills); + } + return temp_registers; } diff --git a/src/gallium/drivers/vc5/vc5_context.h b/src/gallium/drivers/vc5/vc5_context.h index 18fc27c5147..28b2e165a9d 100644 --- a/src/gallium/drivers/vc5/vc5_context.h +++ b/src/gallium/drivers/vc5/vc5_context.h @@ -154,6 +154,9 @@ struct vc5_compiled_shader { struct vc5_program_stateobj { struct vc5_uncompiled_shader *bind_vs, *bind_fs; struct vc5_compiled_shader *cs, *vs, *fs; + + struct vc5_bo *spill_bo; + int spill_size_per_thread; }; struct vc5_constbuf_stateobj { diff --git a/src/gallium/drivers/vc5/vc5_program.c b/src/gallium/drivers/vc5/vc5_program.c index eb5b6105455..ae3850a64b3 100644 --- a/src/gallium/drivers/vc5/vc5_program.c +++ b/src/gallium/drivers/vc5/vc5_program.c @@ -267,6 +267,21 @@ vc5_get_compiled_shader(struct vc5_context *vc5, struct v3d_key *key) memcpy(dup_key, key, key_size); _mesa_hash_table_insert(ht, dup_key, shader); + if (shader->prog_data.base->spill_size > + vc5->prog.spill_size_per_thread) { + /* Max 4 QPUs per slice, 3 slices per core. We only do single + * core so far. This overallocates memory on smaller cores. + */ + int total_spill_size = + 4 * 3 * shader->prog_data.base->spill_size; + + vc5_bo_unreference(&vc5->prog.spill_bo); + vc5->prog.spill_bo = vc5_bo_alloc(vc5->screen, + total_spill_size, "spill"); + vc5->prog.spill_size_per_thread = + shader->prog_data.base->spill_size; + } + return shader; } diff --git a/src/gallium/drivers/vc5/vc5_uniforms.c b/src/gallium/drivers/vc5/vc5_uniforms.c index faf49dbc359..03b6d8381c1 100644 --- a/src/gallium/drivers/vc5/vc5_uniforms.c +++ b/src/gallium/drivers/vc5/vc5_uniforms.c @@ -389,6 +389,16 @@ vc5_write_uniforms(struct vc5_context *vc5, struct vc5_compiled_shader *shader, /* XXX */ break; + case QUNIFORM_SPILL_OFFSET: + cl_aligned_reloc(&job->indirect, &uniforms, + vc5->prog.spill_bo, 0); + break; + + case QUNIFORM_SPILL_SIZE_PER_THREAD: + cl_aligned_u32(&uniforms, + vc5->prog.spill_size_per_thread); + break; + default: assert(quniform_contents_is_texture_p0(uinfo->contents[i])); @@ -451,6 +461,8 @@ vc5_set_shader_uniform_dirty_flags(struct vc5_compiled_shader *shader) case QUNIFORM_TEXTURE_DEPTH: case QUNIFORM_TEXTURE_ARRAY_SIZE: case QUNIFORM_TEXTURE_LEVELS: + case QUNIFORM_SPILL_OFFSET: + case QUNIFORM_SPILL_SIZE_PER_THREAD: /* We could flag this on just the stage we're * compiling for, but it's not passed in. */ -- 2.30.2