From 97566efe5cac0ff11b23d8f27001fc98c7cea2af Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 25 Feb 2019 15:36:26 -0800 Subject: [PATCH] v3d: Rematerialize MOVs of uniforms instead of spilling them. If we have a MOV of a uniform value available to spill, that's one of our best choices. We can just not spill the value, and emit a new load of the uniform as the fill. This saves bothering the TMU and the thrsw, and is the same cost in uniforms (since the spill offset is a uniform anyway). This doesn't have a huge impact on shader-db, since there aren't a whole lot of spills and we usually copy-prop the uniforms at the VIR level such that the only uniform MOVs are from vir_lower_uniforms: total instructions in shared programs: 6430292 -> 6430279 (<.01%) total uniforms in shared programs: 2386023 -> 2385787 (<.01%) total spills in shared programs: 4961 -> 4960 (-0.02%) total fills in shared programs: 6352 -> 6350 (-0.03%) However, I'm interested in dropping the uniforms copy-prop in the backend, since it would be cheaper to not load repeated uniforms if we have the registers to spare. This also saves many spills on dEQP-GLES31.functional.ubo.random.all_per_block_buffers.20, which is what motivated a bunch of my recent backend work in the first place: before: 46 spills, 106 fills, 3062 instructions after: 0 spills, 0 fills, 2611 instructions --- src/broadcom/compiler/v3d_compiler.h | 4 + src/broadcom/compiler/vir_register_allocate.c | 91 +++++++++++++------ 2 files changed, 68 insertions(+), 27 deletions(-) diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 3995bb1e74f..5984d3ef5fe 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -1152,4 +1152,8 @@ vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond) vir_for_each_block(_block, c) \ vir_for_each_inst(inst, _block) +#define vir_for_each_inst_inorder_safe(inst, c) \ + vir_for_each_block(_block, c) \ + vir_for_each_inst_safe(inst, _block) + #endif /* V3D_COMPILER_H */ diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c index 79ab5acd764..91cce71e0ac 100644 --- a/src/broadcom/compiler/vir_register_allocate.c +++ b/src/broadcom/compiler/vir_register_allocate.c @@ -47,10 +47,21 @@ is_last_ldtmu(struct qinst *inst, struct qblock *block) return true; } +static bool +vir_is_mov_uniform(struct v3d_compile *c, int temp) +{ + struct qinst *def = c->defs[temp]; + + return (def && + vir_is_raw_mov(def) && + def->src[0].file == QFILE_UNIF); +} + static int v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, uint32_t *temp_to_node) { + const float tmu_scale = 5; float block_scale = 1.0; float spill_costs[c->num_temps]; bool in_tmu_operation = false; @@ -75,22 +86,28 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, continue; int temp = inst->src[i].index; - if (no_spilling) { - BITSET_CLEAR(c->spillable, - temp); - } else { + if (vir_is_mov_uniform(c, temp)) { spill_costs[temp] += block_scale; + } else if (!no_spilling) { + spill_costs[temp] += (block_scale * + tmu_scale); + } else { + BITSET_CLEAR(c->spillable, temp); } } if (inst->dst.file == QFILE_TEMP) { int temp = inst->dst.index; - if (no_spilling) { - BITSET_CLEAR(c->spillable, - temp); + if (vir_is_mov_uniform(c, temp)) { + /* We just rematerialize the unform + * later. + */ + } else if (!no_spilling) { + spill_costs[temp] += (block_scale * + tmu_scale); } else { - spill_costs[temp] += block_scale; + BITSET_CLEAR(c->spillable, temp); } } @@ -184,18 +201,28 @@ v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset) static void v3d_spill_reg(struct v3d_compile *c, int spill_temp) { - uint32_t spill_offset = c->spill_size; - c->spill_size += 16 * sizeof(uint32_t); + bool is_uniform = vir_is_mov_uniform(c, spill_temp); + + uint32_t spill_offset = 0; - if (spill_offset == 0) - v3d_setup_spill_base(c); + if (!is_uniform) { + uint32_t spill_offset = c->spill_size; + c->spill_size += 16 * sizeof(uint32_t); + + if (spill_offset == 0) + v3d_setup_spill_base(c); + } struct qinst *last_thrsw = c->last_thrsw; assert(!last_thrsw || last_thrsw->is_last_thrsw); int start_num_temps = c->num_temps; - vir_for_each_inst_inorder(inst, c) { + struct qreg uniform_src = c->undef; + if (is_uniform) + uniform_src = c->defs[spill_temp]->src[0]; + + vir_for_each_inst_inorder_safe(inst, c) { for (int i = 0; i < vir_get_nsrc(inst); i++) { if (inst->src[i].file != QFILE_TEMP || inst->src[i].index != spill_temp) { @@ -204,23 +231,33 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp) c->cursor = vir_before_inst(inst); - v3d_emit_spill_tmua(c, spill_offset); - vir_emit_thrsw(c); - inst->src[i] = vir_LDTMU(c); - c->fills++; + if (is_uniform) { + inst->src[i] = vir_MOV(c, uniform_src); + } else { + v3d_emit_spill_tmua(c, spill_offset); + vir_emit_thrsw(c); + inst->src[i] = vir_LDTMU(c); + c->fills++; + } } if (inst->dst.file == QFILE_TEMP && inst->dst.index == spill_temp) { - c->cursor = vir_after_inst(inst); - - inst->dst.index = c->num_temps++; - vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), - inst->dst); - v3d_emit_spill_tmua(c, spill_offset); - vir_emit_thrsw(c); - vir_TMUWT(c); - c->spills++; + if (is_uniform) { + c->cursor.link = NULL; + vir_remove_instruction(c, inst); + } else { + c->cursor = vir_after_inst(inst); + + inst->dst.index = c->num_temps++; + vir_MOV_dest(c, vir_reg(QFILE_MAGIC, + V3D_QPU_WADDR_TMUD), + inst->dst); + v3d_emit_spill_tmua(c, spill_offset); + vir_emit_thrsw(c); + vir_TMUWT(c); + c->spills++; + } } /* If we didn't have a last-thrsw inserted by nir_to_vir and @@ -228,7 +265,7 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp) * right before we start the vpm/tlb sequence for the last * thread segment. */ - if (!last_thrsw && c->last_thrsw && + if (!is_uniform && !last_thrsw && c->last_thrsw && (v3d_qpu_writes_vpm(&inst->qpu) || v3d_qpu_uses_tlb(&inst->qpu))) { c->cursor = vir_before_inst(inst); -- 2.30.2