From 6b73a97f84f86f4c3d3bbbbadf963c20b8e52b57 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Fri, 20 Jul 2018 14:27:09 -0700 Subject: [PATCH] v3d: Implement a small immediates optimization, based on VC4's. We can do one per instruction, and we have to be careful not to overwrite raddr_b, but this greatly reduces the pressure on uniform loads (particularly around ldvpm/stvpm instructions). total instructions in shared programs: 90768 -> 88220 (-2.81%) instructions in affected programs: 82711 -> 80163 (-3.08%) --- src/broadcom/Makefile.sources | 1 + src/broadcom/compiler/meson.build | 1 + src/broadcom/compiler/qpu_schedule.c | 3 +- src/broadcom/compiler/v3d_compiler.h | 1 + src/broadcom/compiler/vir.c | 1 + src/broadcom/compiler/vir_dump.c | 27 +++-- .../compiler/vir_opt_small_immediates.c | 112 ++++++++++++++++++ src/broadcom/compiler/vir_to_qpu.c | 16 ++- 8 files changed, 143 insertions(+), 19 deletions(-) create mode 100644 src/broadcom/compiler/vir_opt_small_immediates.c diff --git a/src/broadcom/Makefile.sources b/src/broadcom/Makefile.sources index 05e4588446a..5955acdefd5 100644 --- a/src/broadcom/Makefile.sources +++ b/src/broadcom/Makefile.sources @@ -28,6 +28,7 @@ BROADCOM_FILES = \ compiler/vir_lower_uniforms.c \ compiler/vir_opt_copy_propagate.c \ compiler/vir_opt_dead_code.c \ + compiler/vir_opt_small_immediates.c \ compiler/vir_register_allocate.c \ compiler/vir_to_qpu.c \ compiler/qpu_schedule.c \ diff --git a/src/broadcom/compiler/meson.build b/src/broadcom/compiler/meson.build index 0cdf8a5f364..1bd43bf2c2d 100644 --- a/src/broadcom/compiler/meson.build +++ b/src/broadcom/compiler/meson.build @@ -26,6 +26,7 @@ libbroadcom_compiler_files = files( 'vir_lower_uniforms.c', 'vir_opt_copy_propagate.c', 'vir_opt_dead_code.c', + 'vir_opt_small_immediates.c', 'vir_register_allocate.c', 'vir_to_qpu.c', 'qpu_schedule.c', diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index af0b9b86b1c..441b6327825 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -670,7 +670,8 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) { if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) && - a->raddr_b != b->raddr_b) { + (a->raddr_b != b->raddr_b || + a->sig.small_imm != b->sig.small_imm)) { return false; } merge.raddr_b = b->raddr_b; diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 9dc19248aa6..a02b5a6404a 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -115,6 +115,7 @@ static inline struct qreg vir_reg(enum qfile file, uint32_t index) */ struct qpu_reg { bool magic; + bool smimm; int index; }; diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 5a4bf80f93d..b5539b6ef50 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -1001,6 +1001,7 @@ vir_optimize(struct v3d_compile *c) OPTPASS(vir_opt_copy_propagate); OPTPASS(vir_opt_dead_code); + OPTPASS(vir_opt_small_immediates); if (!progress) break; diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c index 88b5dc90ac7..c43578302b5 100644 --- a/src/broadcom/compiler/vir_dump.c +++ b/src/broadcom/compiler/vir_dump.c @@ -25,7 +25,8 @@ #include "v3d_compiler.h" static void -vir_print_reg(struct v3d_compile *c, struct qreg reg) +vir_print_reg(struct v3d_compile *c, const struct qinst *inst, + struct qreg reg) { static const char *files[] = { [QFILE_TEMP] = "t", @@ -58,12 +59,20 @@ vir_print_reg(struct v3d_compile *c, struct qreg reg) fprintf(stderr, "%s", v3d_qpu_magic_waddr_name(reg.index)); break; - case QFILE_SMALL_IMM: - if ((int)reg.index >= -16 && (int)reg.index <= 15) - fprintf(stderr, "%d", reg.index); + case QFILE_SMALL_IMM: { + uint32_t unpacked; + bool ok = v3d_qpu_small_imm_unpack(c->devinfo, + inst->qpu.raddr_b, + &unpacked); + assert(ok); (void) ok; + + if ((int)inst->qpu.raddr_b >= -16 && + (int)inst->qpu.raddr_b <= 15) + fprintf(stderr, "%d", unpacked); else - fprintf(stderr, "%f", uif(reg.index)); + fprintf(stderr, "%f", uif(unpacked)); break; + } case QFILE_VPM: fprintf(stderr, "vpm%d.%d", @@ -220,7 +229,7 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst) fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.auf)); fprintf(stderr, " "); - vir_print_reg(c, inst->dst); + vir_print_reg(c, inst, inst->dst); fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack)); unpack[0] = instr->alu.add.a_unpack; @@ -232,7 +241,7 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst) fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.muf)); fprintf(stderr, " "); - vir_print_reg(c, inst->dst); + vir_print_reg(c, inst, inst->dst); fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack)); unpack[0] = instr->alu.mul.a_unpack; @@ -241,7 +250,7 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst) for (int i = 0; i < sideband_nsrc; i++) { fprintf(stderr, ", "); - vir_print_reg(c, inst->src[i]); + vir_print_reg(c, inst, inst->src[i]); if (i < nsrc) fprintf(stderr, "%s", v3d_qpu_unpack_name(unpack[i])); } @@ -307,7 +316,7 @@ vir_dump_inst(struct v3d_compile *c, struct qinst *inst) if (vir_has_implicit_uniform(inst)) { fprintf(stderr, " "); - vir_print_reg(c, inst->src[vir_get_implicit_uniform_src(inst)]); + vir_print_reg(c, inst, inst->src[vir_get_implicit_uniform_src(inst)]); } break; diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c new file mode 100644 index 00000000000..5491f9c24c8 --- /dev/null +++ b/src/broadcom/compiler/vir_opt_small_immediates.c @@ -0,0 +1,112 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file v3d_opt_small_immediates.c + * + * Turns references to small constant uniform values into small immediates + * fields. + */ + +#include "v3d_compiler.h" + +static bool debug; + +bool +vir_opt_small_immediates(struct v3d_compile *c) +{ + bool progress = false; + + vir_for_each_inst_inorder(inst, c) { + if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU) + continue; + + /* The small immediate value sits in the raddr B field, so we + * can't have 2 small immediates in one instruction (unless + * they're the same value, but that should be optimized away + * elsewhere). + */ + bool uses_small_imm = false; + for (int i = 0; i < vir_get_nsrc(inst); i++) { + if (inst->src[i].file == QFILE_SMALL_IMM) + uses_small_imm = true; + } + if (uses_small_imm) + continue; + + for (int i = 0; i < vir_get_nsrc(inst); i++) { + struct qreg src = vir_follow_movs(c, inst->src[i]); + + if (src.file != QFILE_UNIF || + c->uniform_contents[src.index] != + QUNIFORM_CONSTANT) { + continue; + } + + if (vir_has_implicit_uniform(inst) && + i == vir_get_implicit_uniform_src(inst)) { + /* No turning the implicit uniform read into + * an immediate. + */ + continue; + } + + /* Check if the uniform is suitable as a small + * immediate. + */ + uint32_t imm = c->uniform_data[src.index]; + uint32_t packed; + if (!v3d_qpu_small_imm_pack(c->devinfo, imm, &packed)) + continue; + + /* Check that we don't have any other signals already + * that would be incompatible with small_imm. + */ + struct v3d_qpu_sig new_sig = inst->qpu.sig; + uint32_t sig_packed; + new_sig.small_imm = true; + if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed)) + continue; + + if (debug) { + fprintf(stderr, "opt_small_immediate() from: "); + vir_dump_inst(c, inst); + fprintf(stderr, "\n"); + } + inst->qpu.sig.small_imm = true; + inst->qpu.raddr_b = packed; + + inst->src[i].file = QFILE_SMALL_IMM; + inst->src[i].index = imm; + if (debug) { + fprintf(stderr, "to: "); + vir_dump_inst(c, inst); + fprintf(stderr, "\n"); + } + progress = true; + break; + } + } + + return progress; +} diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c index 2c0349bb3b7..b5a7b841ef6 100644 --- a/src/broadcom/compiler/vir_to_qpu.c +++ b/src/broadcom/compiler/vir_to_qpu.c @@ -109,6 +109,12 @@ new_ldunif_instr(struct qinst *inst, int i) static void set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) { + if (src.smimm) { + assert(instr->sig.small_imm); + *mux = V3D_QPU_MUX_B; + return; + } + if (src.magic) { assert(src.index >= V3D_QPU_WADDR_R0 && src.index <= V3D_QPU_WADDR_R5); @@ -244,15 +250,7 @@ v3d_generate_code_block(struct v3d_compile *c, src[i] = qpu_acc(5); break; case QFILE_SMALL_IMM: - abort(); /* XXX */ -#if 0 - src[i].mux = QPU_MUX_SMALL_IMM; - src[i].addr = qpu_encode_small_immediate(qinst->src[i].index); - /* This should only have returned a valid - * small immediate field, not ~0 for failure. - */ - assert(src[i].addr <= 47); -#endif + src[i].smimm = true; break; case QFILE_VPM: -- 2.30.2