From e103b52aec773537d2821d8acc42ac9caa2a4b17 Mon Sep 17 00:00:00 2001 From: Varad Gautam Date: Tue, 8 Mar 2016 01:01:59 +0530 Subject: [PATCH] vc4: Coalesce instructions using VPM reads into the VPM read. This is done instead of copy propagating the VPM reads into the instructions using them, because VPM reads have to stay in order. shader-db results: total instructions in shared programs: 78509 -> 78114 (-0.50%) instructions in affected programs: 5203 -> 4808 (-7.59%) total estimated cycles in shared programs: 234670 -> 234318 (-0.15%) estimated cycles in affected programs: 5345 -> 4993 (-6.59%) Signed-off-by: Varad Gautam Reviewed-by: Eric Anholt Tested-by: Rhys Kidd --- src/gallium/drivers/vc4/vc4_opt_vpm.c | 74 +++++++++++++++++++++++++-- src/gallium/drivers/vc4/vc4_qir.c | 2 +- src/gallium/drivers/vc4/vc4_qir.h | 2 +- 3 files changed, 71 insertions(+), 7 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm.c b/src/gallium/drivers/vc4/vc4_opt_vpm.c index 0fcf1e5c6dd..d15b0c1a39f 100644 --- a/src/gallium/drivers/vc4/vc4_opt_vpm.c +++ b/src/gallium/drivers/vc4/vc4_opt_vpm.c @@ -24,14 +24,16 @@ /** * @file vc4_opt_vpm.c * - * This modifies instructions that generate the value consumed by a VPM write - * to write directly into the VPM. + * This modifies instructions that: + * 1. exclusively consume a value read from the VPM to directly read the VPM if + * other operands allow it. + * 2. generate the value consumed by a VPM write to write directly into the VPM. */ #include "vc4_qir.h" bool -qir_opt_vpm_writes(struct vc4_compile *c) +qir_opt_vpm(struct vc4_compile *c) { if (c->stage == QSTAGE_FRAG) return false; @@ -52,8 +54,70 @@ qir_opt_vpm_writes(struct vc4_compile *c) } for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { - if (inst->src[i].file == QFILE_TEMP) - use_count[inst->src[i].index]++; + if (inst->src[i].file == QFILE_TEMP) { + uint32_t temp = inst->src[i].index; + use_count[temp]++; + } + } + } + + /* For instructions reading from a temporary that contains a VPM read + * result, try to move the instruction up in place of the VPM read. + */ + list_for_each_entry(struct qinst, inst, &c->instructions, link) { + if (!inst || qir_is_multi_instruction(inst)) + continue; + + if (qir_depends_on_flags(inst) || inst->sf) + continue; + + if (qir_has_side_effects(c, inst) || + qir_has_side_effect_reads(c, inst) || + qir_is_tex(inst)) + continue; + + for (int j = 0; j < qir_get_op_nsrc(inst->op); j++) { + if (inst->src[j].file != QFILE_TEMP || + inst->src[j].pack) + continue; + + uint32_t temp = inst->src[j].index; + + /* Since VPM reads pull from a FIFO, we only get to + * read each VPM entry once (unless we reset the read + * pointer). That means we can't copy-propagate a VPM + * read to multiple locations. + */ + if (use_count[temp] != 1) + continue; + + struct qinst *mov = c->defs[temp]; + if (!mov || + (mov->op != QOP_MOV && + mov->op != QOP_FMOV && + mov->op != QOP_MMOV) || + mov->src[0].file != QFILE_VPM) { + continue; + } + + uint32_t temps = 0; + for (int k = 0; k < qir_get_op_nsrc(inst->op); k++) { + if (inst->src[k].file == QFILE_TEMP) + temps++; + } + + /* The instruction is safe to reorder if its other + * sources are independent of previous instructions + */ + if (temps == 1) { + list_del(&inst->link); + inst->src[j] = mov->src[0]; + list_replace(&mov->link, &inst->link); + c->defs[temp] = NULL; + free(mov); + progress = true; + break; + } } } diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index f9eb0e151c5..65f0067c61e 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -526,7 +526,7 @@ qir_optimize(struct vc4_compile *c) OPTPASS(qir_opt_copy_propagation); OPTPASS(qir_opt_dead_code); OPTPASS(qir_opt_small_immediates); - OPTPASS(qir_opt_vpm_writes); + OPTPASS(qir_opt_vpm); if (!progress) break; diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index bae31768bd8..4f39d72f552 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -484,7 +484,7 @@ bool qir_opt_copy_propagation(struct vc4_compile *c); bool qir_opt_cse(struct vc4_compile *c); bool qir_opt_dead_code(struct vc4_compile *c); bool qir_opt_small_immediates(struct vc4_compile *c); -bool qir_opt_vpm_writes(struct vc4_compile *c); +bool qir_opt_vpm(struct vc4_compile *c); void vc4_nir_lower_blend(struct vc4_compile *c); void vc4_nir_lower_io(struct vc4_compile *c); nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b, -- 2.30.2