From 3359ad6cda49fb977d837eb00e8ae4d781d95c2a Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 5 Aug 2015 20:05:56 -0700 Subject: [PATCH] vc4: Add support for copy propagation with unpack flags present. total instructions in shared programs: 89251 -> 87862 (-1.56%) instructions in affected programs: 52971 -> 51582 (-2.62%) --- .../drivers/vc4/vc4_opt_copy_propagation.c | 84 +++++++++++++++---- src/gallium/drivers/vc4/vc4_qpu_emit.c | 61 ++++++++++---- 2 files changed, 109 insertions(+), 36 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c index b46be24ad0c..0eee5c34e1d 100644 --- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c +++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c @@ -41,29 +41,77 @@ qir_opt_copy_propagation(struct vc4_compile *c) bool debug = false; list_for_each_entry(struct qinst, inst, &c->instructions, link) { - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { - int index = inst->src[i].index; - if (inst->src[i].file == QFILE_TEMP && - c->defs[index] && - qir_is_raw_mov(c->defs[index]) && - (c->defs[index]->src[0].file == QFILE_TEMP || - c->defs[index]->src[0].file == QFILE_UNIF)) { - if (debug) { - fprintf(stderr, "Copy propagate: "); - qir_dump_inst(c, inst); - fprintf(stderr, "\n"); - } + int nsrc = qir_get_op_nsrc(inst->op); + for (int i = 0; i < nsrc; i++) { + if (inst->src[i].file != QFILE_TEMP) + continue; + + struct qinst *mov = c->defs[inst->src[i].index]; + if (!mov || + (mov->op != QOP_MOV && + mov->op != QOP_FMOV && + mov->op != QOP_MMOV)) { + continue; + } - inst->src[i] = c->defs[index]->src[0]; + if (mov->src[0].file != QFILE_TEMP && + mov->src[0].file != QFILE_UNIF) { + continue; + } + + if (mov->dst.pack) + continue; + + uint8_t unpack; + if (mov->src[0].pack) { + /* Make sure that the meaning of the unpack + * would be the same between the two + * instructions. + */ + if (qir_is_float_input(inst) != + qir_is_float_input(mov)) { + continue; + } - if (debug) { - fprintf(stderr, "to: "); - qir_dump_inst(c, inst); - fprintf(stderr, "\n"); + /* There's only one unpack field, so make sure + * this instruction doesn't already use it. + */ + bool already_has_unpack = false; + for (int j = 0; j < nsrc; j++) { + if (inst->src[j].pack) + already_has_unpack = true; } + if (already_has_unpack) + continue; - progress = true; + /* A destination pack requires the PM bit to + * be set to a specific value already, which + * may be different from ours. + */ + if (inst->dst.pack) + continue; + + unpack = mov->src[0].pack; + } else { + unpack = inst->src[i].pack; + } + + if (debug) { + fprintf(stderr, "Copy propagate: "); + qir_dump_inst(c, inst); + fprintf(stderr, "\n"); } + + inst->src[i] = mov->src[0]; + inst->src[i].pack = unpack; + + if (debug) { + fprintf(stderr, "to: "); + qir_dump_inst(c, inst); + fprintf(stderr, "\n"); + } + + progress = true; } } return progress; diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index d06f8b27d29..133e1385178 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -101,7 +101,8 @@ swap_file(struct qpu_reg *src) static void fixup_raddr_conflict(struct vc4_compile *c, struct qpu_reg dst, - struct qpu_reg *src0, struct qpu_reg *src1) + struct qpu_reg *src0, struct qpu_reg *src1, + struct qinst *inst, uint64_t *unpack) { uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux; uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux; @@ -117,7 +118,21 @@ fixup_raddr_conflict(struct vc4_compile *c, return; if (mux0 == QPU_MUX_A) { - queue(c, qpu_a_MOV(qpu_rb(31), *src0)); + /* Make sure we use the same type of MOV as the instruction, + * in case of unpacks. + */ + if (qir_is_float_input(inst)) + queue(c, qpu_a_FMAX(qpu_rb(31), *src0, *src0)); + else + queue(c, qpu_a_MOV(qpu_rb(31), *src0)); + + /* If we had an unpack on this A-file source, we need to put + * it into this MOV, not into the later move from regfile B. + */ + if (inst->src[0].pack) { + *last_inst(c) |= *unpack; + *unpack = 0; + } *src0 = qpu_rb(31); } else { queue(c, qpu_a_MOV(qpu_ra(31), *src0)); @@ -296,7 +311,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) case QOP_SEL_X_0_ZC: case QOP_SEL_X_0_NS: case QOP_SEL_X_0_NC: - queue(c, qpu_a_MOV(dst, src[0])); + queue(c, qpu_a_MOV(dst, src[0]) | unpack); set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS + QPU_COND_ZS); @@ -310,10 +325,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) case QOP_SEL_X_Y_NS: case QOP_SEL_X_Y_NC: queue(c, qpu_a_MOV(dst, src[0])); + if (qinst->src[0].pack) + *(last_inst(c)) |= unpack; set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS + QPU_COND_ZS); queue(c, qpu_a_MOV(dst, src[1])); + if (qinst->src[1].pack) + *(last_inst(c)) |= unpack; set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^ 1) + QPU_COND_ZS); @@ -326,19 +345,19 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) switch (qinst->op) { case QOP_RCP: queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP), - src[0])); + src[0]) | unpack); break; case QOP_RSQ: queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT), - src[0])); + src[0]) | unpack); break; case QOP_EXP2: queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP), - src[0])); + src[0]) | unpack); break; case QOP_LOG2: queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG), - src[0])); + src[0]) | unpack); break; default: abort(); @@ -373,16 +392,19 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) case QOP_TLB_DISCARD_SETUP: discard = true; - queue(c, qpu_a_MOV(src[0], src[0])); + queue(c, qpu_a_MOV(src[0], src[0]) | unpack); *last_inst(c) |= QPU_SF; break; case QOP_TLB_STENCIL_SETUP: - queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0])); + assert(!unpack); + queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), + src[0]) | unpack); break; case QOP_TLB_Z_WRITE: - queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0])); + queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), + src[0]) | unpack); if (discard) { set_last_cond_add(c, QPU_COND_ZS); } @@ -398,14 +420,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) break; case QOP_TLB_COLOR_WRITE: - queue(c, qpu_a_MOV(qpu_tlbc(), src[0])); + queue(c, qpu_a_MOV(qpu_tlbc(), src[0]) | unpack); if (discard) { set_last_cond_add(c, QPU_COND_ZS); } break; case QOP_VARY_ADD_C: - queue(c, qpu_a_FADD(dst, src[0], qpu_r5())); + queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack); break; case QOP_TEX_S: @@ -414,12 +436,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) case QOP_TEX_B: queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S + (qinst->op - QOP_TEX_S)), - src[0])); + src[0]) | unpack); break; case QOP_TEX_DIRECT: - fixup_raddr_conflict(c, dst, &src[0], &src[1]); - queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1])); + fixup_raddr_conflict(c, dst, &src[0], &src[1], + qinst, &unpack); + queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), + src[0], src[1]) | unpack); break; case QOP_TEX_RESULT: @@ -447,16 +471,17 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) if (qir_get_op_nsrc(qinst->op) == 1) src[1] = src[0]; - fixup_raddr_conflict(c, dst, &src[0], &src[1]); + fixup_raddr_conflict(c, dst, &src[0], &src[1], + qinst, &unpack); if (qir_is_mul(qinst)) { queue(c, qpu_m_alu2(translate[qinst->op].op, dst, - src[0], src[1])); + src[0], src[1]) | unpack); } else { queue(c, qpu_a_alu2(translate[qinst->op].op, dst, - src[0], src[1])); + src[0], src[1]) | unpack); } set_last_dst_pack(c, qinst); -- 2.30.2