From 874dfa8b2ecccf3c9a73453d7ccc6638363a59bd Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Sun, 24 Aug 2014 16:51:32 -0700 Subject: [PATCH] vc4: Expose compares at a lower level in QIR. Before, we had some special opcodes like CMP and SNE that emitted multiple instructions. Now, we reduce those operations significantly, giving optimization more to look at for reducing redundant operations. The downside is that QOP_SF is pretty special -- we're going to have to track it separately when we're doing instruction scheduling, and we want to peephole it into the instruction generating the destination write in most cases (and not allocate the destination reg, probably. Unless it's used for some other purpose, as well). --- src/gallium/drivers/vc4/vc4_opt_algebraic.c | 9 +- src/gallium/drivers/vc4/vc4_program.c | 104 +++++++++++++++----- src/gallium/drivers/vc4/vc4_qir.c | 14 ++- src/gallium/drivers/vc4/vc4_qir.h | 37 ++++--- src/gallium/drivers/vc4/vc4_qpu_emit.c | 63 ++++++------ 5 files changed, 148 insertions(+), 79 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c index 2bf474ccef9..f8ed6218adc 100644 --- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c +++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c @@ -45,9 +45,12 @@ qir_opt_algebraic(struct qcompile *c) struct qinst *inst = (struct qinst *)node; switch (inst->op) { - case QOP_CMP: - /* Turn "dst = (a < 0) ? b : b)" into "dst = b" */ - if (qir_reg_equals(inst->src[1], inst->src[2])) { + case QOP_SEL_X_Y_ZS: + case QOP_SEL_X_Y_ZC: + case QOP_SEL_X_Y_NS: + case QOP_SEL_X_Y_NC: + /* Turn "dst = (sf == x) ? a : a)" into "dst = a" */ + if (qir_reg_equals(inst->src[0], inst->src[1])) { if (debug) { fprintf(stderr, "optimizing: "); qir_dump_inst(inst); diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index d404047e4bb..aaa7eb346f3 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -248,6 +248,58 @@ tgsi_to_qir_alu(struct tgsi_to_qir *trans, return dst; } +static struct qreg +tgsi_to_qir_seq(struct tgsi_to_qir *trans, + struct tgsi_full_instruction *tgsi_inst, + enum qop op, struct qreg *src, int i) +{ + struct qcompile *c = trans->c; + qir_SF(c, qir_FSUB(c, src[0 * 4 + i], src[1 * 4 + i])); + return qir_SEL_X_0_ZS(c, qir_uniform_f(trans, 1.0)); +} + +static struct qreg +tgsi_to_qir_sne(struct tgsi_to_qir *trans, + struct tgsi_full_instruction *tgsi_inst, + enum qop op, struct qreg *src, int i) +{ + struct qcompile *c = trans->c; + qir_SF(c, qir_FSUB(c, src[0 * 4 + i], src[1 * 4 + i])); + return qir_SEL_X_0_ZC(c, qir_uniform_f(trans, 1.0)); +} + +static struct qreg +tgsi_to_qir_slt(struct tgsi_to_qir *trans, + struct tgsi_full_instruction *tgsi_inst, + enum qop op, struct qreg *src, int i) +{ + struct qcompile *c = trans->c; + qir_SF(c, qir_FSUB(c, src[0 * 4 + i], src[1 * 4 + i])); + return qir_SEL_X_0_NS(c, qir_uniform_f(trans, 1.0)); +} + +static struct qreg +tgsi_to_qir_sge(struct tgsi_to_qir *trans, + struct tgsi_full_instruction *tgsi_inst, + enum qop op, struct qreg *src, int i) +{ + struct qcompile *c = trans->c; + qir_SF(c, qir_FSUB(c, src[0 * 4 + i], src[1 * 4 + i])); + return qir_SEL_X_0_NC(c, qir_uniform_f(trans, 1.0)); +} + +static struct qreg +tgsi_to_qir_cmp(struct tgsi_to_qir *trans, + struct tgsi_full_instruction *tgsi_inst, + enum qop op, struct qreg *src, int i) +{ + struct qcompile *c = trans->c; + qir_SF(c, src[0 * 4 + i]); + return qir_SEL_X_Y_NS(c, + src[1 * 4 + i], + src[2 * 4 + i]); +} + static struct qreg tgsi_to_qir_mad(struct tgsi_to_qir *trans, struct tgsi_full_instruction *tgsi_inst, @@ -280,16 +332,15 @@ tgsi_to_qir_lit(struct tgsi_to_qir *trans, case 2: { struct qreg zero = qir_uniform_f(trans, 0.0); + qir_SF(c, x); /* XXX: Clamp w to -128..128 */ - return qir_CMP(c, - x, - zero, - qir_EXP2(c, qir_FMUL(c, - w, - qir_LOG2(c, - qir_FMAX(c, - y, - zero))))); + return qir_SEL_X_0_NC(c, + qir_EXP2(c, qir_FMUL(c, + w, + qir_LOG2(c, + qir_FMAX(c, + y, + zero))))); } default: assert(!"not reached"); @@ -415,10 +466,10 @@ tgsi_to_qir_frc(struct tgsi_to_qir *trans, struct qcompile *c = trans->c; struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src[0 * 4 + i])); struct qreg diff = qir_FSUB(c, src[0 * 4 + i], trunc); - return qir_CMP(c, - diff, - qir_FADD(c, diff, qir_uniform_f(trans, 1.0)), - diff); + qir_SF(c, diff); + return qir_SEL_X_Y_NS(c, + qir_FADD(c, diff, qir_uniform_f(trans, 1.0)), + diff); } /** @@ -436,12 +487,11 @@ tgsi_to_qir_flr(struct tgsi_to_qir *trans, /* This will be < 0 if we truncated and the truncation was of a value * that was < 0 in the first place. */ - struct qreg diff = qir_FSUB(c, src[0 * 4 + i], trunc); + qir_SF(c, qir_FSUB(c, src[0 * 4 + i], trunc)); - return qir_CMP(c, - diff, - qir_FSUB(c, trunc, qir_uniform_f(trans, 1.0)), - trunc); + return qir_SEL_X_Y_NS(c, + qir_FSUB(c, trunc, qir_uniform_f(trans, 1.0)), + trunc); } static struct qreg @@ -613,10 +663,10 @@ tgsi_to_qir_kill_if(struct tgsi_to_qir *trans, struct qreg *src, int i) if (trans->discard.file == QFILE_NULL) trans->discard = qir_uniform_f(trans, 0.0); - trans->discard = qir_CMP(c, - src[0 * 4 + i], - qir_uniform_f(trans, 1.0), - trans->discard); + qir_SF(c, src[0 * 4 + i]); + trans->discard = qir_SEL_X_Y_NS(c, + qir_uniform_f(trans, 1.0), + trans->discard); } static void @@ -705,11 +755,11 @@ emit_tgsi_instruction(struct tgsi_to_qir *trans, [TGSI_OPCODE_MIN] = { QOP_FMIN, tgsi_to_qir_alu }, [TGSI_OPCODE_MAX] = { QOP_FMAX, tgsi_to_qir_alu }, [TGSI_OPCODE_RSQ] = { QOP_RSQ, tgsi_to_qir_alu }, - [TGSI_OPCODE_SEQ] = { QOP_SEQ, tgsi_to_qir_alu }, - [TGSI_OPCODE_SNE] = { QOP_SNE, tgsi_to_qir_alu }, - [TGSI_OPCODE_SGE] = { QOP_SGE, tgsi_to_qir_alu }, - [TGSI_OPCODE_SLT] = { QOP_SLT, tgsi_to_qir_alu }, - [TGSI_OPCODE_CMP] = { QOP_CMP, tgsi_to_qir_alu }, + [TGSI_OPCODE_SEQ] = { 0, tgsi_to_qir_seq }, + [TGSI_OPCODE_SNE] = { 0, tgsi_to_qir_sne }, + [TGSI_OPCODE_SGE] = { 0, tgsi_to_qir_sge }, + [TGSI_OPCODE_SLT] = { 0, tgsi_to_qir_slt }, + [TGSI_OPCODE_CMP] = { 0, tgsi_to_qir_cmp }, [TGSI_OPCODE_MAD] = { 0, tgsi_to_qir_mad }, [TGSI_OPCODE_DP2] = { 0, tgsi_to_qir_dp2 }, [TGSI_OPCODE_DP3] = { 0, tgsi_to_qir_dp3 }, diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index 0b0d2c11cf1..72149908422 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -43,11 +43,15 @@ static const struct qir_op_info qir_op_info[] = { [QOP_FMINABS] = { "fminabs", 1, 2 }, [QOP_FMAXABS] = { "fmaxabs", 1, 2 }, - [QOP_SEQ] = { "seq", 1, 2 }, - [QOP_SNE] = { "sne", 1, 2 }, - [QOP_SGE] = { "sge", 1, 2 }, - [QOP_SLT] = { "slt", 1, 2 }, - [QOP_CMP] = { "cmp", 1, 3 }, + [QOP_SF] = { "sf", 0, 1 }, + [QOP_SEL_X_0_NS] = { "fsel_x_0_ns", 1, 1 }, + [QOP_SEL_X_0_NC] = { "fsel_x_0_nc", 1, 1 }, + [QOP_SEL_X_0_ZS] = { "fsel_x_0_zs", 1, 1 }, + [QOP_SEL_X_0_ZC] = { "fsel_x_0_zc", 1, 1 }, + [QOP_SEL_X_Y_NS] = { "fsel_x_y_ns", 1, 2 }, + [QOP_SEL_X_Y_NC] = { "fsel_x_y_nc", 1, 2 }, + [QOP_SEL_X_Y_ZS] = { "fsel_x_y_zs", 1, 2 }, + [QOP_SEL_X_Y_ZC] = { "fsel_x_y_zc", 1, 2 }, [QOP_FTOI] = { "ftoi", 1, 1 }, [QOP_ITOF] = { "itof", 1, 1 }, diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index 77b5f1af903..99df99c1a07 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -54,11 +54,21 @@ enum qop { QOP_FMINABS, QOP_FMAXABS, - QOP_SEQ, - QOP_SNE, - QOP_SGE, - QOP_SLT, - QOP_CMP, + /* Sets the flag register according to src. */ + QOP_SF, + + /* Note: Orderings of these compares must be the same as in + * qpu_defines.h. Selects the src[0] if the ns flag bit is set, + * otherwise 0. */ + QOP_SEL_X_0_ZS, + QOP_SEL_X_0_ZC, + QOP_SEL_X_0_NS, + QOP_SEL_X_0_NC, + /* Selects the src[0] if the ns flag bit is set, otherwise src[1]. */ + QOP_SEL_X_Y_ZS, + QOP_SEL_X_Y_ZC, + QOP_SEL_X_Y_NS, + QOP_SEL_X_Y_NC, QOP_FTOI, QOP_ITOF, @@ -260,6 +270,15 @@ QIR_ALU1(MOV) QIR_ALU2(FADD) QIR_ALU2(FSUB) QIR_ALU2(FMUL) +QIR_NODST_1(SF) +QIR_ALU1(SEL_X_0_ZS) +QIR_ALU1(SEL_X_0_ZC) +QIR_ALU1(SEL_X_0_NS) +QIR_ALU1(SEL_X_0_NC) +QIR_ALU2(SEL_X_Y_ZS) +QIR_ALU2(SEL_X_Y_ZC) +QIR_ALU2(SEL_X_Y_NS) +QIR_ALU2(SEL_X_Y_NC) QIR_ALU2(FMIN) QIR_ALU2(FMAX) QIR_ALU2(FMINABS) @@ -283,14 +302,6 @@ QIR_ALU0(FRAG_Z) QIR_ALU0(FRAG_RCP_W) QIR_NODST_1(TLB_DISCARD_SETUP) -static inline struct qreg -qir_CMP(struct qcompile *c, struct qreg cmp, struct qreg a, struct qreg b) -{ - struct qreg t = qir_get_temp(c); - qir_emit(c, qir_inst4(QOP_CMP, t, cmp, a, b, c->undef)); - return t; -} - static inline struct qreg qir_R4_UNPACK(struct qcompile *c, int i) { diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index 4e28ff7c3b8..6d2c34f2d1f 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -60,6 +60,12 @@ last_inst(struct qcompile *c) return &q->inst; } +static void +set_last_cond_add(struct qcompile *c, uint32_t cond) +{ + *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond); +} + /** * This is used to resolve the fact that we might register-allocate two * different operands of an instruction to the same physical register file @@ -278,13 +284,6 @@ vc4_generate_code(struct qcompile *c) M(FMUL), }; - static const uint32_t compareflags[] = { - [QOP_SEQ - QOP_SEQ] = QPU_COND_ZS, - [QOP_SNE - QOP_SEQ] = QPU_COND_ZC, - [QOP_SLT - QOP_SEQ] = QPU_COND_NS, - [QOP_SGE - QOP_SEQ] = QPU_COND_NC, - }; - struct qpu_reg src[4]; for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) { int index = qinst->src[i].index; @@ -365,32 +364,36 @@ vc4_generate_code(struct qcompile *c) } break; - case QOP_CMP: + case QOP_SF: + fixup_raddr_conflict(c, src[0], &src[1]); queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0])); *last_inst(c) |= QPU_SF; - - queue(c, qpu_a_MOV(dst, src[1])); - *last_inst(c) = qpu_set_cond_add(*last_inst(c), - QPU_COND_NS); - - queue(c, qpu_a_MOV(dst, src[2])); - *last_inst(c) = qpu_set_cond_add(*last_inst(c), - QPU_COND_NC); break; - case QOP_SEQ: - case QOP_SNE: - case QOP_SGE: - case QOP_SLT: - fixup_raddr_conflict(c, src[0], &src[1]); - queue(c, qpu_a_FSUB(qpu_ra(QPU_W_NOP), src[0], src[1])); - *last_inst(c) |= QPU_SF; + case QOP_SEL_X_0_ZS: + case QOP_SEL_X_0_ZC: + case QOP_SEL_X_0_NS: + case QOP_SEL_X_0_NC: + queue(c, qpu_a_MOV(dst, src[0])); + set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS + + QPU_COND_ZS); + + queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0())); + set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^ + 1) + QPU_COND_ZS); + break; - queue(c, qpu_load_imm_f(dst, 0.0)); - queue(c, qpu_load_imm_f(dst, 1.0)); - *last_inst(c) = qpu_set_cond_add(*last_inst(c), - compareflags[qinst->op - QOP_SEQ]); + case QOP_SEL_X_Y_ZS: + case QOP_SEL_X_Y_ZC: + case QOP_SEL_X_Y_NS: + case QOP_SEL_X_Y_NC: + queue(c, qpu_a_MOV(dst, src[0])); + set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS + + QPU_COND_ZS); + queue(c, qpu_a_MOV(dst, src[1])); + set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^ + 1) + QPU_COND_ZS); break; @@ -475,8 +478,7 @@ vc4_generate_code(struct qcompile *c) queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), qpu_rb(QPU_R_FRAG_PAYLOAD_ZW))); if (discard) { - *last_inst(c) = qpu_set_cond_add(*last_inst(c), - QPU_COND_ZS); + set_last_cond_add(c, QPU_COND_ZS); } break; @@ -490,8 +492,7 @@ vc4_generate_code(struct qcompile *c) case QOP_TLB_COLOR_WRITE: queue(c, qpu_a_MOV(qpu_tlbc(), src[0])); if (discard) { - *last_inst(c) = qpu_set_cond_add(*last_inst(c), - QPU_COND_ZS); + set_last_cond_add(c, QPU_COND_ZS); } break; -- 2.30.2