From 72cb6619cb75a92901d372d687505a747a384571 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 8 Jan 2015 18:32:29 -0800 Subject: [PATCH] vc4: Restructure color packing as a series of channel replacements. I'm using this in some WIP commits for doing blending in 8888 instead of vec4. But it also gives us these results immediately, thanks to allowing more uniforms/immediates in the arguments: total instructions in shared programs: 41027 -> 40960 (-0.16%) instructions in affected programs: 4381 -> 4314 (-1.53%) --- src/gallium/drivers/vc4/vc4_program.c | 34 +++++++----------- src/gallium/drivers/vc4/vc4_qir.c | 6 +++- src/gallium/drivers/vc4/vc4_qir.h | 19 +++++++++- src/gallium/drivers/vc4/vc4_qpu_emit.c | 50 +++++++++++++------------- 4 files changed, 60 insertions(+), 49 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index bba02ca93f2..6bad1560b2f 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -1843,32 +1843,22 @@ emit_frag_end(struct vc4_compile *c) qir_TLB_Z_WRITE(c, z); } - bool color_written = false; + struct qreg packed_color = c->undef; for (int i = 0; i < 4; i++) { - if (swizzled_outputs[i].file != QFILE_NULL) - color_written = true; - } - - struct qreg packed_color; - if (color_written) { - /* Fill in any undefined colors. The simulator will assertion - * fail if we read something that wasn't written, and I don't - * know what hardware does. - */ - for (int i = 0; i < 4; i++) { - if (swizzled_outputs[i].file == QFILE_NULL) - swizzled_outputs[i] = qir_uniform_f(c, 0.0); + if (swizzled_outputs[i].file == QFILE_NULL) + continue; + if (packed_color.file == QFILE_NULL) { + packed_color = qir_PACK_8888_F(c, swizzled_outputs[i]); + } else { + packed_color = qir_PACK_8_F(c, + packed_color, + swizzled_outputs[i], + i); } - packed_color = qir_get_temp(c); - qir_emit(c, qir_inst4(QOP_PACK_COLORS, packed_color, - swizzled_outputs[0], - swizzled_outputs[1], - swizzled_outputs[2], - swizzled_outputs[3])); - } else { - packed_color = qir_uniform_ui(c, 0); } + if (packed_color.file == QFILE_NULL) + packed_color = qir_uniform_ui(c, 0); if (c->fs_key->logicop_func != PIPE_LOGICOP_COPY) { packed_color = vc4_logicop(c, packed_color, packed_dst_color); diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index 3fd39413222..5f3b8ddc445 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -73,7 +73,11 @@ static const struct qir_op_info qir_op_info[] = { [QOP_RSQ] = { "rsq", 1, 1, false, true }, [QOP_EXP2] = { "exp2", 1, 2, false, true }, [QOP_LOG2] = { "log2", 1, 2, false, true }, - [QOP_PACK_COLORS] = { "pack_colors", 1, 4, false, true }, + [QOP_PACK_8888_F] = { "pack_8888_f", 1, 1, false, true }, + [QOP_PACK_8A_F] = { "pack_8a_f", 1, 2, false, true }, + [QOP_PACK_8B_F] = { "pack_8b_f", 1, 2, false, true }, + [QOP_PACK_8C_F] = { "pack_8c_f", 1, 2, false, true }, + [QOP_PACK_8D_F] = { "pack_8d_f", 1, 2, false, true }, [QOP_PACK_SCALED] = { "pack_scaled", 1, 2, false, true }, [QOP_VPM_READ] = { "vpm_read", 0, 1, true }, [QOP_TLB_DISCARD_SETUP] = { "discard", 0, 1, true }, diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index f7d59a80dac..6dac00fbbd8 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -100,7 +100,11 @@ enum qop { QOP_VW_SETUP, QOP_VR_SETUP, QOP_PACK_SCALED, - QOP_PACK_COLORS, + QOP_PACK_8888_F, + QOP_PACK_8A_F, + QOP_PACK_8B_F, + QOP_PACK_8C_F, + QOP_PACK_8D_F, QOP_VPM_READ, QOP_TLB_DISCARD_SETUP, QOP_TLB_STENCIL_SETUP, @@ -473,6 +477,11 @@ QIR_ALU1(RSQ) QIR_ALU1(EXP2) QIR_ALU1(LOG2) QIR_ALU2(PACK_SCALED) +QIR_ALU1(PACK_8888_F) +QIR_ALU2(PACK_8A_F) +QIR_ALU2(PACK_8B_F) +QIR_ALU2(PACK_8C_F) +QIR_ALU2(PACK_8D_F) QIR_ALU1(VARY_ADD_C) QIR_NODST_2(TEX_S) QIR_NODST_2(TEX_T) @@ -538,6 +547,14 @@ qir_UNPACK_16_I(struct vc4_compile *c, struct qreg src, int i) return t; } +static inline struct qreg +qir_PACK_8_F(struct vc4_compile *c, struct qreg rest, struct qreg val, int chan) +{ + struct qreg t = qir_get_temp(c); + qir_emit(c, qir_inst(QOP_PACK_8A_F + chan, t, rest, val)); + return t; +} + static inline struct qreg qir_POW(struct vc4_compile *c, struct qreg x, struct qreg y) { diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index 503f32a4c05..857d56e0f44 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -347,40 +347,40 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) break; - case QOP_PACK_COLORS: { - /* We have to be careful not to start writing over one - * of our source values when incrementally writing the - * destination. So, if the dst is one of the srcs, we - * pack that one first (and we pack 4 channels at once - * for the first pack). - */ - struct qpu_reg first_pack = src[0]; - for (int i = 0; i < 4; i++) { - if (src[i].mux == dst.mux && - src[i].addr == dst.addr) { - first_pack = dst; - break; - } - } - queue(c, qpu_m_MOV(dst, first_pack)); + case QOP_PACK_8888_F: + queue(c, qpu_m_MOV(dst, src[0])); *last_inst(c) |= QPU_PM; *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888, QPU_PACK); + break; - for (int i = 0; i < 4; i++) { - if (src[i].mux == first_pack.mux && - src[i].addr == first_pack.addr) { - continue; + case QOP_PACK_8A_F: + case QOP_PACK_8B_F: + case QOP_PACK_8C_F: + case QOP_PACK_8D_F: + /* If dst doesn't happen to already contain src[0], + * then we have to move it in. + */ + if (qinst->src[0].file != QFILE_NULL && + (src[0].mux != dst.mux || src[0].addr != dst.addr)) { + /* Don't overwrite src1 while setting up + * the dst! + */ + if (dst.mux == src[1].mux && + dst.addr == src[1].addr) { + queue(c, qpu_m_MOV(qpu_rb(31), src[1])); + src[1] = qpu_rb(31); } - queue(c, qpu_m_MOV(dst, src[i])); - *last_inst(c) |= QPU_PM; - *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i, - QPU_PACK); + queue(c, qpu_m_MOV(dst, src[0])); } + queue(c, qpu_m_MOV(dst, src[1])); + *last_inst(c) |= QPU_PM; + *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + + qinst->op - QOP_PACK_8A_F, + QPU_PACK); break; - } case QOP_FRAG_X: queue(c, qpu_a_ITOF(dst, -- 2.30.2