From f505f66cd5a266dc70ad12e2b015e6c631651aec Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 27 Apr 2016 16:01:24 -0700 Subject: [PATCH] vc4: Add support for storing to NIR registers in a non-SSA fashion. Previously, there were occasionally NIR registers in our programs, but they were always actually used SSA-only. Now that we're trying to support control flow, we need to actually conditionally move to registers based on whether channels are active or not. --- src/gallium/drivers/vc4/vc4_program.c | 217 ++++++++++++++++---------- src/gallium/drivers/vc4/vc4_qir.h | 12 ++ 2 files changed, 144 insertions(+), 85 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index dec14453203..f87a9b21261 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -151,6 +151,43 @@ ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def) return qregs; } +static void +ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan, + struct qreg result) +{ + if (dest->is_ssa) { + assert(chan < dest->ssa.num_components); + + struct qreg *qregs; + struct hash_entry *entry = + _mesa_hash_table_search(c->def_ht, &dest->ssa); + + if (entry) + qregs = entry->data; + else + qregs = ntq_init_ssa_def(c, &dest->ssa); + + qregs[chan] = result; + } else { + nir_register *reg = dest->reg.reg; + assert(dest->reg.base_offset == 0); + assert(reg->num_array_elems == 0); + struct hash_entry *entry = + _mesa_hash_table_search(c->def_ht, reg); + struct qreg *qregs = entry->data; + + /* Conditionally move the result to the destination if the + * channel is active. + */ + if (c->execute.file != QFILE_NULL) { + qir_SF(c, c->execute); + qir_MOV_cond(c, QPU_COND_ZS, qregs[chan], result); + } else { + qir_MOV_dest(c, qregs[chan], result); + } + } +} + static struct qreg * ntq_get_dest(struct vc4_compile *c, nir_dest *dest) { @@ -300,7 +337,7 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr) struct qreg tex = qir_TEX_RESULT(c); c->num_texture_samples++; - struct qreg *dest = ntq_get_dest(c, &instr->dest); + struct qreg dest[4]; enum pipe_format format = c->key->tex[unit].format; if (util_format_is_depth_or_stencil(format)) { struct qreg scaled = ntq_scale_depth_texture(c, tex); @@ -310,6 +347,9 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr) for (int i = 0; i < 4; i++) dest[i] = qir_UNPACK_8_F(c, tex, i); } + + for (int i = 0; i < 4; i++) + ntq_store_dest(c, &instr->dest, i, dest[i]); } static void @@ -731,10 +771,10 @@ ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr) if (instr->src[0].swizzle[0] == instr->src[0].swizzle[1] && instr->src[0].swizzle[0] == instr->src[0].swizzle[2] && instr->src[0].swizzle[0] == instr->src[0].swizzle[3]) { - struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); - *dest = qir_PACK_8888_F(c, - ntq_get_src(c, instr->src[0].src, - instr->src[0].swizzle[0])); + struct qreg rep = ntq_get_src(c, + instr->src[0].src, + instr->src[0].swizzle[0]); + ntq_store_dest(c, &instr->dest.dest, 0, qir_PACK_8888_F(c, rep)); return; } @@ -764,8 +804,7 @@ ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr) qir_PACK_8_F(c, result, src, i); } - struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); - *dest = result; + ntq_store_dest(c, &instr->dest.dest, 0, result); } /** Handles sign-extended bitfield extracts for 16 bits. */ @@ -901,6 +940,9 @@ out: static void ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) { + /* This should always be lowered to ALU operations for VC4. */ + assert(!instr->dest.saturate); + /* Vectors are special in that they have non-scalarized writemasks, * and just take the first swizzle channel for each argument in order * into each writemask channel. @@ -912,9 +954,8 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) srcs[i] = ntq_get_src(c, instr->src[i].src, instr->src[i].swizzle[0]); - struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) - dest[i] = srcs[i]; + ntq_store_dest(c, &instr->dest.dest, i, srcs[i]); return; } @@ -926,10 +967,10 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) if (instr->op == nir_op_unpack_unorm_4x8) { struct qreg src = ntq_get_src(c, instr->src[0].src, instr->src[0].swizzle[0]); - struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); for (int i = 0; i < 4; i++) { if (instr->dest.write_mask & (1 << i)) - dest[i] = qir_UNPACK_8_F(c, src, i); + ntq_store_dest(c, &instr->dest.dest, i, + qir_UNPACK_8_F(c, src, i)); } return; } @@ -940,91 +981,87 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) src[i] = ntq_get_alu_src(c, instr, i); } - /* Pick the channel to store the output in. */ - assert(!instr->dest.saturate); - struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); - assert(util_is_power_of_two(instr->dest.write_mask)); - dest += ffs(instr->dest.write_mask) - 1; + struct qreg result; switch (instr->op) { case nir_op_fmov: case nir_op_imov: - *dest = qir_MOV(c, src[0]); + result = qir_MOV(c, src[0]); break; case nir_op_fmul: - *dest = qir_FMUL(c, src[0], src[1]); + result = qir_FMUL(c, src[0], src[1]); break; case nir_op_fadd: - *dest = qir_FADD(c, src[0], src[1]); + result = qir_FADD(c, src[0], src[1]); break; case nir_op_fsub: - *dest = qir_FSUB(c, src[0], src[1]); + result = qir_FSUB(c, src[0], src[1]); break; case nir_op_fmin: - *dest = qir_FMIN(c, src[0], src[1]); + result = qir_FMIN(c, src[0], src[1]); break; case nir_op_fmax: - *dest = qir_FMAX(c, src[0], src[1]); + result = qir_FMAX(c, src[0], src[1]); break; case nir_op_f2i: case nir_op_f2u: - *dest = qir_FTOI(c, src[0]); + result = qir_FTOI(c, src[0]); break; case nir_op_i2f: case nir_op_u2f: - *dest = qir_ITOF(c, src[0]); + result = qir_ITOF(c, src[0]); break; case nir_op_b2f: - *dest = qir_AND(c, src[0], qir_uniform_f(c, 1.0)); + result = qir_AND(c, src[0], qir_uniform_f(c, 1.0)); break; case nir_op_b2i: - *dest = qir_AND(c, src[0], qir_uniform_ui(c, 1)); + result = qir_AND(c, src[0], qir_uniform_ui(c, 1)); break; case nir_op_i2b: case nir_op_f2b: qir_SF(c, src[0]); - *dest = qir_SEL(c, QPU_COND_ZC, - qir_uniform_ui(c, ~0), - qir_uniform_ui(c, 0)); + result = qir_SEL(c, QPU_COND_ZC, + qir_uniform_ui(c, ~0), + qir_uniform_ui(c, 0)); break; case nir_op_iadd: - *dest = qir_ADD(c, src[0], src[1]); + result = qir_ADD(c, src[0], src[1]); break; case nir_op_ushr: - *dest = qir_SHR(c, src[0], src[1]); + result = qir_SHR(c, src[0], src[1]); break; case nir_op_isub: - *dest = qir_SUB(c, src[0], src[1]); + result = qir_SUB(c, src[0], src[1]); break; case nir_op_ishr: - *dest = qir_ASR(c, src[0], src[1]); + result = qir_ASR(c, src[0], src[1]); break; case nir_op_ishl: - *dest = qir_SHL(c, src[0], src[1]); + result = qir_SHL(c, src[0], src[1]); break; case nir_op_imin: - *dest = qir_MIN(c, src[0], src[1]); + result = qir_MIN(c, src[0], src[1]); break; case nir_op_imax: - *dest = qir_MAX(c, src[0], src[1]); + result = qir_MAX(c, src[0], src[1]); break; case nir_op_iand: - *dest = qir_AND(c, src[0], src[1]); + result = qir_AND(c, src[0], src[1]); break; case nir_op_ior: - *dest = qir_OR(c, src[0], src[1]); + result = qir_OR(c, src[0], src[1]); break; case nir_op_ixor: - *dest = qir_XOR(c, src[0], src[1]); + result = qir_XOR(c, src[0], src[1]); break; case nir_op_inot: - *dest = qir_NOT(c, src[0]); + result = qir_NOT(c, src[0]); break; case nir_op_imul: - *dest = ntq_umul(c, src[0], src[1]); + result = ntq_umul(c, src[0], src[1]); break; case nir_op_seq: @@ -1040,90 +1077,90 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) case nir_op_ige: case nir_op_uge: case nir_op_ilt: - if (!ntq_emit_comparison(c, dest, instr, instr)) { + if (!ntq_emit_comparison(c, &result, instr, instr)) { fprintf(stderr, "Bad comparison instruction\n"); } break; case nir_op_bcsel: - *dest = ntq_emit_bcsel(c, instr, src); + result = ntq_emit_bcsel(c, instr, src); break; case nir_op_fcsel: qir_SF(c, src[0]); - *dest = qir_SEL(c, QPU_COND_ZC, src[1], src[2]); + result = qir_SEL(c, QPU_COND_ZC, src[1], src[2]); break; case nir_op_frcp: - *dest = ntq_rcp(c, src[0]); + result = ntq_rcp(c, src[0]); break; case nir_op_frsq: - *dest = ntq_rsq(c, src[0]); + result = ntq_rsq(c, src[0]); break; case nir_op_fexp2: - *dest = qir_EXP2(c, src[0]); + result = qir_EXP2(c, src[0]); break; case nir_op_flog2: - *dest = qir_LOG2(c, src[0]); + result = qir_LOG2(c, src[0]); break; case nir_op_ftrunc: - *dest = qir_ITOF(c, qir_FTOI(c, src[0])); + result = qir_ITOF(c, qir_FTOI(c, src[0])); break; case nir_op_fceil: - *dest = ntq_fceil(c, src[0]); + result = ntq_fceil(c, src[0]); break; case nir_op_ffract: - *dest = ntq_ffract(c, src[0]); + result = ntq_ffract(c, src[0]); break; case nir_op_ffloor: - *dest = ntq_ffloor(c, src[0]); + result = ntq_ffloor(c, src[0]); break; case nir_op_fsin: - *dest = ntq_fsin(c, src[0]); + result = ntq_fsin(c, src[0]); break; case nir_op_fcos: - *dest = ntq_fcos(c, src[0]); + result = ntq_fcos(c, src[0]); break; case nir_op_fsign: - *dest = ntq_fsign(c, src[0]); + result = ntq_fsign(c, src[0]); break; case nir_op_fabs: - *dest = qir_FMAXABS(c, src[0], src[0]); + result = qir_FMAXABS(c, src[0], src[0]); break; case nir_op_iabs: - *dest = qir_MAX(c, src[0], + result = qir_MAX(c, src[0], qir_SUB(c, qir_uniform_ui(c, 0), src[0])); break; case nir_op_ibitfield_extract: - *dest = ntq_emit_ibfe(c, src[0], src[1], src[2]); + result = ntq_emit_ibfe(c, src[0], src[1], src[2]); break; case nir_op_ubitfield_extract: - *dest = ntq_emit_ubfe(c, src[0], src[1], src[2]); + result = ntq_emit_ubfe(c, src[0], src[1], src[2]); break; case nir_op_usadd_4x8: - *dest = qir_V8ADDS(c, src[0], src[1]); + result = qir_V8ADDS(c, src[0], src[1]); break; case nir_op_ussub_4x8: - *dest = qir_V8SUBS(c, src[0], src[1]); + result = qir_V8SUBS(c, src[0], src[1]); break; case nir_op_umin_4x8: - *dest = qir_V8MIN(c, src[0], src[1]); + result = qir_V8MIN(c, src[0], src[1]); break; case nir_op_umax_4x8: - *dest = qir_V8MAX(c, src[0], src[1]); + result = qir_V8MAX(c, src[0], src[1]); break; case nir_op_umul_unorm_4x8: - *dest = qir_V8MULD(c, src[0], src[1]); + result = qir_V8MULD(c, src[0], src[1]); break; default: @@ -1132,6 +1169,13 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) fprintf(stderr, "\n"); abort(); } + + /* We have a scalar result, so the instruction should only have a + * single channel written to. + */ + assert(util_is_power_of_two(instr->dest.write_mask)); + ntq_store_dest(c, &instr->dest.dest, + ffs(instr->dest.write_mask) - 1, result); } static void @@ -1473,7 +1517,7 @@ ntq_setup_registers(struct vc4_compile *c, struct exec_list *list) _mesa_hash_table_insert(c->def_ht, nir_reg, qregs); for (int i = 0; i < array_len * nir_reg->num_components; i++) - qregs[i] = qir_uniform_ui(c, 0); + qregs[i] = qir_get_temp(c); } } @@ -1502,14 +1546,8 @@ ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr) static void ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) { - const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; nir_const_value *const_offset; unsigned offset; - struct qreg *dest = NULL; - - if (info->has_dest) { - dest = ntq_get_dest(c, &instr->dest); - } switch (instr->intrinsic) { case nir_intrinsic_load_uniform: @@ -1521,36 +1559,43 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) /* We need dwords */ offset = offset / 4; if (offset < VC4_NIR_STATE_UNIFORM_OFFSET) { - *dest = qir_uniform(c, QUNIFORM_UNIFORM, - offset); + ntq_store_dest(c, &instr->dest, 0, + qir_uniform(c, QUNIFORM_UNIFORM, + offset)); } else { - *dest = qir_uniform(c, offset - - VC4_NIR_STATE_UNIFORM_OFFSET, - 0); + ntq_store_dest(c, &instr->dest, 0, + qir_uniform(c, offset - + VC4_NIR_STATE_UNIFORM_OFFSET, + 0)); } } else { - *dest = indirect_uniform_load(c, instr); + ntq_store_dest(c, &instr->dest, 0, + indirect_uniform_load(c, instr)); } break; case nir_intrinsic_load_user_clip_plane: for (int i = 0; i < instr->num_components; i++) { - dest[i] = qir_uniform(c, QUNIFORM_USER_CLIP_PLANE, - instr->const_index[0] * 4 + i); + ntq_store_dest(c, &instr->dest, i, + qir_uniform(c, QUNIFORM_USER_CLIP_PLANE, + instr->const_index[0] * 4 + + i)); } break; case nir_intrinsic_load_sample_mask_in: - *dest = qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0); + ntq_store_dest(c, &instr->dest, 0, + qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0)); break; case nir_intrinsic_load_front_face: /* The register contains 0 (front) or 1 (back), and we need to * turn it into a NIR bool where true means front. */ - *dest = qir_ADD(c, - qir_uniform_ui(c, -1), - qir_reg(QFILE_FRAG_REV_FLAG, 0)); + ntq_store_dest(c, &instr->dest, 0, + qir_ADD(c, + qir_uniform_ui(c, -1), + qir_reg(QFILE_FRAG_REV_FLAG, 0))); break; case nir_intrinsic_load_input: @@ -1570,10 +1615,12 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) qir_TLB_COLOR_READ(c); } } - *dest = c->color_reads[sample_index]; + ntq_store_dest(c, &instr->dest, 0, + c->color_reads[sample_index]); } else { offset = instr->const_index[0] + const_offset->u32[0]; - *dest = c->inputs[offset]; + ntq_store_dest(c, &instr->dest, 0, + c->inputs[offset]); } break; diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index ad784bb987b..e284ed58b65 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -408,6 +408,11 @@ struct vc4_compile { uint32_t num_ubo_ranges; uint32_t next_ubo_dst_offset; + /* State for whether we're executing on each channel currently. 0 if + * yes, otherwise a block number + 1 that the channel jumped to. + */ + struct qreg execute; + struct qreg line_x, point_x, point_y; struct qreg discard; struct qreg payload_FRAG_Z; @@ -760,6 +765,13 @@ qir_LOAD_IMM(struct vc4_compile *c, uint32_t val) qir_reg(QFILE_LOAD_IMM, val), c->undef)); } +static inline void +qir_MOV_cond(struct vc4_compile *c, uint8_t cond, + struct qreg dest, struct qreg src) +{ + qir_MOV_dest(c, dest, src)->cond = cond; +} + static inline struct qinst * qir_BRANCH(struct vc4_compile *c, uint8_t cond) { -- 2.30.2