From 7830e465a5f446616ce49a7f8219256a5503a68b Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 30 Jul 2015 11:16:13 -0700 Subject: [PATCH] vc4: Lower uniform loads to scalar in NIR. This also moves the vec4-to-byte-addressing math into NIR, so that algebraic has a chance at it. --- src/gallium/drivers/vc4/vc4_nir_lower_io.c | 86 ++++++++++++++++++---- src/gallium/drivers/vc4/vc4_program.c | 26 +++---- 2 files changed, 81 insertions(+), 31 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c index fa06c893cfb..ffc120e8865 100644 --- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c +++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c @@ -29,10 +29,29 @@ * Walks the NIR generated by TGSI-to-NIR to lower its io intrinsics into * something amenable to the VC4 architecture. * - * Currently, it split inputs and outputs into scalars, and drops any - * non-position outputs in coordinate shaders. + * Currently, it split inputs, outputs, and uniforms into scalars, drops any + * non-position outputs in coordinate shaders, and fixes up the addressing on + * indirect uniform loads. */ +static void +replace_intrinsic_with_vec4(nir_builder *b, nir_intrinsic_instr *intr, + nir_ssa_def **comps) +{ + + /* Batch things back together into a vec4. This will get split by the + * later ALU scalarization pass. + */ + nir_ssa_def *vec = nir_vec4(b, comps[0], comps[1], comps[2], comps[3]); + + /* Replace the old intrinsic with a reference to our reconstructed + * vec4. + */ + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec), + ralloc_parent(b->impl)); + nir_instr_remove(&intr->instr); +} + static void vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b, nir_intrinsic_instr *intr) @@ -102,18 +121,7 @@ vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b, break; } - /* Batch things back together into a vec4. This will get split by the - * later ALU scalarization pass. - */ - nir_ssa_def *vec_instr = nir_vec4(b, dests[0], dests[1], - dests[2], dests[3]); - - /* Replace the old intrinsic with a reference to our reconstructed - * vec4. - */ - nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec_instr), - ralloc_parent(b->impl)); - nir_instr_remove(&intr->instr); + replace_intrinsic_with_vec4(b, intr, dests); } static void @@ -158,6 +166,51 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b, nir_instr_remove(&intr->instr); } +static void +vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b, + nir_intrinsic_instr *intr) +{ + /* All TGSI-to-NIR uniform loads are vec4. */ + assert(intr->num_components == 4); + + nir_builder_insert_before_instr(b, &intr->instr); + + /* Generate scalar loads equivalent to the original VEC4. */ + nir_ssa_def *dests[4]; + for (unsigned i = 0; i < intr->num_components; i++) { + nir_intrinsic_instr *intr_comp = + nir_intrinsic_instr_create(c->s, intr->intrinsic); + intr_comp->num_components = 1; + nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL); + + if (intr->intrinsic == nir_intrinsic_load_uniform_indirect) { + /* Convert the variable TGSI register index to a byte + * offset. + */ + intr_comp->src[0] = + nir_src_for_ssa(nir_ishl(b, + intr->src[0].ssa, + nir_imm_int(b, 4))); + + /* Convert the offset to be a byte index, too. */ + intr_comp->const_index[0] = (intr->const_index[0] * 16 + + i * 4); + } else { + /* We want a dword index for non-indirect uniform + * loads. + */ + intr_comp->const_index[0] = (intr->const_index[0] * 4 + + i); + } + + dests[i] = &intr_comp->dest.ssa; + + nir_builder_instr_insert(b, &intr_comp->instr); + } + + replace_intrinsic_with_vec4(b, intr, dests); +} + static void vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b, struct nir_instr *instr) @@ -175,6 +228,11 @@ vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b, vc4_nir_lower_output(c, b, intr); break; + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_uniform_indirect: + vc4_nir_lower_uniform(c, b, intr); + break; + default: break; } diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index ddc997003b2..f2742986beb 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -42,6 +42,9 @@ #include "simpenrose/simpenrose.h" #endif +static struct qreg +ntq_get_src(struct vc4_compile *c, nir_src src, int i); + static void resize_qreg_array(struct vc4_compile *c, struct qreg **regs, @@ -64,10 +67,10 @@ resize_qreg_array(struct vc4_compile *c, } static struct qreg -indirect_uniform_load(struct vc4_compile *c, - struct qreg indirect_offset, - unsigned offset) +indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr) { + struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0); + uint32_t offset = intr->const_index[0]; struct vc4_compiler_ubo_range *range = NULL; unsigned i; for (i = 0; i < c->num_uniform_ranges; i++) { @@ -89,10 +92,6 @@ indirect_uniform_load(struct vc4_compile *c, }; offset -= range->src_offset; - /* Translate the user's TGSI register index from the TGSI register - * base to a byte offset. - */ - indirect_offset = qir_SHL(c, indirect_offset, qir_uniform_ui(c, 4)); /* Adjust for where we stored the TGSI register base. */ indirect_offset = qir_ADD(c, indirect_offset, @@ -1793,19 +1792,12 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) switch (instr->intrinsic) { case nir_intrinsic_load_uniform: - for (int i = 0; i < instr->num_components; i++) { - dest[i] = qir_uniform(c, QUNIFORM_UNIFORM, - instr->const_index[0] * 4 + i); - } + assert(instr->num_components == 1); + *dest = qir_uniform(c, QUNIFORM_UNIFORM, instr->const_index[0]); break; case nir_intrinsic_load_uniform_indirect: - for (int i = 0; i < instr->num_components; i++) { - dest[i] = indirect_uniform_load(c, - ntq_get_src(c, instr->src[0], 0), - (instr->const_index[0] * - 4 + i) * sizeof(float)); - } + *dest = indirect_uniform_load(c, instr); break; -- 2.30.2