From 921feb8782bdc3c459922858bee6d55919467436 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 31 Jul 2015 20:58:57 -0700
Subject: [PATCH] vc4: Switch our vertex attr lowering to being NIR-based.

This exposes more information to NIR's optimization, and should be
particularly useful when we do range-based optimization.

total uniforms in shared programs: 32066 -> 32065 (-0.00%)
uniforms in affected programs:     21 -> 20 (-4.76%)
total instructions in shared programs: 93104 -> 92630 (-0.51%)
instructions in affected programs:     31901 -> 31427 (-1.49%)
---
 src/gallium/drivers/vc4/vc4_nir_lower_io.c | 233 ++++++++++++++++++---
 src/gallium/drivers/vc4/vc4_program.c      | 110 +---------
 2 files changed, 200 insertions(+), 143 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index 761e2c819c5..caf706aa2a6 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -23,6 +23,7 @@
 
 #include "vc4_qir.h"
 #include "glsl/nir/nir_builder.h"
+#include "util/u_format.h"
 
 /**
  * Walks the NIR generated by TGSI-to-NIR to lower its io intrinsics into
@@ -50,14 +51,182 @@ replace_intrinsic_with_vec4(nir_builder *b, nir_intrinsic_instr *intr,
         nir_instr_remove(&intr->instr);
 }
 
+static nir_ssa_def *
+vc4_nir_unpack_8i(nir_builder *b, nir_ssa_def *src, unsigned chan)
+{
+        return nir_ubitfield_extract(b,
+                                     src,
+                                     nir_imm_int(b, 8 * chan),
+                                     nir_imm_int(b, 8));
+}
+
+/** Returns the 16 bit field as a sign-extended 32-bit value. */
+static nir_ssa_def *
+vc4_nir_unpack_16i(nir_builder *b, nir_ssa_def *src, unsigned chan)
+{
+        return nir_ibitfield_extract(b,
+                                     src,
+                                     nir_imm_int(b, 16 * chan),
+                                     nir_imm_int(b, 16));
+}
+
+/** Returns the 16 bit field as an unsigned 32 bit value. */
+static nir_ssa_def *
+vc4_nir_unpack_16u(nir_builder *b, nir_ssa_def *src, unsigned chan)
+{
+        if (chan == 0) {
+                return nir_iand(b, src, nir_imm_int(b, 0xffff));
+        } else {
+                return nir_ushr(b, src, nir_imm_int(b, 16));
+        }
+}
+
+static nir_ssa_def *
+vc4_nir_unpack_8f(nir_builder *b, nir_ssa_def *src, unsigned chan)
+{
+        return nir_swizzle(b, nir_unpack_unorm_4x8(b, src), &chan, 1, false);
+}
+
+static nir_ssa_def *
+vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c,
+                              nir_builder *b,
+                              nir_ssa_def **vpm_reads,
+                              uint8_t swiz,
+                              const struct util_format_description *desc)
+{
+        const struct util_format_channel_description *chan =
+                &desc->channel[swiz];
+        nir_ssa_def *temp;
+
+        if (swiz > UTIL_FORMAT_SWIZZLE_W) {
+                return vc4_nir_get_swizzled_channel(b, vpm_reads, swiz);
+        } else if (chan->size == 32 && chan->type == UTIL_FORMAT_TYPE_FLOAT) {
+                return vc4_nir_get_swizzled_channel(b, vpm_reads, swiz);
+        } else if (chan->size == 32 && chan->type == UTIL_FORMAT_TYPE_SIGNED) {
+                if (chan->normalized) {
+                        return nir_fmul(b,
+                                        nir_i2f(b, vpm_reads[swiz]),
+                                        nir_imm_float(b,
+                                                      1.0 / 0x7fffffff));
+                } else {
+                        return nir_i2f(b, vpm_reads[swiz]);
+                }
+        } else if (chan->size == 8 &&
+                   (chan->type == UTIL_FORMAT_TYPE_UNSIGNED ||
+                    chan->type == UTIL_FORMAT_TYPE_SIGNED)) {
+                nir_ssa_def *vpm = vpm_reads[0];
+                if (chan->type == UTIL_FORMAT_TYPE_SIGNED) {
+                        temp = nir_ixor(b, vpm, nir_imm_int(b, 0x80808080));
+                        if (chan->normalized) {
+                                return nir_fsub(b, nir_fmul(b,
+                                                            vc4_nir_unpack_8f(b, temp, swiz),
+                                                            nir_imm_float(b, 2.0)),
+                                                nir_imm_float(b, 1.0));
+                        } else {
+                                return nir_fadd(b,
+                                                nir_i2f(b,
+                                                        vc4_nir_unpack_8i(b, temp,
+                                                                          swiz)),
+                                                nir_imm_float(b, -128.0));
+                        }
+                } else {
+                        if (chan->normalized) {
+                                return vc4_nir_unpack_8f(b, vpm, swiz);
+                        } else {
+                                return nir_i2f(b, vc4_nir_unpack_8i(b, vpm, swiz));
+                        }
+                }
+        } else if (chan->size == 16 &&
+                   (chan->type == UTIL_FORMAT_TYPE_UNSIGNED ||
+                    chan->type == UTIL_FORMAT_TYPE_SIGNED)) {
+                nir_ssa_def *vpm = vpm_reads[swiz / 2];
+
+                /* Note that UNPACK_16F eats a half float, not ints, so we use
+                 * UNPACK_16_I for all of these.
+                 */
+                if (chan->type == UTIL_FORMAT_TYPE_SIGNED) {
+                        temp = nir_i2f(b, vc4_nir_unpack_16i(b, vpm, swiz & 1));
+                        if (chan->normalized) {
+                                return nir_fmul(b, temp,
+                                                nir_imm_float(b, 1/32768.0f));
+                        } else {
+                                return temp;
+                        }
+                } else {
+                        temp = nir_i2f(b, vc4_nir_unpack_16u(b, vpm, swiz & 1));
+                        if (chan->normalized) {
+                                return nir_fmul(b, temp,
+                                                nir_imm_float(b, 1 / 65535.0));
+                        } else {
+                                return temp;
+                        }
+                }
+        } else {
+                return NULL;
+        }
+}
+
+static void
+vc4_nir_lower_vertex_attr(struct vc4_compile *c, nir_builder *b,
+                          nir_intrinsic_instr *intr)
+{
+        b->cursor = nir_before_instr(&intr->instr);
+
+        int attr = intr->const_index[0];
+        enum pipe_format format = c->vs_key->attr_formats[attr];
+        uint32_t attr_size = util_format_get_blocksize(format);
+
+        /* All TGSI-to-NIR inputs are vec4. */
+        assert(intr->num_components == 4);
+
+        /* Generate dword loads for the VPM values (Since these intrinsics may
+         * be reordered, the actual reads will be generated at the top of the
+         * shader by ntq_setup_inputs().
+         */
+        nir_ssa_def *vpm_reads[4];
+        for (int i = 0; i < align(attr_size, 4) / 4; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s,
+                                                   nir_intrinsic_load_input);
+                intr_comp->num_components = 1;
+                intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+                nir_builder_instr_insert(b, &intr_comp->instr);
+
+                vpm_reads[i] = &intr_comp->dest.ssa;
+        }
+
+        bool format_warned = false;
+        const struct util_format_description *desc =
+                util_format_description(format);
+
+        nir_ssa_def *dests[4];
+        for (int i = 0; i < 4; i++) {
+                uint8_t swiz = desc->swizzle[i];
+                dests[i] = vc4_nir_get_vattr_channel_vpm(c, b, vpm_reads, swiz,
+                                                         desc);
+
+                if (!dests[i]) {
+                        if (!format_warned) {
+                                fprintf(stderr,
+                                        "vtx element %d unsupported type: %s\n",
+                                        attr, util_format_name(format));
+                                format_warned = true;
+                        }
+                        dests[i] = nir_imm_float(b, 0.0);
+                }
+        }
+
+        replace_intrinsic_with_vec4(b, intr, dests);
+}
+
 static void
-vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
-                    nir_intrinsic_instr *intr)
+vc4_nir_lower_fs_input(struct vc4_compile *c, nir_builder *b,
+                       nir_intrinsic_instr *intr)
 {
         b->cursor = nir_before_instr(&intr->instr);
 
-        if (c->stage == QSTAGE_FRAG && intr->const_index[0] ==
-            VC4_NIR_TLB_COLOR_READ_INPUT) {
+        if (intr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) {
                 /* This doesn't need any lowering. */
                 return;
         }
@@ -87,38 +256,31 @@ vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
                 dests[i] = &intr_comp->dest.ssa;
         }
 
-        switch (c->stage) {
-        case QSTAGE_FRAG:
-                if (input_var->data.location == VARYING_SLOT_FACE) {
-                        dests[0] = nir_fsub(b,
-                                            nir_imm_float(b, 1.0),
-                                            nir_fmul(b,
-                                                     nir_i2f(b, dests[0]),
-                                                     nir_imm_float(b, 2.0)));
-                        dests[1] = nir_imm_float(b, 0.0);
+        if (input_var->data.location == VARYING_SLOT_FACE) {
+                dests[0] = nir_fsub(b,
+                                    nir_imm_float(b, 1.0),
+                                    nir_fmul(b,
+                                             nir_i2f(b, dests[0]),
+                                             nir_imm_float(b, 2.0)));
+                dests[1] = nir_imm_float(b, 0.0);
+                dests[2] = nir_imm_float(b, 0.0);
+                dests[3] = nir_imm_float(b, 1.0);
+        } else if (input_var->data.location >= VARYING_SLOT_VAR0) {
+                if (c->fs_key->point_sprite_mask &
+                    (1 << (input_var->data.location -
+                           VARYING_SLOT_VAR0))) {
+                        if (!c->fs_key->is_points) {
+                                dests[0] = nir_imm_float(b, 0.0);
+                                dests[1] = nir_imm_float(b, 0.0);
+                        }
+                        if (c->fs_key->point_coord_upper_left) {
+                                dests[1] = nir_fsub(b,
+                                                    nir_imm_float(b, 1.0),
+                                                    dests[1]);
+                        }
                         dests[2] = nir_imm_float(b, 0.0);
                         dests[3] = nir_imm_float(b, 1.0);
-                } else if (input_var->data.location >= VARYING_SLOT_VAR0) {
-                        if (c->fs_key->point_sprite_mask &
-                            (1 << (input_var->data.location -
-                                   VARYING_SLOT_VAR0))) {
-                                if (!c->fs_key->is_points) {
-                                        dests[0] = nir_imm_float(b, 0.0);
-                                        dests[1] = nir_imm_float(b, 0.0);
-                                }
-                                if (c->fs_key->point_coord_upper_left) {
-                                        dests[1] = nir_fsub(b,
-                                                            nir_imm_float(b, 1.0),
-                                                            dests[1]);
-                                }
-                                dests[2] = nir_imm_float(b, 0.0);
-                                dests[3] = nir_imm_float(b, 1.0);
-                        }
                 }
-                break;
-        case QSTAGE_COORD:
-        case QSTAGE_VERT:
-                break;
         }
 
         replace_intrinsic_with_vec4(b, intr, dests);
@@ -232,7 +394,10 @@ vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b,
 
         switch (intr->intrinsic) {
         case nir_intrinsic_load_input:
-                vc4_nir_lower_input(c, b, intr);
+                if (c->stage == QSTAGE_FRAG)
+                        vc4_nir_lower_fs_input(c, b, intr);
+                else
+                        vc4_nir_lower_vertex_attr(c, b, intr);
                 break;
 
         case nir_intrinsic_store_output:
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index d3e856a8530..6e9ec6530c6 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -602,126 +602,18 @@ ntq_fsign(struct vc4_compile *c, struct qreg src)
                               qir_uniform_f(c, -1.0));
 }
 
-static struct qreg
-get_channel_from_vpm(struct vc4_compile *c,
-                     struct qreg *vpm_reads,
-                     uint8_t swiz,
-                     const struct util_format_description *desc)
-{
-        const struct util_format_channel_description *chan =
-                &desc->channel[swiz];
-        struct qreg temp;
-
-        if (swiz > UTIL_FORMAT_SWIZZLE_W)
-                return get_swizzled_channel(c, vpm_reads, swiz);
-        else if (chan->size == 32 &&
-                 chan->type == UTIL_FORMAT_TYPE_FLOAT) {
-                return get_swizzled_channel(c, vpm_reads, swiz);
-        } else if (chan->size == 32 &&
-                   chan->type == UTIL_FORMAT_TYPE_SIGNED) {
-                if (chan->normalized) {
-                        return qir_FMUL(c,
-                                        qir_ITOF(c, vpm_reads[swiz]),
-                                        qir_uniform_f(c,
-                                                      1.0 / 0x7fffffff));
-                } else {
-                        return qir_ITOF(c, vpm_reads[swiz]);
-                }
-        } else if (chan->size == 8 &&
-                   (chan->type == UTIL_FORMAT_TYPE_UNSIGNED ||
-                    chan->type == UTIL_FORMAT_TYPE_SIGNED)) {
-                struct qreg vpm = vpm_reads[0];
-                if (chan->type == UTIL_FORMAT_TYPE_SIGNED) {
-                        temp = qir_XOR(c, vpm, qir_uniform_ui(c, 0x80808080));
-                        if (chan->normalized) {
-                                return qir_FSUB(c, qir_FMUL(c,
-                                                            qir_UNPACK_8_F(c, temp, swiz),
-                                                            qir_uniform_f(c, 2.0)),
-                                                qir_uniform_f(c, 1.0));
-                        } else {
-                                return qir_FADD(c,
-                                                qir_ITOF(c,
-                                                         qir_UNPACK_8_I(c, temp,
-                                                                        swiz)),
-                                                qir_uniform_f(c, -128.0));
-                        }
-                } else {
-                        if (chan->normalized) {
-                                return qir_UNPACK_8_F(c, vpm, swiz);
-                        } else {
-                                return qir_ITOF(c, qir_UNPACK_8_I(c, vpm, swiz));
-                        }
-                }
-        } else if (chan->size == 16 &&
-                   (chan->type == UTIL_FORMAT_TYPE_UNSIGNED ||
-                    chan->type == UTIL_FORMAT_TYPE_SIGNED)) {
-                struct qreg vpm = vpm_reads[swiz / 2];
-
-                /* Note that UNPACK_16F eats a half float, not ints, so we use
-                 * UNPACK_16_I for all of these.
-                 */
-                if (chan->type == UTIL_FORMAT_TYPE_SIGNED) {
-                        temp = qir_ITOF(c, qir_UNPACK_16_I(c, vpm, swiz % 2));
-                        if (chan->normalized) {
-                                return qir_FMUL(c, temp,
-                                                qir_uniform_f(c, 1/32768.0f));
-                        } else {
-                                return temp;
-                        }
-                } else {
-                        /* UNPACK_16I sign-extends, so we have to emit ANDs. */
-                        temp = vpm;
-                        if (swiz == 1 || swiz == 3)
-                                temp = qir_UNPACK_16_I(c, temp, 1);
-                        temp = qir_AND(c, temp, qir_uniform_ui(c, 0xffff));
-                        temp = qir_ITOF(c, temp);
-
-                        if (chan->normalized) {
-                                return qir_FMUL(c, temp,
-                                                qir_uniform_f(c, 1 / 65535.0));
-                        } else {
-                                return temp;
-                        }
-                }
-        } else {
-                return c->undef;
-        }
-}
-
 static void
 emit_vertex_input(struct vc4_compile *c, int attr)
 {
         enum pipe_format format = c->vs_key->attr_formats[attr];
         uint32_t attr_size = util_format_get_blocksize(format);
-        struct qreg vpm_reads[4];
 
         c->vattr_sizes[attr] = align(attr_size, 4);
         for (int i = 0; i < align(attr_size, 4) / 4; i++) {
                 struct qreg vpm = { QFILE_VPM, attr * 4 + i };
-                vpm_reads[i] = qir_MOV(c, vpm);
+                c->inputs[attr * 4 + i] = qir_MOV(c, vpm);
                 c->num_inputs++;
         }
-
-        bool format_warned = false;
-        const struct util_format_description *desc =
-                util_format_description(format);
-
-        for (int i = 0; i < 4; i++) {
-                uint8_t swiz = desc->swizzle[i];
-                struct qreg result = get_channel_from_vpm(c, vpm_reads,
-                                                          swiz, desc);
-
-                if (result.file == QFILE_NULL) {
-                        if (!format_warned) {
-                                fprintf(stderr,
-                                        "vtx element %d unsupported type: %s\n",
-                                        attr, util_format_name(format));
-                                format_warned = true;
-                        }
-                        result = qir_uniform_f(c, 0.0);
-                }
-                c->inputs[attr * 4 + i] = result;
-        }
 }
 
 static void
-- 
2.30.2