From 292c24ddac5acc35676424f05291c101fcd47b3e Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 24 Feb 2017 12:57:03 -0800
Subject: [PATCH] vc4: Lazily emit our FS/VS input loads.

This reduces register pressure in both types of shaders, by reordering the
input loads from the var->data.driver_location order to whatever order
they appear first in the NIR shader.  These instructions aren't
reorderable at our QIR scheduling level because the FS takes two in
lockstep to do an interpolation, and the VS takes multiple read
instructions in a row to get a whole vec4-level attribute read.

shader-db impact:
total instructions in shared programs: 76666 -> 76590 (-0.10%)
instructions in affected programs:     42945 -> 42869 (-0.18%)
total max temps in shared programs: 9395 -> 9208 (-1.99%)
max temps in affected programs:     2951 -> 2764 (-6.34%)

Some programs get their max temps hurt, depending on the order that the
load_input intrinsics appear, because we end up being unable to copy
propagate an older VPM read into its only use.
---
 src/gallium/drivers/vc4/vc4_context.h |   8 +-
 src/gallium/drivers/vc4/vc4_draw.c    |   4 +-
 src/gallium/drivers/vc4/vc4_program.c | 149 ++++++++++++++------------
 src/gallium/drivers/vc4/vc4_qir.h     |   7 ++
 4 files changed, 93 insertions(+), 75 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index 6bd2424ec79..f346474abe0 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -174,10 +174,10 @@ struct vc4_compiled_shader {
 
         uint8_t num_inputs;
 
-        /* Byte offsets for the start of the vertex attributes 0-7, and the
-         * total size as "attribute" 8.
-         */
-        uint8_t vattr_offsets[9];
+        /** Byte offsets for the start of the vertex attributes. */
+        uint8_t vattr_offsets[8];
+        /** Total size of the vertex inputs, in bytes. */
+        uint8_t vattr_total_size;
         uint8_t vattrs_live;
 
         const struct vc4_fs_inputs *fs_inputs;
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index ebd080298a4..9f3765db1af 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -170,14 +170,14 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4,
         /* VC4_DIRTY_COMPILED_VS */
         cl_u16(&shader_rec, 0); /* vs num uniforms */
         cl_u8(&shader_rec, vc4->prog.vs->vattrs_live);
-        cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[8]);
+        cl_u8(&shader_rec, vc4->prog.vs->vattr_total_size);
         cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.vs->bo, 0);
         cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
 
         /* VC4_DIRTY_COMPILED_CS */
         cl_u16(&shader_rec, 0); /* cs num uniforms */
         cl_u8(&shader_rec, vc4->prog.cs->vattrs_live);
-        cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[8]);
+        cl_u8(&shader_rec, vc4->prog.cs->vattr_total_size);
         cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.cs->bo, 0);
         cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
 
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 05e596e733b..21753439cf6 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -733,11 +733,14 @@ emit_vertex_input(struct vc4_compile *c, int attr)
 {
         enum pipe_format format = c->vs_key->attr_formats[attr];
         uint32_t attr_size = util_format_get_blocksize(format);
+        uint32_t vpm_attr = c->next_vpm_input++;
 
-        c->vattr_sizes[attr] = align(attr_size, 4);
+        c->vpm_input_order[vpm_attr] = attr;
+
+        c->vattr_sizes[vpm_attr] = align(attr_size, 4);
         for (int i = 0; i < align(attr_size, 4) / 4; i++) {
                 c->inputs[attr * 4 + i] =
-                        qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i));
+                        qir_MOV(c, qir_reg(QFILE_VPM, vpm_attr * 4 + i));
                 c->num_inputs++;
         }
 }
@@ -1466,6 +1469,7 @@ emit_stub_vpm_read(struct vc4_compile *c)
         if (c->num_inputs)
                 return;
 
+        c->next_vpm_input++;
         c->vattr_sizes[0] = 4;
         (void)qir_MOV(c, qir_reg(QFILE_VPM, 0));
         c->num_inputs++;
@@ -1552,64 +1556,6 @@ vc4_optimize_nir(struct nir_shader *s)
         } while (progress);
 }
 
-static int
-driver_location_compare(const void *in_a, const void *in_b)
-{
-        const nir_variable *const *a = in_a;
-        const nir_variable *const *b = in_b;
-
-        return (*a)->data.driver_location - (*b)->data.driver_location;
-}
-
-static void
-ntq_setup_inputs(struct vc4_compile *c)
-{
-        unsigned num_entries = 0;
-        nir_foreach_variable(var, &c->s->inputs)
-                num_entries++;
-
-        nir_variable *vars[num_entries];
-
-        unsigned i = 0;
-        nir_foreach_variable(var, &c->s->inputs)
-                vars[i++] = var;
-
-        /* Sort the variables so that we emit the input setup in
-         * driver_location order.  This is required for VPM reads, whose data
-         * is fetched into the VPM in driver_location (TGSI register index)
-         * order.
-         */
-        qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
-
-        for (unsigned i = 0; i < num_entries; i++) {
-                nir_variable *var = vars[i];
-                unsigned array_len = MAX2(glsl_get_length(var->type), 1);
-                unsigned loc = var->data.driver_location;
-
-                assert(array_len == 1);
-                (void)array_len;
-                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
-                                  (loc + 1) * 4);
-
-                if (c->stage == QSTAGE_FRAG) {
-                        if (var->data.location == VARYING_SLOT_POS) {
-                                emit_fragcoord_input(c, loc);
-                        } else if (var->data.location == VARYING_SLOT_PNTC ||
-                                   (var->data.location >= VARYING_SLOT_VAR0 &&
-                                    (c->fs_key->point_sprite_mask &
-                                     (1 << (var->data.location -
-                                            VARYING_SLOT_VAR0))))) {
-                                c->inputs[loc * 4 + 0] = c->point_x;
-                                c->inputs[loc * 4 + 1] = c->point_y;
-                        } else {
-                                emit_fragment_input(c, loc, var->data.location);
-                        }
-                } else {
-                        emit_vertex_input(c, loc);
-                }
-        }
-}
-
 static void
 ntq_setup_outputs(struct vc4_compile *c)
 {
@@ -1740,10 +1686,73 @@ ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr)
                 return;
         }
 
-        uint32_t offset = nir_intrinsic_base(instr) + const_offset->u32[0];
+        /* Size our inputs array as far as this input.  Input arrays are
+         * small, and we don't have a shader_info field that tells us up front
+         * what the maximum driver_location is.
+         */
+        uint32_t loc = nir_intrinsic_base(instr) + const_offset->u32[0];
+        if ((loc + 1) * 4 > c->inputs_array_size) {
+                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
+                                  (loc + 1) * 4);
+        }
+
+        /* If we've already loaded this input, just return it.  This would
+         * happen for VPM loads, where we load an entire vertex attribute at
+         * once, or possibly also in the FS if we haven't CSEed away repeated
+         * loads.
+         */
         int comp = nir_intrinsic_component(instr);
+        if (c->inputs[loc * 4 + comp].file != QFILE_NULL) {
+                ntq_store_dest(c, &instr->dest, 0,
+                               qir_MOV(c, c->inputs[loc * 4 + comp]));
+                return;
+        }
+
+        /* In the FS, we always have to fully drain our FS FIFO before
+         * terminating the shader.  For the VS we only have to drain whatever
+         * VPM setup we configure, but vc4_qpu_emit.c configures it for the
+         * entire vertex attribute space.  Because of this, we emit our lazy
+         * varying/VPM loads at the last top level basic block.
+         */
+        struct qblock *saved_cur_block = c->cur_block;
+        c->cur_block = c->last_top_block;
+
+        /* Look up the NIR variable for this input, so we can see how big the
+         * input is, or what sort of interpolation is necessary.
+         */
+        nir_variable *var = NULL;
+        nir_foreach_variable(search_var, &c->s->inputs) {
+                unsigned search_len = MAX2(glsl_get_length(search_var->type), 1);
+                unsigned search_loc = search_var->data.driver_location;
+
+                if (loc >= search_loc && loc < search_loc + search_len) {
+                        var = search_var;
+                        break;
+                }
+        }
+        assert(var);
+
+        if (c->stage == QSTAGE_FRAG) {
+                if (var->data.location == VARYING_SLOT_POS) {
+                        emit_fragcoord_input(c, loc);
+                } else if (var->data.location == VARYING_SLOT_PNTC ||
+                           (var->data.location >= VARYING_SLOT_VAR0 &&
+                            (c->fs_key->point_sprite_mask &
+                             (1 << (var->data.location -
+                                    VARYING_SLOT_VAR0))))) {
+                        c->inputs[loc * 4 + 0] = c->point_x;
+                        c->inputs[loc * 4 + 1] = c->point_y;
+                } else {
+                        emit_fragment_input(c, loc, var->data.location);
+                }
+        } else {
+                emit_vertex_input(c, loc);
+        }
+
+        c->cur_block = saved_cur_block;
+
         ntq_store_dest(c, &instr->dest, 0,
-                       qir_MOV(c, c->inputs[offset * 4 + comp]));
+                       qir_MOV(c, c->inputs[loc * 4 + comp]));
 }
 
 static void
@@ -2161,7 +2170,6 @@ nir_to_qir(struct vc4_compile *c)
         if (c->stage == QSTAGE_FRAG && c->s->info->fs.uses_discard)
                 c->discard = qir_MOV(c, qir_uniform_ui(c, 0));
 
-        ntq_setup_inputs(c);
         ntq_setup_outputs(c);
         ntq_setup_uniforms(c);
         ntq_setup_registers(c, &c->s->registers);
@@ -2587,14 +2595,17 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
         } else {
                 shader->num_inputs = c->num_inputs;
 
-                shader->vattr_offsets[0] = 0;
-                for (int i = 0; i < 8; i++) {
-                        shader->vattr_offsets[i + 1] =
-                                shader->vattr_offsets[i] + c->vattr_sizes[i];
+                uint8_t next_vattr_offset = 0;
+                for (int i = 0; i < c->next_vpm_input; i++) {
+                        if (!c->vattr_sizes[i])
+                                continue;
 
-                        if (c->vattr_sizes[i])
-                                shader->vattrs_live |= (1 << i);
+                        uint32_t nir_attr = c->vpm_input_order[i];
+                        shader->vattr_offsets[nir_attr] = next_vattr_offset;
+                        next_vattr_offset += c->vattr_sizes[i];
+                        shader->vattrs_live |= (1 << nir_attr);
                 }
+                shader->vattr_total_size = next_vattr_offset;
         }
 
         shader->failed = c->failed;
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 6469e51b051..fe86232aeb2 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -461,6 +461,13 @@ struct vc4_compile {
 
         uint8_t vattr_sizes[8];
 
+        /**
+         * Order in which the vattrs were loaded by the program, to arrange
+         * vattr_offsets[] in the program data appropriately.
+         */
+        uint8_t vpm_input_order[8];
+        uint8_t next_vpm_input;
+
         /**
          * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
          *
-- 
2.30.2