vc4: Lazily emit our FS/VS input loads.

author Eric Anholt <eric@anholt.net>

Fri, 24 Feb 2017 20:57:03 +0000 (12:57 -0800)

committer Eric Anholt <eric@anholt.net>

Sat, 25 Feb 2017 01:01:29 +0000 (17:01 -0800)
author Eric Anholt <eric@anholt.net>
Fri, 24 Feb 2017 20:57:03 +0000 (12:57 -0800)
committer Eric Anholt <eric@anholt.net>
Sat, 25 Feb 2017 01:01:29 +0000 (17:01 -0800)
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h

index 6bd2424ec797a2ac90a6491f980b189ba2b99dc4..f346474abe0a48ef9f5cfb42088db5ca91f534ea 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -174,10 +174,10 @@ struct vc4_compiled_shader {
  
          uint8_t num_inputs;
  
-        /* Byte offsets for the start of the vertex attributes 0-7, and the
-         * total size as "attribute" 8.
-         */
-        uint8_t vattr_offsets[9];
+        /** Byte offsets for the start of the vertex attributes. */
+        uint8_t vattr_offsets[8];
+        /** Total size of the vertex inputs, in bytes. */
+        uint8_t vattr_total_size;
          uint8_t vattrs_live;
  
          const struct vc4_fs_inputs *fs_inputs;
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c

index ebd080298a4b37cff6ad329a950d321c1fda584b..9f3765db1af9895a6c16f88f0317ba003720869c 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -170,14 +170,14 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4,
          /* VC4_DIRTY_COMPILED_VS */
          cl_u16(&shader_rec, 0); /* vs num uniforms */
          cl_u8(&shader_rec, vc4->prog.vs->vattrs_live);
-        cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[8]);
+        cl_u8(&shader_rec, vc4->prog.vs->vattr_total_size);
          cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.vs->bo, 0);
          cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
  
          /* VC4_DIRTY_COMPILED_CS */
          cl_u16(&shader_rec, 0); /* cs num uniforms */
          cl_u8(&shader_rec, vc4->prog.cs->vattrs_live);
-        cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[8]);
+        cl_u8(&shader_rec, vc4->prog.cs->vattr_total_size);
          cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.cs->bo, 0);
          cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
  
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c

index 05e596e733b290fb54f8e252b6b6c679becff640..21753439cf67d7ec45bc8e078190781a4ec2f271 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -733,11 +733,14 @@ emit_vertex_input(struct vc4_compile *c, int attr)
  {
          enum pipe_format format = c->vs_key->attr_formats[attr];
          uint32_t attr_size = util_format_get_blocksize(format);
+        uint32_t vpm_attr = c->next_vpm_input++;
  
-        c->vattr_sizes[attr] = align(attr_size, 4);
+        c->vpm_input_order[vpm_attr] = attr;
+
+        c->vattr_sizes[vpm_attr] = align(attr_size, 4);
          for (int i = 0; i < align(attr_size, 4) / 4; i++) {
                  c->inputs[attr * 4 + i] =
-                        qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i));
+                        qir_MOV(c, qir_reg(QFILE_VPM, vpm_attr * 4 + i));
                  c->num_inputs++;
          }
  }
@@ -1466,6 +1469,7 @@ emit_stub_vpm_read(struct vc4_compile *c)
          if (c->num_inputs)
                  return;
  
+        c->next_vpm_input++;
          c->vattr_sizes[0] = 4;
          (void)qir_MOV(c, qir_reg(QFILE_VPM, 0));
          c->num_inputs++;
@@ -1552,64 +1556,6 @@ vc4_optimize_nir(struct nir_shader *s)
          } while (progress);
  }
  
-static int
-driver_location_compare(const void *in_a, const void *in_b)
-{
-        const nir_variable *const *a = in_a;
-        const nir_variable *const *b = in_b;
-
-        return (*a)->data.driver_location - (*b)->data.driver_location;
-}
-
-static void
-ntq_setup_inputs(struct vc4_compile *c)
-{
-        unsigned num_entries = 0;
-        nir_foreach_variable(var, &c->s->inputs)
-                num_entries++;
-
-        nir_variable *vars[num_entries];
-
-        unsigned i = 0;
-        nir_foreach_variable(var, &c->s->inputs)
-                vars[i++] = var;
-
-        /* Sort the variables so that we emit the input setup in
-         * driver_location order.  This is required for VPM reads, whose data
-         * is fetched into the VPM in driver_location (TGSI register index)
-         * order.
-         */
-        qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
-
-        for (unsigned i = 0; i < num_entries; i++) {
-                nir_variable *var = vars[i];
-                unsigned array_len = MAX2(glsl_get_length(var->type), 1);
-                unsigned loc = var->data.driver_location;
-
-                assert(array_len == 1);
-                (void)array_len;
-                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
-                                  (loc + 1) * 4);
-
-                if (c->stage == QSTAGE_FRAG) {
-                        if (var->data.location == VARYING_SLOT_POS) {
-                                emit_fragcoord_input(c, loc);
-                        } else if (var->data.location == VARYING_SLOT_PNTC ||
-                                   (var->data.location >= VARYING_SLOT_VAR0 &&
-                                    (c->fs_key->point_sprite_mask &
-                                     (1 << (var->data.location -
-                                            VARYING_SLOT_VAR0))))) {
-                                c->inputs[loc * 4 + 0] = c->point_x;
-                                c->inputs[loc * 4 + 1] = c->point_y;
-                        } else {
-                                emit_fragment_input(c, loc, var->data.location);
-                        }
-                } else {
-                        emit_vertex_input(c, loc);
-                }
-        }
-}
-
  static void
  ntq_setup_outputs(struct vc4_compile *c)
  {
@@ -1740,10 +1686,73 @@ ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr)
                  return;
          }
  
-        uint32_t offset = nir_intrinsic_base(instr) + const_offset->u32[0];
+        /* Size our inputs array as far as this input.  Input arrays are
+         * small, and we don't have a shader_info field that tells us up front
+         * what the maximum driver_location is.
+         */
+        uint32_t loc = nir_intrinsic_base(instr) + const_offset->u32[0];
+        if ((loc + 1) * 4 > c->inputs_array_size) {
+                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
+                                  (loc + 1) * 4);
+        }
+
+        /* If we've already loaded this input, just return it.  This would
+         * happen for VPM loads, where we load an entire vertex attribute at
+         * once, or possibly also in the FS if we haven't CSEed away repeated
+         * loads.
+         */
          int comp = nir_intrinsic_component(instr);
+        if (c->inputs[loc * 4 + comp].file != QFILE_NULL) {
+                ntq_store_dest(c, &instr->dest, 0,
+                               qir_MOV(c, c->inputs[loc * 4 + comp]));
+                return;
+        }
+
+        /* In the FS, we always have to fully drain our FS FIFO before
+         * terminating the shader.  For the VS we only have to drain whatever
+         * VPM setup we configure, but vc4_qpu_emit.c configures it for the
+         * entire vertex attribute space.  Because of this, we emit our lazy
+         * varying/VPM loads at the last top level basic block.
+         */
+        struct qblock *saved_cur_block = c->cur_block;
+        c->cur_block = c->last_top_block;
+
+        /* Look up the NIR variable for this input, so we can see how big the
+         * input is, or what sort of interpolation is necessary.
+         */
+        nir_variable *var = NULL;
+        nir_foreach_variable(search_var, &c->s->inputs) {
+                unsigned search_len = MAX2(glsl_get_length(search_var->type), 1);
+                unsigned search_loc = search_var->data.driver_location;
+
+                if (loc >= search_loc && loc < search_loc + search_len) {
+                        var = search_var;
+                        break;
+                }
+        }
+        assert(var);
+
+        if (c->stage == QSTAGE_FRAG) {
+                if (var->data.location == VARYING_SLOT_POS) {
+                        emit_fragcoord_input(c, loc);
+                } else if (var->data.location == VARYING_SLOT_PNTC ||
+                           (var->data.location >= VARYING_SLOT_VAR0 &&
+                            (c->fs_key->point_sprite_mask &
+                             (1 << (var->data.location -
+                                    VARYING_SLOT_VAR0))))) {
+                        c->inputs[loc * 4 + 0] = c->point_x;
+                        c->inputs[loc * 4 + 1] = c->point_y;
+                } else {
+                        emit_fragment_input(c, loc, var->data.location);
+                }
+        } else {
+                emit_vertex_input(c, loc);
+        }
+
+        c->cur_block = saved_cur_block;
+
          ntq_store_dest(c, &instr->dest, 0,
-                       qir_MOV(c, c->inputs[offset * 4 + comp]));
+                       qir_MOV(c, c->inputs[loc * 4 + comp]));
  }
  
  static void
@@ -2161,7 +2170,6 @@ nir_to_qir(struct vc4_compile *c)
          if (c->stage == QSTAGE_FRAG && c->s->info->fs.uses_discard)
                  c->discard = qir_MOV(c, qir_uniform_ui(c, 0));
  
-        ntq_setup_inputs(c);
          ntq_setup_outputs(c);
          ntq_setup_uniforms(c);
          ntq_setup_registers(c, &c->s->registers);
@@ -2587,14 +2595,17 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
          } else {
                  shader->num_inputs = c->num_inputs;
  
-                shader->vattr_offsets[0] = 0;
-                for (int i = 0; i < 8; i++) {
-                        shader->vattr_offsets[i + 1] =
-                                shader->vattr_offsets[i] + c->vattr_sizes[i];
+                uint8_t next_vattr_offset = 0;
+                for (int i = 0; i < c->next_vpm_input; i++) {
+                        if (!c->vattr_sizes[i])
+                                continue;
  
-                        if (c->vattr_sizes[i])
-                                shader->vattrs_live |= (1 << i);
+                        uint32_t nir_attr = c->vpm_input_order[i];
+                        shader->vattr_offsets[nir_attr] = next_vattr_offset;
+                        next_vattr_offset += c->vattr_sizes[i];
+                        shader->vattrs_live |= (1 << nir_attr);
                  }
+                shader->vattr_total_size = next_vattr_offset;
          }
  
          shader->failed = c->failed;
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h

index 6469e51b051cd0bdd7195089863109e40a0b2736..fe86232aeb27a3930a7c3af17e5d926085096323 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -461,6 +461,13 @@ struct vc4_compile {
  
          uint8_t vattr_sizes[8];
  
+        /**
+         * Order in which the vattrs were loaded by the program, to arrange
+         * vattr_offsets[] in the program data appropriately.
+         */
+        uint8_t vpm_input_order[8];
+        uint8_t next_vpm_input;
+
          /**
           * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
           *
author	Eric Anholt <eric@anholt.net>
	Fri, 24 Feb 2017 20:57:03 +0000 (12:57 -0800)
committer	Eric Anholt <eric@anholt.net>
	Sat, 25 Feb 2017 01:01:29 +0000 (17:01 -0800)
src/gallium/drivers/vc4/vc4_context.h		patch \| blob \| history
src/gallium/drivers/vc4/vc4_draw.c		patch \| blob \| history
src/gallium/drivers/vc4/vc4_program.c		patch \| blob \| history
src/gallium/drivers/vc4/vc4_qir.h		patch \| blob \| history