From 29927e7524b07d491c555b8ed06c9b89cd0856f8 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 14 Dec 2018 14:46:48 -0800
Subject: [PATCH] v3d: Drop in a bunch of notes about performance improvement
 opportunities.

These have all been floating in my head, and while I've thought about
encoding them in issues on gitlab once they're enabled, they also make
sense to just have in the area of the code you'll need to work in.
---
 src/broadcom/compiler/nir_to_vir.c   | 35 +++++++++++++++++++++++++++-
 src/broadcom/compiler/qpu_schedule.c | 13 +++++++++++
 src/broadcom/compiler/v3d40_tex.c    | 14 +++++++++++
 src/gallium/drivers/v3d/v3dx_draw.c  |  9 +++++++
 src/gallium/drivers/v3d/v3dx_rcl.c   |  5 +++-
 5 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 446ac53e95f..484dc050368 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -850,6 +850,9 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 break;
 
         case nir_op_unpack_half_2x16_split_x:
+                /* XXX perf: It would be good to be able to merge this unpack
+                 * with whatever uses our result.
+                 */
                 result = vir_FMOV(c, src[0]);
                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
                 break;
@@ -1489,6 +1492,10 @@ ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
 static void
 ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
 {
+        /* XXX perf: Experiment with using immediate loads to avoid having
+         * these end up in the uniform stream.  Watch out for breaking the
+         * small immediates optimization in the process!
+         */
         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
         for (int i = 0; i < instr->def.num_components; i++)
                 qregs[i] = vir_uniform_ui(c, instr->value.u32[i]);
@@ -1535,6 +1542,11 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 for (int i = 0; i < instr->num_components; i++) {
                         int ubo = nir_src_as_uint(instr->src[0]);
 
+                        /* XXX perf: On V3D 4.x with uniform offsets, we
+                         * should probably try setting UBOs up in the A
+                         * register file and doing a sequence of loads that
+                         * way.
+                         */
                         /* Adjust for where we stored the TGSI register base. */
                         vir_ADD_dest(c,
                                      vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
@@ -1669,6 +1681,12 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 
 /* Clears (activates) the execute flags for any channels whose jump target
  * matches this block.
+ *
+ * XXX perf: Could we be using flpush/flpop somehow for our execution channel
+ * enabling?
+ *
+ * XXX perf: For uniform control flow, we should be able to skip c->execute
+ * handling entirely.
  */
 static void
 ntq_activate_execute_for_block(struct v3d_compile *c)
@@ -1704,6 +1722,10 @@ ntq_emit_if(struct v3d_compile *c, nir_if *if_stmt)
         /* Set A for executing (execute == 0) and jumping (if->condition ==
          * 0) channels, and then update execute flags for those to point to
          * the ELSE block.
+         *
+         * XXX perf: we could reuse ntq_emit_comparison() to generate our if
+         * condition, and the .uf field to ignore non-executing channels, to
+         * reduce the overhead of if statements.
          */
         vir_PF(c, vir_OR(c,
                          c->execute,
@@ -1925,6 +1947,10 @@ nir_to_vir(struct v3d_compile *c)
                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
 
+                /* XXX perf: We could set the "disable implicit point/line
+                 * varyings" field in the shader record and not emit these, if
+                 * they're not going to be used.
+                 */
                 if (c->fs_key->is_points) {
                         c->point_x = emit_fragment_varying(c, NULL, 0);
                         c->point_y = emit_fragment_varying(c, NULL, 0);
@@ -2119,7 +2145,14 @@ v3d_nir_to_vir(struct v3d_compile *c)
 
         vir_check_payload_w(c);
 
-        /* XXX: vir_schedule_instructions(c); */
+        /* XXX perf: On VC4, we do a VIR-level instruction scheduling here.
+         * We used that on that platform to pipeline TMU writes and reduce the
+         * number of thread switches, as well as try (mostly successfully) to
+         * reduce maximum register pressure to allow more threads.  We should
+         * do something of that sort for V3D -- either instruction scheduling
+         * here, or delay the the THRSW and LDTMUs from our texture
+         * instructions until the results are needed.
+         */
 
         if (V3D_DEBUG & (V3D_DEBUG_VIR |
                          v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 365aebdbd6d..7662c8f6f08 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -195,6 +195,9 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
         if (!magic) {
                 add_write_dep(state, &state->last_rf[waddr], n);
         } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) {
+                /* XXX perf: For V3D 4.x, we could reorder TMU writes other
+                 * than the TMUS/TMUD/TMUA to improve scheduling flexibility.
+                 */
                 add_write_dep(state, &state->last_tmu_write, n);
                 switch (waddr) {
                 case V3D_QPU_WADDR_TMUS:
@@ -590,6 +593,10 @@ get_instruction_priority(const struct v3d_qpu_instr *inst)
                 return next_score;
         next_score++;
 
+        /* XXX perf: We should schedule SFU ALU ops so that the reader is 2
+         * instructions after the producer if possible, not just 1.
+         */
+
         /* Default score for things that aren't otherwise special. */
         baseline_score = next_score;
         next_score++;
@@ -784,6 +791,12 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                  * sooner.  If the ldvary's r5 wasn't used, then ldunif might
                  * otherwise get scheduled so ldunif and ldvary try to update
                  * r5 in the same tick.
+                 *
+                 * XXX perf: To get good pipelining of a sequence of varying
+                 * loads, we need to figure out how to pair the ldvary signal
+                 * up to the instruction before the last r5 user in the
+                 * previous ldvary sequence.  Currently, it usually pairs with
+                 * the last r5 user.
                  */
                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d40_tex.c
index 9f1fd9a0d20..3cb96e1204d 100644
--- a/src/broadcom/compiler/v3d40_tex.c
+++ b/src/broadcom/compiler/v3d40_tex.c
@@ -34,6 +34,9 @@ static void
 vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val,
               int *tmu_writes)
 {
+        /* XXX perf: We should figure out how to merge ALU operations
+         * producing the val with this MOV, when possible.
+         */
         vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
 
         (*tmu_writes)++;
@@ -147,6 +150,10 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
 
         /* Limit the number of channels returned to both how many the NIR
          * instruction writes and how many the instruction could produce.
+         *
+         * XXX perf: Can we also limit to the number of channels that are
+         * actually read by the users of this NIR dest, so that we don't need
+         * to emit unused LDTMUs?
          */
         uint32_t instr_return_channels = nir_tex_instr_dest_size(instr);
         if (!p1_unpacked.output_type_32_bit)
@@ -187,6 +194,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
         p1_packed |= unit << 24;
 
         vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed);
+        /* XXX perf: Can we skip p1 setup for txf ops? */
         vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P1, p1_packed);
         if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)) != 0)
                 vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
@@ -226,6 +234,12 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                         STATIC_ASSERT(PIPE_SWIZZLE_X == 0);
                         chan = return_values[i / 2];
 
+                        /* XXX perf: We should move this unpacking into NIR.
+                         * That would give us exposure of these types to NIR
+                         * optimization, so that (for example) a repacking of
+                         * half-float samples to the half-float render target
+                         * could be eliminated.
+                         */
                         if (nir_alu_type_get_base_type(instr->dest_type) ==
                             nir_type_float) {
                                 enum v3d_qpu_input_unpack unpack;
diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
index 692f1fe3c04..46e629d0c64 100644
--- a/src/gallium/drivers/v3d/v3dx_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -124,6 +124,11 @@ v3d_predraw_check_stage_inputs(struct pipe_context *pctx,
 {
         struct v3d_context *v3d = v3d_context(pctx);
 
+        /* XXX perf: If we're reading from the output of TF in this job, we
+         * should instead be using the wait for transform feedback
+         * functionality.
+         */
+
         /* Flush writes to textures we're sampling. */
         for (int i = 0; i < v3d->tex[s].num_textures; i++) {
                 struct pipe_sampler_view *pview = v3d->tex[s].textures[i];
@@ -175,6 +180,10 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
                                     cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD),
                                     32);
 
+        /* XXX perf: We should move most of the SHADER_STATE_RECORD setup to
+         * compile time, so that we mostly just have to OR the VS and FS
+         * records together at draw time.
+         */
         cl_emit(&job->indirect, GL_SHADER_STATE_RECORD, shader) {
                 shader.enable_clipping = true;
                 /* VC5_DIRTY_PRIM_MODE | VC5_DIRTY_RASTERIZER */
diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c
index 01a907b0a86..17b30465c9d 100644
--- a/src/gallium/drivers/v3d/v3dx_rcl.c
+++ b/src/gallium/drivers/v3d/v3dx_rcl.c
@@ -761,7 +761,10 @@ v3dX(emit_rcl)(struct v3d_job *job)
 
         v3d_rcl_emit_generic_per_tile_list(job, nr_cbufs - 1);
 
-        /* XXX: Use Morton order */
+        /* XXX perf: We should expose GL_MESA_tile_raster_order to improve X11
+         * performance, but we should use Morton order otherwise to improve
+         * cache locality.
+         */
         uint32_t supertile_w_in_pixels = job->tile_width * supertile_w;
         uint32_t supertile_h_in_pixels = job->tile_height * supertile_h;
         uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels;
-- 
2.30.2