v3d: Drop in a bunch of notes about performance improvement opportunities.

author Eric Anholt <eric@anholt.net>

Fri, 14 Dec 2018 22:46:48 +0000 (14:46 -0800)

committer Eric Anholt <eric@anholt.net>

Sat, 15 Dec 2018 01:48:01 +0000 (17:48 -0800)
author Eric Anholt <eric@anholt.net>
Fri, 14 Dec 2018 22:46:48 +0000 (14:46 -0800)
committer Eric Anholt <eric@anholt.net>
Sat, 15 Dec 2018 01:48:01 +0000 (17:48 -0800)
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c

index 446ac53e95f90c484e2e15afa99dd165e458ffac..484dc050368bfe4a40acdfeca7054476880481a7 100644 (file)
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -850,6 +850,9 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                  break;
  
          case nir_op_unpack_half_2x16_split_x:
+                /* XXX perf: It would be good to be able to merge this unpack
+                 * with whatever uses our result.
+                 */
                  result = vir_FMOV(c, src[0]);
                  vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
                  break;
@@ -1489,6 +1492,10 @@ ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
  static void
  ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
  {
+        /* XXX perf: Experiment with using immediate loads to avoid having
+         * these end up in the uniform stream.  Watch out for breaking the
+         * small immediates optimization in the process!
+         */
          struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
          for (int i = 0; i < instr->def.num_components; i++)
                  qregs[i] = vir_uniform_ui(c, instr->value.u32[i]);
@@ -1535,6 +1542,11 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                  for (int i = 0; i < instr->num_components; i++) {
                          int ubo = nir_src_as_uint(instr->src[0]);
  
+                        /* XXX perf: On V3D 4.x with uniform offsets, we
+                         * should probably try setting UBOs up in the A
+                         * register file and doing a sequence of loads that
+                         * way.
+                         */
                          /* Adjust for where we stored the TGSI register base. */
                          vir_ADD_dest(c,
                                       vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
@@ -1669,6 +1681,12 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
  
  /* Clears (activates) the execute flags for any channels whose jump target
   * matches this block.
+ *
+ * XXX perf: Could we be using flpush/flpop somehow for our execution channel
+ * enabling?
+ *
+ * XXX perf: For uniform control flow, we should be able to skip c->execute
+ * handling entirely.
   */
  static void
  ntq_activate_execute_for_block(struct v3d_compile *c)
@@ -1704,6 +1722,10 @@ ntq_emit_if(struct v3d_compile *c, nir_if *if_stmt)
          /* Set A for executing (execute == 0) and jumping (if->condition ==
           * 0) channels, and then update execute flags for those to point to
           * the ELSE block.
+         *
+         * XXX perf: we could reuse ntq_emit_comparison() to generate our if
+         * condition, and the .uf field to ignore non-executing channels, to
+         * reduce the overhead of if statements.
           */
          vir_PF(c, vir_OR(c,
                           c->execute,
@@ -1925,6 +1947,10 @@ nir_to_vir(struct v3d_compile *c)
                  c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
                  c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
  
+                /* XXX perf: We could set the "disable implicit point/line
+                 * varyings" field in the shader record and not emit these, if
+                 * they're not going to be used.
+                 */
                  if (c->fs_key->is_points) {
                          c->point_x = emit_fragment_varying(c, NULL, 0);
                          c->point_y = emit_fragment_varying(c, NULL, 0);
@@ -2119,7 +2145,14 @@ v3d_nir_to_vir(struct v3d_compile *c)
  
          vir_check_payload_w(c);
  
-        /* XXX: vir_schedule_instructions(c); */
+        /* XXX perf: On VC4, we do a VIR-level instruction scheduling here.
+         * We used that on that platform to pipeline TMU writes and reduce the
+         * number of thread switches, as well as try (mostly successfully) to
+         * reduce maximum register pressure to allow more threads.  We should
+         * do something of that sort for V3D -- either instruction scheduling
+         * here, or delay the the THRSW and LDTMUs from our texture
+         * instructions until the results are needed.
+         */
  
          if (V3D_DEBUG & (V3D_DEBUG_VIR |
                           v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c

index 365aebdbd6d9a8151081d7248ad411c13dff19e6..7662c8f6f0857be336dbbd8fd3578abf6af6ec11 100644 (file)
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -195,6 +195,9 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
          if (!magic) {
                  add_write_dep(state, &state->last_rf[waddr], n);
          } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) {
+                /* XXX perf: For V3D 4.x, we could reorder TMU writes other
+                 * than the TMUS/TMUD/TMUA to improve scheduling flexibility.
+                 */
                  add_write_dep(state, &state->last_tmu_write, n);
                  switch (waddr) {
                  case V3D_QPU_WADDR_TMUS:
@@ -590,6 +593,10 @@ get_instruction_priority(const struct v3d_qpu_instr *inst)
                  return next_score;
          next_score++;
  
+        /* XXX perf: We should schedule SFU ALU ops so that the reader is 2
+         * instructions after the producer if possible, not just 1.
+         */
+
          /* Default score for things that aren't otherwise special. */
          baseline_score = next_score;
          next_score++;
@@ -784,6 +791,12 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                   * sooner.  If the ldvary's r5 wasn't used, then ldunif might
                   * otherwise get scheduled so ldunif and ldvary try to update
                   * r5 in the same tick.
+                 *
+                 * XXX perf: To get good pipelining of a sequence of varying
+                 * loads, we need to figure out how to pair the ldvary signal
+                 * up to the instruction before the last r5 user in the
+                 * previous ldvary sequence.  Currently, it usually pairs with
+                 * the last r5 user.
                   */
                  if ((inst->sig.ldunif || inst->sig.ldunifa) &&
                      scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d40_tex.c

index 9f1fd9a0d20162c61a0e1b34868c5ad77dce319b..3cb96e1204df14febc0c8705067a791aa0d8adcb 100644 (file)
--- a/src/broadcom/compiler/v3d40_tex.c
+++ b/src/broadcom/compiler/v3d40_tex.c
@@ -34,6 +34,9 @@ static void
  vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val,
                int *tmu_writes)
  {
+        /* XXX perf: We should figure out how to merge ALU operations
+         * producing the val with this MOV, when possible.
+         */
          vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
  
          (*tmu_writes)++;
@@ -147,6 +150,10 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
  
          /* Limit the number of channels returned to both how many the NIR
           * instruction writes and how many the instruction could produce.
+         *
+         * XXX perf: Can we also limit to the number of channels that are
+         * actually read by the users of this NIR dest, so that we don't need
+         * to emit unused LDTMUs?
           */
          uint32_t instr_return_channels = nir_tex_instr_dest_size(instr);
          if (!p1_unpacked.output_type_32_bit)
@@ -187,6 +194,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
          p1_packed |= unit << 24;
  
          vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed);
+        /* XXX perf: Can we skip p1 setup for txf ops? */
          vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P1, p1_packed);
          if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)) != 0)
                  vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
@@ -226,6 +234,12 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                          STATIC_ASSERT(PIPE_SWIZZLE_X == 0);
                          chan = return_values[i / 2];
  
+                        /* XXX perf: We should move this unpacking into NIR.
+                         * That would give us exposure of these types to NIR
+                         * optimization, so that (for example) a repacking of
+                         * half-float samples to the half-float render target
+                         * could be eliminated.
+                         */
                          if (nir_alu_type_get_base_type(instr->dest_type) ==
                              nir_type_float) {
                                  enum v3d_qpu_input_unpack unpack;
diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c

index 692f1fe3c044aa85c1a8b37b6a305869db06791b..46e629d0c64f9639708887f02dbb93748ea4676c 100644 (file)
--- a/src/gallium/drivers/v3d/v3dx_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -124,6 +124,11 @@ v3d_predraw_check_stage_inputs(struct pipe_context *pctx,
  {
          struct v3d_context *v3d = v3d_context(pctx);
  
+        /* XXX perf: If we're reading from the output of TF in this job, we
+         * should instead be using the wait for transform feedback
+         * functionality.
+         */
+
          /* Flush writes to textures we're sampling. */
          for (int i = 0; i < v3d->tex[s].num_textures; i++) {
                  struct pipe_sampler_view *pview = v3d->tex[s].textures[i];
@@ -175,6 +180,10 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
                                      cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD),
                                      32);
  
+        /* XXX perf: We should move most of the SHADER_STATE_RECORD setup to
+         * compile time, so that we mostly just have to OR the VS and FS
+         * records together at draw time.
+         */
          cl_emit(&job->indirect, GL_SHADER_STATE_RECORD, shader) {
                  shader.enable_clipping = true;
                  /* VC5_DIRTY_PRIM_MODE | VC5_DIRTY_RASTERIZER */
diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c

index 01a907b0a86042bc481fa4eeb42eebc13c82971d..17b30465c9d7cbe2ce1950196b746ec7ebdd07dc 100644 (file)
--- a/src/gallium/drivers/v3d/v3dx_rcl.c
+++ b/src/gallium/drivers/v3d/v3dx_rcl.c
@@ -761,7 +761,10 @@ v3dX(emit_rcl)(struct v3d_job *job)
  
          v3d_rcl_emit_generic_per_tile_list(job, nr_cbufs - 1);
  
-        /* XXX: Use Morton order */
+        /* XXX perf: We should expose GL_MESA_tile_raster_order to improve X11
+         * performance, but we should use Morton order otherwise to improve
+         * cache locality.
+         */
          uint32_t supertile_w_in_pixels = job->tile_width * supertile_w;
          uint32_t supertile_h_in_pixels = job->tile_height * supertile_h;
          uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels;
author	Eric Anholt <eric@anholt.net>
	Fri, 14 Dec 2018 22:46:48 +0000 (14:46 -0800)
committer	Eric Anholt <eric@anholt.net>
	Sat, 15 Dec 2018 01:48:01 +0000 (17:48 -0800)
src/broadcom/compiler/nir_to_vir.c		patch \| blob \| history
src/broadcom/compiler/qpu_schedule.c		patch \| blob \| history
src/broadcom/compiler/v3d40_tex.c		patch \| blob \| history
src/gallium/drivers/v3d/v3dx_draw.c		patch \| blob \| history
src/gallium/drivers/v3d/v3dx_rcl.c		patch \| blob \| history