broadcom/vc5: Add support for register spilling.

author Eric Anholt <eric@anholt.net>

Tue, 13 Mar 2018 22:13:00 +0000 (15:13 -0700)

committer Eric Anholt <eric@anholt.net>

Mon, 19 Mar 2018 23:44:06 +0000 (16:44 -0700)
author Eric Anholt <eric@anholt.net>
Tue, 13 Mar 2018 22:13:00 +0000 (15:13 -0700)
committer Eric Anholt <eric@anholt.net>
Mon, 19 Mar 2018 23:44:06 +0000 (16:44 -0700)
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c

index c1ba1e3049da74b82e2d82f760710ae98bef7f01..75e35067f27da4edfb467e00521e906576234542 100644 (file)
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -1919,12 +1919,11 @@ vir_remove_thrsw(struct v3d_compile *c)
                                  vir_remove_instruction(c, inst);
                  }
          }
-        vir_calculate_live_intervals(c);
  
          c->last_thrsw = NULL;
  }
  
-static void
+void
  vir_emit_last_thrsw(struct v3d_compile *c)
  {
          /* On V3D before 4.1, we need a TMU op to be outstanding when thread
@@ -2012,16 +2011,16 @@ v3d_nir_to_vir(struct v3d_compile *c)
                  fprintf(stderr, "\n");
          }
  
-        /* Compute the live ranges so we can figure out interference. */
-        vir_calculate_live_intervals(c);
-
          /* Attempt to allocate registers for the temporaries.  If we fail,
           * reduce thread count and try again.
           */
          int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
          struct qpu_reg *temp_registers;
          while (true) {
-                temp_registers = v3d_register_allocate(c);
+                bool spilled;
+                temp_registers = v3d_register_allocate(c, &spilled);
+                if (spilled)
+                        continue;
  
                  if (temp_registers)
                          break;
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h

index fdf1b131978fae420ba3b5776690a089789e82e1..84cc4d290a0461477de53cf7b254b18b0a12ceaf 100644 (file)
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -248,6 +248,12 @@ enum quniform_contents {
  
          QUNIFORM_ALPHA_REF,
          QUNIFORM_SAMPLE_MASK,
+
+        /**
+         * Returns the the offset of the scratch buffer for register spilling.
+         */
+        QUNIFORM_SPILL_OFFSET,
+        QUNIFORM_SPILL_SIZE_PER_THREAD,
  };
  
  struct v3d_varying_slot {
@@ -506,6 +512,20 @@ struct v3d_compile {
          uint8_t vattr_sizes[V3D_MAX_VS_INPUTS];
          uint32_t num_vpm_writes;
  
+        /* Size in bytes of registers that have been spilled. This is how much
+         * space needs to be available in the spill BO per thread per QPU.
+         */
+        uint32_t spill_size;
+        /* Shader-db stats for register spilling. */
+        uint32_t spills, fills;
+        /**
+         * Register spilling's per-thread base address, shared between each
+         * spill/fill's addressing calculations.
+         */
+        struct qreg spill_base;
+        /* Bit vector of which temps may be spilled */
+        BITSET_WORD *spillable;
+
          /**
           * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
           *
@@ -600,6 +620,7 @@ struct v3d_prog_data {
          struct v3d_ubo_range *ubo_ranges;
          uint32_t num_ubo_ranges;
          uint32_t ubo_size;
+        uint32_t spill_size;
  
          uint8_t num_inputs;
          uint8_t threads;
@@ -697,6 +718,7 @@ void vir_set_unpack(struct qinst *inst, int src,
                      enum v3d_qpu_input_unpack unpack);
  
  struct qreg vir_get_temp(struct v3d_compile *c);
+void vir_emit_last_thrsw(struct v3d_compile *c);
  void vir_calculate_live_intervals(struct v3d_compile *c);
  bool vir_has_implicit_uniform(struct qinst *inst);
  int vir_get_implicit_uniform_src(struct qinst *inst);
@@ -746,7 +768,7 @@ void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
  void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers);
  uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
  void qpu_validate(struct v3d_compile *c);
-struct qpu_reg *v3d_register_allocate(struct v3d_compile *c);
+struct qpu_reg *v3d_register_allocate(struct v3d_compile *c, bool *spilled);
  bool vir_init_reg_sets(struct v3d_compiler *compiler);
  
  void vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf);
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c

index 6a315dd482329265ea77850b04d1fa82b21aedd7..0cbdc986d3fa844be0f5cbe1195cb5fe8fa1a1f5 100644 (file)
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -354,10 +354,17 @@ vir_get_temp(struct v3d_compile *c)
          if (c->num_temps > c->defs_array_size) {
                  uint32_t old_size = c->defs_array_size;
                  c->defs_array_size = MAX2(old_size * 2, 16);
+
                  c->defs = reralloc(c, c->defs, struct qinst *,
                                     c->defs_array_size);
                  memset(&c->defs[old_size], 0,
                         sizeof(c->defs[0]) * (c->defs_array_size - old_size));
+
+                c->spillable = reralloc(c, c->spillable,
+                                        BITSET_WORD,
+                                        BITSET_WORDS(c->defs_array_size));
+                for (int i = old_size; i < c->defs_array_size; i++)
+                        BITSET_SET(c->spillable, i);
          }
  
          return reg;
@@ -653,6 +660,7 @@ v3d_set_prog_data(struct v3d_compile *c,
  {
          prog_data->threads = c->threads;
          prog_data->single_seg = !c->last_thrsw;
+        prog_data->spill_size = c->spill_size;
  
          v3d_set_prog_data_uniforms(c, prog_data);
          v3d_set_prog_data_ubo(c, prog_data);
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c

index ab3a4e257ff6f0b3f9735411ba82934bcfa09729..4ec5f232643ec1fb9b6ea65690a49e4ec05522af 100644 (file)
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -33,6 +33,211 @@
  #define PHYS_INDEX    (ACC_INDEX + ACC_COUNT)
  #define PHYS_COUNT    64
  
+static bool
+is_last_ldtmu(struct qinst *inst, struct qblock *block)
+{
+        list_for_each_entry_from(struct qinst, scan_inst, inst,
+                                 &block->instructions, link) {
+                if (inst->qpu.sig.ldtmu)
+                        return false;
+                if (v3d_qpu_writes_tmu(&inst->qpu))
+                        return true;
+        }
+
+        return true;
+}
+
+static int
+v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
+                      uint32_t *temp_to_node)
+{
+        float block_scale = 1.0;
+        float spill_costs[c->num_temps];
+        bool in_tmu_operation = false;
+        bool started_last_seg = false;
+
+        for (unsigned i = 0; i < c->num_temps; i++)
+                spill_costs[i] = 0.0;
+
+        /* XXX: Scale the cost up when inside of a loop. */
+        vir_for_each_block(block, c) {
+                vir_for_each_inst(inst, block) {
+                        /* We can't insert a new TMU operation while currently
+                         * in a TMU operation, and we can't insert new thread
+                         * switches after starting output writes.
+                         */
+                        bool no_spilling =
+                                (in_tmu_operation ||
+                                 (c->threads > 1 && started_last_seg));
+
+                        for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                                if (inst->src[i].file != QFILE_TEMP)
+                                        continue;
+
+                                int temp = inst->src[i].index;
+                                if (no_spilling) {
+                                        BITSET_CLEAR(c->spillable,
+                                                     temp);
+                                } else {
+                                        spill_costs[temp] += block_scale;
+                                }
+                        }
+
+                        if (inst->dst.file == QFILE_TEMP) {
+                                int temp = inst->dst.index;
+
+                                if (no_spilling) {
+                                        BITSET_CLEAR(c->spillable,
+                                                     temp);
+                                } else {
+                                        spill_costs[temp] += block_scale;
+                                }
+                        }
+
+                        if (inst->is_last_thrsw)
+                                started_last_seg = true;
+
+                        if (v3d_qpu_writes_vpm(&inst->qpu) ||
+                            v3d_qpu_uses_tlb(&inst->qpu))
+                                started_last_seg = true;
+
+                        /* Track when we're in between a TMU setup and the
+                         * final LDTMU from that TMU setup.  We can't
+                         * spill/fill any temps during that time, because that
+                         * involves inserting a new TMU setup/LDTMU sequence.
+                         */
+                        if (inst->qpu.sig.ldtmu &&
+                            is_last_ldtmu(inst, block))
+                                in_tmu_operation = false;
+
+                        if (v3d_qpu_writes_tmu(&inst->qpu))
+                                in_tmu_operation = true;
+                }
+        }
+
+        for (unsigned i = 0; i < c->num_temps; i++) {
+                int node = temp_to_node[i];
+
+                if (BITSET_TEST(c->spillable, i))
+                        ra_set_node_spill_cost(g, node, spill_costs[i]);
+        }
+
+        return ra_get_best_spill_node(g);
+}
+
+/* The spill offset for this thread takes a bit of setup, so do it once at
+ * program start.
+ */
+static void
+v3d_setup_spill_base(struct v3d_compile *c)
+{
+        c->cursor = vir_before_block(vir_entry_block(c));
+
+        int start_num_temps = c->num_temps;
+
+        /* Each thread wants to be in a separate region of the scratch space
+         * so that the QPUs aren't fighting over cache lines.  We have the
+         * driver keep a single global spill BO rather than
+         * per-spilling-program BOs, so we need a uniform from the driver for
+         * what the per-thread scale is.
+         */
+        struct qreg thread_offset =
+                vir_UMUL(c,
+                         vir_TIDX(c),
+                         vir_uniform(c, QUNIFORM_SPILL_SIZE_PER_THREAD, 0));
+
+        /* Each channel in a reg is 4 bytes, so scale them up by that. */
+        struct qreg element_offset = vir_SHL(c, vir_EIDX(c),
+                                             vir_uniform_ui(c, 2));
+
+        c->spill_base = vir_ADD(c,
+                                vir_ADD(c, thread_offset, element_offset),
+                                vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0));
+
+        /* Make sure that we don't spill the spilling setup instructions. */
+        for (int i = start_num_temps; i < c->num_temps; i++)
+                BITSET_CLEAR(c->spillable, i);
+}
+
+static void
+v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset)
+{
+        vir_ADD_dest(c, vir_reg(QFILE_MAGIC,
+                                V3D_QPU_WADDR_TMUA),
+                     c->spill_base,
+                     vir_uniform_ui(c, spill_offset));
+}
+
+static void
+v3d_spill_reg(struct v3d_compile *c, int spill_temp)
+{
+        uint32_t spill_offset = c->spill_size;
+        c->spill_size += 16 * sizeof(uint32_t);
+
+        if (spill_offset == 0)
+                v3d_setup_spill_base(c);
+
+        struct qinst *last_thrsw = c->last_thrsw;
+        assert(!last_thrsw || last_thrsw->is_last_thrsw);
+
+        int start_num_temps = c->num_temps;
+
+        vir_for_each_inst_inorder(inst, c) {
+                for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                        if (inst->src[i].file != QFILE_TEMP ||
+                            inst->src[i].index != spill_temp) {
+                                continue;
+                        }
+
+                        c->cursor = vir_before_inst(inst);
+
+                        v3d_emit_spill_tmua(c, spill_offset);
+                        vir_emit_thrsw(c);
+                        inst->src[i] = vir_LDTMU(c);
+                        c->fills++;
+                }
+
+                if (inst->dst.file == QFILE_TEMP &&
+                    inst->dst.index == spill_temp) {
+                        c->cursor = vir_after_inst(inst);
+
+                        inst->dst.index = c->num_temps++;
+                        vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
+                                     inst->dst);
+                        v3d_emit_spill_tmua(c, spill_offset);
+                        vir_emit_thrsw(c);
+                        c->spills++;
+                }
+
+                /* If we didn't have a last-thrsw inserted by nir_to_vir and
+                 * we've been inserting thrsws, then insert a new last_thrsw
+                 * right before we start the vpm/tlb sequence for the last
+                 * thread segment.
+                 */
+                if (!last_thrsw && c->last_thrsw &&
+                    (v3d_qpu_writes_vpm(&inst->qpu) ||
+                     v3d_qpu_uses_tlb(&inst->qpu))) {
+                        c->cursor = vir_before_inst(inst);
+                        vir_emit_thrsw(c);
+
+                        last_thrsw = c->last_thrsw;
+                        last_thrsw->is_last_thrsw = true;
+                }
+        }
+
+        /* Make sure c->last_thrsw is the actual last thrsw, not just one we
+         * inserted in our most recent unspill.
+         */
+        if (last_thrsw)
+                c->last_thrsw = last_thrsw;
+
+        /* Don't allow spilling of our spilling instructions.  There's no way
+         * they can help get things colored.
+         */
+        for (int i = start_num_temps; i < c->num_temps; i++)
+                BITSET_CLEAR(c->spillable, i);
+}
+
  bool
  vir_init_reg_sets(struct v3d_compiler *compiler)
  {
@@ -96,7 +301,7 @@ node_to_temp_priority(const void *in_a, const void *in_b)
   * The return value should be freed by the caller.
   */
  struct qpu_reg *
-v3d_register_allocate(struct v3d_compile *c)
+v3d_register_allocate(struct v3d_compile *c, bool *spilled)
  {
          struct node_to_temp_map map[c->num_temps];
          uint32_t temp_to_node[c->num_temps];
@@ -105,9 +310,10 @@ v3d_register_allocate(struct v3d_compile *c)
                                                  sizeof(*temp_registers));
          int acc_nodes[ACC_COUNT];
  
-        struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
-                                                         c->num_temps +
-                                                         ARRAY_SIZE(acc_nodes));
+        *spilled = false;
+
+        vir_calculate_live_intervals(c);
+
          /* Convert 1, 2, 4 threads to 0, 1, 2 index.
           *
           * V3D 4.x has double the physical register space, so 64 physical regs
@@ -119,6 +325,10 @@ v3d_register_allocate(struct v3d_compile *c)
                          thread_index--;
          }
  
+        struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
+                                                         c->num_temps +
+                                                         ARRAY_SIZE(acc_nodes));
+
          /* Make some fixed nodes for the accumulators, which we will need to
           * interfere with when ops have implied r3/r4 writes or for the thread
           * switches.  We could represent these as classes for the nodes to
@@ -254,6 +464,20 @@ v3d_register_allocate(struct v3d_compile *c)
  
          bool ok = ra_allocate(g);
          if (!ok) {
+                /* Try to spill, if we can't reduce threading first. */
+                if (thread_index == 0) {
+                        int node = v3d_choose_spill_node(c, g, temp_to_node);
+
+                        if (node != -1) {
+                                v3d_spill_reg(c, map[node].temp);
+                                ralloc_free(g);
+
+                                /* Ask the outer loop to call back in. */
+                                *spilled = true;
+                                return NULL;
+                        }
+                }
+
                  free(temp_registers);
                  return NULL;
          }
@@ -280,5 +504,17 @@ v3d_register_allocate(struct v3d_compile *c)
  
          ralloc_free(g);
  
+        if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
+                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d spills\n",
+                        vir_get_stage_name(c),
+                        c->program_id, c->variant_id,
+                        c->spills);
+
+                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d fills\n",
+                        vir_get_stage_name(c),
+                        c->program_id, c->variant_id,
+                        c->fills);
+        }
+
          return temp_registers;
  }
diff --git a/src/gallium/drivers/vc5/vc5_context.h b/src/gallium/drivers/vc5/vc5_context.h

index 18fc27c5147f0fc0427fec9ced1d0807168acfaa..28b2e165a9d7d8d1c7546ff2c8bfdfddc97c1f90 100644 (file)
--- a/src/gallium/drivers/vc5/vc5_context.h
+++ b/src/gallium/drivers/vc5/vc5_context.h
@@ -154,6 +154,9 @@ struct vc5_compiled_shader {
  struct vc5_program_stateobj {
          struct vc5_uncompiled_shader *bind_vs, *bind_fs;
          struct vc5_compiled_shader *cs, *vs, *fs;
+
+        struct vc5_bo *spill_bo;
+        int spill_size_per_thread;
  };
  
  struct vc5_constbuf_stateobj {
diff --git a/src/gallium/drivers/vc5/vc5_program.c b/src/gallium/drivers/vc5/vc5_program.c

index eb5b61054555e4d6974d38e6aa67c1494fba6b59..ae3850a64b370bb4921924eb080ddca375801a5d 100644 (file)
--- a/src/gallium/drivers/vc5/vc5_program.c
+++ b/src/gallium/drivers/vc5/vc5_program.c
@@ -267,6 +267,21 @@ vc5_get_compiled_shader(struct vc5_context *vc5, struct v3d_key *key)
          memcpy(dup_key, key, key_size);
          _mesa_hash_table_insert(ht, dup_key, shader);
  
+        if (shader->prog_data.base->spill_size >
+            vc5->prog.spill_size_per_thread) {
+                /* Max 4 QPUs per slice, 3 slices per core. We only do single
+                 * core so far.  This overallocates memory on smaller cores.
+                 */
+                int total_spill_size =
+                        4 * 3 * shader->prog_data.base->spill_size;
+
+                vc5_bo_unreference(&vc5->prog.spill_bo);
+                vc5->prog.spill_bo = vc5_bo_alloc(vc5->screen,
+                                                  total_spill_size, "spill");
+                vc5->prog.spill_size_per_thread =
+                        shader->prog_data.base->spill_size;
+        }
+
          return shader;
  }
  
diff --git a/src/gallium/drivers/vc5/vc5_uniforms.c b/src/gallium/drivers/vc5/vc5_uniforms.c

index faf49dbc359c069567e896dac9210093fd507b0d..03b6d8381c17485da7532f94c0c69d3579153d64 100644 (file)
--- a/src/gallium/drivers/vc5/vc5_uniforms.c
+++ b/src/gallium/drivers/vc5/vc5_uniforms.c
@@ -389,6 +389,16 @@ vc5_write_uniforms(struct vc5_context *vc5, struct vc5_compiled_shader *shader,
                          /* XXX */
                          break;
  
+                case QUNIFORM_SPILL_OFFSET:
+                        cl_aligned_reloc(&job->indirect, &uniforms,
+                                         vc5->prog.spill_bo, 0);
+                        break;
+
+                case QUNIFORM_SPILL_SIZE_PER_THREAD:
+                        cl_aligned_u32(&uniforms,
+                                       vc5->prog.spill_size_per_thread);
+                        break;
+
                  default:
                          assert(quniform_contents_is_texture_p0(uinfo->contents[i]));
  
@@ -451,6 +461,8 @@ vc5_set_shader_uniform_dirty_flags(struct vc5_compiled_shader *shader)
                  case QUNIFORM_TEXTURE_DEPTH:
                  case QUNIFORM_TEXTURE_ARRAY_SIZE:
                  case QUNIFORM_TEXTURE_LEVELS:
+                case QUNIFORM_SPILL_OFFSET:
+                case QUNIFORM_SPILL_SIZE_PER_THREAD:
                          /* We could flag this on just the stage we're
                           * compiling for, but it's not passed in.
                           */
author	Eric Anholt <eric@anholt.net>
	Tue, 13 Mar 2018 22:13:00 +0000 (15:13 -0700)
committer	Eric Anholt <eric@anholt.net>
	Mon, 19 Mar 2018 23:44:06 +0000 (16:44 -0700)
src/broadcom/compiler/nir_to_vir.c		patch \| blob \| history
src/broadcom/compiler/v3d_compiler.h		patch \| blob \| history
src/broadcom/compiler/vir.c		patch \| blob \| history
src/broadcom/compiler/vir_register_allocate.c		patch \| blob \| history
src/gallium/drivers/vc5/vc5_context.h		patch \| blob \| history
src/gallium/drivers/vc5/vc5_program.c		patch \| blob \| history
src/gallium/drivers/vc5/vc5_uniforms.c		patch \| blob \| history