broadcom/vc5: Use THRSW to enable multi-threaded shaders.

author Eric Anholt <eric@anholt.net>

Mon, 8 Jan 2018 19:55:31 +0000 (11:55 -0800)

committer Eric Anholt <eric@anholt.net>

Sat, 13 Jan 2018 05:55:30 +0000 (21:55 -0800)
author Eric Anholt <eric@anholt.net>
Mon, 8 Jan 2018 19:55:31 +0000 (11:55 -0800)
committer Eric Anholt <eric@anholt.net>
Sat, 13 Jan 2018 05:55:30 +0000 (21:55 -0800)
diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet_v33.xml

index 094ee00cf127b4276a5172c23aa126dd33ebc643..6be632112a2dfd170d01396a3b90c8f3be16093a 100644 (file)
--- a/src/broadcom/cle/v3d_packet_v33.xml
+++ b/src/broadcom/cle/v3d_packet_v33.xml
@@ -700,13 +700,17 @@
      <field name="Vertex Shader input VPM segment size" size="8" start="7b" type="uint"/>
      <field name="Address of default attribute values" size="32" start="8b" type="address"/>
      <field name="Fragment Shader Code Address" size="29" start="99" type="address"/>
-    <field name="2-way threadable" size="1" start="96" type="bool"/>
-    <field name="4-way threadable" size="1" start="97" type="bool"/>
+    <field name="Fragment Shader 2-way threadable" size="1" start="96" type="bool"/>
+    <field name="Fragment Shader 4-way threadable" size="1" start="97" type="bool"/>
      <field name="Propagate NaNs" size="1" start="98" type="bool"/>
      <field name="Fragment Shader Uniforms Address" size="32" start="16b" type="address"/>
      <field name="Vertex Shader Code Address" size="32" start="20b" type="address"/>
+    <field name="Vertex Shader 2-way threadable" size="1" start="160" type="bool"/>
+    <field name="Vertex Shader 4-way threadable" size="1" start="161" type="bool"/>
      <field name="Vertex Shader Uniforms Address" size="32" start="24b" type="address"/>
      <field name="Coordinate Shader Code Address" size="32" start="28b" type="address"/>
+    <field name="Coordinate Shader 2-way threadable" size="1" start="224" type="bool"/>
+    <field name="Coordinate Shader 4-way threadable" size="1" start="225" type="bool"/>
      <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
    </struct>
  
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c

index 1882c5ace7e443a9b187385b3fcf1d82fbef71f3..0400a683b714b78e5ea167a88c9e4e64e942bae9 100644 (file)
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -65,6 +65,23 @@ resize_qreg_array(struct v3d_compile *c,
                  (*regs)[i] = c->undef;
  }
  
+static void
+vir_emit_thrsw(struct v3d_compile *c)
+{
+        if (c->threads == 1)
+                return;
+
+        /* Always thread switch after each texture operation for now.
+         *
+         * We could do better by batching a bunch of texture fetches up and
+         * then doing one thread switch and collecting all their results
+         * afterward.
+         */
+        c->last_thrsw = vir_NOP(c);
+        c->last_thrsw->qpu.sig.thrsw = true;
+        c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
+}
+
  static struct qreg
  vir_SFU(struct v3d_compile *c, int waddr, struct qreg src)
  {
@@ -118,6 +135,7 @@ indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr)
                       vir_uniform(c, QUNIFORM_UBO_ADDR, 0),
                       indirect_offset);
  
+        vir_emit_thrsw(c);
          return vir_LDTMU(c);
  }
  
@@ -488,6 +506,8 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                  }
          }
  
+        vir_emit_thrsw(c);
+
          struct qreg return_values[4];
          for (int i = 0; i < 4; i++) {
                  /* Swizzling .zw of an RG texture should give undefined
@@ -1685,6 +1705,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                                               ntq_get_src(c, instr->src[1], 0),
                                               vir_uniform_ui(c, i * 4)));
  
+                        vir_emit_thrsw(c);
+
                          ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
                  }
                  break;
@@ -2124,6 +2146,62 @@ count_nir_instrs(nir_shader *nir)
  }
  #endif
  
+/**
+ * When demoting a shader down to single-threaded, removes the THRSW
+ * instructions (one will still be inserted at v3d_vir_to_qpu() for the
+ * program end).
+ */
+static void
+vir_remove_thrsw(struct v3d_compile *c)
+{
+        vir_for_each_block(block, c) {
+                vir_for_each_inst_safe(inst, block) {
+                        if (inst->qpu.sig.thrsw)
+                                vir_remove_instruction(c, inst);
+                }
+        }
+
+        c->last_thrsw = NULL;
+}
+
+static void
+vir_emit_last_thrsw(struct v3d_compile *c)
+{
+        /* On V3D before 4.1, we need a TMU op to be outstanding when thread
+         * switching, so disable threads if we didn't do any TMU ops (each of
+         * which would have emitted a THRSW).
+         */
+        if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
+                c->threads = 1;
+                if (c->last_thrsw)
+                        vir_remove_thrsw(c);
+                return;
+        }
+
+        /* If we're threaded and the last THRSW was in conditional code, then
+         * we need to emit another one so that we can flag it as the last
+         * thrsw.
+         */
+        if (c->last_thrsw && !c->last_thrsw_at_top_level) {
+                assert(c->devinfo->ver >= 41);
+                vir_emit_thrsw(c);
+        }
+
+        /* If we're threaded, then we need to mark the last THRSW instruction
+         * so we can emit a pair of them at QPU emit time.
+         *
+         * For V3D 4.x, we can spawn the non-fragment shaders already in the
+         * post-last-THRSW state, so we can skip this.
+         */
+        if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
+                assert(c->devinfo->ver >= 41);
+                vir_emit_thrsw(c);
+        }
+
+        if (c->last_thrsw)
+                c->last_thrsw->is_last_thrsw = true;
+}
+
  void
  v3d_nir_to_vir(struct v3d_compile *c)
  {
@@ -2137,6 +2215,9 @@ v3d_nir_to_vir(struct v3d_compile *c)
  
          nir_to_vir(c);
  
+        /* Emit the last THRSW before STVPM and TLB writes. */
+        vir_emit_last_thrsw(c);
+
          switch (c->s->info.stage) {
          case MESA_SHADER_FRAGMENT:
                  emit_frag_end(c);
@@ -2171,5 +2252,33 @@ v3d_nir_to_vir(struct v3d_compile *c)
                  fprintf(stderr, "\n");
          }
  
-        v3d_vir_to_qpu(c);
+        /* Compute the live ranges so we can figure out interference. */
+        vir_calculate_live_intervals(c);
+
+        /* Attempt to allocate registers for the temporaries.  If we fail,
+         * reduce thread count and try again.
+         */
+        int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
+        struct qpu_reg *temp_registers;
+        while (true) {
+                temp_registers = v3d_register_allocate(c);
+
+                if (temp_registers)
+                        break;
+
+                if (c->threads == min_threads) {
+                        fprintf(stderr, "Failed to register allocate at %d threads:\n",
+                                c->threads);
+                        vir_dump(c);
+                        c->failed = true;
+                        return;
+                }
+
+                c->threads /= 2;
+
+                if (c->threads == 1)
+                        vir_remove_thrsw(c);
+        }
+
+        v3d_vir_to_qpu(c, temp_registers);
  }
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c

index fdec5252b1f72afbf13d287cfb847bc043cb50fa..c3b88c345d1ce9059996f3dbf2f4275009ffe6e3 100644 (file)
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -1097,13 +1097,30 @@ qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c,
  }
  
  static bool
-valid_thrend_sequence(struct v3d_compile *c,
-                      struct qinst *qinst, int instructions_in_sequence)
+valid_thrsw_sequence(struct v3d_compile *c,
+                     struct qinst *qinst, int instructions_in_sequence,
+                     bool is_thrend)
  {
          for (int slot = 0; slot < instructions_in_sequence; slot++) {
-                if (!qpu_instruction_valid_in_thrend_slot(c, qinst, slot))
+                /* No scheduling SFU when the result would land in the other
+                 * thread.  The simulator complains for safety, though it
+                 * would only occur for dead code in our case.
+                 */
+                if (slot > 0 &&
+                    qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+                    (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
+                     v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
+                        return false;
+                }
+
+                if (slot > 0 && qinst->qpu.sig.ldvary)
                          return false;
  
+                if (is_thrend &&
+                    !qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) {
+                        return false;
+                }
+
                  /* Note that the list is circular, so we can only do this up
                   * to instructions_in_sequence.
                   */
@@ -1121,7 +1138,8 @@ static int
  emit_thrsw(struct v3d_compile *c,
             struct qblock *block,
             struct choose_scoreboard *scoreboard,
-           struct qinst *inst)
+           struct qinst *inst,
+           bool is_thrend)
  {
          int time = 0;
  
@@ -1143,20 +1161,25 @@ emit_thrsw(struct v3d_compile *c,
                  if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
                          break;
  
-                if (!valid_thrend_sequence(c, prev_inst, slots_filled + 1))
+                if (!valid_thrsw_sequence(c, prev_inst, slots_filled + 1,
+                                          is_thrend)) {
                          break;
+                }
  
                  merge_inst = prev_inst;
                  if (++slots_filled == 3)
                          break;
          }
  
+        bool needs_free = false;
          if (merge_inst) {
                  merge_inst->qpu.sig.thrsw = true;
+                needs_free = true;
          } else {
                  insert_scheduled_instruction(c, block, scoreboard, inst);
                  time++;
                  slots_filled++;
+                merge_inst = inst;
          }
  
          /* Insert any extra delay slot NOPs we need. */
@@ -1165,10 +1188,19 @@ emit_thrsw(struct v3d_compile *c,
                  time++;
          }
  
+        /* If we're emitting the last THRSW (other than program end), then
+         * signal that to the HW by emitting two THRSWs in a row.
+         */
+        if (inst->is_last_thrsw) {
+                struct qinst *second_inst =
+                        (struct qinst *)merge_inst->link.next;
+                second_inst->qpu.sig.thrsw = true;
+        }
+
          /* If we put our THRSW into another instruction, free up the
           * instruction that didn't end up scheduled into the list.
           */
-        if (merge_inst)
+        if (needs_free)
                  free(inst);
  
          return time;
@@ -1293,40 +1325,24 @@ schedule_instructions(struct v3d_compile *c,
                          free(merge->inst);
                  }
  
-                if (0 && inst->sig.thrsw) {
-                        /* XXX emit_thrsw(c, scoreboard, qinst); */
+                if (inst->sig.thrsw) {
+                        time += emit_thrsw(c, block, scoreboard, qinst, false);
                  } else {
-                        c->qpu_inst_count++;
-                        list_addtail(&qinst->link, &block->instructions);
-                        update_scoreboard_for_chosen(scoreboard, inst);
-                }
-
-                scoreboard->tick++;
-                time++;
-
-                if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ||
-                    inst->sig.thrsw /* XXX */) {
-                        block->branch_qpu_ip = c->qpu_inst_count - 1;
-                        /* Fill the delay slots.
-                         *
-                         * We should fill these with actual instructions,
-                         * instead, but that will probably need to be done
-                         * after this, once we know what the leading
-                         * instructions of the successors are (so we can
-                         * handle A/B register file write latency)
-                        */
-                        /* XXX: scoreboard */
-                        int slots = (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ?
-                                     3 : 2);
-                        for (int i = 0; i < slots; i++) {
-                                struct qinst *nop = vir_nop();
-                                list_addtail(&nop->link, &block->instructions);
-
-                                update_scoreboard_for_chosen(scoreboard,
-                                                             &nop->qpu);
-                                c->qpu_inst_count++;
-                                scoreboard->tick++;
-                                time++;
+                        insert_scheduled_instruction(c, block,
+                                                     scoreboard, qinst);
+
+                        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
+                                block->branch_qpu_ip = c->qpu_inst_count - 1;
+                                /* Fill the delay slots.
+                                 *
+                                 * We should fill these with actual instructions,
+                                 * instead, but that will probably need to be done
+                                 * after this, once we know what the leading
+                                 * instructions of the successors are (so we can
+                                 * handle A/B register file write latency)
+                                 */
+                                for (int i = 0; i < 3; i++)
+                                        emit_nop(c, block, scoreboard);
                          }
                  }
          }
@@ -1488,7 +1504,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
          /* Emit the program-end THRSW instruction. */;
          struct qinst *thrsw = vir_nop();
          thrsw->qpu.sig.thrsw = true;
-        emit_thrsw(c, end_block, &scoreboard, thrsw);
+        emit_thrsw(c, end_block, &scoreboard, thrsw, true);
  
          qpu_set_branch_targets(c);
  
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c

index 3b2c10eabc6852dee4a2681a8fe0d01af5f0f356..4ef587c1d52bdab30ce7afceeb09de1c01e36592 100644 (file)
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -39,6 +39,10 @@ struct v3d_qpu_validate_state {
          const struct v3d_qpu_instr *last;
          int ip;
          int last_sfu_write;
+        int last_branch_ip;
+        int last_thrsw_ip;
+        bool last_thrsw_found;
+        int thrsw_count;
  };
  
  static void
@@ -62,6 +66,18 @@ fail_instr(struct v3d_qpu_validate_state *state, const char *msg)
          abort();
  }
  
+static bool
+in_branch_delay_slots(struct v3d_qpu_validate_state *state)
+{
+        return (state->ip - state->last_branch_ip) < 3;
+}
+
+static bool
+in_thrsw_delay_slots(struct v3d_qpu_validate_state *state)
+{
+        return (state->ip - state->last_thrsw_ip) < 3;
+}
+
  static bool
  qpu_magic_waddr_matches(const struct v3d_qpu_instr *inst,
                          bool (*predicate)(enum v3d_qpu_waddr waddr))
@@ -136,6 +152,19 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
                  }
          }
  
+        if (in_thrsw_delay_slots(state)) {
+                /* There's no way you want to start SFU during the THRSW delay
+                 * slots, since the result would land in the other thread.
+                 */
+                if (sfu_writes) {
+                        fail_instr(state,
+                                   "SFU write started during THRSW delay slots ");
+                }
+
+                if (inst->sig.ldvary)
+                        fail_instr(state, "LDVARY during THRSW delay slots");
+        }
+
          (void)qpu_magic_waddr_matches; /* XXX */
  
          /* SFU r4 results come back two instructions later.  No doing
@@ -170,6 +199,35 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
  
          if (sfu_writes)
                  state->last_sfu_write = state->ip;
+
+        if (inst->sig.thrsw) {
+                if (in_branch_delay_slots(state))
+                        fail_instr(state, "THRSW in a branch delay slot.");
+
+                if (state->last_thrsw_ip == state->ip - 1) {
+                        /* If it's the second THRSW in a row, then it's just a
+                         * last-thrsw signal.
+                         */
+                        if (state->last_thrsw_found)
+                                fail_instr(state, "Two last-THRSW signals");
+                        state->last_thrsw_found = true;
+                } else {
+                        if (in_thrsw_delay_slots(state)) {
+                                fail_instr(state,
+                                           "THRSW too close to another THRSW.");
+                        }
+                        state->thrsw_count++;
+                        state->last_thrsw_ip = state->ip;
+                }
+        }
+
+        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
+                if (in_branch_delay_slots(state))
+                        fail_instr(state, "branch in a branch delay slot.");
+                if (in_thrsw_delay_slots(state))
+                        fail_instr(state, "branch in a THRSW delay slot.");
+                state->last_branch_ip = state->ip;
+        }
  }
  
  static void
@@ -201,10 +259,22 @@ qpu_validate(struct v3d_compile *c)
          struct v3d_qpu_validate_state state = {
                  .c = c,
                  .last_sfu_write = -10,
+                .last_thrsw_ip = -10,
+                .last_branch_ip = -10,
                  .ip = 0,
          };
  
          vir_for_each_block(block, c) {
                  qpu_validate_block(&state, block);
          }
+
+        if (state.thrsw_count > 1 && !state.last_thrsw_found) {
+                fail_instr(&state,
+                           "thread switch found without last-THRSW in program");
+        }
+
+        if (state.thrsw_count == 0 ||
+            (state.last_thrsw_found && state.thrsw_count == 1)) {
+                fail_instr(&state, "No program-end THRSW found");
+        }
  }
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h

index e17a108233f7115143676470a4bd012d277a41ae..cb3614edcb65e60f996ffbaa8c70f78725200dbb 100644 (file)
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -134,6 +134,7 @@ struct qinst {
          struct qreg src[3];
          bool cond_is_exec_mask;
          bool has_implicit_uniform;
+        bool is_last_thrsw;
  
          /* After vir_to_qpu.c: If instr reads a uniform, which uniform from
           * the uncompiled stream it is.
@@ -522,12 +523,16 @@ struct v3d_compile {
          uint32_t program_id;
          uint32_t variant_id;
  
-        /* Set to compile program in threaded FS mode, where SIG_THREAD_SWITCH
-         * is used to hide texturing latency at the cost of limiting ourselves
-         * to the bottom half of physical reg space.
+        /* Set to compile program in in 1x, 2x, or 4x threaded mode, where
+         * SIG_THREAD_SWITCH is used to hide texturing latency at the cost of
+         * limiting ourselves to the part of the physical reg space.
+         *
+         * On V3D 3.x, 2x or 4x divide the physical reg space by 2x or 4x.  On
+         * V3D 4.x, all shaders are 2x threaded, and 4x only divides the
+         * physical reg space in half.
           */
-        bool fs_threaded;
-
+        uint8_t threads;
+        struct qinst *last_thrsw;
          bool last_thrsw_at_top_level;
  
          bool failed;
@@ -547,7 +552,12 @@ struct v3d_prog_data {
          uint32_t ubo_size;
  
          uint8_t num_inputs;
+        uint8_t threads;
  
+        /* For threads > 1, whether the program should be dispatched in the
+         * after-final-THRSW state.
+         */
+        bool single_seg;
  };
  
  struct v3d_vs_prog_data {
@@ -674,7 +684,7 @@ void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
  void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c);
  void vir_lower_uniforms(struct v3d_compile *c);
  
-void v3d_vir_to_qpu(struct v3d_compile *c);
+void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers);
  uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
  void qpu_validate(struct v3d_compile *c);
  struct qpu_reg *v3d_register_allocate(struct v3d_compile *c);
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c

index da4ece2cffe73ecaee65140ddfc7670f56633f26..a063ebc5d53bcc50f9b578dce42393525a03e9e8 100644 (file)
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -109,7 +109,7 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
                  }
          }
  
-        if (inst->qpu.sig.ldtmu)
+        if (inst->qpu.sig.ldtmu || inst->qpu.sig.thrsw)
                  return true;
  
          return false;
@@ -528,6 +528,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
          c->key = key;
          c->program_id = program_id;
          c->variant_id = variant_id;
+        c->threads = 4;
  
          s = nir_shader_clone(c, s);
          c->s = s;
@@ -637,6 +638,9 @@ static void
  v3d_set_prog_data(struct v3d_compile *c,
                    struct v3d_prog_data *prog_data)
  {
+        prog_data->threads = c->threads;
+        prog_data->single_seg = !c->last_thrsw;
+
          v3d_set_prog_data_uniforms(c, prog_data);
          v3d_set_prog_data_ubo(c, prog_data);
  }
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c

index ff30101ce41d49069702b1ae74eea7e386412190..ab3a4e257ff6f0b3f9735411ba82934bcfa09729 100644 (file)
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -23,6 +23,7 @@
  
  #include "util/ralloc.h"
  #include "util/register_allocate.h"
+#include "common/v3d_device_info.h"
  #include "v3d_compiler.h"
  
  #define QPU_R(i) { .magic = false, .index = i }
@@ -35,15 +36,17 @@
  bool
  vir_init_reg_sets(struct v3d_compiler *compiler)
  {
+        /* Allocate up to 3 regfile classes, for the ways the physical
+         * register file can be divided up for fragment shader threading.
+         */
+        int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
+
          compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
                                            true);
          if (!compiler->regs)
                  return false;
  
-        /* Allocate 3 regfile classes, for the ways the physical register file
-         * can be divided up for fragment shader threading.
-         */
-        for (int threads = 0; threads < 3; threads++) {
+        for (int threads = 0; threads < max_thread_index; threads++) {
                  compiler->reg_class_phys_or_acc[threads] =
                          ra_alloc_reg_class(compiler->regs);
                  compiler->reg_class_phys[threads] =
@@ -105,6 +108,16 @@ v3d_register_allocate(struct v3d_compile *c)
          struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
                                                           c->num_temps +
                                                           ARRAY_SIZE(acc_nodes));
+        /* Convert 1, 2, 4 threads to 0, 1, 2 index.
+         *
+         * V3D 4.x has double the physical register space, so 64 physical regs
+         * are available at both 1x and 2x threading, and 4x has 32.
+         */
+        int thread_index = ffs(c->threads) - 1;
+        if (c->devinfo->ver >= 40) {
+                if (thread_index >= 1)
+                        thread_index--;
+        }
  
          /* Make some fixed nodes for the accumulators, which we will need to
           * interfere with when ops have implied r3/r4 writes or for the thread
@@ -117,9 +130,6 @@ v3d_register_allocate(struct v3d_compile *c)
                  ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);
          }
  
-        /* Compute the live ranges so we can figure out interference. */
-        vir_calculate_live_intervals(c);
-
          for (uint32_t i = 0; i < c->num_temps; i++) {
                  map[i].temp = i;
                  map[i].priority = c->temp_end[i] - c->temp_start[i];
@@ -204,23 +214,15 @@ v3d_register_allocate(struct v3d_compile *c)
                          }
                  }
  
-#if 0
-                switch (inst->op) {
-                case QOP_THRSW:
+                if (inst->qpu.sig.thrsw) {
                          /* All accumulators are invalidated across a thread
                           * switch.
                           */
                          for (int i = 0; i < c->num_temps; i++) {
                                  if (c->temp_start[i] < ip && c->temp_end[i] > ip)
-                                        class_bits[i] &= ~(CLASS_BIT_R0_R3 |
-                                                           CLASS_BIT_R4);
+                                        class_bits[i] &= CLASS_BIT_PHYS;
                          }
-                        break;
-
-                default:
-                        break;
                  }
-#endif
  
                  ip++;
          }
@@ -228,14 +230,14 @@ v3d_register_allocate(struct v3d_compile *c)
          for (uint32_t i = 0; i < c->num_temps; i++) {
                  if (class_bits[i] == CLASS_BIT_PHYS) {
                          ra_set_node_class(g, temp_to_node[i],
-                                          c->compiler->reg_class_phys[c->fs_threaded]);
+                                          c->compiler->reg_class_phys[thread_index]);
                  } else {
                          assert(class_bits[i] == (CLASS_BIT_PHYS |
                                                   CLASS_BIT_R0_R2 |
                                                   CLASS_BIT_R3 |
                                                   CLASS_BIT_R4));
                          ra_set_node_class(g, temp_to_node[i],
-                                          c->compiler->reg_class_phys_or_acc[c->fs_threaded]);
+                                          c->compiler->reg_class_phys_or_acc[thread_index]);
                  }
          }
  
@@ -252,12 +254,6 @@ v3d_register_allocate(struct v3d_compile *c)
  
          bool ok = ra_allocate(g);
          if (!ok) {
-                if (!c->fs_threaded) {
-                        fprintf(stderr, "Failed to register allocate:\n");
-                        vir_dump(c);
-                }
-
-                c->failed = true;
                  free(temp_registers);
                  return NULL;
          }
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c

index 955eb96a87e7be750f2b954b72792fc840d24f36..9229fa5ba47122acf8aae657156266e6633d5b99 100644 (file)
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -319,10 +319,8 @@ v3d_dump_qpu(struct v3d_compile *c)
  }
  
  void
-v3d_vir_to_qpu(struct v3d_compile *c)
+v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers)
  {
-        struct qpu_reg *temp_registers = v3d_register_allocate(c);
-
          /* Reset the uniform count to how many will be actually loaded by the
           * generated QPU code.
           */
diff --git a/src/gallium/drivers/vc5/vc5_draw.c b/src/gallium/drivers/vc5/vc5_draw.c

index 95a857f1abc8baaff74b42711e304ff9588988c1..39378280823eb023a1fe02bb646eb0ef88a440a3 100644 (file)
--- a/src/gallium/drivers/vc5/vc5_draw.c
+++ b/src/gallium/drivers/vc5/vc5_draw.c
@@ -209,9 +209,32 @@ vc5_emit_gl_shader_state(struct vc5_context *vc5,
                  shader.fragment_shader_uniforms_address = fs_uniforms;
  
  #if V3D_VERSION >= 41
-                shader.coordinate_shader_start_in_final_thread_section = true;
-                shader.vertex_shader_start_in_final_thread_section = true;
-                shader.fragment_shader_start_in_final_thread_section = true;
+                shader.coordinate_shader_4_way_threadable =
+                        vc5->prog.cs->prog_data.vs->base.threads == 4;
+                shader.vertex_shader_4_way_threadable =
+                        vc5->prog.vs->prog_data.vs->base.threads == 4;
+                shader.fragment_shader_4_way_threadable =
+                        vc5->prog.fs->prog_data.fs->base.threads == 4;
+
+                shader.coordinate_shader_start_in_final_thread_section =
+                        vc5->prog.cs->prog_data.vs->base.single_seg;
+                shader.vertex_shader_start_in_final_thread_section =
+                        vc5->prog.vs->prog_data.vs->base.single_seg;
+                shader.fragment_shader_start_in_final_thread_section =
+                        vc5->prog.fs->prog_data.fs->base.single_seg;
+#else
+                shader.coordinate_shader_4_way_threadable =
+                        vc5->prog.cs->prog_data.vs->base.threads == 4;
+                shader.coordinate_shader_2_way_threadable =
+                        vc5->prog.cs->prog_data.vs->base.threads == 2;
+                shader.vertex_shader_4_way_threadable =
+                        vc5->prog.vs->prog_data.vs->base.threads == 4;
+                shader.vertex_shader_2_way_threadable =
+                        vc5->prog.vs->prog_data.vs->base.threads == 2;
+                shader.fragment_shader_4_way_threadable =
+                        vc5->prog.fs->prog_data.fs->base.threads == 4;
+                shader.fragment_shader_2_way_threadable =
+                        vc5->prog.fs->prog_data.fs->base.threads == 2;
  #endif
  
                  shader.vertex_id_read_by_coordinate_shader =
author	Eric Anholt <eric@anholt.net>
	Mon, 8 Jan 2018 19:55:31 +0000 (11:55 -0800)
committer	Eric Anholt <eric@anholt.net>
	Sat, 13 Jan 2018 05:55:30 +0000 (21:55 -0800)
src/broadcom/cle/v3d_packet_v33.xml		patch \| blob \| history
src/broadcom/compiler/nir_to_vir.c		patch \| blob \| history
src/broadcom/compiler/qpu_schedule.c		patch \| blob \| history
src/broadcom/compiler/qpu_validate.c		patch \| blob \| history
src/broadcom/compiler/v3d_compiler.h		patch \| blob \| history
src/broadcom/compiler/vir.c		patch \| blob \| history
src/broadcom/compiler/vir_register_allocate.c		patch \| blob \| history
src/broadcom/compiler/vir_to_qpu.c		patch \| blob \| history
src/gallium/drivers/vc5/vc5_draw.c		patch \| blob \| history