util/hash_table: update users to use new optimal integer hash functions

[mesa.git] / src / gallium / drivers / vc4 / vc4_qir_schedule.c
diff --git a/src/gallium/drivers/vc4/vc4_qir_schedule.c b/src/gallium/drivers/vc4/vc4_qir_schedule.c

index 903c6108824032cb7280cf4bddfc83a73ea459f3..fdd922fde4d5fca5cf4feef8b7d8618ace0123ee 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_qir_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qir_schedule.c
@@ -37,18 +37,15 @@
   */
  
  #include "vc4_qir.h"
+#include "util/dag.h"
  
  static bool debug;
  
  struct schedule_node {
+        struct dag_node dag;
          struct list_head link;
          struct qinst *inst;
  
-        struct schedule_node **children;
-        uint32_t child_count;
-        uint32_t child_array_size;
-        uint32_t parent_count;
-
          /* Length of the longest (latency) chain from a DAG head to the this
           * instruction.
           */
@@ -61,11 +58,7 @@ struct schedule_node {
  };
  
  struct schedule_state {
-        /* List of struct schedule_node *.  This starts out with all
-         * instructions, and after dependency updates it's trimmed to be just
-         * the DAG heads.
-         */
-        struct list_head worklist;
+        struct dag *dag;
  
          uint32_t time;
  
@@ -103,21 +96,7 @@ add_dep(enum direction dir,
                  after = t;
          }
  
-        for (int i = 0; i < after->child_count; i++) {
-                if (after->children[i] == after)
-                        return;
-        }
-
-        if (after->child_array_size <= after->child_count) {
-                after->child_array_size = MAX2(after->child_array_size * 2, 16);
-                after->children = reralloc(after, after->children,
-                                           struct schedule_node *,
-                                           after->child_array_size);
-        }
-
-        after->children[after->child_count] = before;
-        after->child_count++;
-        before->parent_count++;
+        dag_add_edge(&after->dag, &before->dag, NULL);
  }
  
  static void
@@ -138,6 +117,7 @@ struct schedule_setup_state {
          struct schedule_node *last_tex_coord;
          struct schedule_node *last_tex_result;
          struct schedule_node *last_tlb;
+        struct schedule_node *last_uniforms_reset;
          enum direction dir;
  
         /**
@@ -186,7 +166,7 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
           * ignore uniforms accesses, because qir_reorder_uniforms() happens
           * after this.
           */
-        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+        for (int i = 0; i < qir_get_nsrc(inst); i++) {
                  switch (inst->src[i].file) {
                  case QFILE_TEMP:
                          add_dep(dir,
@@ -211,23 +191,35 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
                  add_dep(dir, state->last_vary_read, n);
                  break;
  
-        case QOP_TEX_S:
-        case QOP_TEX_T:
-        case QOP_TEX_R:
-        case QOP_TEX_B:
-        case QOP_TEX_DIRECT:
-                /* Texturing setup gets scheduled in order, because
-                 * the uniforms referenced by them have to land in a
-                 * specific order.
-                 */
-                add_write_dep(dir, &state->last_tex_coord, n);
-                break;
-
          case QOP_TEX_RESULT:
                  /* Results have to be fetched in order. */
                  add_write_dep(dir, &state->last_tex_result, n);
                  break;
  
+        case QOP_THRSW:
+                /* After a new THRSW, one must collect all texture samples
+                 * queued since the previous THRSW/program start.  For now, we
+                 * have one THRSW in between each texture setup and its
+                 * results collection as our input, and we just make sure that
+                 * that ordering is maintained.
+                 */
+                add_write_dep(dir, &state->last_tex_coord, n);
+                add_write_dep(dir, &state->last_tex_result, n);
+
+                /* accumulators and flags are lost across thread switches. */
+                add_write_dep(dir, &state->last_sf, n);
+
+                /* Setup, like the varyings, will need to be drained before we
+                 * thread switch.
+                 */
+                add_write_dep(dir, &state->last_vary_read, n);
+
+                /* The TLB-locking operations have to stay after the last
+                 * thread switch.
+                 */
+                add_write_dep(dir, &state->last_tlb, n);
+                break;
+
          case QOP_TLB_COLOR_READ:
          case QOP_MS_MASK:
                  add_write_dep(dir, &state->last_tlb, n);
@@ -253,6 +245,18 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
                  add_write_dep(dir, &state->last_tlb, n);
                  break;
  
+        case QFILE_TEX_S_DIRECT:
+        case QFILE_TEX_S:
+        case QFILE_TEX_T:
+        case QFILE_TEX_R:
+        case QFILE_TEX_B:
+                /* Texturing setup gets scheduled in order, because
+                 * the uniforms referenced by them have to land in a
+                 * specific order.
+                 */
+                add_write_dep(dir, &state->last_tex_coord, n);
+                break;
+
          default:
                  break;
          }
@@ -280,26 +284,69 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
  
                  calculate_deps(&state, n);
  
-                switch (inst->op) {
-                case QOP_TEX_S:
-                case QOP_TEX_T:
-                case QOP_TEX_R:
-                case QOP_TEX_B:
-                case QOP_TEX_DIRECT:
-                        /* If the texture coordinate fifo is full,
-                         * block this on the last QOP_TEX_RESULT.
+                for (int i = 0; i < qir_get_nsrc(inst); i++) {
+                        switch (inst->src[i].file) {
+                        case QFILE_UNIF:
+                                add_dep(state.dir, state.last_uniforms_reset, n);
+                                break;
+                        default:
+                                break;
+                        }
+                }
+
+                switch (inst->dst.file) {
+                case QFILE_TEX_S_DIRECT:
+                case QFILE_TEX_S:
+                case QFILE_TEX_T:
+                case QFILE_TEX_R:
+                case QFILE_TEX_B:
+                        /* From the VC4 spec:
+                         *
+                         *     "The TFREQ input FIFO holds two full lots of s,
+                         *      t, r, b data, plus associated setup data, per
+                         *      QPU, that is, there are eight data slots. For
+                         *      each texture request, slots are only consumed
+                         *      for the components of s, t, r, and b actually
+                         *      written. Thus the FIFO can hold four requests
+                         *      of just (s, t) data, or eight requests of just
+                         *      s data (for direct addressed data lookups).
+                         *
+                         *      Note that there is one FIFO per QPU, and the
+                         *      FIFO has no concept of threads - that is,
+                         *      multi-threaded shaders must be careful to use
+                         *      only 1/2 the FIFO depth before reading
+                         *      back. Multi-threaded programs must also
+                         *      therefore always thread switch on texture
+                         *      fetch as the other thread may have data
+                         *      waiting in the FIFO."
+                         *
+                         * If the texture coordinate fifo is full, block this
+                         * on the last QOP_TEX_RESULT.
                           */
-                        if (state.tfreq_count == 8) {
+                        if (state.tfreq_count == (c->fs_threaded ? 4 : 8)) {
                                  block_until_tex_result(&state, n);
                          }
  
-                        /* If the texture result fifo is full, block
-                         * adding any more to it until the last
-                         * QOP_TEX_RESULT.
+                        /* From the VC4 spec:
+                         *
+                         *     "Since the maximum number of texture requests
+                         *      in the input (TFREQ) FIFO is four lots of (s,
+                         *      t) data, the output (TFRCV) FIFO is sized to
+                         *      holds four lots of max-size color data per
+                         *      QPU. For non-float color, reads are packed
+                         *      RGBA8888 data (one read per pixel). For 16-bit
+                         *      float color, two reads are necessary per
+                         *      pixel, with reads packed as RG1616 then
+                         *      BA1616. So per QPU there are eight color slots
+                         *      in the TFRCV FIFO."
+                         *
+                         * If the texture result fifo is full, block adding
+                         * any more to it until the last QOP_TEX_RESULT.
                           */
-                        if (inst->op == QOP_TEX_S ||
-                            inst->op == QOP_TEX_DIRECT) {
-                                if (state.tfrcv_count == 4)
+                        if (inst->dst.file == QFILE_TEX_S ||
+                            inst->dst.file == QFILE_TEX_S_DIRECT) {
+                                if (state.tfrcv_count ==
+                                    (c->fs_threaded ? 2 : 4))
                                          block_until_tex_result(&state, n);
                                  state.tfrcv_count++;
                          }
@@ -308,6 +355,11 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
                          state.tfreq_count++;
                          break;
  
+                default:
+                        break;
+                }
+
+                switch (inst->op) {
                  case QOP_TEX_RESULT:
                          /* Results have to be fetched after the
                           * coordinate setup.  Note that we're assuming
@@ -324,8 +376,12 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
                          memset(&state.tex_fifo[state.tex_fifo_pos], 0,
                                 sizeof(state.tex_fifo[0]));
                          break;
+
+                case QOP_UNIFORMS_RESET:
+                        add_write_dep(state.dir, &state.last_uniforms_reset, n);
+                        break;
+
                  default:
-                        assert(!qir_is_tex(inst));
                          break;
                  }
          }
@@ -356,11 +412,21 @@ get_register_pressure_cost(struct schedule_state *state, struct qinst *inst)
              state->temp_writes[inst->dst.index] == 1)
                  cost--;
  
-        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
-                if (inst->src[i].file == QFILE_TEMP &&
-                    !BITSET_TEST(state->temp_live, inst->src[i].index)) {
-                        cost++;
+        for (int i = 0; i < qir_get_nsrc(inst); i++) {
+                if (inst->src[i].file != QFILE_TEMP ||
+                    BITSET_TEST(state->temp_live, inst->src[i].index)) {
+                        continue;
                  }
+
+                bool already_counted = false;
+                for (int j = 0; j < i; j++) {
+                        if (inst->src[i].file == inst->src[j].file &&
+                            inst->src[i].index == inst->src[j].index) {
+                                already_counted = true;
+                        }
+                }
+                if (!already_counted)
+                        cost++;
          }
  
          return cost;
@@ -387,7 +453,8 @@ choose_instruction(struct schedule_state *state)
  {
          struct schedule_node *chosen = NULL;
  
-        list_for_each_entry(struct schedule_node, n, &state->worklist, link) {
+        list_for_each_entry(struct schedule_node, n, &state->dag->heads,
+                            dag.link) {
                  /* The branches aren't being tracked as dependencies.  Make
                   * sure that they stay scheduled as the last instruction of
                   * the block, which is to say the first one we choose to
@@ -462,17 +529,20 @@ static void
  dump_state(struct vc4_compile *c, struct schedule_state *state)
  {
          uint32_t i = 0;
-        list_for_each_entry(struct schedule_node, n, &state->worklist, link) {
+        list_for_each_entry(struct schedule_node, n, &state->dag->heads,
+                            dag.link) {
                  fprintf(stderr, "%3d: ", i++);
                  qir_dump_inst(c, n->inst);
                  fprintf(stderr, " (%d cost)\n",
                          get_register_pressure_cost(state, n->inst));
  
-                for (int i = 0; i < n->child_count; i++) {
-                        struct schedule_node *child = n->children[i];
+                util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+                        struct schedule_node *child =
+                                (struct schedule_node *)edge->child;
                          fprintf(stderr, "   - ");
                          qir_dump_inst(c, child->inst);
-                        fprintf(stderr, " (%d parents)\n", child->parent_count);
+                        fprintf(stderr, " (%d parents)\n",
+                                child->dag.parent_count);
                  }
          }
  }
@@ -487,37 +557,58 @@ dump_state(struct vc4_compile *c, struct schedule_state *state)
  static uint32_t
  latency_between(struct schedule_node *before, struct schedule_node *after)
  {
-        if ((before->inst->op == QOP_TEX_S ||
-             before->inst->op == QOP_TEX_DIRECT) &&
+        if ((before->inst->dst.file == QFILE_TEX_S ||
+             before->inst->dst.file == QFILE_TEX_S_DIRECT) &&
              after->inst->op == QOP_TEX_RESULT)
                  return 100;
  
+        switch (before->inst->op) {
+        case QOP_RCP:
+        case QOP_RSQ:
+        case QOP_EXP2:
+        case QOP_LOG2:
+                for (int i = 0; i < qir_get_nsrc(after->inst); i++) {
+                        if (after->inst->src[i].file ==
+                            before->inst->dst.file &&
+                            after->inst->src[i].index ==
+                            before->inst->dst.index) {
+                                /* There are two QPU delay slots before we can
+                                 * read a math result, which could be up to 4
+                                 * QIR instructions if they packed well.
+                                 */
+                                return 4;
+                        }
+                }
+                break;
+        default:
+                break;
+        }
+
          return 1;
  }
  
  /** Recursive computation of the delay member of a node. */
  static void
-compute_delay(struct schedule_node *n)
+compute_delay(struct dag_node *node, void *state)
  {
-        if (!n->child_count) {
-                /* The color read needs to be scheduled late, to avoid locking
-                 * the scoreboard early.  This is our best tool for
-                 * encouraging that.  The other scoreboard locking ops will
-                 * have this happen by default, since they are generally the
-                 * DAG heads or close to them.
-                 */
-                if (n->inst->op == QOP_TLB_COLOR_READ)
-                        n->delay = 1000;
-                else
-                        n->delay = 1;
-        } else {
-                for (int i = 0; i < n->child_count; i++) {
-                        if (!n->children[i]->delay)
-                                compute_delay(n->children[i]);
-                        n->delay = MAX2(n->delay,
-                                        n->children[i]->delay +
-                                        latency_between(n, n->children[i]));
-                }
+        struct schedule_node *n = (struct schedule_node *)node;
+
+        /* The color read needs to be scheduled late, to avoid locking the
+         * scoreboard early.  This is our best tool for encouraging that.  The
+         * other scoreboard locking ops will have this happen by default,
+         * since they are generally the DAG heads or close to them.
+         */
+        if (n->inst->op == QOP_TLB_COLOR_READ)
+                n->delay = 1000;
+        else
+                n->delay = 1;
+
+        util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+                struct schedule_node *child =
+                        (struct schedule_node *)edge->child;
+
+                n->delay = MAX2(n->delay, (child->delay +
+                                           latency_between(child, n)));
          }
  }
  
@@ -530,15 +621,8 @@ schedule_instructions(struct vc4_compile *c,
                  dump_state(c, state);
          }
  
-        /* Remove non-DAG heads from the list. */
-        list_for_each_entry_safe(struct schedule_node, n,
-                                 &state->worklist, link) {
-                if (n->parent_count != 0)
-                        list_del(&n->link);
-        }
-
          state->time = 0;
-        while (!list_empty(&state->worklist)) {
+        while (!list_is_empty(&state->dag->heads)) {
                  struct schedule_node *chosen = choose_instruction(state);
                  struct qinst *inst = chosen->inst;
  
@@ -554,7 +638,6 @@ schedule_instructions(struct vc4_compile *c,
                  state->time = MAX2(state->time, chosen->unblocked_time);
  
                  /* Schedule this instruction back onto the QIR list. */
-                list_del(&chosen->link);
                  list_add(&inst->link, &block->instructions);
  
                  /* Now that we've scheduled a new instruction, some of its
@@ -562,20 +645,20 @@ schedule_instructions(struct vc4_compile *c,
                   * be scheduled.  Update the children's unblocked time for this
                   * DAG edge as we do so.
                   */
-                for (int i = chosen->child_count - 1; i >= 0; i--) {
-                        struct schedule_node *child = chosen->children[i];
+                util_dynarray_foreach(&chosen->dag.edges,
+                                      struct dag_edge, edge) {
+                        struct schedule_node *child =
+                                (struct schedule_node *)edge->child;
  
                          child->unblocked_time = MAX2(child->unblocked_time,
                                                       state->time +
-                                                     latency_between(chosen,
-                                                                     child));
-                        child->parent_count--;
-                        if (child->parent_count == 0)
-                                list_add(&child->link, &state->worklist);
+                                                     latency_between(child,
+                                                                     chosen));
                  }
+                dag_prune_head(state->dag, &chosen->dag);
  
                  /* Update our tracking of register pressure. */
-                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                for (int i = 0; i < qir_get_nsrc(inst); i++) {
                          if (inst->src[i].file == QFILE_TEMP)
                                  BITSET_SET(state->temp_live, inst->src[i].index);
                  }
@@ -593,37 +676,39 @@ static void
  qir_schedule_instructions_block(struct vc4_compile *c,
                                  struct qblock *block)
  {
-        void *mem_ctx = ralloc_context(NULL);
-        struct schedule_state state = { { 0 } };
+        struct schedule_state *state = rzalloc(NULL, struct schedule_state);
+
+        state->temp_writes = rzalloc_array(state, uint32_t, c->num_temps);
+        state->temp_live = rzalloc_array(state, BITSET_WORD,
+                                         BITSET_WORDS(c->num_temps));
+        state->dag = dag_create(state);
  
-        state.temp_writes = rzalloc_array(mem_ctx, uint32_t, c->num_temps);
-        state.temp_live = rzalloc_array(mem_ctx, BITSET_WORD,
-                                        BITSET_WORDS(c->num_temps));
-        list_inithead(&state.worklist);
+        struct list_head setup_list;
+        list_inithead(&setup_list);
  
          /* Wrap each instruction in a scheduler structure. */
          qir_for_each_inst_safe(inst, block) {
-                struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
+                struct schedule_node *n = rzalloc(state, struct schedule_node);
  
                  n->inst = inst;
                  list_del(&inst->link);
-                list_addtail(&n->link, &state.worklist);
+                list_addtail(&n->link, &setup_list);
+                dag_init_node(state->dag, &n->dag);
  
                  if (inst->dst.file == QFILE_TEMP)
-                        state.temp_writes[inst->dst.index]++;
+                        state->temp_writes[inst->dst.index]++;
          }
  
          /* Dependencies tracked top-to-bottom. */
-        calculate_forward_deps(c, mem_ctx, &state.worklist);
+        calculate_forward_deps(c, state, &setup_list);
          /* Dependencies tracked bottom-to-top. */
-        calculate_reverse_deps(c, mem_ctx, &state.worklist);
+        calculate_reverse_deps(c, state, &setup_list);
  
-        list_for_each_entry(struct schedule_node, n, &state.worklist, link)
-                compute_delay(n);
+        dag_traverse_bottom_up(state->dag, compute_delay, NULL);
  
-        schedule_instructions(c, block, &state);
+        schedule_instructions(c, block, state);
  
-        ralloc_free(mem_ctx);
+        ralloc_free(state);
  }
  
  void