vc4: Fix latency handling for QPU texture scheduling.

author Eric Anholt <eric@anholt.net>

Fri, 18 Dec 2015 19:30:30 +0000 (11:30 -0800)

committer Eric Anholt <eric@anholt.net>

Sat, 19 Dec 2015 01:09:03 +0000 (17:09 -0800)
author Eric Anholt <eric@anholt.net>
Fri, 18 Dec 2015 19:30:30 +0000 (11:30 -0800)
committer Eric Anholt <eric@anholt.net>
Sat, 19 Dec 2015 01:09:03 +0000 (17:09 -0800)
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c

index 76cad2e03fefb1a80ebc249ce700c6d979ca4167..09164b7493277608cadeaef272c5d00c258f10f8 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -50,7 +50,7 @@ struct schedule_node {
          uint32_t child_array_size;
          uint32_t parent_count;
  
-        /* Longest cycles + n->latency of any parent of this node. */
+        /* Longest cycles + instruction_latency() of any parent of this node. */
          uint32_t unblocked_time;
  
          /**
@@ -624,6 +624,46 @@ dump_state(struct list_head *schedule_list)
          }
  }
  
+static uint32_t waddr_latency(uint32_t waddr, uint64_t after)
+{
+        if (waddr < 32)
+                return 2;
+
+        /* Apply some huge latency between texture fetch requests and getting
+         * their results back.
+         */
+        if (waddr == QPU_W_TMU0_S) {
+                if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0)
+                        return 100;
+        }
+        if (waddr == QPU_W_TMU1_S) {
+                if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1)
+                        return 100;
+        }
+
+        switch(waddr) {
+        case QPU_W_SFU_RECIP:
+        case QPU_W_SFU_RECIPSQRT:
+        case QPU_W_SFU_EXP:
+        case QPU_W_SFU_LOG:
+                return 3;
+        default:
+                return 1;
+        }
+}
+
+static uint32_t
+instruction_latency(struct schedule_node *before, struct schedule_node *after)
+{
+        uint64_t before_inst = before->inst->inst;
+        uint64_t after_inst = after->inst->inst;
+
+        return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD),
+                                  after_inst),
+                    waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL),
+                                  after_inst));
+}
+
  /** Recursive computation of the delay member of a node. */
  static void
  compute_delay(struct schedule_node *n)
@@ -635,7 +675,8 @@ compute_delay(struct schedule_node *n)
                          if (!n->children[i].node->delay)
                                  compute_delay(n->children[i].node);
                          n->delay = MAX2(n->delay,
-                                        n->children[i].node->delay + n->latency);
+                                        n->children[i].node->delay +
+                                        instruction_latency(n, n->children[i].node));
                  }
          }
  }
@@ -664,9 +705,14 @@ mark_instruction_scheduled(struct list_head *schedule_list,
                   * immediately after (or paired with!) the thing reading the
                   * destination.
                   */
-                int latency_from_previous = war_only ? 0 : node->latency;
+                uint32_t latency = 0;
+                if (!war_only) {
+                        latency = instruction_latency(node,
+                                                      node->children[i].node);
+                }
+
                  child->unblocked_time = MAX2(child->unblocked_time,
-                                             time + latency_from_previous);
+                                             time + latency);
                  child->parent_count--;
                  if (child->parent_count == 0)
                          list_add(&child->link, schedule_list);
@@ -799,33 +845,6 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list)
          return time;
  }
  
-static uint32_t waddr_latency(uint32_t waddr)
-{
-        if (waddr < 32)
-                return 2;
-
-        /* Some huge number, really. */
-        if (waddr >= QPU_W_TMU0_S && waddr <= QPU_W_TMU1_B)
-                return 100;
-
-        switch(waddr) {
-        case QPU_W_SFU_RECIP:
-        case QPU_W_SFU_RECIPSQRT:
-        case QPU_W_SFU_EXP:
-        case QPU_W_SFU_LOG:
-                return 3;
-        default:
-                return 1;
-        }
-}
-
-static uint32_t
-instruction_latency(uint64_t inst)
-{
-        return MAX2(waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_ADD)),
-                    waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
-}
-
  uint32_t
  qpu_schedule_instructions(struct vc4_compile *c)
  {
@@ -852,7 +871,6 @@ qpu_schedule_instructions(struct vc4_compile *c)
                  struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
  
                  n->inst = inst;
-                n->latency = instruction_latency(inst->inst);
  
                  if (reads_uniform(inst->inst)) {
                          n->uniform = next_uniform++;
author	Eric Anholt <eric@anholt.net>
	Fri, 18 Dec 2015 19:30:30 +0000 (11:30 -0800)
committer	Eric Anholt <eric@anholt.net>
	Sat, 19 Dec 2015 01:09:03 +0000 (17:09 -0800)