lima/gp: Fix problem with complex moves

author Connor Abbott <cwabbott0@gmail.com>

Sat, 11 May 2019 16:43:30 +0000 (18:43 +0200)

committer Connor Abbott <cwabbott0@gmail.com>

Thu, 18 Jul 2019 12:33:23 +0000 (14:33 +0200)
author Connor Abbott <cwabbott0@gmail.com>
Sat, 11 May 2019 16:43:30 +0000 (18:43 +0200)
committer Connor Abbott <cwabbott0@gmail.com>
Thu, 18 Jul 2019 12:33:23 +0000 (14:33 +0200)
diff --git a/src/gallium/drivers/lima/ir/gp/gpir.h b/src/gallium/drivers/lima/ir/gp/gpir.h

index 6a1d533977b310d674916322a7640338b878606e..de571c3c1c4e6d36aa9148468338ed5ac7fb018f 100644 (file)
--- a/src/gallium/drivers/lima/ir/gp/gpir.h
+++ b/src/gallium/drivers/lima/ir/gp/gpir.h
@@ -174,6 +174,7 @@ typedef struct gpir_node {
           bool ready;
           bool inserted;
           bool max_node, next_max_node;
+         bool complex_allowed;
        } sched;
        struct {
           int parent_index;
@@ -288,16 +289,22 @@ typedef struct gpir_instr {
      * (3) There is a store instruction scheduled, but not its child.
      *
      * The complex slot cannot be used for a move in case (1), since it only
-    * has a FIFO depth of 1, but it can be used for (2) and (3). In order to
-    * ensure that we have enough space for all three, we maintain the
-    * following invariants:
+    * has a FIFO depth of 1, but it can be used for (2) as well as (3) as long
+    * as the uses aren't in certain slots. It turns out that we don't have to
+    * worry about nodes that can't use the complex slot for (2), since there
+    * are at most 4 uses 1 cycle ago that can't use the complex slot, but we
+    * do have to worry about (3). This means tracking stores whose children
+    * cannot be in the complex slot. In order to ensure that we have enough
+    * space for all three, we maintain the following invariants:
      *
      * (1) alu_num_slot_free >= alu_num_slot_needed_by_store +
      *       alu_num_slot_needed_by_max +
      *       alu_num_slot_needed_by_next_max
-    * (2) alu_non_cplx_slot_free >= alu_num_slot_needed_by_max
+    * (2) alu_non_cplx_slot_free >= alu_num_slot_needed_by_max +
+    *       alu_num_slot_neede_by_non_cplx_store
      */
     int alu_num_slot_needed_by_store;
+   int alu_num_slot_needed_by_non_cplx_store;
     int alu_num_slot_needed_by_max;
     int alu_num_slot_needed_by_next_max;
  
diff --git a/src/gallium/drivers/lima/ir/gp/instr.c b/src/gallium/drivers/lima/ir/gp/instr.c

index 228d145928081bd16fce622d280956deeb5dd737..e07a2c9b7c266b175b82d84d8b5a355403da7c79 100644 (file)
--- a/src/gallium/drivers/lima/ir/gp/instr.c
+++ b/src/gallium/drivers/lima/ir/gp/instr.c
@@ -85,10 +85,15 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
     if (!gpir_instr_check_acc_same_op(instr, node, node->sched.pos))
        return false;
  
+   if (node->sched.next_max_node && !node->sched.complex_allowed &&
+       node->sched.pos == GPIR_INSTR_SLOT_COMPLEX)
+      return false;
+
     int consume_slot = gpir_instr_get_consume_slot(instr, node);
     int non_cplx_consume_slot =
        node->sched.pos == GPIR_INSTR_SLOT_COMPLEX ? 0 : consume_slot;
     int store_reduce_slot = 0;
+   int non_cplx_store_reduce_slot = 0;
     int max_reduce_slot = node->sched.max_node ? 1 : 0;
     int next_max_reduce_slot = node->sched.next_max_node ? 1 : 0;
  
@@ -100,6 +105,8 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
        gpir_store_node *s = gpir_node_to_store(instr->slots[i]);
        if (s && s->child == node) {
           store_reduce_slot = 1;
+         if (node->sched.next_max_node && !node->sched.complex_allowed)
+            non_cplx_store_reduce_slot = 1;
           break;
        }
     }
@@ -118,7 +125,8 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
     }
  
     int non_cplx_slot_difference =
-       instr->alu_num_slot_needed_by_max - max_reduce_slot -
+       instr->alu_num_slot_needed_by_max - max_reduce_slot +
+       instr->alu_num_slot_needed_by_non_cplx_store - non_cplx_store_reduce_slot -
         (instr->alu_non_cplx_slot_free - non_cplx_consume_slot);
     if (non_cplx_slot_difference > 0) {
        gpir_debug("failed %d because of alu slot\n", node->index);
@@ -131,6 +139,7 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
     instr->alu_num_slot_free -= consume_slot;
     instr->alu_non_cplx_slot_free -= non_cplx_consume_slot;
     instr->alu_num_slot_needed_by_store -= store_reduce_slot;
+   instr->alu_num_slot_needed_by_non_cplx_store -= non_cplx_store_reduce_slot;
     instr->alu_num_slot_needed_by_max -= max_reduce_slot;
     instr->alu_num_slot_needed_by_next_max -= next_max_reduce_slot;
     return true;
@@ -144,6 +153,8 @@ static void gpir_instr_remove_alu(gpir_instr *instr, gpir_node *node)
        gpir_store_node *s = gpir_node_to_store(instr->slots[i]);
        if (s && s->child == node) {
           instr->alu_num_slot_needed_by_store++;
+         if (node->sched.next_max_node && !node->sched.complex_allowed)
+            instr->alu_num_slot_needed_by_non_cplx_store++;
           break;
        }
     }
@@ -296,7 +307,7 @@ static bool gpir_instr_insert_store_check(gpir_instr *instr, gpir_node *node)
     }
  
     /* Check the invariants documented in gpir.h, similar to the ALU case.
-    * Since the only thing that changes is alu_num_slot_needed_by_store, we
+    * When the only thing that changes is alu_num_slot_needed_by_store, we
      * can get away with just checking the first one.
      */
     int slot_difference = instr->alu_num_slot_needed_by_store + 1
@@ -308,6 +319,25 @@ static bool gpir_instr_insert_store_check(gpir_instr *instr, gpir_node *node)
        return false;
     }
  
+   if (store->child->sched.next_max_node &&
+       !store->child->sched.complex_allowed) {
+      /* The child of the store is already partially ready, and has a use one
+       * cycle ago that disqualifies it (or a move replacing it) from being
+       * put in the complex slot. Therefore we have to check the non-complex
+       * invariant.
+       */
+      int non_cplx_slot_difference =
+          instr->alu_num_slot_needed_by_max +
+          instr->alu_num_slot_needed_by_non_cplx_store + 1 -
+          instr->alu_non_cplx_slot_free;
+      if (non_cplx_slot_difference > 0) {
+         instr->non_cplx_slot_difference = non_cplx_slot_difference;
+         return false;
+      }
+
+      instr->alu_num_slot_needed_by_non_cplx_store++;
+   }
+
     instr->alu_num_slot_needed_by_store++;
  
  out:
@@ -346,6 +376,11 @@ static void gpir_instr_remove_store(gpir_instr *instr, gpir_node *node)
  
     instr->alu_num_slot_needed_by_store--;
  
+   if (store->child->sched.next_max_node &&
+       !store->child->sched.complex_allowed) {
+      instr->alu_num_slot_needed_by_non_cplx_store--;
+   }
+
  out:
     if (!instr->slots[other_slot])
        instr->store_content[component >> 1] = GPIR_INSTR_STORE_NONE;
diff --git a/src/gallium/drivers/lima/ir/gp/scheduler.c b/src/gallium/drivers/lima/ir/gp/scheduler.c

index 60076daa3c4166f7f4a0fcf6e48532b5f37fde3d..7149e1dd213a4b8e5ef97902a978903ef9112601 100644 (file)
--- a/src/gallium/drivers/lima/ir/gp/scheduler.c
+++ b/src/gallium/drivers/lima/ir/gp/scheduler.c
@@ -640,6 +640,7 @@ static gpir_node *create_move(sched_ctx *ctx, gpir_node *node)
     move->node.sched.dist = node->sched.dist;
     move->node.sched.max_node = node->sched.max_node;
     move->node.sched.next_max_node = node->sched.next_max_node;
+   move->node.sched.complex_allowed = node->sched.complex_allowed;
  
     gpir_debug("create move %d for %d\n", move->node.index, node->index);
  
@@ -974,6 +975,7 @@ static bool try_spill_node(sched_ctx *ctx, gpir_node *node)
        store->child = node;
        store->node.sched.max_node = false;
        store->node.sched.next_max_node = false;
+      store->node.sched.complex_allowed = false;
        store->node.sched.pos = -1;
        store->node.sched.instr = NULL;
        store->node.sched.inserted = false;
@@ -1086,6 +1088,44 @@ static int gpir_get_min_end_as_move(gpir_node *node)
     return min;
  }
  
+/* The second source for add0, add1, mul0, and mul1 units cannot be complex.
+ * The hardware overwrites the add second sources with 0 and mul second
+ * sources with 1. This can be a problem if we need to insert more next-max
+ * moves but we only have values that can't use the complex unit for moves.
+ *
+ * Fortunately, we only need to insert a next-max move if there are more than
+ * 5 next-max nodes, but there are only 4 sources in the previous instruction
+ * that make values not complex-capable, which means there can be at most 4
+ * non-complex-capable values. Hence there will always be at least two values
+ * that can be rewritten to use a move in the complex slot. However, we have
+ * to be careful not to waste those values by putting both of them in a
+ * non-complex slot. This is handled for us by gpir_instr, which will reject
+ * such instructions. We just need to tell it which nodes can use complex, and
+ * it will do the accounting to figure out what is safe.
+ */
+
+static bool can_use_complex(gpir_node *node)
+{
+   gpir_node_foreach_succ(node, dep) {
+      if (dep->type != GPIR_DEP_INPUT)
+         continue;
+
+      gpir_node *succ = dep->succ;
+      if (succ->type != gpir_node_type_alu)
+         continue;
+
+      gpir_alu_node *alu = gpir_node_to_alu(succ);
+      if (alu->num_child >= 2 && alu->children[1] == node)
+         return false;
+
+      /* complex1 puts its third source in the fourth slot */
+      if (alu->node.op == gpir_op_complex1 && alu->children[2] == node)
+         return false;
+   }
+
+   return true;
+}
+
  /* Initialize node->sched.max_node and node->sched.next_max_node for every
   * input node on the ready list. We should only need to do this once per
   * instruction, at the beginning, since we never add max nodes to the ready
@@ -1104,6 +1144,8 @@ static void sched_find_max_nodes(sched_ctx *ctx)
        int min_end_move = gpir_get_min_end_as_move(node);
        node->sched.max_node = (min_end_move == ctx->instr->index);
        node->sched.next_max_node = (min_end_move == ctx->instr->index + 1);
+      if (node->sched.next_max_node)
+         node->sched.complex_allowed = can_use_complex(node);
  
        if (node->sched.max_node)
           ctx->instr->alu_num_slot_needed_by_max++;
@@ -1120,6 +1162,7 @@ static void verify_max_nodes(sched_ctx *ctx)
     int alu_num_slot_needed_by_max = 0;
     int alu_num_slot_needed_by_next_max = -5;
     int alu_num_slot_needed_by_store = 0;
+   int alu_num_slot_needed_by_non_cplx_store = 0;
  
     list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
        if (!gpir_is_input_node(node))
@@ -1129,15 +1172,20 @@ static void verify_max_nodes(sched_ctx *ctx)
           alu_num_slot_needed_by_max++;
        if (node->sched.next_max_node)
           alu_num_slot_needed_by_next_max++;
-      if (used_by_store(node, ctx->instr))
+      if (used_by_store(node, ctx->instr)) {
           alu_num_slot_needed_by_store++;
+         if (node->sched.next_max_node && !node->sched.complex_allowed)
+            alu_num_slot_needed_by_non_cplx_store++;
+      }
     }
  
     assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max);
     assert(ctx->instr->alu_num_slot_needed_by_next_max == alu_num_slot_needed_by_next_max);
     assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store);
-   assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + alu_num_slot_needed_by_next_max);
-   assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max);
+   assert(ctx->instr->alu_num_slot_needed_by_non_cplx_store ==
+          alu_num_slot_needed_by_non_cplx_store);
+   assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_slot_needed_by_next_max, 0));
+   assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + alu_num_slot_needed_by_non_cplx_store);
  }
  
  static bool try_node(sched_ctx *ctx)
@@ -1207,6 +1255,22 @@ static void place_move(sched_ctx *ctx, gpir_node *node)
     assert(score != INT_MIN);
  }
  
+/* For next-max nodes, not every node can be offloaded to a move in the
+ * complex slot. If we run out of non-complex slots, then such nodes cannot
+ * have moves placed for them. There should always be sufficient
+ * complex-capable nodes so that this isn't a problem.
+ */
+static bool can_place_move(sched_ctx *ctx, gpir_node *node)
+{
+   if (!node->sched.next_max_node)
+      return true;
+
+   if (node->sched.complex_allowed)
+      return true;
+
+   return ctx->instr->alu_non_cplx_slot_free > 0;
+}
+
  static bool sched_move(sched_ctx *ctx)
  {
     list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
@@ -1250,6 +1314,9 @@ static bool sched_move(sched_ctx *ctx)
      */
     if (ctx->instr->alu_num_slot_free > 0) {
        list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+         if (!can_place_move(ctx, node))
+            continue;
+
           if (node->sched.next_max_node && node->op == gpir_op_complex1 &&
               node->sched.ready) {
              bool skip = true;
@@ -1284,6 +1351,9 @@ static bool sched_move(sched_ctx *ctx)
      */
     if (ctx->instr->alu_num_slot_free > 0) {
        list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) {
+         if (!can_place_move(ctx, node))
+            continue;
+
           if (node->sched.next_max_node &&
               !(node->op == gpir_op_complex1 && node->sched.ready)) {
              place_move(ctx, node);
@@ -1298,6 +1368,9 @@ static bool sched_move(sched_ctx *ctx)
  
     if (ctx->instr->alu_num_slot_needed_by_next_max > 0) {
        list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+         if (!can_place_move(ctx, node))
+            continue;
+
           if (node->sched.next_max_node) {
              place_move(ctx, node);
              return true;
@@ -1546,6 +1619,7 @@ bool gpir_schedule_prog(gpir_compiler *comp)
           node->sched.physreg_store = NULL;
           node->sched.ready = false;
           node->sched.inserted = false;
+         node->sched.complex_allowed = false;
           node->sched.max_node = false;
           node->sched.next_max_node = false;
        }
author	Connor Abbott <cwabbott0@gmail.com>
	Sat, 11 May 2019 16:43:30 +0000 (18:43 +0200)
committer	Connor Abbott <cwabbott0@gmail.com>
	Thu, 18 Jul 2019 12:33:23 +0000 (14:33 +0200)
src/gallium/drivers/lima/ir/gp/gpir.h		patch \| blob \| history
src/gallium/drivers/lima/ir/gp/instr.c		patch \| blob \| history
src/gallium/drivers/lima/ir/gp/scheduler.c		patch \| blob \| history