From c2f48d8f324a7577a63f7f4ad4628564f02687b0 Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Sun, 28 Jul 2019 01:13:10 +0200
Subject: [PATCH] lima/gpir: Always schedule complex2 and *_impl right after
 complex1

See https://gitlab.freedesktop.org/lima/mesa/issues/94 for the gory
details of why this is needed. For *_impl this is easy, since it never
increases register pressure and it goes in the complex slot hence it
never counts against max nodes. It's a bit more challenging for
complex2, since it does count against max nodes, so we need to change
the reservation logic to reserve an extra slot for complex2 when
scheduling complex1. This second part isn't strictly necessary yet, but
it will be for exp2.

Signed-off-by: Connor Abbott <cwabbott0@gmail.com>
Acked-by: Qiang Yu <yuq825@gmail.com>
---
 src/gallium/drivers/lima/ir/gp/gpir.h      | 11 ++++++--
 src/gallium/drivers/lima/ir/gp/instr.c     | 15 +++++++---
 src/gallium/drivers/lima/ir/gp/node.c      |  3 ++
 src/gallium/drivers/lima/ir/gp/scheduler.c | 33 ++++++++++++++++------
 4 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/src/gallium/drivers/lima/ir/gp/gpir.h b/src/gallium/drivers/lima/ir/gp/gpir.h
index de571c3c1c4..e7707814b7c 100644
--- a/src/gallium/drivers/lima/ir/gp/gpir.h
+++ b/src/gallium/drivers/lima/ir/gp/gpir.h
@@ -120,6 +120,7 @@ typedef struct {
    int *slots;
    gpir_node_type type;
    bool spillless;
+   bool schedule_first;
    bool may_consume_two_slots;
 } gpir_op_info;
 
@@ -299,14 +300,20 @@ typedef struct gpir_instr {
     *
     * (1) alu_num_slot_free >= alu_num_slot_needed_by_store +
     *       alu_num_slot_needed_by_max +
-    *       alu_num_slot_needed_by_next_max
+    *       max(alu_num_unscheduled_next_max - alu_max_allowed_next_max, 0)
     * (2) alu_non_cplx_slot_free >= alu_num_slot_needed_by_max +
     *       alu_num_slot_neede_by_non_cplx_store
+    *
+    * alu_max_allowed_next_max is normally 5 (since there can be at most 5 max
+    * nodes for the next instruction) but when there is a complex1 node in
+    * this instruction it reduces to 4 to reserve a slot for complex2 in the
+    * next instruction.
     */
    int alu_num_slot_needed_by_store;
    int alu_num_slot_needed_by_non_cplx_store;
    int alu_num_slot_needed_by_max;
-   int alu_num_slot_needed_by_next_max;
+   int alu_num_unscheduled_next_max;
+   int alu_max_allowed_next_max;
 
    /* Used to communicate to the scheduler how many slots need to be cleared
     * up in order to satisfy the invariants.
diff --git a/src/gallium/drivers/lima/ir/gp/instr.c b/src/gallium/drivers/lima/ir/gp/instr.c
index e07a2c9b7c2..45e9d817143 100644
--- a/src/gallium/drivers/lima/ir/gp/instr.c
+++ b/src/gallium/drivers/lima/ir/gp/instr.c
@@ -37,6 +37,7 @@ gpir_instr *gpir_instr_create(gpir_block *block)
    instr->index = block->sched.instr_index++;
    instr->alu_num_slot_free = 6;
    instr->alu_non_cplx_slot_free = 5;
+   instr->alu_max_allowed_next_max = 5;
 
    list_add(&instr->list, &block->instr_list);
    return instr;
@@ -96,6 +97,8 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
    int non_cplx_store_reduce_slot = 0;
    int max_reduce_slot = node->sched.max_node ? 1 : 0;
    int next_max_reduce_slot = node->sched.next_max_node ? 1 : 0;
+   int alu_new_max_allowed_next_max =
+      node->op == gpir_op_complex1 ? 4 : instr->alu_max_allowed_next_max;
 
    /* check if this node is child of one store node.
     * complex1 won't be any of this instr's store node's child,
@@ -117,7 +120,8 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
    int slot_difference = 
        instr->alu_num_slot_needed_by_store - store_reduce_slot +
        instr->alu_num_slot_needed_by_max - max_reduce_slot +
-       MAX2(instr->alu_num_slot_needed_by_next_max - next_max_reduce_slot, 0) -
+       MAX2(instr->alu_num_unscheduled_next_max - next_max_reduce_slot -
+            alu_new_max_allowed_next_max, 0) -
       (instr->alu_num_slot_free - consume_slot);
    if (slot_difference > 0) {
       gpir_debug("failed %d because of alu slot\n", node->index);
@@ -141,7 +145,8 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
    instr->alu_num_slot_needed_by_store -= store_reduce_slot;
    instr->alu_num_slot_needed_by_non_cplx_store -= non_cplx_store_reduce_slot;
    instr->alu_num_slot_needed_by_max -= max_reduce_slot;
-   instr->alu_num_slot_needed_by_next_max -= next_max_reduce_slot;
+   instr->alu_num_unscheduled_next_max -= next_max_reduce_slot;
+   instr->alu_max_allowed_next_max = alu_new_max_allowed_next_max;
    return true;
 }
 
@@ -165,7 +170,9 @@ static void gpir_instr_remove_alu(gpir_instr *instr, gpir_node *node)
    if (node->sched.max_node)
       instr->alu_num_slot_needed_by_max++;
    if (node->sched.next_max_node)
-      instr->alu_num_slot_needed_by_next_max++;
+      instr->alu_num_unscheduled_next_max++;
+   if (node->op == gpir_op_complex1)
+      instr->alu_max_allowed_next_max = 5;
 }
 
 static bool gpir_instr_insert_reg0_check(gpir_instr *instr, gpir_node *node)
@@ -312,7 +319,7 @@ static bool gpir_instr_insert_store_check(gpir_instr *instr, gpir_node *node)
     */
    int slot_difference = instr->alu_num_slot_needed_by_store + 1
       + instr->alu_num_slot_needed_by_max +
-      MAX2(instr->alu_num_slot_needed_by_next_max, 0) -
+      MAX2(instr->alu_num_unscheduled_next_max - instr->alu_max_allowed_next_max, 0) -
       instr->alu_num_slot_free;
    if (slot_difference > 0) {
       instr->slot_difference = slot_difference;
diff --git a/src/gallium/drivers/lima/ir/gp/node.c b/src/gallium/drivers/lima/ir/gp/node.c
index decda5f1246..a8706627f38 100644
--- a/src/gallium/drivers/lima/ir/gp/node.c
+++ b/src/gallium/drivers/lima/ir/gp/node.c
@@ -58,6 +58,7 @@ const gpir_op_info gpir_op_infos[] = {
       .name = "complex2",
       .slots = (int []) { GPIR_INSTR_SLOT_MUL0, GPIR_INSTR_SLOT_END },
       .spillless = true,
+      .schedule_first = true,
    },
    [gpir_op_add] = {
       .name = "add",
@@ -154,11 +155,13 @@ const gpir_op_info gpir_op_infos[] = {
       .name = "rcp_impl",
       .slots = (int []) { GPIR_INSTR_SLOT_COMPLEX, GPIR_INSTR_SLOT_END },
       .spillless = true,
+      .schedule_first = true,
    },
    [gpir_op_rsqrt_impl] = {
       .name = "rsqrt_impl",
       .slots = (int []) { GPIR_INSTR_SLOT_COMPLEX, GPIR_INSTR_SLOT_END },
       .spillless = true,
+      .schedule_first = true,
    },
    [gpir_op_load_uniform] = {
       .name = "ld_uni",
diff --git a/src/gallium/drivers/lima/ir/gp/scheduler.c b/src/gallium/drivers/lima/ir/gp/scheduler.c
index 35925a1af51..f06089b7992 100644
--- a/src/gallium/drivers/lima/ir/gp/scheduler.c
+++ b/src/gallium/drivers/lima/ir/gp/scheduler.c
@@ -441,7 +441,8 @@ static void schedule_insert_ready_list(sched_ctx *ctx,
 
    struct list_head *insert_pos = &ctx->ready_list;
    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
-      if (insert_node->sched.dist > node->sched.dist) {
+      if (insert_node->sched.dist > node->sched.dist ||
+          gpir_op_infos[insert_node->op].schedule_first) {
          insert_pos = &node->list;
          break;
       }
@@ -916,7 +917,7 @@ static void spill_node(sched_ctx *ctx, gpir_node *node, gpir_store_node *store)
       }
       if (node->sched.next_max_node) {
          node->sched.next_max_node = false;
-         ctx->instr->alu_num_slot_needed_by_next_max--;
+         ctx->instr->alu_num_unscheduled_next_max--;
       }
    }
 }
@@ -1153,7 +1154,7 @@ static bool can_use_complex(gpir_node *node)
 
 static void sched_find_max_nodes(sched_ctx *ctx)
 {
-   ctx->instr->alu_num_slot_needed_by_next_max = -5;
+   ctx->instr->alu_num_unscheduled_next_max = 0;
    ctx->instr->alu_num_slot_needed_by_max = 0;
 
    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
@@ -1169,7 +1170,7 @@ static void sched_find_max_nodes(sched_ctx *ctx)
       if (node->sched.max_node)
          ctx->instr->alu_num_slot_needed_by_max++;
       if (node->sched.next_max_node)
-         ctx->instr->alu_num_slot_needed_by_next_max++;
+         ctx->instr->alu_num_unscheduled_next_max++;
    }
 }
 
@@ -1179,9 +1180,10 @@ static void sched_find_max_nodes(sched_ctx *ctx)
 static void verify_max_nodes(sched_ctx *ctx)
 {
    int alu_num_slot_needed_by_max = 0;
-   int alu_num_slot_needed_by_next_max = -5;
+   int alu_num_unscheduled_next_max = 0;
    int alu_num_slot_needed_by_store = 0;
    int alu_num_slot_needed_by_non_cplx_store = 0;
+   int alu_max_allowed_next_max = 5;
 
    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
       if (!gpir_is_input_node(node))
@@ -1190,7 +1192,7 @@ static void verify_max_nodes(sched_ctx *ctx)
       if (node->sched.max_node)
          alu_num_slot_needed_by_max++;
       if (node->sched.next_max_node)
-         alu_num_slot_needed_by_next_max++;
+         alu_num_unscheduled_next_max++;
       if (used_by_store(node, ctx->instr)) {
          alu_num_slot_needed_by_store++;
          if (node->sched.next_max_node && !node->sched.complex_allowed)
@@ -1198,12 +1200,17 @@ static void verify_max_nodes(sched_ctx *ctx)
       }
    }
 
+   if (ctx->instr->slots[GPIR_INSTR_SLOT_MUL0] &&
+       ctx->instr->slots[GPIR_INSTR_SLOT_MUL0]->op == gpir_op_complex1)
+      alu_max_allowed_next_max = 4;
+
    assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max);
-   assert(ctx->instr->alu_num_slot_needed_by_next_max == alu_num_slot_needed_by_next_max);
+   assert(ctx->instr->alu_num_unscheduled_next_max == alu_num_unscheduled_next_max);
+   assert(ctx->instr->alu_max_allowed_next_max == alu_max_allowed_next_max);
    assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store);
    assert(ctx->instr->alu_num_slot_needed_by_non_cplx_store ==
           alu_num_slot_needed_by_non_cplx_store);
-   assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_slot_needed_by_next_max, 0));
+   assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_unscheduled_next_max - alu_max_allowed_next_max, 0));
    assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + alu_num_slot_needed_by_non_cplx_store);
 }
 
@@ -1237,6 +1244,13 @@ static bool try_node(sched_ctx *ctx)
             score = schedule_try_node(ctx, node, true);
          }
 
+         /* schedule_first nodes must be scheduled if possible */
+         if (gpir_op_infos[node->op].schedule_first && score != INT_MIN) {
+            best_node = node;
+            best_score = score;
+            break;
+         }
+
          if (score > best_score) {
             best_score = score;
             best_node = node;
@@ -1382,7 +1396,8 @@ static bool sched_move(sched_ctx *ctx)
     * need to insert the move.
     */
 
-   if (ctx->instr->alu_num_slot_needed_by_next_max > 0) {
+   if (ctx->instr->alu_num_unscheduled_next_max >
+       ctx->instr->alu_max_allowed_next_max) {
       list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
          if (!can_place_move(ctx, node))
             continue;
-- 
2.30.2