From 7b2f195d0b34332c2ed0d6550f839cb1922caf0c Mon Sep 17 00:00:00 2001
From: Erico Nunes <nunes.erico@gmail.com>
Date: Wed, 28 Aug 2019 01:07:55 +0200
Subject: [PATCH] lima/ppir: optimizations in regalloc spilling code

Avoid creating unnecessary instructions for the load/store temp nodes
when not required, to further reduce register pressure.

The store_temp operation seems to be unable to do any spilling.
At least the offline shader seems to never output instructions accessing
swizzled components, and attempting to output that in ppir results in
errors. So, force spilled registers to allocate a full vec4 register.
This seems to be the optimal way as it is possible to always keep stores
and temps in a single instruction that can be pipelined.

Signed-off-by: Erico Nunes <nunes.erico@gmail.com>
Reviewed-by: Vasily Khoruzhick <anarsoul@gmail.com>
---
 src/gallium/drivers/lima/ir/pp/regalloc.c | 178 +++++++++++-----------
 1 file changed, 88 insertions(+), 90 deletions(-)

diff --git a/src/gallium/drivers/lima/ir/pp/regalloc.c b/src/gallium/drivers/lima/ir/pp/regalloc.c
index f6740c0ebb0..70a178deff8 100644
--- a/src/gallium/drivers/lima/ir/pp/regalloc.c
+++ b/src/gallium/drivers/lima/ir/pp/regalloc.c
@@ -249,35 +249,49 @@ static bool create_new_instr_before(ppir_block *block, ppir_instr *ref,
    return true;
 }
 
-static ppir_alu_node* ppir_update_spilled_src(ppir_compiler *comp,
-                                              ppir_block *block,
-                                              ppir_node *node, ppir_src *src,
-                                              ppir_alu_node *move_alu)
+static bool ppir_update_spilled_src(ppir_compiler *comp, ppir_block *block,
+                                    ppir_node *node, ppir_src *src,
+                                    ppir_node **fill_node)
 {
-   /* alu nodes may have multiple references to the same value.
-    * try to avoid unnecessary loads for the same alu node by
+   /* nodes might have multiple references to the same value.
+    * avoid creating unnecessary loads for the same fill by
     * saving the node resulting from the temporary load */
-   if (move_alu)
+   if (*fill_node)
       goto update_src;
 
+   int num_components = src->reg->num_components;
+
    /* alloc new node to load value */
    ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0);
    if (!load_node)
-      return NULL;
+      return false;
    list_addtail(&load_node->list, &node->list);
    comp->num_fills++;
 
    ppir_load_node *load = ppir_node_to_load(load_node);
 
    load->index = -comp->prog->stack_size; /* index sizes are negative */
-   load->num_components = 4;
+   load->num_components = num_components;
 
    ppir_dest *ld_dest = &load->dest;
    ld_dest->type = ppir_target_pipeline;
    ld_dest->pipeline = ppir_pipeline_reg_uniform;
-   ld_dest->write_mask = 0xf;
+   ld_dest->write_mask = u_bit_consecutive(0, num_components);
+
+   /* If the uniform slot is empty, we can insert the load_temp
+    * there and use it directly. Exceptionally, if the node is in the
+    * varying or texld slot, this doesn't work. */
+   if (!node->instr->slots[PPIR_INSTR_SLOT_UNIFORM] &&
+        node->instr_pos != PPIR_INSTR_SLOT_VARYING &&
+        node->instr_pos != PPIR_INSTR_SLOT_TEXLD) {
+      ppir_node_target_assign(src, load_node);
+      *fill_node = load_node;
+      return ppir_instr_insert_node(node->instr, load_node);
+   }
 
-   create_new_instr_before(block, node->instr, load_node);
+   /* Uniform slot was taken, so fall back to a new instruction with a mov */
+   if (!create_new_instr_before(block, node->instr, load_node))
+      return false;
 
    /* Create move node */
    ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0);
@@ -285,7 +299,7 @@ static ppir_alu_node* ppir_update_spilled_src(ppir_compiler *comp,
       return false;
    list_addtail(&move_node->list, &node->list);
 
-   move_alu = ppir_node_to_alu(move_node);
+   ppir_alu_node *move_alu = ppir_node_to_alu(move_node);
 
    move_alu->num_src = 1;
    move_alu->src->type = ppir_target_pipeline;
@@ -295,11 +309,11 @@ static ppir_alu_node* ppir_update_spilled_src(ppir_compiler *comp,
 
    ppir_dest *alu_dest = &move_alu->dest;
    alu_dest->type = ppir_target_ssa;
-   alu_dest->ssa.num_components = 4;
+   alu_dest->ssa.num_components = num_components;
    alu_dest->ssa.live_in = INT_MAX;
    alu_dest->ssa.live_out = 0;
    alu_dest->ssa.spilled = true;
-   alu_dest->write_mask = 0xf;
+   alu_dest->write_mask = u_bit_consecutive(0, num_components);
 
    list_addtail(&alu_dest->ssa.list, &comp->reg_list);
 
@@ -315,44 +329,23 @@ static ppir_alu_node* ppir_update_spilled_src(ppir_compiler *comp,
    ppir_node_add_dep(node, move_node);
    ppir_node_add_dep(move_node, load_node);
 
-update_src:
-   /* switch node src to use the new ssa instead */
-   src->type = ppir_target_ssa;
-   src->ssa = &move_alu->dest.ssa;
-
-   return move_alu;
-}
+   *fill_node = move_node;
 
-static ppir_reg *create_reg(ppir_compiler *comp, int num_components)
-{
-   ppir_reg *r = rzalloc(comp, ppir_reg);
-   if (!r)
-      return NULL;
-
-   r->num_components = num_components;
-   r->live_in = INT_MAX;
-   r->live_out = 0;
-   r->is_head = false;
-   list_addtail(&r->list, &comp->reg_list);
+update_src:
+   /* switch node src to use the fill node dest */
+   ppir_node_target_assign(src, *fill_node);
 
-   return r;
+   return true;
 }
 
-static bool ppir_update_spilled_dest(ppir_compiler *comp, ppir_block *block,
-                                     ppir_node *node, ppir_dest *dest)
+static bool ppir_update_spilled_dest_load(ppir_compiler *comp, ppir_block *block,
+                                          ppir_node *node)
 {
+   ppir_dest *dest = ppir_node_get_dest(node);
    assert(dest != NULL);
-   ppir_reg *reg = NULL;
-   if (dest->type == ppir_target_register) {
-      reg = dest->reg;
-      reg->num_components = 4;
-      reg->spilled = true;
-   }
-   else {
-      reg = create_reg(comp, 4);
-      reg->spilled = true;
-      list_del(&dest->ssa.list);
-   }
+   assert(dest->type == ppir_target_register);
+   ppir_reg *reg = dest->reg;
+   int num_components = reg->num_components;
 
    /* alloc new node to load value */
    ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0);
@@ -364,13 +357,16 @@ static bool ppir_update_spilled_dest(ppir_compiler *comp, ppir_block *block,
    ppir_load_node *load = ppir_node_to_load(load_node);
 
    load->index = -comp->prog->stack_size; /* index sizes are negative */
-   load->num_components = 4;
+   load->num_components = num_components;
 
    load->dest.type = ppir_target_pipeline;
    load->dest.pipeline = ppir_pipeline_reg_uniform;
-   load->dest.write_mask = 0xf;
+   load->dest.write_mask = u_bit_consecutive(0, num_components);
 
-   create_new_instr_before(block, node->instr, load_node);
+   /* New instruction is needed since we're updating a dest register
+    * and we can't write to the uniform pipeline reg */
+   if (!create_new_instr_before(block, node->instr, load_node))
+      return false;
 
    /* Create move node */
    ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0);
@@ -388,7 +384,7 @@ static bool ppir_update_spilled_dest(ppir_compiler *comp, ppir_block *block,
 
    move_alu->dest.type = ppir_target_register;
    move_alu->dest.reg = reg;
-   move_alu->dest.write_mask = 0x0f;
+   move_alu->dest.write_mask = u_bit_consecutive(0, num_components);
 
    if (!ppir_instr_insert_node(load_node->instr, move_node))
       return false;
@@ -401,8 +397,15 @@ static bool ppir_update_spilled_dest(ppir_compiler *comp, ppir_block *block,
    ppir_node_add_dep(node, move_node);
    ppir_node_add_dep(move_node, load_node);
 
-   dest->type = ppir_target_register;
-   dest->reg = reg;
+   return true;
+}
+
+static bool ppir_update_spilled_dest(ppir_compiler *comp, ppir_block *block,
+                                     ppir_node *node)
+{
+   ppir_dest *dest = ppir_node_get_dest(node);
+   assert(dest != NULL);
+   ppir_reg *reg = ppir_dest_get_reg(dest);
 
    /* alloc new node to store value */
    ppir_node *store_node = ppir_node_create(block, ppir_op_store_temp, -1, 0);
@@ -414,10 +417,10 @@ static bool ppir_update_spilled_dest(ppir_compiler *comp, ppir_block *block,
    ppir_store_node *store = ppir_node_to_store(store_node);
 
    store->index = -comp->prog->stack_size; /* index sizes are negative */
-   store->num_components = 4;
+   store->num_components = reg->num_components;
 
-   store->src.type = ppir_target_register;
-   store->src.reg = dest->reg;
+   store->src.type = dest->type;
+   store->src.reg = reg;
 
    /* insert the new node as successor */
    ppir_node_foreach_succ_safe(node, dep) {
@@ -427,9 +430,15 @@ static bool ppir_update_spilled_dest(ppir_compiler *comp, ppir_block *block,
    }
    ppir_node_add_dep(store_node, node);
 
-   create_new_instr_after(block, node->instr, store_node);
+   /* If the store temp slot is empty, we can insert the store_temp
+    * there and use it directly. Exceptionally, if the node is in the
+    * combine slot, this doesn't work. */
+   if (!node->instr->slots[PPIR_INSTR_SLOT_STORE_TEMP] &&
+        node->instr_pos != PPIR_INSTR_SLOT_ALU_COMBINE)
+      return ppir_instr_insert_node(node->instr, store_node);
 
-   return true;
+   /* Not possible to merge store, so fall back to a new instruction */
+   return create_new_instr_after(block, node->instr, store_node);
 }
 
 static bool ppir_regalloc_spill_reg(ppir_compiler *comp, ppir_reg *chosen)
@@ -438,41 +447,28 @@ static bool ppir_regalloc_spill_reg(ppir_compiler *comp, ppir_reg *chosen)
       list_for_each_entry(ppir_node, node, &block->node_list, list) {
 
          ppir_dest *dest = ppir_node_get_dest(node);
-         ppir_reg *reg = NULL;
-         if (dest) {
-            reg = ppir_dest_get_reg(dest);
-            if (reg == chosen)
-               ppir_update_spilled_dest(comp, block, node, dest);
-         }
-
-         switch (node->type) {
-         case ppir_node_type_alu:
-         {
-            /* alu nodes may have multiple references to the same value.
-             * try to avoid unnecessary loads for the same alu node by
-             * saving the node resulting from the temporary load */
-            ppir_alu_node *move_alu = NULL;
-            ppir_alu_node *alu = ppir_node_to_alu(node);
-            for (int i = 0; i < alu->num_src; i++) {
-               reg = ppir_src_get_reg(alu->src + i);
-               if (reg == chosen) {
-                  move_alu = ppir_update_spilled_src(comp, block, node,
-                                                     alu->src + i, move_alu);
-               }
+         if (dest && ppir_dest_get_reg(dest) == chosen) {
+            /* If dest is a register, it might be updating only some its
+             * components, so need to load the existing value first */
+            if (dest->type == ppir_target_register) {
+               if (!ppir_update_spilled_dest_load(comp, block, node))
+                  return false;
             }
-            break;
+            if (!ppir_update_spilled_dest(comp, block, node))
+               return false;
          }
-         default:
-         {
-            for (int i = 0; i < ppir_node_get_src_num(node); i++) {
-               ppir_src *src = ppir_node_get_src(node, i);
-               reg = ppir_src_get_reg(src);
-               if (reg == chosen) {
-                  ppir_update_spilled_src(comp, block, node, src, NULL);
-               }
+
+         ppir_node *fill_node = NULL;
+         /* nodes might have multiple references to the same value.
+          * avoid creating unnecessary loads for the same fill by
+          * saving the node resulting from the temporary load */
+         for (int i = 0; i < ppir_node_get_src_num(node); i++) {
+            ppir_src *src = ppir_node_get_src(node, i);
+            ppir_reg *reg = ppir_src_get_reg(src);
+            if (reg == chosen) {
+               if (!ppir_update_spilled_src(comp, block, node, src, &fill_node))
+                  return false;
             }
-            break;
-         }
          }
       }
    }
@@ -512,6 +508,7 @@ static ppir_reg *ppir_regalloc_choose_spill_node(ppir_compiler *comp,
    }
    assert(chosen);
    chosen->spilled = true;
+   chosen->is_head = true; /* store_temp unable to do swizzle */
 
    return chosen;
 }
@@ -599,7 +596,8 @@ static bool ppir_regalloc_prog_try(ppir_compiler *comp, bool *spilled)
           * It is also be used in the spilling code, as negative indices
           * starting from -1, to create stack addresses. */
          comp->prog->stack_size++;
-         ppir_regalloc_spill_reg(comp, chosen);
+         if (!ppir_regalloc_spill_reg(comp, chosen))
+            goto err_out;
          /* Ask the outer loop to call back in. */
          *spilled = true;
 
-- 
2.30.2