From d699a17475b5d123e6a22778e8ac6e005774ce92 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Date: Fri, 30 Aug 2019 12:56:55 -0700
Subject: [PATCH] pan/midgard: Schedule before RA

This is a tradeoff.

Scheduling before RA means we don't do RA on what-will-become pipeline
registers. Importantly, it means the scheduler is able to reorder
instructions, as registers have not been decided yet.

Unfortunately, it also complicates register spilling, since the spills
themselves won't get bundled optimally and we can only spill twice per
ALU bundle (only one spill per bundle allowed here). It also prevents us
from eliminating dead moves introduced by register allocation, as they
are not dead before RA. The shader-db regressions are from poor spilling
choices introduced by the new bundling requirements. These could be
solved by the combination of a post-scheduler (to combine adjacent
spills into bundles) with a VLIW-aware spill cost calculation.
Nevertheless, the change is small enough that I feel it's worth it to
eat a tiny shader-db regression for the sake of flexibility.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
---
 src/panfrost/midgard/midgard_schedule.c | 56 +++++++++++++------------
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c
index 6693a1b725b..8f86701e33f 100644
--- a/src/panfrost/midgard/midgard_schedule.c
+++ b/src/panfrost/midgard/midgard_schedule.c
@@ -798,11 +798,26 @@ static void mir_spill_register(
                 ra_set_node_spill_cost(g, i, 1.0);
         }
 
-        mir_foreach_instr_global(ctx, ins) {
-                if (ins->no_spill &&
-                    ins->dest >= 0 &&
-                    ins->dest < ctx->temp_count)
-                        ra_set_node_spill_cost(g, ins->dest, -1.0);
+        /* We can't spill any bundles that contain unspills. This could be
+         * optimized to allow use of r27 to spill twice per bundle, but if
+         * you're at the point of optimizing spilling, it's too late. */
+
+        mir_foreach_block(ctx, block) {
+                mir_foreach_bundle_in_block(block, bun) {
+                        bool no_spill = false;
+
+                        for (unsigned i = 0; i < bun->instruction_count; ++i)
+                                no_spill |= bun->instructions[i]->no_spill;
+
+                        if (!no_spill)
+                                continue;
+
+                        for (unsigned i = 0; i < bun->instruction_count; ++i) {
+                                unsigned dest = bun->instructions[i]->dest;
+                                if (dest < ctx->temp_count)
+                                        ra_set_node_spill_cost(g, dest, -1.0);
+                        }
+                }
         }
 
         int spill_node = ra_get_best_spill_node(g);
@@ -831,7 +846,8 @@ static void mir_spill_register(
                 if (is_special_w)
                         spill_slot = spill_index++;
 
-                mir_foreach_instr_global_safe(ctx, ins) {
+                mir_foreach_block(ctx, block) {
+                mir_foreach_instr_in_block_safe(block, ins) {
                         if (ins->dest != spill_node) continue;
 
                         midgard_instruction st;
@@ -841,17 +857,19 @@ static void mir_spill_register(
                                 st.no_spill = true;
                         } else {
                                 ins->dest = SSA_FIXED_REGISTER(26);
+                                ins->no_spill = true;
                                 st = v_load_store_scratch(ins->dest, spill_slot, true, ins->mask);
                         }
 
                         /* Hint: don't rewrite this node */
                         st.hint = true;
 
-                        mir_insert_instruction_before(ctx, mir_next_op(ins), st);
+                        mir_insert_instruction_after_scheduled(ctx, block, ins, st);
 
                         if (!is_special)
                                 ctx->spills++;
                 }
+                }
         }
 
         /* For special reads, figure out how many components we need */
@@ -915,7 +933,7 @@ static void mir_spill_register(
 
                                 st.mask = read_mask;
 
-                                mir_insert_instruction_before(ctx, before, st);
+                                mir_insert_instruction_before_scheduled(ctx, block, before, st);
                                // consecutive_skip = true;
                         } else {
                                 /* Special writes already have their move spilled in */
@@ -962,8 +980,11 @@ schedule_program(compiler_context *ctx)
 
         mir_foreach_block(ctx, block) {
                 midgard_opt_dead_move_eliminate(ctx, block);
+                schedule_block(ctx, block);
         }
 
+        mir_create_pipeline_registers(ctx);
+
         do {
                 if (spilled) 
                         mir_spill_register(ctx, g, &spill_count);
@@ -974,25 +995,6 @@ schedule_program(compiler_context *ctx)
                 g = allocate_registers(ctx, &spilled);
         } while(spilled && ((iter_count--) > 0));
 
-        /* We can simplify a bit after RA */
-
-        mir_foreach_block(ctx, block) {
-                midgard_opt_post_move_eliminate(ctx, block, g);
-        }
-
-        /* After RA finishes, we schedule all at once */
-
-        mir_foreach_block(ctx, block) {
-                schedule_block(ctx, block);
-        }
-
-        /* Finally, we create pipeline registers as a peephole pass after
-         * scheduling. This isn't totally optimal, since there are cases where
-         * the usage of pipeline registers can eliminate spills, but it does
-         * save some power */
-
-        mir_create_pipeline_registers(ctx);
-
         if (iter_count <= 0) {
                 fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");
                 assert(0);
-- 
2.30.2