From 7b0a4f977b98cc49df5b2233e1674e63b05a7b25 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Date: Tue, 16 Jun 2020 19:06:21 -0400
Subject: [PATCH] pan/mdg: Schedule based on liveness
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

By estimating liveness in the scheduler and choosing instructions likely
to reduce register pressure, on average we can decrease pressure given a
sufficiently larger window. On the other hand, decreasing pressure
instead of leaning too heavily on the search window enables us to use a
much larger search window without inflating pressure too much. So by
doing both in lockstep, we benefit pretty well.

total instructions in shared programs: 49458 -> 48540 (-1.86%)
instructions in affected programs: 26931 -> 26013 (-3.41%)
helped: 221
HURT: 15
helped stats (abs) min: 1 max: 36 xÌ: 4.37 xÌ: 2
helped stats (rel) min: 0.31% max: 16.90% xÌ: 4.97% xÌ: 3.85%
HURT stats (abs)   min: 1 max: 4 xÌ: 3.13 xÌ: 3
HURT stats (rel)   min: 0.50% max: 7.14% xÌ: 4.53% xÌ: 4.55%
95% mean confidence interval for instructions value: -4.65 -3.13
95% mean confidence interval for instructions %-change: -4.94% -3.81%
Instructions are helped.

total bundles in shared programs: 25199 -> 23446 (-6.96%)
bundles in affected programs: 21600 -> 19847 (-8.12%)
helped: 277
HURT: 170
helped stats (abs) min: 1 max: 45 xÌ: 7.33 xÌ: 6
helped stats (rel) min: 1.06% max: 33.83% xÌ: 11.01% xÌ: 8.57%
HURT stats (abs)   min: 1 max: 6 xÌ: 1.63 xÌ: 1
HURT stats (rel)   min: 1.19% max: 40.00% xÌ: 13.36% xÌ: 11.11%
95% mean confidence interval for bundles value: -4.61 -3.23
95% mean confidence interval for bundles %-change: -3.00% -0.49%
Bundles are helped.

total quadwords in shared programs: 40269 -> 39652 (-1.53%)
quadwords in affected programs: 35881 -> 35264 (-1.72%)
helped: 242
HURT: 244
helped stats (abs) min: 1 max: 36 xÌ: 4.61 xÌ: 3
helped stats (rel) min: 0.39% max: 16.33% xÌ: 5.33% xÌ: 5.13%
HURT stats (abs)   min: 1 max: 20 xÌ: 2.04 xÌ: 1
HURT stats (rel)   min: 0.81% max: 21.74% xÌ: 7.57% xÌ: 6.25%
95% mean confidence interval for quadwords value: -1.71 -0.83
95% mean confidence interval for quadwords %-change: 0.46% 1.82%
Inconclusive result (value mean confidence interval and %-change mean confidence interval disagree).

total registers in shared programs: 3786 -> 3336 (-11.89%)
registers in affected programs: 2161 -> 1711 (-20.82%)
helped: 262
HURT: 35
helped stats (abs) min: 1 max: 7 xÌ: 1.87 xÌ: 1
helped stats (rel) min: 6.25% max: 66.67% xÌ: 28.91% xÌ: 25.00%
HURT stats (abs)   min: 1 max: 3 xÌ: 1.11 xÌ: 1
HURT stats (rel)   min: 7.69% max: 100.00% xÌ: 19.76% xÌ: 12.50%
95% mean confidence interval for registers value: -1.70 -1.33
95% mean confidence interval for registers %-change: -25.56% -20.79%
Registers are helped.

total threads in shared programs: 2453 -> 2592 (5.67%)
threads in affected programs: 160 -> 299 (86.87%)
helped: 79
HURT: 6
helped stats (abs) min: 1 max: 2 xÌ: 1.85 xÌ: 2
helped stats (rel) min: 100.00% max: 100.00% xÌ: 100.00% xÌ: 100.00%
HURT stats (abs)   min: 1 max: 2 xÌ: 1.17 xÌ: 1
HURT stats (rel)   min: 50.00% max: 50.00% xÌ: 50.00% xÌ: 50.00%
95% mean confidence interval for threads value: 1.45 1.82
95% mean confidence interval for threads %-change: 81.08% 97.75%
Threads are [helped].

total spills in shared programs: 168 -> 17 (-89.88%)
spills in affected programs: 167 -> 16 (-90.42%)
helped: 13
HURT: 0

total fills in shared programs: 186 -> 35 (-81.18%)
fills in affected programs: 186 -> 35 (-81.18%)
helped: 14

HURT: 0
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5513>
---
 src/panfrost/midgard/midgard_schedule.c | 107 +++++++++++++++++++-----
 1 file changed, 84 insertions(+), 23 deletions(-)

diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c
index 3aee91222ef..c5a4dea67f9 100644
--- a/src/panfrost/midgard/midgard_schedule.c
+++ b/src/panfrost/midgard/midgard_schedule.c
@@ -572,9 +572,56 @@ mir_has_unit(midgard_instruction *ins, unsigned unit)
         return false;
 }
 
+/* Net change in liveness if an instruction were scheduled. Loosely based on
+ * ir3's scheduler. */
+
+static int
+mir_live_effect(uint16_t *liveness, midgard_instruction *ins, bool destructive)
+{
+        /* TODO: what if dest is used multiple times? */
+        int free_live = 0;
+
+        if (ins->dest < SSA_FIXED_MINIMUM) {
+                unsigned bytemask = mir_bytemask(ins);
+                bytemask = util_next_power_of_two(bytemask + 1) - 1;
+                free_live += util_bitcount(liveness[ins->dest] & bytemask);
+
+                if (destructive)
+                        liveness[ins->dest] &= ~bytemask;
+        }
+
+        int new_live = 0;
+
+        mir_foreach_src(ins, s) {
+                unsigned S = ins->src[s];
+
+                bool dupe = false;
+
+                for (unsigned q = 0; q < s; ++q)
+                        dupe |= (ins->src[q] == S);
+
+                if (dupe)
+                        continue;
+
+                if (S < SSA_FIXED_MINIMUM) {
+                        unsigned bytemask = mir_bytemask_of_read_components(ins, S);
+                        bytemask = util_next_power_of_two(bytemask + 1) - 1;
+
+                        /* Count only the new components */
+                        new_live += util_bitcount(bytemask & ~(liveness[S]));
+
+                        if (destructive)
+                                liveness[S] |= bytemask;
+                }
+        }
+
+        return new_live - free_live;
+}
+
 static midgard_instruction *
 mir_choose_instruction(
                 midgard_instruction **instructions,
+                uint16_t *liveness,
                 BITSET_WORD *worklist, unsigned count,
                 struct midgard_predicate *predicate)
 {
@@ -595,6 +642,7 @@ mir_choose_instruction(
         unsigned i;
 
         signed best_index = -1;
+        signed best_effect = INT_MAX;
         bool best_conditional = false;
 
         /* Enforce a simple metric limiting distance to keep down register
@@ -602,7 +650,7 @@ mir_choose_instruction(
          * results */
 
         unsigned max_active = 0;
-        unsigned max_distance = 6;
+        unsigned max_distance = 36;
 
         BITSET_FOREACH_SET(i, worklist, count) {
                 max_active = MAX2(max_active, i);
@@ -655,15 +703,19 @@ mir_choose_instruction(
                 if (conditional && no_cond)
                         continue;
 
-                /* Simulate in-order scheduling */
-                if ((signed) i < best_index)
+                int effect = mir_live_effect(liveness, instructions[i], false);
+
+                if (effect > best_effect)
+                        continue;
+
+                if (effect == best_effect && (signed) i < best_index)
                         continue;
 
+                best_effect = effect;
                 best_index = i;
                 best_conditional = conditional;
         }
 
-
         /* Did we find anything?  */
 
         if (best_index < 0)
@@ -686,6 +738,7 @@ mir_choose_instruction(
 
                 /* Once we schedule a conditional, we can't again */
                 predicate->no_cond |= best_conditional;
+                mir_live_effect(liveness, instructions[best_index], true);
         }
 
         return instructions[best_index];
@@ -697,6 +750,7 @@ mir_choose_instruction(
 static unsigned
 mir_choose_bundle(
                 midgard_instruction **instructions,
+                uint16_t *liveness,
                 BITSET_WORD *worklist, unsigned count)
 {
         /* At the moment, our algorithm is very simple - use the bundle of the
@@ -709,7 +763,7 @@ mir_choose_bundle(
                 .exclude = ~0
         };
 
-        midgard_instruction *chosen = mir_choose_instruction(instructions, worklist, count, &predicate);
+        midgard_instruction *chosen = mir_choose_instruction(instructions, liveness, worklist, count, &predicate);
 
         if (chosen)
                 return chosen->type;
@@ -721,6 +775,7 @@ mir_choose_bundle(
 static void
 mir_choose_alu(midgard_instruction **slot,
                 midgard_instruction **instructions,
+                uint16_t *liveness,
                 BITSET_WORD *worklist, unsigned len,
                 struct midgard_predicate *predicate,
                 unsigned unit)
@@ -731,7 +786,7 @@ mir_choose_alu(midgard_instruction **slot,
 
         /* Try to schedule something, if not */
         predicate->unit = unit;
-        *slot = mir_choose_instruction(instructions, worklist, len, predicate);
+        *slot = mir_choose_instruction(instructions, liveness, worklist, len, predicate);
 
         /* Store unit upon scheduling */
         if (*slot && !((*slot)->compact_branch))
@@ -898,6 +953,7 @@ mir_schedule_condition(compiler_context *ctx,
 static midgard_bundle
 mir_schedule_texture(
                 midgard_instruction **instructions,
+                uint16_t *liveness,
                 BITSET_WORD *worklist, unsigned len,
                 bool is_vertex)
 {
@@ -908,7 +964,7 @@ mir_schedule_texture(
         };
 
         midgard_instruction *ins =
-                mir_choose_instruction(instructions, worklist, len, &predicate);
+                mir_choose_instruction(instructions, liveness, worklist, len, &predicate);
 
         mir_update_worklist(worklist, len, instructions, ins);
 
@@ -926,6 +982,7 @@ mir_schedule_texture(
 static midgard_bundle
 mir_schedule_ldst(
                 midgard_instruction **instructions,
+                uint16_t *liveness,
                 BITSET_WORD *worklist, unsigned len)
 {
         struct midgard_predicate predicate = {
@@ -937,10 +994,10 @@ mir_schedule_ldst(
         /* Try to pick two load/store ops. Second not gauranteed to exist */
 
         midgard_instruction *ins =
-                mir_choose_instruction(instructions, worklist, len, &predicate);
+                mir_choose_instruction(instructions, liveness, worklist, len, &predicate);
 
         midgard_instruction *pair =
-                mir_choose_instruction(instructions, worklist, len, &predicate);
+                mir_choose_instruction(instructions, liveness, worklist, len, &predicate);
 
         struct midgard_bundle out = {
                 .tag = TAG_LOAD_STORE_4,
@@ -962,6 +1019,7 @@ mir_schedule_zs_write(
                 compiler_context *ctx,
                 struct midgard_predicate *predicate,
                 midgard_instruction **instructions,
+                uint16_t *liveness,
                 BITSET_WORD *worklist, unsigned len,
                 midgard_instruction *branch,
                 midgard_instruction **smul,
@@ -985,7 +1043,7 @@ mir_schedule_zs_write(
 
                 predicate->unit = unit_names[i];
                 midgard_instruction *ins =
-                        mir_choose_instruction(instructions, worklist, len, predicate);
+                        mir_choose_instruction(instructions, liveness, worklist, len, predicate);
 
                 if (ins) {
                         ins->unit = unit_names[i];
@@ -1028,6 +1086,7 @@ static midgard_bundle
 mir_schedule_alu(
                 compiler_context *ctx,
                 midgard_instruction **instructions,
+                uint16_t *liveness,
                 BITSET_WORD *worklist, unsigned len)
 {
         struct midgard_bundle bundle = {};
@@ -1048,7 +1107,7 @@ mir_schedule_alu(
         midgard_instruction *sadd = NULL;
         midgard_instruction *branch = NULL;
 
-        mir_choose_alu(&branch, instructions, worklist, len, &predicate, ALU_ENAB_BR_COMPACT);
+        mir_choose_alu(&branch, instructions, liveness, worklist, len, &predicate, ALU_ENAB_BR_COMPACT);
         mir_update_worklist(worklist, len, instructions, branch);
         unsigned writeout = branch ? branch->writeout : 0;
 
@@ -1123,19 +1182,19 @@ mir_schedule_alu(
         }
 
         if (writeout & PAN_WRITEOUT_Z)
-                mir_schedule_zs_write(ctx, &predicate, instructions, worklist, len, branch, &smul, &vadd, &vlut, false);
+                mir_schedule_zs_write(ctx, &predicate, instructions, liveness, worklist, len, branch, &smul, &vadd, &vlut, false);
 
         if (writeout & PAN_WRITEOUT_S)
-                mir_schedule_zs_write(ctx, &predicate, instructions, worklist, len, branch, &smul, &vadd, &vlut, true);
+                mir_schedule_zs_write(ctx, &predicate, instructions, liveness, worklist, len, branch, &smul, &vadd, &vlut, true);
 
-        mir_choose_alu(&smul, instructions, worklist, len, &predicate, UNIT_SMUL);
+        mir_choose_alu(&smul, instructions, liveness, worklist, len, &predicate, UNIT_SMUL);
 
         for (unsigned moves = 0; moves < 2; ++moves) {
                 predicate.moves = moves;
                 predicate.no_mask = writeout ? (1 << 3) : 0;
-                mir_choose_alu(&vlut, instructions, worklist, len, &predicate, UNIT_VLUT);
+                mir_choose_alu(&vlut, instructions, liveness, worklist, len, &predicate, UNIT_VLUT);
                 predicate.no_mask = 0;
-                mir_choose_alu(&vadd, instructions, worklist, len, &predicate, UNIT_VADD);
+                mir_choose_alu(&vadd, instructions, liveness, worklist, len, &predicate, UNIT_VADD);
         }
 
         mir_update_worklist(worklist, len, instructions, vlut);
@@ -1158,7 +1217,7 @@ mir_schedule_alu(
         }
 
         /* Stage 2, let's schedule sadd before vmul for writeout */
-        mir_choose_alu(&sadd, instructions, worklist, len, &predicate, UNIT_SADD);
+        mir_choose_alu(&sadd, instructions, liveness, worklist, len, &predicate, UNIT_SADD);
 
         /* Check if writeout reads its own register */
 
@@ -1191,7 +1250,7 @@ mir_schedule_alu(
                         predicate.mask = writeout_mask ^ full_mask;
 
                         struct midgard_instruction *peaked =
-                                mir_choose_instruction(instructions, worklist, len, &predicate);
+                                mir_choose_instruction(instructions, liveness, worklist, len, &predicate);
 
                         if (peaked) {
                                 vmul = peaked;
@@ -1224,7 +1283,7 @@ mir_schedule_alu(
                 }
         }
 
-        mir_choose_alu(&vmul, instructions, worklist, len, &predicate, UNIT_VMUL);
+        mir_choose_alu(&vmul, instructions, liveness, worklist, len, &predicate, UNIT_VMUL);
 
         mir_update_worklist(worklist, len, instructions, vmul);
         mir_update_worklist(worklist, len, instructions, sadd);
@@ -1298,6 +1357,7 @@ schedule_block(compiler_context *ctx, midgard_block *block)
         /* Allocate the worklist */
         size_t sz = BITSET_WORDS(len) * sizeof(BITSET_WORD);
         BITSET_WORD *worklist = calloc(sz, 1);
+        uint16_t *liveness = calloc(node_count, 2);
         mir_initialize_worklist(worklist, instructions, len);
 
         struct util_dynarray bundles;
@@ -1307,15 +1367,15 @@ schedule_block(compiler_context *ctx, midgard_block *block)
         unsigned blend_offset = 0;
 
         for (;;) {
-                unsigned tag = mir_choose_bundle(instructions, worklist, len);
+                unsigned tag = mir_choose_bundle(instructions, liveness, worklist, len);
                 midgard_bundle bundle;
 
                 if (tag == TAG_TEXTURE_4)
-                        bundle = mir_schedule_texture(instructions, worklist, len, ctx->stage != MESA_SHADER_FRAGMENT);
+                        bundle = mir_schedule_texture(instructions, liveness, worklist, len, ctx->stage != MESA_SHADER_FRAGMENT);
                 else if (tag == TAG_LOAD_STORE_4)
-                        bundle = mir_schedule_ldst(instructions, worklist, len);
+                        bundle = mir_schedule_ldst(instructions, liveness, worklist, len);
                 else if (tag == TAG_ALU_4)
-                        bundle = mir_schedule_alu(ctx, instructions, worklist, len);
+                        bundle = mir_schedule_alu(ctx, instructions, liveness, worklist, len);
                 else
                         break;
 
@@ -1360,6 +1420,7 @@ schedule_block(compiler_context *ctx, midgard_block *block)
 
 	free(instructions); /* Allocated by flatten_mir() */
 	free(worklist);
+        free(liveness);
 }
 
 void
-- 
2.30.2