From 7b0a4f977b98cc49df5b2233e1674e63b05a7b25 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Tue, 16 Jun 2020 19:06:21 -0400 Subject: [PATCH] pan/mdg: Schedule based on liveness MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit By estimating liveness in the scheduler and choosing instructions likely to reduce register pressure, on average we can decrease pressure given a sufficiently larger window. On the other hand, decreasing pressure instead of leaning too heavily on the search window enables us to use a much larger search window without inflating pressure too much. So by doing both in lockstep, we benefit pretty well. total instructions in shared programs: 49458 -> 48540 (-1.86%) instructions in affected programs: 26931 -> 26013 (-3.41%) helped: 221 HURT: 15 helped stats (abs) min: 1 max: 36 x̄: 4.37 x̃: 2 helped stats (rel) min: 0.31% max: 16.90% x̄: 4.97% x̃: 3.85% HURT stats (abs) min: 1 max: 4 x̄: 3.13 x̃: 3 HURT stats (rel) min: 0.50% max: 7.14% x̄: 4.53% x̃: 4.55% 95% mean confidence interval for instructions value: -4.65 -3.13 95% mean confidence interval for instructions %-change: -4.94% -3.81% Instructions are helped. total bundles in shared programs: 25199 -> 23446 (-6.96%) bundles in affected programs: 21600 -> 19847 (-8.12%) helped: 277 HURT: 170 helped stats (abs) min: 1 max: 45 x̄: 7.33 x̃: 6 helped stats (rel) min: 1.06% max: 33.83% x̄: 11.01% x̃: 8.57% HURT stats (abs) min: 1 max: 6 x̄: 1.63 x̃: 1 HURT stats (rel) min: 1.19% max: 40.00% x̄: 13.36% x̃: 11.11% 95% mean confidence interval for bundles value: -4.61 -3.23 95% mean confidence interval for bundles %-change: -3.00% -0.49% Bundles are helped. total quadwords in shared programs: 40269 -> 39652 (-1.53%) quadwords in affected programs: 35881 -> 35264 (-1.72%) helped: 242 HURT: 244 helped stats (abs) min: 1 max: 36 x̄: 4.61 x̃: 3 helped stats (rel) min: 0.39% max: 16.33% x̄: 5.33% x̃: 5.13% HURT stats (abs) min: 1 max: 20 x̄: 2.04 x̃: 1 HURT stats (rel) min: 0.81% max: 21.74% x̄: 7.57% x̃: 6.25% 95% mean confidence interval for quadwords value: -1.71 -0.83 95% mean confidence interval for quadwords %-change: 0.46% 1.82% Inconclusive result (value mean confidence interval and %-change mean confidence interval disagree). total registers in shared programs: 3786 -> 3336 (-11.89%) registers in affected programs: 2161 -> 1711 (-20.82%) helped: 262 HURT: 35 helped stats (abs) min: 1 max: 7 x̄: 1.87 x̃: 1 helped stats (rel) min: 6.25% max: 66.67% x̄: 28.91% x̃: 25.00% HURT stats (abs) min: 1 max: 3 x̄: 1.11 x̃: 1 HURT stats (rel) min: 7.69% max: 100.00% x̄: 19.76% x̃: 12.50% 95% mean confidence interval for registers value: -1.70 -1.33 95% mean confidence interval for registers %-change: -25.56% -20.79% Registers are helped. total threads in shared programs: 2453 -> 2592 (5.67%) threads in affected programs: 160 -> 299 (86.87%) helped: 79 HURT: 6 helped stats (abs) min: 1 max: 2 x̄: 1.85 x̃: 2 helped stats (rel) min: 100.00% max: 100.00% x̄: 100.00% x̃: 100.00% HURT stats (abs) min: 1 max: 2 x̄: 1.17 x̃: 1 HURT stats (rel) min: 50.00% max: 50.00% x̄: 50.00% x̃: 50.00% 95% mean confidence interval for threads value: 1.45 1.82 95% mean confidence interval for threads %-change: 81.08% 97.75% Threads are [helped]. total spills in shared programs: 168 -> 17 (-89.88%) spills in affected programs: 167 -> 16 (-90.42%) helped: 13 HURT: 0 total fills in shared programs: 186 -> 35 (-81.18%) fills in affected programs: 186 -> 35 (-81.18%) helped: 14 HURT: 0 Part-of: --- src/panfrost/midgard/midgard_schedule.c | 107 +++++++++++++++++++----- 1 file changed, 84 insertions(+), 23 deletions(-) diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c index 3aee91222ef..c5a4dea67f9 100644 --- a/src/panfrost/midgard/midgard_schedule.c +++ b/src/panfrost/midgard/midgard_schedule.c @@ -572,9 +572,56 @@ mir_has_unit(midgard_instruction *ins, unsigned unit) return false; } +/* Net change in liveness if an instruction were scheduled. Loosely based on + * ir3's scheduler. */ + +static int +mir_live_effect(uint16_t *liveness, midgard_instruction *ins, bool destructive) +{ + /* TODO: what if dest is used multiple times? */ + int free_live = 0; + + if (ins->dest < SSA_FIXED_MINIMUM) { + unsigned bytemask = mir_bytemask(ins); + bytemask = util_next_power_of_two(bytemask + 1) - 1; + free_live += util_bitcount(liveness[ins->dest] & bytemask); + + if (destructive) + liveness[ins->dest] &= ~bytemask; + } + + int new_live = 0; + + mir_foreach_src(ins, s) { + unsigned S = ins->src[s]; + + bool dupe = false; + + for (unsigned q = 0; q < s; ++q) + dupe |= (ins->src[q] == S); + + if (dupe) + continue; + + if (S < SSA_FIXED_MINIMUM) { + unsigned bytemask = mir_bytemask_of_read_components(ins, S); + bytemask = util_next_power_of_two(bytemask + 1) - 1; + + /* Count only the new components */ + new_live += util_bitcount(bytemask & ~(liveness[S])); + + if (destructive) + liveness[S] |= bytemask; + } + } + + return new_live - free_live; +} + static midgard_instruction * mir_choose_instruction( midgard_instruction **instructions, + uint16_t *liveness, BITSET_WORD *worklist, unsigned count, struct midgard_predicate *predicate) { @@ -595,6 +642,7 @@ mir_choose_instruction( unsigned i; signed best_index = -1; + signed best_effect = INT_MAX; bool best_conditional = false; /* Enforce a simple metric limiting distance to keep down register @@ -602,7 +650,7 @@ mir_choose_instruction( * results */ unsigned max_active = 0; - unsigned max_distance = 6; + unsigned max_distance = 36; BITSET_FOREACH_SET(i, worklist, count) { max_active = MAX2(max_active, i); @@ -655,15 +703,19 @@ mir_choose_instruction( if (conditional && no_cond) continue; - /* Simulate in-order scheduling */ - if ((signed) i < best_index) + int effect = mir_live_effect(liveness, instructions[i], false); + + if (effect > best_effect) + continue; + + if (effect == best_effect && (signed) i < best_index) continue; + best_effect = effect; best_index = i; best_conditional = conditional; } - /* Did we find anything? */ if (best_index < 0) @@ -686,6 +738,7 @@ mir_choose_instruction( /* Once we schedule a conditional, we can't again */ predicate->no_cond |= best_conditional; + mir_live_effect(liveness, instructions[best_index], true); } return instructions[best_index]; @@ -697,6 +750,7 @@ mir_choose_instruction( static unsigned mir_choose_bundle( midgard_instruction **instructions, + uint16_t *liveness, BITSET_WORD *worklist, unsigned count) { /* At the moment, our algorithm is very simple - use the bundle of the @@ -709,7 +763,7 @@ mir_choose_bundle( .exclude = ~0 }; - midgard_instruction *chosen = mir_choose_instruction(instructions, worklist, count, &predicate); + midgard_instruction *chosen = mir_choose_instruction(instructions, liveness, worklist, count, &predicate); if (chosen) return chosen->type; @@ -721,6 +775,7 @@ mir_choose_bundle( static void mir_choose_alu(midgard_instruction **slot, midgard_instruction **instructions, + uint16_t *liveness, BITSET_WORD *worklist, unsigned len, struct midgard_predicate *predicate, unsigned unit) @@ -731,7 +786,7 @@ mir_choose_alu(midgard_instruction **slot, /* Try to schedule something, if not */ predicate->unit = unit; - *slot = mir_choose_instruction(instructions, worklist, len, predicate); + *slot = mir_choose_instruction(instructions, liveness, worklist, len, predicate); /* Store unit upon scheduling */ if (*slot && !((*slot)->compact_branch)) @@ -898,6 +953,7 @@ mir_schedule_condition(compiler_context *ctx, static midgard_bundle mir_schedule_texture( midgard_instruction **instructions, + uint16_t *liveness, BITSET_WORD *worklist, unsigned len, bool is_vertex) { @@ -908,7 +964,7 @@ mir_schedule_texture( }; midgard_instruction *ins = - mir_choose_instruction(instructions, worklist, len, &predicate); + mir_choose_instruction(instructions, liveness, worklist, len, &predicate); mir_update_worklist(worklist, len, instructions, ins); @@ -926,6 +982,7 @@ mir_schedule_texture( static midgard_bundle mir_schedule_ldst( midgard_instruction **instructions, + uint16_t *liveness, BITSET_WORD *worklist, unsigned len) { struct midgard_predicate predicate = { @@ -937,10 +994,10 @@ mir_schedule_ldst( /* Try to pick two load/store ops. Second not gauranteed to exist */ midgard_instruction *ins = - mir_choose_instruction(instructions, worklist, len, &predicate); + mir_choose_instruction(instructions, liveness, worklist, len, &predicate); midgard_instruction *pair = - mir_choose_instruction(instructions, worklist, len, &predicate); + mir_choose_instruction(instructions, liveness, worklist, len, &predicate); struct midgard_bundle out = { .tag = TAG_LOAD_STORE_4, @@ -962,6 +1019,7 @@ mir_schedule_zs_write( compiler_context *ctx, struct midgard_predicate *predicate, midgard_instruction **instructions, + uint16_t *liveness, BITSET_WORD *worklist, unsigned len, midgard_instruction *branch, midgard_instruction **smul, @@ -985,7 +1043,7 @@ mir_schedule_zs_write( predicate->unit = unit_names[i]; midgard_instruction *ins = - mir_choose_instruction(instructions, worklist, len, predicate); + mir_choose_instruction(instructions, liveness, worklist, len, predicate); if (ins) { ins->unit = unit_names[i]; @@ -1028,6 +1086,7 @@ static midgard_bundle mir_schedule_alu( compiler_context *ctx, midgard_instruction **instructions, + uint16_t *liveness, BITSET_WORD *worklist, unsigned len) { struct midgard_bundle bundle = {}; @@ -1048,7 +1107,7 @@ mir_schedule_alu( midgard_instruction *sadd = NULL; midgard_instruction *branch = NULL; - mir_choose_alu(&branch, instructions, worklist, len, &predicate, ALU_ENAB_BR_COMPACT); + mir_choose_alu(&branch, instructions, liveness, worklist, len, &predicate, ALU_ENAB_BR_COMPACT); mir_update_worklist(worklist, len, instructions, branch); unsigned writeout = branch ? branch->writeout : 0; @@ -1123,19 +1182,19 @@ mir_schedule_alu( } if (writeout & PAN_WRITEOUT_Z) - mir_schedule_zs_write(ctx, &predicate, instructions, worklist, len, branch, &smul, &vadd, &vlut, false); + mir_schedule_zs_write(ctx, &predicate, instructions, liveness, worklist, len, branch, &smul, &vadd, &vlut, false); if (writeout & PAN_WRITEOUT_S) - mir_schedule_zs_write(ctx, &predicate, instructions, worklist, len, branch, &smul, &vadd, &vlut, true); + mir_schedule_zs_write(ctx, &predicate, instructions, liveness, worklist, len, branch, &smul, &vadd, &vlut, true); - mir_choose_alu(&smul, instructions, worklist, len, &predicate, UNIT_SMUL); + mir_choose_alu(&smul, instructions, liveness, worklist, len, &predicate, UNIT_SMUL); for (unsigned moves = 0; moves < 2; ++moves) { predicate.moves = moves; predicate.no_mask = writeout ? (1 << 3) : 0; - mir_choose_alu(&vlut, instructions, worklist, len, &predicate, UNIT_VLUT); + mir_choose_alu(&vlut, instructions, liveness, worklist, len, &predicate, UNIT_VLUT); predicate.no_mask = 0; - mir_choose_alu(&vadd, instructions, worklist, len, &predicate, UNIT_VADD); + mir_choose_alu(&vadd, instructions, liveness, worklist, len, &predicate, UNIT_VADD); } mir_update_worklist(worklist, len, instructions, vlut); @@ -1158,7 +1217,7 @@ mir_schedule_alu( } /* Stage 2, let's schedule sadd before vmul for writeout */ - mir_choose_alu(&sadd, instructions, worklist, len, &predicate, UNIT_SADD); + mir_choose_alu(&sadd, instructions, liveness, worklist, len, &predicate, UNIT_SADD); /* Check if writeout reads its own register */ @@ -1191,7 +1250,7 @@ mir_schedule_alu( predicate.mask = writeout_mask ^ full_mask; struct midgard_instruction *peaked = - mir_choose_instruction(instructions, worklist, len, &predicate); + mir_choose_instruction(instructions, liveness, worklist, len, &predicate); if (peaked) { vmul = peaked; @@ -1224,7 +1283,7 @@ mir_schedule_alu( } } - mir_choose_alu(&vmul, instructions, worklist, len, &predicate, UNIT_VMUL); + mir_choose_alu(&vmul, instructions, liveness, worklist, len, &predicate, UNIT_VMUL); mir_update_worklist(worklist, len, instructions, vmul); mir_update_worklist(worklist, len, instructions, sadd); @@ -1298,6 +1357,7 @@ schedule_block(compiler_context *ctx, midgard_block *block) /* Allocate the worklist */ size_t sz = BITSET_WORDS(len) * sizeof(BITSET_WORD); BITSET_WORD *worklist = calloc(sz, 1); + uint16_t *liveness = calloc(node_count, 2); mir_initialize_worklist(worklist, instructions, len); struct util_dynarray bundles; @@ -1307,15 +1367,15 @@ schedule_block(compiler_context *ctx, midgard_block *block) unsigned blend_offset = 0; for (;;) { - unsigned tag = mir_choose_bundle(instructions, worklist, len); + unsigned tag = mir_choose_bundle(instructions, liveness, worklist, len); midgard_bundle bundle; if (tag == TAG_TEXTURE_4) - bundle = mir_schedule_texture(instructions, worklist, len, ctx->stage != MESA_SHADER_FRAGMENT); + bundle = mir_schedule_texture(instructions, liveness, worklist, len, ctx->stage != MESA_SHADER_FRAGMENT); else if (tag == TAG_LOAD_STORE_4) - bundle = mir_schedule_ldst(instructions, worklist, len); + bundle = mir_schedule_ldst(instructions, liveness, worklist, len); else if (tag == TAG_ALU_4) - bundle = mir_schedule_alu(ctx, instructions, worklist, len); + bundle = mir_schedule_alu(ctx, instructions, liveness, worklist, len); else break; @@ -1360,6 +1420,7 @@ schedule_block(compiler_context *ctx, midgard_block *block) free(instructions); /* Allocated by flatten_mir() */ free(worklist); + free(liveness); } void -- 2.30.2