pan/midgard: Track shader quadword count while scheduling
[mesa.git] / src / panfrost / midgard / midgard_schedule.c
1 /*
2 * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "compiler.h"
25 #include "midgard_ops.h"
26 #include "util/u_memory.h"
27 #include "util/register_allocate.h"
28
29 /* Scheduling for Midgard is complicated, to say the least. ALU instructions
30 * must be grouped into VLIW bundles according to following model:
31 *
32 * [VMUL] [SADD]
33 * [VADD] [SMUL] [VLUT]
34 *
35 * A given instruction can execute on some subset of the units (or a few can
36 * execute on all). Instructions can be either vector or scalar; only scalar
37 * instructions can execute on SADD/SMUL units. Units on a given line execute
38 * in parallel. Subsequent lines execute separately and can pass results
39 * directly via pipeline registers r24/r25, bypassing the register file.
40 *
41 * A bundle can optionally have 128-bits of embedded constants, shared across
42 * all of the instructions within a bundle.
43 *
44 * Instructions consuming conditionals (branches and conditional selects)
45 * require their condition to be written into the conditional register (r31)
46 * within the same bundle they are consumed.
47 *
48 * Fragment writeout requires its argument to be written in full within the
49 * same bundle as the branch, with no hanging dependencies.
50 *
51 * Load/store instructions are also in bundles of simply two instructions, and
52 * texture instructions have no bundling.
53 *
54 * -------------------------------------------------------------------------
55 *
56 */
57
58 /* Create a mask of accessed components from a swizzle to figure out vector
59 * dependencies */
60
61 static unsigned
62 swizzle_to_access_mask(unsigned swizzle)
63 {
64 unsigned component_mask = 0;
65
66 for (int i = 0; i < 4; ++i) {
67 unsigned c = (swizzle >> (2 * i)) & 3;
68 component_mask |= (1 << c);
69 }
70
71 return component_mask;
72 }
73
74 /* Does the mask cover more than a scalar? */
75
76 static bool
77 is_single_component_mask(unsigned mask)
78 {
79 int components = 0;
80
81 for (int c = 0; c < 8; ++c) {
82 if (mask & (1 << c))
83 components++;
84 }
85
86 return components == 1;
87 }
88
89 /* Checks for an SSA data hazard between two adjacent instructions, keeping in
90 * mind that we are a vector architecture and we can write to different
91 * components simultaneously */
92
93 static bool
94 can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
95 {
96 /* Writeout has its own rules anyway */
97 if (first->compact_branch || second->compact_branch)
98 return true;
99
100 /* Each instruction reads some registers and writes to a register. See
101 * where the first writes */
102
103 int source = first->dest;
104 int source_mask = first->mask;
105
106 /* As long as the second doesn't read from the first, we're okay */
107 for (unsigned i = 0; i < ARRAY_SIZE(second->src); ++i) {
108 if (second->src[i] != source)
109 continue;
110
111 if (first->type != TAG_ALU_4)
112 return false;
113
114 /* Figure out which components we just read from */
115
116 int q = (i == 0) ? second->alu.src1 : second->alu.src2;
117 midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
118
119 /* Check if there are components in common, and fail if so */
120 if (swizzle_to_access_mask(m->swizzle) & source_mask)
121 return false;
122 }
123
124 /* Otherwise, it's safe in that regard. Another data hazard is both
125 * writing to the same place, of course */
126
127 if (second->dest == source) {
128 /* ...but only if the components overlap */
129
130 if (second->mask & source_mask)
131 return false;
132 }
133
134 /* ...That's it */
135 return true;
136 }
137
138 static bool
139 midgard_has_hazard(
140 midgard_instruction **segment, unsigned segment_size,
141 midgard_instruction *ains)
142 {
143 for (int s = 0; s < segment_size; ++s)
144 if (!can_run_concurrent_ssa(segment[s], ains))
145 return true;
146
147 return false;
148
149
150 }
151
152 /* Fragment writeout (of r0) is allowed when:
153 *
154 * - All components of r0 are written in the bundle
155 * - No components of r0 are written in VLUT
156 * - Non-pipelined dependencies of r0 are not written in the bundle
157 *
158 * This function checks if these requirements are satisfied given the content
159 * of a scheduled bundle.
160 */
161
162 static bool
163 can_writeout_fragment(compiler_context *ctx, midgard_instruction **bundle, unsigned count, unsigned node_count)
164 {
165 /* First scan for which components of r0 are written out. Initially
166 * none are written */
167
168 uint8_t r0_written_mask = 0x0;
169
170 /* Simultaneously we scan for the set of dependencies */
171
172 size_t sz = sizeof(BITSET_WORD) * BITSET_WORDS(node_count);
173 BITSET_WORD *dependencies = alloca(sz);
174 memset(dependencies, 0, sz);
175
176 for (unsigned i = 0; i < count; ++i) {
177 midgard_instruction *ins = bundle[i];
178
179 if (ins->dest != SSA_FIXED_REGISTER(0))
180 continue;
181
182 /* Record written out mask */
183 r0_written_mask |= ins->mask;
184
185 /* Record dependencies, but only if they won't become pipeline
186 * registers. We know we can't be live after this, because
187 * we're writeout at the very end of the shader. So check if
188 * they were written before us. */
189
190 unsigned src0 = ins->src[0];
191 unsigned src1 = ins->src[1];
192
193 if (!mir_is_written_before(ctx, bundle[0], src0))
194 src0 = ~0;
195
196 if (!mir_is_written_before(ctx, bundle[0], src1))
197 src1 = ~0;
198
199 if (src0 < node_count)
200 BITSET_SET(dependencies, src0);
201
202 if (src1 < node_count)
203 BITSET_SET(dependencies, src1);
204
205 /* Requirement 2 */
206 if (ins->unit == UNIT_VLUT)
207 return false;
208 }
209
210 /* Requirement 1 */
211 if ((r0_written_mask & 0xF) != 0xF)
212 return false;
213
214 /* Requirement 3 */
215
216 for (unsigned i = 0; i < count; ++i) {
217 unsigned dest = bundle[i]->dest;
218
219 if (dest < node_count && BITSET_TEST(dependencies, dest))
220 return false;
221 }
222
223 /* Otherwise, we're good to go */
224 return true;
225 }
226
227 /* Helpers for scheudling */
228
229 static bool
230 mir_is_scalar(midgard_instruction *ains)
231 {
232 /* Does the op support scalar units? */
233 if (!(alu_opcode_props[ains->alu.op].props & UNITS_SCALAR))
234 return false;
235
236 /* Do we try to use it as a vector op? */
237 if (!is_single_component_mask(ains->mask))
238 return false;
239
240 /* Otherwise, check mode hazards */
241 bool could_scalar = true;
242
243 /* Only 16/32-bit can run on a scalar unit */
244 could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8;
245 could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64;
246 could_scalar &= ains->alu.dest_override == midgard_dest_override_none;
247
248 if (ains->alu.reg_mode == midgard_reg_mode_16) {
249 /* If we're running in 16-bit mode, we
250 * can't have any 8-bit sources on the
251 * scalar unit (since the scalar unit
252 * doesn't understand 8-bit) */
253
254 midgard_vector_alu_src s1 =
255 vector_alu_from_unsigned(ains->alu.src1);
256
257 could_scalar &= !s1.half;
258
259 midgard_vector_alu_src s2 =
260 vector_alu_from_unsigned(ains->alu.src2);
261
262 could_scalar &= !s2.half;
263 }
264
265 return could_scalar;
266 }
267
268 /* How many bytes does this ALU instruction add to the bundle? */
269
270 static unsigned
271 bytes_for_instruction(midgard_instruction *ains)
272 {
273 if (ains->unit & UNITS_ANY_VECTOR)
274 return sizeof(midgard_reg_info) + sizeof(midgard_vector_alu);
275 else if (ains->unit == ALU_ENAB_BRANCH)
276 return sizeof(midgard_branch_extended);
277 else if (ains->compact_branch)
278 return sizeof(ains->br_compact);
279 else
280 return sizeof(midgard_reg_info) + sizeof(midgard_scalar_alu);
281 }
282
283 /* Schedules, but does not emit, a single basic block. After scheduling, the
284 * final tag and size of the block are known, which are necessary for branching
285 * */
286
287 static midgard_bundle
288 schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
289 {
290 int instructions_emitted = 0, packed_idx = 0;
291 midgard_bundle bundle = { 0 };
292
293 midgard_instruction *scheduled[5] = { NULL };
294
295 uint8_t tag = ins->type;
296
297 /* Default to the instruction's tag */
298 bundle.tag = tag;
299
300 switch (ins->type) {
301 case TAG_ALU_4: {
302 uint32_t control = 0;
303 size_t bytes_emitted = sizeof(control);
304
305 /* TODO: Constant combining */
306 int index = 0, last_unit = 0;
307
308 /* Previous instructions, for the purpose of parallelism */
309 midgard_instruction *segment[4] = {0};
310 int segment_size = 0;
311
312 instructions_emitted = -1;
313 midgard_instruction *pins = ins;
314
315 unsigned constant_count = 0;
316
317 for (;;) {
318 midgard_instruction *ains = pins;
319
320 /* Advance instruction pointer */
321 if (index) {
322 ains = mir_next_op(pins);
323 pins = ains;
324 }
325
326 /* Out-of-work condition */
327 if ((struct list_head *) ains == &block->instructions)
328 break;
329
330 /* Ensure that the chain can continue */
331 if (ains->type != TAG_ALU_4) break;
332
333 /* If there's already something in the bundle and we
334 * have weird scheduler constraints, break now */
335 if (ains->precede_break && index) break;
336
337 /* According to the presentation "The ARM
338 * Mali-T880 Mobile GPU" from HotChips 27,
339 * there are two pipeline stages. Branching
340 * position determined experimentally. Lines
341 * are executed in parallel:
342 *
343 * [ VMUL ] [ SADD ]
344 * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
345 *
346 * Verify that there are no ordering dependencies here.
347 *
348 * TODO: Allow for parallelism!!!
349 */
350
351 /* Pick a unit for it if it doesn't force a particular unit */
352
353 int unit = ains->unit;
354
355 if (!unit) {
356 int op = ains->alu.op;
357 int units = alu_opcode_props[op].props;
358 bool scalar = mir_is_scalar(ains);
359
360 if (!scalar) {
361 if (last_unit >= UNIT_VADD) {
362 if (units & UNIT_VLUT)
363 unit = UNIT_VLUT;
364 else
365 break;
366 } else {
367 if ((units & UNIT_VMUL) && last_unit < UNIT_VMUL)
368 unit = UNIT_VMUL;
369 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
370 unit = UNIT_VADD;
371 else if (units & UNIT_VLUT)
372 unit = UNIT_VLUT;
373 else
374 break;
375 }
376 } else {
377 if (last_unit >= UNIT_VADD) {
378 if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
379 unit = UNIT_SMUL;
380 else if (units & UNIT_VLUT)
381 unit = UNIT_VLUT;
382 else
383 break;
384 } else {
385 if ((units & UNIT_VMUL) && (last_unit < UNIT_VMUL))
386 unit = UNIT_VMUL;
387 else if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
388 unit = UNIT_SADD;
389 else if (units & UNIT_VADD)
390 unit = UNIT_VADD;
391 else if (units & UNIT_SMUL)
392 unit = UNIT_SMUL;
393 else if (units & UNIT_VLUT)
394 unit = UNIT_VLUT;
395 else
396 break;
397 }
398 }
399
400 assert(unit & units);
401 }
402
403 /* Late unit check, this time for encoding (not parallelism) */
404 if (unit <= last_unit) break;
405
406 /* Clear the segment */
407 if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
408 segment_size = 0;
409
410 if (midgard_has_hazard(segment, segment_size, ains))
411 break;
412
413 /* We're good to go -- emit the instruction */
414 ains->unit = unit;
415
416 segment[segment_size++] = ains;
417
418 /* We try to reuse constants if possible, by adjusting
419 * the swizzle */
420
421 if (ains->has_blend_constant) {
422 /* Everything conflicts with the blend constant */
423 if (bundle.has_embedded_constants)
424 break;
425
426 bundle.has_blend_constant = 1;
427 bundle.has_embedded_constants = 1;
428 } else if (ains->has_constants && ains->alu.reg_mode == midgard_reg_mode_16) {
429 /* TODO: DRY with the analysis pass */
430
431 if (bundle.has_blend_constant)
432 break;
433
434 if (constant_count)
435 break;
436
437 /* TODO: Fix packing XXX */
438 uint16_t *bundles = (uint16_t *) bundle.constants;
439 uint32_t *constants = (uint32_t *) ains->constants;
440
441 /* Copy them wholesale */
442 for (unsigned i = 0; i < 4; ++i)
443 bundles[i] = constants[i];
444
445 bundle.has_embedded_constants = true;
446 constant_count = 4;
447 } else if (ains->has_constants) {
448 /* By definition, blend constants conflict with
449 * everything, so if there are already
450 * constants we break the bundle *now* */
451
452 if (bundle.has_blend_constant)
453 break;
454
455 /* For anything but blend constants, we can do
456 * proper analysis, however */
457
458 /* TODO: Mask by which are used */
459 uint32_t *constants = (uint32_t *) ains->constants;
460 uint32_t *bundles = (uint32_t *) bundle.constants;
461
462 uint32_t indices[4] = { 0 };
463 bool break_bundle = false;
464
465 for (unsigned i = 0; i < 4; ++i) {
466 uint32_t cons = constants[i];
467 bool constant_found = false;
468
469 /* Search for the constant */
470 for (unsigned j = 0; j < constant_count; ++j) {
471 if (bundles[j] != cons)
472 continue;
473
474 /* We found it, reuse */
475 indices[i] = j;
476 constant_found = true;
477 break;
478 }
479
480 if (constant_found)
481 continue;
482
483 /* We didn't find it, so allocate it */
484 unsigned idx = constant_count++;
485
486 if (idx >= 4) {
487 /* Uh-oh, out of space */
488 break_bundle = true;
489 break;
490 }
491
492 /* We have space, copy it in! */
493 bundles[idx] = cons;
494 indices[i] = idx;
495 }
496
497 if (break_bundle)
498 break;
499
500 /* Cool, we have it in. So use indices as a
501 * swizzle */
502
503 unsigned swizzle = SWIZZLE_FROM_ARRAY(indices);
504 unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
505
506 if (ains->src[0] == r_constant)
507 ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle);
508
509 if (ains->src[1] == r_constant)
510 ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle);
511
512 bundle.has_embedded_constants = true;
513 }
514
515 if (ains->compact_branch) {
516 /* All of r0 has to be written out along with
517 * the branch writeout */
518
519 if (ains->writeout && !can_writeout_fragment(ctx, scheduled, index, ctx->temp_count)) {
520 /* We only work on full moves
521 * at the beginning. We could
522 * probably do better */
523 if (index != 0)
524 break;
525
526 /* Inject a move */
527 midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
528 ins.unit = UNIT_VMUL;
529 control |= ins.unit;
530
531 /* TODO don't leak */
532 midgard_instruction *move =
533 mem_dup(&ins, sizeof(midgard_instruction));
534 bytes_emitted += bytes_for_instruction(move);
535 bundle.instructions[packed_idx++] = move;
536 }
537 }
538
539 bytes_emitted += bytes_for_instruction(ains);
540
541 /* Defer marking until after writing to allow for break */
542 scheduled[index] = ains;
543 control |= ains->unit;
544 last_unit = ains->unit;
545 ++instructions_emitted;
546 ++index;
547 }
548
549 int padding = 0;
550
551 /* Pad ALU op to nearest word */
552
553 if (bytes_emitted & 15) {
554 padding = 16 - (bytes_emitted & 15);
555 bytes_emitted += padding;
556 }
557
558 /* Constants must always be quadwords */
559 if (bundle.has_embedded_constants)
560 bytes_emitted += 16;
561
562 /* Size ALU instruction for tag */
563 bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
564 bundle.padding = padding;
565 bundle.control = bundle.tag | control;
566
567 break;
568 }
569
570 case TAG_LOAD_STORE_4: {
571 /* Load store instructions have two words at once. If
572 * we only have one queued up, we need to NOP pad.
573 * Otherwise, we store both in succession to save space
574 * and cycles -- letting them go in parallel -- skip
575 * the next. The usefulness of this optimisation is
576 * greatly dependent on the quality of the instruction
577 * scheduler.
578 */
579
580 midgard_instruction *next_op = mir_next_op(ins);
581
582 if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
583 /* TODO: Concurrency check */
584 instructions_emitted++;
585 }
586
587 break;
588 }
589
590 case TAG_TEXTURE_4: {
591 /* Which tag we use depends on the shader stage */
592 bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT;
593 bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX;
594 break;
595 }
596
597 default:
598 unreachable("Unknown tag");
599 break;
600 }
601
602 /* Copy the instructions into the bundle */
603 bundle.instruction_count = instructions_emitted + 1 + packed_idx;
604
605 midgard_instruction *uins = ins;
606 for (; packed_idx < bundle.instruction_count; ++packed_idx) {
607 assert(&uins->link != &block->instructions);
608 bundle.instructions[packed_idx] = uins;
609 uins = mir_next_op(uins);
610 }
611
612 *skip = instructions_emitted;
613
614 return bundle;
615 }
616
617 /* Schedule a single block by iterating its instruction to create bundles.
618 * While we go, tally about the bundle sizes to compute the block size. */
619
620 static void
621 schedule_block(compiler_context *ctx, midgard_block *block)
622 {
623 util_dynarray_init(&block->bundles, NULL);
624
625 block->quadword_count = 0;
626
627 mir_foreach_instr_in_block(block, ins) {
628 int skip;
629 midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
630 util_dynarray_append(&block->bundles, midgard_bundle, bundle);
631
632 if (bundle.has_blend_constant) {
633 unsigned offset = ctx->quadword_count + block->quadword_count + quadword_size(bundle.tag) - 1;
634 ctx->blend_constant_offset = offset * 0x10;
635 }
636
637 while(skip--)
638 ins = mir_next_op(ins);
639
640 block->quadword_count += quadword_size(bundle.tag);
641 }
642
643 block->is_scheduled = true;
644 ctx->quadword_count += block->quadword_count;
645 }
646
647 /* The following passes reorder MIR instructions to enable better scheduling */
648
649 static void
650 midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
651 {
652 mir_foreach_instr_in_block_safe(block, ins) {
653 if (ins->type != TAG_LOAD_STORE_4) continue;
654
655 /* We've found a load/store op. Check if next is also load/store. */
656 midgard_instruction *next_op = mir_next_op(ins);
657 if (&next_op->link != &block->instructions) {
658 if (next_op->type == TAG_LOAD_STORE_4) {
659 /* If so, we're done since we're a pair */
660 ins = mir_next_op(ins);
661 continue;
662 }
663
664 /* Maximum search distance to pair, to avoid register pressure disasters */
665 int search_distance = 8;
666
667 /* Otherwise, we have an orphaned load/store -- search for another load */
668 mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) {
669 /* Terminate search if necessary */
670 if (!(search_distance--)) break;
671
672 if (c->type != TAG_LOAD_STORE_4) continue;
673
674 /* We can only reorder if there are no sources */
675
676 bool deps = false;
677
678 for (unsigned s = 0; s < ARRAY_SIZE(ins->src); ++s)
679 deps |= (c->src[s] != ~0);
680
681 if (deps)
682 continue;
683
684 /* We found one! Move it up to pair and remove it from the old location */
685
686 mir_insert_instruction_before(ctx, ins, *c);
687 mir_remove_instruction(c);
688
689 break;
690 }
691 }
692 }
693 }
694
695 /* When we're 'squeezing down' the values in the IR, we maintain a hash
696 * as such */
697
698 static unsigned
699 find_or_allocate_temp(compiler_context *ctx, unsigned hash)
700 {
701 if (hash >= SSA_FIXED_MINIMUM)
702 return hash;
703
704 unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
705 ctx->hash_to_temp, hash + 1);
706
707 if (temp)
708 return temp - 1;
709
710 /* If no temp is find, allocate one */
711 temp = ctx->temp_count++;
712 ctx->max_hash = MAX2(ctx->max_hash, hash);
713
714 _mesa_hash_table_u64_insert(ctx->hash_to_temp,
715 hash + 1, (void *) ((uintptr_t) temp + 1));
716
717 return temp;
718 }
719
720 /* Reassigns numbering to get rid of gaps in the indices */
721
722 static void
723 mir_squeeze_index(compiler_context *ctx)
724 {
725 /* Reset */
726 ctx->temp_count = 0;
727 /* TODO don't leak old hash_to_temp */
728 ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
729
730 mir_foreach_instr_global(ctx, ins) {
731 ins->dest = find_or_allocate_temp(ctx, ins->dest);
732
733 for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i)
734 ins->src[i] = find_or_allocate_temp(ctx, ins->src[i]);
735 }
736 }
737
738 static midgard_instruction
739 v_load_store_scratch(
740 unsigned srcdest,
741 unsigned index,
742 bool is_store,
743 unsigned mask)
744 {
745 /* We index by 32-bit vec4s */
746 unsigned byte = (index * 4 * 4);
747
748 midgard_instruction ins = {
749 .type = TAG_LOAD_STORE_4,
750 .mask = mask,
751 .dest = ~0,
752 .src = { ~0, ~0, ~0 },
753 .load_store = {
754 .op = is_store ? midgard_op_st_int4 : midgard_op_ld_int4,
755 .swizzle = SWIZZLE_XYZW,
756
757 /* For register spilling - to thread local storage */
758 .arg_1 = 0xEA,
759 .arg_2 = 0x1E,
760
761 /* Splattered across, TODO combine logically */
762 .varying_parameters = (byte & 0x1FF) << 1,
763 .address = (byte >> 9)
764 },
765
766 /* If we spill an unspill, RA goes into an infinite loop */
767 .no_spill = true
768 };
769
770 if (is_store) {
771 /* r0 = r26, r1 = r27 */
772 assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27));
773 ins.src[0] = srcdest;
774 } else {
775 ins.dest = srcdest;
776 }
777
778 return ins;
779 }
780
781 /* If register allocation fails, find the best spill node and spill it to fix
782 * whatever the issue was. This spill node could be a work register (spilling
783 * to thread local storage), but it could also simply be a special register
784 * that needs to spill to become a work register. */
785
786 static void mir_spill_register(
787 compiler_context *ctx,
788 struct ra_graph *g,
789 unsigned *spill_count)
790 {
791 unsigned spill_index = ctx->temp_count;
792
793 /* Our first step is to calculate spill cost to figure out the best
794 * spill node. All nodes are equal in spill cost, but we can't spill
795 * nodes written to from an unspill */
796
797 for (unsigned i = 0; i < ctx->temp_count; ++i) {
798 ra_set_node_spill_cost(g, i, 1.0);
799 }
800
801 mir_foreach_instr_global(ctx, ins) {
802 if (ins->no_spill &&
803 ins->dest >= 0 &&
804 ins->dest < ctx->temp_count)
805 ra_set_node_spill_cost(g, ins->dest, -1.0);
806 }
807
808 int spill_node = ra_get_best_spill_node(g);
809
810 if (spill_node < 0) {
811 mir_print_shader(ctx);
812 assert(0);
813 }
814
815 /* We have a spill node, so check the class. Work registers
816 * legitimately spill to TLS, but special registers just spill to work
817 * registers */
818
819 unsigned class = ra_get_node_class(g, spill_node);
820 bool is_special = (class >> 2) != REG_CLASS_WORK;
821 bool is_special_w = (class >> 2) == REG_CLASS_TEXW;
822
823 /* Allocate TLS slot (maybe) */
824 unsigned spill_slot = !is_special ? (*spill_count)++ : 0;
825
826 /* For TLS, replace all stores to the spilled node. For
827 * special reads, just keep as-is; the class will be demoted
828 * implicitly. For special writes, spill to a work register */
829
830 if (!is_special || is_special_w) {
831 if (is_special_w)
832 spill_slot = spill_index++;
833
834 mir_foreach_instr_global_safe(ctx, ins) {
835 if (ins->dest != spill_node) continue;
836
837 midgard_instruction st;
838
839 if (is_special_w) {
840 st = v_mov(spill_node, blank_alu_src, spill_slot);
841 st.no_spill = true;
842 } else {
843 ins->dest = SSA_FIXED_REGISTER(26);
844 st = v_load_store_scratch(ins->dest, spill_slot, true, ins->mask);
845 }
846
847 /* Hint: don't rewrite this node */
848 st.hint = true;
849
850 mir_insert_instruction_before(ctx, mir_next_op(ins), st);
851
852 if (!is_special)
853 ctx->spills++;
854 }
855 }
856
857 /* For special reads, figure out how many components we need */
858 unsigned read_mask = 0;
859
860 mir_foreach_instr_global_safe(ctx, ins) {
861 read_mask |= mir_mask_of_read_components(ins, spill_node);
862 }
863
864 /* Insert a load from TLS before the first consecutive
865 * use of the node, rewriting to use spilled indices to
866 * break up the live range. Or, for special, insert a
867 * move. Ironically the latter *increases* register
868 * pressure, but the two uses of the spilling mechanism
869 * are somewhat orthogonal. (special spilling is to use
870 * work registers to back special registers; TLS
871 * spilling is to use memory to back work registers) */
872
873 mir_foreach_block(ctx, block) {
874 bool consecutive_skip = false;
875 unsigned consecutive_index = 0;
876
877 mir_foreach_instr_in_block(block, ins) {
878 /* We can't rewrite the moves used to spill in the
879 * first place. These moves are hinted. */
880 if (ins->hint) continue;
881
882 if (!mir_has_arg(ins, spill_node)) {
883 consecutive_skip = false;
884 continue;
885 }
886
887 if (consecutive_skip) {
888 /* Rewrite */
889 mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
890 continue;
891 }
892
893 if (!is_special_w) {
894 consecutive_index = ++spill_index;
895
896 midgard_instruction *before = ins;
897
898 /* For a csel, go back one more not to break up the bundle */
899 if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op))
900 before = mir_prev_op(before);
901
902 midgard_instruction st;
903
904 if (is_special) {
905 /* Move */
906 st = v_mov(spill_node, blank_alu_src, consecutive_index);
907 st.no_spill = true;
908 } else {
909 /* TLS load */
910 st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF);
911 }
912
913 /* Mask the load based on the component count
914 * actually needed to prvent RA loops */
915
916 st.mask = read_mask;
917
918 mir_insert_instruction_before(ctx, before, st);
919 // consecutive_skip = true;
920 } else {
921 /* Special writes already have their move spilled in */
922 consecutive_index = spill_slot;
923 }
924
925
926 /* Rewrite to use */
927 mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
928
929 if (!is_special)
930 ctx->fills++;
931 }
932 }
933
934 /* Reset hints */
935
936 mir_foreach_instr_global(ctx, ins) {
937 ins->hint = false;
938 }
939 }
940
941 void
942 schedule_program(compiler_context *ctx)
943 {
944 struct ra_graph *g = NULL;
945 bool spilled = false;
946 int iter_count = 1000; /* max iterations */
947
948 /* Number of 128-bit slots in memory we've spilled into */
949 unsigned spill_count = 0;
950
951 midgard_promote_uniforms(ctx, 16);
952
953 mir_foreach_block(ctx, block) {
954 midgard_pair_load_store(ctx, block);
955 }
956
957 /* Must be lowered right before RA */
958 mir_squeeze_index(ctx);
959 mir_lower_special_reads(ctx);
960
961 /* Lowering can introduce some dead moves */
962
963 mir_foreach_block(ctx, block) {
964 midgard_opt_dead_move_eliminate(ctx, block);
965 }
966
967 do {
968 if (spilled)
969 mir_spill_register(ctx, g, &spill_count);
970
971 mir_squeeze_index(ctx);
972
973 g = NULL;
974 g = allocate_registers(ctx, &spilled);
975 } while(spilled && ((iter_count--) > 0));
976
977 /* We can simplify a bit after RA */
978
979 mir_foreach_block(ctx, block) {
980 midgard_opt_post_move_eliminate(ctx, block, g);
981 }
982
983 /* After RA finishes, we schedule all at once */
984
985 mir_foreach_block(ctx, block) {
986 schedule_block(ctx, block);
987 }
988
989 /* Finally, we create pipeline registers as a peephole pass after
990 * scheduling. This isn't totally optimal, since there are cases where
991 * the usage of pipeline registers can eliminate spills, but it does
992 * save some power */
993
994 mir_create_pipeline_registers(ctx);
995
996 if (iter_count <= 0) {
997 fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");
998 assert(0);
999 }
1000
1001 /* Report spilling information. spill_count is in 128-bit slots (vec4 x
1002 * fp32), but tls_size is in bytes, so multiply by 16 */
1003
1004 ctx->tls_size = spill_count * 16;
1005
1006 install_registers(ctx, g);
1007 }