tree-wide: replace MAYBE_UNUSED with ASSERTED
[mesa.git] / src / gallium / drivers / lima / ir / gp / scheduler.c
1 /*
2 * Copyright (c) 2017 Lima Project
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sub license,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the
12 * next paragraph) shall be included in all copies or substantial portions
13 * of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 */
24
25 #include <limits.h>
26
27 #include "gpir.h"
28
29 /*
30 * GP scheduling algorithm (by Connor Abbott <cwabbott0@gmail.com>)
31 *
32 * The GP pipeline has three main stages:
33 *
34 * --------------------------------------------------------
35 * | |
36 * | Register/Attr/Temp Fetch |
37 * | |
38 * --------------------------------------------------------
39 * | | | | | | |
40 * | Mul0 | Mul1 | Add0 | Add1 | Cplx | Pass |
41 * | | | | | | |
42 * --------------------------------------------------------
43 * | | | |
44 * | Complex1 | Temp/Register/Varying | Pass |
45 * | Stage 2 | Store | Stage 2 |
46 * | | | |
47 * --------------------------------------------------------
48 *
49 * Because of this setup, storing a register has a latency of three cycles.
50 * Also, the register file is organized into 4-component vectors, and the
51 * load stage can only load two vectors at a time. Aside from these highly
52 * constrained register load/store units, there is an explicit bypass
53 * network, where each unit (mul0/mul1/etc.) can access the results of the
54 * any unit from the previous two cycles directly, except for the complex
55 * unit whose result can only be accessed for one cycle (since it's expected
56 * to be used directly by the complex2 instruction in the following cycle).
57 *
58 * Because of the very restricted register file, and because only rarely are
59 * all the units in use at the same time, it can be very beneficial to use
60 * the unused units to "thread" a value from source to destination by using
61 * moves in the otherwise-unused units, without involving the register file
62 * at all. It's very difficult to fully exploit this with a traditional
63 * scheduler, so we need to do something a little un-traditional. The 512
64 * instruction limit means that for more complex shaders, we need to do as
65 * well as possible or else the app won't even work.
66 *
67 * The scheduler works by considering the bypass network as a kind of
68 * register file. It's a quite unusual register file, since registers have to
69 * be assigned "on the fly" as we schedule operations, but with some care, we
70 * can use something conceptually similar to a linear-scan allocator to
71 * successfully schedule nodes to instructions without running into
72 * conflicts.
73 *
74 * Values in the IR are separated into normal values, or "value registers",
75 * which is what normal nodes like add, mul, etc. produce, and which only
76 * live inside one basic block, and registers, which can span multiple basic
77 * blocks but have to be accessed via special load_reg/store_reg nodes. RA
78 * assigns physical registers to both value registers and normal registers,
79 * treating load_reg/store_reg as a move instruction, but these are only used
80 * directly for normal registers -- the physreg assigned to a value register
81 * is "fake," and is only used inside the scheduler. Before scheduling we
82 * insert read-after-write dependencies, even for value registers, as if
83 * we're going to use those, but then we throw them away. For example, if we
84 * had something like:
85 *
86 * (*)r2 = add (*)r1, (*)r2
87 * (*)r1 = load_reg r0
88 *
89 * we'd insert a write-after-read dependency between the add and load_reg,
90 * even though the starred registers aren't actually used by the scheduler
91 * after this step. This step is crucial since it guarantees that during any
92 * point in the schedule, the number of live registers + live value registers
93 * will never exceed the capacity of the register file and the bypass network
94 * combined. This is because each live register/value register will have a
95 * different fake number, thanks to the fake dependencies inserted before
96 * scheduling. This allows us to not have to worry about spilling to
97 * temporaries, which is only done ahead of time.
98 *
99 * The scheduler is a bottom-up scheduler. It keeps track of each live value
100 * register, and decides on-the-fly which value registers to keep in the
101 * bypass network and which to "spill" to registers. Of particular importance
102 * is the "ready list," which consists of "input nodes" (nodes that produce a
103 * value that can be consumed via the bypass network), both "partially ready"
104 * (only some of the uses have been scheduled) and "fully ready" (all uses
105 * have been scheduled), as well as other non-input nodes like register
106 * stores. Each input node on the ready list represents a live value register
107 * before the current instruction. There must be at most 11 such input nodes
108 * at all times, since there are only 11 slots in the next two instructions
109 * which can reach the current instruction.
110 *
111 * An input node is a "max node" if it has a use two cycles ago, which must be
112 * connected to a definition this cycle. Otherwise it may be a "next max node"
113 * if it will be a max node on the next instruction (i.e. it has a use at most
114 * one cycle ago), or it may be neither if all of its uses are this cycle. As
115 * we keep adding instructions to the front, input nodes graduate from
116 * neither, to next max, to max, unless we decide to insert a move to keep it
117 * alive longer, at which point any uses after the current instruction are
118 * rewritten to be uses of the move so that the original node returns to
119 * neither. The scheduler decides which nodes to try freely, but we have to
120 * reserve slots for two different reasons: (1) out of the 5 non-complex
121 * slots, we reserve a slot for each max node, so that we can connect a
122 * definition to the use 2 cycles ago. (2) Out of all 6 slots, we reserve a
123 * slot for every next-max node above 5, so that for the next instruction
124 * there are no more than 5 max nodes. When a max or next-max node gets
125 * scheduled, the corresponding reservation is reduced by one. At the end, we
126 * insert moves for every slot that was reserved. The reservation is actually
127 * managed by nir_instr, and all we have to do is tell it how many to reserve
128 * at the beginning and then tell it which nodes are max/next-max nodes. When
129 * we start scheduling an instruction, there will be at most 5 max nodes
130 * thanks to the previous instruction's next-max reservation/move insertion.
131 * Since there are at most 11 total input nodes, if there are N max nodes,
132 * there are at most 11 - N next-max nodes, and therefore at most 11 - N - 5 =
133 * 6 - N slots need to be reserved for next-max nodes, and so at most
134 * 6 - N + N = 6 slots need to be reserved in total, exactly the total number
135 * of slots. So, thanks to the total input node restriction, we will never
136 * need to reserve too many slots.
137 *
138 * It sometimes happens that scheduling a given node will violate this total
139 * input node restriction, or that a reservation will mean that we can't
140 * schedule it. We first schedule a node "speculatively" to see if this is a
141 * problem. If some of the node's sources are loads, then we can schedule
142 * the node and its dependent loads in one swoop to avoid going over the
143 * pressure limit. If that fails, we can try to spill a ready or
144 * partially-ready input node to a register by rewriting all of its uses to
145 * refer to a register load. This removes it from the list of ready and
146 * partially ready input nodes as all of its uses are now unscheduled. If
147 * successful, we can then proceed with scheduling the original node. All of
148 * this happens "speculatively," meaning that afterwards the node is removed
149 * and the entire state of the scheduler is reverted to before it was tried, to
150 * ensure that we never get into an invalid state and run out of spots for
151 * moves. In try_nodes(), we try to schedule each node speculatively on the
152 * ready list, keeping only the nodes that could be successfully scheduled, so
153 * that when we finally decide which node to actually schedule, we know it
154 * will succeed. This is how we decide on the fly which values go in
155 * registers and which go in the bypass network. Note that "unspilling" a
156 * value is simply a matter of scheduling the store_reg instruction created
157 * when we spill.
158 *
159 * The careful accounting of live value registers, reservations for moves, and
160 * speculative scheduling guarantee that we never run into a failure case
161 * while scheduling. However, we need to make sure that this scheduler will
162 * not get stuck in an infinite loop, i.e. that we'll always make forward
163 * progress by eventually scheduling a non-move node. If we run out of value
164 * registers, then we may have to spill a node to a register. If we
165 * were to schedule one of the fully-ready nodes, then we'd have 11 + N live
166 * value registers before the current instruction. But since there are at most
167 * 64+11 live registers and register values total thanks to the fake
168 * dependencies we inserted before scheduling, there are at most 64 - N live
169 * physical registers, and therefore there are at least N registers available
170 * for spilling. Not all these registers will be available immediately, since
171 * in order to spill a node to a given register we have to ensure that there
172 * are slots available to rewrite every use to a load instruction, and that
173 * may not be the case. There may also be intervening writes which prevent
174 * some registers from being used. However, these are all temporary problems,
175 * since as we create each instruction, we create additional register load
176 * slots that can be freely used for spilling, and we create more move nodes
177 * which means that the uses of the nodes we're trying to spill keep moving
178 * forward. This means that eventually, these problems will go away, at which
179 * point we'll be able to spill a node successfully, so eventually we'll be
180 * able to schedule the first node on the ready list.
181 */
182
183 typedef struct {
184 /* This is the list of ready and partially-ready nodes. A partially-ready
185 * node must have at least one input dependency already scheduled.
186 */
187 struct list_head ready_list;
188
189 /* The number of ready or partially-ready nodes with at least one input
190 * dependency already scheduled. In other words, the number of live value
191 * registers. This must be at most 11.
192 */
193 int ready_list_slots;
194
195 /* The physical registers live into the current instruction. */
196 uint64_t live_physregs;
197
198 /* The current instruction. */
199 gpir_instr *instr;
200
201 /* The current basic block. */
202 gpir_block *block;
203
204 /* True if at least one node failed to schedule due to lack of available
205 * value registers.
206 */
207 bool try_spill_all;
208
209 /* The number of max nodes needed to spill to successfully schedule the
210 * instruction.
211 */
212 int max_node_spill_needed;
213
214 /* The number of max and next-max nodes needed to spill to successfully
215 * schedule the instruction.
216 */
217 int total_spill_needed;
218 } sched_ctx;
219
220 static int gpir_min_dist_alu(gpir_dep *dep)
221 {
222 switch (dep->pred->op) {
223 case gpir_op_load_uniform:
224 case gpir_op_load_temp:
225 case gpir_op_load_reg:
226 case gpir_op_load_attribute:
227 return 0;
228
229 case gpir_op_complex1:
230 return 2;
231
232 default:
233 return 1;
234 }
235 }
236
237 static int gpir_get_min_dist(gpir_dep *dep)
238 {
239 switch (dep->type) {
240 case GPIR_DEP_INPUT:
241 switch (dep->succ->op) {
242 case gpir_op_store_temp:
243 case gpir_op_store_reg:
244 case gpir_op_store_varying:
245 /* Stores must use an alu node as input. Also, complex1 takes two
246 * cycles, which means that its result cannot be stored to a register
247 * as part of the normal path, and therefore it must also have a move
248 * inserted.
249 */
250 if (dep->pred->type == gpir_node_type_load ||
251 dep->pred->op == gpir_op_complex1)
252 return INT_MAX >> 2;
253 else
254 return 0;
255
256 default:
257 return gpir_min_dist_alu(dep);
258 }
259
260 case GPIR_DEP_OFFSET:
261 assert(dep->succ->op == gpir_op_store_temp);
262 return gpir_min_dist_alu(dep);
263
264 case GPIR_DEP_READ_AFTER_WRITE:
265 if (dep->succ->op == gpir_op_load_temp &&
266 dep->pred->op == gpir_op_store_temp) {
267 return 4;
268 } else if (dep->succ->op == gpir_op_load_reg &&
269 dep->pred->op == gpir_op_store_reg) {
270 return 3;
271 } else if ((dep->pred->op == gpir_op_store_temp_load_off0 ||
272 dep->pred->op == gpir_op_store_temp_load_off1 ||
273 dep->pred->op == gpir_op_store_temp_load_off2) &&
274 dep->succ->op == gpir_op_load_uniform) {
275 return 4;
276 } else {
277 /* Fake dependency */
278 return 0;
279 }
280
281 case GPIR_DEP_WRITE_AFTER_READ:
282 return 0;
283 }
284
285 return 0;
286 }
287
288 static int gpir_max_dist_alu(gpir_dep *dep)
289 {
290 switch (dep->pred->op) {
291 case gpir_op_load_uniform:
292 case gpir_op_load_temp:
293 return 0;
294 case gpir_op_load_attribute:
295 return 1;
296 case gpir_op_load_reg:
297 if (dep->pred->sched.pos < GPIR_INSTR_SLOT_REG0_LOAD0 ||
298 dep->pred->sched.pos > GPIR_INSTR_SLOT_REG0_LOAD3)
299 return 0;
300 else
301 return 1;
302 case gpir_op_exp2_impl:
303 case gpir_op_log2_impl:
304 case gpir_op_rcp_impl:
305 case gpir_op_rsqrt_impl:
306 case gpir_op_store_temp_load_off0:
307 case gpir_op_store_temp_load_off1:
308 case gpir_op_store_temp_load_off2:
309 return 1;
310 case gpir_op_mov:
311 if (dep->pred->sched.pos == GPIR_INSTR_SLOT_COMPLEX)
312 return 1;
313 else
314 return 2;
315 default:
316 return 2;
317 }
318 }
319
320 static int gpir_get_max_dist(gpir_dep *dep)
321 {
322 switch (dep->type) {
323 case GPIR_DEP_INPUT:
324 switch (dep->succ->op) {
325 case gpir_op_store_temp:
326 case gpir_op_store_reg:
327 case gpir_op_store_varying:
328 return 0;
329
330 default:
331 return gpir_max_dist_alu(dep);
332 }
333
334 case GPIR_DEP_OFFSET:
335 assert(dep->succ->op == gpir_op_store_temp);
336 return gpir_max_dist_alu(dep);
337
338 default:
339 return INT_MAX >> 2; /* Don't want to overflow... */
340 }
341 }
342
343 static void schedule_update_distance(gpir_node *node)
344 {
345 if (gpir_node_is_leaf(node)) {
346 node->sched.dist = 0;
347 return;
348 }
349
350 gpir_node_foreach_pred(node, dep) {
351 gpir_node *pred = dep->pred;
352
353 if (pred->sched.dist < 0)
354 schedule_update_distance(pred);
355
356 int dist = pred->sched.dist + gpir_min_dist_alu(dep);
357 if (node->sched.dist < dist)
358 node->sched.dist = dist;
359 }
360 }
361
362 static bool gpir_is_input_node(gpir_node *node)
363 {
364 gpir_node_foreach_succ(node, dep) {
365 if (dep->type == GPIR_DEP_INPUT)
366 return true;
367 }
368 return false;
369 }
370
371
372 /* Get the number of slots required for a node on the ready list.
373 */
374 static int gpir_get_slots_required(gpir_node *node)
375 {
376 if (!gpir_is_input_node(node))
377 return 0;
378
379 /* Note that we assume every node only consumes one slot, even dual-slot
380 * instructions. While dual-slot instructions may consume more than one
381 * slot, we can always safely insert a move if it turns out that there
382 * isn't enough space for them. There's the risk that we get stuck in an
383 * infinite loop if all the fully ready nodes are dual-slot nodes, but we
384 * rely on spilling to registers to save us here.
385 */
386 return 1;
387 }
388
389 static void verify_ready_list(sched_ctx *ctx)
390 {
391 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
392 if (!gpir_is_input_node(node)) {
393 assert(node->sched.ready);
394 }
395
396 if (node->sched.ready) {
397 /* Every successor must have been scheduled */
398 gpir_node_foreach_succ(node, dep) {
399 assert(dep->succ->sched.instr);
400 }
401 } else {
402 /* There must be at least one successor that's not scheduled. */
403 bool unscheduled = false;
404 gpir_node_foreach_succ(node, dep) {
405 unscheduled |= !(dep->succ->sched.instr);
406 }
407
408 assert(unscheduled);
409 }
410 }
411 }
412
413 static void schedule_insert_ready_list(sched_ctx *ctx,
414 gpir_node *insert_node)
415 {
416 /* if this node is fully ready or partially ready
417 * fully ready: all successors have been scheduled
418 * partially ready: part of input successors have been scheduled
419 *
420 * either fully ready or partially ready node need be inserted to
421 * the ready list, but we only schedule a move node for partially
422 * ready node.
423 */
424 bool ready = true, insert = false;
425 gpir_node_foreach_succ(insert_node, dep) {
426 gpir_node *succ = dep->succ;
427 if (succ->sched.instr) {
428 if (dep->type == GPIR_DEP_INPUT)
429 insert = true;
430 }
431 else
432 ready = false;
433 }
434
435 insert_node->sched.ready = ready;
436 /* for root node */
437 insert |= ready;
438
439 if (!insert || insert_node->sched.inserted)
440 return;
441
442 struct list_head *insert_pos = &ctx->ready_list;
443 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
444 if (insert_node->sched.dist > node->sched.dist ||
445 gpir_op_infos[insert_node->op].schedule_first) {
446 insert_pos = &node->list;
447 break;
448 }
449 }
450
451 list_addtail(&insert_node->list, insert_pos);
452 insert_node->sched.inserted = true;
453 ctx->ready_list_slots += gpir_get_slots_required(insert_node);
454 }
455
456 static int gpir_get_max_start(gpir_node *node)
457 {
458 int max_start = 0;
459
460 /* find the max start instr constrainted by all successors */
461 gpir_node_foreach_succ(node, dep) {
462 gpir_node *succ = dep->succ;
463 if (!succ->sched.instr)
464 continue;
465
466 int start = succ->sched.instr->index + gpir_get_min_dist(dep);
467 if (start > max_start)
468 max_start = start;
469 }
470
471 return max_start;
472 }
473
474 static int gpir_get_min_end(gpir_node *node)
475 {
476 int min_end = INT_MAX;
477
478 /* find the min end instr constrainted by all successors */
479 gpir_node_foreach_succ(node, dep) {
480 gpir_node *succ = dep->succ;
481 if (!succ->sched.instr)
482 continue;
483
484 int end = succ->sched.instr->index + gpir_get_max_dist(dep);
485 if (end < min_end)
486 min_end = end;
487 }
488
489 return min_end;
490 }
491
492 static gpir_node *gpir_sched_instr_has_load(gpir_instr *instr, gpir_node *node)
493 {
494 gpir_load_node *load = gpir_node_to_load(node);
495
496 for (int i = GPIR_INSTR_SLOT_REG0_LOAD0; i <= GPIR_INSTR_SLOT_MEM_LOAD3; i++) {
497 if (!instr->slots[i])
498 continue;
499
500 gpir_load_node *iload = gpir_node_to_load(instr->slots[i]);
501 if (load->node.op == iload->node.op &&
502 load->index == iload->index &&
503 load->component == iload->component)
504 return &iload->node;
505 }
506 return NULL;
507 }
508
509 /* Simply place the node into the given instruction without trying to deal
510 * with liveness or the ready list. This will only fail if the instruction
511 * cannot be placed due to a lack of available slots. In addition to normal
512 * node placement, this is also used for placing loads when spilling to
513 * registers.
514 */
515 static bool _try_place_node(sched_ctx *ctx, gpir_instr *instr, gpir_node *node)
516 {
517 if (node->type == gpir_node_type_load) {
518 gpir_node *load = gpir_sched_instr_has_load(instr, node);
519 if (load) {
520 /* This node may have a store as a successor, in which case we have to
521 * fail it exactly like below in order to later create a move node in
522 * between.
523 */
524 if (instr->index < gpir_get_max_start(node))
525 return false;
526
527 gpir_debug("same load %d in instr %d for node %d\n",
528 load->index, instr->index, node->index);
529
530 /* not really merge two node, just fake scheduled same place */
531 node->sched.instr = load->sched.instr;
532 node->sched.pos = load->sched.pos;
533 return true;
534 }
535 }
536
537 node->sched.instr = instr;
538
539 int max_node_spill_needed = INT_MAX;
540 int total_spill_needed = INT_MAX;
541 int *slots = gpir_op_infos[node->op].slots;
542 for (int i = 0; slots[i] != GPIR_INSTR_SLOT_END; i++) {
543 node->sched.pos = slots[i];
544 if (instr->index >= gpir_get_max_start(node) &&
545 instr->index <= gpir_get_min_end(node) &&
546 gpir_instr_try_insert_node(instr, node))
547 return true;
548 if (ctx->instr->non_cplx_slot_difference ||
549 ctx->instr->slot_difference) {
550 /* If one of these fields is non-zero, then we could insert the node
551 * here after spilling. To get an accurate count of how many nodes we
552 * need to spill, we need to choose one of the positions where there
553 * were nonzero slot differences, preferably one with the smallest
554 * difference (so we don't have to spill as much).
555 */
556 if (ctx->instr->non_cplx_slot_difference < max_node_spill_needed ||
557 ctx->instr->slot_difference < total_spill_needed) {
558 max_node_spill_needed = ctx->instr->non_cplx_slot_difference;
559 total_spill_needed = ctx->instr->slot_difference;
560 }
561 }
562 }
563
564 if (max_node_spill_needed != INT_MAX) {
565 /* Indicate how many spill nodes are needed. */
566 ctx->max_node_spill_needed = MAX2(ctx->max_node_spill_needed,
567 max_node_spill_needed);
568 ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
569 total_spill_needed);
570 }
571 node->sched.instr = NULL;
572 node->sched.pos = -1;
573 return false;
574 }
575
576 /* Try to place just the node given, updating the ready list. If "speculative"
577 * is true, then this is part ofthe pre-commit phase. If false, then we have
578 * committed to placing this node, so update liveness and ready list
579 * information.
580 */
581
582 static bool schedule_try_place_node(sched_ctx *ctx, gpir_node *node,
583 bool speculative)
584 {
585 if (!_try_place_node(ctx, ctx->instr, node)) {
586 if (!speculative)
587 gpir_debug("failed to place %d\n", node->index);
588 return false;
589 }
590
591 ctx->ready_list_slots -= gpir_get_slots_required(node);
592
593 if (!speculative) {
594 gpir_debug("placed node %d\n", node->index);
595
596 /* We assume here that writes are placed before reads. If this changes,
597 * then this needs to be updated.
598 */
599 if (node->op == gpir_op_store_reg) {
600 gpir_store_node *store = gpir_node_to_store(node);
601 ctx->live_physregs &=
602 ~(1ull << (4 * store->index + store->component));
603 if (store->child->sched.physreg_store == store)
604 store->child->sched.physreg_store = NULL;
605 }
606
607 if (node->op == gpir_op_load_reg) {
608 gpir_load_node *load = gpir_node_to_load(node);
609 ctx->live_physregs |=
610 (1ull << (4 * load->index + load->component));
611 }
612
613 list_del(&node->list);
614 list_add(&node->list, &ctx->block->node_list);
615 gpir_node_foreach_pred(node, dep) {
616 gpir_node *pred = dep->pred;
617 schedule_insert_ready_list(ctx, pred);
618 }
619 } else {
620 gpir_node_foreach_pred(node, dep) {
621 gpir_node *pred = dep->pred;
622 if (!pred->sched.inserted && dep->type == GPIR_DEP_INPUT)
623 ctx->ready_list_slots += gpir_get_slots_required(pred);
624 }
625 }
626
627 return true;
628 }
629
630 /* Create a new node with "node" as the child, replace all uses of "node" with
631 * this new node, and replace "node" with it in the ready list.
632 */
633 static gpir_node *create_replacement(sched_ctx *ctx, gpir_node *node,
634 gpir_op op)
635 {
636
637 gpir_alu_node *new_node = gpir_node_create(node->block, op);
638 if (unlikely(!new_node))
639 return NULL;
640
641 new_node->children[0] = node;
642 new_node->num_child = 1;
643
644 new_node->node.sched.instr = NULL;
645 new_node->node.sched.pos = -1;
646 new_node->node.sched.dist = node->sched.dist;
647 new_node->node.sched.max_node = node->sched.max_node;
648 new_node->node.sched.next_max_node = node->sched.next_max_node;
649 new_node->node.sched.complex_allowed = node->sched.complex_allowed;
650
651 ctx->ready_list_slots--;
652 list_del(&node->list);
653 node->sched.max_node = false;
654 node->sched.next_max_node = false;
655 node->sched.ready = false;
656 node->sched.inserted = false;
657 gpir_node_replace_succ(&new_node->node, node);
658 gpir_node_add_dep(&new_node->node, node, GPIR_DEP_INPUT);
659 schedule_insert_ready_list(ctx, &new_node->node);
660 return &new_node->node;
661 }
662
663 static gpir_node *create_move(sched_ctx *ctx, gpir_node *node)
664 {
665 gpir_node *move = create_replacement(ctx, node, gpir_op_mov);
666 gpir_debug("create move %d for %d\n", move->index, node->index);
667 return move;
668 }
669
670 static gpir_node *create_postlog2(sched_ctx *ctx, gpir_node *node)
671 {
672 assert(node->op == gpir_op_complex1);
673 gpir_node *postlog2 = create_replacement(ctx, node, gpir_op_postlog2);
674 gpir_debug("create postlog2 %d for %d\n", postlog2->index, node->index);
675 return postlog2;
676 }
677
678 /* Once we schedule the successor, would the predecessor be fully ready? */
679 static bool pred_almost_ready(gpir_dep *dep)
680 {
681 bool fully_ready = true;
682 gpir_node_foreach_succ(dep->pred, other_dep) {
683 gpir_node *succ = other_dep->succ;
684 if (!succ->sched.instr && dep->succ != other_dep->succ) {
685 fully_ready = false;
686 break;
687 }
688 }
689
690 return fully_ready;
691 }
692
693 /* Recursively try to schedule a node and all its dependent nodes that can fit
694 * in the same instruction. There is a simple heuristic scoring system to try
695 * to group together nodes that load different components of the same input,
696 * to avoid bottlenecking for operations like matrix multiplies that are
697 * mostly input-bound.
698 */
699
700 static int _schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
701 {
702 if (!schedule_try_place_node(ctx, node, speculative))
703 return INT_MIN;
704
705 int score = 0;
706
707 gpir_node_foreach_pred(node, dep) {
708 if (!gpir_is_input_node(dep->pred))
709 continue;
710
711 int pred_score = INT_MIN;
712 if (pred_almost_ready(dep)) {
713 if (dep->pred->type == gpir_node_type_load ||
714 node->type == gpir_node_type_store) {
715 pred_score = _schedule_try_node(ctx, dep->pred, speculative);
716 }
717 }
718 if (dep->pred->type == gpir_node_type_load ||
719 node->type == gpir_node_type_store) {
720 if (pred_score == INT_MIN) {
721 if (node->op == gpir_op_mov) {
722 /* The only moves on the ready list are for loads that we
723 * couldn't schedule immediately, as created below. If we
724 * couldn't schedule the load, there's no point scheduling
725 * the move. The normal move threading logic will ensure
726 * that another move is created if we're about to go too far
727 * from the uses of this move.
728 */
729 assert(speculative);
730 return INT_MIN;
731 } else if (!speculative && dep->pred->type == gpir_node_type_load) {
732 /* We couldn't schedule the load right away, so it will have
733 * to happen in some earlier instruction and then be moved
734 * into a value register and threaded to the use by "node".
735 * We create the move right away, so that later we'll fail
736 * to schedule it if there isn't a slot for a move
737 * available.
738 */
739 create_move(ctx, dep->pred);
740 }
741 /* Penalize nodes whose dependent ops we couldn't schedule.
742 */
743 score--;
744 } else {
745 score += pred_score;
746 continue;
747 }
748 }
749 }
750
751 return score;
752 }
753
754 /* If we speculatively tried a node, undo everything.
755 */
756
757 static void schedule_undo_node(sched_ctx *ctx, gpir_node *node)
758 {
759 gpir_instr_remove_node(ctx->instr, node);
760
761 gpir_node_foreach_pred(node, dep) {
762 gpir_node *pred = dep->pred;
763 if (pred->sched.instr) {
764 schedule_undo_node(ctx, pred);
765 }
766 }
767 }
768
769 /* Try to schedule a node. We also try to schedule any predecessors that can
770 * be part of the same instruction. If "speculative" is true, then we don't
771 * actually change any state, only returning the score were the node to be
772 * scheduled, with INT_MIN meaning "cannot be scheduled at all".
773 */
774 static int schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
775 {
776 int prev_slots = ctx->ready_list_slots;
777
778 int score = _schedule_try_node(ctx, node, speculative);
779
780 if (ctx->ready_list_slots > GPIR_VALUE_REG_NUM) {
781 assert(speculative);
782 ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
783 ctx->ready_list_slots - GPIR_VALUE_REG_NUM);
784 score = INT_MIN;
785 }
786
787 if (speculative) {
788 ctx->ready_list_slots = prev_slots;
789 if (node->sched.instr)
790 schedule_undo_node(ctx, node);
791 }
792
793 return score;
794 }
795
796 /* This is called when we want to spill "node" by inserting loads at its uses.
797 * It returns all the possible registers we can use so that all the loads will
798 * successfully be inserted. Also return the first instruction we'll need to
799 * insert a load for.
800 */
801
802 static uint64_t get_available_regs(sched_ctx *ctx, gpir_node *node,
803 int *min_index)
804 {
805 uint64_t available = ~0ull;
806 gpir_node_foreach_succ(node, dep) {
807 if (dep->type != GPIR_DEP_INPUT)
808 continue;
809
810 gpir_node *use = dep->succ;
811 gpir_instr *instr = use->sched.instr;
812
813 if (!instr) {
814 /* This use isn't scheduled, so no need to spill it. */
815 continue;
816 }
817
818 if (use->type == gpir_node_type_store) {
819 /* We're trying to spill something that was recently stored... just
820 * bail out.
821 */
822 return 0;
823 }
824
825 if (use->op == gpir_op_mov && instr == ctx->instr) {
826 /* We try to spill the sources of this move, so we can free up space
827 * in the current instruction.
828 *
829 * TODO: should we go back further? It might let us schedule the
830 * write earlier in some cases, but then we might fail to spill.
831 */
832 available &= get_available_regs(ctx, use, min_index);
833 } else {
834 if (instr->index < *min_index)
835 *min_index = instr->index;
836
837 uint64_t use_available = 0;
838
839 if (instr->reg0_use_count == 0)
840 use_available = ~0ull;
841 else if (!instr->reg0_is_attr)
842 use_available = 0xf << (4 * instr->reg0_index);
843
844 if (instr->reg1_use_count == 0)
845 use_available = ~0ull;
846 else
847 use_available |= 0xf << (4 * instr->reg1_index);
848
849 available &= use_available;
850 }
851 }
852
853 return available;
854 }
855
856 /* Using "min_index" returned by get_available_regs(), figure out which
857 * registers are killed by a write after or during the current instruction and
858 * hence we can't use for spilling. Writes that haven't been scheduled yet
859 * should be reflected in live_physregs.
860 */
861
862 static uint64_t get_killed_regs(sched_ctx *ctx, int min_index)
863 {
864 uint64_t killed = 0;
865
866 list_for_each_entry(gpir_instr, instr, &ctx->block->instr_list, list) {
867 if (instr->index <= min_index)
868 break;
869
870 for (int slot = GPIR_INSTR_SLOT_STORE0; slot <= GPIR_INSTR_SLOT_STORE3;
871 slot++) {
872 if (!instr->slots[slot])
873 continue;
874
875 gpir_store_node *store = gpir_node_to_store(instr->slots[slot]);
876 if (store->node.op != gpir_op_store_reg)
877 continue;
878
879 killed |= 1ull << (4 * store->index + store->component);
880 }
881 }
882
883 return killed;
884 }
885
886 /* Actually spill a node so that it is no longer in the ready list. Note that
887 * this must exactly follow the logic of get_available_regs() or else the
888 * loads could fail to schedule.
889 */
890
891 static void spill_node(sched_ctx *ctx, gpir_node *node, gpir_store_node *store)
892 {
893 gpir_node_foreach_succ_safe(node, dep) {
894 if (dep->type != GPIR_DEP_INPUT)
895 continue;
896
897 gpir_node *use = dep->succ;
898 gpir_instr *instr = use->sched.instr;
899
900 if (!instr)
901 continue;
902
903 if (use->op == gpir_op_mov && instr == ctx->instr) {
904 spill_node(ctx, use, store);
905 } else {
906 gpir_load_node *load = gpir_node_create(ctx->block, gpir_op_load_reg);
907 load->index = store->index;
908 load->component = store->component;
909 list_add(&load->node.list, &ctx->block->node_list);
910 gpir_node_replace_child(dep->succ, dep->pred, &load->node);
911 gpir_node_replace_pred(dep, &load->node);
912 gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE);
913 gpir_debug("spilling use %d of node %d to load node %d\n",
914 use->index, node->index, load->node.index);
915 ASSERTED bool result = _try_place_node(ctx, use->sched.instr, &load->node);
916 assert(result);
917 }
918 }
919
920 if (node->op == gpir_op_mov) {
921 /* We replaced all the uses of the move, so it's dead now. */
922 gpir_instr_remove_node(node->sched.instr, node);
923 gpir_node_delete(node);
924 } else {
925 /* We deleted all the uses of the node except the store, so it's not
926 * live anymore.
927 */
928 list_del(&node->list);
929 node->sched.inserted = false;
930 ctx->ready_list_slots--;
931 if (node->sched.max_node) {
932 node->sched.max_node = false;
933 ctx->instr->alu_num_slot_needed_by_max--;
934 }
935 if (node->sched.next_max_node) {
936 node->sched.next_max_node = false;
937 ctx->instr->alu_num_unscheduled_next_max--;
938 }
939 }
940 }
941
942 static bool used_by_store(gpir_node *node, gpir_instr *instr)
943 {
944 gpir_node_foreach_succ(node, dep) {
945 if (dep->type != GPIR_DEP_INPUT)
946 continue;
947
948 if (dep->succ->type == gpir_node_type_store &&
949 dep->succ->sched.instr == instr)
950 return true;
951 }
952
953 return false;
954 }
955
956 static gpir_node *consuming_postlog2(gpir_node *node)
957 {
958 if (node->op != gpir_op_complex1)
959 return NULL;
960
961 gpir_node_foreach_succ(node, dep) {
962 if (dep->type != GPIR_DEP_INPUT)
963 continue;
964 if (dep->succ->op == gpir_op_postlog2)
965 return dep->succ;
966 else
967 return NULL;
968 }
969
970 return NULL;
971 }
972
973 static bool try_spill_node(sched_ctx *ctx, gpir_node *node)
974 {
975 assert(node->op != gpir_op_mov);
976
977 if (used_by_store(node, ctx->instr))
978 return false;
979
980 gpir_debug("trying to spill %d\n", node->index);
981
982 int min_instr = INT_MAX;
983 uint64_t available = get_available_regs(ctx, node, &min_instr);
984 available &= ~get_killed_regs(ctx, min_instr);
985
986 if (node->sched.physreg_store) {
987 gpir_store_node *store = node->sched.physreg_store;
988 if (!(available & (1ull << (4 * store->index + store->component))))
989 return false;
990 } else {
991 available &= ~ctx->live_physregs;
992
993 if (available == 0)
994 return false;
995
996 /* Don't spill complex1 if it's used postlog2, turn the postlog2 into a
997 * move, replace the complex1 with postlog2 and spill that instead. The
998 * store needs a move anyways so the postlog2 is usually free.
999 */
1000 gpir_node *postlog2 = consuming_postlog2(node);
1001 if (postlog2) {
1002 postlog2->op = gpir_op_mov;
1003 node = create_postlog2(ctx, node);
1004 }
1005
1006 /* TODO: use a better heuristic for choosing an available register? */
1007 int physreg = ffsll(available) - 1;
1008
1009 ctx->live_physregs |= (1ull << physreg);
1010
1011 /* TODO: when we support multiple basic blocks, there may be register
1012 * loads/stores to this register other than this one that haven't been
1013 * scheduled yet so we may need to insert write-after-read dependencies.
1014 */
1015 gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg);
1016 store->index = physreg / 4;
1017 store->component = physreg % 4;
1018 store->child = node;
1019 store->node.sched.max_node = false;
1020 store->node.sched.next_max_node = false;
1021 store->node.sched.complex_allowed = false;
1022 store->node.sched.pos = -1;
1023 store->node.sched.instr = NULL;
1024 store->node.sched.inserted = false;
1025 store->node.sched.dist = node->sched.dist;
1026 if (node->op == gpir_op_complex1) {
1027 /* Complex1 cannot be directly stored, and has a latency of 2 */
1028 store->node.sched.dist += 2;
1029 }
1030 node->sched.physreg_store = store;
1031 gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT);
1032 node->sched.ready = false;
1033 schedule_insert_ready_list(ctx, &store->node);
1034 }
1035
1036 gpir_debug("spilling %d to $%d.%c, store %d\n", node->index,
1037 node->sched.physreg_store->index,
1038 "xyzw"[node->sched.physreg_store->component],
1039 node->sched.physreg_store->node.index);
1040
1041 spill_node(ctx, node, node->sched.physreg_store);
1042
1043 return true;
1044 }
1045
1046 static bool try_spill_nodes(sched_ctx *ctx, gpir_node *orig_node)
1047 {
1048 /* First, try to spill max nodes. */
1049 list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
1050 if (ctx->max_node_spill_needed <= 0)
1051 break;
1052
1053 /* orig_node is the node we're trying to schedule, so spilling it makes
1054 * no sense. Also don't try to spill any nodes in front of it, since
1055 * they might be scheduled instead.
1056 */
1057 if (node == orig_node)
1058 break;
1059
1060 if (node->op == gpir_op_mov) {
1061 /* Don't try to spill loads, since that only adds another load and
1062 * store which is likely pointless.
1063 */
1064 continue;
1065 }
1066
1067 if (!gpir_is_input_node(node) || !node->sched.max_node)
1068 continue;
1069
1070 if (try_spill_node(ctx, node)) {
1071 ctx->max_node_spill_needed--;
1072 ctx->total_spill_needed--;
1073 }
1074 }
1075
1076 /* Now, try to spill the remaining nodes. */
1077 list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
1078 if (ctx->total_spill_needed <= 0)
1079 break;
1080
1081 if (node == orig_node)
1082 break;
1083
1084 if (node->op == gpir_op_mov)
1085 continue;
1086
1087 if (!gpir_is_input_node(node) ||
1088 !(node->sched.max_node || node->sched.next_max_node))
1089 continue;
1090
1091 if (try_spill_node(ctx, node))
1092 ctx->total_spill_needed--;
1093 }
1094
1095 return ctx->total_spill_needed <= 0 && ctx->max_node_spill_needed <= 0;
1096 }
1097
1098 static int gpir_get_curr_ready_list_slots(sched_ctx *ctx)
1099 {
1100 int total = 0;
1101 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1102 total += gpir_get_slots_required(node);
1103 }
1104
1105 return total;
1106 }
1107
1108 /* What gpir_get_min_end() would return if node were replaced with a move
1109 * instruction not in the complex slot. Normally this is 2 + min_end, except
1110 * for some store instructions which must have the move node in the same
1111 * instruction.
1112 */
1113 static int gpir_get_min_end_as_move(gpir_node *node)
1114 {
1115 int min = INT_MAX;
1116 gpir_node_foreach_succ(node, dep) {
1117 gpir_node *succ = dep->succ;
1118 if (succ->sched.instr && dep->type == GPIR_DEP_INPUT) {
1119 switch (succ->op) {
1120 case gpir_op_store_temp:
1121 case gpir_op_store_reg:
1122 case gpir_op_store_varying:
1123 continue;
1124 default:
1125 break;
1126 }
1127 if (min > succ->sched.instr->index + 2)
1128 min = succ->sched.instr->index + 2;
1129 }
1130 }
1131 return min;
1132 }
1133
1134 /* The second source for add0, add1, mul0, and mul1 units cannot be complex.
1135 * The hardware overwrites the add second sources with 0 and mul second
1136 * sources with 1. This can be a problem if we need to insert more next-max
1137 * moves but we only have values that can't use the complex unit for moves.
1138 *
1139 * Fortunately, we only need to insert a next-max move if there are more than
1140 * 5 next-max nodes, but there are only 4 sources in the previous instruction
1141 * that make values not complex-capable, which means there can be at most 4
1142 * non-complex-capable values. Hence there will always be at least two values
1143 * that can be rewritten to use a move in the complex slot. However, we have
1144 * to be careful not to waste those values by putting both of them in a
1145 * non-complex slot. This is handled for us by gpir_instr, which will reject
1146 * such instructions. We just need to tell it which nodes can use complex, and
1147 * it will do the accounting to figure out what is safe.
1148 */
1149
1150 static bool can_use_complex(gpir_node *node)
1151 {
1152 gpir_node_foreach_succ(node, dep) {
1153 if (dep->type != GPIR_DEP_INPUT)
1154 continue;
1155
1156 gpir_node *succ = dep->succ;
1157 if (succ->type != gpir_node_type_alu)
1158 continue;
1159
1160 /* Note: this must be consistent with gpir_codegen_{mul,add}_slot{0,1}
1161 */
1162 gpir_alu_node *alu = gpir_node_to_alu(succ);
1163 switch (alu->node.op) {
1164 case gpir_op_complex1:
1165 /* complex1 puts its third source in the fourth slot */
1166 if (alu->children[1] == node || alu->children[2] == node)
1167 return false;
1168 break;
1169 case gpir_op_complex2:
1170 /* complex2 has its source duplicated, since it actually takes two
1171 * sources but we only ever use it with both sources the same. Hence
1172 * its source can never be the complex slot.
1173 */
1174 return false;
1175 case gpir_op_select:
1176 /* Select has its sources rearranged */
1177 if (alu->children[0] == node)
1178 return false;
1179 break;
1180 default:
1181 assert(alu->num_child <= 2);
1182 if (alu->num_child == 2 && alu->children[1] == node)
1183 return false;
1184 break;
1185 }
1186 }
1187
1188 return true;
1189 }
1190
1191 /* Initialize node->sched.max_node and node->sched.next_max_node for every
1192 * input node on the ready list. We should only need to do this once per
1193 * instruction, at the beginning, since we never add max nodes to the ready
1194 * list.
1195 */
1196
1197 static void sched_find_max_nodes(sched_ctx *ctx)
1198 {
1199 ctx->instr->alu_num_unscheduled_next_max = 0;
1200 ctx->instr->alu_num_slot_needed_by_max = 0;
1201
1202 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1203 if (!gpir_is_input_node(node))
1204 continue;
1205
1206 int min_end_move = gpir_get_min_end_as_move(node);
1207 node->sched.max_node = (min_end_move == ctx->instr->index);
1208 node->sched.next_max_node = (min_end_move == ctx->instr->index + 1);
1209 if (node->sched.next_max_node)
1210 node->sched.complex_allowed = can_use_complex(node);
1211
1212 if (node->sched.max_node)
1213 ctx->instr->alu_num_slot_needed_by_max++;
1214 if (node->sched.next_max_node)
1215 ctx->instr->alu_num_unscheduled_next_max++;
1216 }
1217 }
1218
1219 /* Verify the invariants described in gpir.h, as well as making sure the
1220 * counts are correct.
1221 */
1222 static void verify_max_nodes(sched_ctx *ctx)
1223 {
1224 int alu_num_slot_needed_by_max = 0;
1225 int alu_num_unscheduled_next_max = 0;
1226 int alu_num_slot_needed_by_store = 0;
1227 int alu_num_slot_needed_by_non_cplx_store = 0;
1228 int alu_max_allowed_next_max = 5;
1229
1230 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1231 if (!gpir_is_input_node(node))
1232 continue;
1233
1234 if (node->sched.max_node)
1235 alu_num_slot_needed_by_max++;
1236 if (node->sched.next_max_node)
1237 alu_num_unscheduled_next_max++;
1238 if (used_by_store(node, ctx->instr)) {
1239 alu_num_slot_needed_by_store++;
1240 if (node->sched.next_max_node && !node->sched.complex_allowed)
1241 alu_num_slot_needed_by_non_cplx_store++;
1242 }
1243 }
1244
1245 if (ctx->instr->slots[GPIR_INSTR_SLOT_MUL0] &&
1246 ctx->instr->slots[GPIR_INSTR_SLOT_MUL0]->op == gpir_op_complex1)
1247 alu_max_allowed_next_max = 4;
1248
1249 assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max);
1250 assert(ctx->instr->alu_num_unscheduled_next_max == alu_num_unscheduled_next_max);
1251 assert(ctx->instr->alu_max_allowed_next_max == alu_max_allowed_next_max);
1252 assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store);
1253 assert(ctx->instr->alu_num_slot_needed_by_non_cplx_store ==
1254 alu_num_slot_needed_by_non_cplx_store);
1255 assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_unscheduled_next_max - alu_max_allowed_next_max, 0));
1256 assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + alu_num_slot_needed_by_non_cplx_store);
1257 }
1258
1259 static bool try_node(sched_ctx *ctx)
1260 {
1261 gpir_node *best_node = NULL;
1262 int best_score = INT_MIN;
1263
1264 /* Spilling will delete arbitrary nodes after the current one in the ready
1265 * list, which means that we always need to look up the next node in the
1266 * list at the end of each iteration. While list_for_each_entry() works for
1267 * this purpose, its sanity checking assumes that you don't want to modify
1268 * the list at all. We know better here, so we have to open-code
1269 * list_for_each_entry() without the check in order to not assert.
1270 */
1271 for (gpir_node *node = LIST_ENTRY(gpir_node, ctx->ready_list.next, list);
1272 &node->list != &ctx->ready_list;
1273 node = LIST_ENTRY(gpir_node, node->list.next, list)) {
1274 if (best_score != INT_MIN) {
1275 if (node->sched.dist < best_node->sched.dist)
1276 break;
1277 }
1278
1279 if (node->sched.ready) {
1280 ctx->total_spill_needed = 0;
1281 ctx->max_node_spill_needed = 0;
1282 int score = schedule_try_node(ctx, node, true);
1283 if (score == INT_MIN && !best_node &&
1284 ctx->total_spill_needed > 0 &&
1285 try_spill_nodes(ctx, node)) {
1286 score = schedule_try_node(ctx, node, true);
1287 }
1288
1289 /* schedule_first nodes must be scheduled if possible */
1290 if (gpir_op_infos[node->op].schedule_first && score != INT_MIN) {
1291 best_node = node;
1292 best_score = score;
1293 break;
1294 }
1295
1296 if (score > best_score) {
1297 best_score = score;
1298 best_node = node;
1299 }
1300 }
1301 }
1302
1303 if (best_node) {
1304 gpir_debug("scheduling %d (score = %d)%s\n", best_node->index,
1305 best_score, best_node->sched.max_node ? " (max)" : "");
1306 ASSERTED int score = schedule_try_node(ctx, best_node, false);
1307 assert(score != INT_MIN);
1308 return true;
1309 }
1310
1311 return false;
1312 }
1313
1314 static void place_move(sched_ctx *ctx, gpir_node *node)
1315 {
1316 gpir_node *move = create_move(ctx, node);
1317 gpir_node_foreach_succ_safe(move, dep) {
1318 gpir_node *succ = dep->succ;
1319 if (!succ->sched.instr ||
1320 ctx->instr->index < succ->sched.instr->index + gpir_get_min_dist(dep)) {
1321 gpir_node_replace_pred(dep, node);
1322 if (dep->type == GPIR_DEP_INPUT)
1323 gpir_node_replace_child(succ, move, node);
1324 }
1325 }
1326 ASSERTED int score = schedule_try_node(ctx, move, false);
1327 assert(score != INT_MIN);
1328 }
1329
1330 /* For next-max nodes, not every node can be offloaded to a move in the
1331 * complex slot. If we run out of non-complex slots, then such nodes cannot
1332 * have moves placed for them. There should always be sufficient
1333 * complex-capable nodes so that this isn't a problem.
1334 */
1335 static bool can_place_move(sched_ctx *ctx, gpir_node *node)
1336 {
1337 if (!node->sched.next_max_node)
1338 return true;
1339
1340 if (node->sched.complex_allowed)
1341 return true;
1342
1343 return ctx->instr->alu_non_cplx_slot_free > 0;
1344 }
1345
1346 static bool sched_move(sched_ctx *ctx)
1347 {
1348 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1349 if (node->sched.max_node) {
1350 /* For complex1 that is consumed by a postlog2, we cannot allow any
1351 * moves in between. Convert the postlog2 to a move and insert a new
1352 * postlog2, and try to schedule it again in try_node().
1353 */
1354 gpir_node *postlog2 = consuming_postlog2(node);
1355 if (postlog2) {
1356 postlog2->op = gpir_op_mov;
1357 create_postlog2(ctx, node);
1358 } else {
1359 place_move(ctx, node);
1360 }
1361 return true;
1362 }
1363 }
1364
1365 if (ctx->instr->alu_num_slot_needed_by_store > 0) {
1366 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1367 if (used_by_store(node, ctx->instr)) {
1368 place_move(ctx, node);
1369 /* If we have a store of a load, then we need to make sure that we
1370 * immediately schedule the dependent load, or create a move
1371 * instruction for it, like we would with a normal instruction.
1372 * The rest of the code isn't set up to handle load nodes in the
1373 * ready list -- see the comments in _schedule_try_node().
1374 */
1375 if (node->type == gpir_node_type_load) {
1376 if (!schedule_try_place_node(ctx, node, false)) {
1377 create_move(ctx, node);
1378 }
1379 }
1380 return true;
1381 }
1382 }
1383 }
1384
1385 /* complex1 is a bit a special case, since it has a latency of 2 cycles.
1386 * Once it is fully ready, we need to group all its uses in the same
1387 * instruction, and then we need to avoid creating any moves in the next
1388 * cycle in order to get it scheduled. Failing to do any of these things
1389 * could result in a cycle penalty, or even worse, an infinite loop of
1390 * inserting moves. If it is a next-max node and ready, then it has a use
1391 * in the previous cycle. If it has a use in the current cycle as well,
1392 * then we want to insert a move node to make it ready in two cycles -- if
1393 * we don't, then there will be at least a one cycle penalty. Otherwise, it
1394 * will be ready next cycle, and we shouldn't insert a move node, or else
1395 * we'll also have a one cycle penalty.
1396 */
1397 if (ctx->instr->alu_num_slot_free > 0) {
1398 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1399 if (!can_place_move(ctx, node))
1400 continue;
1401
1402 if (node->sched.next_max_node && node->op == gpir_op_complex1 &&
1403 node->sched.ready) {
1404 bool skip = true;
1405 gpir_node_foreach_succ(node, dep) {
1406 if (dep->type != GPIR_DEP_INPUT)
1407 continue;
1408
1409 gpir_node *succ = dep->succ;
1410
1411 if (!succ->sched.instr ||
1412 succ->sched.instr->index != ctx->instr->index - 1) {
1413 skip = false;
1414 break;
1415 }
1416 }
1417
1418 if (skip)
1419 continue;
1420
1421 place_move(ctx, node);
1422 return true;
1423 }
1424 }
1425 }
1426
1427 /* Once we've made all the required moves, we're free to use any extra
1428 * slots to schedule more moves for next max nodes. Besides sometimes being
1429 * necessary, this can free up extra space in the next instruction. We walk
1430 * from back to front so that we pick nodes less likely to be scheduled
1431 * next first -- an extra move would be unnecessary there. But make sure
1432 * not to handle the complex1 case handled above.
1433 */
1434 if (ctx->instr->alu_num_slot_free > 0) {
1435 list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) {
1436 if (!can_place_move(ctx, node))
1437 continue;
1438
1439 if (node->sched.next_max_node &&
1440 !(node->op == gpir_op_complex1 && node->sched.ready)) {
1441 place_move(ctx, node);
1442 return true;
1443 }
1444 }
1445 }
1446
1447 /* We may have skipped complex1 above, but if we run out of space, we still
1448 * need to insert the move.
1449 */
1450
1451 if (ctx->instr->alu_num_unscheduled_next_max >
1452 ctx->instr->alu_max_allowed_next_max) {
1453 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1454 if (!can_place_move(ctx, node))
1455 continue;
1456
1457 if (node->sched.next_max_node) {
1458 place_move(ctx, node);
1459 return true;
1460 }
1461 }
1462 }
1463
1464
1465 return false;
1466 }
1467
1468 static bool gpir_sched_instr_pass(sched_ctx *ctx)
1469 {
1470 if (try_node(ctx))
1471 return true;
1472
1473 if (sched_move(ctx))
1474 return true;
1475
1476 return false;
1477 }
1478
1479 static void schedule_print_pre_one_instr(sched_ctx *ctx)
1480 {
1481 if (!(lima_debug & LIMA_DEBUG_GP))
1482 return;
1483
1484 printf("instr %d for ready list:", ctx->instr->index);
1485 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1486 printf(" %d/%c (%d, %d, %s)", node->index, node->sched.ready ? 'r' : 'p',
1487 node->sched.dist, gpir_get_slots_required(node),
1488 node->sched.max_node ? "max" : (node->sched.next_max_node ? "next" : "none"));
1489 }
1490 printf("\nlive physregs: ");
1491 for (unsigned i = 0; i < 16; i++) {
1492 if (ctx->live_physregs & (0xfull << (4 * i))) {
1493 printf("$%d.", i);
1494 for (unsigned j = 0; j < 4; j++) {
1495 if (ctx->live_physregs & (1ull << (4 * i + j)))
1496 printf("%c", "xyzw"[j]);
1497 }
1498 printf(" ");
1499 }
1500 }
1501 printf("\n");
1502 }
1503
1504 static void schedule_print_post_one_instr(gpir_instr *instr)
1505 {
1506 if (!(lima_debug & LIMA_DEBUG_GP))
1507 return;
1508
1509 printf("post schedule instr");
1510 for (int i = 0; i < GPIR_INSTR_SLOT_NUM; i++) {
1511 if (instr->slots[i])
1512 printf(" %d/%d", i, instr->slots[i]->index);
1513 }
1514 printf("\n");
1515 }
1516
1517
1518 static bool schedule_one_instr(sched_ctx *ctx)
1519 {
1520 gpir_instr *instr = gpir_instr_create(ctx->block);
1521 if (unlikely(!instr))
1522 return false;
1523
1524 ctx->instr = instr;
1525
1526 sched_find_max_nodes(ctx);
1527 schedule_print_pre_one_instr(ctx);
1528
1529 while (gpir_sched_instr_pass(ctx)) {
1530 assert(ctx->ready_list_slots == gpir_get_curr_ready_list_slots(ctx));
1531 #ifndef NDEBUG
1532 verify_max_nodes(ctx);
1533 verify_ready_list(ctx);
1534 #endif
1535 }
1536
1537 schedule_print_post_one_instr(instr);
1538 return true;
1539 }
1540
1541 static bool schedule_block(gpir_block *block)
1542 {
1543 /* calculate distance */
1544 list_for_each_entry(gpir_node, node, &block->node_list, list) {
1545 if (gpir_node_is_root(node))
1546 schedule_update_distance(node);
1547 }
1548
1549 sched_ctx ctx;
1550 list_inithead(&ctx.ready_list);
1551 ctx.block = block;
1552 ctx.ready_list_slots = 0;
1553 /* TODO initialize with block live out once we have proper liveness
1554 * tracking
1555 */
1556 ctx.live_physregs = 0;
1557
1558 /* construct the ready list from root nodes */
1559 list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
1560 if (gpir_node_is_root(node))
1561 schedule_insert_ready_list(&ctx, node);
1562 }
1563
1564 list_inithead(&block->node_list);
1565 while (!list_empty(&ctx.ready_list)) {
1566 if (!schedule_one_instr(&ctx))
1567 return false;
1568 }
1569
1570 return true;
1571 }
1572
1573 static void schedule_build_dependency(gpir_block *block)
1574 {
1575 gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0};
1576
1577 /* merge dummy_f/m to the node created from */
1578 list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
1579 if (node->op == gpir_op_dummy_m) {
1580 gpir_alu_node *alu = gpir_node_to_alu(node);
1581 gpir_node *origin = alu->children[0];
1582 gpir_node *dummy_f = alu->children[1];
1583
1584 gpir_node_foreach_succ(node, dep) {
1585 gpir_node *succ = dep->succ;
1586 /* origin and node may have same succ (by VREG/INPUT or
1587 * VREG/VREG dep), so use gpir_node_add_dep() instead of
1588 * gpir_node_replace_pred() */
1589 gpir_node_add_dep(succ, origin, dep->type);
1590 gpir_node_replace_child(succ, node, origin);
1591 }
1592 gpir_node_delete(dummy_f);
1593 gpir_node_delete(node);
1594 }
1595 }
1596
1597 /* Forward dependencies. We only need to add these for register loads,
1598 * since value registers already have an input dependency.
1599 */
1600 list_for_each_entry(gpir_node, node, &block->node_list, list) {
1601 if (node->op == gpir_op_load_reg) {
1602 gpir_load_node *load = gpir_node_to_load(node);
1603 unsigned index = 4 * load->index + load->component;
1604 if (last_written[index]) {
1605 gpir_node_add_dep(node, last_written[index], GPIR_DEP_READ_AFTER_WRITE);
1606 }
1607 }
1608
1609 if (node->value_reg >= 0)
1610 last_written[node->value_reg] = node;
1611 }
1612
1613 memset(last_written, 0, sizeof(last_written));
1614
1615 /* False dependencies. For value registers, these exist only to make sure
1616 * that the maximum pressure isn't exceeded and are hence "fake".
1617 */
1618 list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
1619 if (node->op == gpir_op_load_reg) {
1620 gpir_load_node *load = gpir_node_to_load(node);
1621 unsigned index = 4 * load->index + load->component;
1622 if (last_written[index]) {
1623 gpir_node_add_dep(last_written[index], node, GPIR_DEP_WRITE_AFTER_READ);
1624 }
1625 } else {
1626 gpir_node_foreach_pred(node, dep) {
1627 if (dep->type == GPIR_DEP_INPUT) {
1628 int index = dep->pred->value_reg;
1629 if (index >= 0 && last_written[index]) {
1630 gpir_node_add_dep(last_written[index], node,
1631 GPIR_DEP_WRITE_AFTER_READ);
1632 }
1633 }
1634 }
1635 }
1636
1637 if (node->value_reg >= 0)
1638 last_written[node->value_reg] = node;
1639 }
1640 }
1641
1642 static void print_statistic(gpir_compiler *comp, int save_index)
1643 {
1644 int num_nodes[gpir_op_num] = {0};
1645 int num_created_nodes[gpir_op_num] = {0};
1646
1647 list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1648 list_for_each_entry(gpir_node, node, &block->node_list, list) {
1649 num_nodes[node->op]++;
1650 if (node->index >= save_index)
1651 num_created_nodes[node->op]++;
1652 }
1653 }
1654
1655 printf("====== gpir scheduler statistic ======\n");
1656 printf("---- how many nodes are scheduled ----\n");
1657 int n = 0, l = 0;
1658 for (int i = 0; i < gpir_op_num; i++) {
1659 if (num_nodes[i]) {
1660 printf("%10s:%-6d", gpir_op_infos[i].name, num_nodes[i]);
1661 n += num_nodes[i];
1662 if (!(++l % 4))
1663 printf("\n");
1664 }
1665 }
1666 if (l % 4)
1667 printf("\n");
1668 printf("\ntotal: %d\n", n);
1669
1670 printf("---- how many nodes are created ----\n");
1671 n = l = 0;
1672 for (int i = 0; i < gpir_op_num; i++) {
1673 if (num_created_nodes[i]) {
1674 printf("%10s:%-6d", gpir_op_infos[i].name, num_created_nodes[i]);
1675 n += num_created_nodes[i];
1676 if (!(++l % 4))
1677 printf("\n");
1678 }
1679 }
1680 if (l % 4)
1681 printf("\n");
1682 printf("\ntotal: %d\n", n);
1683 printf("------------------------------------\n");
1684 }
1685
1686 bool gpir_schedule_prog(gpir_compiler *comp)
1687 {
1688 int save_index = comp->cur_index;
1689
1690 /* init schedule info */
1691 int index = 0;
1692 list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1693 block->sched.instr_index = 0;
1694 list_for_each_entry(gpir_node, node, &block->node_list, list) {
1695 node->sched.instr = NULL;
1696 node->sched.pos = -1;
1697 node->sched.index = index++;
1698 node->sched.dist = -1;
1699 /* TODO when we support multiple basic blocks, we need a way to keep
1700 * track of this for physregs allocated before the scheduler.
1701 */
1702 node->sched.physreg_store = NULL;
1703 node->sched.ready = false;
1704 node->sched.inserted = false;
1705 node->sched.complex_allowed = false;
1706 node->sched.max_node = false;
1707 node->sched.next_max_node = false;
1708 }
1709 }
1710
1711 /* build dependency */
1712 list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1713 schedule_build_dependency(block);
1714 }
1715
1716 //gpir_debug("after scheduler build reg dependency\n");
1717 //gpir_node_print_prog_dep(comp);
1718
1719 list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1720 if (!schedule_block(block)) {
1721 gpir_error("fail schedule block\n");
1722 return false;
1723 }
1724 }
1725
1726 if (lima_debug & LIMA_DEBUG_GP) {
1727 print_statistic(comp, save_index);
1728 gpir_instr_print_prog(comp);
1729 }
1730
1731 return true;
1732 }