vc4: Use the intrinsic's first_component for vattr VPM index.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qir_schedule.c
1 /*
2 * Copyright © 2010 Intel Corporation
3 * Copyright © 2014-2015 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 /**
26 * @file vc4_qir_schedule.c
27 *
28 * The basic model of the list scheduler is to take a basic block, compute a
29 * DAG of the dependencies from the bottom up, and make a list of the DAG
30 * heads. Heuristically pick a DAG head and schedule (remove) it, then put
31 * all the parents that are now DAG heads into the list of things to
32 * schedule.
33 *
34 * The goal of scheduling here, before register allocation and conversion to
35 * QPU instructions, is to reduce register pressure by reordering instructions
36 * to consume values when possible.
37 */
38
39 #include "vc4_qir.h"
40
41 static bool debug;
42
43 struct schedule_node {
44 struct list_head link;
45 struct qinst *inst;
46
47 struct schedule_node **children;
48 uint32_t child_count;
49 uint32_t child_array_size;
50 uint32_t parent_count;
51
52 /* Length of the longest (latency) chain from a DAG head to the this
53 * instruction.
54 */
55 uint32_t delay;
56
57 /* Longest time + latency_between(parent, this) of any parent of this
58 * node.
59 */
60 uint32_t unblocked_time;
61 };
62
63 struct schedule_state {
64 /* List of struct schedule_node *. This starts out with all
65 * instructions, and after dependency updates it's trimmed to be just
66 * the DAG heads.
67 */
68 struct list_head worklist;
69
70 uint32_t time;
71
72 uint32_t *temp_writes;
73
74 BITSET_WORD *temp_live;
75 };
76
77 /* When walking the instructions in reverse, we need to swap before/after in
78 * add_dep().
79 */
80 enum direction { F, R };
81
82 /**
83 * Marks a dependency between two intructions, that \p after must appear after
84 * \p before.
85 *
86 * Our dependencies are tracked as a DAG. Since we're scheduling bottom-up,
87 * the latest instructions with nothing left to schedule are the DAG heads,
88 * and their inputs are their children.
89 */
90 static void
91 add_dep(enum direction dir,
92 struct schedule_node *before,
93 struct schedule_node *after)
94 {
95 if (!before || !after)
96 return;
97
98 assert(before != after);
99
100 if (dir == R) {
101 struct schedule_node *t = before;
102 before = after;
103 after = t;
104 }
105
106 for (int i = 0; i < after->child_count; i++) {
107 if (after->children[i] == after)
108 return;
109 }
110
111 if (after->child_array_size <= after->child_count) {
112 after->child_array_size = MAX2(after->child_array_size * 2, 16);
113 after->children = reralloc(after, after->children,
114 struct schedule_node *,
115 after->child_array_size);
116 }
117
118 after->children[after->child_count] = before;
119 after->child_count++;
120 before->parent_count++;
121 }
122
123 static void
124 add_write_dep(enum direction dir,
125 struct schedule_node **before,
126 struct schedule_node *after)
127 {
128 add_dep(dir, *before, after);
129 *before = after;
130 }
131
132 struct schedule_setup_state {
133 struct schedule_node **last_temp_write;
134 struct schedule_node *last_sf;
135 struct schedule_node *last_vary_read;
136 struct schedule_node *last_vpm_read;
137 struct schedule_node *last_vpm_write;
138 struct schedule_node *last_tex_coord;
139 struct schedule_node *last_tex_result;
140 struct schedule_node *last_tlb;
141 struct schedule_node *last_uniforms_reset;
142 enum direction dir;
143
144 /**
145 * Texture FIFO tracking. This is done top-to-bottom, and is used to
146 * track the QOP_TEX_RESULTs and add dependencies on previous ones
147 * when trying to submit texture coords with TFREQ full or new texture
148 * fetches with TXRCV full.
149 */
150 struct {
151 struct schedule_node *node;
152 int coords;
153 } tex_fifo[8];
154 int tfreq_count; /**< Number of texture coords outstanding. */
155 int tfrcv_count; /**< Number of texture results outstanding. */
156 int tex_fifo_pos;
157 };
158
159 static void
160 block_until_tex_result(struct schedule_setup_state *state, struct schedule_node *n)
161 {
162 add_dep(state->dir, state->tex_fifo[0].node, n);
163
164 state->tfreq_count -= state->tex_fifo[0].coords;
165 state->tfrcv_count--;
166
167 memmove(&state->tex_fifo[0],
168 &state->tex_fifo[1],
169 state->tex_fifo_pos * sizeof(state->tex_fifo[0]));
170 state->tex_fifo_pos--;
171 }
172
173 /**
174 * Common code for dependencies that need to be tracked both forward and
175 * backward.
176 *
177 * This is for things like "all VPM reads have to happen in order."
178 */
179 static void
180 calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
181 {
182 struct qinst *inst = n->inst;
183 enum direction dir = state->dir;
184
185
186 /* Add deps for temp registers and varyings accesses. Note that we
187 * ignore uniforms accesses, because qir_reorder_uniforms() happens
188 * after this.
189 */
190 for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
191 switch (inst->src[i].file) {
192 case QFILE_TEMP:
193 add_dep(dir,
194 state->last_temp_write[inst->src[i].index], n);
195 break;
196
197 case QFILE_VARY:
198 add_write_dep(dir, &state->last_vary_read, n);
199 break;
200
201 case QFILE_VPM:
202 add_write_dep(dir, &state->last_vpm_read, n);
203 break;
204
205 default:
206 break;
207 }
208 }
209
210 switch (inst->op) {
211 case QOP_VARY_ADD_C:
212 add_dep(dir, state->last_vary_read, n);
213 break;
214
215 case QOP_TEX_S:
216 case QOP_TEX_T:
217 case QOP_TEX_R:
218 case QOP_TEX_B:
219 case QOP_TEX_DIRECT:
220 /* Texturing setup gets scheduled in order, because
221 * the uniforms referenced by them have to land in a
222 * specific order.
223 */
224 add_write_dep(dir, &state->last_tex_coord, n);
225 break;
226
227 case QOP_TEX_RESULT:
228 /* Results have to be fetched in order. */
229 add_write_dep(dir, &state->last_tex_result, n);
230 break;
231
232 case QOP_TLB_COLOR_READ:
233 case QOP_MS_MASK:
234 add_write_dep(dir, &state->last_tlb, n);
235 break;
236
237 default:
238 break;
239 }
240
241 switch (inst->dst.file) {
242 case QFILE_VPM:
243 add_write_dep(dir, &state->last_vpm_write, n);
244 break;
245
246 case QFILE_TEMP:
247 add_write_dep(dir, &state->last_temp_write[inst->dst.index], n);
248 break;
249
250 case QFILE_TLB_COLOR_WRITE:
251 case QFILE_TLB_COLOR_WRITE_MS:
252 case QFILE_TLB_Z_WRITE:
253 case QFILE_TLB_STENCIL_SETUP:
254 add_write_dep(dir, &state->last_tlb, n);
255 break;
256
257 default:
258 break;
259 }
260
261 if (qir_depends_on_flags(inst))
262 add_dep(dir, state->last_sf, n);
263
264 if (inst->sf)
265 add_write_dep(dir, &state->last_sf, n);
266 }
267
268 static void
269 calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
270 struct list_head *schedule_list)
271 {
272 struct schedule_setup_state state;
273
274 memset(&state, 0, sizeof(state));
275 state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
276 c->num_temps);
277 state.dir = F;
278
279 list_for_each_entry(struct schedule_node, n, schedule_list, link) {
280 struct qinst *inst = n->inst;
281
282 calculate_deps(&state, n);
283
284 for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
285 switch (inst->src[i].file) {
286 case QFILE_UNIF:
287 add_dep(state.dir, state.last_uniforms_reset, n);
288 break;
289 default:
290 break;
291 }
292 }
293
294 switch (inst->op) {
295 case QOP_TEX_S:
296 case QOP_TEX_T:
297 case QOP_TEX_R:
298 case QOP_TEX_B:
299 case QOP_TEX_DIRECT:
300 /* If the texture coordinate fifo is full,
301 * block this on the last QOP_TEX_RESULT.
302 */
303 if (state.tfreq_count == 8) {
304 block_until_tex_result(&state, n);
305 }
306
307 /* If the texture result fifo is full, block
308 * adding any more to it until the last
309 * QOP_TEX_RESULT.
310 */
311 if (inst->op == QOP_TEX_S ||
312 inst->op == QOP_TEX_DIRECT) {
313 if (state.tfrcv_count == 4)
314 block_until_tex_result(&state, n);
315 state.tfrcv_count++;
316 }
317
318 state.tex_fifo[state.tex_fifo_pos].coords++;
319 state.tfreq_count++;
320 break;
321
322 case QOP_TEX_RESULT:
323 /* Results have to be fetched after the
324 * coordinate setup. Note that we're assuming
325 * here that our input shader has the texture
326 * coord setup and result fetch in order,
327 * which is true initially but not of our
328 * instruction stream after this pass.
329 */
330 add_dep(state.dir, state.last_tex_coord, n);
331
332 state.tex_fifo[state.tex_fifo_pos].node = n;
333
334 state.tex_fifo_pos++;
335 memset(&state.tex_fifo[state.tex_fifo_pos], 0,
336 sizeof(state.tex_fifo[0]));
337 break;
338
339 case QOP_UNIFORMS_RESET:
340 add_write_dep(state.dir, &state.last_uniforms_reset, n);
341 break;
342
343 default:
344 assert(!qir_is_tex(inst));
345 break;
346 }
347 }
348 }
349
350 static void
351 calculate_reverse_deps(struct vc4_compile *c, void *mem_ctx,
352 struct list_head *schedule_list)
353 {
354 struct schedule_setup_state state;
355
356 memset(&state, 0, sizeof(state));
357 state.dir = R;
358 state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
359 c->num_temps);
360
361 list_for_each_entry_rev(struct schedule_node, n, schedule_list, link) {
362 calculate_deps(&state, n);
363 }
364 }
365
366 static int
367 get_register_pressure_cost(struct schedule_state *state, struct qinst *inst)
368 {
369 int cost = 0;
370
371 if (inst->dst.file == QFILE_TEMP &&
372 state->temp_writes[inst->dst.index] == 1)
373 cost--;
374
375 for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
376 if (inst->src[i].file == QFILE_TEMP &&
377 !BITSET_TEST(state->temp_live, inst->src[i].index)) {
378 cost++;
379 }
380 }
381
382 return cost;
383 }
384
385 static bool
386 locks_scoreboard(struct qinst *inst)
387 {
388 if (inst->op == QOP_TLB_COLOR_READ)
389 return true;
390
391 switch (inst->dst.file) {
392 case QFILE_TLB_Z_WRITE:
393 case QFILE_TLB_COLOR_WRITE:
394 case QFILE_TLB_COLOR_WRITE_MS:
395 return true;
396 default:
397 return false;
398 }
399 }
400
401 static struct schedule_node *
402 choose_instruction(struct schedule_state *state)
403 {
404 struct schedule_node *chosen = NULL;
405
406 list_for_each_entry(struct schedule_node, n, &state->worklist, link) {
407 /* The branches aren't being tracked as dependencies. Make
408 * sure that they stay scheduled as the last instruction of
409 * the block, which is to say the first one we choose to
410 * schedule.
411 */
412 if (n->inst->op == QOP_BRANCH)
413 return n;
414
415 if (!chosen) {
416 chosen = n;
417 continue;
418 }
419
420 /* Prefer scheduling things that lock the scoreboard, so that
421 * they appear late in the program and we get more parallelism
422 * between shaders on multiple QPUs hitting the same fragment.
423 */
424 if (locks_scoreboard(n->inst) &&
425 !locks_scoreboard(chosen->inst)) {
426 chosen = n;
427 continue;
428 } else if (!locks_scoreboard(n->inst) &&
429 locks_scoreboard(chosen->inst)) {
430 continue;
431 }
432
433 /* If we would block on the previously chosen node, but would
434 * block less on this one, then prefer it.
435 */
436 if (chosen->unblocked_time > state->time &&
437 n->unblocked_time < chosen->unblocked_time) {
438 chosen = n;
439 continue;
440 } else if (n->unblocked_time > state->time &&
441 n->unblocked_time > chosen->unblocked_time) {
442 continue;
443 }
444
445 /* If we can definitely reduce register pressure, do so
446 * immediately.
447 */
448 int register_pressure_cost =
449 get_register_pressure_cost(state, n->inst);
450 int chosen_register_pressure_cost =
451 get_register_pressure_cost(state, chosen->inst);
452
453 if (register_pressure_cost < chosen_register_pressure_cost) {
454 chosen = n;
455 continue;
456 } else if (register_pressure_cost >
457 chosen_register_pressure_cost) {
458 continue;
459 }
460
461 /* Otherwise, prefer instructions with the deepest chain to
462 * the end of the program. This avoids the problem of
463 * "everything generates a temp, nothing finishes freeing one,
464 * guess I'll just keep emitting varying mul/adds".
465 */
466 if (n->delay > chosen->delay) {
467 chosen = n;
468 continue;
469 } else if (n->delay < chosen->delay) {
470 continue;
471 }
472 }
473
474 return chosen;
475 }
476
477 static void
478 dump_state(struct vc4_compile *c, struct schedule_state *state)
479 {
480 uint32_t i = 0;
481 list_for_each_entry(struct schedule_node, n, &state->worklist, link) {
482 fprintf(stderr, "%3d: ", i++);
483 qir_dump_inst(c, n->inst);
484 fprintf(stderr, " (%d cost)\n",
485 get_register_pressure_cost(state, n->inst));
486
487 for (int i = 0; i < n->child_count; i++) {
488 struct schedule_node *child = n->children[i];
489 fprintf(stderr, " - ");
490 qir_dump_inst(c, child->inst);
491 fprintf(stderr, " (%d parents)\n", child->parent_count);
492 }
493 }
494 }
495
496 /* Estimate of how many instructions we should schedule between operations.
497 *
498 * These aren't in real cycle counts, because we're just estimating cycle
499 * times anyway. QIR instructions will get paired up when turned into QPU
500 * instructions, or extra NOP delays will have to be added due to register
501 * allocation choices.
502 */
503 static uint32_t
504 latency_between(struct schedule_node *before, struct schedule_node *after)
505 {
506 if ((before->inst->op == QOP_TEX_S ||
507 before->inst->op == QOP_TEX_DIRECT) &&
508 after->inst->op == QOP_TEX_RESULT)
509 return 100;
510
511 return 1;
512 }
513
514 /** Recursive computation of the delay member of a node. */
515 static void
516 compute_delay(struct schedule_node *n)
517 {
518 if (!n->child_count) {
519 /* The color read needs to be scheduled late, to avoid locking
520 * the scoreboard early. This is our best tool for
521 * encouraging that. The other scoreboard locking ops will
522 * have this happen by default, since they are generally the
523 * DAG heads or close to them.
524 */
525 if (n->inst->op == QOP_TLB_COLOR_READ)
526 n->delay = 1000;
527 else
528 n->delay = 1;
529 } else {
530 for (int i = 0; i < n->child_count; i++) {
531 if (!n->children[i]->delay)
532 compute_delay(n->children[i]);
533 n->delay = MAX2(n->delay,
534 n->children[i]->delay +
535 latency_between(n, n->children[i]));
536 }
537 }
538 }
539
540 static void
541 schedule_instructions(struct vc4_compile *c,
542 struct qblock *block, struct schedule_state *state)
543 {
544 if (debug) {
545 fprintf(stderr, "initial deps:\n");
546 dump_state(c, state);
547 }
548
549 /* Remove non-DAG heads from the list. */
550 list_for_each_entry_safe(struct schedule_node, n,
551 &state->worklist, link) {
552 if (n->parent_count != 0)
553 list_del(&n->link);
554 }
555
556 state->time = 0;
557 while (!list_empty(&state->worklist)) {
558 struct schedule_node *chosen = choose_instruction(state);
559 struct qinst *inst = chosen->inst;
560
561 if (debug) {
562 fprintf(stderr, "current list:\n");
563 dump_state(c, state);
564 fprintf(stderr, "chose: ");
565 qir_dump_inst(c, inst);
566 fprintf(stderr, " (%d cost)\n",
567 get_register_pressure_cost(state, inst));
568 }
569
570 state->time = MAX2(state->time, chosen->unblocked_time);
571
572 /* Schedule this instruction back onto the QIR list. */
573 list_del(&chosen->link);
574 list_add(&inst->link, &block->instructions);
575
576 /* Now that we've scheduled a new instruction, some of its
577 * children can be promoted to the list of instructions ready to
578 * be scheduled. Update the children's unblocked time for this
579 * DAG edge as we do so.
580 */
581 for (int i = chosen->child_count - 1; i >= 0; i--) {
582 struct schedule_node *child = chosen->children[i];
583
584 child->unblocked_time = MAX2(child->unblocked_time,
585 state->time +
586 latency_between(chosen,
587 child));
588 child->parent_count--;
589 if (child->parent_count == 0)
590 list_add(&child->link, &state->worklist);
591 }
592
593 /* Update our tracking of register pressure. */
594 for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
595 if (inst->src[i].file == QFILE_TEMP)
596 BITSET_SET(state->temp_live, inst->src[i].index);
597 }
598 if (inst->dst.file == QFILE_TEMP) {
599 state->temp_writes[inst->dst.index]--;
600 if (state->temp_writes[inst->dst.index] == 0)
601 BITSET_CLEAR(state->temp_live, inst->dst.index);
602 }
603
604 state->time++;
605 }
606 }
607
608 static void
609 qir_schedule_instructions_block(struct vc4_compile *c,
610 struct qblock *block)
611 {
612 void *mem_ctx = ralloc_context(NULL);
613 struct schedule_state state = { { 0 } };
614
615 state.temp_writes = rzalloc_array(mem_ctx, uint32_t, c->num_temps);
616 state.temp_live = rzalloc_array(mem_ctx, BITSET_WORD,
617 BITSET_WORDS(c->num_temps));
618 list_inithead(&state.worklist);
619
620 /* Wrap each instruction in a scheduler structure. */
621 qir_for_each_inst_safe(inst, block) {
622 struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
623
624 n->inst = inst;
625 list_del(&inst->link);
626 list_addtail(&n->link, &state.worklist);
627
628 if (inst->dst.file == QFILE_TEMP)
629 state.temp_writes[inst->dst.index]++;
630 }
631
632 /* Dependencies tracked top-to-bottom. */
633 calculate_forward_deps(c, mem_ctx, &state.worklist);
634 /* Dependencies tracked bottom-to-top. */
635 calculate_reverse_deps(c, mem_ctx, &state.worklist);
636
637 list_for_each_entry(struct schedule_node, n, &state.worklist, link)
638 compute_delay(n);
639
640 schedule_instructions(c, block, &state);
641
642 ralloc_free(mem_ctx);
643 }
644
645 void
646 qir_schedule_instructions(struct vc4_compile *c)
647 {
648
649 if (debug) {
650 fprintf(stderr, "Pre-schedule instructions\n");
651 qir_dump(c);
652 }
653
654 qir_for_each_block(block, c)
655 qir_schedule_instructions_block(c, block);
656
657 if (debug) {
658 fprintf(stderr, "Post-schedule instructions\n");
659 qir_dump(c);
660 }
661 }