intel/fs/gen12: Introduce software scoreboard lowering pass.
[mesa.git] / src / intel / compiler / brw_fs_scoreboard.cpp
1 /*
2 * Copyright © 2019 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_scoreboard.cpp
25 *
26 * Gen12+ hardware lacks the register scoreboard logic that used to guarantee
27 * data coherency between register reads and writes in previous generations.
28 * This lowering pass runs after register allocation in order to make up for
29 * it.
30 *
31 * It works by performing global dataflow analysis in order to determine the
32 * set of potential dependencies of every instruction in the shader, and then
33 * inserts any required SWSB annotations and additional SYNC instructions in
34 * order to guarantee data coherency.
35 *
36 * WARNING - Access of the following (rarely used) ARF registers is not
37 * tracked here, and require the RegDist SWSB annotation to be set
38 * to 1 by the generator in order to avoid data races:
39 *
40 * - sp stack pointer
41 * - sr0 state register
42 * - cr0 control register
43 * - ip instruction pointer
44 * - tm0 timestamp register
45 * - dbg0 debug register
46 *
47 * The following ARF registers don't need to be tracked here because data
48 * coherency is still provided transparently by the hardware:
49 *
50 * - f0-1 flag registers
51 * - n0 notification register
52 * - tdr0 thread dependency register
53 */
54
55 #include <tuple>
56 #include <vector>
57
58 #include "brw_fs.h"
59 #include "brw_cfg.h"
60
61 using namespace brw;
62
63 namespace {
64 /**
65 * In-order instruction accounting.
66 * @{
67 */
68
69 /**
70 * Number of in-order hardware instructions contained in this IR
71 * instruction. This determines the increment applied to the RegDist
72 * counter calculated for any ordered dependency that crosses this
73 * instruction.
74 */
75 unsigned
76 ordered_unit(const fs_inst *inst)
77 {
78 switch (inst->opcode) {
79 case BRW_OPCODE_SYNC:
80 case BRW_OPCODE_DO:
81 case SHADER_OPCODE_UNDEF:
82 case FS_OPCODE_PLACEHOLDER_HALT:
83 return 0;
84 default:
85 /* Note that the following is inaccurate for virtual instructions
86 * that expand to more in-order instructions than assumed here, but
87 * that can only lead to suboptimal execution ordering, data
88 * coherency won't be impacted. Providing exact RegDist counts for
89 * each virtual instruction would allow better ALU performance, but
90 * it would require keeping this switch statement in perfect sync
91 * with the generator in order to avoid data corruption. Lesson is
92 * (again) don't use virtual instructions if you want optimal
93 * scheduling.
94 */
95 return is_unordered(inst) ? 0 : 1;
96 }
97 }
98
99 /**
100 * Type for an instruction counter that increments for in-order
101 * instructions only, arbitrarily denoted 'jp' throughout this lowering
102 * pass in order to distinguish it from the regular instruction counter.
103 */
104 typedef int ordered_address;
105
106 /**
107 * Calculate the local ordered_address instruction counter at every
108 * instruction of the shader for subsequent constant-time look-up.
109 */
110 std::vector<ordered_address>
111 ordered_inst_addresses(const fs_visitor *shader)
112 {
113 std::vector<ordered_address> jps;
114 ordered_address jp = 0;
115
116 foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
117 jps.push_back(jp);
118 jp += ordered_unit(inst);
119 }
120
121 return jps;
122 }
123
124 /**
125 * Synchronization mode required for data manipulated by in-order
126 * instructions.
127 *
128 * Similar to tgl_sbid_mode, but without SET mode. Defined as a separate
129 * enum for additional type safety. The hardware doesn't provide control
130 * over the synchronization mode for RegDist annotations, this is only used
131 * internally in this pass in order to optimize out redundant read
132 * dependencies where possible.
133 */
134 enum tgl_regdist_mode {
135 TGL_REGDIST_NULL = 0,
136 TGL_REGDIST_SRC = 1,
137 TGL_REGDIST_DST = 2
138 };
139
140 /**
141 * Allow bitwise arithmetic of tgl_regdist_mode enums.
142 */
143 tgl_regdist_mode
144 operator|(tgl_regdist_mode x, tgl_regdist_mode y)
145 {
146 return tgl_regdist_mode(unsigned(x) | unsigned(y));
147 }
148
149 tgl_regdist_mode
150 operator&(tgl_regdist_mode x, tgl_regdist_mode y)
151 {
152 return tgl_regdist_mode(unsigned(x) & unsigned(y));
153 }
154
155 tgl_regdist_mode &
156 operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
157 {
158 return x = x | y;
159 }
160
161 tgl_regdist_mode &
162 operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
163 {
164 return x = x & y;
165 }
166
167 /** @} */
168
169 /**
170 * Representation of an equivalence relation among the set of unsigned
171 * integers.
172 *
173 * Its initial state is the identity relation '~' such that i ~ j if and
174 * only if i == j for every pair of unsigned integers i and j.
175 */
176 struct equivalence_relation {
177 /**
178 * Return equivalence class index of the specified element. Effectively
179 * this is the numeric value of an arbitrary representative from the
180 * equivalence class.
181 *
182 * Allows the evaluation of the equivalence relation according to the
183 * rule that i ~ j if and only if lookup(i) == lookup(j).
184 */
185 unsigned
186 lookup(unsigned i) const
187 {
188 if (i < is.size() && is[i] != i)
189 return lookup(is[i]);
190 else
191 return i;
192 }
193
194 /**
195 * Create an array with the results of the lookup() method for
196 * constant-time evaluation.
197 */
198 std::vector<unsigned>
199 flatten() const {
200 std::vector<unsigned> ids;
201
202 for (const auto i : is)
203 ids.push_back(lookup(i));
204
205 return ids;
206 }
207
208 /**
209 * Mutate the existing equivalence relation minimally by imposing the
210 * additional requirement that i ~ j.
211 *
212 * The algorithm updates the internal representation recursively in
213 * order to guarantee transitivity while preserving the previously
214 * specified equivalence requirements.
215 */
216 unsigned
217 link(unsigned i, unsigned j)
218 {
219 const unsigned k = lookup(i);
220 assign(i, k);
221 assign(j, k);
222 return k;
223 }
224
225 private:
226 /**
227 * Assign the representative of \p from to be equivalent to \p to.
228 *
229 * At the same time the data structure is partially flattened as much as
230 * it's possible without increasing the number of recursive calls.
231 */
232 void
233 assign(unsigned from, unsigned to)
234 {
235 if (from != to) {
236 if (from < is.size() && is[from] != from)
237 assign(is[from], to);
238
239 for (unsigned i = is.size(); i <= from; i++)
240 is.push_back(i);
241
242 is[from] = to;
243 }
244 }
245
246 std::vector<unsigned> is;
247 };
248
249 /**
250 * Representation of a data dependency between two instructions in the
251 * program.
252 * @{
253 */
254 struct dependency {
255 /**
256 * No dependency information.
257 */
258 dependency() : ordered(TGL_REGDIST_NULL), jp(INT_MIN),
259 unordered(TGL_SBID_NULL), id(0) {}
260
261 /**
262 * Construct a dependency on the in-order instruction with the provided
263 * ordered_address instruction counter.
264 */
265 dependency(tgl_regdist_mode mode, ordered_address jp) :
266 ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0) {}
267
268 /**
269 * Construct a dependency on the out-of-order instruction with the
270 * specified synchronization token.
271 */
272 dependency(tgl_sbid_mode mode, unsigned id) :
273 ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id) {}
274
275 /**
276 * Synchronization mode of in-order dependency, or zero if no in-order
277 * dependency is present.
278 */
279 tgl_regdist_mode ordered;
280
281 /**
282 * Instruction counter of in-order dependency.
283 *
284 * For a dependency part of a different block in the program, this is
285 * relative to the specific control flow path taken between the
286 * dependency and the current block: It is the ordered_address such that
287 * the difference between it and the ordered_address of the first
288 * instruction of the current block is exactly the number of in-order
289 * instructions across that control flow path. It is not guaranteed to
290 * be equal to the local ordered_address of the generating instruction
291 * [as returned by ordered_inst_addresses()], except for block-local
292 * dependencies.
293 */
294 ordered_address jp;
295
296 /**
297 * Synchronization mode of unordered dependency, or zero if no unordered
298 * dependency is present.
299 */
300 tgl_sbid_mode unordered;
301
302 /** Synchronization token of out-of-order dependency. */
303 unsigned id;
304
305 /**
306 * Trivial in-order dependency that's always satisfied.
307 *
308 * Note that unlike a default-constructed dependency() which is also
309 * trivially satisfied, this is considered to provide dependency
310 * information and can be used to clear a previously pending dependency
311 * via shadow().
312 */
313 static const dependency done;
314
315 friend bool
316 operator==(const dependency &dep0, const dependency &dep1)
317 {
318 return dep0.ordered == dep1.ordered &&
319 dep0.jp == dep1.jp &&
320 dep0.unordered == dep1.unordered &&
321 dep0.id == dep1.id;
322 }
323
324 friend bool
325 operator!=(const dependency &dep0, const dependency &dep1)
326 {
327 return !(dep0 == dep1);
328 }
329 };
330
331 const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN);
332
333 /**
334 * Return whether \p dep contains any dependency information.
335 */
336 bool
337 is_valid(const dependency &dep)
338 {
339 return dep.ordered || dep.unordered;
340 }
341
342 /**
343 * Combine \p dep0 and \p dep1 into a single dependency object that is only
344 * satisfied when both original dependencies are satisfied. This might
345 * involve updating the equivalence relation \p eq in order to make sure
346 * that both out-of-order dependencies are assigned the same hardware SBID
347 * as synchronization token.
348 */
349 dependency
350 merge(equivalence_relation &eq,
351 const dependency &dep0, const dependency &dep1)
352 {
353 dependency dep;
354
355 if (dep0.ordered || dep1.ordered) {
356 dep.ordered = dep0.ordered | dep1.ordered;
357 dep.jp = MAX2(dep0.jp, dep1.jp);
358 }
359
360 if (dep0.unordered || dep1.unordered) {
361 dep.unordered = dep0.unordered | dep1.unordered;
362 dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
363 dep1.unordered ? dep1.id : dep0.id);
364 }
365
366 return dep;
367 }
368
369 /**
370 * Override dependency information of \p dep0 with that of \p dep1.
371 */
372 dependency
373 shadow(const dependency &dep0, const dependency &dep1)
374 {
375 return is_valid(dep1) ? dep1 : dep0;
376 }
377
378 /**
379 * Translate dependency information across the program.
380 *
381 * This returns a dependency on the same instruction translated to the
382 * ordered_address space of a different block. The correct shift for
383 * transporting a dependency across an edge of the CFG is the difference
384 * between the local ordered_address of the first instruction of the target
385 * block and the local ordered_address of the instruction immediately after
386 * the end of the origin block.
387 */
388 dependency
389 transport(dependency dep, int delta)
390 {
391 if (dep.ordered && dep.jp > INT_MIN)
392 dep.jp += delta;
393
394 return dep;
395 }
396
397 /**
398 * Return simplified dependency removing any synchronization modes not
399 * applicable to an instruction reading the same register location.
400 */
401 dependency
402 dependency_for_read(dependency dep)
403 {
404 dep.ordered &= TGL_REGDIST_DST;
405 return dep;
406 }
407
408 /**
409 * Return simplified dependency removing any synchronization modes not
410 * applicable to an instruction \p inst writing the same register location.
411 */
412 dependency
413 dependency_for_write(const fs_inst *inst, dependency dep)
414 {
415 if (!is_unordered(inst))
416 dep.ordered &= TGL_REGDIST_DST;
417 return dep;
418 }
419
420 /** @} */
421
422 /**
423 * Scoreboard representation. This keeps track of the data dependencies of
424 * registers with GRF granularity.
425 */
426 class scoreboard {
427 public:
428 /**
429 * Look up the most current data dependency for register \p r.
430 */
431 dependency
432 get(const fs_reg &r) const
433 {
434 if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
435 return *p;
436 else
437 return dependency();
438 }
439
440 /**
441 * Specify the most current data dependency for register \p r.
442 */
443 void
444 set(const fs_reg &r, const dependency &d)
445 {
446 if (dependency *p = dep(r))
447 *p = d;
448 }
449
450 /**
451 * Component-wise merge() of corresponding dependencies from two
452 * scoreboard objects. \sa merge().
453 */
454 friend scoreboard
455 merge(equivalence_relation &eq,
456 const scoreboard &sb0, const scoreboard &sb1)
457 {
458 scoreboard sb;
459
460 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
461 sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
462
463 sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
464
465 for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
466 sb.accum_deps[i] = merge(eq, sb0.accum_deps[i], sb1.accum_deps[i]);
467
468 return sb;
469 }
470
471 /**
472 * Component-wise shadow() of corresponding dependencies from two
473 * scoreboard objects. \sa shadow().
474 */
475 friend scoreboard
476 shadow(const scoreboard &sb0, const scoreboard &sb1)
477 {
478 scoreboard sb;
479
480 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
481 sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
482
483 sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
484
485 for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
486 sb.accum_deps[i] = shadow(sb0.accum_deps[i], sb1.accum_deps[i]);
487
488 return sb;
489 }
490
491 /**
492 * Component-wise transport() of dependencies from a scoreboard
493 * object. \sa transport().
494 */
495 friend scoreboard
496 transport(const scoreboard &sb0, int delta)
497 {
498 scoreboard sb;
499
500 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
501 sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
502
503 sb.addr_dep = transport(sb0.addr_dep, delta);
504
505 for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
506 sb.accum_deps[i] = transport(sb0.accum_deps[i], delta);
507
508 return sb;
509 }
510
511 friend bool
512 operator==(const scoreboard &sb0, const scoreboard &sb1)
513 {
514 for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
515 if (sb0.grf_deps[i] != sb1.grf_deps[i])
516 return false;
517 }
518
519 if (sb0.addr_dep != sb1.addr_dep)
520 return false;
521
522 for (unsigned i = 0; i < ARRAY_SIZE(sb0.accum_deps); i++) {
523 if (sb0.accum_deps[i] != sb1.accum_deps[i])
524 return false;
525 }
526
527 return true;
528 }
529
530 friend bool
531 operator!=(const scoreboard &sb0, const scoreboard &sb1)
532 {
533 return !(sb0 == sb1);
534 }
535
536 private:
537 dependency grf_deps[BRW_MAX_GRF];
538 dependency addr_dep;
539 dependency accum_deps[10];
540
541 dependency *
542 dep(const fs_reg &r)
543 {
544 const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
545 reg_offset(r) / REG_SIZE);
546
547 return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
548 r.file == MRF ? &grf_deps[GEN7_MRF_HACK_START + reg] :
549 r.file == ARF && reg >= BRW_ARF_ADDRESS &&
550 reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
551 r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
552 reg < BRW_ARF_FLAG ? &accum_deps[
553 reg - BRW_ARF_ACCUMULATOR] :
554 NULL);
555 }
556 };
557
558 /**
559 * Dependency list handling.
560 * @{
561 */
562
563 /**
564 * Add dependency \p dep to the list of dependencies of an instruction
565 * \p deps.
566 */
567 void
568 add_dependency(const std::vector<unsigned> &ids,
569 std::vector<dependency> &deps, dependency dep)
570 {
571 if (is_valid(dep)) {
572 /* Translate the unordered dependency token first in order to keep
573 * the list minimally redundant.
574 */
575 if (dep.unordered && dep.id < ids.size())
576 dep.id = ids[dep.id];
577
578 /* Try to combine the specified dependency with any existing ones. */
579 for (auto &dep1 : deps) {
580 if (dep.ordered && dep1.ordered) {
581 dep1.jp = MAX2(dep1.jp, dep.jp);
582 dep1.ordered |= dep.ordered;
583 dep.ordered = TGL_REGDIST_NULL;
584 }
585
586 if (dep.unordered && dep1.unordered && dep1.id == dep.id) {
587 dep1.unordered |= dep.unordered;
588 dep.unordered = TGL_SBID_NULL;
589 }
590 }
591
592 /* Add it to the end of the list if necessary. */
593 if (is_valid(dep))
594 deps.push_back(dep);
595 }
596 }
597
598 /**
599 * Construct a tgl_swsb annotation encoding any ordered dependencies from
600 * the dependency list \p deps of an instruction with ordered_address
601 * \p jp.
602 */
603 tgl_swsb
604 ordered_dependency_swsb(const std::vector<dependency> &deps,
605 const ordered_address &jp)
606 {
607 unsigned min_dist = ~0u;
608
609 for (const auto &dep : deps) {
610 if (dep.ordered) {
611 const unsigned dist = jp - dep.jp;
612 const unsigned max_dist = 10;
613 assert(jp > dep.jp);
614 if (dist <= max_dist)
615 min_dist = MIN3(min_dist, dist, 7);
616 }
617 }
618
619 return { min_dist == ~0u ? 0 : min_dist };
620 }
621
622 /**
623 * Return whether the dependency list \p deps of an instruction with
624 * ordered_address \p jp has any non-trivial ordered dependencies.
625 */
626 bool
627 find_ordered_dependency(const std::vector<dependency> &deps,
628 const ordered_address &jp)
629 {
630 return ordered_dependency_swsb(deps, jp).regdist;
631 }
632
633 /**
634 * Return the full tgl_sbid_mode bitset for the first unordered dependency
635 * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
636 * no such dependency is present.
637 */
638 tgl_sbid_mode
639 find_unordered_dependency(const std::vector<dependency> &deps,
640 tgl_sbid_mode unordered)
641 {
642 if (unordered) {
643 for (const auto &dep : deps) {
644 if (unordered & dep.unordered)
645 return dep.unordered;
646 }
647 }
648
649 return TGL_SBID_NULL;
650 }
651
652 /**
653 * Return the tgl_sbid_mode bitset of an unordered dependency from the list
654 * \p deps that can be represented directly in the SWSB annotation of the
655 * instruction without additional SYNC instructions, or zero if no such
656 * dependency is present.
657 */
658 tgl_sbid_mode
659 baked_unordered_dependency_mode(const fs_inst *inst,
660 const std::vector<dependency> &deps,
661 const ordered_address &jp)
662 {
663 const bool has_ordered = find_ordered_dependency(deps, jp);
664
665 if (find_unordered_dependency(deps, TGL_SBID_SET))
666 return find_unordered_dependency(deps, TGL_SBID_SET);
667 else if (has_ordered && is_unordered(inst))
668 return TGL_SBID_NULL;
669 else if (find_unordered_dependency(deps, TGL_SBID_DST) &&
670 (!has_ordered || !is_unordered(inst)))
671 return find_unordered_dependency(deps, TGL_SBID_DST);
672 else if (!has_ordered)
673 return find_unordered_dependency(deps, TGL_SBID_SRC);
674 else
675 return TGL_SBID_NULL;
676 }
677
678 /** @} */
679
680 /**
681 * Shader instruction dependency calculation.
682 * @{
683 */
684
685 /**
686 * Update scoreboard object \p sb to account for the execution of
687 * instruction \p inst.
688 */
689 void
690 update_inst_scoreboard(const fs_visitor *shader,
691 const std::vector<ordered_address> &jps,
692 const fs_inst *inst, unsigned ip, scoreboard &sb)
693 {
694 /* Track any source registers that may be fetched asynchronously by this
695 * instruction, otherwise clear the dependency in order to avoid
696 * subsequent redundant synchronization.
697 */
698 for (unsigned i = 0; i < inst->sources; i++) {
699 const dependency rd_dep =
700 inst->is_payload(i) || inst->is_math() ? dependency(TGL_SBID_SRC, ip) :
701 ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip]) :
702 dependency::done;
703
704 for (unsigned j = 0; j < regs_read(inst, i); j++)
705 sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep);
706 }
707
708 if (is_send(inst) && inst->base_mrf != -1) {
709 const dependency rd_dep = dependency(TGL_SBID_SRC, ip);
710
711 for (unsigned j = 0; j < inst->mlen; j++)
712 sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
713 }
714
715 /* Track any destination registers of this instruction. */
716 const dependency wr_dep =
717 is_unordered(inst) ? dependency(TGL_SBID_DST, ip) :
718 ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip]) :
719 dependency();
720
721 if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
722 !inst->dst.is_null()) {
723 for (unsigned j = 0; j < regs_written(inst); j++)
724 sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
725 }
726 }
727
728 /**
729 * Calculate scoreboard objects locally that represent any pending (and
730 * unconditionally resolved) dependencies at the end of each block of the
731 * program.
732 */
733 std::vector<scoreboard>
734 gather_block_scoreboards(const fs_visitor *shader,
735 const std::vector<ordered_address> &jps)
736 {
737 std::vector<scoreboard> sbs(shader->cfg->num_blocks);
738 unsigned ip = 0;
739
740 foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
741 update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
742
743 return sbs;
744 }
745
746 /**
747 * Propagate data dependencies globally through the control flow graph
748 * until a fixed point is reached.
749 *
750 * Calculates the set of dependencies potentially pending at the beginning
751 * of each block, and returns it as an array of scoreboard objects.
752 */
753 std::pair<std::vector<scoreboard>, std::vector<unsigned>>
754 propagate_block_scoreboards(const fs_visitor *shader,
755 const std::vector<ordered_address> &jps)
756 {
757 const std::vector<scoreboard> delta_sbs =
758 gather_block_scoreboards(shader, jps);
759 std::vector<scoreboard> in_sbs(shader->cfg->num_blocks);
760 std::vector<scoreboard> out_sbs(shader->cfg->num_blocks);
761 equivalence_relation eq;
762
763 for (bool progress = true; progress;) {
764 progress = false;
765
766 foreach_block(block, shader->cfg) {
767 const scoreboard sb = shadow(in_sbs[block->num],
768 delta_sbs[block->num]);
769
770 if (sb != out_sbs[block->num]) {
771 foreach_list_typed(bblock_link, child_link, link,
772 &block->children) {
773 scoreboard &in_sb = in_sbs[child_link->block->num];
774 const int delta =
775 jps[child_link->block->start_ip] - jps[block->end_ip]
776 - ordered_unit(static_cast<const fs_inst *>(block->end()));
777
778 in_sb = merge(eq, in_sb, transport(sb, delta));
779 }
780
781 out_sbs[block->num] = sb;
782 progress = true;
783 }
784 }
785 }
786
787 return { std::move(in_sbs), eq.flatten() };
788 }
789
790 /**
791 * Return the list of potential dependencies of each instruction in the
792 * shader based on the result of global dependency analysis.
793 */
794 std::vector<std::vector<dependency>>
795 gather_inst_dependencies(const fs_visitor *shader,
796 const std::vector<ordered_address> &jps)
797 {
798 std::vector<scoreboard> sbs;
799 std::vector<unsigned> ids;
800 std::vector<std::vector<dependency>> deps;
801 unsigned ip = 0;
802
803 std::tie(sbs, ids) = propagate_block_scoreboards(shader, jps);
804
805 foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
806 scoreboard &sb = sbs[block->num];
807 std::vector<dependency> inst_deps;
808
809 for (unsigned i = 0; i < inst->sources; i++) {
810 for (unsigned j = 0; j < regs_read(inst, i); j++)
811 add_dependency(ids, inst_deps, dependency_for_read(
812 sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
813 }
814
815 if (is_send(inst) && inst->base_mrf != -1) {
816 for (unsigned j = 0; j < inst->mlen; j++)
817 add_dependency(ids, inst_deps, dependency_for_read(
818 sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
819 }
820
821 if (is_unordered(inst))
822 add_dependency(ids, inst_deps, dependency(TGL_SBID_SET, ip));
823
824 if (!inst->no_dd_check) {
825 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
826 for (unsigned j = 0; j < regs_written(inst); j++) {
827 add_dependency(ids, inst_deps, dependency_for_write(inst,
828 sb.get(byte_offset(inst->dst, REG_SIZE * j))));
829 }
830 }
831
832 if (is_send(inst) && inst->base_mrf != -1) {
833 for (int j = 0; j < shader->implied_mrf_writes(inst); j++)
834 add_dependency(ids, inst_deps, dependency_for_write(inst,
835 sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
836 }
837 }
838
839 deps.push_back(inst_deps);
840 update_inst_scoreboard(shader, jps, inst, ip, sb);
841 ip++;
842 }
843
844 return deps;
845 }
846
847 /** @} */
848
849 /**
850 * Allocate SBID tokens to track the execution of every out-of-order
851 * instruction of the shader.
852 */
853 std::vector<std::vector<dependency>>
854 allocate_inst_dependencies(const fs_visitor *shader,
855 const std::vector<std::vector<dependency>> &deps0)
856 {
857 /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
858 * shaders with a large number of SEND messages.
859 */
860 std::vector<std::vector<dependency>> deps1;
861 std::vector<unsigned> ids(deps0.size(), ~0u);
862 unsigned next_id = 0;
863
864 for (const auto &inst_deps0 : deps0) {
865 std::vector<dependency> inst_deps1;
866
867 for (const auto &dep : inst_deps0) {
868 if (dep.unordered && ids[dep.id] == ~0u)
869 ids[dep.id] = (next_id++) & 0xf;
870
871 add_dependency(ids, inst_deps1, dep);
872 }
873
874 deps1.push_back(inst_deps1);
875 }
876
877 return deps1;
878 }
879
880 /**
881 * Emit dependency information provided by \p deps into the shader,
882 * inserting additional SYNC instructions for dependencies that can't be
883 * represented directly by annotating existing instructions.
884 */
885 void
886 emit_inst_dependencies(fs_visitor *shader,
887 const std::vector<ordered_address> &jps,
888 const std::vector<std::vector<dependency>> &deps)
889 {
890 unsigned ip = 0;
891
892 foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
893 tgl_swsb swsb = ordered_dependency_swsb(deps[ip], jps[ip]);
894 const tgl_sbid_mode unordered_mode =
895 baked_unordered_dependency_mode(inst, deps[ip], jps[ip]);
896
897 for (const auto &dep : deps[ip]) {
898 if (dep.unordered) {
899 if (unordered_mode == dep.unordered && !swsb.mode) {
900 /* Bake unordered dependency into the instruction's SWSB if
901 * possible.
902 */
903 swsb.sbid = dep.id;
904 swsb.mode = dep.unordered;
905 } else {
906 /* Emit dependency into the SWSB of an extra SYNC
907 * instruction.
908 */
909 const fs_builder ibld = fs_builder(shader, block, inst)
910 .exec_all().group(1, 0);
911 fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
912 brw_imm_ud(TGL_SYNC_NOP));
913 sync->sched.sbid = dep.id;
914 sync->sched.mode = dep.unordered;
915 assert(!(sync->sched.mode & TGL_SBID_SET));
916 }
917 }
918 }
919
920 /* Update the IR. */
921 inst->sched = swsb;
922 inst->no_dd_check = inst->no_dd_clear = false;
923 ip++;
924 }
925 }
926 }
927
928 bool
929 fs_visitor::lower_scoreboard()
930 {
931 if (devinfo->gen >= 12) {
932 const std::vector<ordered_address> jps = ordered_inst_addresses(this);
933 emit_inst_dependencies(this, jps,
934 allocate_inst_dependencies(this,
935 gather_inst_dependencies(this, jps)));
936 }
937
938 return true;
939 }