intel/eu/validate/gen12: Fix validation of SYNC instruction.
[mesa.git] / src / intel / compiler / brw_fs_scoreboard.cpp
1 /*
2 * Copyright © 2019 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_scoreboard.cpp
25 *
26 * Gen12+ hardware lacks the register scoreboard logic that used to guarantee
27 * data coherency between register reads and writes in previous generations.
28 * This lowering pass runs after register allocation in order to make up for
29 * it.
30 *
31 * It works by performing global dataflow analysis in order to determine the
32 * set of potential dependencies of every instruction in the shader, and then
33 * inserts any required SWSB annotations and additional SYNC instructions in
34 * order to guarantee data coherency.
35 *
36 * WARNING - Access of the following (rarely used) ARF registers is not
37 * tracked here, and require the RegDist SWSB annotation to be set
38 * to 1 by the generator in order to avoid data races:
39 *
40 * - sp stack pointer
41 * - sr0 state register
42 * - cr0 control register
43 * - ip instruction pointer
44 * - tm0 timestamp register
45 * - dbg0 debug register
46 *
47 * The following ARF registers don't need to be tracked here because data
48 * coherency is still provided transparently by the hardware:
49 *
50 * - f0-1 flag registers
51 * - n0 notification register
52 * - tdr0 thread dependency register
53 */
54
55 #include "brw_fs.h"
56 #include "brw_cfg.h"
57
58 using namespace brw;
59
60 namespace {
61 /**
62 * In-order instruction accounting.
63 * @{
64 */
65
66 /**
67 * Number of in-order hardware instructions contained in this IR
68 * instruction. This determines the increment applied to the RegDist
69 * counter calculated for any ordered dependency that crosses this
70 * instruction.
71 */
72 unsigned
73 ordered_unit(const fs_inst *inst)
74 {
75 switch (inst->opcode) {
76 case BRW_OPCODE_SYNC:
77 case BRW_OPCODE_DO:
78 case SHADER_OPCODE_UNDEF:
79 case FS_OPCODE_PLACEHOLDER_HALT:
80 return 0;
81 default:
82 /* Note that the following is inaccurate for virtual instructions
83 * that expand to more in-order instructions than assumed here, but
84 * that can only lead to suboptimal execution ordering, data
85 * coherency won't be impacted. Providing exact RegDist counts for
86 * each virtual instruction would allow better ALU performance, but
87 * it would require keeping this switch statement in perfect sync
88 * with the generator in order to avoid data corruption. Lesson is
89 * (again) don't use virtual instructions if you want optimal
90 * scheduling.
91 */
92 return is_unordered(inst) ? 0 : 1;
93 }
94 }
95
96 /**
97 * Type for an instruction counter that increments for in-order
98 * instructions only, arbitrarily denoted 'jp' throughout this lowering
99 * pass in order to distinguish it from the regular instruction counter.
100 */
101 typedef int ordered_address;
102
103 /**
104 * Return the number of instructions in the program.
105 */
106 unsigned
107 num_instructions(const backend_shader *shader)
108 {
109 return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
110 }
111
112 /**
113 * Calculate the local ordered_address instruction counter at every
114 * instruction of the shader for subsequent constant-time look-up.
115 */
116 ordered_address *
117 ordered_inst_addresses(const fs_visitor *shader)
118 {
119 ordered_address *jps = new ordered_address[num_instructions(shader)];
120 ordered_address jp = 0;
121 unsigned ip = 0;
122
123 foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
124 jps[ip] = jp;
125 jp += ordered_unit(inst);
126 ip++;
127 }
128
129 return jps;
130 }
131
132 /**
133 * Synchronization mode required for data manipulated by in-order
134 * instructions.
135 *
136 * Similar to tgl_sbid_mode, but without SET mode. Defined as a separate
137 * enum for additional type safety. The hardware doesn't provide control
138 * over the synchronization mode for RegDist annotations, this is only used
139 * internally in this pass in order to optimize out redundant read
140 * dependencies where possible.
141 */
142 enum tgl_regdist_mode {
143 TGL_REGDIST_NULL = 0,
144 TGL_REGDIST_SRC = 1,
145 TGL_REGDIST_DST = 2
146 };
147
148 /**
149 * Allow bitwise arithmetic of tgl_regdist_mode enums.
150 */
151 tgl_regdist_mode
152 operator|(tgl_regdist_mode x, tgl_regdist_mode y)
153 {
154 return tgl_regdist_mode(unsigned(x) | unsigned(y));
155 }
156
157 tgl_regdist_mode
158 operator&(tgl_regdist_mode x, tgl_regdist_mode y)
159 {
160 return tgl_regdist_mode(unsigned(x) & unsigned(y));
161 }
162
163 tgl_regdist_mode &
164 operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
165 {
166 return x = x | y;
167 }
168
169 tgl_regdist_mode &
170 operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
171 {
172 return x = x & y;
173 }
174
175 /** @} */
176
177 /**
178 * Representation of an equivalence relation among the set of unsigned
179 * integers.
180 *
181 * Its initial state is the identity relation '~' such that i ~ j if and
182 * only if i == j for every pair of unsigned integers i and j.
183 */
184 struct equivalence_relation {
185 equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
186 {
187 for (unsigned i = 0; i < n; i++)
188 is[i] = i;
189 }
190
191 ~equivalence_relation()
192 {
193 delete[] is;
194 }
195
196 /**
197 * Return equivalence class index of the specified element. Effectively
198 * this is the numeric value of an arbitrary representative from the
199 * equivalence class.
200 *
201 * Allows the evaluation of the equivalence relation according to the
202 * rule that i ~ j if and only if lookup(i) == lookup(j).
203 */
204 unsigned
205 lookup(unsigned i) const
206 {
207 if (i < n && is[i] != i)
208 return lookup(is[i]);
209 else
210 return i;
211 }
212
213 /**
214 * Create an array with the results of the lookup() method for
215 * constant-time evaluation.
216 */
217 unsigned *
218 flatten() const
219 {
220 unsigned *ids = new unsigned[n];
221
222 for (unsigned i = 0; i < n; i++)
223 ids[i] = lookup(i);
224
225 return ids;
226 }
227
228 /**
229 * Mutate the existing equivalence relation minimally by imposing the
230 * additional requirement that i ~ j.
231 *
232 * The algorithm updates the internal representation recursively in
233 * order to guarantee transitivity while preserving the previously
234 * specified equivalence requirements.
235 */
236 unsigned
237 link(unsigned i, unsigned j)
238 {
239 const unsigned k = lookup(i);
240 assign(i, k);
241 assign(j, k);
242 return k;
243 }
244
245 private:
246 equivalence_relation(const equivalence_relation &);
247
248 equivalence_relation &
249 operator=(const equivalence_relation &);
250
251 /**
252 * Assign the representative of \p from to be equivalent to \p to.
253 *
254 * At the same time the data structure is partially flattened as much as
255 * it's possible without increasing the number of recursive calls.
256 */
257 void
258 assign(unsigned from, unsigned to)
259 {
260 if (from != to) {
261 assert(from < n);
262
263 if (is[from] != from)
264 assign(is[from], to);
265
266 is[from] = to;
267 }
268 }
269
270 unsigned *is;
271 unsigned n;
272 };
273
274 /**
275 * Representation of a data dependency between two instructions in the
276 * program.
277 * @{
278 */
279 struct dependency {
280 /**
281 * No dependency information.
282 */
283 dependency() : ordered(TGL_REGDIST_NULL), jp(INT_MIN),
284 unordered(TGL_SBID_NULL), id(0) {}
285
286 /**
287 * Construct a dependency on the in-order instruction with the provided
288 * ordered_address instruction counter.
289 */
290 dependency(tgl_regdist_mode mode, ordered_address jp) :
291 ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0) {}
292
293 /**
294 * Construct a dependency on the out-of-order instruction with the
295 * specified synchronization token.
296 */
297 dependency(tgl_sbid_mode mode, unsigned id) :
298 ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id) {}
299
300 /**
301 * Synchronization mode of in-order dependency, or zero if no in-order
302 * dependency is present.
303 */
304 tgl_regdist_mode ordered;
305
306 /**
307 * Instruction counter of in-order dependency.
308 *
309 * For a dependency part of a different block in the program, this is
310 * relative to the specific control flow path taken between the
311 * dependency and the current block: It is the ordered_address such that
312 * the difference between it and the ordered_address of the first
313 * instruction of the current block is exactly the number of in-order
314 * instructions across that control flow path. It is not guaranteed to
315 * be equal to the local ordered_address of the generating instruction
316 * [as returned by ordered_inst_addresses()], except for block-local
317 * dependencies.
318 */
319 ordered_address jp;
320
321 /**
322 * Synchronization mode of unordered dependency, or zero if no unordered
323 * dependency is present.
324 */
325 tgl_sbid_mode unordered;
326
327 /** Synchronization token of out-of-order dependency. */
328 unsigned id;
329
330 /**
331 * Trivial in-order dependency that's always satisfied.
332 *
333 * Note that unlike a default-constructed dependency() which is also
334 * trivially satisfied, this is considered to provide dependency
335 * information and can be used to clear a previously pending dependency
336 * via shadow().
337 */
338 static const dependency done;
339
340 friend bool
341 operator==(const dependency &dep0, const dependency &dep1)
342 {
343 return dep0.ordered == dep1.ordered &&
344 dep0.jp == dep1.jp &&
345 dep0.unordered == dep1.unordered &&
346 dep0.id == dep1.id;
347 }
348
349 friend bool
350 operator!=(const dependency &dep0, const dependency &dep1)
351 {
352 return !(dep0 == dep1);
353 }
354 };
355
356 const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN);
357
358 /**
359 * Return whether \p dep contains any dependency information.
360 */
361 bool
362 is_valid(const dependency &dep)
363 {
364 return dep.ordered || dep.unordered;
365 }
366
367 /**
368 * Combine \p dep0 and \p dep1 into a single dependency object that is only
369 * satisfied when both original dependencies are satisfied. This might
370 * involve updating the equivalence relation \p eq in order to make sure
371 * that both out-of-order dependencies are assigned the same hardware SBID
372 * as synchronization token.
373 */
374 dependency
375 merge(equivalence_relation &eq,
376 const dependency &dep0, const dependency &dep1)
377 {
378 dependency dep;
379
380 if (dep0.ordered || dep1.ordered) {
381 dep.ordered = dep0.ordered | dep1.ordered;
382 dep.jp = MAX2(dep0.jp, dep1.jp);
383 }
384
385 if (dep0.unordered || dep1.unordered) {
386 dep.unordered = dep0.unordered | dep1.unordered;
387 dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
388 dep1.unordered ? dep1.id : dep0.id);
389 }
390
391 return dep;
392 }
393
394 /**
395 * Override dependency information of \p dep0 with that of \p dep1.
396 */
397 dependency
398 shadow(const dependency &dep0, const dependency &dep1)
399 {
400 return is_valid(dep1) ? dep1 : dep0;
401 }
402
403 /**
404 * Translate dependency information across the program.
405 *
406 * This returns a dependency on the same instruction translated to the
407 * ordered_address space of a different block. The correct shift for
408 * transporting a dependency across an edge of the CFG is the difference
409 * between the local ordered_address of the first instruction of the target
410 * block and the local ordered_address of the instruction immediately after
411 * the end of the origin block.
412 */
413 dependency
414 transport(dependency dep, int delta)
415 {
416 if (dep.ordered && dep.jp > INT_MIN)
417 dep.jp += delta;
418
419 return dep;
420 }
421
422 /**
423 * Return simplified dependency removing any synchronization modes not
424 * applicable to an instruction reading the same register location.
425 */
426 dependency
427 dependency_for_read(dependency dep)
428 {
429 dep.ordered &= TGL_REGDIST_DST;
430 return dep;
431 }
432
433 /**
434 * Return simplified dependency removing any synchronization modes not
435 * applicable to an instruction \p inst writing the same register location.
436 */
437 dependency
438 dependency_for_write(const fs_inst *inst, dependency dep)
439 {
440 if (!is_unordered(inst))
441 dep.ordered &= TGL_REGDIST_DST;
442 return dep;
443 }
444
445 /** @} */
446
447 /**
448 * Scoreboard representation. This keeps track of the data dependencies of
449 * registers with GRF granularity.
450 */
451 class scoreboard {
452 public:
453 /**
454 * Look up the most current data dependency for register \p r.
455 */
456 dependency
457 get(const fs_reg &r) const
458 {
459 if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
460 return *p;
461 else
462 return dependency();
463 }
464
465 /**
466 * Specify the most current data dependency for register \p r.
467 */
468 void
469 set(const fs_reg &r, const dependency &d)
470 {
471 if (dependency *p = dep(r))
472 *p = d;
473 }
474
475 /**
476 * Component-wise merge() of corresponding dependencies from two
477 * scoreboard objects. \sa merge().
478 */
479 friend scoreboard
480 merge(equivalence_relation &eq,
481 const scoreboard &sb0, const scoreboard &sb1)
482 {
483 scoreboard sb;
484
485 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
486 sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
487
488 sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
489
490 for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
491 sb.accum_deps[i] = merge(eq, sb0.accum_deps[i], sb1.accum_deps[i]);
492
493 return sb;
494 }
495
496 /**
497 * Component-wise shadow() of corresponding dependencies from two
498 * scoreboard objects. \sa shadow().
499 */
500 friend scoreboard
501 shadow(const scoreboard &sb0, const scoreboard &sb1)
502 {
503 scoreboard sb;
504
505 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
506 sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
507
508 sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
509
510 for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
511 sb.accum_deps[i] = shadow(sb0.accum_deps[i], sb1.accum_deps[i]);
512
513 return sb;
514 }
515
516 /**
517 * Component-wise transport() of dependencies from a scoreboard
518 * object. \sa transport().
519 */
520 friend scoreboard
521 transport(const scoreboard &sb0, int delta)
522 {
523 scoreboard sb;
524
525 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
526 sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
527
528 sb.addr_dep = transport(sb0.addr_dep, delta);
529
530 for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
531 sb.accum_deps[i] = transport(sb0.accum_deps[i], delta);
532
533 return sb;
534 }
535
536 friend bool
537 operator==(const scoreboard &sb0, const scoreboard &sb1)
538 {
539 for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
540 if (sb0.grf_deps[i] != sb1.grf_deps[i])
541 return false;
542 }
543
544 if (sb0.addr_dep != sb1.addr_dep)
545 return false;
546
547 for (unsigned i = 0; i < ARRAY_SIZE(sb0.accum_deps); i++) {
548 if (sb0.accum_deps[i] != sb1.accum_deps[i])
549 return false;
550 }
551
552 return true;
553 }
554
555 friend bool
556 operator!=(const scoreboard &sb0, const scoreboard &sb1)
557 {
558 return !(sb0 == sb1);
559 }
560
561 private:
562 dependency grf_deps[BRW_MAX_GRF];
563 dependency addr_dep;
564 dependency accum_deps[10];
565
566 dependency *
567 dep(const fs_reg &r)
568 {
569 const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
570 reg_offset(r) / REG_SIZE);
571
572 return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
573 r.file == MRF ? &grf_deps[GEN7_MRF_HACK_START + reg] :
574 r.file == ARF && reg >= BRW_ARF_ADDRESS &&
575 reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
576 r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
577 reg < BRW_ARF_FLAG ? &accum_deps[
578 reg - BRW_ARF_ACCUMULATOR] :
579 NULL);
580 }
581 };
582
583 /**
584 * Dependency list handling.
585 * @{
586 */
587 struct dependency_list {
588 dependency_list() : deps(NULL), n(0) {}
589
590 ~dependency_list()
591 {
592 free(deps);
593 }
594
595 void
596 push_back(const dependency &dep)
597 {
598 deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
599 deps[n++] = dep;
600 }
601
602 unsigned
603 size() const
604 {
605 return n;
606 }
607
608 const dependency &
609 operator[](unsigned i) const
610 {
611 assert(i < n);
612 return deps[i];
613 }
614
615 dependency &
616 operator[](unsigned i)
617 {
618 assert(i < n);
619 return deps[i];
620 }
621
622 private:
623 dependency_list(const dependency_list &);
624 dependency_list &
625 operator=(const dependency_list &);
626
627 dependency *deps;
628 unsigned n;
629 };
630
631 /**
632 * Add dependency \p dep to the list of dependencies of an instruction
633 * \p deps.
634 */
635 void
636 add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
637 {
638 if (is_valid(dep)) {
639 /* Translate the unordered dependency token first in order to keep
640 * the list minimally redundant.
641 */
642 if (dep.unordered)
643 dep.id = ids[dep.id];
644
645 /* Try to combine the specified dependency with any existing ones. */
646 for (unsigned i = 0; i < deps.size(); i++) {
647 if (dep.ordered && deps[i].ordered) {
648 deps[i].jp = MAX2(deps[i].jp, dep.jp);
649 deps[i].ordered |= dep.ordered;
650 dep.ordered = TGL_REGDIST_NULL;
651 }
652
653 if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
654 deps[i].unordered |= dep.unordered;
655 dep.unordered = TGL_SBID_NULL;
656 }
657 }
658
659 /* Add it to the end of the list if necessary. */
660 if (is_valid(dep))
661 deps.push_back(dep);
662 }
663 }
664
665 /**
666 * Construct a tgl_swsb annotation encoding any ordered dependencies from
667 * the dependency list \p deps of an instruction with ordered_address
668 * \p jp.
669 */
670 tgl_swsb
671 ordered_dependency_swsb(const dependency_list &deps,
672 const ordered_address &jp)
673 {
674 unsigned min_dist = ~0u;
675
676 for (unsigned i = 0; i < deps.size(); i++) {
677 if (deps[i].ordered) {
678 const unsigned dist = jp - deps[i].jp;
679 const unsigned max_dist = 10;
680 assert(jp > deps[i].jp);
681 if (dist <= max_dist)
682 min_dist = MIN3(min_dist, dist, 7);
683 }
684 }
685
686 return { min_dist == ~0u ? 0 : min_dist };
687 }
688
689 /**
690 * Return whether the dependency list \p deps of an instruction with
691 * ordered_address \p jp has any non-trivial ordered dependencies.
692 */
693 bool
694 find_ordered_dependency(const dependency_list &deps,
695 const ordered_address &jp)
696 {
697 return ordered_dependency_swsb(deps, jp).regdist;
698 }
699
700 /**
701 * Return the full tgl_sbid_mode bitset for the first unordered dependency
702 * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
703 * no such dependency is present.
704 */
705 tgl_sbid_mode
706 find_unordered_dependency(const dependency_list &deps,
707 tgl_sbid_mode unordered)
708 {
709 if (unordered) {
710 for (unsigned i = 0; i < deps.size(); i++) {
711 if (unordered & deps[i].unordered)
712 return deps[i].unordered;
713 }
714 }
715
716 return TGL_SBID_NULL;
717 }
718
719 /**
720 * Return the tgl_sbid_mode bitset of an unordered dependency from the list
721 * \p deps that can be represented directly in the SWSB annotation of the
722 * instruction without additional SYNC instructions, or zero if no such
723 * dependency is present.
724 */
725 tgl_sbid_mode
726 baked_unordered_dependency_mode(const fs_inst *inst,
727 const dependency_list &deps,
728 const ordered_address &jp)
729 {
730 const bool has_ordered = find_ordered_dependency(deps, jp);
731
732 if (find_unordered_dependency(deps, TGL_SBID_SET))
733 return find_unordered_dependency(deps, TGL_SBID_SET);
734 else if (has_ordered && is_unordered(inst))
735 return TGL_SBID_NULL;
736 else if (find_unordered_dependency(deps, TGL_SBID_DST) &&
737 (!has_ordered || !is_unordered(inst)))
738 return find_unordered_dependency(deps, TGL_SBID_DST);
739 else if (!has_ordered)
740 return find_unordered_dependency(deps, TGL_SBID_SRC);
741 else
742 return TGL_SBID_NULL;
743 }
744
745 /** @} */
746
747 /**
748 * Shader instruction dependency calculation.
749 * @{
750 */
751
752 /**
753 * Update scoreboard object \p sb to account for the execution of
754 * instruction \p inst.
755 */
756 void
757 update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps,
758 const fs_inst *inst, unsigned ip, scoreboard &sb)
759 {
760 /* Track any source registers that may be fetched asynchronously by this
761 * instruction, otherwise clear the dependency in order to avoid
762 * subsequent redundant synchronization.
763 */
764 for (unsigned i = 0; i < inst->sources; i++) {
765 const dependency rd_dep =
766 inst->is_payload(i) || inst->is_math() ? dependency(TGL_SBID_SRC, ip) :
767 ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip]) :
768 dependency::done;
769
770 for (unsigned j = 0; j < regs_read(inst, i); j++)
771 sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep);
772 }
773
774 if (is_send(inst) && inst->base_mrf != -1) {
775 const dependency rd_dep = dependency(TGL_SBID_SRC, ip);
776
777 for (unsigned j = 0; j < inst->mlen; j++)
778 sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
779 }
780
781 /* Track any destination registers of this instruction. */
782 const dependency wr_dep =
783 is_unordered(inst) ? dependency(TGL_SBID_DST, ip) :
784 ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip]) :
785 dependency();
786
787 if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
788 !inst->dst.is_null()) {
789 for (unsigned j = 0; j < regs_written(inst); j++)
790 sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
791 }
792 }
793
794 /**
795 * Calculate scoreboard objects locally that represent any pending (and
796 * unconditionally resolved) dependencies at the end of each block of the
797 * program.
798 */
799 scoreboard *
800 gather_block_scoreboards(const fs_visitor *shader,
801 const ordered_address *jps)
802 {
803 scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
804 unsigned ip = 0;
805
806 foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
807 update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
808
809 return sbs;
810 }
811
812 /**
813 * Propagate data dependencies globally through the control flow graph
814 * until a fixed point is reached.
815 *
816 * Calculates the set of dependencies potentially pending at the beginning
817 * of each block, and returns it as an array of scoreboard objects.
818 */
819 scoreboard *
820 propagate_block_scoreboards(const fs_visitor *shader,
821 const ordered_address *jps,
822 equivalence_relation &eq)
823 {
824 const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
825 scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
826 scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
827
828 for (bool progress = true; progress;) {
829 progress = false;
830
831 foreach_block(block, shader->cfg) {
832 const scoreboard sb = shadow(in_sbs[block->num],
833 delta_sbs[block->num]);
834
835 if (sb != out_sbs[block->num]) {
836 foreach_list_typed(bblock_link, child_link, link,
837 &block->children) {
838 scoreboard &in_sb = in_sbs[child_link->block->num];
839 const int delta =
840 jps[child_link->block->start_ip] - jps[block->end_ip]
841 - ordered_unit(static_cast<const fs_inst *>(block->end()));
842
843 in_sb = merge(eq, in_sb, transport(sb, delta));
844 }
845
846 out_sbs[block->num] = sb;
847 progress = true;
848 }
849 }
850 }
851
852 delete[] delta_sbs;
853 delete[] out_sbs;
854
855 return in_sbs;
856 }
857
858 /**
859 * Return the list of potential dependencies of each instruction in the
860 * shader based on the result of global dependency analysis.
861 */
862 dependency_list *
863 gather_inst_dependencies(const fs_visitor *shader,
864 const ordered_address *jps)
865 {
866 equivalence_relation eq(num_instructions(shader));
867 scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
868 const unsigned *ids = eq.flatten();
869 dependency_list *deps = new dependency_list[num_instructions(shader)];
870 unsigned ip = 0;
871
872 foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
873 scoreboard &sb = sbs[block->num];
874
875 for (unsigned i = 0; i < inst->sources; i++) {
876 for (unsigned j = 0; j < regs_read(inst, i); j++)
877 add_dependency(ids, deps[ip], dependency_for_read(
878 sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
879 }
880
881 if (is_send(inst) && inst->base_mrf != -1) {
882 for (unsigned j = 0; j < inst->mlen; j++)
883 add_dependency(ids, deps[ip], dependency_for_read(
884 sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
885 }
886
887 if (is_unordered(inst))
888 add_dependency(ids, deps[ip], dependency(TGL_SBID_SET, ip));
889
890 if (!inst->no_dd_check) {
891 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
892 for (unsigned j = 0; j < regs_written(inst); j++) {
893 add_dependency(ids, deps[ip], dependency_for_write(inst,
894 sb.get(byte_offset(inst->dst, REG_SIZE * j))));
895 }
896 }
897
898 if (is_send(inst) && inst->base_mrf != -1) {
899 for (int j = 0; j < shader->implied_mrf_writes(inst); j++)
900 add_dependency(ids, deps[ip], dependency_for_write(inst,
901 sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
902 }
903 }
904
905 update_inst_scoreboard(shader, jps, inst, ip, sb);
906 ip++;
907 }
908
909 delete[] sbs;
910 delete[] ids;
911
912 return deps;
913 }
914
915 /** @} */
916
917 /**
918 * Allocate SBID tokens to track the execution of every out-of-order
919 * instruction of the shader.
920 */
921 dependency_list *
922 allocate_inst_dependencies(const fs_visitor *shader,
923 const dependency_list *deps0)
924 {
925 /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
926 * shaders with a large number of SEND messages.
927 */
928
929 /* Allocate an unordered dependency ID to hardware SBID translation
930 * table with as many entries as instructions there are in the shader,
931 * which is the maximum number of unordered IDs we can find in the
932 * program.
933 */
934 unsigned *ids = new unsigned[num_instructions(shader)];
935 for (unsigned ip = 0; ip < num_instructions(shader); ip++)
936 ids[ip] = ~0u;
937
938 dependency_list *deps1 = new dependency_list[num_instructions(shader)];
939 unsigned next_id = 0;
940
941 for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
942 for (unsigned i = 0; i < deps0[ip].size(); i++) {
943 const dependency &dep = deps0[ip][i];
944
945 if (dep.unordered && ids[dep.id] == ~0u)
946 ids[dep.id] = (next_id++) & 0xf;
947
948 add_dependency(ids, deps1[ip], dep);
949 }
950 }
951
952 delete[] ids;
953
954 return deps1;
955 }
956
957 /**
958 * Emit dependency information provided by \p deps into the shader,
959 * inserting additional SYNC instructions for dependencies that can't be
960 * represented directly by annotating existing instructions.
961 */
962 void
963 emit_inst_dependencies(fs_visitor *shader,
964 const ordered_address *jps,
965 const dependency_list *deps)
966 {
967 unsigned ip = 0;
968
969 foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
970 tgl_swsb swsb = ordered_dependency_swsb(deps[ip], jps[ip]);
971 const tgl_sbid_mode unordered_mode =
972 baked_unordered_dependency_mode(inst, deps[ip], jps[ip]);
973
974 for (unsigned i = 0; i < deps[ip].size(); i++) {
975 const dependency &dep = deps[ip][i];
976
977 if (dep.unordered) {
978 if (unordered_mode == dep.unordered && !swsb.mode) {
979 /* Bake unordered dependency into the instruction's SWSB if
980 * possible.
981 */
982 swsb.sbid = dep.id;
983 swsb.mode = dep.unordered;
984 } else {
985 /* Emit dependency into the SWSB of an extra SYNC
986 * instruction.
987 */
988 const fs_builder ibld = fs_builder(shader, block, inst)
989 .exec_all().group(1, 0);
990 fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
991 brw_imm_ud(TGL_SYNC_NOP));
992 sync->sched.sbid = dep.id;
993 sync->sched.mode = dep.unordered;
994 assert(!(sync->sched.mode & TGL_SBID_SET));
995 }
996 }
997 }
998
999 /* Update the IR. */
1000 inst->sched = swsb;
1001 inst->no_dd_check = inst->no_dd_clear = false;
1002 ip++;
1003 }
1004 }
1005 }
1006
1007 bool
1008 fs_visitor::lower_scoreboard()
1009 {
1010 if (devinfo->gen >= 12) {
1011 const ordered_address *jps = ordered_inst_addresses(this);
1012 const dependency_list *deps0 = gather_inst_dependencies(this, jps);
1013 const dependency_list *deps1 = allocate_inst_dependencies(this, deps0);
1014 emit_inst_dependencies(this, jps, deps1);
1015 delete[] deps1;
1016 delete[] deps0;
1017 delete[] jps;
1018 }
1019
1020 return true;
1021 }