intel/compiler: Fix pointer arithmetic when reading shader assembly
[mesa.git] / src / intel / compiler / brw_fs_scoreboard.cpp
1 /*
2 * Copyright © 2019 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_scoreboard.cpp
25 *
26 * Gen12+ hardware lacks the register scoreboard logic that used to guarantee
27 * data coherency between register reads and writes in previous generations.
28 * This lowering pass runs after register allocation in order to make up for
29 * it.
30 *
31 * It works by performing global dataflow analysis in order to determine the
32 * set of potential dependencies of every instruction in the shader, and then
33 * inserts any required SWSB annotations and additional SYNC instructions in
34 * order to guarantee data coherency.
35 *
36 * WARNING - Access of the following (rarely used) ARF registers is not
37 * tracked here, and require the RegDist SWSB annotation to be set
38 * to 1 by the generator in order to avoid data races:
39 *
40 * - sp stack pointer
41 * - sr0 state register
42 * - cr0 control register
43 * - ip instruction pointer
44 * - tm0 timestamp register
45 * - dbg0 debug register
46 *
47 * The following ARF registers don't need to be tracked here because data
48 * coherency is still provided transparently by the hardware:
49 *
50 * - f0-1 flag registers
51 * - n0 notification register
52 * - tdr0 thread dependency register
53 */
54
55 #include "brw_fs.h"
56 #include "brw_cfg.h"
57
58 using namespace brw;
59
60 namespace {
61 /**
62 * In-order instruction accounting.
63 * @{
64 */
65
66 /**
67 * Number of in-order hardware instructions contained in this IR
68 * instruction. This determines the increment applied to the RegDist
69 * counter calculated for any ordered dependency that crosses this
70 * instruction.
71 */
72 unsigned
73 ordered_unit(const fs_inst *inst)
74 {
75 switch (inst->opcode) {
76 case BRW_OPCODE_SYNC:
77 case BRW_OPCODE_DO:
78 case SHADER_OPCODE_UNDEF:
79 case FS_OPCODE_PLACEHOLDER_HALT:
80 return 0;
81 default:
82 /* Note that the following is inaccurate for virtual instructions
83 * that expand to more in-order instructions than assumed here, but
84 * that can only lead to suboptimal execution ordering, data
85 * coherency won't be impacted. Providing exact RegDist counts for
86 * each virtual instruction would allow better ALU performance, but
87 * it would require keeping this switch statement in perfect sync
88 * with the generator in order to avoid data corruption. Lesson is
89 * (again) don't use virtual instructions if you want optimal
90 * scheduling.
91 */
92 return is_unordered(inst) ? 0 : 1;
93 }
94 }
95
96 /**
97 * Type for an instruction counter that increments for in-order
98 * instructions only, arbitrarily denoted 'jp' throughout this lowering
99 * pass in order to distinguish it from the regular instruction counter.
100 */
101 typedef int ordered_address;
102
103 /**
104 * Return the number of instructions in the program.
105 */
106 unsigned
107 num_instructions(const backend_shader *shader)
108 {
109 return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
110 }
111
112 /**
113 * Calculate the local ordered_address instruction counter at every
114 * instruction of the shader for subsequent constant-time look-up.
115 */
116 ordered_address *
117 ordered_inst_addresses(const fs_visitor *shader)
118 {
119 ordered_address *jps = new ordered_address[num_instructions(shader)];
120 ordered_address jp = 0;
121 unsigned ip = 0;
122
123 foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
124 jps[ip] = jp;
125 jp += ordered_unit(inst);
126 ip++;
127 }
128
129 return jps;
130 }
131
132 /**
133 * Synchronization mode required for data manipulated by in-order
134 * instructions.
135 *
136 * Similar to tgl_sbid_mode, but without SET mode. Defined as a separate
137 * enum for additional type safety. The hardware doesn't provide control
138 * over the synchronization mode for RegDist annotations, this is only used
139 * internally in this pass in order to optimize out redundant read
140 * dependencies where possible.
141 */
142 enum tgl_regdist_mode {
143 TGL_REGDIST_NULL = 0,
144 TGL_REGDIST_SRC = 1,
145 TGL_REGDIST_DST = 2
146 };
147
148 /**
149 * Allow bitwise arithmetic of tgl_regdist_mode enums.
150 */
151 tgl_regdist_mode
152 operator|(tgl_regdist_mode x, tgl_regdist_mode y)
153 {
154 return tgl_regdist_mode(unsigned(x) | unsigned(y));
155 }
156
157 tgl_regdist_mode
158 operator&(tgl_regdist_mode x, tgl_regdist_mode y)
159 {
160 return tgl_regdist_mode(unsigned(x) & unsigned(y));
161 }
162
163 tgl_regdist_mode &
164 operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
165 {
166 return x = x | y;
167 }
168
169 tgl_regdist_mode &
170 operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
171 {
172 return x = x & y;
173 }
174
175 /** @} */
176
177 /**
178 * Representation of an equivalence relation among the set of unsigned
179 * integers.
180 *
181 * Its initial state is the identity relation '~' such that i ~ j if and
182 * only if i == j for every pair of unsigned integers i and j.
183 */
184 struct equivalence_relation {
185 equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
186 {
187 for (unsigned i = 0; i < n; i++)
188 is[i] = i;
189 }
190
191 ~equivalence_relation()
192 {
193 delete[] is;
194 }
195
196 /**
197 * Return equivalence class index of the specified element. Effectively
198 * this is the numeric value of an arbitrary representative from the
199 * equivalence class.
200 *
201 * Allows the evaluation of the equivalence relation according to the
202 * rule that i ~ j if and only if lookup(i) == lookup(j).
203 */
204 unsigned
205 lookup(unsigned i) const
206 {
207 if (i < n && is[i] != i)
208 return lookup(is[i]);
209 else
210 return i;
211 }
212
213 /**
214 * Create an array with the results of the lookup() method for
215 * constant-time evaluation.
216 */
217 unsigned *
218 flatten() const
219 {
220 unsigned *ids = new unsigned[n];
221
222 for (unsigned i = 0; i < n; i++)
223 ids[i] = lookup(i);
224
225 return ids;
226 }
227
228 /**
229 * Mutate the existing equivalence relation minimally by imposing the
230 * additional requirement that i ~ j.
231 *
232 * The algorithm updates the internal representation recursively in
233 * order to guarantee transitivity while preserving the previously
234 * specified equivalence requirements.
235 */
236 unsigned
237 link(unsigned i, unsigned j)
238 {
239 const unsigned k = lookup(i);
240 assign(i, k);
241 assign(j, k);
242 return k;
243 }
244
245 private:
246 equivalence_relation(const equivalence_relation &);
247
248 equivalence_relation &
249 operator=(const equivalence_relation &);
250
251 /**
252 * Assign the representative of \p from to be equivalent to \p to.
253 *
254 * At the same time the data structure is partially flattened as much as
255 * it's possible without increasing the number of recursive calls.
256 */
257 void
258 assign(unsigned from, unsigned to)
259 {
260 if (from != to) {
261 assert(from < n);
262
263 if (is[from] != from)
264 assign(is[from], to);
265
266 is[from] = to;
267 }
268 }
269
270 unsigned *is;
271 unsigned n;
272 };
273
274 /**
275 * Representation of a data dependency between two instructions in the
276 * program.
277 * @{
278 */
279 struct dependency {
280 /**
281 * No dependency information.
282 */
283 dependency() : ordered(TGL_REGDIST_NULL), jp(INT_MIN),
284 unordered(TGL_SBID_NULL), id(0),
285 exec_all(false) {}
286
287 /**
288 * Construct a dependency on the in-order instruction with the provided
289 * ordered_address instruction counter.
290 */
291 dependency(tgl_regdist_mode mode, ordered_address jp, bool exec_all) :
292 ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
293 exec_all(exec_all) {}
294
295 /**
296 * Construct a dependency on the out-of-order instruction with the
297 * specified synchronization token.
298 */
299 dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
300 ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id),
301 exec_all(exec_all) {}
302
303 /**
304 * Synchronization mode of in-order dependency, or zero if no in-order
305 * dependency is present.
306 */
307 tgl_regdist_mode ordered;
308
309 /**
310 * Instruction counter of in-order dependency.
311 *
312 * For a dependency part of a different block in the program, this is
313 * relative to the specific control flow path taken between the
314 * dependency and the current block: It is the ordered_address such that
315 * the difference between it and the ordered_address of the first
316 * instruction of the current block is exactly the number of in-order
317 * instructions across that control flow path. It is not guaranteed to
318 * be equal to the local ordered_address of the generating instruction
319 * [as returned by ordered_inst_addresses()], except for block-local
320 * dependencies.
321 */
322 ordered_address jp;
323
324 /**
325 * Synchronization mode of unordered dependency, or zero if no unordered
326 * dependency is present.
327 */
328 tgl_sbid_mode unordered;
329
330 /** Synchronization token of out-of-order dependency. */
331 unsigned id;
332
333 /**
334 * Whether the dependency could be run with execution masking disabled,
335 * which might lead to the unwanted execution of the generating
336 * instruction in cases where a BB is executed with all channels
337 * disabled due to hardware bug GEN:BUG:1407528679.
338 */
339 bool exec_all;
340
341 /**
342 * Trivial in-order dependency that's always satisfied.
343 *
344 * Note that unlike a default-constructed dependency() which is also
345 * trivially satisfied, this is considered to provide dependency
346 * information and can be used to clear a previously pending dependency
347 * via shadow().
348 */
349 static const dependency done;
350
351 friend bool
352 operator==(const dependency &dep0, const dependency &dep1)
353 {
354 return dep0.ordered == dep1.ordered &&
355 dep0.jp == dep1.jp &&
356 dep0.unordered == dep1.unordered &&
357 dep0.id == dep1.id &&
358 dep0.exec_all == dep1.exec_all;
359 }
360
361 friend bool
362 operator!=(const dependency &dep0, const dependency &dep1)
363 {
364 return !(dep0 == dep1);
365 }
366 };
367
368 const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN, false);
369
370 /**
371 * Return whether \p dep contains any dependency information.
372 */
373 bool
374 is_valid(const dependency &dep)
375 {
376 return dep.ordered || dep.unordered;
377 }
378
379 /**
380 * Combine \p dep0 and \p dep1 into a single dependency object that is only
381 * satisfied when both original dependencies are satisfied. This might
382 * involve updating the equivalence relation \p eq in order to make sure
383 * that both out-of-order dependencies are assigned the same hardware SBID
384 * as synchronization token.
385 */
386 dependency
387 merge(equivalence_relation &eq,
388 const dependency &dep0, const dependency &dep1)
389 {
390 dependency dep;
391
392 if (dep0.ordered || dep1.ordered) {
393 dep.ordered = dep0.ordered | dep1.ordered;
394 dep.jp = MAX2(dep0.jp, dep1.jp);
395 }
396
397 if (dep0.unordered || dep1.unordered) {
398 dep.unordered = dep0.unordered | dep1.unordered;
399 dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
400 dep1.unordered ? dep1.id : dep0.id);
401 }
402
403 dep.exec_all = dep0.exec_all || dep1.exec_all;
404
405 return dep;
406 }
407
408 /**
409 * Override dependency information of \p dep0 with that of \p dep1.
410 */
411 dependency
412 shadow(const dependency &dep0, const dependency &dep1)
413 {
414 return is_valid(dep1) ? dep1 : dep0;
415 }
416
417 /**
418 * Translate dependency information across the program.
419 *
420 * This returns a dependency on the same instruction translated to the
421 * ordered_address space of a different block. The correct shift for
422 * transporting a dependency across an edge of the CFG is the difference
423 * between the local ordered_address of the first instruction of the target
424 * block and the local ordered_address of the instruction immediately after
425 * the end of the origin block.
426 */
427 dependency
428 transport(dependency dep, int delta)
429 {
430 if (dep.ordered && dep.jp > INT_MIN)
431 dep.jp += delta;
432
433 return dep;
434 }
435
436 /**
437 * Return simplified dependency removing any synchronization modes not
438 * applicable to an instruction reading the same register location.
439 */
440 dependency
441 dependency_for_read(dependency dep)
442 {
443 dep.ordered &= TGL_REGDIST_DST;
444 return dep;
445 }
446
447 /**
448 * Return simplified dependency removing any synchronization modes not
449 * applicable to an instruction \p inst writing the same register location.
450 */
451 dependency
452 dependency_for_write(const fs_inst *inst, dependency dep)
453 {
454 if (!is_unordered(inst))
455 dep.ordered &= TGL_REGDIST_DST;
456 return dep;
457 }
458
459 /** @} */
460
461 /**
462 * Scoreboard representation. This keeps track of the data dependencies of
463 * registers with GRF granularity.
464 */
465 class scoreboard {
466 public:
467 /**
468 * Look up the most current data dependency for register \p r.
469 */
470 dependency
471 get(const fs_reg &r) const
472 {
473 if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
474 return *p;
475 else
476 return dependency();
477 }
478
479 /**
480 * Specify the most current data dependency for register \p r.
481 */
482 void
483 set(const fs_reg &r, const dependency &d)
484 {
485 if (dependency *p = dep(r))
486 *p = d;
487 }
488
489 /**
490 * Component-wise merge() of corresponding dependencies from two
491 * scoreboard objects. \sa merge().
492 */
493 friend scoreboard
494 merge(equivalence_relation &eq,
495 const scoreboard &sb0, const scoreboard &sb1)
496 {
497 scoreboard sb;
498
499 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
500 sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
501
502 sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
503
504 for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
505 sb.accum_deps[i] = merge(eq, sb0.accum_deps[i], sb1.accum_deps[i]);
506
507 return sb;
508 }
509
510 /**
511 * Component-wise shadow() of corresponding dependencies from two
512 * scoreboard objects. \sa shadow().
513 */
514 friend scoreboard
515 shadow(const scoreboard &sb0, const scoreboard &sb1)
516 {
517 scoreboard sb;
518
519 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
520 sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
521
522 sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
523
524 for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
525 sb.accum_deps[i] = shadow(sb0.accum_deps[i], sb1.accum_deps[i]);
526
527 return sb;
528 }
529
530 /**
531 * Component-wise transport() of dependencies from a scoreboard
532 * object. \sa transport().
533 */
534 friend scoreboard
535 transport(const scoreboard &sb0, int delta)
536 {
537 scoreboard sb;
538
539 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
540 sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
541
542 sb.addr_dep = transport(sb0.addr_dep, delta);
543
544 for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
545 sb.accum_deps[i] = transport(sb0.accum_deps[i], delta);
546
547 return sb;
548 }
549
550 friend bool
551 operator==(const scoreboard &sb0, const scoreboard &sb1)
552 {
553 for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
554 if (sb0.grf_deps[i] != sb1.grf_deps[i])
555 return false;
556 }
557
558 if (sb0.addr_dep != sb1.addr_dep)
559 return false;
560
561 for (unsigned i = 0; i < ARRAY_SIZE(sb0.accum_deps); i++) {
562 if (sb0.accum_deps[i] != sb1.accum_deps[i])
563 return false;
564 }
565
566 return true;
567 }
568
569 friend bool
570 operator!=(const scoreboard &sb0, const scoreboard &sb1)
571 {
572 return !(sb0 == sb1);
573 }
574
575 private:
576 dependency grf_deps[BRW_MAX_GRF];
577 dependency addr_dep;
578 dependency accum_deps[10];
579
580 dependency *
581 dep(const fs_reg &r)
582 {
583 const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
584 reg_offset(r) / REG_SIZE);
585
586 return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
587 r.file == MRF ? &grf_deps[GEN7_MRF_HACK_START + reg] :
588 r.file == ARF && reg >= BRW_ARF_ADDRESS &&
589 reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
590 r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
591 reg < BRW_ARF_FLAG ? &accum_deps[
592 reg - BRW_ARF_ACCUMULATOR] :
593 NULL);
594 }
595 };
596
597 /**
598 * Dependency list handling.
599 * @{
600 */
601 struct dependency_list {
602 dependency_list() : deps(NULL), n(0) {}
603
604 ~dependency_list()
605 {
606 free(deps);
607 }
608
609 void
610 push_back(const dependency &dep)
611 {
612 deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
613 deps[n++] = dep;
614 }
615
616 unsigned
617 size() const
618 {
619 return n;
620 }
621
622 const dependency &
623 operator[](unsigned i) const
624 {
625 assert(i < n);
626 return deps[i];
627 }
628
629 dependency &
630 operator[](unsigned i)
631 {
632 assert(i < n);
633 return deps[i];
634 }
635
636 private:
637 dependency_list(const dependency_list &);
638 dependency_list &
639 operator=(const dependency_list &);
640
641 dependency *deps;
642 unsigned n;
643 };
644
645 /**
646 * Add dependency \p dep to the list of dependencies of an instruction
647 * \p deps.
648 */
649 void
650 add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
651 {
652 if (is_valid(dep)) {
653 /* Translate the unordered dependency token first in order to keep
654 * the list minimally redundant.
655 */
656 if (dep.unordered)
657 dep.id = ids[dep.id];
658
659 /* Try to combine the specified dependency with any existing ones. */
660 for (unsigned i = 0; i < deps.size(); i++) {
661 /* Don't combine otherwise matching dependencies if there is an
662 * exec_all mismatch which would cause a SET dependency to gain an
663 * exec_all flag, since that would prevent it from being baked
664 * into the instruction we want to allocate an SBID for.
665 */
666 if (deps[i].exec_all != dep.exec_all &&
667 (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
668 (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
669 continue;
670
671 if (dep.ordered && deps[i].ordered) {
672 deps[i].jp = MAX2(deps[i].jp, dep.jp);
673 deps[i].ordered |= dep.ordered;
674 deps[i].exec_all |= dep.exec_all;
675 dep.ordered = TGL_REGDIST_NULL;
676 }
677
678 if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
679 deps[i].unordered |= dep.unordered;
680 deps[i].exec_all |= dep.exec_all;
681 dep.unordered = TGL_SBID_NULL;
682 }
683 }
684
685 /* Add it to the end of the list if necessary. */
686 if (is_valid(dep))
687 deps.push_back(dep);
688 }
689 }
690
691 /**
692 * Construct a tgl_swsb annotation encoding any ordered dependencies from
693 * the dependency list \p deps of an instruction with ordered_address \p
694 * jp. If \p exec_all is false only dependencies known to be executed with
695 * channel masking applied will be considered in the calculation.
696 */
697 tgl_swsb
698 ordered_dependency_swsb(const dependency_list &deps,
699 const ordered_address &jp,
700 bool exec_all)
701 {
702 unsigned min_dist = ~0u;
703
704 for (unsigned i = 0; i < deps.size(); i++) {
705 if (deps[i].ordered && exec_all >= deps[i].exec_all) {
706 const unsigned dist = jp - deps[i].jp;
707 const unsigned max_dist = 10;
708 assert(jp > deps[i].jp);
709 if (dist <= max_dist)
710 min_dist = MIN3(min_dist, dist, 7);
711 }
712 }
713
714 return { min_dist == ~0u ? 0 : min_dist };
715 }
716
717 /**
718 * Return whether the dependency list \p deps of an instruction with
719 * ordered_address \p jp has any non-trivial ordered dependencies. If \p
720 * exec_all is false only dependencies known to be executed with channel
721 * masking applied will be considered in the calculation.
722 */
723 bool
724 find_ordered_dependency(const dependency_list &deps,
725 const ordered_address &jp,
726 bool exec_all)
727 {
728 return ordered_dependency_swsb(deps, jp, exec_all).regdist;
729 }
730
731 /**
732 * Return the full tgl_sbid_mode bitset for the first unordered dependency
733 * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
734 * no such dependency is present. If \p exec_all is false only
735 * dependencies known to be executed with channel masking applied will be
736 * considered in the calculation.
737 */
738 tgl_sbid_mode
739 find_unordered_dependency(const dependency_list &deps,
740 tgl_sbid_mode unordered,
741 bool exec_all)
742 {
743 if (unordered) {
744 for (unsigned i = 0; i < deps.size(); i++) {
745 if ((unordered & deps[i].unordered) &&
746 exec_all >= deps[i].exec_all)
747 return deps[i].unordered;
748 }
749 }
750
751 return TGL_SBID_NULL;
752 }
753
754 /**
755 * Return the tgl_sbid_mode bitset of an unordered dependency from the list
756 * \p deps that can be represented directly in the SWSB annotation of the
757 * instruction without additional SYNC instructions, or zero if no such
758 * dependency is present.
759 */
760 tgl_sbid_mode
761 baked_unordered_dependency_mode(const fs_inst *inst,
762 const dependency_list &deps,
763 const ordered_address &jp)
764 {
765 const bool exec_all = inst->force_writemask_all;
766 const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
767
768 if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
769 return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
770 else if (has_ordered && is_unordered(inst))
771 return TGL_SBID_NULL;
772 else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
773 (!has_ordered || !is_unordered(inst)))
774 return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
775 else if (!has_ordered)
776 return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
777 else
778 return TGL_SBID_NULL;
779 }
780
781 /** @} */
782
783 /**
784 * Shader instruction dependency calculation.
785 * @{
786 */
787
788 /**
789 * Update scoreboard object \p sb to account for the execution of
790 * instruction \p inst.
791 */
792 void
793 update_inst_scoreboard(const ordered_address *jps,
794 const fs_inst *inst, unsigned ip, scoreboard &sb)
795 {
796 const bool exec_all = inst->force_writemask_all;
797
798 /* Track any source registers that may be fetched asynchronously by this
799 * instruction, otherwise clear the dependency in order to avoid
800 * subsequent redundant synchronization.
801 */
802 for (unsigned i = 0; i < inst->sources; i++) {
803 const dependency rd_dep =
804 (inst->is_payload(i) ||
805 inst->is_math()) ? dependency(TGL_SBID_SRC, ip, exec_all) :
806 ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip], exec_all) :
807 dependency::done;
808
809 for (unsigned j = 0; j < regs_read(inst, i); j++)
810 sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep);
811 }
812
813 if (is_send(inst) && inst->base_mrf != -1) {
814 const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all);
815
816 for (unsigned j = 0; j < inst->mlen; j++)
817 sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
818 }
819
820 /* Track any destination registers of this instruction. */
821 const dependency wr_dep =
822 is_unordered(inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
823 ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip], exec_all) :
824 dependency();
825
826 if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
827 !inst->dst.is_null()) {
828 for (unsigned j = 0; j < regs_written(inst); j++)
829 sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
830 }
831 }
832
833 /**
834 * Calculate scoreboard objects locally that represent any pending (and
835 * unconditionally resolved) dependencies at the end of each block of the
836 * program.
837 */
838 scoreboard *
839 gather_block_scoreboards(const fs_visitor *shader,
840 const ordered_address *jps)
841 {
842 scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
843 unsigned ip = 0;
844
845 foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
846 update_inst_scoreboard(jps, inst, ip++, sbs[block->num]);
847
848 return sbs;
849 }
850
851 /**
852 * Propagate data dependencies globally through the control flow graph
853 * until a fixed point is reached.
854 *
855 * Calculates the set of dependencies potentially pending at the beginning
856 * of each block, and returns it as an array of scoreboard objects.
857 */
858 scoreboard *
859 propagate_block_scoreboards(const fs_visitor *shader,
860 const ordered_address *jps,
861 equivalence_relation &eq)
862 {
863 const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
864 scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
865 scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
866
867 for (bool progress = true; progress;) {
868 progress = false;
869
870 foreach_block(block, shader->cfg) {
871 const scoreboard sb = shadow(in_sbs[block->num],
872 delta_sbs[block->num]);
873
874 if (sb != out_sbs[block->num]) {
875 foreach_list_typed(bblock_link, child_link, link,
876 &block->children) {
877 scoreboard &in_sb = in_sbs[child_link->block->num];
878 const int delta =
879 jps[child_link->block->start_ip] - jps[block->end_ip]
880 - ordered_unit(static_cast<const fs_inst *>(block->end()));
881
882 in_sb = merge(eq, in_sb, transport(sb, delta));
883 }
884
885 out_sbs[block->num] = sb;
886 progress = true;
887 }
888 }
889 }
890
891 delete[] delta_sbs;
892 delete[] out_sbs;
893
894 return in_sbs;
895 }
896
897 /**
898 * Return the list of potential dependencies of each instruction in the
899 * shader based on the result of global dependency analysis.
900 */
901 dependency_list *
902 gather_inst_dependencies(const fs_visitor *shader,
903 const ordered_address *jps)
904 {
905 equivalence_relation eq(num_instructions(shader));
906 scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
907 const unsigned *ids = eq.flatten();
908 dependency_list *deps = new dependency_list[num_instructions(shader)];
909 unsigned ip = 0;
910
911 foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
912 const bool exec_all = inst->force_writemask_all;
913 scoreboard &sb = sbs[block->num];
914
915 for (unsigned i = 0; i < inst->sources; i++) {
916 for (unsigned j = 0; j < regs_read(inst, i); j++)
917 add_dependency(ids, deps[ip], dependency_for_read(
918 sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
919 }
920
921 if (is_send(inst) && inst->base_mrf != -1) {
922 for (unsigned j = 0; j < inst->mlen; j++)
923 add_dependency(ids, deps[ip], dependency_for_read(
924 sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
925 }
926
927 if (is_unordered(inst))
928 add_dependency(ids, deps[ip],
929 dependency(TGL_SBID_SET, ip, exec_all));
930
931 if (!inst->no_dd_check) {
932 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
933 for (unsigned j = 0; j < regs_written(inst); j++) {
934 add_dependency(ids, deps[ip], dependency_for_write(inst,
935 sb.get(byte_offset(inst->dst, REG_SIZE * j))));
936 }
937 }
938
939 if (is_send(inst) && inst->base_mrf != -1) {
940 for (unsigned j = 0; j < inst->implied_mrf_writes(); j++)
941 add_dependency(ids, deps[ip], dependency_for_write(inst,
942 sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
943 }
944 }
945
946 update_inst_scoreboard(jps, inst, ip, sb);
947 ip++;
948 }
949
950 delete[] sbs;
951 delete[] ids;
952
953 return deps;
954 }
955
956 /** @} */
957
958 /**
959 * Allocate SBID tokens to track the execution of every out-of-order
960 * instruction of the shader.
961 */
962 dependency_list *
963 allocate_inst_dependencies(const fs_visitor *shader,
964 const dependency_list *deps0)
965 {
966 /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
967 * shaders with a large number of SEND messages.
968 */
969
970 /* Allocate an unordered dependency ID to hardware SBID translation
971 * table with as many entries as instructions there are in the shader,
972 * which is the maximum number of unordered IDs we can find in the
973 * program.
974 */
975 unsigned *ids = new unsigned[num_instructions(shader)];
976 for (unsigned ip = 0; ip < num_instructions(shader); ip++)
977 ids[ip] = ~0u;
978
979 dependency_list *deps1 = new dependency_list[num_instructions(shader)];
980 unsigned next_id = 0;
981
982 for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
983 for (unsigned i = 0; i < deps0[ip].size(); i++) {
984 const dependency &dep = deps0[ip][i];
985
986 if (dep.unordered && ids[dep.id] == ~0u)
987 ids[dep.id] = (next_id++) & 0xf;
988
989 add_dependency(ids, deps1[ip], dep);
990 }
991 }
992
993 delete[] ids;
994
995 return deps1;
996 }
997
998 /**
999 * Emit dependency information provided by \p deps into the shader,
1000 * inserting additional SYNC instructions for dependencies that can't be
1001 * represented directly by annotating existing instructions.
1002 */
1003 void
1004 emit_inst_dependencies(fs_visitor *shader,
1005 const ordered_address *jps,
1006 const dependency_list *deps)
1007 {
1008 unsigned ip = 0;
1009
1010 foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
1011 const bool exec_all = inst->force_writemask_all;
1012 tgl_swsb swsb = ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
1013 const tgl_sbid_mode unordered_mode =
1014 baked_unordered_dependency_mode(inst, deps[ip], jps[ip]);
1015
1016 for (unsigned i = 0; i < deps[ip].size(); i++) {
1017 const dependency &dep = deps[ip][i];
1018
1019 if (dep.unordered) {
1020 if (unordered_mode == dep.unordered &&
1021 exec_all >= dep.exec_all && !swsb.mode) {
1022 /* Bake unordered dependency into the instruction's SWSB if
1023 * possible, except in cases where the current instruction
1024 * isn't marked NoMask but the dependency is, since that
1025 * might lead to data coherency issues due to
1026 * GEN:BUG:1407528679.
1027 */
1028 swsb.sbid = dep.id;
1029 swsb.mode = dep.unordered;
1030 } else {
1031 /* Emit dependency into the SWSB of an extra SYNC
1032 * instruction.
1033 */
1034 const fs_builder ibld = fs_builder(shader, block, inst)
1035 .exec_all().group(1, 0);
1036 fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1037 brw_imm_ud(TGL_SYNC_NOP));
1038 sync->sched.sbid = dep.id;
1039 sync->sched.mode = dep.unordered;
1040 assert(!(sync->sched.mode & TGL_SBID_SET));
1041 }
1042 }
1043 }
1044
1045 for (unsigned i = 0; i < deps[ip].size(); i++) {
1046 const dependency &dep = deps[ip][i];
1047
1048 if (dep.ordered && dep.exec_all > exec_all &&
1049 find_ordered_dependency(deps[ip], jps[ip], true)) {
1050 /* If the current instruction is not marked NoMask but an
1051 * ordered dependency is, perform the synchronization as a
1052 * separate NoMask SYNC instruction in order to avoid data
1053 * coherency issues due to GEN:BUG:1407528679. The similar
1054 * scenario with unordered dependencies should have been
1055 * handled above.
1056 */
1057 const fs_builder ibld = fs_builder(shader, block, inst)
1058 .exec_all().group(1, 0);
1059 fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1060 brw_imm_ud(TGL_SYNC_NOP));
1061 sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
1062 break;
1063 }
1064 }
1065
1066 /* Update the IR. */
1067 inst->sched = swsb;
1068 inst->no_dd_check = inst->no_dd_clear = false;
1069 ip++;
1070 }
1071 }
1072 }
1073
1074 bool
1075 fs_visitor::lower_scoreboard()
1076 {
1077 if (devinfo->gen >= 12) {
1078 const ordered_address *jps = ordered_inst_addresses(this);
1079 const dependency_list *deps0 = gather_inst_dependencies(this, jps);
1080 const dependency_list *deps1 = allocate_inst_dependencies(this, deps0);
1081 emit_inst_dependencies(this, jps, deps1);
1082 delete[] deps1;
1083 delete[] deps0;
1084 delete[] jps;
1085 }
1086
1087 return true;
1088 }