radeon: make texture logging more useful
[mesa.git] / src / gallium / drivers / r600 / sb / sb_gcm.cpp
1 /*
2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Vadim Girlin
25 */
26
27 #define GCM_DEBUG 0
28
29 #if GCM_DEBUG
30 #define GCM_DUMP(a) do { a } while(0);
31 #else
32 #define GCM_DUMP(a)
33 #endif
34
35 #include <map>
36
37 #include "sb_bc.h"
38 #include "sb_shader.h"
39 #include "sb_pass.h"
40
41 namespace r600_sb {
42
43 int gcm::run() {
44
45 GCM_DUMP( sblog << "==== GCM ==== \n"; sh.dump_ir(); );
46
47 collect_instructions(sh.root, true);
48
49 init_def_count(uses, pending);
50
51 for (node_iterator N, I = pending.begin(), E = pending.end();
52 I != E; I = N) {
53 N = I;
54 ++N;
55 node *o = *I;
56
57 GCM_DUMP(
58 sblog << "pending : ";
59 dump::dump_op(o);
60 sblog << "\n";
61 );
62
63 if (td_is_ready(o)) {
64
65 GCM_DUMP(
66 sblog << " ready: ";
67 dump::dump_op(o);
68 sblog << "\n";
69 );
70 pending.remove_node(o);
71 ready.push_back(o);
72 } else {
73 }
74 }
75
76 sched_early(sh.root);
77
78 if (!pending.empty()) {
79 sblog << "##### gcm_sched_early_pass: unscheduled ops:\n";
80 dump::dump_op(pending.front());
81 }
82
83 assert(pending.empty());
84
85 GCM_DUMP( sh.dump_ir(); );
86
87 GCM_DUMP( sblog << "\n\n ############## gcm late\n\n"; );
88
89 collect_instructions(sh.root, false);
90
91 init_use_count(uses, pending);
92
93 sched_late(sh.root);
94 if (!pending.empty()) {
95 sblog << "##### gcm_sched_late_pass: unscheduled ops:\n";
96 dump::dump_op(pending.front());
97 }
98
99 assert(ucs_level == 0);
100 assert(pending.empty());
101
102 return 0;
103 }
104
105
106 void gcm::collect_instructions(container_node *c, bool early_pass) {
107 if (c->is_bb()) {
108
109 if (early_pass) {
110 for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) {
111 node *n = *I;
112 if (n->flags & NF_DONT_MOVE) {
113 op_info &o = op_map[n];
114 o.top_bb = o.bottom_bb = static_cast<bb_node*>(c);
115 }
116 }
117 }
118
119 pending.append_from(c);
120 return;
121 }
122
123 for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) {
124 if (I->is_container()) {
125 collect_instructions(static_cast<container_node*>(*I), early_pass);
126 }
127 }
128 }
129
130 void gcm::sched_early(container_node *n) {
131
132 region_node *r =
133 (n->type == NT_REGION) ? static_cast<region_node*>(n) : NULL;
134
135 if (r && r->loop_phi) {
136 sched_early(r->loop_phi);
137 }
138
139 for (node_iterator I = n->begin(), E = n->end(); I != E; ++I) {
140 if (I->type == NT_OP) {
141 node *op = *I;
142 if (op->subtype == NST_PHI) {
143 td_release_uses(op->dst);
144 }
145 } else if (I->is_container()) {
146 if (I->subtype == NST_BB) {
147 bb_node* bb = static_cast<bb_node*>(*I);
148 td_sched_bb(bb);
149 } else {
150 sched_early(static_cast<container_node*>(*I));
151 }
152 }
153 }
154
155 if (r && r->phi) {
156 sched_early(r->phi);
157 }
158 }
159
160 void gcm::td_schedule(bb_node *bb, node *n) {
161 GCM_DUMP(
162 sblog << "scheduling : ";
163 dump::dump_op(n);
164 sblog << "\n";
165 );
166 td_release_uses(n->dst);
167
168 bb->push_back(n);
169
170 op_map[n].top_bb = bb;
171
172 }
173
174 void gcm::td_sched_bb(bb_node* bb) {
175 GCM_DUMP(
176 sblog << "td scheduling BB_" << bb->id << "\n";
177 );
178
179 while (!ready.empty()) {
180 for (sq_iterator N, I = ready.begin(), E = ready.end(); I != E;
181 I = N) {
182 N = I; ++N;
183 td_schedule(bb, *I);
184 ready.erase(I);
185 }
186 }
187 }
188
189 bool gcm::td_is_ready(node* n) {
190 return uses[n] == 0;
191 }
192
193 void gcm::td_release_val(value *v) {
194
195 GCM_DUMP(
196 sblog << "td checking uses: ";
197 dump::dump_val(v);
198 sblog << "\n";
199 );
200
201 use_info *u = v->uses;
202 while (u) {
203 if (u->op->parent != &pending) {
204 u = u->next;
205 continue;
206 }
207
208 GCM_DUMP(
209 sblog << "td used in ";
210 dump::dump_op(u->op);
211 sblog << "\n";
212 );
213
214 if (--uses[u->op] == 0) {
215 GCM_DUMP(
216 sblog << "td released : ";
217 dump::dump_op(u->op);
218 sblog << "\n";
219 );
220
221 pending.remove_node(u->op);
222 ready.push_back(u->op);
223 }
224 u = u->next;
225 }
226
227 }
228
229 void gcm::td_release_uses(vvec& v) {
230 for (vvec::iterator I = v.begin(), E = v.end(); I != E; ++I) {
231 value *v = *I;
232 if (!v)
233 continue;
234
235 if (v->is_rel())
236 td_release_uses(v->mdef);
237 else
238 td_release_val(v);
239 }
240 }
241
242 void gcm::sched_late(container_node *n) {
243
244 bool stack_pushed = false;
245
246 if (n->is_depart()) {
247 depart_node *d = static_cast<depart_node*>(n);
248 push_uc_stack();
249 stack_pushed = true;
250 bu_release_phi_defs(d->target->phi, d->dep_id);
251 } else if (n->is_repeat()) {
252 repeat_node *r = static_cast<repeat_node*>(n);
253 assert(r->target->loop_phi);
254 push_uc_stack();
255 stack_pushed = true;
256 bu_release_phi_defs(r->target->loop_phi, r->rep_id);
257 }
258
259 for (node_riterator I = n->rbegin(), E = n->rend(); I != E; ++I) {
260 if (I->is_container()) {
261 if (I->subtype == NST_BB) {
262 bb_node* bb = static_cast<bb_node*>(*I);
263 bu_sched_bb(bb);
264 } else {
265 sched_late(static_cast<container_node*>(*I));
266 }
267 }
268 }
269
270 if (n->type == NT_IF) {
271 if_node *f = static_cast<if_node*>(n);
272 if (f->cond)
273 pending_defs.push_back(f->cond);
274 } else if (n->type == NT_REGION) {
275 region_node *r = static_cast<region_node*>(n);
276 if (r->loop_phi)
277 bu_release_phi_defs(r->loop_phi, 0);
278 }
279
280 if (stack_pushed)
281 pop_uc_stack();
282
283 }
284
285 void gcm::bu_sched_bb(bb_node* bb) {
286 GCM_DUMP(
287 sblog << "bu scheduling BB_" << bb->id << "\n";
288 );
289
290 bu_bb = bb;
291
292 if (!pending_nodes.empty()) {
293 GCM_DUMP(
294 sblog << "pending nodes:\n";
295 );
296
297 // TODO consider sorting the exports by array_base,
298 // possibly it can improve performance
299
300 for (node_list::iterator I = pending_nodes.begin(),
301 E = pending_nodes.end(); I != E; ++I) {
302 bu_release_op(*I);
303 }
304 pending_nodes.clear();
305 GCM_DUMP(
306 sblog << "pending nodes processed...\n";
307 );
308 }
309
310
311 if (!pending_defs.empty()) {
312 for (vvec::iterator I = pending_defs.begin(), E = pending_defs.end();
313 I != E; ++I) {
314 bu_release_val(*I);
315 }
316 pending_defs.clear();
317 }
318
319 for (sched_queue::iterator N, I = ready_above.begin(), E = ready_above.end();
320 I != E; I = N) {
321 N = I;
322 ++N;
323 node *n = *I;
324 if (op_map[n].bottom_bb == bb) {
325 add_ready(*I);
326 ready_above.erase(I);
327 }
328 }
329
330 unsigned cnt_ready[SQ_NUM];
331
332 container_node *clause = NULL;
333 unsigned last_inst_type = ~0;
334 unsigned last_count = 0;
335
336 bool s = true;
337 while (s) {
338 node *n;
339
340 s = false;
341
342 unsigned ready_mask = 0;
343
344 for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) {
345 if (!bu_ready[sq].empty() || !bu_ready_next[sq].empty())
346 ready_mask |= (1 << sq);
347 }
348
349 if (!ready_mask) {
350 for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) {
351 if (!bu_ready_early[sq].empty()) {
352 node *n = bu_ready_early[sq].front();
353 bu_ready_early[sq].pop_front();
354 bu_ready[sq].push_back(n);
355 break;
356 }
357 }
358 }
359
360 for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) {
361
362 if (sq == SQ_CF && pending_exec_mask_update) {
363 pending_exec_mask_update = false;
364 sq = SQ_ALU;
365 --sq;
366 continue;
367 }
368
369 if (!bu_ready_next[sq].empty())
370 bu_ready[sq].splice(bu_ready[sq].end(), bu_ready_next[sq]);
371
372 cnt_ready[sq] = bu_ready[sq].size();
373
374 if ((sq == SQ_TEX || sq == SQ_VTX) && live_count <= rp_threshold &&
375 cnt_ready[sq] < ctx.max_fetch/2 &&
376 !bu_ready_next[SQ_ALU].empty()) {
377 sq = SQ_ALU;
378 --sq;
379 continue;
380 }
381
382 while (!bu_ready[sq].empty()) {
383
384 if (last_inst_type != sq) {
385 clause = NULL;
386 last_count = 0;
387 last_inst_type = sq;
388 }
389
390 // simple heuristic to limit register pressure,
391 if (sq == SQ_ALU && live_count > rp_threshold &&
392 (!bu_ready[SQ_TEX].empty() ||
393 !bu_ready[SQ_VTX].empty() ||
394 !bu_ready_next[SQ_TEX].empty() ||
395 !bu_ready_next[SQ_VTX].empty())) {
396 GCM_DUMP( sblog << "switching to fetch (regpressure)\n"; );
397 break;
398 }
399
400 n = bu_ready[sq].front();
401
402 // real count (e.g. SAMPLE_G will be expanded to 3 instructions,
403 // 2 SET_GRAD_ + 1 SAMPLE_G
404 unsigned ncnt = 1;
405 if (n->is_fetch_inst() && n->src.size() == 12) {
406 ncnt = 3;
407 }
408
409 if ((sq == SQ_TEX || sq == SQ_VTX) &&
410 ((last_count >= ctx.max_fetch/2 &&
411 check_alu_ready_count(24)) ||
412 last_count + ncnt > ctx.max_fetch))
413 break;
414 else if (sq == SQ_CF && last_count > 4 &&
415 check_alu_ready_count(24))
416 break;
417
418 bu_ready[sq].pop_front();
419
420 if (sq != SQ_CF) {
421 if (!clause) {
422 clause = sh.create_clause(sq == SQ_ALU ?
423 NST_ALU_CLAUSE :
424 sq == SQ_TEX ? NST_TEX_CLAUSE :
425 NST_VTX_CLAUSE);
426 bb->push_front(clause);
427 }
428 } else {
429 clause = bb;
430 }
431
432 bu_schedule(clause, n);
433 s = true;
434 last_count += ncnt;
435 }
436 }
437 }
438
439 bu_bb = NULL;
440
441 GCM_DUMP(
442 sblog << "bu finished scheduling BB_" << bb->id << "\n";
443 );
444 }
445
446 void gcm::bu_release_defs(vvec& v, bool src) {
447 for (vvec::reverse_iterator I = v.rbegin(), E = v.rend(); I != E; ++I) {
448 value *v = *I;
449 if (!v || v->is_readonly())
450 continue;
451
452 if (v->is_rel()) {
453 if (!v->rel->is_readonly())
454 bu_release_val(v->rel);
455 bu_release_defs(v->muse, true);
456 } else if (src)
457 bu_release_val(v);
458 else {
459 if (live.remove_val(v)) {
460 --live_count;
461 }
462 }
463 }
464 }
465
466 void gcm::push_uc_stack() {
467 GCM_DUMP(
468 sblog << "pushing use count stack prev_level " << ucs_level
469 << " new level " << (ucs_level + 1) << "\n";
470 );
471 ++ucs_level;
472 if (ucs_level == nuc_stk.size()) {
473 nuc_stk.resize(ucs_level + 1);
474 }
475 else {
476 nuc_stk[ucs_level].clear();
477 }
478 }
479
480 bool gcm::bu_is_ready(node* n) {
481 nuc_map &cm = nuc_stk[ucs_level];
482 nuc_map::iterator F = cm.find(n);
483 unsigned uc = (F == cm.end() ? 0 : F->second);
484 return uc == uses[n];
485 }
486
487 void gcm::bu_schedule(container_node* c, node* n) {
488 GCM_DUMP(
489 sblog << "bu scheduling : ";
490 dump::dump_op(n);
491 sblog << "\n";
492 );
493
494 assert(op_map[n].bottom_bb == bu_bb);
495
496 bu_release_defs(n->src, true);
497 bu_release_defs(n->dst, false);
498
499 c->push_front(n);
500 }
501
502 void gcm::dump_uc_stack() {
503 sblog << "##### uc_stk start ####\n";
504 for (unsigned l = 0; l <= ucs_level; ++l) {
505 nuc_map &m = nuc_stk[l];
506
507 sblog << "nuc_stk[" << l << "] : @" << &m << "\n";
508
509 for (nuc_map::iterator I = m.begin(), E = m.end(); I != E; ++I) {
510 sblog << " uc " << I->second << " for ";
511 dump::dump_op(I->first);
512 sblog << "\n";
513 }
514 }
515 sblog << "##### uc_stk end ####\n";
516 }
517
518 void gcm::pop_uc_stack() {
519 nuc_map &pm = nuc_stk[ucs_level];
520 --ucs_level;
521 nuc_map &cm = nuc_stk[ucs_level];
522
523 GCM_DUMP(
524 sblog << "merging use stack from level " << (ucs_level+1)
525 << " to " << ucs_level << "\n";
526 );
527
528 for (nuc_map::iterator N, I = pm.begin(), E = pm.end(); I != E; ++I) {
529 node *n = I->first;
530
531 GCM_DUMP(
532 sblog << " " << cm[n] << " += " << I->second << " for ";
533 dump::dump_op(n);
534 sblog << "\n";
535 );
536
537 unsigned uc = cm[n] += I->second;
538
539 if (n->parent == &pending && uc == uses[n]) {
540 cm.erase(n);
541 pending_nodes.push_back(n);
542 GCM_DUMP(
543 sblog << "pushed pending_node due to stack pop ";
544 dump::dump_op(n);
545 sblog << "\n";
546 );
547 }
548 }
549 }
550
551 void gcm::bu_find_best_bb(node *n, op_info &oi) {
552
553 GCM_DUMP(
554 sblog << " find best bb : ";
555 dump::dump_op(n);
556 sblog << "\n";
557 );
558
559 if (oi.bottom_bb)
560 return;
561
562 // don't hoist generated copies
563 if (n->flags & NF_DONT_HOIST) {
564 oi.bottom_bb = bu_bb;
565 return;
566 }
567
568 bb_node* best_bb = bu_bb;
569 bb_node* top_bb = oi.top_bb;
570 assert(oi.top_bb && !oi.bottom_bb);
571
572 node *c = best_bb;
573
574 // FIXME top_bb may be located inside the loop so we'll never enter it
575 // in the loop below, and the instruction will be incorrectly placed at the
576 // beginning of the shader.
577 // For now just check if top_bb's loop_level is higher than of
578 // current bb and abort the search for better bb in such case,
579 // but this problem may require more complete (and more expensive) fix
580 if (top_bb->loop_level <= best_bb->loop_level) {
581 while (c && c != top_bb) {
582
583 if (c->prev) {
584 c = c->prev;
585 } else {
586 c = c->parent;
587 if (!c)
588 break;
589 continue;
590 }
591
592 if (c->subtype == NST_BB) {
593 bb_node *bb = static_cast<bb_node*>(c);
594 if (bb->loop_level < best_bb->loop_level)
595 best_bb = bb;
596 }
597 }
598 }
599
600 oi.bottom_bb = best_bb;
601 }
602
603 void gcm::add_ready(node *n) {
604 sched_queue_id sq = sh.get_queue_id(n);
605 if (n->flags & NF_SCHEDULE_EARLY)
606 bu_ready_early[sq].push_back(n);
607 else if (sq == SQ_ALU && n->is_copy_mov())
608 bu_ready[sq].push_front(n);
609 else if (n->is_alu_inst()) {
610 alu_node *a = static_cast<alu_node*>(n);
611 if (a->bc.op_ptr->flags & AF_PRED && a->dst[2]) {
612 // PRED_SET instruction that updates exec mask
613 pending_exec_mask_update = true;
614 }
615 bu_ready_next[sq].push_back(n);
616 } else
617 bu_ready_next[sq].push_back(n);
618 }
619
620 void gcm::bu_release_op(node * n) {
621 op_info &oi = op_map[n];
622
623 GCM_DUMP(
624 sblog << " bu release op ";
625 dump::dump_op(n);
626 );
627
628 nuc_stk[ucs_level].erase(n);
629 pending.remove_node(n);
630
631 bu_find_best_bb(n, oi);
632
633 if (oi.bottom_bb == bu_bb) {
634 GCM_DUMP( sblog << " ready\n";);
635 add_ready(n);
636 } else {
637 GCM_DUMP( sblog << " ready_above\n";);
638 ready_above.push_back(n);
639 }
640 }
641
642 void gcm::bu_release_phi_defs(container_node* p, unsigned op)
643 {
644 for (node_riterator I = p->rbegin(), E = p->rend(); I != E; ++I) {
645 node *o = *I;
646 value *v = o->src[op];
647 if (v && !v->is_readonly())
648 pending_defs.push_back(o->src[op]);
649
650 }
651 }
652
653 unsigned gcm::get_uc_vec(vvec &vv) {
654 unsigned c = 0;
655 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
656 value *v = *I;
657 if (!v)
658 continue;
659
660 if (v->is_rel())
661 c += get_uc_vec(v->mdef);
662 else
663 c += v->use_count();
664 }
665 return c;
666 }
667
668 void gcm::init_use_count(nuc_map& m, container_node &s) {
669 m.clear();
670 for (node_iterator I = s.begin(), E = s.end(); I != E; ++I) {
671 node *n = *I;
672 unsigned uc = get_uc_vec(n->dst);
673 GCM_DUMP(
674 sblog << "uc " << uc << " ";
675 dump::dump_op(n);
676 sblog << "\n";
677 );
678 if (!uc) {
679 pending_nodes.push_back(n);
680 GCM_DUMP(
681 sblog << "pushed pending_node in init ";
682 dump::dump_op(n);
683 sblog << "\n";
684 );
685
686 } else
687 m[n] = uc;
688 }
689 }
690
691 void gcm::bu_release_val(value* v) {
692 node *n = v->any_def();
693
694 if (n && n->parent == &pending) {
695 nuc_map &m = nuc_stk[ucs_level];
696 unsigned uc = ++m[n];
697 unsigned uc2 = uses[n];
698
699 if (live.add_val(v)) {
700 ++live_count;
701 GCM_DUMP ( sblog << "live_count: " << live_count << "\n"; );
702 }
703
704 GCM_DUMP(
705 sblog << "release val ";
706 dump::dump_val(v);
707 sblog << " for node ";
708 dump::dump_op(n);
709 sblog << " new uc=" << uc << ", total " << uc2 << "\n";
710 );
711
712 if (uc == uc2)
713 bu_release_op(n);
714 }
715
716 }
717
718 void gcm::init_def_count(nuc_map& m, container_node& s) {
719 m.clear();
720 for (node_iterator I = s.begin(), E = s.end(); I != E; ++I) {
721 node *n = *I;
722 unsigned dc = get_dc_vec(n->src, true) + get_dc_vec(n->dst, false);
723 m[n] = dc;
724
725 GCM_DUMP(
726 sblog << "dc " << dc << " ";
727 dump::dump_op(n);
728 sblog << "\n";
729 );
730 }
731 }
732
733 unsigned gcm::get_dc_vec(vvec& vv, bool src) {
734 unsigned c = 0;
735 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
736 value *v = *I;
737 if (!v || v->is_readonly())
738 continue;
739
740 if (v->is_rel()) {
741 c += v->rel->def != NULL;
742 c += get_dc_vec(v->muse, true);
743 }
744 else if (src) {
745 c += v->def != NULL;
746 c += v->adef != NULL;
747 }
748 }
749 return c;
750 }
751
752 unsigned gcm::real_alu_count(sched_queue& q, unsigned max) {
753 sq_iterator I(q.begin()), E(q.end());
754 unsigned c = 0;
755
756 while (I != E && c < max) {
757 node *n = *I;
758 if (n->is_alu_inst()) {
759 if (!n->is_copy_mov() || !n->src[0]->is_any_gpr())
760 ++c;
761 } else if (n->is_alu_packed()) {
762 c += static_cast<container_node*>(n)->count();
763 }
764 ++I;
765 }
766
767 return c;
768 }
769
770 bool gcm::check_alu_ready_count(unsigned threshold) {
771 unsigned r = real_alu_count(bu_ready[SQ_ALU], threshold);
772 if (r >= threshold)
773 return true;
774 r += real_alu_count(bu_ready_next[SQ_ALU], threshold - r);
775 return r >= threshold;
776 }
777
778 } // namespace r600_sb