r600g/sb: fix kcache handling on r6xx
[mesa.git] / src / gallium / drivers / r600 / sb / sb_gcm.cpp
1 /*
2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Vadim Girlin
25 */
26
27 #define GCM_DEBUG 0
28
29 #if GCM_DEBUG
30 #define GCM_DUMP(a) do { a } while(0);
31 #else
32 #define GCM_DUMP(a)
33 #endif
34
35 #include <iostream>
36 #include <map>
37
38 #include "sb_bc.h"
39 #include "sb_shader.h"
40
41 #include "sb_pass.h"
42
43 namespace r600_sb {
44
45 using std::cerr;
46
47 int gcm::run() {
48
49 GCM_DUMP( cerr << "==== GCM ==== \n"; sh.dump_ir(); );
50
51 collect_instructions(sh.root, true);
52
53 init_def_count(uses, pending);
54
55 for (node_iterator N, I = pending.begin(), E = pending.end();
56 I != E; I = N) {
57 N = I;
58 ++N;
59 node *o = *I;
60
61 GCM_DUMP(
62 cerr << "pending : ";
63 dump::dump_op(o);
64 cerr << "\n";
65 );
66
67 if (td_is_ready(o)) {
68
69 GCM_DUMP(
70 cerr << " ready: ";
71 dump::dump_op(o);
72 cerr << "\n";
73 );
74 pending.remove_node(o);
75 ready.push_back(o);
76 } else {
77 }
78 }
79
80 sched_early(sh.root);
81
82 if (!pending.empty()) {
83 cerr << "##### gcm_sched_early_pass: unscheduled ops:\n";
84 dump::dump_op(pending.front());
85 }
86
87 assert(pending.empty());
88
89 GCM_DUMP( sh.dump_ir(); );
90
91 GCM_DUMP( cerr << "\n\n ############## gcm late\n\n"; );
92
93 collect_instructions(sh.root, false);
94
95 init_use_count(uses, pending);
96
97 sched_late(sh.root);
98 if (!pending.empty()) {
99 cerr << "##### gcm_sched_late_pass: unscheduled ops:\n";
100 dump::dump_op(pending.front());
101 }
102
103 assert(ucs_level == 0);
104 assert(pending.empty());
105
106 return 0;
107 }
108
109
110 void gcm::collect_instructions(container_node *c, bool early_pass) {
111 if (c->is_bb()) {
112
113 if (early_pass) {
114 for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) {
115 node *n = *I;
116 if (n->flags & NF_DONT_MOVE) {
117 op_info &o = op_map[n];
118 o.top_bb = o.bottom_bb = static_cast<bb_node*>(c);
119 }
120 }
121 }
122
123 pending.append_from(c);
124 return;
125 }
126
127 for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) {
128 if (I->is_container()) {
129 collect_instructions(static_cast<container_node*>(*I), early_pass);
130 }
131 }
132 }
133
134 void gcm::sched_early(container_node *n) {
135
136 region_node *r =
137 (n->type == NT_REGION) ? static_cast<region_node*>(n) : NULL;
138
139 if (r && r->loop_phi) {
140 sched_early(r->loop_phi);
141 }
142
143 for (node_iterator I = n->begin(), E = n->end(); I != E; ++I) {
144 if (I->type == NT_OP) {
145 node *op = *I;
146 if (op->subtype == NST_PHI) {
147 td_release_uses(op->dst);
148 }
149 } else if (I->is_container()) {
150 if (I->subtype == NST_BB) {
151 bb_node* bb = static_cast<bb_node*>(*I);
152 td_sched_bb(bb);
153 } else {
154 sched_early(static_cast<container_node*>(*I));
155 }
156 }
157 }
158
159 if (r && r->phi) {
160 sched_early(r->phi);
161 }
162 }
163
164 void gcm::td_schedule(bb_node *bb, node *n) {
165 GCM_DUMP(
166 cerr << "scheduling : ";
167 dump::dump_op(n);
168 cerr << "\n";
169 );
170 td_release_uses(n->dst);
171
172 bb->push_back(n);
173
174 op_map[n].top_bb = bb;
175
176 }
177
178 void gcm::td_sched_bb(bb_node* bb) {
179 GCM_DUMP(
180 cerr << "td scheduling BB_" << bb->id << "\n";
181 );
182
183 while (!ready.empty()) {
184 for (sq_iterator N, I = ready.begin(), E = ready.end(); I != E;
185 I = N) {
186 N = I; ++N;
187 td_schedule(bb, *I);
188 ready.erase(I);
189 }
190 }
191 }
192
193 bool gcm::td_is_ready(node* n) {
194 return uses[n] == 0;
195 }
196
197 void gcm::td_release_val(value *v) {
198
199 GCM_DUMP(
200 cerr << "td checking uses: ";
201 dump::dump_val(v);
202 cerr << "\n";
203 );
204
205 use_info *u = v->uses;
206 while (u) {
207 if (u->op->parent != &pending) {
208 u = u->next;
209 continue;
210 }
211
212 GCM_DUMP(
213 cerr << "td used in ";
214 dump::dump_op(u->op);
215 cerr << "\n";
216 );
217
218 if (--uses[u->op] == 0) {
219 GCM_DUMP(
220 cerr << "td released : ";
221 dump::dump_op(u->op);
222 cerr << "\n";
223 );
224
225 pending.remove_node(u->op);
226 ready.push_back(u->op);
227 }
228 u = u->next;
229 }
230
231 }
232
233 void gcm::td_release_uses(vvec& v) {
234 for (vvec::iterator I = v.begin(), E = v.end(); I != E; ++I) {
235 value *v = *I;
236 if (!v)
237 continue;
238
239 if (v->is_rel())
240 td_release_uses(v->mdef);
241 else
242 td_release_val(v);
243 }
244 }
245
246 void gcm::sched_late(container_node *n) {
247
248 bool stack_pushed = false;
249
250 if (n->is_depart()) {
251 depart_node *d = static_cast<depart_node*>(n);
252 push_uc_stack();
253 stack_pushed = true;
254 bu_release_phi_defs(d->target->phi, d->dep_id);
255 } else if (n->is_repeat()) {
256 repeat_node *r = static_cast<repeat_node*>(n);
257 assert(r->target->loop_phi);
258 push_uc_stack();
259 stack_pushed = true;
260 bu_release_phi_defs(r->target->loop_phi, r->rep_id);
261 }
262
263 for (node_riterator I = n->rbegin(), E = n->rend(); I != E; ++I) {
264 if (I->is_container()) {
265 if (I->subtype == NST_BB) {
266 bb_node* bb = static_cast<bb_node*>(*I);
267 bu_sched_bb(bb);
268 } else {
269 sched_late(static_cast<container_node*>(*I));
270 }
271 }
272 }
273
274 if (n->type == NT_IF) {
275 if_node *f = static_cast<if_node*>(n);
276 if (f->cond)
277 pending_defs.push_back(f->cond);
278 } else if (n->type == NT_REGION) {
279 region_node *r = static_cast<region_node*>(n);
280 if (r->loop_phi)
281 bu_release_phi_defs(r->loop_phi, 0);
282 }
283
284 if (stack_pushed)
285 pop_uc_stack();
286
287 }
288
289 void gcm::bu_sched_bb(bb_node* bb) {
290 GCM_DUMP(
291 cerr << "bu scheduling BB_" << bb->id << "\n";
292 );
293
294 bu_bb = bb;
295
296 if (!pending_nodes.empty()) {
297 GCM_DUMP(
298 cerr << "pending nodes:\n";
299 );
300
301 // TODO consider sorting the exports by array_base,
302 // possibly it can improve performance
303
304 for (node_list::iterator I = pending_nodes.begin(),
305 E = pending_nodes.end(); I != E; ++I) {
306 bu_release_op(*I);
307 }
308 pending_nodes.clear();
309 GCM_DUMP(
310 cerr << "pending nodes processed...\n";
311 );
312 }
313
314
315 if (!pending_defs.empty()) {
316 for (vvec::iterator I = pending_defs.begin(), E = pending_defs.end();
317 I != E; ++I) {
318 bu_release_val(*I);
319 }
320 pending_defs.clear();
321 }
322
323 for (sched_queue::iterator N, I = ready_above.begin(), E = ready_above.end();
324 I != E; I = N) {
325 N = I;
326 ++N;
327 node *n = *I;
328 if (op_map[n].bottom_bb == bb) {
329 add_ready(*I);
330 ready_above.erase(I);
331 }
332 }
333
334 unsigned cnt_ready[SQ_NUM];
335
336 container_node *clause = NULL;
337 unsigned last_inst_type = ~0;
338 unsigned last_count = 0;
339
340 bool s = true;
341 while (s) {
342 node *n;
343
344 s = false;
345
346 unsigned ready_mask = 0;
347
348 for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) {
349 if (!bu_ready[sq].empty() || !bu_ready_next[sq].empty())
350 ready_mask |= (1 << sq);
351 }
352
353 if (!ready_mask) {
354 for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) {
355 if (!bu_ready_early[sq].empty()) {
356 node *n = bu_ready_early[sq].front();
357 bu_ready_early[sq].pop_front();
358 bu_ready[sq].push_back(n);
359 break;
360 }
361 }
362 }
363
364 for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) {
365
366 if (!bu_ready_next[sq].empty())
367 bu_ready[sq].splice(bu_ready[sq].end(), bu_ready_next[sq]);
368
369 cnt_ready[sq] = bu_ready[sq].size();
370
371 if ((sq == SQ_TEX || sq == SQ_VTX) && live_count <= rp_threshold &&
372 cnt_ready[sq] < ctx.max_fetch/2 &&
373 !bu_ready_next[SQ_ALU].empty()) {
374 sq = SQ_ALU;
375 --sq;
376 continue;
377 }
378
379 while (!bu_ready[sq].empty()) {
380
381 if (last_inst_type != sq) {
382 clause = NULL;
383 last_count = 0;
384 last_inst_type = sq;
385 }
386
387 // simple heuristic to limit register pressure,
388 if (sq == SQ_ALU && live_count > rp_threshold &&
389 (!bu_ready[SQ_TEX].empty() ||
390 !bu_ready[SQ_VTX].empty() ||
391 !bu_ready_next[SQ_TEX].empty() ||
392 !bu_ready_next[SQ_VTX].empty())) {
393 GCM_DUMP( cerr << "switching to fetch (regpressure)\n"; );
394 break;
395 }
396
397 n = bu_ready[sq].front();
398
399 // real count (e.g. SAMPLE_G will be expanded to 3 instructions,
400 // 2 SET_GRAD_ + 1 SAMPLE_G
401 unsigned ncnt = 1;
402 if (n->is_fetch_inst() && n->src.size() == 12) {
403 ncnt = 3;
404 }
405
406 if ((sq == SQ_TEX || sq == SQ_VTX) &&
407 ((last_count >= ctx.max_fetch/2 &&
408 check_alu_ready_count(24)) ||
409 last_count + ncnt > ctx.max_fetch))
410 break;
411 else if (sq == SQ_CF && last_count > 4 &&
412 check_alu_ready_count(24))
413 break;
414
415 bu_ready[sq].pop_front();
416
417 if (sq != SQ_CF) {
418 if (!clause) {
419 clause = sh.create_clause(sq == SQ_ALU ?
420 NST_ALU_CLAUSE :
421 sq == SQ_TEX ? NST_TEX_CLAUSE :
422 NST_VTX_CLAUSE);
423 bb->push_front(clause);
424 }
425 } else {
426 clause = bb;
427 }
428
429 bu_schedule(clause, n);
430 s = true;
431 last_count += ncnt;
432 }
433 }
434 }
435
436 bu_bb = NULL;
437
438 GCM_DUMP(
439 cerr << "bu finished scheduling BB_" << bb->id << "\n";
440 );
441 }
442
443 void gcm::bu_release_defs(vvec& v, bool src) {
444 for (vvec::reverse_iterator I = v.rbegin(), E = v.rend(); I != E; ++I) {
445 value *v = *I;
446 if (!v || v->is_readonly())
447 continue;
448
449 if (v->is_rel()) {
450 if (!v->rel->is_readonly())
451 bu_release_val(v->rel);
452 bu_release_defs(v->muse, true);
453 } else if (src)
454 bu_release_val(v);
455 else {
456 if (live.remove_val(v)) {
457 --live_count;
458 }
459 }
460 }
461 }
462
463 void gcm::push_uc_stack() {
464 GCM_DUMP(
465 cerr << "pushing use count stack prev_level " << ucs_level
466 << " new level " << (ucs_level + 1) << "\n";
467 );
468 ++ucs_level;
469 if (ucs_level == nuc_stk.size()) {
470 nuc_stk.resize(ucs_level + 1);
471 }
472 else {
473 nuc_stk[ucs_level].clear();
474 }
475 }
476
477 bool gcm::bu_is_ready(node* n) {
478 nuc_map &cm = nuc_stk[ucs_level];
479 nuc_map::iterator F = cm.find(n);
480 unsigned uc = (F == cm.end() ? 0 : F->second);
481 return uc == uses[n];
482 }
483
484 void gcm::bu_schedule(container_node* c, node* n) {
485 GCM_DUMP(
486 cerr << "bu scheduling : ";
487 dump::dump_op(n);
488 cerr << "\n";
489 );
490
491 assert(op_map[n].bottom_bb == bu_bb);
492
493 bu_release_defs(n->src, true);
494 bu_release_defs(n->dst, false);
495
496 c->push_front(n);
497 }
498
499 void gcm::dump_uc_stack() {
500 cerr << "##### uc_stk start ####\n";
501 for (unsigned l = 0; l <= ucs_level; ++l) {
502 nuc_map &m = nuc_stk[l];
503
504 cerr << "nuc_stk[" << l << "] : @" << &m << "\n";
505
506 for (nuc_map::iterator I = m.begin(), E = m.end(); I != E; ++I) {
507 cerr << " uc " << I->second << " for ";
508 dump::dump_op(I->first);
509 cerr << "\n";
510 }
511 }
512 cerr << "##### uc_stk end ####\n";
513 }
514
515 void gcm::pop_uc_stack() {
516 nuc_map &pm = nuc_stk[ucs_level];
517 --ucs_level;
518 nuc_map &cm = nuc_stk[ucs_level];
519
520 GCM_DUMP(
521 cerr << "merging use stack from level " << (ucs_level+1)
522 << " to " << ucs_level << "\n";
523 );
524
525 for (nuc_map::iterator N, I = pm.begin(), E = pm.end(); I != E; ++I) {
526 node *n = I->first;
527
528 GCM_DUMP(
529 cerr << " " << cm[n] << " += " << I->second << " for ";
530 dump::dump_op(n);
531 cerr << "\n";
532 );
533
534 unsigned uc = cm[n] += I->second;
535
536 if (n->parent == &pending && uc == uses[n]) {
537 cm.erase(n);
538 pending_nodes.push_back(n);
539 GCM_DUMP(
540 cerr << "pushed pending_node due to stack pop ";
541 dump::dump_op(n);
542 cerr << "\n";
543 );
544 }
545 }
546 }
547
548 void gcm::bu_find_best_bb(node *n, op_info &oi) {
549
550 GCM_DUMP(
551 cerr << " find best bb : ";
552 dump::dump_op(n);
553 cerr << "\n";
554 );
555
556 if (oi.bottom_bb)
557 return;
558
559 // don't hoist generated copies
560 if (n->flags & NF_DONT_HOIST) {
561 oi.bottom_bb = bu_bb;
562 return;
563 }
564
565 bb_node* best_bb = bu_bb;
566 bb_node* top_bb = oi.top_bb;
567 assert(oi.top_bb && !oi.bottom_bb);
568
569 node *c = best_bb;
570
571 // FIXME top_bb may be located inside the loop so we'll never enter it
572 // in the loop below, and the instruction will be incorrectly placed at the
573 // beginning of the shader.
574 // For now just check if top_bb's loop_level is higher than of
575 // current bb and abort the search for better bb in such case,
576 // but this problem may require more complete (and more expensive) fix
577 if (top_bb->loop_level <= best_bb->loop_level) {
578 while (c && c != top_bb) {
579
580 if (c->prev) {
581 c = c->prev;
582 } else {
583 c = c->parent;
584 if (!c)
585 break;
586 continue;
587 }
588
589 if (c->subtype == NST_BB) {
590 bb_node *bb = static_cast<bb_node*>(c);
591 if (bb->loop_level < best_bb->loop_level)
592 best_bb = bb;
593 }
594 }
595 }
596
597 oi.bottom_bb = best_bb;
598 }
599
600 void gcm::add_ready(node *n) {
601 sched_queue_id sq = sh.get_queue_id(n);
602 if (n->flags & NF_SCHEDULE_EARLY)
603 bu_ready_early[sq].push_back(n);
604 else if (sq == SQ_ALU && n->is_copy_mov())
605 bu_ready[sq].push_front(n);
606 else
607 bu_ready_next[sq].push_back(n);
608 }
609
610 void gcm::bu_release_op(node * n) {
611 op_info &oi = op_map[n];
612
613 GCM_DUMP(
614 cerr << " bu release op ";
615 dump::dump_op(n);
616 );
617
618 nuc_stk[ucs_level].erase(n);
619 pending.remove_node(n);
620
621 bu_find_best_bb(n, oi);
622
623 if (oi.bottom_bb == bu_bb) {
624 GCM_DUMP( cerr << " ready\n";);
625 add_ready(n);
626 } else {
627 GCM_DUMP( cerr << " ready_above\n";);
628 ready_above.push_back(n);
629 }
630 }
631
632 void gcm::bu_release_phi_defs(container_node* p, unsigned op)
633 {
634 for (node_riterator I = p->rbegin(), E = p->rend(); I != E; ++I) {
635 node *o = *I;
636 value *v = o->src[op];
637 if (v && !v->is_readonly())
638 pending_defs.push_back(o->src[op]);
639
640 }
641 }
642
643 unsigned gcm::get_uc_vec(vvec &vv) {
644 unsigned c = 0;
645 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
646 value *v = *I;
647 if (!v)
648 continue;
649
650 if (v->is_rel())
651 c += get_uc_vec(v->mdef);
652 else
653 c += v->use_count();
654 }
655 return c;
656 }
657
658 void gcm::init_use_count(nuc_map& m, container_node &s) {
659 m.clear();
660 for (node_iterator I = s.begin(), E = s.end(); I != E; ++I) {
661 node *n = *I;
662 unsigned uc = get_uc_vec(n->dst);
663 GCM_DUMP(
664 cerr << "uc " << uc << " ";
665 dump::dump_op(n);
666 cerr << "\n";
667 );
668 if (!uc) {
669 pending_nodes.push_back(n);
670 GCM_DUMP(
671 cerr << "pushed pending_node in init ";
672 dump::dump_op(n);
673 cerr << "\n";
674 );
675
676 } else
677 m[n] = uc;
678 }
679 }
680
681 void gcm::bu_release_val(value* v) {
682 node *n = v->any_def();
683
684 if (n && n->parent == &pending) {
685 nuc_map &m = nuc_stk[ucs_level];
686 unsigned uc = ++m[n];
687 unsigned uc2 = uses[n];
688
689 if (live.add_val(v)) {
690 ++live_count;
691 GCM_DUMP ( cerr << "live_count: " << live_count << "\n"; );
692 }
693
694 GCM_DUMP(
695 cerr << "release val ";
696 dump::dump_val(v);
697 cerr << " for node ";
698 dump::dump_op(n);
699 cerr << " new uc=" << uc << ", total " << uc2 << "\n";
700 );
701
702 if (uc == uc2)
703 bu_release_op(n);
704 }
705
706 }
707
708 void gcm::init_def_count(nuc_map& m, container_node& s) {
709 m.clear();
710 for (node_iterator I = s.begin(), E = s.end(); I != E; ++I) {
711 node *n = *I;
712 unsigned dc = get_dc_vec(n->src, true) + get_dc_vec(n->dst, false);
713 m[n] = dc;
714
715 GCM_DUMP(
716 cerr << "dc " << dc << " ";
717 dump::dump_op(n);
718 cerr << "\n";
719 );
720 }
721 }
722
723 unsigned gcm::get_dc_vec(vvec& vv, bool src) {
724 unsigned c = 0;
725 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
726 value *v = *I;
727 if (!v || v->is_readonly())
728 continue;
729
730 if (v->is_rel()) {
731 c += v->rel->def != NULL;
732 c += get_dc_vec(v->muse, true);
733 }
734 else if (src) {
735 c += v->def != NULL;
736 c += v->adef != NULL;
737 }
738 }
739 return c;
740 }
741
742 unsigned gcm::real_alu_count(sched_queue& q, unsigned max) {
743 sq_iterator I(q.begin()), E(q.end());
744 unsigned c = 0;
745
746 while (I != E && c < max) {
747 node *n = *I;
748 if (n->is_alu_inst()) {
749 if (!n->is_copy_mov() || !n->src[0]->is_any_gpr())
750 ++c;
751 } else if (n->is_alu_packed()) {
752 c += static_cast<container_node*>(n)->count();
753 }
754 ++I;
755 }
756
757 return c;
758 }
759
760 bool gcm::check_alu_ready_count(unsigned threshold) {
761 unsigned r = real_alu_count(bu_ready[SQ_ALU], threshold);
762 if (r >= threshold)
763 return true;
764 r += real_alu_count(bu_ready_next[SQ_ALU], threshold - r);
765 return r >= threshold;
766 }
767
768 } // namespace r600_sb