2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
30 #define GCM_DUMP(a) do { a } while(0);
38 #include "sb_shader.h"
40 #include "eg_sq.h" // V_SQ_CF_INDEX_NONE
46 GCM_DUMP( sblog
<< "==== GCM ==== \n"; sh
.dump_ir(); );
48 collect_instructions(sh
.root
, true);
50 init_def_count(uses
, pending
);
52 for (node_iterator N
, I
= pending
.begin(), E
= pending
.end();
59 sblog
<< "pending : ";
71 pending
.remove_node(o
);
79 if (!pending
.empty()) {
80 sblog
<< "##### gcm_sched_early_pass: unscheduled ops:\n";
81 dump::dump_op(pending
.front());
84 assert(pending
.empty());
86 GCM_DUMP( sh
.dump_ir(); );
88 GCM_DUMP( sblog
<< "\n\n ############## gcm late\n\n"; );
90 collect_instructions(sh
.root
, false);
92 init_use_count(uses
, pending
);
95 if (!pending
.empty()) {
96 sblog
<< "##### gcm_sched_late_pass: unscheduled ops:\n";
97 dump::dump_op(pending
.front());
100 assert(ucs_level
== 0);
101 assert(pending
.empty());
107 void gcm::collect_instructions(container_node
*c
, bool early_pass
) {
111 for (node_iterator I
= c
->begin(), E
= c
->end(); I
!= E
; ++I
) {
113 if (n
->flags
& NF_DONT_MOVE
) {
114 op_info
&o
= op_map
[n
];
115 o
.top_bb
= o
.bottom_bb
= static_cast<bb_node
*>(c
);
120 pending
.append_from(c
);
124 for (node_iterator I
= c
->begin(), E
= c
->end(); I
!= E
; ++I
) {
125 if (I
->is_container()) {
126 collect_instructions(static_cast<container_node
*>(*I
), early_pass
);
131 void gcm::sched_early(container_node
*n
) {
134 (n
->type
== NT_REGION
) ? static_cast<region_node
*>(n
) : NULL
;
136 if (r
&& r
->loop_phi
) {
137 sched_early(r
->loop_phi
);
140 for (node_iterator I
= n
->begin(), E
= n
->end(); I
!= E
; ++I
) {
141 if (I
->type
== NT_OP
) {
143 if (op
->subtype
== NST_PHI
) {
144 td_release_uses(op
->dst
);
146 } else if (I
->is_container()) {
147 if (I
->subtype
== NST_BB
) {
148 bb_node
* bb
= static_cast<bb_node
*>(*I
);
151 sched_early(static_cast<container_node
*>(*I
));
161 void gcm::td_schedule(bb_node
*bb
, node
*n
) {
163 sblog
<< "scheduling : ";
167 td_release_uses(n
->dst
);
171 op_map
[n
].top_bb
= bb
;
175 void gcm::td_sched_bb(bb_node
* bb
) {
177 sblog
<< "td scheduling BB_" << bb
->id
<< "\n";
180 while (!ready
.empty()) {
181 for (sq_iterator N
, I
= ready
.begin(), E
= ready
.end(); I
!= E
;
190 bool gcm::td_is_ready(node
* n
) {
194 void gcm::td_release_val(value
*v
) {
197 sblog
<< "td checking uses: ";
202 for (uselist::iterator I
= v
->uses
.begin(), E
= v
->uses
.end(); I
!= E
; ++I
) {
204 if (op
->parent
!= &pending
) {
209 sblog
<< "td used in ";
214 assert(uses
[op
] > 0);
215 if (--uses
[op
] == 0) {
217 sblog
<< "td released : ";
222 pending
.remove_node(op
);
229 void gcm::td_release_uses(vvec
& v
) {
230 for (vvec::iterator I
= v
.begin(), E
= v
.end(); I
!= E
; ++I
) {
236 td_release_uses(v
->mdef
);
242 void gcm::sched_late(container_node
*n
) {
244 bool stack_pushed
= false;
246 if (n
->is_depart()) {
247 depart_node
*d
= static_cast<depart_node
*>(n
);
250 bu_release_phi_defs(d
->target
->phi
, d
->dep_id
);
251 } else if (n
->is_repeat()) {
252 repeat_node
*r
= static_cast<repeat_node
*>(n
);
253 assert(r
->target
->loop_phi
);
256 bu_release_phi_defs(r
->target
->loop_phi
, r
->rep_id
);
259 for (node_riterator I
= n
->rbegin(), E
= n
->rend(); I
!= E
; ++I
) {
260 if (I
->is_container()) {
261 if (I
->subtype
== NST_BB
) {
262 bb_node
* bb
= static_cast<bb_node
*>(*I
);
265 sched_late(static_cast<container_node
*>(*I
));
270 if (n
->type
== NT_IF
) {
271 if_node
*f
= static_cast<if_node
*>(n
);
273 pending_defs
.push_back(f
->cond
);
274 } else if (n
->type
== NT_REGION
) {
275 region_node
*r
= static_cast<region_node
*>(n
);
277 bu_release_phi_defs(r
->loop_phi
, 0);
285 void gcm::bu_sched_bb(bb_node
* bb
) {
287 sblog
<< "bu scheduling BB_" << bb
->id
<< "\n";
292 if (!pending_nodes
.empty()) {
294 sblog
<< "pending nodes:\n";
297 // TODO consider sorting the exports by array_base,
298 // possibly it can improve performance
300 for (node_list::iterator I
= pending_nodes
.begin(),
301 E
= pending_nodes
.end(); I
!= E
; ++I
) {
304 pending_nodes
.clear();
306 sblog
<< "pending nodes processed...\n";
311 if (!pending_defs
.empty()) {
312 for (vvec::iterator I
= pending_defs
.begin(), E
= pending_defs
.end();
316 pending_defs
.clear();
319 for (sched_queue::iterator N
, I
= ready_above
.begin(), E
= ready_above
.end();
324 if (op_map
[n
].bottom_bb
== bb
) {
326 ready_above
.erase(I
);
330 unsigned cnt_ready
[SQ_NUM
];
332 container_node
*clause
= NULL
;
333 unsigned last_inst_type
= ~0;
334 unsigned last_count
= 0;
342 unsigned ready_mask
= 0;
344 for (unsigned sq
= SQ_CF
; sq
< SQ_NUM
; ++sq
) {
345 if (!bu_ready
[sq
].empty() || !bu_ready_next
[sq
].empty())
346 ready_mask
|= (1 << sq
);
350 for (unsigned sq
= SQ_CF
; sq
< SQ_NUM
; ++sq
) {
351 if (!bu_ready_early
[sq
].empty()) {
352 node
*n
= bu_ready_early
[sq
].front();
353 bu_ready_early
[sq
].pop_front();
354 bu_ready
[sq
].push_back(n
);
360 for (unsigned sq
= SQ_CF
; sq
< SQ_NUM
; ++sq
) {
362 if (sq
== SQ_CF
&& pending_exec_mask_update
) {
363 pending_exec_mask_update
= false;
369 if (!bu_ready_next
[sq
].empty())
370 bu_ready
[sq
].splice(bu_ready
[sq
].end(), bu_ready_next
[sq
]);
372 cnt_ready
[sq
] = bu_ready
[sq
].size();
374 if ((sq
== SQ_TEX
|| sq
== SQ_VTX
) && live_count
<= rp_threshold
&&
375 cnt_ready
[sq
] < ctx
.max_fetch
/2 &&
376 !bu_ready_next
[SQ_ALU
].empty()) {
382 while (!bu_ready
[sq
].empty()) {
384 if (last_inst_type
!= sq
) {
390 // simple heuristic to limit register pressure,
391 if (sq
== SQ_ALU
&& live_count
> rp_threshold
&&
392 (!bu_ready
[SQ_TEX
].empty() ||
393 !bu_ready
[SQ_VTX
].empty() ||
394 !bu_ready_next
[SQ_TEX
].empty() ||
395 !bu_ready_next
[SQ_VTX
].empty())) {
396 GCM_DUMP( sblog
<< "switching to fetch (regpressure)\n"; );
400 n
= bu_ready
[sq
].front();
402 // real count (e.g. SAMPLE_G will be expanded to 3 instructions,
403 // 2 SET_GRAD_ + 1 SAMPLE_G
405 if (n
->is_fetch_inst() && n
->src
.size() == 12) {
409 bool sampler_indexing
= false;
410 if (n
->is_fetch_inst() &&
411 static_cast<fetch_node
*>(n
)->bc
.sampler_index_mode
!= V_SQ_CF_INDEX_NONE
)
413 sampler_indexing
= true; // Give sampler indexed ops get their own clause
414 ncnt
= sh
.get_ctx().is_cayman() ? 2 : 3; // MOVA + SET_CF_IDX0/1
417 if ((sq
== SQ_TEX
|| sq
== SQ_VTX
) &&
418 ((last_count
>= ctx
.max_fetch
/2 &&
419 check_alu_ready_count(24)) ||
420 last_count
+ ncnt
> ctx
.max_fetch
))
422 else if (sq
== SQ_CF
&& last_count
> 4 &&
423 check_alu_ready_count(24))
426 bu_ready
[sq
].pop_front();
429 if (!clause
|| sampler_indexing
) {
430 clause
= sh
.create_clause(sq
== SQ_ALU
?
432 sq
== SQ_TEX
? NST_TEX_CLAUSE
:
434 bb
->push_front(clause
);
440 bu_schedule(clause
, n
);
450 sblog
<< "bu finished scheduling BB_" << bb
->id
<< "\n";
454 void gcm::bu_release_defs(vvec
& v
, bool src
) {
455 for (vvec::reverse_iterator I
= v
.rbegin(), E
= v
.rend(); I
!= E
; ++I
) {
457 if (!v
|| v
->is_readonly())
461 if (!v
->rel
->is_readonly())
462 bu_release_val(v
->rel
);
463 bu_release_defs(v
->muse
, true);
467 if (live
.remove_val(v
)) {
474 void gcm::push_uc_stack() {
476 sblog
<< "pushing use count stack prev_level " << ucs_level
477 << " new level " << (ucs_level
+ 1) << "\n";
480 if (ucs_level
== nuc_stk
.size()) {
481 nuc_stk
.resize(ucs_level
+ 1);
484 nuc_stk
[ucs_level
].clear();
488 bool gcm::bu_is_ready(node
* n
) {
489 nuc_map
&cm
= nuc_stk
[ucs_level
];
490 nuc_map::iterator F
= cm
.find(n
);
491 unsigned uc
= (F
== cm
.end() ? 0 : F
->second
);
492 return uc
== uses
[n
];
495 void gcm::bu_schedule(container_node
* c
, node
* n
) {
497 sblog
<< "bu scheduling : ";
502 assert(op_map
[n
].bottom_bb
== bu_bb
);
504 bu_release_defs(n
->src
, true);
505 bu_release_defs(n
->dst
, false);
510 void gcm::dump_uc_stack() {
511 sblog
<< "##### uc_stk start ####\n";
512 for (unsigned l
= 0; l
<= ucs_level
; ++l
) {
513 nuc_map
&m
= nuc_stk
[l
];
515 sblog
<< "nuc_stk[" << l
<< "] : @" << &m
<< "\n";
517 for (nuc_map::iterator I
= m
.begin(), E
= m
.end(); I
!= E
; ++I
) {
518 sblog
<< " uc " << I
->second
<< " for ";
519 dump::dump_op(I
->first
);
523 sblog
<< "##### uc_stk end ####\n";
526 void gcm::pop_uc_stack() {
527 nuc_map
&pm
= nuc_stk
[ucs_level
];
529 nuc_map
&cm
= nuc_stk
[ucs_level
];
532 sblog
<< "merging use stack from level " << (ucs_level
+1)
533 << " to " << ucs_level
<< "\n";
536 for (nuc_map::iterator N
, I
= pm
.begin(), E
= pm
.end(); I
!= E
; ++I
) {
540 sblog
<< " " << cm
[n
] << " += " << I
->second
<< " for ";
545 unsigned uc
= cm
[n
] += I
->second
;
547 if (n
->parent
== &pending
&& uc
== uses
[n
]) {
549 pending_nodes
.push_back(n
);
551 sblog
<< "pushed pending_node due to stack pop ";
559 void gcm::bu_find_best_bb(node
*n
, op_info
&oi
) {
562 sblog
<< " find best bb : ";
570 // don't hoist generated copies
571 if (n
->flags
& NF_DONT_HOIST
) {
572 oi
.bottom_bb
= bu_bb
;
576 bb_node
* best_bb
= bu_bb
;
577 bb_node
* top_bb
= oi
.top_bb
;
578 assert(oi
.top_bb
&& !oi
.bottom_bb
);
582 // FIXME top_bb may be located inside the loop so we'll never enter it
583 // in the loop below, and the instruction will be incorrectly placed at the
584 // beginning of the shader.
585 // For now just check if top_bb's loop_level is higher than of
586 // current bb and abort the search for better bb in such case,
587 // but this problem may require more complete (and more expensive) fix
588 if (top_bb
->loop_level
<= best_bb
->loop_level
) {
589 while (c
&& c
!= top_bb
) {
600 if (c
->subtype
== NST_BB
) {
601 bb_node
*bb
= static_cast<bb_node
*>(c
);
602 if (bb
->loop_level
< best_bb
->loop_level
)
608 oi
.bottom_bb
= best_bb
;
611 void gcm::add_ready(node
*n
) {
612 sched_queue_id sq
= sh
.get_queue_id(n
);
613 if (n
->flags
& NF_SCHEDULE_EARLY
)
614 bu_ready_early
[sq
].push_back(n
);
615 else if (sq
== SQ_ALU
&& n
->is_copy_mov())
616 bu_ready
[sq
].push_front(n
);
617 else if (n
->is_alu_inst()) {
618 alu_node
*a
= static_cast<alu_node
*>(n
);
619 if (a
->bc
.op_ptr
->flags
& AF_PRED
&& a
->dst
[2]) {
620 // PRED_SET instruction that updates exec mask
621 pending_exec_mask_update
= true;
623 bu_ready_next
[sq
].push_back(n
);
625 bu_ready_next
[sq
].push_back(n
);
628 void gcm::bu_release_op(node
* n
) {
629 op_info
&oi
= op_map
[n
];
632 sblog
<< " bu release op ";
636 nuc_stk
[ucs_level
].erase(n
);
637 pending
.remove_node(n
);
639 bu_find_best_bb(n
, oi
);
641 if (oi
.bottom_bb
== bu_bb
) {
642 GCM_DUMP( sblog
<< " ready\n";);
645 GCM_DUMP( sblog
<< " ready_above\n";);
646 ready_above
.push_back(n
);
650 void gcm::bu_release_phi_defs(container_node
* p
, unsigned op
)
652 for (node_riterator I
= p
->rbegin(), E
= p
->rend(); I
!= E
; ++I
) {
654 value
*v
= o
->src
[op
];
655 if (v
&& !v
->is_readonly())
656 pending_defs
.push_back(o
->src
[op
]);
661 unsigned gcm::get_uc_vec(vvec
&vv
) {
663 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
669 c
+= get_uc_vec(v
->mdef
);
676 void gcm::init_use_count(nuc_map
& m
, container_node
&s
) {
678 for (node_iterator I
= s
.begin(), E
= s
.end(); I
!= E
; ++I
) {
680 unsigned uc
= get_uc_vec(n
->dst
);
682 sblog
<< "uc " << uc
<< " ";
687 pending_nodes
.push_back(n
);
689 sblog
<< "pushed pending_node in init ";
699 void gcm::bu_release_val(value
* v
) {
700 node
*n
= v
->any_def();
702 if (n
&& n
->parent
== &pending
) {
703 nuc_map
&m
= nuc_stk
[ucs_level
];
704 unsigned uc
= ++m
[n
];
705 unsigned uc2
= uses
[n
];
707 if (live
.add_val(v
)) {
709 GCM_DUMP ( sblog
<< "live_count: " << live_count
<< "\n"; );
713 sblog
<< "release val ";
715 sblog
<< " for node ";
717 sblog
<< " new uc=" << uc
<< ", total " << uc2
<< "\n";
726 void gcm::init_def_count(nuc_map
& m
, container_node
& s
) {
728 for (node_iterator I
= s
.begin(), E
= s
.end(); I
!= E
; ++I
) {
730 unsigned dc
= get_dc_vec(n
->src
, true) + get_dc_vec(n
->dst
, false);
734 sblog
<< "dc " << dc
<< " ";
741 unsigned gcm::get_dc_vec(vvec
& vv
, bool src
) {
743 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
745 if (!v
|| v
->is_readonly())
749 c
+= v
->rel
->def
!= NULL
;
750 c
+= get_dc_vec(v
->muse
, true);
754 c
+= v
->adef
!= NULL
;
760 unsigned gcm::real_alu_count(sched_queue
& q
, unsigned max
) {
761 sq_iterator
I(q
.begin()), E(q
.end());
764 while (I
!= E
&& c
< max
) {
766 if (n
->is_alu_inst()) {
767 if (!n
->is_copy_mov() || !n
->src
[0]->is_any_gpr())
769 } else if (n
->is_alu_packed()) {
770 c
+= static_cast<container_node
*>(n
)->count();
778 bool gcm::check_alu_ready_count(unsigned threshold
) {
779 unsigned r
= real_alu_count(bu_ready
[SQ_ALU
], threshold
);
782 r
+= real_alu_count(bu_ready_next
[SQ_ALU
], threshold
- r
);
783 return r
>= threshold
;
786 } // namespace r600_sb