2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
30 #define GCM_DUMP(a) do { a } while(0);
38 #include "sb_shader.h"
40 #include "eg_sq.h" // V_SQ_CF_INDEX_NONE
46 GCM_DUMP( sblog
<< "==== GCM ==== \n"; sh
.dump_ir(); );
48 collect_instructions(sh
.root
, true);
50 init_def_count(uses
, pending
);
52 for (node_iterator N
, I
= pending
.begin(), E
= pending
.end();
59 sblog
<< "pending : ";
71 pending
.remove_node(o
);
79 if (!pending
.empty()) {
80 sblog
<< "##### gcm_sched_early_pass: unscheduled ops:\n";
81 dump::dump_op(pending
.front());
84 assert(pending
.empty());
86 GCM_DUMP( sh
.dump_ir(); );
88 GCM_DUMP( sblog
<< "\n\n ############## gcm late\n\n"; );
90 collect_instructions(sh
.root
, false);
92 init_use_count(uses
, pending
);
95 if (!pending
.empty()) {
96 sblog
<< "##### gcm_sched_late_pass: unscheduled ops:\n";
97 dump::dump_op(pending
.front());
100 assert(ucs_level
== 0);
101 assert(pending
.empty());
107 void gcm::collect_instructions(container_node
*c
, bool early_pass
) {
111 for (node_iterator I
= c
->begin(), E
= c
->end(); I
!= E
; ++I
) {
113 if (n
->flags
& NF_DONT_MOVE
) {
114 op_info
&o
= op_map
[n
];
115 o
.top_bb
= o
.bottom_bb
= static_cast<bb_node
*>(c
);
120 pending
.append_from(c
);
124 for (node_iterator I
= c
->begin(), E
= c
->end(); I
!= E
; ++I
) {
125 if (I
->is_container()) {
126 collect_instructions(static_cast<container_node
*>(*I
), early_pass
);
131 void gcm::sched_early(container_node
*n
) {
134 (n
->type
== NT_REGION
) ? static_cast<region_node
*>(n
) : NULL
;
136 if (r
&& r
->loop_phi
) {
137 sched_early(r
->loop_phi
);
140 for (node_iterator I
= n
->begin(), E
= n
->end(); I
!= E
; ++I
) {
141 if (I
->type
== NT_OP
) {
143 if (op
->subtype
== NST_PHI
) {
144 td_release_uses(op
->dst
);
146 } else if (I
->is_container()) {
147 if (I
->subtype
== NST_BB
) {
148 bb_node
* bb
= static_cast<bb_node
*>(*I
);
151 sched_early(static_cast<container_node
*>(*I
));
161 void gcm::td_schedule(bb_node
*bb
, node
*n
) {
163 sblog
<< "scheduling : ";
167 td_release_uses(n
->dst
);
171 op_map
[n
].top_bb
= bb
;
175 void gcm::td_sched_bb(bb_node
* bb
) {
177 sblog
<< "td scheduling BB_" << bb
->id
<< "\n";
180 while (!ready
.empty()) {
181 for (sq_iterator N
, I
= ready
.begin(), E
= ready
.end(); I
!= E
;
190 bool gcm::td_is_ready(node
* n
) {
194 void gcm::td_release_val(value
*v
) {
197 sblog
<< "td checking uses: ";
202 for (uselist::iterator I
= v
->uses
.begin(), E
= v
->uses
.end(); I
!= E
; ++I
) {
204 if (op
->parent
!= &pending
) {
209 sblog
<< "td used in ";
214 assert(uses
[op
] > 0);
215 if (--uses
[op
] == 0) {
217 sblog
<< "td released : ";
222 pending
.remove_node(op
);
229 void gcm::td_release_uses(vvec
& v
) {
230 for (vvec::iterator I
= v
.begin(), E
= v
.end(); I
!= E
; ++I
) {
236 td_release_uses(v
->mdef
);
242 void gcm::sched_late(container_node
*n
) {
244 bool stack_pushed
= false;
246 if (n
->is_depart()) {
247 depart_node
*d
= static_cast<depart_node
*>(n
);
250 bu_release_phi_defs(d
->target
->phi
, d
->dep_id
);
251 } else if (n
->is_repeat()) {
252 repeat_node
*r
= static_cast<repeat_node
*>(n
);
253 assert(r
->target
->loop_phi
);
256 bu_release_phi_defs(r
->target
->loop_phi
, r
->rep_id
);
259 for (node_riterator I
= n
->rbegin(), E
= n
->rend(); I
!= E
; ++I
) {
260 if (I
->is_container()) {
261 if (I
->subtype
== NST_BB
) {
262 bb_node
* bb
= static_cast<bb_node
*>(*I
);
265 sched_late(static_cast<container_node
*>(*I
));
270 if (n
->type
== NT_IF
) {
271 if_node
*f
= static_cast<if_node
*>(n
);
273 pending_defs
.push_back(f
->cond
);
274 } else if (n
->type
== NT_REGION
) {
275 region_node
*r
= static_cast<region_node
*>(n
);
277 bu_release_phi_defs(r
->loop_phi
, 0);
285 void gcm::bu_sched_bb(bb_node
* bb
) {
287 sblog
<< "bu scheduling BB_" << bb
->id
<< "\n";
292 if (!pending_nodes
.empty()) {
294 sblog
<< "pending nodes:\n";
297 // TODO consider sorting the exports by array_base,
298 // possibly it can improve performance
300 for (node_list::iterator I
= pending_nodes
.begin(),
301 E
= pending_nodes
.end(); I
!= E
; ++I
) {
304 pending_nodes
.clear();
306 sblog
<< "pending nodes processed...\n";
311 if (!pending_defs
.empty()) {
312 for (vvec::iterator I
= pending_defs
.begin(), E
= pending_defs
.end();
316 pending_defs
.clear();
319 for (sched_queue::iterator N
, I
= ready_above
.begin(), E
= ready_above
.end();
324 if (op_map
[n
].bottom_bb
== bb
) {
326 ready_above
.erase(I
);
330 unsigned cnt_ready
[SQ_NUM
];
332 container_node
*clause
= NULL
;
333 unsigned last_inst_type
= ~0;
334 unsigned last_count
= 0;
342 unsigned ready_mask
= 0;
344 for (unsigned sq
= SQ_CF
; sq
< SQ_NUM
; ++sq
) {
345 if (!bu_ready
[sq
].empty() || !bu_ready_next
[sq
].empty())
346 ready_mask
|= (1 << sq
);
350 for (unsigned sq
= SQ_CF
; sq
< SQ_NUM
; ++sq
) {
351 if (!bu_ready_early
[sq
].empty()) {
352 node
*n
= bu_ready_early
[sq
].front();
353 bu_ready_early
[sq
].pop_front();
354 bu_ready
[sq
].push_back(n
);
360 for (unsigned sq
= SQ_CF
; sq
< SQ_NUM
; ++sq
) {
362 if (sq
== SQ_CF
&& pending_exec_mask_update
) {
363 pending_exec_mask_update
= false;
369 if (sq
!= SQ_ALU
&& outstanding_lds_oq
)
372 if (!bu_ready_next
[sq
].empty())
373 bu_ready
[sq
].splice(bu_ready
[sq
].end(), bu_ready_next
[sq
]);
375 cnt_ready
[sq
] = bu_ready
[sq
].size();
377 if ((sq
== SQ_TEX
|| sq
== SQ_VTX
) && live_count
<= rp_threshold
&&
378 cnt_ready
[sq
] < ctx
.max_fetch
/2 &&
379 !bu_ready_next
[SQ_ALU
].empty()) {
385 while (!bu_ready
[sq
].empty()) {
387 if (last_inst_type
!= sq
) {
393 // simple heuristic to limit register pressure,
394 if (sq
== SQ_ALU
&& live_count
> rp_threshold
&& !outstanding_lds_oq
&&
395 (!bu_ready
[SQ_TEX
].empty() ||
396 !bu_ready
[SQ_VTX
].empty() ||
397 !bu_ready_next
[SQ_TEX
].empty() ||
398 !bu_ready_next
[SQ_VTX
].empty())) {
399 GCM_DUMP( sblog
<< "switching to fetch (regpressure)\n"; );
403 n
= bu_ready
[sq
].front();
405 // real count (e.g. SAMPLE_G will be expanded to 3 instructions,
406 // 2 SET_GRAD_ + 1 SAMPLE_G
408 if (n
->is_fetch_inst() && n
->src
.size() == 12) {
412 bool sampler_indexing
= false;
413 if (n
->is_fetch_inst() &&
414 static_cast<fetch_node
*>(n
)->bc
.sampler_index_mode
!= V_SQ_CF_INDEX_NONE
)
416 sampler_indexing
= true; // Give sampler indexed ops get their own clause
417 ncnt
= sh
.get_ctx().is_cayman() ? 2 : 3; // MOVA + SET_CF_IDX0/1
420 if ((sq
== SQ_TEX
|| sq
== SQ_VTX
) &&
421 ((last_count
>= ctx
.max_fetch
/2 &&
422 check_alu_ready_count(24)) ||
423 last_count
+ ncnt
> ctx
.max_fetch
))
425 else if (sq
== SQ_CF
&& last_count
> 4 &&
426 check_alu_ready_count(24))
430 if (sq
== SQ_ALU
&& n
->consumes_lds_oq() &&
431 (bu_ready
[SQ_TEX
].size() || bu_ready
[SQ_VTX
].size() || bu_ready
[SQ_GDS
].size())) {
432 GCM_DUMP( sblog
<< "switching scheduling due to lds op\n"; );
435 bu_ready
[sq
].pop_front();
438 if (!clause
|| sampler_indexing
) {
442 nst
= NST_ALU_CLAUSE
;
445 nst
= NST_TEX_CLAUSE
;
448 nst
= NST_GDS_CLAUSE
;
451 nst
= NST_VTX_CLAUSE
;
454 clause
= sh
.create_clause(nst
);
455 bb
->push_front(clause
);
461 bu_schedule(clause
, n
);
471 sblog
<< "bu finished scheduling BB_" << bb
->id
<< "\n";
475 void gcm::bu_release_defs(vvec
& v
, bool src
) {
476 for (vvec::reverse_iterator I
= v
.rbegin(), E
= v
.rend(); I
!= E
; ++I
) {
478 if (!v
|| v
->is_readonly())
482 if (!v
->rel
->is_readonly())
483 bu_release_val(v
->rel
);
484 bu_release_defs(v
->muse
, true);
488 if (live
.remove_val(v
)) {
495 void gcm::push_uc_stack() {
497 sblog
<< "pushing use count stack prev_level " << ucs_level
498 << " new level " << (ucs_level
+ 1) << "\n";
501 if (ucs_level
== nuc_stk
.size()) {
502 nuc_stk
.resize(ucs_level
+ 1);
505 nuc_stk
[ucs_level
].clear();
509 bool gcm::bu_is_ready(node
* n
) {
510 nuc_map
&cm
= nuc_stk
[ucs_level
];
511 nuc_map::iterator F
= cm
.find(n
);
512 unsigned uc
= (F
== cm
.end() ? 0 : F
->second
);
513 return uc
== uses
[n
];
516 void gcm::bu_schedule(container_node
* c
, node
* n
) {
518 sblog
<< "bu scheduling : ";
523 assert(op_map
[n
].bottom_bb
== bu_bb
);
525 if (n
->produces_lds_oq())
526 outstanding_lds_oq
--;
527 if (n
->consumes_lds_oq())
528 outstanding_lds_oq
++;
529 bu_release_defs(n
->src
, true);
530 bu_release_defs(n
->dst
, false);
535 void gcm::dump_uc_stack() {
536 sblog
<< "##### uc_stk start ####\n";
537 for (unsigned l
= 0; l
<= ucs_level
; ++l
) {
538 nuc_map
&m
= nuc_stk
[l
];
540 sblog
<< "nuc_stk[" << l
<< "] : @" << &m
<< "\n";
542 for (nuc_map::iterator I
= m
.begin(), E
= m
.end(); I
!= E
; ++I
) {
543 sblog
<< " uc " << I
->second
<< " for ";
544 dump::dump_op(I
->first
);
548 sblog
<< "##### uc_stk end ####\n";
551 void gcm::pop_uc_stack() {
552 nuc_map
&pm
= nuc_stk
[ucs_level
];
554 nuc_map
&cm
= nuc_stk
[ucs_level
];
557 sblog
<< "merging use stack from level " << (ucs_level
+1)
558 << " to " << ucs_level
<< "\n";
561 for (nuc_map::iterator N
, I
= pm
.begin(), E
= pm
.end(); I
!= E
; ++I
) {
565 sblog
<< " " << cm
[n
] << " += " << I
->second
<< " for ";
570 unsigned uc
= cm
[n
] += I
->second
;
572 if (n
->parent
== &pending
&& uc
== uses
[n
]) {
574 pending_nodes
.push_back(n
);
576 sblog
<< "pushed pending_node due to stack pop ";
584 void gcm::bu_find_best_bb(node
*n
, op_info
&oi
) {
587 sblog
<< " find best bb : ";
595 // don't hoist generated copies
596 if (n
->flags
& NF_DONT_HOIST
) {
597 oi
.bottom_bb
= bu_bb
;
601 bb_node
* best_bb
= bu_bb
;
602 bb_node
* top_bb
= oi
.top_bb
;
603 assert(oi
.top_bb
&& !oi
.bottom_bb
);
607 // FIXME top_bb may be located inside the loop so we'll never enter it
608 // in the loop below, and the instruction will be incorrectly placed at the
609 // beginning of the shader.
610 // For now just check if top_bb's loop_level is higher than of
611 // current bb and abort the search for better bb in such case,
612 // but this problem may require more complete (and more expensive) fix
613 if (top_bb
->loop_level
<= best_bb
->loop_level
) {
614 while (c
&& c
!= top_bb
) {
625 if (c
->subtype
== NST_BB
) {
626 bb_node
*bb
= static_cast<bb_node
*>(c
);
627 if (bb
->loop_level
< best_bb
->loop_level
)
633 oi
.bottom_bb
= best_bb
;
636 void gcm::add_ready(node
*n
) {
637 sched_queue_id sq
= sh
.get_queue_id(n
);
638 if (n
->flags
& NF_SCHEDULE_EARLY
)
639 bu_ready_early
[sq
].push_back(n
);
640 else if (sq
== SQ_ALU
&& n
->is_copy_mov())
641 bu_ready
[sq
].push_front(n
);
642 else if (n
->is_alu_inst()) {
643 alu_node
*a
= static_cast<alu_node
*>(n
);
644 if (a
->bc
.op_ptr
->flags
& AF_PRED
&& a
->dst
[2]) {
645 // PRED_SET instruction that updates exec mask
646 pending_exec_mask_update
= true;
648 bu_ready_next
[sq
].push_back(n
);
650 bu_ready_next
[sq
].push_back(n
);
653 void gcm::bu_release_op(node
* n
) {
654 op_info
&oi
= op_map
[n
];
657 sblog
<< " bu release op ";
661 nuc_stk
[ucs_level
].erase(n
);
662 pending
.remove_node(n
);
664 bu_find_best_bb(n
, oi
);
666 if (oi
.bottom_bb
== bu_bb
) {
667 GCM_DUMP( sblog
<< " ready\n";);
670 GCM_DUMP( sblog
<< " ready_above\n";);
671 ready_above
.push_back(n
);
675 void gcm::bu_release_phi_defs(container_node
* p
, unsigned op
)
677 for (node_riterator I
= p
->rbegin(), E
= p
->rend(); I
!= E
; ++I
) {
679 value
*v
= o
->src
[op
];
680 if (v
&& !v
->is_readonly())
681 pending_defs
.push_back(o
->src
[op
]);
686 unsigned gcm::get_uc_vec(vvec
&vv
) {
688 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
694 c
+= get_uc_vec(v
->mdef
);
701 void gcm::init_use_count(nuc_map
& m
, container_node
&s
) {
703 for (node_iterator I
= s
.begin(), E
= s
.end(); I
!= E
; ++I
) {
705 unsigned uc
= get_uc_vec(n
->dst
);
707 sblog
<< "uc " << uc
<< " ";
712 pending_nodes
.push_back(n
);
714 sblog
<< "pushed pending_node in init ";
724 void gcm::bu_release_val(value
* v
) {
725 node
*n
= v
->any_def();
727 if (n
&& n
->parent
== &pending
) {
728 nuc_map
&m
= nuc_stk
[ucs_level
];
729 unsigned uc
= ++m
[n
];
730 unsigned uc2
= uses
[n
];
732 if (live
.add_val(v
)) {
734 GCM_DUMP ( sblog
<< "live_count: " << live_count
<< "\n"; );
738 sblog
<< "release val ";
740 sblog
<< " for node ";
742 sblog
<< " new uc=" << uc
<< ", total " << uc2
<< "\n";
751 void gcm::init_def_count(nuc_map
& m
, container_node
& s
) {
753 for (node_iterator I
= s
.begin(), E
= s
.end(); I
!= E
; ++I
) {
755 unsigned dc
= get_dc_vec(n
->src
, true) + get_dc_vec(n
->dst
, false);
759 sblog
<< "dc " << dc
<< " ";
766 unsigned gcm::get_dc_vec(vvec
& vv
, bool src
) {
768 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
770 if (!v
|| v
->is_readonly())
774 c
+= v
->rel
->def
!= NULL
;
775 c
+= get_dc_vec(v
->muse
, true);
779 c
+= v
->adef
!= NULL
;
785 unsigned gcm::real_alu_count(sched_queue
& q
, unsigned max
) {
786 sq_iterator
I(q
.begin()), E(q
.end());
789 while (I
!= E
&& c
< max
) {
791 if (n
->is_alu_inst()) {
792 if (!n
->is_copy_mov() || !n
->src
[0]->is_any_gpr())
794 } else if (n
->is_alu_packed()) {
795 c
+= static_cast<container_node
*>(n
)->count();
803 bool gcm::check_alu_ready_count(unsigned threshold
) {
804 unsigned r
= real_alu_count(bu_ready
[SQ_ALU
], threshold
);
807 r
+= real_alu_count(bu_ready_next
[SQ_ALU
], threshold
- r
);
808 return r
>= threshold
;
811 } // namespace r600_sb