2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
30 #define GCM_DUMP(a) do { a } while(0);
39 #include "sb_shader.h"
49 GCM_DUMP( cerr
<< "==== GCM ==== \n"; sh
.dump_ir(); );
51 collect_instructions(sh
.root
, true);
53 init_def_count(uses
, pending
);
55 for (node_iterator N
, I
= pending
.begin(), E
= pending
.end();
74 pending
.remove_node(o
);
82 if (!pending
.empty()) {
83 cerr
<< "##### gcm_sched_early_pass: unscheduled ops:\n";
84 dump::dump_op(pending
.front());
87 assert(pending
.empty());
89 GCM_DUMP( sh
.dump_ir(); );
91 GCM_DUMP( cerr
<< "\n\n ############## gcm late\n\n"; );
93 collect_instructions(sh
.root
, false);
95 init_use_count(uses
, pending
);
98 if (!pending
.empty()) {
99 cerr
<< "##### gcm_sched_late_pass: unscheduled ops:\n";
100 dump::dump_op(pending
.front());
103 assert(ucs_level
== 0);
104 assert(pending
.empty());
110 void gcm::collect_instructions(container_node
*c
, bool early_pass
) {
114 for (node_iterator I
= c
->begin(), E
= c
->end(); I
!= E
; ++I
) {
116 if (n
->flags
& NF_DONT_MOVE
) {
117 op_info
&o
= op_map
[n
];
118 o
.top_bb
= o
.bottom_bb
= static_cast<bb_node
*>(c
);
123 pending
.append_from(c
);
127 for (node_iterator I
= c
->begin(), E
= c
->end(); I
!= E
; ++I
) {
128 if (I
->is_container()) {
129 collect_instructions(static_cast<container_node
*>(*I
), early_pass
);
134 void gcm::sched_early(container_node
*n
) {
137 (n
->type
== NT_REGION
) ? static_cast<region_node
*>(n
) : NULL
;
139 if (r
&& r
->loop_phi
) {
140 sched_early(r
->loop_phi
);
143 for (node_iterator I
= n
->begin(), E
= n
->end(); I
!= E
; ++I
) {
144 if (I
->type
== NT_OP
) {
146 if (op
->subtype
== NST_PHI
) {
147 td_release_uses(op
->dst
);
149 } else if (I
->is_container()) {
150 if (I
->subtype
== NST_BB
) {
151 bb_node
* bb
= static_cast<bb_node
*>(*I
);
154 sched_early(static_cast<container_node
*>(*I
));
164 void gcm::td_schedule(bb_node
*bb
, node
*n
) {
166 cerr
<< "scheduling : ";
170 td_release_uses(n
->dst
);
174 op_map
[n
].top_bb
= bb
;
178 void gcm::td_sched_bb(bb_node
* bb
) {
180 cerr
<< "td scheduling BB_" << bb
->id
<< "\n";
183 while (!ready
.empty()) {
184 for (sq_iterator N
, I
= ready
.begin(), E
= ready
.end(); I
!= E
;
193 bool gcm::td_is_ready(node
* n
) {
197 void gcm::td_release_val(value
*v
) {
200 cerr
<< "td checking uses: ";
205 use_info
*u
= v
->uses
;
207 if (u
->op
->parent
!= &pending
) {
213 cerr
<< "td used in ";
214 dump::dump_op(u
->op
);
218 if (--uses
[u
->op
] == 0) {
220 cerr
<< "td released : ";
221 dump::dump_op(u
->op
);
225 pending
.remove_node(u
->op
);
226 ready
.push_back(u
->op
);
233 void gcm::td_release_uses(vvec
& v
) {
234 for (vvec::iterator I
= v
.begin(), E
= v
.end(); I
!= E
; ++I
) {
240 td_release_uses(v
->mdef
);
246 void gcm::sched_late(container_node
*n
) {
248 bool stack_pushed
= false;
250 if (n
->is_depart()) {
251 depart_node
*d
= static_cast<depart_node
*>(n
);
254 bu_release_phi_defs(d
->target
->phi
, d
->dep_id
);
255 } else if (n
->is_repeat()) {
256 repeat_node
*r
= static_cast<repeat_node
*>(n
);
257 assert(r
->target
->loop_phi
);
260 bu_release_phi_defs(r
->target
->loop_phi
, r
->rep_id
);
263 for (node_riterator I
= n
->rbegin(), E
= n
->rend(); I
!= E
; ++I
) {
264 if (I
->is_container()) {
265 if (I
->subtype
== NST_BB
) {
266 bb_node
* bb
= static_cast<bb_node
*>(*I
);
269 sched_late(static_cast<container_node
*>(*I
));
274 if (n
->type
== NT_IF
) {
275 if_node
*f
= static_cast<if_node
*>(n
);
277 pending_defs
.push_back(f
->cond
);
278 } else if (n
->type
== NT_REGION
) {
279 region_node
*r
= static_cast<region_node
*>(n
);
281 bu_release_phi_defs(r
->loop_phi
, 0);
289 void gcm::bu_sched_bb(bb_node
* bb
) {
291 cerr
<< "bu scheduling BB_" << bb
->id
<< "\n";
296 if (!pending_nodes
.empty()) {
298 cerr
<< "pending nodes:\n";
301 // TODO consider sorting the exports by array_base,
302 // possibly it can improve performance
304 for (node_list::iterator I
= pending_nodes
.begin(),
305 E
= pending_nodes
.end(); I
!= E
; ++I
) {
308 pending_nodes
.clear();
310 cerr
<< "pending nodes processed...\n";
315 if (!pending_defs
.empty()) {
316 for (vvec::iterator I
= pending_defs
.begin(), E
= pending_defs
.end();
320 pending_defs
.clear();
323 for (sched_queue::iterator N
, I
= ready_above
.begin(), E
= ready_above
.end();
328 if (op_map
[n
].bottom_bb
== bb
) {
330 ready_above
.erase(I
);
334 unsigned cnt_ready
[SQ_NUM
];
336 container_node
*clause
= NULL
;
337 unsigned last_inst_type
= ~0;
338 unsigned last_count
= 0;
346 unsigned ready_mask
= 0;
348 for (unsigned sq
= SQ_CF
; sq
< SQ_NUM
; ++sq
) {
349 if (!bu_ready
[sq
].empty() || !bu_ready_next
[sq
].empty())
350 ready_mask
|= (1 << sq
);
354 for (unsigned sq
= SQ_CF
; sq
< SQ_NUM
; ++sq
) {
355 if (!bu_ready_early
[sq
].empty()) {
356 node
*n
= bu_ready_early
[sq
].front();
357 bu_ready_early
[sq
].pop_front();
358 bu_ready
[sq
].push_back(n
);
364 for (unsigned sq
= SQ_CF
; sq
< SQ_NUM
; ++sq
) {
366 if (!bu_ready_next
[sq
].empty())
367 bu_ready
[sq
].splice(bu_ready
[sq
].end(), bu_ready_next
[sq
]);
369 cnt_ready
[sq
] = bu_ready
[sq
].size();
371 if ((sq
== SQ_TEX
|| sq
== SQ_VTX
) &&
372 cnt_ready
[sq
] < ctx
.max_fetch
/2 &&
373 !bu_ready_next
[SQ_ALU
].empty()) {
379 while (!bu_ready
[sq
].empty()) {
381 if (last_inst_type
!= sq
) {
387 n
= bu_ready
[sq
].front();
389 // real count (e.g. SAMPLE_G will be expanded to 3 instructions,
390 // 2 SET_GRAD_ + 1 SAMPLE_G
392 if (n
->is_fetch_inst() && n
->src
.size() == 12) {
396 if ((sq
== SQ_TEX
|| sq
== SQ_VTX
) &&
397 ((last_count
>= ctx
.max_fetch
/2 &&
398 check_alu_ready_count(24)) ||
399 last_count
+ ncnt
> ctx
.max_fetch
))
401 else if (sq
== SQ_CF
&& last_count
> 4 &&
402 check_alu_ready_count(24))
405 bu_ready
[sq
].pop_front();
409 clause
= sh
.create_clause(sq
== SQ_ALU
?
411 sq
== SQ_TEX
? NST_TEX_CLAUSE
:
413 bb
->push_front(clause
);
419 bu_schedule(clause
, n
);
429 cerr
<< "bu finished scheduling BB_" << bb
->id
<< "\n";
433 void gcm::bu_release_defs(vvec
& v
, bool src
) {
434 for (vvec::reverse_iterator I
= v
.rbegin(), E
= v
.rend(); I
!= E
; ++I
) {
436 if (!v
|| v
->is_readonly())
440 if (!v
->rel
->is_readonly())
441 bu_release_val(v
->rel
);
442 bu_release_defs(v
->muse
, true);
448 void gcm::push_uc_stack() {
450 cerr
<< "pushing use count stack prev_level " << ucs_level
451 << " new level " << (ucs_level
+ 1) << "\n";
454 if (ucs_level
== nuc_stk
.size()) {
455 nuc_stk
.resize(ucs_level
+ 1);
458 nuc_stk
[ucs_level
].clear();
462 bool gcm::bu_is_ready(node
* n
) {
463 nuc_map
&cm
= nuc_stk
[ucs_level
];
464 nuc_map::iterator F
= cm
.find(n
);
465 unsigned uc
= (F
== cm
.end() ? 0 : F
->second
);
466 return uc
== uses
[n
];
469 void gcm::bu_schedule(container_node
* c
, node
* n
) {
471 cerr
<< "bu scheduling : ";
476 assert(op_map
[n
].bottom_bb
== bu_bb
);
478 bu_release_defs(n
->src
, true);
479 bu_release_defs(n
->dst
, false);
484 void gcm::dump_uc_stack() {
485 cerr
<< "##### uc_stk start ####\n";
486 for (unsigned l
= 0; l
<= ucs_level
; ++l
) {
487 nuc_map
&m
= nuc_stk
[l
];
489 cerr
<< "nuc_stk[" << l
<< "] : @" << &m
<< "\n";
491 for (nuc_map::iterator I
= m
.begin(), E
= m
.end(); I
!= E
; ++I
) {
492 cerr
<< " uc " << I
->second
<< " for ";
493 dump::dump_op(I
->first
);
497 cerr
<< "##### uc_stk end ####\n";
500 void gcm::pop_uc_stack() {
501 nuc_map
&pm
= nuc_stk
[ucs_level
];
503 nuc_map
&cm
= nuc_stk
[ucs_level
];
506 cerr
<< "merging use stack from level " << (ucs_level
+1)
507 << " to " << ucs_level
<< "\n";
510 for (nuc_map::iterator N
, I
= pm
.begin(), E
= pm
.end(); I
!= E
; ++I
) {
514 cerr
<< " " << cm
[n
] << " += " << I
->second
<< " for ";
519 unsigned uc
= cm
[n
] += I
->second
;
521 if (n
->parent
== &pending
&& uc
== uses
[n
]) {
523 pending_nodes
.push_back(n
);
525 cerr
<< "pushed pending_node due to stack pop ";
533 void gcm::bu_find_best_bb(node
*n
, op_info
&oi
) {
536 cerr
<< " find best bb : ";
544 // don't hoist generated copies
545 if (n
->flags
& NF_DONT_HOIST
) {
546 oi
.bottom_bb
= bu_bb
;
550 bb_node
* best_bb
= bu_bb
;
551 bb_node
* top_bb
= oi
.top_bb
;
552 assert(oi
.top_bb
&& !oi
.bottom_bb
);
556 // FIXME top_bb may be located inside the loop so we'll never enter it
557 // in the loop below, and the instruction will be incorrectly placed at the
558 // beginning of the shader.
559 // For now just check if top_bb's loop_level is higher than of
560 // current bb and abort the search for better bb in such case,
561 // but this problem may require more complete (and more expensive) fix
562 if (top_bb
->loop_level
<= best_bb
->loop_level
) {
563 while (c
&& c
!= top_bb
) {
574 if (c
->subtype
== NST_BB
) {
575 bb_node
*bb
= static_cast<bb_node
*>(c
);
576 if (bb
->loop_level
< best_bb
->loop_level
)
582 oi
.bottom_bb
= best_bb
;
585 void gcm::add_ready(node
*n
) {
586 sched_queue_id sq
= sh
.get_queue_id(n
);
587 if (n
->flags
& NF_SCHEDULE_EARLY
)
588 bu_ready_early
[sq
].push_back(n
);
590 bu_ready_next
[sq
].push_back(n
);
593 void gcm::bu_release_op(node
* n
) {
594 op_info
&oi
= op_map
[n
];
597 cerr
<< " bu release op ";
601 nuc_stk
[ucs_level
].erase(n
);
602 pending
.remove_node(n
);
604 bu_find_best_bb(n
, oi
);
606 if (oi
.bottom_bb
== bu_bb
) {
607 GCM_DUMP( cerr
<< " ready\n";);
610 GCM_DUMP( cerr
<< " ready_above\n";);
611 ready_above
.push_back(n
);
615 void gcm::bu_release_phi_defs(container_node
* p
, unsigned op
)
617 for (node_riterator I
= p
->rbegin(), E
= p
->rend(); I
!= E
; ++I
) {
619 value
*v
= o
->src
[op
];
620 if (v
&& !v
->is_readonly())
621 pending_defs
.push_back(o
->src
[op
]);
626 unsigned gcm::get_uc_vec(vvec
&vv
) {
628 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
634 c
+= get_uc_vec(v
->mdef
);
641 void gcm::init_use_count(nuc_map
& m
, container_node
&s
) {
643 for (node_iterator I
= s
.begin(), E
= s
.end(); I
!= E
; ++I
) {
645 unsigned uc
= get_uc_vec(n
->dst
);
647 cerr
<< "uc " << uc
<< " ";
652 pending_nodes
.push_back(n
);
654 cerr
<< "pushed pending_node in init ";
664 void gcm::bu_release_val(value
* v
) {
665 node
*n
= v
->any_def();
667 if (n
&& n
->parent
== &pending
) {
668 unsigned uc
= ++nuc_stk
[ucs_level
][n
];
669 unsigned uc2
= uses
[n
];
672 cerr
<< "release val ";
674 cerr
<< " for node ";
676 cerr
<< " new uc=" << uc
<< ", total " << uc2
<< "\n";
685 void gcm::init_def_count(nuc_map
& m
, container_node
& s
) {
687 for (node_iterator I
= s
.begin(), E
= s
.end(); I
!= E
; ++I
) {
689 unsigned dc
= get_dc_vec(n
->src
, true) + get_dc_vec(n
->dst
, false);
693 cerr
<< "dc " << dc
<< " ";
700 unsigned gcm::get_dc_vec(vvec
& vv
, bool src
) {
702 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
704 if (!v
|| v
->is_readonly())
708 c
+= v
->rel
->def
!= NULL
;
709 c
+= get_dc_vec(v
->muse
, true);
713 c
+= v
->adef
!= NULL
;
719 unsigned gcm::real_alu_count(sched_queue
& q
, unsigned max
) {
720 sq_iterator
I(q
.begin()), E(q
.end());
723 while (I
!= E
&& c
< max
) {
725 if (n
->is_alu_inst()) {
726 if (!n
->is_copy_mov() || !n
->src
[0]->is_any_gpr())
728 } else if (n
->is_alu_packed()) {
729 c
+= static_cast<container_node
*>(n
)->count();
737 bool gcm::check_alu_ready_count(unsigned threshold
) {
738 unsigned r
= real_alu_count(bu_ready
[SQ_ALU
], threshold
);
741 r
+= real_alu_count(bu_ready_next
[SQ_ALU
], threshold
- r
);
742 return r
>= threshold
;
745 } // namespace r600_sb