2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
30 #define PSC_DUMP(a) do { a } while (0)
36 #include "sb_shader.h"
45 rp_kcache_tracker::rp_kcache_tracker(shader
&sh
) : rp(), uc(),
46 // FIXME: for now we'll use "two const pairs" limit for r600, same as
47 // for other chips, otherwise additional check in alu_group_tracker is
48 // required to make sure that all 4 consts in the group fit into 2
52 bool rp_kcache_tracker::try_reserve(sel_chan r
) {
53 unsigned sel
= kc_sel(r
);
55 for (unsigned i
= 0; i
< sel_count
; ++i
) {
69 bool rp_kcache_tracker::try_reserve(node
* n
) {
70 bool need_unreserve
= false;
71 vvec::iterator
I(n
->src
.begin()), E(n
->src
.end());
76 if (!try_reserve(v
->select
))
79 need_unreserve
= true;
85 if (need_unreserve
&& I
!= n
->src
.begin()) {
91 } while (I
!= n
->src
.begin());
97 void rp_kcache_tracker::unreserve(node
* n
) {
98 vvec::iterator
I(n
->src
.begin()), E(n
->src
.end());
102 unreserve(v
->select
);
106 void rp_kcache_tracker::unreserve(sel_chan r
) {
107 unsigned sel
= kc_sel(r
);
109 for (unsigned i
= 0; i
< sel_count
; ++i
)
119 bool literal_tracker::try_reserve(alu_node
* n
) {
120 bool need_unreserve
= false;
122 vvec::iterator
I(n
->src
.begin()), E(n
->src
.end());
124 for (; I
!= E
; ++I
) {
126 if (v
->is_literal()) {
127 if (!try_reserve(v
->literal_value
))
130 need_unreserve
= true;
136 if (need_unreserve
&& I
!= n
->src
.begin()) {
141 unreserve(v
->literal_value
);
142 } while (I
!= n
->src
.begin());
147 void literal_tracker::unreserve(alu_node
* n
) {
148 unsigned nsrc
= n
->bc
.op_ptr
->src_count
, i
;
150 for (i
= 0; i
< nsrc
; ++i
) {
151 value
*v
= n
->src
[i
];
153 unreserve(v
->literal_value
);
157 bool literal_tracker::try_reserve(literal l
) {
159 PSC_DUMP( cerr
<< "literal reserve " << l
.u
<< " " << l
.f
<< "\n"; );
161 for (unsigned i
= 0; i
< MAX_ALU_LITERALS
; ++i
) {
165 PSC_DUMP( cerr
<< " reserved new uc = " << uc
[i
] << "\n"; );
167 } else if (lt
[i
] == l
) {
169 PSC_DUMP( cerr
<< " reserved uc = " << uc
[i
] << "\n"; );
173 PSC_DUMP( cerr
<< " failed to reserve literal\n"; );
177 void literal_tracker::unreserve(literal l
) {
179 PSC_DUMP( cerr
<< "literal unreserve " << l
.u
<< " " << l
.f
<< "\n"; );
181 for (unsigned i
= 0; i
< MAX_ALU_LITERALS
; ++i
) {
192 static inline unsigned bs_cycle_vector(unsigned bs
, unsigned src
) {
193 static const unsigned swz
[VEC_NUM
][3] = {
194 {0, 1, 2}, {0, 2, 1}, {1, 2, 0}, {1, 0, 2}, {2, 0, 1}, {2, 1, 0}
196 assert(bs
< VEC_NUM
&& src
< 3);
200 static inline unsigned bs_cycle_scalar(unsigned bs
, unsigned src
) {
201 static const unsigned swz
[SCL_NUM
][3] = {
202 {2, 1, 0}, {1, 2, 2}, {2, 1, 2}, {2, 2, 1}
205 if (bs
>= SCL_NUM
|| src
>= 3) {
206 // this prevents gcc warning "array subscript is above array bounds"
207 // AFAICS we should never hit this path
213 static inline unsigned bs_cycle(bool trans
, unsigned bs
, unsigned src
) {
214 return trans
? bs_cycle_scalar(bs
, src
) : bs_cycle_vector(bs
, src
);
218 bool rp_gpr_tracker::try_reserve(unsigned cycle
, unsigned sel
, unsigned chan
) {
220 if (rp
[cycle
][chan
] == 0) {
221 rp
[cycle
][chan
] = sel
;
224 } else if (rp
[cycle
][chan
] == sel
) {
232 void rp_gpr_tracker::unreserve(alu_node
* n
) {
233 unsigned nsrc
= n
->bc
.op_ptr
->src_count
, i
;
234 unsigned trans
= n
->bc
.slot
== SLOT_TRANS
;
235 unsigned bs
= n
->bc
.bank_swizzle
;
236 unsigned opt
= !trans
237 && n
->bc
.src
[0].sel
== n
->bc
.src
[1].sel
238 && n
->bc
.src
[0].chan
== n
->bc
.src
[1].chan
;
240 for (i
= 0; i
< nsrc
; ++i
) {
241 value
*v
= n
->src
[i
];
242 if (v
->is_readonly())
246 unsigned cycle
= bs_cycle(trans
, bs
, i
);
247 unreserve(cycle
, n
->bc
.src
[i
].sel
, n
->bc
.src
[i
].chan
);
252 void rp_gpr_tracker::unreserve(unsigned cycle
, unsigned sel
, unsigned chan
) {
254 assert(rp
[cycle
][chan
] == sel
&& uc
[cycle
][chan
]);
255 if (--uc
[cycle
][chan
] == 0)
260 bool rp_gpr_tracker::try_reserve(alu_node
* n
) {
261 unsigned nsrc
= n
->bc
.op_ptr
->src_count
, i
;
262 unsigned trans
= n
->bc
.slot
== SLOT_TRANS
;
263 unsigned bs
= n
->bc
.bank_swizzle
;
264 unsigned opt
= !trans
&& nsrc
>= 2 &&
265 n
->src
[0] == n
->src
[1];
267 bool need_unreserve
= false;
268 unsigned const_count
= 0, min_gpr_cycle
= 3;
270 for (i
= 0; i
< nsrc
; ++i
) {
271 value
*v
= n
->src
[i
];
272 if (v
->is_readonly()) {
274 if (trans
&& const_count
== 3)
280 unsigned cycle
= bs_cycle(trans
, bs
, i
);
282 if (trans
&& cycle
< min_gpr_cycle
)
283 min_gpr_cycle
= cycle
;
285 if (const_count
&& cycle
< const_count
&& trans
)
288 if (!try_reserve(cycle
, n
->bc
.src
[i
].sel
, n
->bc
.src
[i
].chan
))
291 need_unreserve
= true;
295 if ((i
== nsrc
) && (min_gpr_cycle
+ 1 > const_count
))
298 if (need_unreserve
&& i
--) {
300 value
*v
= n
->src
[i
];
301 if (!v
->is_readonly()) {
304 unreserve(bs_cycle(trans
, bs
, i
), n
->bc
.src
[i
].sel
,
312 alu_group_tracker::alu_group_tracker(shader
&sh
)
314 gpr(), lt(), slots(),
315 max_slots(sh
.get_ctx().is_cayman() ? 4 : 5),
316 has_mova(), uses_ar(), has_predset(), has_kill(),
317 updates_exec_mask(), chan_count(), interp_param(), next_id() {
319 available_slots
= sh
.get_ctx().has_trans
? 0x1F : 0x0F;
323 sel_chan
alu_group_tracker::get_value_id(value
* v
) {
324 unsigned &id
= vmap
[v
];
327 return sel_chan(id
, v
->get_final_chan());
331 void alu_group_tracker::assign_slot(unsigned slot
, alu_node
* n
) {
334 available_slots
&= ~(1 << slot
);
336 unsigned param
= n
->interp_param();
339 assert(!interp_param
|| interp_param
== param
);
340 interp_param
= param
;
345 void alu_group_tracker::discard_all_slots(container_node
&removed_nodes
) {
346 PSC_DUMP( cerr
<< "agt::discard_all_slots\n"; );
347 discard_slots(~available_slots
& ((1 << max_slots
) - 1), removed_nodes
);
350 void alu_group_tracker::discard_slots(unsigned slot_mask
,
351 container_node
&removed_nodes
) {
354 cerr
<< "discard_slots : packed_ops : " << packed_ops
.size() << "\n";
357 for (node_vec::iterator N
, I
= packed_ops
.begin();
358 I
!= packed_ops
.end(); I
= N
) {
361 alu_packed_node
*n
= static_cast<alu_packed_node
*>(*I
);
362 unsigned pslots
= n
->get_slot_mask();
365 cerr
<< "discard_slots : packed slot_mask : " << pslots
<< "\n";
368 if (pslots
& slot_mask
) {
371 cerr
<< "discard_slots : discarding packed...\n";
374 removed_nodes
.push_back(n
);
375 slot_mask
&= ~pslots
;
376 N
= packed_ops
.erase(I
);
377 available_slots
|= pslots
;
378 for (unsigned k
= 0; k
< max_slots
; ++k
) {
379 if (pslots
& (1 << k
))
385 for (unsigned slot
= 0; slot
< max_slots
; ++slot
) {
386 unsigned slot_bit
= 1 << slot
;
388 if (slot_mask
& slot_bit
) {
389 assert(!(available_slots
& slot_bit
));
392 assert(!(slots
[slot
]->bc
.slot_flags
& AF_4SLOT
));
395 cerr
<< "discarding slot " << slot
<< " : ";
396 dump::dump_op(slots
[slot
]);
400 removed_nodes
.push_back(slots
[slot
]);
402 available_slots
|= slot_bit
;
406 alu_node
*t
= slots
[4];
407 if (t
&& (t
->bc
.slot_flags
& AF_V
)) {
408 unsigned chan
= t
->bc
.dst_chan
;
413 cerr
<< " from trans slot to free slot " << chan
<< "\n";
425 alu_group_node
* alu_group_tracker::emit() {
427 alu_group_node
*g
= sh
.create_alu_group();
429 lt
.init_group_literals(g
);
431 for (unsigned i
= 0; i
< max_slots
; ++i
) {
432 alu_node
*n
= slots
[i
];
440 bool alu_group_tracker::try_reserve(alu_node
* n
) {
441 unsigned nsrc
= n
->bc
.op_ptr
->src_count
;
442 unsigned slot
= n
->bc
.slot
;
443 bool trans
= slot
== 4;
448 unsigned flags
= n
->bc
.op_ptr
->flags
;
450 unsigned param
= n
->interp_param();
452 if (param
&& interp_param
&& interp_param
!= param
)
455 if ((flags
& AF_KILL
) && has_predset
)
457 if ((flags
& AF_ANY_PRED
) && (has_kill
|| has_predset
))
459 if ((flags
& AF_MOVA
) && (has_mova
|| uses_ar
))
462 if (n
->uses_ar() && has_mova
)
465 for (unsigned i
= 0; i
< nsrc
; ++i
) {
467 unsigned last_id
= next_id
;
469 value
*v
= n
->src
[i
];
470 if (!v
->is_any_gpr() && !v
->is_rel())
472 sel_chan vid
= get_value_id(n
->src
[i
]);
474 if (vid
> last_id
&& chan_count
[vid
.chan()] == 3) {
478 n
->bc
.src
[i
].sel
= vid
.sel();
479 n
->bc
.src
[i
].chan
= vid
.chan();
482 if (!lt
.try_reserve(n
))
485 if (!kc
.try_reserve(n
)) {
490 unsigned fbs
= n
->forced_bank_swizzle();
492 n
->bc
.bank_swizzle
= 0;
495 n
->bc
.bank_swizzle
= VEC_210
;
497 if (gpr
.try_reserve(n
)) {
498 assign_slot(slot
, n
);
503 unsigned swz_num
= trans
? SCL_NUM
: VEC_NUM
;
504 for (unsigned bs
= 0; bs
< swz_num
; ++bs
) {
505 n
->bc
.bank_swizzle
= bs
;
506 if (gpr
.try_reserve(n
)) {
507 assign_slot(slot
, n
);
516 unsigned forced_swz_slots
= 0;
517 int first_slot
= ~0, first_nf
= ~0, last_slot
= ~0;
520 for (unsigned i
= 0; i
< max_slots
; ++i
) {
521 alu_node
*a
= slots
[i
];
523 if (first_slot
== ~0)
526 save_bs
[i
] = a
->bc
.bank_swizzle
;
527 if (a
->forced_bank_swizzle()) {
528 assert(i
!= SLOT_TRANS
);
529 forced_swz_slots
|= (1 << i
);
530 a
->bc
.bank_swizzle
= VEC_210
;
531 if (!gpr
.try_reserve(a
))
532 assert("!internal reservation error");
537 a
->bc
.bank_swizzle
= 0;
542 if (first_nf
== ~0) {
543 assign_slot(slot
, n
);
547 assert(first_slot
!= ~0 && last_slot
!= ~0);
549 // silence "array subscript is above array bounds" with gcc 4.8
554 alu_node
*a
= slots
[i
];
555 bool backtrack
= false;
560 cerr
<< " bs: trying s" << i
<< " bs:" << a
->bc
.bank_swizzle
561 << " bt:" << backtrack
<< "\n";
564 if (!backtrack
&& gpr
.try_reserve(a
)) {
566 cerr
<< " bs: reserved s" << i
<< " bs:" << a
->bc
.bank_swizzle
570 while ((++i
<= last_slot
) && !slots
[i
]);
576 bool itrans
= i
== SLOT_TRANS
;
577 unsigned max_swz
= itrans
? SCL_221
: VEC_210
;
579 if (a
->bc
.bank_swizzle
< max_swz
) {
580 ++a
->bc
.bank_swizzle
;
583 cerr
<< " bs: inc s" << i
<< " bs:" << a
->bc
.bank_swizzle
589 a
->bc
.bank_swizzle
= 0;
590 while ((--i
>= first_nf
) && !slots
[i
]);
595 cerr
<< " bs: unreserve s" << i
<< " bs:" << a
->bc
.bank_swizzle
607 if (i
== last_slot
+ 1) {
608 assign_slot(slot
, n
);
612 // reservation failed, restore previous state
615 for (unsigned i
= 0; i
< max_slots
; ++i
) {
616 alu_node
*a
= slots
[i
];
618 a
->bc
.bank_swizzle
= save_bs
[i
];
619 bool b
= gpr
.try_reserve(a
);
629 bool alu_group_tracker::try_reserve(alu_packed_node
* p
) {
630 bool need_unreserve
= false;
631 node_iterator
I(p
->begin()), E(p
->end());
633 for (; I
!= E
; ++I
) {
634 alu_node
*n
= static_cast<alu_node
*>(*I
);
638 need_unreserve
= true;
642 packed_ops
.push_back(p
);
646 if (need_unreserve
) {
648 alu_node
*n
= static_cast<alu_node
*>(*I
);
649 slots
[n
->bc
.slot
] = NULL
;
656 void alu_group_tracker::reinit() {
658 memcpy(s
, slots
, sizeof(slots
));
662 for (int i
= max_slots
- 1; i
>= 0; --i
) {
663 if (s
[i
] && !try_reserve(s
[i
])) {
664 cerr
<< "alu_group_tracker: reinit error on slot " << i
<< "\n";
665 for (unsigned i
= 0; i
< max_slots
; ++i
) {
666 cerr
<< " slot " << i
<< " : ";
672 assert(!"alu_group_tracker: reinit error");
677 void alu_group_tracker::reset(bool keep_packed
) {
681 memset(slots
, 0, sizeof(slots
));
688 updates_exec_mask
= false;
689 available_slots
= sh
.get_ctx().has_trans
? 0x1F : 0x0F;
701 void alu_group_tracker::update_flags(alu_node
* n
) {
702 unsigned flags
= n
->bc
.op_ptr
->flags
;
703 has_kill
|= (flags
& AF_KILL
);
704 has_mova
|= (flags
& AF_MOVA
);
705 has_predset
|= (flags
& AF_ANY_PRED
);
706 uses_ar
|= n
->uses_ar();
708 if (flags
& AF_ANY_PRED
) {
709 if (n
->dst
[2] != NULL
)
710 updates_exec_mask
= true;
714 int post_scheduler::run() {
719 void post_scheduler::run_on(container_node
* n
) {
721 for (node_riterator I
= n
->rbegin(), E
= n
->rend(); I
!= E
; ++I
) {
722 if (I
->is_container()) {
723 if (I
->subtype
== NST_BB
) {
724 bb_node
* bb
= static_cast<bb_node
*>(*I
);
727 run_on(static_cast<container_node
*>(*I
));
733 void post_scheduler::init_uc_val(container_node
*c
, value
*v
) {
734 node
*d
= v
->any_def();
735 if (d
&& d
->parent
== c
)
739 void post_scheduler::init_uc_vec(container_node
*c
, vvec
&vv
, bool src
) {
740 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
742 if (!v
|| v
->is_readonly())
746 init_uc_val(c
, v
->rel
);
747 init_uc_vec(c
, v
->muse
, true);
754 unsigned post_scheduler::init_ucm(container_node
*c
, node
*n
) {
755 init_uc_vec(c
, n
->src
, true);
756 init_uc_vec(c
, n
->dst
, false);
758 uc_map::iterator F
= ucm
.find(n
);
759 return F
== ucm
.end() ? 0 : F
->second
;
762 void post_scheduler::schedule_bb(bb_node
* bb
) {
764 cerr
<< "scheduling BB " << bb
->id
<< "\n";
765 if (!pending
.empty())
766 dump::dump_op_list(&pending
);
769 assert(pending
.empty());
770 assert(bb_pending
.empty());
771 assert(ready
.empty());
773 bb_pending
.append_from(bb
);
778 while ((n
= bb_pending
.back())) {
781 cerr
<< "post_sched_bb ";
786 if (n
->subtype
== NST_ALU_CLAUSE
) {
788 process_alu(static_cast<container_node
*>(n
));
799 void post_scheduler::init_regmap() {
804 cerr
<< "init_regmap: live: ";
805 dump::dump_set(sh
, live
);
809 for (val_set::iterator I
= live
.begin(sh
), E
= live
.end(sh
); I
!= E
; ++I
) {
812 if (!v
->is_sgpr() || !v
->is_prealloc())
818 cerr
<< "init_regmap: " << r
<< " <= ";
828 void post_scheduler::process_alu(container_node
*c
) {
833 live
= c
->live_after
;
835 init_globals(c
->live_after
, true);
836 init_globals(c
->live_before
, true);
840 update_local_interferences();
842 for (node_riterator N
, I
= c
->rbegin(), E
= c
->rend(); I
!= E
; I
= N
) {
847 unsigned uc
= init_ucm(c
, n
);
850 cerr
<< "process_alu uc=" << uc
<< " ";
857 pending
.push_back(n
);
858 PSC_DUMP( cerr
<< "pending\n"; );
867 void post_scheduler::update_local_interferences() {
870 cerr
<< "update_local_interferences : ";
871 dump::dump_set(sh
, live
);
876 for (val_set::iterator I
= live
.begin(sh
), E
= live
.end(sh
); I
!= E
; ++I
) {
878 if (v
->is_prealloc())
881 v
->interferences
.add_set(live
);
885 void post_scheduler::update_live_src_vec(vvec
&vv
, val_set
*born
, bool src
) {
886 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
892 if (src
&& v
->is_any_gpr()) {
893 if (live
.add_val(v
)) {
894 if (!v
->is_prealloc()) {
895 if (!cleared_interf
.contains(v
)) {
897 cerr
<< "clearing interferences for " << *v
<< "\n";
899 v
->interferences
.clear();
900 cleared_interf
.add_val(v
);
906 } else if (v
->is_rel()) {
907 if (!v
->rel
->is_any_gpr())
908 live
.add_val(v
->rel
);
909 update_live_src_vec(v
->muse
, born
, true);
914 void post_scheduler::update_live_dst_vec(vvec
&vv
) {
915 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
921 update_live_dst_vec(v
->mdef
);
922 } else if (v
->is_any_gpr()) {
923 if (!live
.remove_val(v
)) {
925 cerr
<< "failed to remove ";
927 cerr
<< " from live : ";
928 dump::dump_set(sh
, live
);
936 void post_scheduler::update_live(node
*n
, val_set
*born
) {
937 update_live_dst_vec(n
->dst
);
938 update_live_src_vec(n
->src
, born
, true);
939 update_live_src_vec(n
->dst
, born
, false);
942 void post_scheduler::process_group() {
943 alu_group_tracker
&rt
= alu
.grp();
950 cerr
<< "process_group: live_before : ";
951 dump::dump_set(sh
, live
);
955 for (unsigned s
= 0; s
< ctx
.num_slots
; ++s
) {
956 alu_node
*n
= rt
.slot(s
);
960 update_live(n
, &vals_born
);
964 cerr
<< "process_group: live_after : ";
965 dump::dump_set(sh
, live
);
969 update_local_interferences();
971 for (unsigned i
= 0; i
< 5; ++i
) {
972 node
*n
= rt
.slot(i
);
973 if (n
&& !n
->is_mova()) {
974 release_src_values(n
);
979 void post_scheduler::init_globals(val_set
&s
, bool prealloc
) {
982 cerr
<< "init_globals: ";
983 dump::dump_set(sh
, s
);
987 for (val_set::iterator I
= s
.begin(sh
), E
= s
.end(sh
); I
!= E
; ++I
) {
989 if (v
->is_sgpr() && !v
->is_global()) {
992 if (prealloc
&& v
->is_fixed()) {
999 void post_scheduler::emit_clause() {
1001 if (alu
.current_ar
) {
1007 alu
.emit_clause(cur_bb
);
1010 void post_scheduler::schedule_alu(container_node
*c
) {
1012 assert(!ready
.empty() || !ready_copies
.empty());
1016 prev_regmap
= regmap
;
1018 if (!prepare_alu_group()) {
1019 if (alu
.current_ar
) {
1026 if (!alu
.check_clause_limits()) {
1027 regmap
= prev_regmap
;
1029 init_globals(live
, false);
1037 if (!alu
.is_empty()) {
1041 if (!ready
.empty()) {
1042 cerr
<< "##post_scheduler: unscheduled ready instructions :";
1043 dump::dump_op_list(&ready
);
1044 assert(!"unscheduled ready instructions");
1047 if (!pending
.empty()) {
1048 cerr
<< "##post_scheduler: unscheduled pending instructions :";
1049 dump::dump_op_list(&pending
);
1050 assert(!"unscheduled pending instructions");
1054 void post_scheduler::add_interferences(value
*v
, sb_bitset
&rb
, val_set
&vs
) {
1055 unsigned chan
= v
->gpr
.chan();
1057 for (val_set::iterator I
= vs
.begin(sh
), E
= vs
.end(sh
);
1060 sel_chan gpr
= vi
->get_final_gpr();
1062 if (vi
->is_any_gpr() && gpr
&& vi
!= v
&&
1063 (!v
->chunk
|| v
->chunk
!= vi
->chunk
) &&
1064 vi
->is_fixed() && gpr
.chan() == chan
) {
1066 unsigned r
= gpr
.sel();
1069 cerr
<< "\tadd_interferences: " << *vi
<< "\n";
1079 void post_scheduler::set_color_local_val(value
*v
, sel_chan color
) {
1083 cerr
<< " recolored: ";
1089 void post_scheduler::set_color_local(value
*v
, sel_chan color
) {
1091 vvec
&vv
= v
->chunk
->values
;
1092 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
1094 set_color_local_val(v2
, color
);
1098 set_color_local_val(v
, color
);
1103 bool post_scheduler::recolor_local(value
*v
) {
1107 assert(v
->is_sgpr());
1108 assert(!v
->is_prealloc());
1111 unsigned chan
= v
->gpr
.chan();
1114 cerr
<< "recolor_local: ";
1116 cerr
<< " interferences: ";
1117 dump::dump_set(sh
, v
->interferences
);
1120 cerr
<< " in chunk: ";
1121 coalescer::dump_chunk(v
->chunk
);
1127 for (vvec::iterator I
= v
->chunk
->values
.begin(),
1128 E
= v
->chunk
->values
.end(); I
!= E
; ++I
) {
1131 PSC_DUMP( cerr
<< " add_interferences for " << *v2
<< " :\n"; );
1133 add_interferences(v
, rb
, v2
->interferences
);
1136 add_interferences(v
, rb
, v
->interferences
);
1140 unsigned sz
= rb
.size();
1141 cerr
<< "registers bits: " << sz
;
1142 for (unsigned r
= 0; r
< sz
; ++r
) {
1144 cerr
<< "\n " << r
<< " ";
1145 cerr
<< (rb
.get(r
) ? 1 : 0);
1149 bool no_temp_gprs
= v
->is_global();
1150 unsigned rs
, re
, pass
= no_temp_gprs
? 1 : 0;
1155 rs
= sh
.first_temp_gpr();
1159 re
= sh
.num_nontemp_gpr();
1162 for (unsigned reg
= rs
; reg
< re
; ++reg
) {
1163 if (reg
>= rb
.size() || !rb
.get(reg
)) {
1165 set_color_local(v
, sel_chan(reg
, chan
));
1172 assert(!"recolor_local failed");
1176 void post_scheduler::emit_load_ar() {
1178 regmap
= prev_regmap
;
1179 alu
.discard_current_group();
1181 alu_group_tracker
&rt
= alu
.grp();
1182 alu_node
*a
= alu
.create_ar_load();
1184 if (!rt
.try_reserve(a
)) {
1185 cerr
<< "can't emit AR load : ";
1193 bool post_scheduler::unmap_dst_val(value
*d
) {
1195 if (d
== alu
.current_ar
) {
1200 if (d
->is_prealloc()) {
1201 sel_chan gpr
= d
->get_final_gpr();
1202 rv_map::iterator F
= regmap
.find(gpr
);
1204 if (F
!= regmap
.end())
1207 if (c
&& c
!=d
&& (!c
->chunk
|| c
->chunk
!= d
->chunk
)) {
1209 cerr
<< "dst value conflict : ";
1211 cerr
<< " regmap contains ";
1215 assert(!"scheduler error");
1224 bool post_scheduler::unmap_dst(alu_node
*n
) {
1225 value
*d
= n
->dst
.empty() ? NULL
: n
->dst
[0];
1231 if (d
&& d
->is_any_reg()) {
1234 if (alu
.current_ar
!= d
) {
1235 cerr
<< "loading wrong ar value\n";
1238 alu
.current_ar
= NULL
;
1241 } else if (d
->is_any_gpr()) {
1242 if (!unmap_dst_val(d
))
1247 for (vvec::iterator I
= d
->mdef
.begin(), E
= d
->mdef
.end();
1253 assert(d
->is_any_gpr());
1255 if (!unmap_dst_val(d
))
1262 bool post_scheduler::map_src_val(value
*v
) {
1264 if (!v
->is_prealloc())
1267 sel_chan gpr
= v
->get_final_gpr();
1268 rv_map::iterator F
= regmap
.find(gpr
);
1270 if (F
!= regmap
.end()) {
1272 if (!v
->v_equal(c
)) {
1274 cerr
<< "can't map src value ";
1276 cerr
<< ", regmap contains ";
1283 regmap
.insert(std::make_pair(gpr
, v
));
1288 bool post_scheduler::map_src_vec(vvec
&vv
, bool src
) {
1289 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
1294 if ((!v
->is_any_gpr() || !v
->is_fixed()) && !v
->is_rel())
1298 value
*rel
= v
->rel
;
1301 if (!rel
->is_const()) {
1302 if (!map_src_vec(v
->muse
, true))
1305 if (rel
!= alu
.current_ar
) {
1306 if (alu
.current_ar
) {
1308 cerr
<< " current_AR is " << *alu
.current_ar
1309 << " trying to use " << *rel
<< "\n";
1314 alu
.current_ar
= rel
;
1317 cerr
<< " new current_AR assigned: " << *alu
.current_ar
1324 if (!map_src_val(v
)) {
1332 bool post_scheduler::map_src(alu_node
*n
) {
1333 if (!map_src_vec(n
->dst
, false))
1336 if (!map_src_vec(n
->src
, true))
1342 void post_scheduler::dump_regmap() {
1344 cerr
<< "# REGMAP :\n";
1346 for(rv_map::iterator I
= regmap
.begin(), E
= regmap
.end(); I
!= E
; ++I
) {
1347 cerr
<< " # " << I
->first
<< " => " << *(I
->second
) << "\n";
1351 cerr
<< " current_AR: " << *alu
.current_ar
<< "\n";
1353 cerr
<< " current_PR: " << *alu
.current_pr
<< "\n";
1356 void post_scheduler::recolor_locals() {
1357 alu_group_tracker
&rt
= alu
.grp();
1359 for (unsigned s
= 0; s
< ctx
.num_slots
; ++s
) {
1360 alu_node
*n
= rt
.slot(s
);
1362 value
*d
= n
->dst
[0];
1363 if (d
&& d
->is_sgpr() && !d
->is_prealloc()) {
1370 // returns true if there are interferences
1371 bool post_scheduler::check_interferences() {
1373 alu_group_tracker
&rt
= alu
.grp();
1375 unsigned interf_slots
;
1377 bool discarded
= false;
1380 cerr
<< "check_interferences: before: \n";
1388 for (unsigned s
= 0; s
< ctx
.num_slots
; ++s
) {
1389 alu_node
*n
= rt
.slot(s
);
1391 if (!unmap_dst(n
)) {
1397 for (unsigned s
= 0; s
< ctx
.num_slots
; ++s
) {
1398 alu_node
*n
= rt
.slot(s
);
1401 interf_slots
|= (1 << s
);
1407 for (unsigned i
= 0; i
< 5; ++i
) {
1408 if (interf_slots
& (1 << i
)) {
1409 cerr
<< "!!!!!! interf slot: " << i
<< " : ";
1410 dump::dump_op(rt
.slot(i
));
1419 PSC_DUMP( cerr
<< "ci: discarding slots " << interf_slots
<< "\n"; );
1421 rt
.discard_slots(interf_slots
, alu
.conflict_nodes
);
1422 regmap
= prev_regmap
;
1428 cerr
<< "check_interferences: after: \n";
1435 // add instruction(s) (alu_node or contents of alu_packed_node) to current group
1436 // returns the number of added instructions on success
1437 unsigned post_scheduler::try_add_instruction(node
*n
) {
1439 alu_group_tracker
&rt
= alu
.grp();
1441 unsigned avail_slots
= rt
.avail_slots();
1443 if (n
->is_alu_packed()) {
1444 alu_packed_node
*p
= static_cast<alu_packed_node
*>(n
);
1445 unsigned slots
= p
->get_slot_mask();
1446 unsigned cnt
= __builtin_popcount(slots
);
1448 if ((slots
& avail_slots
) != slots
) {
1449 PSC_DUMP( cerr
<< " no slots \n"; );
1453 p
->update_packed_items(ctx
);
1455 if (!rt
.try_reserve(p
)) {
1456 PSC_DUMP( cerr
<< " reservation failed \n"; );
1464 alu_node
*a
= static_cast<alu_node
*>(n
);
1465 value
*d
= a
->dst
.empty() ? NULL
: a
->dst
[0];
1467 if (d
&& d
->is_special_reg()) {
1468 assert(a
->bc
.op_ptr
->flags
& AF_MOVA
);
1472 unsigned allowed_slots
= ctx
.alu_slots_mask(a
->bc
.op_ptr
);
1475 allowed_slots
&= avail_slots
;
1481 slot
= d
->get_final_chan();
1482 a
->bc
.dst_chan
= slot
;
1483 allowed_slots
&= (1 << slot
) | 0x10;
1485 if (a
->bc
.op_ptr
->flags
& AF_MOVA
) {
1486 if (a
->bc
.slot_flags
& AF_V
)
1487 allowed_slots
&= (1 << SLOT_X
);
1489 allowed_slots
&= (1 << SLOT_TRANS
);
1493 // FIXME workaround for some problems with MULADD in trans slot on r700,
1494 // (is it really needed on r600?)
1495 if (a
->bc
.op
== ALU_OP3_MULADD
&& !ctx
.is_egcm()) {
1496 allowed_slots
&= 0x0F;
1499 if (!allowed_slots
) {
1500 PSC_DUMP( cerr
<< " no suitable slots\n"; );
1504 slot
= __builtin_ctz(allowed_slots
);
1507 PSC_DUMP( cerr
<< "slot: " << slot
<< "\n"; );
1509 if (!rt
.try_reserve(a
)) {
1510 PSC_DUMP( cerr
<< " reservation failed\n"; );
1519 bool post_scheduler::check_copy(node
*n
) {
1520 if (!n
->is_copy_mov())
1523 value
*s
= n
->src
[0];
1524 value
*d
= n
->dst
[0];
1526 if (!s
->is_sgpr() || !d
->is_sgpr())
1529 if (!s
->is_prealloc()) {
1533 if (s
->gpr
== d
->gpr
) {
1536 cerr
<< "check_copy: ";
1541 rv_map::iterator F
= regmap
.find(d
->gpr
);
1542 bool gpr_free
= (F
== regmap
.end());
1544 if (d
->is_prealloc()) {
1546 PSC_DUMP( cerr
<< " copy not ready...\n";);
1550 value
*rv
= F
->second
;
1551 if (rv
!= d
&& (!rv
->chunk
|| rv
->chunk
!= d
->chunk
)) {
1552 PSC_DUMP( cerr
<< " copy not ready(2)...\n";);
1556 unmap_dst(static_cast<alu_node
*>(n
));
1559 if (s
->is_prealloc() && !map_src_val(s
))
1562 update_live(n
, NULL
);
1564 release_src_values(n
);
1566 PSC_DUMP( cerr
<< " copy coalesced...\n";);
1572 void post_scheduler::dump_group(alu_group_tracker
&rt
) {
1573 for (unsigned i
= 0; i
< 5; ++i
) {
1574 node
*n
= rt
.slot(i
);
1576 cerr
<< "slot " << i
<< " : ";
1583 void post_scheduler::process_ready_copies() {
1588 last
= ready_copies
.back();
1590 for (node_iterator N
, I
= ready_copies
.begin(), E
= ready_copies
.end();
1596 if (!check_copy(n
)) {
1601 } while (last
!= ready_copies
.back());
1603 update_local_interferences();
1607 bool post_scheduler::prepare_alu_group() {
1609 alu_group_tracker
&rt
= alu
.grp();
1614 cerr
<< "prepare_alu_group: starting...\n";
1618 ready
.append_from(&alu
.conflict_nodes
);
1620 // FIXME rework this loop
1624 process_ready_copies();
1628 for (node_iterator N
, I
= ready
.begin(), E
= ready
.end(); I
!= E
;
1640 unsigned cnt
= try_add_instruction(n
);
1646 cerr
<< "current group:\n";
1650 if (rt
.inst_count() == ctx
.num_slots
) {
1651 PSC_DUMP( cerr
<< " all slots used\n"; );
1656 if (!check_interferences())
1659 // don't try to add more instructions to the group with mova if this
1660 // can lead to breaking clause slot count limit - we don't want mova to
1661 // end up in the end of the new clause instead of beginning of the
1663 if (rt
.has_ar_load() && alu
.total_slots() > 121)
1666 if (rt
.inst_count() && i1
> 50)
1669 regmap
= prev_regmap
;
1674 cerr
<< " prepare_alu_group done, " << rt
.inst_count()
1677 cerr
<< "$$$$$$$$PAG i1=" << i1
1678 << " ready " << ready
.count()
1679 << " pending " << pending
.count()
1680 << " conflicting " << alu
.conflict_nodes
.count()
1685 return rt
.inst_count();
1688 void post_scheduler::release_src_values(node
* n
) {
1689 release_src_vec(n
->src
, true);
1690 release_src_vec(n
->dst
, false);
1693 void post_scheduler::release_op(node
*n
) {
1695 cerr
<< "release_op ";
1702 if (n
->is_copy_mov()) {
1703 ready_copies
.push_back(n
);
1704 } else if (n
->is_mova() || n
->is_pred_set()) {
1705 ready
.push_front(n
);
1711 void post_scheduler::release_src_val(value
*v
) {
1712 node
*d
= v
->any_def();
1719 void post_scheduler::release_src_vec(vvec
& vv
, bool src
) {
1721 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
1723 if (!v
|| v
->is_readonly())
1727 release_src_val(v
->rel
);
1728 release_src_vec(v
->muse
, true);
1736 void literal_tracker::reset() {
1737 memset(lt
, 0, sizeof(lt
));
1738 memset(uc
, 0, sizeof(uc
));
1741 void rp_gpr_tracker::reset() {
1742 memset(rp
, 0, sizeof(rp
));
1743 memset(uc
, 0, sizeof(uc
));
1746 void rp_kcache_tracker::reset() {
1747 memset(rp
, 0, sizeof(rp
));
1748 memset(uc
, 0, sizeof(uc
));
1751 void alu_kcache_tracker::reset() {
1752 memset(kc
, 0, sizeof(kc
));
1756 void alu_clause_tracker::reset() {
1763 alu_clause_tracker::alu_clause_tracker(shader
&sh
)
1764 : sh(sh
), kt(sh
.get_ctx().hw_class
), slot_count(),
1768 current_ar(), current_pr() {}
1770 void alu_clause_tracker::emit_group() {
1772 assert(grp().inst_count());
1774 alu_group_node
*g
= grp().emit();
1776 if (grp().has_update_exec_mask()) {
1777 assert(!push_exec_mask
);
1778 push_exec_mask
= true;
1784 clause
= sh
.create_clause(NST_ALU_CLAUSE
);
1787 clause
->push_front(g
);
1789 slot_count
+= grp().slot_count();
1793 PSC_DUMP( cerr
<< " #### group emitted\n"; );
1796 void alu_clause_tracker::emit_clause(container_node
*c
) {
1799 kt
.init_clause(clause
->bc
);
1801 assert(!current_ar
);
1802 assert(!current_pr
);
1805 clause
->bc
.set_op(CF_OP_ALU_PUSH_BEFORE
);
1807 c
->push_front(clause
);
1810 push_exec_mask
= false;
1814 PSC_DUMP( cerr
<< "######### ALU clause emitted\n"; );
1817 bool alu_clause_tracker::check_clause_limits() {
1819 alu_group_tracker
>
= grp();
1821 unsigned slots
= gt
.slot_count();
1823 // reserving slots to load AR and PR values
1824 unsigned reserve_slots
= (current_ar
? 1 : 0) + (current_pr
? 1 : 0);
1826 if (slot_count
+ slots
> MAX_ALU_SLOTS
- reserve_slots
)
1829 if (!kt
.try_reserve(gt
))
1835 void alu_clause_tracker::new_group() {
1840 bool alu_clause_tracker::is_empty() {
1841 return clause
== NULL
;
1844 void literal_tracker::init_group_literals(alu_group_node
* g
) {
1846 g
->literals
.clear();
1847 for (unsigned i
= 0; i
< 4; ++i
) {
1851 g
->literals
.push_back(lt
[i
]);
1854 cerr
<< "literal emitted: " << lt
[i
].f
1855 << " 0x" << std::hex
<< lt
[i
].u
1856 << std::dec
<< " " << lt
[i
].i
<< "\n";
1861 bool alu_kcache_tracker::try_reserve(alu_group_tracker
& gt
) {
1862 rp_kcache_tracker
&kt
= gt
.kcache();
1867 sb_set
<unsigned> group_lines
;
1869 unsigned nl
= kt
.get_lines(group_lines
);
1872 sb_set
<unsigned> clause_lines(lines
);
1873 lines
.add_set(group_lines
);
1875 if (clause_lines
.size() == lines
.size())
1881 lines
= clause_lines
;
1886 unsigned rp_kcache_tracker::get_lines(kc_lines
& lines
) {
1889 for (unsigned i
= 0; i
< sel_count
; ++i
) {
1890 unsigned line
= rp
[i
];
1896 line
= (sel_count
== 2) ? line
>> 5 : line
>> 6;
1898 if (lines
.insert(line
).second
)
1904 bool alu_kcache_tracker::update_kc() {
1907 bc_kcache old_kc
[4];
1908 memcpy(old_kc
, kc
, sizeof(kc
));
1910 for (kc_lines::iterator I
= lines
.begin(), E
= lines
.end(); I
!= E
; ++I
) {
1912 unsigned bank
= line
>> 8;
1916 if (c
&& (bank
== kc
[c
-1].bank
) && (kc
[c
-1].addr
+ 1 == line
))
1920 memcpy(kc
, old_kc
, sizeof(kc
));
1924 kc
[c
].mode
= KC_LOCK_1
;
1934 alu_node
* alu_clause_tracker::create_ar_load() {
1935 alu_node
*a
= sh
.create_alu();
1937 // FIXME use MOVA_GPR on R6xx
1939 if (sh
.get_ctx().uses_mova_gpr
) {
1940 a
->bc
.set_op(ALU_OP1_MOVA_GPR_INT
);
1941 a
->bc
.slot
= SLOT_TRANS
;
1943 a
->bc
.set_op(ALU_OP1_MOVA_INT
);
1944 a
->bc
.slot
= SLOT_X
;
1948 a
->src
.push_back(current_ar
);
1951 cerr
<< "created AR load: ";
1959 void alu_clause_tracker::discard_current_group() {
1960 PSC_DUMP( cerr
<< "act::discard_current_group\n"; );
1961 grp().discard_all_slots(conflict_nodes
);
1964 void rp_gpr_tracker::dump() {
1965 cerr
<< "=== gpr_tracker dump:\n";
1966 for (int c
= 0; c
< 3; ++c
) {
1967 cerr
<< "cycle " << c
<< " ";
1968 for (int h
= 0; h
< 4; ++h
) {
1969 cerr
<< rp
[c
][h
] << ":" << uc
[c
][h
] << " ";
1975 } // namespace r600_sb