f0e41f5863774fcda77c40d5cc0173b6af07eff1
2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
30 #define PSC_DUMP(a) do { a } while (0)
36 #include "sb_shader.h"
42 rp_kcache_tracker::rp_kcache_tracker(shader
&sh
) : rp(), uc(),
43 // FIXME: for now we'll use "two const pairs" limit for r600, same as
44 // for other chips, otherwise additional check in alu_group_tracker is
45 // required to make sure that all 4 consts in the group fit into 2
49 bool rp_kcache_tracker::try_reserve(sel_chan r
) {
50 unsigned sel
= kc_sel(r
);
52 for (unsigned i
= 0; i
< sel_count
; ++i
) {
66 bool rp_kcache_tracker::try_reserve(node
* n
) {
67 bool need_unreserve
= false;
68 vvec::iterator
I(n
->src
.begin()), E(n
->src
.end());
73 if (!try_reserve(v
->select
))
76 need_unreserve
= true;
82 if (need_unreserve
&& I
!= n
->src
.begin()) {
88 } while (I
!= n
->src
.begin());
94 void rp_kcache_tracker::unreserve(node
* n
) {
95 vvec::iterator
I(n
->src
.begin()), E(n
->src
.end());
103 void rp_kcache_tracker::unreserve(sel_chan r
) {
104 unsigned sel
= kc_sel(r
);
106 for (unsigned i
= 0; i
< sel_count
; ++i
)
116 bool literal_tracker::try_reserve(alu_node
* n
) {
117 bool need_unreserve
= false;
119 vvec::iterator
I(n
->src
.begin()), E(n
->src
.end());
121 for (; I
!= E
; ++I
) {
123 if (v
->is_literal()) {
124 if (!try_reserve(v
->literal_value
))
127 need_unreserve
= true;
133 if (need_unreserve
&& I
!= n
->src
.begin()) {
138 unreserve(v
->literal_value
);
139 } while (I
!= n
->src
.begin());
144 void literal_tracker::unreserve(alu_node
* n
) {
145 unsigned nsrc
= n
->bc
.op_ptr
->src_count
, i
;
147 for (i
= 0; i
< nsrc
; ++i
) {
148 value
*v
= n
->src
[i
];
150 unreserve(v
->literal_value
);
154 bool literal_tracker::try_reserve(literal l
) {
156 PSC_DUMP( sblog
<< "literal reserve " << l
.u
<< " " << l
.f
<< "\n"; );
158 for (unsigned i
= 0; i
< MAX_ALU_LITERALS
; ++i
) {
162 PSC_DUMP( sblog
<< " reserved new uc = " << uc
[i
] << "\n"; );
164 } else if (lt
[i
] == l
) {
166 PSC_DUMP( sblog
<< " reserved uc = " << uc
[i
] << "\n"; );
170 PSC_DUMP( sblog
<< " failed to reserve literal\n"; );
174 void literal_tracker::unreserve(literal l
) {
176 PSC_DUMP( sblog
<< "literal unreserve " << l
.u
<< " " << l
.f
<< "\n"; );
178 for (unsigned i
= 0; i
< MAX_ALU_LITERALS
; ++i
) {
189 static inline unsigned bs_cycle_vector(unsigned bs
, unsigned src
) {
190 static const unsigned swz
[VEC_NUM
][3] = {
191 {0, 1, 2}, {0, 2, 1}, {1, 2, 0}, {1, 0, 2}, {2, 0, 1}, {2, 1, 0}
193 assert(bs
< VEC_NUM
&& src
< 3);
197 static inline unsigned bs_cycle_scalar(unsigned bs
, unsigned src
) {
198 static const unsigned swz
[SCL_NUM
][3] = {
199 {2, 1, 0}, {1, 2, 2}, {2, 1, 2}, {2, 2, 1}
202 if (bs
>= SCL_NUM
|| src
>= 3) {
203 // this prevents gcc warning "array subscript is above array bounds"
204 // AFAICS we should never hit this path
210 static inline unsigned bs_cycle(bool trans
, unsigned bs
, unsigned src
) {
211 return trans
? bs_cycle_scalar(bs
, src
) : bs_cycle_vector(bs
, src
);
215 bool rp_gpr_tracker::try_reserve(unsigned cycle
, unsigned sel
, unsigned chan
) {
217 if (rp
[cycle
][chan
] == 0) {
218 rp
[cycle
][chan
] = sel
;
221 } else if (rp
[cycle
][chan
] == sel
) {
229 void rp_gpr_tracker::unreserve(alu_node
* n
) {
230 unsigned nsrc
= n
->bc
.op_ptr
->src_count
, i
;
231 unsigned trans
= n
->bc
.slot
== SLOT_TRANS
;
232 unsigned bs
= n
->bc
.bank_swizzle
;
233 unsigned opt
= !trans
234 && n
->bc
.src
[0].sel
== n
->bc
.src
[1].sel
235 && n
->bc
.src
[0].chan
== n
->bc
.src
[1].chan
;
237 for (i
= 0; i
< nsrc
; ++i
) {
238 value
*v
= n
->src
[i
];
239 if (v
->is_readonly())
243 unsigned cycle
= bs_cycle(trans
, bs
, i
);
244 unreserve(cycle
, n
->bc
.src
[i
].sel
, n
->bc
.src
[i
].chan
);
249 void rp_gpr_tracker::unreserve(unsigned cycle
, unsigned sel
, unsigned chan
) {
251 assert(rp
[cycle
][chan
] == sel
&& uc
[cycle
][chan
]);
252 if (--uc
[cycle
][chan
] == 0)
257 bool rp_gpr_tracker::try_reserve(alu_node
* n
) {
258 unsigned nsrc
= n
->bc
.op_ptr
->src_count
, i
;
259 unsigned trans
= n
->bc
.slot
== SLOT_TRANS
;
260 unsigned bs
= n
->bc
.bank_swizzle
;
261 unsigned opt
= !trans
&& nsrc
>= 2 &&
262 n
->src
[0] == n
->src
[1];
264 bool need_unreserve
= false;
265 unsigned const_count
= 0, min_gpr_cycle
= 3;
267 for (i
= 0; i
< nsrc
; ++i
) {
268 value
*v
= n
->src
[i
];
269 if (v
->is_readonly()) {
271 if (trans
&& const_count
== 3)
277 unsigned cycle
= bs_cycle(trans
, bs
, i
);
279 if (trans
&& cycle
< min_gpr_cycle
)
280 min_gpr_cycle
= cycle
;
282 if (const_count
&& cycle
< const_count
&& trans
)
285 if (!try_reserve(cycle
, n
->bc
.src
[i
].sel
, n
->bc
.src
[i
].chan
))
288 need_unreserve
= true;
292 if ((i
== nsrc
) && (min_gpr_cycle
+ 1 > const_count
))
295 if (need_unreserve
&& i
--) {
297 value
*v
= n
->src
[i
];
298 if (!v
->is_readonly()) {
301 unreserve(bs_cycle(trans
, bs
, i
), n
->bc
.src
[i
].sel
,
309 alu_group_tracker::alu_group_tracker(shader
&sh
)
311 gpr(), lt(), slots(),
312 max_slots(sh
.get_ctx().is_cayman() ? 4 : 5),
313 has_mova(), uses_ar(), has_predset(), has_kill(),
314 updates_exec_mask(), chan_count(), interp_param(), next_id() {
316 available_slots
= sh
.get_ctx().has_trans
? 0x1F : 0x0F;
320 sel_chan
alu_group_tracker::get_value_id(value
* v
) {
321 unsigned &id
= vmap
[v
];
324 return sel_chan(id
, v
->get_final_chan());
328 void alu_group_tracker::assign_slot(unsigned slot
, alu_node
* n
) {
331 available_slots
&= ~(1 << slot
);
333 unsigned param
= n
->interp_param();
336 assert(!interp_param
|| interp_param
== param
);
337 interp_param
= param
;
342 void alu_group_tracker::discard_all_slots(container_node
&removed_nodes
) {
343 PSC_DUMP( sblog
<< "agt::discard_all_slots\n"; );
344 discard_slots(~available_slots
& ((1 << max_slots
) - 1), removed_nodes
);
347 void alu_group_tracker::discard_slots(unsigned slot_mask
,
348 container_node
&removed_nodes
) {
351 sblog
<< "discard_slots : packed_ops : "
352 << (unsigned)packed_ops
.size() << "\n";
355 for (node_vec::iterator N
, I
= packed_ops
.begin();
356 I
!= packed_ops
.end(); I
= N
) {
359 alu_packed_node
*n
= static_cast<alu_packed_node
*>(*I
);
360 unsigned pslots
= n
->get_slot_mask();
363 sblog
<< "discard_slots : packed slot_mask : " << pslots
<< "\n";
366 if (pslots
& slot_mask
) {
369 sblog
<< "discard_slots : discarding packed...\n";
372 removed_nodes
.push_back(n
);
373 slot_mask
&= ~pslots
;
374 N
= packed_ops
.erase(I
);
375 available_slots
|= pslots
;
376 for (unsigned k
= 0; k
< max_slots
; ++k
) {
377 if (pslots
& (1 << k
))
383 for (unsigned slot
= 0; slot
< max_slots
; ++slot
) {
384 unsigned slot_bit
= 1 << slot
;
386 if (slot_mask
& slot_bit
) {
387 assert(!(available_slots
& slot_bit
));
390 assert(!(slots
[slot
]->bc
.slot_flags
& AF_4SLOT
));
393 sblog
<< "discarding slot " << slot
<< " : ";
394 dump::dump_op(slots
[slot
]);
398 removed_nodes
.push_back(slots
[slot
]);
400 available_slots
|= slot_bit
;
404 alu_node
*t
= slots
[4];
405 if (t
&& (t
->bc
.slot_flags
& AF_V
)) {
406 unsigned chan
= t
->bc
.dst_chan
;
411 sblog
<< " from trans slot to free slot " << chan
<< "\n";
423 alu_group_node
* alu_group_tracker::emit() {
425 alu_group_node
*g
= sh
.create_alu_group();
427 lt
.init_group_literals(g
);
429 for (unsigned i
= 0; i
< max_slots
; ++i
) {
430 alu_node
*n
= slots
[i
];
438 bool alu_group_tracker::try_reserve(alu_node
* n
) {
439 unsigned nsrc
= n
->bc
.op_ptr
->src_count
;
440 unsigned slot
= n
->bc
.slot
;
441 bool trans
= slot
== 4;
446 unsigned flags
= n
->bc
.op_ptr
->flags
;
448 unsigned param
= n
->interp_param();
450 if (param
&& interp_param
&& interp_param
!= param
)
453 if ((flags
& AF_KILL
) && has_predset
)
455 if ((flags
& AF_ANY_PRED
) && (has_kill
|| has_predset
))
457 if ((flags
& AF_MOVA
) && (has_mova
|| uses_ar
))
460 if (n
->uses_ar() && has_mova
)
463 for (unsigned i
= 0; i
< nsrc
; ++i
) {
465 unsigned last_id
= next_id
;
467 value
*v
= n
->src
[i
];
468 if (!v
->is_any_gpr() && !v
->is_rel())
470 sel_chan vid
= get_value_id(n
->src
[i
]);
472 if (vid
> last_id
&& chan_count
[vid
.chan()] == 3) {
476 n
->bc
.src
[i
].sel
= vid
.sel();
477 n
->bc
.src
[i
].chan
= vid
.chan();
480 if (!lt
.try_reserve(n
))
483 if (!kc
.try_reserve(n
)) {
488 unsigned fbs
= n
->forced_bank_swizzle();
490 n
->bc
.bank_swizzle
= 0;
493 n
->bc
.bank_swizzle
= VEC_210
;
495 if (gpr
.try_reserve(n
)) {
496 assign_slot(slot
, n
);
501 unsigned swz_num
= trans
? SCL_NUM
: VEC_NUM
;
502 for (unsigned bs
= 0; bs
< swz_num
; ++bs
) {
503 n
->bc
.bank_swizzle
= bs
;
504 if (gpr
.try_reserve(n
)) {
505 assign_slot(slot
, n
);
514 unsigned forced_swz_slots
= 0;
515 int first_slot
= ~0, first_nf
= ~0, last_slot
= ~0;
518 for (unsigned i
= 0; i
< max_slots
; ++i
) {
519 alu_node
*a
= slots
[i
];
521 if (first_slot
== ~0)
524 save_bs
[i
] = a
->bc
.bank_swizzle
;
525 if (a
->forced_bank_swizzle()) {
526 assert(i
!= SLOT_TRANS
);
527 forced_swz_slots
|= (1 << i
);
528 a
->bc
.bank_swizzle
= VEC_210
;
529 if (!gpr
.try_reserve(a
))
530 assert("!internal reservation error");
535 a
->bc
.bank_swizzle
= 0;
540 if (first_nf
== ~0) {
541 assign_slot(slot
, n
);
545 assert(first_slot
!= ~0 && last_slot
!= ~0);
547 // silence "array subscript is above array bounds" with gcc 4.8
552 alu_node
*a
= slots
[i
];
553 bool backtrack
= false;
558 sblog
<< " bs: trying s" << i
<< " bs:" << a
->bc
.bank_swizzle
559 << " bt:" << backtrack
<< "\n";
562 if (!backtrack
&& gpr
.try_reserve(a
)) {
564 sblog
<< " bs: reserved s" << i
<< " bs:" << a
->bc
.bank_swizzle
568 while ((++i
<= last_slot
) && !slots
[i
]);
574 bool itrans
= i
== SLOT_TRANS
;
575 unsigned max_swz
= itrans
? SCL_221
: VEC_210
;
577 if (a
->bc
.bank_swizzle
< max_swz
) {
578 ++a
->bc
.bank_swizzle
;
581 sblog
<< " bs: inc s" << i
<< " bs:" << a
->bc
.bank_swizzle
587 a
->bc
.bank_swizzle
= 0;
588 while ((--i
>= first_nf
) && !slots
[i
]);
593 sblog
<< " bs: unreserve s" << i
<< " bs:" << a
->bc
.bank_swizzle
605 if (i
== last_slot
+ 1) {
606 assign_slot(slot
, n
);
610 // reservation failed, restore previous state
613 for (unsigned i
= 0; i
< max_slots
; ++i
) {
614 alu_node
*a
= slots
[i
];
616 a
->bc
.bank_swizzle
= save_bs
[i
];
617 bool b
= gpr
.try_reserve(a
);
627 bool alu_group_tracker::try_reserve(alu_packed_node
* p
) {
628 bool need_unreserve
= false;
629 node_iterator
I(p
->begin()), E(p
->end());
631 for (; I
!= E
; ++I
) {
632 alu_node
*n
= static_cast<alu_node
*>(*I
);
636 need_unreserve
= true;
640 packed_ops
.push_back(p
);
644 if (need_unreserve
) {
646 alu_node
*n
= static_cast<alu_node
*>(*I
);
647 slots
[n
->bc
.slot
] = NULL
;
654 void alu_group_tracker::reinit() {
656 memcpy(s
, slots
, sizeof(slots
));
660 for (int i
= max_slots
- 1; i
>= 0; --i
) {
661 if (s
[i
] && !try_reserve(s
[i
])) {
662 sblog
<< "alu_group_tracker: reinit error on slot " << i
<< "\n";
663 for (unsigned i
= 0; i
< max_slots
; ++i
) {
664 sblog
<< " slot " << i
<< " : ";
670 assert(!"alu_group_tracker: reinit error");
675 void alu_group_tracker::reset(bool keep_packed
) {
679 memset(slots
, 0, sizeof(slots
));
686 updates_exec_mask
= false;
687 available_slots
= sh
.get_ctx().has_trans
? 0x1F : 0x0F;
699 void alu_group_tracker::update_flags(alu_node
* n
) {
700 unsigned flags
= n
->bc
.op_ptr
->flags
;
701 has_kill
|= (flags
& AF_KILL
);
702 has_mova
|= (flags
& AF_MOVA
);
703 has_predset
|= (flags
& AF_ANY_PRED
);
704 uses_ar
|= n
->uses_ar();
706 if (flags
& AF_ANY_PRED
) {
707 if (n
->dst
[2] != NULL
)
708 updates_exec_mask
= true;
712 int post_scheduler::run() {
717 void post_scheduler::run_on(container_node
* n
) {
719 for (node_riterator I
= n
->rbegin(), E
= n
->rend(); I
!= E
; ++I
) {
720 if (I
->is_container()) {
721 if (I
->subtype
== NST_BB
) {
722 bb_node
* bb
= static_cast<bb_node
*>(*I
);
725 run_on(static_cast<container_node
*>(*I
));
731 void post_scheduler::init_uc_val(container_node
*c
, value
*v
) {
732 node
*d
= v
->any_def();
733 if (d
&& d
->parent
== c
)
737 void post_scheduler::init_uc_vec(container_node
*c
, vvec
&vv
, bool src
) {
738 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
740 if (!v
|| v
->is_readonly())
744 init_uc_val(c
, v
->rel
);
745 init_uc_vec(c
, v
->muse
, true);
752 unsigned post_scheduler::init_ucm(container_node
*c
, node
*n
) {
753 init_uc_vec(c
, n
->src
, true);
754 init_uc_vec(c
, n
->dst
, false);
756 uc_map::iterator F
= ucm
.find(n
);
757 return F
== ucm
.end() ? 0 : F
->second
;
760 void post_scheduler::schedule_bb(bb_node
* bb
) {
762 sblog
<< "scheduling BB " << bb
->id
<< "\n";
763 if (!pending
.empty())
764 dump::dump_op_list(&pending
);
767 assert(pending
.empty());
768 assert(bb_pending
.empty());
769 assert(ready
.empty());
771 bb_pending
.append_from(bb
);
776 while ((n
= bb_pending
.back())) {
779 sblog
<< "post_sched_bb ";
784 if (n
->subtype
== NST_ALU_CLAUSE
) {
786 process_alu(static_cast<container_node
*>(n
));
797 void post_scheduler::init_regmap() {
802 sblog
<< "init_regmap: live: ";
803 dump::dump_set(sh
, live
);
807 for (val_set::iterator I
= live
.begin(sh
), E
= live
.end(sh
); I
!= E
; ++I
) {
810 if (!v
->is_sgpr() || !v
->is_prealloc())
816 sblog
<< "init_regmap: " << r
<< " <= ";
826 void post_scheduler::process_alu(container_node
*c
) {
831 live
= c
->live_after
;
833 init_globals(c
->live_after
, true);
834 init_globals(c
->live_before
, true);
838 update_local_interferences();
840 for (node_riterator N
, I
= c
->rbegin(), E
= c
->rend(); I
!= E
; I
= N
) {
845 unsigned uc
= init_ucm(c
, n
);
848 sblog
<< "process_alu uc=" << uc
<< " ";
855 pending
.push_back(n
);
856 PSC_DUMP( sblog
<< "pending\n"; );
865 void post_scheduler::update_local_interferences() {
868 sblog
<< "update_local_interferences : ";
869 dump::dump_set(sh
, live
);
874 for (val_set::iterator I
= live
.begin(sh
), E
= live
.end(sh
); I
!= E
; ++I
) {
876 if (v
->is_prealloc())
879 v
->interferences
.add_set(live
);
883 void post_scheduler::update_live_src_vec(vvec
&vv
, val_set
*born
, bool src
) {
884 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
890 if (src
&& v
->is_any_gpr()) {
891 if (live
.add_val(v
)) {
892 if (!v
->is_prealloc()) {
893 if (!cleared_interf
.contains(v
)) {
895 sblog
<< "clearing interferences for " << *v
<< "\n";
897 v
->interferences
.clear();
898 cleared_interf
.add_val(v
);
904 } else if (v
->is_rel()) {
905 if (!v
->rel
->is_any_gpr())
906 live
.add_val(v
->rel
);
907 update_live_src_vec(v
->muse
, born
, true);
912 void post_scheduler::update_live_dst_vec(vvec
&vv
) {
913 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
919 update_live_dst_vec(v
->mdef
);
920 } else if (v
->is_any_gpr()) {
921 if (!live
.remove_val(v
)) {
923 sblog
<< "failed to remove ";
925 sblog
<< " from live : ";
926 dump::dump_set(sh
, live
);
934 void post_scheduler::update_live(node
*n
, val_set
*born
) {
935 update_live_dst_vec(n
->dst
);
936 update_live_src_vec(n
->src
, born
, true);
937 update_live_src_vec(n
->dst
, born
, false);
940 void post_scheduler::process_group() {
941 alu_group_tracker
&rt
= alu
.grp();
948 sblog
<< "process_group: live_before : ";
949 dump::dump_set(sh
, live
);
953 for (unsigned s
= 0; s
< ctx
.num_slots
; ++s
) {
954 alu_node
*n
= rt
.slot(s
);
958 update_live(n
, &vals_born
);
962 sblog
<< "process_group: live_after : ";
963 dump::dump_set(sh
, live
);
967 update_local_interferences();
969 for (unsigned i
= 0; i
< 5; ++i
) {
970 node
*n
= rt
.slot(i
);
971 if (n
&& !n
->is_mova()) {
972 release_src_values(n
);
977 void post_scheduler::init_globals(val_set
&s
, bool prealloc
) {
980 sblog
<< "init_globals: ";
981 dump::dump_set(sh
, s
);
985 for (val_set::iterator I
= s
.begin(sh
), E
= s
.end(sh
); I
!= E
; ++I
) {
987 if (v
->is_sgpr() && !v
->is_global()) {
990 if (prealloc
&& v
->is_fixed()) {
997 void post_scheduler::emit_clause() {
999 if (alu
.current_ar
) {
1005 alu
.emit_clause(cur_bb
);
1008 void post_scheduler::schedule_alu(container_node
*c
) {
1010 assert(!ready
.empty() || !ready_copies
.empty());
1014 prev_regmap
= regmap
;
1016 if (!prepare_alu_group()) {
1017 if (alu
.current_ar
) {
1024 if (!alu
.check_clause_limits()) {
1025 regmap
= prev_regmap
;
1027 init_globals(live
, false);
1035 if (!alu
.is_empty()) {
1039 if (!ready
.empty()) {
1040 sblog
<< "##post_scheduler: unscheduled ready instructions :";
1041 dump::dump_op_list(&ready
);
1042 assert(!"unscheduled ready instructions");
1045 if (!pending
.empty()) {
1046 sblog
<< "##post_scheduler: unscheduled pending instructions :";
1047 dump::dump_op_list(&pending
);
1048 assert(!"unscheduled pending instructions");
1052 void post_scheduler::add_interferences(value
*v
, sb_bitset
&rb
, val_set
&vs
) {
1053 unsigned chan
= v
->gpr
.chan();
1055 for (val_set::iterator I
= vs
.begin(sh
), E
= vs
.end(sh
);
1058 sel_chan gpr
= vi
->get_final_gpr();
1060 if (vi
->is_any_gpr() && gpr
&& vi
!= v
&&
1061 (!v
->chunk
|| v
->chunk
!= vi
->chunk
) &&
1062 vi
->is_fixed() && gpr
.chan() == chan
) {
1064 unsigned r
= gpr
.sel();
1067 sblog
<< "\tadd_interferences: " << *vi
<< "\n";
1077 void post_scheduler::set_color_local_val(value
*v
, sel_chan color
) {
1081 sblog
<< " recolored: ";
1087 void post_scheduler::set_color_local(value
*v
, sel_chan color
) {
1089 vvec
&vv
= v
->chunk
->values
;
1090 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
1092 set_color_local_val(v2
, color
);
1096 set_color_local_val(v
, color
);
1101 bool post_scheduler::recolor_local(value
*v
) {
1105 assert(v
->is_sgpr());
1106 assert(!v
->is_prealloc());
1109 unsigned chan
= v
->gpr
.chan();
1112 sblog
<< "recolor_local: ";
1114 sblog
<< " interferences: ";
1115 dump::dump_set(sh
, v
->interferences
);
1118 sblog
<< " in chunk: ";
1119 coalescer::dump_chunk(v
->chunk
);
1125 for (vvec::iterator I
= v
->chunk
->values
.begin(),
1126 E
= v
->chunk
->values
.end(); I
!= E
; ++I
) {
1129 PSC_DUMP( sblog
<< " add_interferences for " << *v2
<< " :\n"; );
1131 add_interferences(v
, rb
, v2
->interferences
);
1134 add_interferences(v
, rb
, v
->interferences
);
1138 unsigned sz
= rb
.size();
1139 sblog
<< "registers bits: " << sz
;
1140 for (unsigned r
= 0; r
< sz
; ++r
) {
1142 sblog
<< "\n " << r
<< " ";
1143 sblog
<< (rb
.get(r
) ? 1 : 0);
1147 bool no_temp_gprs
= v
->is_global();
1148 unsigned rs
, re
, pass
= no_temp_gprs
? 1 : 0;
1153 rs
= sh
.first_temp_gpr();
1157 re
= sh
.num_nontemp_gpr();
1160 for (unsigned reg
= rs
; reg
< re
; ++reg
) {
1161 if (reg
>= rb
.size() || !rb
.get(reg
)) {
1163 set_color_local(v
, sel_chan(reg
, chan
));
1170 assert(!"recolor_local failed");
1174 void post_scheduler::emit_load_ar() {
1176 regmap
= prev_regmap
;
1177 alu
.discard_current_group();
1179 alu_group_tracker
&rt
= alu
.grp();
1180 alu_node
*a
= alu
.create_ar_load();
1182 if (!rt
.try_reserve(a
)) {
1183 sblog
<< "can't emit AR load : ";
1191 bool post_scheduler::unmap_dst_val(value
*d
) {
1193 if (d
== alu
.current_ar
) {
1198 if (d
->is_prealloc()) {
1199 sel_chan gpr
= d
->get_final_gpr();
1200 rv_map::iterator F
= regmap
.find(gpr
);
1202 if (F
!= regmap
.end())
1205 if (c
&& c
!=d
&& (!c
->chunk
|| c
->chunk
!= d
->chunk
)) {
1207 sblog
<< "dst value conflict : ";
1209 sblog
<< " regmap contains ";
1213 assert(!"scheduler error");
1222 bool post_scheduler::unmap_dst(alu_node
*n
) {
1223 value
*d
= n
->dst
.empty() ? NULL
: n
->dst
[0];
1229 if (d
&& d
->is_any_reg()) {
1232 if (alu
.current_ar
!= d
) {
1233 sblog
<< "loading wrong ar value\n";
1236 alu
.current_ar
= NULL
;
1239 } else if (d
->is_any_gpr()) {
1240 if (!unmap_dst_val(d
))
1245 for (vvec::iterator I
= d
->mdef
.begin(), E
= d
->mdef
.end();
1251 assert(d
->is_any_gpr());
1253 if (!unmap_dst_val(d
))
1260 bool post_scheduler::map_src_val(value
*v
) {
1262 if (!v
->is_prealloc())
1265 sel_chan gpr
= v
->get_final_gpr();
1266 rv_map::iterator F
= regmap
.find(gpr
);
1268 if (F
!= regmap
.end()) {
1270 if (!v
->v_equal(c
)) {
1272 sblog
<< "can't map src value ";
1274 sblog
<< ", regmap contains ";
1281 regmap
.insert(std::make_pair(gpr
, v
));
1286 bool post_scheduler::map_src_vec(vvec
&vv
, bool src
) {
1287 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
1292 if ((!v
->is_any_gpr() || !v
->is_fixed()) && !v
->is_rel())
1296 value
*rel
= v
->rel
;
1299 if (!rel
->is_const()) {
1300 if (!map_src_vec(v
->muse
, true))
1303 if (rel
!= alu
.current_ar
) {
1304 if (alu
.current_ar
) {
1306 sblog
<< " current_AR is " << *alu
.current_ar
1307 << " trying to use " << *rel
<< "\n";
1312 alu
.current_ar
= rel
;
1315 sblog
<< " new current_AR assigned: " << *alu
.current_ar
1322 if (!map_src_val(v
)) {
1330 bool post_scheduler::map_src(alu_node
*n
) {
1331 if (!map_src_vec(n
->dst
, false))
1334 if (!map_src_vec(n
->src
, true))
1340 void post_scheduler::dump_regmap() {
1342 sblog
<< "# REGMAP :\n";
1344 for(rv_map::iterator I
= regmap
.begin(), E
= regmap
.end(); I
!= E
; ++I
) {
1345 sblog
<< " # " << I
->first
<< " => " << *(I
->second
) << "\n";
1349 sblog
<< " current_AR: " << *alu
.current_ar
<< "\n";
1351 sblog
<< " current_PR: " << *alu
.current_pr
<< "\n";
1354 void post_scheduler::recolor_locals() {
1355 alu_group_tracker
&rt
= alu
.grp();
1357 for (unsigned s
= 0; s
< ctx
.num_slots
; ++s
) {
1358 alu_node
*n
= rt
.slot(s
);
1360 value
*d
= n
->dst
[0];
1361 if (d
&& d
->is_sgpr() && !d
->is_prealloc()) {
1368 // returns true if there are interferences
1369 bool post_scheduler::check_interferences() {
1371 alu_group_tracker
&rt
= alu
.grp();
1373 unsigned interf_slots
;
1375 bool discarded
= false;
1378 sblog
<< "check_interferences: before: \n";
1386 for (unsigned s
= 0; s
< ctx
.num_slots
; ++s
) {
1387 alu_node
*n
= rt
.slot(s
);
1389 if (!unmap_dst(n
)) {
1395 for (unsigned s
= 0; s
< ctx
.num_slots
; ++s
) {
1396 alu_node
*n
= rt
.slot(s
);
1399 interf_slots
|= (1 << s
);
1405 for (unsigned i
= 0; i
< 5; ++i
) {
1406 if (interf_slots
& (1 << i
)) {
1407 sblog
<< "!!!!!! interf slot: " << i
<< " : ";
1408 dump::dump_op(rt
.slot(i
));
1417 PSC_DUMP( sblog
<< "ci: discarding slots " << interf_slots
<< "\n"; );
1419 rt
.discard_slots(interf_slots
, alu
.conflict_nodes
);
1420 regmap
= prev_regmap
;
1426 sblog
<< "check_interferences: after: \n";
1433 // add instruction(s) (alu_node or contents of alu_packed_node) to current group
1434 // returns the number of added instructions on success
1435 unsigned post_scheduler::try_add_instruction(node
*n
) {
1437 alu_group_tracker
&rt
= alu
.grp();
1439 unsigned avail_slots
= rt
.avail_slots();
1441 if (n
->is_alu_packed()) {
1442 alu_packed_node
*p
= static_cast<alu_packed_node
*>(n
);
1443 unsigned slots
= p
->get_slot_mask();
1444 unsigned cnt
= __builtin_popcount(slots
);
1446 if ((slots
& avail_slots
) != slots
) {
1447 PSC_DUMP( sblog
<< " no slots \n"; );
1451 p
->update_packed_items(ctx
);
1453 if (!rt
.try_reserve(p
)) {
1454 PSC_DUMP( sblog
<< " reservation failed \n"; );
1462 alu_node
*a
= static_cast<alu_node
*>(n
);
1463 value
*d
= a
->dst
.empty() ? NULL
: a
->dst
[0];
1465 if (d
&& d
->is_special_reg()) {
1466 assert(a
->bc
.op_ptr
->flags
& AF_MOVA
);
1470 unsigned allowed_slots
= ctx
.alu_slots_mask(a
->bc
.op_ptr
);
1473 allowed_slots
&= avail_slots
;
1479 slot
= d
->get_final_chan();
1480 a
->bc
.dst_chan
= slot
;
1481 allowed_slots
&= (1 << slot
) | 0x10;
1483 if (a
->bc
.op_ptr
->flags
& AF_MOVA
) {
1484 if (a
->bc
.slot_flags
& AF_V
)
1485 allowed_slots
&= (1 << SLOT_X
);
1487 allowed_slots
&= (1 << SLOT_TRANS
);
1491 // FIXME workaround for some problems with MULADD in trans slot on r700,
1492 // (is it really needed on r600?)
1493 if (a
->bc
.op
== ALU_OP3_MULADD
&& !ctx
.is_egcm()) {
1494 allowed_slots
&= 0x0F;
1497 if (!allowed_slots
) {
1498 PSC_DUMP( sblog
<< " no suitable slots\n"; );
1502 slot
= __builtin_ctz(allowed_slots
);
1505 PSC_DUMP( sblog
<< "slot: " << slot
<< "\n"; );
1507 if (!rt
.try_reserve(a
)) {
1508 PSC_DUMP( sblog
<< " reservation failed\n"; );
1517 bool post_scheduler::check_copy(node
*n
) {
1518 if (!n
->is_copy_mov())
1521 value
*s
= n
->src
[0];
1522 value
*d
= n
->dst
[0];
1524 if (!s
->is_sgpr() || !d
->is_sgpr())
1527 if (!s
->is_prealloc()) {
1531 if (s
->gpr
== d
->gpr
) {
1534 sblog
<< "check_copy: ";
1539 rv_map::iterator F
= regmap
.find(d
->gpr
);
1540 bool gpr_free
= (F
== regmap
.end());
1542 if (d
->is_prealloc()) {
1544 PSC_DUMP( sblog
<< " copy not ready...\n";);
1548 value
*rv
= F
->second
;
1549 if (rv
!= d
&& (!rv
->chunk
|| rv
->chunk
!= d
->chunk
)) {
1550 PSC_DUMP( sblog
<< " copy not ready(2)...\n";);
1554 unmap_dst(static_cast<alu_node
*>(n
));
1557 if (s
->is_prealloc() && !map_src_val(s
))
1560 update_live(n
, NULL
);
1562 release_src_values(n
);
1564 PSC_DUMP( sblog
<< " copy coalesced...\n";);
1570 void post_scheduler::dump_group(alu_group_tracker
&rt
) {
1571 for (unsigned i
= 0; i
< 5; ++i
) {
1572 node
*n
= rt
.slot(i
);
1574 sblog
<< "slot " << i
<< " : ";
1581 void post_scheduler::process_ready_copies() {
1586 last
= ready_copies
.back();
1588 for (node_iterator N
, I
= ready_copies
.begin(), E
= ready_copies
.end();
1594 if (!check_copy(n
)) {
1599 } while (last
!= ready_copies
.back());
1601 update_local_interferences();
1605 bool post_scheduler::prepare_alu_group() {
1607 alu_group_tracker
&rt
= alu
.grp();
1612 sblog
<< "prepare_alu_group: starting...\n";
1616 ready
.append_from(&alu
.conflict_nodes
);
1618 // FIXME rework this loop
1622 process_ready_copies();
1626 for (node_iterator N
, I
= ready
.begin(), E
= ready
.end(); I
!= E
;
1638 unsigned cnt
= try_add_instruction(n
);
1644 sblog
<< "current group:\n";
1648 if (rt
.inst_count() == ctx
.num_slots
) {
1649 PSC_DUMP( sblog
<< " all slots used\n"; );
1654 if (!check_interferences())
1657 // don't try to add more instructions to the group with mova if this
1658 // can lead to breaking clause slot count limit - we don't want mova to
1659 // end up in the end of the new clause instead of beginning of the
1661 if (rt
.has_ar_load() && alu
.total_slots() > 121)
1664 if (rt
.inst_count() && i1
> 50)
1667 regmap
= prev_regmap
;
1672 sblog
<< " prepare_alu_group done, " << rt
.inst_count()
1675 sblog
<< "$$$$$$$$PAG i1=" << i1
1676 << " ready " << ready
.count()
1677 << " pending " << pending
.count()
1678 << " conflicting " << alu
.conflict_nodes
.count()
1683 return rt
.inst_count();
1686 void post_scheduler::release_src_values(node
* n
) {
1687 release_src_vec(n
->src
, true);
1688 release_src_vec(n
->dst
, false);
1691 void post_scheduler::release_op(node
*n
) {
1693 sblog
<< "release_op ";
1700 if (n
->is_copy_mov()) {
1701 ready_copies
.push_back(n
);
1702 } else if (n
->is_mova() || n
->is_pred_set()) {
1703 ready
.push_front(n
);
1709 void post_scheduler::release_src_val(value
*v
) {
1710 node
*d
= v
->any_def();
1717 void post_scheduler::release_src_vec(vvec
& vv
, bool src
) {
1719 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
1721 if (!v
|| v
->is_readonly())
1725 release_src_val(v
->rel
);
1726 release_src_vec(v
->muse
, true);
1734 void literal_tracker::reset() {
1735 memset(lt
, 0, sizeof(lt
));
1736 memset(uc
, 0, sizeof(uc
));
1739 void rp_gpr_tracker::reset() {
1740 memset(rp
, 0, sizeof(rp
));
1741 memset(uc
, 0, sizeof(uc
));
1744 void rp_kcache_tracker::reset() {
1745 memset(rp
, 0, sizeof(rp
));
1746 memset(uc
, 0, sizeof(uc
));
1749 void alu_kcache_tracker::reset() {
1750 memset(kc
, 0, sizeof(kc
));
1754 void alu_clause_tracker::reset() {
1761 alu_clause_tracker::alu_clause_tracker(shader
&sh
)
1762 : sh(sh
), kt(sh
.get_ctx().hw_class
), slot_count(),
1766 current_ar(), current_pr() {}
1768 void alu_clause_tracker::emit_group() {
1770 assert(grp().inst_count());
1772 alu_group_node
*g
= grp().emit();
1774 if (grp().has_update_exec_mask()) {
1775 assert(!push_exec_mask
);
1776 push_exec_mask
= true;
1782 clause
= sh
.create_clause(NST_ALU_CLAUSE
);
1785 clause
->push_front(g
);
1787 slot_count
+= grp().slot_count();
1791 PSC_DUMP( sblog
<< " #### group emitted\n"; );
1794 void alu_clause_tracker::emit_clause(container_node
*c
) {
1797 kt
.init_clause(clause
->bc
);
1799 assert(!current_ar
);
1800 assert(!current_pr
);
1803 clause
->bc
.set_op(CF_OP_ALU_PUSH_BEFORE
);
1805 c
->push_front(clause
);
1808 push_exec_mask
= false;
1812 PSC_DUMP( sblog
<< "######### ALU clause emitted\n"; );
1815 bool alu_clause_tracker::check_clause_limits() {
1817 alu_group_tracker
>
= grp();
1819 unsigned slots
= gt
.slot_count();
1821 // reserving slots to load AR and PR values
1822 unsigned reserve_slots
= (current_ar
? 1 : 0) + (current_pr
? 1 : 0);
1824 if (slot_count
+ slots
> MAX_ALU_SLOTS
- reserve_slots
)
1827 if (!kt
.try_reserve(gt
))
1833 void alu_clause_tracker::new_group() {
1838 bool alu_clause_tracker::is_empty() {
1839 return clause
== NULL
;
1842 void literal_tracker::init_group_literals(alu_group_node
* g
) {
1844 g
->literals
.clear();
1845 for (unsigned i
= 0; i
< 4; ++i
) {
1849 g
->literals
.push_back(lt
[i
]);
1852 sblog
<< "literal emitted: " << lt
[i
].f
;
1853 sblog
.print_zw_hex(lt
[i
].u
, 8);
1854 sblog
<< " " << lt
[i
].i
<< "\n";
1859 bool alu_kcache_tracker::try_reserve(alu_group_tracker
& gt
) {
1860 rp_kcache_tracker
&kt
= gt
.kcache();
1865 sb_set
<unsigned> group_lines
;
1867 unsigned nl
= kt
.get_lines(group_lines
);
1870 sb_set
<unsigned> clause_lines(lines
);
1871 lines
.add_set(group_lines
);
1873 if (clause_lines
.size() == lines
.size())
1879 lines
= clause_lines
;
1884 unsigned rp_kcache_tracker::get_lines(kc_lines
& lines
) {
1887 for (unsigned i
= 0; i
< sel_count
; ++i
) {
1888 unsigned line
= rp
[i
];
1894 line
= (sel_count
== 2) ? line
>> 5 : line
>> 6;
1896 if (lines
.insert(line
).second
)
1902 bool alu_kcache_tracker::update_kc() {
1905 bc_kcache old_kc
[4];
1906 memcpy(old_kc
, kc
, sizeof(kc
));
1908 for (kc_lines::iterator I
= lines
.begin(), E
= lines
.end(); I
!= E
; ++I
) {
1910 unsigned bank
= line
>> 8;
1914 if (c
&& (bank
== kc
[c
-1].bank
) && (kc
[c
-1].addr
+ 1 == line
))
1918 memcpy(kc
, old_kc
, sizeof(kc
));
1922 kc
[c
].mode
= KC_LOCK_1
;
1932 alu_node
* alu_clause_tracker::create_ar_load() {
1933 alu_node
*a
= sh
.create_alu();
1935 // FIXME use MOVA_GPR on R6xx
1937 if (sh
.get_ctx().uses_mova_gpr
) {
1938 a
->bc
.set_op(ALU_OP1_MOVA_GPR_INT
);
1939 a
->bc
.slot
= SLOT_TRANS
;
1941 a
->bc
.set_op(ALU_OP1_MOVA_INT
);
1942 a
->bc
.slot
= SLOT_X
;
1946 a
->src
.push_back(current_ar
);
1949 sblog
<< "created AR load: ";
1957 void alu_clause_tracker::discard_current_group() {
1958 PSC_DUMP( sblog
<< "act::discard_current_group\n"; );
1959 grp().discard_all_slots(conflict_nodes
);
1962 void rp_gpr_tracker::dump() {
1963 sblog
<< "=== gpr_tracker dump:\n";
1964 for (int c
= 0; c
< 3; ++c
) {
1965 sblog
<< "cycle " << c
<< " ";
1966 for (int h
= 0; h
< 4; ++h
) {
1967 sblog
<< rp
[c
][h
] << ":" << uc
[c
][h
] << " ";
1973 } // namespace r600_sb