2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
30 #define RA_DUMP(q) do { q } while (0)
38 #include "sb_shader.h"
44 typedef uint32_t basetype
;
45 static const unsigned bt_bytes
= sizeof(basetype
);
46 static const unsigned bt_index_shift
= 5;
47 static const unsigned bt_index_mask
= (1u << bt_index_shift
) - 1;
48 static const unsigned bt_bits
= bt_bytes
<< 3;
49 static const unsigned size
= MAX_GPR
* 4 / bt_bits
;
57 regbits(unsigned num_temps
) : dta(), num_temps(num_temps
) {}
58 regbits(unsigned num_temps
, unsigned value
) : num_temps(num_temps
)
61 regbits(shader
&sh
, val_set
&vs
) : num_temps(sh
.get_ctx().alu_temp_gprs
)
62 { set_all(1); from_val_set(sh
, vs
); }
64 void set_all(unsigned val
);
65 void from_val_set(shader
&sh
, val_set
&vs
);
67 void set(unsigned index
);
68 void clear(unsigned index
);
69 bool get(unsigned index
);
71 void set(unsigned index
, unsigned val
);
73 sel_chan
find_free_bit();
74 sel_chan
find_free_chans(unsigned mask
);
75 sel_chan
find_free_chan_by_mask(unsigned mask
);
76 sel_chan
find_free_array(unsigned size
, unsigned mask
);
81 // =======================================
83 void regbits::dump() {
84 for (unsigned i
= 0; i
< size
* bt_bits
; ++i
) {
90 sblog
.print_w(i
/ 4, 7);
94 sblog
<< (get(i
) ? 1 : 0);
99 void regbits::set_all(unsigned v
) {
100 memset(&dta
, v
? 0xFF : 0x00, size
* bt_bytes
);
103 void regbits::from_val_set(shader
&sh
, val_set
& vs
) {
106 for (val_set::iterator I
= s
.begin(sh
), E
= s
.end(sh
); I
!= E
; ++I
) {
108 if (v
->is_any_gpr()) {
109 g
= v
->get_final_gpr();
122 void regbits::set(unsigned index
) {
123 unsigned ih
= index
>> bt_index_shift
;
124 unsigned il
= index
& bt_index_mask
;
125 dta
[ih
] |= ((basetype
)1u << il
);
128 void regbits::clear(unsigned index
) {
129 unsigned ih
= index
>> bt_index_shift
;
130 unsigned il
= index
& bt_index_mask
;
132 dta
[ih
] &= ~((basetype
)1u << il
);
135 bool regbits::get(unsigned index
) {
136 unsigned ih
= index
>> bt_index_shift
;
137 unsigned il
= index
& bt_index_mask
;
138 return dta
[ih
] & ((basetype
)1u << il
);
141 void regbits::set(unsigned index
, unsigned val
) {
142 unsigned ih
= index
>> bt_index_shift
;
143 unsigned il
= index
& bt_index_mask
;
144 basetype bm
= 1u << il
;
145 dta
[ih
] = (dta
[ih
] & ~bm
) | (val
<< il
);
148 // free register for ra means the bit is set
149 sel_chan
regbits::find_free_bit() {
153 while (elt
< size
&& !dta
[elt
])
159 bit
= __builtin_ctz(dta
[elt
]) + (elt
<< bt_index_shift
);
161 assert(bit
< ((MAX_GPR
- num_temps
) << 2));
166 // find free gpr component to use as indirectly addressable array
167 sel_chan
regbits::find_free_array(unsigned length
, unsigned mask
) {
170 // FIXME optimize this. though hopefully we won't have a lot of arrays
171 for (unsigned a
= 0; a
< MAX_GPR
- num_temps
; ++a
) {
172 for(unsigned c
= 0; c
< MAX_CHAN
; ++c
) {
173 if (mask
& (1 << c
)) {
174 if (get((a
<< 2) | c
)) {
175 if (++cc
[c
] == length
)
176 return sel_chan(a
- length
+ 1, c
);
186 sel_chan
regbits::find_free_chans(unsigned mask
) {
190 assert (!(mask
& ~0xF));
191 basetype cd
= dta
[elt
];
203 unsigned p
= __builtin_ctz(cd
) & ~(basetype
)3u;
205 assert (p
<= bt_bits
- bit
);
209 if ((cd
& mask
) == mask
) {
210 return ((elt
<< bt_index_shift
) | bit
) + 1;
221 sel_chan
regbits::find_free_chan_by_mask(unsigned mask
) {
225 assert (!(mask
& ~0xF));
226 basetype cd
= dta
[elt
];
238 unsigned p
= __builtin_ctz(cd
) & ~(basetype
)3u;
240 assert (p
<= bt_bits
- bit
);
245 unsigned nb
= __builtin_ctz(cd
& mask
);
246 unsigned ofs
= ((elt
<< bt_index_shift
) | bit
);
258 // ================================
260 void ra_init::alloc_arrays() {
262 gpr_array_vec
&ga
= sh
.arrays();
264 for(gpr_array_vec::iterator I
= ga
.begin(), E
= ga
.end(); I
!= E
; ++I
) {
268 sblog
<< "array [" << a
->array_size
<< "] at " << a
->base_gpr
<< "\n";
272 // skip preallocated arrays (e.g. with preloaded inputs)
274 RA_DUMP( sblog
<< " FIXED at " << a
->gpr
<< "\n"; );
278 bool dead
= a
->is_dead();
281 RA_DUMP( sblog
<< " DEAD\n"; );
285 val_set
&s
= a
->interferences
;
288 for (val_set::iterator I
= s
.begin(sh
), E
= s
.end(sh
); I
!= E
; ++I
) {
295 sblog
<< " interf: ";
296 dump::dump_set(sh
, s
);
302 sel_chan base
= rb
.find_free_array(a
->array_size
,
303 (1 << a
->base_gpr
.chan()));
305 RA_DUMP( sblog
<< " found base: " << base
<< "\n"; );
320 void ra_init::ra_node(container_node
* c
) {
322 for (node_iterator I
= c
->begin(), E
= c
->end(); I
!= E
; ++I
) {
324 if (n
->type
== NT_OP
) {
327 if (n
->is_container() && !n
->is_alu_packed()) {
328 ra_node(static_cast<container_node
*>(n
));
333 void ra_init::process_op(node
* n
) {
335 bool copy
= n
->is_copy_mov();
338 sblog
<< "ra_init: process_op : ";
343 if (n
->is_alu_packed()) {
344 for (vvec::iterator I
= n
->src
.begin(), E
= n
->src
.end(); I
!= E
; ++I
) {
346 if (v
&& v
->is_sgpr() && v
->constraint
&&
347 v
->constraint
->kind
== CK_PACKED_BS
) {
348 color_bs_constraint(v
->constraint
);
354 if (n
->is_fetch_inst() || n
->is_cf_inst()) {
355 for (vvec::iterator I
= n
->src
.begin(), E
= n
->src
.end(); I
!= E
; ++I
) {
357 if (v
&& v
->is_sgpr())
362 for (vvec::iterator I
= n
->dst
.begin(), E
= n
->dst
.end(); I
!= E
; ++I
) {
368 if (copy
&& !v
->constraint
) {
369 value
*s
= *(n
->src
.begin() + (I
- n
->dst
.begin()));
372 assign_color(v
, s
->gpr
);
381 void ra_init::color_bs_constraint(ra_constraint
* c
) {
382 vvec
&vv
= c
->values
;
383 assert(vv
.size() <= 8);
386 sblog
<< "color_bs_constraint: ";
391 regbits
rb(ctx
.alu_temp_gprs
);
393 unsigned chan_count
[4] = {};
394 unsigned allowed_chans
= 0x0F;
396 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
399 if (!v
|| v
->is_dead())
402 sel_chan gpr
= v
->get_final_gpr();
407 sh
.coal
.get_chunk_interferences(v
->chunk
, interf
);
409 interf
= v
->interferences
;
412 sblog
<< " processing " << *v
<< " interferences : ";
413 dump::dump_set(sh
, interf
);
418 unsigned chan
= gpr
.chan();
419 if (chan_count
[chan
] < 3) {
423 v
->flags
&= ~VLF_FIXED
;
424 allowed_chans
&= ~(1 << chan
);
425 assert(allowed_chans
);
435 rb
.from_val_set(sh
, interf
);
438 sblog
<< " regbits : ";
443 while (allowed_chans
&& gpr
.sel() < sh
.num_nontemp_gpr()) {
445 while (rb
.get(gpr
- 1) == 0)
449 sblog
<< " trying " << gpr
<< "\n";
452 unsigned chan
= gpr
.chan();
453 if (chan_count
[chan
] < 3) {
457 vvec::iterator F
= std::find(v
->chunk
->values
.begin(),
458 v
->chunk
->values
.end(),
460 v
->chunk
->values
.erase(F
);
464 assign_color(v
, gpr
);
467 allowed_chans
&= ~(1 << chan
);
473 sblog
<< "color_bs_constraint: failed...\n";
474 assert(!"coloring failed");
479 void ra_init::color(value
* v
) {
481 if (v
->constraint
&& v
->constraint
->kind
== CK_PACKED_BS
) {
482 color_bs_constraint(v
->constraint
);
486 if (v
->chunk
&& v
->chunk
->is_fixed())
490 sblog
<< "coloring ";
492 sblog
<< " interferences ";
493 dump::dump_set(sh
, v
->interferences
);
497 if (v
->is_reg_pinned()) {
498 assert(v
->is_chan_pinned());
499 assign_color(v
, v
->pin_gpr
);
503 regbits
rb(sh
, v
->interferences
);
506 if (v
->is_chan_pinned()) {
507 RA_DUMP( sblog
<< "chan_pinned = " << v
->pin_gpr
.chan() << " "; );
508 unsigned mask
= 1 << v
->pin_gpr
.chan();
509 c
= rb
.find_free_chans(mask
) + v
->pin_gpr
.chan();
511 unsigned cm
= get_preferable_chan_mask();
512 RA_DUMP( sblog
<< "pref chan mask: " << cm
<< "\n"; );
513 c
= rb
.find_free_chan_by_mask(cm
);
516 assert(c
&& c
.sel() < 128 - ctx
.alu_temp_gprs
&& "color failed");
520 void ra_init::assign_color(value
* v
, sel_chan c
) {
521 add_prev_chan(c
.chan());
526 sblog
<< " to " << c
<< "\n";
530 // ===================================================
532 int ra_split::run() {
537 void ra_split::split_phi_src(container_node
*loc
, container_node
*c
,
538 unsigned id
, bool loop
) {
539 for (node_iterator I
= c
->begin(), E
= c
->end(); I
!= E
; ++I
) {
541 value
* &v
= p
->src
[id
], *d
= p
->dst
[0];
544 if (!d
->is_sgpr() || v
->is_undef())
547 value
*t
= sh
.create_temp_value();
549 loc
->insert_before(sh
.create_copy_mov(t
, v
));
551 loc
->push_back(sh
.create_copy_mov(t
, v
));
554 sh
.coal
.add_edge(v
, d
, coalescer::phi_cost
);
558 void ra_split::split_phi_dst(node
* loc
, container_node
*c
, bool loop
) {
559 for (node_iterator I
= c
->begin(), E
= c
->end(); I
!= E
; ++I
) {
561 value
* &v
= p
->dst
[0];
567 value
*t
= sh
.create_temp_value();
568 node
*cp
= sh
.create_copy_mov(v
, t
);
570 static_cast<container_node
*>(loc
)->push_front(cp
);
572 loc
->insert_after(cp
);
578 void ra_split::init_phi_constraints(container_node
*c
) {
579 for (node_iterator I
= c
->begin(), E
= c
->end(); I
!= E
; ++I
) {
581 ra_constraint
*cc
= sh
.coal
.create_constraint(CK_PHI
);
582 cc
->values
.push_back(p
->dst
[0]);
584 for (vvec::iterator I
= p
->src
.begin(), E
= p
->src
.end(); I
!= E
; ++I
) {
587 cc
->values
.push_back(v
);
594 void ra_split::split(container_node
* n
) {
596 if (n
->type
== NT_DEPART
) {
597 depart_node
*d
= static_cast<depart_node
*>(n
);
599 split_phi_src(d
, d
->target
->phi
, d
->dep_id
, false);
600 } else if (n
->type
== NT_REPEAT
) {
601 repeat_node
*r
= static_cast<repeat_node
*>(n
);
602 if (r
->target
->loop_phi
)
603 split_phi_src(r
, r
->target
->loop_phi
, r
->rep_id
, true);
604 } else if (n
->type
== NT_REGION
) {
605 region_node
*r
= static_cast<region_node
*>(n
);
607 split_phi_dst(r
, r
->phi
, false);
610 split_phi_dst(r
->get_entry_code_location(), r
->loop_phi
,
612 split_phi_src(r
, r
->loop_phi
, 0, true);
616 for (node_riterator N
, I
= n
->rbegin(), E
= n
->rend(); I
!= E
; I
= N
) {
620 if (o
->type
== NT_OP
) {
622 } else if (o
->is_container()) {
623 split(static_cast<container_node
*>(o
));
627 if (n
->type
== NT_REGION
) {
628 region_node
*r
= static_cast<region_node
*>(n
);
630 init_phi_constraints(r
->phi
);
632 init_phi_constraints(r
->loop_phi
);
636 void ra_split::split_op(node
* n
) {
638 case NST_ALU_PACKED_INST
:
639 split_alu_packed(static_cast<alu_packed_node
*>(n
));
643 split_vector_inst(n
);
649 void ra_split::split_packed_ins(alu_packed_node
*n
) {
653 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
) {
657 if (v
&& v
->is_any_gpr() && !v
->is_undef()) {
659 vvec::iterator F
= std::find(sv
.begin(), sv
.end(), v
);
663 t
= *(dv
.begin() + (F
- sv
.begin()));
665 t
= sh
.create_temp_value();
673 unsigned cnt
= sv
.size();
677 for (vvec::iterator SI
= sv
.begin(), DI
= dv
.begin(), SE
= sv
.end();
678 SI
!= SE
; ++SI
, ++DI
) {
679 n
->insert_before(sh
.create_copy_mov(*DI
, *SI
));
682 ra_constraint
*c
= sh
.coal
.create_constraint(CK_PACKED_BS
);
688 // TODO handle other packed ops for cayman
689 void ra_split::split_alu_packed(alu_packed_node
* n
) {
700 void ra_split::split_vec(vvec
&vv
, vvec
&v1
, vvec
&v2
, bool allow_swz
) {
702 for (vvec::iterator I
= vv
.begin(), E
= vv
.end(); I
!= E
; ++I
, ++ch
) {
708 assert(!o
->is_dead());
713 if (allow_swz
&& o
->is_float_0_or_1())
718 allow_swz
? std::find(v2
.begin(), v2
.end(), o
) : v2
.end();
721 t
= *(v1
.begin() + (F
- v2
.begin()));
723 t
= sh
.create_temp_value();
726 t
->flags
|= VLF_PIN_CHAN
;
727 t
->pin_gpr
= sel_chan(0, ch
);
738 void ra_split::split_vector_inst(node
* n
) {
741 bool call_fs
= n
->is_cf_op(CF_OP_CALL_FS
);
742 bool no_src_swizzle
= n
->is_cf_inst() && (n
->cf_op_flags() & CF_MEM
);
744 no_src_swizzle
|= n
->is_fetch_op(FETCH_OP_VFETCH
) ||
745 n
->is_fetch_op(FETCH_OP_SEMFETCH
);
747 if (!n
->src
.empty() && !call_fs
) {
749 // we may have more than one source vector -
750 // fetch instructions with FF_USEGRAD have gradient values in
751 // src vectors 1 (src[4-7] and 2 (src[8-11])
753 unsigned nvec
= n
->src
.size() >> 2;
754 assert(nvec
<< 2 == n
->src
.size());
756 for (unsigned nv
= 0; nv
< nvec
; ++nv
) {
757 vvec sv
, tv
, nsrc(4);
758 unsigned arg_start
= nv
<< 2;
760 std::copy(n
->src
.begin() + arg_start
,
761 n
->src
.begin() + arg_start
+ 4,
764 split_vec(nsrc
, tv
, sv
, !no_src_swizzle
);
766 unsigned cnt
= sv
.size();
768 if (no_src_swizzle
|| cnt
) {
770 std::copy(nsrc
.begin(), nsrc
.end(), n
->src
.begin() + arg_start
);
772 for(unsigned i
= 0, s
= tv
.size(); i
< s
; ++i
) {
773 n
->insert_before(sh
.create_copy_mov(tv
[i
], sv
[i
]));
776 c
= sh
.coal
.create_constraint(CK_SAME_REG
);
783 if (!n
->dst
.empty()) {
784 vvec sv
, tv
, ndst
= n
->dst
;
786 split_vec(ndst
, tv
, sv
, true);
792 for(unsigned i
= 0, s
= tv
.size(); i
< s
; ++i
) {
793 lp
->insert_after(sh
.create_copy_mov(sv
[i
], tv
[i
]));
798 for (unsigned i
= 0, cnt
= tv
.size(); i
< cnt
; ++i
) {
804 v
->flags
|= VLF_PIN_REG
| VLF_PIN_CHAN
;
805 s
->flags
&= ~(VLF_PIN_REG
| VLF_PIN_CHAN
);
809 assert(s
->rel
->is_const());
810 sel
= sel_chan(s
->select
.sel() +
811 s
->rel
->get_const_value().u
,
816 v
->gpr
= v
->pin_gpr
= sel
;
820 c
= sh
.coal
.create_constraint(CK_SAME_REG
);
828 void ra_init::add_prev_chan(unsigned chan
) {
829 prev_chans
= (prev_chans
<< 4) | (1 << chan
);
832 unsigned ra_init::get_preferable_chan_mask() {
833 unsigned i
, used_chans
= 0;
834 unsigned chans
= prev_chans
;
836 for (i
= 0; i
< ra_tune
; ++i
) {
841 return (~used_chans
) & 0xF;
844 } // namespace r600_sb