public:
- ra_init(shader &sh) : pass(sh) {}
+ ra_init(shader &sh) : pass(sh), prev_chans() {
+
+ // The parameter below affects register channels distribution.
+ // For cayman (VLIW-4) we're trying to distribute the channels
+ // uniformly, this means significantly better alu slots utilization
+ // at the expense of higher gpr usage. Hopefully this will improve
+ // performance, though it has to be proven with real benchmarks yet.
+ // For VLIW-5 this method could also slightly improve slots
+ // utilization, but increased register pressure seems more significant
+ // and overall performance effect is negative according to some
+ // benchmarks, so it's not used currently. Basically, VLIW-5 doesn't
+ // really need it because trans slot (unrestricted by register write
+ // channel) allows to consume most deviations from uniform channel
+ // distribution.
+ // Value 3 means that for new allocation we'll use channel that differs
+ // from 3 last used channels. 0 for VLIW-5 effectively turns this off.
+
+ ra_tune = sh.get_ctx().is_cayman() ? 3 : 0;
+ }
virtual int run();
private:
+ unsigned prev_chans;
+ unsigned ra_tune;
+
+ void add_prev_chan(unsigned chan);
+ unsigned get_preferable_chan_mask();
+
void ra_node(container_node *c);
void process_op(node *n);
sel_chan find_free_bit();
sel_chan find_free_chans(unsigned mask);
+ sel_chan find_free_chan_by_mask(unsigned mask);
sel_chan find_free_array(unsigned size, unsigned mask);
void dump();
sblog << "\n";
if (!(i & 3)) {
- sblog.print_wl(i / 4, 7);
+ sblog.print_w(i / 4, 7);
sblog << " ";
}
unsigned elt = 0;
unsigned bit = 0;
- basetype cd = dta[elt] >> bit;
+ assert (!(mask & ~0xF));
+ basetype cd = dta[elt];
do {
-
if (!cd) {
- if (++elt < size)
+ if (++elt < size) {
cd = dta[elt];
- else
+ bit = 0;
+ continue;
+ } else
return 0;
-
- bit = 0;
}
unsigned p = __builtin_ctz(cd) & ~(basetype)3u;
- if (p > bt_bits - bit) {
- if (++elt < size)
+ assert (p <= bt_bits - bit);
+ bit += p;
+ cd >>= p;
+
+ if ((cd & mask) == mask) {
+ return ((elt << bt_index_shift) | bit) + 1;
+ }
+
+ bit += 4;
+ cd >>= 4;
+
+ } while (1);
+
+ return 0;
+}
+
+sel_chan regbits::find_free_chan_by_mask(unsigned mask) {
+ unsigned elt = 0;
+ unsigned bit = 0;
+
+ assert (!(mask & ~0xF));
+ basetype cd = dta[elt];
+
+ do {
+ if (!cd) {
+ if (++elt < size) {
cd = dta[elt];
- else
+ bit = 0;
+ continue;
+ } else
return 0;
- bit = 0;
}
+ unsigned p = __builtin_ctz(cd) & ~(basetype)3u;
+
+ assert (p <= bt_bits - bit);
bit += p;
cd >>= p;
- if ((cd & mask) == mask) {
- return ((elt << bt_index_shift) | bit) + 1;
+ if (cd & mask) {
+ unsigned nb = __builtin_ctz(cd & mask);
+ unsigned ofs = ((elt << bt_index_shift) | bit);
+ return nb + ofs + 1;
}
bit += 4;
unsigned mask = 1 << v->pin_gpr.chan();
c = rb.find_free_chans(mask) + v->pin_gpr.chan();
} else {
- c = rb.find_free_bit();
+ unsigned cm = get_preferable_chan_mask();
+ RA_DUMP( sblog << "pref chan mask: " << cm << "\n"; );
+ c = rb.find_free_chan_by_mask(cm);
}
assert(c && c.sel() < 128 - ctx.alu_temp_gprs && "color failed");
}
void ra_init::assign_color(value* v, sel_chan c) {
+ add_prev_chan(c.chan());
v->gpr = c;
RA_DUMP(
sblog << "colored ";
}
}
+void ra_init::add_prev_chan(unsigned chan) {
+ prev_chans = (prev_chans << 4) | (1 << chan);
+}
+
+unsigned ra_init::get_preferable_chan_mask() {
+ unsigned i, used_chans = 0;
+ unsigned chans = prev_chans;
+
+ for (i = 0; i < ra_tune; ++i) {
+ used_chans |= chans;
+ chans >>= 4;
+ }
+
+ return (~used_chans) & 0xF;
+}
+
} // namespace r600_sb