From: Vadim Girlin <vadimgirlin@gmail.com>
Date: Wed, 17 Jul 2013 14:29:56 +0000 (+0400)
Subject: r600g/sb: improve alu packing on cayman
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=07baf9cfd16b38872be952382ae5a705057cbec2;p=mesa.git

r600g/sb: improve alu packing on cayman

Scheduler/register allocator in r600-sb was developed and optimized
on evergreen (VLIW-5) hardware, so currently it's not optimal for
VLIW-4 chips.
This patch should improve performance on cayman gpus due to better alu
packing, but also it tends to increase register usage, so overall positive
effect on performance has to be proven by real benchmarks yet.

Some results with bfgminer kernel on cayman:
source bytecode:       60 gprs, 3905 alu groups,
sbcl before the patch: 45 gprs, 4088 alu groups,
sbcl with this patch:  55 gprs, 3474 alu groups.

Signed-off-by: Vadim Girlin <vadimgirlin@gmail.com>
---

diff --git a/src/gallium/drivers/r600/sb/sb_pass.h b/src/gallium/drivers/r600/sb/sb_pass.h
index c3ea8734de3..95d2a203a60 100644
--- a/src/gallium/drivers/r600/sb/sb_pass.h
+++ b/src/gallium/drivers/r600/sb/sb_pass.h
@@ -507,12 +507,36 @@ class ra_init : public pass {
 
 public:
 
-	ra_init(shader &sh) : pass(sh) {}
+	ra_init(shader &sh) : pass(sh), prev_chans() {
+
+		// The parameter below affects register channels distribution.
+		// For cayman (VLIW-4) we're trying to distribute the channels
+		// uniformly, this means significantly better alu slots utilization
+		// at the expense of higher gpr usage. Hopefully this will improve
+		// performance, though it has to be proven with real benchmarks yet.
+		// For VLIW-5 this method could also slightly improve slots
+		// utilization, but increased register pressure seems more significant
+		// and overall performance effect is negative according to some
+		// benchmarks, so it's not used currently. Basically, VLIW-5 doesn't
+		// really need it because trans slot (unrestricted by register write
+		// channel) allows to consume most deviations from uniform channel
+		// distribution.
+		// Value 3 means that for new allocation we'll use channel that differs
+		// from 3 last used channels. 0 for VLIW-5 effectively turns this off.
+
+		ra_tune = sh.get_ctx().is_cayman() ? 3 : 0;
+	}
 
 	virtual int run();
 
 private:
 
+	unsigned prev_chans;
+	unsigned ra_tune;
+
+	void add_prev_chan(unsigned chan);
+	unsigned get_preferable_chan_mask();
+
 	void ra_node(container_node *c);
 	void process_op(node *n);
 
diff --git a/src/gallium/drivers/r600/sb/sb_ra_init.cpp b/src/gallium/drivers/r600/sb/sb_ra_init.cpp
index 24b24a0bde3..0b332a9847a 100644
--- a/src/gallium/drivers/r600/sb/sb_ra_init.cpp
+++ b/src/gallium/drivers/r600/sb/sb_ra_init.cpp
@@ -72,6 +72,7 @@ public:
 
 	sel_chan find_free_bit();
 	sel_chan find_free_chans(unsigned mask);
+	sel_chan find_free_chan_by_mask(unsigned mask);
 	sel_chan find_free_array(unsigned size, unsigned mask);
 
 	void dump();
@@ -86,7 +87,7 @@ void regbits::dump() {
 			sblog << "\n";
 
 		if (!(i & 3)) {
-			sblog.print_wl(i / 4, 7);
+			sblog.print_w(i / 4, 7);
 			sblog << " ";
 		}
 
@@ -186,34 +187,64 @@ sel_chan regbits::find_free_chans(unsigned mask) {
 	unsigned elt = 0;
 	unsigned bit = 0;
 
-	basetype cd = dta[elt] >> bit;
+	assert (!(mask & ~0xF));
+	basetype cd = dta[elt];
 
 	do {
-
 		if (!cd) {
-			if (++elt < size)
+			if (++elt < size) {
 				cd = dta[elt];
-			else
+				bit = 0;
+				continue;
+			} else
 				return 0;
-
-			bit = 0;
 		}
 
 		unsigned p = __builtin_ctz(cd) & ~(basetype)3u;
 
-		if (p > bt_bits - bit) {
-			if (++elt < size)
+		assert (p <= bt_bits - bit);
+		bit += p;
+		cd >>= p;
+
+		if ((cd & mask) == mask) {
+			return ((elt << bt_index_shift) | bit) + 1;
+		}
+
+		bit += 4;
+		cd >>= 4;
+
+	} while (1);
+
+	return 0;
+}
+
+sel_chan regbits::find_free_chan_by_mask(unsigned mask) {
+	unsigned elt = 0;
+	unsigned bit = 0;
+
+	assert (!(mask & ~0xF));
+	basetype cd = dta[elt];
+
+	do {
+		if (!cd) {
+			if (++elt < size) {
 				cd = dta[elt];
-			else
+				bit = 0;
+				continue;
+			} else
 				return 0;
-			bit = 0;
 		}
 
+		unsigned p = __builtin_ctz(cd) & ~(basetype)3u;
+
+		assert (p <= bt_bits - bit);
 		bit += p;
 		cd >>= p;
 
-		if ((cd & mask) == mask) {
-			return ((elt << bt_index_shift) | bit) + 1;
+		if (cd & mask) {
+			unsigned nb = __builtin_ctz(cd & mask);
+			unsigned ofs = ((elt << bt_index_shift) | bit);
+			return nb + ofs + 1;
 		}
 
 		bit += 4;
@@ -476,7 +507,9 @@ void ra_init::color(value* v) {
 		unsigned mask = 1 << v->pin_gpr.chan();
 		c = rb.find_free_chans(mask) + v->pin_gpr.chan();
 	} else {
-		c = rb.find_free_bit();
+		unsigned cm = get_preferable_chan_mask();
+		RA_DUMP( sblog << "pref chan mask: " << cm << "\n"; );
+		c = rb.find_free_chan_by_mask(cm);
 	}
 
 	assert(c && c.sel() < 128 - ctx.alu_temp_gprs && "color failed");
@@ -484,6 +517,7 @@ void ra_init::color(value* v) {
 }
 
 void ra_init::assign_color(value* v, sel_chan c) {
+	add_prev_chan(c.chan());
 	v->gpr = c;
 	RA_DUMP(
 		sblog << "colored ";
@@ -790,4 +824,20 @@ void ra_split::split_vector_inst(node* n) {
 	}
 }
 
+void ra_init::add_prev_chan(unsigned chan) {
+	prev_chans = (prev_chans << 4) | (1 << chan);
+}
+
+unsigned ra_init::get_preferable_chan_mask() {
+	unsigned i, used_chans = 0;
+	unsigned chans = prev_chans;
+
+	for (i = 0; i < ra_tune; ++i) {
+		used_chans |= chans;
+		chans >>= 4;
+	}
+
+	return (~used_chans) & 0xF;
+}
+
 } // namespace r600_sb