}
}
+class fs_reg_alloc {
+public:
+ fs_reg_alloc(fs_visitor *fs):
+ fs(fs), devinfo(fs->devinfo), compiler(fs->compiler), g(NULL)
+ {
+ mem_ctx = ralloc_context(NULL);
+ int reg_width = fs->dispatch_width / 8;
+ rsi = _mesa_logbase2(reg_width);
+ }
+
+ ~fs_reg_alloc()
+ {
+ ralloc_free(mem_ctx);
+ }
+
+ bool assign_regs(bool allow_spilling, bool spill_all);
+
+private:
+ void setup_payload_interference(int payload_node_count,
+ int first_payload_node);
+ void setup_mrf_hack_interference(int first_mrf_node,
+ int *first_used_mrf);
+ void build_interference_graph(bool allow_spilling);
+
+ int choose_spill_reg();
+ void spill_reg(unsigned spill_reg);
+
+ void *mem_ctx;
+ fs_visitor *fs;
+ const gen_device_info *devinfo;
+ const brw_compiler *compiler;
+
+ /* Which compiler->fs_reg_sets[] to use */
+ int rsi;
+
+ ra_graph *g;
+};
+
/**
* Sets up interference between thread payload registers and the virtual GRFs
* (note that in SIMD16, a node is two registers).
*/
void
-fs_visitor::setup_payload_interference(struct ra_graph *g,
- int payload_node_count,
- int first_payload_node)
+fs_reg_alloc::setup_payload_interference(int payload_node_count,
+ int first_payload_node)
{
int payload_last_use_ip[payload_node_count];
- calculate_payload_ranges(payload_node_count, payload_last_use_ip);
+ fs->calculate_payload_ranges(payload_node_count, payload_last_use_ip);
for (int i = 0; i < payload_node_count; i++) {
if (payload_last_use_ip[i] == -1)
* live between the start of the program and our last use of the payload
* node.
*/
- for (unsigned j = 0; j < this->alloc.count; j++) {
+ for (unsigned j = 0; j < fs->alloc.count; j++) {
/* Note that we use a <= comparison, unlike virtual_grf_interferes(),
* in order to not have to worry about the uniform issue described in
* calculate_live_intervals().
*/
- if (this->virtual_grf_start[j] <= payload_last_use_ip[i]) {
+ if (fs->virtual_grf_start[j] <= payload_last_use_ip[i]) {
ra_add_node_interference(g, first_payload_node + i, j);
}
}
* The alternative would be to have per-physical-register classes, which
* would just be silly.
*/
- if (devinfo->gen <= 5 && dispatch_width >= 16) {
+ if (devinfo->gen <= 5 && fs->dispatch_width >= 16) {
/* We have to divide by 2 here because we only have even numbered
* registers. Some of the payload registers will be odd, but
* that's ok because their physical register numbers have already
}
}
+namespace {
+ /**
+ * Maximum spill block size we expect to encounter in 32B units.
+ *
+ * This is somewhat arbitrary and doesn't necessarily limit the maximum
+ * variable size that can be spilled -- A higher value will allow a
+ * variable of a given size to be spilled more efficiently with a smaller
+ * number of scratch messages, but will increase the likelihood of a
+ * collision between the MRFs reserved for spilling and other MRFs used by
+ * the program (and possibly increase GRF register pressure on platforms
+ * without hardware MRFs), what could cause register allocation to fail.
+ *
+ * For the moment reserve just enough space so a register of 32 bit
+ * component type and natural region width can be spilled without splitting
+ * into multiple (force_writemask_all) scratch messages.
+ */
+ unsigned
+ spill_max_size(const backend_shader *s)
+ {
+ /* FINISHME - On Gen7+ it should be possible to avoid this limit
+ * altogether by spilling directly from the temporary GRF
+ * allocated to hold the result of the instruction (and the
+ * scratch write header).
+ */
+ /* FINISHME - The shader's dispatch width probably belongs in
+ * backend_shader (or some nonexistent fs_shader class?)
+ * rather than in the visitor class.
+ */
+ return static_cast<const fs_visitor *>(s)->dispatch_width / 8;
+ }
+
+ /**
+ * First MRF register available for spilling.
+ */
+ unsigned
+ spill_base_mrf(const backend_shader *s)
+ {
+ return BRW_MAX_MRF(s->devinfo->gen) - spill_max_size(s) - 1;
+ }
+}
+
/**
* Sets interference between virtual GRFs and usage of the high GRFs for SEND
* messages (treated as MRFs in code generation).
*/
-static void
-setup_mrf_hack_interference(fs_visitor *v, struct ra_graph *g,
- int first_mrf_node, int *first_used_mrf)
+void
+fs_reg_alloc::setup_mrf_hack_interference(int first_mrf_node,
+ int *first_used_mrf)
{
- bool mrf_used[BRW_MAX_MRF(v->devinfo->gen)];
- get_used_mrfs(v, mrf_used);
-
- *first_used_mrf = BRW_MAX_MRF(v->devinfo->gen);
- for (int i = 0; i < BRW_MAX_MRF(v->devinfo->gen); i++) {
+ *first_used_mrf = spill_base_mrf(fs);
+ for (int i = spill_base_mrf(fs); i < BRW_MAX_MRF(devinfo->gen); i++) {
/* Mark each MRF reg node as being allocated to its physical register.
*
* The alternative would be to have per-physical-register classes, which
*/
ra_set_node_reg(g, first_mrf_node + i, GEN7_MRF_HACK_START + i);
- /* Since we don't have any live/dead analysis on the MRFs, just mark all
- * that are used as conflicting with all virtual GRFs.
- */
- if (mrf_used[i]) {
- if (i < *first_used_mrf)
- *first_used_mrf = i;
-
- for (unsigned j = 0; j < v->alloc.count; j++) {
- ra_add_node_interference(g, first_mrf_node + i, j);
- }
- }
+ for (unsigned j = 0; j < fs->alloc.count; j++)
+ ra_add_node_interference(g, first_mrf_node + i, j);
}
}
-bool
-fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
+void
+fs_reg_alloc::build_interference_graph(bool allow_spilling)
{
+ const gen_device_info *devinfo = fs->devinfo;
+ const brw_compiler *compiler = fs->compiler;
+
/* Most of this allocation was written for a reg_width of 1
* (dispatch_width == 8). In extending to SIMD16, the code was
* left in place and it was converted to have the hardware
* registers it's allocating be contiguous physical pairs of regs
* for reg_width == 2.
*/
- int reg_width = dispatch_width / 8;
- unsigned hw_reg_mapping[this->alloc.count];
- int payload_node_count = ALIGN(this->first_non_payload_grf, reg_width);
- int rsi = _mesa_logbase2(reg_width); /* Which compiler->fs_reg_sets[] to use */
- calculate_live_intervals();
+ int reg_width = fs->dispatch_width / 8;
+ int payload_node_count = ALIGN(fs->first_non_payload_grf, reg_width);
+
+ fs->calculate_live_intervals();
- int node_count = this->alloc.count;
+ int node_count = fs->alloc.count;
int first_payload_node = node_count;
node_count += payload_node_count;
int first_mrf_hack_node = node_count;
int grf127_send_hack_node = node_count;
if (devinfo->gen >= 8)
node_count ++;
- struct ra_graph *g =
- ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count);
- for (unsigned i = 0; i < this->alloc.count; i++) {
- unsigned size = this->alloc.sizes[i];
+ assert(g == NULL);
+ g = ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count);
+ ralloc_steal(mem_ctx, g);
+
+ for (unsigned i = 0; i < fs->alloc.count; i++) {
+ unsigned size = fs->alloc.sizes[i];
int c;
assert(size <= ARRAY_SIZE(compiler->fs_reg_sets[rsi].classes) &&
* second operand of a PLN instruction needs to be an
* even-numbered register, so we have a special register class
* wm_aligned_pairs_class to handle this case. pre-GEN6 always
- * uses this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] as the
+ * uses fs->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] as the
* second operand of a PLN instruction (since it doesn't support
* any other interpolation modes). So all we need to do is find
* that register and set it to the appropriate class.
*/
if (compiler->fs_reg_sets[rsi].aligned_pairs_class >= 0 &&
- this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].file == VGRF &&
- this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].nr == i) {
+ fs->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].file == VGRF &&
+ fs->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].nr == i) {
c = compiler->fs_reg_sets[rsi].aligned_pairs_class;
}
ra_set_node_class(g, i, c);
for (unsigned j = 0; j < i; j++) {
- if (virtual_grf_interferes(i, j)) {
+ if (fs->virtual_grf_interferes(i, j)) {
ra_add_node_interference(g, i, j);
}
}
/* Certain instructions can't safely use the same register for their
* sources and destination. Add interference.
*/
- foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
- for (unsigned i = 0; i < 3; i++) {
+ for (unsigned i = 0; i < inst->sources; i++) {
if (inst->src[i].file == VGRF) {
ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
}
}
}
- setup_payload_interference(g, payload_node_count, first_payload_node);
+ setup_payload_interference(payload_node_count, first_payload_node);
if (devinfo->gen >= 7) {
int first_used_mrf = BRW_MAX_MRF(devinfo->gen);
- setup_mrf_hack_interference(this, g, first_mrf_hack_node,
- &first_used_mrf);
+ if (allow_spilling)
+ setup_mrf_hack_interference(first_mrf_hack_node, &first_used_mrf);
- foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
/* When we do send-from-GRF for FB writes, we need to ensure that
* the last write instruction sends from a high register. This is
* because the vertex fetcher wants to start filling the low
if (inst->eot) {
const int vgrf = inst->opcode == SHADER_OPCODE_SEND ?
inst->src[2].nr : inst->src[0].nr;
- int size = alloc.sizes[vgrf];
+ int size = fs->alloc.sizes[vgrf];
int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1;
/* If something happened to spill, we want to push the EOT send
* about this level of granularity, we simply make the source and
* destination interfere.
*/
- foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
if (inst->exec_size < 16 || inst->dst.file != VGRF)
continue;
* any register overlap between sources and destination.
*/
ra_set_node_reg(g, grf127_send_hack_node, 127);
- foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
if (inst->exec_size < 16 && inst->is_send_from_grf() &&
inst->dst.file == VGRF)
ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node);
}
- if (spilled_any_registers) {
- foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ if (fs->spilled_any_registers) {
+ foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
/* Spilling instruction are genereated as SEND messages from MRF
* but as Gen7+ supports sending from GRF the driver will maps
* assingn these MRF registers to a GRF. Implementations reuses
* interference here.
*/
if (devinfo->gen >= 9) {
- foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
inst->src[2].file == VGRF &&
inst->src[3].file == VGRF &&
- inst->src[2].nr != inst->src[3].nr) {
- for (unsigned i = 0; i < inst->mlen; i++) {
- for (unsigned j = 0; j < inst->ex_mlen; j++) {
- ra_add_node_interference(g, inst->src[2].nr + i,
- inst->src[3].nr + j);
- }
- }
- }
- }
- }
-
- /* Debug of register spilling: Go spill everything. */
- if (unlikely(spill_all)) {
- int reg = choose_spill_reg(g);
-
- if (reg != -1) {
- spill_reg(reg);
- ralloc_free(g);
- return false;
- }
- }
-
- if (!ra_allocate(g)) {
- /* Failed to allocate registers. Spill a reg, and the caller will
- * loop back into here to try again.
- */
- int reg = choose_spill_reg(g);
-
- if (reg == -1) {
- fail("no register to spill:\n");
- dump_instructions(NULL);
- } else if (allow_spilling) {
- spill_reg(reg);
- }
-
- ralloc_free(g);
-
- return false;
- }
-
- /* Get the chosen virtual registers for each node, and map virtual
- * regs in the register classes back down to real hardware reg
- * numbers.
- */
- this->grf_used = payload_node_count;
- for (unsigned i = 0; i < this->alloc.count; i++) {
- int reg = ra_get_node_reg(g, i);
-
- hw_reg_mapping[i] = compiler->fs_reg_sets[rsi].ra_reg_to_grf[reg];
- this->grf_used = MAX2(this->grf_used,
- hw_reg_mapping[i] + this->alloc.sizes[i]);
- }
-
- foreach_block_and_inst(block, fs_inst, inst, cfg) {
- assign_reg(hw_reg_mapping, &inst->dst);
- for (int i = 0; i < inst->sources; i++) {
- assign_reg(hw_reg_mapping, &inst->src[i]);
+ inst->src[2].nr != inst->src[3].nr)
+ ra_add_node_interference(g, inst->src[2].nr,
+ inst->src[3].nr);
}
}
-
- this->alloc.count = this->grf_used;
-
- ralloc_free(g);
-
- return true;
-}
-
-namespace {
- /**
- * Maximum spill block size we expect to encounter in 32B units.
- *
- * This is somewhat arbitrary and doesn't necessarily limit the maximum
- * variable size that can be spilled -- A higher value will allow a
- * variable of a given size to be spilled more efficiently with a smaller
- * number of scratch messages, but will increase the likelihood of a
- * collision between the MRFs reserved for spilling and other MRFs used by
- * the program (and possibly increase GRF register pressure on platforms
- * without hardware MRFs), what could cause register allocation to fail.
- *
- * For the moment reserve just enough space so a register of 32 bit
- * component type and natural region width can be spilled without splitting
- * into multiple (force_writemask_all) scratch messages.
- */
- unsigned
- spill_max_size(const backend_shader *s)
- {
- /* FINISHME - On Gen7+ it should be possible to avoid this limit
- * altogether by spilling directly from the temporary GRF
- * allocated to hold the result of the instruction (and the
- * scratch write header).
- */
- /* FINISHME - The shader's dispatch width probably belongs in
- * backend_shader (or some nonexistent fs_shader class?)
- * rather than in the visitor class.
- */
- return static_cast<const fs_visitor *>(s)->dispatch_width / 8;
- }
-
- /**
- * First MRF register available for spilling.
- */
- unsigned
- spill_base_mrf(const backend_shader *s)
- {
- return BRW_MAX_MRF(s->devinfo->gen) - spill_max_size(s) - 1;
- }
}
static void
}
int
-fs_visitor::choose_spill_reg(struct ra_graph *g)
+fs_reg_alloc::choose_spill_reg()
{
float block_scale = 1.0;
- float spill_costs[this->alloc.count];
- bool no_spill[this->alloc.count];
+ float spill_costs[fs->alloc.count];
+ bool no_spill[fs->alloc.count];
- for (unsigned i = 0; i < this->alloc.count; i++) {
+ for (unsigned i = 0; i < fs->alloc.count; i++) {
spill_costs[i] = 0.0;
no_spill[i] = false;
}
* spill/unspill we'll have to do, and guess that the insides of
* loops run 10 times.
*/
- foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
for (unsigned int i = 0; i < inst->sources; i++) {
if (inst->src[i].file == VGRF)
spill_costs[inst->src[i].nr] += regs_read(inst, i) * block_scale;
}
}
- for (unsigned i = 0; i < this->alloc.count; i++) {
+ for (unsigned i = 0; i < fs->alloc.count; i++) {
+ int live_length = fs->virtual_grf_end[i] - fs->virtual_grf_start[i];
+ if (live_length <= 0)
+ continue;
+
+ /* Divide the cost (in number of spills/fills) by the log of the length
+ * of the live range of the register. This will encourage spill logic
+ * to spill long-living things before spilling short-lived things where
+ * spilling is less likely to actually do us any good. We use the log
+ * of the length because it will fall off very quickly and not cause us
+ * to spill medium length registers with more uses.
+ */
+ float adjusted_cost = spill_costs[i] / logf(live_length);
if (!no_spill[i])
- ra_set_node_spill_cost(g, i, spill_costs[i]);
+ ra_set_node_spill_cost(g, i, adjusted_cost);
}
return ra_get_best_spill_node(g);
}
void
-fs_visitor::spill_reg(unsigned spill_reg)
+fs_reg_alloc::spill_reg(unsigned spill_reg)
{
- int size = alloc.sizes[spill_reg];
- unsigned int spill_offset = last_scratch;
+ int size = fs->alloc.sizes[spill_reg];
+ unsigned int spill_offset = fs->last_scratch;
assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */
/* Spills may use MRFs 13-15 in the SIMD16 case. Our texturing is done
* depth), starting from m1. In summary: We may not be able to spill in
* SIMD16 mode, because we'd stomp the FB writes.
*/
- if (!spilled_any_registers) {
+ if (!fs->spilled_any_registers) {
bool mrf_used[BRW_MAX_MRF(devinfo->gen)];
- get_used_mrfs(this, mrf_used);
+ get_used_mrfs(fs, mrf_used);
- for (int i = spill_base_mrf(this); i < BRW_MAX_MRF(devinfo->gen); i++) {
+ for (int i = spill_base_mrf(fs); i < BRW_MAX_MRF(devinfo->gen); i++) {
if (mrf_used[i]) {
- fail("Register spilling not supported with m%d used", i);
+ fs->fail("Register spilling not supported with m%d used", i);
return;
}
}
- spilled_any_registers = true;
+ fs->spilled_any_registers = true;
}
- last_scratch += size * REG_SIZE;
+ fs->last_scratch += size * REG_SIZE;
/* Generate spill/unspill instructions for the objects being
* spilled. Right now, we spill or unspill the whole thing to a
* virtual grf of the same size. For most instructions, though, we
* could just spill/unspill the GRF being accessed.
*/
- foreach_block_and_inst (block, fs_inst, inst, cfg) {
- const fs_builder ibld = fs_builder(this, block, inst);
+ foreach_block_and_inst (block, fs_inst, inst, fs->cfg) {
+ const fs_builder ibld = fs_builder(fs, block, inst);
for (unsigned int i = 0; i < inst->sources; i++) {
if (inst->src[i].file == VGRF &&
int count = regs_read(inst, i);
int subset_spill_offset = spill_offset +
ROUND_DOWN_TO(inst->src[i].offset, REG_SIZE);
- fs_reg unspill_dst(VGRF, alloc.allocate(count));
+ fs_reg unspill_dst(VGRF, fs->alloc.allocate(count));
inst->src[i].nr = unspill_dst.nr;
inst->src[i].offset %= REG_SIZE;
inst->dst.nr == spill_reg) {
int subset_spill_offset = spill_offset +
ROUND_DOWN_TO(inst->dst.offset, REG_SIZE);
- fs_reg spill_src(VGRF, alloc.allocate(regs_written(inst)));
+ fs_reg spill_src(VGRF, fs->alloc.allocate(regs_written(inst)));
inst->dst.nr = spill_src.nr;
inst->dst.offset %= REG_SIZE;
*/
const unsigned width = 8 * MIN2(
DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE),
- spill_max_size(this));
+ spill_max_size(fs));
/* Spills should only write data initialized by the instruction for
* whichever channels are enabled in the excution mask. If that's
* write, there should be no need for the unspill since the
* instruction will be overwriting the whole destination in any case.
*/
- if (inst->is_partial_reg_write() ||
+ if (inst->is_partial_write() ||
(!inst->force_writemask_all && !per_channel))
emit_unspill(ubld, spill_src, subset_spill_offset,
regs_written(inst));
}
}
- invalidate_live_intervals();
+ fs->invalidate_live_intervals();
+}
+
+bool
+fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all)
+{
+ while (1) {
+ build_interference_graph(fs->spilled_any_registers);
+
+ /* Debug of register spilling: Go spill everything. */
+ if (unlikely(spill_all)) {
+ int reg = choose_spill_reg();
+ if (reg != -1) {
+ spill_reg(reg);
+ ralloc_free(g);
+ g = NULL;
+ continue;
+ }
+ }
+
+ if (ra_allocate(g))
+ break;
+
+ if (!allow_spilling)
+ return false;
+
+ /* Failed to allocate registers. Spill a reg, and the caller will
+ * loop back into here to try again.
+ */
+ int reg = choose_spill_reg();
+ if (reg == -1)
+ return false;
+
+ spill_reg(reg);
+ ralloc_free(g);
+ g = NULL;
+ }
+
+ /* Get the chosen virtual registers for each node, and map virtual
+ * regs in the register classes back down to real hardware reg
+ * numbers.
+ */
+ unsigned hw_reg_mapping[fs->alloc.count];
+ fs->grf_used = fs->first_non_payload_grf;
+ for (unsigned i = 0; i < fs->alloc.count; i++) {
+ int reg = ra_get_node_reg(g, i);
+
+ hw_reg_mapping[i] = compiler->fs_reg_sets[rsi].ra_reg_to_grf[reg];
+ fs->grf_used = MAX2(fs->grf_used,
+ hw_reg_mapping[i] + fs->alloc.sizes[i]);
+ }
+
+ foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
+ assign_reg(hw_reg_mapping, &inst->dst);
+ for (int i = 0; i < inst->sources; i++) {
+ assign_reg(hw_reg_mapping, &inst->src[i]);
+ }
+ }
+
+ fs->alloc.count = fs->grf_used;
+
+ return true;
+}
+
+bool
+fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
+{
+ fs_reg_alloc alloc(this);
+ bool success = alloc.assign_regs(allow_spilling, spill_all);
+ if (!success && allow_spilling) {
+ fail("no register to spill:\n");
+ dump_instructions(NULL);
+ }
+ return success;
}