#include "compiler.h"
#include "midgard_ops.h"
#include "util/u_memory.h"
-#include "util/register_allocate.h"
/* Scheduling for Midgard is complicated, to say the least. ALU instructions
* must be grouped into VLIW bundles according to following model:
*
*/
-/* We create the dependency graph with per-component granularity */
+/* We create the dependency graph with per-byte granularity */
-#define COMPONENT_COUNT 8
+#define BYTE_COUNT 16
static void
-add_dependency(struct util_dynarray *table, unsigned index, unsigned mask, midgard_instruction **instructions, unsigned child)
+add_dependency(struct util_dynarray *table, unsigned index, uint16_t mask, midgard_instruction **instructions, unsigned child)
{
- for (unsigned i = 0; i < COMPONENT_COUNT; ++i) {
+ for (unsigned i = 0; i < BYTE_COUNT; ++i) {
if (!(mask & (1 << i)))
continue;
- struct util_dynarray *parents = &table[(COMPONENT_COUNT * index) + i];
+ struct util_dynarray *parents = &table[(BYTE_COUNT * index) + i];
util_dynarray_foreach(parents, unsigned, parent) {
BITSET_WORD *dependents = instructions[*parent]->dependents;
}
static void
-mark_access(struct util_dynarray *table, unsigned index, unsigned mask, unsigned parent)
+mark_access(struct util_dynarray *table, unsigned index, uint16_t mask, unsigned parent)
{
- for (unsigned i = 0; i < COMPONENT_COUNT; ++i) {
+ for (unsigned i = 0; i < BYTE_COUNT; ++i) {
if (!(mask & (1 << i)))
continue;
- util_dynarray_append(&table[(COMPONENT_COUNT * index) + i], unsigned, parent);
+ util_dynarray_append(&table[(BYTE_COUNT * index) + i], unsigned, parent);
}
}
static void
mir_create_dependency_graph(midgard_instruction **instructions, unsigned count, unsigned node_count)
{
- size_t sz = node_count * COMPONENT_COUNT;
+ size_t sz = node_count * BYTE_COUNT;
struct util_dynarray *last_read = calloc(sizeof(struct util_dynarray), sz);
struct util_dynarray *last_write = calloc(sizeof(struct util_dynarray), sz);
continue;
unsigned dest = instructions[i]->dest;
- unsigned mask = instructions[i]->mask;
+ unsigned mask = mir_bytemask(instructions[i]);
mir_foreach_src((*instructions), s) {
unsigned src = instructions[i]->src[s];
if (src < node_count) {
- unsigned readmask = mir_mask_of_read_components(instructions[i], src);
+ unsigned readmask = mir_bytemask_of_read_components(instructions[i], src);
add_dependency(last_write, src, readmask, instructions, i);
}
}
unsigned src = instructions[i]->src[s];
if (src < node_count) {
- unsigned readmask = mir_mask_of_read_components(instructions[i], src);
+ unsigned readmask = mir_bytemask_of_read_components(instructions[i], src);
mark_access(last_read, src, readmask, i);
}
}
static bool
mir_is_scalar(midgard_instruction *ains)
{
- /* Does the op support scalar units? */
- if (!(alu_opcode_props[ains->alu.op].props & UNITS_SCALAR))
- return false;
-
/* Do we try to use it as a vector op? */
if (!is_single_component_mask(ains->mask))
return false;
/* Exclude this destination (if not ~0) */
unsigned exclude;
+
+ /* Don't schedule instructions consuming conditionals (since we already
+ * scheduled one). Excludes conditional branches and csel */
+ bool no_cond;
+
+ /* Require a minimal mask and (if nonzero) given destination. Used for
+ * writeout optimizations */
+
+ unsigned mask;
+ unsigned dest;
};
/* For an instruction that can fit, adjust it to fit and update the constants
if (!ins->has_constants)
return true;
- /* TODO: Deduplicate; permit multiple constants within a bundle */
+ if (ins->alu.reg_mode != midgard_reg_mode_32) {
+ /* TODO: 16-bit constant combining */
+ if (pred->constant_count)
+ return false;
- if (destructive && !pred->constant_count) {
- if (ins->alu.reg_mode == midgard_reg_mode_16) {
- /* TODO: Fix packing XXX */
- uint16_t *bundles = (uint16_t *) pred->constants;
- uint32_t *constants = (uint32_t *) ins->constants;
+ uint16_t *bundles = (uint16_t *) pred->constants;
+ uint32_t *constants = (uint32_t *) ins->constants;
- /* Copy them wholesale */
- for (unsigned i = 0; i < 4; ++i)
- bundles[i] = constants[i];
- } else {
- memcpy(pred->constants, ins->constants, 16);
- }
+ /* Copy them wholesale */
+ for (unsigned i = 0; i < 4; ++i)
+ bundles[i] = constants[i];
pred->constant_count = 16;
- return true;
+ } else {
+ /* Pack 32-bit constants */
+ uint32_t *bundles = (uint32_t *) pred->constants;
+ uint32_t *constants = (uint32_t *) ins->constants;
+ unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
+ unsigned mask = mir_from_bytemask(mir_bytemask_of_read_components(ins, r_constant), midgard_reg_mode_32);
+
+ /* First, check if it fits */
+ unsigned count = DIV_ROUND_UP(pred->constant_count, sizeof(uint32_t));
+ unsigned existing_count = count;
+
+ for (unsigned i = 0; i < 4; ++i) {
+ if (!(mask & (1 << i)))
+ continue;
+
+ bool ok = false;
+
+ /* Look for existing constant */
+ for (unsigned j = 0; j < existing_count; ++j) {
+ if (bundles[j] == constants[i]) {
+ ok = true;
+ break;
+ }
+ }
+
+ if (ok)
+ continue;
+
+ /* If the constant is new, check ourselves */
+ for (unsigned j = 0; j < i; ++j) {
+ if (constants[j] == constants[i]) {
+ ok = true;
+ break;
+ }
+ }
+
+ if (ok)
+ continue;
+
+ /* Otherwise, this is a new constant */
+ count++;
+ }
+
+ /* Check if we have space */
+ if (count > 4)
+ return false;
+
+ /* If non-destructive, we're done */
+ if (!destructive)
+ return true;
+
+ /* If destructive, let's copy in the new constants and adjust
+ * swizzles to pack it in. */
+
+ unsigned indices[16] = { 0 };
+
+ /* Reset count */
+ count = existing_count;
+
+ for (unsigned i = 0; i < 4; ++i) {
+ if (!(mask & (1 << i)))
+ continue;
+
+ uint32_t cons = constants[i];
+ bool constant_found = false;
+
+ /* Search for the constant */
+ for (unsigned j = 0; j < count; ++j) {
+ if (bundles[j] != cons)
+ continue;
+
+ /* We found it, reuse */
+ indices[i] = j;
+ constant_found = true;
+ break;
+ }
+
+ if (constant_found)
+ continue;
+
+ /* We didn't find it, so allocate it */
+ unsigned idx = count++;
+
+ /* We have space, copy it in! */
+ bundles[idx] = cons;
+ indices[i] = idx;
+ }
+
+ pred->constant_count = count * sizeof(uint32_t);
+
+ /* Use indices as a swizzle */
+
+ mir_foreach_src(ins, s) {
+ if (ins->src[s] == r_constant)
+ mir_compose_swizzle(ins->swizzle[s], indices, ins->swizzle[s]);
+ }
}
- return !pred->constant_count;
+ return true;
}
static midgard_instruction *
bool alu = tag == TAG_ALU_4;
unsigned unit = predicate->unit;
bool branch = alu && (unit == ALU_ENAB_BR_COMPACT);
+ bool scalar = (unit != ~0) && (unit & UNITS_SCALAR);
+ bool no_cond = predicate->no_cond;
+
+ unsigned mask = predicate->mask;
+ unsigned dest = predicate->dest;
+ bool needs_dest = mask & 0xF;
/* Iterate to find the best instruction satisfying the predicate */
unsigned i;
BITSET_WORD tmp;
signed best_index = -1;
+ bool best_conditional = false;
/* Enforce a simple metric limiting distance to keep down register
* pressure. TOOD: replace with liveness tracking for much better
if (branch && !instructions[i]->compact_branch)
continue;
+ if (alu && scalar && !mir_is_scalar(instructions[i]))
+ continue;
+
if (alu && !mir_adjust_constants(instructions[i], predicate, false))
continue;
+ if (needs_dest && instructions[i]->dest != dest)
+ continue;
+
+ if (mask && ((~instructions[i]->mask) & mask))
+ continue;
+
+ bool conditional = alu && !branch && OP_IS_CSEL(instructions[i]->alu.op);
+ conditional |= (branch && !instructions[i]->prepacked_branch && instructions[i]->branch.conditional);
+
+ if (conditional && no_cond)
+ continue;
+
/* Simulate in-order scheduling */
if ((signed) i < best_index)
continue;
best_index = i;
+ best_conditional = conditional;
}
if (alu)
mir_adjust_constants(instructions[best_index], predicate, true);
+
+ /* Once we schedule a conditional, we can't again */
+ predicate->no_cond |= best_conditional;
}
return instructions[best_index];
mir_comparison_mobile(
compiler_context *ctx,
midgard_instruction **instructions,
+ struct midgard_predicate *predicate,
unsigned count,
unsigned cond)
{
if (GET_CHANNEL_COUNT(alu_opcode_props[instructions[i]->alu.op].props))
return ~0;
- /* TODO: moving conditionals with constants */
+ /* Ensure it will fit with constants */
- if (instructions[i]->has_constants)
+ if (!mir_adjust_constants(instructions[i], predicate, false))
return ~0;
/* Ensure it is written only once */
ret = i;
}
+ /* Inject constants now that we are sure we want to */
+ if (ret != ~0)
+ mir_adjust_constants(instructions[ret], predicate, true);
+
return ret;
}
mir_schedule_comparison(
compiler_context *ctx,
midgard_instruction **instructions,
+ struct midgard_predicate *predicate,
BITSET_WORD *worklist, unsigned count,
- unsigned cond, bool vector, unsigned swizzle,
+ unsigned cond, bool vector, unsigned *swizzle,
midgard_instruction *user)
{
/* TODO: swizzle when scheduling */
unsigned comp_i =
- (!vector && (swizzle == 0)) ?
- mir_comparison_mobile(ctx, instructions, count, cond) : ~0;
+ (!vector && (swizzle[0] == 0)) ?
+ mir_comparison_mobile(ctx, instructions, predicate, count, cond) : ~0;
/* If we can, schedule the condition immediately */
if ((comp_i != ~0) && BITSET_TEST(worklist, comp_i)) {
}
/* Otherwise, we insert a move */
- midgard_vector_alu_src csel = {
- .swizzle = swizzle
- };
- midgard_instruction mov = v_mov(cond, csel, cond);
+ midgard_instruction mov = v_mov(cond, cond);
mov.mask = vector ? 0xF : 0x1;
+ memcpy(mov.swizzle[1], swizzle, sizeof(mov.swizzle[1]));
return mir_insert_instruction_before(ctx, user, mov);
}
/* Grab the conditional instruction */
midgard_instruction *cond = mir_schedule_comparison(
- ctx, instructions, worklist, count, last->src[condition_index],
- vector, last->cond_swizzle, last);
+ ctx, instructions, predicate, worklist, count, last->src[condition_index],
+ vector, last->swizzle[2], last);
/* We have exclusive reign over this (possibly move) conditional
* instruction. We can rewrite into a pipeline conditional register */
if (cond->src[s] == ~0)
continue;
- mir_set_swizzle(cond, s, (mir_get_swizzle(cond, s) << (2*3)) & 0xFF);
+ for (unsigned q = 0; q < 4; ++q)
+ cond->swizzle[s][q + COMPONENT_W] = cond->swizzle[s][q];
}
}
unreachable("Bad condition");
}
+ mir_choose_alu(&smul, instructions, worklist, len, &predicate, UNIT_SMUL);
+
if (!writeout)
mir_choose_alu(&vlut, instructions, worklist, len, &predicate, UNIT_VLUT);
unreachable("Bad condition");
}
+ /* Stage 2, let's schedule sadd before vmul for writeout */
+ mir_choose_alu(&sadd, instructions, worklist, len, &predicate, UNIT_SADD);
+
/* Check if writeout reads its own register */
bool bad_writeout = false;
bad_writeout |= mir_has_arg(stages[i], branch->src[0]);
}
- /* Add a move if necessary */
+ /* It's possible we'll be able to schedule something into vmul
+ * to fill r0. Let's peak into the future, trying to schedule
+ * vmul specially that way. */
+
+ if (!bad_writeout && writeout_mask != 0xF) {
+ predicate.unit = UNIT_VMUL;
+ predicate.dest = src;
+ predicate.mask = writeout_mask ^ 0xF;
+
+ struct midgard_instruction *peaked =
+ mir_choose_instruction(instructions, worklist, len, &predicate);
+
+ if (peaked) {
+ vmul = peaked;
+ vmul->unit = UNIT_VMUL;
+ writeout_mask |= predicate.mask;
+ assert(writeout_mask == 0xF);
+ }
+
+ /* Cleanup */
+ predicate.dest = predicate.mask = 0;
+ }
+
+ /* Finally, add a move if necessary */
if (bad_writeout || writeout_mask != 0xF) {
unsigned temp = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : make_compiler_temp(ctx);
- midgard_instruction mov = v_mov(src, blank_alu_src, temp);
+ midgard_instruction mov = v_mov(src, temp);
vmul = mem_dup(&mov, sizeof(midgard_instruction));
vmul->unit = UNIT_VMUL;
vmul->mask = 0xF ^ writeout_mask;
/* Blend constant was backwards as well. blend_offset if set is
* strictly positive, as an offset of zero would imply constants before
- * any instructions which is invalid in Midgard */
+ * any instructions which is invalid in Midgard. TODO: blend constants
+ * are broken if you spill since then quadword_count becomes invalid
+ * XXX */
if (blend_offset)
ctx->blend_constant_offset = ((ctx->quadword_count + block->quadword_count) - blend_offset - 1) * 0x10;
return temp;
}
-/* Reassigns numbering to get rid of gaps in the indices */
+/* Reassigns numbering to get rid of gaps in the indices and to prioritize
+ * smaller register classes */
static void
mir_squeeze_index(compiler_context *ctx)
/* TODO don't leak old hash_to_temp */
ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
+ /* We need to prioritize texture registers on older GPUs so we don't
+ * fail RA trying to assign to work registers r0/r1 when a work
+ * register is already there */
+
mir_foreach_instr_global(ctx, ins) {
- ins->dest = find_or_allocate_temp(ctx, ins->dest);
+ if (ins->type == TAG_TEXTURE_4)
+ ins->dest = find_or_allocate_temp(ctx, ins->dest);
+ }
+
+ mir_foreach_instr_global(ctx, ins) {
+ if (ins->type != TAG_TEXTURE_4)
+ ins->dest = find_or_allocate_temp(ctx, ins->dest);
for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i)
ins->src[i] = find_or_allocate_temp(ctx, ins->src[i]);
.mask = mask,
.dest = ~0,
.src = { ~0, ~0, ~0 },
+ .swizzle = SWIZZLE_IDENTITY_4,
.load_store = {
.op = is_store ? midgard_op_st_int4 : midgard_op_ld_int4,
- .swizzle = SWIZZLE_XYZW,
/* For register spilling - to thread local storage */
.arg_1 = 0xEA,
.arg_2 = 0x1E,
-
- /* Splattered across, TODO combine logically */
- .varying_parameters = (byte & 0x1FF) << 1,
- .address = (byte >> 9)
},
/* If we spill an unspill, RA goes into an infinite loop */
.no_spill = true
};
+ ins.constants[0] = byte;
+
if (is_store) {
/* r0 = r26, r1 = r27 */
assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27));
static void mir_spill_register(
compiler_context *ctx,
- struct ra_graph *g,
+ struct lcra_state *l,
unsigned *spill_count)
{
unsigned spill_index = ctx->temp_count;
* spill node. All nodes are equal in spill cost, but we can't spill
* nodes written to from an unspill */
- for (unsigned i = 0; i < ctx->temp_count; ++i) {
- ra_set_node_spill_cost(g, i, 1.0);
+ unsigned *cost = calloc(ctx->temp_count, sizeof(cost[0]));
+
+ mir_foreach_instr_global(ctx, ins) {
+ if (ins->dest < ctx->temp_count)
+ cost[ins->dest]++;
+
+ mir_foreach_src(ins, s) {
+ if (ins->src[s] < ctx->temp_count)
+ cost[ins->src[s]]++;
+ }
}
+ for (unsigned i = 0; i < ctx->temp_count; ++i)
+ lcra_set_node_spill_cost(l, i, cost[i]);
+
/* We can't spill any bundles that contain unspills. This could be
* optimized to allow use of r27 to spill twice per bundle, but if
- * you're at the point of optimizing spilling, it's too late. */
+ * you're at the point of optimizing spilling, it's too late.
+ *
+ * We also can't double-spill. */
mir_foreach_block(ctx, block) {
mir_foreach_bundle_in_block(block, bun) {
bool no_spill = false;
- for (unsigned i = 0; i < bun->instruction_count; ++i)
+ for (unsigned i = 0; i < bun->instruction_count; ++i) {
no_spill |= bun->instructions[i]->no_spill;
+ if (bun->instructions[i]->no_spill) {
+ mir_foreach_src(bun->instructions[i], s) {
+ unsigned src = bun->instructions[i]->src[s];
+
+ if (src < ctx->temp_count)
+ lcra_set_node_spill_cost(l, src, -1);
+ }
+ }
+ }
+
if (!no_spill)
continue;
for (unsigned i = 0; i < bun->instruction_count; ++i) {
unsigned dest = bun->instructions[i]->dest;
if (dest < ctx->temp_count)
- ra_set_node_spill_cost(g, dest, -1.0);
+ lcra_set_node_spill_cost(l, dest, -1);
}
}
}
- int spill_node = ra_get_best_spill_node(g);
+ int spill_node = lcra_get_best_spill_node(l);
if (spill_node < 0) {
mir_print_shader(ctx);
* legitimately spill to TLS, but special registers just spill to work
* registers */
- unsigned class = ra_get_node_class(g, spill_node);
- bool is_special = (class >> 2) != REG_CLASS_WORK;
- bool is_special_w = (class >> 2) == REG_CLASS_TEXW;
+ bool is_special = l->class[spill_node] != REG_CLASS_WORK;
+ bool is_special_w = l->class[spill_node] == REG_CLASS_TEXW;
/* Allocate TLS slot (maybe) */
unsigned spill_slot = !is_special ? (*spill_count)++ : 0;
midgard_instruction st;
if (is_special_w) {
- st = v_mov(spill_node, blank_alu_src, spill_slot);
+ st = v_mov(spill_node, spill_slot);
st.no_spill = true;
} else {
ins->dest = SSA_FIXED_REGISTER(26);
}
}
- /* For special reads, figure out how many components we need */
- unsigned read_mask = 0;
+ /* For special reads, figure out how many bytes we need */
+ unsigned read_bytemask = 0;
mir_foreach_instr_global_safe(ctx, ins) {
- read_mask |= mir_mask_of_read_components(ins, spill_node);
+ read_bytemask |= mir_bytemask_of_read_components(ins, spill_node);
}
/* Insert a load from TLS before the first consecutive
midgard_instruction *before = ins;
- /* For a csel, go back one more not to break up the bundle */
+ /* TODO: Remove me I'm a fossil */
if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op))
before = mir_prev_op(before);
if (is_special) {
/* Move */
- st = v_mov(spill_node, blank_alu_src, consecutive_index);
+ st = v_mov(spill_node, consecutive_index);
st.no_spill = true;
} else {
/* TLS load */
}
/* Mask the load based on the component count
- * actually needed to prvent RA loops */
+ * actually needed to prevent RA loops */
- st.mask = read_mask;
+ st.mask = mir_from_bytemask(read_bytemask, midgard_reg_mode_32);
mir_insert_instruction_before_scheduled(ctx, block, before, st);
// consecutive_skip = true;
void
schedule_program(compiler_context *ctx)
{
- struct ra_graph *g = NULL;
+ struct lcra_state *l = NULL;
bool spilled = false;
int iter_count = 1000; /* max iterations */
do {
if (spilled)
- mir_spill_register(ctx, g, &spill_count);
+ mir_spill_register(ctx, l, &spill_count);
mir_squeeze_index(ctx);
+ mir_invalidate_liveness(ctx);
- g = NULL;
- g = allocate_registers(ctx, &spilled);
+ l = NULL;
+ l = allocate_registers(ctx, &spilled);
} while(spilled && ((iter_count--) > 0));
if (iter_count <= 0) {
ctx->tls_size = spill_count * 16;
- install_registers(ctx, g);
+ install_registers(ctx, l);
}