static bool
can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
{
+ /* Writeout has its own rules anyway */
+ if (first->compact_branch || second->compact_branch)
+ return true;
+
/* Each instruction reads some registers and writes to a register. See
* where the first writes */
- /* Figure out where exactly we wrote to */
- int source = first->ssa_args.dest;
+ int source = first->dest;
int source_mask = first->mask;
/* As long as the second doesn't read from the first, we're okay */
- if (second->ssa_args.src0 == source) {
- if (first->type == TAG_ALU_4) {
- /* Figure out which components we just read from */
-
- int q = second->alu.src1;
- midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
+ for (unsigned i = 0; i < ARRAY_SIZE(second->src); ++i) {
+ if (second->src[i] != source)
+ continue;
- /* Check if there are components in common, and fail if so */
- if (swizzle_to_access_mask(m->swizzle) & source_mask)
- return false;
- } else
+ if (first->type != TAG_ALU_4)
return false;
- }
+ /* Figure out which components we just read from */
- if (second->ssa_args.src1 == source)
- return false;
+ int q = (i == 0) ? second->alu.src1 : second->alu.src2;
+ midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
+
+ /* Check if there are components in common, and fail if so */
+ if (swizzle_to_access_mask(m->swizzle) & source_mask)
+ return false;
+ }
/* Otherwise, it's safe in that regard. Another data hazard is both
* writing to the same place, of course */
- if (second->ssa_args.dest == source) {
+ if (second->dest == source) {
/* ...but only if the components overlap */
if (second->mask & source_mask)
}
+/* Fragment writeout (of r0) is allowed when:
+ *
+ * - All components of r0 are written in the bundle
+ * - No components of r0 are written in VLUT
+ * - Non-pipelined dependencies of r0 are not written in the bundle
+ *
+ * This function checks if these requirements are satisfied given the content
+ * of a scheduled bundle.
+ */
+
+static bool
+can_writeout_fragment(compiler_context *ctx, midgard_instruction **bundle, unsigned count, unsigned node_count)
+{
+ /* First scan for which components of r0 are written out. Initially
+ * none are written */
+
+ uint8_t r0_written_mask = 0x0;
+
+ /* Simultaneously we scan for the set of dependencies */
+
+ size_t sz = sizeof(BITSET_WORD) * BITSET_WORDS(node_count);
+ BITSET_WORD *dependencies = alloca(sz);
+ memset(dependencies, 0, sz);
+
+ for (unsigned i = 0; i < count; ++i) {
+ midgard_instruction *ins = bundle[i];
+
+ if (ins->dest != SSA_FIXED_REGISTER(0))
+ continue;
+
+ /* Record written out mask */
+ r0_written_mask |= ins->mask;
+
+ /* Record dependencies, but only if they won't become pipeline
+ * registers. We know we can't be live after this, because
+ * we're writeout at the very end of the shader. So check if
+ * they were written before us. */
+
+ unsigned src0 = ins->src[0];
+ unsigned src1 = ins->src[1];
+
+ if (!mir_is_written_before(ctx, bundle[0], src0))
+ src0 = ~0;
+
+ if (!mir_is_written_before(ctx, bundle[0], src1))
+ src1 = ~0;
+
+ if (src0 < node_count)
+ BITSET_SET(dependencies, src0);
+
+ if (src1 < node_count)
+ BITSET_SET(dependencies, src1);
+
+ /* Requirement 2 */
+ if (ins->unit == UNIT_VLUT)
+ return false;
+ }
+
+ /* Requirement 1 */
+ if ((r0_written_mask & 0xF) != 0xF)
+ return false;
+
+ /* Requirement 3 */
+
+ for (unsigned i = 0; i < count; ++i) {
+ unsigned dest = bundle[i]->dest;
+
+ if (dest < node_count && BITSET_TEST(dependencies, dest))
+ return false;
+ }
+
+ /* Otherwise, we're good to go */
+ return true;
+}
+
/* Schedules, but does not emit, a single basic block. After scheduling, the
* final tag and size of the block are known, which are necessary for branching
* */
int instructions_emitted = 0, packed_idx = 0;
midgard_bundle bundle = { 0 };
+ midgard_instruction *scheduled[5] = { NULL };
+
uint8_t tag = ins->type;
/* Default to the instruction's tag */
could_scalar &= !s1.half;
- if (!ains->ssa_args.inline_constant) {
- midgard_vector_alu_src s2 =
- vector_alu_from_unsigned(ains->alu.src2);
-
- could_scalar &= !s2.half;
- }
+ midgard_vector_alu_src s2 =
+ vector_alu_from_unsigned(ains->alu.src2);
+ could_scalar &= !s2.half;
}
bool scalar = could_scalar && scalarable;
else
break;
} else {
- if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
+ if ((units & UNIT_VMUL) && (last_unit < UNIT_VMUL))
+ unit = UNIT_VMUL;
+ else if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
unit = UNIT_SADD;
- else if (units & UNIT_SMUL)
- unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL;
- else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
+ else if (units & UNIT_VADD)
unit = UNIT_VADD;
+ else if (units & UNIT_SMUL)
+ unit = UNIT_SMUL;
+ else if (units & UNIT_VLUT)
+ unit = UNIT_VLUT;
else
break;
}
unsigned swizzle = SWIZZLE_FROM_ARRAY(indices);
unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
- if (ains->ssa_args.src0 == r_constant)
+ if (ains->src[0] == r_constant)
ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle);
- if (ains->ssa_args.src1 == r_constant)
+ if (ains->src[1] == r_constant)
ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle);
bundle.has_embedded_constants = true;
/* All of r0 has to be written out along with
* the branch writeout */
- if (ains->writeout) {
- /* The rules for when "bare" writeout
- * is safe are when all components are
- * r0 are written out in the final
- * bundle, earlier than VLUT, where any
- * register dependencies of r0 are from
- * an earlier bundle. We can't verify
- * this before RA, so we don't try. */
-
+ if (ains->writeout && !can_writeout_fragment(ctx, scheduled, index, ctx->temp_count)) {
+ /* We only work on full moves
+ * at the beginning. We could
+ * probably do better */
if (index != 0)
break;
}
/* Defer marking until after writing to allow for break */
+ scheduled[index] = ains;
control |= ains->unit;
last_unit = ains->unit;
++instructions_emitted;
if (c->type != TAG_LOAD_STORE_4) continue;
- /* Stores cannot be reordered, since they have
- * dependencies. For the same reason, indirect
- * loads cannot be reordered as their index is
- * loaded in r27.w */
+ /* We can only reorder if there are no sources */
- if (OP_IS_STORE(c->load_store.op)) continue;
+ bool deps = false;
- /* It appears the 0x800 bit is set whenever a
- * load is direct, unset when it is indirect.
- * Skip indirect loads. */
+ for (unsigned s = 0; s < ARRAY_SIZE(ins->src); ++s)
+ deps |= (c->src[s] != ~0);
- if (!(c->load_store.unknown & 0x800)) continue;
+ if (deps)
+ continue;
/* We found one! Move it up to pair and remove it from the old location */
static unsigned
find_or_allocate_temp(compiler_context *ctx, unsigned hash)
{
- if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
+ if (hash >= SSA_FIXED_MINIMUM)
return hash;
unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
mir_foreach_instr_global(ctx, ins) {
- if (ins->compact_branch) continue;
-
- ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
- ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
-
- if (!ins->ssa_args.inline_constant)
- ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
+ ins->dest = find_or_allocate_temp(ctx, ins->dest);
+ for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i)
+ ins->src[i] = find_or_allocate_temp(ctx, ins->src[i]);
}
}
midgard_instruction ins = {
.type = TAG_LOAD_STORE_4,
.mask = mask,
- .ssa_args = {
- .dest = -1,
- .src0 = -1,
- .src1 = -1
- },
+ .dest = ~0,
+ .src = { ~0, ~0, ~0 },
.load_store = {
.op = is_store ? midgard_op_st_int4 : midgard_op_ld_int4,
.swizzle = SWIZZLE_XYZW,
/* For register spilling - to thread local storage */
- .unknown = 0x1EEA,
+ .arg_1 = 0xEA,
+ .arg_2 = 0x1E,
/* Splattered across, TODO combine logically */
.varying_parameters = (byte & 0x1FF) << 1,
.address = (byte >> 9)
- }
+ },
+
+ /* If we spill an unspill, RA goes into an infinite loop */
+ .no_spill = true
};
if (is_store) {
/* r0 = r26, r1 = r27 */
assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27));
- ins.ssa_args.src0 = (srcdest == SSA_FIXED_REGISTER(27)) ? SSA_FIXED_REGISTER(1) : SSA_FIXED_REGISTER(0);
+ ins.src[0] = srcdest;
} else {
- ins.ssa_args.dest = srcdest;
+ ins.dest = srcdest;
}
return ins;
}
-void
-schedule_program(compiler_context *ctx)
+/* If register allocation fails, find the best spill node and spill it to fix
+ * whatever the issue was. This spill node could be a work register (spilling
+ * to thread local storage), but it could also simply be a special register
+ * that needs to spill to become a work register. */
+
+static void mir_spill_register(
+ compiler_context *ctx,
+ struct ra_graph *g,
+ unsigned *spill_count)
{
- struct ra_graph *g = NULL;
- bool spilled = false;
- int iter_count = 1000; /* max iterations */
+ unsigned spill_index = ctx->temp_count;
- /* Number of 128-bit slots in memory we've spilled into */
- unsigned spill_count = 0;
+ /* Our first step is to calculate spill cost to figure out the best
+ * spill node. All nodes are equal in spill cost, but we can't spill
+ * nodes written to from an unspill */
- midgard_promote_uniforms(ctx, 8);
+ for (unsigned i = 0; i < ctx->temp_count; ++i) {
+ ra_set_node_spill_cost(g, i, 1.0);
+ }
- mir_foreach_block(ctx, block) {
- midgard_pair_load_store(ctx, block);
+ mir_foreach_instr_global(ctx, ins) {
+ if (ins->no_spill &&
+ ins->dest >= 0 &&
+ ins->dest < ctx->temp_count)
+ ra_set_node_spill_cost(g, ins->dest, -1.0);
}
- do {
- /* If we spill, find the best spill node and spill it */
+ int spill_node = ra_get_best_spill_node(g);
- unsigned spill_index = ctx->temp_count;
- if (g && spilled) {
- /* All nodes are equal in spill cost, but we can't
- * spill nodes written to from an unspill */
+ if (spill_node < 0) {
+ mir_print_shader(ctx);
+ assert(0);
+ }
- for (unsigned i = 0; i < ctx->temp_count; ++i) {
- ra_set_node_spill_cost(g, i, 1.0);
- }
+ /* We have a spill node, so check the class. Work registers
+ * legitimately spill to TLS, but special registers just spill to work
+ * registers */
- mir_foreach_instr_global(ctx, ins) {
- if (ins->type != TAG_LOAD_STORE_4) continue;
- if (ins->load_store.op != midgard_op_ld_int4) continue;
- if (ins->load_store.unknown != 0x1EEA) continue;
- ra_set_node_spill_cost(g, ins->ssa_args.dest, -1.0);
- }
+ unsigned class = ra_get_node_class(g, spill_node);
+ bool is_special = (class >> 2) != REG_CLASS_WORK;
+ bool is_special_w = (class >> 2) == REG_CLASS_TEXW;
- int spill_node = ra_get_best_spill_node(g);
+ /* Allocate TLS slot (maybe) */
+ unsigned spill_slot = !is_special ? (*spill_count)++ : 0;
- if (spill_node < 0) {
- mir_print_shader(ctx);
- assert(0);
- }
+ /* For TLS, replace all stores to the spilled node. For
+ * special reads, just keep as-is; the class will be demoted
+ * implicitly. For special writes, spill to a work register */
- /* Allocate TLS slot */
- unsigned spill_slot = spill_count++;
+ if (!is_special || is_special_w) {
+ if (is_special_w)
+ spill_slot = spill_index++;
- /* Replace all stores to the spilled node with stores
- * to TLS */
+ mir_foreach_instr_global_safe(ctx, ins) {
+ if (ins->dest != spill_node) continue;
- mir_foreach_instr_global_safe(ctx, ins) {
- if (ins->compact_branch) continue;
- if (ins->ssa_args.dest != spill_node) continue;
- ins->ssa_args.dest = SSA_FIXED_REGISTER(26);
+ midgard_instruction st;
- midgard_instruction st = v_load_store_scratch(ins->ssa_args.dest, spill_slot, true, ins->mask);
- mir_insert_instruction_before(mir_next_op(ins), st);
+ if (is_special_w) {
+ st = v_mov(spill_node, blank_alu_src, spill_slot);
+ st.no_spill = true;
+ } else {
+ ins->dest = SSA_FIXED_REGISTER(26);
+ st = v_load_store_scratch(ins->dest, spill_slot, true, ins->mask);
}
- /* Insert a load from TLS before the first consecutive
- * use of the node, rewriting to use spilled indices to
- * break up the live range */
+ /* Hint: don't rewrite this node */
+ st.hint = true;
- mir_foreach_block(ctx, block) {
+ mir_insert_instruction_before(mir_next_op(ins), st);
- bool consecutive_skip = false;
- unsigned consecutive_index = 0;
+ if (!is_special)
+ ctx->spills++;
+ }
+ }
- mir_foreach_instr_in_block(block, ins) {
- if (ins->compact_branch) continue;
-
- if (!mir_has_arg(ins, spill_node)) {
- consecutive_skip = false;
- continue;
- }
+ /* For special reads, figure out how many components we need */
+ unsigned read_mask = 0;
- if (consecutive_skip) {
- /* Rewrite */
- mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
- continue;
- }
+ mir_foreach_instr_global_safe(ctx, ins) {
+ read_mask |= mir_mask_of_read_components(ins, spill_node);
+ }
+ /* Insert a load from TLS before the first consecutive
+ * use of the node, rewriting to use spilled indices to
+ * break up the live range. Or, for special, insert a
+ * move. Ironically the latter *increases* register
+ * pressure, but the two uses of the spilling mechanism
+ * are somewhat orthogonal. (special spilling is to use
+ * work registers to back special registers; TLS
+ * spilling is to use memory to back work registers) */
+
+ mir_foreach_block(ctx, block) {
+ bool consecutive_skip = false;
+ unsigned consecutive_index = 0;
+
+ mir_foreach_instr_in_block(block, ins) {
+ /* We can't rewrite the moves used to spill in the
+ * first place. These moves are hinted. */
+ if (ins->hint) continue;
+
+ if (!mir_has_arg(ins, spill_node)) {
+ consecutive_skip = false;
+ continue;
+ }
+
+ if (consecutive_skip) {
+ /* Rewrite */
+ mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
+ continue;
+ }
+
+ if (!is_special_w) {
consecutive_index = ++spill_index;
- midgard_instruction st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF);
+
midgard_instruction *before = ins;
/* For a csel, go back one more not to break up the bundle */
if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op))
before = mir_prev_op(before);
+ midgard_instruction st;
+
+ if (is_special) {
+ /* Move */
+ st = v_mov(spill_node, blank_alu_src, consecutive_index);
+ st.no_spill = true;
+ } else {
+ /* TLS load */
+ st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF);
+ }
+
+ /* Mask the load based on the component count
+ * actually needed to prvent RA loops */
+
+ st.mask = read_mask;
+
mir_insert_instruction_before(before, st);
// consecutive_skip = true;
+ } else {
+ /* Special writes already have their move spilled in */
+ consecutive_index = spill_slot;
+ }
- /* Rewrite to use */
- mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
- }
- }
+ /* Rewrite to use */
+ mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
+
+ if (!is_special)
+ ctx->fills++;
}
+ }
+
+ /* Reset hints */
+
+ mir_foreach_instr_global(ctx, ins) {
+ ins->hint = false;
+ }
+}
+
+void
+schedule_program(compiler_context *ctx)
+{
+ struct ra_graph *g = NULL;
+ bool spilled = false;
+ int iter_count = 1000; /* max iterations */
+
+ /* Number of 128-bit slots in memory we've spilled into */
+ unsigned spill_count = 0;
+
+ midgard_promote_uniforms(ctx, 16);
+
+ mir_foreach_block(ctx, block) {
+ midgard_pair_load_store(ctx, block);
+ }
+
+ /* Must be lowered right before RA */
+ mir_squeeze_index(ctx);
+ mir_lower_special_reads(ctx);
+
+ /* Lowering can introduce some dead moves */
+
+ mir_foreach_block(ctx, block) {
+ midgard_opt_dead_move_eliminate(ctx, block);
+ }
+
+ do {
+ if (spilled)
+ mir_spill_register(ctx, g, &spill_count);
mir_squeeze_index(ctx);
g = allocate_registers(ctx, &spilled);
} while(spilled && ((iter_count--) > 0));
- /* We would like to run RA after scheduling, but spilling can
- * complicate this */
+ /* We can simplify a bit after RA */
- mir_foreach_block(ctx, block) {
- schedule_block(ctx, block);
- }
-#if 0
+ mir_foreach_block(ctx, block) {
+ midgard_opt_post_move_eliminate(ctx, block, g);
+ }
+
+ /* After RA finishes, we schedule all at once */
- /* Pipeline registers creation is a prepass before RA */
- mir_create_pipeline_registers(ctx);
-#endif
+ mir_foreach_block(ctx, block) {
+ schedule_block(ctx, block);
+ }
+ /* Finally, we create pipeline registers as a peephole pass after
+ * scheduling. This isn't totally optimal, since there are cases where
+ * the usage of pipeline registers can eliminate spills, but it does
+ * save some power */
+ mir_create_pipeline_registers(ctx);
if (iter_count <= 0) {
fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");