X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fpanfrost%2Fmidgard%2Fmidgard_schedule.c;h=ff71e0dcd3f80c4a6258bb491a1d422ac19fa8eb;hb=34ff50cadd8157c5f41db301aa26d42af4c708b2;hp=862b9306c15ca069dcc011157be4db93d20fe83d;hpb=f0d0061b18aa39179552fe6f6c49e3f0ad63a9c1;p=mesa.git diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c index 862b9306c15..ff71e0dcd3f 100644 --- a/src/panfrost/midgard/midgard_schedule.c +++ b/src/panfrost/midgard/midgard_schedule.c @@ -26,20 +26,146 @@ #include "util/u_memory.h" #include "util/register_allocate.h" -/* Create a mask of accessed components from a swizzle to figure out vector - * dependencies */ +/* Scheduling for Midgard is complicated, to say the least. ALU instructions + * must be grouped into VLIW bundles according to following model: + * + * [VMUL] [SADD] + * [VADD] [SMUL] [VLUT] + * + * A given instruction can execute on some subset of the units (or a few can + * execute on all). Instructions can be either vector or scalar; only scalar + * instructions can execute on SADD/SMUL units. Units on a given line execute + * in parallel. Subsequent lines execute separately and can pass results + * directly via pipeline registers r24/r25, bypassing the register file. + * + * A bundle can optionally have 128-bits of embedded constants, shared across + * all of the instructions within a bundle. + * + * Instructions consuming conditionals (branches and conditional selects) + * require their condition to be written into the conditional register (r31) + * within the same bundle they are consumed. + * + * Fragment writeout requires its argument to be written in full within the + * same bundle as the branch, with no hanging dependencies. + * + * Load/store instructions are also in bundles of simply two instructions, and + * texture instructions have no bundling. + * + * ------------------------------------------------------------------------- + * + */ -static unsigned -swizzle_to_access_mask(unsigned swizzle) +/* We create the dependency graph with per-component granularity */ + +#define COMPONENT_COUNT 8 + +static void +add_dependency(struct util_dynarray *table, unsigned index, unsigned mask, midgard_instruction **instructions, unsigned child) { - unsigned component_mask = 0; + for (unsigned i = 0; i < COMPONENT_COUNT; ++i) { + if (!(mask & (1 << i))) + continue; + + struct util_dynarray *parents = &table[(COMPONENT_COUNT * index) + i]; + + util_dynarray_foreach(parents, unsigned, parent) { + BITSET_WORD *dependents = instructions[*parent]->dependents; - for (int i = 0; i < 4; ++i) { - unsigned c = (swizzle >> (2 * i)) & 3; - component_mask |= (1 << c); + /* Already have the dependency */ + if (BITSET_TEST(dependents, child)) + continue; + + BITSET_SET(dependents, child); + instructions[child]->nr_dependencies++; + } } +} - return component_mask; +static void +mark_access(struct util_dynarray *table, unsigned index, unsigned mask, unsigned parent) +{ + for (unsigned i = 0; i < COMPONENT_COUNT; ++i) { + if (!(mask & (1 << i))) + continue; + + util_dynarray_append(&table[(COMPONENT_COUNT * index) + i], unsigned, parent); + } +} + +static void +mir_create_dependency_graph(midgard_instruction **instructions, unsigned count, unsigned node_count) +{ + size_t sz = node_count * COMPONENT_COUNT; + + struct util_dynarray *last_read = calloc(sizeof(struct util_dynarray), sz); + struct util_dynarray *last_write = calloc(sizeof(struct util_dynarray), sz); + + for (unsigned i = 0; i < sz; ++i) { + util_dynarray_init(&last_read[i], NULL); + util_dynarray_init(&last_write[i], NULL); + } + + /* Initialize dependency graph */ + for (unsigned i = 0; i < count; ++i) { + instructions[i]->dependents = + calloc(BITSET_WORDS(count), sizeof(BITSET_WORD)); + + instructions[i]->nr_dependencies = 0; + } + + /* Populate dependency graph */ + for (signed i = count - 1; i >= 0; --i) { + if (instructions[i]->compact_branch) + continue; + + unsigned dest = instructions[i]->dest; + unsigned mask = instructions[i]->mask; + + mir_foreach_src((*instructions), s) { + unsigned src = instructions[i]->src[s]; + + if (src < node_count) { + unsigned readmask = mir_mask_of_read_components(instructions[i], src); + add_dependency(last_write, src, readmask, instructions, i); + } + } + + if (dest < node_count) { + add_dependency(last_read, dest, mask, instructions, i); + add_dependency(last_write, dest, mask, instructions, i); + mark_access(last_write, dest, mask, i); + } + + mir_foreach_src((*instructions), s) { + unsigned src = instructions[i]->src[s]; + + if (src < node_count) { + unsigned readmask = mir_mask_of_read_components(instructions[i], src); + mark_access(last_read, src, readmask, i); + } + } + } + + /* If there is a branch, all instructions depend on it, as interblock + * execution must be purely in-order */ + + if (instructions[count - 1]->compact_branch) { + BITSET_WORD *dependents = instructions[count - 1]->dependents; + + for (signed i = count - 2; i >= 0; --i) { + if (BITSET_TEST(dependents, i)) + continue; + + BITSET_SET(dependents, i); + instructions[i]->nr_dependencies++; + } + } + + /* Free the intermediate structures */ + for (unsigned i = 0; i < sz; ++i) { + util_dynarray_fini(&last_read[i]); + util_dynarray_fini(&last_write[i]); + } } /* Does the mask cover more than a scalar? */ @@ -57,440 +183,696 @@ is_single_component_mask(unsigned mask) return components == 1; } -/* Checks for an SSA data hazard between two adjacent instructions, keeping in - * mind that we are a vector architecture and we can write to different - * components simultaneously */ +/* Helpers for scheudling */ static bool -can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second) +mir_is_scalar(midgard_instruction *ains) { - /* Each instruction reads some registers and writes to a register. See - * where the first writes */ + /* Do we try to use it as a vector op? */ + if (!is_single_component_mask(ains->mask)) + return false; - /* Figure out where exactly we wrote to */ - int source = first->ssa_args.dest; - int source_mask = first->mask; + /* Otherwise, check mode hazards */ + bool could_scalar = true; - /* As long as the second doesn't read from the first, we're okay */ - if (second->ssa_args.src0 == source) { - if (first->type == TAG_ALU_4) { - /* Figure out which components we just read from */ + /* Only 16/32-bit can run on a scalar unit */ + could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8; + could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64; + could_scalar &= ains->alu.dest_override == midgard_dest_override_none; - int q = second->alu.src1; - midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q; + if (ains->alu.reg_mode == midgard_reg_mode_16) { + /* If we're running in 16-bit mode, we + * can't have any 8-bit sources on the + * scalar unit (since the scalar unit + * doesn't understand 8-bit) */ - /* Check if there are components in common, and fail if so */ - if (swizzle_to_access_mask(m->swizzle) & source_mask) - return false; - } else - return false; + midgard_vector_alu_src s1 = + vector_alu_from_unsigned(ains->alu.src1); + + could_scalar &= !s1.half; + midgard_vector_alu_src s2 = + vector_alu_from_unsigned(ains->alu.src2); + + could_scalar &= !s2.half; } - if (second->ssa_args.src1 == source) - return false; + return could_scalar; +} + +/* How many bytes does this ALU instruction add to the bundle? */ + +static unsigned +bytes_for_instruction(midgard_instruction *ains) +{ + if (ains->unit & UNITS_ANY_VECTOR) + return sizeof(midgard_reg_info) + sizeof(midgard_vector_alu); + else if (ains->unit == ALU_ENAB_BRANCH) + return sizeof(midgard_branch_extended); + else if (ains->compact_branch) + return sizeof(ains->br_compact); + else + return sizeof(midgard_reg_info) + sizeof(midgard_scalar_alu); +} - /* Otherwise, it's safe in that regard. Another data hazard is both - * writing to the same place, of course */ +/* We would like to flatten the linked list of midgard_instructions in a bundle + * to an array of pointers on the heap for easy indexing */ - if (second->ssa_args.dest == source) { - /* ...but only if the components overlap */ +static midgard_instruction ** +flatten_mir(midgard_block *block, unsigned *len) +{ + *len = list_length(&block->instructions); - if (second->mask & source_mask) - return false; + if (!(*len)) + return NULL; + + midgard_instruction **instructions = + calloc(sizeof(midgard_instruction *), *len); + + unsigned i = 0; + + mir_foreach_instr_in_block(block, ins) + instructions[i++] = ins; + + return instructions; +} + +/* The worklist is the set of instructions that can be scheduled now; that is, + * the set of instructions with no remaining dependencies */ + +static void +mir_initialize_worklist(BITSET_WORD *worklist, midgard_instruction **instructions, unsigned count) +{ + for (unsigned i = 0; i < count; ++i) { + if (instructions[i]->nr_dependencies == 0) + BITSET_SET(worklist, i); } +} + +/* Update the worklist after an instruction terminates. Remove its edges from + * the graph and if that causes any node to have no dependencies, add it to the + * worklist */ + +static void +mir_update_worklist( + BITSET_WORD *worklist, unsigned count, + midgard_instruction **instructions, midgard_instruction *done) +{ + /* Sanity check: if no instruction terminated, there is nothing to do. + * If the instruction that terminated had dependencies, that makes no + * sense and means we messed up the worklist. Finally, as the purpose + * of this routine is to update dependents, we abort early if there are + * no dependents defined. */ + + if (!done) + return; + + assert(done->nr_dependencies == 0); + + if (!done->dependents) + return; + + /* We have an instruction with dependents. Iterate each dependent to + * remove one dependency (`done`), adding dependents to the worklist + * where possible. */ + + unsigned i; + BITSET_WORD tmp; + BITSET_FOREACH_SET(i, tmp, done->dependents, count) { + assert(instructions[i]->nr_dependencies); - /* ...That's it */ - return true; + if (!(--instructions[i]->nr_dependencies)) + BITSET_SET(worklist, i); + } + + free(done->dependents); } +/* While scheduling, we need to choose instructions satisfying certain + * criteria. As we schedule backwards, we choose the *last* instruction in the + * worklist to simulate in-order scheduling. Chosen instructions must satisfy a + * given predicate. */ + +struct midgard_predicate { + /* TAG or ~0 for dont-care */ + unsigned tag; + + /* True if we want to pop off the chosen instruction */ + bool destructive; + + /* For ALU, choose only this unit */ + unsigned unit; + + /* State for bundle constants. constants is the actual constants + * for the bundle. constant_count is the number of bytes (up to + * 16) currently in use for constants. When picking in destructive + * mode, the constants array will be updated, and the instruction + * will be adjusted to index into the constants array */ + + uint8_t *constants; + unsigned constant_count; + bool blend_constant; + + /* Exclude this destination (if not ~0) */ + unsigned exclude; + + /* Don't schedule instructions consuming conditionals (since we already + * scheduled one). Excludes conditional branches and csel */ + bool no_cond; +}; + +/* For an instruction that can fit, adjust it to fit and update the constants + * array, in destructive mode. Returns whether the fitting was successful. */ + static bool -midgard_has_hazard( - midgard_instruction **segment, unsigned segment_size, - midgard_instruction *ains) +mir_adjust_constants(midgard_instruction *ins, + struct midgard_predicate *pred, + bool destructive) { - for (int s = 0; s < segment_size; ++s) - if (!can_run_concurrent_ssa(segment[s], ains)) + /* Blend constants dominate */ + if (ins->has_blend_constant) { + if (pred->constant_count) + return false; + else if (destructive) { + pred->blend_constant = true; + pred->constant_count = 16; return true; + } + } - return false; + /* No constant, nothing to adjust */ + if (!ins->has_constants) + return true; + /* TODO: Deduplicate; permit multiple constants within a bundle */ -} + if (destructive && !pred->constant_count) { + if (ins->alu.reg_mode == midgard_reg_mode_16) { + /* TODO: Fix packing XXX */ + uint16_t *bundles = (uint16_t *) pred->constants; + uint32_t *constants = (uint32_t *) ins->constants; -/* Schedules, but does not emit, a single basic block. After scheduling, the - * final tag and size of the block are known, which are necessary for branching - * */ + /* Copy them wholesale */ + for (unsigned i = 0; i < 4; ++i) + bundles[i] = constants[i]; + } else { + memcpy(pred->constants, ins->constants, 16); + } -static midgard_bundle -schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip) + pred->constant_count = 16; + return true; + } + + return !pred->constant_count; +} + +static midgard_instruction * +mir_choose_instruction( + midgard_instruction **instructions, + BITSET_WORD *worklist, unsigned count, + struct midgard_predicate *predicate) { - int instructions_emitted = 0, packed_idx = 0; - midgard_bundle bundle = { 0 }; + /* Parse the predicate */ + unsigned tag = predicate->tag; + bool alu = tag == TAG_ALU_4; + unsigned unit = predicate->unit; + bool branch = alu && (unit == ALU_ENAB_BR_COMPACT); + bool scalar = (unit != ~0) && (unit & UNITS_SCALAR); + bool no_cond = predicate->no_cond; + + /* Iterate to find the best instruction satisfying the predicate */ + unsigned i; + BITSET_WORD tmp; + + signed best_index = -1; + bool best_conditional = false; + + /* Enforce a simple metric limiting distance to keep down register + * pressure. TOOD: replace with liveness tracking for much better + * results */ + + unsigned max_active = 0; + unsigned max_distance = 6; + + BITSET_FOREACH_SET(i, tmp, worklist, count) { + max_active = MAX2(max_active, i); + } - uint8_t tag = ins->type; + BITSET_FOREACH_SET(i, tmp, worklist, count) { + if ((max_active - i) >= max_distance) + continue; - /* Default to the instruction's tag */ - bundle.tag = tag; + if (tag != ~0 && instructions[i]->type != tag) + continue; - switch (ins->type) { - case TAG_ALU_4: { - uint32_t control = 0; - size_t bytes_emitted = sizeof(control); + if (predicate->exclude != ~0 && instructions[i]->dest == predicate->exclude) + continue; - /* TODO: Constant combining */ - int index = 0, last_unit = 0; + if (alu && !branch && !(alu_opcode_props[instructions[i]->alu.op].props & unit)) + continue; - /* Previous instructions, for the purpose of parallelism */ - midgard_instruction *segment[4] = {0}; - int segment_size = 0; + if (branch && !instructions[i]->compact_branch) + continue; - instructions_emitted = -1; - midgard_instruction *pins = ins; + if (alu && scalar && !mir_is_scalar(instructions[i])) + continue; - unsigned constant_count = 0; + if (alu && !mir_adjust_constants(instructions[i], predicate, false)) + continue; - for (;;) { - midgard_instruction *ains = pins; + bool conditional = alu && !branch && OP_IS_CSEL(instructions[i]->alu.op); + conditional |= (branch && !instructions[i]->prepacked_branch && instructions[i]->branch.conditional); - /* Advance instruction pointer */ - if (index) { - ains = mir_next_op(pins); - pins = ains; - } + if (conditional && no_cond) + continue; - /* Out-of-work condition */ - if ((struct list_head *) ains == &block->instructions) - break; + /* Simulate in-order scheduling */ + if ((signed) i < best_index) + continue; - /* Ensure that the chain can continue */ - if (ains->type != TAG_ALU_4) break; + best_index = i; + best_conditional = conditional; + } - /* If there's already something in the bundle and we - * have weird scheduler constraints, break now */ - if (ains->precede_break && index) break; - /* According to the presentation "The ARM - * Mali-T880 Mobile GPU" from HotChips 27, - * there are two pipeline stages. Branching - * position determined experimentally. Lines - * are executed in parallel: - * - * [ VMUL ] [ SADD ] - * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ] - * - * Verify that there are no ordering dependencies here. - * - * TODO: Allow for parallelism!!! - */ + /* Did we find anything? */ - /* Pick a unit for it if it doesn't force a particular unit */ + if (best_index < 0) + return NULL; - int unit = ains->unit; + /* If we found something, remove it from the worklist */ + assert(best_index < count); - if (!unit) { - int op = ains->alu.op; - int units = alu_opcode_props[op].props; + if (predicate->destructive) { + BITSET_CLEAR(worklist, best_index); - bool scalarable = units & UNITS_SCALAR; - bool could_scalar = is_single_component_mask(ains->mask); + if (alu) + mir_adjust_constants(instructions[best_index], predicate, true); - /* Only 16/32-bit can run on a scalar unit */ - could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8; - could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64; - could_scalar &= ains->alu.dest_override == midgard_dest_override_none; + /* Once we schedule a conditional, we can't again */ + predicate->no_cond |= best_conditional; + } - if (ains->alu.reg_mode == midgard_reg_mode_16) { - /* If we're running in 16-bit mode, we - * can't have any 8-bit sources on the - * scalar unit (since the scalar unit - * doesn't understand 8-bit) */ + return instructions[best_index]; +} - midgard_vector_alu_src s1 = - vector_alu_from_unsigned(ains->alu.src1); +/* Still, we don't choose instructions in a vacuum. We need a way to choose the + * best bundle type (ALU, load/store, texture). Nondestructive. */ - could_scalar &= !s1.half; +static unsigned +mir_choose_bundle( + midgard_instruction **instructions, + BITSET_WORD *worklist, unsigned count) +{ + /* At the moment, our algorithm is very simple - use the bundle of the + * best instruction, regardless of what else could be scheduled + * alongside it. This is not optimal but it works okay for in-order */ + + struct midgard_predicate predicate = { + .tag = ~0, + .destructive = false, + .exclude = ~0 + }; - if (!ains->ssa_args.inline_constant) { - midgard_vector_alu_src s2 = - vector_alu_from_unsigned(ains->alu.src2); + midgard_instruction *chosen = mir_choose_instruction(instructions, worklist, count, &predicate); - could_scalar &= !s2.half; - } + if (chosen) + return chosen->type; + else + return ~0; +} - } +/* We want to choose an ALU instruction filling a given unit */ +static void +mir_choose_alu(midgard_instruction **slot, + midgard_instruction **instructions, + BITSET_WORD *worklist, unsigned len, + struct midgard_predicate *predicate, + unsigned unit) +{ + /* Did we already schedule to this slot? */ + if ((*slot) != NULL) + return; - bool scalar = could_scalar && scalarable; - - /* TODO: Check ahead-of-time for other scalar - * hazards that otherwise get aborted out */ - - if (scalar) - assert(units & UNITS_SCALAR); - - if (!scalar) { - if (last_unit >= UNIT_VADD) { - if (units & UNIT_VLUT) - unit = UNIT_VLUT; - else - break; - } else { - if ((units & UNIT_VMUL) && last_unit < UNIT_VMUL) - unit = UNIT_VMUL; - else if ((units & UNIT_VADD) && !(control & UNIT_VADD)) - unit = UNIT_VADD; - else if (units & UNIT_VLUT) - unit = UNIT_VLUT; - else - break; - } - } else { - if (last_unit >= UNIT_VADD) { - if ((units & UNIT_SMUL) && !(control & UNIT_SMUL)) - unit = UNIT_SMUL; - else if (units & UNIT_VLUT) - unit = UNIT_VLUT; - else - break; - } else { - if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains)) - unit = UNIT_SADD; - else if (units & UNIT_SMUL) - unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL; - else if ((units & UNIT_VADD) && !(control & UNIT_VADD)) - unit = UNIT_VADD; - else - break; - } - } + /* Try to schedule something, if not */ + predicate->unit = unit; + *slot = mir_choose_instruction(instructions, worklist, len, predicate); - assert(unit & units); - } + /* Store unit upon scheduling */ + if (*slot && !((*slot)->compact_branch)) + (*slot)->unit = unit; +} - /* Late unit check, this time for encoding (not parallelism) */ - if (unit <= last_unit) break; +/* When we are scheduling a branch/csel, we need the consumed condition in the + * same block as a pipeline register. There are two options to enable this: + * + * - Move the conditional into the bundle. Preferred, but only works if the + * conditional is used only once and is from this block. + * - Copy the conditional. + * + * We search for the conditional. If it's in this block, single-use, and + * without embedded constants, we schedule it immediately. Otherwise, we + * schedule a move for it. + * + * mir_comparison_mobile is a helper to find the moveable condition. + */ - /* Clear the segment */ - if (last_unit < UNIT_VADD && unit >= UNIT_VADD) - segment_size = 0; +static unsigned +mir_comparison_mobile( + compiler_context *ctx, + midgard_instruction **instructions, + unsigned count, + unsigned cond) +{ + if (!mir_single_use(ctx, cond)) + return ~0; - if (midgard_has_hazard(segment, segment_size, ains)) - break; + unsigned ret = ~0; - /* We're good to go -- emit the instruction */ - ains->unit = unit; + for (unsigned i = 0; i < count; ++i) { + if (instructions[i]->dest != cond) + continue; - segment[segment_size++] = ains; + /* Must fit in an ALU bundle */ + if (instructions[i]->type != TAG_ALU_4) + return ~0; - /* We try to reuse constants if possible, by adjusting - * the swizzle */ + /* We'll need to rewrite to .w but that doesn't work for vector + * ops that don't replicate (ball/bany), so bail there */ - if (ains->has_blend_constant) { - /* Everything conflicts with the blend constant */ - if (bundle.has_embedded_constants) - break; + if (GET_CHANNEL_COUNT(alu_opcode_props[instructions[i]->alu.op].props)) + return ~0; - bundle.has_blend_constant = 1; - bundle.has_embedded_constants = 1; - } else if (ains->has_constants && ains->alu.reg_mode == midgard_reg_mode_16) { - /* TODO: DRY with the analysis pass */ + /* TODO: moving conditionals with constants */ - if (bundle.has_blend_constant) - break; + if (instructions[i]->has_constants) + return ~0; - if (constant_count) - break; + /* Ensure it is written only once */ - /* TODO: Fix packing XXX */ - uint16_t *bundles = (uint16_t *) bundle.constants; - uint32_t *constants = (uint32_t *) ains->constants; + if (ret != ~0) + return ~0; + else + ret = i; + } - /* Copy them wholesale */ - for (unsigned i = 0; i < 4; ++i) - bundles[i] = constants[i]; + return ret; +} + +/* Using the information about the moveable conditional itself, we either pop + * that condition off the worklist for use now, or create a move to + * artificially schedule instead as a fallback */ + +static midgard_instruction * +mir_schedule_comparison( + compiler_context *ctx, + midgard_instruction **instructions, + BITSET_WORD *worklist, unsigned count, + unsigned cond, bool vector, unsigned swizzle, + midgard_instruction *user) +{ + /* TODO: swizzle when scheduling */ + unsigned comp_i = + (!vector && (swizzle == 0)) ? + mir_comparison_mobile(ctx, instructions, count, cond) : ~0; + + /* If we can, schedule the condition immediately */ + if ((comp_i != ~0) && BITSET_TEST(worklist, comp_i)) { + assert(comp_i < count); + BITSET_CLEAR(worklist, comp_i); + return instructions[comp_i]; + } - bundle.has_embedded_constants = true; - constant_count = 4; - } else if (ains->has_constants) { - /* By definition, blend constants conflict with - * everything, so if there are already - * constants we break the bundle *now* */ + /* Otherwise, we insert a move */ + midgard_vector_alu_src csel = { + .swizzle = swizzle + }; - if (bundle.has_blend_constant) - break; + midgard_instruction mov = v_mov(cond, csel, cond); + mov.mask = vector ? 0xF : 0x1; - /* For anything but blend constants, we can do - * proper analysis, however */ + return mir_insert_instruction_before(ctx, user, mov); +} - /* TODO: Mask by which are used */ - uint32_t *constants = (uint32_t *) ains->constants; - uint32_t *bundles = (uint32_t *) bundle.constants; +/* Most generally, we need instructions writing to r31 in the appropriate + * components */ - uint32_t indices[4] = { 0 }; - bool break_bundle = false; +static midgard_instruction * +mir_schedule_condition(compiler_context *ctx, + struct midgard_predicate *predicate, + BITSET_WORD *worklist, unsigned count, + midgard_instruction **instructions, + midgard_instruction *last) +{ + /* For a branch, the condition is the only argument; for csel, third */ + bool branch = last->compact_branch; + unsigned condition_index = branch ? 0 : 2; - for (unsigned i = 0; i < 4; ++i) { - uint32_t cons = constants[i]; - bool constant_found = false; + /* csel_v is vector; otherwise, conditions are scalar */ + bool vector = !branch && OP_IS_CSEL_V(last->alu.op); - /* Search for the constant */ - for (unsigned j = 0; j < constant_count; ++j) { - if (bundles[j] != cons) - continue; + /* Grab the conditional instruction */ - /* We found it, reuse */ - indices[i] = j; - constant_found = true; - break; - } + midgard_instruction *cond = mir_schedule_comparison( + ctx, instructions, worklist, count, last->src[condition_index], + vector, last->cond_swizzle, last); - if (constant_found) - continue; + /* We have exclusive reign over this (possibly move) conditional + * instruction. We can rewrite into a pipeline conditional register */ - /* We didn't find it, so allocate it */ - unsigned idx = constant_count++; + predicate->exclude = cond->dest; + cond->dest = SSA_FIXED_REGISTER(31); - if (idx >= 4) { - /* Uh-oh, out of space */ - break_bundle = true; - break; - } + if (!vector) { + cond->mask = (1 << COMPONENT_W); - /* We have space, copy it in! */ - bundles[idx] = cons; - indices[i] = idx; - } + mir_foreach_src(cond, s) { + if (cond->src[s] == ~0) + continue; - if (break_bundle) - break; + mir_set_swizzle(cond, s, (mir_get_swizzle(cond, s) << (2*3)) & 0xFF); + } + } - /* Cool, we have it in. So use indices as a - * swizzle */ + /* Schedule the unit: csel is always in the latter pipeline, so a csel + * condition must be in the former pipeline stage (vmul/sadd), + * depending on scalar/vector of the instruction itself. A branch must + * be written from the latter pipeline stage and a branch condition is + * always scalar, so it is always in smul (exception: ball/bany, which + * will be vadd) */ - unsigned swizzle = SWIZZLE_FROM_ARRAY(indices); - unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + if (branch) + cond->unit = UNIT_SMUL; + else + cond->unit = vector ? UNIT_VMUL : UNIT_SADD; - if (ains->ssa_args.src0 == r_constant) - ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle); + return cond; +} - if (ains->ssa_args.src1 == r_constant) - ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle); +/* Schedules a single bundle of the given type */ - bundle.has_embedded_constants = true; - } +static midgard_bundle +mir_schedule_texture( + midgard_instruction **instructions, + BITSET_WORD *worklist, unsigned len) +{ + struct midgard_predicate predicate = { + .tag = TAG_TEXTURE_4, + .destructive = true, + .exclude = ~0 + }; - if (ains->unit & UNITS_ANY_VECTOR) { - bytes_emitted += sizeof(midgard_reg_info); - bytes_emitted += sizeof(midgard_vector_alu); - } else if (ains->compact_branch) { - /* All of r0 has to be written out along with - * the branch writeout */ - - if (ains->writeout) { - /* The rules for when "bare" writeout - * is safe are when all components are - * r0 are written out in the final - * bundle, earlier than VLUT, where any - * register dependencies of r0 are from - * an earlier bundle. We can't verify - * this before RA, so we don't try. */ - - if (index != 0) - break; - - /* Inject a move */ - midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0)); - ins.unit = UNIT_VMUL; - control |= ins.unit; - - /* TODO don't leak */ - midgard_instruction *move = - mem_dup(&ins, sizeof(midgard_instruction)); - bytes_emitted += sizeof(midgard_reg_info); - bytes_emitted += sizeof(midgard_vector_alu); - bundle.instructions[packed_idx++] = move; - } + midgard_instruction *ins = + mir_choose_instruction(instructions, worklist, len, &predicate); - if (ains->unit == ALU_ENAB_BRANCH) { - bytes_emitted += sizeof(midgard_branch_extended); - } else { - bytes_emitted += sizeof(ains->br_compact); - } - } else { - bytes_emitted += sizeof(midgard_reg_info); - bytes_emitted += sizeof(midgard_scalar_alu); - } + mir_update_worklist(worklist, len, instructions, ins); - /* Defer marking until after writing to allow for break */ - control |= ains->unit; - last_unit = ains->unit; - ++instructions_emitted; - ++index; - } + struct midgard_bundle out = { + .tag = TAG_TEXTURE_4, + .instruction_count = 1, + .instructions = { ins } + }; - int padding = 0; + return out; +} - /* Pad ALU op to nearest word */ +static midgard_bundle +mir_schedule_ldst( + midgard_instruction **instructions, + BITSET_WORD *worklist, unsigned len) +{ + struct midgard_predicate predicate = { + .tag = TAG_LOAD_STORE_4, + .destructive = true, + .exclude = ~0 + }; - if (bytes_emitted & 15) { - padding = 16 - (bytes_emitted & 15); - bytes_emitted += padding; - } + /* Try to pick two load/store ops. Second not gauranteed to exist */ + + midgard_instruction *ins = + mir_choose_instruction(instructions, worklist, len, &predicate); + + midgard_instruction *pair = + mir_choose_instruction(instructions, worklist, len, &predicate); + + struct midgard_bundle out = { + .tag = TAG_LOAD_STORE_4, + .instruction_count = pair ? 2 : 1, + .instructions = { ins, pair } + }; + + /* We have to update the worklist atomically, since the two + * instructions run concurrently (TODO: verify it's not pipelined) */ - /* Constants must always be quadwords */ - if (bundle.has_embedded_constants) - bytes_emitted += 16; + mir_update_worklist(worklist, len, instructions, ins); + mir_update_worklist(worklist, len, instructions, pair); - /* Size ALU instruction for tag */ - bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1; - bundle.padding = padding; - bundle.control = bundle.tag | control; + return out; +} + +static midgard_bundle +mir_schedule_alu( + compiler_context *ctx, + midgard_instruction **instructions, + BITSET_WORD *worklist, unsigned len) +{ + struct midgard_bundle bundle = {}; + + unsigned bytes_emitted = sizeof(bundle.control); - break; + struct midgard_predicate predicate = { + .tag = TAG_ALU_4, + .destructive = true, + .exclude = ~0, + .constants = (uint8_t *) bundle.constants + }; + + midgard_instruction *vmul = NULL; + midgard_instruction *vadd = NULL; + midgard_instruction *vlut = NULL; + midgard_instruction *smul = NULL; + midgard_instruction *sadd = NULL; + midgard_instruction *branch = NULL; + + mir_choose_alu(&branch, instructions, worklist, len, &predicate, ALU_ENAB_BR_COMPACT); + mir_update_worklist(worklist, len, instructions, branch); + bool writeout = branch && branch->writeout; + + if (branch && !branch->prepacked_branch && branch->branch.conditional) { + midgard_instruction *cond = mir_schedule_condition(ctx, &predicate, worklist, len, instructions, branch); + + if (cond->unit == UNIT_VADD) + vadd = cond; + else if (cond->unit == UNIT_SMUL) + smul = cond; + else + unreachable("Bad condition"); } - case TAG_LOAD_STORE_4: { - /* Load store instructions have two words at once. If - * we only have one queued up, we need to NOP pad. - * Otherwise, we store both in succession to save space - * and cycles -- letting them go in parallel -- skip - * the next. The usefulness of this optimisation is - * greatly dependent on the quality of the instruction - * scheduler. - */ + mir_choose_alu(&smul, instructions, worklist, len, &predicate, UNIT_SMUL); - midgard_instruction *next_op = mir_next_op(ins); + if (!writeout) + mir_choose_alu(&vlut, instructions, worklist, len, &predicate, UNIT_VLUT); - if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) { - /* TODO: Concurrency check */ - instructions_emitted++; - } + mir_choose_alu(&vadd, instructions, worklist, len, &predicate, UNIT_VADD); + + mir_update_worklist(worklist, len, instructions, vlut); + mir_update_worklist(worklist, len, instructions, vadd); + mir_update_worklist(worklist, len, instructions, smul); + + bool vadd_csel = vadd && OP_IS_CSEL(vadd->alu.op); + bool smul_csel = smul && OP_IS_CSEL(smul->alu.op); + + if (vadd_csel || smul_csel) { + midgard_instruction *ins = vadd_csel ? vadd : smul; + midgard_instruction *cond = mir_schedule_condition(ctx, &predicate, worklist, len, instructions, ins); - break; + if (cond->unit == UNIT_VMUL) + vmul = cond; + else if (cond->unit == UNIT_SADD) + sadd = cond; + else + unreachable("Bad condition"); } - case TAG_TEXTURE_4: { - /* Which tag we use depends on the shader stage */ - bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT; - bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX; - break; + /* Stage 2, let's schedule sadd before vmul for writeout */ + mir_choose_alu(&sadd, instructions, worklist, len, &predicate, UNIT_SADD); + + /* Check if writeout reads its own register */ + bool bad_writeout = false; + + if (branch && branch->writeout) { + midgard_instruction *stages[] = { sadd, vadd, smul }; + unsigned src = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : branch->src[0]; + unsigned writeout_mask = 0x0; + + for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { + if (!stages[i]) + continue; + + if (stages[i]->dest != src) + continue; + + writeout_mask |= stages[i]->mask; + bad_writeout |= mir_has_arg(stages[i], branch->src[0]); + } + + /* Add a move if necessary */ + if (bad_writeout || writeout_mask != 0xF) { + unsigned temp = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : make_compiler_temp(ctx); + midgard_instruction mov = v_mov(src, blank_alu_src, temp); + vmul = mem_dup(&mov, sizeof(midgard_instruction)); + vmul->unit = UNIT_VMUL; + vmul->mask = 0xF ^ writeout_mask; + /* TODO: Don't leak */ + + /* Rewrite to use our temp */ + + for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { + if (stages[i]) + mir_rewrite_index_dst_single(stages[i], src, temp); + } + + mir_rewrite_index_src_single(branch, src, temp); + } } - default: - unreachable("Unknown tag"); - break; + mir_choose_alu(&vmul, instructions, worklist, len, &predicate, UNIT_VMUL); + + mir_update_worklist(worklist, len, instructions, vmul); + mir_update_worklist(worklist, len, instructions, sadd); + + bundle.has_blend_constant = predicate.blend_constant; + bundle.has_embedded_constants = predicate.constant_count > 0; + + unsigned padding = 0; + + /* Now that we have finished scheduling, build up the bundle */ + midgard_instruction *stages[] = { vmul, sadd, vadd, smul, vlut, branch }; + + for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { + if (stages[i]) { + bundle.control |= stages[i]->unit; + bytes_emitted += bytes_for_instruction(stages[i]); + bundle.instructions[bundle.instruction_count++] = stages[i]; + } } - /* Copy the instructions into the bundle */ - bundle.instruction_count = instructions_emitted + 1 + packed_idx; + /* Pad ALU op to nearest word */ - midgard_instruction *uins = ins; - for (; packed_idx < bundle.instruction_count; ++packed_idx) { - bundle.instructions[packed_idx] = uins; - uins = mir_next_op(uins); + if (bytes_emitted & 15) { + padding = 16 - (bytes_emitted & 15); + bytes_emitted += padding; } - *skip = instructions_emitted; + /* Constants must always be quadwords */ + if (bundle.has_embedded_constants) + bytes_emitted += 16; + + /* Size ALU instruction for tag */ + bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1; + bundle.padding = padding; + bundle.control |= bundle.tag; return bundle; } @@ -498,81 +880,79 @@ schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction /* Schedule a single block by iterating its instruction to create bundles. * While we go, tally about the bundle sizes to compute the block size. */ + static void schedule_block(compiler_context *ctx, midgard_block *block) { - util_dynarray_init(&block->bundles, NULL); + /* Copy list to dynamic array */ + unsigned len = 0; + midgard_instruction **instructions = flatten_mir(block, &len); - block->quadword_count = 0; + if (!len) + return; - mir_foreach_instr_in_block(block, ins) { - int skip; - midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip); - util_dynarray_append(&block->bundles, midgard_bundle, bundle); + /* Calculate dependencies and initial worklist */ + unsigned node_count = ctx->temp_count + 1; + mir_create_dependency_graph(instructions, len, node_count); - if (bundle.has_blend_constant) { - /* TODO: Multiblock? */ - int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1; - ctx->blend_constant_offset = quadwords_within_block * 0x10; - } + /* Allocate the worklist */ + size_t sz = BITSET_WORDS(len) * sizeof(BITSET_WORD); + BITSET_WORD *worklist = calloc(sz, 1); + mir_initialize_worklist(worklist, instructions, len); - while(skip--) - ins = mir_next_op(ins); + struct util_dynarray bundles; + util_dynarray_init(&bundles, NULL); - block->quadword_count += quadword_size(bundle.tag); - } + block->quadword_count = 0; + unsigned blend_offset = 0; - block->is_scheduled = true; -} + for (;;) { + unsigned tag = mir_choose_bundle(instructions, worklist, len); + midgard_bundle bundle; -/* The following passes reorder MIR instructions to enable better scheduling */ + if (tag == TAG_TEXTURE_4) + bundle = mir_schedule_texture(instructions, worklist, len); + else if (tag == TAG_LOAD_STORE_4) + bundle = mir_schedule_ldst(instructions, worklist, len); + else if (tag == TAG_ALU_4) + bundle = mir_schedule_alu(ctx, instructions, worklist, len); + else + break; -static void -midgard_pair_load_store(compiler_context *ctx, midgard_block *block) -{ - mir_foreach_instr_in_block_safe(block, ins) { - if (ins->type != TAG_LOAD_STORE_4) continue; - - /* We've found a load/store op. Check if next is also load/store. */ - midgard_instruction *next_op = mir_next_op(ins); - if (&next_op->link != &block->instructions) { - if (next_op->type == TAG_LOAD_STORE_4) { - /* If so, we're done since we're a pair */ - ins = mir_next_op(ins); - continue; - } + util_dynarray_append(&bundles, midgard_bundle, bundle); - /* Maximum search distance to pair, to avoid register pressure disasters */ - int search_distance = 8; + if (bundle.has_blend_constant) + blend_offset = block->quadword_count; - /* Otherwise, we have an orphaned load/store -- search for another load */ - mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) { - /* Terminate search if necessary */ - if (!(search_distance--)) break; + block->quadword_count += quadword_size(bundle.tag); + } - if (c->type != TAG_LOAD_STORE_4) continue; + /* We emitted bundles backwards; copy into the block in reverse-order */ - /* Stores cannot be reordered, since they have - * dependencies. For the same reason, indirect - * loads cannot be reordered as their index is - * loaded in r27.w */ + util_dynarray_init(&block->bundles, NULL); + util_dynarray_foreach_reverse(&bundles, midgard_bundle, bundle) { + util_dynarray_append(&block->bundles, midgard_bundle, *bundle); + } - if (OP_IS_STORE(c->load_store.op)) continue; + /* Blend constant was backwards as well. blend_offset if set is + * strictly positive, as an offset of zero would imply constants before + * any instructions which is invalid in Midgard */ - /* It appears the 0x800 bit is set whenever a - * load is direct, unset when it is indirect. - * Skip indirect loads. */ + if (blend_offset) + ctx->blend_constant_offset = ((ctx->quadword_count + block->quadword_count) - blend_offset - 1) * 0x10; - if (!(c->load_store.unknown & 0x800)) continue; + block->is_scheduled = true; + ctx->quadword_count += block->quadword_count; - /* We found one! Move it up to pair and remove it from the old location */ + /* Reorder instructions to match bundled. First remove existing + * instructions and then recreate the list */ - mir_insert_instruction_before(ins, *c); - mir_remove_instruction(c); + mir_foreach_instr_in_block_safe(block, ins) { + list_del(&ins->link); + } - break; - } - } + mir_foreach_instr_in_block_scheduled_rev(block, ins) { + list_add(&ins->link, &block->instructions); } } @@ -582,7 +962,7 @@ midgard_pair_load_store(compiler_context *ctx, midgard_block *block) static unsigned find_or_allocate_temp(compiler_context *ctx, unsigned hash) { - if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM)) + if (hash >= SSA_FIXED_MINIMUM) return hash; unsigned temp = (uintptr_t) _mesa_hash_table_u64_search( @@ -612,14 +992,10 @@ mir_squeeze_index(compiler_context *ctx) ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL); mir_foreach_instr_global(ctx, ins) { - if (ins->compact_branch) continue; - - ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest); - ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0); - - if (!ins->ssa_args.inline_constant) - ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1); + ins->dest = find_or_allocate_temp(ctx, ins->dest); + for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i) + ins->src[i] = find_or_allocate_temp(ctx, ins->src[i]); } } @@ -636,152 +1012,249 @@ v_load_store_scratch( midgard_instruction ins = { .type = TAG_LOAD_STORE_4, .mask = mask, - .ssa_args = { - .dest = -1, - .src0 = -1, - .src1 = -1 - }, + .dest = ~0, + .src = { ~0, ~0, ~0 }, .load_store = { .op = is_store ? midgard_op_st_int4 : midgard_op_ld_int4, .swizzle = SWIZZLE_XYZW, /* For register spilling - to thread local storage */ - .unknown = 0x1EEA, + .arg_1 = 0xEA, + .arg_2 = 0x1E, /* Splattered across, TODO combine logically */ .varying_parameters = (byte & 0x1FF) << 1, .address = (byte >> 9) - } + }, + + /* If we spill an unspill, RA goes into an infinite loop */ + .no_spill = true }; if (is_store) { /* r0 = r26, r1 = r27 */ assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27)); - ins.ssa_args.src0 = (srcdest == SSA_FIXED_REGISTER(27)) ? SSA_FIXED_REGISTER(1) : SSA_FIXED_REGISTER(0); + ins.src[0] = srcdest; } else { - ins.ssa_args.dest = srcdest; + ins.dest = srcdest; } return ins; } -void -schedule_program(compiler_context *ctx) +/* If register allocation fails, find the best spill node and spill it to fix + * whatever the issue was. This spill node could be a work register (spilling + * to thread local storage), but it could also simply be a special register + * that needs to spill to become a work register. */ + +static void mir_spill_register( + compiler_context *ctx, + struct ra_graph *g, + unsigned *spill_count) { - struct ra_graph *g = NULL; - bool spilled = false; - int iter_count = 1000; /* max iterations */ + unsigned spill_index = ctx->temp_count; - /* Number of 128-bit slots in memory we've spilled into */ - unsigned spill_count = 0; + /* Our first step is to calculate spill cost to figure out the best + * spill node. All nodes are equal in spill cost, but we can't spill + * nodes written to from an unspill */ + + for (unsigned i = 0; i < ctx->temp_count; ++i) { + ra_set_node_spill_cost(g, i, 1.0); + } - midgard_promote_uniforms(ctx, 8); + /* We can't spill any bundles that contain unspills. This could be + * optimized to allow use of r27 to spill twice per bundle, but if + * you're at the point of optimizing spilling, it's too late. */ mir_foreach_block(ctx, block) { - midgard_pair_load_store(ctx, block); - } + mir_foreach_bundle_in_block(block, bun) { + bool no_spill = false; - do { - /* If we spill, find the best spill node and spill it */ + for (unsigned i = 0; i < bun->instruction_count; ++i) + no_spill |= bun->instructions[i]->no_spill; - unsigned spill_index = ctx->temp_count; - if (g && spilled) { - /* All nodes are equal in spill cost, but we can't - * spill nodes written to from an unspill */ + if (!no_spill) + continue; - for (unsigned i = 0; i < ctx->temp_count; ++i) { - ra_set_node_spill_cost(g, i, 1.0); + for (unsigned i = 0; i < bun->instruction_count; ++i) { + unsigned dest = bun->instructions[i]->dest; + if (dest < ctx->temp_count) + ra_set_node_spill_cost(g, dest, -1.0); } + } + } - mir_foreach_instr_global(ctx, ins) { - if (ins->type != TAG_LOAD_STORE_4) continue; - if (ins->load_store.op != midgard_op_ld_int4) continue; - if (ins->load_store.unknown != 0x1EEA) continue; - ra_set_node_spill_cost(g, ins->ssa_args.dest, -1.0); - } + int spill_node = ra_get_best_spill_node(g); + + if (spill_node < 0) { + mir_print_shader(ctx); + assert(0); + } - int spill_node = ra_get_best_spill_node(g); + /* We have a spill node, so check the class. Work registers + * legitimately spill to TLS, but special registers just spill to work + * registers */ - if (spill_node < 0) { - mir_print_shader(ctx); - assert(0); - } + unsigned class = ra_get_node_class(g, spill_node); + bool is_special = (class >> 2) != REG_CLASS_WORK; + bool is_special_w = (class >> 2) == REG_CLASS_TEXW; + + /* Allocate TLS slot (maybe) */ + unsigned spill_slot = !is_special ? (*spill_count)++ : 0; + + /* For TLS, replace all stores to the spilled node. For + * special reads, just keep as-is; the class will be demoted + * implicitly. For special writes, spill to a work register */ - /* Allocate TLS slot */ - unsigned spill_slot = spill_count++; + if (!is_special || is_special_w) { + if (is_special_w) + spill_slot = spill_index++; - /* Replace all stores to the spilled node with stores - * to TLS */ + mir_foreach_block(ctx, block) { + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->dest != spill_node) continue; - mir_foreach_instr_global_safe(ctx, ins) { - if (ins->compact_branch) continue; - if (ins->ssa_args.dest != spill_node) continue; - ins->ssa_args.dest = SSA_FIXED_REGISTER(26); + midgard_instruction st; - midgard_instruction st = v_load_store_scratch(ins->ssa_args.dest, spill_slot, true, ins->mask); - mir_insert_instruction_before(mir_next_op(ins), st); + if (is_special_w) { + st = v_mov(spill_node, blank_alu_src, spill_slot); + st.no_spill = true; + } else { + ins->dest = SSA_FIXED_REGISTER(26); + ins->no_spill = true; + st = v_load_store_scratch(ins->dest, spill_slot, true, ins->mask); } - /* Insert a load from TLS before the first consecutive - * use of the node, rewriting to use spilled indices to - * break up the live range */ + /* Hint: don't rewrite this node */ + st.hint = true; - mir_foreach_block(ctx, block) { + mir_insert_instruction_after_scheduled(ctx, block, ins, st); - bool consecutive_skip = false; - unsigned consecutive_index = 0; + if (!is_special) + ctx->spills++; + } + } + } - mir_foreach_instr_in_block(block, ins) { - if (ins->compact_branch) continue; - - if (!mir_has_arg(ins, spill_node)) { - consecutive_skip = false; - continue; - } + /* For special reads, figure out how many components we need */ + unsigned read_mask = 0; - if (consecutive_skip) { - /* Rewrite */ - mir_rewrite_index_src_single(ins, spill_node, consecutive_index); - continue; - } + mir_foreach_instr_global_safe(ctx, ins) { + read_mask |= mir_mask_of_read_components(ins, spill_node); + } + + /* Insert a load from TLS before the first consecutive + * use of the node, rewriting to use spilled indices to + * break up the live range. Or, for special, insert a + * move. Ironically the latter *increases* register + * pressure, but the two uses of the spilling mechanism + * are somewhat orthogonal. (special spilling is to use + * work registers to back special registers; TLS + * spilling is to use memory to back work registers) */ + + mir_foreach_block(ctx, block) { + bool consecutive_skip = false; + unsigned consecutive_index = 0; + + mir_foreach_instr_in_block(block, ins) { + /* We can't rewrite the moves used to spill in the + * first place. These moves are hinted. */ + if (ins->hint) continue; + + if (!mir_has_arg(ins, spill_node)) { + consecutive_skip = false; + continue; + } + if (consecutive_skip) { + /* Rewrite */ + mir_rewrite_index_src_single(ins, spill_node, consecutive_index); + continue; + } + + if (!is_special_w) { consecutive_index = ++spill_index; - midgard_instruction st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF); + midgard_instruction *before = ins; /* For a csel, go back one more not to break up the bundle */ if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op)) before = mir_prev_op(before); - mir_insert_instruction_before(before, st); - // consecutive_skip = true; + midgard_instruction st; + if (is_special) { + /* Move */ + st = v_mov(spill_node, blank_alu_src, consecutive_index); + st.no_spill = true; + } else { + /* TLS load */ + st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF); + } - /* Rewrite to use */ - mir_rewrite_index_src_single(ins, spill_node, consecutive_index); - } + /* Mask the load based on the component count + * actually needed to prvent RA loops */ + + st.mask = read_mask; + + mir_insert_instruction_before_scheduled(ctx, block, before, st); + // consecutive_skip = true; + } else { + /* Special writes already have their move spilled in */ + consecutive_index = spill_slot; } + + + /* Rewrite to use */ + mir_rewrite_index_src_single(ins, spill_node, consecutive_index); + + if (!is_special) + ctx->fills++; } + } - mir_squeeze_index(ctx); + /* Reset hints */ - g = NULL; - g = allocate_registers(ctx, &spilled); - } while(spilled && ((iter_count--) > 0)); + mir_foreach_instr_global(ctx, ins) { + ins->hint = false; + } +} + +void +schedule_program(compiler_context *ctx) +{ + struct ra_graph *g = NULL; + bool spilled = false; + int iter_count = 1000; /* max iterations */ - /* We would like to run RA after scheduling, but spilling can - * complicate this */ + /* Number of 128-bit slots in memory we've spilled into */ + unsigned spill_count = 0; - mir_foreach_block(ctx, block) { - schedule_block(ctx, block); - } -#if 0 + midgard_promote_uniforms(ctx, 16); - /* Pipeline registers creation is a prepass before RA */ - mir_create_pipeline_registers(ctx); -#endif + /* Must be lowered right before RA */ + mir_squeeze_index(ctx); + mir_lower_special_reads(ctx); + mir_squeeze_index(ctx); + /* Lowering can introduce some dead moves */ + mir_foreach_block(ctx, block) { + midgard_opt_dead_move_eliminate(ctx, block); + schedule_block(ctx, block); + } + + mir_create_pipeline_registers(ctx); + + do { + if (spilled) + mir_spill_register(ctx, g, &spill_count); + + mir_squeeze_index(ctx); + + g = NULL; + g = allocate_registers(ctx, &spilled); + } while(spilled && ((iter_count--) > 0)); if (iter_count <= 0) { fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");