X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fpanfrost%2Fmidgard%2Fmidgard_schedule.c;h=a365cfaf09ea76eb9db2006ab732468dcd48b8e9;hb=4e4c9f5f5ac4373dca5177cfcecc484a476cbf36;hp=86a77149c78f4859855ac81e375151249404bc93;hpb=6284f3ec257159aecb9493a71e9d311af3b28b0f;p=mesa.git diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c index 86a77149c78..a365cfaf09e 100644 --- a/src/panfrost/midgard/midgard_schedule.c +++ b/src/panfrost/midgard/midgard_schedule.c @@ -23,8 +23,9 @@ #include "compiler.h" #include "midgard_ops.h" +#include "midgard_quirks.h" #include "util/u_memory.h" -#include "util/register_allocate.h" +#include "util/u_math.h" /* Scheduling for Midgard is complicated, to say the least. ALU instructions * must be grouped into VLIW bundles according to following model: @@ -55,18 +56,18 @@ * */ -/* We create the dependency graph with per-component granularity */ +/* We create the dependency graph with per-byte granularity */ -#define COMPONENT_COUNT 8 +#define BYTE_COUNT 16 static void -add_dependency(struct util_dynarray *table, unsigned index, unsigned mask, midgard_instruction **instructions, unsigned child) +add_dependency(struct util_dynarray *table, unsigned index, uint16_t mask, midgard_instruction **instructions, unsigned child) { - for (unsigned i = 0; i < COMPONENT_COUNT; ++i) { + for (unsigned i = 0; i < BYTE_COUNT; ++i) { if (!(mask & (1 << i))) continue; - struct util_dynarray *parents = &table[(COMPONENT_COUNT * index) + i]; + struct util_dynarray *parents = &table[(BYTE_COUNT * index) + i]; util_dynarray_foreach(parents, unsigned, parent) { BITSET_WORD *dependents = instructions[*parent]->dependents; @@ -82,20 +83,20 @@ add_dependency(struct util_dynarray *table, unsigned index, unsigned mask, midga } static void -mark_access(struct util_dynarray *table, unsigned index, unsigned mask, unsigned parent) +mark_access(struct util_dynarray *table, unsigned index, uint16_t mask, unsigned parent) { - for (unsigned i = 0; i < COMPONENT_COUNT; ++i) { + for (unsigned i = 0; i < BYTE_COUNT; ++i) { if (!(mask & (1 << i))) continue; - util_dynarray_append(&table[(COMPONENT_COUNT * index) + i], unsigned, parent); + util_dynarray_append(&table[(BYTE_COUNT * index) + i], unsigned, parent); } } static void mir_create_dependency_graph(midgard_instruction **instructions, unsigned count, unsigned node_count) { - size_t sz = node_count * COMPONENT_COUNT; + size_t sz = node_count * BYTE_COUNT; struct util_dynarray *last_read = calloc(sizeof(struct util_dynarray), sz); struct util_dynarray *last_write = calloc(sizeof(struct util_dynarray), sz); @@ -119,13 +120,13 @@ mir_create_dependency_graph(midgard_instruction **instructions, unsigned count, continue; unsigned dest = instructions[i]->dest; - unsigned mask = instructions[i]->mask; + unsigned mask = mir_bytemask(instructions[i]); mir_foreach_src((*instructions), s) { unsigned src = instructions[i]->src[s]; if (src < node_count) { - unsigned readmask = mir_mask_of_read_components(instructions[i], src); + unsigned readmask = mir_bytemask_of_read_components(instructions[i], src); add_dependency(last_write, src, readmask, instructions, i); } } @@ -140,7 +141,7 @@ mir_create_dependency_graph(midgard_instruction **instructions, unsigned count, unsigned src = instructions[i]->src[s]; if (src < node_count) { - unsigned readmask = mir_mask_of_read_components(instructions[i], src); + unsigned readmask = mir_bytemask_of_read_components(instructions[i], src); mark_access(last_read, src, readmask, i); } } @@ -166,22 +167,9 @@ mir_create_dependency_graph(midgard_instruction **instructions, unsigned count, util_dynarray_fini(&last_read[i]); util_dynarray_fini(&last_write[i]); } -} - -/* Create a mask of accessed components from a swizzle to figure out vector - * dependencies */ - -static unsigned -swizzle_to_access_mask(unsigned swizzle) -{ - unsigned component_mask = 0; - - for (int i = 0; i < 4; ++i) { - unsigned c = (swizzle >> (2 * i)) & 3; - component_mask |= (1 << c); - } - return component_mask; + free(last_read); + free(last_write); } /* Does the mask cover more than a scalar? */ @@ -199,187 +187,29 @@ is_single_component_mask(unsigned mask) return components == 1; } -/* Checks for an SSA data hazard between two adjacent instructions, keeping in - * mind that we are a vector architecture and we can write to different - * components simultaneously */ - -static bool -can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second) -{ - /* Writeout has its own rules anyway */ - if (first->compact_branch || second->compact_branch) - return true; - - /* Each instruction reads some registers and writes to a register. See - * where the first writes */ - - int source = first->dest; - int source_mask = first->mask; - - /* As long as the second doesn't read from the first, we're okay */ - for (unsigned i = 0; i < ARRAY_SIZE(second->src); ++i) { - if (second->src[i] != source) - continue; - - if (first->type != TAG_ALU_4) - return false; - - /* Figure out which components we just read from */ - - int q = (i == 0) ? second->alu.src1 : second->alu.src2; - midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q; - - /* Check if there are components in common, and fail if so */ - if (swizzle_to_access_mask(m->swizzle) & source_mask) - return false; - } - - /* Otherwise, it's safe in that regard. Another data hazard is both - * writing to the same place, of course */ - - if (second->dest == source) { - /* ...but only if the components overlap */ - - if (second->mask & source_mask) - return false; - } - - /* ...That's it */ - return true; -} - -static bool -midgard_has_hazard( - midgard_instruction **segment, unsigned segment_size, - midgard_instruction *ains) -{ - for (int s = 0; s < segment_size; ++s) - if (!can_run_concurrent_ssa(segment[s], ains)) - return true; - - return false; - - -} - -/* Fragment writeout (of r0) is allowed when: - * - * - All components of r0 are written in the bundle - * - No components of r0 are written in VLUT - * - Non-pipelined dependencies of r0 are not written in the bundle - * - * This function checks if these requirements are satisfied given the content - * of a scheduled bundle. - */ - -static bool -can_writeout_fragment(compiler_context *ctx, midgard_instruction **bundle, unsigned count, unsigned node_count, unsigned r0) -{ - /* First scan for which components of r0 are written out. Initially - * none are written */ - - uint8_t r0_written_mask = 0x0; - - /* Simultaneously we scan for the set of dependencies */ - - size_t sz = sizeof(BITSET_WORD) * BITSET_WORDS(node_count); - BITSET_WORD *dependencies = calloc(1, sz); - memset(dependencies, 0, sz); - - bool success = false; - - for (unsigned i = 0; i < count; ++i) { - midgard_instruction *ins = bundle[i]; - - if (ins->dest != r0) - continue; - - /* Record written out mask */ - r0_written_mask |= ins->mask; - - /* Record dependencies, but only if they won't become pipeline - * registers. We know we can't be live after this, because - * we're writeout at the very end of the shader. So check if - * they were written before us. */ - - unsigned src0 = ins->src[0]; - unsigned src1 = ins->src[1]; - - if (!mir_is_written_before(ctx, bundle[0], src0)) - src0 = ~0; - - if (!mir_is_written_before(ctx, bundle[0], src1)) - src1 = ~0; - - if (src0 < node_count) - BITSET_SET(dependencies, src0); - - if (src1 < node_count) - BITSET_SET(dependencies, src1); - - /* Requirement 2 */ - if (ins->unit == UNIT_VLUT) - goto done; - } - - /* Requirement 1 */ - if ((r0_written_mask & 0xF) != 0xF) - goto done; - - /* Requirement 3 */ - - for (unsigned i = 0; i < count; ++i) { - unsigned dest = bundle[i]->dest; - - if (dest < node_count && BITSET_TEST(dependencies, dest)) - goto done; - } - - /* Otherwise, we're good to go */ - success = true; - -done: - free(dependencies); - return success; -} - /* Helpers for scheudling */ static bool mir_is_scalar(midgard_instruction *ains) { - /* Does the op support scalar units? */ - if (!(alu_opcode_props[ains->alu.op].props & UNITS_SCALAR)) - return false; - /* Do we try to use it as a vector op? */ if (!is_single_component_mask(ains->mask)) return false; /* Otherwise, check mode hazards */ bool could_scalar = true; + unsigned sz0 = nir_alu_type_get_type_size(ains->src_types[0]); + unsigned sz1 = nir_alu_type_get_type_size(ains->src_types[1]); /* Only 16/32-bit can run on a scalar unit */ could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8; could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64; - could_scalar &= ains->alu.dest_override == midgard_dest_override_none; - - if (ains->alu.reg_mode == midgard_reg_mode_16) { - /* If we're running in 16-bit mode, we - * can't have any 8-bit sources on the - * scalar unit (since the scalar unit - * doesn't understand 8-bit) */ - - midgard_vector_alu_src s1 = - vector_alu_from_unsigned(ains->alu.src1); - could_scalar &= !s1.half; + if (ains->src[0] != ~0) + could_scalar &= (sz0 == 16) || (sz0 == 32); - midgard_vector_alu_src s2 = - vector_alu_from_unsigned(ains->alu.src2); - - could_scalar &= !s2.half; - } + if (ains->src[1] != ~0) + could_scalar &= (sz1 == 16) || (sz1 == 32); return could_scalar; } @@ -399,347 +229,13 @@ bytes_for_instruction(midgard_instruction *ains) return sizeof(midgard_reg_info) + sizeof(midgard_scalar_alu); } -/* Schedules, but does not emit, a single basic block. After scheduling, the - * final tag and size of the block are known, which are necessary for branching - * */ - -static midgard_bundle -schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip) -{ - int instructions_emitted = 0, packed_idx = 0; - midgard_bundle bundle = { 0 }; - - midgard_instruction *scheduled[5] = { NULL }; - - uint8_t tag = ins->type; - - /* Default to the instruction's tag */ - bundle.tag = tag; - - switch (ins->type) { - case TAG_ALU_4: { - uint32_t control = 0; - size_t bytes_emitted = sizeof(control); - - /* TODO: Constant combining */ - int index = 0, last_unit = 0; - - /* Previous instructions, for the purpose of parallelism */ - midgard_instruction *segment[4] = {0}; - int segment_size = 0; - - instructions_emitted = -1; - midgard_instruction *pins = ins; - - unsigned constant_count = 0; - - for (;;) { - midgard_instruction *ains = pins; - - /* Advance instruction pointer */ - if (index) { - ains = mir_next_op(pins); - pins = ains; - } - - /* Out-of-work condition */ - if ((struct list_head *) ains == &block->instructions) - break; - - /* Ensure that the chain can continue */ - if (ains->type != TAG_ALU_4) break; - - /* If there's already something in the bundle and we - * have weird scheduler constraints, break now */ - if (ains->precede_break && index) break; - - /* According to the presentation "The ARM - * Mali-T880 Mobile GPU" from HotChips 27, - * there are two pipeline stages. Branching - * position determined experimentally. Lines - * are executed in parallel: - * - * [ VMUL ] [ SADD ] - * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ] - * - * Verify that there are no ordering dependencies here. - * - * TODO: Allow for parallelism!!! - */ - - /* Pick a unit for it if it doesn't force a particular unit */ - - int unit = ains->unit; - - if (!unit) { - int op = ains->alu.op; - int units = alu_opcode_props[op].props; - bool scalar = mir_is_scalar(ains); - - if (!scalar) { - if (last_unit >= UNIT_VADD) { - if (units & UNIT_VLUT) - unit = UNIT_VLUT; - else - break; - } else { - if ((units & UNIT_VMUL) && last_unit < UNIT_VMUL) - unit = UNIT_VMUL; - else if ((units & UNIT_VADD) && !(control & UNIT_VADD)) - unit = UNIT_VADD; - else if (units & UNIT_VLUT) - unit = UNIT_VLUT; - else - break; - } - } else { - if (last_unit >= UNIT_VADD) { - if ((units & UNIT_SMUL) && !(control & UNIT_SMUL)) - unit = UNIT_SMUL; - else if (units & UNIT_VLUT) - unit = UNIT_VLUT; - else - break; - } else { - if ((units & UNIT_VMUL) && (last_unit < UNIT_VMUL)) - unit = UNIT_VMUL; - else if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains)) - unit = UNIT_SADD; - else if (units & UNIT_VADD) - unit = UNIT_VADD; - else if (units & UNIT_SMUL) - unit = UNIT_SMUL; - else if (units & UNIT_VLUT) - unit = UNIT_VLUT; - else - break; - } - } - - assert(unit & units); - } - - /* Late unit check, this time for encoding (not parallelism) */ - if (unit <= last_unit) break; - - /* Clear the segment */ - if (last_unit < UNIT_VADD && unit >= UNIT_VADD) - segment_size = 0; - - if (midgard_has_hazard(segment, segment_size, ains)) - break; - - /* We're good to go -- emit the instruction */ - ains->unit = unit; - - segment[segment_size++] = ains; - - /* We try to reuse constants if possible, by adjusting - * the swizzle */ - - if (ains->has_blend_constant) { - /* Everything conflicts with the blend constant */ - if (bundle.has_embedded_constants) - break; - - bundle.has_blend_constant = 1; - bundle.has_embedded_constants = 1; - } else if (ains->has_constants && ains->alu.reg_mode == midgard_reg_mode_16) { - /* TODO: DRY with the analysis pass */ - - if (bundle.has_blend_constant) - break; - - if (constant_count) - break; - - /* TODO: Fix packing XXX */ - uint16_t *bundles = (uint16_t *) bundle.constants; - uint32_t *constants = (uint32_t *) ains->constants; - - /* Copy them wholesale */ - for (unsigned i = 0; i < 4; ++i) - bundles[i] = constants[i]; - - bundle.has_embedded_constants = true; - constant_count = 4; - } else if (ains->has_constants) { - /* By definition, blend constants conflict with - * everything, so if there are already - * constants we break the bundle *now* */ - - if (bundle.has_blend_constant) - break; - - /* For anything but blend constants, we can do - * proper analysis, however */ - - /* TODO: Mask by which are used */ - uint32_t *constants = (uint32_t *) ains->constants; - uint32_t *bundles = (uint32_t *) bundle.constants; - - uint32_t indices[4] = { 0 }; - bool break_bundle = false; - - for (unsigned i = 0; i < 4; ++i) { - uint32_t cons = constants[i]; - bool constant_found = false; - - /* Search for the constant */ - for (unsigned j = 0; j < constant_count; ++j) { - if (bundles[j] != cons) - continue; - - /* We found it, reuse */ - indices[i] = j; - constant_found = true; - break; - } - - if (constant_found) - continue; - - /* We didn't find it, so allocate it */ - unsigned idx = constant_count++; - - if (idx >= 4) { - /* Uh-oh, out of space */ - break_bundle = true; - break; - } - - /* We have space, copy it in! */ - bundles[idx] = cons; - indices[i] = idx; - } - - if (break_bundle) - break; - - /* Cool, we have it in. So use indices as a - * swizzle */ - - unsigned swizzle = SWIZZLE_FROM_ARRAY(indices); - unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT); - - if (ains->src[0] == r_constant) - ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle); - - if (ains->src[1] == r_constant) - ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle); - - bundle.has_embedded_constants = true; - } - - if (ains->compact_branch) { - /* All of r0 has to be written out along with - * the branch writeout */ - - if (ains->writeout && !can_writeout_fragment(ctx, scheduled, index, ctx->temp_count, ains->src[0])) { - /* We only work on full moves - * at the beginning. We could - * probably do better */ - if (index != 0) - break; - - /* Inject a move */ - midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0)); - ins.unit = UNIT_VMUL; - control |= ins.unit; - - /* TODO don't leak */ - midgard_instruction *move = - mem_dup(&ins, sizeof(midgard_instruction)); - bytes_emitted += bytes_for_instruction(move); - bundle.instructions[packed_idx++] = move; - } - } - - bytes_emitted += bytes_for_instruction(ains); - - /* Defer marking until after writing to allow for break */ - scheduled[index] = ains; - control |= ains->unit; - last_unit = ains->unit; - ++instructions_emitted; - ++index; - } - - int padding = 0; - - /* Pad ALU op to nearest word */ - - if (bytes_emitted & 15) { - padding = 16 - (bytes_emitted & 15); - bytes_emitted += padding; - } - - /* Constants must always be quadwords */ - if (bundle.has_embedded_constants) - bytes_emitted += 16; - - /* Size ALU instruction for tag */ - bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1; - bundle.padding = padding; - bundle.control = bundle.tag | control; - - break; - } - - case TAG_LOAD_STORE_4: { - /* Load store instructions have two words at once. If - * we only have one queued up, we need to NOP pad. - * Otherwise, we store both in succession to save space - * and cycles -- letting them go in parallel -- skip - * the next. The usefulness of this optimisation is - * greatly dependent on the quality of the instruction - * scheduler. - */ - - midgard_instruction *next_op = mir_next_op(ins); - - if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) { - /* TODO: Concurrency check */ - instructions_emitted++; - } - - break; - } - - case TAG_TEXTURE_4: { - /* Which tag we use depends on the shader stage */ - bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT; - bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX; - break; - } - - default: - unreachable("Unknown tag"); - break; - } - - /* Copy the instructions into the bundle */ - bundle.instruction_count = instructions_emitted + 1 + packed_idx; - - midgard_instruction *uins = ins; - for (; packed_idx < bundle.instruction_count; ++packed_idx) { - assert(&uins->link != &block->instructions); - bundle.instructions[packed_idx] = uins; - uins = mir_next_op(uins); - } - - *skip = instructions_emitted; - - return bundle; -} - /* We would like to flatten the linked list of midgard_instructions in a bundle * to an array of pointers on the heap for easy indexing */ static midgard_instruction ** flatten_mir(midgard_block *block, unsigned *len) { - *len = list_length(&block->instructions); + *len = list_length(&block->base.instructions); if (!(*len)) return NULL; @@ -795,8 +291,7 @@ mir_update_worklist( * where possible. */ unsigned i; - BITSET_WORD tmp; - BITSET_FOREACH_SET(i, tmp, done->dependents, count) { + BITSET_FOREACH_SET(i, done->dependents, count) { assert(instructions[i]->nr_dependencies); if (!(--instructions[i]->nr_dependencies)) @@ -818,17 +313,112 @@ struct midgard_predicate { /* True if we want to pop off the chosen instruction */ bool destructive; + /* For ALU, choose only this unit */ + unsigned unit; + /* State for bundle constants. constants is the actual constants * for the bundle. constant_count is the number of bytes (up to * 16) currently in use for constants. When picking in destructive * mode, the constants array will be updated, and the instruction * will be adjusted to index into the constants array */ - uint8_t *constants; - unsigned constant_count; + midgard_constants *constants; + unsigned constant_mask; bool blend_constant; + + /* Exclude this destination (if not ~0) */ + unsigned exclude; + + /* Don't schedule instructions consuming conditionals (since we already + * scheduled one). Excludes conditional branches and csel */ + bool no_cond; + + /* Require a minimal mask and (if nonzero) given destination. Used for + * writeout optimizations */ + + unsigned mask; + unsigned dest; + + /* For load/store: how many pipeline registers are in use? The two + * scheduled instructions cannot use more than the 256-bits of pipeline + * space available or RA will fail (as it would run out of pipeline + * registers and fail to spill without breaking the schedule) */ + + unsigned pipeline_count; }; +static bool +mir_adjust_constant(midgard_instruction *ins, unsigned src, + unsigned *bundle_constant_mask, + unsigned *comp_mapping, + uint8_t *bundle_constants, + bool upper) +{ + unsigned type_size = nir_alu_type_get_type_size(ins->src_types[src]) / 8; + unsigned max_comp = 16 / type_size; + unsigned comp_mask = mir_from_bytemask(mir_round_bytemask_up( + mir_bytemask_of_read_components_index(ins, src), + type_size * 8), + type_size * 8); + unsigned type_mask = (1 << type_size) - 1; + + /* Upper only makes sense for 16-bit */ + if (type_size != 16 && upper) + return false; + + /* For 16-bit, we need to stay on either upper or lower halves to avoid + * disrupting the swizzle */ + unsigned start = upper ? 8 : 0; + unsigned length = (type_size == 2) ? 8 : 16; + + for (unsigned comp = 0; comp < max_comp; comp++) { + if (!(comp_mask & (1 << comp))) + continue; + + uint8_t *constantp = ins->constants.u8 + (type_size * comp); + unsigned best_reuse_bytes = 0; + signed best_place = -1; + unsigned i, j; + + for (i = start; i < (start + length); i += type_size) { + unsigned reuse_bytes = 0; + + for (j = 0; j < type_size; j++) { + if (!(*bundle_constant_mask & (1 << (i + j)))) + continue; + if (constantp[j] != bundle_constants[i + j]) + break; + if ((i + j) > (start + length)) + break; + + reuse_bytes++; + } + + /* Select the place where existing bytes can be + * reused so we leave empty slots to others + */ + if (j == type_size && + (reuse_bytes > best_reuse_bytes || best_place < 0)) { + best_reuse_bytes = reuse_bytes; + best_place = i; + break; + } + } + + /* This component couldn't fit in the remaining constant slot, + * no need check the remaining components, bail out now + */ + if (best_place < 0) + return false; + + memcpy(&bundle_constants[i], constantp, type_size); + *bundle_constant_mask |= type_mask << best_place; + comp_mapping[comp] = best_place / type_size; + } + + return true; +} + /* For an instruction that can fit, adjust it to fit and update the constants * array, in destructive mode. Returns whether the fitting was successful. */ @@ -839,11 +429,11 @@ mir_adjust_constants(midgard_instruction *ins, { /* Blend constants dominate */ if (ins->has_blend_constant) { - if (pred->constant_count) + if (pred->constant_mask) return false; else if (destructive) { pred->blend_constant = true; - pred->constant_count = 16; + pred->constant_mask = 0xffff; return true; } } @@ -852,26 +442,69 @@ mir_adjust_constants(midgard_instruction *ins, if (!ins->has_constants) return true; - /* TODO: Deduplicate; permit multiple constants within a bundle */ + unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + unsigned bundle_constant_mask = pred->constant_mask; + unsigned comp_mapping[2][16] = { }; + uint8_t bundle_constants[16]; - if (destructive && !pred->constant_count) { - if (ins->alu.reg_mode == midgard_reg_mode_16) { - /* TODO: Fix packing XXX */ - uint16_t *bundles = (uint16_t *) pred->constants; - uint32_t *constants = (uint32_t *) ins->constants; + memcpy(bundle_constants, pred->constants, 16); - /* Copy them wholesale */ - for (unsigned i = 0; i < 4; ++i) - bundles[i] = constants[i]; - } else { - memcpy(pred->constants, ins->constants, 16); - } + /* Let's try to find a place for each active component of the constant + * register. + */ + for (unsigned src = 0; src < 2; ++src) { + if (ins->src[src] != SSA_FIXED_REGISTER(REGISTER_CONSTANT)) + continue; + + /* First, try lower half (or whole for !16) */ + if (mir_adjust_constant(ins, src, &bundle_constant_mask, + comp_mapping[src], bundle_constants, false)) + continue; + + /* Next, try upper half */ + if (mir_adjust_constant(ins, src, &bundle_constant_mask, + comp_mapping[src], bundle_constants, true)) + continue; + + /* Otherwise bail */ + return false; + } - pred->constant_count = 16; + /* If non-destructive, we're done */ + if (!destructive) return true; + + /* Otherwise update the constant_mask and constant values */ + pred->constant_mask = bundle_constant_mask; + memcpy(pred->constants, bundle_constants, 16); + + /* Use comp_mapping as a swizzle */ + mir_foreach_src(ins, s) { + if (ins->src[s] == r_constant) + mir_compose_swizzle(ins->swizzle[s], comp_mapping[s], ins->swizzle[s]); } - return !pred->constant_count; + return true; +} + +/* Conservative estimate of the pipeline registers required for load/store */ + +static unsigned +mir_pipeline_count(midgard_instruction *ins) +{ + unsigned bytecount = 0; + + mir_foreach_src(ins, i) { + /* Skip empty source */ + if (ins->src[i] == ~0) continue; + + unsigned bytemask = mir_bytemask_of_read_components_index(ins, i); + + unsigned max = util_logbase2(bytemask) + 1; + bytecount += max; + } + + return DIV_ROUND_UP(bytecount, 16); } static midgard_instruction * @@ -882,22 +515,77 @@ mir_choose_instruction( { /* Parse the predicate */ unsigned tag = predicate->tag; + bool alu = tag == TAG_ALU_4; + bool ldst = tag == TAG_LOAD_STORE_4; + unsigned unit = predicate->unit; + bool branch = alu && (unit == ALU_ENAB_BR_COMPACT); + bool scalar = (unit != ~0) && (unit & UNITS_SCALAR); + bool no_cond = predicate->no_cond; + + unsigned mask = predicate->mask; + unsigned dest = predicate->dest; + bool needs_dest = mask & 0xF; /* Iterate to find the best instruction satisfying the predicate */ unsigned i; - BITSET_WORD tmp; signed best_index = -1; + bool best_conditional = false; + + /* Enforce a simple metric limiting distance to keep down register + * pressure. TOOD: replace with liveness tracking for much better + * results */ + + unsigned max_active = 0; + unsigned max_distance = 6; + + BITSET_FOREACH_SET(i, worklist, count) { + max_active = MAX2(max_active, i); + } + + BITSET_FOREACH_SET(i, worklist, count) { + if ((max_active - i) >= max_distance) + continue; - BITSET_FOREACH_SET(i, tmp, worklist, count) { if (tag != ~0 && instructions[i]->type != tag) continue; + if (predicate->exclude != ~0 && instructions[i]->dest == predicate->exclude) + continue; + + if (alu && !branch && !(alu_opcode_props[instructions[i]->alu.op].props & unit)) + continue; + + if (branch && !instructions[i]->compact_branch) + continue; + + if (alu && scalar && !mir_is_scalar(instructions[i])) + continue; + + if (alu && !mir_adjust_constants(instructions[i], predicate, false)) + continue; + + if (needs_dest && instructions[i]->dest != dest) + continue; + + if (mask && ((~instructions[i]->mask) & mask)) + continue; + + if (ldst && mir_pipeline_count(instructions[i]) + predicate->pipeline_count > 2) + continue; + + bool conditional = alu && !branch && OP_IS_CSEL(instructions[i]->alu.op); + conditional |= (branch && instructions[i]->branch.conditional); + + if (conditional && no_cond) + continue; + /* Simulate in-order scheduling */ if ((signed) i < best_index) continue; best_index = i; + best_conditional = conditional; } @@ -911,6 +599,15 @@ mir_choose_instruction( if (predicate->destructive) { BITSET_CLEAR(worklist, best_index); + + if (alu) + mir_adjust_constants(instructions[best_index], predicate, true); + + if (ldst) + predicate->pipeline_count += mir_pipeline_count(instructions[best_index]); + + /* Once we schedule a conditional, we can't again */ + predicate->no_cond |= best_conditional; } return instructions[best_index]; @@ -930,7 +627,8 @@ mir_choose_bundle( struct midgard_predicate predicate = { .tag = ~0, - .destructive = false + .destructive = false, + .exclude = ~0 }; midgard_instruction *chosen = mir_choose_instruction(instructions, worklist, count, &predicate); @@ -941,6 +639,182 @@ mir_choose_bundle( return ~0; } +/* We want to choose an ALU instruction filling a given unit */ +static void +mir_choose_alu(midgard_instruction **slot, + midgard_instruction **instructions, + BITSET_WORD *worklist, unsigned len, + struct midgard_predicate *predicate, + unsigned unit) +{ + /* Did we already schedule to this slot? */ + if ((*slot) != NULL) + return; + + /* Try to schedule something, if not */ + predicate->unit = unit; + *slot = mir_choose_instruction(instructions, worklist, len, predicate); + + /* Store unit upon scheduling */ + if (*slot && !((*slot)->compact_branch)) + (*slot)->unit = unit; +} + +/* When we are scheduling a branch/csel, we need the consumed condition in the + * same block as a pipeline register. There are two options to enable this: + * + * - Move the conditional into the bundle. Preferred, but only works if the + * conditional is used only once and is from this block. + * - Copy the conditional. + * + * We search for the conditional. If it's in this block, single-use, and + * without embedded constants, we schedule it immediately. Otherwise, we + * schedule a move for it. + * + * mir_comparison_mobile is a helper to find the moveable condition. + */ + +static unsigned +mir_comparison_mobile( + compiler_context *ctx, + midgard_instruction **instructions, + struct midgard_predicate *predicate, + unsigned count, + unsigned cond) +{ + if (!mir_single_use(ctx, cond)) + return ~0; + + unsigned ret = ~0; + + for (unsigned i = 0; i < count; ++i) { + if (instructions[i]->dest != cond) + continue; + + /* Must fit in an ALU bundle */ + if (instructions[i]->type != TAG_ALU_4) + return ~0; + + /* If it would itself require a condition, that's recursive */ + if (OP_IS_CSEL(instructions[i]->alu.op)) + return ~0; + + /* We'll need to rewrite to .w but that doesn't work for vector + * ops that don't replicate (ball/bany), so bail there */ + + if (GET_CHANNEL_COUNT(alu_opcode_props[instructions[i]->alu.op].props)) + return ~0; + + /* Ensure it will fit with constants */ + + if (!mir_adjust_constants(instructions[i], predicate, false)) + return ~0; + + /* Ensure it is written only once */ + + if (ret != ~0) + return ~0; + else + ret = i; + } + + /* Inject constants now that we are sure we want to */ + if (ret != ~0) + mir_adjust_constants(instructions[ret], predicate, true); + + return ret; +} + +/* Using the information about the moveable conditional itself, we either pop + * that condition off the worklist for use now, or create a move to + * artificially schedule instead as a fallback */ + +static midgard_instruction * +mir_schedule_comparison( + compiler_context *ctx, + midgard_instruction **instructions, + struct midgard_predicate *predicate, + BITSET_WORD *worklist, unsigned count, + unsigned cond, bool vector, unsigned *swizzle, + midgard_instruction *user) +{ + /* TODO: swizzle when scheduling */ + unsigned comp_i = + (!vector && (swizzle[0] == 0)) ? + mir_comparison_mobile(ctx, instructions, predicate, count, cond) : ~0; + + /* If we can, schedule the condition immediately */ + if ((comp_i != ~0) && BITSET_TEST(worklist, comp_i)) { + assert(comp_i < count); + BITSET_CLEAR(worklist, comp_i); + return instructions[comp_i]; + } + + /* Otherwise, we insert a move */ + + midgard_instruction mov = v_mov(cond, cond); + mov.mask = vector ? 0xF : 0x1; + memcpy(mov.swizzle[1], swizzle, sizeof(mov.swizzle[1])); + + return mir_insert_instruction_before(ctx, user, mov); +} + +/* Most generally, we need instructions writing to r31 in the appropriate + * components */ + +static midgard_instruction * +mir_schedule_condition(compiler_context *ctx, + struct midgard_predicate *predicate, + BITSET_WORD *worklist, unsigned count, + midgard_instruction **instructions, + midgard_instruction *last) +{ + /* For a branch, the condition is the only argument; for csel, third */ + bool branch = last->compact_branch; + unsigned condition_index = branch ? 0 : 2; + + /* csel_v is vector; otherwise, conditions are scalar */ + bool vector = !branch && OP_IS_CSEL_V(last->alu.op); + + /* Grab the conditional instruction */ + + midgard_instruction *cond = mir_schedule_comparison( + ctx, instructions, predicate, worklist, count, last->src[condition_index], + vector, last->swizzle[2], last); + + /* We have exclusive reign over this (possibly move) conditional + * instruction. We can rewrite into a pipeline conditional register */ + + predicate->exclude = cond->dest; + cond->dest = SSA_FIXED_REGISTER(31); + + if (!vector) { + cond->mask = (1 << COMPONENT_W); + + mir_foreach_src(cond, s) { + if (cond->src[s] == ~0) + continue; + + for (unsigned q = 0; q < 4; ++q) + cond->swizzle[s][q + COMPONENT_W] = cond->swizzle[s][q]; + } + } + + /* Schedule the unit: csel is always in the latter pipeline, so a csel + * condition must be in the former pipeline stage (vmul/sadd), + * depending on scalar/vector of the instruction itself. A branch must + * be written from the latter pipeline stage and a branch condition is + * always scalar, so it is always in smul (exception: ball/bany, which + * will be vadd) */ + + if (branch) + cond->unit = UNIT_SMUL; + else + cond->unit = vector ? UNIT_VMUL : UNIT_SADD; + + return cond; +} + /* Schedules a single bundle of the given type */ static midgard_bundle @@ -950,7 +824,8 @@ mir_schedule_texture( { struct midgard_predicate predicate = { .tag = TAG_TEXTURE_4, - .destructive = true + .destructive = true, + .exclude = ~0 }; midgard_instruction *ins = @@ -959,7 +834,8 @@ mir_schedule_texture( mir_update_worklist(worklist, len, instructions, ins); struct midgard_bundle out = { - .tag = TAG_TEXTURE_4, + .tag = ins->texture.op == TEXTURE_OP_BARRIER ? + TAG_TEXTURE_4_BARRIER : TAG_TEXTURE_4, .instruction_count = 1, .instructions = { ins } }; @@ -974,20 +850,30 @@ mir_schedule_ldst( { struct midgard_predicate predicate = { .tag = TAG_LOAD_STORE_4, - .destructive = true + .destructive = true, + .exclude = ~0 }; + /* Try to pick two load/store ops. Second not gauranteed to exist */ + midgard_instruction *ins = mir_choose_instruction(instructions, worklist, len, &predicate); - mir_update_worklist(worklist, len, instructions, ins); + midgard_instruction *pair = + mir_choose_instruction(instructions, worklist, len, &predicate); struct midgard_bundle out = { .tag = TAG_LOAD_STORE_4, - .instruction_count = 1, - .instructions = { ins } + .instruction_count = pair ? 2 : 1, + .instructions = { ins, pair } }; + /* We have to update the worklist atomically, since the two + * instructions run concurrently (TODO: verify it's not pipelined) */ + + mir_update_worklist(worklist, len, instructions, ins); + mir_update_worklist(worklist, len, instructions, pair); + return out; } @@ -1003,12 +889,11 @@ mir_schedule_alu( struct midgard_predicate predicate = { .tag = TAG_ALU_4, - .destructive = true + .destructive = true, + .exclude = ~0, + .constants = &bundle.constants }; - midgard_instruction *ins = - mir_choose_instruction(instructions, worklist, len, &predicate); - midgard_instruction *vmul = NULL; midgard_instruction *vadd = NULL; midgard_instruction *vlut = NULL; @@ -1016,88 +901,176 @@ mir_schedule_alu( midgard_instruction *sadd = NULL; midgard_instruction *branch = NULL; - mir_update_worklist(worklist, len, instructions, ins); + mir_choose_alu(&branch, instructions, worklist, len, &predicate, ALU_ENAB_BR_COMPACT); + mir_update_worklist(worklist, len, instructions, branch); + bool writeout = branch && branch->writeout; + bool zs_writeout = writeout && (branch->writeout_depth | branch->writeout_stencil); - if (ins->compact_branch) { - branch = ins; - } else if (!ins->unit) { - unsigned units = alu_opcode_props[ins->alu.op].props; - - if (units & UNIT_VMUL) { - ins->unit = UNIT_VMUL; - vmul = ins; - } else if (units & UNIT_VADD) { - ins->unit = UNIT_VADD; - vadd = ins; - } else if (units & UNIT_VLUT) { - ins->unit = UNIT_VLUT; - vlut = ins; - } else - assert(0); + if (branch && branch->branch.conditional) { + midgard_instruction *cond = mir_schedule_condition(ctx, &predicate, worklist, len, instructions, branch); + + if (cond->unit == UNIT_VADD) + vadd = cond; + else if (cond->unit == UNIT_SMUL) + smul = cond; + else + unreachable("Bad condition"); } - bundle.has_embedded_constants = ins->has_constants; - bundle.has_blend_constant = ins->has_blend_constant; + /* If we have a render target reference, schedule a move for it. Since + * this will be in sadd, we boost this to prevent scheduling csel into + * smul */ + + if (writeout && (branch->constants.u32[0] || ctx->is_blend)) { + sadd = ralloc(ctx, midgard_instruction); + *sadd = v_mov(~0, make_compiler_temp(ctx)); + sadd->unit = UNIT_SADD; + sadd->mask = 0x1; + sadd->has_inline_constant = true; + sadd->inline_constant = branch->constants.u32[0]; + branch->src[1] = sadd->dest; + branch->src_types[1] = sadd->dest_type; + + /* Mask off any conditionals. Could be optimized to just scalar + * conditionals TODO */ + predicate.no_cond = true; + } - if (ins->alu.reg_mode == midgard_reg_mode_16) { - /* TODO: Fix packing XXX */ - uint16_t *bundles = (uint16_t *) bundle.constants; - uint32_t *constants = (uint32_t *) ins->constants; + mir_choose_alu(&smul, instructions, worklist, len, &predicate, UNIT_SMUL); - /* Copy them wholesale */ - for (unsigned i = 0; i < 4; ++i) - bundles[i] = constants[i]; + if (!writeout) { + mir_choose_alu(&vlut, instructions, worklist, len, &predicate, UNIT_VLUT); } else { - memcpy(bundle.constants, ins->constants, sizeof(bundle.constants)); + /* Propagate up */ + bundle.last_writeout = branch->last_writeout; } - if (ins->writeout) { - unsigned src = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : branch->src[0]; - unsigned temp = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : make_compiler_temp(ctx); - midgard_instruction mov = v_mov(src, blank_alu_src, temp); - vmul = mem_dup(&mov, sizeof(midgard_instruction)); - vmul->unit = UNIT_VMUL; - vmul->mask = 0xF; - /* TODO: Don't leak */ + if (writeout && !zs_writeout) { + vadd = ralloc(ctx, midgard_instruction); + *vadd = v_mov(~0, make_compiler_temp(ctx)); - /* Rewrite to use our temp */ - midgard_instruction *stages[] = { sadd, vadd, smul }; + if (!ctx->is_blend) { + vadd->alu.op = midgard_alu_op_iadd; + vadd->src[0] = SSA_FIXED_REGISTER(31); + vadd->src_types[0] = nir_type_uint32; - for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { - if (stages[i]) - mir_rewrite_index_dst_single(stages[i], src, temp); + for (unsigned c = 0; c < 16; ++c) + vadd->swizzle[0][c] = COMPONENT_X; + + vadd->has_inline_constant = true; + vadd->inline_constant = 0; + } else { + vadd->src[1] = SSA_FIXED_REGISTER(1); + vadd->src_types[0] = nir_type_uint32; + + for (unsigned c = 0; c < 16; ++c) + vadd->swizzle[1][c] = COMPONENT_W; } - mir_rewrite_index_src_single(branch, src, temp); + vadd->unit = UNIT_VADD; + vadd->mask = 0x1; + branch->src[2] = vadd->dest; + branch->src_types[2] = vadd->dest_type; } - if ((vadd && OP_IS_CSEL(vadd->alu.op)) || (smul && OP_IS_CSEL(smul->alu.op)) || (ins->compact_branch && !ins->prepacked_branch && ins->branch.conditional)) { - midgard_instruction *cond = mir_choose_instruction(instructions, worklist, len, &predicate); - mir_update_worklist(worklist, len, instructions, cond); + mir_choose_alu(&vadd, instructions, worklist, len, &predicate, UNIT_VADD); + + mir_update_worklist(worklist, len, instructions, vlut); + mir_update_worklist(worklist, len, instructions, vadd); + mir_update_worklist(worklist, len, instructions, smul); - if (!cond->unit) { - unsigned units = alu_opcode_props[cond->alu.op].props; + bool vadd_csel = vadd && OP_IS_CSEL(vadd->alu.op); + bool smul_csel = smul && OP_IS_CSEL(smul->alu.op); - if (units & UNIT_VMUL) { - cond->unit = UNIT_VMUL; - } else if (units & UNIT_VADD) { - cond->unit = UNIT_VADD; - } else - assert(0); - } + if (vadd_csel || smul_csel) { + midgard_instruction *ins = vadd_csel ? vadd : smul; + midgard_instruction *cond = mir_schedule_condition(ctx, &predicate, worklist, len, instructions, ins); - if (cond->unit & UNIT_VMUL) + if (cond->unit == UNIT_VMUL) vmul = cond; - else if (cond->unit & UNIT_SADD) + else if (cond->unit == UNIT_SADD) sadd = cond; - else if (cond->unit & UNIT_VADD) - vadd = cond; - else if (cond->unit & UNIT_SMUL) - smul = cond; else unreachable("Bad condition"); } + /* Stage 2, let's schedule sadd before vmul for writeout */ + mir_choose_alu(&sadd, instructions, worklist, len, &predicate, UNIT_SADD); + + /* Check if writeout reads its own register */ + + if (writeout) { + midgard_instruction *stages[] = { sadd, vadd, smul }; + unsigned src = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(zs_writeout ? 1 : 0) : branch->src[0]; + unsigned writeout_mask = 0x0; + bool bad_writeout = false; + + for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { + if (!stages[i]) + continue; + + if (stages[i]->dest != src) + continue; + + writeout_mask |= stages[i]->mask; + bad_writeout |= mir_has_arg(stages[i], branch->src[0]); + } + + /* It's possible we'll be able to schedule something into vmul + * to fill r0/r1. Let's peak into the future, trying to schedule + * vmul specially that way. */ + + unsigned full_mask = zs_writeout ? + (1 << (branch->writeout_depth + branch->writeout_stencil)) - 1 : + 0xF; + + if (!bad_writeout && writeout_mask != full_mask) { + predicate.unit = UNIT_VMUL; + predicate.dest = src; + predicate.mask = writeout_mask ^ full_mask; + + struct midgard_instruction *peaked = + mir_choose_instruction(instructions, worklist, len, &predicate); + + if (peaked) { + vmul = peaked; + vmul->unit = UNIT_VMUL; + writeout_mask |= predicate.mask; + assert(writeout_mask == full_mask); + } + + /* Cleanup */ + predicate.dest = predicate.mask = 0; + } + + /* Finally, add a move if necessary */ + if (bad_writeout || writeout_mask != full_mask) { + unsigned temp = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(zs_writeout ? 1 : 0) : make_compiler_temp(ctx); + + vmul = ralloc(ctx, midgard_instruction); + *vmul = v_mov(src, temp); + vmul->unit = UNIT_VMUL; + vmul->mask = full_mask ^ writeout_mask; + + /* Rewrite to use our temp */ + + for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { + if (stages[i]) + mir_rewrite_index_dst_single(stages[i], src, temp); + } + + mir_rewrite_index_src_single(branch, src, temp); + } + } + + mir_choose_alu(&vmul, instructions, worklist, len, &predicate, UNIT_VMUL); + + mir_update_worklist(worklist, len, instructions, vmul); + mir_update_worklist(worklist, len, instructions, sadd); + + bundle.has_blend_constant = predicate.blend_constant; + bundle.has_embedded_constants = predicate.constant_mask != 0; + unsigned padding = 0; /* Now that we have finished scheduling, build up the bundle */ @@ -1108,6 +1081,14 @@ mir_schedule_alu( bundle.control |= stages[i]->unit; bytes_emitted += bytes_for_instruction(stages[i]); bundle.instructions[bundle.instruction_count++] = stages[i]; + + /* If we branch, we can't spill to TLS since the store + * instruction will never get executed. We could try to + * break the bundle but this is probably easier for + * now. */ + + if (branch) + stages[i]->no_spill |= (1 << REG_CLASS_WORK); } } @@ -1124,6 +1105,11 @@ mir_schedule_alu( /* Size ALU instruction for tag */ bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1; + + /* MRT capable GPUs use a special writeout procedure */ + if (writeout && !(ctx->quirks & MIDGARD_NO_UPPER_ALU)) + bundle.tag += 4; + bundle.padding = padding; bundle.control |= bundle.tag; @@ -1141,6 +1127,9 @@ schedule_block(compiler_context *ctx, midgard_block *block) unsigned len = 0; midgard_instruction **instructions = flatten_mir(block, &len); + if (!len) + return; + /* Calculate dependencies and initial worklist */ unsigned node_count = ctx->temp_count + 1; mir_create_dependency_graph(instructions, len, node_count); @@ -1150,393 +1139,84 @@ schedule_block(compiler_context *ctx, midgard_block *block) BITSET_WORD *worklist = calloc(sz, 1); mir_initialize_worklist(worklist, instructions, len); - util_dynarray_init(&block->bundles, NULL); + struct util_dynarray bundles; + util_dynarray_init(&bundles, NULL); block->quadword_count = 0; + unsigned blend_offset = 0; + + for (;;) { + unsigned tag = mir_choose_bundle(instructions, worklist, len); + midgard_bundle bundle; + + if (tag == TAG_TEXTURE_4) + bundle = mir_schedule_texture(instructions, worklist, len); + else if (tag == TAG_LOAD_STORE_4) + bundle = mir_schedule_ldst(instructions, worklist, len); + else if (tag == TAG_ALU_4) + bundle = mir_schedule_alu(ctx, instructions, worklist, len); + else + break; - int skip = 0; - mir_foreach_instr_in_block(block, ins) { - if (skip) { - skip--; - continue; - } - - midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip); - util_dynarray_append(&block->bundles, midgard_bundle, bundle); - - if (bundle.has_blend_constant) { - unsigned offset = ctx->quadword_count + block->quadword_count + quadword_size(bundle.tag) - 1; - ctx->blend_constant_offset = offset * 0x10; - } - - block->quadword_count += quadword_size(bundle.tag); - } - - block->is_scheduled = true; - ctx->quadword_count += block->quadword_count; -} - -/* The following passes reorder MIR instructions to enable better scheduling */ - -static void -midgard_pair_load_store(compiler_context *ctx, midgard_block *block) -{ - mir_foreach_instr_in_block_safe(block, ins) { - if (ins->type != TAG_LOAD_STORE_4) continue; - - /* We've found a load/store op. Check if next is also load/store. */ - midgard_instruction *next_op = mir_next_op(ins); - if (&next_op->link != &block->instructions) { - if (next_op->type == TAG_LOAD_STORE_4) { - /* If so, we're done since we're a pair */ - ins = mir_next_op(ins); - continue; - } - - /* Maximum search distance to pair, to avoid register pressure disasters */ - int search_distance = 8; - - /* Otherwise, we have an orphaned load/store -- search for another load */ - mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) { - /* Terminate search if necessary */ - if (!(search_distance--)) break; - - if (c->type != TAG_LOAD_STORE_4) continue; - - /* We can only reorder if there are no sources */ - - bool deps = false; - - for (unsigned s = 0; s < ARRAY_SIZE(ins->src); ++s) - deps |= (c->src[s] != ~0); - - if (deps) - continue; - - /* We found one! Move it up to pair and remove it from the old location */ - - mir_insert_instruction_before(ctx, ins, *c); - mir_remove_instruction(c); - - break; - } - } - } -} - -/* When we're 'squeezing down' the values in the IR, we maintain a hash - * as such */ - -static unsigned -find_or_allocate_temp(compiler_context *ctx, unsigned hash) -{ - if (hash >= SSA_FIXED_MINIMUM) - return hash; - - unsigned temp = (uintptr_t) _mesa_hash_table_u64_search( - ctx->hash_to_temp, hash + 1); - - if (temp) - return temp - 1; - - /* If no temp is find, allocate one */ - temp = ctx->temp_count++; - ctx->max_hash = MAX2(ctx->max_hash, hash); - - _mesa_hash_table_u64_insert(ctx->hash_to_temp, - hash + 1, (void *) ((uintptr_t) temp + 1)); - - return temp; -} - -/* Reassigns numbering to get rid of gaps in the indices */ - -static void -mir_squeeze_index(compiler_context *ctx) -{ - /* Reset */ - ctx->temp_count = 0; - /* TODO don't leak old hash_to_temp */ - ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL); - - mir_foreach_instr_global(ctx, ins) { - ins->dest = find_or_allocate_temp(ctx, ins->dest); - - for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i) - ins->src[i] = find_or_allocate_temp(ctx, ins->src[i]); - } -} - -static midgard_instruction -v_load_store_scratch( - unsigned srcdest, - unsigned index, - bool is_store, - unsigned mask) -{ - /* We index by 32-bit vec4s */ - unsigned byte = (index * 4 * 4); - - midgard_instruction ins = { - .type = TAG_LOAD_STORE_4, - .mask = mask, - .dest = ~0, - .src = { ~0, ~0, ~0 }, - .load_store = { - .op = is_store ? midgard_op_st_int4 : midgard_op_ld_int4, - .swizzle = SWIZZLE_XYZW, - - /* For register spilling - to thread local storage */ - .arg_1 = 0xEA, - .arg_2 = 0x1E, - - /* Splattered across, TODO combine logically */ - .varying_parameters = (byte & 0x1FF) << 1, - .address = (byte >> 9) - }, - - /* If we spill an unspill, RA goes into an infinite loop */ - .no_spill = true - }; - - if (is_store) { - /* r0 = r26, r1 = r27 */ - assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27)); - ins.src[0] = srcdest; - } else { - ins.dest = srcdest; - } - - return ins; -} - -/* If register allocation fails, find the best spill node and spill it to fix - * whatever the issue was. This spill node could be a work register (spilling - * to thread local storage), but it could also simply be a special register - * that needs to spill to become a work register. */ - -static void mir_spill_register( - compiler_context *ctx, - struct ra_graph *g, - unsigned *spill_count) -{ - unsigned spill_index = ctx->temp_count; - - /* Our first step is to calculate spill cost to figure out the best - * spill node. All nodes are equal in spill cost, but we can't spill - * nodes written to from an unspill */ - - for (unsigned i = 0; i < ctx->temp_count; ++i) { - ra_set_node_spill_cost(g, i, 1.0); - } - - /* We can't spill any bundles that contain unspills. This could be - * optimized to allow use of r27 to spill twice per bundle, but if - * you're at the point of optimizing spilling, it's too late. */ - - mir_foreach_block(ctx, block) { - mir_foreach_bundle_in_block(block, bun) { - bool no_spill = false; - - for (unsigned i = 0; i < bun->instruction_count; ++i) - no_spill |= bun->instructions[i]->no_spill; + util_dynarray_append(&bundles, midgard_bundle, bundle); - if (!no_spill) - continue; + if (bundle.has_blend_constant) + blend_offset = block->quadword_count; - for (unsigned i = 0; i < bun->instruction_count; ++i) { - unsigned dest = bun->instructions[i]->dest; - if (dest < ctx->temp_count) - ra_set_node_spill_cost(g, dest, -1.0); - } - } + block->quadword_count += midgard_tag_props[bundle.tag].size; } - int spill_node = ra_get_best_spill_node(g); + /* We emitted bundles backwards; copy into the block in reverse-order */ - if (spill_node < 0) { - mir_print_shader(ctx); - assert(0); + util_dynarray_init(&block->bundles, block); + util_dynarray_foreach_reverse(&bundles, midgard_bundle, bundle) { + util_dynarray_append(&block->bundles, midgard_bundle, *bundle); } + util_dynarray_fini(&bundles); - /* We have a spill node, so check the class. Work registers - * legitimately spill to TLS, but special registers just spill to work - * registers */ - - unsigned class = ra_get_node_class(g, spill_node); - bool is_special = (class >> 2) != REG_CLASS_WORK; - bool is_special_w = (class >> 2) == REG_CLASS_TEXW; - - /* Allocate TLS slot (maybe) */ - unsigned spill_slot = !is_special ? (*spill_count)++ : 0; - - /* For TLS, replace all stores to the spilled node. For - * special reads, just keep as-is; the class will be demoted - * implicitly. For special writes, spill to a work register */ + /* Blend constant was backwards as well. blend_offset if set is + * strictly positive, as an offset of zero would imply constants before + * any instructions which is invalid in Midgard. TODO: blend constants + * are broken if you spill since then quadword_count becomes invalid + * XXX */ - if (!is_special || is_special_w) { - if (is_special_w) - spill_slot = spill_index++; - - mir_foreach_block(ctx, block) { - mir_foreach_instr_in_block_safe(block, ins) { - if (ins->dest != spill_node) continue; - - midgard_instruction st; - - if (is_special_w) { - st = v_mov(spill_node, blank_alu_src, spill_slot); - st.no_spill = true; - } else { - ins->dest = SSA_FIXED_REGISTER(26); - ins->no_spill = true; - st = v_load_store_scratch(ins->dest, spill_slot, true, ins->mask); - } + if (blend_offset) + ctx->blend_constant_offset = ((ctx->quadword_count + block->quadword_count) - blend_offset - 1) * 0x10; - /* Hint: don't rewrite this node */ - st.hint = true; + block->scheduled = true; + ctx->quadword_count += block->quadword_count; - mir_insert_instruction_after_scheduled(ctx, block, ins, st); + /* Reorder instructions to match bundled. First remove existing + * instructions and then recreate the list */ - if (!is_special) - ctx->spills++; - } - } + mir_foreach_instr_in_block_safe(block, ins) { + list_del(&ins->link); } - /* For special reads, figure out how many components we need */ - unsigned read_mask = 0; - - mir_foreach_instr_global_safe(ctx, ins) { - read_mask |= mir_mask_of_read_components(ins, spill_node); + mir_foreach_instr_in_block_scheduled_rev(block, ins) { + list_add(&ins->link, &block->base.instructions); } - /* Insert a load from TLS before the first consecutive - * use of the node, rewriting to use spilled indices to - * break up the live range. Or, for special, insert a - * move. Ironically the latter *increases* register - * pressure, but the two uses of the spilling mechanism - * are somewhat orthogonal. (special spilling is to use - * work registers to back special registers; TLS - * spilling is to use memory to back work registers) */ - - mir_foreach_block(ctx, block) { - bool consecutive_skip = false; - unsigned consecutive_index = 0; - - mir_foreach_instr_in_block(block, ins) { - /* We can't rewrite the moves used to spill in the - * first place. These moves are hinted. */ - if (ins->hint) continue; - - if (!mir_has_arg(ins, spill_node)) { - consecutive_skip = false; - continue; - } - - if (consecutive_skip) { - /* Rewrite */ - mir_rewrite_index_src_single(ins, spill_node, consecutive_index); - continue; - } - - if (!is_special_w) { - consecutive_index = ++spill_index; - - midgard_instruction *before = ins; - - /* For a csel, go back one more not to break up the bundle */ - if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op)) - before = mir_prev_op(before); - - midgard_instruction st; - - if (is_special) { - /* Move */ - st = v_mov(spill_node, blank_alu_src, consecutive_index); - st.no_spill = true; - } else { - /* TLS load */ - st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF); - } - - /* Mask the load based on the component count - * actually needed to prvent RA loops */ - - st.mask = read_mask; - - mir_insert_instruction_before_scheduled(ctx, block, before, st); - // consecutive_skip = true; - } else { - /* Special writes already have their move spilled in */ - consecutive_index = spill_slot; - } - - - /* Rewrite to use */ - mir_rewrite_index_src_single(ins, spill_node, consecutive_index); - - if (!is_special) - ctx->fills++; - } - } - - /* Reset hints */ - - mir_foreach_instr_global(ctx, ins) { - ins->hint = false; - } + free(instructions); /* Allocated by flatten_mir() */ + free(worklist); } void -schedule_program(compiler_context *ctx) +midgard_schedule_program(compiler_context *ctx) { - struct ra_graph *g = NULL; - bool spilled = false; - int iter_count = 1000; /* max iterations */ - - /* Number of 128-bit slots in memory we've spilled into */ - unsigned spill_count = 0; + midgard_promote_uniforms(ctx); - midgard_promote_uniforms(ctx, 16); - - mir_foreach_block(ctx, block) { - midgard_pair_load_store(ctx, block); - } - - /* Must be lowered right before RA */ + /* Must be lowered right before scheduling */ mir_squeeze_index(ctx); mir_lower_special_reads(ctx); mir_squeeze_index(ctx); /* Lowering can introduce some dead moves */ - mir_foreach_block(ctx, block) { + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *) _block; midgard_opt_dead_move_eliminate(ctx, block); schedule_block(ctx, block); } - mir_create_pipeline_registers(ctx); - - do { - if (spilled) - mir_spill_register(ctx, g, &spill_count); - - mir_squeeze_index(ctx); - - g = NULL; - g = allocate_registers(ctx, &spilled); - } while(spilled && ((iter_count--) > 0)); - - if (iter_count <= 0) { - fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n"); - assert(0); - } - - /* Report spilling information. spill_count is in 128-bit slots (vec4 x - * fp32), but tls_size is in bytes, so multiply by 16 */ - - ctx->tls_size = spill_count * 16; - - install_registers(ctx, g); }