X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Fpanfrost%2Fmidgard%2Fmidgard_ra.c;h=b06eb97ab8ec77f96e3126fc9bb0416a8fb9ca1a;hp=afee5b82745fdea1d98e1585e73f5258bba5b90f;hb=b4de9e035ac0afe64ebfd58cd0eb04c1a671afce;hpb=3450c013c5a90c1689287c69aaf0e41eae147bc3 diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c index afee5b82745..b06eb97ab8e 100644 --- a/src/panfrost/midgard/midgard_ra.c +++ b/src/panfrost/midgard/midgard_ra.c @@ -24,112 +24,53 @@ #include "compiler.h" #include "midgard_ops.h" -#include "util/register_allocate.h" #include "util/u_math.h" #include "util/u_memory.h" - -/* For work registers, we can subdivide in various ways. So we create - * classes for the various sizes and conflict accordingly, keeping in - * mind that physical registers are divided along 128-bit boundaries. - * The important part is that 128-bit boundaries are not crossed. - * - * For each 128-bit register, we can subdivide to 32-bits 10 ways - * - * vec4: xyzw - * vec3: xyz, yzw - * vec2: xy, yz, zw, - * vec1: x, y, z, w - * - * For each 64-bit register, we can subdivide similarly to 16-bit - * (TODO: half-float RA, not that we support fp16 yet) - */ - -#define WORK_STRIDE 10 - -/* We have overlapping register classes for special registers, handled via - * shadows */ - -#define SHADOW_R0 17 -#define SHADOW_R28 18 -#define SHADOW_R29 19 - -/* Prepacked masks/swizzles for virtual register types */ -static unsigned reg_type_to_mask[WORK_STRIDE] = { - 0xF, /* xyzw */ - 0x7, 0x7 << 1, /* xyz */ - 0x3, 0x3 << 1, 0x3 << 2, /* xy */ - 0x1, 0x1 << 1, 0x1 << 2, 0x1 << 3 /* x */ -}; - -static unsigned reg_type_to_swizzle[WORK_STRIDE] = { - SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - - SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_W, COMPONENT_W), - - SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_Z, COMPONENT_W, COMPONENT_Z, COMPONENT_W), - - SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_Y, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_Z, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_W, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), -}; +#include "midgard_quirks.h" struct phys_reg { + /* Physical register: 0-31 */ unsigned reg; - unsigned mask; - unsigned swizzle; -}; -/* Given the mask/swizzle of both the register and the original source, - * compose to find the actual mask/swizzle to give the hardware */ + /* Byte offset into the physical register: 0-15 */ + unsigned offset; -static unsigned -compose_writemask(unsigned mask, struct phys_reg reg) -{ - /* Note: the reg mask is guaranteed to be contiguous. So we shift - * into the X place, compose via a simple AND, and shift back */ + /* log2(bytes per component) for fast mul/div */ + unsigned shift; +}; - unsigned shift = __builtin_ctz(reg.mask); - return ((reg.mask >> shift) & mask) << shift; -} +/* Shift up by reg_offset and horizontally by dst_offset. */ -static unsigned -compose_swizzle(unsigned swizzle, unsigned mask, - struct phys_reg reg, struct phys_reg dst) +static void +offset_swizzle(unsigned *swizzle, unsigned reg_offset, unsigned srcshift, unsigned dstshift, unsigned dst_offset) { - unsigned out = pan_compose_swizzle(swizzle, reg.swizzle); + unsigned out[MIR_VEC_COMPONENTS]; - /* Based on the register mask, we need to adjust over. E.g if we're - * writing to yz, a base swizzle of xy__ becomes _xy_. Save the - * original first component (x). But to prevent duplicate shifting - * (only applies to ALU -- mask param is set to xyzw out on L/S to - * prevent changes), we have to account for the shift inherent to the - * original writemask */ + signed reg_comp = reg_offset >> srcshift; + signed dst_comp = dst_offset >> dstshift; - unsigned rep = out & 0x3; - unsigned shift = __builtin_ctz(dst.mask) - __builtin_ctz(mask); - unsigned shifted = out << (2*shift); + unsigned max_component = (16 >> srcshift) - 1; - /* ..but we fill in the gaps so it appears to replicate */ + assert(reg_comp << srcshift == reg_offset); + assert(dst_comp << dstshift == dst_offset); - for (unsigned s = 0; s < shift; ++s) - shifted |= rep << (2*s); + for (signed c = 0; c < MIR_VEC_COMPONENTS; ++c) { + signed comp = MAX2(c - dst_comp, 0); + out[c] = MIN2(swizzle[comp] + reg_comp, max_component); + } - return shifted; + memcpy(swizzle, out, sizeof(out)); } /* Helper to return the default phys_reg for a given register */ static struct phys_reg -default_phys_reg(int reg) +default_phys_reg(int reg, unsigned shift) { struct phys_reg r = { .reg = reg, - .mask = 0xF, /* xyzw */ - .swizzle = 0xE4 /* xyzw */ + .offset = 0, + .shift = shift }; return r; @@ -139,206 +80,37 @@ default_phys_reg(int reg) * register corresponds to */ static struct phys_reg -index_to_reg(compiler_context *ctx, struct ra_graph *g, unsigned reg) +index_to_reg(compiler_context *ctx, struct lcra_state *l, unsigned reg, unsigned shift) { /* Check for special cases */ if (reg == ~0) - return default_phys_reg(REGISTER_UNUSED); + return default_phys_reg(REGISTER_UNUSED, shift); else if (reg >= SSA_FIXED_MINIMUM) - return default_phys_reg(SSA_REG_FROM_FIXED(reg)); - else if (!g) - return default_phys_reg(REGISTER_UNUSED); - - /* Special cases aside, we pick the underlying register */ - int virt = ra_get_node_reg(g, reg); - - /* Divide out the register and classification */ - int phys = virt / WORK_STRIDE; - int type = virt % WORK_STRIDE; - - /* Apply shadow registers */ - - if (phys >= SHADOW_R28 && phys <= SHADOW_R29) - phys += 28 - SHADOW_R28; - else if (phys == SHADOW_R0) - phys = 0; + return default_phys_reg(SSA_REG_FROM_FIXED(reg), shift); + else if (!l) + return default_phys_reg(REGISTER_UNUSED, shift); struct phys_reg r = { - .reg = phys, - .mask = reg_type_to_mask[type], - .swizzle = reg_type_to_swizzle[type] + .reg = l->solutions[reg] / 16, + .offset = l->solutions[reg] & 0xF, + .shift = shift }; /* Report that we actually use this register, and return it */ - if (phys < 16) - ctx->work_registers = MAX2(ctx->work_registers, phys); + if (r.reg < 16) + ctx->work_registers = MAX2(ctx->work_registers, r.reg); return r; } -/* This routine creates a register set. Should be called infrequently since - * it's slow and can be cached. For legibility, variables are named in terms of - * work registers, although it is also used to create the register set for - * special register allocation */ - -static void -add_shadow_conflicts (struct ra_regs *regs, unsigned base, unsigned shadow, unsigned shadow_count) -{ - for (unsigned a = 0; a < WORK_STRIDE; ++a) { - unsigned reg_a = (WORK_STRIDE * base) + a; - - for (unsigned b = 0; b < shadow_count; ++b) { - unsigned reg_b = (WORK_STRIDE * shadow) + b; - - ra_add_reg_conflict(regs, reg_a, reg_b); - ra_add_reg_conflict(regs, reg_b, reg_a); - } - } -} - -static struct ra_regs * -create_register_set(unsigned work_count, unsigned *classes) -{ - int virtual_count = 32 * WORK_STRIDE; - - /* First, initialize the RA */ - struct ra_regs *regs = ra_alloc_reg_set(NULL, virtual_count, true); - - for (unsigned c = 0; c < (NR_REG_CLASSES - 1); ++c) { - int work_vec4 = ra_alloc_reg_class(regs); - int work_vec3 = ra_alloc_reg_class(regs); - int work_vec2 = ra_alloc_reg_class(regs); - int work_vec1 = ra_alloc_reg_class(regs); - - classes[4*c + 0] = work_vec1; - classes[4*c + 1] = work_vec2; - classes[4*c + 2] = work_vec3; - classes[4*c + 3] = work_vec4; - - /* Special register classes have other register counts */ - unsigned count = - (c == REG_CLASS_WORK) ? work_count : 2; - - unsigned first_reg = - (c == REG_CLASS_LDST) ? 26 : - (c == REG_CLASS_TEXR) ? 28 : - (c == REG_CLASS_TEXW) ? SHADOW_R28 : - 0; - - /* Add the full set of work registers */ - for (unsigned i = first_reg; i < (first_reg + count); ++i) { - int base = WORK_STRIDE * i; - - /* Build a full set of subdivisions */ - ra_class_add_reg(regs, work_vec4, base); - ra_class_add_reg(regs, work_vec3, base + 1); - ra_class_add_reg(regs, work_vec3, base + 2); - ra_class_add_reg(regs, work_vec2, base + 3); - ra_class_add_reg(regs, work_vec2, base + 4); - ra_class_add_reg(regs, work_vec2, base + 5); - ra_class_add_reg(regs, work_vec1, base + 6); - ra_class_add_reg(regs, work_vec1, base + 7); - ra_class_add_reg(regs, work_vec1, base + 8); - ra_class_add_reg(regs, work_vec1, base + 9); - - for (unsigned a = 0; a < 10; ++a) { - unsigned mask1 = reg_type_to_mask[a]; - - for (unsigned b = 0; b < 10; ++b) { - unsigned mask2 = reg_type_to_mask[b]; - - if (mask1 & mask2) - ra_add_reg_conflict(regs, - base + a, base + b); - } - } - } - } - - int fragc = ra_alloc_reg_class(regs); - - classes[4*REG_CLASS_FRAGC + 0] = fragc; - classes[4*REG_CLASS_FRAGC + 1] = fragc; - classes[4*REG_CLASS_FRAGC + 2] = fragc; - classes[4*REG_CLASS_FRAGC + 3] = fragc; - ra_class_add_reg(regs, fragc, WORK_STRIDE * SHADOW_R0); - - /* We have duplicate classes */ - add_shadow_conflicts(regs, 0, SHADOW_R0, 1); - add_shadow_conflicts(regs, 28, SHADOW_R28, WORK_STRIDE); - add_shadow_conflicts(regs, 29, SHADOW_R29, WORK_STRIDE); - - /* We're done setting up */ - ra_set_finalize(regs, NULL); - - return regs; -} - -/* This routine gets a precomputed register set off the screen if it's able, or - * otherwise it computes one on the fly */ - -static struct ra_regs * -get_register_set(struct midgard_screen *screen, unsigned work_count, unsigned **classes) -{ - /* Bounds check */ - assert(work_count >= 8); - assert(work_count <= 16); - - /* Compute index */ - unsigned index = work_count - 8; - - /* Find the reg set */ - struct ra_regs *cached = screen->regs[index]; - - if (cached) { - assert(screen->reg_classes[index]); - *classes = screen->reg_classes[index]; - return cached; - } - - /* Otherwise, create one */ - struct ra_regs *created = create_register_set(work_count, screen->reg_classes[index]); - - /* Cache it and use it */ - screen->regs[index] = created; - - *classes = screen->reg_classes[index]; - return created; -} - -/* Assign a (special) class, ensuring that it is compatible with whatever class - * was already set */ - static void set_class(unsigned *classes, unsigned node, unsigned class) { - /* Check that we're even a node */ - if (node >= SSA_FIXED_MINIMUM) - return; - - /* First 4 are work, next 4 are load/store.. */ - unsigned current_class = classes[node] >> 2; - - /* Nothing to do */ - if (class == current_class) - return; - - /* If we're changing, we haven't assigned a special class */ - assert(current_class == REG_CLASS_WORK); - - classes[node] &= 0x3; - classes[node] |= (class << 2); -} - -static void -force_vec4(unsigned *classes, unsigned node) -{ - if (node >= SSA_FIXED_MINIMUM) - return; - - /* Force vec4 = 3 */ - classes[node] |= 0x3; + if (node < SSA_FIXED_MINIMUM && class != classes[node]) { + assert(classes[node] == REG_CLASS_WORK); + classes[node] = class; + } } /* Special register classes impose special constraints on who can read their @@ -351,9 +123,7 @@ check_read_class(unsigned *classes, unsigned tag, unsigned node) if (node >= SSA_FIXED_MINIMUM) return true; - unsigned current_class = classes[node] >> 2; - - switch (current_class) { + switch (classes[node]) { case REG_CLASS_LDST: return (tag == TAG_LOAD_STORE_4); case REG_CLASS_TEXR: @@ -374,9 +144,7 @@ check_write_class(unsigned *classes, unsigned tag, unsigned node) if (node >= SSA_FIXED_MINIMUM) return true; - unsigned current_class = classes[node] >> 2; - - switch (current_class) { + switch (classes[node]) { case REG_CLASS_TEXR: return true; case REG_CLASS_TEXW: @@ -498,10 +266,6 @@ mir_lower_special_reads(compiler_context *ctx) unsigned idx = spill_idx++; - midgard_instruction m = hazard_write ? - v_mov(idx, blank_alu_src, i) : - v_mov(i, blank_alu_src, idx); - /* Insert move before each read/write, depending on the * hazard we're trying to account for */ @@ -512,20 +276,25 @@ mir_lower_special_reads(compiler_context *ctx) if (hazard_write) { if (pre_use->dest != i) continue; - } else { - if (!mir_has_arg(pre_use, i)) - continue; - } - if (hazard_write) { + midgard_instruction m = v_mov(idx, i); + m.dest_type = pre_use->dest_type; + m.src_types[1] = m.dest_type; + m.mask = pre_use->mask; + midgard_instruction *use = mir_next_op(pre_use); assert(use); mir_insert_instruction_before(ctx, use, m); mir_rewrite_index_dst_single(pre_use, i, idx); } else { + if (!mir_has_arg(pre_use, i)) + continue; + idx = spill_idx++; - m = v_mov(i, blank_alu_src, idx); - m.mask = mir_mask_of_read_components(pre_use, i); + + midgard_instruction m = v_mov(i, idx); + m.mask = mir_from_bytemask(mir_round_bytemask_up( + mir_bytemask_of_read_components(pre_use, i), 32), 32); mir_insert_instruction_before(ctx, pre_use, m); mir_rewrite_index_src_single(pre_use, i, idx); } @@ -544,17 +313,40 @@ mir_lower_special_reads(compiler_context *ctx) static void mir_compute_interference( compiler_context *ctx, - struct ra_graph *g) + struct lcra_state *l) { /* First, we need liveness information to be computed per block */ mir_compute_liveness(ctx); + /* We need to force r1.w live throughout a blend shader */ + + if (ctx->is_blend) { + unsigned r1w = ~0; + + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *) _block; + mir_foreach_instr_in_block_rev(block, ins) { + if (ins->writeout) + r1w = ins->dest; + } + + if (r1w != ~0) + break; + } + + mir_foreach_instr_global(ctx, ins) { + if (ins->dest < ctx->temp_count) + lcra_add_node_interference(l, ins->dest, mir_bytemask(ins), r1w, 0xF); + } + } + /* Now that every block has live_in/live_out computed, we can determine * interference by walking each block linearly. Take live_out at the * end of each block and walk the block backwards. */ - mir_foreach_block(ctx, blk) { - uint8_t *live = mem_dup(blk->live_out, ctx->temp_count * sizeof(uint8_t)); + mir_foreach_block(ctx, _blk) { + midgard_block *blk = (midgard_block *) _blk; + uint16_t *live = mem_dup(_blk->live_out, ctx->temp_count * sizeof(uint16_t)); mir_foreach_instr_in_block_rev(blk, ins) { /* Mark all registers live after the instruction as @@ -564,8 +356,10 @@ mir_compute_interference( if (dest < ctx->temp_count) { for (unsigned i = 0; i < ctx->temp_count; ++i) - if (live[i]) - ra_add_node_interference(g, dest, i); + if (live[i]) { + unsigned mask = mir_bytemask(ins); + lcra_add_node_interference(l, dest, mask, i, live[i]); + } } /* Update live_in */ @@ -576,50 +370,151 @@ mir_compute_interference( } } +static bool +mir_is_64(midgard_instruction *ins) +{ + if (nir_alu_type_get_type_size(ins->dest_type) == 64) + return true; + + mir_foreach_src(ins, v) { + if (nir_alu_type_get_type_size(ins->src_types[v]) == 64) + return true; + } + + return false; +} + /* This routine performs the actual register allocation. It should be succeeded * by install_registers */ -struct ra_graph * +static struct lcra_state * allocate_registers(compiler_context *ctx, bool *spilled) { /* The number of vec4 work registers available depends on when the - * uniforms start, so compute that first */ - int work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0); - unsigned *classes = NULL; - struct ra_regs *regs = get_register_set(ctx->screen, work_count, &classes); - - assert(regs != NULL); - assert(classes != NULL); + * uniforms start and the shader stage. By ABI we limit blend shaders + * to 8 registers, should be lower XXX */ + int work_count = ctx->is_blend ? 8 : + 16 - MAX2((ctx->uniform_cutoff - 8), 0); /* No register allocation to do with no SSA */ if (!ctx->temp_count) return NULL; - /* Let's actually do register allocation */ - int nodes = ctx->temp_count; - struct ra_graph *g = ra_alloc_interference_graph(regs, nodes); - - /* Register class (as known to the Mesa register allocator) is actually - * the product of both semantic class (work, load/store, texture..) and - * size (vec2/vec3..). First, we'll go through and determine the - * minimum size needed to hold values */ + /* Initialize LCRA. Allocate an extra node at the end for a precoloured + * r1 for interference */ + + struct lcra_state *l = lcra_alloc_equations(ctx->temp_count + 1, 5); + unsigned node_r1 = ctx->temp_count; + + /* Starts of classes, in bytes */ + l->class_start[REG_CLASS_WORK] = 16 * 0; + l->class_start[REG_CLASS_LDST] = 16 * 26; + l->class_start[REG_CLASS_TEXR] = 16 * 28; + l->class_start[REG_CLASS_TEXW] = 16 * 28; + + l->class_size[REG_CLASS_WORK] = 16 * work_count; + l->class_size[REG_CLASS_LDST] = 16 * 2; + l->class_size[REG_CLASS_TEXR] = 16 * 2; + l->class_size[REG_CLASS_TEXW] = 16 * 2; + + lcra_set_disjoint_class(l, REG_CLASS_TEXR, REG_CLASS_TEXW); + + /* To save space on T*20, we don't have real texture registers. + * Instead, tex inputs reuse the load/store pipeline registers, and + * tex outputs use work r0/r1. Note we still use TEXR/TEXW classes, + * noting that this handles interferences and sizes correctly. */ + + if (ctx->quirks & MIDGARD_INTERPIPE_REG_ALIASING) { + l->class_start[REG_CLASS_TEXR] = l->class_start[REG_CLASS_LDST]; + l->class_start[REG_CLASS_TEXW] = l->class_start[REG_CLASS_WORK]; + } unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count); + unsigned *min_alignment = calloc(sizeof(unsigned), ctx->temp_count); + unsigned *min_bound = calloc(sizeof(unsigned), ctx->temp_count); mir_foreach_instr_global(ctx, ins) { + /* Swizzles of 32-bit sources on 64-bit instructions need to be + * aligned to either bottom (xy) or top (zw). More general + * swizzle lowering should happen prior to scheduling (TODO), + * but once we get RA we shouldn't disrupt this further. Align + * sources of 64-bit instructions. */ + + if (ins->type == TAG_ALU_4 && mir_is_64(ins)) { + mir_foreach_src(ins, v) { + unsigned s = ins->src[v]; + + if (s < ctx->temp_count) + min_alignment[s] = 3; + } + } + + if (ins->type == TAG_LOAD_STORE_4 && OP_HAS_ADDRESS(ins->op)) { + mir_foreach_src(ins, v) { + unsigned s = ins->src[v]; + unsigned size = nir_alu_type_get_type_size(ins->src_types[v]); + + if (s < ctx->temp_count) + min_alignment[s] = (size == 64) ? 3 : 2; + } + } + if (ins->dest >= SSA_FIXED_MINIMUM) continue; + unsigned size = nir_alu_type_get_type_size(ins->dest_type); + + if (ins->is_pack) + size = 32; + /* 0 for x, 1 for xy, 2 for xyz, 3 for xyzw */ - int class = util_logbase2(ins->mask); + int comps1 = util_logbase2(ins->mask); + + int bytes = (comps1 + 1) * (size / 8); /* Use the largest class if there's ambiguity, this * handles partial writes */ int dest = ins->dest; - found_class[dest] = MAX2(found_class[dest], class); + found_class[dest] = MAX2(found_class[dest], bytes); + + min_alignment[dest] = + (size == 16) ? 1 : /* (1 << 1) = 2-byte */ + (size == 32) ? 2 : /* (1 << 2) = 4-byte */ + (size == 64) ? 3 : /* (1 << 3) = 8-byte */ + 3; /* 8-bit todo */ + + /* We can't cross xy/zw boundaries. TODO: vec8 can */ + if (size == 16) + min_bound[dest] = 8; + + /* We don't have a swizzle for the conditional and we don't + * want to muck with the conditional itself, so just force + * alignment for now */ + + if (ins->type == TAG_ALU_4 && OP_IS_CSEL_V(ins->op)) { + min_alignment[dest] = 4; /* 1 << 4= 16-byte = vec4 */ + + /* LCRA assumes bound >= alignment */ + min_bound[dest] = 16; + } + + /* Since ld/st swizzles and masks are 32-bit only, we need them + * aligned to enable final packing */ + if (ins->type == TAG_LOAD_STORE_4) + min_alignment[dest] = MAX2(min_alignment[dest], 2); } + for (unsigned i = 0; i < ctx->temp_count; ++i) { + lcra_set_alignment(l, i, min_alignment[i] ? min_alignment[i] : 2, + min_bound[i] ? min_bound[i] : 16); + lcra_restrict_range(l, i, found_class[i]); + } + + free(found_class); + free(min_alignment); + free(min_bound); + /* Next, we'll determine semantic class. We default to zero (work). * But, if we're used with a special operation, that will force us to a * particular class. Each node must be assigned to exactly one class; a @@ -631,59 +526,126 @@ allocate_registers(compiler_context *ctx, bool *spilled) /* Check if this operation imposes any classes */ if (ins->type == TAG_LOAD_STORE_4) { - bool force_vec4_only = OP_IS_VEC4_ONLY(ins->load_store.op); - - set_class(found_class, ins->src[0], REG_CLASS_LDST); - set_class(found_class, ins->src[1], REG_CLASS_LDST); - set_class(found_class, ins->src[2], REG_CLASS_LDST); - - if (force_vec4_only) { - force_vec4(found_class, ins->dest); - force_vec4(found_class, ins->src[0]); - force_vec4(found_class, ins->src[1]); - force_vec4(found_class, ins->src[2]); + set_class(l->class, ins->src[0], REG_CLASS_LDST); + set_class(l->class, ins->src[1], REG_CLASS_LDST); + set_class(l->class, ins->src[2], REG_CLASS_LDST); + + if (OP_IS_VEC4_ONLY(ins->op)) { + lcra_restrict_range(l, ins->dest, 16); + lcra_restrict_range(l, ins->src[0], 16); + lcra_restrict_range(l, ins->src[1], 16); + lcra_restrict_range(l, ins->src[2], 16); } } else if (ins->type == TAG_TEXTURE_4) { - set_class(found_class, ins->dest, REG_CLASS_TEXW); - set_class(found_class, ins->src[0], REG_CLASS_TEXR); - set_class(found_class, ins->src[1], REG_CLASS_TEXR); - set_class(found_class, ins->src[2], REG_CLASS_TEXR); + set_class(l->class, ins->dest, REG_CLASS_TEXW); + set_class(l->class, ins->src[0], REG_CLASS_TEXR); + set_class(l->class, ins->src[1], REG_CLASS_TEXR); + set_class(l->class, ins->src[2], REG_CLASS_TEXR); + set_class(l->class, ins->src[3], REG_CLASS_TEXR); } } /* Check that the semantics of the class are respected */ mir_foreach_instr_global(ctx, ins) { - assert(check_write_class(found_class, ins->type, ins->dest)); - assert(check_read_class(found_class, ins->type, ins->src[0])); - assert(check_read_class(found_class, ins->type, ins->src[1])); - assert(check_read_class(found_class, ins->type, ins->src[2])); + assert(check_write_class(l->class, ins->type, ins->dest)); + assert(check_read_class(l->class, ins->type, ins->src[0])); + assert(check_read_class(l->class, ins->type, ins->src[1])); + assert(check_read_class(l->class, ins->type, ins->src[2])); } - /* Mark writeout to r0 */ + /* Mark writeout to r0, depth to r1.x, stencil to r1.y, + * render target to r1.z, unknown to r1.w */ mir_foreach_instr_global(ctx, ins) { - if (ins->compact_branch && ins->writeout) - set_class(found_class, ins->src[0], REG_CLASS_FRAGC); + if (!(ins->compact_branch && ins->writeout)) continue; + + if (ins->src[0] < ctx->temp_count) + l->solutions[ins->src[0]] = 0; + + if (ins->src[2] < ctx->temp_count) + l->solutions[ins->src[2]] = (16 * 1) + COMPONENT_X * 4; + + if (ins->src[3] < ctx->temp_count) + l->solutions[ins->src[3]] = (16 * 1) + COMPONENT_Y * 4; + + if (ins->src[1] < ctx->temp_count) + l->solutions[ins->src[1]] = (16 * 1) + COMPONENT_Z * 4; + + if (ins->dest < ctx->temp_count) + l->solutions[ins->dest] = (16 * 1) + COMPONENT_W * 4; } - for (unsigned i = 0; i < ctx->temp_count; ++i) { - unsigned class = found_class[i]; - ra_set_node_class(g, i, classes[class]); + /* Destinations of instructions in a writeout block cannot be assigned + * to r1 unless they are actually used as r1 from the writeout itself, + * since the writes to r1 are special. A code sequence like: + * + * sadd.fmov r1.x, [...] + * vadd.fadd r0, r1, r2 + * [writeout branch] + * + * will misbehave since the r1.x write will be interpreted as a + * gl_FragDepth write so it won't show up correctly when r1 is read in + * the following segment. We model this as interference. + */ + + l->solutions[node_r1] = (16 * 1); + + mir_foreach_block(ctx, _blk) { + midgard_block *blk = (midgard_block *) _blk; + + mir_foreach_bundle_in_block(blk, v) { + /* We need at least a writeout and nonwriteout instruction */ + if (v->instruction_count < 2) + continue; + + /* Branches always come at the end */ + midgard_instruction *br = v->instructions[v->instruction_count - 1]; + + if (!br->writeout) + continue; + + for (signed i = v->instruction_count - 2; i >= 0; --i) { + midgard_instruction *ins = v->instructions[i]; + + if (ins->dest >= ctx->temp_count) + continue; + + bool used_as_r1 = (br->dest == ins->dest); + + mir_foreach_src(br, s) + used_as_r1 |= (s > 0) && (br->src[s] == ins->dest); + + if (!used_as_r1) + lcra_add_node_interference(l, ins->dest, mir_bytemask(ins), node_r1, 0xFFFF); + } + } } - mir_compute_interference(ctx, g); + /* Precolour blend input to r0. Note writeout is necessarily at the end + * and blend shaders are single-RT only so there is only a single + * writeout block, so this cannot conflict with the writeout r0 (there + * is no need to have an intermediate move) */ - if (!ra_allocate(g)) { - *spilled = true; - } else { - *spilled = false; + if (ctx->blend_input != ~0) { + assert(ctx->blend_input < ctx->temp_count); + l->solutions[ctx->blend_input] = 0; } - /* Whether we were successful or not, report the graph so we can - * compute spill nodes */ + /* Same for the dual-source blend input/output, except here we use r2, + * which is also set in the fragment shader. */ - return g; + if (ctx->blend_src1 != ~0) { + assert(ctx->blend_src1 < ctx->temp_count); + l->solutions[ctx->blend_src1] = (16 * 2); + ctx->work_registers = MAX2(ctx->work_registers, 2); + } + + mir_compute_interference(ctx, l); + + *spilled = !lcra_solve(l); + return l; } + /* Once registers have been decided via register allocation * (allocate_registers), we need to rewrite the MIR to use registers instead of * indices */ @@ -691,9 +653,19 @@ allocate_registers(compiler_context *ctx, bool *spilled) static void install_registers_instr( compiler_context *ctx, - struct ra_graph *g, + struct lcra_state *l, midgard_instruction *ins) { + unsigned src_shift[MIR_SRC_COUNT]; + + for (unsigned i = 0; i < MIR_SRC_COUNT; ++i) { + src_shift[i] = + util_logbase2(nir_alu_type_get_type_size(ins->src_types[i]) / 8); + } + + unsigned dest_shift = + util_logbase2(nir_alu_type_get_type_size(ins->dest_type) / 8); + switch (ins->type) { case TAG_ALU_4: case TAG_ALU_8: @@ -702,48 +674,25 @@ install_registers_instr( if (ins->compact_branch) return; - struct phys_reg src1 = index_to_reg(ctx, g, ins->src[0]); - struct phys_reg src2 = index_to_reg(ctx, g, ins->src[1]); - struct phys_reg dest = index_to_reg(ctx, g, ins->dest); - - unsigned uncomposed_mask = ins->mask; - ins->mask = compose_writemask(uncomposed_mask, dest); - - /* Adjust the dest mask if necessary. Mostly this is a no-op - * but it matters for dot products */ - dest.mask = effective_writemask(&ins->alu, ins->mask); - - midgard_vector_alu_src mod1 = - vector_alu_from_unsigned(ins->alu.src1); - mod1.swizzle = compose_swizzle(mod1.swizzle, uncomposed_mask, src1, dest); - ins->alu.src1 = vector_alu_srco_unsigned(mod1); - - ins->registers.src1_reg = src1.reg; - - ins->registers.src2_imm = ins->has_inline_constant; - - if (ins->has_inline_constant) { - /* Encode inline 16-bit constant. See disassembler for - * where the algorithm is from */ - - ins->registers.src2_reg = ins->inline_constant >> 11; - - int lower_11 = ins->inline_constant & ((1 << 12) - 1); - uint16_t imm = ((lower_11 >> 8) & 0x7) | - ((lower_11 & 0xFF) << 3); - - ins->alu.src2 = imm << 2; - } else { - midgard_vector_alu_src mod2 = - vector_alu_from_unsigned(ins->alu.src2); - mod2.swizzle = compose_swizzle( - mod2.swizzle, uncomposed_mask, src2, dest); - ins->alu.src2 = vector_alu_srco_unsigned(mod2); - - ins->registers.src2_reg = src2.reg; - } - - ins->registers.out_reg = dest.reg; + struct phys_reg src1 = index_to_reg(ctx, l, ins->src[0], src_shift[0]); + struct phys_reg src2 = index_to_reg(ctx, l, ins->src[1], src_shift[1]); + struct phys_reg dest = index_to_reg(ctx, l, ins->dest, dest_shift); + + mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset); + + unsigned dest_offset = + GET_CHANNEL_COUNT(alu_opcode_props[ins->op].props) ? 0 : + dest.offset; + + offset_swizzle(ins->swizzle[0], src1.offset, src1.shift, dest.shift, dest_offset); + if (!ins->has_inline_constant) + offset_swizzle(ins->swizzle[1], src2.offset, src2.shift, dest.shift, dest_offset); + if (ins->src[0] != ~0) + ins->src[0] = SSA_FIXED_REGISTER(src1.reg); + if (ins->src[1] != ~0) + ins->src[1] = SSA_FIXED_REGISTER(src2.reg); + if (ins->dest != ~0) + ins->dest = SSA_FIXED_REGISTER(dest.reg); break; } @@ -752,99 +701,80 @@ install_registers_instr( * whether we are loading or storing -- think about the * logical dataflow */ - bool encodes_src = OP_IS_STORE(ins->load_store.op); + bool encodes_src = OP_IS_STORE(ins->op); if (encodes_src) { - struct phys_reg src = index_to_reg(ctx, g, ins->src[0]); + struct phys_reg src = index_to_reg(ctx, l, ins->src[0], src_shift[0]); assert(src.reg == 26 || src.reg == 27); - ins->load_store.reg = src.reg - 26; - - unsigned shift = __builtin_ctz(src.mask); - unsigned adjusted_mask = src.mask >> shift; - assert(((adjusted_mask + 1) & adjusted_mask) == 0); - - unsigned new_swizzle = 0; - for (unsigned q = 0; q < 4; ++q) { - unsigned c = (ins->load_store.swizzle >> (2*q)) & 3; - new_swizzle |= (c + shift) << (2*q); - } - - ins->load_store.swizzle = compose_swizzle( - new_swizzle, src.mask, - default_phys_reg(0), src); + ins->src[0] = SSA_FIXED_REGISTER(src.reg); + offset_swizzle(ins->swizzle[0], src.offset, src.shift, 0, 0); } else { - struct phys_reg src = index_to_reg(ctx, g, ins->dest); + struct phys_reg dst = index_to_reg(ctx, l, ins->dest, dest_shift); - ins->load_store.reg = src.reg; - - ins->load_store.swizzle = compose_swizzle( - ins->load_store.swizzle, 0xF, - default_phys_reg(0), src); - - ins->mask = compose_writemask( - ins->mask, src); + ins->dest = SSA_FIXED_REGISTER(dst.reg); + offset_swizzle(ins->swizzle[0], 0, 2, 2, dst.offset); + mir_set_bytemask(ins, mir_bytemask(ins) << dst.offset); } /* We also follow up by actual arguments */ - int src2 = - encodes_src ? ins->src[1] : ins->src[0]; + unsigned src2 = ins->src[1]; + unsigned src3 = ins->src[2]; - int src3 = - encodes_src ? ins->src[2] : ins->src[1]; - - if (src2 >= 0) { - struct phys_reg src = index_to_reg(ctx, g, src2); - unsigned component = __builtin_ctz(src.mask); - ins->load_store.arg_1 |= midgard_ldst_reg(src.reg, component); + if (src2 != ~0) { + struct phys_reg src = index_to_reg(ctx, l, src2, 2); + unsigned component = src.offset >> src.shift; + assert(component << src.shift == src.offset); + ins->src[1] = SSA_FIXED_REGISTER(src.reg); + ins->swizzle[1][0] = component; } - if (src3 >= 0) { - struct phys_reg src = index_to_reg(ctx, g, src3); - unsigned component = __builtin_ctz(src.mask); - ins->load_store.arg_2 |= midgard_ldst_reg(src.reg, component); + if (src3 != ~0) { + struct phys_reg src = index_to_reg(ctx, l, src3, 2); + unsigned component = src.offset >> src.shift; + assert(component << src.shift == src.offset); + ins->src[2] = SSA_FIXED_REGISTER(src.reg); + ins->swizzle[2][0] = component; } break; } case TAG_TEXTURE_4: { - /* Grab RA results */ - struct phys_reg dest = index_to_reg(ctx, g, ins->dest); - struct phys_reg coord = index_to_reg(ctx, g, ins->src[0]); - struct phys_reg lod = index_to_reg(ctx, g, ins->src[1]); + if (ins->op == TEXTURE_OP_BARRIER) + break; - assert(dest.reg == 28 || dest.reg == 29); - assert(coord.reg == 28 || coord.reg == 29); + /* Grab RA results */ + struct phys_reg dest = index_to_reg(ctx, l, ins->dest, dest_shift); + struct phys_reg coord = index_to_reg(ctx, l, ins->src[1], src_shift[1]); + struct phys_reg lod = index_to_reg(ctx, l, ins->src[2], src_shift[2]); + struct phys_reg offset = index_to_reg(ctx, l, ins->src[3], src_shift[3]); /* First, install the texture coordinate */ - ins->texture.in_reg_full = 1; - ins->texture.in_reg_upper = 0; - ins->texture.in_reg_select = coord.reg - 28; - ins->texture.in_reg_swizzle = - compose_swizzle(ins->texture.in_reg_swizzle, 0xF, coord, dest); + if (ins->src[1] != ~0) + ins->src[1] = SSA_FIXED_REGISTER(coord.reg); + offset_swizzle(ins->swizzle[1], coord.offset, coord.shift, dest.shift, 0); /* Next, install the destination */ - ins->texture.out_full = 1; - ins->texture.out_upper = 0; - ins->texture.out_reg_select = dest.reg - 28; - ins->texture.swizzle = - compose_swizzle(ins->texture.swizzle, dest.mask, dest, dest); - ins->mask = - compose_writemask(ins->mask, dest); + if (ins->dest != ~0) + ins->dest = SSA_FIXED_REGISTER(dest.reg); + offset_swizzle(ins->swizzle[0], 0, 2, dest.shift, + dest_shift == 1 ? dest.offset % 8 : + dest.offset); + mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset); /* If there is a register LOD/bias, use it */ - if (ins->src[1] != ~0) { - midgard_tex_register_select sel = { - .select = lod.reg, - .full = 1, - .component = lod.swizzle & 3, - }; - - uint8_t packed; - memcpy(&packed, &sel, sizeof(packed)); - ins->texture.bias = packed; + if (ins->src[2] != ~0) { + assert(!(lod.offset & 3)); + ins->src[2] = SSA_FIXED_REGISTER(lod.reg); + ins->swizzle[2][0] = lod.offset / 4; + } + + /* If there is an offset register, install it */ + if (ins->src[3] != ~0) { + ins->src[3] = SSA_FIXED_REGISTER(offset.reg); + ins->swizzle[3][0] = offset.offset / 4; } break; @@ -855,9 +785,224 @@ install_registers_instr( } } -void -install_registers(compiler_context *ctx, struct ra_graph *g) +static void +install_registers(compiler_context *ctx, struct lcra_state *l) { mir_foreach_instr_global(ctx, ins) - install_registers_instr(ctx, g, ins); + install_registers_instr(ctx, l, ins); +} + + +/* If register allocation fails, find the best spill node */ + +static signed +mir_choose_spill_node( + compiler_context *ctx, + struct lcra_state *l) +{ + /* We can't spill a previously spilled value or an unspill */ + + mir_foreach_instr_global(ctx, ins) { + if (ins->no_spill & (1 << l->spill_class)) { + lcra_set_node_spill_cost(l, ins->dest, -1); + + if (l->spill_class != REG_CLASS_WORK) { + mir_foreach_src(ins, s) + lcra_set_node_spill_cost(l, ins->src[s], -1); + } + } + } + + return lcra_get_best_spill_node(l); +} + +/* Once we've chosen a spill node, spill it */ + +static void +mir_spill_register( + compiler_context *ctx, + unsigned spill_node, + unsigned spill_class, + unsigned *spill_count) +{ + if (spill_class == REG_CLASS_WORK && ctx->is_blend) + unreachable("Blend shader spilling is currently unimplemented"); + + unsigned spill_index = ctx->temp_count; + + /* We have a spill node, so check the class. Work registers + * legitimately spill to TLS, but special registers just spill to work + * registers */ + + bool is_special = spill_class != REG_CLASS_WORK; + bool is_special_w = spill_class == REG_CLASS_TEXW; + + /* Allocate TLS slot (maybe) */ + unsigned spill_slot = !is_special ? (*spill_count)++ : 0; + + /* For TLS, replace all stores to the spilled node. For + * special reads, just keep as-is; the class will be demoted + * implicitly. For special writes, spill to a work register */ + + if (!is_special || is_special_w) { + if (is_special_w) + spill_slot = spill_index++; + + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *) _block; + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->dest != spill_node) continue; + + midgard_instruction st; + + /* Note: it's important to match the mask of the spill + * with the mask of the instruction whose destination + * we're spilling, or otherwise we'll read invalid + * components and can fail RA in a subsequent iteration + */ + + if (is_special_w) { + st = v_mov(spill_node, spill_slot); + st.no_spill |= (1 << spill_class); + st.mask = ins->mask; + st.dest_type = st.src_types[0] = ins->dest_type; + } else { + ins->dest = spill_index++; + ins->no_spill |= (1 << spill_class); + st = v_load_store_scratch(ins->dest, spill_slot, true, ins->mask); + } + + /* Hint: don't rewrite this node */ + st.hint = true; + + mir_insert_instruction_after_scheduled(ctx, block, ins, st); + + if (!is_special) + ctx->spills++; + } + } + } + + /* For special reads, figure out how many bytes we need */ + unsigned read_bytemask = 0; + + mir_foreach_instr_global_safe(ctx, ins) { + read_bytemask |= mir_bytemask_of_read_components(ins, spill_node); + } + + /* Insert a load from TLS before the first consecutive + * use of the node, rewriting to use spilled indices to + * break up the live range. Or, for special, insert a + * move. Ironically the latter *increases* register + * pressure, but the two uses of the spilling mechanism + * are somewhat orthogonal. (special spilling is to use + * work registers to back special registers; TLS + * spilling is to use memory to back work registers) */ + + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *) _block; + mir_foreach_instr_in_block(block, ins) { + /* We can't rewrite the moves used to spill in the + * first place. These moves are hinted. */ + if (ins->hint) continue; + + /* If we don't use the spilled value, nothing to do */ + if (!mir_has_arg(ins, spill_node)) continue; + + unsigned index = 0; + + if (!is_special_w) { + index = ++spill_index; + + midgard_instruction *before = ins; + midgard_instruction st; + + if (is_special) { + /* Move */ + st = v_mov(spill_node, index); + st.no_spill |= (1 << spill_class); + } else { + /* TLS load */ + st = v_load_store_scratch(index, spill_slot, false, 0xF); + } + + /* Mask the load based on the component count + * actually needed to prevent RA loops */ + + st.mask = mir_from_bytemask(mir_round_bytemask_up( + read_bytemask, 32), 32); + + mir_insert_instruction_before_scheduled(ctx, block, before, st); + } else { + /* Special writes already have their move spilled in */ + index = spill_slot; + } + + + /* Rewrite to use */ + mir_rewrite_index_src_single(ins, spill_node, index); + + if (!is_special) + ctx->fills++; + } + } + + /* Reset hints */ + + mir_foreach_instr_global(ctx, ins) { + ins->hint = false; + } +} + +/* Run register allocation in a loop, spilling until we succeed */ + +void +mir_ra(compiler_context *ctx) +{ + struct lcra_state *l = NULL; + bool spilled = false; + int iter_count = 1000; /* max iterations */ + + /* Number of 128-bit slots in memory we've spilled into */ + unsigned spill_count = 0; + + + mir_create_pipeline_registers(ctx); + + do { + if (spilled) { + signed spill_node = mir_choose_spill_node(ctx, l); + + if (spill_node == -1) { + fprintf(stderr, "ERROR: Failed to choose spill node\n"); + return; + } + + mir_spill_register(ctx, spill_node, l->spill_class, &spill_count); + } + + mir_squeeze_index(ctx); + mir_invalidate_liveness(ctx); + + if (l) { + lcra_free(l); + l = NULL; + } + + l = allocate_registers(ctx, &spilled); + } while(spilled && ((iter_count--) > 0)); + + if (iter_count <= 0) { + fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n"); + assert(0); + } + + /* Report spilling information. spill_count is in 128-bit slots (vec4 x + * fp32), but tls_size is in bytes, so multiply by 16 */ + + ctx->tls_size = spill_count * 16; + + install_registers(ctx, l); + + lcra_free(l); }