#include "util/ralloc.h"
#include "util/register_allocate.h"
+#include "common/v3d_device_info.h"
#include "v3d_compiler.h"
#define QPU_R(i) { .magic = false, .index = i }
#define ACC_INDEX 0
-#define ACC_COUNT 5
+#define ACC_COUNT 6
#define PHYS_INDEX (ACC_INDEX + ACC_COUNT)
#define PHYS_COUNT 64
+static inline bool
+qinst_writes_tmu(struct qinst *inst)
+{
+ return (inst->dst.file == QFILE_MAGIC &&
+ v3d_qpu_magic_waddr_is_tmu(inst->dst.index));
+}
+
+static bool
+is_last_ldtmu(struct qinst *inst, struct qblock *block)
+{
+ list_for_each_entry_from(struct qinst, scan_inst, inst->link.next,
+ &block->instructions, link) {
+ if (scan_inst->qpu.sig.ldtmu)
+ return false;
+ if (qinst_writes_tmu(scan_inst))
+ return true;
+ }
+
+ return true;
+}
+
+static bool
+vir_is_mov_uniform(struct v3d_compile *c, int temp)
+{
+ struct qinst *def = c->defs[temp];
+
+ return def && def->qpu.sig.ldunif;
+}
+
+static int
+v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
+ uint32_t *temp_to_node)
+{
+ const float tmu_scale = 5;
+ float block_scale = 1.0;
+ float spill_costs[c->num_temps];
+ bool in_tmu_operation = false;
+ bool started_last_seg = false;
+
+ for (unsigned i = 0; i < c->num_temps; i++)
+ spill_costs[i] = 0.0;
+
+ /* XXX: Scale the cost up when inside of a loop. */
+ vir_for_each_block(block, c) {
+ vir_for_each_inst(inst, block) {
+ /* We can't insert a new TMU operation while currently
+ * in a TMU operation, and we can't insert new thread
+ * switches after starting output writes.
+ */
+ bool no_spilling =
+ (in_tmu_operation ||
+ (c->threads > 1 && started_last_seg));
+
+ for (int i = 0; i < vir_get_nsrc(inst); i++) {
+ if (inst->src[i].file != QFILE_TEMP)
+ continue;
+
+ int temp = inst->src[i].index;
+ if (vir_is_mov_uniform(c, temp)) {
+ spill_costs[temp] += block_scale;
+ } else if (!no_spilling) {
+ spill_costs[temp] += (block_scale *
+ tmu_scale);
+ } else {
+ BITSET_CLEAR(c->spillable, temp);
+ }
+ }
+
+ if (inst->dst.file == QFILE_TEMP) {
+ int temp = inst->dst.index;
+
+ if (vir_is_mov_uniform(c, temp)) {
+ /* We just rematerialize the unform
+ * later.
+ */
+ } else if (!no_spilling) {
+ spill_costs[temp] += (block_scale *
+ tmu_scale);
+ } else {
+ BITSET_CLEAR(c->spillable, temp);
+ }
+ }
+
+ /* Refuse to spill a ldvary's dst, because that means
+ * that ldvary's r5 would end up being used across a
+ * thrsw.
+ */
+ if (inst->qpu.sig.ldvary) {
+ assert(inst->dst.file == QFILE_TEMP);
+ BITSET_CLEAR(c->spillable, inst->dst.index);
+ }
+
+ if (inst->is_last_thrsw)
+ started_last_seg = true;
+
+ if (v3d_qpu_writes_vpm(&inst->qpu) ||
+ v3d_qpu_uses_tlb(&inst->qpu))
+ started_last_seg = true;
+
+ /* Track when we're in between a TMU setup and the
+ * final LDTMU or TMUWT from that TMU setup. We can't
+ * spill/fill any temps during that time, because that
+ * involves inserting a new TMU setup/LDTMU sequence.
+ */
+ if (inst->qpu.sig.ldtmu &&
+ is_last_ldtmu(inst, block))
+ in_tmu_operation = false;
+
+ if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+ inst->qpu.alu.add.op == V3D_QPU_A_TMUWT)
+ in_tmu_operation = false;
+
+ if (qinst_writes_tmu(inst))
+ in_tmu_operation = true;
+ }
+ }
+
+ for (unsigned i = 0; i < c->num_temps; i++) {
+ int node = temp_to_node[i];
+
+ if (BITSET_TEST(c->spillable, i))
+ ra_set_node_spill_cost(g, node, spill_costs[i]);
+ }
+
+ return ra_get_best_spill_node(g);
+}
+
+/* The spill offset for this thread takes a bit of setup, so do it once at
+ * program start.
+ */
+void
+v3d_setup_spill_base(struct v3d_compile *c)
+{
+ c->cursor = vir_before_block(vir_entry_block(c));
+
+ int start_num_temps = c->num_temps;
+
+ /* Each thread wants to be in a separate region of the scratch space
+ * so that the QPUs aren't fighting over cache lines. We have the
+ * driver keep a single global spill BO rather than
+ * per-spilling-program BOs, so we need a uniform from the driver for
+ * what the per-thread scale is.
+ */
+ struct qreg thread_offset =
+ vir_UMUL(c,
+ vir_TIDX(c),
+ vir_uniform(c, QUNIFORM_SPILL_SIZE_PER_THREAD, 0));
+
+ /* Each channel in a reg is 4 bytes, so scale them up by that. */
+ struct qreg element_offset = vir_SHL(c, vir_EIDX(c),
+ vir_uniform_ui(c, 2));
+
+ c->spill_base = vir_ADD(c,
+ vir_ADD(c, thread_offset, element_offset),
+ vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0));
+
+ /* Make sure that we don't spill the spilling setup instructions. */
+ for (int i = start_num_temps; i < c->num_temps; i++)
+ BITSET_CLEAR(c->spillable, i);
+
+ c->cursor = vir_after_block(c->cur_block);
+}
+
+static void
+v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset)
+{
+ vir_ADD_dest(c, vir_reg(QFILE_MAGIC,
+ V3D_QPU_WADDR_TMUA),
+ c->spill_base,
+ vir_uniform_ui(c, spill_offset));
+}
+
+static void
+v3d_spill_reg(struct v3d_compile *c, int spill_temp)
+{
+ bool is_uniform = vir_is_mov_uniform(c, spill_temp);
+
+ uint32_t spill_offset = 0;
+
+ if (!is_uniform) {
+ uint32_t spill_offset = c->spill_size;
+ c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
+
+ if (spill_offset == 0)
+ v3d_setup_spill_base(c);
+ }
+
+ struct qinst *last_thrsw = c->last_thrsw;
+ assert(!last_thrsw || last_thrsw->is_last_thrsw);
+
+ int start_num_temps = c->num_temps;
+
+ int uniform_index = ~0;
+ if (is_uniform) {
+ struct qinst *orig_unif = c->defs[spill_temp];
+ uniform_index = orig_unif->uniform;
+ }
+
+ vir_for_each_inst_inorder_safe(inst, c) {
+ for (int i = 0; i < vir_get_nsrc(inst); i++) {
+ if (inst->src[i].file != QFILE_TEMP ||
+ inst->src[i].index != spill_temp) {
+ continue;
+ }
+
+ c->cursor = vir_before_inst(inst);
+
+ if (is_uniform) {
+ struct qreg unif =
+ vir_uniform(c,
+ c->uniform_contents[uniform_index],
+ c->uniform_data[uniform_index]);
+ inst->src[i] = unif;
+ } else {
+ v3d_emit_spill_tmua(c, spill_offset);
+ vir_emit_thrsw(c);
+ inst->src[i] = vir_LDTMU(c);
+ c->fills++;
+ }
+ }
+
+ if (inst->dst.file == QFILE_TEMP &&
+ inst->dst.index == spill_temp) {
+ if (is_uniform) {
+ c->cursor.link = NULL;
+ vir_remove_instruction(c, inst);
+ } else {
+ c->cursor = vir_after_inst(inst);
+
+ inst->dst.index = c->num_temps++;
+ vir_MOV_dest(c, vir_reg(QFILE_MAGIC,
+ V3D_QPU_WADDR_TMUD),
+ inst->dst);
+ v3d_emit_spill_tmua(c, spill_offset);
+ vir_emit_thrsw(c);
+ vir_TMUWT(c);
+ c->spills++;
+ }
+ }
+
+ /* If we didn't have a last-thrsw inserted by nir_to_vir and
+ * we've been inserting thrsws, then insert a new last_thrsw
+ * right before we start the vpm/tlb sequence for the last
+ * thread segment.
+ */
+ if (!is_uniform && !last_thrsw && c->last_thrsw &&
+ (v3d_qpu_writes_vpm(&inst->qpu) ||
+ v3d_qpu_uses_tlb(&inst->qpu))) {
+ c->cursor = vir_before_inst(inst);
+ vir_emit_thrsw(c);
+
+ last_thrsw = c->last_thrsw;
+ last_thrsw->is_last_thrsw = true;
+ }
+ }
+
+ /* Make sure c->last_thrsw is the actual last thrsw, not just one we
+ * inserted in our most recent unspill.
+ */
+ if (last_thrsw)
+ c->last_thrsw = last_thrsw;
+
+ /* Don't allow spilling of our spilling instructions. There's no way
+ * they can help get things colored.
+ */
+ for (int i = start_num_temps; i < c->num_temps; i++)
+ BITSET_CLEAR(c->spillable, i);
+}
+
+struct v3d_ra_select_callback_data {
+ uint32_t next_acc;
+ uint32_t next_phys;
+};
+
+static unsigned int
+v3d_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
+{
+ struct v3d_ra_select_callback_data *v3d_ra = data;
+ int r5 = ACC_INDEX + 5;
+
+ /* Choose r5 for our ldunifs if possible (nobody else can load to that
+ * reg, and it keeps the QPU cond field free from being occupied by
+ * ldunifrf).
+ */
+ if (BITSET_TEST(regs, r5))
+ return r5;
+
+ /* Choose an accumulator if possible (I think it's lower power than
+ * phys regs), but round-robin through them to give post-RA
+ * instruction selection more options.
+ */
+ for (int i = 0; i < ACC_COUNT; i++) {
+ int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT;
+ int acc = ACC_INDEX + acc_off;
+
+ if (BITSET_TEST(regs, acc)) {
+ v3d_ra->next_acc = acc_off + 1;
+ return acc;
+ }
+ }
+
+ for (int i = 0; i < PHYS_COUNT; i++) {
+ int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
+ int phys = PHYS_INDEX + phys_off;
+
+ if (BITSET_TEST(regs, phys)) {
+ v3d_ra->next_phys = phys_off + 1;
+ return phys;
+ }
+ }
+
+ unreachable("RA must pass us at least one possible reg.");
+}
+
bool
vir_init_reg_sets(struct v3d_compiler *compiler)
{
+ /* Allocate up to 3 regfile classes, for the ways the physical
+ * register file can be divided up for fragment shader threading.
+ */
+ int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
+
compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
true);
if (!compiler->regs)
return false;
- /* Allocate 3 regfile classes, for the ways the physical register file
- * can be divided up for fragment shader threading.
- */
- for (int threads = 0; threads < 3; threads++) {
- compiler->reg_class[threads] =
+ for (int threads = 0; threads < max_thread_index; threads++) {
+ compiler->reg_class_any[threads] =
+ ra_alloc_reg_class(compiler->regs);
+ compiler->reg_class_r5[threads] =
+ ra_alloc_reg_class(compiler->regs);
+ compiler->reg_class_phys_or_acc[threads] =
+ ra_alloc_reg_class(compiler->regs);
+ compiler->reg_class_phys[threads] =
ra_alloc_reg_class(compiler->regs);
for (int i = PHYS_INDEX;
i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
ra_class_add_reg(compiler->regs,
- compiler->reg_class[threads], i);
+ compiler->reg_class_phys_or_acc[threads], i);
+ ra_class_add_reg(compiler->regs,
+ compiler->reg_class_phys[threads], i);
+ ra_class_add_reg(compiler->regs,
+ compiler->reg_class_any[threads], i);
}
- for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT; i++) {
+ for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
+ ra_class_add_reg(compiler->regs,
+ compiler->reg_class_phys_or_acc[threads], i);
ra_class_add_reg(compiler->regs,
- compiler->reg_class[threads], i);
+ compiler->reg_class_any[threads], i);
}
+ /* r5 can only store a single 32-bit value, so not much can
+ * use it.
+ */
+ ra_class_add_reg(compiler->regs,
+ compiler->reg_class_r5[threads],
+ ACC_INDEX + 5);
+ ra_class_add_reg(compiler->regs,
+ compiler->reg_class_any[threads],
+ ACC_INDEX + 5);
}
ra_set_finalize(compiler->regs, NULL);
}
#define CLASS_BIT_PHYS (1 << 0)
-#define CLASS_BIT_R0_R2 (1 << 1)
-#define CLASS_BIT_R3 (1 << 2)
-#define CLASS_BIT_R4 (1 << 3)
+#define CLASS_BIT_ACC (1 << 1)
+#define CLASS_BIT_R5 (1 << 4)
+#define CLASS_BITS_ANY (CLASS_BIT_PHYS | \
+ CLASS_BIT_ACC | \
+ CLASS_BIT_R5)
/**
* Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
* The return value should be freed by the caller.
*/
struct qpu_reg *
-v3d_register_allocate(struct v3d_compile *c)
+v3d_register_allocate(struct v3d_compile *c, bool *spilled)
{
struct node_to_temp_map map[c->num_temps];
uint32_t temp_to_node[c->num_temps];
uint8_t class_bits[c->num_temps];
- struct qpu_reg *temp_registers = calloc(c->num_temps,
- sizeof(*temp_registers));
int acc_nodes[ACC_COUNT];
+ struct v3d_ra_select_callback_data callback_data = {
+ .next_acc = 0,
+ /* Start at RF3, to try to keep the TLB writes from using
+ * RF0-2.
+ */
+ .next_phys = 3,
+ };
+
+ *spilled = false;
+
+ vir_calculate_live_intervals(c);
+
+ /* Convert 1, 2, 4 threads to 0, 1, 2 index.
+ *
+ * V3D 4.x has double the physical register space, so 64 physical regs
+ * are available at both 1x and 2x threading, and 4x has 32.
+ */
+ int thread_index = ffs(c->threads) - 1;
+ if (c->devinfo->ver >= 40) {
+ if (thread_index >= 1)
+ thread_index--;
+ }
struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
c->num_temps +
ARRAY_SIZE(acc_nodes));
+ ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data);
/* Make some fixed nodes for the accumulators, which we will need to
* interfere with when ops have implied r3/r4 writes or for the thread
ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);
}
- /* Compute the live ranges so we can figure out interference. */
- vir_calculate_live_intervals(c);
-
for (uint32_t i = 0; i < c->num_temps; i++) {
map[i].temp = i;
map[i].priority = c->temp_end[i] - c->temp_start[i];
* start with any temp being able to be in any file, then instructions
* incrementally remove bits that the temp definitely can't be in.
*/
- memset(class_bits,
- CLASS_BIT_PHYS | CLASS_BIT_R0_R2 | CLASS_BIT_R3 | CLASS_BIT_R4,
- sizeof(class_bits));
+ memset(class_bits, CLASS_BITS_ANY, sizeof(class_bits));
int ip = 0;
vir_for_each_inst_inorder(inst, c) {
* decides whether the LDVPM is in or out)
*/
assert(inst->dst.file == QFILE_TEMP);
- class_bits[temp_to_node[inst->dst.index]] &=
- CLASS_BIT_PHYS;
+ class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
+ break;
+
+ case V3D_QPU_A_RECIP:
+ case V3D_QPU_A_RSQRT:
+ case V3D_QPU_A_EXP:
+ case V3D_QPU_A_LOG:
+ case V3D_QPU_A_SIN:
+ case V3D_QPU_A_RSQRT2:
+ /* The SFU instructions write directly to the
+ * phys regfile.
+ */
+ assert(inst->dst.file == QFILE_TEMP);
+ class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
break;
default:
case 0:
case 1:
case 2:
+ case 3:
/* Payload setup instructions: Force allocate
* the dst to the given register (so the MOV
* will disappear).
}
}
-#if 0
- switch (inst->op) {
- case QOP_THRSW:
+ if (inst->dst.file == QFILE_TEMP) {
+ /* Only a ldunif gets to write to R5, which only has a
+ * single 32-bit channel of storage.
+ */
+ if (!inst->qpu.sig.ldunif) {
+ class_bits[inst->dst.index] &= ~CLASS_BIT_R5;
+ } else {
+ /* Until V3D 4.x, we could only load a uniform
+ * to r5, so we'll need to spill if uniform
+ * loads interfere with each other.
+ */
+ if (c->devinfo->ver < 40) {
+ class_bits[inst->dst.index] &=
+ CLASS_BIT_R5;
+ }
+ }
+ }
+
+ if (inst->qpu.sig.thrsw) {
/* All accumulators are invalidated across a thread
* switch.
*/
for (int i = 0; i < c->num_temps; i++) {
if (c->temp_start[i] < ip && c->temp_end[i] > ip)
- class_bits[i] &= ~(CLASS_BIT_R0_R3 |
- CLASS_BIT_R4);
+ class_bits[i] &= CLASS_BIT_PHYS;
}
- break;
-
- default:
- break;
}
-#endif
ip++;
}
for (uint32_t i = 0; i < c->num_temps; i++) {
- ra_set_node_class(g, temp_to_node[i],
- c->compiler->reg_class[c->fs_threaded]);
+ if (class_bits[i] == CLASS_BIT_PHYS) {
+ ra_set_node_class(g, temp_to_node[i],
+ c->compiler->reg_class_phys[thread_index]);
+ } else if (class_bits[i] == (CLASS_BIT_R5)) {
+ ra_set_node_class(g, temp_to_node[i],
+ c->compiler->reg_class_r5[thread_index]);
+ } else if (class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_ACC)) {
+ ra_set_node_class(g, temp_to_node[i],
+ c->compiler->reg_class_phys_or_acc[thread_index]);
+ } else {
+ assert(class_bits[i] == CLASS_BITS_ANY);
+ ra_set_node_class(g, temp_to_node[i],
+ c->compiler->reg_class_any[thread_index]);
+ }
}
for (uint32_t i = 0; i < c->num_temps; i++) {
}
}
+ /* Debug code to force a bit of register spilling, for running across
+ * conformance tests to make sure that spilling works.
+ */
+ int force_register_spills = 0;
+ if (c->spill_size <
+ V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
+ int node = v3d_choose_spill_node(c, g, temp_to_node);
+ if (node != -1) {
+ v3d_spill_reg(c, map[node].temp);
+ ralloc_free(g);
+ *spilled = true;
+ return NULL;
+ }
+ }
+
bool ok = ra_allocate(g);
if (!ok) {
- if (!c->fs_threaded) {
- fprintf(stderr, "Failed to register allocate:\n");
- vir_dump(c);
+ int node = v3d_choose_spill_node(c, g, temp_to_node);
+
+ /* Don't emit spills using the TMU until we've dropped thread
+ * conut first.
+ */
+ if (node != -1 &&
+ (vir_is_mov_uniform(c, map[node].temp) ||
+ thread_index == 0)) {
+ v3d_spill_reg(c, map[node].temp);
+
+ /* Ask the outer loop to call back in. */
+ *spilled = true;
}
- c->failed = true;
- free(temp_registers);
+ ralloc_free(g);
return NULL;
}
+ struct qpu_reg *temp_registers = calloc(c->num_temps,
+ sizeof(*temp_registers));
+
for (uint32_t i = 0; i < c->num_temps; i++) {
int ra_reg = ra_get_node_reg(g, temp_to_node[i]);
if (ra_reg < PHYS_INDEX) {