* SOFTWARE.
*/
-/* #define NV50PC_DEBUG */
+#if NV50_DEBUG & NV50_DEBUG_PROG_RA
+# define NV50_RA_DEBUG_LIVEI
+# define NV50_RA_DEBUG_LIVE_SETS
+# define NV50_RA_DEBUG_JOIN
+#endif
#include "nv50_context.h"
#include "nv50_pc.h"
#include "util/u_simple_list.h"
#define NUM_REGISTER_FILES 4
+#define MAX_REGISTER_COUNT 256
struct register_set {
struct nv_pc *pc;
uint32_t last[NUM_REGISTER_FILES];
- uint32_t bits[NUM_REGISTER_FILES][8];
+ uint32_t bits[NUM_REGISTER_FILES][(MAX_REGISTER_COUNT + 31) / 32];
};
+/* using OR because a set bit means occupied/unavailable, aliasing is allowed */
+static void
+intersect_register_sets(struct register_set *dst,
+ struct register_set *src1, struct register_set *src2)
+{
+ int i, j;
+
+ for (i = 0; i < NUM_REGISTER_FILES; ++i) {
+ for (j = 0; j < (MAX_REGISTER_COUNT + 31) / 32; ++j)
+ dst->bits[i][j] = src1->bits[i][j] | src2->bits[i][j];
+ }
+}
+
+static void
+mask_register_set(struct register_set *set, uint32_t mask, uint32_t umask)
+{
+ int i, j;
+
+ for (i = 0; i < NUM_REGISTER_FILES; ++i) {
+ for (j = 0; j < (MAX_REGISTER_COUNT + 31) / 32; ++j)
+ set->bits[i][j] = (set->bits[i][j] | mask) & umask;
+ }
+}
+
struct nv_pc_pass {
struct nv_pc *pc;
}
}
+/* @return: TRUE if @new_range can be freed (i.e. was not reused) */
static boolean
add_range_ex(struct nv_value *val, int bgn, int end, struct nv_range *new_range)
{
struct nv_range *range, **nextp = &val->livei;
+ if (bgn == end) /* [a, a) is invalid / empty */
+ return TRUE;
+
for (range = val->livei; range; range = range->next) {
if (end < range->bgn)
break; /* insert before */
add_range_ex(val, bgn, end, NULL);
}
-#ifdef NV50_RA_DEBUG_JOIN
+#if defined(NV50_RA_DEBUG_JOIN) || defined(NV50_RA_DEBUG_LIVEI)
static void
livei_print(struct nv_value *a)
{
id <<= s;
m = (1 << (1 << s)) - 1;
+ assert(s >= 0); /* XXX: remove me */
+
set->bits[f][id / 32] |= m << (id % 32);
if (set->pc->max_reg[f] < id)
if (a->join->reg.id == b->join->reg.id)
return TRUE;
-#if 1
/* either a or b or both have been assigned */
if (a->join->reg.id >= 0 && b->join->reg.id >= 0)
return FALSE;
else
if (b->join->reg.id >= 0) {
- if (a->join->reg.id >= 0)
- return FALSE;
val = a;
a = b;
b = val;
return FALSE;
}
return TRUE;
-#endif
- return FALSE;
}
static INLINE void
assert(b->join == a->join);
}
-static INLINE void
+static INLINE boolean
try_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b)
{
if (!join_allowed(ctx, a, b)) {
#ifdef NV50_RA_DEBUG_JOIN
debug_printf("cannot join %i to %i: not allowed\n", b->n, a->n);
#endif
- return;
+ return FALSE;
}
if (livei_have_overlap(a->join, b->join)) {
#ifdef NV50_RA_DEBUG_JOIN
livei_print(a);
livei_print(b);
#endif
- return;
+ return FALSE;
}
do_join_values(ctx, a, b);
+
+ return TRUE;
+}
+
+static void
+join_values_nofail(struct nv_pc_pass *ctx,
+ struct nv_value *a, struct nv_value *b, boolean type_only)
+{
+ if (type_only) {
+ assert(join_allowed(ctx, a, b));
+ do_join_values(ctx, a, b);
+ } else {
+ boolean ok = try_join_values(ctx, a, b);
+ if (!ok) {
+ NOUVEAU_ERR("failed to coalesce values\n");
+ }
+ }
}
static INLINE boolean
int i = 0, n = 0;
for (; i < 2; ++i)
- if (p->out[i] && p->out_kind[i] != CFG_EDGE_LOOP_LEAVE)
+ if (p->out[i] && !IS_LOOP_EDGE(p->out_kind[i]))
++n;
return (b->num_in > 1) && (n == 2);
}
+/* Look for the @phi's operand whose definition reaches @b. */
+static int
+phi_opnd_for_bb(struct nv_instruction *phi, struct nv_basic_block *b,
+ struct nv_basic_block *tb)
+{
+ struct nv_ref *srci, *srcj;
+ int i, j;
+
+ for (j = -1, i = 0; i < 6 && phi->src[i]; ++i) {
+ srci = phi->src[i];
+ /* if already replaced, check with original source first */
+ if (srci->flags & NV_REF_FLAG_REGALLOC_PRIV)
+ srci = srci->value->insn->src[0];
+ if (!nvbb_reachable_by(b, srci->value->insn->bb, NULL))
+ continue;
+ /* NOTE: back-edges are ignored by the reachable-by check */
+ if (j < 0 || !nvbb_reachable_by(srcj->value->insn->bb,
+ srci->value->insn->bb, NULL)) {
+ j = i;
+ srcj = srci;
+ }
+ }
+ if (j >= 0 && nvbb_reachable_by(b, phi->def[0]->insn->bb, NULL))
+ if (!nvbb_reachable_by(srcj->value->insn->bb,
+ phi->def[0]->insn->bb, NULL))
+ j = -1;
+ return j;
+}
+
/* For each operand of each PHI in b, generate a new value by inserting a MOV
* at the end of the block it is coming from and replace the operand with its
* result. This eliminates liveness conflicts and enables us to let values be
* copied to the right register if such a conflict exists nonetheless.
+ *
+ * These MOVs are also crucial in making sure the live intervals of phi srces
+ * are extended until the end of the loop, since they are not included in the
+ * live-in sets.
*/
static int
pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b)
ctx->pc->current_block = pn;
for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) {
- for (j = 0; j < 4 && i->src[j]; ++j) {
- if (nvbb_reachable_by(p, i->src[j]->value->insn->bb, b))
- break;
+ j = phi_opnd_for_bb(i, p, b);
+
+ if (j < 0) {
+ val = i->def[0];
+ } else {
+ val = i->src[j]->value;
+ if (i->src[j]->flags & NV_REF_FLAG_REGALLOC_PRIV) {
+ j = -1;
+ /* use original value, we already encountered & replaced it */
+ val = val->insn->src[0]->value;
+ }
}
- if (j >= 4 || !i->src[j])
- continue;
- val = i->src[j]->value;
+ if (j < 0) /* need an additional source ? */
+ for (j = 0; j < 5 && i->src[j] && i->src[j]->value != val; ++j);
+ assert(j < 5);
ni = new_instruction(ctx->pc, NV_OP_MOV);
ni->src[0] = new_ref(ctx->pc, val);
nv_reference(ctx->pc, &i->src[j], ni->def[0]);
+
+ i->src[j]->flags |= NV_REF_FLAG_REGALLOC_PRIV;
}
if (pn != p && pn->exit) {
- ctx->pc->current_block = b->in[n ? 0 : 1];
+ assert(!b->in[!n]->exit || b->in[!n]->exit->is_terminator);
+ /* insert terminator (branch to ENDIF) in new else block */
+ ctx->pc->current_block = pn;
ni = new_instruction(ctx->pc, NV_OP_BRA);
ni->target = b;
ni->is_terminator = 1;
return 0;
}
+#define JOIN_MASK_PHI (1 << 0)
+#define JOIN_MASK_SELECT (1 << 1)
+#define JOIN_MASK_MOV (1 << 2)
+#define JOIN_MASK_TEX (1 << 3)
+
static int
-pass_join_values(struct nv_pc_pass *ctx, int iter)
+pass_join_values(struct nv_pc_pass *ctx, unsigned mask)
{
int c, n;
for (n = 0; n < ctx->num_insns; ++n) {
- struct nv_instruction *i = ctx->insns[n];
+ struct nv_instruction *nvi, *i = ctx->insns[n];
switch (i->opcode) {
case NV_OP_PHI:
- if (!iter)
- continue;
- try_join_values(ctx, i->src[0]->value, i->src[1]->value);
- try_join_values(ctx, i->def[0], i->src[0]->value);
+ if (!(mask & JOIN_MASK_PHI))
+ break;
+ for (c = 0; c < 5 && i->src[c]; ++c)
+ join_values_nofail(ctx, i->def[0], i->src[c]->value, FALSE);
break;
case NV_OP_MOV:
- if (iter && i->src[0]->value->insn &&
- !nv_is_vector_op(i->src[0]->value->join->insn->opcode))
+ if (!(mask & JOIN_MASK_MOV))
+ break;
+ nvi = i->src[0]->value->join->insn;
+ if (nvi && !nv_is_vector_op(nvi->opcode))
try_join_values(ctx, i->def[0], i->src[0]->value);
break;
case NV_OP_SELECT:
- if (!iter)
+ if (!(mask & JOIN_MASK_SELECT))
break;
- assert(join_allowed(ctx, i->def[0], i->src[0]->value));
- assert(join_allowed(ctx, i->def[0], i->src[1]->value));
- do_join_values(ctx, i->def[0], i->src[0]->value);
- do_join_values(ctx, i->def[0], i->src[1]->value);
+ for (c = 0; c < 5 && i->src[c]; ++c)
+ join_values_nofail(ctx, i->def[0], i->src[c]->value, TRUE);
break;
case NV_OP_TEX:
case NV_OP_TXB:
case NV_OP_TXL:
case NV_OP_TXQ:
- if (iter)
+ if (!(mask & JOIN_MASK_TEX))
break;
- for (c = 0; c < 4; ++c) {
- if (!i->src[c])
- break;
- do_join_values(ctx, i->def[c], i->src[c]->value);
- }
+ /* This should work without conflicts because we always generate
+ * extra MOVs for the sources of a TEX.
+ */
+ for (c = 0; c < 4 && i->src[c]; ++c)
+ join_values_nofail(ctx, i->def[c], i->src[c]->value, TRUE);
break;
default:
break;
for (j = 0; j < (ctx->pc->num_values + 31) / 32; ++j)
b->live_set[j] |= b->out[n]->live_set[j];
}
-
- /* Kick values out of our live set that are created in incoming
- * blocks of our successors that are not us.
- */
- for (i = b->out[n]->phi; i && i->opcode == NV_OP_PHI; i = i->next) {
- for (j = 0; j < 4; ++j) {
- if (!i->src[j])
- break;
- assert(i->src[j]->value->insn);
-
- if (nvbb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n]))
- live_set_add(b, i->src[j]->value);
- else
- live_set_rem(b, i->src[j]->value);
- }
- }
}
if (!b->entry)
bb_live_set_print(ctx->pc, b);
- for (i = b->exit; i; i = i->prev) {
+ for (i = b->exit; i != b->entry->prev; i = i->prev) {
for (j = 0; j < 4; j++) {
if (!i->def[j])
break;
if (i->flags_src)
live_set_add(b, i->flags_src->value);
}
+ for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next)
+ live_set_rem(b, i->def[0]);
+
bb_live_set_print(ctx->pc, b);
return 0;
{
int i;
- if (b->out[0]) {
- if (b->out[1]) { /* what to do about back-edges ? */
+ if (b->out[0] && b->out_kind[0] != CFG_EDGE_FAKE) {
+ if (b->out[1] && b->out_kind[1] != CFG_EDGE_FAKE) {
for (i = 0; i < n; ++i)
b->live_set[i] = b->out[0]->live_set[i] | b->out[1]->live_set[i];
} else {
memcpy(b->live_set, b->out[0]->live_set, n * sizeof(uint32_t));
}
} else
- if (b->out[1]) {
+ if (b->out[1] && b->out_kind[1] != CFG_EDGE_FAKE) {
memcpy(b->live_set, b->out[1]->live_set, n * sizeof(uint32_t));
} else {
memset(b->live_set, 0, n * sizeof(uint32_t));
for (j = 0; j < ctx->pc->num_values; ++j) {
if (!(b->live_set[j / 32] & (1 << (j % 32))))
continue;
+ add_range(&ctx->pc->values[j], b, b->exit->serial + 1);
#ifdef NV50_RA_DEBUG_LIVEI
- debug_printf("adding range for live value %i\n", j);
+ debug_printf("adding range for live value %i: ", j);
+ livei_print(&ctx->pc->values[j]);
#endif
- add_range(&ctx->pc->values[j], b, b->exit->serial + 1);
+
}
}
for (j = 0; j < 5; ++j) {
if (i->src[j] && !live_set_test(b, i->src[j])) {
live_set_add(b, i->src[j]->value);
+ add_range(i->src[j]->value, b, i->serial);
#ifdef NV50_RA_DEBUG_LIVEI
- debug_printf("adding range for source that ends living: %i\n",
+ debug_printf("adding range for source %i (ends living): ",
i->src[j]->value->n);
+ livei_print(i->src[j]->value);
#endif
- add_range(i->src[j]->value, b, i->serial);
}
}
if (i->flags_src && !live_set_test(b, i->flags_src)) {
live_set_add(b, i->flags_src->value);
+ add_range(i->flags_src->value, b, i->serial);
#ifdef NV50_RA_DEBUG_LIVEI
- debug_printf("adding range for source that ends living: %i\n",
+ debug_printf("adding range for source %i (ends living): ",
i->flags_src->value->n);
+ livei_print(i->flags_src->value);
#endif
- add_range(i->flags_src->value, b, i->serial);
}
}
static void
insert_ordered_tail(struct nv_value *list, struct nv_value *nval)
{
- struct nv_value *elem = list->prev;
+ struct nv_value *elem;
for (elem = list->prev;
- elem != list && elem->livei->bgn > nval->livei->bgn;
- elem = elem->prev);
+ elem != list && elem->livei->bgn > nval->livei->bgn;
+ elem = elem->prev);
/* now elem begins before or at the same time as val */
nval->prev = elem;
elem->next = nval;
}
-static int
-pass_linear_scan(struct nv_pc_pass *ctx, int iter)
+static void
+collect_register_values(struct nv_pc_pass *ctx, struct nv_value *head,
+ boolean assigned_only)
{
- struct nv_instruction *i;
- struct register_set f, free;
+ struct nv_value *val;
int k, n;
- struct nv_value *cur, *val, *tmp[2];
- struct nv_value active, inactive, handled, unhandled;
-
- make_empty_list(&active);
- make_empty_list(&inactive);
- make_empty_list(&handled);
- make_empty_list(&unhandled);
- nv50_ctor_register_set(ctx->pc, &free);
+ make_empty_list(head);
- /* joined values should have range = NULL and thus not be added;
- * also, fixed memory values won't be added because they're not
- * def'd, just used
- */
for (n = 0; n < ctx->num_insns; ++n) {
- i = ctx->insns[n];
+ struct nv_instruction *i = ctx->insns[n];
+ /* for joined values, only the representative will have livei != NULL */
for (k = 0; k < 4; ++k) {
if (i->def[k] && i->def[k]->livei)
- insert_ordered_tail(&unhandled, i->def[k]);
- else
- if (0 && i->def[k])
- debug_printf("skipping def'd value %i: no livei\n", i->def[k]->n);
+ if (!assigned_only || i->def[k]->reg.id >= 0)
+ insert_ordered_tail(head, i->def[k]);
}
if (i->flags_def && i->flags_def->livei)
- insert_ordered_tail(&unhandled, i->flags_def);
+ if (!assigned_only || i->flags_def->reg.id >= 0)
+ insert_ordered_tail(head, i->flags_def);
}
- for (val = unhandled.next; val != unhandled.prev; val = val->next) {
+ for (val = head->next; val != head->prev; val = val->next) {
assert(val->join == val);
assert(val->livei->bgn <= val->next->livei->bgn);
}
+}
+
+static int
+pass_linear_scan(struct nv_pc_pass *ctx, int iter)
+{
+ struct register_set f, free;
+ struct nv_value *cur, *val, *tmp[2];
+ struct nv_value active, inactive, handled, unhandled;
+
+ make_empty_list(&active);
+ make_empty_list(&inactive);
+ make_empty_list(&handled);
+
+ nv50_ctor_register_set(ctx->pc, &free);
+
+ collect_register_values(ctx, &unhandled, FALSE);
foreach_s(cur, tmp[0], &unhandled) {
remove_from_list(cur);
reg_occupy(&f, val);
if (cur->reg.id < 0) {
- boolean mem = FALSE;
-
- if (nv_is_vector_op(cur->insn->opcode))
- mem = !reg_assign(&f, &cur->insn->def[0], 4);
- else
- if (iter)
- mem = !reg_assign(&f, &cur, 1);
+ boolean mem = !reg_assign(&f, &cur, 1);
if (mem) {
NOUVEAU_ERR("out of registers\n");
return 0;
}
-int
-nv_pc_exec_pass1(struct nv_pc *pc)
+/* Allocate values defined by instructions such as TEX, which have to be
+ * assigned to consecutive registers.
+ * Linear scan doesn't really work here since the values can have different
+ * live intervals.
+ */
+static int
+pass_allocate_constrained_values(struct nv_pc_pass *ctx)
+{
+ struct nv_value regvals, *val;
+ struct nv_instruction *i;
+ struct nv_value *defs[4];
+ struct register_set regs[4];
+ int n, vsize, c;
+ uint32_t mask;
+ boolean mem;
+
+ collect_register_values(ctx, ®vals, TRUE);
+
+ for (n = 0; n < ctx->num_insns; ++n) {
+ i = ctx->insns[n];
+ vsize = nvi_vector_size(i);
+ if (!(vsize > 1))
+ continue;
+ assert(vsize <= 4);
+ for (c = 0; c < vsize; ++c)
+ defs[c] = i->def[c]->join;
+
+ if (defs[0]->reg.id >= 0) {
+ for (c = 1; c < vsize; ++c)
+ assert(defs[c]->reg.id >= 0);
+ continue;
+ }
+
+ /* Compute registers available for this "vector" of consecutive registers.
+ * Each value (component) has its own independent live interval.
+ */
+ for (c = 0; c < vsize; ++c) {
+ nv50_ctor_register_set(ctx->pc, ®s[c]);
+
+ foreach(val, ®vals) {
+ if (val->reg.id >= 0 && livei_have_overlap(val, defs[c]))
+ reg_occupy(®s[c], val);
+ }
+ /* Only 32 bit GPRs will be allocated here, but register set
+ * granularity for GPRs is 16 bit.
+ */
+ mask = 0x03030303;
+ if (vsize == 2) /* granularity is 2 and not 4 */
+ mask |= 0x03030303 << 4;
+ mask_register_set(®s[c], 0, mask << (c * 2));
+
+ if (defs[c]->livei)
+ insert_ordered_tail(®vals, defs[c]);
+ }
+ for (c = 1; c < vsize; ++c)
+ intersect_register_sets(®s[0], ®s[0], ®s[c]);
+
+ mem = !reg_assign(®s[0], &defs[0], vsize);
+
+ if (mem) {
+ NOUVEAU_ERR("out of registers\n");
+ abort();
+ }
+ }
+ return 0;
+}
+
+static int
+nv_pc_pass1(struct nv_pc *pc, struct nv_basic_block *root)
{
struct nv_pc_pass *ctx;
int i, ret;
- NV50_DBGMSG("REGISTER ALLOCATION - entering\n");
+ NV50_DBGMSG(PROG_RA, "REGISTER ALLOCATION - entering\n");
ctx = CALLOC_STRUCT(nv_pc_pass);
if (!ctx)
ctx->pc = pc;
ctx->insns = CALLOC(NV_PC_MAX_INSTRUCTIONS, sizeof(struct nv_instruction *));
+ if (!ctx->insns) {
+ FREE(ctx);
+ return -1;
+ }
pc->pass_seq++;
- ret = pass_generate_phi_movs(ctx, pc->root);
+ ret = pass_generate_phi_movs(ctx, root);
assert(!ret);
for (i = 0; i < pc->loop_nesting_bound; ++i) {
pc->pass_seq++;
- ret = pass_build_live_sets(ctx, pc->root);
+ ret = pass_build_live_sets(ctx, root);
assert(!ret && "live sets");
if (ret) {
NOUVEAU_ERR("failed to build live sets (iteration %d)\n", i);
}
pc->pass_seq++;
- nv_pc_pass_in_order(pc->root, pass_order_instructions, ctx);
+ nv_pc_pass_in_order(root, pass_order_instructions, ctx);
pc->pass_seq++;
- ret = pass_build_intervals(ctx, pc->root);
+ ret = pass_build_intervals(ctx, root);
assert(!ret && "build intervals");
if (ret) {
NOUVEAU_ERR("failed to build live intervals\n");
livei_print(&pc->values[i]);
#endif
- for (i = 0; i < 2; ++i) {
- ret = pass_join_values(ctx, i);
- if (ret)
- goto out;
- ret = pass_linear_scan(ctx, i);
- if (ret)
- goto out;
- }
- assert(!ret && "joining");
+ ret = pass_join_values(ctx, JOIN_MASK_PHI);
+ if (ret)
+ goto out;
+ ret = pass_join_values(ctx, JOIN_MASK_SELECT | JOIN_MASK_TEX);
+ if (ret)
+ goto out;
+ ret = pass_join_values(ctx, JOIN_MASK_MOV);
+ if (ret)
+ goto out;
+ ret = pass_allocate_constrained_values(ctx);
+ if (ret)
+ goto out;
+ ret = pass_linear_scan(ctx, 1);
+ if (ret)
+ goto out;
for (i = 0; i < pc->num_values; ++i)
livei_release(&pc->values[i]);
- NV50_DBGMSG("REGISTER ALLOCATION - leaving\n");
+ NV50_DBGMSG(PROG_RA, "REGISTER ALLOCATION - leaving\n");
out:
+ FREE(ctx->insns);
FREE(ctx);
return ret;
}
+
+int
+nv_pc_exec_pass1(struct nv_pc *pc)
+{
+ int i, ret;
+
+ for (i = 0; i < pc->num_subroutines + 1; ++i)
+ if (pc->root[i] && (ret = nv_pc_pass1(pc, pc->root[i])))
+ return ret;
+ return 0;
+}