bool ready;
bool inserted;
bool max_node, next_max_node;
+ bool complex_allowed;
} sched;
struct {
int parent_index;
* (3) There is a store instruction scheduled, but not its child.
*
* The complex slot cannot be used for a move in case (1), since it only
- * has a FIFO depth of 1, but it can be used for (2) and (3). In order to
- * ensure that we have enough space for all three, we maintain the
- * following invariants:
+ * has a FIFO depth of 1, but it can be used for (2) as well as (3) as long
+ * as the uses aren't in certain slots. It turns out that we don't have to
+ * worry about nodes that can't use the complex slot for (2), since there
+ * are at most 4 uses 1 cycle ago that can't use the complex slot, but we
+ * do have to worry about (3). This means tracking stores whose children
+ * cannot be in the complex slot. In order to ensure that we have enough
+ * space for all three, we maintain the following invariants:
*
* (1) alu_num_slot_free >= alu_num_slot_needed_by_store +
* alu_num_slot_needed_by_max +
* alu_num_slot_needed_by_next_max
- * (2) alu_non_cplx_slot_free >= alu_num_slot_needed_by_max
+ * (2) alu_non_cplx_slot_free >= alu_num_slot_needed_by_max +
+ * alu_num_slot_neede_by_non_cplx_store
*/
int alu_num_slot_needed_by_store;
+ int alu_num_slot_needed_by_non_cplx_store;
int alu_num_slot_needed_by_max;
int alu_num_slot_needed_by_next_max;
if (!gpir_instr_check_acc_same_op(instr, node, node->sched.pos))
return false;
+ if (node->sched.next_max_node && !node->sched.complex_allowed &&
+ node->sched.pos == GPIR_INSTR_SLOT_COMPLEX)
+ return false;
+
int consume_slot = gpir_instr_get_consume_slot(instr, node);
int non_cplx_consume_slot =
node->sched.pos == GPIR_INSTR_SLOT_COMPLEX ? 0 : consume_slot;
int store_reduce_slot = 0;
+ int non_cplx_store_reduce_slot = 0;
int max_reduce_slot = node->sched.max_node ? 1 : 0;
int next_max_reduce_slot = node->sched.next_max_node ? 1 : 0;
gpir_store_node *s = gpir_node_to_store(instr->slots[i]);
if (s && s->child == node) {
store_reduce_slot = 1;
+ if (node->sched.next_max_node && !node->sched.complex_allowed)
+ non_cplx_store_reduce_slot = 1;
break;
}
}
}
int non_cplx_slot_difference =
- instr->alu_num_slot_needed_by_max - max_reduce_slot -
+ instr->alu_num_slot_needed_by_max - max_reduce_slot +
+ instr->alu_num_slot_needed_by_non_cplx_store - non_cplx_store_reduce_slot -
(instr->alu_non_cplx_slot_free - non_cplx_consume_slot);
if (non_cplx_slot_difference > 0) {
gpir_debug("failed %d because of alu slot\n", node->index);
instr->alu_num_slot_free -= consume_slot;
instr->alu_non_cplx_slot_free -= non_cplx_consume_slot;
instr->alu_num_slot_needed_by_store -= store_reduce_slot;
+ instr->alu_num_slot_needed_by_non_cplx_store -= non_cplx_store_reduce_slot;
instr->alu_num_slot_needed_by_max -= max_reduce_slot;
instr->alu_num_slot_needed_by_next_max -= next_max_reduce_slot;
return true;
gpir_store_node *s = gpir_node_to_store(instr->slots[i]);
if (s && s->child == node) {
instr->alu_num_slot_needed_by_store++;
+ if (node->sched.next_max_node && !node->sched.complex_allowed)
+ instr->alu_num_slot_needed_by_non_cplx_store++;
break;
}
}
}
/* Check the invariants documented in gpir.h, similar to the ALU case.
- * Since the only thing that changes is alu_num_slot_needed_by_store, we
+ * When the only thing that changes is alu_num_slot_needed_by_store, we
* can get away with just checking the first one.
*/
int slot_difference = instr->alu_num_slot_needed_by_store + 1
return false;
}
+ if (store->child->sched.next_max_node &&
+ !store->child->sched.complex_allowed) {
+ /* The child of the store is already partially ready, and has a use one
+ * cycle ago that disqualifies it (or a move replacing it) from being
+ * put in the complex slot. Therefore we have to check the non-complex
+ * invariant.
+ */
+ int non_cplx_slot_difference =
+ instr->alu_num_slot_needed_by_max +
+ instr->alu_num_slot_needed_by_non_cplx_store + 1 -
+ instr->alu_non_cplx_slot_free;
+ if (non_cplx_slot_difference > 0) {
+ instr->non_cplx_slot_difference = non_cplx_slot_difference;
+ return false;
+ }
+
+ instr->alu_num_slot_needed_by_non_cplx_store++;
+ }
+
instr->alu_num_slot_needed_by_store++;
out:
instr->alu_num_slot_needed_by_store--;
+ if (store->child->sched.next_max_node &&
+ !store->child->sched.complex_allowed) {
+ instr->alu_num_slot_needed_by_non_cplx_store--;
+ }
+
out:
if (!instr->slots[other_slot])
instr->store_content[component >> 1] = GPIR_INSTR_STORE_NONE;
move->node.sched.dist = node->sched.dist;
move->node.sched.max_node = node->sched.max_node;
move->node.sched.next_max_node = node->sched.next_max_node;
+ move->node.sched.complex_allowed = node->sched.complex_allowed;
gpir_debug("create move %d for %d\n", move->node.index, node->index);
store->child = node;
store->node.sched.max_node = false;
store->node.sched.next_max_node = false;
+ store->node.sched.complex_allowed = false;
store->node.sched.pos = -1;
store->node.sched.instr = NULL;
store->node.sched.inserted = false;
return min;
}
+/* The second source for add0, add1, mul0, and mul1 units cannot be complex.
+ * The hardware overwrites the add second sources with 0 and mul second
+ * sources with 1. This can be a problem if we need to insert more next-max
+ * moves but we only have values that can't use the complex unit for moves.
+ *
+ * Fortunately, we only need to insert a next-max move if there are more than
+ * 5 next-max nodes, but there are only 4 sources in the previous instruction
+ * that make values not complex-capable, which means there can be at most 4
+ * non-complex-capable values. Hence there will always be at least two values
+ * that can be rewritten to use a move in the complex slot. However, we have
+ * to be careful not to waste those values by putting both of them in a
+ * non-complex slot. This is handled for us by gpir_instr, which will reject
+ * such instructions. We just need to tell it which nodes can use complex, and
+ * it will do the accounting to figure out what is safe.
+ */
+
+static bool can_use_complex(gpir_node *node)
+{
+ gpir_node_foreach_succ(node, dep) {
+ if (dep->type != GPIR_DEP_INPUT)
+ continue;
+
+ gpir_node *succ = dep->succ;
+ if (succ->type != gpir_node_type_alu)
+ continue;
+
+ gpir_alu_node *alu = gpir_node_to_alu(succ);
+ if (alu->num_child >= 2 && alu->children[1] == node)
+ return false;
+
+ /* complex1 puts its third source in the fourth slot */
+ if (alu->node.op == gpir_op_complex1 && alu->children[2] == node)
+ return false;
+ }
+
+ return true;
+}
+
/* Initialize node->sched.max_node and node->sched.next_max_node for every
* input node on the ready list. We should only need to do this once per
* instruction, at the beginning, since we never add max nodes to the ready
int min_end_move = gpir_get_min_end_as_move(node);
node->sched.max_node = (min_end_move == ctx->instr->index);
node->sched.next_max_node = (min_end_move == ctx->instr->index + 1);
+ if (node->sched.next_max_node)
+ node->sched.complex_allowed = can_use_complex(node);
if (node->sched.max_node)
ctx->instr->alu_num_slot_needed_by_max++;
int alu_num_slot_needed_by_max = 0;
int alu_num_slot_needed_by_next_max = -5;
int alu_num_slot_needed_by_store = 0;
+ int alu_num_slot_needed_by_non_cplx_store = 0;
list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
if (!gpir_is_input_node(node))
alu_num_slot_needed_by_max++;
if (node->sched.next_max_node)
alu_num_slot_needed_by_next_max++;
- if (used_by_store(node, ctx->instr))
+ if (used_by_store(node, ctx->instr)) {
alu_num_slot_needed_by_store++;
+ if (node->sched.next_max_node && !node->sched.complex_allowed)
+ alu_num_slot_needed_by_non_cplx_store++;
+ }
}
assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max);
assert(ctx->instr->alu_num_slot_needed_by_next_max == alu_num_slot_needed_by_next_max);
assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store);
- assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + alu_num_slot_needed_by_next_max);
- assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max);
+ assert(ctx->instr->alu_num_slot_needed_by_non_cplx_store ==
+ alu_num_slot_needed_by_non_cplx_store);
+ assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_slot_needed_by_next_max, 0));
+ assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + alu_num_slot_needed_by_non_cplx_store);
}
static bool try_node(sched_ctx *ctx)
assert(score != INT_MIN);
}
+/* For next-max nodes, not every node can be offloaded to a move in the
+ * complex slot. If we run out of non-complex slots, then such nodes cannot
+ * have moves placed for them. There should always be sufficient
+ * complex-capable nodes so that this isn't a problem.
+ */
+static bool can_place_move(sched_ctx *ctx, gpir_node *node)
+{
+ if (!node->sched.next_max_node)
+ return true;
+
+ if (node->sched.complex_allowed)
+ return true;
+
+ return ctx->instr->alu_non_cplx_slot_free > 0;
+}
+
static bool sched_move(sched_ctx *ctx)
{
list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
*/
if (ctx->instr->alu_num_slot_free > 0) {
list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+ if (!can_place_move(ctx, node))
+ continue;
+
if (node->sched.next_max_node && node->op == gpir_op_complex1 &&
node->sched.ready) {
bool skip = true;
*/
if (ctx->instr->alu_num_slot_free > 0) {
list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) {
+ if (!can_place_move(ctx, node))
+ continue;
+
if (node->sched.next_max_node &&
!(node->op == gpir_op_complex1 && node->sched.ready)) {
place_move(ctx, node);
if (ctx->instr->alu_num_slot_needed_by_next_max > 0) {
list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+ if (!can_place_move(ctx, node))
+ continue;
+
if (node->sched.next_max_node) {
place_move(ctx, node);
return true;
node->sched.physreg_store = NULL;
node->sched.ready = false;
node->sched.inserted = false;
+ node->sched.complex_allowed = false;
node->sched.max_node = false;
node->sched.next_max_node = false;
}