*/
#include "vc4_qir.h"
+#include "util/dag.h"
static bool debug;
struct schedule_node {
+ struct dag_node dag;
struct list_head link;
struct qinst *inst;
- struct schedule_node **children;
- uint32_t child_count;
- uint32_t child_array_size;
- uint32_t parent_count;
-
/* Length of the longest (latency) chain from a DAG head to the this
* instruction.
*/
};
struct schedule_state {
- /* List of struct schedule_node *. This starts out with all
- * instructions, and after dependency updates it's trimmed to be just
- * the DAG heads.
- */
- struct list_head worklist;
+ struct dag *dag;
uint32_t time;
after = t;
}
- for (int i = 0; i < after->child_count; i++) {
- if (after->children[i] == after)
- return;
- }
-
- if (after->child_array_size <= after->child_count) {
- after->child_array_size = MAX2(after->child_array_size * 2, 16);
- after->children = reralloc(after, after->children,
- struct schedule_node *,
- after->child_array_size);
- }
-
- after->children[after->child_count] = before;
- after->child_count++;
- before->parent_count++;
+ dag_add_edge(&after->dag, &before->dag, NULL);
}
static void
* ignore uniforms accesses, because qir_reorder_uniforms() happens
* after this.
*/
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
switch (inst->src[i].file) {
case QFILE_TEMP:
add_dep(dir,
add_dep(dir, state->last_vary_read, n);
break;
- case QOP_TEX_S:
- case QOP_TEX_T:
- case QOP_TEX_R:
- case QOP_TEX_B:
- case QOP_TEX_DIRECT:
- /* Texturing setup gets scheduled in order, because
- * the uniforms referenced by them have to land in a
- * specific order.
- */
- add_write_dep(dir, &state->last_tex_coord, n);
- break;
-
case QOP_TEX_RESULT:
/* Results have to be fetched in order. */
add_write_dep(dir, &state->last_tex_result, n);
break;
+ case QOP_THRSW:
+ /* After a new THRSW, one must collect all texture samples
+ * queued since the previous THRSW/program start. For now, we
+ * have one THRSW in between each texture setup and its
+ * results collection as our input, and we just make sure that
+ * that ordering is maintained.
+ */
+ add_write_dep(dir, &state->last_tex_coord, n);
+ add_write_dep(dir, &state->last_tex_result, n);
+
+ /* accumulators and flags are lost across thread switches. */
+ add_write_dep(dir, &state->last_sf, n);
+
+ /* Setup, like the varyings, will need to be drained before we
+ * thread switch.
+ */
+ add_write_dep(dir, &state->last_vary_read, n);
+
+ /* The TLB-locking operations have to stay after the last
+ * thread switch.
+ */
+ add_write_dep(dir, &state->last_tlb, n);
+ break;
+
case QOP_TLB_COLOR_READ:
case QOP_MS_MASK:
add_write_dep(dir, &state->last_tlb, n);
add_write_dep(dir, &state->last_tlb, n);
break;
+ case QFILE_TEX_S_DIRECT:
+ case QFILE_TEX_S:
+ case QFILE_TEX_T:
+ case QFILE_TEX_R:
+ case QFILE_TEX_B:
+ /* Texturing setup gets scheduled in order, because
+ * the uniforms referenced by them have to land in a
+ * specific order.
+ */
+ add_write_dep(dir, &state->last_tex_coord, n);
+ break;
+
default:
break;
}
calculate_deps(&state, n);
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
switch (inst->src[i].file) {
case QFILE_UNIF:
add_dep(state.dir, state.last_uniforms_reset, n);
}
}
- switch (inst->op) {
- case QOP_TEX_S:
- case QOP_TEX_T:
- case QOP_TEX_R:
- case QOP_TEX_B:
- case QOP_TEX_DIRECT:
+ switch (inst->dst.file) {
+ case QFILE_TEX_S_DIRECT:
+ case QFILE_TEX_S:
+ case QFILE_TEX_T:
+ case QFILE_TEX_R:
+ case QFILE_TEX_B:
/* From the VC4 spec:
*
* "The TFREQ input FIFO holds two full lots of s,
* If the texture coordinate fifo is full, block this
* on the last QOP_TEX_RESULT.
*/
- if (state.tfreq_count == 8) {
+ if (state.tfreq_count == (c->fs_threaded ? 4 : 8)) {
block_until_tex_result(&state, n);
}
* If the texture result fifo is full, block adding
* any more to it until the last QOP_TEX_RESULT.
*/
- if (inst->op == QOP_TEX_S ||
- inst->op == QOP_TEX_DIRECT) {
- if (state.tfrcv_count == 4)
+ if (inst->dst.file == QFILE_TEX_S ||
+ inst->dst.file == QFILE_TEX_S_DIRECT) {
+ if (state.tfrcv_count ==
+ (c->fs_threaded ? 2 : 4))
block_until_tex_result(&state, n);
state.tfrcv_count++;
}
state.tfreq_count++;
break;
+ default:
+ break;
+ }
+
+ switch (inst->op) {
case QOP_TEX_RESULT:
/* Results have to be fetched after the
* coordinate setup. Note that we're assuming
break;
default:
- assert(!qir_is_tex(inst));
break;
}
}
state->temp_writes[inst->dst.index] == 1)
cost--;
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
- if (inst->src[i].file == QFILE_TEMP &&
- !BITSET_TEST(state->temp_live, inst->src[i].index)) {
- cost++;
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
+ if (inst->src[i].file != QFILE_TEMP ||
+ BITSET_TEST(state->temp_live, inst->src[i].index)) {
+ continue;
}
+
+ bool already_counted = false;
+ for (int j = 0; j < i; j++) {
+ if (inst->src[i].file == inst->src[j].file &&
+ inst->src[i].index == inst->src[j].index) {
+ already_counted = true;
+ }
+ }
+ if (!already_counted)
+ cost++;
}
return cost;
{
struct schedule_node *chosen = NULL;
- list_for_each_entry(struct schedule_node, n, &state->worklist, link) {
+ list_for_each_entry(struct schedule_node, n, &state->dag->heads,
+ dag.link) {
/* The branches aren't being tracked as dependencies. Make
* sure that they stay scheduled as the last instruction of
* the block, which is to say the first one we choose to
dump_state(struct vc4_compile *c, struct schedule_state *state)
{
uint32_t i = 0;
- list_for_each_entry(struct schedule_node, n, &state->worklist, link) {
+ list_for_each_entry(struct schedule_node, n, &state->dag->heads,
+ dag.link) {
fprintf(stderr, "%3d: ", i++);
qir_dump_inst(c, n->inst);
fprintf(stderr, " (%d cost)\n",
get_register_pressure_cost(state, n->inst));
- for (int i = 0; i < n->child_count; i++) {
- struct schedule_node *child = n->children[i];
+ util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+ struct schedule_node *child =
+ (struct schedule_node *)edge->child;
fprintf(stderr, " - ");
qir_dump_inst(c, child->inst);
- fprintf(stderr, " (%d parents)\n", child->parent_count);
+ fprintf(stderr, " (%d parents)\n",
+ child->dag.parent_count);
}
}
}
static uint32_t
latency_between(struct schedule_node *before, struct schedule_node *after)
{
- if ((before->inst->op == QOP_TEX_S ||
- before->inst->op == QOP_TEX_DIRECT) &&
+ if ((before->inst->dst.file == QFILE_TEX_S ||
+ before->inst->dst.file == QFILE_TEX_S_DIRECT) &&
after->inst->op == QOP_TEX_RESULT)
return 100;
+ switch (before->inst->op) {
+ case QOP_RCP:
+ case QOP_RSQ:
+ case QOP_EXP2:
+ case QOP_LOG2:
+ for (int i = 0; i < qir_get_nsrc(after->inst); i++) {
+ if (after->inst->src[i].file ==
+ before->inst->dst.file &&
+ after->inst->src[i].index ==
+ before->inst->dst.index) {
+ /* There are two QPU delay slots before we can
+ * read a math result, which could be up to 4
+ * QIR instructions if they packed well.
+ */
+ return 4;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+
return 1;
}
/** Recursive computation of the delay member of a node. */
static void
-compute_delay(struct schedule_node *n)
+compute_delay(struct dag_node *node, void *state)
{
- if (!n->child_count) {
- /* The color read needs to be scheduled late, to avoid locking
- * the scoreboard early. This is our best tool for
- * encouraging that. The other scoreboard locking ops will
- * have this happen by default, since they are generally the
- * DAG heads or close to them.
- */
- if (n->inst->op == QOP_TLB_COLOR_READ)
- n->delay = 1000;
- else
- n->delay = 1;
- } else {
- for (int i = 0; i < n->child_count; i++) {
- if (!n->children[i]->delay)
- compute_delay(n->children[i]);
- n->delay = MAX2(n->delay,
- n->children[i]->delay +
- latency_between(n, n->children[i]));
- }
+ struct schedule_node *n = (struct schedule_node *)node;
+
+ /* The color read needs to be scheduled late, to avoid locking the
+ * scoreboard early. This is our best tool for encouraging that. The
+ * other scoreboard locking ops will have this happen by default,
+ * since they are generally the DAG heads or close to them.
+ */
+ if (n->inst->op == QOP_TLB_COLOR_READ)
+ n->delay = 1000;
+ else
+ n->delay = 1;
+
+ util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+ struct schedule_node *child =
+ (struct schedule_node *)edge->child;
+
+ n->delay = MAX2(n->delay, (child->delay +
+ latency_between(child, n)));
}
}
dump_state(c, state);
}
- /* Remove non-DAG heads from the list. */
- list_for_each_entry_safe(struct schedule_node, n,
- &state->worklist, link) {
- if (n->parent_count != 0)
- list_del(&n->link);
- }
-
state->time = 0;
- while (!list_empty(&state->worklist)) {
+ while (!list_is_empty(&state->dag->heads)) {
struct schedule_node *chosen = choose_instruction(state);
struct qinst *inst = chosen->inst;
state->time = MAX2(state->time, chosen->unblocked_time);
/* Schedule this instruction back onto the QIR list. */
- list_del(&chosen->link);
list_add(&inst->link, &block->instructions);
/* Now that we've scheduled a new instruction, some of its
* be scheduled. Update the children's unblocked time for this
* DAG edge as we do so.
*/
- for (int i = chosen->child_count - 1; i >= 0; i--) {
- struct schedule_node *child = chosen->children[i];
+ util_dynarray_foreach(&chosen->dag.edges,
+ struct dag_edge, edge) {
+ struct schedule_node *child =
+ (struct schedule_node *)edge->child;
child->unblocked_time = MAX2(child->unblocked_time,
state->time +
- latency_between(chosen,
- child));
- child->parent_count--;
- if (child->parent_count == 0)
- list_add(&child->link, &state->worklist);
+ latency_between(child,
+ chosen));
}
+ dag_prune_head(state->dag, &chosen->dag);
/* Update our tracking of register pressure. */
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
if (inst->src[i].file == QFILE_TEMP)
BITSET_SET(state->temp_live, inst->src[i].index);
}
qir_schedule_instructions_block(struct vc4_compile *c,
struct qblock *block)
{
- void *mem_ctx = ralloc_context(NULL);
- struct schedule_state state = { { 0 } };
+ struct schedule_state *state = rzalloc(NULL, struct schedule_state);
+
+ state->temp_writes = rzalloc_array(state, uint32_t, c->num_temps);
+ state->temp_live = rzalloc_array(state, BITSET_WORD,
+ BITSET_WORDS(c->num_temps));
+ state->dag = dag_create(state);
- state.temp_writes = rzalloc_array(mem_ctx, uint32_t, c->num_temps);
- state.temp_live = rzalloc_array(mem_ctx, BITSET_WORD,
- BITSET_WORDS(c->num_temps));
- list_inithead(&state.worklist);
+ struct list_head setup_list;
+ list_inithead(&setup_list);
/* Wrap each instruction in a scheduler structure. */
qir_for_each_inst_safe(inst, block) {
- struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
+ struct schedule_node *n = rzalloc(state, struct schedule_node);
n->inst = inst;
list_del(&inst->link);
- list_addtail(&n->link, &state.worklist);
+ list_addtail(&n->link, &setup_list);
+ dag_init_node(state->dag, &n->dag);
if (inst->dst.file == QFILE_TEMP)
- state.temp_writes[inst->dst.index]++;
+ state->temp_writes[inst->dst.index]++;
}
/* Dependencies tracked top-to-bottom. */
- calculate_forward_deps(c, mem_ctx, &state.worklist);
+ calculate_forward_deps(c, state, &setup_list);
/* Dependencies tracked bottom-to-top. */
- calculate_reverse_deps(c, mem_ctx, &state.worklist);
+ calculate_reverse_deps(c, state, &setup_list);
- list_for_each_entry(struct schedule_node, n, &state.worklist, link)
- compute_delay(n);
+ dag_traverse_bottom_up(state->dag, compute_delay, NULL);
- schedule_instructions(c, block, &state);
+ schedule_instructions(c, block, state);
- ralloc_free(mem_ctx);
+ ralloc_free(state);
}
void