X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fvc4%2Fvc4_qpu_schedule.c;h=5f01f8c7093a58c0a4644580cfe493f03266be75;hb=4087b5343d8d85f46ead6656f58c94cec0289cb0;hp=45360f73410c01c81f47389912923dc9fa2954e6;hpb=1ee503c74dafe504acf1ca2751bf4fdb326da049;p=mesa.git diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c index 45360f73410..5f01f8c7093 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -37,18 +37,16 @@ #include "vc4_qir.h" #include "vc4_qpu.h" #include "util/ralloc.h" +#include "util/dag.h" static bool debug; struct schedule_node_child; struct schedule_node { + struct dag_node dag; struct list_head link; struct queued_qpu_inst *inst; - struct schedule_node_child *children; - uint32_t child_count; - uint32_t child_array_size; - uint32_t parent_count; /* Longest cycles + instruction_latency() of any parent of this node. */ uint32_t unblocked_time; @@ -73,17 +71,13 @@ struct schedule_node { int uniform; }; -struct schedule_node_child { - struct schedule_node *node; - bool write_after_read; -}; - /* When walking the instructions in reverse, we need to swap before/after in * add_dep(). */ enum direction { F, R }; struct schedule_state { + struct dag *dag; struct schedule_node *last_r[6]; struct schedule_node *last_ra[32]; struct schedule_node *last_rb[32]; @@ -105,37 +99,17 @@ add_dep(struct schedule_state *state, bool write) { bool write_after_read = !write && state->dir == R; + void *edge_data = (void *)(uintptr_t)write_after_read; if (!before || !after) return; assert(before != after); - if (state->dir == R) { - struct schedule_node *t = before; - before = after; - after = t; - } - - for (int i = 0; i < before->child_count; i++) { - if (before->children[i].node == after && - (before->children[i].write_after_read == write_after_read)) { - return; - } - } - - if (before->child_array_size <= before->child_count) { - before->child_array_size = MAX2(before->child_array_size * 2, 16); - before->children = reralloc(before, before->children, - struct schedule_node_child, - before->child_array_size); - } - - before->children[before->child_count].node = after; - before->children[before->child_count].write_after_read = - write_after_read; - before->child_count++; - after->parent_count++; + if (state->dir == F) + dag_add_edge(&before->dag, &after->dag, edge_data); + else + dag_add_edge(&after->dag, &before->dag, edge_data); } static void @@ -438,11 +412,13 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) } static void -calculate_forward_deps(struct vc4_compile *c, struct list_head *schedule_list) +calculate_forward_deps(struct vc4_compile *c, struct dag *dag, + struct list_head *schedule_list) { struct schedule_state state; memset(&state, 0, sizeof(state)); + state.dag = dag; state.dir = F; list_for_each_entry(struct schedule_node, node, schedule_list, link) @@ -450,20 +426,23 @@ calculate_forward_deps(struct vc4_compile *c, struct list_head *schedule_list) } static void -calculate_reverse_deps(struct vc4_compile *c, struct list_head *schedule_list) +calculate_reverse_deps(struct vc4_compile *c, struct dag *dag, + struct list_head *schedule_list) { - struct list_head *node; struct schedule_state state; memset(&state, 0, sizeof(state)); + state.dag = dag; state.dir = R; - for (node = schedule_list->prev; schedule_list != node; node = node->prev) { + list_for_each_entry_rev(struct schedule_node, node, schedule_list, + link) { calculate_deps(&state, (struct schedule_node *)node); } } struct choose_scoreboard { + struct dag *dag; int tick; int last_sfu_write_tick; int last_uniforms_reset_tick; @@ -477,6 +456,11 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst) uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + + /* Full immediate loads don't read any registers. */ + if (sig == QPU_SIG_LOAD_IMM) + return false; + uint32_t src_muxes[] = { QPU_GET_FIELD(inst, QPU_ADD_A), QPU_GET_FIELD(inst, QPU_ADD_B), @@ -570,16 +554,30 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, struct schedule_node *chosen = NULL; int chosen_prio = 0; - list_for_each_entry(struct schedule_node, n, schedule_list, link) { + /* Don't pair up anything with a thread switch signal -- emit_thrsw() + * will handle pairing it along with filling the delay slots. + */ + if (prev_inst) { + uint32_t prev_sig = QPU_GET_FIELD(prev_inst->inst->inst, + QPU_SIG); + if (prev_sig == QPU_SIG_THREAD_SWITCH || + prev_sig == QPU_SIG_LAST_THREAD_SWITCH) { + return NULL; + } + } + + list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads, + dag.link) { uint64_t inst = n->inst->inst; + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); /* Don't choose the branch instruction until it's the last one * left. XXX: We could potentially choose it before it's the * last one, if the remaining instructions fit in the delay * slots. */ - if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH && - !list_is_singular(schedule_list)) { + if (sig == QPU_SIG_BRANCH && + !list_is_singular(&scoreboard->dag->heads)) { continue; } @@ -602,6 +600,14 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, * that they're compatible. */ if (prev_inst) { + /* Don't pair up a thread switch signal -- we'll + * handle pairing it when we pick it on its own. + */ + if (sig == QPU_SIG_THREAD_SWITCH || + sig == QPU_SIG_LAST_THREAD_SWITCH) { + continue; + } + if (prev_inst->uniform != -1 && n->uniform != -1) continue; @@ -677,23 +683,24 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, } static void -dump_state(struct list_head *schedule_list) +dump_state(struct dag *dag) { - list_for_each_entry(struct schedule_node, n, schedule_list, link) { + list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) { fprintf(stderr, " t=%4d: ", n->unblocked_time); vc4_qpu_disasm(&n->inst->inst, 1); fprintf(stderr, "\n"); - for (int i = 0; i < n->child_count; i++) { - struct schedule_node *child = n->children[i].node; + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + struct schedule_node *child = + (struct schedule_node *)edge->child; if (!child) continue; fprintf(stderr, " - "); vc4_qpu_disasm(&child->inst->inst, 1); fprintf(stderr, " (%d parents, %c)\n", - child->parent_count, - n->children[i].write_after_read ? 'w' : 'r'); + child->dag.parent_count, + edge->data ? 'w' : 'r'); } } } @@ -705,6 +712,26 @@ static uint32_t waddr_latency(uint32_t waddr, uint64_t after) /* Apply some huge latency between texture fetch requests and getting * their results back. + * + * FIXME: This is actually pretty bogus. If we do: + * + * mov tmu0_s, a + * + * mov tmu0_s, b + * load_tmu0 + * + * load_tmu0 + * + * we count that as worse than + * + * mov tmu0_s, a + * mov tmu0_s, b + * + * load_tmu0 + * + * load_tmu0 + * + * because we associate the first load_tmu0 with the *second* tmu0_s. */ if (waddr == QPU_W_TMU0_S) { if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0) @@ -740,58 +767,99 @@ instruction_latency(struct schedule_node *before, struct schedule_node *after) /** Recursive computation of the delay member of a node. */ static void -compute_delay(struct schedule_node *n) +compute_delay(struct dag_node *node, void *state) { - if (!n->child_count) { - n->delay = 1; - } else { - for (int i = 0; i < n->child_count; i++) { - if (!n->children[i].node->delay) - compute_delay(n->children[i].node); - n->delay = MAX2(n->delay, - n->children[i].node->delay + - instruction_latency(n, n->children[i].node)); - } + struct schedule_node *n = (struct schedule_node *)node; + + n->delay = 1; + + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + struct schedule_node *child = + (struct schedule_node *)edge->child; + n->delay = MAX2(n->delay, (child->delay + + instruction_latency(n, child))); } } +/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head() + * should be called on it later to finish pruning the other edges). + */ static void -mark_instruction_scheduled(struct list_head *schedule_list, +pre_remove_head(struct dag *dag, struct schedule_node *n) +{ + list_delinit(&n->dag.link); + + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + if (edge->data) + dag_remove_edge(dag, edge); + } +} + +static void +mark_instruction_scheduled(struct dag *dag, uint32_t time, - struct schedule_node *node, - bool war_only) + struct schedule_node *node) { if (!node) return; - for (int i = node->child_count - 1; i >= 0; i--) { + util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) { struct schedule_node *child = - node->children[i].node; + (struct schedule_node *)edge->child; if (!child) continue; - if (war_only && !node->children[i].write_after_read) - continue; - - /* If the requirement is only that the node not appear before - * the last read of its destination, then it can be scheduled - * immediately after (or paired with!) the thing reading the - * destination. - */ - uint32_t latency = 0; - if (!war_only) { - latency = instruction_latency(node, - node->children[i].node); - } + uint32_t latency = instruction_latency(node, child); child->unblocked_time = MAX2(child->unblocked_time, time + latency); - child->parent_count--; - if (child->parent_count == 0) - list_add(&child->link, schedule_list); + } + dag_prune_head(dag, &node->dag); +} - node->children[i].node = NULL; +/** + * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair + * with another instruction. + */ +static void +emit_thrsw(struct vc4_compile *c, + struct choose_scoreboard *scoreboard, + uint64_t inst) +{ + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + + /* There should be nothing in a thrsw inst being scheduled other than + * the signal bits. + */ + assert(QPU_GET_FIELD(inst, QPU_OP_ADD) == QPU_A_NOP); + assert(QPU_GET_FIELD(inst, QPU_OP_MUL) == QPU_M_NOP); + + /* Try to find an earlier scheduled instruction that we can merge the + * thrsw into. + */ + int thrsw_ip = c->qpu_inst_count; + for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) { + uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i]; + uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG); + + if (prev_sig == QPU_SIG_NONE) + thrsw_ip = c->qpu_inst_count - i; + } + + if (thrsw_ip != c->qpu_inst_count) { + /* Merge the thrsw into the existing instruction. */ + c->qpu_insts[thrsw_ip] = + QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG); + } else { + qpu_serialize_one_inst(c, inst); + update_scoreboard_for_chosen(scoreboard, inst); + } + + /* Fill the delay slots. */ + while (c->qpu_inst_count < thrsw_ip + 3) { + update_scoreboard_for_chosen(scoreboard, qpu_NOP()); + qpu_serialize_one_inst(c, qpu_NOP()); } } @@ -806,19 +874,7 @@ schedule_instructions(struct vc4_compile *c, { uint32_t time = 0; - if (debug) { - fprintf(stderr, "initial deps:\n"); - dump_state(schedule_list); - fprintf(stderr, "\n"); - } - - /* Remove non-DAG heads from the list. */ - list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) { - if (n->parent_count != 0) - list_del(&n->link); - } - - while (!list_empty(schedule_list)) { + while (!list_is_empty(&scoreboard->dag->heads)) { struct schedule_node *chosen = choose_instruction_to_schedule(scoreboard, schedule_list, @@ -833,7 +889,7 @@ schedule_instructions(struct vc4_compile *c, if (debug) { fprintf(stderr, "t=%4d: current list:\n", time); - dump_state(schedule_list); + dump_state(scoreboard->dag); fprintf(stderr, "t=%4d: chose: ", time); vc4_qpu_disasm(&inst, 1); fprintf(stderr, "\n"); @@ -844,9 +900,7 @@ schedule_instructions(struct vc4_compile *c, */ if (chosen) { time = MAX2(chosen->unblocked_time, time); - list_del(&chosen->link); - mark_instruction_scheduled(schedule_list, time, - chosen, true); + pre_remove_head(scoreboard->dag, chosen); if (chosen->uniform != -1) { c->uniform_data[*next_uniform] = orig_uniform_data[chosen->uniform]; @@ -860,7 +914,6 @@ schedule_instructions(struct vc4_compile *c, chosen); if (merge) { time = MAX2(merge->unblocked_time, time); - list_del(&merge->link); inst = qpu_merge_inst(inst, merge->inst->inst); assert(inst != 0); if (merge->uniform != -1) { @@ -887,17 +940,21 @@ schedule_instructions(struct vc4_compile *c, fprintf(stderr, "\n"); } - qpu_serialize_one_inst(c, inst); - - update_scoreboard_for_chosen(scoreboard, inst); - /* Now that we've scheduled a new instruction, some of its * children can be promoted to the list of instructions ready to * be scheduled. Update the children's unblocked time for this * DAG edge as we do so. */ - mark_instruction_scheduled(schedule_list, time, chosen, false); - mark_instruction_scheduled(schedule_list, time, merge, false); + mark_instruction_scheduled(scoreboard->dag, time, chosen); + mark_instruction_scheduled(scoreboard->dag, time, merge); + + if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH || + QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) { + emit_thrsw(c, scoreboard, inst); + } else { + qpu_serialize_one_inst(c, inst); + update_scoreboard_for_chosen(scoreboard, inst); + } scoreboard->tick++; time++; @@ -917,16 +974,6 @@ schedule_instructions(struct vc4_compile *c, qpu_serialize_one_inst(c, inst); qpu_serialize_one_inst(c, inst); qpu_serialize_one_inst(c, inst); - } else if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH || - QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) { - /* The thread switch occurs after two delay slots. We - * should fit things in these slots, but we don't - * currently. - */ - inst = qpu_NOP(); - update_scoreboard_for_chosen(scoreboard, inst); - qpu_serialize_one_inst(c, inst); - qpu_serialize_one_inst(c, inst); } } @@ -941,18 +988,20 @@ qpu_schedule_instructions_block(struct vc4_compile *c, uint32_t *orig_uniform_data, uint32_t *next_uniform) { - void *mem_ctx = ralloc_context(NULL); - struct list_head schedule_list; + scoreboard->dag = dag_create(NULL); + struct list_head setup_list; - list_inithead(&schedule_list); + list_inithead(&setup_list); /* Wrap each instruction in a scheduler structure. */ uint32_t next_sched_uniform = *next_uniform; - while (!list_empty(&block->qpu_inst_list)) { + while (!list_is_empty(&block->qpu_inst_list)) { struct queued_qpu_inst *inst = (struct queued_qpu_inst *)block->qpu_inst_list.next; - struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node); + struct schedule_node *n = rzalloc(scoreboard->dag, + struct schedule_node); + dag_init_node(scoreboard->dag, &n->dag); n->inst = inst; if (reads_uniform(inst->inst)) { @@ -961,23 +1010,22 @@ qpu_schedule_instructions_block(struct vc4_compile *c, n->uniform = -1; } list_del(&inst->link); - list_addtail(&n->link, &schedule_list); + list_addtail(&n->link, &setup_list); } - calculate_forward_deps(c, &schedule_list); - calculate_reverse_deps(c, &schedule_list); + calculate_forward_deps(c, scoreboard->dag, &setup_list); + calculate_reverse_deps(c, scoreboard->dag, &setup_list); - list_for_each_entry(struct schedule_node, n, &schedule_list, link) { - compute_delay(n); - } + dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL); uint32_t cycles = schedule_instructions(c, scoreboard, block, - &schedule_list, + &setup_list, orig_uniform_contents, orig_uniform_data, next_uniform); - ralloc_free(mem_ctx); + ralloc_free(scoreboard->dag); + scoreboard->dag = NULL; return cycles; }