X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fbroadcom%2Fcompiler%2Fqpu_schedule.c;h=b8e04f6ea1303c07d8c7dd723c86ebc60bbc6590;hb=aff8885cf9922516f59391aab7e87170ae9cd906;hp=b404390a799879d8e4b359490eaf69de2d1be64c;hpb=4760040c0980a8921120d517d5e5809f7f0e488c;p=mesa.git diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index b404390a799..b8e04f6ea13 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -37,18 +37,16 @@ #include "qpu/qpu_disasm.h" #include "v3d_compiler.h" #include "util/ralloc.h" +#include "util/dag.h" static bool debug; struct schedule_node_child; struct schedule_node { + struct dag_node dag; struct list_head link; struct qinst *inst; - struct schedule_node_child *children; - uint32_t child_count; - uint32_t child_array_size; - uint32_t parent_count; /* Longest cycles + instruction_latency() of any parent of this node. */ uint32_t unblocked_time; @@ -67,11 +65,6 @@ struct schedule_node { uint32_t latency; }; -struct schedule_node_child { - struct schedule_node *node; - bool write_after_read; -}; - /* When walking the instructions in reverse, we need to swap before/after in * add_dep(). */ @@ -79,6 +72,7 @@ enum direction { F, R }; struct schedule_state { const struct v3d_device_info *devinfo; + struct dag *dag; struct schedule_node *last_r[6]; struct schedule_node *last_rf[64]; struct schedule_node *last_sf; @@ -101,37 +95,17 @@ add_dep(struct schedule_state *state, bool write) { bool write_after_read = !write && state->dir == R; + void *edge_data = (void *)(uintptr_t)write_after_read; if (!before || !after) return; assert(before != after); - if (state->dir == R) { - struct schedule_node *t = before; - before = after; - after = t; - } - - for (int i = 0; i < before->child_count; i++) { - if (before->children[i].node == after && - (before->children[i].write_after_read == write_after_read)) { - return; - } - } - - if (before->child_array_size <= before->child_count) { - before->child_array_size = MAX2(before->child_array_size * 2, 16); - before->children = reralloc(before, before->children, - struct schedule_node_child, - before->child_array_size); - } - - before->children[before->child_count].node = after; - before->children[before->child_count].write_after_read = - write_after_read; - before->child_count++; - after->parent_count++; + if (state->dir == F) + dag_add_edge(&before->dag, &after->dag, edge_data); + else + dag_add_edge(&after->dag, &before->dag, edge_data); } static void @@ -154,6 +128,9 @@ add_write_dep(struct schedule_state *state, static bool qpu_inst_is_tlb(const struct v3d_qpu_instr *inst) { + if (inst->sig.ldtlb || inst->sig.ldtlbu) + return true; + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) return false; @@ -179,7 +156,10 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n, add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); break; case V3D_QPU_MUX_B: - add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n); + if (!n->inst->qpu.sig.small_imm) { + add_read_dep(state, + state->last_rf[n->inst->qpu.raddr_b], n); + } break; default: add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n); @@ -195,6 +175,9 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n, if (!magic) { add_write_dep(state, &state->last_rf[waddr], n); } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) { + /* XXX perf: For V3D 4.x, we could reorder TMU writes other + * than the TMUS/TMUD/TMUA to improve scheduling flexibility. + */ add_write_dep(state, &state->last_tmu_write, n); switch (waddr) { case V3D_QPU_WADDR_TMUS: @@ -233,6 +216,16 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n, add_write_dep(state, &state->last_tlb, n); break; + case V3D_QPU_WADDR_SYNC: + case V3D_QPU_WADDR_SYNCB: + case V3D_QPU_WADDR_SYNCU: + /* For CS barrier(): Sync against any other memory + * accesses. There doesn't appear to be any need for + * barriers to affect ALU operations. + */ + add_write_dep(state, &state->last_tmu_write, n); + break; + case V3D_QPU_WADDR_NOP: break; @@ -243,30 +236,6 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n, } } -static void -process_cond_deps(struct schedule_state *state, struct schedule_node *n, - enum v3d_qpu_cond cond) -{ - if (cond != V3D_QPU_COND_NONE) - add_read_dep(state, state->last_sf, n); -} - -static void -process_pf_deps(struct schedule_state *state, struct schedule_node *n, - enum v3d_qpu_pf pf) -{ - if (pf != V3D_QPU_PF_NONE) - add_write_dep(state, &state->last_sf, n); -} - -static void -process_uf_deps(struct schedule_state *state, struct schedule_node *n, - enum v3d_qpu_uf uf) -{ - if (uf != V3D_QPU_UF_NONE) - add_write_dep(state, &state->last_sf, n); -} - /** * Common code for dependencies that need to be tracked both forward and * backward. @@ -280,6 +249,11 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) const struct v3d_device_info *devinfo = state->devinfo; struct qinst *qinst = n->inst; struct v3d_qpu_instr *inst = &qinst->qpu; + /* If the input and output segments are shared, then all VPM reads to + * a location need to happen before all writes. We handle this by + * serializing all VPM operations for now. + */ + bool separate_vpm_segment = false; if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) @@ -321,6 +295,14 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) add_write_dep(state, &state->last_vpm, n); break; + case V3D_QPU_A_LDVPMV_IN: + case V3D_QPU_A_LDVPMD_IN: + case V3D_QPU_A_LDVPMG_IN: + case V3D_QPU_A_LDVPMP: + if (!separate_vpm_segment) + add_write_dep(state, &state->last_vpm, n); + break; + case V3D_QPU_A_VPMWT: add_read_dep(state, state->last_vpm, n); break; @@ -334,19 +316,6 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) add_write_dep(state, &state->last_tlb, n); break; - case V3D_QPU_A_FLAPUSH: - case V3D_QPU_A_FLBPUSH: - case V3D_QPU_A_VFLA: - case V3D_QPU_A_VFLNA: - case V3D_QPU_A_VFLB: - case V3D_QPU_A_VFLNB: - add_read_dep(state, state->last_sf, n); - break; - - case V3D_QPU_A_FLBPOP: - add_write_dep(state, &state->last_sf, n); - break; - default: break; } @@ -392,6 +361,7 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) for (int i = 0; i < ARRAY_SIZE(state->last_r); i++) add_write_dep(state, &state->last_r[i], n); add_write_dep(state, &state->last_sf, n); + add_write_dep(state, &state->last_rtop, n); /* Scoreboard-locking operations have to stay after the last * thread switch. @@ -402,7 +372,7 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) add_write_dep(state, &state->last_tmu_config, n); } - if (inst->sig.ldtmu) { + if (v3d_qpu_waits_on_tmu(inst)) { /* TMU loads are coming from a FIFO, so ordering is important. */ add_write_dep(state, &state->last_tmu_write, n); @@ -412,29 +382,36 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) add_write_dep(state, &state->last_tmu_config, n); if (inst->sig.ldtlb | inst->sig.ldtlbu) - add_read_dep(state, state->last_tlb, n); + add_write_dep(state, &state->last_tlb, n); - if (inst->sig.ldvpm) + if (inst->sig.ldvpm) { add_write_dep(state, &state->last_vpm_read, n); + /* At least for now, we're doing shared I/O segments, so queue + * all writes after all reads. + */ + if (!separate_vpm_segment) + add_write_dep(state, &state->last_vpm, n); + } + /* inst->sig.ldunif or sideband uniform read */ - if (qinst->uniform != ~0) + if (vir_has_uniform(qinst)) add_write_dep(state, &state->last_unif, n); - process_cond_deps(state, n, inst->flags.ac); - process_cond_deps(state, n, inst->flags.mc); - process_pf_deps(state, n, inst->flags.apf); - process_pf_deps(state, n, inst->flags.mpf); - process_uf_deps(state, n, inst->flags.auf); - process_uf_deps(state, n, inst->flags.muf); + if (v3d_qpu_reads_flags(inst)) + add_read_dep(state, state->last_sf, n); + if (v3d_qpu_writes_flags(inst)) + add_write_dep(state, &state->last_sf, n); } static void -calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list) +calculate_forward_deps(struct v3d_compile *c, struct dag *dag, + struct list_head *schedule_list) { struct schedule_state state; memset(&state, 0, sizeof(state)); + state.dag = dag; state.devinfo = c->devinfo; state.dir = F; @@ -443,26 +420,29 @@ calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list) } static void -calculate_reverse_deps(struct v3d_compile *c, struct list_head *schedule_list) +calculate_reverse_deps(struct v3d_compile *c, struct dag *dag, + struct list_head *schedule_list) { - struct list_head *node; struct schedule_state state; memset(&state, 0, sizeof(state)); + state.dag = dag; state.devinfo = c->devinfo; state.dir = R; - for (node = schedule_list->prev; schedule_list != node; node = node->prev) { + list_for_each_entry_rev(struct schedule_node, node, schedule_list, + link) { calculate_deps(&state, (struct schedule_node *)node); } } struct choose_scoreboard { + struct dag *dag; int tick; - int last_sfu_write_tick; + int last_magic_sfu_write_tick; int last_ldvary_tick; int last_uniforms_reset_tick; - uint32_t last_waddr_add, last_waddr_mul; + int last_thrsw_tick; bool tlb_locked; }; @@ -471,22 +451,8 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) { switch (mux) { - case V3D_QPU_MUX_A: - if (scoreboard->last_waddr_add == inst->raddr_a || - scoreboard->last_waddr_mul == inst->raddr_a) { - return true; - } - break; - - case V3D_QPU_MUX_B: - if (scoreboard->last_waddr_add == inst->raddr_b || - scoreboard->last_waddr_mul == inst->raddr_b) { - return true; - } - break; - case V3D_QPU_MUX_R4: - if (scoreboard->tick - scoreboard->last_sfu_write_tick <= 2) + if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2) return true; break; @@ -551,7 +517,7 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo, * This would normally be prevented by dependency tracking, but might * occur if a dead SFU computation makes it to scheduling. */ - if (scoreboard->tick - scoreboard->last_sfu_write_tick < 2 && + if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 && v3d_qpu_writes_r4(devinfo, inst)) return true; @@ -579,10 +545,14 @@ get_instruction_priority(const struct v3d_qpu_instr *inst) next_score++; /* Schedule texture read results collection late to hide latency. */ - if (inst->sig.ldtmu) + if (v3d_qpu_waits_on_tmu(inst)) return next_score; next_score++; + /* XXX perf: We should schedule SFU ALU ops so that the reader is 2 + * instructions after the producer if possible, not just 1. + */ + /* Default score for things that aren't otherwise special. */ baseline_score = next_score; next_score++; @@ -610,6 +580,8 @@ qpu_accesses_peripheral(const struct v3d_qpu_instr *inst) { if (v3d_qpu_uses_vpm(inst)) return true; + if (v3d_qpu_uses_sfu(inst)) + return true; if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { if (inst->alu.add.op != V3D_QPU_A_NOP && @@ -618,6 +590,9 @@ qpu_accesses_peripheral(const struct v3d_qpu_instr *inst) return true; } + if (inst->alu.add.op == V3D_QPU_A_TMUWT) + return true; + if (inst->alu.mul.op != V3D_QPU_M_NOP && inst->alu.mul.magic_write && qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) { @@ -632,6 +607,37 @@ qpu_accesses_peripheral(const struct v3d_qpu_instr *inst) inst->sig.wrtmuc); } +static bool +qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *a, + const struct v3d_qpu_instr *b) +{ + const bool a_uses_peripheral = qpu_accesses_peripheral(a); + const bool b_uses_peripheral = qpu_accesses_peripheral(b); + + /* We can always do one peripheral access per instruction. */ + if (!a_uses_peripheral || !b_uses_peripheral) + return true; + + if (devinfo->ver < 41) + return false; + + /* V3D 4.1 and later allow TMU read along with a VPM read or write, and + * WRTMUC with a TMU magic register write (other than tmuc). + */ + if ((a->sig.ldtmu && v3d_qpu_uses_vpm(b)) || + (b->sig.ldtmu && v3d_qpu_uses_vpm(a))) { + return true; + } + + if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(b)) || + (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(a))) { + return true; + } + + return false; +} + static bool qpu_merge_inst(const struct v3d_device_info *devinfo, struct v3d_qpu_instr *result, @@ -643,12 +649,7 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, return false; } - /* Can't do more than one peripheral access in an instruction. - * - * XXX: V3D 4.1 allows TMU read along with a VPM read or write, and - * WRTMUC with a TMU magic register write (other than tmuc). - */ - if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b)) + if (!qpu_compatible_peripheral_access(devinfo, a, b)) return false; struct v3d_qpu_instr merge = *a; @@ -683,7 +684,8 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) { if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) && - a->raddr_b != b->raddr_b) { + (a->raddr_b != b->raddr_b || + a->sig.small_imm != b->sig.small_imm)) { return false; } merge.raddr_b = b->raddr_b; @@ -723,7 +725,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, static struct schedule_node * choose_instruction_to_schedule(const struct v3d_device_info *devinfo, struct choose_scoreboard *scoreboard, - struct list_head *schedule_list, struct schedule_node *prev_inst) { struct schedule_node *chosen = NULL; @@ -737,7 +738,8 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, return NULL; } - list_for_each_entry(struct schedule_node, n, schedule_list, link) { + list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads, + dag.link) { const struct v3d_qpu_instr *inst = &n->inst->qpu; /* Don't choose the branch instruction until it's the last one @@ -745,7 +747,7 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, * choose it. */ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && - !list_is_singular(schedule_list)) { + !list_is_singular(&scoreboard->dag->heads)) { continue; } @@ -771,6 +773,12 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, * sooner. If the ldvary's r5 wasn't used, then ldunif might * otherwise get scheduled so ldunif and ldvary try to update * r5 in the same tick. + * + * XXX perf: To get good pipelining of a sequence of varying + * loads, we need to figure out how to pair the ldvary signal + * up to the instruction before the last r5 user in the + * previous ldvary sequence. Currently, it usually pairs with + * the last r5 user. */ if ((inst->sig.ldunif || inst->sig.ldunifa) && scoreboard->tick == scoreboard->last_ldvary_tick + 1) { @@ -840,16 +848,13 @@ update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, enum v3d_qpu_waddr waddr) { if (v3d_qpu_magic_waddr_is_sfu(waddr)) - scoreboard->last_sfu_write_tick = scoreboard->tick; + scoreboard->last_magic_sfu_write_tick = scoreboard->tick; } static void update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst) { - scoreboard->last_waddr_add = ~0; - scoreboard->last_waddr_mul = ~0; - if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) return; @@ -859,8 +864,6 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, if (inst->alu.add.magic_write) { update_scoreboard_for_magic_waddr(scoreboard, inst->alu.add.waddr); - } else { - scoreboard->last_waddr_add = inst->alu.add.waddr; } } @@ -868,8 +871,6 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, if (inst->alu.mul.magic_write) { update_scoreboard_for_magic_waddr(scoreboard, inst->alu.mul.waddr); - } else { - scoreboard->last_waddr_mul = inst->alu.mul.waddr; } } @@ -881,24 +882,24 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, } static void -dump_state(const struct v3d_device_info *devinfo, - struct list_head *schedule_list) +dump_state(const struct v3d_device_info *devinfo, struct dag *dag) { - list_for_each_entry(struct schedule_node, n, schedule_list, link) { + list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) { fprintf(stderr, " t=%4d: ", n->unblocked_time); v3d_qpu_dump(devinfo, &n->inst->qpu); fprintf(stderr, "\n"); - for (int i = 0; i < n->child_count; i++) { - struct schedule_node *child = n->children[i].node; + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + struct schedule_node *child = + (struct schedule_node *)edge->child; if (!child) continue; fprintf(stderr, " - "); v3d_qpu_dump(devinfo, &child->inst->qpu); fprintf(stderr, " (%d parents, %c)\n", - child->parent_count, - n->children[i].write_after_read ? 'w' : 'r'); + child->dag.parent_count, + edge->data ? 'w' : 'r'); } } } @@ -929,7 +930,7 @@ static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr, * * because we associate the first load_tmu0 with the *second* tmu0_s. */ - if (v3d_qpu_magic_waddr_is_tmu(waddr) && after->sig.ldtmu) + if (v3d_qpu_magic_waddr_is_tmu(waddr) && v3d_qpu_waits_on_tmu(after)) return 100; /* Assume that anything depending on us is consuming the SFU result. */ @@ -967,59 +968,56 @@ instruction_latency(struct schedule_node *before, struct schedule_node *after) /** Recursive computation of the delay member of a node. */ static void -compute_delay(struct schedule_node *n) +compute_delay(struct dag_node *node, void *state) { - if (!n->child_count) { - n->delay = 1; - } else { - for (int i = 0; i < n->child_count; i++) { - if (!n->children[i].node->delay) - compute_delay(n->children[i].node); - n->delay = MAX2(n->delay, - n->children[i].node->delay + - instruction_latency(n, n->children[i].node)); - } + struct schedule_node *n = (struct schedule_node *)node; + + n->delay = 1; + + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + struct schedule_node *child = + (struct schedule_node *)edge->child; + + n->delay = MAX2(n->delay, (child->delay + + instruction_latency(n, child))); } } +/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head() + * should be called on it later to finish pruning the other edges). + */ static void -mark_instruction_scheduled(struct list_head *schedule_list, +pre_remove_head(struct dag *dag, struct schedule_node *n) +{ + list_delinit(&n->dag.link); + + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + if (edge->data) + dag_remove_edge(dag, edge); + } +} + +static void +mark_instruction_scheduled(struct dag *dag, uint32_t time, - struct schedule_node *node, - bool war_only) + struct schedule_node *node) { if (!node) return; - for (int i = node->child_count - 1; i >= 0; i--) { + util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) { struct schedule_node *child = - node->children[i].node; + (struct schedule_node *)edge->child; if (!child) continue; - if (war_only && !node->children[i].write_after_read) - continue; - - /* If the requirement is only that the node not appear before - * the last read of its destination, then it can be scheduled - * immediately after (or paired with!) the thing reading the - * destination. - */ - uint32_t latency = 0; - if (!war_only) { - latency = instruction_latency(node, - node->children[i].node); - } + uint32_t latency = instruction_latency(node, child); child->unblocked_time = MAX2(child->unblocked_time, time + latency); - child->parent_count--; - if (child->parent_count == 0) - list_add(&child->link, schedule_list); - - node->children[i].node = NULL; } + dag_prune_head(dag, &node->dag); } static void @@ -1038,7 +1036,7 @@ insert_scheduled_instruction(struct v3d_compile *c, static struct qinst * vir_nop() { - struct qreg undef = { QFILE_NULL, 0 }; + struct qreg undef = vir_nop_reg(); struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef); return qinst; @@ -1073,6 +1071,10 @@ qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c, return false; if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { + /* GFXH-1625: TMUWT not allowed in the final instruction. */ + if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT) + return false; + /* No writing physical registers at the end. */ if (!inst->alu.add.magic_write || !inst->alu.mul.magic_write) { @@ -1107,10 +1109,16 @@ qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c, } static bool -valid_thrsw_sequence(struct v3d_compile *c, +valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard, struct qinst *qinst, int instructions_in_sequence, bool is_thrend) { + /* No emitting our thrsw while the previous thrsw hasn't happened yet. */ + if (scoreboard->last_thrsw_tick + 3 > + scoreboard->tick - instructions_in_sequence) { + return false; + } + for (int slot = 0; slot < instructions_in_sequence; slot++) { /* No scheduling SFU when the result would land in the other * thread. The simulator complains for safety, though it @@ -1171,7 +1179,8 @@ emit_thrsw(struct v3d_compile *c, if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) break; - if (!valid_thrsw_sequence(c, prev_inst, slots_filled + 1, + if (!valid_thrsw_sequence(c, scoreboard, + prev_inst, slots_filled + 1, is_thrend)) { break; } @@ -1185,7 +1194,9 @@ emit_thrsw(struct v3d_compile *c, if (merge_inst) { merge_inst->qpu.sig.thrsw = true; needs_free = true; + scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled; } else { + scoreboard->last_thrsw_tick = scoreboard->tick; insert_scheduled_instruction(c, block, scoreboard, inst); time++; slots_filled++; @@ -1220,7 +1231,6 @@ static uint32_t schedule_instructions(struct v3d_compile *c, struct choose_scoreboard *scoreboard, struct qblock *block, - struct list_head *schedule_list, enum quniform_contents *orig_uniform_contents, uint32_t *orig_uniform_data, uint32_t *next_uniform) @@ -1228,23 +1238,10 @@ schedule_instructions(struct v3d_compile *c, const struct v3d_device_info *devinfo = c->devinfo; uint32_t time = 0; - if (debug) { - fprintf(stderr, "initial deps:\n"); - dump_state(devinfo, schedule_list); - fprintf(stderr, "\n"); - } - - /* Remove non-DAG heads from the list. */ - list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) { - if (n->parent_count != 0) - list_del(&n->link); - } - - while (!list_empty(schedule_list)) { + while (!list_empty(&scoreboard->dag->heads)) { struct schedule_node *chosen = choose_instruction_to_schedule(devinfo, scoreboard, - schedule_list, NULL); struct schedule_node *merge = NULL; @@ -1257,7 +1254,7 @@ schedule_instructions(struct v3d_compile *c, if (debug) { fprintf(stderr, "t=%4d: current list:\n", time); - dump_state(devinfo, schedule_list); + dump_state(devinfo, scoreboard->dag); fprintf(stderr, "t=%4d: chose: ", time); v3d_qpu_dump(devinfo, inst); fprintf(stderr, "\n"); @@ -1275,17 +1272,14 @@ schedule_instructions(struct v3d_compile *c, */ if (chosen) { time = MAX2(chosen->unblocked_time, time); - list_del(&chosen->link); - mark_instruction_scheduled(schedule_list, time, - chosen, true); + pre_remove_head(scoreboard->dag, chosen); while ((merge = choose_instruction_to_schedule(devinfo, scoreboard, - schedule_list, chosen))) { time = MAX2(merge->unblocked_time, time); - list_del(&merge->link); + pre_remove_head(scoreboard->dag, chosen); list_addtail(&merge->link, &merged_list); (void)qpu_merge_inst(devinfo, inst, inst, &merge->inst->qpu); @@ -1331,11 +1325,10 @@ schedule_instructions(struct v3d_compile *c, * be scheduled. Update the children's unblocked time for this * DAG edge as we do so. */ - mark_instruction_scheduled(schedule_list, time, chosen, false); + mark_instruction_scheduled(scoreboard->dag, time, chosen); list_for_each_entry(struct schedule_node, merge, &merged_list, link) { - mark_instruction_scheduled(schedule_list, time, merge, - false); + mark_instruction_scheduled(scoreboard->dag, time, merge); /* The merged VIR instruction doesn't get re-added to the * block, so free it now. @@ -1377,9 +1370,10 @@ qpu_schedule_instructions_block(struct v3d_compile *c, uint32_t *next_uniform) { void *mem_ctx = ralloc_context(NULL); - struct list_head schedule_list; + scoreboard->dag = dag_create(mem_ctx); + struct list_head setup_list; - list_inithead(&schedule_list); + list_inithead(&setup_list); /* Wrap each instruction in a scheduler structure. */ while (!list_empty(&block->instructions)) { @@ -1387,26 +1381,25 @@ qpu_schedule_instructions_block(struct v3d_compile *c, struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node); + dag_init_node(scoreboard->dag, &n->dag); n->inst = qinst; list_del(&qinst->link); - list_addtail(&n->link, &schedule_list); + list_addtail(&n->link, &setup_list); } - calculate_forward_deps(c, &schedule_list); - calculate_reverse_deps(c, &schedule_list); + calculate_forward_deps(c, scoreboard->dag, &setup_list); + calculate_reverse_deps(c, scoreboard->dag, &setup_list); - list_for_each_entry(struct schedule_node, n, &schedule_list, link) { - compute_delay(n); - } + dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL); uint32_t cycles = schedule_instructions(c, scoreboard, block, - &schedule_list, orig_uniform_contents, orig_uniform_data, next_uniform); ralloc_free(mem_ctx); + scoreboard->dag = NULL; return cycles; } @@ -1484,11 +1477,10 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c) struct choose_scoreboard scoreboard; memset(&scoreboard, 0, sizeof(scoreboard)); - scoreboard.last_waddr_add = ~0; - scoreboard.last_waddr_mul = ~0; scoreboard.last_ldvary_tick = -10; - scoreboard.last_sfu_write_tick = -10; + scoreboard.last_magic_sfu_write_tick = -10; scoreboard.last_uniforms_reset_tick = -10; + scoreboard.last_thrsw_tick = -10; if (debug) { fprintf(stderr, "Pre-schedule instructions\n");