From 3fe4d8e1e39b47c9c5c4bfdd87300abd0c336a7e Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 26 Nov 2014 12:44:19 -0800 Subject: [PATCH] vc4: Introduce scheduling of QPU instructions. This doesn't reschedule much currently, just tries to fit things into the regfile A/B write-versus-read slots (the cause of the improvements in shader-db), and hide texture fetch latency by scheduling setup early and results collection late (haven't performance tested it). This infrastructure will be important for doing instruction pairing, though. shader-db2 results: total instructions in shared programs: 61874 -> 59583 (-3.70%) instructions in affected programs: 50677 -> 48386 (-4.52%) --- src/gallium/drivers/vc4/Makefile.sources | 1 + src/gallium/drivers/vc4/vc4_qir.h | 7 + src/gallium/drivers/vc4/vc4_qpu.c | 12 + src/gallium/drivers/vc4/vc4_qpu.h | 3 + src/gallium/drivers/vc4/vc4_qpu_emit.c | 132 +--- src/gallium/drivers/vc4/vc4_qpu_schedule.c | 693 +++++++++++++++++++++ 6 files changed, 722 insertions(+), 126 deletions(-) create mode 100644 src/gallium/drivers/vc4/vc4_qpu_schedule.c diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources index 6ec48ab36be..6bcb731d034 100644 --- a/src/gallium/drivers/vc4/Makefile.sources +++ b/src/gallium/drivers/vc4/Makefile.sources @@ -24,6 +24,7 @@ C_SOURCES := \ vc4_qpu_disasm.c \ vc4_qpu_emit.c \ vc4_qpu.h \ + vc4_qpu_schedule.c \ vc4_qpu_validate.c \ vc4_query.c \ vc4_register_allocate.c \ diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index cb02db5272c..0b76a2f246e 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -148,6 +148,11 @@ struct simple_node { struct simple_node *prev; }; +struct queued_qpu_inst { + struct simple_node link; + uint64_t inst; +}; + struct qinst { struct simple_node link; @@ -368,6 +373,8 @@ bool qir_opt_copy_propagation(struct vc4_compile *c); bool qir_opt_cse(struct vc4_compile *c); bool qir_opt_dead_code(struct vc4_compile *c); +void qpu_schedule_instructions(struct vc4_compile *c); + #define QIR_ALU0(name) \ static inline struct qreg \ qir_##name(struct vc4_compile *c) \ diff --git a/src/gallium/drivers/vc4/vc4_qpu.c b/src/gallium/drivers/vc4/vc4_qpu.c index 093ca077e6d..723b3613665 100644 --- a/src/gallium/drivers/vc4/vc4_qpu.c +++ b/src/gallium/drivers/vc4/vc4_qpu.c @@ -22,6 +22,7 @@ */ #include +#include "vc4_qir.h" #include "vc4_qpu.h" static uint64_t @@ -267,3 +268,14 @@ qpu_inst_is_tlb(uint64_t inst) sig == QPU_SIG_COLOR_LOAD || sig == QPU_SIG_WAIT_FOR_SCOREBOARD); } + +void +qpu_serialize_one_inst(struct vc4_compile *c, uint64_t inst) +{ + if (c->qpu_inst_count >= c->qpu_inst_size) { + c->qpu_inst_size = MAX2(16, c->qpu_inst_size * 2); + c->qpu_insts = realloc(c->qpu_insts, + c->qpu_inst_size * sizeof(uint64_t)); + } + c->qpu_insts[c->qpu_inst_count++] = inst; +} diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h index 5f4caab193e..bf41f72c34b 100644 --- a/src/gallium/drivers/vc4/vc4_qpu.h +++ b/src/gallium/drivers/vc4/vc4_qpu.h @@ -30,6 +30,8 @@ #include "vc4_qpu_defines.h" +struct vc4_compile; + struct qpu_reg { enum qpu_mux mux; uint8_t addr; @@ -135,6 +137,7 @@ uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond); bool qpu_waddr_is_tlb(uint32_t waddr); bool qpu_inst_is_tlb(uint64_t inst); +void qpu_serialize_one_inst(struct vc4_compile *c, uint64_t inst); static inline uint64_t qpu_load_imm_f(struct qpu_reg dst, float val) diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index e6e97cce462..3cb709f11fe 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -41,11 +41,6 @@ vc4_dump_program(struct vc4_compile *c) } } -struct queued_qpu_inst { - struct simple_node link; - uint64_t inst; -}; - static void queue(struct vc4_compile *c, uint64_t inst) { @@ -115,121 +110,6 @@ fixup_raddr_conflict(struct vc4_compile *c, *src1 = qpu_r3(); } -static void -serialize_one_inst(struct vc4_compile *c, uint64_t inst) -{ - if (c->qpu_inst_count >= c->qpu_inst_size) { - c->qpu_inst_size = MAX2(16, c->qpu_inst_size * 2); - c->qpu_insts = realloc(c->qpu_insts, - c->qpu_inst_size * sizeof(uint64_t)); - } - c->qpu_insts[c->qpu_inst_count++] = inst; -} - -static void -serialize_insts(struct vc4_compile *c) -{ - int last_sfu_write = -10; - - while (!is_empty_list(&c->qpu_inst_list)) { - struct queued_qpu_inst *q = - (struct queued_qpu_inst *)first_elem(&c->qpu_inst_list); - uint32_t last_waddr_a = QPU_W_NOP, last_waddr_b = QPU_W_NOP; - uint32_t raddr_a = QPU_GET_FIELD(q->inst, QPU_RADDR_A); - uint32_t raddr_b = QPU_GET_FIELD(q->inst, QPU_RADDR_B); - - if (c->qpu_inst_count > 0) { - uint64_t last_inst = c->qpu_insts[c->qpu_inst_count - - 1]; - uint32_t last_waddr_add = QPU_GET_FIELD(last_inst, - QPU_WADDR_ADD); - uint32_t last_waddr_mul = QPU_GET_FIELD(last_inst, - QPU_WADDR_MUL); - - if (last_inst & QPU_WS) { - last_waddr_a = last_waddr_mul; - last_waddr_b = last_waddr_add; - } else { - last_waddr_a = last_waddr_add; - last_waddr_b = last_waddr_mul; - } - } - - uint32_t src_muxes[] = { - QPU_GET_FIELD(q->inst, QPU_ADD_A), - QPU_GET_FIELD(q->inst, QPU_ADD_B), - QPU_GET_FIELD(q->inst, QPU_MUL_A), - QPU_GET_FIELD(q->inst, QPU_MUL_B), - }; - - /* "An instruction must not read from a location in physical - * regfile A or B that was written to by the previous - * instruction." - */ - bool needs_raddr_vs_waddr_nop = false; - bool reads_r4 = false; - for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) { - if ((raddr_a < 32 && - src_muxes[i] == QPU_MUX_A && - last_waddr_a == raddr_a) || - (raddr_b < 32 && - src_muxes[i] == QPU_MUX_B && - last_waddr_b == raddr_b)) { - needs_raddr_vs_waddr_nop = true; - } - if (src_muxes[i] == QPU_MUX_R4) - reads_r4 = true; - } - - if (needs_raddr_vs_waddr_nop) { - serialize_one_inst(c, qpu_NOP()); - } - - /* "After an SFU lookup instruction, accumulator r4 must not - * be read in the following two instructions. Any other - * instruction that results in r4 being written (that is, TMU - * read, TLB read, SFU lookup) cannot occur in the two - * instructions following an SFU lookup." - */ - if (reads_r4) { - while (c->qpu_inst_count - last_sfu_write < 3) { - serialize_one_inst(c, qpu_NOP()); - } - } - - uint32_t waddr_a = QPU_GET_FIELD(q->inst, QPU_WADDR_ADD); - uint32_t waddr_m = QPU_GET_FIELD(q->inst, QPU_WADDR_MUL); - if ((waddr_a >= QPU_W_SFU_RECIP && waddr_a <= QPU_W_SFU_LOG) || - (waddr_m >= QPU_W_SFU_RECIP && waddr_m <= QPU_W_SFU_LOG)) { - last_sfu_write = c->qpu_inst_count; - } - - /* "A scoreboard wait must not occur in the first two - * instructions of a fragment shader. This is either the - * explicit Wait for Scoreboard signal or an implicit wait - * with the first tile-buffer read or write instruction." - */ - if (waddr_a == QPU_W_TLB_Z || - waddr_m == QPU_W_TLB_Z || - waddr_a == QPU_W_TLB_COLOR_MS || - waddr_m == QPU_W_TLB_COLOR_MS || - waddr_a == QPU_W_TLB_COLOR_ALL || - waddr_m == QPU_W_TLB_COLOR_ALL || - QPU_GET_FIELD(q->inst, QPU_SIG) == QPU_SIG_COLOR_LOAD) { - while (c->qpu_inst_count < 3 || - QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1], - QPU_SIG) != QPU_SIG_NONE) { - serialize_one_inst(c, qpu_NOP()); - } - } - - serialize_one_inst(c, q->inst); - - remove_from_list(&q->link); - free(q); - } -} - void vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) { @@ -589,7 +469,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) } } - serialize_insts(c); + qpu_schedule_instructions(c); /* thread end can't have VPM write or read */ if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1], @@ -600,7 +480,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) QPU_RADDR_A) == QPU_R_VPM || QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1], QPU_RADDR_B) == QPU_R_VPM) { - serialize_one_inst(c, qpu_NOP()); + qpu_serialize_one_inst(c, qpu_NOP()); } /* thread end can't have uniform read */ @@ -608,18 +488,18 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) QPU_RADDR_A) == QPU_R_UNIF || QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1], QPU_RADDR_B) == QPU_R_UNIF) { - serialize_one_inst(c, qpu_NOP()); + qpu_serialize_one_inst(c, qpu_NOP()); } /* thread end can't have TLB operations */ if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1])) - serialize_one_inst(c, qpu_NOP()); + qpu_serialize_one_inst(c, qpu_NOP()); c->qpu_insts[c->qpu_inst_count - 1] = qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1], QPU_SIG_PROG_END); - serialize_one_inst(c, qpu_NOP()); - serialize_one_inst(c, qpu_NOP()); + qpu_serialize_one_inst(c, qpu_NOP()); + qpu_serialize_one_inst(c, qpu_NOP()); switch (c->stage) { case QSTAGE_VERT: diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c new file mode 100644 index 00000000000..f309034fba7 --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -0,0 +1,693 @@ +/* + * Copyright © 2010 Intel Corporation + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file vc4_qpu_schedule.c + * + * The basic model of the list scheduler is to take a basic block, compute a + * DAG of the dependencies, and make a list of the DAG heads. Heuristically + * pick a DAG head, then put all the children that are now DAG heads into the + * list of things to schedule. + * + * The goal of scheduling here is to pack pairs of operations together in a + * single QPU instruction. + */ + +#include "vc4_qir.h" +#include "vc4_qpu.h" +#include "util/ralloc.h" + +static bool debug; + +struct schedule_node { + struct simple_node link; + struct queued_qpu_inst *inst; + struct schedule_node **children; + uint32_t child_count; + uint32_t child_array_size; + uint32_t parent_count; + uint32_t delay; +}; + +/* When walking the instructions in reverse, we need to swap before/after in + * add_dep(). + */ +enum direction { F, R }; + +struct schedule_state { + struct schedule_node *last_r[6]; + struct schedule_node *last_ra[32]; + struct schedule_node *last_rb[32]; + struct schedule_node *last_sf; + struct schedule_node *last_vpm_read; + struct schedule_node *last_unif_read; + struct schedule_node *last_tmu_write; + struct schedule_node *last_tlb; + struct schedule_node *last_vpm; + enum direction dir; +}; + +static void +add_dep(struct schedule_state *state, + struct schedule_node *before, + struct schedule_node *after) +{ + if (!before || !after) + return; + + assert(before != after); + + if (state->dir == R) { + struct schedule_node *t = before; + before = after; + after = t; + } + + for (int i = 0; i < before->child_count; i++) { + if (before->children[i] == after) + return; + } + + if (before->child_array_size <= before->child_count) { + before->child_array_size = MAX2(before->child_array_size * 2, 16); + before->children = reralloc(before, before->children, + struct schedule_node *, + before->child_array_size); + } + + before->children[before->child_count] = after; + before->child_count++; + after->parent_count++; +} + +static void +add_write_dep(struct schedule_state *state, + struct schedule_node **before, + struct schedule_node *after) +{ + add_dep(state, *before, after); + *before = after; +} + +static bool +qpu_writes_r4(uint64_t inst) +{ + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + + switch(sig) { + case QPU_SIG_COLOR_LOAD: + case QPU_SIG_LOAD_TMU0: + case QPU_SIG_LOAD_TMU1: + case QPU_SIG_ALPHA_MASK_LOAD: + return true; + default: + return false; + } +} + +static void +process_raddr_deps(struct schedule_state *state, struct schedule_node *n, + uint32_t raddr, bool is_a) +{ + switch (raddr) { + case QPU_R_VARY: + add_write_dep(state, &state->last_r[5], n); + break; + + case QPU_R_VPM: + add_write_dep(state, &state->last_vpm_read, n); + break; + + case QPU_R_UNIF: + add_write_dep(state, &state->last_unif_read, n); + break; + + case QPU_R_NOP: + case QPU_R_ELEM_QPU: + case QPU_R_XY_PIXEL_COORD: + case QPU_R_MS_REV_FLAGS: + break; + + default: + if (raddr < 32) { + if (is_a) + add_dep(state, state->last_ra[raddr], n); + else + add_dep(state, state->last_rb[raddr], n); + } else { + fprintf(stderr, "unknown raddr %d\n", raddr); + abort(); + } + break; + } +} + +static bool +is_tmu_write(uint32_t waddr) +{ + switch (waddr) { + case QPU_W_TMU0_S: + case QPU_W_TMU0_T: + case QPU_W_TMU0_R: + case QPU_W_TMU0_B: + case QPU_W_TMU1_S: + case QPU_W_TMU1_T: + case QPU_W_TMU1_R: + case QPU_W_TMU1_B: + return true; + default: + return false; + } +} + +static void +process_mux_deps(struct schedule_state *state, struct schedule_node *n, + uint32_t mux) +{ + if (mux != QPU_MUX_A && mux != QPU_MUX_B) + add_dep(state, state->last_r[mux], n); +} + + +static bool +is_direct_tmu_read(uint64_t inst) +{ + /* If it's a direct read, we happen to structure the code such that + * there's an explicit uniform read in the instruction (for kernel + * texture reloc processing). + */ + return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF || + QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF); +} + +static void +process_waddr_deps(struct schedule_state *state, struct schedule_node *n, + uint32_t waddr, bool is_add) +{ + uint64_t inst = n->inst->inst; + bool is_a = is_add ^ ((inst & QPU_WS) != 0); + + if (waddr < 32) { + if (is_a) { + add_write_dep(state, &state->last_ra[waddr], n); + } else { + add_write_dep(state, &state->last_rb[waddr], n); + } + } else if (is_tmu_write(waddr)) { + add_write_dep(state, &state->last_tmu_write, n); + + /* There is an implicit uniform read in texture ops in + * hardware, unless this is a direct-addressed uniform read, + * so we need to keep it in the same order as the other + * uniforms. + */ + if (!is_direct_tmu_read(n->inst->inst)) + add_write_dep(state, &state->last_unif_read, n); + } else if (qpu_waddr_is_tlb(waddr)) { + add_write_dep(state, &state->last_tlb, n); + } else { + switch (waddr) { + case QPU_W_ACC0: + case QPU_W_ACC1: + case QPU_W_ACC2: + case QPU_W_ACC3: + case QPU_W_ACC5: + add_write_dep(state, &state->last_r[waddr - QPU_W_ACC0], + n); + break; + + case QPU_W_VPM: + case QPU_W_VPMVCD_SETUP: + add_write_dep(state, &state->last_vpm, n); + break; + + case QPU_W_SFU_RECIP: + case QPU_W_SFU_RECIPSQRT: + case QPU_W_SFU_EXP: + case QPU_W_SFU_LOG: + add_write_dep(state, &state->last_r[4], n); + break; + + case QPU_W_TLB_STENCIL_SETUP: + /* This isn't a TLB operation that does things like + * implicitly lock the scoreboard, but it does have to + * appear before TLB_Z, and each of the TLB_STENCILs + * have to schedule in the same order relative to each + * other. + */ + add_write_dep(state, &state->last_tlb, n); + break; + + case QPU_W_NOP: + break; + + default: + fprintf(stderr, "Unknown waddr %d\n", waddr); + abort(); + } + } +} + +static void +process_cond_deps(struct schedule_state *state, struct schedule_node *n, + uint32_t cond) +{ + switch (cond) { + case QPU_COND_NEVER: + case QPU_COND_ALWAYS: + break; + default: + add_dep(state, state->last_sf, n); + break; + } +} + +/** + * Common code for dependencies that need to be tracked both forward and + * backward. + * + * This is for things like "all reads of r4 have to happen between the r4 + * writes that surround them". + */ +static void +calculate_deps(struct schedule_state *state, struct schedule_node *n) +{ + uint64_t inst = n->inst->inst; + uint32_t add_op = QPU_GET_FIELD(inst, QPU_OP_ADD); + uint32_t mul_op = QPU_GET_FIELD(inst, QPU_OP_MUL); + uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); + uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); + uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); + uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); + uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); + uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); + uint32_t mul_a = QPU_GET_FIELD(inst, QPU_MUL_A); + uint32_t mul_b = QPU_GET_FIELD(inst, QPU_MUL_B); + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + + process_raddr_deps(state, n, raddr_a, true); + process_raddr_deps(state, n, raddr_b, false); + if (add_op != QPU_A_NOP) { + process_mux_deps(state, n, add_a); + process_mux_deps(state, n, add_b); + } + if (mul_op != QPU_M_NOP) { + process_mux_deps(state, n, mul_a); + process_mux_deps(state, n, mul_b); + } + + process_waddr_deps(state, n, waddr_add, true); + process_waddr_deps(state, n, waddr_mul, false); + if (qpu_writes_r4(inst)) + add_write_dep(state, &state->last_r[4], n); + + switch (sig) { + case QPU_SIG_SW_BREAKPOINT: + case QPU_SIG_NONE: + case QPU_SIG_THREAD_SWITCH: + case QPU_SIG_LAST_THREAD_SWITCH: + case QPU_SIG_SMALL_IMM: + case QPU_SIG_LOAD_IMM: + break; + + case QPU_SIG_LOAD_TMU0: + case QPU_SIG_LOAD_TMU1: + /* TMU loads are coming from a FIFO, so ordering is important. + */ + add_write_dep(state, &state->last_tmu_write, n); + break; + + case QPU_SIG_COLOR_LOAD: + add_dep(state, state->last_tlb, n); + break; + + case QPU_SIG_PROG_END: + case QPU_SIG_WAIT_FOR_SCOREBOARD: + case QPU_SIG_SCOREBOARD_UNLOCK: + case QPU_SIG_COVERAGE_LOAD: + case QPU_SIG_COLOR_LOAD_END: + case QPU_SIG_ALPHA_MASK_LOAD: + case QPU_SIG_BRANCH: + fprintf(stderr, "Unhandled signal bits %d\n", sig); + abort(); + } + + process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD)); + process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD)); + if (inst & QPU_SF) + add_write_dep(state, &state->last_sf, n); +} + +static void +calculate_forward_deps(struct vc4_compile *c, struct simple_node *schedule_list) +{ + struct simple_node *node; + struct schedule_state state; + + memset(&state, 0, sizeof(state)); + state.dir = F; + + foreach(node, schedule_list) + calculate_deps(&state, (struct schedule_node *)node); +} + +static void +calculate_reverse_deps(struct vc4_compile *c, struct simple_node *schedule_list) +{ + struct simple_node *node; + struct schedule_state state; + + memset(&state, 0, sizeof(state)); + state.dir = R; + + for (node = schedule_list->prev; schedule_list != node; node = node->prev) { + calculate_deps(&state, (struct schedule_node *)node); + } +} + +struct choose_scoreboard { + int tick; + int last_sfu_write_tick; + uint32_t last_waddr_a, last_waddr_b; +}; + +static bool +reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst) +{ + uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); + uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); + uint32_t src_muxes[] = { + QPU_GET_FIELD(inst, QPU_ADD_A), + QPU_GET_FIELD(inst, QPU_ADD_B), + QPU_GET_FIELD(inst, QPU_MUL_A), + QPU_GET_FIELD(inst, QPU_MUL_B), + }; + for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) { + if ((src_muxes[i] == QPU_MUX_A && + raddr_a < 32 && + scoreboard->last_waddr_a == raddr_a) || + (src_muxes[i] == QPU_MUX_B && + raddr_b < 32 && + scoreboard->last_waddr_b == raddr_b)) { + return true; + } + + if (src_muxes[i] == QPU_MUX_R4) { + if (scoreboard->tick - + scoreboard->last_sfu_write_tick <= 2) { + return true; + } + } + } + + return false; +} + +static bool +pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, uint64_t inst) +{ + return (scoreboard->tick < 2 && qpu_inst_is_tlb(inst)); +} + +static int +get_instruction_priority(uint64_t inst) +{ + uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); + uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + uint32_t baseline_score; + uint32_t next_score = 0; + + /* Schedule texture read setup early to hide their latency better. */ + if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul)) + return next_score; + next_score++; + + /* Default score for things that aren't otherwise special. */ + baseline_score = next_score; + next_score++; + + /* Schedule texture read results collection late to hide latency. */ + if (sig == QPU_SIG_LOAD_TMU0 || sig == QPU_SIG_LOAD_TMU1) + return next_score; + next_score++; + + /* Schedule TLB operations as late as possible, to get more + * parallelism between shaders. + */ + if (qpu_inst_is_tlb(inst)) + return next_score; + next_score++; + + return baseline_score; +} + +static struct schedule_node * +choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, + struct simple_node *schedule_list) +{ + struct schedule_node *chosen = NULL; + struct simple_node *node; + int chosen_prio = 0; + + foreach(node, schedule_list) { + struct schedule_node *n = (struct schedule_node *)node; + uint64_t inst = n->inst->inst; + + /* "An instruction must not read from a location in physical + * regfile A or B that was written to by the previous + * instruction." + */ + if (reads_too_soon_after_write(scoreboard, inst)) + continue; + + /* "A scoreboard wait must not occur in the first two + * instructions of a fragment shader. This is either the + * explicit Wait for Scoreboard signal or an implicit wait + * with the first tile-buffer read or write instruction." + */ + if (pixel_scoreboard_too_soon(scoreboard, inst)) + continue; + + int prio = get_instruction_priority(inst); + + /* Found a valid instruction. If nothing better comes along, + * this one works. + */ + if (!chosen) { + chosen = n; + chosen_prio = prio; + continue; + } + + if (prio > chosen_prio) { + chosen = n; + chosen_prio = prio; + } else if (prio < chosen_prio) { + continue; + } + } + + return chosen; +} + +static void +update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, + uint64_t inst) +{ + uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); + uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); + + if (!(inst & QPU_WS)) { + scoreboard->last_waddr_a = waddr_add; + scoreboard->last_waddr_b = waddr_mul; + } else { + scoreboard->last_waddr_b = waddr_add; + scoreboard->last_waddr_a = waddr_mul; + } + + if ((waddr_add >= QPU_W_SFU_RECIP && waddr_add <= QPU_W_SFU_LOG) || + (waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) { + scoreboard->last_sfu_write_tick = scoreboard->tick; + } +} + +static void +dump_state(struct simple_node *schedule_list) +{ + struct simple_node *node; + + uint32_t i = 0; + foreach(node, schedule_list) { + struct schedule_node *n = (struct schedule_node *)node; + + fprintf(stderr, "%3d: ", i++); + vc4_qpu_disasm(&n->inst->inst, 1); + fprintf(stderr, "\n"); + + for (int i = 0; i < n->child_count; i++) { + struct schedule_node *child = n->children[i]; + fprintf(stderr, " - "); + vc4_qpu_disasm(&child->inst->inst, 1); + fprintf(stderr, " (%d parents)\n", child->parent_count); + } + } +} + +/** Recursive computation of the delay member of a node. */ +static void +compute_delay(struct schedule_node *n) +{ + if (!n->child_count) { + n->delay = 1; + } else { + for (int i = 0; i < n->child_count; i++) { + if (!n->children[i]->delay) + compute_delay(n->children[i]); + n->delay = MAX2(n->delay, n->children[i]->delay + 1); + } + } +} + +static void +schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list) +{ + struct simple_node *node, *t; + struct choose_scoreboard scoreboard; + + memset(&scoreboard, 0, sizeof(scoreboard)); + scoreboard.last_waddr_a = ~0; + scoreboard.last_waddr_b = ~0; + scoreboard.last_sfu_write_tick = -10; + + if (debug) { + fprintf(stderr, "initial deps:\n"); + dump_state(schedule_list); + fprintf(stderr, "\n"); + } + + /* Remove non-DAG heads from the list. */ + foreach_s(node, t, schedule_list) { + struct schedule_node *n = (struct schedule_node *)node; + + if (n->parent_count != 0) + remove_from_list(&n->link); + } + + while (!is_empty_list(schedule_list)) { + struct schedule_node *chosen = + choose_instruction_to_schedule(&scoreboard, + schedule_list); + + /* If there are no valid instructions to schedule, drop a NOP + * in. + */ + uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP(); + + if (debug) { + fprintf(stderr, "current list:\n"); + dump_state(schedule_list); + fprintf(stderr, "chose: "); + vc4_qpu_disasm(&inst, 1); + fprintf(stderr, "\n\n"); + } + + /* Schedule this instruction onto the QPU list. */ + if (chosen) + remove_from_list(&chosen->link); + qpu_serialize_one_inst(c, inst); + + update_scoreboard_for_chosen(&scoreboard, inst); + + /* Now that we've scheduled a new instruction, some of its + * children can be promoted to the list of instructions ready to + * be scheduled. Update the children's unblocked time for this + * DAG edge as we do so. + */ + if (chosen) { + for (int i = chosen->child_count - 1; i >= 0; i--) { + struct schedule_node *child = + chosen->children[i]; + + child->parent_count--; + if (child->parent_count == 0) { + insert_at_head(schedule_list, + &child->link); + } + } + } + + scoreboard.tick++; + } +} + +void +qpu_schedule_instructions(struct vc4_compile *c) +{ + void *mem_ctx = ralloc_context(NULL); + struct simple_node schedule_list; + struct simple_node *node; + + make_empty_list(&schedule_list); + + if (debug) { + fprintf(stderr, "Pre-schedule instructions\n"); + foreach(node, &c->qpu_inst_list) { + struct queued_qpu_inst *q = + (struct queued_qpu_inst *)node; + vc4_qpu_disasm(&q->inst, 1); + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); + } + + /* Wrap each instruction in a scheduler structure. */ + while (!is_empty_list(&c->qpu_inst_list)) { + struct queued_qpu_inst *inst = + (struct queued_qpu_inst *)c->qpu_inst_list.next; + struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node); + + n->inst = inst; + remove_from_list(&inst->link); + insert_at_tail(&schedule_list, &n->link); + } + + calculate_forward_deps(c, &schedule_list); + calculate_reverse_deps(c, &schedule_list); + + foreach(node, &schedule_list) { + struct schedule_node *n = (struct schedule_node *)node; + compute_delay(n); + } + + schedule_instructions(c, &schedule_list); + + if (debug) { + fprintf(stderr, "Post-schedule instructions\n"); + vc4_qpu_disasm(c->qpu_insts, c->qpu_inst_count); + fprintf(stderr, "\n"); + } + + ralloc_free(mem_ctx); +} -- 2.30.2