X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fvc4%2Fvc4_qir_schedule.c;h=cbd993ceb32b32003c37b2acb510063ea9c4efcd;hb=2e94cb66933fd7b130011b53b47e0816eb8a76d5;hp=186e81be750e5051c81ef81d17c8e8ea7722d46a;hpb=2b9f0dffe00bdc556436da02c099b8a50ecc4f49;p=mesa.git diff --git a/src/gallium/drivers/vc4/vc4_qir_schedule.c b/src/gallium/drivers/vc4/vc4_qir_schedule.c index 186e81be750..cbd993ceb32 100644 --- a/src/gallium/drivers/vc4/vc4_qir_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qir_schedule.c @@ -37,18 +37,15 @@ */ #include "vc4_qir.h" +#include "util/dag.h" static bool debug; struct schedule_node { + struct dag_node dag; struct list_head link; struct qinst *inst; - struct schedule_node **children; - uint32_t child_count; - uint32_t child_array_size; - uint32_t parent_count; - /* Length of the longest (latency) chain from a DAG head to the this * instruction. */ @@ -61,11 +58,7 @@ struct schedule_node { }; struct schedule_state { - /* List of struct schedule_node *. This starts out with all - * instructions, and after dependency updates it's trimmed to be just - * the DAG heads. - */ - struct list_head worklist; + struct dag *dag; uint32_t time; @@ -80,8 +73,8 @@ struct schedule_state { enum direction { F, R }; /** - * Marks a dependency between two intructions, that @after must appear after - * @before. + * Marks a dependency between two intructions, that \p after must appear after + * \p before. * * Our dependencies are tracked as a DAG. Since we're scheduling bottom-up, * the latest instructions with nothing left to schedule are the DAG heads, @@ -103,21 +96,7 @@ add_dep(enum direction dir, after = t; } - for (int i = 0; i < after->child_count; i++) { - if (after->children[i] == after) - return; - } - - if (after->child_array_size <= after->child_count) { - after->child_array_size = MAX2(after->child_array_size * 2, 16); - after->children = reralloc(after, after->children, - struct schedule_node *, - after->child_array_size); - } - - after->children[after->child_count] = before; - after->child_count++; - before->parent_count++; + dag_add_edge(&after->dag, &before->dag, NULL); } static void @@ -138,6 +117,7 @@ struct schedule_setup_state { struct schedule_node *last_tex_coord; struct schedule_node *last_tex_result; struct schedule_node *last_tlb; + struct schedule_node *last_uniforms_reset; enum direction dir; /** @@ -186,7 +166,7 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n) * ignore uniforms accesses, because qir_reorder_uniforms() happens * after this. */ - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { switch (inst->src[i].file) { case QFILE_TEMP: add_dep(dir, @@ -211,27 +191,36 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n) add_dep(dir, state->last_vary_read, n); break; - case QOP_TEX_S: - case QOP_TEX_T: - case QOP_TEX_R: - case QOP_TEX_B: - case QOP_TEX_DIRECT: - /* Texturing setup gets scheduled in order, because - * the uniforms referenced by them have to land in a - * specific order. - */ - add_write_dep(dir, &state->last_tex_coord, n); - break; - case QOP_TEX_RESULT: /* Results have to be fetched in order. */ add_write_dep(dir, &state->last_tex_result, n); break; - case QOP_TLB_COLOR_WRITE: + case QOP_THRSW: + /* After a new THRSW, one must collect all texture samples + * queued since the previous THRSW/program start. For now, we + * have one THRSW in between each texture setup and its + * results collection as our input, and we just make sure that + * that ordering is maintained. + */ + add_write_dep(dir, &state->last_tex_coord, n); + add_write_dep(dir, &state->last_tex_result, n); + + /* accumulators and flags are lost across thread switches. */ + add_write_dep(dir, &state->last_sf, n); + + /* Setup, like the varyings, will need to be drained before we + * thread switch. + */ + add_write_dep(dir, &state->last_vary_read, n); + + /* The TLB-locking operations have to stay after the last + * thread switch. + */ + add_write_dep(dir, &state->last_tlb, n); + break; + case QOP_TLB_COLOR_READ: - case QOP_TLB_Z_WRITE: - case QOP_TLB_STENCIL_SETUP: case QOP_MS_MASK: add_write_dep(dir, &state->last_tlb, n); break; @@ -240,10 +229,37 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n) break; } - if (inst->dst.file == QFILE_VPM) + switch (inst->dst.file) { + case QFILE_VPM: add_write_dep(dir, &state->last_vpm_write, n); - else if (inst->dst.file == QFILE_TEMP) + break; + + case QFILE_TEMP: add_write_dep(dir, &state->last_temp_write[inst->dst.index], n); + break; + + case QFILE_TLB_COLOR_WRITE: + case QFILE_TLB_COLOR_WRITE_MS: + case QFILE_TLB_Z_WRITE: + case QFILE_TLB_STENCIL_SETUP: + add_write_dep(dir, &state->last_tlb, n); + break; + + case QFILE_TEX_S_DIRECT: + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: + /* Texturing setup gets scheduled in order, because + * the uniforms referenced by them have to land in a + * specific order. + */ + add_write_dep(dir, &state->last_tex_coord, n); + break; + + default: + break; + } if (qir_depends_on_flags(inst)) add_dep(dir, state->last_sf, n); @@ -268,26 +284,69 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, calculate_deps(&state, n); - switch (inst->op) { - case QOP_TEX_S: - case QOP_TEX_T: - case QOP_TEX_R: - case QOP_TEX_B: - case QOP_TEX_DIRECT: - /* If the texture coordinate fifo is full, - * block this on the last QOP_TEX_RESULT. + for (int i = 0; i < qir_get_nsrc(inst); i++) { + switch (inst->src[i].file) { + case QFILE_UNIF: + add_dep(state.dir, state.last_uniforms_reset, n); + break; + default: + break; + } + } + + switch (inst->dst.file) { + case QFILE_TEX_S_DIRECT: + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: + /* From the VC4 spec: + * + * "The TFREQ input FIFO holds two full lots of s, + * t, r, b data, plus associated setup data, per + * QPU, that is, there are eight data slots. For + * each texture request, slots are only consumed + * for the components of s, t, r, and b actually + * written. Thus the FIFO can hold four requests + * of just (s, t) data, or eight requests of just + * s data (for direct addressed data lookups). + * + * Note that there is one FIFO per QPU, and the + * FIFO has no concept of threads - that is, + * multi-threaded shaders must be careful to use + * only 1/2 the FIFO depth before reading + * back. Multi-threaded programs must also + * therefore always thread switch on texture + * fetch as the other thread may have data + * waiting in the FIFO." + * + * If the texture coordinate fifo is full, block this + * on the last QOP_TEX_RESULT. */ - if (state.tfreq_count == 8) { + if (state.tfreq_count == (c->fs_threaded ? 4 : 8)) { block_until_tex_result(&state, n); } - /* If the texture result fifo is full, block - * adding any more to it until the last - * QOP_TEX_RESULT. + /* From the VC4 spec: + * + * "Since the maximum number of texture requests + * in the input (TFREQ) FIFO is four lots of (s, + * t) data, the output (TFRCV) FIFO is sized to + * holds four lots of max-size color data per + * QPU. For non-float color, reads are packed + * RGBA8888 data (one read per pixel). For 16-bit + * float color, two reads are necessary per + * pixel, with reads packed as RG1616 then + * BA1616. So per QPU there are eight color slots + * in the TFRCV FIFO." + * + * If the texture result fifo is full, block adding + * any more to it until the last QOP_TEX_RESULT. */ - if (inst->op == QOP_TEX_S || - inst->op == QOP_TEX_DIRECT) { - if (state.tfrcv_count == 4) + if (inst->dst.file == QFILE_TEX_S || + inst->dst.file == QFILE_TEX_S_DIRECT) { + if (state.tfrcv_count == + (c->fs_threaded ? 2 : 4)) block_until_tex_result(&state, n); state.tfrcv_count++; } @@ -296,6 +355,11 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, state.tfreq_count++; break; + default: + break; + } + + switch (inst->op) { case QOP_TEX_RESULT: /* Results have to be fetched after the * coordinate setup. Note that we're assuming @@ -312,8 +376,12 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, memset(&state.tex_fifo[state.tex_fifo_pos], 0, sizeof(state.tex_fifo[0])); break; + + case QOP_UNIFORMS_RESET: + add_write_dep(state.dir, &state.last_uniforms_reset, n); + break; + default: - assert(!qir_is_tex(inst)); break; } } @@ -344,11 +412,21 @@ get_register_pressure_cost(struct schedule_state *state, struct qinst *inst) state->temp_writes[inst->dst.index] == 1) cost--; - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { - if (inst->src[i].file == QFILE_TEMP && - !BITSET_TEST(state->temp_live, inst->src[i].index)) { - cost++; + for (int i = 0; i < qir_get_nsrc(inst); i++) { + if (inst->src[i].file != QFILE_TEMP || + BITSET_TEST(state->temp_live, inst->src[i].index)) { + continue; } + + bool already_counted = false; + for (int j = 0; j < i; j++) { + if (inst->src[i].file == inst->src[j].file && + inst->src[i].index == inst->src[j].index) { + already_counted = true; + } + } + if (!already_counted) + cost++; } return cost; @@ -357,11 +435,13 @@ get_register_pressure_cost(struct schedule_state *state, struct qinst *inst) static bool locks_scoreboard(struct qinst *inst) { - switch (inst->op) { - case QOP_TLB_Z_WRITE: - case QOP_TLB_COLOR_WRITE: - case QOP_TLB_COLOR_WRITE_MS: - case QOP_TLB_COLOR_READ: + if (inst->op == QOP_TLB_COLOR_READ) + return true; + + switch (inst->dst.file) { + case QFILE_TLB_Z_WRITE: + case QFILE_TLB_COLOR_WRITE: + case QFILE_TLB_COLOR_WRITE_MS: return true; default: return false; @@ -373,7 +453,16 @@ choose_instruction(struct schedule_state *state) { struct schedule_node *chosen = NULL; - list_for_each_entry(struct schedule_node, n, &state->worklist, link) { + list_for_each_entry(struct schedule_node, n, &state->dag->heads, + dag.link) { + /* The branches aren't being tracked as dependencies. Make + * sure that they stay scheduled as the last instruction of + * the block, which is to say the first one we choose to + * schedule. + */ + if (n->inst->op == QOP_BRANCH) + return n; + if (!chosen) { chosen = n; continue; @@ -393,7 +482,7 @@ choose_instruction(struct schedule_state *state) } /* If we would block on the previously chosen node, but would - * block less on this one, then then prefer it. + * block less on this one, then prefer it. */ if (chosen->unblocked_time > state->time && n->unblocked_time < chosen->unblocked_time) { @@ -440,17 +529,20 @@ static void dump_state(struct vc4_compile *c, struct schedule_state *state) { uint32_t i = 0; - list_for_each_entry(struct schedule_node, n, &state->worklist, link) { + list_for_each_entry(struct schedule_node, n, &state->dag->heads, + dag.link) { fprintf(stderr, "%3d: ", i++); qir_dump_inst(c, n->inst); fprintf(stderr, " (%d cost)\n", get_register_pressure_cost(state, n->inst)); - for (int i = 0; i < n->child_count; i++) { - struct schedule_node *child = n->children[i]; + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + struct schedule_node *child = + (struct schedule_node *)edge->child; fprintf(stderr, " - "); qir_dump_inst(c, child->inst); - fprintf(stderr, " (%d parents)\n", child->parent_count); + fprintf(stderr, " (%d parents)\n", + child->dag.parent_count); } } } @@ -465,57 +557,72 @@ dump_state(struct vc4_compile *c, struct schedule_state *state) static uint32_t latency_between(struct schedule_node *before, struct schedule_node *after) { - if ((before->inst->op == QOP_TEX_S || - before->inst->op == QOP_TEX_DIRECT) && + if ((before->inst->dst.file == QFILE_TEX_S || + before->inst->dst.file == QFILE_TEX_S_DIRECT) && after->inst->op == QOP_TEX_RESULT) return 100; + switch (before->inst->op) { + case QOP_RCP: + case QOP_RSQ: + case QOP_EXP2: + case QOP_LOG2: + for (int i = 0; i < qir_get_nsrc(after->inst); i++) { + if (after->inst->src[i].file == + before->inst->dst.file && + after->inst->src[i].index == + before->inst->dst.index) { + /* There are two QPU delay slots before we can + * read a math result, which could be up to 4 + * QIR instructions if they packed well. + */ + return 4; + } + } + break; + default: + break; + } + return 1; } /** Recursive computation of the delay member of a node. */ static void -compute_delay(struct schedule_node *n) +compute_delay(struct dag_node *node, void *state) { - if (!n->child_count) { - /* The color read needs to be scheduled late, to avoid locking - * the scoreboard early. This is our best tool for - * encouraging that. The other scoreboard locking ops will - * have this happen by default, since they are generally the - * DAG heads or close to them. - */ - if (n->inst->op == QOP_TLB_COLOR_READ) - n->delay = 1000; - else - n->delay = 1; - } else { - for (int i = 0; i < n->child_count; i++) { - if (!n->children[i]->delay) - compute_delay(n->children[i]); - n->delay = MAX2(n->delay, - n->children[i]->delay + - latency_between(n, n->children[i])); - } + struct schedule_node *n = (struct schedule_node *)node; + + /* The color read needs to be scheduled late, to avoid locking the + * scoreboard early. This is our best tool for encouraging that. The + * other scoreboard locking ops will have this happen by default, + * since they are generally the DAG heads or close to them. + */ + if (n->inst->op == QOP_TLB_COLOR_READ) + n->delay = 1000; + else + n->delay = 1; + + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + struct schedule_node *child = + (struct schedule_node *)edge->child; + + n->delay = MAX2(n->delay, (child->delay + + latency_between(child, n))); } } static void -schedule_instructions(struct vc4_compile *c, struct schedule_state *state) +schedule_instructions(struct vc4_compile *c, + struct qblock *block, struct schedule_state *state) { if (debug) { fprintf(stderr, "initial deps:\n"); dump_state(c, state); } - /* Remove non-DAG heads from the list. */ - list_for_each_entry_safe(struct schedule_node, n, - &state->worklist, link) { - if (n->parent_count != 0) - list_del(&n->link); - } - state->time = 0; - while (!list_empty(&state->worklist)) { + while (!list_empty(&state->dag->heads)) { struct schedule_node *chosen = choose_instruction(state); struct qinst *inst = chosen->inst; @@ -531,28 +638,27 @@ schedule_instructions(struct vc4_compile *c, struct schedule_state *state) state->time = MAX2(state->time, chosen->unblocked_time); /* Schedule this instruction back onto the QIR list. */ - list_del(&chosen->link); - list_add(&inst->link, &c->instructions); + list_add(&inst->link, &block->instructions); /* Now that we've scheduled a new instruction, some of its * children can be promoted to the list of instructions ready to * be scheduled. Update the children's unblocked time for this * DAG edge as we do so. */ - for (int i = chosen->child_count - 1; i >= 0; i--) { - struct schedule_node *child = chosen->children[i]; + util_dynarray_foreach(&chosen->dag.edges, + struct dag_edge, edge) { + struct schedule_node *child = + (struct schedule_node *)edge->child; child->unblocked_time = MAX2(child->unblocked_time, state->time + - latency_between(chosen, - child)); - child->parent_count--; - if (child->parent_count == 0) - list_add(&child->link, &state->worklist); + latency_between(child, + chosen)); } + dag_prune_head(state->dag, &chosen->dag); /* Update our tracking of register pressure. */ - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { if (inst->src[i].file == QFILE_TEMP) BITSET_SET(state->temp_live, inst->src[i].index); } @@ -566,48 +672,59 @@ schedule_instructions(struct vc4_compile *c, struct schedule_state *state) } } -void -qir_schedule_instructions(struct vc4_compile *c) +static void +qir_schedule_instructions_block(struct vc4_compile *c, + struct qblock *block) { - void *mem_ctx = ralloc_context(NULL); - struct schedule_state state = { { 0 } }; + struct schedule_state *state = rzalloc(NULL, struct schedule_state); - if (debug) { - fprintf(stderr, "Pre-schedule instructions\n"); - qir_dump(c); - } + state->temp_writes = rzalloc_array(state, uint32_t, c->num_temps); + state->temp_live = rzalloc_array(state, BITSET_WORD, + BITSET_WORDS(c->num_temps)); + state->dag = dag_create(state); - state.temp_writes = rzalloc_array(mem_ctx, uint32_t, c->num_temps); - state.temp_live = rzalloc_array(mem_ctx, BITSET_WORD, - BITSET_WORDS(c->num_temps)); - list_inithead(&state.worklist); + struct list_head setup_list; + list_inithead(&setup_list); /* Wrap each instruction in a scheduler structure. */ - list_for_each_entry_safe(struct qinst, inst, &c->instructions, link) { - struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node); + qir_for_each_inst_safe(inst, block) { + struct schedule_node *n = rzalloc(state, struct schedule_node); n->inst = inst; list_del(&inst->link); - list_addtail(&n->link, &state.worklist); + list_addtail(&n->link, &setup_list); + dag_init_node(state->dag, &n->dag); if (inst->dst.file == QFILE_TEMP) - state.temp_writes[inst->dst.index]++; + state->temp_writes[inst->dst.index]++; } /* Dependencies tracked top-to-bottom. */ - calculate_forward_deps(c, mem_ctx, &state.worklist); + calculate_forward_deps(c, state, &setup_list); /* Dependencies tracked bottom-to-top. */ - calculate_reverse_deps(c, mem_ctx, &state.worklist); + calculate_reverse_deps(c, state, &setup_list); - list_for_each_entry(struct schedule_node, n, &state.worklist, link) - compute_delay(n); + dag_traverse_bottom_up(state->dag, compute_delay, NULL); - schedule_instructions(c, &state); + schedule_instructions(c, block, state); + + ralloc_free(state); +} + +void +qir_schedule_instructions(struct vc4_compile *c) +{ if (debug) { - fprintf(stderr, "Post-schedule instructions\n"); + fprintf(stderr, "Pre-schedule instructions\n"); qir_dump(c); } - ralloc_free(mem_ctx); + qir_for_each_block(block, c) + qir_schedule_instructions_block(c, block); + + if (debug) { + fprintf(stderr, "Post-schedule instructions\n"); + qir_dump(c); + } }