From 88b41239f98c2d9b3e6cf9185b74876bb77362eb Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Tue, 27 Dec 2016 13:52:54 -0800 Subject: [PATCH] vc4: Rework scheduling of thread switch to cut one more NOP. Jonas's patch got us most of the benefit of scheduling instructions into the delay slots of thread switch, but if there had been nothing to pair the thrsw with, it would move the thrsw up and leave a NOP where the thrsw was. Instead, don't pair anything with thrsw through the normal scheduling path, and have a separate helper function that inserts the thrsw earlier if possible and inserts any necessary NOPs. total instructions in shared programs: 93027 -> 92643 (-0.41%) instructions in affected programs: 14952 -> 14568 (-2.57%) --- src/gallium/drivers/vc4/vc4_qpu_schedule.c | 121 +++++++++++++-------- 1 file changed, 75 insertions(+), 46 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c index cf916198334..9141396c872 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -575,15 +575,28 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, struct schedule_node *chosen = NULL; int chosen_prio = 0; + /* Don't pair up anything with a thread switch signal -- emit_thrsw() + * will handle pairing it along with filling the delay slots. + */ + if (prev_inst) { + uint32_t prev_sig = QPU_GET_FIELD(prev_inst->inst->inst, + QPU_SIG); + if (prev_sig == QPU_SIG_THREAD_SWITCH || + prev_sig == QPU_SIG_LAST_THREAD_SWITCH) { + return NULL; + } + } + list_for_each_entry(struct schedule_node, n, schedule_list, link) { uint64_t inst = n->inst->inst; + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); /* Don't choose the branch instruction until it's the last one * left. XXX: We could potentially choose it before it's the * last one, if the remaining instructions fit in the delay * slots. */ - if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH && + if (sig == QPU_SIG_BRANCH && !list_is_singular(schedule_list)) { continue; } @@ -607,6 +620,14 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, * that they're compatible. */ if (prev_inst) { + /* Don't pair up a thread switch signal -- we'll + * handle pairing it when we pick it on its own. + */ + if (sig == QPU_SIG_THREAD_SWITCH || + sig == QPU_SIG_LAST_THREAD_SWITCH) { + continue; + } + if (prev_inst->uniform != -1 && n->uniform != -1) continue; @@ -820,6 +841,51 @@ mark_instruction_scheduled(struct list_head *schedule_list, } } +/** + * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair + * with another instruction. + */ +static void +emit_thrsw(struct vc4_compile *c, + struct choose_scoreboard *scoreboard, + uint64_t inst) +{ + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + + /* There should be nothing in a thrsw inst being scheduled other than + * the signal bits. + */ + assert(QPU_GET_FIELD(inst, QPU_OP_ADD) == QPU_A_NOP); + assert(QPU_GET_FIELD(inst, QPU_OP_MUL) == QPU_M_NOP); + + /* Try to find an earlier scheduled instruction that we can merge the + * thrsw into. + */ + int thrsw_ip = c->qpu_inst_count; + for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) { + uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i]; + uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG); + + if (prev_sig == QPU_SIG_NONE) + thrsw_ip = c->qpu_inst_count - i; + } + + if (thrsw_ip != c->qpu_inst_count) { + /* Merge the thrsw into the existing instruction. */ + c->qpu_insts[thrsw_ip] = + QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG); + } else { + qpu_serialize_one_inst(c, inst); + update_scoreboard_for_chosen(scoreboard, inst); + } + + /* Fill the delay slots. */ + while (c->qpu_inst_count < thrsw_ip + 3) { + update_scoreboard_for_chosen(scoreboard, qpu_NOP()); + qpu_serialize_one_inst(c, qpu_NOP()); + } +} + static uint32_t schedule_instructions(struct vc4_compile *c, struct choose_scoreboard *scoreboard, @@ -830,7 +896,6 @@ schedule_instructions(struct vc4_compile *c, uint32_t *next_uniform) { uint32_t time = 0; - uint32_t last_thread_switch = 0; if (debug) { fprintf(stderr, "initial deps:\n"); @@ -913,10 +978,6 @@ schedule_instructions(struct vc4_compile *c, fprintf(stderr, "\n"); } - qpu_serialize_one_inst(c, inst); - - update_scoreboard_for_chosen(scoreboard, inst); - /* Now that we've scheduled a new instruction, some of its * children can be promoted to the list of instructions ready to * be scheduled. Update the children's unblocked time for this @@ -925,6 +986,14 @@ schedule_instructions(struct vc4_compile *c, mark_instruction_scheduled(schedule_list, time, chosen, false); mark_instruction_scheduled(schedule_list, time, merge, false); + if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH || + QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) { + emit_thrsw(c, scoreboard, inst); + } else { + qpu_serialize_one_inst(c, inst); + update_scoreboard_for_chosen(scoreboard, inst); + } + scoreboard->tick++; time++; @@ -943,46 +1012,6 @@ schedule_instructions(struct vc4_compile *c, qpu_serialize_one_inst(c, inst); qpu_serialize_one_inst(c, inst); qpu_serialize_one_inst(c, inst); - } else if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH || - QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) { - int last = c->qpu_inst_count - 1; - - /* The thread switch occurs after two delay slots. - * Shift the signal upwards, if there is an - * instruction without a signal there. Watch out for - * the last thread switch as theoretically it could be - * only two instructions away. - */ - - /* Remove sig from the instruction */ - enum qpu_sig_bits sig = QPU_GET_FIELD(inst, QPU_SIG); - c->qpu_insts[last] = QPU_UPDATE_FIELD(c->qpu_insts[last], - QPU_SIG_NONE, - QPU_SIG); - /* Compute how far we can shift */ - int max_shift = MIN2(last - last_thread_switch, 2); - /* If both instructions in front have a signal set, - * reset the signal on the current instruction.*/ - int shift; - for (shift = max_shift; shift >= 0; --shift) { - int ip = last - shift; - if (QPU_GET_FIELD(c->qpu_insts[ip], - QPU_SIG) == QPU_SIG_NONE) { - c->qpu_insts[ip] = - QPU_UPDATE_FIELD( - c->qpu_insts[ip], - sig, QPU_SIG); - break; - } - } - /* If necessarry, add filling NOPs*/ - for (int i = 0; i < 2 - shift; ++i) { - update_scoreboard_for_chosen(scoreboard, - qpu_NOP()); - qpu_serialize_one_inst(c, qpu_NOP()); - } - /* Avoid branching in a thread switch*/ - last_thread_switch = c->qpu_inst_count - 1; } } -- 2.30.2