From e887341d3f4a3b13b2bf56b4a931afb78ca0526e Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 7 Nov 2016 10:52:32 -0800 Subject: [PATCH] vc4: Don't pair up TLB scoreboard locking instructions early in QPU sched. Jonas Pfeil noticed that we were putting passthrough tlb_z writes early in the shader, despite QIR and QPU scheduling both trying to delay scoreboard locking for as long as possible. The problem was that when trying to pair up QPU instructions, at some point the passthrough tlb_z would be the last one available and it would get paired, even if the other half would open up other instructions to be scheduled and we could have paired tlb_z with something later in the program. Also, since passthrough z is just a mov, it pairs up really easily. The proper fix would probably be to flip the order of scheduling instructions so we went from bottom to top (also relevant for branch delay slot scheduling). However, we can do a quick fix here to just not schedule a TLB lock until there's nothing but TLB left in the program, at a slight instruction cost (est .61% cycle count in shader-db) but a major fragment shader parallelism win. glmark2 results: texture:texture-filter=linear: +1.24481% +/- 0.626117% (n=15) bump:bump-render=height: 1.24991% +/- 0.154793% (n=136,133 -- screensaver outliers removed) --- src/gallium/drivers/vc4/vc4_qpu_schedule.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c index 25adbe67103..680191542b8 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -453,6 +453,7 @@ struct choose_scoreboard { int last_sfu_write_tick; int last_uniforms_reset_tick; uint32_t last_waddr_a, last_waddr_b; + bool tlb_locked; }; static bool @@ -589,6 +590,14 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, if (prev_inst->uniform != -1 && n->uniform != -1) continue; + /* Don't merge in something that will lock the TLB. + * Hopwefully what we have in inst will release some + * other instructions, allowing us to delay the + * TLB-locking instruction until later. + */ + if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst)) + continue; + inst = qpu_merge_inst(prev_inst->inst->inst, inst); if (!inst) continue; @@ -647,6 +656,9 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, waddr_mul == QPU_W_UNIFORMS_ADDRESS) { scoreboard->last_uniforms_reset_tick = scoreboard->tick; } + + if (qpu_inst_is_tlb(inst)) + scoreboard->tlb_locked = true; } static void -- 2.30.2