From 20dbeadd83ffca2345c4ba1f1ac27c19bade0d4a Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 28 Oct 2013 15:17:07 -0700 Subject: [PATCH] i965/fs: Prefer more-critical instructions of the same age in LIFO scheduling. When faced with a million instructions that all became candidates at the same time (none of which individually reduce register pressure), the ones on the critical path are more likely to be the ones that will free up some candidates soon. shader-db: total instructions in shared programs: 1681070 -> 1681070 (0.00%) instructions in affected programs: 0 -> 0 GAINED: 40 LOST: 74 Fixes indistinguishable-from-hanging behavior in GLES3conform's uniform_buffer_object_max_uniform_block_size test, regressed by c3c9a8c85758796a26b48e484286e6b6f5a5299a. Given that 93bd627d5a6c485948b94488e6cd53a06b7ebdcf was unlocked by that commit, the net effect on 16-wide program count is still quite positive, and I think this should give us more stable scheduling (less dependency on original instruction emit order). v2: Comment suggestions by Paul Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=70943 Reviewed-by: Paul Berry --- .../dri/i965/brw_schedule_instructions.cpp | 82 +++++++++++++++---- 1 file changed, 67 insertions(+), 15 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index 8437a4e02f4..5093dd59782 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -68,6 +68,7 @@ public: this->child_count = 0; this->parent_count = 0; this->unblocked_time = 0; + this->cand_generation = 0; /* We can't measure Gen6 timings directly but expect them to be much * closer to Gen7 than Gen4. @@ -90,6 +91,12 @@ public: int unblocked_time; int latency; + /** + * Which iteration of pushing groups of children onto the candidates list + * this node was a part of. + */ + unsigned cand_generation; + /** * This is the sum of the instruction's latency plus the maximum delay of * its children, or just the issue_time if it's a leaf node. @@ -973,26 +980,68 @@ fs_instruction_scheduler::choose_instruction_to_schedule() * variables so that we can avoid register spilling, or get 16-wide * shaders which naturally do a better job of hiding instruction * latency. - * - * To do so, schedule our instructions in a roughly LIFO/depth-first - * order: when new instructions become available as a result of - * scheduling something, choose those first so that our result - * hopefully is consumed quickly. - * - * The exception is messages that generate more than one result - * register (AKA texturing). In those cases, the LIFO search would - * normally tend to choose them quickly (because scheduling the - * previous message not only unblocked the children using its result, - * but also the MRF setup for the next sampler message, which in turn - * unblocks the next sampler message). */ foreach_list(node, &instructions) { schedule_node *n = (schedule_node *)node; fs_inst *inst = (fs_inst *)n->inst; - chosen = n; - if (v->brw->gen >= 7 || inst->regs_written <= 1) - break; + if (!chosen) { + chosen = n; + continue; + } + + /* Prefer instructions that recently became available for scheduling. + * These are the things that are most likely to (eventually) make a + * variable dead and reduce register pressure. Typical register + * pressure estimates don't work for us because most of our pressure + * comes from texturing, where no single instruction to schedule will + * make a vec4 value dead. + */ + if (n->cand_generation > chosen->cand_generation) { + chosen = n; + continue; + } else if (n->cand_generation < chosen->cand_generation) { + continue; + } + + /* On MRF-using chips, prefer non-SEND instructions. If we don't do + * this, then because we prefer instructions that just became + * candidates, we'll end up in a pattern of scheduling a SEND, then + * the MRFs for the next SEND, then the next SEND, then the MRFs, + * etc., without ever consuming the results of a send. + */ + if (v->brw->gen < 7) { + fs_inst *chosen_inst = (fs_inst *)chosen->inst; + + /* We use regs_written > 1 as our test for the kind of send + * instruction to avoid -- only sends generate many regs, and a + * single-result send is probably actually reducing register + * pressure. + */ + if (inst->regs_written <= 1 && chosen_inst->regs_written > 1) { + chosen = n; + continue; + } else if (inst->regs_written > chosen_inst->regs_written) { + continue; + } + } + + /* For instructions pushed on the cands list at the same time, prefer + * the one with the highest delay to the end of the program. This is + * most likely to have its values able to be consumed first (such as + * for a large tree of lowered ubo loads, which appear reversed in + * the instruction stream with respect to when they can be consumed). + */ + if (n->delay > chosen->delay) { + chosen = n; + continue; + } else if (n->delay < chosen->delay) { + continue; + } + + /* If all other metrics are equal, we prefer the first instruction in + * the list (program execution). + */ } } @@ -1048,6 +1097,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea n->remove(); } + unsigned cand_generation = 1; while (!instructions.is_empty()) { schedule_node *chosen = choose_instruction_to_schedule(); @@ -1090,6 +1140,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea bv->dump_instruction(child->inst); } + child->cand_generation = cand_generation; child->parent_count--; if (child->parent_count == 0) { if (debug) { @@ -1098,6 +1149,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea instructions.push_head(child); } } + cand_generation++; /* Shared resource: the mathbox. There's one mathbox per EU on Gen6+ * but it's more limited pre-gen6, so if we send something off to it then -- 2.30.2