From 73caa26e4319f4e0bbc0bf1d5d455ab0d53d20a3 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Tue, 9 Jun 2015 10:26:53 -0700 Subject: [PATCH] i965/sched: use liveness analysis for computing register pressure Previously, we were using some heuristics to try and detect when a write was about to begin a live range, or when a read was about to end a live range. We never used the liveness analysis information used by the register allocator, though, which meant that the scheduler's and the allocator's ideas of when a live range began and ended were different. Not only did this make our estimate of the register pressure benefit of scheduling an instruction wrong in some cases, but it was preventing us from knowing the actual register pressure when scheduling each instruction, which we want to have in order to switch to register pressure scheduling only when the register pressure is too high. This commit rewrites the register pressure tracking code to use the same model as our register allocator currently uses. We use the results of liveness analysis, as well as the compute_payload_ranges() function that we split out in the last commit. This means that we compute live ranges twice on each round through the register allocator, although we could speed it up by only recomputing the ranges and not the live in/live out sets after scheduling, since we only shuffle around instructions within a single basic block when we schedule. Shader-db results on bdw: total instructions in shared programs: 7130187 -> 7129880 (-0.00%) instructions in affected programs: 1744 -> 1437 (-17.60%) helped: 1 HURT: 1 total cycles in shared programs: 172535126 -> 172473226 (-0.04%) cycles in affected programs: 11338636 -> 11276736 (-0.55%) helped: 876 HURT: 873 LOST: 8 GAINED: 0 v2: use regs_read() in more places. Reviewed-by: Jason Ekstrand --- .../dri/i965/brw_schedule_instructions.cpp | 285 ++++++++++++++---- 1 file changed, 229 insertions(+), 56 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index 46da3251ea7..2698399641a 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -26,6 +26,7 @@ */ #include "brw_fs.h" +#include "brw_fs_live_variables.h" #include "brw_vec4.h" #include "brw_cfg.h" #include "brw_shader.h" @@ -400,22 +401,49 @@ schedule_node::set_latency_gen7(bool is_haswell) class instruction_scheduler { public: instruction_scheduler(backend_shader *s, int grf_count, + int hw_reg_count, int block_count, instruction_scheduler_mode mode) { this->bs = s; this->mem_ctx = ralloc_context(NULL); this->grf_count = grf_count; + this->hw_reg_count = hw_reg_count; this->instructions.make_empty(); this->instructions_to_schedule = 0; this->post_reg_alloc = (mode == SCHEDULE_POST); this->mode = mode; this->time = 0; if (!post_reg_alloc) { - this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count); - this->grf_active = rzalloc_array(mem_ctx, bool, grf_count); + this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count); + + this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count); + for (int i = 0; i < block_count; i++) + this->livein[i] = rzalloc_array(mem_ctx, BITSET_WORD, + BITSET_WORDS(grf_count)); + + this->liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); + for (int i = 0; i < block_count; i++) + this->liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, + BITSET_WORDS(grf_count)); + + this->hw_liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); + for (int i = 0; i < block_count; i++) + this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, + BITSET_WORDS(hw_reg_count)); + + this->written = rzalloc_array(mem_ctx, bool, grf_count); + + this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count); + + this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count); } else { - this->remaining_grf_uses = NULL; - this->grf_active = NULL; + this->reg_pressure_in = NULL; + this->livein = NULL; + this->liveout = NULL; + this->hw_liveout = NULL; + this->written = NULL; + this->reads_remaining = NULL; + this->hw_reads_remaining = NULL; } } @@ -442,7 +470,8 @@ public: */ virtual int issue_time(backend_instruction *inst) = 0; - virtual void count_remaining_grf_uses(backend_instruction *inst) = 0; + virtual void count_reads_remaining(backend_instruction *inst) = 0; + virtual void setup_liveness(cfg_t *cfg) = 0; virtual void update_register_pressure(backend_instruction *inst) = 0; virtual int get_register_pressure_benefit(backend_instruction *inst) = 0; @@ -453,33 +482,63 @@ public: bool post_reg_alloc; int instructions_to_schedule; int grf_count; + int hw_reg_count; int time; + int reg_pressure; + int block_idx; exec_list instructions; backend_shader *bs; instruction_scheduler_mode mode; - /** - * Number of instructions left to schedule that reference each vgrf. - * - * Used so that we can prefer scheduling instructions that will end the - * live intervals of multiple variables, to reduce register pressure. + /* + * The register pressure at the beginning of each basic block. */ - int *remaining_grf_uses; - /** - * Tracks whether each VGRF has had an instruction scheduled that uses it. - * - * This is used to estimate whether scheduling a new instruction will - * increase register pressure. + int *reg_pressure_in; + + /* + * The virtual GRF's whose range overlaps the beginning of each basic block. + */ + + BITSET_WORD **livein; + + /* + * The virtual GRF's whose range overlaps the end of each basic block. + */ + + BITSET_WORD **liveout; + + /* + * The hardware GRF's whose range overlaps the end of each basic block. + */ + + BITSET_WORD **hw_liveout; + + /* + * Whether we've scheduled a write for this virtual GRF yet. */ - bool *grf_active; + + bool *written; + + /* + * How many reads we haven't scheduled for this virtual GRF yet. + */ + + int *reads_remaining; + + /* + * How many reads we haven't scheduled for this hardware GRF yet. + */ + + int *hw_reads_remaining; }; class fs_instruction_scheduler : public instruction_scheduler { public: - fs_instruction_scheduler(fs_visitor *v, int grf_count, + fs_instruction_scheduler(fs_visitor *v, int grf_count, int hw_reg_count, + int block_count, instruction_scheduler_mode mode); void calculate_deps(); bool is_compressed(fs_inst *inst); @@ -487,35 +546,109 @@ public: int issue_time(backend_instruction *inst); fs_visitor *v; - void count_remaining_grf_uses(backend_instruction *inst); + void count_reads_remaining(backend_instruction *inst); + void setup_liveness(cfg_t *cfg); void update_register_pressure(backend_instruction *inst); int get_register_pressure_benefit(backend_instruction *inst); }; fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v, - int grf_count, + int grf_count, int hw_reg_count, + int block_count, instruction_scheduler_mode mode) - : instruction_scheduler(v, grf_count, mode), + : instruction_scheduler(v, grf_count, hw_reg_count, block_count, mode), v(v) { } +static bool +is_src_duplicate(fs_inst *inst, int src) +{ + for (int i = 0; i < src; i++) + if (inst->src[i].equals(inst->src[src])) + return true; + + return false; +} + void -fs_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be) +fs_instruction_scheduler::count_reads_remaining(backend_instruction *be) { fs_inst *inst = (fs_inst *)be; - if (!remaining_grf_uses) + if (!reads_remaining) return; - if (inst->dst.file == GRF) - remaining_grf_uses[inst->dst.reg]++; - for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file != GRF) + if (is_src_duplicate(inst, i)) + continue; + + if (inst->src[i].file == GRF) { + reads_remaining[inst->src[i].reg]++; + } else if (inst->src[i].file == HW_REG && + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { + if (inst->src[i].fixed_hw_reg.nr >= hw_reg_count) + continue; + + for (int j = 0; j < inst->regs_read(i); j++) + hw_reads_remaining[inst->src[i].fixed_hw_reg.nr + j]++; + } + } +} + +void +fs_instruction_scheduler::setup_liveness(cfg_t *cfg) +{ + /* First, compute liveness on a per-GRF level using the in/out sets from + * liveness calculation. + */ + for (int block = 0; block < cfg->num_blocks; block++) { + for (int i = 0; i < v->live_intervals->num_vars; i++) { + if (BITSET_TEST(v->live_intervals->block_data[block].livein, i)) { + int vgrf = v->live_intervals->vgrf_from_var[i]; + if (!BITSET_TEST(livein[block], vgrf)) { + reg_pressure_in[block] += v->alloc.sizes[vgrf]; + BITSET_SET(livein[block], vgrf); + } + } + + if (BITSET_TEST(v->live_intervals->block_data[block].liveout, i)) + BITSET_SET(liveout[block], v->live_intervals->vgrf_from_var[i]); + } + } + + /* Now, extend the live in/live out sets for when a range crosses a block + * boundary, which matches what our register allocator/interference code + * does to account for force_writemask_all and incompatible exec_mask's. + */ + for (int block = 0; block < cfg->num_blocks - 1; block++) { + for (int i = 0; i < grf_count; i++) { + if (v->virtual_grf_start[i] <= cfg->blocks[block]->end_ip && + v->virtual_grf_end[i] >= cfg->blocks[block + 1]->start_ip) { + if (!BITSET_TEST(livein[block + 1], i)) { + reg_pressure_in[block + 1] += v->alloc.sizes[i]; + BITSET_SET(livein[block + 1], i); + } + + BITSET_SET(liveout[block], i); + } + } + } + + int payload_last_use_ip[hw_reg_count]; + v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip); + + for (int i = 0; i < hw_reg_count; i++) { + if (payload_last_use_ip[i] == -1) continue; - remaining_grf_uses[inst->src[i].reg]++; + for (int block = 0; block < cfg->num_blocks; block++) { + if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i]) + reg_pressure_in[block]++; + + if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i]) + BITSET_SET(hw_liveout[block], i); + } } } @@ -524,18 +657,24 @@ fs_instruction_scheduler::update_register_pressure(backend_instruction *be) { fs_inst *inst = (fs_inst *)be; - if (!remaining_grf_uses) + if (!reads_remaining) return; if (inst->dst.file == GRF) { - remaining_grf_uses[inst->dst.reg]--; - grf_active[inst->dst.reg] = true; + written[inst->dst.reg] = true; } for (int i = 0; i < inst->sources; i++) { + if (is_src_duplicate(inst, i)) + continue; + if (inst->src[i].file == GRF) { - remaining_grf_uses[inst->src[i].reg]--; - grf_active[inst->src[i].reg] = true; + reads_remaining[inst->src[i].reg]--; + } else if (inst->src[i].file == HW_REG && + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE && + inst->src[i].fixed_hw_reg.nr < hw_reg_count) { + for (int off = 0; off < inst->regs_read(i); off++) + hw_reads_remaining[inst->src[i].fixed_hw_reg.nr + off]--; } } } @@ -547,20 +686,31 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be) int benefit = 0; if (inst->dst.file == GRF) { - if (remaining_grf_uses[inst->dst.reg] == 1) - benefit += v->alloc.sizes[inst->dst.reg]; - if (!grf_active[inst->dst.reg]) + if (!BITSET_TEST(livein[block_idx], inst->dst.reg) && + !written[inst->dst.reg]) benefit -= v->alloc.sizes[inst->dst.reg]; } for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file != GRF) + if (is_src_duplicate(inst, i)) continue; - if (remaining_grf_uses[inst->src[i].reg] == 1) + if (inst->src[i].file == GRF && + !BITSET_TEST(liveout[block_idx], inst->src[i].reg) && + reads_remaining[inst->src[i].reg] == 1) benefit += v->alloc.sizes[inst->src[i].reg]; - if (!grf_active[inst->src[i].reg]) - benefit -= v->alloc.sizes[inst->src[i].reg]; + + if (inst->src[i].file == HW_REG && + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE && + inst->src[i].fixed_hw_reg.nr < hw_reg_count) { + for (int off = 0; off < inst->regs_read(i); off++) { + int reg = inst->src[i].fixed_hw_reg.nr + off; + if (!BITSET_TEST(hw_liveout[block_idx], reg) && + hw_reads_remaining[reg] == 1) { + benefit++; + } + } + } } return benefit; @@ -575,20 +725,26 @@ public: int issue_time(backend_instruction *inst); vec4_visitor *v; - void count_remaining_grf_uses(backend_instruction *inst); + void count_reads_remaining(backend_instruction *inst); + void setup_liveness(cfg_t *cfg); void update_register_pressure(backend_instruction *inst); int get_register_pressure_benefit(backend_instruction *inst); }; vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v, int grf_count) - : instruction_scheduler(v, grf_count, SCHEDULE_POST), + : instruction_scheduler(v, grf_count, 0, 0, SCHEDULE_POST), v(v) { } void -vec4_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be) +vec4_instruction_scheduler::count_reads_remaining(backend_instruction *be) +{ +} + +void +vec4_instruction_scheduler::setup_liveness(cfg_t *cfg) { } @@ -1387,6 +1543,9 @@ instruction_scheduler::schedule_instructions(bblock_t *block) const struct brw_device_info *devinfo = bs->devinfo; backend_instruction *inst = block->end(); time = 0; + if (!post_reg_alloc) + reg_pressure = reg_pressure_in[block->num]; + block_idx = block->num; /* Remove non-DAG heads from the list. */ foreach_in_list_safe(schedule_node, n, &instructions) { @@ -1403,7 +1562,11 @@ instruction_scheduler::schedule_instructions(bblock_t *block) chosen->remove(); inst->insert_before(block, chosen->inst); instructions_to_schedule--; - update_register_pressure(chosen->inst); + + if (!post_reg_alloc) { + reg_pressure -= get_register_pressure_benefit(chosen->inst); + update_register_pressure(chosen->inst); + } /* If we expected a delay for scheduling, then bump the clock to reflect * that. In reality, the hardware will switch to another hyperthread @@ -1421,6 +1584,8 @@ instruction_scheduler::schedule_instructions(bblock_t *block) if (debug) { fprintf(stderr, "clock %4d, scheduled: ", time); bs->dump_instruction(chosen->inst); + if (!post_reg_alloc) + fprintf(stderr, "(register pressure %d)\n", reg_pressure); } /* Now that we've scheduled a new instruction, some of its @@ -1490,25 +1655,30 @@ static unsigned get_cycle_count(cfg_t *cfg) void instruction_scheduler::run(cfg_t *cfg) { - if (debug) { + if (debug && !post_reg_alloc) { fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n", post_reg_alloc); - bs->dump_instructions(); + bs->dump_instructions(); } - /* Populate the remaining GRF uses array to improve the pre-regalloc - * scheduling. - */ - if (remaining_grf_uses) { - foreach_block_and_inst(block, backend_instruction, inst, cfg) { - count_remaining_grf_uses(inst); - } - } + if (!post_reg_alloc) + setup_liveness(cfg); foreach_block(block, cfg) { if (block->end_ip - block->start_ip <= 1) continue; + if (reads_remaining) { + memset(reads_remaining, 0, + grf_count * sizeof(*reads_remaining)); + memset(hw_reads_remaining, 0, + hw_reg_count * sizeof(*hw_reads_remaining)); + memset(written, 0, grf_count * sizeof(*written)); + + foreach_inst_in_block(fs_inst, inst, block) + count_reads_remaining(inst); + } + add_insts_from_block(block); calculate_deps(); @@ -1520,7 +1690,7 @@ instruction_scheduler::run(cfg_t *cfg) schedule_instructions(block); } - if (debug) { + if (debug && !post_reg_alloc) { fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n", post_reg_alloc); bs->dump_instructions(); @@ -1532,13 +1702,16 @@ instruction_scheduler::run(cfg_t *cfg) void fs_visitor::schedule_instructions(instruction_scheduler_mode mode) { + calculate_live_intervals(); + int grf_count; if (mode == SCHEDULE_POST) grf_count = grf_used; else grf_count = alloc.count; - fs_instruction_scheduler sched(this, grf_count, mode); + fs_instruction_scheduler sched(this, grf_count, first_non_payload_grf, + cfg->num_blocks, mode); sched.run(cfg); if (unlikely(debug_enabled) && mode == SCHEDULE_POST) { -- 2.30.2