From: Eric Anholt Date: Tue, 30 Oct 2012 18:09:59 +0000 (-0700) Subject: i965/fs: Add support for global copy propagation. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=177c82555b24a80c15c34315ff17437cc39d1ba5;p=mesa.git i965/fs: Add support for global copy propagation. It is common for complicated shaders, particularly code-generated ones, to have a big array of uniforms or attributes, and a prologue in the shader that dereferences from the big array to more informatively-named local variables. Then there will be some small control flow operation (like a ? : statement), and then use of those informatively-named variables. We were emitting extra MOVs in these cases, because copy propagation couldn't reach across control flow. Instead, implement dataflow analysis on the output of the first copy propagation pass and re-run it to propagate those extra MOVs out. On one future Steam release, reduces VS+FS instruction count from 42837 to 41437. No statistically significant performance difference (n=48), though, at least at the low resolution I'm running it at. shader-db results: total instructions in shared programs: 722170 -> 702545 (-2.72%) instructions in affected programs: 260618 -> 240993 (-7.53%) Some shaders do get hurt by up to 2 instructions, because a choice to copy propagate instead of coalesce or something like that results in a dead write sticking around. Given that we already have instances of those instructions in the affected programs (particularly unigine), we should just improve dead code elimination to fix the problem. --- diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 13662bb8836..48451767c32 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -249,7 +249,8 @@ public: bool opt_copy_propagate(); bool try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry); bool try_constant_propagate(fs_inst *inst, acp_entry *entry); - bool opt_copy_propagate_local(void *mem_ctx, bblock_t *block); + bool opt_copy_propagate_local(void *mem_ctx, bblock_t *block, + exec_list *acp); bool register_coalesce(); bool register_coalesce_2(); bool compute_to_mrf(); diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp index cc488a1ec70..dec3dcaef5b 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp @@ -21,6 +21,19 @@ * IN THE SOFTWARE. */ +/** @file brw_fs_copy_propagation.cpp + * + * Support for global copy propagation in two passes: A local pass that does + * intra-block copy (and constant) propagation, and a global pass that uses + * dataflow analysis on the copies available at the end of each block to re-do + * local copy propagation with more copies available. + * + * See Muchnik's Advanced Compiler Design and Implementation, section + * 12.5 (p356). + */ + +#define ACP_HASH_SIZE 16 + #include "brw_fs.h" #include "brw_cfg.h" @@ -29,6 +42,158 @@ struct acp_entry : public exec_node { fs_reg dst; fs_reg src; }; + +struct block_data { + /** + * Which entries in the fs_copy_prop_dataflow acp table are live at the + * start of this block. This is the useful output of the analysis, since + * it lets us plug those into the local copy propagation on the second + * pass. + */ + bool *livein; + + /** + * Which entries in the fs_copy_prop_dataflow acp table are live at the end + * of this block. This is done in initial setup from the per-block acps + * returned by the first local copy prop pass. + */ + bool *liveout; + + /** + * Which entries in the fs_copy_prop_dataflow acp table are killed over the + * course of this block. + */ + bool *kill; +}; + +class fs_copy_prop_dataflow +{ +public: + fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg, + exec_list out_acp[][ACP_HASH_SIZE]); + + void setup_kills(); + void run(); + + void *mem_ctx; + cfg_t *cfg; + + acp_entry **acp; + int num_acp; + + struct block_data *bd; +}; +} /* anonymous namespace */ + +fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg, + exec_list out_acp[][ACP_HASH_SIZE]) + : mem_ctx(mem_ctx), cfg(cfg) +{ + bd = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks); + + num_acp = 0; + for (int b = 0; b < cfg->num_blocks; b++) { + for (int i = 0; i < ACP_HASH_SIZE; i++) { + foreach_list(entry_node, &out_acp[b][i]) { + num_acp++; + } + } + } + + acp = rzalloc_array(mem_ctx, struct acp_entry *, num_acp); + + int next_acp = 0; + for (int b = 0; b < cfg->num_blocks; b++) { + bd[b].livein = rzalloc_array(bd, bool, num_acp); + bd[b].liveout = rzalloc_array(bd, bool, num_acp); + bd[b].kill = rzalloc_array(bd, bool, num_acp); + + for (int i = 0; i < ACP_HASH_SIZE; i++) { + foreach_list(entry_node, &out_acp[b][i]) { + acp_entry *entry = (acp_entry *)entry_node; + + acp[next_acp] = entry; + bd[b].liveout[next_acp] = true; + next_acp++; + } + } + } + + assert(next_acp == num_acp); + + setup_kills(); + run(); +} + +/** + * Walk the set of instructions in the block, marking which entries in the acp + * are killed by the block. + */ +void +fs_copy_prop_dataflow::setup_kills() +{ + for (int b = 0; b < cfg->num_blocks; b++) { + bblock_t *block = cfg->blocks[b]; + + for (fs_inst *inst = (fs_inst *)block->start; + inst != block->end->next; + inst = (fs_inst *)inst->next) { + if (inst->dst.file != GRF) + continue; + + for (int i = 0; i < num_acp; i++) { + if (inst->overwrites_reg(acp[i]->dst) || + inst->overwrites_reg(acp[i]->src)) { + bd[b].kill[i] = true; + } + } + } + } +} + +/** + * Walk the set of instructions in the block, marking which entries in the acp + * are killed by the block. + */ +void +fs_copy_prop_dataflow::run() +{ + bool cont = true; + + while (cont) { + cont = false; + + for (int b = 0; b < cfg->num_blocks; b++) { + for (int i = 0; i < num_acp; i++) { + if (!bd[b].liveout[i]) { + /* Update liveout */ + if (bd[b].livein[i] && !bd[b].kill[i]) { + bd[b].liveout[i] = true; + cont = true; + } + } + + if (!bd[b].livein[i]) { + /* Update livein: if it's live at the end of all parents, it's + * live at our start. + */ + bool add = true; + foreach_list(block_node, &cfg->blocks[b]->parents) { + bblock_link *link = (bblock_link *)block_node; + bblock_t *block = link->block; + if (!bd[block->block_num].liveout[i]) { + add = false; + break; + } + } + if (add) { + bd[b].livein[i] = true; + cont = true; + } + } + } + } + } } bool @@ -183,25 +348,14 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) return progress; } - -/** @file brw_fs_copy_propagation.cpp - * - * Support for local copy propagation by walking the list of instructions - * and maintaining the ACP table of available copies for propagation. - * - * See Muchnik's Advanced Compiler Design and Implementation, section - * 12.5 (p356). - */ - /* Walks a basic block and does copy propagation on it using the acp * list. */ bool -fs_visitor::opt_copy_propagate_local(void *mem_ctx, bblock_t *block) +fs_visitor::opt_copy_propagate_local(void *mem_ctx, bblock_t *block, + exec_list *acp) { bool progress = false; - int acp_count = 16; - exec_list acp[acp_count]; for (fs_inst *inst = (fs_inst *)block->start; inst != block->end->next; @@ -212,7 +366,7 @@ fs_visitor::opt_copy_propagate_local(void *mem_ctx, bblock_t *block) if (inst->src[i].file != GRF) continue; - foreach_list(entry_node, &acp[inst->src[i].reg % acp_count]) { + foreach_list(entry_node, &acp[inst->src[i].reg % ACP_HASH_SIZE]) { acp_entry *entry = (acp_entry *)entry_node; if (try_constant_propagate(inst, entry)) @@ -225,7 +379,7 @@ fs_visitor::opt_copy_propagate_local(void *mem_ctx, bblock_t *block) /* kill the destination from the ACP */ if (inst->dst.file == GRF) { - foreach_list_safe(entry_node, &acp[inst->dst.reg % acp_count]) { + foreach_list_safe(entry_node, &acp[inst->dst.reg % ACP_HASH_SIZE]) { acp_entry *entry = (acp_entry *)entry_node; if (inst->overwrites_reg(entry->dst)) { @@ -236,7 +390,7 @@ fs_visitor::opt_copy_propagate_local(void *mem_ctx, bblock_t *block) /* Oops, we only have the chaining hash based on the destination, not * the source, so walk across the entire table. */ - for (int i = 0; i < acp_count; i++) { + for (int i = 0; i < ACP_HASH_SIZE; i++) { foreach_list_safe(entry_node, &acp[i]) { acp_entry *entry = (acp_entry *)entry_node; if (inst->overwrites_reg(entry->src)) @@ -263,7 +417,7 @@ fs_visitor::opt_copy_propagate_local(void *mem_ctx, bblock_t *block) acp_entry *entry = ralloc(mem_ctx, acp_entry); entry->dst = inst->dst; entry->src = inst->src[0]; - acp[entry->dst.reg % acp_count].push_tail(entry); + acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry); } } @@ -275,13 +429,37 @@ fs_visitor::opt_copy_propagate() { bool progress = false; void *mem_ctx = ralloc_context(this->mem_ctx); - cfg_t cfg(this); + exec_list out_acp[cfg.num_blocks][ACP_HASH_SIZE]; + /* First, walk through each block doing local copy propagation and getting + * the set of copies available at the end of the block. + */ for (int b = 0; b < cfg.num_blocks; b++) { bblock_t *block = cfg.blocks[b]; - progress = opt_copy_propagate_local(mem_ctx, block) || progress; + progress = opt_copy_propagate_local(mem_ctx, block, + out_acp[b]) || progress; + } + + /* Do dataflow analysis for those available copies. */ + fs_copy_prop_dataflow dataflow(mem_ctx, &cfg, out_acp); + + /* Next, re-run local copy propagation, this time with the set of copies + * provided by the dataflow analysis available at the start of a block. + */ + for (int b = 0; b < cfg.num_blocks; b++) { + bblock_t *block = cfg.blocks[b]; + exec_list in_acp[ACP_HASH_SIZE]; + + for (int i = 0; i < dataflow.num_acp; i++) { + if (dataflow.bd[b].livein[i]) { + struct acp_entry *entry = dataflow.acp[i]; + in_acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry); + } + } + + progress = opt_copy_propagate_local(mem_ctx, block, in_acp) || progress; } ralloc_free(mem_ctx);