From af9296b8c075d109426ecd1686211088e618103e Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Wed, 25 Sep 2019 14:02:48 +0200 Subject: [PATCH] nir/sink: Rewrite loop handling logic MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Previously, for code like: loop { loop { a = load_ubo() } use(a) } adjust_block_for_loops() would return the block before the first loop. Now we compute the range of allowed blocks and then walk the dominance tree directly, guaranteeing directly that we always choose a block that dominates all the uses and is dominated by the definition. Reviewed-by: Daniel Schürmann --- src/compiler/nir/nir_opt_sink.c | 75 ++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/src/compiler/nir/nir_opt_sink.c b/src/compiler/nir/nir_opt_sink.c index bdeedde5f48..f6c7093a0d3 100644 --- a/src/compiler/nir/nir_opt_sink.c +++ b/src/compiler/nir/nir_opt_sink.c @@ -76,23 +76,36 @@ get_innermost_loop(nir_cf_node *node) return NULL; } -/* return last block not after use_block with def_loop as it's innermost loop */ -static nir_block * -adjust_block_for_loops(nir_block *use_block, nir_loop *def_loop) +static bool +loop_contains_block(nir_loop *loop, nir_block *block) { - nir_loop *use_loop = NULL; + nir_block *before = nir_cf_node_as_block(nir_cf_node_prev(&loop->cf_node)); + nir_block *after = nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node)); - for (nir_cf_node *node = &use_block->cf_node; node != NULL; node = node->parent) { - if (def_loop && node == &def_loop->cf_node) - break; - if (node->type == nir_cf_node_loop) - use_loop = nir_cf_node_as_loop(node); - } - if (use_loop) { - return nir_block_cf_tree_prev(nir_loop_first_block(use_loop)); - } else { - return use_block; + return block->index > before->index && block->index < after->index; +} + +/* Given the LCA of all uses and the definition, find a block on the path + * between them in the dominance tree that is outside of as many loops as + * possible. + */ + +static nir_block * +adjust_block_for_loops(nir_block *use_block, nir_block *def_block) +{ + for (nir_block *cur_block = use_block; cur_block != def_block->imm_dom; + cur_block = cur_block->imm_dom) { + nir_cf_node *next = nir_cf_node_next(&cur_block->cf_node); + if (next && next->type == nir_cf_node_loop) { + nir_loop *following_loop = nir_cf_node_as_loop(next); + if (loop_contains_block(following_loop, use_block)) { + use_block = cur_block; + continue; + } + } } + + return use_block; } /* iterate a ssa def's use's and try to find a more optimal block to @@ -106,10 +119,6 @@ get_preferred_block(nir_ssa_def *def, bool sink_into_loops) { nir_block *lca = NULL; - nir_loop *def_loop = NULL; - if (!sink_into_loops) - def_loop = get_innermost_loop(&def->parent_instr->block->cf_node); - nir_foreach_use(use, def) { nir_instr *instr = use->parent_instr; nir_block *use_block = instr->block; @@ -131,18 +140,6 @@ get_preferred_block(nir_ssa_def *def, bool sink_into_loops) use_block = phi_lca; } - /* If we're moving a load_ubo or load_interpolated_input, we don't want to - * sink it down into loops, which may result in accessing memory or shared - * functions multiple times. Sink it just above the start of the loop - * where it's used. For load_consts, undefs, and comparisons, we expect - * the driver to be able to emit them as simple ALU ops, so sinking as far - * in as we can go is probably worth it for register pressure. - */ - if (!sink_into_loops) { - use_block = adjust_block_for_loops(use_block, def_loop); - assert(nir_block_dominates(def->parent_instr->block, use_block)); - } - lca = nir_dominance_lca(lca, use_block); } @@ -150,14 +147,22 @@ get_preferred_block(nir_ssa_def *def, bool sink_into_loops) nir_block *use_block = nir_cf_node_as_block(nir_cf_node_prev(&use->parent_if->cf_node)); - if (!sink_into_loops) { - use_block = adjust_block_for_loops(use_block, def_loop); - assert(nir_block_dominates(def->parent_instr->block, use_block)); - } - lca = nir_dominance_lca(lca, use_block); } + /* If we're moving a load_ubo or load_interpolated_input, we don't want to + * sink it down into loops, which may result in accessing memory or shared + * functions multiple times. Sink it just above the start of the loop + * where it's used. For load_consts, undefs, and comparisons, we expect + * the driver to be able to emit them as simple ALU ops, so sinking as far + * in as we can go is probably worth it for register pressure. + */ + if (!sink_into_loops) { + lca = adjust_block_for_loops(lca, def->parent_instr->block); + assert(nir_block_dominates(def->parent_instr->block, lca)); + } + + return lca; } -- 2.30.2