From 5e6c3861182cbf481ce3d25d0bbbcad4916c92dd Mon Sep 17 00:00:00 2001 From: Erico Nunes Date: Mon, 13 Apr 2020 15:30:25 +0200 Subject: [PATCH] lima/ppir: duplicate intrinsics in nir Move the duplicate uniform and varying steps to a nir pass, along with some changes in the duplicating strategy. Node duplication is now done per user of the varying/uniform. This is inspired by what the offline shader compiler seems to usually do, and as usual aims to reduce register pressure and better utilize the ld_uni and ld_var instruction slots. It is worth noting that due to a bug/feature, ppir was already duplicating uniforms per successor in ppir_node_add_src even if the comment indicated it was meant to be per-block. Additionally, ppir was duplicating load uniform nodes twice for nodes that use the same uniform in more than one source, resulting in one unnecessary (and unpipelineable) load. This new implementation in nir only creates one load in that case. Signed-off-by: Erico Nunes Reviewed-by: Vasily Khoruzhick Part-of: --- src/gallium/drivers/lima/Android.mk | 1 + src/gallium/drivers/lima/ir/lima_ir.h | 3 + .../lima/ir/lima_nir_duplicate_intrinsic.c | 163 ++++++++++++++++++ src/gallium/drivers/lima/ir/pp/nir.c | 37 +--- src/gallium/drivers/lima/lima_program.c | 3 + src/gallium/drivers/lima/meson.build | 1 + 6 files changed, 175 insertions(+), 33 deletions(-) create mode 100644 src/gallium/drivers/lima/ir/lima_nir_duplicate_intrinsic.c diff --git a/src/gallium/drivers/lima/Android.mk b/src/gallium/drivers/lima/Android.mk index df2531c8eaa..e0a4eda56eb 100644 --- a/src/gallium/drivers/lima/Android.mk +++ b/src/gallium/drivers/lima/Android.mk @@ -36,6 +36,7 @@ LOCAL_SRC_FILES := \ ir/gp/reduce_scheduler.c \ ir/gp/scheduler.c \ ir/lima_ir.h \ + ir/lima_nir_duplicate_intrinsic.c \ ir/lima_nir_lower_uniform_to_scalar.c \ ir/lima_nir_split_load_input.c \ ir/pp/codegen.c \ diff --git a/src/gallium/drivers/lima/ir/lima_ir.h b/src/gallium/drivers/lima/ir/lima_ir.h index 8bd72c55abe..d84bf969276 100644 --- a/src/gallium/drivers/lima/ir/lima_ir.h +++ b/src/gallium/drivers/lima/ir/lima_ir.h @@ -68,4 +68,7 @@ bool lima_nir_scale_trig(nir_shader *shader); bool lima_nir_lower_ftrunc(nir_shader *shader); bool lima_nir_split_load_input(nir_shader *shader); +void lima_nir_duplicate_load_inputs(nir_shader *shader); +void lima_nir_duplicate_load_uniforms(nir_shader *shader); + #endif diff --git a/src/gallium/drivers/lima/ir/lima_nir_duplicate_intrinsic.c b/src/gallium/drivers/lima/ir/lima_nir_duplicate_intrinsic.c new file mode 100644 index 00000000000..8d1c7dc5c32 --- /dev/null +++ b/src/gallium/drivers/lima/ir/lima_nir_duplicate_intrinsic.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2020 Lima Project + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "nir.h" +#include "nir_builder.h" +#include "lima_ir.h" + +static bool +lima_nir_duplicate_intrinsic(nir_builder *b, nir_intrinsic_instr *itr, + nir_intrinsic_op op) +{ + nir_intrinsic_instr *last_dupl = NULL; + nir_instr *last_parent_instr = NULL; + + nir_foreach_use_safe(use_src, &itr->dest.ssa) { + nir_intrinsic_instr *dupl; + + if (last_parent_instr != use_src->parent_instr) { + /* if ssa use, clone for the target block */ + b->cursor = nir_before_instr(use_src->parent_instr); + dupl = nir_intrinsic_instr_create(b->shader, op); + dupl->num_components = itr->num_components; + memcpy(dupl->const_index, itr->const_index, sizeof(itr->const_index)); + dupl->src[0].is_ssa = itr->src[0].is_ssa; + if (itr->src[0].is_ssa) + dupl->src[0].ssa = itr->src[0].ssa; + else + dupl->src[0].reg = itr->src[0].reg; + + nir_ssa_dest_init(&dupl->instr, &dupl->dest, + dupl->num_components, itr->dest.ssa.bit_size, + itr->dest.ssa.name); + + dupl->instr.pass_flags = 1; + nir_builder_instr_insert(b, &dupl->instr); + } + else { + dupl = last_dupl; + } + + nir_instr_rewrite_src(use_src->parent_instr, use_src, nir_src_for_ssa(&dupl->dest.ssa)); + last_parent_instr = use_src->parent_instr; + last_dupl = dupl; + } + + last_dupl = NULL; + last_parent_instr = NULL; + + nir_foreach_if_use_safe(use_src, &itr->dest.ssa) { + nir_intrinsic_instr *dupl; + + if (last_parent_instr != use_src->parent_instr) { + /* if 'if use', clone where it is */ + b->cursor = nir_before_instr(&itr->instr); + dupl = nir_intrinsic_instr_create(b->shader, op); + dupl->num_components = itr->num_components; + memcpy(dupl->const_index, itr->const_index, sizeof(itr->const_index)); + dupl->src[0].is_ssa = itr->src[0].is_ssa; + if (itr->src[0].is_ssa) + dupl->src[0].ssa = itr->src[0].ssa; + else + dupl->src[0].reg = itr->src[0].reg; + + nir_ssa_dest_init(&dupl->instr, &dupl->dest, + dupl->num_components, itr->dest.ssa.bit_size, + itr->dest.ssa.name); + + dupl->instr.pass_flags = 1; + nir_builder_instr_insert(b, &dupl->instr); + } + else { + dupl = last_dupl; + } + + nir_if_rewrite_condition(use_src->parent_if, nir_src_for_ssa(&dupl->dest.ssa)); + last_parent_instr = use_src->parent_instr; + last_dupl = dupl; + } + + nir_instr_remove(&itr->instr); + return true; +} + +static void +lima_nir_duplicate_intrinsic_impl(nir_shader *shader, nir_function_impl *impl, + nir_intrinsic_op op) +{ + nir_builder builder; + nir_builder_init(&builder, impl); + + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + instr->pass_flags = 0; + } + + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *itr = nir_instr_as_intrinsic(instr); + + if (itr->intrinsic != op) + continue; + + if (itr->instr.pass_flags) + continue; + + if (!itr->dest.is_ssa) + continue; + + lima_nir_duplicate_intrinsic(&builder, itr, op); + } + } + + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); +} + +/* Duplicate load uniforms for every user. + * Helps by utilizing the load uniform instruction slots that would + * otherwise stay empty, and reduces register pressure. */ +void +lima_nir_duplicate_load_uniforms(nir_shader *shader) +{ + nir_foreach_function(function, shader) { + if (function->impl) { + lima_nir_duplicate_intrinsic_impl(shader, function->impl, nir_intrinsic_load_uniform); + } + } +} + +/* Duplicate load inputs for every user. + * Helps by utilizing the load input instruction slots that would + * otherwise stay empty, and reduces register pressure. */ +void +lima_nir_duplicate_load_inputs(nir_shader *shader) +{ + nir_foreach_function(function, shader) { + if (function->impl) { + lima_nir_duplicate_intrinsic_impl(shader, function->impl, nir_intrinsic_load_input); + } + } +} diff --git a/src/gallium/drivers/lima/ir/pp/nir.c b/src/gallium/drivers/lima/ir/pp/nir.c index 61664a02941..91f4be151b3 100644 --- a/src/gallium/drivers/lima/ir/pp/nir.c +++ b/src/gallium/drivers/lima/ir/pp/nir.c @@ -104,42 +104,13 @@ static void ppir_node_add_src(ppir_compiler *comp, ppir_node *node, case ppir_op_const: child = ppir_node_clone(node->block, child); break; - case ppir_op_load_varying: { - bool is_load_coords = false; + case ppir_op_load_varying: + /* If at least one successor is load_texture, promote it to + * load_coords to ensure that is has exactly one successor */ if (node->op == ppir_op_load_texture) { nir_tex_src *nts = (nir_tex_src *)ns; if (nts->src_type == nir_tex_src_coord) - is_load_coords = true; - } - - if (!is_load_coords) { - /* Clone varying loads for each block */ - if (child->block != node->block) { - ppir_node *new = ppir_node_clone(node->block, child); - /* If we clone it for every block and there is no user of - * the original load left, delete the original one. */ - ppir_delete_if_orphan(node->block, child); - child = new; - comp->var_nodes[ns->ssa->index] = child; - } - break; - } - /* At least one successor is load_texture, promote it to load_coords - * to ensure that is has exactly one successor */ - child->op = ppir_op_load_coords; - } - /* Fallthrough */ - case ppir_op_load_uniform: - case ppir_op_load_coords: - case ppir_op_load_coords_reg: - /* Clone uniform and texture coord loads for each block. - * Also ensure that each load has a single successor. - * Let's do a fetch each time and hope for a cache hit instead - * of increasing reg pressure. - */ - if (child->block != node->block || !ppir_node_is_root(child)) { - child = ppir_node_clone(node->block, child); - comp->var_nodes[ns->ssa->index] = child; + child->op = ppir_op_load_coords; } break; default: diff --git a/src/gallium/drivers/lima/lima_program.c b/src/gallium/drivers/lima/lima_program.c index 19eef27f320..bd7c40f7f6e 100644 --- a/src/gallium/drivers/lima/lima_program.c +++ b/src/gallium/drivers/lima/lima_program.c @@ -248,6 +248,9 @@ lima_program_optimize_fs_nir(struct nir_shader *s, NIR_PASS_V(s, nir_move_vec_src_uses_to_dest); NIR_PASS_V(s, nir_lower_vec_to_movs); + NIR_PASS_V(s, lima_nir_duplicate_load_uniforms); + NIR_PASS_V(s, lima_nir_duplicate_load_inputs); + nir_sweep(s); } diff --git a/src/gallium/drivers/lima/meson.build b/src/gallium/drivers/lima/meson.build index 23766a8a4f3..aae30917bad 100644 --- a/src/gallium/drivers/lima/meson.build +++ b/src/gallium/drivers/lima/meson.build @@ -46,6 +46,7 @@ files_lima = files( 'ir/pp/node_to_instr.c', 'ir/pp/disasm.c', + 'ir/lima_nir_duplicate_intrinsic.c', 'ir/lima_nir_lower_uniform_to_scalar.c', 'ir/lima_nir_split_load_input.c', -- 2.30.2