lima/ppir: duplicate intrinsics in nir
authorErico Nunes <nunes.erico@gmail.com>
Mon, 13 Apr 2020 13:30:25 +0000 (15:30 +0200)
committerMarge Bot <eric+marge@anholt.net>
Sat, 9 May 2020 11:30:07 +0000 (11:30 +0000)
Move the duplicate uniform and varying steps to a nir pass, along with
some changes in the duplicating strategy.

Node duplication is now done per user of the varying/uniform. This is
inspired by what the offline shader compiler seems to usually do, and as
usual aims to reduce register pressure and better utilize the ld_uni and
ld_var instruction slots.
It is worth noting that due to a bug/feature, ppir was already
duplicating uniforms per successor in ppir_node_add_src even if the
comment indicated it was meant to be per-block.
Additionally, ppir was duplicating load uniform nodes twice for nodes
that use the same uniform in more than one source, resulting in one
unnecessary (and unpipelineable) load. This new implementation in nir
only creates one load in that case.

Signed-off-by: Erico Nunes <nunes.erico@gmail.com>
Reviewed-by: Vasily Khoruzhick <anarsoul@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4535>

src/gallium/drivers/lima/Android.mk
src/gallium/drivers/lima/ir/lima_ir.h
src/gallium/drivers/lima/ir/lima_nir_duplicate_intrinsic.c [new file with mode: 0644]
src/gallium/drivers/lima/ir/pp/nir.c
src/gallium/drivers/lima/lima_program.c
src/gallium/drivers/lima/meson.build

index df2531c8eaa837be93411f17788e5cfa27846be0..e0a4eda56eb5bfaf935c1544686f1dbf9558ae9b 100644 (file)
@@ -36,6 +36,7 @@ LOCAL_SRC_FILES := \
        ir/gp/reduce_scheduler.c \
        ir/gp/scheduler.c \
        ir/lima_ir.h \
+       ir/lima_nir_duplicate_intrinsic.c \
        ir/lima_nir_lower_uniform_to_scalar.c \
        ir/lima_nir_split_load_input.c \
        ir/pp/codegen.c \
index 8bd72c55abea6c35e81c89fa609fc1f070ff8093..d84bf96927677d74272040057a1e18eb714d4d2e 100644 (file)
@@ -68,4 +68,7 @@ bool lima_nir_scale_trig(nir_shader *shader);
 bool lima_nir_lower_ftrunc(nir_shader *shader);
 bool lima_nir_split_load_input(nir_shader *shader);
 
+void lima_nir_duplicate_load_inputs(nir_shader *shader);
+void lima_nir_duplicate_load_uniforms(nir_shader *shader);
+
 #endif
diff --git a/src/gallium/drivers/lima/ir/lima_nir_duplicate_intrinsic.c b/src/gallium/drivers/lima/ir/lima_nir_duplicate_intrinsic.c
new file mode 100644 (file)
index 0000000..8d1c7dc
--- /dev/null
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2020 Lima Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+#include "lima_ir.h"
+
+static bool
+lima_nir_duplicate_intrinsic(nir_builder *b, nir_intrinsic_instr *itr,
+                             nir_intrinsic_op op)
+{
+   nir_intrinsic_instr *last_dupl = NULL;
+   nir_instr *last_parent_instr = NULL;
+
+   nir_foreach_use_safe(use_src, &itr->dest.ssa) {
+      nir_intrinsic_instr *dupl;
+
+      if (last_parent_instr != use_src->parent_instr) {
+         /* if ssa use, clone for the target block */
+         b->cursor = nir_before_instr(use_src->parent_instr);
+         dupl = nir_intrinsic_instr_create(b->shader, op);
+         dupl->num_components = itr->num_components;
+         memcpy(dupl->const_index, itr->const_index, sizeof(itr->const_index));
+         dupl->src[0].is_ssa = itr->src[0].is_ssa;
+         if (itr->src[0].is_ssa)
+            dupl->src[0].ssa = itr->src[0].ssa;
+         else
+            dupl->src[0].reg = itr->src[0].reg;
+
+         nir_ssa_dest_init(&dupl->instr, &dupl->dest,
+               dupl->num_components, itr->dest.ssa.bit_size,
+               itr->dest.ssa.name);
+
+         dupl->instr.pass_flags = 1;
+         nir_builder_instr_insert(b, &dupl->instr);
+      }
+      else {
+         dupl = last_dupl;
+      }
+
+      nir_instr_rewrite_src(use_src->parent_instr, use_src, nir_src_for_ssa(&dupl->dest.ssa));
+      last_parent_instr = use_src->parent_instr;
+      last_dupl = dupl;
+   }
+
+   last_dupl = NULL;
+   last_parent_instr = NULL;
+
+   nir_foreach_if_use_safe(use_src, &itr->dest.ssa) {
+      nir_intrinsic_instr *dupl;
+
+      if (last_parent_instr != use_src->parent_instr) {
+         /* if 'if use', clone where it is */
+         b->cursor = nir_before_instr(&itr->instr);
+         dupl = nir_intrinsic_instr_create(b->shader, op);
+         dupl->num_components = itr->num_components;
+         memcpy(dupl->const_index, itr->const_index, sizeof(itr->const_index));
+         dupl->src[0].is_ssa = itr->src[0].is_ssa;
+         if (itr->src[0].is_ssa)
+            dupl->src[0].ssa = itr->src[0].ssa;
+         else
+            dupl->src[0].reg = itr->src[0].reg;
+
+         nir_ssa_dest_init(&dupl->instr, &dupl->dest,
+               dupl->num_components, itr->dest.ssa.bit_size,
+               itr->dest.ssa.name);
+
+         dupl->instr.pass_flags = 1;
+         nir_builder_instr_insert(b, &dupl->instr);
+      }
+      else {
+         dupl = last_dupl;
+      }
+
+      nir_if_rewrite_condition(use_src->parent_if, nir_src_for_ssa(&dupl->dest.ssa));
+      last_parent_instr = use_src->parent_instr;
+      last_dupl = dupl;
+   }
+
+   nir_instr_remove(&itr->instr);
+   return true;
+}
+
+static void
+lima_nir_duplicate_intrinsic_impl(nir_shader *shader, nir_function_impl *impl,
+                                  nir_intrinsic_op op)
+{
+   nir_builder builder;
+   nir_builder_init(&builder, impl);
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr(instr, block) {
+         instr->pass_flags = 0;
+      }
+
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *itr = nir_instr_as_intrinsic(instr);
+
+         if (itr->intrinsic != op)
+            continue;
+
+         if (itr->instr.pass_flags)
+            continue;
+
+         if (!itr->dest.is_ssa)
+            continue;
+
+         lima_nir_duplicate_intrinsic(&builder, itr, op);
+      }
+   }
+
+   nir_metadata_preserve(impl, nir_metadata_block_index |
+                               nir_metadata_dominance);
+}
+
+/* Duplicate load uniforms for every user.
+ * Helps by utilizing the load uniform instruction slots that would
+ * otherwise stay empty, and reduces register pressure. */
+void
+lima_nir_duplicate_load_uniforms(nir_shader *shader)
+{
+   nir_foreach_function(function, shader) {
+      if (function->impl) {
+         lima_nir_duplicate_intrinsic_impl(shader, function->impl, nir_intrinsic_load_uniform);
+      }
+   }
+}
+
+/* Duplicate load inputs for every user.
+ * Helps by utilizing the load input instruction slots that would
+ * otherwise stay empty, and reduces register pressure. */
+void
+lima_nir_duplicate_load_inputs(nir_shader *shader)
+{
+   nir_foreach_function(function, shader) {
+      if (function->impl) {
+         lima_nir_duplicate_intrinsic_impl(shader, function->impl, nir_intrinsic_load_input);
+      }
+   }
+}
index 61664a02941b1527c2615d1f0ffd6edbb61c0ba5..91f4be151b3c31f99fd39bff61b3f4de63dd73b0 100644 (file)
@@ -104,42 +104,13 @@ static void ppir_node_add_src(ppir_compiler *comp, ppir_node *node,
       case ppir_op_const:
          child = ppir_node_clone(node->block, child);
          break;
-      case ppir_op_load_varying: {
-         bool is_load_coords = false;
+      case ppir_op_load_varying:
+         /* If at least one successor is load_texture, promote it to
+          * load_coords to ensure that is has exactly one successor */
          if (node->op == ppir_op_load_texture) {
             nir_tex_src *nts = (nir_tex_src *)ns;
             if (nts->src_type == nir_tex_src_coord)
-               is_load_coords = true;
-         }
-
-         if (!is_load_coords) {
-            /* Clone varying loads for each block */
-            if (child->block != node->block) {
-               ppir_node *new = ppir_node_clone(node->block, child);
-               /* If we clone it for every block and there is no user of
-                * the original load left, delete the original one. */
-               ppir_delete_if_orphan(node->block, child);
-               child = new;
-               comp->var_nodes[ns->ssa->index] = child;
-            }
-            break;
-         }
-         /* At least one successor is load_texture, promote it to load_coords
-          * to ensure that is has exactly one successor */
-         child->op = ppir_op_load_coords;
-      }
-         /* Fallthrough */
-      case ppir_op_load_uniform:
-      case ppir_op_load_coords:
-      case ppir_op_load_coords_reg:
-         /* Clone uniform and texture coord loads for each block.
-          * Also ensure that each load has a single successor.
-          * Let's do a fetch each time and hope for a cache hit instead
-          * of increasing reg pressure.
-          */
-         if (child->block != node->block || !ppir_node_is_root(child)) {
-            child = ppir_node_clone(node->block, child);
-            comp->var_nodes[ns->ssa->index] = child;
+               child->op = ppir_op_load_coords;
          }
          break;
       default:
index 19eef27f32028cd823842d77f9d03685f704098c..bd7c40f7f6e59afba2fdc520281694fba853988b 100644 (file)
@@ -248,6 +248,9 @@ lima_program_optimize_fs_nir(struct nir_shader *s,
    NIR_PASS_V(s, nir_move_vec_src_uses_to_dest);
    NIR_PASS_V(s, nir_lower_vec_to_movs);
 
+   NIR_PASS_V(s, lima_nir_duplicate_load_uniforms);
+   NIR_PASS_V(s, lima_nir_duplicate_load_inputs);
+
    nir_sweep(s);
 }
 
index 23766a8a4f392c4f4292480ed794732949d0314a..aae30917bad40194ec2f0767d1fddf0990ab37b0 100644 (file)
@@ -46,6 +46,7 @@ files_lima = files(
   'ir/pp/node_to_instr.c',
   'ir/pp/disasm.c',
 
+  'ir/lima_nir_duplicate_intrinsic.c',
   'ir/lima_nir_lower_uniform_to_scalar.c',
   'ir/lima_nir_split_load_input.c',