nir/algebraic: trivially enable existing 32-bit patterns for all bit sizes

[mesa.git] / src / compiler / nir / nir_from_ssa.c
diff --git a/src/compiler/nir/nir_from_ssa.c b/src/compiler/nir/nir_from_ssa.c

index a8d3e648974447306f30cc9553beebe3ef95ea7e..52c3a2cb33bb9237cd67ba74837dc122b4c1f71d 100644 (file)
--- a/src/compiler/nir/nir_from_ssa.c
+++ b/src/compiler/nir/nir_from_ssa.c
@@ -32,7 +32,7 @@
  /*
   * This file implements an out-of-SSA pass as described in "Revisiting
   * Out-of-SSA Translation for Correctness, Code Quality, and Efficiency" by
- * Boissinot et. al.
+ * Boissinot et al.
   */
  
  struct from_ssa_state {
@@ -41,6 +41,7 @@ struct from_ssa_state {
     bool phi_webs_only;
     struct hash_table *merge_node_table;
     nir_instr *instr;
+   bool progress;
  };
  
  /* Returns true if a dominates b */
@@ -63,16 +64,16 @@ ssa_def_dominates(nir_ssa_def *a, nir_ssa_def *b)
  
  /* The following data structure, which I have named merge_set is a way of
   * representing a set registers of non-interfering registers.  This is
- * based on the concept of a "dominence forest" presented in "Fast Copy
- * Coalescing and Live-Range Identification" by Budimlic et. al. but the
+ * based on the concept of a "dominance forest" presented in "Fast Copy
+ * Coalescing and Live-Range Identification" by Budimlic et al. but the
   * implementation concept is taken from  "Revisiting Out-of-SSA Translation
- * for Correctness, Code Quality, and Efficiency" by Boissinot et. al..
+ * for Correctness, Code Quality, and Efficiency" by Boissinot et al.
   *
   * Each SSA definition is associated with a merge_node and the association
   * is represented by a combination of a hash table and the "def" parameter
   * in the merge_node structure.  The merge_set stores a linked list of
- * merge_node's in dominence order of the ssa definitions.  (Since the
- * liveness analysis pass indexes the SSA values in dominence order for us,
+ * merge_nodes in dominance order of the ssa definitions.  (Since the
+ * liveness analysis pass indexes the SSA values in dominance order for us,
   * this is an easy thing to keep up.)  It is assumed that no pair of the
   * nodes in a given set interfere.  Merging two sets or checking for
   * interference can be done in a single linear-time merge-sort walk of the
@@ -177,7 +178,7 @@ merge_merge_sets(merge_set *a, merge_set *b)
   *
   * This is an implementation of Algorithm 2 in "Revisiting Out-of-SSA
   * Translation for Correctness, Code Quality, and Efficiency" by
- * Boissinot et. al.
+ * Boissinot et al.
   */
  static bool
  merge_sets_interfere(merge_set *a, merge_set *b)
@@ -313,7 +314,7 @@ isolate_phi_nodes_block(nir_block *block, void *dead_ctx)
        last_phi_instr = instr;
     }
  
-   /* If we don't have any phi's, then there's nothing for us to do. */
+   /* If we don't have any phis, then there's nothing for us to do. */
     if (last_phi_instr == NULL)
        return true;
  
@@ -494,7 +495,7 @@ rewrite_ssa_def(nir_ssa_def *def, void *void_state)
     }
  
     nir_ssa_def_rewrite_uses(def, nir_src_for_reg(reg));
-   assert(list_empty(&def->uses) && list_empty(&def->if_uses));
+   assert(list_is_empty(&def->uses) && list_is_empty(&def->if_uses));
  
     if (def->parent_instr->type == nir_instr_type_ssa_undef) {
        /* If it's an ssa_undef instruction, remove it since we know we just got
@@ -503,6 +504,7 @@ rewrite_ssa_def(nir_ssa_def *def, void *void_state)
        nir_instr *parent_instr = def->parent_instr;
        nir_instr_remove(parent_instr);
        ralloc_steal(state->dead_ctx, parent_instr);
+      state->progress = true;
        return true;
     }
  
@@ -514,14 +516,14 @@ rewrite_ssa_def(nir_ssa_def *def, void *void_state)
     nir_dest *dest = exec_node_data(nir_dest, def, ssa);
  
     nir_instr_rewrite_dest(state->instr, dest, nir_dest_for_reg(reg));
-
+   state->progress = true;
     return true;
  }
  
  /* Resolves ssa definitions to registers.  While we're at it, we also
   * remove phi nodes.
   */
-static bool
+static void
  resolve_registers_block(nir_block *block, struct from_ssa_state *state)
  {
     nir_foreach_instr_safe(instr, block) {
@@ -531,11 +533,10 @@ resolve_registers_block(nir_block *block, struct from_ssa_state *state)
        if (instr->type == nir_instr_type_phi) {
           nir_instr_remove(instr);
           ralloc_steal(state->dead_ctx, instr);
+         state->progress = true;
        }
     }
     state->instr = NULL;
-
-   return true;
  }
  
  static void
@@ -550,7 +551,7 @@ emit_copy(nir_builder *b, nir_src src, nir_src dest_src)
     else
        assert(src.reg.reg->num_components >= dest_src.reg.reg->num_components);
  
-   nir_alu_instr *mov = nir_alu_instr_create(b->shader, nir_op_imov);
+   nir_alu_instr *mov = nir_alu_instr_create(b->shader, nir_op_mov);
     nir_src_copy(&mov->src[0].src, &src, mov);
     mov->dest.dest = nir_dest_for_reg(dest_src.reg.reg);
     mov->dest.write_mask = (1 << dest_src.reg.reg->num_components) - 1;
@@ -558,10 +559,10 @@ emit_copy(nir_builder *b, nir_src src, nir_src dest_src)
     nir_builder_instr_insert(b, &mov->instr);
  }
  
-/* Resolves a single parallel copy operation into a sequence of mov's
+/* Resolves a single parallel copy operation into a sequence of movs
   *
   * This is based on Algorithm 1 from "Revisiting Out-of-SSA Translation for
- * Correctness, Code Quality, and Efficiency" by Boissinot et. al..
+ * Correctness, Code Quality, and Efficiency" by Boissinot et al.
   * However, I never got the algorithm to work as written, so this version
   * is slightly modified.
   *
@@ -679,15 +680,17 @@ resolve_parallel_copy(nir_parallel_copy_instr *pcopy,
           int a = pred[b];
           emit_copy(&state->builder, values[loc[a]], values[b]);
  
-         /* If any other copies want a they can find it at b */
-         loc[a] = b;
-
           /* b has been filled, mark it as not needing to be copied */
           pred[b] = -1;
  
-         /* If a needs to be filled, it's ready for copying now */
-         if (pred[a] != -1)
+         /* If a needs to be filled... */
+         if (pred[a] != -1) {
+            /* If any other copies want a they can find it at b */
+            loc[a] = b;
+
+            /* It's ready for copying now */
              ready[++ready_idx] = a;
+         }
        }
        int b = to_do[to_do_idx--];
        if (pred[b] == -1)
@@ -706,10 +709,13 @@ resolve_parallel_copy(nir_parallel_copy_instr *pcopy,
        nir_register *reg = nir_local_reg_create(state->builder.impl);
        reg->name = "copy_temp";
        reg->num_array_elems = 0;
-      if (values[b].is_ssa)
+      if (values[b].is_ssa) {
           reg->num_components = values[b].ssa->num_components;
-      else
+         reg->bit_size = values[b].ssa->bit_size;
+      } else {
           reg->num_components = values[b].reg.reg->num_components;
+         reg->bit_size = values[b].reg.reg->bit_size;
+      }
        values[num_vals].is_ssa = false;
        values[num_vals].reg.reg = reg;
  
@@ -756,7 +762,7 @@ resolve_parallel_copies_block(nir_block *block, struct from_ssa_state *state)
     return true;
  }
  
-static void
+static bool
  nir_convert_from_ssa_impl(nir_function_impl *impl, bool phi_webs_only)
  {
     struct from_ssa_state state;
@@ -764,8 +770,8 @@ nir_convert_from_ssa_impl(nir_function_impl *impl, bool phi_webs_only)
     nir_builder_init(&state.builder, impl);
     state.dead_ctx = ralloc_context(NULL);
     state.phi_webs_only = phi_webs_only;
-   state.merge_node_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
-                                                    _mesa_key_pointer_equal);
+   state.merge_node_table = _mesa_pointer_hash_table_create(NULL);
+   state.progress = false;
  
     nir_foreach_block(block, impl) {
        add_parallel_copy_to_end_of_block(block, state.dead_ctx);
@@ -804,26 +810,30 @@ nir_convert_from_ssa_impl(nir_function_impl *impl, bool phi_webs_only)
     /* Clean up dead instructions and the hash tables */
     _mesa_hash_table_destroy(state.merge_node_table, NULL);
     ralloc_free(state.dead_ctx);
+   return state.progress;
  }
  
-void
+bool
  nir_convert_from_ssa(nir_shader *shader, bool phi_webs_only)
  {
+   bool progress = false;
+
     nir_foreach_function(function, shader) {
        if (function->impl)
-         nir_convert_from_ssa_impl(function->impl, phi_webs_only);
+         progress |= nir_convert_from_ssa_impl(function->impl, phi_webs_only);
     }
+
+   return progress;
  }
  
  
  static void
-place_phi_read(nir_shader *shader, nir_register *reg,
-               nir_ssa_def *def, nir_block *block)
+place_phi_read(nir_builder *b, nir_register *reg,
+               nir_ssa_def *def, nir_block *block, unsigned depth)
  {
     if (block != def->parent_instr->block) {
        /* Try to go up the single-successor tree */
        bool all_single_successors = true;
-      struct set_entry *entry;
        set_foreach(block->predecessors, entry) {
           nir_block *pred = (nir_block *)entry->key;
           if (pred->successors[0] && pred->successors[1]) {
@@ -832,29 +842,35 @@ place_phi_read(nir_shader *shader, nir_register *reg,
           }
        }
  
-      if (all_single_successors) {
+      if (all_single_successors && depth < 32) {
           /* All predecessors of this block have exactly one successor and it
            * is this block so they must eventually lead here without
            * intersecting each other.  Place the reads in the predecessors
            * instead of this block.
+          *
+          * We only let this function recurse 32 times because it can recurse
+          * indefinitely in the presence of infinite loops.  Because we're
+          * crawling a single-successor chain, it doesn't matter where we
+          * place it so it's ok to stop at an arbitrary distance.
+          *
+          * TODO: One day, we could detect back edges and avoid the recursion
+          * that way.
            */
-         set_foreach(block->predecessors, entry)
-            place_phi_read(shader, reg, def, (nir_block *)entry->key);
+         set_foreach(block->predecessors, entry) {
+            place_phi_read(b, reg, def, (nir_block *)entry->key, depth + 1);
+         }
           return;
        }
     }
  
-   nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_imov);
-   mov->src[0].src = nir_src_for_ssa(def);
-   mov->dest.dest = nir_dest_for_reg(reg);
-   mov->dest.write_mask = (1 << reg->num_components) - 1;
-   nir_instr_insert(nir_after_block_before_jump(block), &mov->instr);
+   b->cursor = nir_after_block_before_jump(block);
+   nir_store_reg(b, reg, def, ~0);
  }
  
-/** Lower all of the phi nodes in a block to imov's to and from a register
+/** Lower all of the phi nodes in a block to imovs to and from a register
   *
   * This provides a very quick-and-dirty out-of-SSA pass that you can run on a
- * single block to convert all of it's phis to a register and some imov's.
+ * single block to convert all of its phis to a register and some imovs.
   * The code that is generated, while not optimal for actual codegen in a
   * back-end, is easy to generate, correct, and will turn into the same set of
   * phis after you call regs_to_ssa and do some copy propagation.
@@ -868,8 +884,8 @@ place_phi_read(nir_shader *shader, nir_register *reg,
  bool
  nir_lower_phis_to_regs_block(nir_block *block)
  {
-   nir_function_impl *impl = nir_cf_node_get_function(&block->cf_node);
-   nir_shader *shader = impl->function->shader;
+   nir_builder b;
+   nir_builder_init(&b, nir_cf_node_get_function(&block->cf_node));
  
     bool progress = false;
     nir_foreach_instr_safe(instr, block) {
@@ -879,22 +895,16 @@ nir_lower_phis_to_regs_block(nir_block *block)
        nir_phi_instr *phi = nir_instr_as_phi(instr);
        assert(phi->dest.is_ssa);
  
-      nir_register *reg = create_reg_for_ssa_def(&phi->dest.ssa, impl);
+      nir_register *reg = create_reg_for_ssa_def(&phi->dest.ssa, b.impl);
  
-      nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_imov);
-      mov->src[0].src = nir_src_for_reg(reg);
-      mov->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1;
-      nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
-                        phi->dest.ssa.num_components, phi->dest.ssa.bit_size,
-                        phi->dest.ssa.name);
-      nir_instr_insert(nir_after_instr(&phi->instr), &mov->instr);
+      b.cursor = nir_after_instr(&phi->instr);
+      nir_ssa_def *def = nir_load_reg(&b, reg);
  
-      nir_ssa_def_rewrite_uses(&phi->dest.ssa,
-                               nir_src_for_ssa(&mov->dest.dest.ssa));
+      nir_ssa_def_rewrite_uses(&phi->dest.ssa, nir_src_for_ssa(def));
  
        nir_foreach_phi_src(src, phi) {
           assert(src->src.is_ssa);
-         place_phi_read(shader, reg, src->src.ssa, src->pred);
+         place_phi_read(&b, reg, src->src.ssa, src->pred, 0);
        }
  
        nir_instr_remove(&phi->instr);
@@ -932,6 +942,23 @@ dest_replace_ssa_with_reg(nir_dest *dest, void *void_state)
     return true;
  }
  
+static bool
+ssa_def_is_local_to_block(nir_ssa_def *def, UNUSED void *state)
+{
+   nir_block *block = def->parent_instr->block;
+   nir_foreach_use(use_src, def) {
+      if (use_src->parent_instr->block != block ||
+          use_src->parent_instr->type == nir_instr_type_phi) {
+         return false;
+      }
+   }
+
+   if (!list_is_empty(&def->if_uses))
+      return false;
+
+   return true;
+}
+
  /** Lower all of the SSA defs in a block to registers
   *
   * This performs the very simple operation of blindly replacing all of the SSA
@@ -962,11 +989,16 @@ nir_lower_ssa_defs_to_regs_block(nir_block *block)
           nir_register *reg = create_reg_for_ssa_def(&load->def, state.impl);
           nir_ssa_def_rewrite_uses(&load->def, nir_src_for_reg(reg));
  
-         nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_imov);
+         nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_mov);
           mov->src[0].src = nir_src_for_ssa(&load->def);
           mov->dest.dest = nir_dest_for_reg(reg);
           mov->dest.write_mask = (1 << reg->num_components) - 1;
           nir_instr_insert(nir_after_instr(&load->instr), &mov->instr);
+      } else if (nir_foreach_ssa_def(instr, ssa_def_is_local_to_block, NULL)) {
+         /* If the SSA def produced by this instruction is only in the block
+          * in which it is defined and is not used by ifs or phis, then we
+          * don't have a reason to convert it to a register.
+          */
        } else {
           nir_foreach_dest(instr, dest_replace_ssa_with_reg, &state);
        }