glsl_to_nir: fix crashes with int16 shifts
[mesa.git] / src / compiler / nir / nir_opt_gcm.c
index aa20ab1667f69705c1a4fe75f2a1b9090f788250..6129eacd079253d60788e1b80561c7d89443f43f 100644 (file)
@@ -54,10 +54,11 @@ struct gcm_instr_info {
 
 /* Flags used in the instr->pass_flags field for various instruction states */
 enum {
-   GCM_INSTR_PINNED =            (1 << 0),
-   GCM_INSTR_SCHEDULED_EARLY =   (1 << 1),
-   GCM_INSTR_SCHEDULED_LATE =    (1 << 2),
-   GCM_INSTR_PLACED =            (1 << 3),
+   GCM_INSTR_PINNED =                (1 << 0),
+   GCM_INSTR_SCHEDULE_EARLIER_ONLY = (1 << 1),
+   GCM_INSTR_SCHEDULED_EARLY =       (1 << 2),
+   GCM_INSTR_SCHEDULED_LATE =        (1 << 3),
+   GCM_INSTR_PLACED =                (1 << 4),
 };
 
 struct gcm_state {
@@ -108,6 +109,70 @@ gcm_build_block_info(struct exec_list *cf_list, struct gcm_state *state,
    }
 }
 
+static bool
+is_src_scalarizable(nir_src *src)
+{
+   assert(src->is_ssa);
+
+   nir_instr *src_instr = src->ssa->parent_instr;
+   switch (src_instr->type) {
+   case nir_instr_type_alu: {
+      nir_alu_instr *src_alu = nir_instr_as_alu(src_instr);
+
+      /* ALU operations with output_size == 0 should be scalarized.  We
+       * will also see a bunch of vecN operations from scalarizing ALU
+       * operations and, since they can easily be copy-propagated, they
+       * are ok too.
+       */
+      return nir_op_infos[src_alu->op].output_size == 0 ||
+             src_alu->op == nir_op_vec2 ||
+             src_alu->op == nir_op_vec3 ||
+             src_alu->op == nir_op_vec4;
+   }
+
+   case nir_instr_type_load_const:
+      /* These are trivially scalarizable */
+      return true;
+
+   case nir_instr_type_ssa_undef:
+      return true;
+
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *src_intrin = nir_instr_as_intrinsic(src_instr);
+
+      switch (src_intrin->intrinsic) {
+      case nir_intrinsic_load_deref: {
+         nir_deref_instr *deref = nir_src_as_deref(src_intrin->src[0]);
+         return deref->mode == nir_var_shader_in ||
+                deref->mode == nir_var_uniform ||
+                deref->mode == nir_var_mem_ubo ||
+                deref->mode == nir_var_mem_ssbo ||
+                deref->mode == nir_var_mem_global;
+      }
+
+      case nir_intrinsic_interp_deref_at_centroid:
+      case nir_intrinsic_interp_deref_at_sample:
+      case nir_intrinsic_interp_deref_at_offset:
+      case nir_intrinsic_load_uniform:
+      case nir_intrinsic_load_ubo:
+      case nir_intrinsic_load_ssbo:
+      case nir_intrinsic_load_global:
+      case nir_intrinsic_load_global_constant:
+      case nir_intrinsic_load_input:
+         return true;
+      default:
+         break;
+      }
+
+      return false;
+   }
+
+   default:
+      /* We can't scalarize this type of instruction */
+      return false;
+   }
+}
+
 /* Walks the instruction list and marks immovable instructions as pinned
  *
  * This function also serves to initialize the instr->pass_flags field.
@@ -133,10 +198,17 @@ gcm_pin_instructions(nir_function_impl *impl, struct gcm_state *state)
             case nir_op_fddy_fine:
             case nir_op_fddx_coarse:
             case nir_op_fddy_coarse:
-               /* These can only go in uniform control flow; pin them for now */
-               instr->pass_flags = GCM_INSTR_PINNED;
+               /* These can only go in uniform control flow */
+               instr->pass_flags = GCM_INSTR_SCHEDULE_EARLIER_ONLY;
                break;
 
+            case nir_op_mov:
+               if (!is_src_scalarizable(&(nir_instr_as_alu(instr)->src[0].src))) {
+                  instr->pass_flags = GCM_INSTR_PINNED;
+                  break;
+               }
+               /* fallthrough */
+
             default:
                instr->pass_flags = 0;
                break;
@@ -145,7 +217,7 @@ gcm_pin_instructions(nir_function_impl *impl, struct gcm_state *state)
 
          case nir_instr_type_tex:
             if (nir_tex_instr_has_implicit_derivative(nir_instr_as_tex(instr)))
-               instr->pass_flags = GCM_INSTR_PINNED;
+               instr->pass_flags = GCM_INSTR_SCHEDULE_EARLIER_ONLY;
             break;
 
          case nir_instr_type_deref:
@@ -276,8 +348,24 @@ gcm_choose_block_for_instr(nir_instr *instr, nir_block *early_block,
 
    nir_block *best = late_block;
    for (nir_block *block = late_block; block != NULL; block = block->imm_dom) {
+      /* Being too aggressive with how we pull instructions out of loops can
+       * result in extra register pressure and spilling. For example its fairly
+       * common for loops in compute shaders to calculate SSBO offsets using
+       * the workgroup id, subgroup id and subgroup invocation, pulling all
+       * these calculations outside the loop causes register pressure.
+       *
+       * To work around these issues for now we only allow constant and texture
+       * instructions to be moved outside their original loops.
+       *
+       * TODO: figure out some heuristics to allow more to be moved out of loops.
+       */
       if (state->blocks[block->index].loop_depth <
-          state->blocks[best->index].loop_depth)
+          state->blocks[best->index].loop_depth &&
+          (nir_block_dominates(instr->block, block) ||
+           instr->type == nir_instr_type_load_const ||
+           instr->type == nir_instr_type_tex))
+         best = block;
+      else if (block == instr->block)
          best = block;
 
       if (block == early_block)
@@ -343,14 +431,20 @@ gcm_schedule_late_def(nir_ssa_def *def, void *void_state)
    nir_block *early_block =
       state->instr_infos[def->parent_instr->index].early_block;
 
-   /* Some instructions may never be used.  We'll just schedule them early and
-    * let dead code clean them up.
+   /* Some instructions may never be used.  Flag them and the instruction
+    * placement code will get rid of them for us.
     */
    if (lca == NULL) {
-      def->parent_instr->block = early_block;
+      def->parent_instr->block = NULL;
       return true;
    }
 
+   if (def->parent_instr->pass_flags & GCM_INSTR_SCHEDULE_EARLIER_ONLY &&
+       lca != def->parent_instr->block &&
+       nir_block_dominates(def->parent_instr->block, lca)) {
+      lca = def->parent_instr->block;
+   }
+
    /* We now have the LCA of all of the uses.  If our invariants hold,
     * this is dominated by the block that we chose when scheduling early.
     * We now walk up the dominance tree and pick the lowest block that is
@@ -410,6 +504,23 @@ gcm_place_instr_def(nir_ssa_def *def, void *state)
    return false;
 }
 
+static bool
+gcm_replace_def_with_undef(nir_ssa_def *def, void *void_state)
+{
+   struct gcm_state *state = void_state;
+
+   if (list_is_empty(&def->uses) && list_is_empty(&def->if_uses))
+      return true;
+
+   nir_ssa_undef_instr *undef =
+      nir_ssa_undef_instr_create(state->impl->function->shader,
+                                 def->num_components, def->bit_size);
+   nir_instr_insert(nir_before_cf_list(&state->impl->body), &undef->instr);
+   nir_ssa_def_rewrite_uses(def, nir_src_for_ssa(&undef->def));
+
+   return true;
+}
+
 /** Places an instrution back into the program
  *
  * The earlier passes of GCM simply choose blocks for each instruction and
@@ -433,6 +544,12 @@ gcm_place_instr(nir_instr *instr, struct gcm_state *state)
 
    instr->pass_flags |= GCM_INSTR_PLACED;
 
+   if (instr->block == NULL) {
+      nir_foreach_ssa_def(instr, gcm_replace_def_with_undef, state);
+      nir_instr_remove(instr);
+      return;
+   }
+
    /* Phi nodes are our once source of back-edges.  Since right now we are
     * only doing scheduling within blocks, we don't need to worry about
     * them since they are always at the top.  Just skip them completely.