X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fglsl%2Fnir%2Fnir_lower_vec_to_movs.c;h=2cb0457f9ba6ee8a8be3caeaa56c00c29309c613;hb=aecbc93f2d1ff9de4e03a2b216e86dcb9a4ce414;hp=602853ea665a89817ce4c41ce261f2de5d25cb25;hpb=dd4d9a4e62ecf44011cfa2e020d08029299dd7e0;p=mesa.git

diff --git a/src/glsl/nir/nir_lower_vec_to_movs.c b/src/glsl/nir/nir_lower_vec_to_movs.c
index 602853ea665..2cb0457f9ba 100644
--- a/src/glsl/nir/nir_lower_vec_to_movs.c
+++ b/src/glsl/nir/nir_lower_vec_to_movs.c
@@ -53,29 +53,25 @@ src_matches_dest_reg(nir_dest *dest, nir_src *src)
  * which ones have been processed.
  */
 static unsigned
-insert_mov(nir_alu_instr *vec, unsigned start_channel,
-            unsigned start_src_idx, void *mem_ctx)
+insert_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
 {
-   unsigned src_idx = start_src_idx;
-   assert(src_idx < nir_op_infos[vec->op].num_inputs);
+   assert(start_idx < nir_op_infos[vec->op].num_inputs);
 
-   nir_alu_instr *mov = nir_alu_instr_create(mem_ctx, nir_op_imov);
-   nir_alu_src_copy(&mov->src[0], &vec->src[src_idx], mem_ctx);
-   nir_alu_dest_copy(&mov->dest, &vec->dest, mem_ctx);
+   nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_imov);
+   nir_alu_src_copy(&mov->src[0], &vec->src[start_idx], mov);
+   nir_alu_dest_copy(&mov->dest, &vec->dest, mov);
 
-   mov->dest.write_mask = (1u << start_channel);
-   mov->src[0].swizzle[start_channel] = vec->src[src_idx].swizzle[0];
-   src_idx++;
+   mov->dest.write_mask = (1u << start_idx);
+   mov->src[0].swizzle[start_idx] = vec->src[start_idx].swizzle[0];
 
-   for (unsigned i = start_channel + 1; i < 4; i++) {
+   for (unsigned i = start_idx + 1; i < 4; i++) {
       if (!(vec->dest.write_mask & (1 << i)))
          continue;
 
-      if (nir_srcs_equal(vec->src[src_idx].src, vec->src[start_src_idx].src)) {
+      if (nir_srcs_equal(vec->src[i].src, vec->src[start_idx].src)) {
          mov->dest.write_mask |= (1 << i);
-         mov->src[0].swizzle[i] = vec->src[src_idx].swizzle[0];
+         mov->src[0].swizzle[i] = vec->src[i].swizzle[0];
       }
-      src_idx++;
    }
 
    nir_instr_insert_before(&vec->instr, &mov->instr);
@@ -84,13 +80,121 @@ insert_mov(nir_alu_instr *vec, unsigned start_channel,
 }
 
 static bool
-lower_vec_to_movs_block(nir_block *block, void *mem_ctx)
+has_replicated_dest(nir_alu_instr *alu)
 {
+   return alu->op == nir_op_fdot_replicated2 ||
+          alu->op == nir_op_fdot_replicated3 ||
+          alu->op == nir_op_fdot_replicated4;
+}
+
+/* Attempts to coalesce the "move" from the given source of the vec to the
+ * destination of the instruction generating the value. If, for whatever
+ * reason, we cannot coalesce the mmove, it does nothing and returns 0.  We
+ * can then call insert_mov as normal.
+ */
+static unsigned
+try_coalesce(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
+{
+   assert(start_idx < nir_op_infos[vec->op].num_inputs);
+
+   /* We will only even try if the source is SSA */
+   if (!vec->src[start_idx].src.is_ssa)
+      return 0;
+
+   assert(vec->src[start_idx].src.ssa);
+
+   /* If we are going to do a reswizzle, then the vecN operation must be the
+    * only use of the source value.  We also can't have any source modifiers.
+    */
+   nir_foreach_use(vec->src[start_idx].src.ssa, src) {
+      if (src->parent_instr != &vec->instr)
+         return 0;
+
+      nir_alu_src *alu_src = exec_node_data(nir_alu_src, src, src);
+      if (alu_src->abs || alu_src->negate)
+         return 0;
+   }
+
+   if (!list_empty(&vec->src[start_idx].src.ssa->if_uses))
+      return 0;
+
+   if (vec->src[start_idx].src.ssa->parent_instr->type != nir_instr_type_alu)
+      return 0;
+
+   nir_alu_instr *src_alu =
+      nir_instr_as_alu(vec->src[start_idx].src.ssa->parent_instr);
+
+   if (has_replicated_dest(src_alu)) {
+      /* The fdot instruction is special: It replicates its result to all
+       * components.  This means that we can always rewrite its destination
+       * and we don't need to swizzle anything.
+       */
+   } else {
+      /* We only care about being able to re-swizzle the instruction if it is
+       * something that we can reswizzle.  It must be per-component.  The one
+       * exception to this is the fdotN instructions which implicitly splat
+       * their result out to all channels.
+       */
+      if (nir_op_infos[src_alu->op].output_size != 0)
+         return 0;
+
+      /* If we are going to reswizzle the instruction, we can't have any
+       * non-per-component sources either.
+       */
+      for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+         if (nir_op_infos[src_alu->op].input_sizes[j] != 0)
+            return 0;
+   }
+
+   /* Stash off all of the ALU instruction's swizzles. */
+   uint8_t swizzles[4][4];
+   for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+      for (unsigned i = 0; i < 4; i++)
+         swizzles[j][i] = src_alu->src[j].swizzle[i];
+
+   unsigned write_mask = 0;
+   for (unsigned i = start_idx; i < 4; i++) {
+      if (!(vec->dest.write_mask & (1 << i)))
+         continue;
+
+      if (!vec->src[i].src.is_ssa ||
+          vec->src[i].src.ssa != &src_alu->dest.dest.ssa)
+         continue;
+
+      /* At this point, the give vec source matchese up with the ALU
+       * instruction so we can re-swizzle that component to match.
+       */
+      write_mask |= 1 << i;
+      if (has_replicated_dest(src_alu)) {
+         /* Since the destination is a single replicated value, we don't need
+          * to do any reswizzling
+          */
+      } else {
+         for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+            src_alu->src[j].swizzle[i] = swizzles[j][vec->src[i].swizzle[0]];
+      }
+
+      /* Clear the no longer needed vec source */
+      nir_instr_rewrite_src(&vec->instr, &vec->src[i].src, NIR_SRC_INIT);
+   }
+
+   nir_instr_rewrite_dest(&src_alu->instr, &src_alu->dest.dest, vec->dest.dest);
+   src_alu->dest.write_mask = write_mask;
+
+   return write_mask;
+}
+
+static bool
+lower_vec_to_movs_block(nir_block *block, void *void_impl)
+{
+   nir_function_impl *impl = void_impl;
+   nir_shader *shader = impl->overload->function->shader;
+
    nir_foreach_instr_safe(block, instr) {
       if (instr->type != nir_instr_type_alu)
          continue;
 
-      nir_alu_instr *vec = (nir_alu_instr *)instr;
+      nir_alu_instr *vec = nir_instr_as_alu(instr);
 
       switch (vec->op) {
       case nir_op_vec2:
@@ -101,8 +205,16 @@ lower_vec_to_movs_block(nir_block *block, void *mem_ctx)
          continue; /* The loop */
       }
 
-      /* Since we insert multiple MOVs, we have to be non-SSA. */
-      assert(!vec->dest.dest.is_ssa);
+      if (vec->dest.dest.is_ssa) {
+         /* Since we insert multiple MOVs, we have a register destination. */
+         nir_register *reg = nir_local_reg_create(impl);
+         reg->num_components = vec->dest.dest.ssa.num_components;
+
+         nir_ssa_def_rewrite_uses(&vec->dest.dest.ssa, nir_src_for_reg(reg));
+
+         nir_instr_rewrite_dest(&vec->instr, &vec->dest.dest,
+                                nir_dest_for_reg(reg));
+      }
 
       unsigned finished_write_mask = 0;
 
@@ -110,26 +222,26 @@ lower_vec_to_movs_block(nir_block *block, void *mem_ctx)
        * destination reg, in case other values we're populating in the dest
        * might overwrite them.
        */
-      for (unsigned i = 0, src_idx = 0; i < 4; i++) {
+      for (unsigned i = 0; i < 4; i++) {
          if (!(vec->dest.write_mask & (1 << i)))
             continue;
 
-         if (src_matches_dest_reg(&vec->dest.dest, &vec->src[src_idx].src)) {
-            finished_write_mask |= insert_mov(vec, i, src_idx, mem_ctx);
+         if (src_matches_dest_reg(&vec->dest.dest, &vec->src[i].src)) {
+            finished_write_mask |= insert_mov(vec, i, shader);
             break;
          }
-         src_idx++;
       }
 
       /* Now, emit MOVs for all the other src channels. */
-      for (unsigned i = 0, src_idx = 0; i < 4; i++) {
+      for (unsigned i = 0; i < 4; i++) {
          if (!(vec->dest.write_mask & (1 << i)))
             continue;
 
          if (!(finished_write_mask & (1 << i)))
-            finished_write_mask |= insert_mov(vec, i, src_idx, mem_ctx);
+            finished_write_mask |= try_coalesce(vec, i, shader);
 
-         src_idx++;
+         if (!(finished_write_mask & (1 << i)))
+            finished_write_mask |= insert_mov(vec, i, shader);
       }
 
       nir_instr_remove(&vec->instr);
@@ -142,7 +254,7 @@ lower_vec_to_movs_block(nir_block *block, void *mem_ctx)
 static void
 nir_lower_vec_to_movs_impl(nir_function_impl *impl)
 {
-   nir_foreach_block(impl, lower_vec_to_movs_block, ralloc_parent(impl));
+   nir_foreach_block(impl, lower_vec_to_movs_block, impl);
 }
 
 void