src/compiler/nir/nir_lower_vec_to_movs.c

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Jason Ekstrand (jason@jlekstrand.net)
  25  *
  26  */
  27
  28 #include "nir.h"
  29
  30 /*
  31  * Implements a simple pass that lowers vecN instructions to a series of
  32  * moves with partial writes.
  33  */
  34
  35 struct vec_to_movs_state {
  36    nir_function_impl *impl;
  37    bool progress;
  38 };
  39
  40 static bool
  41 src_matches_dest_reg(nir_dest *dest, nir_src *src)
  42 {
  43    if (dest->is_ssa || src->is_ssa)
  44       return false;
  45
  46    return (dest->reg.reg == src->reg.reg &&
  47            dest->reg.base_offset == src->reg.base_offset &&
  48            !dest->reg.indirect &&
  49            !src->reg.indirect);
  50 }
  51
  52 /**
  53  * For a given starting writemask channel and corresponding source index in
  54  * the vec instruction, insert a MOV to the vec instruction's dest of all the
  55  * writemask channels that get read from the same src reg.
  56  *
  57  * Returns the writemask of our MOV, so the parent loop calling this knows
  58  * which ones have been processed.
  59  */
  60 static unsigned
  61 insert_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
  62 {
  63    assert(start_idx < nir_op_infos[vec->op].num_inputs);
  64
  65    nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_imov);
  66    nir_alu_src_copy(&mov->src[0], &vec->src[start_idx], mov);
  67    nir_alu_dest_copy(&mov->dest, &vec->dest, mov);
  68
  69    mov->dest.write_mask = (1u << start_idx);
  70    mov->src[0].swizzle[start_idx] = vec->src[start_idx].swizzle[0];
  71    mov->src[0].negate = vec->src[start_idx].negate;
  72    mov->src[0].abs = vec->src[start_idx].abs;
  73
  74    for (unsigned i = start_idx + 1; i < 4; i++) {
  75       if (!(vec->dest.write_mask & (1 << i)))
  76          continue;
  77
  78       if (nir_srcs_equal(vec->src[i].src, vec->src[start_idx].src) &&
  79           vec->src[i].negate == vec->src[start_idx].negate &&
  80           vec->src[i].abs == vec->src[start_idx].abs) {
  81          mov->dest.write_mask |= (1 << i);
  82          mov->src[0].swizzle[i] = vec->src[i].swizzle[0];
  83       }
  84    }
  85
  86    /* In some situations (if the vecN is involved in a phi-web), we can end
  87     * up with a mov from a register to itself.  Some of those channels may end
  88     * up doing nothing and there's no reason to have them as part of the mov.
  89     */
  90    if (src_matches_dest_reg(&mov->dest.dest, &mov->src[0].src) &&
  91        !mov->src[0].abs && !mov->src[0].negate) {
  92       for (unsigned i = 0; i < 4; i++) {
  93          if (mov->src[0].swizzle[i] == i) {
  94             mov->dest.write_mask &= ~(1 << i);
  95          }
  96       }
  97    }
  98
  99    /* Only emit the instruction if it actually does something */
 100    if (mov->dest.write_mask) {
 101       nir_instr_insert_before(&vec->instr, &mov->instr);
 102    } else {
 103       ralloc_free(mov);
 104    }
 105
 106    return mov->dest.write_mask;
 107 }
 108
 109 static bool
 110 has_replicated_dest(nir_alu_instr *alu)
 111 {
 112    return alu->op == nir_op_fdot_replicated2 ||
 113           alu->op == nir_op_fdot_replicated3 ||
 114           alu->op == nir_op_fdot_replicated4 ||
 115           alu->op == nir_op_fdph_replicated;
 116 }
 117
 118 /* Attempts to coalesce the "move" from the given source of the vec to the
 119  * destination of the instruction generating the value. If, for whatever
 120  * reason, we cannot coalesce the mmove, it does nothing and returns 0.  We
 121  * can then call insert_mov as normal.
 122  */
 123 static unsigned
 124 try_coalesce(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
 125 {
 126    assert(start_idx < nir_op_infos[vec->op].num_inputs);
 127
 128    /* We will only even try if the source is SSA */
 129    if (!vec->src[start_idx].src.is_ssa)
 130       return 0;
 131
 132    assert(vec->src[start_idx].src.ssa);
 133
 134    /* If we are going to do a reswizzle, then the vecN operation must be the
 135     * only use of the source value.  We also can't have any source modifiers.
 136     */
 137    nir_foreach_use(vec->src[start_idx].src.ssa, src) {
 138       if (src->parent_instr != &vec->instr)
 139          return 0;
 140
 141       nir_alu_src *alu_src = exec_node_data(nir_alu_src, src, src);
 142       if (alu_src->abs || alu_src->negate)
 143          return 0;
 144    }
 145
 146    if (!list_empty(&vec->src[start_idx].src.ssa->if_uses))
 147       return 0;
 148
 149    if (vec->src[start_idx].src.ssa->parent_instr->type != nir_instr_type_alu)
 150       return 0;
 151
 152    nir_alu_instr *src_alu =
 153       nir_instr_as_alu(vec->src[start_idx].src.ssa->parent_instr);
 154
 155    if (has_replicated_dest(src_alu)) {
 156       /* The fdot instruction is special: It replicates its result to all
 157        * components.  This means that we can always rewrite its destination
 158        * and we don't need to swizzle anything.
 159        */
 160    } else {
 161       /* We only care about being able to re-swizzle the instruction if it is
 162        * something that we can reswizzle.  It must be per-component.  The one
 163        * exception to this is the fdotN instructions which implicitly splat
 164        * their result out to all channels.
 165        */
 166       if (nir_op_infos[src_alu->op].output_size != 0)
 167          return 0;
 168
 169       /* If we are going to reswizzle the instruction, we can't have any
 170        * non-per-component sources either.
 171        */
 172       for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
 173          if (nir_op_infos[src_alu->op].input_sizes[j] != 0)
 174             return 0;
 175    }
 176
 177    /* Stash off all of the ALU instruction's swizzles. */
 178    uint8_t swizzles[4][4];
 179    for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
 180       for (unsigned i = 0; i < 4; i++)
 181          swizzles[j][i] = src_alu->src[j].swizzle[i];
 182
 183    unsigned write_mask = 0;
 184    for (unsigned i = start_idx; i < 4; i++) {
 185       if (!(vec->dest.write_mask & (1 << i)))
 186          continue;
 187
 188       if (!vec->src[i].src.is_ssa ||
 189           vec->src[i].src.ssa != &src_alu->dest.dest.ssa)
 190          continue;
 191
 192       /* At this point, the give vec source matchese up with the ALU
 193        * instruction so we can re-swizzle that component to match.
 194        */
 195       write_mask |= 1 << i;
 196       if (has_replicated_dest(src_alu)) {
 197          /* Since the destination is a single replicated value, we don't need
 198           * to do any reswizzling
 199           */
 200       } else {
 201          for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
 202             src_alu->src[j].swizzle[i] = swizzles[j][vec->src[i].swizzle[0]];
 203       }
 204
 205       /* Clear the no longer needed vec source */
 206       nir_instr_rewrite_src(&vec->instr, &vec->src[i].src, NIR_SRC_INIT);
 207    }
 208
 209    nir_instr_rewrite_dest(&src_alu->instr, &src_alu->dest.dest, vec->dest.dest);
 210    src_alu->dest.write_mask = write_mask;
 211
 212    return write_mask;
 213 }
 214
 215 static bool
 216 lower_vec_to_movs_block(nir_block *block, void *void_state)
 217 {
 218    struct vec_to_movs_state *state = void_state;
 219    nir_function_impl *impl = state->impl;
 220    nir_shader *shader = impl->function->shader;
 221
 222    nir_foreach_instr_safe(block, instr) {
 223       if (instr->type != nir_instr_type_alu)
 224          continue;
 225
 226       nir_alu_instr *vec = nir_instr_as_alu(instr);
 227
 228       switch (vec->op) {
 229       case nir_op_vec2:
 230       case nir_op_vec3:
 231       case nir_op_vec4:
 232          break;
 233       default:
 234          continue; /* The loop */
 235       }
 236
 237       if (vec->dest.dest.is_ssa) {
 238          /* Since we insert multiple MOVs, we have a register destination. */
 239          nir_register *reg = nir_local_reg_create(impl);
 240          reg->num_components = vec->dest.dest.ssa.num_components;
 241
 242          nir_ssa_def_rewrite_uses(&vec->dest.dest.ssa, nir_src_for_reg(reg));
 243
 244          nir_instr_rewrite_dest(&vec->instr, &vec->dest.dest,
 245                                 nir_dest_for_reg(reg));
 246       }
 247
 248       unsigned finished_write_mask = 0;
 249
 250       /* First, emit a MOV for all the src channels that are in the
 251        * destination reg, in case other values we're populating in the dest
 252        * might overwrite them.
 253        */
 254       for (unsigned i = 0; i < 4; i++) {
 255          if (!(vec->dest.write_mask & (1 << i)))
 256             continue;
 257
 258          if (src_matches_dest_reg(&vec->dest.dest, &vec->src[i].src)) {
 259             finished_write_mask |= insert_mov(vec, i, shader);
 260             break;
 261          }
 262       }
 263
 264       /* Now, emit MOVs for all the other src channels. */
 265       for (unsigned i = 0; i < 4; i++) {
 266          if (!(vec->dest.write_mask & (1 << i)))
 267             continue;
 268
 269          if (!(finished_write_mask & (1 << i)))
 270             finished_write_mask |= try_coalesce(vec, i, shader);
 271
 272          if (!(finished_write_mask & (1 << i)))
 273             finished_write_mask |= insert_mov(vec, i, shader);
 274       }
 275
 276       nir_instr_remove(&vec->instr);
 277       ralloc_free(vec);
 278       state->progress = true;
 279    }
 280
 281    return true;
 282 }
 283
 284 static bool
 285 nir_lower_vec_to_movs_impl(nir_function_impl *impl)
 286 {
 287    struct vec_to_movs_state state = { impl, false };
 288
 289    nir_foreach_block(impl, lower_vec_to_movs_block, &state);
 290
 291    if (state.progress) {
 292       nir_metadata_preserve(impl, nir_metadata_block_index |
 293                                   nir_metadata_dominance);
 294    }
 295
 296    return state.progress;
 297 }
 298
 299 bool
 300 nir_lower_vec_to_movs(nir_shader *shader)
 301 {
 302    bool progress = false;
 303
 304    nir_foreach_function(shader, function) {
 305       if (function->impl)
 306          progress = nir_lower_vec_to_movs_impl(function->impl) || progress;
 307    }
 308
 309    return progress;
 310 }