src/compiler/nir/nir_linking_helpers.c

   1 /*
   2  * Copyright © 2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "nir.h"
  25 #include "util/set.h"
  26 #include "util/hash_table.h"
  27
  28 /* This file contains various little helpers for doing simple linking in
  29  * NIR.  Eventually, we'll probably want a full-blown varying packing
  30  * implementation in here.  Right now, it just deletes unused things.
  31  */
  32
  33 /**
  34  * Returns the bits in the inputs_read, outputs_written, or
  35  * system_values_read bitfield corresponding to this variable.
  36  */
  37 static uint64_t
  38 get_variable_io_mask(nir_variable *var, gl_shader_stage stage)
  39 {
  40    if (var->data.location < 0)
  41       return 0;
  42
  43    unsigned location = var->data.patch ?
  44       var->data.location - VARYING_SLOT_PATCH0 : var->data.location;
  45
  46    assert(var->data.mode == nir_var_shader_in ||
  47           var->data.mode == nir_var_shader_out ||
  48           var->data.mode == nir_var_system_value);
  49    assert(var->data.location >= 0);
  50
  51    const struct glsl_type *type = var->type;
  52    if (nir_is_per_vertex_io(var, stage)) {
  53       assert(glsl_type_is_array(type));
  54       type = glsl_get_array_element(type);
  55    }
  56
  57    unsigned slots = glsl_count_attribute_slots(type, false);
  58    return ((1ull << slots) - 1) << location;
  59 }
  60
  61 static void
  62 tcs_add_output_reads(nir_shader *shader, uint64_t *read, uint64_t *patches_read)
  63 {
  64    nir_foreach_function(function, shader) {
  65       if (!function->impl)
  66          continue;
  67
  68       nir_foreach_block(block, function->impl) {
  69          nir_foreach_instr(instr, block) {
  70             if (instr->type != nir_instr_type_intrinsic)
  71                continue;
  72
  73             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
  74             nir_variable *var;
  75             if (intrin->intrinsic == nir_intrinsic_load_var) {
  76                var = intrin->variables[0]->var;
  77             } else if (intrin->intrinsic == nir_intrinsic_load_deref) {
  78                var = nir_deref_instr_get_variable(nir_src_as_deref(intrin->src[0]));
  79             } else {
  80                continue;
  81             }
  82
  83             if (var->data.mode != nir_var_shader_out)
  84                continue;
  85
  86             if (var->data.patch) {
  87                patches_read[var->data.location_frac] |=
  88                   get_variable_io_mask(var, shader->info.stage);
  89             } else {
  90                read[var->data.location_frac] |=
  91                   get_variable_io_mask(var, shader->info.stage);
  92             }
  93          }
  94       }
  95    }
  96 }
  97
  98 static bool
  99 remove_unused_io_vars(nir_shader *shader, struct exec_list *var_list,
 100                       uint64_t *used_by_other_stage,
 101                       uint64_t *used_by_other_stage_patches)
 102 {
 103    bool progress = false;
 104    uint64_t *used;
 105
 106    nir_foreach_variable_safe(var, var_list) {
 107       if (var->data.patch)
 108          used = used_by_other_stage_patches;
 109       else
 110          used = used_by_other_stage;
 111
 112       if (var->data.location < VARYING_SLOT_VAR0 && var->data.location >= 0)
 113          continue;
 114
 115       if (var->data.always_active_io)
 116          continue;
 117
 118       uint64_t other_stage = used[var->data.location_frac];
 119
 120       if (!(other_stage & get_variable_io_mask(var, shader->info.stage))) {
 121          /* This one is invalid, make it a global variable instead */
 122          var->data.location = 0;
 123          var->data.mode = nir_var_global;
 124
 125          exec_node_remove(&var->node);
 126          exec_list_push_tail(&shader->globals, &var->node);
 127
 128          progress = true;
 129       }
 130    }
 131
 132    return progress;
 133 }
 134
 135 bool
 136 nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer)
 137 {
 138    assert(producer->info.stage != MESA_SHADER_FRAGMENT);
 139    assert(consumer->info.stage != MESA_SHADER_VERTEX);
 140
 141    uint64_t read[4] = { 0 }, written[4] = { 0 };
 142    uint64_t patches_read[4] = { 0 }, patches_written[4] = { 0 };
 143
 144    nir_foreach_variable(var, &producer->outputs) {
 145       if (var->data.patch) {
 146          patches_written[var->data.location_frac] |=
 147             get_variable_io_mask(var, producer->info.stage);
 148       } else {
 149          written[var->data.location_frac] |=
 150             get_variable_io_mask(var, producer->info.stage);
 151       }
 152    }
 153
 154    nir_foreach_variable(var, &consumer->inputs) {
 155       if (var->data.patch) {
 156          patches_read[var->data.location_frac] |=
 157             get_variable_io_mask(var, consumer->info.stage);
 158       } else {
 159          read[var->data.location_frac] |=
 160             get_variable_io_mask(var, consumer->info.stage);
 161       }
 162    }
 163
 164    /* Each TCS invocation can read data written by other TCS invocations,
 165     * so even if the outputs are not used by the TES we must also make
 166     * sure they are not read by the TCS before demoting them to globals.
 167     */
 168    if (producer->info.stage == MESA_SHADER_TESS_CTRL)
 169       tcs_add_output_reads(producer, read, patches_read);
 170
 171    bool progress = false;
 172    progress = remove_unused_io_vars(producer, &producer->outputs, read,
 173                                     patches_read);
 174
 175    progress = remove_unused_io_vars(consumer, &consumer->inputs, written,
 176                                     patches_written) || progress;
 177
 178    return progress;
 179 }
 180
 181 static uint8_t
 182 get_interp_type(nir_variable *var, bool default_to_smooth_interp)
 183 {
 184    if (var->data.interpolation != INTERP_MODE_NONE)
 185       return var->data.interpolation;
 186    else if (default_to_smooth_interp)
 187       return INTERP_MODE_SMOOTH;
 188    else
 189       return INTERP_MODE_NONE;
 190 }
 191
 192 #define INTERPOLATE_LOC_SAMPLE 0
 193 #define INTERPOLATE_LOC_CENTROID 1
 194 #define INTERPOLATE_LOC_CENTER 2
 195
 196 static uint8_t
 197 get_interp_loc(nir_variable *var)
 198 {
 199    if (var->data.sample)
 200       return INTERPOLATE_LOC_SAMPLE;
 201    else if (var->data.centroid)
 202       return INTERPOLATE_LOC_CENTROID;
 203    else
 204       return INTERPOLATE_LOC_CENTER;
 205 }
 206
 207 static void
 208 get_slot_component_masks_and_interp_types(struct exec_list *var_list,
 209                                           uint8_t *comps,
 210                                           uint8_t *interp_type,
 211                                           uint8_t *interp_loc,
 212                                           gl_shader_stage stage,
 213                                           bool default_to_smooth_interp)
 214 {
 215    nir_foreach_variable_safe(var, var_list) {
 216       assert(var->data.location >= 0);
 217
 218       /* Only remap things that aren't built-ins.
 219        * TODO: add TES patch support.
 220        */
 221       if (var->data.location >= VARYING_SLOT_VAR0 &&
 222           var->data.location - VARYING_SLOT_VAR0 < 32) {
 223
 224          const struct glsl_type *type = var->type;
 225          if (nir_is_per_vertex_io(var, stage)) {
 226             assert(glsl_type_is_array(type));
 227             type = glsl_get_array_element(type);
 228          }
 229
 230          unsigned location = var->data.location - VARYING_SLOT_VAR0;
 231          unsigned elements =
 232             glsl_get_vector_elements(glsl_without_array(type));
 233
 234          bool dual_slot = glsl_type_is_dual_slot(glsl_without_array(type));
 235          unsigned slots = glsl_count_attribute_slots(type, false);
 236          unsigned comps_slot2 = 0;
 237          for (unsigned i = 0; i < slots; i++) {
 238             interp_type[location + i] =
 239                get_interp_type(var, default_to_smooth_interp);
 240             interp_loc[location + i] = get_interp_loc(var);
 241
 242             if (dual_slot) {
 243                if (i & 1) {
 244                   comps[location + i] |= ((1 << comps_slot2) - 1);
 245                } else {
 246                   unsigned num_comps = 4 - var->data.location_frac;
 247                   comps_slot2 = (elements * 2) - num_comps;
 248
 249                   /* Assume ARB_enhanced_layouts packing rules for doubles */
 250                   assert(var->data.location_frac == 0 ||
 251                          var->data.location_frac == 2);
 252                   assert(comps_slot2 <= 4);
 253
 254                   comps[location + i] |=
 255                      ((1 << num_comps) - 1) << var->data.location_frac;
 256                }
 257             } else {
 258                comps[location + i] |=
 259                   ((1 << elements) - 1) << var->data.location_frac;
 260             }
 261          }
 262       }
 263    }
 264 }
 265
 266 struct varying_loc
 267 {
 268    uint8_t component;
 269    uint32_t location;
 270 };
 271
 272 static void
 273 remap_slots_and_components(struct exec_list *var_list, gl_shader_stage stage,
 274                            struct varying_loc (*remap)[4],
 275                            uint64_t *slots_used, uint64_t *out_slots_read)
 276  {
 277    uint64_t out_slots_read_tmp = 0;
 278
 279    /* We don't touch builtins so just copy the bitmask */
 280    uint64_t slots_used_tmp =
 281       *slots_used & (((uint64_t)1 << (VARYING_SLOT_VAR0 - 1)) - 1);
 282
 283    nir_foreach_variable(var, var_list) {
 284       assert(var->data.location >= 0);
 285
 286       /* Only remap things that aren't built-ins */
 287       if (var->data.location >= VARYING_SLOT_VAR0 &&
 288           var->data.location - VARYING_SLOT_VAR0 < 32) {
 289          assert(var->data.location - VARYING_SLOT_VAR0 < 32);
 290
 291          const struct glsl_type *type = var->type;
 292          if (nir_is_per_vertex_io(var, stage)) {
 293             assert(glsl_type_is_array(type));
 294             type = glsl_get_array_element(type);
 295          }
 296
 297          unsigned num_slots = glsl_count_attribute_slots(type, false);
 298          bool used_across_stages = false;
 299          bool outputs_read = false;
 300
 301          unsigned location = var->data.location - VARYING_SLOT_VAR0;
 302          struct varying_loc *new_loc = &remap[location][var->data.location_frac];
 303
 304          uint64_t slots = (((uint64_t)1 << num_slots) - 1) << var->data.location;
 305          if (slots & *slots_used)
 306             used_across_stages = true;
 307
 308          if (slots & *out_slots_read)
 309             outputs_read = true;
 310
 311          if (new_loc->location) {
 312             var->data.location = new_loc->location;
 313             var->data.location_frac = new_loc->component;
 314          }
 315
 316          if (var->data.always_active_io) {
 317             /* We can't apply link time optimisations (specifically array
 318              * splitting) to these so we need to copy the existing mask
 319              * otherwise we will mess up the mask for things like partially
 320              * marked arrays.
 321              */
 322             if (used_across_stages) {
 323                slots_used_tmp |=
 324                   *slots_used & (((uint64_t)1 << num_slots) - 1) << var->data.location;
 325             }
 326
 327             if (outputs_read) {
 328                out_slots_read_tmp |=
 329                   *out_slots_read & (((uint64_t)1 << num_slots) - 1) << var->data.location;
 330             }
 331
 332          } else {
 333             for (unsigned i = 0; i < num_slots; i++) {
 334                if (used_across_stages)
 335                   slots_used_tmp |= (uint64_t)1 << (var->data.location + i);
 336
 337                if (outputs_read)
 338                   out_slots_read_tmp |= (uint64_t)1 << (var->data.location + i);
 339             }
 340          }
 341       }
 342    }
 343
 344    *slots_used = slots_used_tmp;
 345    *out_slots_read = out_slots_read_tmp;
 346 }
 347
 348 /* If there are empty components in the slot compact the remaining components
 349  * as close to component 0 as possible. This will make it easier to fill the
 350  * empty components with components from a different slot in a following pass.
 351  */
 352 static void
 353 compact_components(nir_shader *producer, nir_shader *consumer, uint8_t *comps,
 354                    uint8_t *interp_type, uint8_t *interp_loc,
 355                    bool default_to_smooth_interp)
 356 {
 357    struct exec_list *input_list = &consumer->inputs;
 358    struct exec_list *output_list = &producer->outputs;
 359    struct varying_loc remap[32][4] = {{{0}, {0}}};
 360
 361    /* Create a cursor for each interpolation type */
 362    unsigned cursor[4] = {0};
 363
 364    /* We only need to pass over one stage and we choose the consumer as it seems
 365     * to cause a larger reduction in instruction counts (tested on i965).
 366     */
 367    nir_foreach_variable(var, input_list) {
 368
 369       /* Only remap things that aren't builtins.
 370        * TODO: add TES patch support.
 371        */
 372       if (var->data.location >= VARYING_SLOT_VAR0 &&
 373           var->data.location - VARYING_SLOT_VAR0 < 32) {
 374
 375          /* We can't repack xfb varyings. */
 376          if (var->data.always_active_io)
 377             continue;
 378
 379          const struct glsl_type *type = var->type;
 380          if (nir_is_per_vertex_io(var, consumer->info.stage)) {
 381             assert(glsl_type_is_array(type));
 382             type = glsl_get_array_element(type);
 383          }
 384
 385          /* Skip types that require more complex packing handling.
 386           * TODO: add support for these types.
 387           */
 388          if (glsl_type_is_array(type) ||
 389              glsl_type_is_dual_slot(type) ||
 390              glsl_type_is_matrix(type) ||
 391              glsl_type_is_struct(type) ||
 392              glsl_type_is_64bit(type))
 393             continue;
 394
 395          /* We ignore complex types above and all other vector types should
 396           * have been split into scalar variables by the lower_io_to_scalar
 397           * pass. The only exeption should by OpenGL xfb varyings.
 398           */
 399          if (glsl_get_vector_elements(type) != 1)
 400             continue;
 401
 402          unsigned location = var->data.location - VARYING_SLOT_VAR0;
 403          uint8_t used_comps = comps[location];
 404
 405          /* If there are no empty components there is nothing more for us to do.
 406           */
 407          if (used_comps == 0xf)
 408             continue;
 409
 410          bool found_new_offset = false;
 411          uint8_t interp = get_interp_type(var, default_to_smooth_interp);
 412          for (; cursor[interp] < 32; cursor[interp]++) {
 413             uint8_t cursor_used_comps = comps[cursor[interp]];
 414
 415             /* We couldn't find anywhere to pack the varying continue on. */
 416             if (cursor[interp] == location &&
 417                 (var->data.location_frac == 0 ||
 418                  cursor_used_comps & ((1 << (var->data.location_frac)) - 1)))
 419                break;
 420
 421             /* We can only pack varyings with matching interpolation types */
 422             if (interp_type[cursor[interp]] != interp)
 423                continue;
 424
 425             /* Interpolation loc must match also.
 426              * TODO: i965 can handle these if they don't match, but the
 427              * radeonsi nir backend handles everything as vec4s and so expects
 428              * this to be the same for all components. We could make this
 429              * check driver specfific or drop it if NIR ever become the only
 430              * radeonsi backend.
 431              */
 432             if (interp_loc[cursor[interp]] != get_interp_loc(var))
 433                continue;
 434
 435             /* If the slot is empty just skip it for now, compact_var_list()
 436              * can be called after this function to remove empty slots for us.
 437              * TODO: finish implementing compact_var_list() requires array and
 438              * matrix splitting.
 439              */
 440             if (!cursor_used_comps)
 441                continue;
 442
 443             uint8_t unused_comps = ~cursor_used_comps;
 444
 445             for (unsigned i = 0; i < 4; i++) {
 446                uint8_t new_var_comps = 1 << i;
 447                if (unused_comps & new_var_comps) {
 448                   remap[location][var->data.location_frac].component = i;
 449                   remap[location][var->data.location_frac].location =
 450                      cursor[interp] + VARYING_SLOT_VAR0;
 451
 452                   found_new_offset = true;
 453
 454                   /* Turn off the mask for the component we are remapping */
 455                   if (comps[location] & 1 << var->data.location_frac) {
 456                      comps[location] ^= 1 << var->data.location_frac;
 457                      comps[cursor[interp]] |= new_var_comps;
 458                   }
 459                   break;
 460                }
 461             }
 462
 463             if (found_new_offset)
 464                break;
 465          }
 466       }
 467    }
 468
 469    uint64_t zero = 0;
 470    remap_slots_and_components(input_list, consumer->info.stage, remap,
 471                               &consumer->info.inputs_read, &zero);
 472    remap_slots_and_components(output_list, producer->info.stage, remap,
 473                               &producer->info.outputs_written,
 474                               &producer->info.outputs_read);
 475 }
 476
 477 /* We assume that this has been called more-or-less directly after
 478  * remove_unused_varyings.  At this point, all of the varyings that we
 479  * aren't going to be using have been completely removed and the
 480  * inputs_read and outputs_written fields in nir_shader_info reflect
 481  * this.  Therefore, the total set of valid slots is the OR of the two
 482  * sets of varyings;  this accounts for varyings which one side may need
 483  * to read/write even if the other doesn't.  This can happen if, for
 484  * instance, an array is used indirectly from one side causing it to be
 485  * unsplittable but directly from the other.
 486  */
 487 void
 488 nir_compact_varyings(nir_shader *producer, nir_shader *consumer,
 489                      bool default_to_smooth_interp)
 490 {
 491    assert(producer->info.stage != MESA_SHADER_FRAGMENT);
 492    assert(consumer->info.stage != MESA_SHADER_VERTEX);
 493
 494    uint8_t comps[32] = {0};
 495    uint8_t interp_type[32] = {0};
 496    uint8_t interp_loc[32] = {0};
 497
 498    get_slot_component_masks_and_interp_types(&producer->outputs, comps,
 499                                              interp_type, interp_loc,
 500                                              producer->info.stage,
 501                                              default_to_smooth_interp);
 502    get_slot_component_masks_and_interp_types(&consumer->inputs, comps,
 503                                              interp_type, interp_loc,
 504                                              consumer->info.stage,
 505                                              default_to_smooth_interp);
 506
 507    compact_components(producer, consumer, comps, interp_type, interp_loc,
 508                       default_to_smooth_interp);
 509 }