src/compiler/nir/nir_lower_amul.c

   1 /*
   2  * Copyright © 2019 Google, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include "nir.h"
  25 #include "nir_vla.h"
  26
  27 /* Lowering for amul instructions, for drivers that support imul24.
  28  * This pass will analyze indirect derefs, and convert corresponding
  29  * amul instructions to either imul or imul24, depending on the
  30  * required range.
  31  *
  32  * 1) Analyze the uniform variables and build a table of UBOs and SSBOs
  33  *    that are either too large, or might be too large (unknown size)
  34  *    for imul24
  35  *
  36  * 2) Loop thru looking at all the intrinsics, finding dereferences of
  37  *    large variables, and recursively replacing all amul instructions
  38  *    used with imul
  39  *
  40  * 3) Finally loop again thru all instructions replacing any remaining
  41  *    amul with imul24.  At this point any remaining amul instructions
  42  *    are not involved in calculating an offset into a large variable,
  43  *    thanks to the 2nd step, so they can be safely replace with imul24.
  44  *
  45  * Using two passes over all the instructions lets us handle the case
  46  * where, due to CSE, an amul is used to calculate an offset into both
  47  * a large and small variable.
  48  */
  49
  50 typedef struct {
  51    int (*type_size)(const struct glsl_type *, bool);
  52
  53    /* Tables of UBOs and SSBOs mapping driver_location/base whether
  54     * they are too large to use imul24:
  55     */
  56    bool *large_ubos;
  57    bool *large_ssbos;
  58
  59    /* for cases that we cannot determine UBO/SSBO index, track if *any*
  60     * UBO/SSBO is too large for imul24:
  61     */
  62    bool has_large_ubo;
  63    bool has_large_ssbo;
  64 } lower_state;
  65
  66 /* Lower 'amul's in offset src of large variables to 'imul': */
  67 static bool
  68 lower_large_src(nir_src *src, void *s)
  69 {
  70    lower_state *state = s;
  71
  72    assert(src->is_ssa);
  73
  74    nir_instr *parent = src->ssa->parent_instr;
  75
  76    /* No need to visit instructions we've already visited.. this also
  77     * avoids infinite recursion when phi's are involved:
  78     */
  79    if (parent->pass_flags)
  80       return false;
  81
  82    bool progress = nir_foreach_src(parent, lower_large_src, state);
  83
  84    if (parent->type == nir_instr_type_alu) {
  85       nir_alu_instr *alu = nir_instr_as_alu(parent);
  86       if (alu->op == nir_op_amul) {
  87          alu->op = nir_op_imul;
  88          progress = true;
  89       }
  90    }
  91
  92    parent->pass_flags = 1;
  93
  94    return progress;
  95 }
  96
  97 static bool
  98 large_ubo(lower_state *state, nir_src src)
  99 {
 100    if (!nir_src_is_const(src))
 101       return state->has_large_ubo;
 102    return state->large_ubos[nir_src_as_uint(src)];
 103 }
 104
 105 static bool
 106 large_ssbo(lower_state *state, nir_src src)
 107 {
 108    if (!nir_src_is_const(src))
 109       return state->has_large_ssbo;
 110    return state->large_ssbos[nir_src_as_uint(src)];
 111 }
 112
 113 static bool
 114 lower_intrinsic(lower_state *state, nir_intrinsic_instr *intr)
 115 {
 116    switch (intr->intrinsic) {
 117    case nir_intrinsic_load_ubo:
 118       //# src[] = { buffer_index, offset }.
 119       if (large_ubo(state, intr->src[0]))
 120          return lower_large_src(&intr->src[1], state);
 121       return false;
 122
 123    case nir_intrinsic_load_ssbo:
 124       //# src[] = { buffer_index, offset }.
 125       if (large_ssbo(state, intr->src[0]))
 126          return lower_large_src(&intr->src[1], state);
 127       return false;
 128
 129    case nir_intrinsic_store_ssbo:
 130       //# src[] = { value, block_index, offset }
 131       if (large_ssbo(state, intr->src[1]))
 132          return lower_large_src(&intr->src[2], state);
 133       return false;
 134
 135    case nir_intrinsic_ssbo_atomic_add:
 136    case nir_intrinsic_ssbo_atomic_imin:
 137    case nir_intrinsic_ssbo_atomic_umin:
 138    case nir_intrinsic_ssbo_atomic_imax:
 139    case nir_intrinsic_ssbo_atomic_umax:
 140    case nir_intrinsic_ssbo_atomic_and:
 141    case nir_intrinsic_ssbo_atomic_or:
 142    case nir_intrinsic_ssbo_atomic_xor:
 143    case nir_intrinsic_ssbo_atomic_exchange:
 144    case nir_intrinsic_ssbo_atomic_comp_swap:
 145    case nir_intrinsic_ssbo_atomic_fadd:
 146    case nir_intrinsic_ssbo_atomic_fmin:
 147    case nir_intrinsic_ssbo_atomic_fmax:
 148    case nir_intrinsic_ssbo_atomic_fcomp_swap:
 149       /* 0: SSBO index
 150        * 1: offset
 151        */
 152       if (large_ssbo(state, intr->src[0]))
 153          return lower_large_src(&intr->src[1], state);
 154       return false;
 155
 156    case nir_intrinsic_global_atomic_add:
 157    case nir_intrinsic_global_atomic_imin:
 158    case nir_intrinsic_global_atomic_umin:
 159    case nir_intrinsic_global_atomic_imax:
 160    case nir_intrinsic_global_atomic_umax:
 161    case nir_intrinsic_global_atomic_and:
 162    case nir_intrinsic_global_atomic_or:
 163    case nir_intrinsic_global_atomic_xor:
 164    case nir_intrinsic_global_atomic_exchange:
 165    case nir_intrinsic_global_atomic_comp_swap:
 166    case nir_intrinsic_global_atomic_fadd:
 167    case nir_intrinsic_global_atomic_fmin:
 168    case nir_intrinsic_global_atomic_fmax:
 169    case nir_intrinsic_global_atomic_fcomp_swap:
 170       /* just assume we that 24b is not sufficient: */
 171       return lower_large_src(&intr->src[0], state);
 172
 173    /* These should all be small enough to unconditionally use imul24: */
 174    case nir_intrinsic_shared_atomic_add:
 175    case nir_intrinsic_shared_atomic_imin:
 176    case nir_intrinsic_shared_atomic_umin:
 177    case nir_intrinsic_shared_atomic_imax:
 178    case nir_intrinsic_shared_atomic_umax:
 179    case nir_intrinsic_shared_atomic_and:
 180    case nir_intrinsic_shared_atomic_or:
 181    case nir_intrinsic_shared_atomic_xor:
 182    case nir_intrinsic_shared_atomic_exchange:
 183    case nir_intrinsic_shared_atomic_comp_swap:
 184    case nir_intrinsic_shared_atomic_fadd:
 185    case nir_intrinsic_shared_atomic_fmin:
 186    case nir_intrinsic_shared_atomic_fmax:
 187    case nir_intrinsic_shared_atomic_fcomp_swap:
 188    case nir_intrinsic_load_uniform:
 189    case nir_intrinsic_load_input:
 190    case nir_intrinsic_load_output:
 191    case nir_intrinsic_store_output:
 192    default:
 193       return false;
 194    }
 195 }
 196
 197 static bool
 198 lower_instr(lower_state *state, nir_instr *instr)
 199 {
 200    bool progress = false;
 201
 202    if (instr->type == nir_instr_type_intrinsic) {
 203       progress |= lower_intrinsic(state, nir_instr_as_intrinsic(instr));
 204    }
 205
 206    return progress;
 207 }
 208
 209 static bool
 210 is_large(lower_state *state, nir_variable *var)
 211 {
 212    unsigned size = state->type_size(var->type, false);
 213
 214    /* if size is not known (ie. VLA) then assume the worst: */
 215    if (!size)
 216       return true;
 217
 218    return size >= (1 << 23);
 219 }
 220
 221 bool
 222 nir_lower_amul(nir_shader *shader,
 223                int (*type_size)(const struct glsl_type *, bool))
 224 {
 225    assert(shader->options->has_imul24);
 226    assert(type_size);
 227
 228    /* uniforms list actually includes ubo's and ssbo's: */
 229    int num_uniforms = exec_list_length(&shader->uniforms);
 230
 231    NIR_VLA_FILL(bool, large_ubos, num_uniforms, 0);
 232    NIR_VLA_FILL(bool, large_ssbos, num_uniforms, 0);
 233
 234    lower_state state = {
 235          .type_size = type_size,
 236          .large_ubos = large_ubos,
 237          .large_ssbos = large_ssbos,
 238    };
 239
 240    /* Figure out which UBOs or SSBOs are large enough to be
 241     * disqualified from imul24:
 242     */
 243    nir_foreach_variable(var, &shader->uniforms) {
 244       if (var->data.mode == nir_var_mem_ubo) {
 245          assert(var->data.driver_location < num_uniforms);
 246          if (is_large(&state, var)) {
 247             state.has_large_ubo = true;
 248             state.large_ubos[var->data.driver_location] = true;
 249          }
 250       } else if (var->data.mode == nir_var_mem_ssbo) {
 251          assert(var->data.driver_location < num_uniforms);
 252          if (is_large(&state, var)) {
 253             state.has_large_ssbo = true;
 254             state.large_ssbos[var->data.driver_location] = true;
 255          }
 256       }
 257    }
 258
 259    /* clear pass flags: */
 260    nir_foreach_function(function, shader) {
 261       nir_function_impl *impl = function->impl;
 262       if (!impl)
 263          continue;
 264
 265       nir_foreach_block(block, impl) {
 266          nir_foreach_instr(instr, block) {
 267             instr->pass_flags = 0;
 268          }
 269       }
 270    }
 271
 272    bool progress = false;
 273    nir_foreach_function(function, shader) {
 274       nir_function_impl *impl = function->impl;
 275
 276       if (!impl)
 277          continue;
 278
 279       nir_foreach_block(block, impl) {
 280          nir_foreach_instr(instr, block) {
 281             progress |= lower_instr(&state, instr);
 282          }
 283       }
 284    }
 285
 286    /* At this point, all 'amul's used in calculating an offset into
 287     * a large variable have been replaced with 'imul'.  So remaining
 288     * 'amul's can be replaced with 'imul24':
 289     */
 290    nir_foreach_function(function, shader) {
 291       nir_function_impl *impl = function->impl;
 292
 293       if (!impl)
 294          continue;
 295
 296       nir_foreach_block(block, impl) {
 297          nir_foreach_instr(instr, block) {
 298             if (instr->type != nir_instr_type_alu)
 299                continue;
 300
 301             nir_alu_instr *alu = nir_instr_as_alu(instr);
 302             if (alu->op != nir_op_amul)
 303                continue;
 304
 305             alu->op = nir_op_imul24;
 306             progress |= true;
 307          }
 308       }
 309
 310       nir_metadata_preserve(impl, nir_metadata_block_index |
 311                                   nir_metadata_dominance);
 312
 313    }
 314
 315    return progress;
 316 }