src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c

   1 /*
   2  * Copyright © 2018 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_nir.h"
  25 #include "compiler/nir/nir_builder.h"
  26 #include "util/u_math.h"
  27 #include "util/bitscan.h"
  28
  29 static nir_ssa_def *
  30 dup_mem_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
  31                   nir_ssa_def *store_src, int offset,
  32                   unsigned num_components, unsigned bit_size,
  33                   unsigned align)
  34 {
  35    const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic];
  36
  37    nir_intrinsic_instr *dup =
  38       nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
  39
  40    nir_src *intrin_offset_src = nir_get_io_offset_src(intrin);
  41    for (unsigned i = 0; i < info->num_srcs; i++) {
  42       assert(intrin->src[i].is_ssa);
  43       if (i == 0 && store_src) {
  44          assert(!info->has_dest);
  45          assert(&intrin->src[i] != intrin_offset_src);
  46          dup->src[i] = nir_src_for_ssa(store_src);
  47       } else if (&intrin->src[i] == intrin_offset_src) {
  48          dup->src[i] = nir_src_for_ssa(nir_iadd_imm(b, intrin->src[i].ssa,
  49                                                        offset));
  50       } else {
  51          dup->src[i] = nir_src_for_ssa(intrin->src[i].ssa);
  52       }
  53    }
  54
  55    dup->num_components = num_components;
  56
  57    for (unsigned i = 0; i < info->num_indices; i++)
  58       dup->const_index[i] = intrin->const_index[i];
  59
  60    nir_intrinsic_set_align(dup, align, 0);
  61
  62    if (info->has_dest) {
  63       assert(intrin->dest.is_ssa);
  64       nir_ssa_dest_init(&dup->instr, &dup->dest,
  65                         num_components, bit_size,
  66                         intrin->dest.ssa.name);
  67    } else {
  68       nir_intrinsic_set_write_mask(dup, (1 << num_components) - 1);
  69    }
  70
  71    nir_builder_instr_insert(b, &dup->instr);
  72
  73    return info->has_dest ? &dup->dest.ssa : NULL;
  74 }
  75
  76 static bool
  77 lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
  78                         const struct gen_device_info *devinfo)
  79 {
  80    const bool needs_scalar =
  81       intrin->intrinsic == nir_intrinsic_load_scratch;
  82
  83    assert(intrin->dest.is_ssa);
  84    const unsigned bit_size = intrin->dest.ssa.bit_size;
  85    const unsigned num_components = intrin->dest.ssa.num_components;
  86    const unsigned bytes_read = num_components * (bit_size / 8);
  87    const unsigned align = nir_intrinsic_align(intrin);
  88
  89    if (bit_size == 32 && align >= 32 && intrin->num_components <= 4 &&
  90        (!needs_scalar || intrin->num_components == 1))
  91       return false;
  92
  93    nir_ssa_def *result;
  94    nir_src *offset_src = nir_get_io_offset_src(intrin);
  95    if (bit_size < 32 && nir_src_is_const(*offset_src)) {
  96       /* The offset is constant so we can use a 32-bit load and just shift it
  97        * around as needed.
  98        */
  99       const int load_offset = nir_src_as_uint(*offset_src) % 4;
 100       assert(load_offset % (bit_size / 8) == 0);
 101       const unsigned load_comps32 = DIV_ROUND_UP(bytes_read + load_offset, 4);
 102       /* A 16-bit vec4 is a 32-bit vec2.  We add an extra component in case
 103        * we offset into a component with load_offset.
 104        */
 105       assert(load_comps32 <= 3);
 106
 107       nir_ssa_def *load = dup_mem_intrinsic(b, intrin, NULL, -load_offset,
 108                                             load_comps32, 32, 4);
 109       result = nir_extract_bits(b, &load, 1, load_offset * 8,
 110                                 num_components, bit_size);
 111    } else {
 112       /* Otherwise, we have to break it into smaller loads.  We could end up
 113        * with as many as 32 loads if we're loading a u64vec16 from scratch.
 114        */
 115       nir_ssa_def *loads[32];
 116       unsigned num_loads = 0;
 117       int load_offset = 0;
 118       while (load_offset < bytes_read) {
 119          const unsigned bytes_left = bytes_read - load_offset;
 120          unsigned load_bit_size, load_comps;
 121          if (align < 4) {
 122             load_comps = 1;
 123             /* Choose a byte, word, or dword */
 124             load_bit_size = util_next_power_of_two(MIN2(bytes_left, 4)) * 8;
 125          } else {
 126             assert(load_offset % 4 == 0);
 127             load_bit_size = 32;
 128             load_comps = needs_scalar ? 1 :
 129                          DIV_ROUND_UP(MIN2(bytes_left, 16), 4);
 130          }
 131
 132          loads[num_loads++] = dup_mem_intrinsic(b, intrin, NULL, load_offset,
 133                                                 load_comps, load_bit_size,
 134                                                 align);
 135
 136          load_offset += load_comps * (load_bit_size / 8);
 137       }
 138       assert(num_loads <= ARRAY_SIZE(loads));
 139       result = nir_extract_bits(b, loads, num_loads, 0,
 140                                 num_components, bit_size);
 141    }
 142
 143    nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
 144                             nir_src_for_ssa(result));
 145    nir_instr_remove(&intrin->instr);
 146
 147    return true;
 148 }
 149
 150 static bool
 151 lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
 152                          const struct gen_device_info *devinfo)
 153 {
 154    const bool needs_scalar =
 155       intrin->intrinsic == nir_intrinsic_store_scratch;
 156
 157    assert(intrin->src[0].is_ssa);
 158    nir_ssa_def *value = intrin->src[0].ssa;
 159
 160    assert(intrin->num_components == value->num_components);
 161    const unsigned bit_size = value->bit_size;
 162    const unsigned num_components = intrin->num_components;
 163    const unsigned bytes_written = num_components * (bit_size / 8);
 164    const unsigned align_mul = nir_intrinsic_align_mul(intrin);
 165    const unsigned align_offset = nir_intrinsic_align_offset(intrin);
 166    const unsigned align = nir_intrinsic_align(intrin);
 167
 168    nir_component_mask_t writemask = nir_intrinsic_write_mask(intrin);
 169    assert(writemask < (1 << num_components));
 170
 171    if ((value->bit_size <= 32 && num_components == 1) ||
 172        (value->bit_size == 32 && num_components <= 4 && align >= 32 &&
 173         writemask == (1 << num_components) - 1 &&
 174         !needs_scalar))
 175       return false;
 176
 177    nir_src *offset_src = nir_get_io_offset_src(intrin);
 178    const bool offset_is_const = nir_src_is_const(*offset_src);
 179    const unsigned const_offset =
 180       offset_is_const ? nir_src_as_uint(*offset_src) : 0;
 181
 182    const unsigned byte_size = bit_size / 8;
 183    assert(byte_size <= sizeof(uint64_t));
 184
 185    BITSET_DECLARE(mask, NIR_MAX_VEC_COMPONENTS * sizeof(uint64_t));
 186    BITSET_ZERO(mask);
 187
 188    for (unsigned i = 0; i < num_components; i++) {
 189       if (writemask & (1u << i))
 190          BITSET_SET_RANGE(mask, i * byte_size, ((i + 1) * byte_size) - 1);
 191    }
 192
 193    while (BITSET_FFS(mask) != 0) {
 194       const int start = BITSET_FFS(mask) - 1;
 195
 196       int end;
 197       for (end = start + 1; end < bytes_written; end++) {
 198          if (!(BITSET_TEST(mask, end)))
 199             break;
 200       }
 201       /* The size of the current contiguous chunk in bytes */
 202       const unsigned chunk_bytes = end - start;
 203
 204       const bool is_dword_aligned =
 205          (align_mul >= 4 && (align_offset + start) % 4 == 0) ||
 206          (offset_is_const && (start + const_offset) % 4 == 0);
 207
 208       unsigned store_comps, store_bit_size, store_align;
 209       if (chunk_bytes >= 4 && is_dword_aligned) {
 210          store_align = MAX2(align, 4);
 211          store_bit_size = 32;
 212          store_comps = needs_scalar ? 1 : MIN2(chunk_bytes, 16) / 4;
 213       } else {
 214          store_align = align;
 215          store_comps = 1;
 216          store_bit_size = MIN2(chunk_bytes, 4) * 8;
 217          /* The bit size must be a power of two */
 218          if (store_bit_size == 24)
 219             store_bit_size = 16;
 220       }
 221       const unsigned store_bytes = store_comps * (store_bit_size / 8);
 222
 223       nir_ssa_def *packed = nir_extract_bits(b, &value, 1, start * 8,
 224                                              store_comps, store_bit_size);
 225
 226       dup_mem_intrinsic(b, intrin, packed, start,
 227                         store_comps, store_bit_size, store_align);
 228
 229       BITSET_CLEAR_RANGE(mask, start, (start + store_bytes - 1));
 230    }
 231
 232    nir_instr_remove(&intrin->instr);
 233
 234    return true;
 235 }
 236
 237 static bool
 238 lower_mem_access_bit_sizes_impl(nir_function_impl *impl,
 239                                 const struct gen_device_info *devinfo)
 240 {
 241    bool progress = false;
 242
 243    nir_builder b;
 244    nir_builder_init(&b, impl);
 245
 246    nir_foreach_block(block, impl) {
 247       nir_foreach_instr_safe(instr, block) {
 248          if (instr->type != nir_instr_type_intrinsic)
 249             continue;
 250
 251          b.cursor = nir_after_instr(instr);
 252
 253          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 254          switch (intrin->intrinsic) {
 255          case nir_intrinsic_load_global:
 256          case nir_intrinsic_load_ssbo:
 257          case nir_intrinsic_load_shared:
 258          case nir_intrinsic_load_scratch:
 259             if (lower_mem_load_bit_size(&b, intrin, devinfo))
 260                progress = true;
 261             break;
 262
 263          case nir_intrinsic_store_global:
 264          case nir_intrinsic_store_ssbo:
 265          case nir_intrinsic_store_shared:
 266          case nir_intrinsic_store_scratch:
 267             if (lower_mem_store_bit_size(&b, intrin, devinfo))
 268                progress = true;
 269             break;
 270
 271          default:
 272             break;
 273          }
 274       }
 275    }
 276
 277    if (progress) {
 278       nir_metadata_preserve(impl, nir_metadata_block_index |
 279                                   nir_metadata_dominance);
 280    } else {
 281       nir_metadata_preserve(impl, nir_metadata_all);
 282    }
 283
 284    return progress;
 285 }
 286
 287 /**
 288  * This pass loads arbitrary SSBO and shared memory load/store operations to
 289  * intrinsics which are natively handleable by GEN hardware.  In particular,
 290  * we have two general types of memory load/store messages:
 291  *
 292  *  - Untyped surface read/write:  These can load/store between one and four
 293  *    dword components to/from a dword-aligned offset.
 294  *
 295  *  - Byte scattered read/write:  These can load/store a single byte, word, or
 296  *    dword scalar to/from an unaligned byte offset.
 297  *
 298  * Neither type of message can do a write-masked store.  This pass converts
 299  * all nir load/store intrinsics into a series of either 8 or 32-bit
 300  * load/store intrinsics with a number of components that we can directly
 301  * handle in hardware and with a trivial write-mask.
 302  *
 303  * For scratch access, additional consideration has to be made due to the way
 304  * that we swizzle the memory addresses to achieve decent cache locality.  In
 305  * particular, even though untyped surface read/write messages exist and work,
 306  * we can't use them to load multiple components in a single SEND.  For more
 307  * detail on the scratch swizzle, see fs_visitor::swizzle_nir_scratch_addr.
 308  */
 309 bool
 310 brw_nir_lower_mem_access_bit_sizes(nir_shader *shader,
 311                                    const struct gen_device_info *devinfo)
 312 {
 313    bool progress = false;
 314
 315    nir_foreach_function(func, shader) {
 316       if (func->impl && lower_mem_access_bit_sizes_impl(func->impl, devinfo))
 317          progress = true;
 318    }
 319
 320    return progress;
 321 }