src/compiler/nir/nir_opt_load_store_vectorize.c

   1 /*
   2  * Copyright © 2019 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /**
  25  * Although it's called a load/store "vectorization" pass, this also combines
  26  * intersecting and identical loads/stores. It currently supports derefs, ubo,
  27  * ssbo and push constant loads/stores.
  28  *
  29  * This doesn't handle copy_deref intrinsics and assumes that
  30  * nir_lower_alu_to_scalar() has been called and that the IR is free from ALU
  31  * modifiers. It also assumes that derefs have explicitly laid out types.
  32  *
  33  * After vectorization, the backend may want to call nir_lower_alu_to_scalar()
  34  * and nir_lower_pack(). Also this creates cast instructions taking derefs as a
  35  * source and some parts of NIR may not be able to handle that well.
  36  *
  37  * There are a few situations where this doesn't vectorize as well as it could:
  38  * - It won't turn four consecutive vec3 loads into 3 vec4 loads.
  39  * - It doesn't do global vectorization.
  40  * Handling these cases probably wouldn't provide much benefit though.
  41  *
  42  * This probably doesn't handle big-endian GPUs correctly.
  43 */
  44
  45 #include "nir.h"
  46 #include "nir_deref.h"
  47 #include "nir_builder.h"
  48 #include "nir_worklist.h"
  49 #include "util/u_dynarray.h"
  50
  51 #include <stdlib.h>
  52
  53 struct intrinsic_info {
  54    nir_variable_mode mode; /* 0 if the mode is obtained from the deref. */
  55    nir_intrinsic_op op;
  56    bool is_atomic;
  57    /* Indices into nir_intrinsic::src[] or -1 if not applicable. */
  58    int resource_src; /* resource (e.g. from vulkan_resource_index) */
  59    int base_src; /* offset which it loads/stores from */
  60    int deref_src; /* deref which is loads/stores from */
  61    int value_src; /* the data it is storing */
  62 };
  63
  64 static const struct intrinsic_info *
  65 get_info(nir_intrinsic_op op) {
  66    switch (op) {
  67 #define INFO(mode, op, atomic, res, base, deref, val) \
  68 case nir_intrinsic_##op: {\
  69    static const struct intrinsic_info op##_info = {mode, nir_intrinsic_##op, atomic, res, base, deref, val};\
  70    return &op##_info;\
  71 }
  72 #define LOAD(mode, op, res, base, deref) INFO(mode, load_##op, false, res, base, deref, -1)
  73 #define STORE(mode, op, res, base, deref, val) INFO(mode, store_##op, false, res, base, deref, val)
  74 #define ATOMIC(mode, type, op, res, base, deref, val) INFO(mode, type##_atomic_##op, true, res, base, deref, val)
  75    LOAD(nir_var_mem_push_const, push_constant, -1, 0, -1)
  76    LOAD(nir_var_mem_ubo, ubo, 0, 1, -1)
  77    LOAD(nir_var_mem_ssbo, ssbo, 0, 1, -1)
  78    STORE(nir_var_mem_ssbo, ssbo, 1, 2, -1, 0)
  79    LOAD(0, deref, -1, -1, 0)
  80    STORE(0, deref, -1, -1, 0, 1)
  81    LOAD(nir_var_mem_shared, shared, -1, 0, -1)
  82    STORE(nir_var_mem_shared, shared, -1, 1, -1, 0)
  83    ATOMIC(nir_var_mem_ssbo, ssbo, add, 0, 1, -1, 2)
  84    ATOMIC(nir_var_mem_ssbo, ssbo, imin, 0, 1, -1, 2)
  85    ATOMIC(nir_var_mem_ssbo, ssbo, umin, 0, 1, -1, 2)
  86    ATOMIC(nir_var_mem_ssbo, ssbo, imax, 0, 1, -1, 2)
  87    ATOMIC(nir_var_mem_ssbo, ssbo, umax, 0, 1, -1, 2)
  88    ATOMIC(nir_var_mem_ssbo, ssbo, and, 0, 1, -1, 2)
  89    ATOMIC(nir_var_mem_ssbo, ssbo, or, 0, 1, -1, 2)
  90    ATOMIC(nir_var_mem_ssbo, ssbo, xor, 0, 1, -1, 2)
  91    ATOMIC(nir_var_mem_ssbo, ssbo, exchange, 0, 1, -1, 2)
  92    ATOMIC(nir_var_mem_ssbo, ssbo, comp_swap, 0, 1, -1, 2)
  93    ATOMIC(nir_var_mem_ssbo, ssbo, fadd, 0, 1, -1, 2)
  94    ATOMIC(nir_var_mem_ssbo, ssbo, fmin, 0, 1, -1, 2)
  95    ATOMIC(nir_var_mem_ssbo, ssbo, fmax, 0, 1, -1, 2)
  96    ATOMIC(nir_var_mem_ssbo, ssbo, fcomp_swap, 0, 1, -1, 2)
  97    ATOMIC(0, deref, add, -1, -1, 0, 1)
  98    ATOMIC(0, deref, imin, -1, -1, 0, 1)
  99    ATOMIC(0, deref, umin, -1, -1, 0, 1)
 100    ATOMIC(0, deref, imax, -1, -1, 0, 1)
 101    ATOMIC(0, deref, umax, -1, -1, 0, 1)
 102    ATOMIC(0, deref, and, -1, -1, 0, 1)
 103    ATOMIC(0, deref, or, -1, -1, 0, 1)
 104    ATOMIC(0, deref, xor, -1, -1, 0, 1)
 105    ATOMIC(0, deref, exchange, -1, -1, 0, 1)
 106    ATOMIC(0, deref, comp_swap, -1, -1, 0, 1)
 107    ATOMIC(0, deref, fadd, -1, -1, 0, 1)
 108    ATOMIC(0, deref, fmin, -1, -1, 0, 1)
 109    ATOMIC(0, deref, fmax, -1, -1, 0, 1)
 110    ATOMIC(0, deref, fcomp_swap, -1, -1, 0, 1)
 111    ATOMIC(nir_var_mem_shared, shared, add, -1, 0, -1, 1)
 112    ATOMIC(nir_var_mem_shared, shared, imin, -1, 0, -1, 1)
 113    ATOMIC(nir_var_mem_shared, shared, umin, -1, 0, -1, 1)
 114    ATOMIC(nir_var_mem_shared, shared, imax, -1, 0, -1, 1)
 115    ATOMIC(nir_var_mem_shared, shared, umax, -1, 0, -1, 1)
 116    ATOMIC(nir_var_mem_shared, shared, and, -1, 0, -1, 1)
 117    ATOMIC(nir_var_mem_shared, shared, or, -1, 0, -1, 1)
 118    ATOMIC(nir_var_mem_shared, shared, xor, -1, 0, -1, 1)
 119    ATOMIC(nir_var_mem_shared, shared, exchange, -1, 0, -1, 1)
 120    ATOMIC(nir_var_mem_shared, shared, comp_swap, -1, 0, -1, 1)
 121    ATOMIC(nir_var_mem_shared, shared, fadd, -1, 0, -1, 1)
 122    ATOMIC(nir_var_mem_shared, shared, fmin, -1, 0, -1, 1)
 123    ATOMIC(nir_var_mem_shared, shared, fmax, -1, 0, -1, 1)
 124    ATOMIC(nir_var_mem_shared, shared, fcomp_swap, -1, 0, -1, 1)
 125    default:
 126       break;
 127 #undef ATOMIC
 128 #undef STORE
 129 #undef LOAD
 130 #undef INFO
 131    }
 132    return NULL;
 133 }
 134
 135 /*
 136  * Information used to compare memory operations.
 137  * It canonically represents an offset as:
 138  * `offset_defs[0]*offset_defs_mul[0] + offset_defs[1]*offset_defs_mul[1] + ...`
 139  * "offset_defs" is sorted in ascenting order by the ssa definition's index.
 140  * "resource" or "var" may be NULL.
 141  */
 142 struct entry_key {
 143    nir_ssa_def *resource;
 144    nir_variable *var;
 145    unsigned offset_def_count;
 146    nir_ssa_def **offset_defs;
 147    uint64_t *offset_defs_mul;
 148 };
 149
 150 /* Information on a single memory operation. */
 151 struct entry {
 152    struct list_head head;
 153    unsigned index;
 154
 155    struct entry_key *key;
 156    union {
 157       uint64_t offset; /* sign-extended */
 158       int64_t offset_signed;
 159    };
 160    uint32_t best_align;
 161
 162    nir_instr *instr;
 163    nir_intrinsic_instr *intrin;
 164    const struct intrinsic_info *info;
 165    enum gl_access_qualifier access;
 166    bool is_store;
 167
 168    nir_deref_instr *deref;
 169 };
 170
 171 struct vectorize_ctx {
 172    nir_variable_mode modes;
 173    nir_should_vectorize_mem_func callback;
 174    struct list_head entries[nir_num_variable_modes];
 175    struct hash_table *loads[nir_num_variable_modes];
 176    struct hash_table *stores[nir_num_variable_modes];
 177 };
 178
 179 static uint32_t hash_entry_key(const void *key_)
 180 {
 181    /* this is careful to not include pointers in the hash calculation so that
 182     * the order of the hash table walk is deterministic */
 183    struct entry_key *key = (struct entry_key*)key_;
 184
 185    uint32_t hash = _mesa_fnv32_1a_offset_bias;
 186    if (key->resource)
 187       hash = _mesa_fnv32_1a_accumulate(hash, key->resource->index);
 188    if (key->var) {
 189       hash = _mesa_fnv32_1a_accumulate(hash, key->var->index);
 190       unsigned mode = key->var->data.mode;
 191       hash = _mesa_fnv32_1a_accumulate(hash, mode);
 192    }
 193
 194    for (unsigned i = 0; i < key->offset_def_count; i++)
 195       hash = _mesa_fnv32_1a_accumulate(hash, key->offset_defs[i]->index);
 196
 197    hash = _mesa_fnv32_1a_accumulate_block(
 198       hash, key->offset_defs_mul, key->offset_def_count * sizeof(uint64_t));
 199
 200    return hash;
 201 }
 202
 203 static bool entry_key_equals(const void *a_, const void *b_)
 204 {
 205    struct entry_key *a = (struct entry_key*)a_;
 206    struct entry_key *b = (struct entry_key*)b_;
 207
 208    if (a->var != b->var || a->resource != b->resource)
 209       return false;
 210
 211    if (a->offset_def_count != b->offset_def_count)
 212       return false;
 213
 214    size_t offset_def_size = a->offset_def_count * sizeof(nir_ssa_def *);
 215    size_t offset_def_mul_size = a->offset_def_count * sizeof(uint64_t);
 216    if (a->offset_def_count &&
 217        (memcmp(a->offset_defs, b->offset_defs, offset_def_size) ||
 218         memcmp(a->offset_defs_mul, b->offset_defs_mul, offset_def_mul_size)))
 219       return false;
 220
 221    return true;
 222 }
 223
 224 static void delete_entry_dynarray(struct hash_entry *entry)
 225 {
 226    struct util_dynarray *arr = (struct util_dynarray *)entry->data;
 227    ralloc_free(arr);
 228 }
 229
 230 static int sort_entries(const void *a_, const void *b_)
 231 {
 232    struct entry *a = *(struct entry*const*)a_;
 233    struct entry *b = *(struct entry*const*)b_;
 234
 235    if (a->offset_signed > b->offset_signed)
 236       return 1;
 237    else if (a->offset_signed < b->offset_signed)
 238       return -1;
 239    else
 240       return 0;
 241 }
 242
 243 static unsigned
 244 get_bit_size(struct entry *entry)
 245 {
 246    unsigned size = entry->is_store ?
 247                    entry->intrin->src[entry->info->value_src].ssa->bit_size :
 248                    entry->intrin->dest.ssa.bit_size;
 249    return size == 1 ? 32u : size;
 250 }
 251
 252 /* If "def" is from an alu instruction with the opcode "op" and one of it's
 253  * sources is a constant, update "def" to be the non-constant source, fill "c"
 254  * with the constant and return true. */
 255 static bool
 256 parse_alu(nir_ssa_def **def, nir_op op, uint64_t *c)
 257 {
 258    nir_ssa_scalar scalar;
 259    scalar.def = *def;
 260    scalar.comp = 0;
 261
 262    if (!nir_ssa_scalar_is_alu(scalar) || nir_ssa_scalar_alu_op(scalar) != op)
 263       return false;
 264
 265    nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);
 266    nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);
 267    if (op != nir_op_ishl && nir_ssa_scalar_is_const(src0) && src1.comp == 0) {
 268       *c = nir_ssa_scalar_as_uint(src0);
 269       *def = src1.def;
 270    } else if (nir_ssa_scalar_is_const(src1) && src0.comp == 0) {
 271       *c = nir_ssa_scalar_as_uint(src1);
 272       *def = src0.def;
 273    } else {
 274       return false;
 275    }
 276    return true;
 277 }
 278
 279 /* Parses an offset expression such as "a * 16 + 4" and "(a * 16 + 4) * 64 + 32". */
 280 static void
 281 parse_offset(nir_ssa_def **base, uint64_t *base_mul, uint64_t *offset)
 282 {
 283    if ((*base)->parent_instr->type == nir_instr_type_load_const) {
 284       *offset = nir_src_comp_as_uint(nir_src_for_ssa(*base), 0);
 285       *base = NULL;
 286       return;
 287    }
 288
 289    uint64_t mul = 1;
 290    uint64_t add = 0;
 291    bool progress = false;
 292    do {
 293       uint64_t mul2 = 1, add2 = 0;
 294
 295       progress = parse_alu(base, nir_op_imul, &mul2);
 296       mul *= mul2;
 297
 298       mul2 = 0;
 299       progress |= parse_alu(base, nir_op_ishl, &mul2);
 300       mul <<= mul2;
 301
 302       progress |= parse_alu(base, nir_op_iadd, &add2);
 303       add += add2 * mul;
 304    } while (progress);
 305
 306    *base_mul = mul;
 307    *offset = add;
 308 }
 309
 310 static unsigned
 311 type_scalar_size_bytes(const struct glsl_type *type)
 312 {
 313    assert(glsl_type_is_vector_or_scalar(type) ||
 314           glsl_type_is_matrix(type));
 315    return glsl_type_is_boolean(type) ? 4u : glsl_get_bit_size(type) / 8u;
 316 }
 317
 318 static int
 319 get_array_stride(const struct glsl_type *type)
 320 {
 321    unsigned explicit_stride = glsl_get_explicit_stride(type);
 322    if ((glsl_type_is_matrix(type) &&
 323         glsl_matrix_type_is_row_major(type)) ||
 324        (glsl_type_is_vector(type) && explicit_stride == 0))
 325       return type_scalar_size_bytes(type);
 326    return explicit_stride;
 327 }
 328
 329 static uint64_t
 330 mask_sign_extend(uint64_t val, unsigned bit_size)
 331 {
 332    return (int64_t)(val << (64 - bit_size)) >> (64 - bit_size);
 333 }
 334
 335 static unsigned
 336 add_to_entry_key(nir_ssa_def **offset_defs, uint64_t *offset_defs_mul,
 337                  unsigned offset_def_count, nir_ssa_def *def, uint64_t mul)
 338 {
 339    mul = mask_sign_extend(mul, def->bit_size);
 340
 341    for (unsigned i = 0; i <= offset_def_count; i++) {
 342       if (i == offset_def_count || def->index > offset_defs[i]->index) {
 343          /* insert before i */
 344          memmove(offset_defs + i + 1, offset_defs + i,
 345                  (offset_def_count - i) * sizeof(nir_ssa_def *));
 346          memmove(offset_defs_mul + i + 1, offset_defs_mul + i,
 347                  (offset_def_count - i) * sizeof(uint64_t));
 348          offset_defs[i] = def;
 349          offset_defs_mul[i] = mul;
 350          return 1;
 351       } else if (def->index == offset_defs[i]->index) {
 352          /* merge with offset_def at i */
 353          offset_defs_mul[i] += mul;
 354          return 0;
 355       }
 356    }
 357    unreachable("Unreachable.");
 358    return 0;
 359 }
 360
 361 static struct entry_key *
 362 create_entry_key_from_deref(void *mem_ctx,
 363                             struct vectorize_ctx *ctx,
 364                             nir_deref_path *path,
 365                             uint64_t *offset_base)
 366 {
 367    unsigned path_len = 0;
 368    while (path->path[path_len])
 369       path_len++;
 370
 371    nir_ssa_def *offset_defs_stack[32];
 372    uint64_t offset_defs_mul_stack[32];
 373    nir_ssa_def **offset_defs = offset_defs_stack;
 374    uint64_t *offset_defs_mul = offset_defs_mul_stack;
 375    if (path_len > 32) {
 376       offset_defs = malloc(path_len * sizeof(nir_ssa_def *));
 377       offset_defs_mul = malloc(path_len * sizeof(uint64_t));
 378    }
 379    unsigned offset_def_count = 0;
 380
 381    struct entry_key* key = ralloc(mem_ctx, struct entry_key);
 382    key->resource = NULL;
 383    key->var = NULL;
 384    *offset_base = 0;
 385
 386    for (unsigned i = 0; i < path_len; i++) {
 387       nir_deref_instr *parent = i ? path->path[i - 1] : NULL;
 388       nir_deref_instr *deref = path->path[i];
 389
 390       switch (deref->deref_type) {
 391       case nir_deref_type_var: {
 392          assert(!parent);
 393          key->var = deref->var;
 394          break;
 395       }
 396       case nir_deref_type_array:
 397       case nir_deref_type_ptr_as_array: {
 398          assert(parent);
 399          nir_ssa_def *index = deref->arr.index.ssa;
 400          uint32_t stride;
 401          if (deref->deref_type == nir_deref_type_ptr_as_array)
 402             stride = nir_deref_instr_ptr_as_array_stride(deref);
 403          else
 404             stride = get_array_stride(parent->type);
 405
 406          nir_ssa_def *base = index;
 407          uint64_t offset = 0, base_mul = 1;
 408          parse_offset(&base, &base_mul, &offset);
 409          offset = mask_sign_extend(offset, index->bit_size);
 410
 411          *offset_base += offset * stride;
 412          if (base) {
 413             offset_def_count += add_to_entry_key(offset_defs, offset_defs_mul,
 414                                                  offset_def_count,
 415                                                  base, base_mul * stride);
 416          }
 417          break;
 418       }
 419       case nir_deref_type_struct: {
 420          assert(parent);
 421          int offset = glsl_get_struct_field_offset(parent->type, deref->strct.index);
 422          *offset_base += offset;
 423          break;
 424       }
 425       case nir_deref_type_cast: {
 426          if (!parent)
 427             key->resource = deref->parent.ssa;
 428          break;
 429       }
 430       default:
 431          unreachable("Unhandled deref type");
 432       }
 433    }
 434
 435    key->offset_def_count = offset_def_count;
 436    key->offset_defs = ralloc_array(mem_ctx, nir_ssa_def *, offset_def_count);
 437    key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, offset_def_count);
 438    memcpy(key->offset_defs, offset_defs, offset_def_count * sizeof(nir_ssa_def *));
 439    memcpy(key->offset_defs_mul, offset_defs_mul, offset_def_count * sizeof(uint64_t));
 440
 441    if (offset_defs != offset_defs_stack)
 442       free(offset_defs);
 443    if (offset_defs_mul != offset_defs_mul_stack)
 444       free(offset_defs_mul);
 445
 446    return key;
 447 }
 448
 449 static unsigned
 450 parse_entry_key_from_offset(struct entry_key *key, unsigned size, unsigned left,
 451                             nir_ssa_def *base, uint64_t base_mul, uint64_t *offset)
 452 {
 453    uint64_t new_mul;
 454    uint64_t new_offset;
 455    parse_offset(&base, &new_mul, &new_offset);
 456    *offset += new_offset * base_mul;
 457
 458    if (!base)
 459       return 0;
 460
 461    base_mul *= new_mul;
 462
 463    assert(left >= 1);
 464
 465    if (left >= 2) {
 466       nir_ssa_scalar scalar;
 467       scalar.def = base;
 468       scalar.comp = 0;
 469       if (nir_ssa_scalar_is_alu(scalar) && nir_ssa_scalar_alu_op(scalar) == nir_op_iadd) {
 470          nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);
 471          nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);
 472          if (src0.comp == 0 && src1.comp == 0) {
 473             unsigned amount = parse_entry_key_from_offset(key, size, left - 1, src0.def, base_mul, offset);
 474             amount += parse_entry_key_from_offset(key, size + amount, left - amount, src1.def, base_mul, offset);
 475             return amount;
 476          }
 477       }
 478    }
 479
 480    return add_to_entry_key(key->offset_defs, key->offset_defs_mul, size, base, base_mul);
 481 }
 482
 483 static struct entry_key *
 484 create_entry_key_from_offset(void *mem_ctx, nir_ssa_def *base, uint64_t base_mul, uint64_t *offset)
 485 {
 486    struct entry_key *key = ralloc(mem_ctx, struct entry_key);
 487    key->resource = NULL;
 488    key->var = NULL;
 489    if (base) {
 490       nir_ssa_def *offset_defs[32];
 491       uint64_t offset_defs_mul[32];
 492       key->offset_defs = offset_defs;
 493       key->offset_defs_mul = offset_defs_mul;
 494
 495       key->offset_def_count = parse_entry_key_from_offset(key, 0, 32, base, base_mul, offset);
 496
 497       key->offset_defs = ralloc_array(mem_ctx, nir_ssa_def *, key->offset_def_count);
 498       key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, key->offset_def_count);
 499       memcpy(key->offset_defs, offset_defs, key->offset_def_count * sizeof(nir_ssa_def *));
 500       memcpy(key->offset_defs_mul, offset_defs_mul, key->offset_def_count * sizeof(uint64_t));
 501    } else {
 502       key->offset_def_count = 0;
 503       key->offset_defs = NULL;
 504       key->offset_defs_mul = NULL;
 505    }
 506    return key;
 507 }
 508
 509 static nir_variable_mode
 510 get_variable_mode(struct entry *entry)
 511 {
 512    if (entry->info->mode)
 513       return entry->info->mode;
 514    assert(entry->deref);
 515    return entry->deref->mode;
 516 }
 517
 518 static struct entry *
 519 create_entry(struct vectorize_ctx *ctx,
 520              const struct intrinsic_info *info,
 521              nir_intrinsic_instr *intrin)
 522 {
 523    struct entry *entry = rzalloc(ctx, struct entry);
 524    entry->intrin = intrin;
 525    entry->instr = &intrin->instr;
 526    entry->info = info;
 527    entry->best_align = UINT32_MAX;
 528    entry->is_store = entry->info->value_src >= 0;
 529
 530    if (entry->info->deref_src >= 0) {
 531       entry->deref = nir_src_as_deref(intrin->src[entry->info->deref_src]);
 532       nir_deref_path path;
 533       nir_deref_path_init(&path, entry->deref, NULL);
 534       entry->key = create_entry_key_from_deref(entry, ctx, &path, &entry->offset);
 535       nir_deref_path_finish(&path);
 536    } else {
 537       nir_ssa_def *base = entry->info->base_src >= 0 ?
 538                           intrin->src[entry->info->base_src].ssa : NULL;
 539       uint64_t offset = 0;
 540       if (nir_intrinsic_infos[intrin->intrinsic].index_map[NIR_INTRINSIC_BASE])
 541          offset += nir_intrinsic_base(intrin);
 542       entry->key = create_entry_key_from_offset(entry, base, 1, &offset);
 543       entry->offset = offset;
 544
 545       if (base)
 546          entry->offset = mask_sign_extend(entry->offset, base->bit_size);
 547    }
 548
 549    if (entry->info->resource_src >= 0)
 550       entry->key->resource = intrin->src[entry->info->resource_src].ssa;
 551
 552    if (nir_intrinsic_infos[intrin->intrinsic].index_map[NIR_INTRINSIC_ACCESS])
 553       entry->access = nir_intrinsic_access(intrin);
 554    else if (entry->key->var)
 555       entry->access = entry->key->var->data.access;
 556
 557    uint32_t restrict_modes = nir_var_shader_in | nir_var_shader_out;
 558    restrict_modes |= nir_var_shader_temp | nir_var_function_temp;
 559    restrict_modes |= nir_var_uniform | nir_var_mem_push_const;
 560    restrict_modes |= nir_var_system_value | nir_var_mem_shared;
 561    if (get_variable_mode(entry) & restrict_modes)
 562       entry->access |= ACCESS_RESTRICT;
 563
 564    return entry;
 565 }
 566
 567 static nir_deref_instr *
 568 cast_deref(nir_builder *b, unsigned num_components, unsigned bit_size, nir_deref_instr *deref)
 569 {
 570    if (glsl_get_components(deref->type) == num_components &&
 571        type_scalar_size_bytes(deref->type)*8u == bit_size)
 572       return deref;
 573
 574    enum glsl_base_type types[] = {
 575       GLSL_TYPE_UINT8, GLSL_TYPE_UINT16, GLSL_TYPE_UINT, GLSL_TYPE_UINT64};
 576    enum glsl_base_type base = types[ffs(bit_size / 8u) - 1u];
 577    const struct glsl_type *type = glsl_vector_type(base, num_components);
 578
 579    if (deref->type == type)
 580       return deref;
 581
 582    return nir_build_deref_cast(b, &deref->dest.ssa, deref->mode, type, 0);
 583 }
 584
 585 /* Return true if the write mask "write_mask" of a store with "old_bit_size"
 586  * bits per element can be represented for a store with "new_bit_size" bits per
 587  * element. */
 588 static bool
 589 writemask_representable(unsigned write_mask, unsigned old_bit_size, unsigned new_bit_size)
 590 {
 591    while (write_mask) {
 592       int start, count;
 593       u_bit_scan_consecutive_range(&write_mask, &start, &count);
 594       start *= old_bit_size;
 595       count *= old_bit_size;
 596       if (start % new_bit_size != 0)
 597          return false;
 598       if (count % new_bit_size != 0)
 599          return false;
 600    }
 601    return true;
 602 }
 603
 604 static uint64_t
 605 gcd(uint64_t a, uint64_t b)
 606 {
 607    while (b) {
 608       uint64_t old_b = b;
 609       b = a % b;
 610       a = old_b;
 611    }
 612    return a;
 613 }
 614
 615 static uint32_t
 616 get_best_align(struct entry *entry)
 617 {
 618    if (entry->best_align != UINT32_MAX)
 619       return entry->best_align;
 620
 621    uint64_t best_align = entry->offset;
 622    for (unsigned i = 0; i < entry->key->offset_def_count; i++) {
 623       if (!best_align)
 624          best_align = entry->key->offset_defs_mul[i];
 625       else if (entry->key->offset_defs_mul[i])
 626          best_align = gcd(best_align, entry->key->offset_defs_mul[i]);
 627    }
 628
 629    if (nir_intrinsic_infos[entry->intrin->intrinsic].index_map[NIR_INTRINSIC_ALIGN_MUL])
 630       best_align = MAX2(best_align, nir_intrinsic_align(entry->intrin));
 631
 632    /* ensure the result is a power of two that fits in a int32_t */
 633    entry->best_align = gcd(best_align, 1u << 30);
 634
 635    return entry->best_align;
 636 }
 637
 638 /* Return true if "new_bit_size" is a usable bit size for a vectorized load/store
 639  * of "low" and "high". */
 640 static bool
 641 new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size,
 642                        struct entry *low, struct entry *high, unsigned size)
 643 {
 644    if (size % new_bit_size != 0)
 645       return false;
 646
 647    unsigned new_num_components = size / new_bit_size;
 648    if (!nir_num_components_valid(new_num_components))
 649       return false;
 650
 651    unsigned high_offset = high->offset_signed - low->offset_signed;
 652
 653    /* check nir_extract_bits limitations */
 654    unsigned common_bit_size = MIN2(get_bit_size(low), get_bit_size(high));
 655    common_bit_size = MIN2(common_bit_size, new_bit_size);
 656    if (high_offset > 0)
 657       common_bit_size = MIN2(common_bit_size, (1u << (ffs(high_offset * 8) - 1)));
 658    if (new_bit_size / common_bit_size > NIR_MAX_VEC_COMPONENTS)
 659       return false;
 660
 661    if (!ctx->callback(get_best_align(low), new_bit_size, new_num_components,
 662                       high_offset, low->intrin, high->intrin))
 663       return false;
 664
 665    if (low->is_store) {
 666       unsigned low_size = low->intrin->num_components * get_bit_size(low);
 667       unsigned high_size = high->intrin->num_components * get_bit_size(high);
 668
 669       if (low_size % new_bit_size != 0)
 670          return false;
 671       if (high_size % new_bit_size != 0)
 672          return false;
 673
 674       unsigned write_mask = nir_intrinsic_write_mask(low->intrin);
 675       if (!writemask_representable(write_mask, low_size, new_bit_size))
 676          return false;
 677
 678       write_mask = nir_intrinsic_write_mask(high->intrin);
 679       if (!writemask_representable(write_mask, high_size, new_bit_size))
 680          return false;
 681    }
 682
 683    return true;
 684 }
 685
 686 /* Updates a write mask, "write_mask", so that it can be used with a
 687  * "new_bit_size"-bit store instead of a "old_bit_size"-bit store. */
 688 static uint32_t
 689 update_writemask(unsigned write_mask, unsigned old_bit_size, unsigned new_bit_size)
 690 {
 691    uint32_t res = 0;
 692    while (write_mask) {
 693       int start, count;
 694       u_bit_scan_consecutive_range(&write_mask, &start, &count);
 695       start = start * old_bit_size / new_bit_size;
 696       count = count * old_bit_size / new_bit_size;
 697       res |= ((1 << count) - 1) << start;
 698    }
 699    return res;
 700 }
 701
 702 static nir_deref_instr *subtract_deref(nir_builder *b, nir_deref_instr *deref, int64_t offset)
 703 {
 704    /* avoid adding another deref to the path */
 705    if (deref->deref_type == nir_deref_type_ptr_as_array &&
 706        nir_src_is_const(deref->arr.index) &&
 707        offset % nir_deref_instr_ptr_as_array_stride(deref) == 0) {
 708       unsigned stride = nir_deref_instr_ptr_as_array_stride(deref);
 709       nir_ssa_def *index = nir_imm_intN_t(b, nir_src_as_int(deref->arr.index) - offset / stride,
 710                                           deref->dest.ssa.bit_size);
 711       return nir_build_deref_ptr_as_array(b, nir_deref_instr_parent(deref), index);
 712    }
 713
 714    if (deref->deref_type == nir_deref_type_array &&
 715        nir_src_is_const(deref->arr.index)) {
 716       nir_deref_instr *parent = nir_deref_instr_parent(deref);
 717       unsigned stride = glsl_get_explicit_stride(parent->type);
 718       if (offset % stride == 0)
 719          return nir_build_deref_array_imm(
 720             b, parent, nir_src_as_int(deref->arr.index) - offset / stride);
 721    }
 722
 723
 724    deref = nir_build_deref_cast(b, &deref->dest.ssa, deref->mode,
 725                                 glsl_scalar_type(GLSL_TYPE_UINT8), 1);
 726    return nir_build_deref_ptr_as_array(
 727       b, deref, nir_imm_intN_t(b, -offset, deref->dest.ssa.bit_size));
 728 }
 729
 730 static bool update_align(struct entry *entry)
 731 {
 732    bool has_align_index =
 733       nir_intrinsic_infos[entry->intrin->intrinsic].index_map[NIR_INTRINSIC_ALIGN_MUL];
 734    if (has_align_index) {
 735       unsigned align = get_best_align(entry);
 736       if (align != nir_intrinsic_align(entry->intrin)) {
 737          nir_intrinsic_set_align(entry->intrin, align, 0);
 738          return true;
 739       }
 740    }
 741    return false;
 742 }
 743
 744 static void
 745 vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx,
 746                 struct entry *low, struct entry *high,
 747                 struct entry *first, struct entry *second,
 748                 unsigned new_bit_size, unsigned new_num_components,
 749                 unsigned high_start)
 750 {
 751    unsigned low_bit_size = get_bit_size(low);
 752    unsigned high_bit_size = get_bit_size(high);
 753    bool low_bool = low->intrin->dest.ssa.bit_size == 1;
 754    bool high_bool = high->intrin->dest.ssa.bit_size == 1;
 755    nir_ssa_def *data = &first->intrin->dest.ssa;
 756
 757    b->cursor = nir_after_instr(first->instr);
 758
 759    /* update the load's destination size and extract data for each of the original loads */
 760    data->num_components = new_num_components;
 761    data->bit_size = new_bit_size;
 762
 763    nir_ssa_def *low_def = nir_extract_bits(
 764       b, &data, 1, 0, low->intrin->num_components, low_bit_size);
 765    nir_ssa_def *high_def = nir_extract_bits(
 766       b, &data, 1, high_start, high->intrin->num_components, high_bit_size);
 767
 768    /* convert booleans */
 769    low_def = low_bool ? nir_i2b(b, low_def) : nir_mov(b, low_def);
 770    high_def = high_bool ? nir_i2b(b, high_def) : nir_mov(b, high_def);
 771
 772    /* update uses */
 773    if (first == low) {
 774       nir_ssa_def_rewrite_uses_after(&low->intrin->dest.ssa, nir_src_for_ssa(low_def),
 775                                      high_def->parent_instr);
 776       nir_ssa_def_rewrite_uses(&high->intrin->dest.ssa, nir_src_for_ssa(high_def));
 777    } else {
 778       nir_ssa_def_rewrite_uses(&low->intrin->dest.ssa, nir_src_for_ssa(low_def));
 779       nir_ssa_def_rewrite_uses_after(&high->intrin->dest.ssa, nir_src_for_ssa(high_def),
 780                                      high_def->parent_instr);
 781    }
 782
 783    /* update the intrinsic */
 784    first->intrin->num_components = new_num_components;
 785
 786    const struct intrinsic_info *info = first->info;
 787
 788    /* update the offset */
 789    if (first != low && info->base_src >= 0) {
 790       /* let nir_opt_algebraic() remove this addition. this doesn't have much
 791        * issues with subtracting 16 from expressions like "(i + 1) * 16" because
 792        * nir_opt_algebraic() turns them into "i * 16 + 16" */
 793       b->cursor = nir_before_instr(first->instr);
 794
 795       nir_ssa_def *new_base = first->intrin->src[info->base_src].ssa;
 796       new_base = nir_iadd(b, new_base, nir_imm_int(b, -(high_start / 8u)));
 797
 798       nir_instr_rewrite_src(first->instr, &first->intrin->src[info->base_src],
 799                             nir_src_for_ssa(new_base));
 800    }
 801
 802    /* update the deref */
 803    if (info->deref_src >= 0) {
 804       b->cursor = nir_before_instr(first->instr);
 805
 806       nir_deref_instr *deref = nir_src_as_deref(first->intrin->src[info->deref_src]);
 807       if (first != low && high_start != 0)
 808          deref = subtract_deref(b, deref, high_start / 8u);
 809       first->deref = cast_deref(b, new_num_components, new_bit_size, deref);
 810
 811       nir_instr_rewrite_src(first->instr, &first->intrin->src[info->deref_src],
 812                             nir_src_for_ssa(&first->deref->dest.ssa));
 813    }
 814
 815    /* update base/align */
 816    bool has_base_index =
 817       nir_intrinsic_infos[first->intrin->intrinsic].index_map[NIR_INTRINSIC_BASE];
 818
 819    if (first != low && has_base_index)
 820       nir_intrinsic_set_base(first->intrin, nir_intrinsic_base(low->intrin));
 821
 822    first->key = low->key;
 823    first->offset = low->offset;
 824    first->best_align = get_best_align(low);
 825
 826    update_align(first);
 827
 828    nir_instr_remove(second->instr);
 829 }
 830
 831 static void
 832 vectorize_stores(nir_builder *b, struct vectorize_ctx *ctx,
 833                  struct entry *low, struct entry *high,
 834                  struct entry *first, struct entry *second,
 835                  unsigned new_bit_size, unsigned new_num_components,
 836                  unsigned high_start)
 837 {
 838    ASSERTED unsigned low_size = low->intrin->num_components * get_bit_size(low);
 839    assert(low_size % new_bit_size == 0);
 840
 841    b->cursor = nir_before_instr(second->instr);
 842
 843    /* get new writemasks */
 844    uint32_t low_write_mask = nir_intrinsic_write_mask(low->intrin);
 845    uint32_t high_write_mask = nir_intrinsic_write_mask(high->intrin);
 846    low_write_mask = update_writemask(low_write_mask, get_bit_size(low), new_bit_size);
 847    high_write_mask = update_writemask(high_write_mask, get_bit_size(high), new_bit_size);
 848    high_write_mask <<= high_start / new_bit_size;
 849
 850    uint32_t write_mask = low_write_mask | high_write_mask;
 851
 852    /* convert booleans */
 853    nir_ssa_def *low_val = low->intrin->src[low->info->value_src].ssa;
 854    nir_ssa_def *high_val = high->intrin->src[high->info->value_src].ssa;
 855    low_val = low_val->bit_size == 1 ? nir_b2i(b, low_val, 32) : low_val;
 856    high_val = high_val->bit_size == 1 ? nir_b2i(b, high_val, 32) : high_val;
 857
 858    /* combine the data */
 859    nir_ssa_def *data_channels[NIR_MAX_VEC_COMPONENTS];
 860    for (unsigned i = 0; i < new_num_components; i++) {
 861       bool set_low = low_write_mask & (1 << i);
 862       bool set_high = high_write_mask & (1 << i);
 863
 864       if (set_low && (!set_high || low == second)) {
 865          unsigned offset = i * new_bit_size;
 866          data_channels[i] = nir_extract_bits(b, &low_val, 1, offset, 1, new_bit_size);
 867       } else if (set_high) {
 868          assert(!set_low || high == second);
 869          unsigned offset = i * new_bit_size - high_start;
 870          data_channels[i] = nir_extract_bits(b, &high_val, 1, offset, 1, new_bit_size);
 871       } else {
 872          data_channels[i] = nir_ssa_undef(b, 1, new_bit_size);
 873       }
 874    }
 875    nir_ssa_def *data = nir_vec(b, data_channels, new_num_components);
 876
 877    /* update the intrinsic */
 878    nir_intrinsic_set_write_mask(second->intrin, write_mask);
 879    second->intrin->num_components = data->num_components;
 880
 881    const struct intrinsic_info *info = second->info;
 882    assert(info->value_src >= 0);
 883    nir_instr_rewrite_src(second->instr, &second->intrin->src[info->value_src],
 884                          nir_src_for_ssa(data));
 885
 886    /* update the offset */
 887    if (second != low && info->base_src >= 0)
 888       nir_instr_rewrite_src(second->instr, &second->intrin->src[info->base_src],
 889                             low->intrin->src[info->base_src]);
 890
 891    /* update the deref */
 892    if (info->deref_src >= 0) {
 893       b->cursor = nir_before_instr(second->instr);
 894       second->deref = cast_deref(b, new_num_components, new_bit_size,
 895                                  nir_src_as_deref(low->intrin->src[info->deref_src]));
 896       nir_instr_rewrite_src(second->instr, &second->intrin->src[info->deref_src],
 897                             nir_src_for_ssa(&second->deref->dest.ssa));
 898    }
 899
 900    /* update base/align */
 901    bool has_base_index =
 902       nir_intrinsic_infos[second->intrin->intrinsic].index_map[NIR_INTRINSIC_BASE];
 903
 904    if (second != low && has_base_index)
 905       nir_intrinsic_set_base(second->intrin, nir_intrinsic_base(low->intrin));
 906
 907    second->key = low->key;
 908    second->offset = low->offset;
 909    second->best_align = get_best_align(low);
 910
 911    update_align(second);
 912
 913    list_del(&first->head);
 914    nir_instr_remove(first->instr);
 915 }
 916
 917 /* Returns true if it can prove that "a" and "b" point to different resources. */
 918 static bool
 919 resources_different(nir_ssa_def *a, nir_ssa_def *b)
 920 {
 921    if (!a || !b)
 922       return false;
 923
 924    if (a->parent_instr->type == nir_instr_type_load_const &&
 925        b->parent_instr->type == nir_instr_type_load_const) {
 926       return nir_src_as_uint(nir_src_for_ssa(a)) != nir_src_as_uint(nir_src_for_ssa(b));
 927    }
 928
 929    if (a->parent_instr->type == nir_instr_type_intrinsic &&
 930        b->parent_instr->type == nir_instr_type_intrinsic) {
 931       nir_intrinsic_instr *aintrin = nir_instr_as_intrinsic(a->parent_instr);
 932       nir_intrinsic_instr *bintrin = nir_instr_as_intrinsic(b->parent_instr);
 933       if (aintrin->intrinsic == nir_intrinsic_vulkan_resource_index &&
 934           bintrin->intrinsic == nir_intrinsic_vulkan_resource_index) {
 935          return nir_intrinsic_desc_set(aintrin) != nir_intrinsic_desc_set(bintrin) ||
 936                 nir_intrinsic_binding(aintrin) != nir_intrinsic_binding(bintrin) ||
 937                 resources_different(aintrin->src[0].ssa, bintrin->src[0].ssa);
 938       }
 939    }
 940
 941    return false;
 942 }
 943
 944 static int64_t
 945 compare_entries(struct entry *a, struct entry *b)
 946 {
 947    if (!entry_key_equals(a->key, b->key))
 948       return INT64_MAX;
 949    return b->offset_signed - a->offset_signed;
 950 }
 951
 952 static bool
 953 may_alias(struct entry *a, struct entry *b)
 954 {
 955    assert(get_variable_mode(a) == get_variable_mode(b));
 956
 957    /* if the resources/variables are definitively different and both have
 958     * ACCESS_RESTRICT, we can assume they do not alias. */
 959    bool res_different = a->key->var != b->key->var ||
 960                         resources_different(a->key->resource, b->key->resource);
 961    if (res_different && (a->access & ACCESS_RESTRICT) && (b->access & ACCESS_RESTRICT))
 962       return false;
 963
 964    /* we can't compare offsets if the resources/variables might be different */
 965    if (a->key->var != b->key->var || a->key->resource != b->key->resource)
 966       return true;
 967
 968    /* use adjacency information */
 969    /* TODO: we can look closer at the entry keys */
 970    int64_t diff = compare_entries(a, b);
 971    if (diff != INT64_MAX) {
 972       /* with atomics, intrin->num_components can be 0 */
 973       if (diff < 0)
 974          return llabs(diff) < MAX2(b->intrin->num_components, 1u) * (get_bit_size(b) / 8u);
 975       else
 976          return diff < MAX2(a->intrin->num_components, 1u) * (get_bit_size(a) / 8u);
 977    }
 978
 979    /* TODO: we can use deref information */
 980
 981    return true;
 982 }
 983
 984 static bool
 985 check_for_aliasing(struct vectorize_ctx *ctx, struct entry *first, struct entry *second)
 986 {
 987    nir_variable_mode mode = get_variable_mode(first);
 988    if (mode & (nir_var_uniform | nir_var_system_value |
 989                nir_var_mem_push_const | nir_var_mem_ubo))
 990       return false;
 991
 992    unsigned mode_index = ffs(mode) - 1;
 993    if (first->is_store) {
 994       /* find first entry that aliases "first" */
 995       list_for_each_entry_from(struct entry, next, first, &ctx->entries[mode_index], head) {
 996          if (next == first)
 997             continue;
 998          if (next == second)
 999             return false;
1000          if (may_alias(first, next))
1001             return true;
1002       }
1003    } else {
1004       /* find previous store that aliases this load */
1005       list_for_each_entry_from_rev(struct entry, prev, second, &ctx->entries[mode_index], head) {
1006          if (prev == second)
1007             continue;
1008          if (prev == first)
1009             return false;
1010          if (prev->is_store && may_alias(second, prev))
1011             return true;
1012       }
1013    }
1014
1015    return false;
1016 }
1017
1018 static bool
1019 is_strided_vector(const struct glsl_type *type)
1020 {
1021    if (glsl_type_is_vector(type)) {
1022       return glsl_get_explicit_stride(type) !=
1023              type_scalar_size_bytes(glsl_get_array_element(type));
1024    } else {
1025       return false;
1026    }
1027 }
1028
1029 static bool
1030 try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
1031               struct entry *low, struct entry *high,
1032               struct entry *first, struct entry *second)
1033 {
1034    if (check_for_aliasing(ctx, first, second))
1035       return false;
1036
1037    /* we can only vectorize non-volatile loads/stores of the same type and with
1038     * the same access */
1039    if (first->info != second->info || first->access != second->access ||
1040        (first->access & ACCESS_VOLATILE) || first->info->is_atomic)
1041       return false;
1042
1043    /* don't attempt to vectorize accesses of row-major matrix columns */
1044    if (first->deref) {
1045       const struct glsl_type *first_type = first->deref->type;
1046       const struct glsl_type *second_type = second->deref->type;
1047       if (is_strided_vector(first_type) || is_strided_vector(second_type))
1048          return false;
1049    }
1050
1051    /* gather information */
1052    uint64_t diff = high->offset_signed - low->offset_signed;
1053    unsigned low_bit_size = get_bit_size(low);
1054    unsigned high_bit_size = get_bit_size(high);
1055    unsigned low_size = low->intrin->num_components * low_bit_size;
1056    unsigned high_size = high->intrin->num_components * high_bit_size;
1057    unsigned new_size = MAX2(diff * 8u + high_size, low_size);
1058
1059    /* find a good bit size for the new load/store */
1060    unsigned new_bit_size = 0;
1061    if (new_bitsize_acceptable(ctx, low_bit_size, low, high, new_size)) {
1062       new_bit_size = low_bit_size;
1063    } else if (low_bit_size != high_bit_size &&
1064               new_bitsize_acceptable(ctx, high_bit_size, low, high, new_size)) {
1065       new_bit_size = high_bit_size;
1066    } else {
1067       new_bit_size = 64;
1068       for (; new_bit_size >= 8; new_bit_size /= 2) {
1069          /* don't repeat trying out bitsizes */
1070          if (new_bit_size == low_bit_size || new_bit_size == high_bit_size)
1071             continue;
1072          if (new_bitsize_acceptable(ctx, new_bit_size, low, high, new_size))
1073             break;
1074       }
1075       if (new_bit_size < 8)
1076          return false;
1077    }
1078    unsigned new_num_components = new_size / new_bit_size;
1079
1080    /* vectorize the loads/stores */
1081    nir_builder b;
1082    nir_builder_init(&b, impl);
1083
1084    if (first->is_store)
1085       vectorize_stores(&b, ctx, low, high, first, second,
1086                        new_bit_size, new_num_components, diff * 8u);
1087    else
1088       vectorize_loads(&b, ctx, low, high, first, second,
1089                       new_bit_size, new_num_components, diff * 8u);
1090
1091    return true;
1092 }
1093
1094 static bool
1095 vectorize_entries(struct vectorize_ctx *ctx, nir_function_impl *impl, struct hash_table *ht)
1096 {
1097    if (!ht)
1098       return false;
1099
1100    bool progress = false;
1101    hash_table_foreach(ht, entry) {
1102       struct util_dynarray *arr = entry->data;
1103       if (!arr->size)
1104          continue;
1105
1106       qsort(util_dynarray_begin(arr),
1107             util_dynarray_num_elements(arr, struct entry *),
1108             sizeof(struct entry *), &sort_entries);
1109
1110       unsigned i = 0;
1111       for (; i < util_dynarray_num_elements(arr, struct entry*) - 1; i++) {
1112          struct entry *low = *util_dynarray_element(arr, struct entry *, i);
1113          struct entry *high = *util_dynarray_element(arr, struct entry *, i + 1);
1114
1115          uint64_t diff = high->offset_signed - low->offset_signed;
1116          if (diff > get_bit_size(low) / 8u * low->intrin->num_components) {
1117             progress |= update_align(low);
1118             continue;
1119          }
1120
1121          struct entry *first = low->index < high->index ? low : high;
1122          struct entry *second = low->index < high->index ? high : low;
1123
1124          if (try_vectorize(impl, ctx, low, high, first, second)) {
1125             *util_dynarray_element(arr, struct entry *, i) = NULL;
1126             *util_dynarray_element(arr, struct entry *, i + 1) = low->is_store ? second : first;
1127             progress = true;
1128          } else {
1129             progress |= update_align(low);
1130          }
1131       }
1132
1133       struct entry *last = *util_dynarray_element(arr, struct entry *, i);
1134       progress |= update_align(last);
1135    }
1136
1137    _mesa_hash_table_clear(ht, delete_entry_dynarray);
1138
1139    return progress;
1140 }
1141
1142 static bool
1143 handle_barrier(struct vectorize_ctx *ctx, bool *progress, nir_function_impl *impl, nir_instr *instr)
1144 {
1145    unsigned modes = 0;
1146    bool acquire = true;
1147    bool release = true;
1148    if (instr->type == nir_instr_type_intrinsic) {
1149       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1150       switch (intrin->intrinsic) {
1151       case nir_intrinsic_group_memory_barrier:
1152       case nir_intrinsic_memory_barrier:
1153          modes = nir_var_mem_ssbo | nir_var_mem_shared | nir_var_mem_global;
1154          break;
1155       /* prevent speculative loads/stores */
1156       case nir_intrinsic_discard_if:
1157       case nir_intrinsic_discard:
1158          modes = nir_var_all;
1159          break;
1160       case nir_intrinsic_memory_barrier_buffer:
1161          modes = nir_var_mem_ssbo | nir_var_mem_global;
1162          break;
1163       case nir_intrinsic_memory_barrier_shared:
1164          modes = nir_var_mem_shared;
1165          break;
1166       case nir_intrinsic_scoped_memory_barrier:
1167          modes = nir_intrinsic_memory_modes(intrin);
1168          acquire = nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_ACQUIRE;
1169          release = nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_RELEASE;
1170          switch (nir_intrinsic_memory_scope(intrin)) {
1171          case NIR_SCOPE_INVOCATION:
1172          case NIR_SCOPE_SUBGROUP:
1173             /* a barier should never be required for correctness with these scopes */
1174             modes = 0;
1175             break;
1176          default:
1177             break;
1178          }
1179          break;
1180       default:
1181          return false;
1182       }
1183    } else if (instr->type == nir_instr_type_call) {
1184       modes = nir_var_all;
1185    } else {
1186       return false;
1187    }
1188
1189    while (modes) {
1190       unsigned mode_index = u_bit_scan(&modes);
1191
1192       if (acquire)
1193          *progress |= vectorize_entries(ctx, impl, ctx->loads[mode_index]);
1194       if (release)
1195          *progress |= vectorize_entries(ctx, impl, ctx->stores[mode_index]);
1196    }
1197
1198    return true;
1199 }
1200
1201 static bool
1202 process_block(nir_function_impl *impl, struct vectorize_ctx *ctx, nir_block *block)
1203 {
1204    bool progress = false;
1205
1206    for (unsigned i = 0; i < nir_num_variable_modes; i++) {
1207       list_inithead(&ctx->entries[i]);
1208       if (ctx->loads[i])
1209          _mesa_hash_table_clear(ctx->loads[i], delete_entry_dynarray);
1210       if (ctx->stores[i])
1211          _mesa_hash_table_clear(ctx->stores[i], delete_entry_dynarray);
1212    }
1213
1214    /* create entries */
1215    unsigned next_index = 0;
1216
1217    nir_foreach_instr_safe(instr, block) {
1218       if (handle_barrier(ctx, &progress, impl, instr))
1219          continue;
1220
1221       /* gather information */
1222       if (instr->type != nir_instr_type_intrinsic)
1223          continue;
1224       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1225
1226       const struct intrinsic_info *info = get_info(intrin->intrinsic);
1227       if (!info)
1228          continue;
1229
1230       nir_variable_mode mode = info->mode;
1231       if (!mode)
1232          mode = nir_src_as_deref(intrin->src[info->deref_src])->mode;
1233       if (!(mode & ctx->modes))
1234          continue;
1235       unsigned mode_index = ffs(mode) - 1;
1236
1237       /* create entry */
1238       struct entry *entry = create_entry(ctx, info, intrin);
1239       entry->index = next_index++;
1240
1241       list_addtail(&entry->head, &ctx->entries[mode_index]);
1242
1243       /* add the entry to a hash table */
1244
1245       struct hash_table *adj_ht = NULL;
1246       if (entry->is_store) {
1247          if (!ctx->stores[mode_index])
1248             ctx->stores[mode_index] = _mesa_hash_table_create(ctx, &hash_entry_key, &entry_key_equals);
1249          adj_ht = ctx->stores[mode_index];
1250       } else {
1251          if (!ctx->loads[mode_index])
1252             ctx->loads[mode_index] = _mesa_hash_table_create(ctx, &hash_entry_key, &entry_key_equals);
1253          adj_ht = ctx->loads[mode_index];
1254       }
1255
1256       uint32_t key_hash = hash_entry_key(entry->key);
1257       struct hash_entry *adj_entry = _mesa_hash_table_search_pre_hashed(adj_ht, key_hash, entry->key);
1258       struct util_dynarray *arr;
1259       if (adj_entry && adj_entry->data) {
1260          arr = (struct util_dynarray *)adj_entry->data;
1261       } else {
1262          arr = ralloc(ctx, struct util_dynarray);
1263          util_dynarray_init(arr, arr);
1264          _mesa_hash_table_insert_pre_hashed(adj_ht, key_hash, entry->key, arr);
1265       }
1266       util_dynarray_append(arr, struct entry *, entry);
1267    }
1268
1269    /* sort and combine entries */
1270    for (unsigned i = 0; i < nir_num_variable_modes; i++) {
1271       progress |= vectorize_entries(ctx, impl, ctx->loads[i]);
1272       progress |= vectorize_entries(ctx, impl, ctx->stores[i]);
1273    }
1274
1275    return progress;
1276 }
1277
1278 bool
1279 nir_opt_load_store_vectorize(nir_shader *shader, nir_variable_mode modes,
1280                              nir_should_vectorize_mem_func callback)
1281 {
1282    bool progress = false;
1283
1284    struct vectorize_ctx *ctx = rzalloc(NULL, struct vectorize_ctx);
1285    ctx->modes = modes;
1286    ctx->callback = callback;
1287
1288    nir_index_vars(shader, NULL, modes);
1289
1290    nir_foreach_function(function, shader) {
1291       if (function->impl) {
1292          if (modes & nir_var_function_temp)
1293             nir_index_vars(shader, function->impl, nir_var_function_temp);
1294
1295          nir_foreach_block(block, function->impl)
1296             progress |= process_block(function->impl, ctx, block);
1297
1298          nir_metadata_preserve(function->impl,
1299                                nir_metadata_block_index |
1300                                nir_metadata_dominance |
1301                                nir_metadata_live_ssa_defs);
1302       }
1303    }
1304
1305    ralloc_free(ctx);
1306    return progress;
1307 }