src/compiler/nir/nir_opt_load_store_vectorize.c

   1 /*
   2  * Copyright © 2019 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /**
  25  * Although it's called a load/store "vectorization" pass, this also combines
  26  * intersecting and identical loads/stores. It currently supports derefs, ubo,
  27  * ssbo and push constant loads/stores.
  28  *
  29  * This doesn't handle copy_deref intrinsics and assumes that
  30  * nir_lower_alu_to_scalar() has been called and that the IR is free from ALU
  31  * modifiers. It also assumes that derefs have explicitly laid out types.
  32  *
  33  * After vectorization, the backend may want to call nir_lower_alu_to_scalar()
  34  * and nir_lower_pack(). Also this creates cast instructions taking derefs as a
  35  * source and some parts of NIR may not be able to handle that well.
  36  *
  37  * There are a few situations where this doesn't vectorize as well as it could:
  38  * - It won't turn four consecutive vec3 loads into 3 vec4 loads.
  39  * - It doesn't do global vectorization.
  40  * Handling these cases probably wouldn't provide much benefit though.
  41  *
  42  * This probably doesn't handle big-endian GPUs correctly.
  43 */
  44
  45 #include "nir.h"
  46 #include "nir_deref.h"
  47 #include "nir_builder.h"
  48 #include "nir_worklist.h"
  49 #include "util/u_dynarray.h"
  50
  51 #include <stdlib.h>
  52
  53 struct intrinsic_info {
  54    nir_variable_mode mode; /* 0 if the mode is obtained from the deref. */
  55    nir_intrinsic_op op;
  56    bool is_atomic;
  57    /* Indices into nir_intrinsic::src[] or -1 if not applicable. */
  58    int resource_src; /* resource (e.g. from vulkan_resource_index) */
  59    int base_src; /* offset which it loads/stores from */
  60    int deref_src; /* deref which is loads/stores from */
  61    int value_src; /* the data it is storing */
  62 };
  63
  64 static const struct intrinsic_info *
  65 get_info(nir_intrinsic_op op) {
  66    switch (op) {
  67 #define INFO(mode, op, atomic, res, base, deref, val) \
  68 case nir_intrinsic_##op: {\
  69    static const struct intrinsic_info op##_info = {mode, nir_intrinsic_##op, atomic, res, base, deref, val};\
  70    return &op##_info;\
  71 }
  72 #define LOAD(mode, op, res, base, deref) INFO(mode, load_##op, false, res, base, deref, -1)
  73 #define STORE(mode, op, res, base, deref, val) INFO(mode, store_##op, false, res, base, deref, val)
  74 #define ATOMIC(mode, type, op, res, base, deref, val) INFO(mode, type##_atomic_##op, true, res, base, deref, val)
  75    LOAD(nir_var_mem_push_const, push_constant, -1, 0, -1)
  76    LOAD(nir_var_mem_ubo, ubo, 0, 1, -1)
  77    LOAD(nir_var_mem_ssbo, ssbo, 0, 1, -1)
  78    STORE(nir_var_mem_ssbo, ssbo, 1, 2, -1, 0)
  79    LOAD(0, deref, -1, -1, 0)
  80    STORE(0, deref, -1, -1, 0, 1)
  81    LOAD(nir_var_mem_shared, shared, -1, 0, -1)
  82    STORE(nir_var_mem_shared, shared, -1, 1, -1, 0)
  83    LOAD(nir_var_mem_global, global, -1, 0, -1)
  84    STORE(nir_var_mem_global, global, -1, 1, -1, 0)
  85    ATOMIC(nir_var_mem_ssbo, ssbo, add, 0, 1, -1, 2)
  86    ATOMIC(nir_var_mem_ssbo, ssbo, imin, 0, 1, -1, 2)
  87    ATOMIC(nir_var_mem_ssbo, ssbo, umin, 0, 1, -1, 2)
  88    ATOMIC(nir_var_mem_ssbo, ssbo, imax, 0, 1, -1, 2)
  89    ATOMIC(nir_var_mem_ssbo, ssbo, umax, 0, 1, -1, 2)
  90    ATOMIC(nir_var_mem_ssbo, ssbo, and, 0, 1, -1, 2)
  91    ATOMIC(nir_var_mem_ssbo, ssbo, or, 0, 1, -1, 2)
  92    ATOMIC(nir_var_mem_ssbo, ssbo, xor, 0, 1, -1, 2)
  93    ATOMIC(nir_var_mem_ssbo, ssbo, exchange, 0, 1, -1, 2)
  94    ATOMIC(nir_var_mem_ssbo, ssbo, comp_swap, 0, 1, -1, 2)
  95    ATOMIC(nir_var_mem_ssbo, ssbo, fadd, 0, 1, -1, 2)
  96    ATOMIC(nir_var_mem_ssbo, ssbo, fmin, 0, 1, -1, 2)
  97    ATOMIC(nir_var_mem_ssbo, ssbo, fmax, 0, 1, -1, 2)
  98    ATOMIC(nir_var_mem_ssbo, ssbo, fcomp_swap, 0, 1, -1, 2)
  99    ATOMIC(0, deref, add, -1, -1, 0, 1)
 100    ATOMIC(0, deref, imin, -1, -1, 0, 1)
 101    ATOMIC(0, deref, umin, -1, -1, 0, 1)
 102    ATOMIC(0, deref, imax, -1, -1, 0, 1)
 103    ATOMIC(0, deref, umax, -1, -1, 0, 1)
 104    ATOMIC(0, deref, and, -1, -1, 0, 1)
 105    ATOMIC(0, deref, or, -1, -1, 0, 1)
 106    ATOMIC(0, deref, xor, -1, -1, 0, 1)
 107    ATOMIC(0, deref, exchange, -1, -1, 0, 1)
 108    ATOMIC(0, deref, comp_swap, -1, -1, 0, 1)
 109    ATOMIC(0, deref, fadd, -1, -1, 0, 1)
 110    ATOMIC(0, deref, fmin, -1, -1, 0, 1)
 111    ATOMIC(0, deref, fmax, -1, -1, 0, 1)
 112    ATOMIC(0, deref, fcomp_swap, -1, -1, 0, 1)
 113    ATOMIC(nir_var_mem_shared, shared, add, -1, 0, -1, 1)
 114    ATOMIC(nir_var_mem_shared, shared, imin, -1, 0, -1, 1)
 115    ATOMIC(nir_var_mem_shared, shared, umin, -1, 0, -1, 1)
 116    ATOMIC(nir_var_mem_shared, shared, imax, -1, 0, -1, 1)
 117    ATOMIC(nir_var_mem_shared, shared, umax, -1, 0, -1, 1)
 118    ATOMIC(nir_var_mem_shared, shared, and, -1, 0, -1, 1)
 119    ATOMIC(nir_var_mem_shared, shared, or, -1, 0, -1, 1)
 120    ATOMIC(nir_var_mem_shared, shared, xor, -1, 0, -1, 1)
 121    ATOMIC(nir_var_mem_shared, shared, exchange, -1, 0, -1, 1)
 122    ATOMIC(nir_var_mem_shared, shared, comp_swap, -1, 0, -1, 1)
 123    ATOMIC(nir_var_mem_shared, shared, fadd, -1, 0, -1, 1)
 124    ATOMIC(nir_var_mem_shared, shared, fmin, -1, 0, -1, 1)
 125    ATOMIC(nir_var_mem_shared, shared, fmax, -1, 0, -1, 1)
 126    ATOMIC(nir_var_mem_shared, shared, fcomp_swap, -1, 0, -1, 1)
 127    ATOMIC(nir_var_mem_global, global, add, -1, 0, -1, 1)
 128    ATOMIC(nir_var_mem_global, global, imin, -1, 0, -1, 1)
 129    ATOMIC(nir_var_mem_global, global, umin, -1, 0, -1, 1)
 130    ATOMIC(nir_var_mem_global, global, imax, -1, 0, -1, 1)
 131    ATOMIC(nir_var_mem_global, global, umax, -1, 0, -1, 1)
 132    ATOMIC(nir_var_mem_global, global, and, -1, 0, -1, 1)
 133    ATOMIC(nir_var_mem_global, global, or, -1, 0, -1, 1)
 134    ATOMIC(nir_var_mem_global, global, xor, -1, 0, -1, 1)
 135    ATOMIC(nir_var_mem_global, global, exchange, -1, 0, -1, 1)
 136    ATOMIC(nir_var_mem_global, global, comp_swap, -1, 0, -1, 1)
 137    ATOMIC(nir_var_mem_global, global, fadd, -1, 0, -1, 1)
 138    ATOMIC(nir_var_mem_global, global, fmin, -1, 0, -1, 1)
 139    ATOMIC(nir_var_mem_global, global, fmax, -1, 0, -1, 1)
 140    ATOMIC(nir_var_mem_global, global, fcomp_swap, -1, 0, -1, 1)
 141    default:
 142       break;
 143 #undef ATOMIC
 144 #undef STORE
 145 #undef LOAD
 146 #undef INFO
 147    }
 148    return NULL;
 149 }
 150
 151 /*
 152  * Information used to compare memory operations.
 153  * It canonically represents an offset as:
 154  * `offset_defs[0]*offset_defs_mul[0] + offset_defs[1]*offset_defs_mul[1] + ...`
 155  * "offset_defs" is sorted in ascenting order by the ssa definition's index.
 156  * "resource" or "var" may be NULL.
 157  */
 158 struct entry_key {
 159    nir_ssa_def *resource;
 160    nir_variable *var;
 161    unsigned offset_def_count;
 162    nir_ssa_def **offset_defs;
 163    uint64_t *offset_defs_mul;
 164 };
 165
 166 /* Information on a single memory operation. */
 167 struct entry {
 168    struct list_head head;
 169    unsigned index;
 170
 171    struct entry_key *key;
 172    union {
 173       uint64_t offset; /* sign-extended */
 174       int64_t offset_signed;
 175    };
 176    uint32_t best_align;
 177
 178    nir_instr *instr;
 179    nir_intrinsic_instr *intrin;
 180    const struct intrinsic_info *info;
 181    enum gl_access_qualifier access;
 182    bool is_store;
 183
 184    nir_deref_instr *deref;
 185 };
 186
 187 struct vectorize_ctx {
 188    nir_variable_mode modes;
 189    nir_should_vectorize_mem_func callback;
 190    nir_variable_mode robust_modes;
 191    struct list_head entries[nir_num_variable_modes];
 192    struct hash_table *loads[nir_num_variable_modes];
 193    struct hash_table *stores[nir_num_variable_modes];
 194 };
 195
 196 static uint32_t hash_entry_key(const void *key_)
 197 {
 198    /* this is careful to not include pointers in the hash calculation so that
 199     * the order of the hash table walk is deterministic */
 200    struct entry_key *key = (struct entry_key*)key_;
 201
 202    uint32_t hash = 0;
 203    if (key->resource)
 204       hash = XXH32(&key->resource->index, sizeof(key->resource->index), hash);
 205    if (key->var) {
 206       hash = XXH32(&key->var->index, sizeof(key->var->index), hash);
 207       unsigned mode = key->var->data.mode;
 208       hash = XXH32(&mode, sizeof(mode), hash);
 209    }
 210
 211    for (unsigned i = 0; i < key->offset_def_count; i++)
 212       hash = XXH32(&key->offset_defs[i]->index, sizeof(key->offset_defs[i]->index), hash);
 213
 214    hash = XXH32(key->offset_defs_mul, key->offset_def_count * sizeof(uint64_t), hash);
 215
 216    return hash;
 217 }
 218
 219 static bool entry_key_equals(const void *a_, const void *b_)
 220 {
 221    struct entry_key *a = (struct entry_key*)a_;
 222    struct entry_key *b = (struct entry_key*)b_;
 223
 224    if (a->var != b->var || a->resource != b->resource)
 225       return false;
 226
 227    if (a->offset_def_count != b->offset_def_count)
 228       return false;
 229
 230    size_t offset_def_size = a->offset_def_count * sizeof(nir_ssa_def *);
 231    size_t offset_def_mul_size = a->offset_def_count * sizeof(uint64_t);
 232    if (a->offset_def_count &&
 233        (memcmp(a->offset_defs, b->offset_defs, offset_def_size) ||
 234         memcmp(a->offset_defs_mul, b->offset_defs_mul, offset_def_mul_size)))
 235       return false;
 236
 237    return true;
 238 }
 239
 240 static void delete_entry_dynarray(struct hash_entry *entry)
 241 {
 242    struct util_dynarray *arr = (struct util_dynarray *)entry->data;
 243    ralloc_free(arr);
 244 }
 245
 246 static int sort_entries(const void *a_, const void *b_)
 247 {
 248    struct entry *a = *(struct entry*const*)a_;
 249    struct entry *b = *(struct entry*const*)b_;
 250
 251    if (a->offset_signed > b->offset_signed)
 252       return 1;
 253    else if (a->offset_signed < b->offset_signed)
 254       return -1;
 255    else
 256       return 0;
 257 }
 258
 259 static unsigned
 260 get_bit_size(struct entry *entry)
 261 {
 262    unsigned size = entry->is_store ?
 263                    entry->intrin->src[entry->info->value_src].ssa->bit_size :
 264                    entry->intrin->dest.ssa.bit_size;
 265    return size == 1 ? 32u : size;
 266 }
 267
 268 /* If "def" is from an alu instruction with the opcode "op" and one of it's
 269  * sources is a constant, update "def" to be the non-constant source, fill "c"
 270  * with the constant and return true. */
 271 static bool
 272 parse_alu(nir_ssa_def **def, nir_op op, uint64_t *c)
 273 {
 274    nir_ssa_scalar scalar;
 275    scalar.def = *def;
 276    scalar.comp = 0;
 277
 278    if (!nir_ssa_scalar_is_alu(scalar) || nir_ssa_scalar_alu_op(scalar) != op)
 279       return false;
 280
 281    nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);
 282    nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);
 283    if (op != nir_op_ishl && nir_ssa_scalar_is_const(src0) && src1.comp == 0) {
 284       *c = nir_ssa_scalar_as_uint(src0);
 285       *def = src1.def;
 286    } else if (nir_ssa_scalar_is_const(src1) && src0.comp == 0) {
 287       *c = nir_ssa_scalar_as_uint(src1);
 288       *def = src0.def;
 289    } else {
 290       return false;
 291    }
 292    return true;
 293 }
 294
 295 /* Parses an offset expression such as "a * 16 + 4" and "(a * 16 + 4) * 64 + 32". */
 296 static void
 297 parse_offset(nir_ssa_def **base, uint64_t *base_mul, uint64_t *offset)
 298 {
 299    if ((*base)->parent_instr->type == nir_instr_type_load_const) {
 300       *offset = nir_src_comp_as_uint(nir_src_for_ssa(*base), 0);
 301       *base = NULL;
 302       return;
 303    }
 304
 305    uint64_t mul = 1;
 306    uint64_t add = 0;
 307    bool progress = false;
 308    do {
 309       uint64_t mul2 = 1, add2 = 0;
 310
 311       progress = parse_alu(base, nir_op_imul, &mul2);
 312       mul *= mul2;
 313
 314       mul2 = 0;
 315       progress |= parse_alu(base, nir_op_ishl, &mul2);
 316       mul <<= mul2;
 317
 318       progress |= parse_alu(base, nir_op_iadd, &add2);
 319       add += add2 * mul;
 320    } while (progress);
 321
 322    *base_mul = mul;
 323    *offset = add;
 324 }
 325
 326 static unsigned
 327 type_scalar_size_bytes(const struct glsl_type *type)
 328 {
 329    assert(glsl_type_is_vector_or_scalar(type) ||
 330           glsl_type_is_matrix(type));
 331    return glsl_type_is_boolean(type) ? 4u : glsl_get_bit_size(type) / 8u;
 332 }
 333
 334 static int
 335 get_array_stride(const struct glsl_type *type)
 336 {
 337    unsigned explicit_stride = glsl_get_explicit_stride(type);
 338    if ((glsl_type_is_matrix(type) &&
 339         glsl_matrix_type_is_row_major(type)) ||
 340        (glsl_type_is_vector(type) && explicit_stride == 0))
 341       return type_scalar_size_bytes(type);
 342    return explicit_stride;
 343 }
 344
 345 static uint64_t
 346 mask_sign_extend(uint64_t val, unsigned bit_size)
 347 {
 348    return (int64_t)(val << (64 - bit_size)) >> (64 - bit_size);
 349 }
 350
 351 static unsigned
 352 add_to_entry_key(nir_ssa_def **offset_defs, uint64_t *offset_defs_mul,
 353                  unsigned offset_def_count, nir_ssa_def *def, uint64_t mul)
 354 {
 355    mul = mask_sign_extend(mul, def->bit_size);
 356
 357    for (unsigned i = 0; i <= offset_def_count; i++) {
 358       if (i == offset_def_count || def->index > offset_defs[i]->index) {
 359          /* insert before i */
 360          memmove(offset_defs + i + 1, offset_defs + i,
 361                  (offset_def_count - i) * sizeof(nir_ssa_def *));
 362          memmove(offset_defs_mul + i + 1, offset_defs_mul + i,
 363                  (offset_def_count - i) * sizeof(uint64_t));
 364          offset_defs[i] = def;
 365          offset_defs_mul[i] = mul;
 366          return 1;
 367       } else if (def->index == offset_defs[i]->index) {
 368          /* merge with offset_def at i */
 369          offset_defs_mul[i] += mul;
 370          return 0;
 371       }
 372    }
 373    unreachable("Unreachable.");
 374    return 0;
 375 }
 376
 377 static struct entry_key *
 378 create_entry_key_from_deref(void *mem_ctx,
 379                             struct vectorize_ctx *ctx,
 380                             nir_deref_path *path,
 381                             uint64_t *offset_base)
 382 {
 383    unsigned path_len = 0;
 384    while (path->path[path_len])
 385       path_len++;
 386
 387    nir_ssa_def *offset_defs_stack[32];
 388    uint64_t offset_defs_mul_stack[32];
 389    nir_ssa_def **offset_defs = offset_defs_stack;
 390    uint64_t *offset_defs_mul = offset_defs_mul_stack;
 391    if (path_len > 32) {
 392       offset_defs = malloc(path_len * sizeof(nir_ssa_def *));
 393       offset_defs_mul = malloc(path_len * sizeof(uint64_t));
 394    }
 395    unsigned offset_def_count = 0;
 396
 397    struct entry_key* key = ralloc(mem_ctx, struct entry_key);
 398    key->resource = NULL;
 399    key->var = NULL;
 400    *offset_base = 0;
 401
 402    for (unsigned i = 0; i < path_len; i++) {
 403       nir_deref_instr *parent = i ? path->path[i - 1] : NULL;
 404       nir_deref_instr *deref = path->path[i];
 405
 406       switch (deref->deref_type) {
 407       case nir_deref_type_var: {
 408          assert(!parent);
 409          key->var = deref->var;
 410          break;
 411       }
 412       case nir_deref_type_array:
 413       case nir_deref_type_ptr_as_array: {
 414          assert(parent);
 415          nir_ssa_def *index = deref->arr.index.ssa;
 416          uint32_t stride;
 417          if (deref->deref_type == nir_deref_type_ptr_as_array)
 418             stride = nir_deref_instr_ptr_as_array_stride(deref);
 419          else
 420             stride = get_array_stride(parent->type);
 421
 422          nir_ssa_def *base = index;
 423          uint64_t offset = 0, base_mul = 1;
 424          parse_offset(&base, &base_mul, &offset);
 425          offset = mask_sign_extend(offset, index->bit_size);
 426
 427          *offset_base += offset * stride;
 428          if (base) {
 429             offset_def_count += add_to_entry_key(offset_defs, offset_defs_mul,
 430                                                  offset_def_count,
 431                                                  base, base_mul * stride);
 432          }
 433          break;
 434       }
 435       case nir_deref_type_struct: {
 436          assert(parent);
 437          int offset = glsl_get_struct_field_offset(parent->type, deref->strct.index);
 438          *offset_base += offset;
 439          break;
 440       }
 441       case nir_deref_type_cast: {
 442          if (!parent)
 443             key->resource = deref->parent.ssa;
 444          break;
 445       }
 446       default:
 447          unreachable("Unhandled deref type");
 448       }
 449    }
 450
 451    key->offset_def_count = offset_def_count;
 452    key->offset_defs = ralloc_array(mem_ctx, nir_ssa_def *, offset_def_count);
 453    key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, offset_def_count);
 454    memcpy(key->offset_defs, offset_defs, offset_def_count * sizeof(nir_ssa_def *));
 455    memcpy(key->offset_defs_mul, offset_defs_mul, offset_def_count * sizeof(uint64_t));
 456
 457    if (offset_defs != offset_defs_stack)
 458       free(offset_defs);
 459    if (offset_defs_mul != offset_defs_mul_stack)
 460       free(offset_defs_mul);
 461
 462    return key;
 463 }
 464
 465 static unsigned
 466 parse_entry_key_from_offset(struct entry_key *key, unsigned size, unsigned left,
 467                             nir_ssa_def *base, uint64_t base_mul, uint64_t *offset)
 468 {
 469    uint64_t new_mul;
 470    uint64_t new_offset;
 471    parse_offset(&base, &new_mul, &new_offset);
 472    *offset += new_offset * base_mul;
 473
 474    if (!base)
 475       return 0;
 476
 477    base_mul *= new_mul;
 478
 479    assert(left >= 1);
 480
 481    if (left >= 2) {
 482       nir_ssa_scalar scalar;
 483       scalar.def = base;
 484       scalar.comp = 0;
 485       if (nir_ssa_scalar_is_alu(scalar) && nir_ssa_scalar_alu_op(scalar) == nir_op_iadd) {
 486          nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);
 487          nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);
 488          if (src0.comp == 0 && src1.comp == 0) {
 489             unsigned amount = parse_entry_key_from_offset(key, size, left - 1, src0.def, base_mul, offset);
 490             amount += parse_entry_key_from_offset(key, size + amount, left - amount, src1.def, base_mul, offset);
 491             return amount;
 492          }
 493       }
 494    }
 495
 496    return add_to_entry_key(key->offset_defs, key->offset_defs_mul, size, base, base_mul);
 497 }
 498
 499 static struct entry_key *
 500 create_entry_key_from_offset(void *mem_ctx, nir_ssa_def *base, uint64_t base_mul, uint64_t *offset)
 501 {
 502    struct entry_key *key = ralloc(mem_ctx, struct entry_key);
 503    key->resource = NULL;
 504    key->var = NULL;
 505    if (base) {
 506       nir_ssa_def *offset_defs[32];
 507       uint64_t offset_defs_mul[32];
 508       key->offset_defs = offset_defs;
 509       key->offset_defs_mul = offset_defs_mul;
 510
 511       key->offset_def_count = parse_entry_key_from_offset(key, 0, 32, base, base_mul, offset);
 512
 513       key->offset_defs = ralloc_array(mem_ctx, nir_ssa_def *, key->offset_def_count);
 514       key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, key->offset_def_count);
 515       memcpy(key->offset_defs, offset_defs, key->offset_def_count * sizeof(nir_ssa_def *));
 516       memcpy(key->offset_defs_mul, offset_defs_mul, key->offset_def_count * sizeof(uint64_t));
 517    } else {
 518       key->offset_def_count = 0;
 519       key->offset_defs = NULL;
 520       key->offset_defs_mul = NULL;
 521    }
 522    return key;
 523 }
 524
 525 static nir_variable_mode
 526 get_variable_mode(struct entry *entry)
 527 {
 528    if (entry->info->mode)
 529       return entry->info->mode;
 530    assert(entry->deref);
 531    return entry->deref->mode;
 532 }
 533
 534 static unsigned
 535 mode_to_index(nir_variable_mode mode)
 536 {
 537    assert(util_bitcount(mode) == 1);
 538
 539    /* Globals and SSBOs should be tracked together */
 540    if (mode == nir_var_mem_global)
 541       mode = nir_var_mem_ssbo;
 542
 543    return ffs(mode) - 1;
 544 }
 545
 546 static nir_variable_mode
 547 aliasing_modes(nir_variable_mode modes)
 548 {
 549    /* Global and SSBO can alias */
 550    if (modes & (nir_var_mem_ssbo | nir_var_mem_global))
 551       modes |= nir_var_mem_ssbo | nir_var_mem_global;
 552    return modes;
 553 }
 554
 555 static struct entry *
 556 create_entry(struct vectorize_ctx *ctx,
 557              const struct intrinsic_info *info,
 558              nir_intrinsic_instr *intrin)
 559 {
 560    struct entry *entry = rzalloc(ctx, struct entry);
 561    entry->intrin = intrin;
 562    entry->instr = &intrin->instr;
 563    entry->info = info;
 564    entry->best_align = UINT32_MAX;
 565    entry->is_store = entry->info->value_src >= 0;
 566
 567    if (entry->info->deref_src >= 0) {
 568       entry->deref = nir_src_as_deref(intrin->src[entry->info->deref_src]);
 569       nir_deref_path path;
 570       nir_deref_path_init(&path, entry->deref, NULL);
 571       entry->key = create_entry_key_from_deref(entry, ctx, &path, &entry->offset);
 572       nir_deref_path_finish(&path);
 573    } else {
 574       nir_ssa_def *base = entry->info->base_src >= 0 ?
 575                           intrin->src[entry->info->base_src].ssa : NULL;
 576       uint64_t offset = 0;
 577       if (nir_intrinsic_infos[intrin->intrinsic].index_map[NIR_INTRINSIC_BASE])
 578          offset += nir_intrinsic_base(intrin);
 579       entry->key = create_entry_key_from_offset(entry, base, 1, &offset);
 580       entry->offset = offset;
 581
 582       if (base)
 583          entry->offset = mask_sign_extend(entry->offset, base->bit_size);
 584    }
 585
 586    if (entry->info->resource_src >= 0)
 587       entry->key->resource = intrin->src[entry->info->resource_src].ssa;
 588
 589    if (nir_intrinsic_infos[intrin->intrinsic].index_map[NIR_INTRINSIC_ACCESS])
 590       entry->access = nir_intrinsic_access(intrin);
 591    else if (entry->key->var)
 592       entry->access = entry->key->var->data.access;
 593
 594    uint32_t restrict_modes = nir_var_shader_in | nir_var_shader_out;
 595    restrict_modes |= nir_var_shader_temp | nir_var_function_temp;
 596    restrict_modes |= nir_var_uniform | nir_var_mem_push_const;
 597    restrict_modes |= nir_var_system_value | nir_var_mem_shared;
 598    if (get_variable_mode(entry) & restrict_modes)
 599       entry->access |= ACCESS_RESTRICT;
 600
 601    return entry;
 602 }
 603
 604 static nir_deref_instr *
 605 cast_deref(nir_builder *b, unsigned num_components, unsigned bit_size, nir_deref_instr *deref)
 606 {
 607    if (glsl_get_components(deref->type) == num_components &&
 608        type_scalar_size_bytes(deref->type)*8u == bit_size)
 609       return deref;
 610
 611    enum glsl_base_type types[] = {
 612       GLSL_TYPE_UINT8, GLSL_TYPE_UINT16, GLSL_TYPE_UINT, GLSL_TYPE_UINT64};
 613    enum glsl_base_type base = types[ffs(bit_size / 8u) - 1u];
 614    const struct glsl_type *type = glsl_vector_type(base, num_components);
 615
 616    if (deref->type == type)
 617       return deref;
 618
 619    return nir_build_deref_cast(b, &deref->dest.ssa, deref->mode, type, 0);
 620 }
 621
 622 /* Return true if the write mask "write_mask" of a store with "old_bit_size"
 623  * bits per element can be represented for a store with "new_bit_size" bits per
 624  * element. */
 625 static bool
 626 writemask_representable(unsigned write_mask, unsigned old_bit_size, unsigned new_bit_size)
 627 {
 628    while (write_mask) {
 629       int start, count;
 630       u_bit_scan_consecutive_range(&write_mask, &start, &count);
 631       start *= old_bit_size;
 632       count *= old_bit_size;
 633       if (start % new_bit_size != 0)
 634          return false;
 635       if (count % new_bit_size != 0)
 636          return false;
 637    }
 638    return true;
 639 }
 640
 641 static uint64_t
 642 gcd(uint64_t a, uint64_t b)
 643 {
 644    while (b) {
 645       uint64_t old_b = b;
 646       b = a % b;
 647       a = old_b;
 648    }
 649    return a;
 650 }
 651
 652 static uint32_t
 653 get_best_align(struct entry *entry)
 654 {
 655    if (entry->best_align != UINT32_MAX)
 656       return entry->best_align;
 657
 658    uint64_t best_align = entry->offset;
 659    for (unsigned i = 0; i < entry->key->offset_def_count; i++) {
 660       if (!best_align)
 661          best_align = entry->key->offset_defs_mul[i];
 662       else if (entry->key->offset_defs_mul[i])
 663          best_align = gcd(best_align, entry->key->offset_defs_mul[i]);
 664    }
 665
 666    if (nir_intrinsic_infos[entry->intrin->intrinsic].index_map[NIR_INTRINSIC_ALIGN_MUL])
 667       best_align = MAX2(best_align, nir_intrinsic_align(entry->intrin));
 668
 669    /* ensure the result is a power of two that fits in a int32_t */
 670    entry->best_align = gcd(best_align, 1u << 30);
 671
 672    return entry->best_align;
 673 }
 674
 675 /* Return true if "new_bit_size" is a usable bit size for a vectorized load/store
 676  * of "low" and "high". */
 677 static bool
 678 new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size,
 679                        struct entry *low, struct entry *high, unsigned size)
 680 {
 681    if (size % new_bit_size != 0)
 682       return false;
 683
 684    unsigned new_num_components = size / new_bit_size;
 685    if (!nir_num_components_valid(new_num_components))
 686       return false;
 687
 688    unsigned high_offset = high->offset_signed - low->offset_signed;
 689
 690    /* check nir_extract_bits limitations */
 691    unsigned common_bit_size = MIN2(get_bit_size(low), get_bit_size(high));
 692    common_bit_size = MIN2(common_bit_size, new_bit_size);
 693    if (high_offset > 0)
 694       common_bit_size = MIN2(common_bit_size, (1u << (ffs(high_offset * 8) - 1)));
 695    if (new_bit_size / common_bit_size > NIR_MAX_VEC_COMPONENTS)
 696       return false;
 697
 698    if (!ctx->callback(get_best_align(low), new_bit_size, new_num_components,
 699                       high_offset, low->intrin, high->intrin))
 700       return false;
 701
 702    if (low->is_store) {
 703       unsigned low_size = low->intrin->num_components * get_bit_size(low);
 704       unsigned high_size = high->intrin->num_components * get_bit_size(high);
 705
 706       if (low_size % new_bit_size != 0)
 707          return false;
 708       if (high_size % new_bit_size != 0)
 709          return false;
 710
 711       unsigned write_mask = nir_intrinsic_write_mask(low->intrin);
 712       if (!writemask_representable(write_mask, low_size, new_bit_size))
 713          return false;
 714
 715       write_mask = nir_intrinsic_write_mask(high->intrin);
 716       if (!writemask_representable(write_mask, high_size, new_bit_size))
 717          return false;
 718    }
 719
 720    return true;
 721 }
 722
 723 /* Updates a write mask, "write_mask", so that it can be used with a
 724  * "new_bit_size"-bit store instead of a "old_bit_size"-bit store. */
 725 static uint32_t
 726 update_writemask(unsigned write_mask, unsigned old_bit_size, unsigned new_bit_size)
 727 {
 728    uint32_t res = 0;
 729    while (write_mask) {
 730       int start, count;
 731       u_bit_scan_consecutive_range(&write_mask, &start, &count);
 732       start = start * old_bit_size / new_bit_size;
 733       count = count * old_bit_size / new_bit_size;
 734       res |= ((1 << count) - 1) << start;
 735    }
 736    return res;
 737 }
 738
 739 static nir_deref_instr *subtract_deref(nir_builder *b, nir_deref_instr *deref, int64_t offset)
 740 {
 741    /* avoid adding another deref to the path */
 742    if (deref->deref_type == nir_deref_type_ptr_as_array &&
 743        nir_src_is_const(deref->arr.index) &&
 744        offset % nir_deref_instr_ptr_as_array_stride(deref) == 0) {
 745       unsigned stride = nir_deref_instr_ptr_as_array_stride(deref);
 746       nir_ssa_def *index = nir_imm_intN_t(b, nir_src_as_int(deref->arr.index) - offset / stride,
 747                                           deref->dest.ssa.bit_size);
 748       return nir_build_deref_ptr_as_array(b, nir_deref_instr_parent(deref), index);
 749    }
 750
 751    if (deref->deref_type == nir_deref_type_array &&
 752        nir_src_is_const(deref->arr.index)) {
 753       nir_deref_instr *parent = nir_deref_instr_parent(deref);
 754       unsigned stride = glsl_get_explicit_stride(parent->type);
 755       if (offset % stride == 0)
 756          return nir_build_deref_array_imm(
 757             b, parent, nir_src_as_int(deref->arr.index) - offset / stride);
 758    }
 759
 760
 761    deref = nir_build_deref_cast(b, &deref->dest.ssa, deref->mode,
 762                                 glsl_scalar_type(GLSL_TYPE_UINT8), 1);
 763    return nir_build_deref_ptr_as_array(
 764       b, deref, nir_imm_intN_t(b, -offset, deref->dest.ssa.bit_size));
 765 }
 766
 767 static bool update_align(struct entry *entry)
 768 {
 769    bool has_align_index =
 770       nir_intrinsic_infos[entry->intrin->intrinsic].index_map[NIR_INTRINSIC_ALIGN_MUL];
 771    if (has_align_index) {
 772       unsigned align = get_best_align(entry);
 773       if (align != nir_intrinsic_align(entry->intrin)) {
 774          nir_intrinsic_set_align(entry->intrin, align, 0);
 775          return true;
 776       }
 777    }
 778    return false;
 779 }
 780
 781 static void
 782 vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx,
 783                 struct entry *low, struct entry *high,
 784                 struct entry *first, struct entry *second,
 785                 unsigned new_bit_size, unsigned new_num_components,
 786                 unsigned high_start)
 787 {
 788    unsigned low_bit_size = get_bit_size(low);
 789    unsigned high_bit_size = get_bit_size(high);
 790    bool low_bool = low->intrin->dest.ssa.bit_size == 1;
 791    bool high_bool = high->intrin->dest.ssa.bit_size == 1;
 792    nir_ssa_def *data = &first->intrin->dest.ssa;
 793
 794    b->cursor = nir_after_instr(first->instr);
 795
 796    /* update the load's destination size and extract data for each of the original loads */
 797    data->num_components = new_num_components;
 798    data->bit_size = new_bit_size;
 799
 800    nir_ssa_def *low_def = nir_extract_bits(
 801       b, &data, 1, 0, low->intrin->num_components, low_bit_size);
 802    nir_ssa_def *high_def = nir_extract_bits(
 803       b, &data, 1, high_start, high->intrin->num_components, high_bit_size);
 804
 805    /* convert booleans */
 806    low_def = low_bool ? nir_i2b(b, low_def) : nir_mov(b, low_def);
 807    high_def = high_bool ? nir_i2b(b, high_def) : nir_mov(b, high_def);
 808
 809    /* update uses */
 810    if (first == low) {
 811       nir_ssa_def_rewrite_uses_after(&low->intrin->dest.ssa, nir_src_for_ssa(low_def),
 812                                      high_def->parent_instr);
 813       nir_ssa_def_rewrite_uses(&high->intrin->dest.ssa, nir_src_for_ssa(high_def));
 814    } else {
 815       nir_ssa_def_rewrite_uses(&low->intrin->dest.ssa, nir_src_for_ssa(low_def));
 816       nir_ssa_def_rewrite_uses_after(&high->intrin->dest.ssa, nir_src_for_ssa(high_def),
 817                                      high_def->parent_instr);
 818    }
 819
 820    /* update the intrinsic */
 821    first->intrin->num_components = new_num_components;
 822
 823    const struct intrinsic_info *info = first->info;
 824
 825    /* update the offset */
 826    if (first != low && info->base_src >= 0) {
 827       /* let nir_opt_algebraic() remove this addition. this doesn't have much
 828        * issues with subtracting 16 from expressions like "(i + 1) * 16" because
 829        * nir_opt_algebraic() turns them into "i * 16 + 16" */
 830       b->cursor = nir_before_instr(first->instr);
 831
 832       nir_ssa_def *new_base = first->intrin->src[info->base_src].ssa;
 833       new_base = nir_iadd_imm(b, new_base, -(int)(high_start / 8u));
 834
 835       nir_instr_rewrite_src(first->instr, &first->intrin->src[info->base_src],
 836                             nir_src_for_ssa(new_base));
 837    }
 838
 839    /* update the deref */
 840    if (info->deref_src >= 0) {
 841       b->cursor = nir_before_instr(first->instr);
 842
 843       nir_deref_instr *deref = nir_src_as_deref(first->intrin->src[info->deref_src]);
 844       if (first != low && high_start != 0)
 845          deref = subtract_deref(b, deref, high_start / 8u);
 846       first->deref = cast_deref(b, new_num_components, new_bit_size, deref);
 847
 848       nir_instr_rewrite_src(first->instr, &first->intrin->src[info->deref_src],
 849                             nir_src_for_ssa(&first->deref->dest.ssa));
 850    }
 851
 852    /* update base/align */
 853    bool has_base_index =
 854       nir_intrinsic_infos[first->intrin->intrinsic].index_map[NIR_INTRINSIC_BASE];
 855
 856    if (first != low && has_base_index)
 857       nir_intrinsic_set_base(first->intrin, nir_intrinsic_base(low->intrin));
 858
 859    first->key = low->key;
 860    first->offset = low->offset;
 861    first->best_align = get_best_align(low);
 862
 863    update_align(first);
 864
 865    nir_instr_remove(second->instr);
 866 }
 867
 868 static void
 869 vectorize_stores(nir_builder *b, struct vectorize_ctx *ctx,
 870                  struct entry *low, struct entry *high,
 871                  struct entry *first, struct entry *second,
 872                  unsigned new_bit_size, unsigned new_num_components,
 873                  unsigned high_start)
 874 {
 875    ASSERTED unsigned low_size = low->intrin->num_components * get_bit_size(low);
 876    assert(low_size % new_bit_size == 0);
 877
 878    b->cursor = nir_before_instr(second->instr);
 879
 880    /* get new writemasks */
 881    uint32_t low_write_mask = nir_intrinsic_write_mask(low->intrin);
 882    uint32_t high_write_mask = nir_intrinsic_write_mask(high->intrin);
 883    low_write_mask = update_writemask(low_write_mask, get_bit_size(low), new_bit_size);
 884    high_write_mask = update_writemask(high_write_mask, get_bit_size(high), new_bit_size);
 885    high_write_mask <<= high_start / new_bit_size;
 886
 887    uint32_t write_mask = low_write_mask | high_write_mask;
 888
 889    /* convert booleans */
 890    nir_ssa_def *low_val = low->intrin->src[low->info->value_src].ssa;
 891    nir_ssa_def *high_val = high->intrin->src[high->info->value_src].ssa;
 892    low_val = low_val->bit_size == 1 ? nir_b2i(b, low_val, 32) : low_val;
 893    high_val = high_val->bit_size == 1 ? nir_b2i(b, high_val, 32) : high_val;
 894
 895    /* combine the data */
 896    nir_ssa_def *data_channels[NIR_MAX_VEC_COMPONENTS];
 897    for (unsigned i = 0; i < new_num_components; i++) {
 898       bool set_low = low_write_mask & (1 << i);
 899       bool set_high = high_write_mask & (1 << i);
 900
 901       if (set_low && (!set_high || low == second)) {
 902          unsigned offset = i * new_bit_size;
 903          data_channels[i] = nir_extract_bits(b, &low_val, 1, offset, 1, new_bit_size);
 904       } else if (set_high) {
 905          assert(!set_low || high == second);
 906          unsigned offset = i * new_bit_size - high_start;
 907          data_channels[i] = nir_extract_bits(b, &high_val, 1, offset, 1, new_bit_size);
 908       } else {
 909          data_channels[i] = nir_ssa_undef(b, 1, new_bit_size);
 910       }
 911    }
 912    nir_ssa_def *data = nir_vec(b, data_channels, new_num_components);
 913
 914    /* update the intrinsic */
 915    nir_intrinsic_set_write_mask(second->intrin, write_mask);
 916    second->intrin->num_components = data->num_components;
 917
 918    const struct intrinsic_info *info = second->info;
 919    assert(info->value_src >= 0);
 920    nir_instr_rewrite_src(second->instr, &second->intrin->src[info->value_src],
 921                          nir_src_for_ssa(data));
 922
 923    /* update the offset */
 924    if (second != low && info->base_src >= 0)
 925       nir_instr_rewrite_src(second->instr, &second->intrin->src[info->base_src],
 926                             low->intrin->src[info->base_src]);
 927
 928    /* update the deref */
 929    if (info->deref_src >= 0) {
 930       b->cursor = nir_before_instr(second->instr);
 931       second->deref = cast_deref(b, new_num_components, new_bit_size,
 932                                  nir_src_as_deref(low->intrin->src[info->deref_src]));
 933       nir_instr_rewrite_src(second->instr, &second->intrin->src[info->deref_src],
 934                             nir_src_for_ssa(&second->deref->dest.ssa));
 935    }
 936
 937    /* update base/align */
 938    bool has_base_index =
 939       nir_intrinsic_infos[second->intrin->intrinsic].index_map[NIR_INTRINSIC_BASE];
 940
 941    if (second != low && has_base_index)
 942       nir_intrinsic_set_base(second->intrin, nir_intrinsic_base(low->intrin));
 943
 944    second->key = low->key;
 945    second->offset = low->offset;
 946    second->best_align = get_best_align(low);
 947
 948    update_align(second);
 949
 950    list_del(&first->head);
 951    nir_instr_remove(first->instr);
 952 }
 953
 954 /* Returns true if it can prove that "a" and "b" point to different resources. */
 955 static bool
 956 resources_different(nir_ssa_def *a, nir_ssa_def *b)
 957 {
 958    if (!a || !b)
 959       return false;
 960
 961    if (a->parent_instr->type == nir_instr_type_load_const &&
 962        b->parent_instr->type == nir_instr_type_load_const) {
 963       return nir_src_as_uint(nir_src_for_ssa(a)) != nir_src_as_uint(nir_src_for_ssa(b));
 964    }
 965
 966    if (a->parent_instr->type == nir_instr_type_intrinsic &&
 967        b->parent_instr->type == nir_instr_type_intrinsic) {
 968       nir_intrinsic_instr *aintrin = nir_instr_as_intrinsic(a->parent_instr);
 969       nir_intrinsic_instr *bintrin = nir_instr_as_intrinsic(b->parent_instr);
 970       if (aintrin->intrinsic == nir_intrinsic_vulkan_resource_index &&
 971           bintrin->intrinsic == nir_intrinsic_vulkan_resource_index) {
 972          return nir_intrinsic_desc_set(aintrin) != nir_intrinsic_desc_set(bintrin) ||
 973                 nir_intrinsic_binding(aintrin) != nir_intrinsic_binding(bintrin) ||
 974                 resources_different(aintrin->src[0].ssa, bintrin->src[0].ssa);
 975       }
 976    }
 977
 978    return false;
 979 }
 980
 981 static int64_t
 982 compare_entries(struct entry *a, struct entry *b)
 983 {
 984    if (!entry_key_equals(a->key, b->key))
 985       return INT64_MAX;
 986    return b->offset_signed - a->offset_signed;
 987 }
 988
 989 static bool
 990 may_alias(struct entry *a, struct entry *b)
 991 {
 992    assert(mode_to_index(get_variable_mode(a)) ==
 993           mode_to_index(get_variable_mode(b)));
 994
 995    /* if the resources/variables are definitively different and both have
 996     * ACCESS_RESTRICT, we can assume they do not alias. */
 997    bool res_different = a->key->var != b->key->var ||
 998                         resources_different(a->key->resource, b->key->resource);
 999    if (res_different && (a->access & ACCESS_RESTRICT) && (b->access & ACCESS_RESTRICT))
1000       return false;
1001
1002    /* we can't compare offsets if the resources/variables might be different */
1003    if (a->key->var != b->key->var || a->key->resource != b->key->resource)
1004       return true;
1005
1006    /* use adjacency information */
1007    /* TODO: we can look closer at the entry keys */
1008    int64_t diff = compare_entries(a, b);
1009    if (diff != INT64_MAX) {
1010       /* with atomics, intrin->num_components can be 0 */
1011       if (diff < 0)
1012          return llabs(diff) < MAX2(b->intrin->num_components, 1u) * (get_bit_size(b) / 8u);
1013       else
1014          return diff < MAX2(a->intrin->num_components, 1u) * (get_bit_size(a) / 8u);
1015    }
1016
1017    /* TODO: we can use deref information */
1018
1019    return true;
1020 }
1021
1022 static bool
1023 check_for_aliasing(struct vectorize_ctx *ctx, struct entry *first, struct entry *second)
1024 {
1025    nir_variable_mode mode = get_variable_mode(first);
1026    if (mode & (nir_var_uniform | nir_var_system_value |
1027                nir_var_mem_push_const | nir_var_mem_ubo))
1028       return false;
1029
1030    unsigned mode_index = mode_to_index(mode);
1031    if (first->is_store) {
1032       /* find first entry that aliases "first" */
1033       list_for_each_entry_from(struct entry, next, first, &ctx->entries[mode_index], head) {
1034          if (next == first)
1035             continue;
1036          if (next == second)
1037             return false;
1038          if (may_alias(first, next))
1039             return true;
1040       }
1041    } else {
1042       /* find previous store that aliases this load */
1043       list_for_each_entry_from_rev(struct entry, prev, second, &ctx->entries[mode_index], head) {
1044          if (prev == second)
1045             continue;
1046          if (prev == first)
1047             return false;
1048          if (prev->is_store && may_alias(second, prev))
1049             return true;
1050       }
1051    }
1052
1053    return false;
1054 }
1055
1056 static bool
1057 check_for_robustness(struct vectorize_ctx *ctx, struct entry *low)
1058 {
1059    nir_variable_mode mode = get_variable_mode(low);
1060    if (mode & ctx->robust_modes) {
1061       unsigned low_bit_size = get_bit_size(low);
1062       unsigned low_size = low->intrin->num_components * low_bit_size;
1063
1064       /* don't attempt to vectorize accesses if the offset can overflow. */
1065       /* TODO: handle indirect accesses. */
1066       return low->offset_signed < 0 && low->offset_signed + low_size >= 0;
1067    }
1068
1069    return false;
1070 }
1071
1072 static bool
1073 is_strided_vector(const struct glsl_type *type)
1074 {
1075    if (glsl_type_is_vector(type)) {
1076       unsigned explicit_stride = glsl_get_explicit_stride(type);
1077       return explicit_stride != 0 && explicit_stride !=
1078              type_scalar_size_bytes(glsl_get_array_element(type));
1079    } else {
1080       return false;
1081    }
1082 }
1083
1084 static bool
1085 try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
1086               struct entry *low, struct entry *high,
1087               struct entry *first, struct entry *second)
1088 {
1089    if (!(get_variable_mode(first) & ctx->modes) ||
1090        !(get_variable_mode(second) & ctx->modes))
1091       return false;
1092
1093    if (check_for_aliasing(ctx, first, second))
1094       return false;
1095
1096    if (check_for_robustness(ctx, low))
1097       return false;
1098
1099    /* we can only vectorize non-volatile loads/stores of the same type and with
1100     * the same access */
1101    if (first->info != second->info || first->access != second->access ||
1102        (first->access & ACCESS_VOLATILE) || first->info->is_atomic)
1103       return false;
1104
1105    /* don't attempt to vectorize accesses of row-major matrix columns */
1106    if (first->deref) {
1107       const struct glsl_type *first_type = first->deref->type;
1108       const struct glsl_type *second_type = second->deref->type;
1109       if (is_strided_vector(first_type) || is_strided_vector(second_type))
1110          return false;
1111    }
1112
1113    /* gather information */
1114    uint64_t diff = high->offset_signed - low->offset_signed;
1115    unsigned low_bit_size = get_bit_size(low);
1116    unsigned high_bit_size = get_bit_size(high);
1117    unsigned low_size = low->intrin->num_components * low_bit_size;
1118    unsigned high_size = high->intrin->num_components * high_bit_size;
1119    unsigned new_size = MAX2(diff * 8u + high_size, low_size);
1120
1121    /* find a good bit size for the new load/store */
1122    unsigned new_bit_size = 0;
1123    if (new_bitsize_acceptable(ctx, low_bit_size, low, high, new_size)) {
1124       new_bit_size = low_bit_size;
1125    } else if (low_bit_size != high_bit_size &&
1126               new_bitsize_acceptable(ctx, high_bit_size, low, high, new_size)) {
1127       new_bit_size = high_bit_size;
1128    } else {
1129       new_bit_size = 64;
1130       for (; new_bit_size >= 8; new_bit_size /= 2) {
1131          /* don't repeat trying out bitsizes */
1132          if (new_bit_size == low_bit_size || new_bit_size == high_bit_size)
1133             continue;
1134          if (new_bitsize_acceptable(ctx, new_bit_size, low, high, new_size))
1135             break;
1136       }
1137       if (new_bit_size < 8)
1138          return false;
1139    }
1140    unsigned new_num_components = new_size / new_bit_size;
1141
1142    /* vectorize the loads/stores */
1143    nir_builder b;
1144    nir_builder_init(&b, impl);
1145
1146    if (first->is_store)
1147       vectorize_stores(&b, ctx, low, high, first, second,
1148                        new_bit_size, new_num_components, diff * 8u);
1149    else
1150       vectorize_loads(&b, ctx, low, high, first, second,
1151                       new_bit_size, new_num_components, diff * 8u);
1152
1153    return true;
1154 }
1155
1156 static bool
1157 vectorize_entries(struct vectorize_ctx *ctx, nir_function_impl *impl, struct hash_table *ht)
1158 {
1159    if (!ht)
1160       return false;
1161
1162    bool progress = false;
1163    hash_table_foreach(ht, entry) {
1164       struct util_dynarray *arr = entry->data;
1165       if (!arr->size)
1166          continue;
1167
1168       qsort(util_dynarray_begin(arr),
1169             util_dynarray_num_elements(arr, struct entry *),
1170             sizeof(struct entry *), &sort_entries);
1171
1172       unsigned i = 0;
1173       for (; i < util_dynarray_num_elements(arr, struct entry*) - 1; i++) {
1174          struct entry *low = *util_dynarray_element(arr, struct entry *, i);
1175          struct entry *high = *util_dynarray_element(arr, struct entry *, i + 1);
1176
1177          uint64_t diff = high->offset_signed - low->offset_signed;
1178          if (diff > get_bit_size(low) / 8u * low->intrin->num_components) {
1179             progress |= update_align(low);
1180             continue;
1181          }
1182
1183          struct entry *first = low->index < high->index ? low : high;
1184          struct entry *second = low->index < high->index ? high : low;
1185
1186          if (try_vectorize(impl, ctx, low, high, first, second)) {
1187             *util_dynarray_element(arr, struct entry *, i) = NULL;
1188             *util_dynarray_element(arr, struct entry *, i + 1) = low->is_store ? second : first;
1189             progress = true;
1190          } else {
1191             progress |= update_align(low);
1192          }
1193       }
1194
1195       struct entry *last = *util_dynarray_element(arr, struct entry *, i);
1196       progress |= update_align(last);
1197    }
1198
1199    _mesa_hash_table_clear(ht, delete_entry_dynarray);
1200
1201    return progress;
1202 }
1203
1204 static bool
1205 handle_barrier(struct vectorize_ctx *ctx, bool *progress, nir_function_impl *impl, nir_instr *instr)
1206 {
1207    unsigned modes = 0;
1208    bool acquire = true;
1209    bool release = true;
1210    if (instr->type == nir_instr_type_intrinsic) {
1211       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1212       switch (intrin->intrinsic) {
1213       case nir_intrinsic_group_memory_barrier:
1214       case nir_intrinsic_memory_barrier:
1215          modes = nir_var_mem_ssbo | nir_var_mem_shared | nir_var_mem_global;
1216          break;
1217       /* prevent speculative loads/stores */
1218       case nir_intrinsic_discard_if:
1219       case nir_intrinsic_discard:
1220          modes = nir_var_all;
1221          break;
1222       case nir_intrinsic_memory_barrier_buffer:
1223          modes = nir_var_mem_ssbo | nir_var_mem_global;
1224          break;
1225       case nir_intrinsic_memory_barrier_shared:
1226          modes = nir_var_mem_shared;
1227          break;
1228       case nir_intrinsic_scoped_barrier:
1229          if (nir_intrinsic_memory_scope(intrin) == NIR_SCOPE_NONE)
1230             break;
1231
1232          modes = nir_intrinsic_memory_modes(intrin);
1233          acquire = nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_ACQUIRE;
1234          release = nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_RELEASE;
1235          switch (nir_intrinsic_memory_scope(intrin)) {
1236          case NIR_SCOPE_INVOCATION:
1237          case NIR_SCOPE_SUBGROUP:
1238             /* a barier should never be required for correctness with these scopes */
1239             modes = 0;
1240             break;
1241          default:
1242             break;
1243          }
1244          break;
1245       default:
1246          return false;
1247       }
1248    } else if (instr->type == nir_instr_type_call) {
1249       modes = nir_var_all;
1250    } else {
1251       return false;
1252    }
1253
1254    while (modes) {
1255       unsigned mode_index = u_bit_scan(&modes);
1256       if ((1 << mode_index) == nir_var_mem_global) {
1257          /* Global should be rolled in with SSBO */
1258          assert(list_is_empty(&ctx->entries[mode_index]));
1259          assert(ctx->loads[mode_index] == NULL);
1260          assert(ctx->stores[mode_index] == NULL);
1261          continue;
1262       }
1263
1264       if (acquire)
1265          *progress |= vectorize_entries(ctx, impl, ctx->loads[mode_index]);
1266       if (release)
1267          *progress |= vectorize_entries(ctx, impl, ctx->stores[mode_index]);
1268    }
1269
1270    return true;
1271 }
1272
1273 static bool
1274 process_block(nir_function_impl *impl, struct vectorize_ctx *ctx, nir_block *block)
1275 {
1276    bool progress = false;
1277
1278    for (unsigned i = 0; i < nir_num_variable_modes; i++) {
1279       list_inithead(&ctx->entries[i]);
1280       if (ctx->loads[i])
1281          _mesa_hash_table_clear(ctx->loads[i], delete_entry_dynarray);
1282       if (ctx->stores[i])
1283          _mesa_hash_table_clear(ctx->stores[i], delete_entry_dynarray);
1284    }
1285
1286    /* create entries */
1287    unsigned next_index = 0;
1288
1289    nir_foreach_instr_safe(instr, block) {
1290       if (handle_barrier(ctx, &progress, impl, instr))
1291          continue;
1292
1293       /* gather information */
1294       if (instr->type != nir_instr_type_intrinsic)
1295          continue;
1296       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1297
1298       const struct intrinsic_info *info = get_info(intrin->intrinsic);
1299       if (!info)
1300          continue;
1301
1302       nir_variable_mode mode = info->mode;
1303       if (!mode)
1304          mode = nir_src_as_deref(intrin->src[info->deref_src])->mode;
1305       if (!(mode & aliasing_modes(ctx->modes)))
1306          continue;
1307       unsigned mode_index = mode_to_index(mode);
1308
1309       /* create entry */
1310       struct entry *entry = create_entry(ctx, info, intrin);
1311       entry->index = next_index++;
1312
1313       list_addtail(&entry->head, &ctx->entries[mode_index]);
1314
1315       /* add the entry to a hash table */
1316
1317       struct hash_table *adj_ht = NULL;
1318       if (entry->is_store) {
1319          if (!ctx->stores[mode_index])
1320             ctx->stores[mode_index] = _mesa_hash_table_create(ctx, &hash_entry_key, &entry_key_equals);
1321          adj_ht = ctx->stores[mode_index];
1322       } else {
1323          if (!ctx->loads[mode_index])
1324             ctx->loads[mode_index] = _mesa_hash_table_create(ctx, &hash_entry_key, &entry_key_equals);
1325          adj_ht = ctx->loads[mode_index];
1326       }
1327
1328       uint32_t key_hash = hash_entry_key(entry->key);
1329       struct hash_entry *adj_entry = _mesa_hash_table_search_pre_hashed(adj_ht, key_hash, entry->key);
1330       struct util_dynarray *arr;
1331       if (adj_entry && adj_entry->data) {
1332          arr = (struct util_dynarray *)adj_entry->data;
1333       } else {
1334          arr = ralloc(ctx, struct util_dynarray);
1335          util_dynarray_init(arr, arr);
1336          _mesa_hash_table_insert_pre_hashed(adj_ht, key_hash, entry->key, arr);
1337       }
1338       util_dynarray_append(arr, struct entry *, entry);
1339    }
1340
1341    /* sort and combine entries */
1342    for (unsigned i = 0; i < nir_num_variable_modes; i++) {
1343       progress |= vectorize_entries(ctx, impl, ctx->loads[i]);
1344       progress |= vectorize_entries(ctx, impl, ctx->stores[i]);
1345    }
1346
1347    return progress;
1348 }
1349
1350 bool
1351 nir_opt_load_store_vectorize(nir_shader *shader, nir_variable_mode modes,
1352                              nir_should_vectorize_mem_func callback,
1353                              nir_variable_mode robust_modes)
1354 {
1355    bool progress = false;
1356
1357    struct vectorize_ctx *ctx = rzalloc(NULL, struct vectorize_ctx);
1358    ctx->modes = modes;
1359    ctx->callback = callback;
1360    ctx->robust_modes = robust_modes;
1361
1362    nir_shader_index_vars(shader, modes);
1363
1364    nir_foreach_function(function, shader) {
1365       if (function->impl) {
1366          if (modes & nir_var_function_temp)
1367             nir_function_impl_index_vars(function->impl);
1368
1369          nir_foreach_block(block, function->impl)
1370             progress |= process_block(function->impl, ctx, block);
1371
1372          nir_metadata_preserve(function->impl,
1373                                nir_metadata_block_index |
1374                                nir_metadata_dominance |
1375                                nir_metadata_live_ssa_defs);
1376       }
1377    }
1378
1379    ralloc_free(ctx);
1380    return progress;
1381 }