nir: Split nir_index_vars into two functions
[mesa.git] / src / compiler / nir / nir_opt_load_store_vectorize.c
1 /*
2 * Copyright © 2019 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * Although it's called a load/store "vectorization" pass, this also combines
26 * intersecting and identical loads/stores. It currently supports derefs, ubo,
27 * ssbo and push constant loads/stores.
28 *
29 * This doesn't handle copy_deref intrinsics and assumes that
30 * nir_lower_alu_to_scalar() has been called and that the IR is free from ALU
31 * modifiers. It also assumes that derefs have explicitly laid out types.
32 *
33 * After vectorization, the backend may want to call nir_lower_alu_to_scalar()
34 * and nir_lower_pack(). Also this creates cast instructions taking derefs as a
35 * source and some parts of NIR may not be able to handle that well.
36 *
37 * There are a few situations where this doesn't vectorize as well as it could:
38 * - It won't turn four consecutive vec3 loads into 3 vec4 loads.
39 * - It doesn't do global vectorization.
40 * Handling these cases probably wouldn't provide much benefit though.
41 *
42 * This probably doesn't handle big-endian GPUs correctly.
43 */
44
45 #include "nir.h"
46 #include "nir_deref.h"
47 #include "nir_builder.h"
48 #include "nir_worklist.h"
49 #include "util/u_dynarray.h"
50
51 #include <stdlib.h>
52
53 struct intrinsic_info {
54 nir_variable_mode mode; /* 0 if the mode is obtained from the deref. */
55 nir_intrinsic_op op;
56 bool is_atomic;
57 /* Indices into nir_intrinsic::src[] or -1 if not applicable. */
58 int resource_src; /* resource (e.g. from vulkan_resource_index) */
59 int base_src; /* offset which it loads/stores from */
60 int deref_src; /* deref which is loads/stores from */
61 int value_src; /* the data it is storing */
62 };
63
64 static const struct intrinsic_info *
65 get_info(nir_intrinsic_op op) {
66 switch (op) {
67 #define INFO(mode, op, atomic, res, base, deref, val) \
68 case nir_intrinsic_##op: {\
69 static const struct intrinsic_info op##_info = {mode, nir_intrinsic_##op, atomic, res, base, deref, val};\
70 return &op##_info;\
71 }
72 #define LOAD(mode, op, res, base, deref) INFO(mode, load_##op, false, res, base, deref, -1)
73 #define STORE(mode, op, res, base, deref, val) INFO(mode, store_##op, false, res, base, deref, val)
74 #define ATOMIC(mode, type, op, res, base, deref, val) INFO(mode, type##_atomic_##op, true, res, base, deref, val)
75 LOAD(nir_var_mem_push_const, push_constant, -1, 0, -1)
76 LOAD(nir_var_mem_ubo, ubo, 0, 1, -1)
77 LOAD(nir_var_mem_ssbo, ssbo, 0, 1, -1)
78 STORE(nir_var_mem_ssbo, ssbo, 1, 2, -1, 0)
79 LOAD(0, deref, -1, -1, 0)
80 STORE(0, deref, -1, -1, 0, 1)
81 LOAD(nir_var_mem_shared, shared, -1, 0, -1)
82 STORE(nir_var_mem_shared, shared, -1, 1, -1, 0)
83 LOAD(nir_var_mem_global, global, -1, 0, -1)
84 STORE(nir_var_mem_global, global, -1, 1, -1, 0)
85 ATOMIC(nir_var_mem_ssbo, ssbo, add, 0, 1, -1, 2)
86 ATOMIC(nir_var_mem_ssbo, ssbo, imin, 0, 1, -1, 2)
87 ATOMIC(nir_var_mem_ssbo, ssbo, umin, 0, 1, -1, 2)
88 ATOMIC(nir_var_mem_ssbo, ssbo, imax, 0, 1, -1, 2)
89 ATOMIC(nir_var_mem_ssbo, ssbo, umax, 0, 1, -1, 2)
90 ATOMIC(nir_var_mem_ssbo, ssbo, and, 0, 1, -1, 2)
91 ATOMIC(nir_var_mem_ssbo, ssbo, or, 0, 1, -1, 2)
92 ATOMIC(nir_var_mem_ssbo, ssbo, xor, 0, 1, -1, 2)
93 ATOMIC(nir_var_mem_ssbo, ssbo, exchange, 0, 1, -1, 2)
94 ATOMIC(nir_var_mem_ssbo, ssbo, comp_swap, 0, 1, -1, 2)
95 ATOMIC(nir_var_mem_ssbo, ssbo, fadd, 0, 1, -1, 2)
96 ATOMIC(nir_var_mem_ssbo, ssbo, fmin, 0, 1, -1, 2)
97 ATOMIC(nir_var_mem_ssbo, ssbo, fmax, 0, 1, -1, 2)
98 ATOMIC(nir_var_mem_ssbo, ssbo, fcomp_swap, 0, 1, -1, 2)
99 ATOMIC(0, deref, add, -1, -1, 0, 1)
100 ATOMIC(0, deref, imin, -1, -1, 0, 1)
101 ATOMIC(0, deref, umin, -1, -1, 0, 1)
102 ATOMIC(0, deref, imax, -1, -1, 0, 1)
103 ATOMIC(0, deref, umax, -1, -1, 0, 1)
104 ATOMIC(0, deref, and, -1, -1, 0, 1)
105 ATOMIC(0, deref, or, -1, -1, 0, 1)
106 ATOMIC(0, deref, xor, -1, -1, 0, 1)
107 ATOMIC(0, deref, exchange, -1, -1, 0, 1)
108 ATOMIC(0, deref, comp_swap, -1, -1, 0, 1)
109 ATOMIC(0, deref, fadd, -1, -1, 0, 1)
110 ATOMIC(0, deref, fmin, -1, -1, 0, 1)
111 ATOMIC(0, deref, fmax, -1, -1, 0, 1)
112 ATOMIC(0, deref, fcomp_swap, -1, -1, 0, 1)
113 ATOMIC(nir_var_mem_shared, shared, add, -1, 0, -1, 1)
114 ATOMIC(nir_var_mem_shared, shared, imin, -1, 0, -1, 1)
115 ATOMIC(nir_var_mem_shared, shared, umin, -1, 0, -1, 1)
116 ATOMIC(nir_var_mem_shared, shared, imax, -1, 0, -1, 1)
117 ATOMIC(nir_var_mem_shared, shared, umax, -1, 0, -1, 1)
118 ATOMIC(nir_var_mem_shared, shared, and, -1, 0, -1, 1)
119 ATOMIC(nir_var_mem_shared, shared, or, -1, 0, -1, 1)
120 ATOMIC(nir_var_mem_shared, shared, xor, -1, 0, -1, 1)
121 ATOMIC(nir_var_mem_shared, shared, exchange, -1, 0, -1, 1)
122 ATOMIC(nir_var_mem_shared, shared, comp_swap, -1, 0, -1, 1)
123 ATOMIC(nir_var_mem_shared, shared, fadd, -1, 0, -1, 1)
124 ATOMIC(nir_var_mem_shared, shared, fmin, -1, 0, -1, 1)
125 ATOMIC(nir_var_mem_shared, shared, fmax, -1, 0, -1, 1)
126 ATOMIC(nir_var_mem_shared, shared, fcomp_swap, -1, 0, -1, 1)
127 ATOMIC(nir_var_mem_global, global, add, -1, 0, -1, 1)
128 ATOMIC(nir_var_mem_global, global, imin, -1, 0, -1, 1)
129 ATOMIC(nir_var_mem_global, global, umin, -1, 0, -1, 1)
130 ATOMIC(nir_var_mem_global, global, imax, -1, 0, -1, 1)
131 ATOMIC(nir_var_mem_global, global, umax, -1, 0, -1, 1)
132 ATOMIC(nir_var_mem_global, global, and, -1, 0, -1, 1)
133 ATOMIC(nir_var_mem_global, global, or, -1, 0, -1, 1)
134 ATOMIC(nir_var_mem_global, global, xor, -1, 0, -1, 1)
135 ATOMIC(nir_var_mem_global, global, exchange, -1, 0, -1, 1)
136 ATOMIC(nir_var_mem_global, global, comp_swap, -1, 0, -1, 1)
137 ATOMIC(nir_var_mem_global, global, fadd, -1, 0, -1, 1)
138 ATOMIC(nir_var_mem_global, global, fmin, -1, 0, -1, 1)
139 ATOMIC(nir_var_mem_global, global, fmax, -1, 0, -1, 1)
140 ATOMIC(nir_var_mem_global, global, fcomp_swap, -1, 0, -1, 1)
141 default:
142 break;
143 #undef ATOMIC
144 #undef STORE
145 #undef LOAD
146 #undef INFO
147 }
148 return NULL;
149 }
150
151 /*
152 * Information used to compare memory operations.
153 * It canonically represents an offset as:
154 * `offset_defs[0]*offset_defs_mul[0] + offset_defs[1]*offset_defs_mul[1] + ...`
155 * "offset_defs" is sorted in ascenting order by the ssa definition's index.
156 * "resource" or "var" may be NULL.
157 */
158 struct entry_key {
159 nir_ssa_def *resource;
160 nir_variable *var;
161 unsigned offset_def_count;
162 nir_ssa_def **offset_defs;
163 uint64_t *offset_defs_mul;
164 };
165
166 /* Information on a single memory operation. */
167 struct entry {
168 struct list_head head;
169 unsigned index;
170
171 struct entry_key *key;
172 union {
173 uint64_t offset; /* sign-extended */
174 int64_t offset_signed;
175 };
176 uint32_t best_align;
177
178 nir_instr *instr;
179 nir_intrinsic_instr *intrin;
180 const struct intrinsic_info *info;
181 enum gl_access_qualifier access;
182 bool is_store;
183
184 nir_deref_instr *deref;
185 };
186
187 struct vectorize_ctx {
188 nir_variable_mode modes;
189 nir_should_vectorize_mem_func callback;
190 nir_variable_mode robust_modes;
191 struct list_head entries[nir_num_variable_modes];
192 struct hash_table *loads[nir_num_variable_modes];
193 struct hash_table *stores[nir_num_variable_modes];
194 };
195
196 static uint32_t hash_entry_key(const void *key_)
197 {
198 /* this is careful to not include pointers in the hash calculation so that
199 * the order of the hash table walk is deterministic */
200 struct entry_key *key = (struct entry_key*)key_;
201
202 uint32_t hash = 0;
203 if (key->resource)
204 hash = XXH32(&key->resource->index, sizeof(key->resource->index), hash);
205 if (key->var) {
206 hash = XXH32(&key->var->index, sizeof(key->var->index), hash);
207 unsigned mode = key->var->data.mode;
208 hash = XXH32(&mode, sizeof(mode), hash);
209 }
210
211 for (unsigned i = 0; i < key->offset_def_count; i++)
212 hash = XXH32(&key->offset_defs[i]->index, sizeof(key->offset_defs[i]->index), hash);
213
214 hash = XXH32(key->offset_defs_mul, key->offset_def_count * sizeof(uint64_t), hash);
215
216 return hash;
217 }
218
219 static bool entry_key_equals(const void *a_, const void *b_)
220 {
221 struct entry_key *a = (struct entry_key*)a_;
222 struct entry_key *b = (struct entry_key*)b_;
223
224 if (a->var != b->var || a->resource != b->resource)
225 return false;
226
227 if (a->offset_def_count != b->offset_def_count)
228 return false;
229
230 size_t offset_def_size = a->offset_def_count * sizeof(nir_ssa_def *);
231 size_t offset_def_mul_size = a->offset_def_count * sizeof(uint64_t);
232 if (a->offset_def_count &&
233 (memcmp(a->offset_defs, b->offset_defs, offset_def_size) ||
234 memcmp(a->offset_defs_mul, b->offset_defs_mul, offset_def_mul_size)))
235 return false;
236
237 return true;
238 }
239
240 static void delete_entry_dynarray(struct hash_entry *entry)
241 {
242 struct util_dynarray *arr = (struct util_dynarray *)entry->data;
243 ralloc_free(arr);
244 }
245
246 static int sort_entries(const void *a_, const void *b_)
247 {
248 struct entry *a = *(struct entry*const*)a_;
249 struct entry *b = *(struct entry*const*)b_;
250
251 if (a->offset_signed > b->offset_signed)
252 return 1;
253 else if (a->offset_signed < b->offset_signed)
254 return -1;
255 else
256 return 0;
257 }
258
259 static unsigned
260 get_bit_size(struct entry *entry)
261 {
262 unsigned size = entry->is_store ?
263 entry->intrin->src[entry->info->value_src].ssa->bit_size :
264 entry->intrin->dest.ssa.bit_size;
265 return size == 1 ? 32u : size;
266 }
267
268 /* If "def" is from an alu instruction with the opcode "op" and one of it's
269 * sources is a constant, update "def" to be the non-constant source, fill "c"
270 * with the constant and return true. */
271 static bool
272 parse_alu(nir_ssa_def **def, nir_op op, uint64_t *c)
273 {
274 nir_ssa_scalar scalar;
275 scalar.def = *def;
276 scalar.comp = 0;
277
278 if (!nir_ssa_scalar_is_alu(scalar) || nir_ssa_scalar_alu_op(scalar) != op)
279 return false;
280
281 nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);
282 nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);
283 if (op != nir_op_ishl && nir_ssa_scalar_is_const(src0) && src1.comp == 0) {
284 *c = nir_ssa_scalar_as_uint(src0);
285 *def = src1.def;
286 } else if (nir_ssa_scalar_is_const(src1) && src0.comp == 0) {
287 *c = nir_ssa_scalar_as_uint(src1);
288 *def = src0.def;
289 } else {
290 return false;
291 }
292 return true;
293 }
294
295 /* Parses an offset expression such as "a * 16 + 4" and "(a * 16 + 4) * 64 + 32". */
296 static void
297 parse_offset(nir_ssa_def **base, uint64_t *base_mul, uint64_t *offset)
298 {
299 if ((*base)->parent_instr->type == nir_instr_type_load_const) {
300 *offset = nir_src_comp_as_uint(nir_src_for_ssa(*base), 0);
301 *base = NULL;
302 return;
303 }
304
305 uint64_t mul = 1;
306 uint64_t add = 0;
307 bool progress = false;
308 do {
309 uint64_t mul2 = 1, add2 = 0;
310
311 progress = parse_alu(base, nir_op_imul, &mul2);
312 mul *= mul2;
313
314 mul2 = 0;
315 progress |= parse_alu(base, nir_op_ishl, &mul2);
316 mul <<= mul2;
317
318 progress |= parse_alu(base, nir_op_iadd, &add2);
319 add += add2 * mul;
320 } while (progress);
321
322 *base_mul = mul;
323 *offset = add;
324 }
325
326 static unsigned
327 type_scalar_size_bytes(const struct glsl_type *type)
328 {
329 assert(glsl_type_is_vector_or_scalar(type) ||
330 glsl_type_is_matrix(type));
331 return glsl_type_is_boolean(type) ? 4u : glsl_get_bit_size(type) / 8u;
332 }
333
334 static int
335 get_array_stride(const struct glsl_type *type)
336 {
337 unsigned explicit_stride = glsl_get_explicit_stride(type);
338 if ((glsl_type_is_matrix(type) &&
339 glsl_matrix_type_is_row_major(type)) ||
340 (glsl_type_is_vector(type) && explicit_stride == 0))
341 return type_scalar_size_bytes(type);
342 return explicit_stride;
343 }
344
345 static uint64_t
346 mask_sign_extend(uint64_t val, unsigned bit_size)
347 {
348 return (int64_t)(val << (64 - bit_size)) >> (64 - bit_size);
349 }
350
351 static unsigned
352 add_to_entry_key(nir_ssa_def **offset_defs, uint64_t *offset_defs_mul,
353 unsigned offset_def_count, nir_ssa_def *def, uint64_t mul)
354 {
355 mul = mask_sign_extend(mul, def->bit_size);
356
357 for (unsigned i = 0; i <= offset_def_count; i++) {
358 if (i == offset_def_count || def->index > offset_defs[i]->index) {
359 /* insert before i */
360 memmove(offset_defs + i + 1, offset_defs + i,
361 (offset_def_count - i) * sizeof(nir_ssa_def *));
362 memmove(offset_defs_mul + i + 1, offset_defs_mul + i,
363 (offset_def_count - i) * sizeof(uint64_t));
364 offset_defs[i] = def;
365 offset_defs_mul[i] = mul;
366 return 1;
367 } else if (def->index == offset_defs[i]->index) {
368 /* merge with offset_def at i */
369 offset_defs_mul[i] += mul;
370 return 0;
371 }
372 }
373 unreachable("Unreachable.");
374 return 0;
375 }
376
377 static struct entry_key *
378 create_entry_key_from_deref(void *mem_ctx,
379 struct vectorize_ctx *ctx,
380 nir_deref_path *path,
381 uint64_t *offset_base)
382 {
383 unsigned path_len = 0;
384 while (path->path[path_len])
385 path_len++;
386
387 nir_ssa_def *offset_defs_stack[32];
388 uint64_t offset_defs_mul_stack[32];
389 nir_ssa_def **offset_defs = offset_defs_stack;
390 uint64_t *offset_defs_mul = offset_defs_mul_stack;
391 if (path_len > 32) {
392 offset_defs = malloc(path_len * sizeof(nir_ssa_def *));
393 offset_defs_mul = malloc(path_len * sizeof(uint64_t));
394 }
395 unsigned offset_def_count = 0;
396
397 struct entry_key* key = ralloc(mem_ctx, struct entry_key);
398 key->resource = NULL;
399 key->var = NULL;
400 *offset_base = 0;
401
402 for (unsigned i = 0; i < path_len; i++) {
403 nir_deref_instr *parent = i ? path->path[i - 1] : NULL;
404 nir_deref_instr *deref = path->path[i];
405
406 switch (deref->deref_type) {
407 case nir_deref_type_var: {
408 assert(!parent);
409 key->var = deref->var;
410 break;
411 }
412 case nir_deref_type_array:
413 case nir_deref_type_ptr_as_array: {
414 assert(parent);
415 nir_ssa_def *index = deref->arr.index.ssa;
416 uint32_t stride;
417 if (deref->deref_type == nir_deref_type_ptr_as_array)
418 stride = nir_deref_instr_ptr_as_array_stride(deref);
419 else
420 stride = get_array_stride(parent->type);
421
422 nir_ssa_def *base = index;
423 uint64_t offset = 0, base_mul = 1;
424 parse_offset(&base, &base_mul, &offset);
425 offset = mask_sign_extend(offset, index->bit_size);
426
427 *offset_base += offset * stride;
428 if (base) {
429 offset_def_count += add_to_entry_key(offset_defs, offset_defs_mul,
430 offset_def_count,
431 base, base_mul * stride);
432 }
433 break;
434 }
435 case nir_deref_type_struct: {
436 assert(parent);
437 int offset = glsl_get_struct_field_offset(parent->type, deref->strct.index);
438 *offset_base += offset;
439 break;
440 }
441 case nir_deref_type_cast: {
442 if (!parent)
443 key->resource = deref->parent.ssa;
444 break;
445 }
446 default:
447 unreachable("Unhandled deref type");
448 }
449 }
450
451 key->offset_def_count = offset_def_count;
452 key->offset_defs = ralloc_array(mem_ctx, nir_ssa_def *, offset_def_count);
453 key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, offset_def_count);
454 memcpy(key->offset_defs, offset_defs, offset_def_count * sizeof(nir_ssa_def *));
455 memcpy(key->offset_defs_mul, offset_defs_mul, offset_def_count * sizeof(uint64_t));
456
457 if (offset_defs != offset_defs_stack)
458 free(offset_defs);
459 if (offset_defs_mul != offset_defs_mul_stack)
460 free(offset_defs_mul);
461
462 return key;
463 }
464
465 static unsigned
466 parse_entry_key_from_offset(struct entry_key *key, unsigned size, unsigned left,
467 nir_ssa_def *base, uint64_t base_mul, uint64_t *offset)
468 {
469 uint64_t new_mul;
470 uint64_t new_offset;
471 parse_offset(&base, &new_mul, &new_offset);
472 *offset += new_offset * base_mul;
473
474 if (!base)
475 return 0;
476
477 base_mul *= new_mul;
478
479 assert(left >= 1);
480
481 if (left >= 2) {
482 nir_ssa_scalar scalar;
483 scalar.def = base;
484 scalar.comp = 0;
485 if (nir_ssa_scalar_is_alu(scalar) && nir_ssa_scalar_alu_op(scalar) == nir_op_iadd) {
486 nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);
487 nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);
488 if (src0.comp == 0 && src1.comp == 0) {
489 unsigned amount = parse_entry_key_from_offset(key, size, left - 1, src0.def, base_mul, offset);
490 amount += parse_entry_key_from_offset(key, size + amount, left - amount, src1.def, base_mul, offset);
491 return amount;
492 }
493 }
494 }
495
496 return add_to_entry_key(key->offset_defs, key->offset_defs_mul, size, base, base_mul);
497 }
498
499 static struct entry_key *
500 create_entry_key_from_offset(void *mem_ctx, nir_ssa_def *base, uint64_t base_mul, uint64_t *offset)
501 {
502 struct entry_key *key = ralloc(mem_ctx, struct entry_key);
503 key->resource = NULL;
504 key->var = NULL;
505 if (base) {
506 nir_ssa_def *offset_defs[32];
507 uint64_t offset_defs_mul[32];
508 key->offset_defs = offset_defs;
509 key->offset_defs_mul = offset_defs_mul;
510
511 key->offset_def_count = parse_entry_key_from_offset(key, 0, 32, base, base_mul, offset);
512
513 key->offset_defs = ralloc_array(mem_ctx, nir_ssa_def *, key->offset_def_count);
514 key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, key->offset_def_count);
515 memcpy(key->offset_defs, offset_defs, key->offset_def_count * sizeof(nir_ssa_def *));
516 memcpy(key->offset_defs_mul, offset_defs_mul, key->offset_def_count * sizeof(uint64_t));
517 } else {
518 key->offset_def_count = 0;
519 key->offset_defs = NULL;
520 key->offset_defs_mul = NULL;
521 }
522 return key;
523 }
524
525 static nir_variable_mode
526 get_variable_mode(struct entry *entry)
527 {
528 if (entry->info->mode)
529 return entry->info->mode;
530 assert(entry->deref);
531 return entry->deref->mode;
532 }
533
534 static unsigned
535 mode_to_index(nir_variable_mode mode)
536 {
537 assert(util_bitcount(mode) == 1);
538
539 /* Globals and SSBOs should be tracked together */
540 if (mode == nir_var_mem_global)
541 mode = nir_var_mem_ssbo;
542
543 return ffs(mode) - 1;
544 }
545
546 static nir_variable_mode
547 aliasing_modes(nir_variable_mode modes)
548 {
549 /* Global and SSBO can alias */
550 if (modes & (nir_var_mem_ssbo | nir_var_mem_global))
551 modes |= nir_var_mem_ssbo | nir_var_mem_global;
552 return modes;
553 }
554
555 static struct entry *
556 create_entry(struct vectorize_ctx *ctx,
557 const struct intrinsic_info *info,
558 nir_intrinsic_instr *intrin)
559 {
560 struct entry *entry = rzalloc(ctx, struct entry);
561 entry->intrin = intrin;
562 entry->instr = &intrin->instr;
563 entry->info = info;
564 entry->best_align = UINT32_MAX;
565 entry->is_store = entry->info->value_src >= 0;
566
567 if (entry->info->deref_src >= 0) {
568 entry->deref = nir_src_as_deref(intrin->src[entry->info->deref_src]);
569 nir_deref_path path;
570 nir_deref_path_init(&path, entry->deref, NULL);
571 entry->key = create_entry_key_from_deref(entry, ctx, &path, &entry->offset);
572 nir_deref_path_finish(&path);
573 } else {
574 nir_ssa_def *base = entry->info->base_src >= 0 ?
575 intrin->src[entry->info->base_src].ssa : NULL;
576 uint64_t offset = 0;
577 if (nir_intrinsic_infos[intrin->intrinsic].index_map[NIR_INTRINSIC_BASE])
578 offset += nir_intrinsic_base(intrin);
579 entry->key = create_entry_key_from_offset(entry, base, 1, &offset);
580 entry->offset = offset;
581
582 if (base)
583 entry->offset = mask_sign_extend(entry->offset, base->bit_size);
584 }
585
586 if (entry->info->resource_src >= 0)
587 entry->key->resource = intrin->src[entry->info->resource_src].ssa;
588
589 if (nir_intrinsic_infos[intrin->intrinsic].index_map[NIR_INTRINSIC_ACCESS])
590 entry->access = nir_intrinsic_access(intrin);
591 else if (entry->key->var)
592 entry->access = entry->key->var->data.access;
593
594 uint32_t restrict_modes = nir_var_shader_in | nir_var_shader_out;
595 restrict_modes |= nir_var_shader_temp | nir_var_function_temp;
596 restrict_modes |= nir_var_uniform | nir_var_mem_push_const;
597 restrict_modes |= nir_var_system_value | nir_var_mem_shared;
598 if (get_variable_mode(entry) & restrict_modes)
599 entry->access |= ACCESS_RESTRICT;
600
601 return entry;
602 }
603
604 static nir_deref_instr *
605 cast_deref(nir_builder *b, unsigned num_components, unsigned bit_size, nir_deref_instr *deref)
606 {
607 if (glsl_get_components(deref->type) == num_components &&
608 type_scalar_size_bytes(deref->type)*8u == bit_size)
609 return deref;
610
611 enum glsl_base_type types[] = {
612 GLSL_TYPE_UINT8, GLSL_TYPE_UINT16, GLSL_TYPE_UINT, GLSL_TYPE_UINT64};
613 enum glsl_base_type base = types[ffs(bit_size / 8u) - 1u];
614 const struct glsl_type *type = glsl_vector_type(base, num_components);
615
616 if (deref->type == type)
617 return deref;
618
619 return nir_build_deref_cast(b, &deref->dest.ssa, deref->mode, type, 0);
620 }
621
622 /* Return true if the write mask "write_mask" of a store with "old_bit_size"
623 * bits per element can be represented for a store with "new_bit_size" bits per
624 * element. */
625 static bool
626 writemask_representable(unsigned write_mask, unsigned old_bit_size, unsigned new_bit_size)
627 {
628 while (write_mask) {
629 int start, count;
630 u_bit_scan_consecutive_range(&write_mask, &start, &count);
631 start *= old_bit_size;
632 count *= old_bit_size;
633 if (start % new_bit_size != 0)
634 return false;
635 if (count % new_bit_size != 0)
636 return false;
637 }
638 return true;
639 }
640
641 static uint64_t
642 gcd(uint64_t a, uint64_t b)
643 {
644 while (b) {
645 uint64_t old_b = b;
646 b = a % b;
647 a = old_b;
648 }
649 return a;
650 }
651
652 static uint32_t
653 get_best_align(struct entry *entry)
654 {
655 if (entry->best_align != UINT32_MAX)
656 return entry->best_align;
657
658 uint64_t best_align = entry->offset;
659 for (unsigned i = 0; i < entry->key->offset_def_count; i++) {
660 if (!best_align)
661 best_align = entry->key->offset_defs_mul[i];
662 else if (entry->key->offset_defs_mul[i])
663 best_align = gcd(best_align, entry->key->offset_defs_mul[i]);
664 }
665
666 if (nir_intrinsic_infos[entry->intrin->intrinsic].index_map[NIR_INTRINSIC_ALIGN_MUL])
667 best_align = MAX2(best_align, nir_intrinsic_align(entry->intrin));
668
669 /* ensure the result is a power of two that fits in a int32_t */
670 entry->best_align = gcd(best_align, 1u << 30);
671
672 return entry->best_align;
673 }
674
675 /* Return true if "new_bit_size" is a usable bit size for a vectorized load/store
676 * of "low" and "high". */
677 static bool
678 new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size,
679 struct entry *low, struct entry *high, unsigned size)
680 {
681 if (size % new_bit_size != 0)
682 return false;
683
684 unsigned new_num_components = size / new_bit_size;
685 if (!nir_num_components_valid(new_num_components))
686 return false;
687
688 unsigned high_offset = high->offset_signed - low->offset_signed;
689
690 /* check nir_extract_bits limitations */
691 unsigned common_bit_size = MIN2(get_bit_size(low), get_bit_size(high));
692 common_bit_size = MIN2(common_bit_size, new_bit_size);
693 if (high_offset > 0)
694 common_bit_size = MIN2(common_bit_size, (1u << (ffs(high_offset * 8) - 1)));
695 if (new_bit_size / common_bit_size > NIR_MAX_VEC_COMPONENTS)
696 return false;
697
698 if (!ctx->callback(get_best_align(low), new_bit_size, new_num_components,
699 high_offset, low->intrin, high->intrin))
700 return false;
701
702 if (low->is_store) {
703 unsigned low_size = low->intrin->num_components * get_bit_size(low);
704 unsigned high_size = high->intrin->num_components * get_bit_size(high);
705
706 if (low_size % new_bit_size != 0)
707 return false;
708 if (high_size % new_bit_size != 0)
709 return false;
710
711 unsigned write_mask = nir_intrinsic_write_mask(low->intrin);
712 if (!writemask_representable(write_mask, low_size, new_bit_size))
713 return false;
714
715 write_mask = nir_intrinsic_write_mask(high->intrin);
716 if (!writemask_representable(write_mask, high_size, new_bit_size))
717 return false;
718 }
719
720 return true;
721 }
722
723 /* Updates a write mask, "write_mask", so that it can be used with a
724 * "new_bit_size"-bit store instead of a "old_bit_size"-bit store. */
725 static uint32_t
726 update_writemask(unsigned write_mask, unsigned old_bit_size, unsigned new_bit_size)
727 {
728 uint32_t res = 0;
729 while (write_mask) {
730 int start, count;
731 u_bit_scan_consecutive_range(&write_mask, &start, &count);
732 start = start * old_bit_size / new_bit_size;
733 count = count * old_bit_size / new_bit_size;
734 res |= ((1 << count) - 1) << start;
735 }
736 return res;
737 }
738
739 static nir_deref_instr *subtract_deref(nir_builder *b, nir_deref_instr *deref, int64_t offset)
740 {
741 /* avoid adding another deref to the path */
742 if (deref->deref_type == nir_deref_type_ptr_as_array &&
743 nir_src_is_const(deref->arr.index) &&
744 offset % nir_deref_instr_ptr_as_array_stride(deref) == 0) {
745 unsigned stride = nir_deref_instr_ptr_as_array_stride(deref);
746 nir_ssa_def *index = nir_imm_intN_t(b, nir_src_as_int(deref->arr.index) - offset / stride,
747 deref->dest.ssa.bit_size);
748 return nir_build_deref_ptr_as_array(b, nir_deref_instr_parent(deref), index);
749 }
750
751 if (deref->deref_type == nir_deref_type_array &&
752 nir_src_is_const(deref->arr.index)) {
753 nir_deref_instr *parent = nir_deref_instr_parent(deref);
754 unsigned stride = glsl_get_explicit_stride(parent->type);
755 if (offset % stride == 0)
756 return nir_build_deref_array_imm(
757 b, parent, nir_src_as_int(deref->arr.index) - offset / stride);
758 }
759
760
761 deref = nir_build_deref_cast(b, &deref->dest.ssa, deref->mode,
762 glsl_scalar_type(GLSL_TYPE_UINT8), 1);
763 return nir_build_deref_ptr_as_array(
764 b, deref, nir_imm_intN_t(b, -offset, deref->dest.ssa.bit_size));
765 }
766
767 static bool update_align(struct entry *entry)
768 {
769 bool has_align_index =
770 nir_intrinsic_infos[entry->intrin->intrinsic].index_map[NIR_INTRINSIC_ALIGN_MUL];
771 if (has_align_index) {
772 unsigned align = get_best_align(entry);
773 if (align != nir_intrinsic_align(entry->intrin)) {
774 nir_intrinsic_set_align(entry->intrin, align, 0);
775 return true;
776 }
777 }
778 return false;
779 }
780
781 static void
782 vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx,
783 struct entry *low, struct entry *high,
784 struct entry *first, struct entry *second,
785 unsigned new_bit_size, unsigned new_num_components,
786 unsigned high_start)
787 {
788 unsigned low_bit_size = get_bit_size(low);
789 unsigned high_bit_size = get_bit_size(high);
790 bool low_bool = low->intrin->dest.ssa.bit_size == 1;
791 bool high_bool = high->intrin->dest.ssa.bit_size == 1;
792 nir_ssa_def *data = &first->intrin->dest.ssa;
793
794 b->cursor = nir_after_instr(first->instr);
795
796 /* update the load's destination size and extract data for each of the original loads */
797 data->num_components = new_num_components;
798 data->bit_size = new_bit_size;
799
800 nir_ssa_def *low_def = nir_extract_bits(
801 b, &data, 1, 0, low->intrin->num_components, low_bit_size);
802 nir_ssa_def *high_def = nir_extract_bits(
803 b, &data, 1, high_start, high->intrin->num_components, high_bit_size);
804
805 /* convert booleans */
806 low_def = low_bool ? nir_i2b(b, low_def) : nir_mov(b, low_def);
807 high_def = high_bool ? nir_i2b(b, high_def) : nir_mov(b, high_def);
808
809 /* update uses */
810 if (first == low) {
811 nir_ssa_def_rewrite_uses_after(&low->intrin->dest.ssa, nir_src_for_ssa(low_def),
812 high_def->parent_instr);
813 nir_ssa_def_rewrite_uses(&high->intrin->dest.ssa, nir_src_for_ssa(high_def));
814 } else {
815 nir_ssa_def_rewrite_uses(&low->intrin->dest.ssa, nir_src_for_ssa(low_def));
816 nir_ssa_def_rewrite_uses_after(&high->intrin->dest.ssa, nir_src_for_ssa(high_def),
817 high_def->parent_instr);
818 }
819
820 /* update the intrinsic */
821 first->intrin->num_components = new_num_components;
822
823 const struct intrinsic_info *info = first->info;
824
825 /* update the offset */
826 if (first != low && info->base_src >= 0) {
827 /* let nir_opt_algebraic() remove this addition. this doesn't have much
828 * issues with subtracting 16 from expressions like "(i + 1) * 16" because
829 * nir_opt_algebraic() turns them into "i * 16 + 16" */
830 b->cursor = nir_before_instr(first->instr);
831
832 nir_ssa_def *new_base = first->intrin->src[info->base_src].ssa;
833 new_base = nir_iadd_imm(b, new_base, -(int)(high_start / 8u));
834
835 nir_instr_rewrite_src(first->instr, &first->intrin->src[info->base_src],
836 nir_src_for_ssa(new_base));
837 }
838
839 /* update the deref */
840 if (info->deref_src >= 0) {
841 b->cursor = nir_before_instr(first->instr);
842
843 nir_deref_instr *deref = nir_src_as_deref(first->intrin->src[info->deref_src]);
844 if (first != low && high_start != 0)
845 deref = subtract_deref(b, deref, high_start / 8u);
846 first->deref = cast_deref(b, new_num_components, new_bit_size, deref);
847
848 nir_instr_rewrite_src(first->instr, &first->intrin->src[info->deref_src],
849 nir_src_for_ssa(&first->deref->dest.ssa));
850 }
851
852 /* update base/align */
853 bool has_base_index =
854 nir_intrinsic_infos[first->intrin->intrinsic].index_map[NIR_INTRINSIC_BASE];
855
856 if (first != low && has_base_index)
857 nir_intrinsic_set_base(first->intrin, nir_intrinsic_base(low->intrin));
858
859 first->key = low->key;
860 first->offset = low->offset;
861 first->best_align = get_best_align(low);
862
863 update_align(first);
864
865 nir_instr_remove(second->instr);
866 }
867
868 static void
869 vectorize_stores(nir_builder *b, struct vectorize_ctx *ctx,
870 struct entry *low, struct entry *high,
871 struct entry *first, struct entry *second,
872 unsigned new_bit_size, unsigned new_num_components,
873 unsigned high_start)
874 {
875 ASSERTED unsigned low_size = low->intrin->num_components * get_bit_size(low);
876 assert(low_size % new_bit_size == 0);
877
878 b->cursor = nir_before_instr(second->instr);
879
880 /* get new writemasks */
881 uint32_t low_write_mask = nir_intrinsic_write_mask(low->intrin);
882 uint32_t high_write_mask = nir_intrinsic_write_mask(high->intrin);
883 low_write_mask = update_writemask(low_write_mask, get_bit_size(low), new_bit_size);
884 high_write_mask = update_writemask(high_write_mask, get_bit_size(high), new_bit_size);
885 high_write_mask <<= high_start / new_bit_size;
886
887 uint32_t write_mask = low_write_mask | high_write_mask;
888
889 /* convert booleans */
890 nir_ssa_def *low_val = low->intrin->src[low->info->value_src].ssa;
891 nir_ssa_def *high_val = high->intrin->src[high->info->value_src].ssa;
892 low_val = low_val->bit_size == 1 ? nir_b2i(b, low_val, 32) : low_val;
893 high_val = high_val->bit_size == 1 ? nir_b2i(b, high_val, 32) : high_val;
894
895 /* combine the data */
896 nir_ssa_def *data_channels[NIR_MAX_VEC_COMPONENTS];
897 for (unsigned i = 0; i < new_num_components; i++) {
898 bool set_low = low_write_mask & (1 << i);
899 bool set_high = high_write_mask & (1 << i);
900
901 if (set_low && (!set_high || low == second)) {
902 unsigned offset = i * new_bit_size;
903 data_channels[i] = nir_extract_bits(b, &low_val, 1, offset, 1, new_bit_size);
904 } else if (set_high) {
905 assert(!set_low || high == second);
906 unsigned offset = i * new_bit_size - high_start;
907 data_channels[i] = nir_extract_bits(b, &high_val, 1, offset, 1, new_bit_size);
908 } else {
909 data_channels[i] = nir_ssa_undef(b, 1, new_bit_size);
910 }
911 }
912 nir_ssa_def *data = nir_vec(b, data_channels, new_num_components);
913
914 /* update the intrinsic */
915 nir_intrinsic_set_write_mask(second->intrin, write_mask);
916 second->intrin->num_components = data->num_components;
917
918 const struct intrinsic_info *info = second->info;
919 assert(info->value_src >= 0);
920 nir_instr_rewrite_src(second->instr, &second->intrin->src[info->value_src],
921 nir_src_for_ssa(data));
922
923 /* update the offset */
924 if (second != low && info->base_src >= 0)
925 nir_instr_rewrite_src(second->instr, &second->intrin->src[info->base_src],
926 low->intrin->src[info->base_src]);
927
928 /* update the deref */
929 if (info->deref_src >= 0) {
930 b->cursor = nir_before_instr(second->instr);
931 second->deref = cast_deref(b, new_num_components, new_bit_size,
932 nir_src_as_deref(low->intrin->src[info->deref_src]));
933 nir_instr_rewrite_src(second->instr, &second->intrin->src[info->deref_src],
934 nir_src_for_ssa(&second->deref->dest.ssa));
935 }
936
937 /* update base/align */
938 bool has_base_index =
939 nir_intrinsic_infos[second->intrin->intrinsic].index_map[NIR_INTRINSIC_BASE];
940
941 if (second != low && has_base_index)
942 nir_intrinsic_set_base(second->intrin, nir_intrinsic_base(low->intrin));
943
944 second->key = low->key;
945 second->offset = low->offset;
946 second->best_align = get_best_align(low);
947
948 update_align(second);
949
950 list_del(&first->head);
951 nir_instr_remove(first->instr);
952 }
953
954 /* Returns true if it can prove that "a" and "b" point to different resources. */
955 static bool
956 resources_different(nir_ssa_def *a, nir_ssa_def *b)
957 {
958 if (!a || !b)
959 return false;
960
961 if (a->parent_instr->type == nir_instr_type_load_const &&
962 b->parent_instr->type == nir_instr_type_load_const) {
963 return nir_src_as_uint(nir_src_for_ssa(a)) != nir_src_as_uint(nir_src_for_ssa(b));
964 }
965
966 if (a->parent_instr->type == nir_instr_type_intrinsic &&
967 b->parent_instr->type == nir_instr_type_intrinsic) {
968 nir_intrinsic_instr *aintrin = nir_instr_as_intrinsic(a->parent_instr);
969 nir_intrinsic_instr *bintrin = nir_instr_as_intrinsic(b->parent_instr);
970 if (aintrin->intrinsic == nir_intrinsic_vulkan_resource_index &&
971 bintrin->intrinsic == nir_intrinsic_vulkan_resource_index) {
972 return nir_intrinsic_desc_set(aintrin) != nir_intrinsic_desc_set(bintrin) ||
973 nir_intrinsic_binding(aintrin) != nir_intrinsic_binding(bintrin) ||
974 resources_different(aintrin->src[0].ssa, bintrin->src[0].ssa);
975 }
976 }
977
978 return false;
979 }
980
981 static int64_t
982 compare_entries(struct entry *a, struct entry *b)
983 {
984 if (!entry_key_equals(a->key, b->key))
985 return INT64_MAX;
986 return b->offset_signed - a->offset_signed;
987 }
988
989 static bool
990 may_alias(struct entry *a, struct entry *b)
991 {
992 assert(mode_to_index(get_variable_mode(a)) ==
993 mode_to_index(get_variable_mode(b)));
994
995 /* if the resources/variables are definitively different and both have
996 * ACCESS_RESTRICT, we can assume they do not alias. */
997 bool res_different = a->key->var != b->key->var ||
998 resources_different(a->key->resource, b->key->resource);
999 if (res_different && (a->access & ACCESS_RESTRICT) && (b->access & ACCESS_RESTRICT))
1000 return false;
1001
1002 /* we can't compare offsets if the resources/variables might be different */
1003 if (a->key->var != b->key->var || a->key->resource != b->key->resource)
1004 return true;
1005
1006 /* use adjacency information */
1007 /* TODO: we can look closer at the entry keys */
1008 int64_t diff = compare_entries(a, b);
1009 if (diff != INT64_MAX) {
1010 /* with atomics, intrin->num_components can be 0 */
1011 if (diff < 0)
1012 return llabs(diff) < MAX2(b->intrin->num_components, 1u) * (get_bit_size(b) / 8u);
1013 else
1014 return diff < MAX2(a->intrin->num_components, 1u) * (get_bit_size(a) / 8u);
1015 }
1016
1017 /* TODO: we can use deref information */
1018
1019 return true;
1020 }
1021
1022 static bool
1023 check_for_aliasing(struct vectorize_ctx *ctx, struct entry *first, struct entry *second)
1024 {
1025 nir_variable_mode mode = get_variable_mode(first);
1026 if (mode & (nir_var_uniform | nir_var_system_value |
1027 nir_var_mem_push_const | nir_var_mem_ubo))
1028 return false;
1029
1030 unsigned mode_index = mode_to_index(mode);
1031 if (first->is_store) {
1032 /* find first entry that aliases "first" */
1033 list_for_each_entry_from(struct entry, next, first, &ctx->entries[mode_index], head) {
1034 if (next == first)
1035 continue;
1036 if (next == second)
1037 return false;
1038 if (may_alias(first, next))
1039 return true;
1040 }
1041 } else {
1042 /* find previous store that aliases this load */
1043 list_for_each_entry_from_rev(struct entry, prev, second, &ctx->entries[mode_index], head) {
1044 if (prev == second)
1045 continue;
1046 if (prev == first)
1047 return false;
1048 if (prev->is_store && may_alias(second, prev))
1049 return true;
1050 }
1051 }
1052
1053 return false;
1054 }
1055
1056 static bool
1057 check_for_robustness(struct vectorize_ctx *ctx, struct entry *low)
1058 {
1059 nir_variable_mode mode = get_variable_mode(low);
1060 if (mode & ctx->robust_modes) {
1061 unsigned low_bit_size = get_bit_size(low);
1062 unsigned low_size = low->intrin->num_components * low_bit_size;
1063
1064 /* don't attempt to vectorize accesses if the offset can overflow. */
1065 /* TODO: handle indirect accesses. */
1066 return low->offset_signed < 0 && low->offset_signed + low_size >= 0;
1067 }
1068
1069 return false;
1070 }
1071
1072 static bool
1073 is_strided_vector(const struct glsl_type *type)
1074 {
1075 if (glsl_type_is_vector(type)) {
1076 unsigned explicit_stride = glsl_get_explicit_stride(type);
1077 return explicit_stride != 0 && explicit_stride !=
1078 type_scalar_size_bytes(glsl_get_array_element(type));
1079 } else {
1080 return false;
1081 }
1082 }
1083
1084 static bool
1085 try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
1086 struct entry *low, struct entry *high,
1087 struct entry *first, struct entry *second)
1088 {
1089 if (!(get_variable_mode(first) & ctx->modes) ||
1090 !(get_variable_mode(second) & ctx->modes))
1091 return false;
1092
1093 if (check_for_aliasing(ctx, first, second))
1094 return false;
1095
1096 if (check_for_robustness(ctx, low))
1097 return false;
1098
1099 /* we can only vectorize non-volatile loads/stores of the same type and with
1100 * the same access */
1101 if (first->info != second->info || first->access != second->access ||
1102 (first->access & ACCESS_VOLATILE) || first->info->is_atomic)
1103 return false;
1104
1105 /* don't attempt to vectorize accesses of row-major matrix columns */
1106 if (first->deref) {
1107 const struct glsl_type *first_type = first->deref->type;
1108 const struct glsl_type *second_type = second->deref->type;
1109 if (is_strided_vector(first_type) || is_strided_vector(second_type))
1110 return false;
1111 }
1112
1113 /* gather information */
1114 uint64_t diff = high->offset_signed - low->offset_signed;
1115 unsigned low_bit_size = get_bit_size(low);
1116 unsigned high_bit_size = get_bit_size(high);
1117 unsigned low_size = low->intrin->num_components * low_bit_size;
1118 unsigned high_size = high->intrin->num_components * high_bit_size;
1119 unsigned new_size = MAX2(diff * 8u + high_size, low_size);
1120
1121 /* find a good bit size for the new load/store */
1122 unsigned new_bit_size = 0;
1123 if (new_bitsize_acceptable(ctx, low_bit_size, low, high, new_size)) {
1124 new_bit_size = low_bit_size;
1125 } else if (low_bit_size != high_bit_size &&
1126 new_bitsize_acceptable(ctx, high_bit_size, low, high, new_size)) {
1127 new_bit_size = high_bit_size;
1128 } else {
1129 new_bit_size = 64;
1130 for (; new_bit_size >= 8; new_bit_size /= 2) {
1131 /* don't repeat trying out bitsizes */
1132 if (new_bit_size == low_bit_size || new_bit_size == high_bit_size)
1133 continue;
1134 if (new_bitsize_acceptable(ctx, new_bit_size, low, high, new_size))
1135 break;
1136 }
1137 if (new_bit_size < 8)
1138 return false;
1139 }
1140 unsigned new_num_components = new_size / new_bit_size;
1141
1142 /* vectorize the loads/stores */
1143 nir_builder b;
1144 nir_builder_init(&b, impl);
1145
1146 if (first->is_store)
1147 vectorize_stores(&b, ctx, low, high, first, second,
1148 new_bit_size, new_num_components, diff * 8u);
1149 else
1150 vectorize_loads(&b, ctx, low, high, first, second,
1151 new_bit_size, new_num_components, diff * 8u);
1152
1153 return true;
1154 }
1155
1156 static bool
1157 vectorize_entries(struct vectorize_ctx *ctx, nir_function_impl *impl, struct hash_table *ht)
1158 {
1159 if (!ht)
1160 return false;
1161
1162 bool progress = false;
1163 hash_table_foreach(ht, entry) {
1164 struct util_dynarray *arr = entry->data;
1165 if (!arr->size)
1166 continue;
1167
1168 qsort(util_dynarray_begin(arr),
1169 util_dynarray_num_elements(arr, struct entry *),
1170 sizeof(struct entry *), &sort_entries);
1171
1172 unsigned i = 0;
1173 for (; i < util_dynarray_num_elements(arr, struct entry*) - 1; i++) {
1174 struct entry *low = *util_dynarray_element(arr, struct entry *, i);
1175 struct entry *high = *util_dynarray_element(arr, struct entry *, i + 1);
1176
1177 uint64_t diff = high->offset_signed - low->offset_signed;
1178 if (diff > get_bit_size(low) / 8u * low->intrin->num_components) {
1179 progress |= update_align(low);
1180 continue;
1181 }
1182
1183 struct entry *first = low->index < high->index ? low : high;
1184 struct entry *second = low->index < high->index ? high : low;
1185
1186 if (try_vectorize(impl, ctx, low, high, first, second)) {
1187 *util_dynarray_element(arr, struct entry *, i) = NULL;
1188 *util_dynarray_element(arr, struct entry *, i + 1) = low->is_store ? second : first;
1189 progress = true;
1190 } else {
1191 progress |= update_align(low);
1192 }
1193 }
1194
1195 struct entry *last = *util_dynarray_element(arr, struct entry *, i);
1196 progress |= update_align(last);
1197 }
1198
1199 _mesa_hash_table_clear(ht, delete_entry_dynarray);
1200
1201 return progress;
1202 }
1203
1204 static bool
1205 handle_barrier(struct vectorize_ctx *ctx, bool *progress, nir_function_impl *impl, nir_instr *instr)
1206 {
1207 unsigned modes = 0;
1208 bool acquire = true;
1209 bool release = true;
1210 if (instr->type == nir_instr_type_intrinsic) {
1211 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1212 switch (intrin->intrinsic) {
1213 case nir_intrinsic_group_memory_barrier:
1214 case nir_intrinsic_memory_barrier:
1215 modes = nir_var_mem_ssbo | nir_var_mem_shared | nir_var_mem_global;
1216 break;
1217 /* prevent speculative loads/stores */
1218 case nir_intrinsic_discard_if:
1219 case nir_intrinsic_discard:
1220 modes = nir_var_all;
1221 break;
1222 case nir_intrinsic_memory_barrier_buffer:
1223 modes = nir_var_mem_ssbo | nir_var_mem_global;
1224 break;
1225 case nir_intrinsic_memory_barrier_shared:
1226 modes = nir_var_mem_shared;
1227 break;
1228 case nir_intrinsic_scoped_barrier:
1229 if (nir_intrinsic_memory_scope(intrin) == NIR_SCOPE_NONE)
1230 break;
1231
1232 modes = nir_intrinsic_memory_modes(intrin);
1233 acquire = nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_ACQUIRE;
1234 release = nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_RELEASE;
1235 switch (nir_intrinsic_memory_scope(intrin)) {
1236 case NIR_SCOPE_INVOCATION:
1237 case NIR_SCOPE_SUBGROUP:
1238 /* a barier should never be required for correctness with these scopes */
1239 modes = 0;
1240 break;
1241 default:
1242 break;
1243 }
1244 break;
1245 default:
1246 return false;
1247 }
1248 } else if (instr->type == nir_instr_type_call) {
1249 modes = nir_var_all;
1250 } else {
1251 return false;
1252 }
1253
1254 while (modes) {
1255 unsigned mode_index = u_bit_scan(&modes);
1256 if ((1 << mode_index) == nir_var_mem_global) {
1257 /* Global should be rolled in with SSBO */
1258 assert(list_is_empty(&ctx->entries[mode_index]));
1259 assert(ctx->loads[mode_index] == NULL);
1260 assert(ctx->stores[mode_index] == NULL);
1261 continue;
1262 }
1263
1264 if (acquire)
1265 *progress |= vectorize_entries(ctx, impl, ctx->loads[mode_index]);
1266 if (release)
1267 *progress |= vectorize_entries(ctx, impl, ctx->stores[mode_index]);
1268 }
1269
1270 return true;
1271 }
1272
1273 static bool
1274 process_block(nir_function_impl *impl, struct vectorize_ctx *ctx, nir_block *block)
1275 {
1276 bool progress = false;
1277
1278 for (unsigned i = 0; i < nir_num_variable_modes; i++) {
1279 list_inithead(&ctx->entries[i]);
1280 if (ctx->loads[i])
1281 _mesa_hash_table_clear(ctx->loads[i], delete_entry_dynarray);
1282 if (ctx->stores[i])
1283 _mesa_hash_table_clear(ctx->stores[i], delete_entry_dynarray);
1284 }
1285
1286 /* create entries */
1287 unsigned next_index = 0;
1288
1289 nir_foreach_instr_safe(instr, block) {
1290 if (handle_barrier(ctx, &progress, impl, instr))
1291 continue;
1292
1293 /* gather information */
1294 if (instr->type != nir_instr_type_intrinsic)
1295 continue;
1296 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1297
1298 const struct intrinsic_info *info = get_info(intrin->intrinsic);
1299 if (!info)
1300 continue;
1301
1302 nir_variable_mode mode = info->mode;
1303 if (!mode)
1304 mode = nir_src_as_deref(intrin->src[info->deref_src])->mode;
1305 if (!(mode & aliasing_modes(ctx->modes)))
1306 continue;
1307 unsigned mode_index = mode_to_index(mode);
1308
1309 /* create entry */
1310 struct entry *entry = create_entry(ctx, info, intrin);
1311 entry->index = next_index++;
1312
1313 list_addtail(&entry->head, &ctx->entries[mode_index]);
1314
1315 /* add the entry to a hash table */
1316
1317 struct hash_table *adj_ht = NULL;
1318 if (entry->is_store) {
1319 if (!ctx->stores[mode_index])
1320 ctx->stores[mode_index] = _mesa_hash_table_create(ctx, &hash_entry_key, &entry_key_equals);
1321 adj_ht = ctx->stores[mode_index];
1322 } else {
1323 if (!ctx->loads[mode_index])
1324 ctx->loads[mode_index] = _mesa_hash_table_create(ctx, &hash_entry_key, &entry_key_equals);
1325 adj_ht = ctx->loads[mode_index];
1326 }
1327
1328 uint32_t key_hash = hash_entry_key(entry->key);
1329 struct hash_entry *adj_entry = _mesa_hash_table_search_pre_hashed(adj_ht, key_hash, entry->key);
1330 struct util_dynarray *arr;
1331 if (adj_entry && adj_entry->data) {
1332 arr = (struct util_dynarray *)adj_entry->data;
1333 } else {
1334 arr = ralloc(ctx, struct util_dynarray);
1335 util_dynarray_init(arr, arr);
1336 _mesa_hash_table_insert_pre_hashed(adj_ht, key_hash, entry->key, arr);
1337 }
1338 util_dynarray_append(arr, struct entry *, entry);
1339 }
1340
1341 /* sort and combine entries */
1342 for (unsigned i = 0; i < nir_num_variable_modes; i++) {
1343 progress |= vectorize_entries(ctx, impl, ctx->loads[i]);
1344 progress |= vectorize_entries(ctx, impl, ctx->stores[i]);
1345 }
1346
1347 return progress;
1348 }
1349
1350 bool
1351 nir_opt_load_store_vectorize(nir_shader *shader, nir_variable_mode modes,
1352 nir_should_vectorize_mem_func callback,
1353 nir_variable_mode robust_modes)
1354 {
1355 bool progress = false;
1356
1357 struct vectorize_ctx *ctx = rzalloc(NULL, struct vectorize_ctx);
1358 ctx->modes = modes;
1359 ctx->callback = callback;
1360 ctx->robust_modes = robust_modes;
1361
1362 nir_shader_index_vars(shader, modes);
1363
1364 nir_foreach_function(function, shader) {
1365 if (function->impl) {
1366 if (modes & nir_var_function_temp)
1367 nir_function_impl_index_vars(function->impl);
1368
1369 nir_foreach_block(block, function->impl)
1370 progress |= process_block(function->impl, ctx, block);
1371
1372 nir_metadata_preserve(function->impl,
1373 nir_metadata_block_index |
1374 nir_metadata_dominance |
1375 nir_metadata_live_ssa_defs);
1376 }
1377 }
1378
1379 ralloc_free(ctx);
1380 return progress;
1381 }