nir/load_store_vectorize: Fix shared atomic info
[mesa.git] / src / compiler / nir / nir_opt_load_store_vectorize.c
1 /*
2 * Copyright © 2019 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * Although it's called a load/store "vectorization" pass, this also combines
26 * intersecting and identical loads/stores. It currently supports derefs, ubo,
27 * ssbo and push constant loads/stores.
28 *
29 * This doesn't handle copy_deref intrinsics and assumes that
30 * nir_lower_alu_to_scalar() has been called and that the IR is free from ALU
31 * modifiers. It also assumes that derefs have explicitly laid out types.
32 *
33 * After vectorization, the backend may want to call nir_lower_alu_to_scalar()
34 * and nir_lower_pack(). Also this creates cast instructions taking derefs as a
35 * source and some parts of NIR may not be able to handle that well.
36 *
37 * There are a few situations where this doesn't vectorize as well as it could:
38 * - It won't turn four consecutive vec3 loads into 3 vec4 loads.
39 * - It doesn't do global vectorization.
40 * Handling these cases probably wouldn't provide much benefit though.
41 *
42 * This probably doesn't handle big-endian GPUs correctly.
43 */
44
45 #include "nir.h"
46 #include "nir_deref.h"
47 #include "nir_builder.h"
48 #include "nir_worklist.h"
49 #include "util/u_dynarray.h"
50
51 #include <stdlib.h>
52
53 struct intrinsic_info {
54 nir_variable_mode mode; /* 0 if the mode is obtained from the deref. */
55 nir_intrinsic_op op;
56 bool is_atomic;
57 /* Indices into nir_intrinsic::src[] or -1 if not applicable. */
58 int resource_src; /* resource (e.g. from vulkan_resource_index) */
59 int base_src; /* offset which it loads/stores from */
60 int deref_src; /* deref which is loads/stores from */
61 int value_src; /* the data it is storing */
62 };
63
64 static const struct intrinsic_info *
65 get_info(nir_intrinsic_op op) {
66 switch (op) {
67 #define INFO(mode, op, atomic, res, base, deref, val) \
68 case nir_intrinsic_##op: {\
69 static const struct intrinsic_info op##_info = {mode, nir_intrinsic_##op, atomic, res, base, deref, val};\
70 return &op##_info;\
71 }
72 #define LOAD(mode, op, res, base, deref) INFO(mode, load_##op, false, res, base, deref, -1)
73 #define STORE(mode, op, res, base, deref, val) INFO(mode, store_##op, false, res, base, deref, val)
74 #define ATOMIC(mode, type, op, res, base, deref, val) INFO(mode, type##_atomic_##op, true, res, base, deref, val)
75 LOAD(nir_var_mem_push_const, push_constant, -1, 0, -1)
76 LOAD(nir_var_mem_ubo, ubo, 0, 1, -1)
77 LOAD(nir_var_mem_ssbo, ssbo, 0, 1, -1)
78 STORE(nir_var_mem_ssbo, ssbo, 1, 2, -1, 0)
79 LOAD(0, deref, -1, -1, 0)
80 STORE(0, deref, -1, -1, 0, 1)
81 LOAD(nir_var_mem_shared, shared, -1, 0, -1)
82 STORE(nir_var_mem_shared, shared, -1, 1, -1, 0)
83 ATOMIC(nir_var_mem_ssbo, ssbo, add, 0, 1, -1, 2)
84 ATOMIC(nir_var_mem_ssbo, ssbo, imin, 0, 1, -1, 2)
85 ATOMIC(nir_var_mem_ssbo, ssbo, umin, 0, 1, -1, 2)
86 ATOMIC(nir_var_mem_ssbo, ssbo, imax, 0, 1, -1, 2)
87 ATOMIC(nir_var_mem_ssbo, ssbo, umax, 0, 1, -1, 2)
88 ATOMIC(nir_var_mem_ssbo, ssbo, and, 0, 1, -1, 2)
89 ATOMIC(nir_var_mem_ssbo, ssbo, or, 0, 1, -1, 2)
90 ATOMIC(nir_var_mem_ssbo, ssbo, xor, 0, 1, -1, 2)
91 ATOMIC(nir_var_mem_ssbo, ssbo, exchange, 0, 1, -1, 2)
92 ATOMIC(nir_var_mem_ssbo, ssbo, comp_swap, 0, 1, -1, 2)
93 ATOMIC(nir_var_mem_ssbo, ssbo, fadd, 0, 1, -1, 2)
94 ATOMIC(nir_var_mem_ssbo, ssbo, fmin, 0, 1, -1, 2)
95 ATOMIC(nir_var_mem_ssbo, ssbo, fmax, 0, 1, -1, 2)
96 ATOMIC(nir_var_mem_ssbo, ssbo, fcomp_swap, 0, 1, -1, 2)
97 ATOMIC(0, deref, add, -1, -1, 0, 1)
98 ATOMIC(0, deref, imin, -1, -1, 0, 1)
99 ATOMIC(0, deref, umin, -1, -1, 0, 1)
100 ATOMIC(0, deref, imax, -1, -1, 0, 1)
101 ATOMIC(0, deref, umax, -1, -1, 0, 1)
102 ATOMIC(0, deref, and, -1, -1, 0, 1)
103 ATOMIC(0, deref, or, -1, -1, 0, 1)
104 ATOMIC(0, deref, xor, -1, -1, 0, 1)
105 ATOMIC(0, deref, exchange, -1, -1, 0, 1)
106 ATOMIC(0, deref, comp_swap, -1, -1, 0, 1)
107 ATOMIC(0, deref, fadd, -1, -1, 0, 1)
108 ATOMIC(0, deref, fmin, -1, -1, 0, 1)
109 ATOMIC(0, deref, fmax, -1, -1, 0, 1)
110 ATOMIC(0, deref, fcomp_swap, -1, -1, 0, 1)
111 ATOMIC(nir_var_mem_shared, shared, add, -1, 0, -1, 1)
112 ATOMIC(nir_var_mem_shared, shared, imin, -1, 0, -1, 1)
113 ATOMIC(nir_var_mem_shared, shared, umin, -1, 0, -1, 1)
114 ATOMIC(nir_var_mem_shared, shared, imax, -1, 0, -1, 1)
115 ATOMIC(nir_var_mem_shared, shared, umax, -1, 0, -1, 1)
116 ATOMIC(nir_var_mem_shared, shared, and, -1, 0, -1, 1)
117 ATOMIC(nir_var_mem_shared, shared, or, -1, 0, -1, 1)
118 ATOMIC(nir_var_mem_shared, shared, xor, -1, 0, -1, 1)
119 ATOMIC(nir_var_mem_shared, shared, exchange, -1, 0, -1, 1)
120 ATOMIC(nir_var_mem_shared, shared, comp_swap, -1, 0, -1, 1)
121 ATOMIC(nir_var_mem_shared, shared, fadd, -1, 0, -1, 1)
122 ATOMIC(nir_var_mem_shared, shared, fmin, -1, 0, -1, 1)
123 ATOMIC(nir_var_mem_shared, shared, fmax, -1, 0, -1, 1)
124 ATOMIC(nir_var_mem_shared, shared, fcomp_swap, -1, 0, -1, 1)
125 default:
126 break;
127 #undef ATOMIC
128 #undef STORE
129 #undef LOAD
130 #undef INFO
131 }
132 return NULL;
133 }
134
135 /*
136 * Information used to compare memory operations.
137 * It canonically represents an offset as:
138 * `offset_defs[0]*offset_defs_mul[0] + offset_defs[1]*offset_defs_mul[1] + ...`
139 * "offset_defs" is sorted in ascenting order by the ssa definition's index.
140 * "resource" or "var" may be NULL.
141 */
142 struct entry_key {
143 nir_ssa_def *resource;
144 nir_variable *var;
145 unsigned offset_def_count;
146 nir_ssa_def **offset_defs;
147 uint64_t *offset_defs_mul;
148 };
149
150 /* Information on a single memory operation. */
151 struct entry {
152 struct list_head head;
153 unsigned index;
154
155 struct entry_key *key;
156 union {
157 uint64_t offset; /* sign-extended */
158 int64_t offset_signed;
159 };
160 uint32_t best_align;
161
162 nir_instr *instr;
163 nir_intrinsic_instr *intrin;
164 const struct intrinsic_info *info;
165 enum gl_access_qualifier access;
166 bool is_store;
167
168 nir_deref_instr *deref;
169 };
170
171 struct vectorize_ctx {
172 nir_variable_mode modes;
173 nir_should_vectorize_mem_func callback;
174 struct list_head entries[nir_num_variable_modes];
175 struct hash_table *loads[nir_num_variable_modes];
176 struct hash_table *stores[nir_num_variable_modes];
177 };
178
179 static uint32_t hash_entry_key(const void *key_)
180 {
181 /* this is careful to not include pointers in the hash calculation so that
182 * the order of the hash table walk is deterministic */
183 struct entry_key *key = (struct entry_key*)key_;
184
185 uint32_t hash = _mesa_fnv32_1a_offset_bias;
186 if (key->resource)
187 hash = _mesa_fnv32_1a_accumulate(hash, key->resource->index);
188 if (key->var) {
189 hash = _mesa_fnv32_1a_accumulate(hash, key->var->index);
190 unsigned mode = key->var->data.mode;
191 hash = _mesa_fnv32_1a_accumulate(hash, mode);
192 }
193
194 for (unsigned i = 0; i < key->offset_def_count; i++)
195 hash = _mesa_fnv32_1a_accumulate(hash, key->offset_defs[i]->index);
196
197 hash = _mesa_fnv32_1a_accumulate_block(
198 hash, key->offset_defs_mul, key->offset_def_count * sizeof(uint64_t));
199
200 return hash;
201 }
202
203 static bool entry_key_equals(const void *a_, const void *b_)
204 {
205 struct entry_key *a = (struct entry_key*)a_;
206 struct entry_key *b = (struct entry_key*)b_;
207
208 if (a->var != b->var || a->resource != b->resource)
209 return false;
210
211 if (a->offset_def_count != b->offset_def_count)
212 return false;
213
214 size_t offset_def_size = a->offset_def_count * sizeof(nir_ssa_def *);
215 size_t offset_def_mul_size = a->offset_def_count * sizeof(uint64_t);
216 if (a->offset_def_count &&
217 (memcmp(a->offset_defs, b->offset_defs, offset_def_size) ||
218 memcmp(a->offset_defs_mul, b->offset_defs_mul, offset_def_mul_size)))
219 return false;
220
221 return true;
222 }
223
224 static void delete_entry_dynarray(struct hash_entry *entry)
225 {
226 struct util_dynarray *arr = (struct util_dynarray *)entry->data;
227 ralloc_free(arr);
228 }
229
230 static int sort_entries(const void *a_, const void *b_)
231 {
232 struct entry *a = *(struct entry*const*)a_;
233 struct entry *b = *(struct entry*const*)b_;
234
235 if (a->offset_signed > b->offset_signed)
236 return 1;
237 else if (a->offset_signed < b->offset_signed)
238 return -1;
239 else
240 return 0;
241 }
242
243 static unsigned
244 get_bit_size(struct entry *entry)
245 {
246 unsigned size = entry->is_store ?
247 entry->intrin->src[entry->info->value_src].ssa->bit_size :
248 entry->intrin->dest.ssa.bit_size;
249 return size == 1 ? 32u : size;
250 }
251
252 /* If "def" is from an alu instruction with the opcode "op" and one of it's
253 * sources is a constant, update "def" to be the non-constant source, fill "c"
254 * with the constant and return true. */
255 static bool
256 parse_alu(nir_ssa_def **def, nir_op op, uint64_t *c)
257 {
258 nir_ssa_scalar scalar;
259 scalar.def = *def;
260 scalar.comp = 0;
261
262 if (!nir_ssa_scalar_is_alu(scalar) || nir_ssa_scalar_alu_op(scalar) != op)
263 return false;
264
265 nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);
266 nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);
267 if (op != nir_op_ishl && nir_ssa_scalar_is_const(src0) && src1.comp == 0) {
268 *c = nir_ssa_scalar_as_uint(src0);
269 *def = src1.def;
270 } else if (nir_ssa_scalar_is_const(src1) && src0.comp == 0) {
271 *c = nir_ssa_scalar_as_uint(src1);
272 *def = src0.def;
273 } else {
274 return false;
275 }
276 return true;
277 }
278
279 /* Parses an offset expression such as "a * 16 + 4" and "(a * 16 + 4) * 64 + 32". */
280 static void
281 parse_offset(nir_ssa_def **base, uint64_t *base_mul, uint64_t *offset)
282 {
283 if ((*base)->parent_instr->type == nir_instr_type_load_const) {
284 *offset = nir_src_comp_as_uint(nir_src_for_ssa(*base), 0);
285 *base = NULL;
286 return;
287 }
288
289 uint64_t mul = 1;
290 uint64_t add = 0;
291 bool progress = false;
292 do {
293 uint64_t mul2 = 1, add2 = 0;
294
295 progress = parse_alu(base, nir_op_imul, &mul2);
296 mul *= mul2;
297
298 mul2 = 0;
299 progress |= parse_alu(base, nir_op_ishl, &mul2);
300 mul <<= mul2;
301
302 progress |= parse_alu(base, nir_op_iadd, &add2);
303 add += add2 * mul;
304 } while (progress);
305
306 *base_mul = mul;
307 *offset = add;
308 }
309
310 static unsigned
311 type_scalar_size_bytes(const struct glsl_type *type)
312 {
313 assert(glsl_type_is_vector_or_scalar(type) ||
314 glsl_type_is_matrix(type));
315 return glsl_type_is_boolean(type) ? 4u : glsl_get_bit_size(type) / 8u;
316 }
317
318 static int
319 get_array_stride(const struct glsl_type *type)
320 {
321 unsigned explicit_stride = glsl_get_explicit_stride(type);
322 if ((glsl_type_is_matrix(type) &&
323 glsl_matrix_type_is_row_major(type)) ||
324 (glsl_type_is_vector(type) && explicit_stride == 0))
325 return type_scalar_size_bytes(type);
326 return explicit_stride;
327 }
328
329 static uint64_t
330 mask_sign_extend(uint64_t val, unsigned bit_size)
331 {
332 return (int64_t)(val << (64 - bit_size)) >> (64 - bit_size);
333 }
334
335 static unsigned
336 add_to_entry_key(nir_ssa_def **offset_defs, uint64_t *offset_defs_mul,
337 unsigned offset_def_count, nir_ssa_def *def, uint64_t mul)
338 {
339 mul = mask_sign_extend(mul, def->bit_size);
340
341 for (unsigned i = 0; i <= offset_def_count; i++) {
342 if (i == offset_def_count || def->index > offset_defs[i]->index) {
343 /* insert before i */
344 memmove(offset_defs + i + 1, offset_defs + i,
345 (offset_def_count - i) * sizeof(nir_ssa_def *));
346 memmove(offset_defs_mul + i + 1, offset_defs_mul + i,
347 (offset_def_count - i) * sizeof(uint64_t));
348 offset_defs[i] = def;
349 offset_defs_mul[i] = mul;
350 return 1;
351 } else if (def->index == offset_defs[i]->index) {
352 /* merge with offset_def at i */
353 offset_defs_mul[i] += mul;
354 return 0;
355 }
356 }
357 unreachable("Unreachable.");
358 return 0;
359 }
360
361 static struct entry_key *
362 create_entry_key_from_deref(void *mem_ctx,
363 struct vectorize_ctx *ctx,
364 nir_deref_path *path,
365 uint64_t *offset_base)
366 {
367 unsigned path_len = 0;
368 while (path->path[path_len])
369 path_len++;
370
371 nir_ssa_def *offset_defs_stack[32];
372 uint64_t offset_defs_mul_stack[32];
373 nir_ssa_def **offset_defs = offset_defs_stack;
374 uint64_t *offset_defs_mul = offset_defs_mul_stack;
375 if (path_len > 32) {
376 offset_defs = malloc(path_len * sizeof(nir_ssa_def *));
377 offset_defs_mul = malloc(path_len * sizeof(uint64_t));
378 }
379 unsigned offset_def_count = 0;
380
381 struct entry_key* key = ralloc(mem_ctx, struct entry_key);
382 key->resource = NULL;
383 key->var = NULL;
384 *offset_base = 0;
385
386 for (unsigned i = 0; i < path_len; i++) {
387 nir_deref_instr *parent = i ? path->path[i - 1] : NULL;
388 nir_deref_instr *deref = path->path[i];
389
390 switch (deref->deref_type) {
391 case nir_deref_type_var: {
392 assert(!parent);
393 key->var = deref->var;
394 break;
395 }
396 case nir_deref_type_array:
397 case nir_deref_type_ptr_as_array: {
398 assert(parent);
399 nir_ssa_def *index = deref->arr.index.ssa;
400 uint32_t stride;
401 if (deref->deref_type == nir_deref_type_ptr_as_array)
402 stride = nir_deref_instr_ptr_as_array_stride(deref);
403 else
404 stride = get_array_stride(parent->type);
405
406 nir_ssa_def *base = index;
407 uint64_t offset = 0, base_mul = 1;
408 parse_offset(&base, &base_mul, &offset);
409 offset = mask_sign_extend(offset, index->bit_size);
410
411 *offset_base += offset * stride;
412 if (base) {
413 offset_def_count += add_to_entry_key(offset_defs, offset_defs_mul,
414 offset_def_count,
415 base, base_mul * stride);
416 }
417 break;
418 }
419 case nir_deref_type_struct: {
420 assert(parent);
421 int offset = glsl_get_struct_field_offset(parent->type, deref->strct.index);
422 *offset_base += offset;
423 break;
424 }
425 case nir_deref_type_cast: {
426 if (!parent)
427 key->resource = deref->parent.ssa;
428 break;
429 }
430 default:
431 unreachable("Unhandled deref type");
432 }
433 }
434
435 key->offset_def_count = offset_def_count;
436 key->offset_defs = ralloc_array(mem_ctx, nir_ssa_def *, offset_def_count);
437 key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, offset_def_count);
438 memcpy(key->offset_defs, offset_defs, offset_def_count * sizeof(nir_ssa_def *));
439 memcpy(key->offset_defs_mul, offset_defs_mul, offset_def_count * sizeof(uint64_t));
440
441 if (offset_defs != offset_defs_stack)
442 free(offset_defs);
443 if (offset_defs_mul != offset_defs_mul_stack)
444 free(offset_defs_mul);
445
446 return key;
447 }
448
449 static unsigned
450 parse_entry_key_from_offset(struct entry_key *key, unsigned size, unsigned left,
451 nir_ssa_def *base, uint64_t base_mul, uint64_t *offset)
452 {
453 uint64_t new_mul;
454 uint64_t new_offset;
455 parse_offset(&base, &new_mul, &new_offset);
456 *offset += new_offset * base_mul;
457
458 if (!base)
459 return 0;
460
461 base_mul *= new_mul;
462
463 assert(left >= 1);
464
465 if (left >= 2) {
466 nir_ssa_scalar scalar;
467 scalar.def = base;
468 scalar.comp = 0;
469 if (nir_ssa_scalar_is_alu(scalar) && nir_ssa_scalar_alu_op(scalar) == nir_op_iadd) {
470 nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);
471 nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);
472 if (src0.comp == 0 && src1.comp == 0) {
473 unsigned amount = parse_entry_key_from_offset(key, size, left - 1, src0.def, base_mul, offset);
474 amount += parse_entry_key_from_offset(key, size + amount, left - amount, src1.def, base_mul, offset);
475 return amount;
476 }
477 }
478 }
479
480 return add_to_entry_key(key->offset_defs, key->offset_defs_mul, size, base, base_mul);
481 }
482
483 static struct entry_key *
484 create_entry_key_from_offset(void *mem_ctx, nir_ssa_def *base, uint64_t base_mul, uint64_t *offset)
485 {
486 struct entry_key *key = ralloc(mem_ctx, struct entry_key);
487 key->resource = NULL;
488 key->var = NULL;
489 if (base) {
490 nir_ssa_def *offset_defs[32];
491 uint64_t offset_defs_mul[32];
492 key->offset_defs = offset_defs;
493 key->offset_defs_mul = offset_defs_mul;
494
495 key->offset_def_count = parse_entry_key_from_offset(key, 0, 32, base, base_mul, offset);
496
497 key->offset_defs = ralloc_array(mem_ctx, nir_ssa_def *, key->offset_def_count);
498 key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, key->offset_def_count);
499 memcpy(key->offset_defs, offset_defs, key->offset_def_count * sizeof(nir_ssa_def *));
500 memcpy(key->offset_defs_mul, offset_defs_mul, key->offset_def_count * sizeof(uint64_t));
501 } else {
502 key->offset_def_count = 0;
503 key->offset_defs = NULL;
504 key->offset_defs_mul = NULL;
505 }
506 return key;
507 }
508
509 static nir_variable_mode
510 get_variable_mode(struct entry *entry)
511 {
512 if (entry->info->mode)
513 return entry->info->mode;
514 assert(entry->deref);
515 return entry->deref->mode;
516 }
517
518 static struct entry *
519 create_entry(struct vectorize_ctx *ctx,
520 const struct intrinsic_info *info,
521 nir_intrinsic_instr *intrin)
522 {
523 struct entry *entry = rzalloc(ctx, struct entry);
524 entry->intrin = intrin;
525 entry->instr = &intrin->instr;
526 entry->info = info;
527 entry->best_align = UINT32_MAX;
528 entry->is_store = entry->info->value_src >= 0;
529
530 if (entry->info->deref_src >= 0) {
531 entry->deref = nir_src_as_deref(intrin->src[entry->info->deref_src]);
532 nir_deref_path path;
533 nir_deref_path_init(&path, entry->deref, NULL);
534 entry->key = create_entry_key_from_deref(entry, ctx, &path, &entry->offset);
535 nir_deref_path_finish(&path);
536 } else {
537 nir_ssa_def *base = entry->info->base_src >= 0 ?
538 intrin->src[entry->info->base_src].ssa : NULL;
539 uint64_t offset = 0;
540 if (nir_intrinsic_infos[intrin->intrinsic].index_map[NIR_INTRINSIC_BASE])
541 offset += nir_intrinsic_base(intrin);
542 entry->key = create_entry_key_from_offset(entry, base, 1, &offset);
543 entry->offset = offset;
544
545 if (base)
546 entry->offset = mask_sign_extend(entry->offset, base->bit_size);
547 }
548
549 if (entry->info->resource_src >= 0)
550 entry->key->resource = intrin->src[entry->info->resource_src].ssa;
551
552 if (nir_intrinsic_infos[intrin->intrinsic].index_map[NIR_INTRINSIC_ACCESS])
553 entry->access = nir_intrinsic_access(intrin);
554 else if (entry->key->var)
555 entry->access = entry->key->var->data.access;
556
557 uint32_t restrict_modes = nir_var_shader_in | nir_var_shader_out;
558 restrict_modes |= nir_var_shader_temp | nir_var_function_temp;
559 restrict_modes |= nir_var_uniform | nir_var_mem_push_const;
560 restrict_modes |= nir_var_system_value | nir_var_mem_shared;
561 if (get_variable_mode(entry) & restrict_modes)
562 entry->access |= ACCESS_RESTRICT;
563
564 return entry;
565 }
566
567 static nir_deref_instr *
568 cast_deref(nir_builder *b, unsigned num_components, unsigned bit_size, nir_deref_instr *deref)
569 {
570 if (glsl_get_components(deref->type) == num_components &&
571 type_scalar_size_bytes(deref->type)*8u == bit_size)
572 return deref;
573
574 enum glsl_base_type types[] = {
575 GLSL_TYPE_UINT8, GLSL_TYPE_UINT16, GLSL_TYPE_UINT, GLSL_TYPE_UINT64};
576 enum glsl_base_type base = types[ffs(bit_size / 8u) - 1u];
577 const struct glsl_type *type = glsl_vector_type(base, num_components);
578
579 if (deref->type == type)
580 return deref;
581
582 return nir_build_deref_cast(b, &deref->dest.ssa, deref->mode, type, 0);
583 }
584
585 /* Return true if the write mask "write_mask" of a store with "old_bit_size"
586 * bits per element can be represented for a store with "new_bit_size" bits per
587 * element. */
588 static bool
589 writemask_representable(unsigned write_mask, unsigned old_bit_size, unsigned new_bit_size)
590 {
591 while (write_mask) {
592 int start, count;
593 u_bit_scan_consecutive_range(&write_mask, &start, &count);
594 start *= old_bit_size;
595 count *= old_bit_size;
596 if (start % new_bit_size != 0)
597 return false;
598 if (count % new_bit_size != 0)
599 return false;
600 }
601 return true;
602 }
603
604 static uint64_t
605 gcd(uint64_t a, uint64_t b)
606 {
607 while (b) {
608 uint64_t old_b = b;
609 b = a % b;
610 a = old_b;
611 }
612 return a;
613 }
614
615 static uint32_t
616 get_best_align(struct entry *entry)
617 {
618 if (entry->best_align != UINT32_MAX)
619 return entry->best_align;
620
621 uint64_t best_align = entry->offset;
622 for (unsigned i = 0; i < entry->key->offset_def_count; i++) {
623 if (!best_align)
624 best_align = entry->key->offset_defs_mul[i];
625 else if (entry->key->offset_defs_mul[i])
626 best_align = gcd(best_align, entry->key->offset_defs_mul[i]);
627 }
628
629 if (nir_intrinsic_infos[entry->intrin->intrinsic].index_map[NIR_INTRINSIC_ALIGN_MUL])
630 best_align = MAX2(best_align, nir_intrinsic_align(entry->intrin));
631
632 /* ensure the result is a power of two that fits in a int32_t */
633 entry->best_align = gcd(best_align, 1u << 30);
634
635 return entry->best_align;
636 }
637
638 /* Return true if "new_bit_size" is a usable bit size for a vectorized load/store
639 * of "low" and "high". */
640 static bool
641 new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size,
642 struct entry *low, struct entry *high, unsigned size)
643 {
644 if (size % new_bit_size != 0)
645 return false;
646
647 unsigned new_num_components = size / new_bit_size;
648 if (!nir_num_components_valid(new_num_components))
649 return false;
650
651 unsigned high_offset = high->offset_signed - low->offset_signed;
652
653 /* check nir_extract_bits limitations */
654 unsigned common_bit_size = MIN2(get_bit_size(low), get_bit_size(high));
655 common_bit_size = MIN2(common_bit_size, new_bit_size);
656 if (high_offset > 0)
657 common_bit_size = MIN2(common_bit_size, (1u << (ffs(high_offset * 8) - 1)));
658 if (new_bit_size / common_bit_size > NIR_MAX_VEC_COMPONENTS)
659 return false;
660
661 if (!ctx->callback(get_best_align(low), new_bit_size, new_num_components,
662 high_offset, low->intrin, high->intrin))
663 return false;
664
665 if (low->is_store) {
666 unsigned low_size = low->intrin->num_components * get_bit_size(low);
667 unsigned high_size = high->intrin->num_components * get_bit_size(high);
668
669 if (low_size % new_bit_size != 0)
670 return false;
671 if (high_size % new_bit_size != 0)
672 return false;
673
674 unsigned write_mask = nir_intrinsic_write_mask(low->intrin);
675 if (!writemask_representable(write_mask, low_size, new_bit_size))
676 return false;
677
678 write_mask = nir_intrinsic_write_mask(high->intrin);
679 if (!writemask_representable(write_mask, high_size, new_bit_size))
680 return false;
681 }
682
683 return true;
684 }
685
686 /* Updates a write mask, "write_mask", so that it can be used with a
687 * "new_bit_size"-bit store instead of a "old_bit_size"-bit store. */
688 static uint32_t
689 update_writemask(unsigned write_mask, unsigned old_bit_size, unsigned new_bit_size)
690 {
691 uint32_t res = 0;
692 while (write_mask) {
693 int start, count;
694 u_bit_scan_consecutive_range(&write_mask, &start, &count);
695 start = start * old_bit_size / new_bit_size;
696 count = count * old_bit_size / new_bit_size;
697 res |= ((1 << count) - 1) << start;
698 }
699 return res;
700 }
701
702 static nir_deref_instr *subtract_deref(nir_builder *b, nir_deref_instr *deref, int64_t offset)
703 {
704 /* avoid adding another deref to the path */
705 if (deref->deref_type == nir_deref_type_ptr_as_array &&
706 nir_src_is_const(deref->arr.index) &&
707 offset % nir_deref_instr_ptr_as_array_stride(deref) == 0) {
708 unsigned stride = nir_deref_instr_ptr_as_array_stride(deref);
709 nir_ssa_def *index = nir_imm_intN_t(b, nir_src_as_int(deref->arr.index) - offset / stride,
710 deref->dest.ssa.bit_size);
711 return nir_build_deref_ptr_as_array(b, nir_deref_instr_parent(deref), index);
712 }
713
714 if (deref->deref_type == nir_deref_type_array &&
715 nir_src_is_const(deref->arr.index)) {
716 nir_deref_instr *parent = nir_deref_instr_parent(deref);
717 unsigned stride = glsl_get_explicit_stride(parent->type);
718 if (offset % stride == 0)
719 return nir_build_deref_array_imm(
720 b, parent, nir_src_as_int(deref->arr.index) - offset / stride);
721 }
722
723
724 deref = nir_build_deref_cast(b, &deref->dest.ssa, deref->mode,
725 glsl_scalar_type(GLSL_TYPE_UINT8), 1);
726 return nir_build_deref_ptr_as_array(
727 b, deref, nir_imm_intN_t(b, -offset, deref->dest.ssa.bit_size));
728 }
729
730 static bool update_align(struct entry *entry)
731 {
732 bool has_align_index =
733 nir_intrinsic_infos[entry->intrin->intrinsic].index_map[NIR_INTRINSIC_ALIGN_MUL];
734 if (has_align_index) {
735 unsigned align = get_best_align(entry);
736 if (align != nir_intrinsic_align(entry->intrin)) {
737 nir_intrinsic_set_align(entry->intrin, align, 0);
738 return true;
739 }
740 }
741 return false;
742 }
743
744 static void
745 vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx,
746 struct entry *low, struct entry *high,
747 struct entry *first, struct entry *second,
748 unsigned new_bit_size, unsigned new_num_components,
749 unsigned high_start)
750 {
751 unsigned low_bit_size = get_bit_size(low);
752 unsigned high_bit_size = get_bit_size(high);
753 bool low_bool = low->intrin->dest.ssa.bit_size == 1;
754 bool high_bool = high->intrin->dest.ssa.bit_size == 1;
755 nir_ssa_def *data = &first->intrin->dest.ssa;
756
757 b->cursor = nir_after_instr(first->instr);
758
759 /* update the load's destination size and extract data for each of the original loads */
760 data->num_components = new_num_components;
761 data->bit_size = new_bit_size;
762
763 nir_ssa_def *low_def = nir_extract_bits(
764 b, &data, 1, 0, low->intrin->num_components, low_bit_size);
765 nir_ssa_def *high_def = nir_extract_bits(
766 b, &data, 1, high_start, high->intrin->num_components, high_bit_size);
767
768 /* convert booleans */
769 low_def = low_bool ? nir_i2b(b, low_def) : nir_mov(b, low_def);
770 high_def = high_bool ? nir_i2b(b, high_def) : nir_mov(b, high_def);
771
772 /* update uses */
773 if (first == low) {
774 nir_ssa_def_rewrite_uses_after(&low->intrin->dest.ssa, nir_src_for_ssa(low_def),
775 high_def->parent_instr);
776 nir_ssa_def_rewrite_uses(&high->intrin->dest.ssa, nir_src_for_ssa(high_def));
777 } else {
778 nir_ssa_def_rewrite_uses(&low->intrin->dest.ssa, nir_src_for_ssa(low_def));
779 nir_ssa_def_rewrite_uses_after(&high->intrin->dest.ssa, nir_src_for_ssa(high_def),
780 high_def->parent_instr);
781 }
782
783 /* update the intrinsic */
784 first->intrin->num_components = new_num_components;
785
786 const struct intrinsic_info *info = first->info;
787
788 /* update the offset */
789 if (first != low && info->base_src >= 0) {
790 /* let nir_opt_algebraic() remove this addition. this doesn't have much
791 * issues with subtracting 16 from expressions like "(i + 1) * 16" because
792 * nir_opt_algebraic() turns them into "i * 16 + 16" */
793 b->cursor = nir_before_instr(first->instr);
794
795 nir_ssa_def *new_base = first->intrin->src[info->base_src].ssa;
796 new_base = nir_iadd(b, new_base, nir_imm_int(b, -(high_start / 8u)));
797
798 nir_instr_rewrite_src(first->instr, &first->intrin->src[info->base_src],
799 nir_src_for_ssa(new_base));
800 }
801
802 /* update the deref */
803 if (info->deref_src >= 0) {
804 b->cursor = nir_before_instr(first->instr);
805
806 nir_deref_instr *deref = nir_src_as_deref(first->intrin->src[info->deref_src]);
807 if (first != low && high_start != 0)
808 deref = subtract_deref(b, deref, high_start / 8u);
809 first->deref = cast_deref(b, new_num_components, new_bit_size, deref);
810
811 nir_instr_rewrite_src(first->instr, &first->intrin->src[info->deref_src],
812 nir_src_for_ssa(&first->deref->dest.ssa));
813 }
814
815 /* update base/align */
816 bool has_base_index =
817 nir_intrinsic_infos[first->intrin->intrinsic].index_map[NIR_INTRINSIC_BASE];
818
819 if (first != low && has_base_index)
820 nir_intrinsic_set_base(first->intrin, nir_intrinsic_base(low->intrin));
821
822 first->key = low->key;
823 first->offset = low->offset;
824 first->best_align = get_best_align(low);
825
826 update_align(first);
827
828 nir_instr_remove(second->instr);
829 }
830
831 static void
832 vectorize_stores(nir_builder *b, struct vectorize_ctx *ctx,
833 struct entry *low, struct entry *high,
834 struct entry *first, struct entry *second,
835 unsigned new_bit_size, unsigned new_num_components,
836 unsigned high_start)
837 {
838 ASSERTED unsigned low_size = low->intrin->num_components * get_bit_size(low);
839 assert(low_size % new_bit_size == 0);
840
841 b->cursor = nir_before_instr(second->instr);
842
843 /* get new writemasks */
844 uint32_t low_write_mask = nir_intrinsic_write_mask(low->intrin);
845 uint32_t high_write_mask = nir_intrinsic_write_mask(high->intrin);
846 low_write_mask = update_writemask(low_write_mask, get_bit_size(low), new_bit_size);
847 high_write_mask = update_writemask(high_write_mask, get_bit_size(high), new_bit_size);
848 high_write_mask <<= high_start / new_bit_size;
849
850 uint32_t write_mask = low_write_mask | high_write_mask;
851
852 /* convert booleans */
853 nir_ssa_def *low_val = low->intrin->src[low->info->value_src].ssa;
854 nir_ssa_def *high_val = high->intrin->src[high->info->value_src].ssa;
855 low_val = low_val->bit_size == 1 ? nir_b2i(b, low_val, 32) : low_val;
856 high_val = high_val->bit_size == 1 ? nir_b2i(b, high_val, 32) : high_val;
857
858 /* combine the data */
859 nir_ssa_def *data_channels[NIR_MAX_VEC_COMPONENTS];
860 for (unsigned i = 0; i < new_num_components; i++) {
861 bool set_low = low_write_mask & (1 << i);
862 bool set_high = high_write_mask & (1 << i);
863
864 if (set_low && (!set_high || low == second)) {
865 unsigned offset = i * new_bit_size;
866 data_channels[i] = nir_extract_bits(b, &low_val, 1, offset, 1, new_bit_size);
867 } else if (set_high) {
868 assert(!set_low || high == second);
869 unsigned offset = i * new_bit_size - high_start;
870 data_channels[i] = nir_extract_bits(b, &high_val, 1, offset, 1, new_bit_size);
871 } else {
872 data_channels[i] = nir_ssa_undef(b, 1, new_bit_size);
873 }
874 }
875 nir_ssa_def *data = nir_vec(b, data_channels, new_num_components);
876
877 /* update the intrinsic */
878 nir_intrinsic_set_write_mask(second->intrin, write_mask);
879 second->intrin->num_components = data->num_components;
880
881 const struct intrinsic_info *info = second->info;
882 assert(info->value_src >= 0);
883 nir_instr_rewrite_src(second->instr, &second->intrin->src[info->value_src],
884 nir_src_for_ssa(data));
885
886 /* update the offset */
887 if (second != low && info->base_src >= 0)
888 nir_instr_rewrite_src(second->instr, &second->intrin->src[info->base_src],
889 low->intrin->src[info->base_src]);
890
891 /* update the deref */
892 if (info->deref_src >= 0) {
893 b->cursor = nir_before_instr(second->instr);
894 second->deref = cast_deref(b, new_num_components, new_bit_size,
895 nir_src_as_deref(low->intrin->src[info->deref_src]));
896 nir_instr_rewrite_src(second->instr, &second->intrin->src[info->deref_src],
897 nir_src_for_ssa(&second->deref->dest.ssa));
898 }
899
900 /* update base/align */
901 bool has_base_index =
902 nir_intrinsic_infos[second->intrin->intrinsic].index_map[NIR_INTRINSIC_BASE];
903
904 if (second != low && has_base_index)
905 nir_intrinsic_set_base(second->intrin, nir_intrinsic_base(low->intrin));
906
907 second->key = low->key;
908 second->offset = low->offset;
909 second->best_align = get_best_align(low);
910
911 update_align(second);
912
913 list_del(&first->head);
914 nir_instr_remove(first->instr);
915 }
916
917 /* Returns true if it can prove that "a" and "b" point to different resources. */
918 static bool
919 resources_different(nir_ssa_def *a, nir_ssa_def *b)
920 {
921 if (!a || !b)
922 return false;
923
924 if (a->parent_instr->type == nir_instr_type_load_const &&
925 b->parent_instr->type == nir_instr_type_load_const) {
926 return nir_src_as_uint(nir_src_for_ssa(a)) != nir_src_as_uint(nir_src_for_ssa(b));
927 }
928
929 if (a->parent_instr->type == nir_instr_type_intrinsic &&
930 b->parent_instr->type == nir_instr_type_intrinsic) {
931 nir_intrinsic_instr *aintrin = nir_instr_as_intrinsic(a->parent_instr);
932 nir_intrinsic_instr *bintrin = nir_instr_as_intrinsic(b->parent_instr);
933 if (aintrin->intrinsic == nir_intrinsic_vulkan_resource_index &&
934 bintrin->intrinsic == nir_intrinsic_vulkan_resource_index) {
935 return nir_intrinsic_desc_set(aintrin) != nir_intrinsic_desc_set(bintrin) ||
936 nir_intrinsic_binding(aintrin) != nir_intrinsic_binding(bintrin) ||
937 resources_different(aintrin->src[0].ssa, bintrin->src[0].ssa);
938 }
939 }
940
941 return false;
942 }
943
944 static int64_t
945 compare_entries(struct entry *a, struct entry *b)
946 {
947 if (!entry_key_equals(a->key, b->key))
948 return INT64_MAX;
949 return b->offset_signed - a->offset_signed;
950 }
951
952 static bool
953 may_alias(struct entry *a, struct entry *b)
954 {
955 assert(get_variable_mode(a) == get_variable_mode(b));
956
957 /* if the resources/variables are definitively different and both have
958 * ACCESS_RESTRICT, we can assume they do not alias. */
959 bool res_different = a->key->var != b->key->var ||
960 resources_different(a->key->resource, b->key->resource);
961 if (res_different && (a->access & ACCESS_RESTRICT) && (b->access & ACCESS_RESTRICT))
962 return false;
963
964 /* we can't compare offsets if the resources/variables might be different */
965 if (a->key->var != b->key->var || a->key->resource != b->key->resource)
966 return true;
967
968 /* use adjacency information */
969 /* TODO: we can look closer at the entry keys */
970 int64_t diff = compare_entries(a, b);
971 if (diff != INT64_MAX) {
972 /* with atomics, intrin->num_components can be 0 */
973 if (diff < 0)
974 return llabs(diff) < MAX2(b->intrin->num_components, 1u) * (get_bit_size(b) / 8u);
975 else
976 return diff < MAX2(a->intrin->num_components, 1u) * (get_bit_size(a) / 8u);
977 }
978
979 /* TODO: we can use deref information */
980
981 return true;
982 }
983
984 static bool
985 check_for_aliasing(struct vectorize_ctx *ctx, struct entry *first, struct entry *second)
986 {
987 nir_variable_mode mode = get_variable_mode(first);
988 if (mode & (nir_var_uniform | nir_var_system_value |
989 nir_var_mem_push_const | nir_var_mem_ubo))
990 return false;
991
992 unsigned mode_index = ffs(mode) - 1;
993 if (first->is_store) {
994 /* find first entry that aliases "first" */
995 list_for_each_entry_from(struct entry, next, first, &ctx->entries[mode_index], head) {
996 if (next == first)
997 continue;
998 if (next == second)
999 return false;
1000 if (may_alias(first, next))
1001 return true;
1002 }
1003 } else {
1004 /* find previous store that aliases this load */
1005 list_for_each_entry_from_rev(struct entry, prev, second, &ctx->entries[mode_index], head) {
1006 if (prev == second)
1007 continue;
1008 if (prev == first)
1009 return false;
1010 if (prev->is_store && may_alias(second, prev))
1011 return true;
1012 }
1013 }
1014
1015 return false;
1016 }
1017
1018 static bool
1019 is_strided_vector(const struct glsl_type *type)
1020 {
1021 if (glsl_type_is_vector(type)) {
1022 return glsl_get_explicit_stride(type) !=
1023 type_scalar_size_bytes(glsl_get_array_element(type));
1024 } else {
1025 return false;
1026 }
1027 }
1028
1029 static bool
1030 try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
1031 struct entry *low, struct entry *high,
1032 struct entry *first, struct entry *second)
1033 {
1034 if (check_for_aliasing(ctx, first, second))
1035 return false;
1036
1037 /* we can only vectorize non-volatile loads/stores of the same type and with
1038 * the same access */
1039 if (first->info != second->info || first->access != second->access ||
1040 (first->access & ACCESS_VOLATILE) || first->info->is_atomic)
1041 return false;
1042
1043 /* don't attempt to vectorize accesses of row-major matrix columns */
1044 if (first->deref) {
1045 const struct glsl_type *first_type = first->deref->type;
1046 const struct glsl_type *second_type = second->deref->type;
1047 if (is_strided_vector(first_type) || is_strided_vector(second_type))
1048 return false;
1049 }
1050
1051 /* gather information */
1052 uint64_t diff = high->offset_signed - low->offset_signed;
1053 unsigned low_bit_size = get_bit_size(low);
1054 unsigned high_bit_size = get_bit_size(high);
1055 unsigned low_size = low->intrin->num_components * low_bit_size;
1056 unsigned high_size = high->intrin->num_components * high_bit_size;
1057 unsigned new_size = MAX2(diff * 8u + high_size, low_size);
1058
1059 /* find a good bit size for the new load/store */
1060 unsigned new_bit_size = 0;
1061 if (new_bitsize_acceptable(ctx, low_bit_size, low, high, new_size)) {
1062 new_bit_size = low_bit_size;
1063 } else if (low_bit_size != high_bit_size &&
1064 new_bitsize_acceptable(ctx, high_bit_size, low, high, new_size)) {
1065 new_bit_size = high_bit_size;
1066 } else {
1067 new_bit_size = 64;
1068 for (; new_bit_size >= 8; new_bit_size /= 2) {
1069 /* don't repeat trying out bitsizes */
1070 if (new_bit_size == low_bit_size || new_bit_size == high_bit_size)
1071 continue;
1072 if (new_bitsize_acceptable(ctx, new_bit_size, low, high, new_size))
1073 break;
1074 }
1075 if (new_bit_size < 8)
1076 return false;
1077 }
1078 unsigned new_num_components = new_size / new_bit_size;
1079
1080 /* vectorize the loads/stores */
1081 nir_builder b;
1082 nir_builder_init(&b, impl);
1083
1084 if (first->is_store)
1085 vectorize_stores(&b, ctx, low, high, first, second,
1086 new_bit_size, new_num_components, diff * 8u);
1087 else
1088 vectorize_loads(&b, ctx, low, high, first, second,
1089 new_bit_size, new_num_components, diff * 8u);
1090
1091 return true;
1092 }
1093
1094 static bool
1095 vectorize_entries(struct vectorize_ctx *ctx, nir_function_impl *impl, struct hash_table *ht)
1096 {
1097 if (!ht)
1098 return false;
1099
1100 bool progress = false;
1101 hash_table_foreach(ht, entry) {
1102 struct util_dynarray *arr = entry->data;
1103 if (!arr->size)
1104 continue;
1105
1106 qsort(util_dynarray_begin(arr),
1107 util_dynarray_num_elements(arr, struct entry *),
1108 sizeof(struct entry *), &sort_entries);
1109
1110 unsigned i = 0;
1111 for (; i < util_dynarray_num_elements(arr, struct entry*) - 1; i++) {
1112 struct entry *low = *util_dynarray_element(arr, struct entry *, i);
1113 struct entry *high = *util_dynarray_element(arr, struct entry *, i + 1);
1114
1115 uint64_t diff = high->offset_signed - low->offset_signed;
1116 if (diff > get_bit_size(low) / 8u * low->intrin->num_components) {
1117 progress |= update_align(low);
1118 continue;
1119 }
1120
1121 struct entry *first = low->index < high->index ? low : high;
1122 struct entry *second = low->index < high->index ? high : low;
1123
1124 if (try_vectorize(impl, ctx, low, high, first, second)) {
1125 *util_dynarray_element(arr, struct entry *, i) = NULL;
1126 *util_dynarray_element(arr, struct entry *, i + 1) = low->is_store ? second : first;
1127 progress = true;
1128 } else {
1129 progress |= update_align(low);
1130 }
1131 }
1132
1133 struct entry *last = *util_dynarray_element(arr, struct entry *, i);
1134 progress |= update_align(last);
1135 }
1136
1137 _mesa_hash_table_clear(ht, delete_entry_dynarray);
1138
1139 return progress;
1140 }
1141
1142 static bool
1143 handle_barrier(struct vectorize_ctx *ctx, bool *progress, nir_function_impl *impl, nir_instr *instr)
1144 {
1145 unsigned modes = 0;
1146 bool acquire = true;
1147 bool release = true;
1148 if (instr->type == nir_instr_type_intrinsic) {
1149 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1150 switch (intrin->intrinsic) {
1151 case nir_intrinsic_group_memory_barrier:
1152 case nir_intrinsic_memory_barrier:
1153 modes = nir_var_mem_ssbo | nir_var_mem_shared | nir_var_mem_global;
1154 break;
1155 /* prevent speculative loads/stores */
1156 case nir_intrinsic_discard_if:
1157 case nir_intrinsic_discard:
1158 modes = nir_var_all;
1159 break;
1160 case nir_intrinsic_memory_barrier_buffer:
1161 modes = nir_var_mem_ssbo | nir_var_mem_global;
1162 break;
1163 case nir_intrinsic_memory_barrier_shared:
1164 modes = nir_var_mem_shared;
1165 break;
1166 case nir_intrinsic_scoped_memory_barrier:
1167 modes = nir_intrinsic_memory_modes(intrin);
1168 acquire = nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_ACQUIRE;
1169 release = nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_RELEASE;
1170 switch (nir_intrinsic_memory_scope(intrin)) {
1171 case NIR_SCOPE_INVOCATION:
1172 case NIR_SCOPE_SUBGROUP:
1173 /* a barier should never be required for correctness with these scopes */
1174 modes = 0;
1175 break;
1176 default:
1177 break;
1178 }
1179 break;
1180 default:
1181 return false;
1182 }
1183 } else if (instr->type == nir_instr_type_call) {
1184 modes = nir_var_all;
1185 } else {
1186 return false;
1187 }
1188
1189 while (modes) {
1190 unsigned mode_index = u_bit_scan(&modes);
1191
1192 if (acquire)
1193 *progress |= vectorize_entries(ctx, impl, ctx->loads[mode_index]);
1194 if (release)
1195 *progress |= vectorize_entries(ctx, impl, ctx->stores[mode_index]);
1196 }
1197
1198 return true;
1199 }
1200
1201 static bool
1202 process_block(nir_function_impl *impl, struct vectorize_ctx *ctx, nir_block *block)
1203 {
1204 bool progress = false;
1205
1206 for (unsigned i = 0; i < nir_num_variable_modes; i++) {
1207 list_inithead(&ctx->entries[i]);
1208 if (ctx->loads[i])
1209 _mesa_hash_table_clear(ctx->loads[i], delete_entry_dynarray);
1210 if (ctx->stores[i])
1211 _mesa_hash_table_clear(ctx->stores[i], delete_entry_dynarray);
1212 }
1213
1214 /* create entries */
1215 unsigned next_index = 0;
1216
1217 nir_foreach_instr_safe(instr, block) {
1218 if (handle_barrier(ctx, &progress, impl, instr))
1219 continue;
1220
1221 /* gather information */
1222 if (instr->type != nir_instr_type_intrinsic)
1223 continue;
1224 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1225
1226 const struct intrinsic_info *info = get_info(intrin->intrinsic);
1227 if (!info)
1228 continue;
1229
1230 nir_variable_mode mode = info->mode;
1231 if (!mode)
1232 mode = nir_src_as_deref(intrin->src[info->deref_src])->mode;
1233 if (!(mode & ctx->modes))
1234 continue;
1235 unsigned mode_index = ffs(mode) - 1;
1236
1237 /* create entry */
1238 struct entry *entry = create_entry(ctx, info, intrin);
1239 entry->index = next_index++;
1240
1241 list_addtail(&entry->head, &ctx->entries[mode_index]);
1242
1243 /* add the entry to a hash table */
1244
1245 struct hash_table *adj_ht = NULL;
1246 if (entry->is_store) {
1247 if (!ctx->stores[mode_index])
1248 ctx->stores[mode_index] = _mesa_hash_table_create(ctx, &hash_entry_key, &entry_key_equals);
1249 adj_ht = ctx->stores[mode_index];
1250 } else {
1251 if (!ctx->loads[mode_index])
1252 ctx->loads[mode_index] = _mesa_hash_table_create(ctx, &hash_entry_key, &entry_key_equals);
1253 adj_ht = ctx->loads[mode_index];
1254 }
1255
1256 uint32_t key_hash = hash_entry_key(entry->key);
1257 struct hash_entry *adj_entry = _mesa_hash_table_search_pre_hashed(adj_ht, key_hash, entry->key);
1258 struct util_dynarray *arr;
1259 if (adj_entry && adj_entry->data) {
1260 arr = (struct util_dynarray *)adj_entry->data;
1261 } else {
1262 arr = ralloc(ctx, struct util_dynarray);
1263 util_dynarray_init(arr, arr);
1264 _mesa_hash_table_insert_pre_hashed(adj_ht, key_hash, entry->key, arr);
1265 }
1266 util_dynarray_append(arr, struct entry *, entry);
1267 }
1268
1269 /* sort and combine entries */
1270 for (unsigned i = 0; i < nir_num_variable_modes; i++) {
1271 progress |= vectorize_entries(ctx, impl, ctx->loads[i]);
1272 progress |= vectorize_entries(ctx, impl, ctx->stores[i]);
1273 }
1274
1275 return progress;
1276 }
1277
1278 bool
1279 nir_opt_load_store_vectorize(nir_shader *shader, nir_variable_mode modes,
1280 nir_should_vectorize_mem_func callback)
1281 {
1282 bool progress = false;
1283
1284 struct vectorize_ctx *ctx = rzalloc(NULL, struct vectorize_ctx);
1285 ctx->modes = modes;
1286 ctx->callback = callback;
1287
1288 nir_index_vars(shader, NULL, modes);
1289
1290 nir_foreach_function(function, shader) {
1291 if (function->impl) {
1292 if (modes & nir_var_function_temp)
1293 nir_index_vars(shader, function->impl, nir_var_function_temp);
1294
1295 nir_foreach_block(block, function->impl)
1296 progress |= process_block(function->impl, ctx, block);
1297
1298 nir_metadata_preserve(function->impl,
1299 nir_metadata_block_index |
1300 nir_metadata_dominance |
1301 nir_metadata_live_ssa_defs);
1302 }
1303 }
1304
1305 ralloc_free(ctx);
1306 return progress;
1307 }