2 * Copyright © 2019 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * Although it's called a load/store "vectorization" pass, this also combines
26 * intersecting and identical loads/stores. It currently supports derefs, ubo,
27 * ssbo and push constant loads/stores.
29 * This doesn't handle copy_deref intrinsics and assumes that
30 * nir_lower_alu_to_scalar() has been called and that the IR is free from ALU
31 * modifiers. It also assumes that derefs have explicitly laid out types.
33 * After vectorization, the backend may want to call nir_lower_alu_to_scalar()
34 * and nir_lower_pack(). Also this creates cast instructions taking derefs as a
35 * source and some parts of NIR may not be able to handle that well.
37 * There are a few situations where this doesn't vectorize as well as it could:
38 * - It won't turn four consecutive vec3 loads into 3 vec4 loads.
39 * - It doesn't do global vectorization.
40 * Handling these cases probably wouldn't provide much benefit though.
42 * This probably doesn't handle big-endian GPUs correctly.
46 #include "nir_deref.h"
47 #include "nir_builder.h"
48 #include "nir_worklist.h"
49 #include "util/u_dynarray.h"
53 struct intrinsic_info
{
54 nir_variable_mode mode
; /* 0 if the mode is obtained from the deref. */
57 /* Indices into nir_intrinsic::src[] or -1 if not applicable. */
58 int resource_src
; /* resource (e.g. from vulkan_resource_index) */
59 int base_src
; /* offset which it loads/stores from */
60 int deref_src
; /* deref which is loads/stores from */
61 int value_src
; /* the data it is storing */
64 static const struct intrinsic_info
*
65 get_info(nir_intrinsic_op op
) {
67 #define INFO(mode, op, atomic, res, base, deref, val) \
68 case nir_intrinsic_##op: {\
69 static const struct intrinsic_info op##_info = {mode, nir_intrinsic_##op, atomic, res, base, deref, val};\
72 #define LOAD(mode, op, res, base, deref) INFO(mode, load_##op, false, res, base, deref, -1)
73 #define STORE(mode, op, res, base, deref, val) INFO(mode, store_##op, false, res, base, deref, val)
74 #define ATOMIC(mode, type, op, res, base, deref, val) INFO(mode, type##_atomic_##op, true, res, base, deref, val)
75 LOAD(nir_var_mem_push_const
, push_constant
, -1, 0, -1)
76 LOAD(nir_var_mem_ubo
, ubo
, 0, 1, -1)
77 LOAD(nir_var_mem_ssbo
, ssbo
, 0, 1, -1)
78 STORE(nir_var_mem_ssbo
, ssbo
, 1, 2, -1, 0)
79 LOAD(0, deref
, -1, -1, 0)
80 STORE(0, deref
, -1, -1, 0, 1)
81 LOAD(nir_var_mem_shared
, shared
, -1, 0, -1)
82 STORE(nir_var_mem_shared
, shared
, -1, 1, -1, 0)
83 LOAD(nir_var_mem_global
, global
, -1, 0, -1)
84 STORE(nir_var_mem_global
, global
, -1, 1, -1, 0)
85 ATOMIC(nir_var_mem_ssbo
, ssbo
, add
, 0, 1, -1, 2)
86 ATOMIC(nir_var_mem_ssbo
, ssbo
, imin
, 0, 1, -1, 2)
87 ATOMIC(nir_var_mem_ssbo
, ssbo
, umin
, 0, 1, -1, 2)
88 ATOMIC(nir_var_mem_ssbo
, ssbo
, imax
, 0, 1, -1, 2)
89 ATOMIC(nir_var_mem_ssbo
, ssbo
, umax
, 0, 1, -1, 2)
90 ATOMIC(nir_var_mem_ssbo
, ssbo
, and, 0, 1, -1, 2)
91 ATOMIC(nir_var_mem_ssbo
, ssbo
, or, 0, 1, -1, 2)
92 ATOMIC(nir_var_mem_ssbo
, ssbo
, xor, 0, 1, -1, 2)
93 ATOMIC(nir_var_mem_ssbo
, ssbo
, exchange
, 0, 1, -1, 2)
94 ATOMIC(nir_var_mem_ssbo
, ssbo
, comp_swap
, 0, 1, -1, 2)
95 ATOMIC(nir_var_mem_ssbo
, ssbo
, fadd
, 0, 1, -1, 2)
96 ATOMIC(nir_var_mem_ssbo
, ssbo
, fmin
, 0, 1, -1, 2)
97 ATOMIC(nir_var_mem_ssbo
, ssbo
, fmax
, 0, 1, -1, 2)
98 ATOMIC(nir_var_mem_ssbo
, ssbo
, fcomp_swap
, 0, 1, -1, 2)
99 ATOMIC(0, deref
, add
, -1, -1, 0, 1)
100 ATOMIC(0, deref
, imin
, -1, -1, 0, 1)
101 ATOMIC(0, deref
, umin
, -1, -1, 0, 1)
102 ATOMIC(0, deref
, imax
, -1, -1, 0, 1)
103 ATOMIC(0, deref
, umax
, -1, -1, 0, 1)
104 ATOMIC(0, deref
, and, -1, -1, 0, 1)
105 ATOMIC(0, deref
, or, -1, -1, 0, 1)
106 ATOMIC(0, deref
, xor, -1, -1, 0, 1)
107 ATOMIC(0, deref
, exchange
, -1, -1, 0, 1)
108 ATOMIC(0, deref
, comp_swap
, -1, -1, 0, 1)
109 ATOMIC(0, deref
, fadd
, -1, -1, 0, 1)
110 ATOMIC(0, deref
, fmin
, -1, -1, 0, 1)
111 ATOMIC(0, deref
, fmax
, -1, -1, 0, 1)
112 ATOMIC(0, deref
, fcomp_swap
, -1, -1, 0, 1)
113 ATOMIC(nir_var_mem_shared
, shared
, add
, -1, 0, -1, 1)
114 ATOMIC(nir_var_mem_shared
, shared
, imin
, -1, 0, -1, 1)
115 ATOMIC(nir_var_mem_shared
, shared
, umin
, -1, 0, -1, 1)
116 ATOMIC(nir_var_mem_shared
, shared
, imax
, -1, 0, -1, 1)
117 ATOMIC(nir_var_mem_shared
, shared
, umax
, -1, 0, -1, 1)
118 ATOMIC(nir_var_mem_shared
, shared
, and, -1, 0, -1, 1)
119 ATOMIC(nir_var_mem_shared
, shared
, or, -1, 0, -1, 1)
120 ATOMIC(nir_var_mem_shared
, shared
, xor, -1, 0, -1, 1)
121 ATOMIC(nir_var_mem_shared
, shared
, exchange
, -1, 0, -1, 1)
122 ATOMIC(nir_var_mem_shared
, shared
, comp_swap
, -1, 0, -1, 1)
123 ATOMIC(nir_var_mem_shared
, shared
, fadd
, -1, 0, -1, 1)
124 ATOMIC(nir_var_mem_shared
, shared
, fmin
, -1, 0, -1, 1)
125 ATOMIC(nir_var_mem_shared
, shared
, fmax
, -1, 0, -1, 1)
126 ATOMIC(nir_var_mem_shared
, shared
, fcomp_swap
, -1, 0, -1, 1)
127 ATOMIC(nir_var_mem_global
, global
, add
, -1, 0, -1, 1)
128 ATOMIC(nir_var_mem_global
, global
, imin
, -1, 0, -1, 1)
129 ATOMIC(nir_var_mem_global
, global
, umin
, -1, 0, -1, 1)
130 ATOMIC(nir_var_mem_global
, global
, imax
, -1, 0, -1, 1)
131 ATOMIC(nir_var_mem_global
, global
, umax
, -1, 0, -1, 1)
132 ATOMIC(nir_var_mem_global
, global
, and, -1, 0, -1, 1)
133 ATOMIC(nir_var_mem_global
, global
, or, -1, 0, -1, 1)
134 ATOMIC(nir_var_mem_global
, global
, xor, -1, 0, -1, 1)
135 ATOMIC(nir_var_mem_global
, global
, exchange
, -1, 0, -1, 1)
136 ATOMIC(nir_var_mem_global
, global
, comp_swap
, -1, 0, -1, 1)
137 ATOMIC(nir_var_mem_global
, global
, fadd
, -1, 0, -1, 1)
138 ATOMIC(nir_var_mem_global
, global
, fmin
, -1, 0, -1, 1)
139 ATOMIC(nir_var_mem_global
, global
, fmax
, -1, 0, -1, 1)
140 ATOMIC(nir_var_mem_global
, global
, fcomp_swap
, -1, 0, -1, 1)
152 * Information used to compare memory operations.
153 * It canonically represents an offset as:
154 * `offset_defs[0]*offset_defs_mul[0] + offset_defs[1]*offset_defs_mul[1] + ...`
155 * "offset_defs" is sorted in ascenting order by the ssa definition's index.
156 * "resource" or "var" may be NULL.
159 nir_ssa_def
*resource
;
161 unsigned offset_def_count
;
162 nir_ssa_def
**offset_defs
;
163 uint64_t *offset_defs_mul
;
166 /* Information on a single memory operation. */
168 struct list_head head
;
171 struct entry_key
*key
;
173 uint64_t offset
; /* sign-extended */
174 int64_t offset_signed
;
179 nir_intrinsic_instr
*intrin
;
180 const struct intrinsic_info
*info
;
181 enum gl_access_qualifier access
;
184 nir_deref_instr
*deref
;
187 struct vectorize_ctx
{
188 nir_variable_mode modes
;
189 nir_should_vectorize_mem_func callback
;
190 nir_variable_mode robust_modes
;
191 struct list_head entries
[nir_num_variable_modes
];
192 struct hash_table
*loads
[nir_num_variable_modes
];
193 struct hash_table
*stores
[nir_num_variable_modes
];
196 static uint32_t hash_entry_key(const void *key_
)
198 /* this is careful to not include pointers in the hash calculation so that
199 * the order of the hash table walk is deterministic */
200 struct entry_key
*key
= (struct entry_key
*)key_
;
204 hash
= XXH32(&key
->resource
->index
, sizeof(key
->resource
->index
), hash
);
206 hash
= XXH32(&key
->var
->index
, sizeof(key
->var
->index
), hash
);
207 unsigned mode
= key
->var
->data
.mode
;
208 hash
= XXH32(&mode
, sizeof(mode
), hash
);
211 for (unsigned i
= 0; i
< key
->offset_def_count
; i
++)
212 hash
= XXH32(&key
->offset_defs
[i
]->index
, sizeof(key
->offset_defs
[i
]->index
), hash
);
214 hash
= XXH32(key
->offset_defs_mul
, key
->offset_def_count
* sizeof(uint64_t), hash
);
219 static bool entry_key_equals(const void *a_
, const void *b_
)
221 struct entry_key
*a
= (struct entry_key
*)a_
;
222 struct entry_key
*b
= (struct entry_key
*)b_
;
224 if (a
->var
!= b
->var
|| a
->resource
!= b
->resource
)
227 if (a
->offset_def_count
!= b
->offset_def_count
)
230 size_t offset_def_size
= a
->offset_def_count
* sizeof(nir_ssa_def
*);
231 size_t offset_def_mul_size
= a
->offset_def_count
* sizeof(uint64_t);
232 if (a
->offset_def_count
&&
233 (memcmp(a
->offset_defs
, b
->offset_defs
, offset_def_size
) ||
234 memcmp(a
->offset_defs_mul
, b
->offset_defs_mul
, offset_def_mul_size
)))
240 static void delete_entry_dynarray(struct hash_entry
*entry
)
242 struct util_dynarray
*arr
= (struct util_dynarray
*)entry
->data
;
246 static int sort_entries(const void *a_
, const void *b_
)
248 struct entry
*a
= *(struct entry
*const*)a_
;
249 struct entry
*b
= *(struct entry
*const*)b_
;
251 if (a
->offset_signed
> b
->offset_signed
)
253 else if (a
->offset_signed
< b
->offset_signed
)
260 get_bit_size(struct entry
*entry
)
262 unsigned size
= entry
->is_store
?
263 entry
->intrin
->src
[entry
->info
->value_src
].ssa
->bit_size
:
264 entry
->intrin
->dest
.ssa
.bit_size
;
265 return size
== 1 ? 32u : size
;
268 /* If "def" is from an alu instruction with the opcode "op" and one of it's
269 * sources is a constant, update "def" to be the non-constant source, fill "c"
270 * with the constant and return true. */
272 parse_alu(nir_ssa_def
**def
, nir_op op
, uint64_t *c
)
274 nir_ssa_scalar scalar
;
278 if (!nir_ssa_scalar_is_alu(scalar
) || nir_ssa_scalar_alu_op(scalar
) != op
)
281 nir_ssa_scalar src0
= nir_ssa_scalar_chase_alu_src(scalar
, 0);
282 nir_ssa_scalar src1
= nir_ssa_scalar_chase_alu_src(scalar
, 1);
283 if (op
!= nir_op_ishl
&& nir_ssa_scalar_is_const(src0
) && src1
.comp
== 0) {
284 *c
= nir_ssa_scalar_as_uint(src0
);
286 } else if (nir_ssa_scalar_is_const(src1
) && src0
.comp
== 0) {
287 *c
= nir_ssa_scalar_as_uint(src1
);
295 /* Parses an offset expression such as "a * 16 + 4" and "(a * 16 + 4) * 64 + 32". */
297 parse_offset(nir_ssa_def
**base
, uint64_t *base_mul
, uint64_t *offset
)
299 if ((*base
)->parent_instr
->type
== nir_instr_type_load_const
) {
300 *offset
= nir_src_comp_as_uint(nir_src_for_ssa(*base
), 0);
307 bool progress
= false;
309 uint64_t mul2
= 1, add2
= 0;
311 progress
= parse_alu(base
, nir_op_imul
, &mul2
);
315 progress
|= parse_alu(base
, nir_op_ishl
, &mul2
);
318 progress
|= parse_alu(base
, nir_op_iadd
, &add2
);
327 type_scalar_size_bytes(const struct glsl_type
*type
)
329 assert(glsl_type_is_vector_or_scalar(type
) ||
330 glsl_type_is_matrix(type
));
331 return glsl_type_is_boolean(type
) ? 4u : glsl_get_bit_size(type
) / 8u;
335 get_array_stride(const struct glsl_type
*type
)
337 unsigned explicit_stride
= glsl_get_explicit_stride(type
);
338 if ((glsl_type_is_matrix(type
) &&
339 glsl_matrix_type_is_row_major(type
)) ||
340 (glsl_type_is_vector(type
) && explicit_stride
== 0))
341 return type_scalar_size_bytes(type
);
342 return explicit_stride
;
346 mask_sign_extend(uint64_t val
, unsigned bit_size
)
348 return (int64_t)(val
<< (64 - bit_size
)) >> (64 - bit_size
);
352 add_to_entry_key(nir_ssa_def
**offset_defs
, uint64_t *offset_defs_mul
,
353 unsigned offset_def_count
, nir_ssa_def
*def
, uint64_t mul
)
355 mul
= mask_sign_extend(mul
, def
->bit_size
);
357 for (unsigned i
= 0; i
<= offset_def_count
; i
++) {
358 if (i
== offset_def_count
|| def
->index
> offset_defs
[i
]->index
) {
359 /* insert before i */
360 memmove(offset_defs
+ i
+ 1, offset_defs
+ i
,
361 (offset_def_count
- i
) * sizeof(nir_ssa_def
*));
362 memmove(offset_defs_mul
+ i
+ 1, offset_defs_mul
+ i
,
363 (offset_def_count
- i
) * sizeof(uint64_t));
364 offset_defs
[i
] = def
;
365 offset_defs_mul
[i
] = mul
;
367 } else if (def
->index
== offset_defs
[i
]->index
) {
368 /* merge with offset_def at i */
369 offset_defs_mul
[i
] += mul
;
373 unreachable("Unreachable.");
377 static struct entry_key
*
378 create_entry_key_from_deref(void *mem_ctx
,
379 struct vectorize_ctx
*ctx
,
380 nir_deref_path
*path
,
381 uint64_t *offset_base
)
383 unsigned path_len
= 0;
384 while (path
->path
[path_len
])
387 nir_ssa_def
*offset_defs_stack
[32];
388 uint64_t offset_defs_mul_stack
[32];
389 nir_ssa_def
**offset_defs
= offset_defs_stack
;
390 uint64_t *offset_defs_mul
= offset_defs_mul_stack
;
392 offset_defs
= malloc(path_len
* sizeof(nir_ssa_def
*));
393 offset_defs_mul
= malloc(path_len
* sizeof(uint64_t));
395 unsigned offset_def_count
= 0;
397 struct entry_key
* key
= ralloc(mem_ctx
, struct entry_key
);
398 key
->resource
= NULL
;
402 for (unsigned i
= 0; i
< path_len
; i
++) {
403 nir_deref_instr
*parent
= i
? path
->path
[i
- 1] : NULL
;
404 nir_deref_instr
*deref
= path
->path
[i
];
406 switch (deref
->deref_type
) {
407 case nir_deref_type_var
: {
409 key
->var
= deref
->var
;
412 case nir_deref_type_array
:
413 case nir_deref_type_ptr_as_array
: {
415 nir_ssa_def
*index
= deref
->arr
.index
.ssa
;
417 if (deref
->deref_type
== nir_deref_type_ptr_as_array
)
418 stride
= nir_deref_instr_ptr_as_array_stride(deref
);
420 stride
= get_array_stride(parent
->type
);
422 nir_ssa_def
*base
= index
;
423 uint64_t offset
= 0, base_mul
= 1;
424 parse_offset(&base
, &base_mul
, &offset
);
425 offset
= mask_sign_extend(offset
, index
->bit_size
);
427 *offset_base
+= offset
* stride
;
429 offset_def_count
+= add_to_entry_key(offset_defs
, offset_defs_mul
,
431 base
, base_mul
* stride
);
435 case nir_deref_type_struct
: {
437 int offset
= glsl_get_struct_field_offset(parent
->type
, deref
->strct
.index
);
438 *offset_base
+= offset
;
441 case nir_deref_type_cast
: {
443 key
->resource
= deref
->parent
.ssa
;
447 unreachable("Unhandled deref type");
451 key
->offset_def_count
= offset_def_count
;
452 key
->offset_defs
= ralloc_array(mem_ctx
, nir_ssa_def
*, offset_def_count
);
453 key
->offset_defs_mul
= ralloc_array(mem_ctx
, uint64_t, offset_def_count
);
454 memcpy(key
->offset_defs
, offset_defs
, offset_def_count
* sizeof(nir_ssa_def
*));
455 memcpy(key
->offset_defs_mul
, offset_defs_mul
, offset_def_count
* sizeof(uint64_t));
457 if (offset_defs
!= offset_defs_stack
)
459 if (offset_defs_mul
!= offset_defs_mul_stack
)
460 free(offset_defs_mul
);
466 parse_entry_key_from_offset(struct entry_key
*key
, unsigned size
, unsigned left
,
467 nir_ssa_def
*base
, uint64_t base_mul
, uint64_t *offset
)
471 parse_offset(&base
, &new_mul
, &new_offset
);
472 *offset
+= new_offset
* base_mul
;
482 nir_ssa_scalar scalar
;
485 if (nir_ssa_scalar_is_alu(scalar
) && nir_ssa_scalar_alu_op(scalar
) == nir_op_iadd
) {
486 nir_ssa_scalar src0
= nir_ssa_scalar_chase_alu_src(scalar
, 0);
487 nir_ssa_scalar src1
= nir_ssa_scalar_chase_alu_src(scalar
, 1);
488 if (src0
.comp
== 0 && src1
.comp
== 0) {
489 unsigned amount
= parse_entry_key_from_offset(key
, size
, left
- 1, src0
.def
, base_mul
, offset
);
490 amount
+= parse_entry_key_from_offset(key
, size
+ amount
, left
- amount
, src1
.def
, base_mul
, offset
);
496 return add_to_entry_key(key
->offset_defs
, key
->offset_defs_mul
, size
, base
, base_mul
);
499 static struct entry_key
*
500 create_entry_key_from_offset(void *mem_ctx
, nir_ssa_def
*base
, uint64_t base_mul
, uint64_t *offset
)
502 struct entry_key
*key
= ralloc(mem_ctx
, struct entry_key
);
503 key
->resource
= NULL
;
506 nir_ssa_def
*offset_defs
[32];
507 uint64_t offset_defs_mul
[32];
508 key
->offset_defs
= offset_defs
;
509 key
->offset_defs_mul
= offset_defs_mul
;
511 key
->offset_def_count
= parse_entry_key_from_offset(key
, 0, 32, base
, base_mul
, offset
);
513 key
->offset_defs
= ralloc_array(mem_ctx
, nir_ssa_def
*, key
->offset_def_count
);
514 key
->offset_defs_mul
= ralloc_array(mem_ctx
, uint64_t, key
->offset_def_count
);
515 memcpy(key
->offset_defs
, offset_defs
, key
->offset_def_count
* sizeof(nir_ssa_def
*));
516 memcpy(key
->offset_defs_mul
, offset_defs_mul
, key
->offset_def_count
* sizeof(uint64_t));
518 key
->offset_def_count
= 0;
519 key
->offset_defs
= NULL
;
520 key
->offset_defs_mul
= NULL
;
525 static nir_variable_mode
526 get_variable_mode(struct entry
*entry
)
528 if (entry
->info
->mode
)
529 return entry
->info
->mode
;
530 assert(entry
->deref
);
531 return entry
->deref
->mode
;
535 mode_to_index(nir_variable_mode mode
)
537 assert(util_bitcount(mode
) == 1);
539 /* Globals and SSBOs should be tracked together */
540 if (mode
== nir_var_mem_global
)
541 mode
= nir_var_mem_ssbo
;
543 return ffs(mode
) - 1;
546 static nir_variable_mode
547 aliasing_modes(nir_variable_mode modes
)
549 /* Global and SSBO can alias */
550 if (modes
& (nir_var_mem_ssbo
| nir_var_mem_global
))
551 modes
|= nir_var_mem_ssbo
| nir_var_mem_global
;
555 static struct entry
*
556 create_entry(struct vectorize_ctx
*ctx
,
557 const struct intrinsic_info
*info
,
558 nir_intrinsic_instr
*intrin
)
560 struct entry
*entry
= rzalloc(ctx
, struct entry
);
561 entry
->intrin
= intrin
;
562 entry
->instr
= &intrin
->instr
;
564 entry
->best_align
= UINT32_MAX
;
565 entry
->is_store
= entry
->info
->value_src
>= 0;
567 if (entry
->info
->deref_src
>= 0) {
568 entry
->deref
= nir_src_as_deref(intrin
->src
[entry
->info
->deref_src
]);
570 nir_deref_path_init(&path
, entry
->deref
, NULL
);
571 entry
->key
= create_entry_key_from_deref(entry
, ctx
, &path
, &entry
->offset
);
572 nir_deref_path_finish(&path
);
574 nir_ssa_def
*base
= entry
->info
->base_src
>= 0 ?
575 intrin
->src
[entry
->info
->base_src
].ssa
: NULL
;
577 if (nir_intrinsic_infos
[intrin
->intrinsic
].index_map
[NIR_INTRINSIC_BASE
])
578 offset
+= nir_intrinsic_base(intrin
);
579 entry
->key
= create_entry_key_from_offset(entry
, base
, 1, &offset
);
580 entry
->offset
= offset
;
583 entry
->offset
= mask_sign_extend(entry
->offset
, base
->bit_size
);
586 if (entry
->info
->resource_src
>= 0)
587 entry
->key
->resource
= intrin
->src
[entry
->info
->resource_src
].ssa
;
589 if (nir_intrinsic_infos
[intrin
->intrinsic
].index_map
[NIR_INTRINSIC_ACCESS
])
590 entry
->access
= nir_intrinsic_access(intrin
);
591 else if (entry
->key
->var
)
592 entry
->access
= entry
->key
->var
->data
.access
;
594 uint32_t restrict_modes
= nir_var_shader_in
| nir_var_shader_out
;
595 restrict_modes
|= nir_var_shader_temp
| nir_var_function_temp
;
596 restrict_modes
|= nir_var_uniform
| nir_var_mem_push_const
;
597 restrict_modes
|= nir_var_system_value
| nir_var_mem_shared
;
598 if (get_variable_mode(entry
) & restrict_modes
)
599 entry
->access
|= ACCESS_RESTRICT
;
604 static nir_deref_instr
*
605 cast_deref(nir_builder
*b
, unsigned num_components
, unsigned bit_size
, nir_deref_instr
*deref
)
607 if (glsl_get_components(deref
->type
) == num_components
&&
608 type_scalar_size_bytes(deref
->type
)*8u == bit_size
)
611 enum glsl_base_type types
[] = {
612 GLSL_TYPE_UINT8
, GLSL_TYPE_UINT16
, GLSL_TYPE_UINT
, GLSL_TYPE_UINT64
};
613 enum glsl_base_type base
= types
[ffs(bit_size
/ 8u) - 1u];
614 const struct glsl_type
*type
= glsl_vector_type(base
, num_components
);
616 if (deref
->type
== type
)
619 return nir_build_deref_cast(b
, &deref
->dest
.ssa
, deref
->mode
, type
, 0);
622 /* Return true if the write mask "write_mask" of a store with "old_bit_size"
623 * bits per element can be represented for a store with "new_bit_size" bits per
626 writemask_representable(unsigned write_mask
, unsigned old_bit_size
, unsigned new_bit_size
)
630 u_bit_scan_consecutive_range(&write_mask
, &start
, &count
);
631 start
*= old_bit_size
;
632 count
*= old_bit_size
;
633 if (start
% new_bit_size
!= 0)
635 if (count
% new_bit_size
!= 0)
642 gcd(uint64_t a
, uint64_t b
)
653 get_best_align(struct entry
*entry
)
655 if (entry
->best_align
!= UINT32_MAX
)
656 return entry
->best_align
;
658 uint64_t best_align
= entry
->offset
;
659 for (unsigned i
= 0; i
< entry
->key
->offset_def_count
; i
++) {
661 best_align
= entry
->key
->offset_defs_mul
[i
];
662 else if (entry
->key
->offset_defs_mul
[i
])
663 best_align
= gcd(best_align
, entry
->key
->offset_defs_mul
[i
]);
666 if (nir_intrinsic_infos
[entry
->intrin
->intrinsic
].index_map
[NIR_INTRINSIC_ALIGN_MUL
])
667 best_align
= MAX2(best_align
, nir_intrinsic_align(entry
->intrin
));
669 /* ensure the result is a power of two that fits in a int32_t */
670 entry
->best_align
= gcd(best_align
, 1u << 30);
672 return entry
->best_align
;
675 /* Return true if "new_bit_size" is a usable bit size for a vectorized load/store
676 * of "low" and "high". */
678 new_bitsize_acceptable(struct vectorize_ctx
*ctx
, unsigned new_bit_size
,
679 struct entry
*low
, struct entry
*high
, unsigned size
)
681 if (size
% new_bit_size
!= 0)
684 unsigned new_num_components
= size
/ new_bit_size
;
685 if (!nir_num_components_valid(new_num_components
))
688 unsigned high_offset
= high
->offset_signed
- low
->offset_signed
;
690 /* check nir_extract_bits limitations */
691 unsigned common_bit_size
= MIN2(get_bit_size(low
), get_bit_size(high
));
692 common_bit_size
= MIN2(common_bit_size
, new_bit_size
);
694 common_bit_size
= MIN2(common_bit_size
, (1u << (ffs(high_offset
* 8) - 1)));
695 if (new_bit_size
/ common_bit_size
> NIR_MAX_VEC_COMPONENTS
)
698 if (!ctx
->callback(get_best_align(low
), new_bit_size
, new_num_components
,
699 high_offset
, low
->intrin
, high
->intrin
))
703 unsigned low_size
= low
->intrin
->num_components
* get_bit_size(low
);
704 unsigned high_size
= high
->intrin
->num_components
* get_bit_size(high
);
706 if (low_size
% new_bit_size
!= 0)
708 if (high_size
% new_bit_size
!= 0)
711 unsigned write_mask
= nir_intrinsic_write_mask(low
->intrin
);
712 if (!writemask_representable(write_mask
, low_size
, new_bit_size
))
715 write_mask
= nir_intrinsic_write_mask(high
->intrin
);
716 if (!writemask_representable(write_mask
, high_size
, new_bit_size
))
723 /* Updates a write mask, "write_mask", so that it can be used with a
724 * "new_bit_size"-bit store instead of a "old_bit_size"-bit store. */
726 update_writemask(unsigned write_mask
, unsigned old_bit_size
, unsigned new_bit_size
)
731 u_bit_scan_consecutive_range(&write_mask
, &start
, &count
);
732 start
= start
* old_bit_size
/ new_bit_size
;
733 count
= count
* old_bit_size
/ new_bit_size
;
734 res
|= ((1 << count
) - 1) << start
;
739 static nir_deref_instr
*subtract_deref(nir_builder
*b
, nir_deref_instr
*deref
, int64_t offset
)
741 /* avoid adding another deref to the path */
742 if (deref
->deref_type
== nir_deref_type_ptr_as_array
&&
743 nir_src_is_const(deref
->arr
.index
) &&
744 offset
% nir_deref_instr_ptr_as_array_stride(deref
) == 0) {
745 unsigned stride
= nir_deref_instr_ptr_as_array_stride(deref
);
746 nir_ssa_def
*index
= nir_imm_intN_t(b
, nir_src_as_int(deref
->arr
.index
) - offset
/ stride
,
747 deref
->dest
.ssa
.bit_size
);
748 return nir_build_deref_ptr_as_array(b
, nir_deref_instr_parent(deref
), index
);
751 if (deref
->deref_type
== nir_deref_type_array
&&
752 nir_src_is_const(deref
->arr
.index
)) {
753 nir_deref_instr
*parent
= nir_deref_instr_parent(deref
);
754 unsigned stride
= glsl_get_explicit_stride(parent
->type
);
755 if (offset
% stride
== 0)
756 return nir_build_deref_array_imm(
757 b
, parent
, nir_src_as_int(deref
->arr
.index
) - offset
/ stride
);
761 deref
= nir_build_deref_cast(b
, &deref
->dest
.ssa
, deref
->mode
,
762 glsl_scalar_type(GLSL_TYPE_UINT8
), 1);
763 return nir_build_deref_ptr_as_array(
764 b
, deref
, nir_imm_intN_t(b
, -offset
, deref
->dest
.ssa
.bit_size
));
767 static bool update_align(struct entry
*entry
)
769 bool has_align_index
=
770 nir_intrinsic_infos
[entry
->intrin
->intrinsic
].index_map
[NIR_INTRINSIC_ALIGN_MUL
];
771 if (has_align_index
) {
772 unsigned align
= get_best_align(entry
);
773 if (align
!= nir_intrinsic_align(entry
->intrin
)) {
774 nir_intrinsic_set_align(entry
->intrin
, align
, 0);
782 vectorize_loads(nir_builder
*b
, struct vectorize_ctx
*ctx
,
783 struct entry
*low
, struct entry
*high
,
784 struct entry
*first
, struct entry
*second
,
785 unsigned new_bit_size
, unsigned new_num_components
,
788 unsigned low_bit_size
= get_bit_size(low
);
789 unsigned high_bit_size
= get_bit_size(high
);
790 bool low_bool
= low
->intrin
->dest
.ssa
.bit_size
== 1;
791 bool high_bool
= high
->intrin
->dest
.ssa
.bit_size
== 1;
792 nir_ssa_def
*data
= &first
->intrin
->dest
.ssa
;
794 b
->cursor
= nir_after_instr(first
->instr
);
796 /* update the load's destination size and extract data for each of the original loads */
797 data
->num_components
= new_num_components
;
798 data
->bit_size
= new_bit_size
;
800 nir_ssa_def
*low_def
= nir_extract_bits(
801 b
, &data
, 1, 0, low
->intrin
->num_components
, low_bit_size
);
802 nir_ssa_def
*high_def
= nir_extract_bits(
803 b
, &data
, 1, high_start
, high
->intrin
->num_components
, high_bit_size
);
805 /* convert booleans */
806 low_def
= low_bool
? nir_i2b(b
, low_def
) : nir_mov(b
, low_def
);
807 high_def
= high_bool
? nir_i2b(b
, high_def
) : nir_mov(b
, high_def
);
811 nir_ssa_def_rewrite_uses_after(&low
->intrin
->dest
.ssa
, nir_src_for_ssa(low_def
),
812 high_def
->parent_instr
);
813 nir_ssa_def_rewrite_uses(&high
->intrin
->dest
.ssa
, nir_src_for_ssa(high_def
));
815 nir_ssa_def_rewrite_uses(&low
->intrin
->dest
.ssa
, nir_src_for_ssa(low_def
));
816 nir_ssa_def_rewrite_uses_after(&high
->intrin
->dest
.ssa
, nir_src_for_ssa(high_def
),
817 high_def
->parent_instr
);
820 /* update the intrinsic */
821 first
->intrin
->num_components
= new_num_components
;
823 const struct intrinsic_info
*info
= first
->info
;
825 /* update the offset */
826 if (first
!= low
&& info
->base_src
>= 0) {
827 /* let nir_opt_algebraic() remove this addition. this doesn't have much
828 * issues with subtracting 16 from expressions like "(i + 1) * 16" because
829 * nir_opt_algebraic() turns them into "i * 16 + 16" */
830 b
->cursor
= nir_before_instr(first
->instr
);
832 nir_ssa_def
*new_base
= first
->intrin
->src
[info
->base_src
].ssa
;
833 new_base
= nir_iadd_imm(b
, new_base
, -(int)(high_start
/ 8u));
835 nir_instr_rewrite_src(first
->instr
, &first
->intrin
->src
[info
->base_src
],
836 nir_src_for_ssa(new_base
));
839 /* update the deref */
840 if (info
->deref_src
>= 0) {
841 b
->cursor
= nir_before_instr(first
->instr
);
843 nir_deref_instr
*deref
= nir_src_as_deref(first
->intrin
->src
[info
->deref_src
]);
844 if (first
!= low
&& high_start
!= 0)
845 deref
= subtract_deref(b
, deref
, high_start
/ 8u);
846 first
->deref
= cast_deref(b
, new_num_components
, new_bit_size
, deref
);
848 nir_instr_rewrite_src(first
->instr
, &first
->intrin
->src
[info
->deref_src
],
849 nir_src_for_ssa(&first
->deref
->dest
.ssa
));
852 /* update base/align */
853 bool has_base_index
=
854 nir_intrinsic_infos
[first
->intrin
->intrinsic
].index_map
[NIR_INTRINSIC_BASE
];
856 if (first
!= low
&& has_base_index
)
857 nir_intrinsic_set_base(first
->intrin
, nir_intrinsic_base(low
->intrin
));
859 first
->key
= low
->key
;
860 first
->offset
= low
->offset
;
861 first
->best_align
= get_best_align(low
);
865 nir_instr_remove(second
->instr
);
869 vectorize_stores(nir_builder
*b
, struct vectorize_ctx
*ctx
,
870 struct entry
*low
, struct entry
*high
,
871 struct entry
*first
, struct entry
*second
,
872 unsigned new_bit_size
, unsigned new_num_components
,
875 ASSERTED
unsigned low_size
= low
->intrin
->num_components
* get_bit_size(low
);
876 assert(low_size
% new_bit_size
== 0);
878 b
->cursor
= nir_before_instr(second
->instr
);
880 /* get new writemasks */
881 uint32_t low_write_mask
= nir_intrinsic_write_mask(low
->intrin
);
882 uint32_t high_write_mask
= nir_intrinsic_write_mask(high
->intrin
);
883 low_write_mask
= update_writemask(low_write_mask
, get_bit_size(low
), new_bit_size
);
884 high_write_mask
= update_writemask(high_write_mask
, get_bit_size(high
), new_bit_size
);
885 high_write_mask
<<= high_start
/ new_bit_size
;
887 uint32_t write_mask
= low_write_mask
| high_write_mask
;
889 /* convert booleans */
890 nir_ssa_def
*low_val
= low
->intrin
->src
[low
->info
->value_src
].ssa
;
891 nir_ssa_def
*high_val
= high
->intrin
->src
[high
->info
->value_src
].ssa
;
892 low_val
= low_val
->bit_size
== 1 ? nir_b2i(b
, low_val
, 32) : low_val
;
893 high_val
= high_val
->bit_size
== 1 ? nir_b2i(b
, high_val
, 32) : high_val
;
895 /* combine the data */
896 nir_ssa_def
*data_channels
[NIR_MAX_VEC_COMPONENTS
];
897 for (unsigned i
= 0; i
< new_num_components
; i
++) {
898 bool set_low
= low_write_mask
& (1 << i
);
899 bool set_high
= high_write_mask
& (1 << i
);
901 if (set_low
&& (!set_high
|| low
== second
)) {
902 unsigned offset
= i
* new_bit_size
;
903 data_channels
[i
] = nir_extract_bits(b
, &low_val
, 1, offset
, 1, new_bit_size
);
904 } else if (set_high
) {
905 assert(!set_low
|| high
== second
);
906 unsigned offset
= i
* new_bit_size
- high_start
;
907 data_channels
[i
] = nir_extract_bits(b
, &high_val
, 1, offset
, 1, new_bit_size
);
909 data_channels
[i
] = nir_ssa_undef(b
, 1, new_bit_size
);
912 nir_ssa_def
*data
= nir_vec(b
, data_channels
, new_num_components
);
914 /* update the intrinsic */
915 nir_intrinsic_set_write_mask(second
->intrin
, write_mask
);
916 second
->intrin
->num_components
= data
->num_components
;
918 const struct intrinsic_info
*info
= second
->info
;
919 assert(info
->value_src
>= 0);
920 nir_instr_rewrite_src(second
->instr
, &second
->intrin
->src
[info
->value_src
],
921 nir_src_for_ssa(data
));
923 /* update the offset */
924 if (second
!= low
&& info
->base_src
>= 0)
925 nir_instr_rewrite_src(second
->instr
, &second
->intrin
->src
[info
->base_src
],
926 low
->intrin
->src
[info
->base_src
]);
928 /* update the deref */
929 if (info
->deref_src
>= 0) {
930 b
->cursor
= nir_before_instr(second
->instr
);
931 second
->deref
= cast_deref(b
, new_num_components
, new_bit_size
,
932 nir_src_as_deref(low
->intrin
->src
[info
->deref_src
]));
933 nir_instr_rewrite_src(second
->instr
, &second
->intrin
->src
[info
->deref_src
],
934 nir_src_for_ssa(&second
->deref
->dest
.ssa
));
937 /* update base/align */
938 bool has_base_index
=
939 nir_intrinsic_infos
[second
->intrin
->intrinsic
].index_map
[NIR_INTRINSIC_BASE
];
941 if (second
!= low
&& has_base_index
)
942 nir_intrinsic_set_base(second
->intrin
, nir_intrinsic_base(low
->intrin
));
944 second
->key
= low
->key
;
945 second
->offset
= low
->offset
;
946 second
->best_align
= get_best_align(low
);
948 update_align(second
);
950 list_del(&first
->head
);
951 nir_instr_remove(first
->instr
);
954 /* Returns true if it can prove that "a" and "b" point to different resources. */
956 resources_different(nir_ssa_def
*a
, nir_ssa_def
*b
)
961 if (a
->parent_instr
->type
== nir_instr_type_load_const
&&
962 b
->parent_instr
->type
== nir_instr_type_load_const
) {
963 return nir_src_as_uint(nir_src_for_ssa(a
)) != nir_src_as_uint(nir_src_for_ssa(b
));
966 if (a
->parent_instr
->type
== nir_instr_type_intrinsic
&&
967 b
->parent_instr
->type
== nir_instr_type_intrinsic
) {
968 nir_intrinsic_instr
*aintrin
= nir_instr_as_intrinsic(a
->parent_instr
);
969 nir_intrinsic_instr
*bintrin
= nir_instr_as_intrinsic(b
->parent_instr
);
970 if (aintrin
->intrinsic
== nir_intrinsic_vulkan_resource_index
&&
971 bintrin
->intrinsic
== nir_intrinsic_vulkan_resource_index
) {
972 return nir_intrinsic_desc_set(aintrin
) != nir_intrinsic_desc_set(bintrin
) ||
973 nir_intrinsic_binding(aintrin
) != nir_intrinsic_binding(bintrin
) ||
974 resources_different(aintrin
->src
[0].ssa
, bintrin
->src
[0].ssa
);
982 compare_entries(struct entry
*a
, struct entry
*b
)
984 if (!entry_key_equals(a
->key
, b
->key
))
986 return b
->offset_signed
- a
->offset_signed
;
990 may_alias(struct entry
*a
, struct entry
*b
)
992 assert(mode_to_index(get_variable_mode(a
)) ==
993 mode_to_index(get_variable_mode(b
)));
995 /* if the resources/variables are definitively different and both have
996 * ACCESS_RESTRICT, we can assume they do not alias. */
997 bool res_different
= a
->key
->var
!= b
->key
->var
||
998 resources_different(a
->key
->resource
, b
->key
->resource
);
999 if (res_different
&& (a
->access
& ACCESS_RESTRICT
) && (b
->access
& ACCESS_RESTRICT
))
1002 /* we can't compare offsets if the resources/variables might be different */
1003 if (a
->key
->var
!= b
->key
->var
|| a
->key
->resource
!= b
->key
->resource
)
1006 /* use adjacency information */
1007 /* TODO: we can look closer at the entry keys */
1008 int64_t diff
= compare_entries(a
, b
);
1009 if (diff
!= INT64_MAX
) {
1010 /* with atomics, intrin->num_components can be 0 */
1012 return llabs(diff
) < MAX2(b
->intrin
->num_components
, 1u) * (get_bit_size(b
) / 8u);
1014 return diff
< MAX2(a
->intrin
->num_components
, 1u) * (get_bit_size(a
) / 8u);
1017 /* TODO: we can use deref information */
1023 check_for_aliasing(struct vectorize_ctx
*ctx
, struct entry
*first
, struct entry
*second
)
1025 nir_variable_mode mode
= get_variable_mode(first
);
1026 if (mode
& (nir_var_uniform
| nir_var_system_value
|
1027 nir_var_mem_push_const
| nir_var_mem_ubo
))
1030 unsigned mode_index
= mode_to_index(mode
);
1031 if (first
->is_store
) {
1032 /* find first entry that aliases "first" */
1033 list_for_each_entry_from(struct entry
, next
, first
, &ctx
->entries
[mode_index
], head
) {
1038 if (may_alias(first
, next
))
1042 /* find previous store that aliases this load */
1043 list_for_each_entry_from_rev(struct entry
, prev
, second
, &ctx
->entries
[mode_index
], head
) {
1048 if (prev
->is_store
&& may_alias(second
, prev
))
1057 check_for_robustness(struct vectorize_ctx
*ctx
, struct entry
*low
)
1059 nir_variable_mode mode
= get_variable_mode(low
);
1060 if (mode
& ctx
->robust_modes
) {
1061 unsigned low_bit_size
= get_bit_size(low
);
1062 unsigned low_size
= low
->intrin
->num_components
* low_bit_size
;
1064 /* don't attempt to vectorize accesses if the offset can overflow. */
1065 /* TODO: handle indirect accesses. */
1066 return low
->offset_signed
< 0 && low
->offset_signed
+ low_size
>= 0;
1073 is_strided_vector(const struct glsl_type
*type
)
1075 if (glsl_type_is_vector(type
)) {
1076 unsigned explicit_stride
= glsl_get_explicit_stride(type
);
1077 return explicit_stride
!= 0 && explicit_stride
!=
1078 type_scalar_size_bytes(glsl_get_array_element(type
));
1085 try_vectorize(nir_function_impl
*impl
, struct vectorize_ctx
*ctx
,
1086 struct entry
*low
, struct entry
*high
,
1087 struct entry
*first
, struct entry
*second
)
1089 if (!(get_variable_mode(first
) & ctx
->modes
) ||
1090 !(get_variable_mode(second
) & ctx
->modes
))
1093 if (check_for_aliasing(ctx
, first
, second
))
1096 if (check_for_robustness(ctx
, low
))
1099 /* we can only vectorize non-volatile loads/stores of the same type and with
1100 * the same access */
1101 if (first
->info
!= second
->info
|| first
->access
!= second
->access
||
1102 (first
->access
& ACCESS_VOLATILE
) || first
->info
->is_atomic
)
1105 /* don't attempt to vectorize accesses of row-major matrix columns */
1107 const struct glsl_type
*first_type
= first
->deref
->type
;
1108 const struct glsl_type
*second_type
= second
->deref
->type
;
1109 if (is_strided_vector(first_type
) || is_strided_vector(second_type
))
1113 /* gather information */
1114 uint64_t diff
= high
->offset_signed
- low
->offset_signed
;
1115 unsigned low_bit_size
= get_bit_size(low
);
1116 unsigned high_bit_size
= get_bit_size(high
);
1117 unsigned low_size
= low
->intrin
->num_components
* low_bit_size
;
1118 unsigned high_size
= high
->intrin
->num_components
* high_bit_size
;
1119 unsigned new_size
= MAX2(diff
* 8u + high_size
, low_size
);
1121 /* find a good bit size for the new load/store */
1122 unsigned new_bit_size
= 0;
1123 if (new_bitsize_acceptable(ctx
, low_bit_size
, low
, high
, new_size
)) {
1124 new_bit_size
= low_bit_size
;
1125 } else if (low_bit_size
!= high_bit_size
&&
1126 new_bitsize_acceptable(ctx
, high_bit_size
, low
, high
, new_size
)) {
1127 new_bit_size
= high_bit_size
;
1130 for (; new_bit_size
>= 8; new_bit_size
/= 2) {
1131 /* don't repeat trying out bitsizes */
1132 if (new_bit_size
== low_bit_size
|| new_bit_size
== high_bit_size
)
1134 if (new_bitsize_acceptable(ctx
, new_bit_size
, low
, high
, new_size
))
1137 if (new_bit_size
< 8)
1140 unsigned new_num_components
= new_size
/ new_bit_size
;
1142 /* vectorize the loads/stores */
1144 nir_builder_init(&b
, impl
);
1146 if (first
->is_store
)
1147 vectorize_stores(&b
, ctx
, low
, high
, first
, second
,
1148 new_bit_size
, new_num_components
, diff
* 8u);
1150 vectorize_loads(&b
, ctx
, low
, high
, first
, second
,
1151 new_bit_size
, new_num_components
, diff
* 8u);
1157 vectorize_entries(struct vectorize_ctx
*ctx
, nir_function_impl
*impl
, struct hash_table
*ht
)
1162 bool progress
= false;
1163 hash_table_foreach(ht
, entry
) {
1164 struct util_dynarray
*arr
= entry
->data
;
1168 qsort(util_dynarray_begin(arr
),
1169 util_dynarray_num_elements(arr
, struct entry
*),
1170 sizeof(struct entry
*), &sort_entries
);
1173 for (; i
< util_dynarray_num_elements(arr
, struct entry
*) - 1; i
++) {
1174 struct entry
*low
= *util_dynarray_element(arr
, struct entry
*, i
);
1175 struct entry
*high
= *util_dynarray_element(arr
, struct entry
*, i
+ 1);
1177 uint64_t diff
= high
->offset_signed
- low
->offset_signed
;
1178 if (diff
> get_bit_size(low
) / 8u * low
->intrin
->num_components
) {
1179 progress
|= update_align(low
);
1183 struct entry
*first
= low
->index
< high
->index
? low
: high
;
1184 struct entry
*second
= low
->index
< high
->index
? high
: low
;
1186 if (try_vectorize(impl
, ctx
, low
, high
, first
, second
)) {
1187 *util_dynarray_element(arr
, struct entry
*, i
) = NULL
;
1188 *util_dynarray_element(arr
, struct entry
*, i
+ 1) = low
->is_store
? second
: first
;
1191 progress
|= update_align(low
);
1195 struct entry
*last
= *util_dynarray_element(arr
, struct entry
*, i
);
1196 progress
|= update_align(last
);
1199 _mesa_hash_table_clear(ht
, delete_entry_dynarray
);
1205 handle_barrier(struct vectorize_ctx
*ctx
, bool *progress
, nir_function_impl
*impl
, nir_instr
*instr
)
1208 bool acquire
= true;
1209 bool release
= true;
1210 if (instr
->type
== nir_instr_type_intrinsic
) {
1211 nir_intrinsic_instr
*intrin
= nir_instr_as_intrinsic(instr
);
1212 switch (intrin
->intrinsic
) {
1213 case nir_intrinsic_group_memory_barrier
:
1214 case nir_intrinsic_memory_barrier
:
1215 modes
= nir_var_mem_ssbo
| nir_var_mem_shared
| nir_var_mem_global
;
1217 /* prevent speculative loads/stores */
1218 case nir_intrinsic_discard_if
:
1219 case nir_intrinsic_discard
:
1220 modes
= nir_var_all
;
1222 case nir_intrinsic_memory_barrier_buffer
:
1223 modes
= nir_var_mem_ssbo
| nir_var_mem_global
;
1225 case nir_intrinsic_memory_barrier_shared
:
1226 modes
= nir_var_mem_shared
;
1228 case nir_intrinsic_scoped_barrier
:
1229 if (nir_intrinsic_memory_scope(intrin
) == NIR_SCOPE_NONE
)
1232 modes
= nir_intrinsic_memory_modes(intrin
) & (nir_var_mem_ssbo
|
1233 nir_var_mem_shared
|
1234 nir_var_mem_global
);
1235 acquire
= nir_intrinsic_memory_semantics(intrin
) & NIR_MEMORY_ACQUIRE
;
1236 release
= nir_intrinsic_memory_semantics(intrin
) & NIR_MEMORY_RELEASE
;
1237 switch (nir_intrinsic_memory_scope(intrin
)) {
1238 case NIR_SCOPE_INVOCATION
:
1239 case NIR_SCOPE_SUBGROUP
:
1240 /* a barier should never be required for correctness with these scopes */
1250 } else if (instr
->type
== nir_instr_type_call
) {
1251 modes
= nir_var_all
;
1257 unsigned mode_index
= u_bit_scan(&modes
);
1258 if ((1 << mode_index
) == nir_var_mem_global
) {
1259 /* Global should be rolled in with SSBO */
1260 assert(list_is_empty(&ctx
->entries
[mode_index
]));
1261 assert(ctx
->loads
[mode_index
] == NULL
);
1262 assert(ctx
->stores
[mode_index
] == NULL
);
1267 *progress
|= vectorize_entries(ctx
, impl
, ctx
->loads
[mode_index
]);
1269 *progress
|= vectorize_entries(ctx
, impl
, ctx
->stores
[mode_index
]);
1276 process_block(nir_function_impl
*impl
, struct vectorize_ctx
*ctx
, nir_block
*block
)
1278 bool progress
= false;
1280 for (unsigned i
= 0; i
< nir_num_variable_modes
; i
++) {
1281 list_inithead(&ctx
->entries
[i
]);
1283 _mesa_hash_table_clear(ctx
->loads
[i
], delete_entry_dynarray
);
1285 _mesa_hash_table_clear(ctx
->stores
[i
], delete_entry_dynarray
);
1288 /* create entries */
1289 unsigned next_index
= 0;
1291 nir_foreach_instr_safe(instr
, block
) {
1292 if (handle_barrier(ctx
, &progress
, impl
, instr
))
1295 /* gather information */
1296 if (instr
->type
!= nir_instr_type_intrinsic
)
1298 nir_intrinsic_instr
*intrin
= nir_instr_as_intrinsic(instr
);
1300 const struct intrinsic_info
*info
= get_info(intrin
->intrinsic
);
1304 nir_variable_mode mode
= info
->mode
;
1306 mode
= nir_src_as_deref(intrin
->src
[info
->deref_src
])->mode
;
1307 if (!(mode
& aliasing_modes(ctx
->modes
)))
1309 unsigned mode_index
= mode_to_index(mode
);
1312 struct entry
*entry
= create_entry(ctx
, info
, intrin
);
1313 entry
->index
= next_index
++;
1315 list_addtail(&entry
->head
, &ctx
->entries
[mode_index
]);
1317 /* add the entry to a hash table */
1319 struct hash_table
*adj_ht
= NULL
;
1320 if (entry
->is_store
) {
1321 if (!ctx
->stores
[mode_index
])
1322 ctx
->stores
[mode_index
] = _mesa_hash_table_create(ctx
, &hash_entry_key
, &entry_key_equals
);
1323 adj_ht
= ctx
->stores
[mode_index
];
1325 if (!ctx
->loads
[mode_index
])
1326 ctx
->loads
[mode_index
] = _mesa_hash_table_create(ctx
, &hash_entry_key
, &entry_key_equals
);
1327 adj_ht
= ctx
->loads
[mode_index
];
1330 uint32_t key_hash
= hash_entry_key(entry
->key
);
1331 struct hash_entry
*adj_entry
= _mesa_hash_table_search_pre_hashed(adj_ht
, key_hash
, entry
->key
);
1332 struct util_dynarray
*arr
;
1333 if (adj_entry
&& adj_entry
->data
) {
1334 arr
= (struct util_dynarray
*)adj_entry
->data
;
1336 arr
= ralloc(ctx
, struct util_dynarray
);
1337 util_dynarray_init(arr
, arr
);
1338 _mesa_hash_table_insert_pre_hashed(adj_ht
, key_hash
, entry
->key
, arr
);
1340 util_dynarray_append(arr
, struct entry
*, entry
);
1343 /* sort and combine entries */
1344 for (unsigned i
= 0; i
< nir_num_variable_modes
; i
++) {
1345 progress
|= vectorize_entries(ctx
, impl
, ctx
->loads
[i
]);
1346 progress
|= vectorize_entries(ctx
, impl
, ctx
->stores
[i
]);
1353 nir_opt_load_store_vectorize(nir_shader
*shader
, nir_variable_mode modes
,
1354 nir_should_vectorize_mem_func callback
,
1355 nir_variable_mode robust_modes
)
1357 bool progress
= false;
1359 struct vectorize_ctx
*ctx
= rzalloc(NULL
, struct vectorize_ctx
);
1361 ctx
->callback
= callback
;
1362 ctx
->robust_modes
= robust_modes
;
1364 nir_shader_index_vars(shader
, modes
);
1366 nir_foreach_function(function
, shader
) {
1367 if (function
->impl
) {
1368 if (modes
& nir_var_function_temp
)
1369 nir_function_impl_index_vars(function
->impl
);
1371 nir_foreach_block(block
, function
->impl
)
1372 progress
|= process_block(function
->impl
, ctx
, block
);
1374 nir_metadata_preserve(function
->impl
,
1375 nir_metadata_block_index
|
1376 nir_metadata_dominance
|
1377 nir_metadata_live_ssa_defs
);