2 * Copyright © 2019 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "nir_builder.h"
26 #include "nir_deref.h"
28 #include "util/bitscan.h"
29 #include "util/list.h"
30 #include "util/u_math.h"
32 /* Combine stores of vectors to the same deref into a single store.
34 * This per-block pass keeps track of stores of vectors to the same
35 * destination and combines them into the last store of the sequence. Dead
36 * stores (or parts of the store) found during the process are removed.
38 * A pending combination becomes an actual combination in various situations:
39 * at the end of the block, when another instruction uses the memory or due to
42 * Besides vectors, the pass also look at array derefs of vectors. For direct
43 * array derefs, it works like a write mask access to the given component.
44 * For indirect access there's no way to know before hand what component it
45 * will overlap with, so the combination is finished -- the indirect remains
49 /* Keep track of a group of stores that can be combined. All stores share the
52 struct combined_store
{
53 struct list_head link
;
55 nir_component_mask_t write_mask
;
58 /* Latest store added. It is reused when combining. */
59 nir_intrinsic_instr
*latest
;
61 /* Original store for each component. The number of times a store appear
62 * in this array is kept in the store's pass_flags.
64 nir_intrinsic_instr
*stores
[NIR_MAX_VEC_COMPONENTS
];
67 struct combine_stores_state
{
68 nir_variable_mode modes
;
70 /* Pending store combinations. */
71 struct list_head pending
;
73 /* Per function impl state. */
78 /* Allocator and freelist to reuse structs between functions. */
80 struct list_head freelist
;
83 static struct combined_store
*
84 alloc_combined_store(struct combine_stores_state
*state
)
86 struct combined_store
*result
;
87 if (list_is_empty(&state
->freelist
)) {
88 result
= linear_zalloc_child(state
->lin_ctx
, sizeof(*result
));
90 result
= list_first_entry(&state
->freelist
,
91 struct combined_store
,
93 list_del(&result
->link
);
94 memset(result
, 0, sizeof(*result
));
100 free_combined_store(struct combine_stores_state
*state
,
101 struct combined_store
*combo
)
103 list_del(&combo
->link
);
104 combo
->write_mask
= 0;
105 list_add(&combo
->link
, &state
->freelist
);
109 combine_stores(struct combine_stores_state
*state
,
110 struct combined_store
*combo
)
112 assert(combo
->latest
);
113 assert(combo
->latest
->intrinsic
== nir_intrinsic_store_deref
);
115 /* If the combined writemask is the same as the latest store, we know there
116 * is only one store in the combination, so nothing to combine.
118 if ((combo
->write_mask
& nir_intrinsic_write_mask(combo
->latest
)) ==
122 state
->b
.cursor
= nir_before_instr(&combo
->latest
->instr
);
124 /* Build a new vec, to be used as source for the combined store. As it
125 * gets build, remove previous stores that are not needed anymore.
127 nir_ssa_def
*comps
[NIR_MAX_VEC_COMPONENTS
] = {0};
128 unsigned num_components
= glsl_get_vector_elements(combo
->dst
->type
);
129 unsigned bit_size
= combo
->latest
->src
[1].ssa
->bit_size
;
130 for (unsigned i
= 0; i
< num_components
; i
++) {
131 nir_intrinsic_instr
*store
= combo
->stores
[i
];
132 if (combo
->write_mask
& (1 << i
)) {
134 assert(store
->src
[1].is_ssa
);
136 /* If store->num_components == 1 then we are in the deref-of-vec case
137 * and store->src[1] is a scalar. Otherwise, we're a regular vector
138 * load and we have to pick off a component.
140 comps
[i
] = store
->num_components
== 1 ?
142 nir_channel(&state
->b
, store
->src
[1].ssa
, i
);
144 assert(store
->instr
.pass_flags
> 0);
145 if (--store
->instr
.pass_flags
== 0 && store
!= combo
->latest
)
146 nir_instr_remove(&store
->instr
);
148 comps
[i
] = nir_ssa_undef(&state
->b
, 1, bit_size
);
151 assert(combo
->latest
->instr
.pass_flags
== 0);
152 nir_ssa_def
*vec
= nir_vec(&state
->b
, comps
, num_components
);
154 /* Fix the latest store with the combined information. */
155 nir_intrinsic_instr
*store
= combo
->latest
;
157 /* In this case, our store is as an array deref of a vector so we need to
158 * rewrite it to use a deref to the whole vector.
160 if (store
->num_components
== 1) {
161 store
->num_components
= num_components
;
162 nir_instr_rewrite_src(&store
->instr
, &store
->src
[0],
163 nir_src_for_ssa(&combo
->dst
->dest
.ssa
));
166 assert(store
->num_components
== num_components
);
167 nir_intrinsic_set_write_mask(store
, combo
->write_mask
);
168 nir_instr_rewrite_src(&store
->instr
, &store
->src
[1],
169 nir_src_for_ssa(vec
));
170 state
->progress
= true;
174 combine_stores_with_deref(struct combine_stores_state
*state
,
175 nir_deref_instr
*deref
)
177 if ((state
->modes
& deref
->mode
) == 0)
180 list_for_each_entry_safe(struct combined_store
, combo
, &state
->pending
, link
) {
181 if (nir_compare_derefs(combo
->dst
, deref
) & nir_derefs_may_alias_bit
) {
182 combine_stores(state
, combo
);
183 free_combined_store(state
, combo
);
189 combine_stores_with_modes(struct combine_stores_state
*state
,
190 nir_variable_mode modes
)
192 if ((state
->modes
& modes
) == 0)
195 list_for_each_entry_safe(struct combined_store
, combo
, &state
->pending
, link
) {
196 if (combo
->dst
->mode
& modes
) {
197 combine_stores(state
, combo
);
198 free_combined_store(state
, combo
);
203 static struct combined_store
*
204 find_matching_combined_store(struct combine_stores_state
*state
,
205 nir_deref_instr
*deref
)
207 list_for_each_entry(struct combined_store
, combo
, &state
->pending
, link
) {
208 if (nir_compare_derefs(combo
->dst
, deref
) & nir_derefs_equal_bit
)
215 update_combined_store(struct combine_stores_state
*state
,
216 nir_intrinsic_instr
*intrin
)
218 nir_deref_instr
*dst
= nir_src_as_deref(intrin
->src
[0]);
219 if ((dst
->mode
& state
->modes
) == 0)
223 nir_deref_instr
*vec_dst
;
225 if (glsl_type_is_vector(dst
->type
)) {
226 vec_mask
= nir_intrinsic_write_mask(intrin
);
229 /* Besides vectors, only direct array derefs of vectors are handled. */
230 if (dst
->deref_type
!= nir_deref_type_array
||
231 !nir_src_is_const(dst
->arr
.index
) ||
232 !glsl_type_is_vector(nir_deref_instr_parent(dst
)->type
)) {
233 combine_stores_with_deref(state
, dst
);
237 uint64_t index
= nir_src_as_uint(dst
->arr
.index
);
238 vec_dst
= nir_deref_instr_parent(dst
);
240 if (index
>= glsl_get_vector_elements(vec_dst
->type
)) {
241 /* Storing to an invalid index is a no-op. */
242 nir_instr_remove(&intrin
->instr
);
243 state
->progress
= true;
247 vec_mask
= 1 << index
;
250 struct combined_store
*combo
= find_matching_combined_store(state
, vec_dst
);
252 combo
= alloc_combined_store(state
);
253 combo
->dst
= vec_dst
;
254 list_add(&combo
->link
, &state
->pending
);
257 /* Use pass_flags to reference count the store based on how many
258 * components are still used by the combination.
260 intrin
->instr
.pass_flags
= util_bitcount(vec_mask
);
261 combo
->latest
= intrin
;
263 /* Update the combined_store, clearing up older overlapping references. */
264 combo
->write_mask
|= vec_mask
;
266 unsigned i
= u_bit_scan(&vec_mask
);
267 nir_intrinsic_instr
*prev_store
= combo
->stores
[i
];
270 if (--prev_store
->instr
.pass_flags
== 0) {
271 nir_instr_remove(&prev_store
->instr
);
273 assert(glsl_type_is_vector(
274 nir_src_as_deref(prev_store
->src
[0])->type
));
275 nir_component_mask_t prev_mask
= nir_intrinsic_write_mask(prev_store
);
276 nir_intrinsic_set_write_mask(prev_store
, prev_mask
& ~(1 << i
));
278 state
->progress
= true;
280 combo
->stores
[i
] = combo
->latest
;
285 combine_stores_block(struct combine_stores_state
*state
, nir_block
*block
)
287 nir_foreach_instr_safe(instr
, block
) {
288 if (instr
->type
== nir_instr_type_call
) {
289 combine_stores_with_modes(state
, nir_var_shader_out
|
290 nir_var_shader_temp
|
291 nir_var_function_temp
|
298 if (instr
->type
!= nir_instr_type_intrinsic
)
301 nir_intrinsic_instr
*intrin
= nir_instr_as_intrinsic(instr
);
302 switch (intrin
->intrinsic
) {
303 case nir_intrinsic_store_deref
:
304 if (nir_intrinsic_access(intrin
) & ACCESS_VOLATILE
) {
305 nir_deref_instr
*dst
= nir_src_as_deref(intrin
->src
[0]);
306 /* When we see a volatile store, we go ahead and combine all
307 * previous non-volatile stores which touch that address and
308 * specifically don't add the volatile store to the list. This
309 * way we guarantee that the volatile store isn't combined with
310 * anything and no non-volatile stores are combined across a
313 combine_stores_with_deref(state
, dst
);
315 update_combined_store(state
, intrin
);
319 case nir_intrinsic_control_barrier
:
320 case nir_intrinsic_group_memory_barrier
:
321 case nir_intrinsic_memory_barrier
:
322 combine_stores_with_modes(state
, nir_var_shader_out
|
328 case nir_intrinsic_memory_barrier_buffer
:
329 combine_stores_with_modes(state
, nir_var_mem_ssbo
|
333 case nir_intrinsic_memory_barrier_shared
:
334 combine_stores_with_modes(state
, nir_var_mem_shared
);
337 case nir_intrinsic_memory_barrier_tcs_patch
:
338 combine_stores_with_modes(state
, nir_var_shader_out
);
341 case nir_intrinsic_scoped_barrier
:
342 if (nir_intrinsic_memory_semantics(intrin
) & NIR_MEMORY_RELEASE
) {
343 combine_stores_with_modes(state
,
344 nir_intrinsic_memory_modes(intrin
));
348 case nir_intrinsic_emit_vertex
:
349 case nir_intrinsic_emit_vertex_with_counter
:
350 combine_stores_with_modes(state
, nir_var_shader_out
);
353 case nir_intrinsic_load_deref
: {
354 nir_deref_instr
*src
= nir_src_as_deref(intrin
->src
[0]);
355 combine_stores_with_deref(state
, src
);
359 case nir_intrinsic_copy_deref
: {
360 nir_deref_instr
*dst
= nir_src_as_deref(intrin
->src
[0]);
361 nir_deref_instr
*src
= nir_src_as_deref(intrin
->src
[1]);
362 combine_stores_with_deref(state
, dst
);
363 combine_stores_with_deref(state
, src
);
367 case nir_intrinsic_deref_atomic_add
:
368 case nir_intrinsic_deref_atomic_imin
:
369 case nir_intrinsic_deref_atomic_umin
:
370 case nir_intrinsic_deref_atomic_imax
:
371 case nir_intrinsic_deref_atomic_umax
:
372 case nir_intrinsic_deref_atomic_and
:
373 case nir_intrinsic_deref_atomic_or
:
374 case nir_intrinsic_deref_atomic_xor
:
375 case nir_intrinsic_deref_atomic_exchange
:
376 case nir_intrinsic_deref_atomic_comp_swap
: {
377 nir_deref_instr
*dst
= nir_src_as_deref(intrin
->src
[0]);
378 combine_stores_with_deref(state
, dst
);
387 /* At the end of the block, try all the remaining combinations. */
388 combine_stores_with_modes(state
, state
->modes
);
392 combine_stores_impl(struct combine_stores_state
*state
, nir_function_impl
*impl
)
394 state
->progress
= false;
395 nir_builder_init(&state
->b
, impl
);
397 nir_foreach_block(block
, impl
)
398 combine_stores_block(state
, block
);
400 if (state
->progress
) {
401 nir_metadata_preserve(impl
, nir_metadata_block_index
|
402 nir_metadata_dominance
);
404 nir_metadata_preserve(impl
, nir_metadata_all
);
407 return state
->progress
;
411 nir_opt_combine_stores(nir_shader
*shader
, nir_variable_mode modes
)
413 void *mem_ctx
= ralloc_context(NULL
);
414 struct combine_stores_state state
= {
416 .lin_ctx
= linear_zalloc_parent(mem_ctx
, 0),
419 list_inithead(&state
.pending
);
420 list_inithead(&state
.freelist
);
422 bool progress
= false;
424 nir_foreach_function(function
, shader
) {
427 progress
|= combine_stores_impl(&state
, function
->impl
);
430 ralloc_free(mem_ctx
);