2 * Copyright © 2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "compiler/nir/nir.h"
26 #include "util/u_dynarray.h"
29 * \file brw_nir_analyze_ubo_ranges.c
31 * This pass decides which portions of UBOs to upload as push constants,
32 * so shaders can access them as part of the thread payload, rather than
33 * having to issue expensive memory reads to pull the data.
35 * The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different
36 * buffers, in GRF (256-bit/32-byte) units.
38 * To do this, we examine NIR load_ubo intrinsics, recording the number of
39 * loads at each offset. We track offsets at a 32-byte granularity, so even
40 * fields with a bit of padding between them tend to fall into contiguous
41 * ranges. We build a list of these ranges, tracking their "cost" (number
42 * of registers required) and "benefit" (number of pull loads eliminated
43 * by pushing the range). We then sort the list to obtain the four best
44 * ranges (most benefit for the least cost).
47 struct ubo_range_entry
49 struct brw_ubo_range range
;
54 score(const struct ubo_range_entry
*entry
)
56 return 2 * entry
->benefit
- entry
->range
.length
;
60 * Compares score for two UBO range entries.
62 * For a descending qsort().
65 cmp_ubo_range_entry(const void *va
, const void *vb
)
67 const struct ubo_range_entry
*a
= va
;
68 const struct ubo_range_entry
*b
= vb
;
70 /* Rank based on scores */
71 int delta
= score(b
) - score(a
);
73 /* Then use the UBO block index as a tie-breaker */
75 delta
= b
->range
.block
- a
->range
.block
;
77 /* Finally use the UBO offset as a second tie-breaker */
79 delta
= b
->range
.block
- a
->range
.block
;
86 /* Each bit in the offsets bitfield represents a 32-byte section of data.
87 * If it's set to one, there is interesting UBO data at that offset. If
88 * not, there's a "hole" - padding between data - or just nothing at all.
94 struct ubo_analysis_state
96 struct hash_table
*blocks
;
97 bool uses_regular_uniforms
;
100 static struct ubo_block_info
*
101 get_block_info(struct ubo_analysis_state
*state
, int block
)
103 uint32_t hash
= block
+ 1;
104 void *key
= (void *) (uintptr_t) hash
;
106 struct hash_entry
*entry
=
107 _mesa_hash_table_search_pre_hashed(state
->blocks
, hash
, key
);
110 return (struct ubo_block_info
*) entry
->data
;
112 struct ubo_block_info
*info
=
113 rzalloc(state
->blocks
, struct ubo_block_info
);
114 _mesa_hash_table_insert_pre_hashed(state
->blocks
, hash
, key
, info
);
120 analyze_ubos_block(struct ubo_analysis_state
*state
, nir_block
*block
)
122 nir_foreach_instr(instr
, block
) {
123 if (instr
->type
!= nir_instr_type_intrinsic
)
126 nir_intrinsic_instr
*intrin
= nir_instr_as_intrinsic(instr
);
127 switch (intrin
->intrinsic
) {
128 case nir_intrinsic_load_uniform
:
129 case nir_intrinsic_image_deref_load
:
130 case nir_intrinsic_image_deref_store
:
131 case nir_intrinsic_image_deref_atomic_add
:
132 case nir_intrinsic_image_deref_atomic_imin
:
133 case nir_intrinsic_image_deref_atomic_umin
:
134 case nir_intrinsic_image_deref_atomic_imax
:
135 case nir_intrinsic_image_deref_atomic_umax
:
136 case nir_intrinsic_image_deref_atomic_and
:
137 case nir_intrinsic_image_deref_atomic_or
:
138 case nir_intrinsic_image_deref_atomic_xor
:
139 case nir_intrinsic_image_deref_atomic_exchange
:
140 case nir_intrinsic_image_deref_atomic_comp_swap
:
141 case nir_intrinsic_image_deref_size
:
142 state
->uses_regular_uniforms
= true;
145 case nir_intrinsic_load_ubo
:
146 break; /* Fall through to the analysis below */
149 continue; /* Not a uniform or UBO intrinsic */
152 if (nir_src_is_const(intrin
->src
[0]) &&
153 nir_src_is_const(intrin
->src
[1])) {
154 const int block
= nir_src_as_uint(intrin
->src
[0]);
155 const unsigned byte_offset
= nir_src_as_uint(intrin
->src
[1]);
156 const int offset
= byte_offset
/ 32;
158 /* Avoid shifting by larger than the width of our bitfield, as this
159 * is undefined in C. Even if we require multiple bits to represent
160 * the entire value, it's OK to record a partial value - the backend
161 * is capable of falling back to pull loads for later components of
162 * vectors, as it has to shrink ranges for other reasons anyway.
167 /* The value might span multiple 32-byte chunks. */
168 const int bytes
= nir_intrinsic_dest_components(intrin
) *
169 (nir_dest_bit_size(intrin
->dest
) / 8);
170 const int start
= ROUND_DOWN_TO(byte_offset
, 32);
171 const int end
= ALIGN(byte_offset
+ bytes
, 32);
172 const int chunks
= (end
- start
) / 32;
174 /* TODO: should we count uses in loops as higher benefit? */
176 struct ubo_block_info
*info
= get_block_info(state
, block
);
177 info
->offsets
|= ((1ull << chunks
) - 1) << offset
;
178 info
->uses
[offset
]++;
184 print_ubo_entry(FILE *file
,
185 const struct ubo_range_entry
*entry
,
186 struct ubo_analysis_state
*state
)
188 struct ubo_block_info
*info
= get_block_info(state
, entry
->range
.block
);
191 "block %2d, start %2d, length %2d, bits = %"PRIx64
", "
192 "benefit %2d, cost %2d, score = %2d\n",
193 entry
->range
.block
, entry
->range
.start
, entry
->range
.length
,
194 info
->offsets
, entry
->benefit
, entry
->range
.length
, score(entry
));
198 brw_nir_analyze_ubo_ranges(const struct brw_compiler
*compiler
,
200 const struct brw_vs_prog_key
*vs_key
,
201 struct brw_ubo_range out_ranges
[4])
203 const struct gen_device_info
*devinfo
= compiler
->devinfo
;
205 if ((devinfo
->gen
<= 7 && !devinfo
->is_haswell
) ||
206 !compiler
->scalar_stage
[nir
->info
.stage
]) {
207 memset(out_ranges
, 0, 4 * sizeof(struct brw_ubo_range
));
211 void *mem_ctx
= ralloc_context(NULL
);
213 struct ubo_analysis_state state
= {
214 .uses_regular_uniforms
= false,
216 _mesa_hash_table_create(mem_ctx
, NULL
, _mesa_key_pointer_equal
),
219 switch (nir
->info
.stage
) {
220 case MESA_SHADER_VERTEX
:
221 if (vs_key
&& vs_key
->nr_userclip_plane_consts
> 0)
222 state
.uses_regular_uniforms
= true;
225 case MESA_SHADER_COMPUTE
:
226 /* Compute shaders use push constants to get the subgroup ID so it's
227 * best to just assume some system values are pushed.
229 state
.uses_regular_uniforms
= true;
236 /* Walk the IR, recording how many times each UBO block/offset is used. */
237 nir_foreach_function(function
, nir
) {
238 if (function
->impl
) {
239 nir_foreach_block(block
, function
->impl
) {
240 analyze_ubos_block(&state
, block
);
245 /* Find ranges: a block, starting 32-byte offset, and length. */
246 struct util_dynarray ranges
;
247 util_dynarray_init(&ranges
, mem_ctx
);
249 hash_table_foreach(state
.blocks
, entry
) {
250 const int b
= entry
->hash
- 1;
251 const struct ubo_block_info
*info
= entry
->data
;
252 uint64_t offsets
= info
->offsets
;
254 /* Walk through the offsets bitfield, finding contiguous regions of
257 * 0000000001111111111111000000000000111111111111110000000011111100
258 * ^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ ^^^^^^
260 * Each of these will become a UBO range.
262 while (offsets
!= 0) {
263 /* Find the first 1 in the offsets bitfield. This represents the
264 * start of a range of interesting UBO data. Make it zero-indexed.
266 int first_bit
= ffsll(offsets
) - 1;
268 /* Find the first 0 bit in offsets beyond first_bit. To find the
269 * first zero bit, we find the first 1 bit in the complement. In
270 * order to ignore bits before first_bit, we mask off those bits.
272 int first_hole
= ffsll(~offsets
& ~((1ull << first_bit
) - 1)) - 1;
274 if (first_hole
== -1) {
275 /* If we didn't find a hole, then set it to the end of the
276 * bitfield. There are no more ranges to process.
281 /* We've processed all bits before first_hole. Mask them off. */
282 offsets
&= ~((1ull << first_hole
) - 1);
285 struct ubo_range_entry
*entry
=
286 util_dynarray_grow(&ranges
, struct ubo_range_entry
, 1);
288 entry
->range
.block
= b
;
289 entry
->range
.start
= first_bit
;
290 /* first_hole is one beyond the end, so we don't need to add 1 */
291 entry
->range
.length
= first_hole
- first_bit
;
294 for (int i
= 0; i
< entry
->range
.length
; i
++)
295 entry
->benefit
+= info
->uses
[first_bit
+ i
];
299 int nr_entries
= ranges
.size
/ sizeof(struct ubo_range_entry
);
302 util_dynarray_foreach(&ranges
, struct ubo_range_entry
, entry
) {
303 print_ubo_entry(stderr
, entry
, &state
);
307 /* TODO: Consider combining ranges.
309 * We can only push 3-4 ranges via 3DSTATE_CONSTANT_XS. If there are
310 * more ranges, and two are close by with only a small hole, it may be
311 * worth combining them. The holes will waste register space, but the
312 * benefit of removing pulls may outweigh that cost.
315 /* Sort the list so the most beneficial ranges are at the front. */
316 qsort(ranges
.data
, nr_entries
, sizeof(struct ubo_range_entry
),
317 cmp_ubo_range_entry
);
319 struct ubo_range_entry
*entries
= ranges
.data
;
321 /* Return the top 4 or so. We drop by one if regular uniforms are in
322 * use, assuming one push buffer will be dedicated to those. We may
323 * also only get 3 on Haswell if we can't write INSTPM.
325 * The backend may need to shrink these ranges to ensure that they
326 * don't exceed the maximum push constant limits. It can simply drop
327 * the tail of the list, as that's the least valuable portion. We
328 * unfortunately can't truncate it here, because we don't know what
329 * the backend is planning to do with regular uniforms.
331 const int max_ubos
= (compiler
->constant_buffer_0_is_relative
? 3 : 4) -
332 state
.uses_regular_uniforms
;
333 nr_entries
= MIN2(nr_entries
, max_ubos
);
335 for (int i
= 0; i
< nr_entries
; i
++) {
336 out_ranges
[i
] = entries
[i
].range
;
338 for (int i
= nr_entries
; i
< 4; i
++) {
339 out_ranges
[i
].block
= 0;
340 out_ranges
[i
].start
= 0;
341 out_ranges
[i
].length
= 0;
344 ralloc_free(ranges
.mem_ctx
);