intel/compiler: Properly consider UBO loads that cross 32B boundaries.
[mesa.git] / src / intel / compiler / brw_nir_analyze_ubo_ranges.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_nir.h"
25 #include "compiler/nir/nir.h"
26 #include "util/u_dynarray.h"
27
28 /**
29 * \file brw_nir_analyze_ubo_ranges.c
30 *
31 * This pass decides which portions of UBOs to upload as push constants,
32 * so shaders can access them as part of the thread payload, rather than
33 * having to issue expensive memory reads to pull the data.
34 *
35 * The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different
36 * buffers, in GRF (256-bit/32-byte) units.
37 *
38 * To do this, we examine NIR load_ubo intrinsics, recording the number of
39 * loads at each offset. We track offsets at a 32-byte granularity, so even
40 * fields with a bit of padding between them tend to fall into contiguous
41 * ranges. We build a list of these ranges, tracking their "cost" (number
42 * of registers required) and "benefit" (number of pull loads eliminated
43 * by pushing the range). We then sort the list to obtain the four best
44 * ranges (most benefit for the least cost).
45 */
46
47 struct ubo_range_entry
48 {
49 struct brw_ubo_range range;
50 int benefit;
51 };
52
53 static int
54 score(const struct ubo_range_entry *entry)
55 {
56 return 2 * entry->benefit - entry->range.length;
57 }
58
59 /**
60 * Compares score for two UBO range entries.
61 *
62 * For a descending qsort().
63 */
64 static int
65 cmp_ubo_range_entry(const void *va, const void *vb)
66 {
67 const struct ubo_range_entry *a = va;
68 const struct ubo_range_entry *b = vb;
69
70 /* Rank based on scores */
71 int delta = score(b) - score(a);
72
73 /* Then use the UBO block index as a tie-breaker */
74 if (delta == 0)
75 delta = b->range.block - a->range.block;
76
77 /* Finally use the UBO offset as a second tie-breaker */
78 if (delta == 0)
79 delta = b->range.block - a->range.block;
80
81 return delta;
82 }
83
84 struct ubo_block_info
85 {
86 /* Each bit in the offsets bitfield represents a 32-byte section of data.
87 * If it's set to one, there is interesting UBO data at that offset. If
88 * not, there's a "hole" - padding between data - or just nothing at all.
89 */
90 uint64_t offsets;
91 uint8_t uses[64];
92 };
93
94 struct ubo_analysis_state
95 {
96 struct hash_table *blocks;
97 bool uses_regular_uniforms;
98 };
99
100 static struct ubo_block_info *
101 get_block_info(struct ubo_analysis_state *state, int block)
102 {
103 uint32_t hash = block + 1;
104 void *key = (void *) (uintptr_t) hash;
105
106 struct hash_entry *entry =
107 _mesa_hash_table_search_pre_hashed(state->blocks, hash, key);
108
109 if (entry)
110 return (struct ubo_block_info *) entry->data;
111
112 struct ubo_block_info *info =
113 rzalloc(state->blocks, struct ubo_block_info);
114 _mesa_hash_table_insert_pre_hashed(state->blocks, hash, key, info);
115
116 return info;
117 }
118
119 static void
120 analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block)
121 {
122 nir_foreach_instr(instr, block) {
123 if (instr->type != nir_instr_type_intrinsic)
124 continue;
125
126 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
127 if (intrin->intrinsic == nir_intrinsic_load_uniform)
128 state->uses_regular_uniforms = true;
129
130 if (intrin->intrinsic != nir_intrinsic_load_ubo)
131 continue;
132
133 nir_const_value *block_const = nir_src_as_const_value(intrin->src[0]);
134 nir_const_value *offset_const = nir_src_as_const_value(intrin->src[1]);
135
136 if (block_const && offset_const) {
137 const int block = block_const->u32[0];
138 const int offset = offset_const->u32[0] / 32;
139
140 /* Avoid shifting by larger than the width of our bitfield, as this
141 * is undefined in C. Even if we require multiple bits to represent
142 * the entire value, it's OK to record a partial value - the backend
143 * is capable of falling back to pull loads for later components of
144 * vectors, as it has to shrink ranges for other reasons anyway.
145 */
146 if (offset >= 64)
147 continue;
148
149 /* The value might span multiple 32-byte chunks. */
150 const int bytes = nir_intrinsic_dest_components(intrin) *
151 (nir_dest_bit_size(intrin->dest) / 8);
152 const int start = ROUND_DOWN_TO(offset_const->u32[0], 32);
153 const int end = ALIGN(offset_const->u32[0] + bytes, 32);
154 const int chunks = (end - start) / 32;
155
156 /* TODO: should we count uses in loops as higher benefit? */
157
158 struct ubo_block_info *info = get_block_info(state, block);
159 info->offsets |= ((1ull << chunks) - 1) << offset;
160 info->uses[offset]++;
161 }
162 }
163 }
164
165 static void
166 print_ubo_entry(FILE *file,
167 const struct ubo_range_entry *entry,
168 struct ubo_analysis_state *state)
169 {
170 struct ubo_block_info *info = get_block_info(state, entry->range.block);
171
172 fprintf(file,
173 "block %2d, start %2d, length %2d, bits = %"PRIx64", "
174 "benefit %2d, cost %2d, score = %2d\n",
175 entry->range.block, entry->range.start, entry->range.length,
176 info->offsets, entry->benefit, entry->range.length, score(entry));
177 }
178
179 void
180 brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
181 nir_shader *nir,
182 struct brw_ubo_range out_ranges[4])
183 {
184 const struct gen_device_info *devinfo = compiler->devinfo;
185
186 if ((devinfo->gen <= 7 && !devinfo->is_haswell) ||
187 !compiler->scalar_stage[nir->info.stage]) {
188 memset(out_ranges, 0, 4 * sizeof(struct brw_ubo_range));
189 return;
190 }
191
192 void *mem_ctx = ralloc_context(NULL);
193
194 struct ubo_analysis_state state = {
195 .uses_regular_uniforms = false,
196 .blocks =
197 _mesa_hash_table_create(mem_ctx, NULL, _mesa_key_pointer_equal),
198 };
199
200 /* Walk the IR, recording how many times each UBO block/offset is used. */
201 nir_foreach_function(function, nir) {
202 if (function->impl) {
203 nir_foreach_block(block, function->impl) {
204 analyze_ubos_block(&state, block);
205 }
206 }
207 }
208
209 /* Find ranges: a block, starting 32-byte offset, and length. */
210 struct util_dynarray ranges;
211 util_dynarray_init(&ranges, mem_ctx);
212
213 struct hash_entry *entry;
214 hash_table_foreach(state.blocks, entry) {
215 const int b = entry->hash - 1;
216 const struct ubo_block_info *info = entry->data;
217 uint64_t offsets = info->offsets;
218
219 /* Walk through the offsets bitfield, finding contiguous regions of
220 * set bits:
221 *
222 * 0000000001111111111111000000000000111111111111110000000011111100
223 * ^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ ^^^^^^
224 *
225 * Each of these will become a UBO range.
226 */
227 while (offsets != 0) {
228 /* Find the first 1 in the offsets bitfield. This represents the
229 * start of a range of interesting UBO data. Make it zero-indexed.
230 */
231 int first_bit = ffsll(offsets) - 1;
232
233 /* Find the first 0 bit in offsets beyond first_bit. To find the
234 * first zero bit, we find the first 1 bit in the complement. In
235 * order to ignore bits before first_bit, we mask off those bits.
236 */
237 int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1;
238
239 if (first_hole == -1) {
240 /* If we didn't find a hole, then set it to the end of the
241 * bitfield. There are no more ranges to process.
242 */
243 first_hole = 64;
244 offsets = 0;
245 } else {
246 /* We've processed all bits before first_hole. Mask them off. */
247 offsets &= ~((1ull << first_hole) - 1);
248 }
249
250 struct ubo_range_entry *entry =
251 util_dynarray_grow(&ranges, sizeof(struct ubo_range_entry));
252
253 entry->range.block = b;
254 entry->range.start = first_bit;
255 /* first_hole is one beyond the end, so we don't need to add 1 */
256 entry->range.length = first_hole - first_bit;
257 entry->benefit = 0;
258
259 for (int i = 0; i < entry->range.length; i++)
260 entry->benefit += info->uses[first_bit + i];
261 }
262 }
263
264 int nr_entries = ranges.size / sizeof(struct ubo_range_entry);
265
266 if (0) {
267 util_dynarray_foreach(&ranges, struct ubo_range_entry, entry) {
268 print_ubo_entry(stderr, entry, &state);
269 }
270 }
271
272 /* TODO: Consider combining ranges.
273 *
274 * We can only push 3-4 ranges via 3DSTATE_CONSTANT_XS. If there are
275 * more ranges, and two are close by with only a small hole, it may be
276 * worth combining them. The holes will waste register space, but the
277 * benefit of removing pulls may outweigh that cost.
278 */
279
280 /* Sort the list so the most beneficial ranges are at the front. */
281 qsort(ranges.data, nr_entries, sizeof(struct ubo_range_entry),
282 cmp_ubo_range_entry);
283
284 struct ubo_range_entry *entries = ranges.data;
285
286 /* Return the top 4 or so. We drop by one if regular uniforms are in
287 * use, assuming one push buffer will be dedicated to those. We may
288 * also only get 3 on Haswell if we can't write INSTPM.
289 *
290 * The backend may need to shrink these ranges to ensure that they
291 * don't exceed the maximum push constant limits. It can simply drop
292 * the tail of the list, as that's the least valuable portion. We
293 * unfortunately can't truncate it here, because we don't know what
294 * the backend is planning to do with regular uniforms.
295 */
296 const int max_ubos = (compiler->constant_buffer_0_is_relative ? 3 : 4) -
297 state.uses_regular_uniforms;
298 nr_entries = MIN2(nr_entries, max_ubos);
299
300 for (int i = 0; i < nr_entries; i++) {
301 out_ranges[i] = entries[i].range;
302 }
303 for (int i = nr_entries; i < 4; i++) {
304 out_ranges[i].block = 0;
305 out_ranges[i].start = 0;
306 out_ranges[i].length = 0;
307 }
308
309 ralloc_free(ranges.mem_ctx);
310 }