nir: Expose nir_remove_unused_io_vars().
[mesa.git] / src / compiler / nir / nir_linking_helpers.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir.h"
25 #include "util/set.h"
26 #include "util/hash_table.h"
27
28 /* This file contains various little helpers for doing simple linking in
29 * NIR. Eventually, we'll probably want a full-blown varying packing
30 * implementation in here. Right now, it just deletes unused things.
31 */
32
33 /**
34 * Returns the bits in the inputs_read, outputs_written, or
35 * system_values_read bitfield corresponding to this variable.
36 */
37 static uint64_t
38 get_variable_io_mask(nir_variable *var, gl_shader_stage stage)
39 {
40 if (var->data.location < 0)
41 return 0;
42
43 unsigned location = var->data.patch ?
44 var->data.location - VARYING_SLOT_PATCH0 : var->data.location;
45
46 assert(var->data.mode == nir_var_shader_in ||
47 var->data.mode == nir_var_shader_out ||
48 var->data.mode == nir_var_system_value);
49 assert(var->data.location >= 0);
50
51 const struct glsl_type *type = var->type;
52 if (nir_is_per_vertex_io(var, stage)) {
53 assert(glsl_type_is_array(type));
54 type = glsl_get_array_element(type);
55 }
56
57 unsigned slots = glsl_count_attribute_slots(type, false);
58 return ((1ull << slots) - 1) << location;
59 }
60
61 static void
62 tcs_add_output_reads(nir_shader *shader, uint64_t *read, uint64_t *patches_read)
63 {
64 nir_foreach_function(function, shader) {
65 if (!function->impl)
66 continue;
67
68 nir_foreach_block(block, function->impl) {
69 nir_foreach_instr(instr, block) {
70 if (instr->type != nir_instr_type_intrinsic)
71 continue;
72
73 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
74 if (intrin->intrinsic != nir_intrinsic_load_deref)
75 continue;
76
77 nir_variable *var =
78 nir_deref_instr_get_variable(nir_src_as_deref(intrin->src[0]));
79
80 if (var->data.mode != nir_var_shader_out)
81 continue;
82
83 if (var->data.patch) {
84 patches_read[var->data.location_frac] |=
85 get_variable_io_mask(var, shader->info.stage);
86 } else {
87 read[var->data.location_frac] |=
88 get_variable_io_mask(var, shader->info.stage);
89 }
90 }
91 }
92 }
93 }
94
95 /**
96 * Helper for removing unused shader I/O variables, by demoting them to global
97 * variables (which may then by dead code eliminated).
98 *
99 * Example usage is:
100 *
101 * progress = nir_remove_unused_io_vars(producer,
102 * &producer->outputs,
103 * read, patches_read) ||
104 * progress;
105 *
106 * The "used" should be an array of 4 uint64_ts (probably of VARYING_BIT_*)
107 * representing each .location_frac used. Note that for vector variables,
108 * only the first channel (.location_frac) is examined for deciding if the
109 * variable is used!
110 */
111 bool
112 nir_remove_unused_io_vars(nir_shader *shader, struct exec_list *var_list,
113 uint64_t *used_by_other_stage,
114 uint64_t *used_by_other_stage_patches)
115 {
116 bool progress = false;
117 uint64_t *used;
118
119 nir_foreach_variable_safe(var, var_list) {
120 if (var->data.patch)
121 used = used_by_other_stage_patches;
122 else
123 used = used_by_other_stage;
124
125 if (var->data.location < VARYING_SLOT_VAR0 && var->data.location >= 0)
126 continue;
127
128 if (var->data.always_active_io)
129 continue;
130
131 uint64_t other_stage = used[var->data.location_frac];
132
133 if (!(other_stage & get_variable_io_mask(var, shader->info.stage))) {
134 /* This one is invalid, make it a global variable instead */
135 var->data.location = 0;
136 var->data.mode = nir_var_global;
137
138 exec_node_remove(&var->node);
139 exec_list_push_tail(&shader->globals, &var->node);
140
141 progress = true;
142 }
143 }
144
145 if (progress)
146 nir_fixup_deref_modes(shader);
147
148 return progress;
149 }
150
151 bool
152 nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer)
153 {
154 assert(producer->info.stage != MESA_SHADER_FRAGMENT);
155 assert(consumer->info.stage != MESA_SHADER_VERTEX);
156
157 uint64_t read[4] = { 0 }, written[4] = { 0 };
158 uint64_t patches_read[4] = { 0 }, patches_written[4] = { 0 };
159
160 nir_foreach_variable(var, &producer->outputs) {
161 if (var->data.patch) {
162 patches_written[var->data.location_frac] |=
163 get_variable_io_mask(var, producer->info.stage);
164 } else {
165 written[var->data.location_frac] |=
166 get_variable_io_mask(var, producer->info.stage);
167 }
168 }
169
170 nir_foreach_variable(var, &consumer->inputs) {
171 if (var->data.patch) {
172 patches_read[var->data.location_frac] |=
173 get_variable_io_mask(var, consumer->info.stage);
174 } else {
175 read[var->data.location_frac] |=
176 get_variable_io_mask(var, consumer->info.stage);
177 }
178 }
179
180 /* Each TCS invocation can read data written by other TCS invocations,
181 * so even if the outputs are not used by the TES we must also make
182 * sure they are not read by the TCS before demoting them to globals.
183 */
184 if (producer->info.stage == MESA_SHADER_TESS_CTRL)
185 tcs_add_output_reads(producer, read, patches_read);
186
187 bool progress = false;
188 progress = nir_remove_unused_io_vars(producer, &producer->outputs, read,
189 patches_read);
190
191 progress = nir_remove_unused_io_vars(consumer, &consumer->inputs, written,
192 patches_written) || progress;
193
194 return progress;
195 }
196
197 static uint8_t
198 get_interp_type(nir_variable *var, bool default_to_smooth_interp)
199 {
200 if (var->data.interpolation != INTERP_MODE_NONE)
201 return var->data.interpolation;
202 else if (default_to_smooth_interp)
203 return INTERP_MODE_SMOOTH;
204 else
205 return INTERP_MODE_NONE;
206 }
207
208 #define INTERPOLATE_LOC_SAMPLE 0
209 #define INTERPOLATE_LOC_CENTROID 1
210 #define INTERPOLATE_LOC_CENTER 2
211
212 static uint8_t
213 get_interp_loc(nir_variable *var)
214 {
215 if (var->data.sample)
216 return INTERPOLATE_LOC_SAMPLE;
217 else if (var->data.centroid)
218 return INTERPOLATE_LOC_CENTROID;
219 else
220 return INTERPOLATE_LOC_CENTER;
221 }
222
223 static void
224 get_slot_component_masks_and_interp_types(struct exec_list *var_list,
225 uint8_t *comps,
226 uint8_t *interp_type,
227 uint8_t *interp_loc,
228 gl_shader_stage stage,
229 bool default_to_smooth_interp)
230 {
231 nir_foreach_variable_safe(var, var_list) {
232 assert(var->data.location >= 0);
233
234 /* Only remap things that aren't built-ins.
235 * TODO: add TES patch support.
236 */
237 if (var->data.location >= VARYING_SLOT_VAR0 &&
238 var->data.location - VARYING_SLOT_VAR0 < 32) {
239
240 const struct glsl_type *type = var->type;
241 if (nir_is_per_vertex_io(var, stage)) {
242 assert(glsl_type_is_array(type));
243 type = glsl_get_array_element(type);
244 }
245
246 unsigned location = var->data.location - VARYING_SLOT_VAR0;
247 unsigned elements =
248 glsl_get_vector_elements(glsl_without_array(type));
249
250 bool dual_slot = glsl_type_is_dual_slot(glsl_without_array(type));
251 unsigned slots = glsl_count_attribute_slots(type, false);
252 unsigned comps_slot2 = 0;
253 for (unsigned i = 0; i < slots; i++) {
254 interp_type[location + i] =
255 get_interp_type(var, default_to_smooth_interp);
256 interp_loc[location + i] = get_interp_loc(var);
257
258 if (dual_slot) {
259 if (i & 1) {
260 comps[location + i] |= ((1 << comps_slot2) - 1);
261 } else {
262 unsigned num_comps = 4 - var->data.location_frac;
263 comps_slot2 = (elements * 2) - num_comps;
264
265 /* Assume ARB_enhanced_layouts packing rules for doubles */
266 assert(var->data.location_frac == 0 ||
267 var->data.location_frac == 2);
268 assert(comps_slot2 <= 4);
269
270 comps[location + i] |=
271 ((1 << num_comps) - 1) << var->data.location_frac;
272 }
273 } else {
274 comps[location + i] |=
275 ((1 << elements) - 1) << var->data.location_frac;
276 }
277 }
278 }
279 }
280 }
281
282 struct varying_loc
283 {
284 uint8_t component;
285 uint32_t location;
286 };
287
288 static void
289 remap_slots_and_components(struct exec_list *var_list, gl_shader_stage stage,
290 struct varying_loc (*remap)[4],
291 uint64_t *slots_used, uint64_t *out_slots_read)
292 {
293 uint64_t out_slots_read_tmp = 0;
294
295 /* We don't touch builtins so just copy the bitmask */
296 uint64_t slots_used_tmp =
297 *slots_used & (((uint64_t)1 << (VARYING_SLOT_VAR0 - 1)) - 1);
298
299 nir_foreach_variable(var, var_list) {
300 assert(var->data.location >= 0);
301
302 /* Only remap things that aren't built-ins */
303 if (var->data.location >= VARYING_SLOT_VAR0 &&
304 var->data.location - VARYING_SLOT_VAR0 < 32) {
305 assert(var->data.location - VARYING_SLOT_VAR0 < 32);
306
307 const struct glsl_type *type = var->type;
308 if (nir_is_per_vertex_io(var, stage)) {
309 assert(glsl_type_is_array(type));
310 type = glsl_get_array_element(type);
311 }
312
313 unsigned num_slots = glsl_count_attribute_slots(type, false);
314 bool used_across_stages = false;
315 bool outputs_read = false;
316
317 unsigned location = var->data.location - VARYING_SLOT_VAR0;
318 struct varying_loc *new_loc = &remap[location][var->data.location_frac];
319
320 uint64_t slots = (((uint64_t)1 << num_slots) - 1) << var->data.location;
321 if (slots & *slots_used)
322 used_across_stages = true;
323
324 if (slots & *out_slots_read)
325 outputs_read = true;
326
327 if (new_loc->location) {
328 var->data.location = new_loc->location;
329 var->data.location_frac = new_loc->component;
330 }
331
332 if (var->data.always_active_io) {
333 /* We can't apply link time optimisations (specifically array
334 * splitting) to these so we need to copy the existing mask
335 * otherwise we will mess up the mask for things like partially
336 * marked arrays.
337 */
338 if (used_across_stages) {
339 slots_used_tmp |=
340 *slots_used & (((uint64_t)1 << num_slots) - 1) << var->data.location;
341 }
342
343 if (outputs_read) {
344 out_slots_read_tmp |=
345 *out_slots_read & (((uint64_t)1 << num_slots) - 1) << var->data.location;
346 }
347
348 } else {
349 for (unsigned i = 0; i < num_slots; i++) {
350 if (used_across_stages)
351 slots_used_tmp |= (uint64_t)1 << (var->data.location + i);
352
353 if (outputs_read)
354 out_slots_read_tmp |= (uint64_t)1 << (var->data.location + i);
355 }
356 }
357 }
358 }
359
360 *slots_used = slots_used_tmp;
361 *out_slots_read = out_slots_read_tmp;
362 }
363
364 /* If there are empty components in the slot compact the remaining components
365 * as close to component 0 as possible. This will make it easier to fill the
366 * empty components with components from a different slot in a following pass.
367 */
368 static void
369 compact_components(nir_shader *producer, nir_shader *consumer, uint8_t *comps,
370 uint8_t *interp_type, uint8_t *interp_loc,
371 bool default_to_smooth_interp)
372 {
373 struct exec_list *input_list = &consumer->inputs;
374 struct exec_list *output_list = &producer->outputs;
375 struct varying_loc remap[32][4] = {{{0}, {0}}};
376
377 /* Create a cursor for each interpolation type */
378 unsigned cursor[4] = {0};
379
380 /* We only need to pass over one stage and we choose the consumer as it seems
381 * to cause a larger reduction in instruction counts (tested on i965).
382 */
383 nir_foreach_variable(var, input_list) {
384
385 /* Only remap things that aren't builtins.
386 * TODO: add TES patch support.
387 */
388 if (var->data.location >= VARYING_SLOT_VAR0 &&
389 var->data.location - VARYING_SLOT_VAR0 < 32) {
390
391 /* We can't repack xfb varyings. */
392 if (var->data.always_active_io)
393 continue;
394
395 const struct glsl_type *type = var->type;
396 if (nir_is_per_vertex_io(var, consumer->info.stage)) {
397 assert(glsl_type_is_array(type));
398 type = glsl_get_array_element(type);
399 }
400
401 /* Skip types that require more complex packing handling.
402 * TODO: add support for these types.
403 */
404 if (glsl_type_is_array(type) ||
405 glsl_type_is_dual_slot(type) ||
406 glsl_type_is_matrix(type) ||
407 glsl_type_is_struct(type) ||
408 glsl_type_is_64bit(type))
409 continue;
410
411 /* We ignore complex types above and all other vector types should
412 * have been split into scalar variables by the lower_io_to_scalar
413 * pass. The only exeption should by OpenGL xfb varyings.
414 */
415 if (glsl_get_vector_elements(type) != 1)
416 continue;
417
418 unsigned location = var->data.location - VARYING_SLOT_VAR0;
419 uint8_t used_comps = comps[location];
420
421 /* If there are no empty components there is nothing more for us to do.
422 */
423 if (used_comps == 0xf)
424 continue;
425
426 bool found_new_offset = false;
427 uint8_t interp = get_interp_type(var, default_to_smooth_interp);
428 for (; cursor[interp] < 32; cursor[interp]++) {
429 uint8_t cursor_used_comps = comps[cursor[interp]];
430
431 /* We couldn't find anywhere to pack the varying continue on. */
432 if (cursor[interp] == location &&
433 (var->data.location_frac == 0 ||
434 cursor_used_comps & ((1 << (var->data.location_frac)) - 1)))
435 break;
436
437 /* We can only pack varyings with matching interpolation types */
438 if (interp_type[cursor[interp]] != interp)
439 continue;
440
441 /* Interpolation loc must match also.
442 * TODO: i965 can handle these if they don't match, but the
443 * radeonsi nir backend handles everything as vec4s and so expects
444 * this to be the same for all components. We could make this
445 * check driver specfific or drop it if NIR ever become the only
446 * radeonsi backend.
447 */
448 if (interp_loc[cursor[interp]] != get_interp_loc(var))
449 continue;
450
451 /* If the slot is empty just skip it for now, compact_var_list()
452 * can be called after this function to remove empty slots for us.
453 * TODO: finish implementing compact_var_list() requires array and
454 * matrix splitting.
455 */
456 if (!cursor_used_comps)
457 continue;
458
459 uint8_t unused_comps = ~cursor_used_comps;
460
461 for (unsigned i = 0; i < 4; i++) {
462 uint8_t new_var_comps = 1 << i;
463 if (unused_comps & new_var_comps) {
464 remap[location][var->data.location_frac].component = i;
465 remap[location][var->data.location_frac].location =
466 cursor[interp] + VARYING_SLOT_VAR0;
467
468 found_new_offset = true;
469
470 /* Turn off the mask for the component we are remapping */
471 if (comps[location] & 1 << var->data.location_frac) {
472 comps[location] ^= 1 << var->data.location_frac;
473 comps[cursor[interp]] |= new_var_comps;
474 }
475 break;
476 }
477 }
478
479 if (found_new_offset)
480 break;
481 }
482 }
483 }
484
485 uint64_t zero = 0;
486 remap_slots_and_components(input_list, consumer->info.stage, remap,
487 &consumer->info.inputs_read, &zero);
488 remap_slots_and_components(output_list, producer->info.stage, remap,
489 &producer->info.outputs_written,
490 &producer->info.outputs_read);
491 }
492
493 /* We assume that this has been called more-or-less directly after
494 * remove_unused_varyings. At this point, all of the varyings that we
495 * aren't going to be using have been completely removed and the
496 * inputs_read and outputs_written fields in nir_shader_info reflect
497 * this. Therefore, the total set of valid slots is the OR of the two
498 * sets of varyings; this accounts for varyings which one side may need
499 * to read/write even if the other doesn't. This can happen if, for
500 * instance, an array is used indirectly from one side causing it to be
501 * unsplittable but directly from the other.
502 */
503 void
504 nir_compact_varyings(nir_shader *producer, nir_shader *consumer,
505 bool default_to_smooth_interp)
506 {
507 assert(producer->info.stage != MESA_SHADER_FRAGMENT);
508 assert(consumer->info.stage != MESA_SHADER_VERTEX);
509
510 uint8_t comps[32] = {0};
511 uint8_t interp_type[32] = {0};
512 uint8_t interp_loc[32] = {0};
513
514 get_slot_component_masks_and_interp_types(&producer->outputs, comps,
515 interp_type, interp_loc,
516 producer->info.stage,
517 default_to_smooth_interp);
518 get_slot_component_masks_and_interp_types(&consumer->inputs, comps,
519 interp_type, interp_loc,
520 consumer->info.stage,
521 default_to_smooth_interp);
522
523 compact_components(producer, consumer, comps, interp_type, interp_loc,
524 default_to_smooth_interp);
525 }