nir: rework nir_link_opt_varyings()
[mesa.git] / src / compiler / nir / nir_linking_helpers.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir.h"
25 #include "nir_builder.h"
26 #include "util/set.h"
27 #include "util/hash_table.h"
28
29 /* This file contains various little helpers for doing simple linking in
30 * NIR. Eventually, we'll probably want a full-blown varying packing
31 * implementation in here. Right now, it just deletes unused things.
32 */
33
34 /**
35 * Returns the bits in the inputs_read, outputs_written, or
36 * system_values_read bitfield corresponding to this variable.
37 */
38 static uint64_t
39 get_variable_io_mask(nir_variable *var, gl_shader_stage stage)
40 {
41 if (var->data.location < 0)
42 return 0;
43
44 unsigned location = var->data.patch ?
45 var->data.location - VARYING_SLOT_PATCH0 : var->data.location;
46
47 assert(var->data.mode == nir_var_shader_in ||
48 var->data.mode == nir_var_shader_out ||
49 var->data.mode == nir_var_system_value);
50 assert(var->data.location >= 0);
51
52 const struct glsl_type *type = var->type;
53 if (nir_is_per_vertex_io(var, stage)) {
54 assert(glsl_type_is_array(type));
55 type = glsl_get_array_element(type);
56 }
57
58 unsigned slots = glsl_count_attribute_slots(type, false);
59 return ((1ull << slots) - 1) << location;
60 }
61
62 static void
63 tcs_add_output_reads(nir_shader *shader, uint64_t *read, uint64_t *patches_read)
64 {
65 nir_foreach_function(function, shader) {
66 if (!function->impl)
67 continue;
68
69 nir_foreach_block(block, function->impl) {
70 nir_foreach_instr(instr, block) {
71 if (instr->type != nir_instr_type_intrinsic)
72 continue;
73
74 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
75 if (intrin->intrinsic != nir_intrinsic_load_deref)
76 continue;
77
78 nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
79 if (deref->mode != nir_var_shader_out)
80 continue;
81
82 nir_variable *var = nir_deref_instr_get_variable(deref);
83 if (var->data.patch) {
84 patches_read[var->data.location_frac] |=
85 get_variable_io_mask(var, shader->info.stage);
86 } else {
87 read[var->data.location_frac] |=
88 get_variable_io_mask(var, shader->info.stage);
89 }
90 }
91 }
92 }
93 }
94
95 /**
96 * Helper for removing unused shader I/O variables, by demoting them to global
97 * variables (which may then by dead code eliminated).
98 *
99 * Example usage is:
100 *
101 * progress = nir_remove_unused_io_vars(producer,
102 * &producer->outputs,
103 * read, patches_read) ||
104 * progress;
105 *
106 * The "used" should be an array of 4 uint64_ts (probably of VARYING_BIT_*)
107 * representing each .location_frac used. Note that for vector variables,
108 * only the first channel (.location_frac) is examined for deciding if the
109 * variable is used!
110 */
111 bool
112 nir_remove_unused_io_vars(nir_shader *shader, struct exec_list *var_list,
113 uint64_t *used_by_other_stage,
114 uint64_t *used_by_other_stage_patches)
115 {
116 bool progress = false;
117 uint64_t *used;
118
119 nir_foreach_variable_safe(var, var_list) {
120 if (var->data.patch)
121 used = used_by_other_stage_patches;
122 else
123 used = used_by_other_stage;
124
125 if (var->data.location < VARYING_SLOT_VAR0 && var->data.location >= 0)
126 continue;
127
128 if (var->data.always_active_io)
129 continue;
130
131 uint64_t other_stage = used[var->data.location_frac];
132
133 if (!(other_stage & get_variable_io_mask(var, shader->info.stage))) {
134 /* This one is invalid, make it a global variable instead */
135 var->data.location = 0;
136 var->data.mode = nir_var_global;
137
138 exec_node_remove(&var->node);
139 exec_list_push_tail(&shader->globals, &var->node);
140
141 progress = true;
142 }
143 }
144
145 if (progress)
146 nir_fixup_deref_modes(shader);
147
148 return progress;
149 }
150
151 bool
152 nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer)
153 {
154 assert(producer->info.stage != MESA_SHADER_FRAGMENT);
155 assert(consumer->info.stage != MESA_SHADER_VERTEX);
156
157 uint64_t read[4] = { 0 }, written[4] = { 0 };
158 uint64_t patches_read[4] = { 0 }, patches_written[4] = { 0 };
159
160 nir_foreach_variable(var, &producer->outputs) {
161 if (var->data.patch) {
162 patches_written[var->data.location_frac] |=
163 get_variable_io_mask(var, producer->info.stage);
164 } else {
165 written[var->data.location_frac] |=
166 get_variable_io_mask(var, producer->info.stage);
167 }
168 }
169
170 nir_foreach_variable(var, &consumer->inputs) {
171 if (var->data.patch) {
172 patches_read[var->data.location_frac] |=
173 get_variable_io_mask(var, consumer->info.stage);
174 } else {
175 read[var->data.location_frac] |=
176 get_variable_io_mask(var, consumer->info.stage);
177 }
178 }
179
180 /* Each TCS invocation can read data written by other TCS invocations,
181 * so even if the outputs are not used by the TES we must also make
182 * sure they are not read by the TCS before demoting them to globals.
183 */
184 if (producer->info.stage == MESA_SHADER_TESS_CTRL)
185 tcs_add_output_reads(producer, read, patches_read);
186
187 bool progress = false;
188 progress = nir_remove_unused_io_vars(producer, &producer->outputs, read,
189 patches_read);
190
191 progress = nir_remove_unused_io_vars(consumer, &consumer->inputs, written,
192 patches_written) || progress;
193
194 return progress;
195 }
196
197 static uint8_t
198 get_interp_type(nir_variable *var, const struct glsl_type *type,
199 bool default_to_smooth_interp)
200 {
201 if (glsl_type_is_integer(type))
202 return INTERP_MODE_FLAT;
203 else if (var->data.interpolation != INTERP_MODE_NONE)
204 return var->data.interpolation;
205 else if (default_to_smooth_interp)
206 return INTERP_MODE_SMOOTH;
207 else
208 return INTERP_MODE_NONE;
209 }
210
211 #define INTERPOLATE_LOC_SAMPLE 0
212 #define INTERPOLATE_LOC_CENTROID 1
213 #define INTERPOLATE_LOC_CENTER 2
214
215 static uint8_t
216 get_interp_loc(nir_variable *var)
217 {
218 if (var->data.sample)
219 return INTERPOLATE_LOC_SAMPLE;
220 else if (var->data.centroid)
221 return INTERPOLATE_LOC_CENTROID;
222 else
223 return INTERPOLATE_LOC_CENTER;
224 }
225
226 static void
227 get_slot_component_masks_and_interp_types(struct exec_list *var_list,
228 uint8_t *comps,
229 uint8_t *interp_type,
230 uint8_t *interp_loc,
231 gl_shader_stage stage,
232 bool default_to_smooth_interp)
233 {
234 nir_foreach_variable_safe(var, var_list) {
235 assert(var->data.location >= 0);
236
237 /* Only remap things that aren't built-ins.
238 * TODO: add TES patch support.
239 */
240 if (var->data.location >= VARYING_SLOT_VAR0 &&
241 var->data.location - VARYING_SLOT_VAR0 < 32) {
242
243 const struct glsl_type *type = var->type;
244 if (nir_is_per_vertex_io(var, stage)) {
245 assert(glsl_type_is_array(type));
246 type = glsl_get_array_element(type);
247 }
248
249 unsigned location = var->data.location - VARYING_SLOT_VAR0;
250 unsigned elements =
251 glsl_get_vector_elements(glsl_without_array(type));
252
253 bool dual_slot = glsl_type_is_dual_slot(glsl_without_array(type));
254 unsigned slots = glsl_count_attribute_slots(type, false);
255 unsigned comps_slot2 = 0;
256 for (unsigned i = 0; i < slots; i++) {
257 interp_type[location + i] =
258 get_interp_type(var, type, default_to_smooth_interp);
259 interp_loc[location + i] = get_interp_loc(var);
260
261 if (dual_slot) {
262 if (i & 1) {
263 comps[location + i] |= ((1 << comps_slot2) - 1);
264 } else {
265 unsigned num_comps = 4 - var->data.location_frac;
266 comps_slot2 = (elements * 2) - num_comps;
267
268 /* Assume ARB_enhanced_layouts packing rules for doubles */
269 assert(var->data.location_frac == 0 ||
270 var->data.location_frac == 2);
271 assert(comps_slot2 <= 4);
272
273 comps[location + i] |=
274 ((1 << num_comps) - 1) << var->data.location_frac;
275 }
276 } else {
277 comps[location + i] |=
278 ((1 << elements) - 1) << var->data.location_frac;
279 }
280 }
281 }
282 }
283 }
284
285 struct varying_loc
286 {
287 uint8_t component;
288 uint32_t location;
289 };
290
291 static void
292 remap_slots_and_components(struct exec_list *var_list, gl_shader_stage stage,
293 struct varying_loc (*remap)[4],
294 uint64_t *slots_used, uint64_t *out_slots_read)
295 {
296 uint64_t out_slots_read_tmp = 0;
297
298 /* We don't touch builtins so just copy the bitmask */
299 uint64_t slots_used_tmp =
300 *slots_used & (((uint64_t)1 << (VARYING_SLOT_VAR0 - 1)) - 1);
301
302 nir_foreach_variable(var, var_list) {
303 assert(var->data.location >= 0);
304
305 /* Only remap things that aren't built-ins */
306 if (var->data.location >= VARYING_SLOT_VAR0 &&
307 var->data.location - VARYING_SLOT_VAR0 < 32) {
308 assert(var->data.location - VARYING_SLOT_VAR0 < 32);
309
310 const struct glsl_type *type = var->type;
311 if (nir_is_per_vertex_io(var, stage)) {
312 assert(glsl_type_is_array(type));
313 type = glsl_get_array_element(type);
314 }
315
316 unsigned num_slots = glsl_count_attribute_slots(type, false);
317 bool used_across_stages = false;
318 bool outputs_read = false;
319
320 unsigned location = var->data.location - VARYING_SLOT_VAR0;
321 struct varying_loc *new_loc = &remap[location][var->data.location_frac];
322
323 uint64_t slots = (((uint64_t)1 << num_slots) - 1) << var->data.location;
324 if (slots & *slots_used)
325 used_across_stages = true;
326
327 if (slots & *out_slots_read)
328 outputs_read = true;
329
330 if (new_loc->location) {
331 var->data.location = new_loc->location;
332 var->data.location_frac = new_loc->component;
333 }
334
335 if (var->data.always_active_io) {
336 /* We can't apply link time optimisations (specifically array
337 * splitting) to these so we need to copy the existing mask
338 * otherwise we will mess up the mask for things like partially
339 * marked arrays.
340 */
341 if (used_across_stages) {
342 slots_used_tmp |=
343 *slots_used & (((uint64_t)1 << num_slots) - 1) << var->data.location;
344 }
345
346 if (outputs_read) {
347 out_slots_read_tmp |=
348 *out_slots_read & (((uint64_t)1 << num_slots) - 1) << var->data.location;
349 }
350
351 } else {
352 for (unsigned i = 0; i < num_slots; i++) {
353 if (used_across_stages)
354 slots_used_tmp |= (uint64_t)1 << (var->data.location + i);
355
356 if (outputs_read)
357 out_slots_read_tmp |= (uint64_t)1 << (var->data.location + i);
358 }
359 }
360 }
361 }
362
363 *slots_used = slots_used_tmp;
364 *out_slots_read = out_slots_read_tmp;
365 }
366
367 /* If there are empty components in the slot compact the remaining components
368 * as close to component 0 as possible. This will make it easier to fill the
369 * empty components with components from a different slot in a following pass.
370 */
371 static void
372 compact_components(nir_shader *producer, nir_shader *consumer, uint8_t *comps,
373 uint8_t *interp_type, uint8_t *interp_loc,
374 bool default_to_smooth_interp)
375 {
376 struct exec_list *input_list = &consumer->inputs;
377 struct exec_list *output_list = &producer->outputs;
378 struct varying_loc remap[32][4] = {{{0}, {0}}};
379
380 /* Create a cursor for each interpolation type */
381 unsigned cursor[4] = {0};
382
383 /* We only need to pass over one stage and we choose the consumer as it seems
384 * to cause a larger reduction in instruction counts (tested on i965).
385 */
386 nir_foreach_variable(var, input_list) {
387
388 /* Only remap things that aren't builtins.
389 * TODO: add TES patch support.
390 */
391 if (var->data.location >= VARYING_SLOT_VAR0 &&
392 var->data.location - VARYING_SLOT_VAR0 < 32) {
393
394 /* We can't repack xfb varyings. */
395 if (var->data.always_active_io)
396 continue;
397
398 const struct glsl_type *type = var->type;
399 if (nir_is_per_vertex_io(var, consumer->info.stage)) {
400 assert(glsl_type_is_array(type));
401 type = glsl_get_array_element(type);
402 }
403
404 /* Skip types that require more complex packing handling.
405 * TODO: add support for these types.
406 */
407 if (glsl_type_is_array(type) ||
408 glsl_type_is_dual_slot(type) ||
409 glsl_type_is_matrix(type) ||
410 glsl_type_is_struct(type) ||
411 glsl_type_is_64bit(type))
412 continue;
413
414 /* We ignore complex types above and all other vector types should
415 * have been split into scalar variables by the lower_io_to_scalar
416 * pass. The only exception should by OpenGL xfb varyings.
417 */
418 if (glsl_get_vector_elements(type) != 1)
419 continue;
420
421 unsigned location = var->data.location - VARYING_SLOT_VAR0;
422 uint8_t used_comps = comps[location];
423
424 /* If there are no empty components there is nothing more for us to do.
425 */
426 if (used_comps == 0xf)
427 continue;
428
429 bool found_new_offset = false;
430 uint8_t interp = get_interp_type(var, type, default_to_smooth_interp);
431 for (; cursor[interp] < 32; cursor[interp]++) {
432 uint8_t cursor_used_comps = comps[cursor[interp]];
433
434 /* We couldn't find anywhere to pack the varying continue on. */
435 if (cursor[interp] == location &&
436 (var->data.location_frac == 0 ||
437 cursor_used_comps & ((1 << (var->data.location_frac)) - 1)))
438 break;
439
440 /* We can only pack varyings with matching interpolation types */
441 if (interp_type[cursor[interp]] != interp)
442 continue;
443
444 /* Interpolation loc must match also.
445 * TODO: i965 can handle these if they don't match, but the
446 * radeonsi nir backend handles everything as vec4s and so expects
447 * this to be the same for all components. We could make this
448 * check driver specfific or drop it if NIR ever become the only
449 * radeonsi backend.
450 */
451 if (interp_loc[cursor[interp]] != get_interp_loc(var))
452 continue;
453
454 /* If the slot is empty just skip it for now, compact_var_list()
455 * can be called after this function to remove empty slots for us.
456 * TODO: finish implementing compact_var_list() requires array and
457 * matrix splitting.
458 */
459 if (!cursor_used_comps)
460 continue;
461
462 uint8_t unused_comps = ~cursor_used_comps;
463
464 for (unsigned i = 0; i < 4; i++) {
465 uint8_t new_var_comps = 1 << i;
466 if (unused_comps & new_var_comps) {
467 remap[location][var->data.location_frac].component = i;
468 remap[location][var->data.location_frac].location =
469 cursor[interp] + VARYING_SLOT_VAR0;
470
471 found_new_offset = true;
472
473 /* Turn off the mask for the component we are remapping */
474 if (comps[location] & 1 << var->data.location_frac) {
475 comps[location] ^= 1 << var->data.location_frac;
476 comps[cursor[interp]] |= new_var_comps;
477 }
478 break;
479 }
480 }
481
482 if (found_new_offset)
483 break;
484 }
485 }
486 }
487
488 uint64_t zero = 0;
489 remap_slots_and_components(input_list, consumer->info.stage, remap,
490 &consumer->info.inputs_read, &zero);
491 remap_slots_and_components(output_list, producer->info.stage, remap,
492 &producer->info.outputs_written,
493 &producer->info.outputs_read);
494 }
495
496 /* We assume that this has been called more-or-less directly after
497 * remove_unused_varyings. At this point, all of the varyings that we
498 * aren't going to be using have been completely removed and the
499 * inputs_read and outputs_written fields in nir_shader_info reflect
500 * this. Therefore, the total set of valid slots is the OR of the two
501 * sets of varyings; this accounts for varyings which one side may need
502 * to read/write even if the other doesn't. This can happen if, for
503 * instance, an array is used indirectly from one side causing it to be
504 * unsplittable but directly from the other.
505 */
506 void
507 nir_compact_varyings(nir_shader *producer, nir_shader *consumer,
508 bool default_to_smooth_interp)
509 {
510 assert(producer->info.stage != MESA_SHADER_FRAGMENT);
511 assert(consumer->info.stage != MESA_SHADER_VERTEX);
512
513 uint8_t comps[32] = {0};
514 uint8_t interp_type[32] = {0};
515 uint8_t interp_loc[32] = {0};
516
517 get_slot_component_masks_and_interp_types(&producer->outputs, comps,
518 interp_type, interp_loc,
519 producer->info.stage,
520 default_to_smooth_interp);
521 get_slot_component_masks_and_interp_types(&consumer->inputs, comps,
522 interp_type, interp_loc,
523 consumer->info.stage,
524 default_to_smooth_interp);
525
526 compact_components(producer, consumer, comps, interp_type, interp_loc,
527 default_to_smooth_interp);
528 }
529
530 /*
531 * Mark XFB varyings as always_active_io in the consumer so the linking opts
532 * don't touch them.
533 */
534 void
535 nir_link_xfb_varyings(nir_shader *producer, nir_shader *consumer)
536 {
537 nir_variable *input_vars[MAX_VARYING] = { 0 };
538
539 nir_foreach_variable(var, &consumer->inputs) {
540 if (var->data.location >= VARYING_SLOT_VAR0 &&
541 var->data.location - VARYING_SLOT_VAR0 < MAX_VARYING) {
542
543 unsigned location = var->data.location - VARYING_SLOT_VAR0;
544 input_vars[location] = var;
545 }
546 }
547
548 nir_foreach_variable(var, &producer->outputs) {
549 if (var->data.location >= VARYING_SLOT_VAR0 &&
550 var->data.location - VARYING_SLOT_VAR0 < MAX_VARYING) {
551
552 if (!var->data.always_active_io)
553 continue;
554
555 unsigned location = var->data.location - VARYING_SLOT_VAR0;
556 if (input_vars[location]) {
557 input_vars[location]->data.always_active_io = true;
558 }
559 }
560 }
561 }
562
563 static bool
564 can_replace_varying(nir_variable *out_var)
565 {
566 /* Skip types that require more complex handling.
567 * TODO: add support for these types.
568 */
569 if (glsl_type_is_array(out_var->type) ||
570 glsl_type_is_dual_slot(out_var->type) ||
571 glsl_type_is_matrix(out_var->type) ||
572 glsl_type_is_struct(out_var->type))
573 return false;
574
575 /* Limit this pass to scalars for now to keep things simple. Most varyings
576 * should have been lowered to scalars at this point anyway.
577 */
578 if (!glsl_type_is_scalar(out_var->type))
579 return false;
580
581 if (out_var->data.location < VARYING_SLOT_VAR0 ||
582 out_var->data.location - VARYING_SLOT_VAR0 >= MAX_VARYING)
583 return false;
584
585 return true;
586 }
587
588 static bool
589 replace_constant_input(nir_shader *shader, nir_intrinsic_instr *store_intr)
590 {
591 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
592
593 nir_builder b;
594 nir_builder_init(&b, impl);
595
596 nir_variable *out_var =
597 nir_deref_instr_get_variable(nir_src_as_deref(store_intr->src[0]));
598
599 bool progress = false;
600 nir_foreach_block(block, impl) {
601 nir_foreach_instr(instr, block) {
602 if (instr->type != nir_instr_type_intrinsic)
603 continue;
604
605 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
606 if (intr->intrinsic != nir_intrinsic_load_deref)
607 continue;
608
609 nir_deref_instr *in_deref = nir_src_as_deref(intr->src[0]);
610 if (in_deref->mode != nir_var_shader_in)
611 continue;
612
613 nir_variable *in_var = nir_deref_instr_get_variable(in_deref);
614
615 if (in_var->data.location != out_var->data.location ||
616 in_var->data.location_frac != out_var->data.location_frac)
617 continue;
618
619 b.cursor = nir_before_instr(instr);
620
621 nir_load_const_instr *out_const =
622 nir_instr_as_load_const(store_intr->src[1].ssa->parent_instr);
623
624 /* Add new const to replace the input */
625 nir_ssa_def *nconst = nir_build_imm(&b, store_intr->num_components,
626 intr->dest.ssa.bit_size,
627 out_const->value);
628
629 nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(nconst));
630
631 progress = true;
632 }
633 }
634
635 return progress;
636 }
637
638 bool
639 nir_link_opt_varyings(nir_shader *producer, nir_shader *consumer)
640 {
641 /* TODO: Add support for more shader stage combinations */
642 if (consumer->info.stage != MESA_SHADER_FRAGMENT ||
643 (producer->info.stage != MESA_SHADER_VERTEX &&
644 producer->info.stage != MESA_SHADER_TESS_EVAL))
645 return false;
646
647 bool progress = false;
648
649 nir_function_impl *impl = nir_shader_get_entrypoint(producer);
650
651 /* If we find a store in the last block of the producer we can be sure this
652 * is the only possible value for this output.
653 */
654 nir_block *last_block = nir_impl_last_block(impl);
655 nir_foreach_instr_reverse(instr, last_block) {
656 if (instr->type != nir_instr_type_intrinsic)
657 continue;
658
659 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
660
661 if (intr->intrinsic != nir_intrinsic_store_deref)
662 continue;
663
664 nir_deref_instr *out_deref = nir_src_as_deref(intr->src[0]);
665 if (out_deref->mode != nir_var_shader_out)
666 continue;
667
668 nir_variable *out_var = nir_deref_instr_get_variable(out_deref);
669 if (!can_replace_varying(out_var))
670 continue;
671
672 if (intr->src[1].ssa->parent_instr->type == nir_instr_type_load_const) {
673 progress |= replace_constant_input(consumer, intr);
674 }
675 }
676
677 return progress;
678 }