nir: don't pack varyings ints with floats unless flat
[mesa.git] / src / compiler / nir / nir_linking_helpers.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir.h"
25 #include "nir_builder.h"
26 #include "util/set.h"
27 #include "util/hash_table.h"
28
29 /* This file contains various little helpers for doing simple linking in
30 * NIR. Eventually, we'll probably want a full-blown varying packing
31 * implementation in here. Right now, it just deletes unused things.
32 */
33
34 /**
35 * Returns the bits in the inputs_read, outputs_written, or
36 * system_values_read bitfield corresponding to this variable.
37 */
38 static uint64_t
39 get_variable_io_mask(nir_variable *var, gl_shader_stage stage)
40 {
41 if (var->data.location < 0)
42 return 0;
43
44 unsigned location = var->data.patch ?
45 var->data.location - VARYING_SLOT_PATCH0 : var->data.location;
46
47 assert(var->data.mode == nir_var_shader_in ||
48 var->data.mode == nir_var_shader_out ||
49 var->data.mode == nir_var_system_value);
50 assert(var->data.location >= 0);
51
52 const struct glsl_type *type = var->type;
53 if (nir_is_per_vertex_io(var, stage)) {
54 assert(glsl_type_is_array(type));
55 type = glsl_get_array_element(type);
56 }
57
58 unsigned slots = glsl_count_attribute_slots(type, false);
59 return ((1ull << slots) - 1) << location;
60 }
61
62 static void
63 tcs_add_output_reads(nir_shader *shader, uint64_t *read, uint64_t *patches_read)
64 {
65 nir_foreach_function(function, shader) {
66 if (!function->impl)
67 continue;
68
69 nir_foreach_block(block, function->impl) {
70 nir_foreach_instr(instr, block) {
71 if (instr->type != nir_instr_type_intrinsic)
72 continue;
73
74 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
75 if (intrin->intrinsic != nir_intrinsic_load_deref)
76 continue;
77
78 nir_variable *var =
79 nir_deref_instr_get_variable(nir_src_as_deref(intrin->src[0]));
80
81 if (var->data.mode != nir_var_shader_out)
82 continue;
83
84 if (var->data.patch) {
85 patches_read[var->data.location_frac] |=
86 get_variable_io_mask(var, shader->info.stage);
87 } else {
88 read[var->data.location_frac] |=
89 get_variable_io_mask(var, shader->info.stage);
90 }
91 }
92 }
93 }
94 }
95
96 /**
97 * Helper for removing unused shader I/O variables, by demoting them to global
98 * variables (which may then by dead code eliminated).
99 *
100 * Example usage is:
101 *
102 * progress = nir_remove_unused_io_vars(producer,
103 * &producer->outputs,
104 * read, patches_read) ||
105 * progress;
106 *
107 * The "used" should be an array of 4 uint64_ts (probably of VARYING_BIT_*)
108 * representing each .location_frac used. Note that for vector variables,
109 * only the first channel (.location_frac) is examined for deciding if the
110 * variable is used!
111 */
112 bool
113 nir_remove_unused_io_vars(nir_shader *shader, struct exec_list *var_list,
114 uint64_t *used_by_other_stage,
115 uint64_t *used_by_other_stage_patches)
116 {
117 bool progress = false;
118 uint64_t *used;
119
120 nir_foreach_variable_safe(var, var_list) {
121 if (var->data.patch)
122 used = used_by_other_stage_patches;
123 else
124 used = used_by_other_stage;
125
126 if (var->data.location < VARYING_SLOT_VAR0 && var->data.location >= 0)
127 continue;
128
129 if (var->data.always_active_io)
130 continue;
131
132 uint64_t other_stage = used[var->data.location_frac];
133
134 if (!(other_stage & get_variable_io_mask(var, shader->info.stage))) {
135 /* This one is invalid, make it a global variable instead */
136 var->data.location = 0;
137 var->data.mode = nir_var_global;
138
139 exec_node_remove(&var->node);
140 exec_list_push_tail(&shader->globals, &var->node);
141
142 progress = true;
143 }
144 }
145
146 if (progress)
147 nir_fixup_deref_modes(shader);
148
149 return progress;
150 }
151
152 bool
153 nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer)
154 {
155 assert(producer->info.stage != MESA_SHADER_FRAGMENT);
156 assert(consumer->info.stage != MESA_SHADER_VERTEX);
157
158 uint64_t read[4] = { 0 }, written[4] = { 0 };
159 uint64_t patches_read[4] = { 0 }, patches_written[4] = { 0 };
160
161 nir_foreach_variable(var, &producer->outputs) {
162 if (var->data.patch) {
163 patches_written[var->data.location_frac] |=
164 get_variable_io_mask(var, producer->info.stage);
165 } else {
166 written[var->data.location_frac] |=
167 get_variable_io_mask(var, producer->info.stage);
168 }
169 }
170
171 nir_foreach_variable(var, &consumer->inputs) {
172 if (var->data.patch) {
173 patches_read[var->data.location_frac] |=
174 get_variable_io_mask(var, consumer->info.stage);
175 } else {
176 read[var->data.location_frac] |=
177 get_variable_io_mask(var, consumer->info.stage);
178 }
179 }
180
181 /* Each TCS invocation can read data written by other TCS invocations,
182 * so even if the outputs are not used by the TES we must also make
183 * sure they are not read by the TCS before demoting them to globals.
184 */
185 if (producer->info.stage == MESA_SHADER_TESS_CTRL)
186 tcs_add_output_reads(producer, read, patches_read);
187
188 bool progress = false;
189 progress = nir_remove_unused_io_vars(producer, &producer->outputs, read,
190 patches_read);
191
192 progress = nir_remove_unused_io_vars(consumer, &consumer->inputs, written,
193 patches_written) || progress;
194
195 return progress;
196 }
197
198 static uint8_t
199 get_interp_type(nir_variable *var, const struct glsl_type *type,
200 bool default_to_smooth_interp)
201 {
202 if (glsl_type_is_integer(type))
203 return INTERP_MODE_FLAT;
204 else if (var->data.interpolation != INTERP_MODE_NONE)
205 return var->data.interpolation;
206 else if (default_to_smooth_interp)
207 return INTERP_MODE_SMOOTH;
208 else
209 return INTERP_MODE_NONE;
210 }
211
212 #define INTERPOLATE_LOC_SAMPLE 0
213 #define INTERPOLATE_LOC_CENTROID 1
214 #define INTERPOLATE_LOC_CENTER 2
215
216 static uint8_t
217 get_interp_loc(nir_variable *var)
218 {
219 if (var->data.sample)
220 return INTERPOLATE_LOC_SAMPLE;
221 else if (var->data.centroid)
222 return INTERPOLATE_LOC_CENTROID;
223 else
224 return INTERPOLATE_LOC_CENTER;
225 }
226
227 static void
228 get_slot_component_masks_and_interp_types(struct exec_list *var_list,
229 uint8_t *comps,
230 uint8_t *interp_type,
231 uint8_t *interp_loc,
232 gl_shader_stage stage,
233 bool default_to_smooth_interp)
234 {
235 nir_foreach_variable_safe(var, var_list) {
236 assert(var->data.location >= 0);
237
238 /* Only remap things that aren't built-ins.
239 * TODO: add TES patch support.
240 */
241 if (var->data.location >= VARYING_SLOT_VAR0 &&
242 var->data.location - VARYING_SLOT_VAR0 < 32) {
243
244 const struct glsl_type *type = var->type;
245 if (nir_is_per_vertex_io(var, stage)) {
246 assert(glsl_type_is_array(type));
247 type = glsl_get_array_element(type);
248 }
249
250 unsigned location = var->data.location - VARYING_SLOT_VAR0;
251 unsigned elements =
252 glsl_get_vector_elements(glsl_without_array(type));
253
254 bool dual_slot = glsl_type_is_dual_slot(glsl_without_array(type));
255 unsigned slots = glsl_count_attribute_slots(type, false);
256 unsigned comps_slot2 = 0;
257 for (unsigned i = 0; i < slots; i++) {
258 interp_type[location + i] =
259 get_interp_type(var, type, default_to_smooth_interp);
260 interp_loc[location + i] = get_interp_loc(var);
261
262 if (dual_slot) {
263 if (i & 1) {
264 comps[location + i] |= ((1 << comps_slot2) - 1);
265 } else {
266 unsigned num_comps = 4 - var->data.location_frac;
267 comps_slot2 = (elements * 2) - num_comps;
268
269 /* Assume ARB_enhanced_layouts packing rules for doubles */
270 assert(var->data.location_frac == 0 ||
271 var->data.location_frac == 2);
272 assert(comps_slot2 <= 4);
273
274 comps[location + i] |=
275 ((1 << num_comps) - 1) << var->data.location_frac;
276 }
277 } else {
278 comps[location + i] |=
279 ((1 << elements) - 1) << var->data.location_frac;
280 }
281 }
282 }
283 }
284 }
285
286 struct varying_loc
287 {
288 uint8_t component;
289 uint32_t location;
290 };
291
292 static void
293 remap_slots_and_components(struct exec_list *var_list, gl_shader_stage stage,
294 struct varying_loc (*remap)[4],
295 uint64_t *slots_used, uint64_t *out_slots_read)
296 {
297 uint64_t out_slots_read_tmp = 0;
298
299 /* We don't touch builtins so just copy the bitmask */
300 uint64_t slots_used_tmp =
301 *slots_used & (((uint64_t)1 << (VARYING_SLOT_VAR0 - 1)) - 1);
302
303 nir_foreach_variable(var, var_list) {
304 assert(var->data.location >= 0);
305
306 /* Only remap things that aren't built-ins */
307 if (var->data.location >= VARYING_SLOT_VAR0 &&
308 var->data.location - VARYING_SLOT_VAR0 < 32) {
309 assert(var->data.location - VARYING_SLOT_VAR0 < 32);
310
311 const struct glsl_type *type = var->type;
312 if (nir_is_per_vertex_io(var, stage)) {
313 assert(glsl_type_is_array(type));
314 type = glsl_get_array_element(type);
315 }
316
317 unsigned num_slots = glsl_count_attribute_slots(type, false);
318 bool used_across_stages = false;
319 bool outputs_read = false;
320
321 unsigned location = var->data.location - VARYING_SLOT_VAR0;
322 struct varying_loc *new_loc = &remap[location][var->data.location_frac];
323
324 uint64_t slots = (((uint64_t)1 << num_slots) - 1) << var->data.location;
325 if (slots & *slots_used)
326 used_across_stages = true;
327
328 if (slots & *out_slots_read)
329 outputs_read = true;
330
331 if (new_loc->location) {
332 var->data.location = new_loc->location;
333 var->data.location_frac = new_loc->component;
334 }
335
336 if (var->data.always_active_io) {
337 /* We can't apply link time optimisations (specifically array
338 * splitting) to these so we need to copy the existing mask
339 * otherwise we will mess up the mask for things like partially
340 * marked arrays.
341 */
342 if (used_across_stages) {
343 slots_used_tmp |=
344 *slots_used & (((uint64_t)1 << num_slots) - 1) << var->data.location;
345 }
346
347 if (outputs_read) {
348 out_slots_read_tmp |=
349 *out_slots_read & (((uint64_t)1 << num_slots) - 1) << var->data.location;
350 }
351
352 } else {
353 for (unsigned i = 0; i < num_slots; i++) {
354 if (used_across_stages)
355 slots_used_tmp |= (uint64_t)1 << (var->data.location + i);
356
357 if (outputs_read)
358 out_slots_read_tmp |= (uint64_t)1 << (var->data.location + i);
359 }
360 }
361 }
362 }
363
364 *slots_used = slots_used_tmp;
365 *out_slots_read = out_slots_read_tmp;
366 }
367
368 /* If there are empty components in the slot compact the remaining components
369 * as close to component 0 as possible. This will make it easier to fill the
370 * empty components with components from a different slot in a following pass.
371 */
372 static void
373 compact_components(nir_shader *producer, nir_shader *consumer, uint8_t *comps,
374 uint8_t *interp_type, uint8_t *interp_loc,
375 bool default_to_smooth_interp)
376 {
377 struct exec_list *input_list = &consumer->inputs;
378 struct exec_list *output_list = &producer->outputs;
379 struct varying_loc remap[32][4] = {{{0}, {0}}};
380
381 /* Create a cursor for each interpolation type */
382 unsigned cursor[4] = {0};
383
384 /* We only need to pass over one stage and we choose the consumer as it seems
385 * to cause a larger reduction in instruction counts (tested on i965).
386 */
387 nir_foreach_variable(var, input_list) {
388
389 /* Only remap things that aren't builtins.
390 * TODO: add TES patch support.
391 */
392 if (var->data.location >= VARYING_SLOT_VAR0 &&
393 var->data.location - VARYING_SLOT_VAR0 < 32) {
394
395 /* We can't repack xfb varyings. */
396 if (var->data.always_active_io)
397 continue;
398
399 const struct glsl_type *type = var->type;
400 if (nir_is_per_vertex_io(var, consumer->info.stage)) {
401 assert(glsl_type_is_array(type));
402 type = glsl_get_array_element(type);
403 }
404
405 /* Skip types that require more complex packing handling.
406 * TODO: add support for these types.
407 */
408 if (glsl_type_is_array(type) ||
409 glsl_type_is_dual_slot(type) ||
410 glsl_type_is_matrix(type) ||
411 glsl_type_is_struct(type) ||
412 glsl_type_is_64bit(type))
413 continue;
414
415 /* We ignore complex types above and all other vector types should
416 * have been split into scalar variables by the lower_io_to_scalar
417 * pass. The only exeption should by OpenGL xfb varyings.
418 */
419 if (glsl_get_vector_elements(type) != 1)
420 continue;
421
422 unsigned location = var->data.location - VARYING_SLOT_VAR0;
423 uint8_t used_comps = comps[location];
424
425 /* If there are no empty components there is nothing more for us to do.
426 */
427 if (used_comps == 0xf)
428 continue;
429
430 bool found_new_offset = false;
431 uint8_t interp = get_interp_type(var, type, default_to_smooth_interp);
432 for (; cursor[interp] < 32; cursor[interp]++) {
433 uint8_t cursor_used_comps = comps[cursor[interp]];
434
435 /* We couldn't find anywhere to pack the varying continue on. */
436 if (cursor[interp] == location &&
437 (var->data.location_frac == 0 ||
438 cursor_used_comps & ((1 << (var->data.location_frac)) - 1)))
439 break;
440
441 /* We can only pack varyings with matching interpolation types */
442 if (interp_type[cursor[interp]] != interp)
443 continue;
444
445 /* Interpolation loc must match also.
446 * TODO: i965 can handle these if they don't match, but the
447 * radeonsi nir backend handles everything as vec4s and so expects
448 * this to be the same for all components. We could make this
449 * check driver specfific or drop it if NIR ever become the only
450 * radeonsi backend.
451 */
452 if (interp_loc[cursor[interp]] != get_interp_loc(var))
453 continue;
454
455 /* If the slot is empty just skip it for now, compact_var_list()
456 * can be called after this function to remove empty slots for us.
457 * TODO: finish implementing compact_var_list() requires array and
458 * matrix splitting.
459 */
460 if (!cursor_used_comps)
461 continue;
462
463 uint8_t unused_comps = ~cursor_used_comps;
464
465 for (unsigned i = 0; i < 4; i++) {
466 uint8_t new_var_comps = 1 << i;
467 if (unused_comps & new_var_comps) {
468 remap[location][var->data.location_frac].component = i;
469 remap[location][var->data.location_frac].location =
470 cursor[interp] + VARYING_SLOT_VAR0;
471
472 found_new_offset = true;
473
474 /* Turn off the mask for the component we are remapping */
475 if (comps[location] & 1 << var->data.location_frac) {
476 comps[location] ^= 1 << var->data.location_frac;
477 comps[cursor[interp]] |= new_var_comps;
478 }
479 break;
480 }
481 }
482
483 if (found_new_offset)
484 break;
485 }
486 }
487 }
488
489 uint64_t zero = 0;
490 remap_slots_and_components(input_list, consumer->info.stage, remap,
491 &consumer->info.inputs_read, &zero);
492 remap_slots_and_components(output_list, producer->info.stage, remap,
493 &producer->info.outputs_written,
494 &producer->info.outputs_read);
495 }
496
497 /* We assume that this has been called more-or-less directly after
498 * remove_unused_varyings. At this point, all of the varyings that we
499 * aren't going to be using have been completely removed and the
500 * inputs_read and outputs_written fields in nir_shader_info reflect
501 * this. Therefore, the total set of valid slots is the OR of the two
502 * sets of varyings; this accounts for varyings which one side may need
503 * to read/write even if the other doesn't. This can happen if, for
504 * instance, an array is used indirectly from one side causing it to be
505 * unsplittable but directly from the other.
506 */
507 void
508 nir_compact_varyings(nir_shader *producer, nir_shader *consumer,
509 bool default_to_smooth_interp)
510 {
511 assert(producer->info.stage != MESA_SHADER_FRAGMENT);
512 assert(consumer->info.stage != MESA_SHADER_VERTEX);
513
514 uint8_t comps[32] = {0};
515 uint8_t interp_type[32] = {0};
516 uint8_t interp_loc[32] = {0};
517
518 get_slot_component_masks_and_interp_types(&producer->outputs, comps,
519 interp_type, interp_loc,
520 producer->info.stage,
521 default_to_smooth_interp);
522 get_slot_component_masks_and_interp_types(&consumer->inputs, comps,
523 interp_type, interp_loc,
524 consumer->info.stage,
525 default_to_smooth_interp);
526
527 compact_components(producer, consumer, comps, interp_type, interp_loc,
528 default_to_smooth_interp);
529 }
530
531 /*
532 * Mark XFB varyings as always_active_io in the consumer so the linking opts
533 * don't touch them.
534 */
535 void
536 nir_link_xfb_varyings(nir_shader *producer, nir_shader *consumer)
537 {
538 nir_variable *input_vars[MAX_VARYING] = { 0 };
539
540 nir_foreach_variable(var, &consumer->inputs) {
541 if (var->data.location >= VARYING_SLOT_VAR0 &&
542 var->data.location - VARYING_SLOT_VAR0 < MAX_VARYING) {
543
544 unsigned location = var->data.location - VARYING_SLOT_VAR0;
545 input_vars[location] = var;
546 }
547 }
548
549 nir_foreach_variable(var, &producer->outputs) {
550 if (var->data.location >= VARYING_SLOT_VAR0 &&
551 var->data.location - VARYING_SLOT_VAR0 < MAX_VARYING) {
552
553 if (!var->data.always_active_io)
554 continue;
555
556 unsigned location = var->data.location - VARYING_SLOT_VAR0;
557 if (input_vars[location]) {
558 input_vars[location]->data.always_active_io = true;
559 }
560 }
561 }
562 }
563
564 static bool
565 try_replace_constant_input(nir_shader *shader,
566 nir_intrinsic_instr *store_intr)
567 {
568 nir_variable *out_var =
569 nir_deref_instr_get_variable(nir_src_as_deref(store_intr->src[0]));
570
571 if (out_var->data.mode != nir_var_shader_out)
572 return false;
573
574 /* Skip types that require more complex handling.
575 * TODO: add support for these types.
576 */
577 if (glsl_type_is_array(out_var->type) ||
578 glsl_type_is_dual_slot(out_var->type) ||
579 glsl_type_is_matrix(out_var->type) ||
580 glsl_type_is_struct(out_var->type))
581 return false;
582
583 /* Limit this pass to scalars for now to keep things simple. Most varyings
584 * should have been lowered to scalars at this point anyway.
585 */
586 if (store_intr->num_components != 1)
587 return false;
588
589 if (out_var->data.location < VARYING_SLOT_VAR0 ||
590 out_var->data.location - VARYING_SLOT_VAR0 >= MAX_VARYING)
591 return false;
592
593 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
594
595 nir_builder b;
596 nir_builder_init(&b, impl);
597
598 bool progress = false;
599 nir_foreach_block(block, impl) {
600 nir_foreach_instr(instr, block) {
601 if (instr->type != nir_instr_type_intrinsic)
602 continue;
603
604 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
605 if (intr->intrinsic != nir_intrinsic_load_deref)
606 continue;
607
608 nir_variable *in_var =
609 nir_deref_instr_get_variable(nir_src_as_deref(intr->src[0]));
610
611 if (in_var->data.mode != nir_var_shader_in)
612 continue;
613
614 if (in_var->data.location != out_var->data.location ||
615 in_var->data.location_frac != out_var->data.location_frac)
616 continue;
617
618 b.cursor = nir_before_instr(instr);
619
620 nir_load_const_instr *out_const =
621 nir_instr_as_load_const(store_intr->src[1].ssa->parent_instr);
622
623 /* Add new const to replace the input */
624 nir_ssa_def *nconst = nir_build_imm(&b, store_intr->num_components,
625 intr->dest.ssa.bit_size,
626 out_const->value);
627
628 nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(nconst));
629
630 progress = true;
631 }
632 }
633
634 return progress;
635 }
636
637 bool
638 nir_link_constant_varyings(nir_shader *producer, nir_shader *consumer)
639 {
640 /* TODO: Add support for more shader stage combinations */
641 if (consumer->info.stage != MESA_SHADER_FRAGMENT ||
642 (producer->info.stage != MESA_SHADER_VERTEX &&
643 producer->info.stage != MESA_SHADER_TESS_EVAL))
644 return false;
645
646 bool progress = false;
647
648 nir_function_impl *impl = nir_shader_get_entrypoint(producer);
649
650 /* If we find a store in the last block of the producer we can be sure this
651 * is the only possible value for this output.
652 */
653 nir_block *last_block = nir_impl_last_block(impl);
654 nir_foreach_instr_reverse(instr, last_block) {
655 if (instr->type != nir_instr_type_intrinsic)
656 continue;
657
658 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
659
660 if (intr->intrinsic != nir_intrinsic_store_deref)
661 continue;
662
663 if (intr->src[1].ssa->parent_instr->type != nir_instr_type_load_const) {
664 continue;
665 }
666
667 progress |= try_replace_constant_input(consumer, intr);
668 }
669
670 return progress;
671 }