b07bb40894e0f3569cbdd9818e5a139e744be907
[mesa.git] / src / compiler / nir / nir_linking_helpers.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir.h"
25 #include "nir_builder.h"
26 #include "util/set.h"
27 #include "util/hash_table.h"
28
29 /* This file contains various little helpers for doing simple linking in
30 * NIR. Eventually, we'll probably want a full-blown varying packing
31 * implementation in here. Right now, it just deletes unused things.
32 */
33
34 /**
35 * Returns the bits in the inputs_read, outputs_written, or
36 * system_values_read bitfield corresponding to this variable.
37 */
38 static uint64_t
39 get_variable_io_mask(nir_variable *var, gl_shader_stage stage)
40 {
41 if (var->data.location < 0)
42 return 0;
43
44 unsigned location = var->data.patch ?
45 var->data.location - VARYING_SLOT_PATCH0 : var->data.location;
46
47 assert(var->data.mode == nir_var_shader_in ||
48 var->data.mode == nir_var_shader_out ||
49 var->data.mode == nir_var_system_value);
50 assert(var->data.location >= 0);
51
52 const struct glsl_type *type = var->type;
53 if (nir_is_per_vertex_io(var, stage)) {
54 assert(glsl_type_is_array(type));
55 type = glsl_get_array_element(type);
56 }
57
58 unsigned slots = glsl_count_attribute_slots(type, false);
59 return ((1ull << slots) - 1) << location;
60 }
61
62 static void
63 tcs_add_output_reads(nir_shader *shader, uint64_t *read, uint64_t *patches_read)
64 {
65 nir_foreach_function(function, shader) {
66 if (!function->impl)
67 continue;
68
69 nir_foreach_block(block, function->impl) {
70 nir_foreach_instr(instr, block) {
71 if (instr->type != nir_instr_type_intrinsic)
72 continue;
73
74 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
75 if (intrin->intrinsic != nir_intrinsic_load_deref)
76 continue;
77
78 nir_variable *var =
79 nir_deref_instr_get_variable(nir_src_as_deref(intrin->src[0]));
80
81 if (var->data.mode != nir_var_shader_out)
82 continue;
83
84 if (var->data.patch) {
85 patches_read[var->data.location_frac] |=
86 get_variable_io_mask(var, shader->info.stage);
87 } else {
88 read[var->data.location_frac] |=
89 get_variable_io_mask(var, shader->info.stage);
90 }
91 }
92 }
93 }
94 }
95
96 /**
97 * Helper for removing unused shader I/O variables, by demoting them to global
98 * variables (which may then by dead code eliminated).
99 *
100 * Example usage is:
101 *
102 * progress = nir_remove_unused_io_vars(producer,
103 * &producer->outputs,
104 * read, patches_read) ||
105 * progress;
106 *
107 * The "used" should be an array of 4 uint64_ts (probably of VARYING_BIT_*)
108 * representing each .location_frac used. Note that for vector variables,
109 * only the first channel (.location_frac) is examined for deciding if the
110 * variable is used!
111 */
112 bool
113 nir_remove_unused_io_vars(nir_shader *shader, struct exec_list *var_list,
114 uint64_t *used_by_other_stage,
115 uint64_t *used_by_other_stage_patches)
116 {
117 bool progress = false;
118 uint64_t *used;
119
120 nir_foreach_variable_safe(var, var_list) {
121 if (var->data.patch)
122 used = used_by_other_stage_patches;
123 else
124 used = used_by_other_stage;
125
126 if (var->data.location < VARYING_SLOT_VAR0 && var->data.location >= 0)
127 continue;
128
129 if (var->data.always_active_io)
130 continue;
131
132 uint64_t other_stage = used[var->data.location_frac];
133
134 if (!(other_stage & get_variable_io_mask(var, shader->info.stage))) {
135 /* This one is invalid, make it a global variable instead */
136 var->data.location = 0;
137 var->data.mode = nir_var_global;
138
139 exec_node_remove(&var->node);
140 exec_list_push_tail(&shader->globals, &var->node);
141
142 progress = true;
143 }
144 }
145
146 if (progress)
147 nir_fixup_deref_modes(shader);
148
149 return progress;
150 }
151
152 bool
153 nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer)
154 {
155 assert(producer->info.stage != MESA_SHADER_FRAGMENT);
156 assert(consumer->info.stage != MESA_SHADER_VERTEX);
157
158 uint64_t read[4] = { 0 }, written[4] = { 0 };
159 uint64_t patches_read[4] = { 0 }, patches_written[4] = { 0 };
160
161 nir_foreach_variable(var, &producer->outputs) {
162 if (var->data.patch) {
163 patches_written[var->data.location_frac] |=
164 get_variable_io_mask(var, producer->info.stage);
165 } else {
166 written[var->data.location_frac] |=
167 get_variable_io_mask(var, producer->info.stage);
168 }
169 }
170
171 nir_foreach_variable(var, &consumer->inputs) {
172 if (var->data.patch) {
173 patches_read[var->data.location_frac] |=
174 get_variable_io_mask(var, consumer->info.stage);
175 } else {
176 read[var->data.location_frac] |=
177 get_variable_io_mask(var, consumer->info.stage);
178 }
179 }
180
181 /* Each TCS invocation can read data written by other TCS invocations,
182 * so even if the outputs are not used by the TES we must also make
183 * sure they are not read by the TCS before demoting them to globals.
184 */
185 if (producer->info.stage == MESA_SHADER_TESS_CTRL)
186 tcs_add_output_reads(producer, read, patches_read);
187
188 bool progress = false;
189 progress = nir_remove_unused_io_vars(producer, &producer->outputs, read,
190 patches_read);
191
192 progress = nir_remove_unused_io_vars(consumer, &consumer->inputs, written,
193 patches_written) || progress;
194
195 return progress;
196 }
197
198 static uint8_t
199 get_interp_type(nir_variable *var, bool default_to_smooth_interp)
200 {
201 if (var->data.interpolation != INTERP_MODE_NONE)
202 return var->data.interpolation;
203 else if (default_to_smooth_interp)
204 return INTERP_MODE_SMOOTH;
205 else
206 return INTERP_MODE_NONE;
207 }
208
209 #define INTERPOLATE_LOC_SAMPLE 0
210 #define INTERPOLATE_LOC_CENTROID 1
211 #define INTERPOLATE_LOC_CENTER 2
212
213 static uint8_t
214 get_interp_loc(nir_variable *var)
215 {
216 if (var->data.sample)
217 return INTERPOLATE_LOC_SAMPLE;
218 else if (var->data.centroid)
219 return INTERPOLATE_LOC_CENTROID;
220 else
221 return INTERPOLATE_LOC_CENTER;
222 }
223
224 static void
225 get_slot_component_masks_and_interp_types(struct exec_list *var_list,
226 uint8_t *comps,
227 uint8_t *interp_type,
228 uint8_t *interp_loc,
229 gl_shader_stage stage,
230 bool default_to_smooth_interp)
231 {
232 nir_foreach_variable_safe(var, var_list) {
233 assert(var->data.location >= 0);
234
235 /* Only remap things that aren't built-ins.
236 * TODO: add TES patch support.
237 */
238 if (var->data.location >= VARYING_SLOT_VAR0 &&
239 var->data.location - VARYING_SLOT_VAR0 < 32) {
240
241 const struct glsl_type *type = var->type;
242 if (nir_is_per_vertex_io(var, stage)) {
243 assert(glsl_type_is_array(type));
244 type = glsl_get_array_element(type);
245 }
246
247 unsigned location = var->data.location - VARYING_SLOT_VAR0;
248 unsigned elements =
249 glsl_get_vector_elements(glsl_without_array(type));
250
251 bool dual_slot = glsl_type_is_dual_slot(glsl_without_array(type));
252 unsigned slots = glsl_count_attribute_slots(type, false);
253 unsigned comps_slot2 = 0;
254 for (unsigned i = 0; i < slots; i++) {
255 interp_type[location + i] =
256 get_interp_type(var, default_to_smooth_interp);
257 interp_loc[location + i] = get_interp_loc(var);
258
259 if (dual_slot) {
260 if (i & 1) {
261 comps[location + i] |= ((1 << comps_slot2) - 1);
262 } else {
263 unsigned num_comps = 4 - var->data.location_frac;
264 comps_slot2 = (elements * 2) - num_comps;
265
266 /* Assume ARB_enhanced_layouts packing rules for doubles */
267 assert(var->data.location_frac == 0 ||
268 var->data.location_frac == 2);
269 assert(comps_slot2 <= 4);
270
271 comps[location + i] |=
272 ((1 << num_comps) - 1) << var->data.location_frac;
273 }
274 } else {
275 comps[location + i] |=
276 ((1 << elements) - 1) << var->data.location_frac;
277 }
278 }
279 }
280 }
281 }
282
283 struct varying_loc
284 {
285 uint8_t component;
286 uint32_t location;
287 };
288
289 static void
290 remap_slots_and_components(struct exec_list *var_list, gl_shader_stage stage,
291 struct varying_loc (*remap)[4],
292 uint64_t *slots_used, uint64_t *out_slots_read)
293 {
294 uint64_t out_slots_read_tmp = 0;
295
296 /* We don't touch builtins so just copy the bitmask */
297 uint64_t slots_used_tmp =
298 *slots_used & (((uint64_t)1 << (VARYING_SLOT_VAR0 - 1)) - 1);
299
300 nir_foreach_variable(var, var_list) {
301 assert(var->data.location >= 0);
302
303 /* Only remap things that aren't built-ins */
304 if (var->data.location >= VARYING_SLOT_VAR0 &&
305 var->data.location - VARYING_SLOT_VAR0 < 32) {
306 assert(var->data.location - VARYING_SLOT_VAR0 < 32);
307
308 const struct glsl_type *type = var->type;
309 if (nir_is_per_vertex_io(var, stage)) {
310 assert(glsl_type_is_array(type));
311 type = glsl_get_array_element(type);
312 }
313
314 unsigned num_slots = glsl_count_attribute_slots(type, false);
315 bool used_across_stages = false;
316 bool outputs_read = false;
317
318 unsigned location = var->data.location - VARYING_SLOT_VAR0;
319 struct varying_loc *new_loc = &remap[location][var->data.location_frac];
320
321 uint64_t slots = (((uint64_t)1 << num_slots) - 1) << var->data.location;
322 if (slots & *slots_used)
323 used_across_stages = true;
324
325 if (slots & *out_slots_read)
326 outputs_read = true;
327
328 if (new_loc->location) {
329 var->data.location = new_loc->location;
330 var->data.location_frac = new_loc->component;
331 }
332
333 if (var->data.always_active_io) {
334 /* We can't apply link time optimisations (specifically array
335 * splitting) to these so we need to copy the existing mask
336 * otherwise we will mess up the mask for things like partially
337 * marked arrays.
338 */
339 if (used_across_stages) {
340 slots_used_tmp |=
341 *slots_used & (((uint64_t)1 << num_slots) - 1) << var->data.location;
342 }
343
344 if (outputs_read) {
345 out_slots_read_tmp |=
346 *out_slots_read & (((uint64_t)1 << num_slots) - 1) << var->data.location;
347 }
348
349 } else {
350 for (unsigned i = 0; i < num_slots; i++) {
351 if (used_across_stages)
352 slots_used_tmp |= (uint64_t)1 << (var->data.location + i);
353
354 if (outputs_read)
355 out_slots_read_tmp |= (uint64_t)1 << (var->data.location + i);
356 }
357 }
358 }
359 }
360
361 *slots_used = slots_used_tmp;
362 *out_slots_read = out_slots_read_tmp;
363 }
364
365 /* If there are empty components in the slot compact the remaining components
366 * as close to component 0 as possible. This will make it easier to fill the
367 * empty components with components from a different slot in a following pass.
368 */
369 static void
370 compact_components(nir_shader *producer, nir_shader *consumer, uint8_t *comps,
371 uint8_t *interp_type, uint8_t *interp_loc,
372 bool default_to_smooth_interp)
373 {
374 struct exec_list *input_list = &consumer->inputs;
375 struct exec_list *output_list = &producer->outputs;
376 struct varying_loc remap[32][4] = {{{0}, {0}}};
377
378 /* Create a cursor for each interpolation type */
379 unsigned cursor[4] = {0};
380
381 /* We only need to pass over one stage and we choose the consumer as it seems
382 * to cause a larger reduction in instruction counts (tested on i965).
383 */
384 nir_foreach_variable(var, input_list) {
385
386 /* Only remap things that aren't builtins.
387 * TODO: add TES patch support.
388 */
389 if (var->data.location >= VARYING_SLOT_VAR0 &&
390 var->data.location - VARYING_SLOT_VAR0 < 32) {
391
392 /* We can't repack xfb varyings. */
393 if (var->data.always_active_io)
394 continue;
395
396 const struct glsl_type *type = var->type;
397 if (nir_is_per_vertex_io(var, consumer->info.stage)) {
398 assert(glsl_type_is_array(type));
399 type = glsl_get_array_element(type);
400 }
401
402 /* Skip types that require more complex packing handling.
403 * TODO: add support for these types.
404 */
405 if (glsl_type_is_array(type) ||
406 glsl_type_is_dual_slot(type) ||
407 glsl_type_is_matrix(type) ||
408 glsl_type_is_struct(type) ||
409 glsl_type_is_64bit(type))
410 continue;
411
412 /* We ignore complex types above and all other vector types should
413 * have been split into scalar variables by the lower_io_to_scalar
414 * pass. The only exeption should by OpenGL xfb varyings.
415 */
416 if (glsl_get_vector_elements(type) != 1)
417 continue;
418
419 unsigned location = var->data.location - VARYING_SLOT_VAR0;
420 uint8_t used_comps = comps[location];
421
422 /* If there are no empty components there is nothing more for us to do.
423 */
424 if (used_comps == 0xf)
425 continue;
426
427 bool found_new_offset = false;
428 uint8_t interp = get_interp_type(var, default_to_smooth_interp);
429 for (; cursor[interp] < 32; cursor[interp]++) {
430 uint8_t cursor_used_comps = comps[cursor[interp]];
431
432 /* We couldn't find anywhere to pack the varying continue on. */
433 if (cursor[interp] == location &&
434 (var->data.location_frac == 0 ||
435 cursor_used_comps & ((1 << (var->data.location_frac)) - 1)))
436 break;
437
438 /* We can only pack varyings with matching interpolation types */
439 if (interp_type[cursor[interp]] != interp)
440 continue;
441
442 /* Interpolation loc must match also.
443 * TODO: i965 can handle these if they don't match, but the
444 * radeonsi nir backend handles everything as vec4s and so expects
445 * this to be the same for all components. We could make this
446 * check driver specfific or drop it if NIR ever become the only
447 * radeonsi backend.
448 */
449 if (interp_loc[cursor[interp]] != get_interp_loc(var))
450 continue;
451
452 /* If the slot is empty just skip it for now, compact_var_list()
453 * can be called after this function to remove empty slots for us.
454 * TODO: finish implementing compact_var_list() requires array and
455 * matrix splitting.
456 */
457 if (!cursor_used_comps)
458 continue;
459
460 uint8_t unused_comps = ~cursor_used_comps;
461
462 for (unsigned i = 0; i < 4; i++) {
463 uint8_t new_var_comps = 1 << i;
464 if (unused_comps & new_var_comps) {
465 remap[location][var->data.location_frac].component = i;
466 remap[location][var->data.location_frac].location =
467 cursor[interp] + VARYING_SLOT_VAR0;
468
469 found_new_offset = true;
470
471 /* Turn off the mask for the component we are remapping */
472 if (comps[location] & 1 << var->data.location_frac) {
473 comps[location] ^= 1 << var->data.location_frac;
474 comps[cursor[interp]] |= new_var_comps;
475 }
476 break;
477 }
478 }
479
480 if (found_new_offset)
481 break;
482 }
483 }
484 }
485
486 uint64_t zero = 0;
487 remap_slots_and_components(input_list, consumer->info.stage, remap,
488 &consumer->info.inputs_read, &zero);
489 remap_slots_and_components(output_list, producer->info.stage, remap,
490 &producer->info.outputs_written,
491 &producer->info.outputs_read);
492 }
493
494 /* We assume that this has been called more-or-less directly after
495 * remove_unused_varyings. At this point, all of the varyings that we
496 * aren't going to be using have been completely removed and the
497 * inputs_read and outputs_written fields in nir_shader_info reflect
498 * this. Therefore, the total set of valid slots is the OR of the two
499 * sets of varyings; this accounts for varyings which one side may need
500 * to read/write even if the other doesn't. This can happen if, for
501 * instance, an array is used indirectly from one side causing it to be
502 * unsplittable but directly from the other.
503 */
504 void
505 nir_compact_varyings(nir_shader *producer, nir_shader *consumer,
506 bool default_to_smooth_interp)
507 {
508 assert(producer->info.stage != MESA_SHADER_FRAGMENT);
509 assert(consumer->info.stage != MESA_SHADER_VERTEX);
510
511 uint8_t comps[32] = {0};
512 uint8_t interp_type[32] = {0};
513 uint8_t interp_loc[32] = {0};
514
515 get_slot_component_masks_and_interp_types(&producer->outputs, comps,
516 interp_type, interp_loc,
517 producer->info.stage,
518 default_to_smooth_interp);
519 get_slot_component_masks_and_interp_types(&consumer->inputs, comps,
520 interp_type, interp_loc,
521 consumer->info.stage,
522 default_to_smooth_interp);
523
524 compact_components(producer, consumer, comps, interp_type, interp_loc,
525 default_to_smooth_interp);
526 }
527
528 /*
529 * Mark XFB varyings as always_active_io in the consumer so the linking opts
530 * don't touch them.
531 */
532 void
533 nir_link_xfb_varyings(nir_shader *producer, nir_shader *consumer)
534 {
535 nir_variable *input_vars[MAX_VARYING] = { 0 };
536
537 nir_foreach_variable(var, &consumer->inputs) {
538 if (var->data.location >= VARYING_SLOT_VAR0 &&
539 var->data.location - VARYING_SLOT_VAR0 < MAX_VARYING) {
540
541 unsigned location = var->data.location - VARYING_SLOT_VAR0;
542 input_vars[location] = var;
543 }
544 }
545
546 nir_foreach_variable(var, &producer->outputs) {
547 if (var->data.location >= VARYING_SLOT_VAR0 &&
548 var->data.location - VARYING_SLOT_VAR0 < MAX_VARYING) {
549
550 if (!var->data.always_active_io)
551 continue;
552
553 unsigned location = var->data.location - VARYING_SLOT_VAR0;
554 if (input_vars[location]) {
555 input_vars[location]->data.always_active_io = true;
556 }
557 }
558 }
559 }
560
561 static bool
562 try_replace_constant_input(nir_shader *shader,
563 nir_intrinsic_instr *store_intr)
564 {
565 nir_variable *out_var =
566 nir_deref_instr_get_variable(nir_src_as_deref(store_intr->src[0]));
567
568 if (out_var->data.mode != nir_var_shader_out)
569 return false;
570
571 /* Skip types that require more complex handling.
572 * TODO: add support for these types.
573 */
574 if (glsl_type_is_array(out_var->type) ||
575 glsl_type_is_dual_slot(out_var->type) ||
576 glsl_type_is_matrix(out_var->type) ||
577 glsl_type_is_struct(out_var->type))
578 return false;
579
580 /* Limit this pass to scalars for now to keep things simple. Most varyings
581 * should have been lowered to scalars at this point anyway.
582 */
583 if (store_intr->num_components != 1)
584 return false;
585
586 if (out_var->data.location < VARYING_SLOT_VAR0 ||
587 out_var->data.location - VARYING_SLOT_VAR0 >= MAX_VARYING)
588 return false;
589
590 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
591
592 nir_builder b;
593 nir_builder_init(&b, impl);
594
595 bool progress = false;
596 nir_foreach_block(block, impl) {
597 nir_foreach_instr(instr, block) {
598 if (instr->type != nir_instr_type_intrinsic)
599 continue;
600
601 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
602 if (intr->intrinsic != nir_intrinsic_load_deref)
603 continue;
604
605 nir_variable *in_var =
606 nir_deref_instr_get_variable(nir_src_as_deref(intr->src[0]));
607
608 if (in_var->data.mode != nir_var_shader_in)
609 continue;
610
611 if (in_var->data.location != out_var->data.location ||
612 in_var->data.location_frac != out_var->data.location_frac)
613 continue;
614
615 b.cursor = nir_before_instr(instr);
616
617 nir_load_const_instr *out_const =
618 nir_instr_as_load_const(store_intr->src[1].ssa->parent_instr);
619
620 /* Add new const to replace the input */
621 nir_ssa_def *nconst = nir_build_imm(&b, store_intr->num_components,
622 intr->dest.ssa.bit_size,
623 out_const->value);
624
625 nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(nconst));
626
627 progress = true;
628 }
629 }
630
631 return progress;
632 }
633
634 bool
635 nir_link_constant_varyings(nir_shader *producer, nir_shader *consumer)
636 {
637 /* TODO: Add support for more shader stage combinations */
638 if (consumer->info.stage != MESA_SHADER_FRAGMENT ||
639 (producer->info.stage != MESA_SHADER_VERTEX &&
640 producer->info.stage != MESA_SHADER_TESS_EVAL))
641 return false;
642
643 bool progress = false;
644
645 nir_function_impl *impl = nir_shader_get_entrypoint(producer);
646
647 /* If we find a store in the last block of the producer we can be sure this
648 * is the only possible value for this output.
649 */
650 nir_block *last_block = nir_impl_last_block(impl);
651 nir_foreach_instr_reverse(instr, last_block) {
652 if (instr->type != nir_instr_type_intrinsic)
653 continue;
654
655 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
656
657 if (intr->intrinsic != nir_intrinsic_store_deref)
658 continue;
659
660 if (intr->src[1].ssa->parent_instr->type != nir_instr_type_load_const) {
661 continue;
662 }
663
664 progress |= try_replace_constant_input(consumer, intr);
665 }
666
667 return progress;
668 }