nir: Add nir_foreach_shader_in/out_variable helpers
[mesa.git] / src / compiler / nir / nir_linking_helpers.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir.h"
25 #include "nir_builder.h"
26 #include "util/set.h"
27 #include "util/hash_table.h"
28
29 /* This file contains various little helpers for doing simple linking in
30 * NIR. Eventually, we'll probably want a full-blown varying packing
31 * implementation in here. Right now, it just deletes unused things.
32 */
33
34 /**
35 * Returns the bits in the inputs_read, outputs_written, or
36 * system_values_read bitfield corresponding to this variable.
37 */
38 static uint64_t
39 get_variable_io_mask(nir_variable *var, gl_shader_stage stage)
40 {
41 if (var->data.location < 0)
42 return 0;
43
44 unsigned location = var->data.patch ?
45 var->data.location - VARYING_SLOT_PATCH0 : var->data.location;
46
47 assert(var->data.mode == nir_var_shader_in ||
48 var->data.mode == nir_var_shader_out ||
49 var->data.mode == nir_var_system_value);
50 assert(var->data.location >= 0);
51
52 const struct glsl_type *type = var->type;
53 if (nir_is_per_vertex_io(var, stage) || var->data.per_view) {
54 assert(glsl_type_is_array(type));
55 type = glsl_get_array_element(type);
56 }
57
58 unsigned slots = glsl_count_attribute_slots(type, false);
59 return ((1ull << slots) - 1) << location;
60 }
61
62 static uint8_t
63 get_num_components(nir_variable *var)
64 {
65 if (glsl_type_is_struct_or_ifc(glsl_without_array(var->type)))
66 return 4;
67
68 return glsl_get_vector_elements(glsl_without_array(var->type));
69 }
70
71 static void
72 tcs_add_output_reads(nir_shader *shader, uint64_t *read, uint64_t *patches_read)
73 {
74 nir_foreach_function(function, shader) {
75 if (!function->impl)
76 continue;
77
78 nir_foreach_block(block, function->impl) {
79 nir_foreach_instr(instr, block) {
80 if (instr->type != nir_instr_type_intrinsic)
81 continue;
82
83 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
84 if (intrin->intrinsic != nir_intrinsic_load_deref)
85 continue;
86
87 nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
88 if (deref->mode != nir_var_shader_out)
89 continue;
90
91 nir_variable *var = nir_deref_instr_get_variable(deref);
92 for (unsigned i = 0; i < get_num_components(var); i++) {
93 if (var->data.patch) {
94 patches_read[var->data.location_frac + i] |=
95 get_variable_io_mask(var, shader->info.stage);
96 } else {
97 read[var->data.location_frac + i] |=
98 get_variable_io_mask(var, shader->info.stage);
99 }
100 }
101 }
102 }
103 }
104 }
105
106 /**
107 * Helper for removing unused shader I/O variables, by demoting them to global
108 * variables (which may then by dead code eliminated).
109 *
110 * Example usage is:
111 *
112 * progress = nir_remove_unused_io_vars(producer, nir_var_shader_out,
113 * read, patches_read) ||
114 * progress;
115 *
116 * The "used" should be an array of 4 uint64_ts (probably of VARYING_BIT_*)
117 * representing each .location_frac used. Note that for vector variables,
118 * only the first channel (.location_frac) is examined for deciding if the
119 * variable is used!
120 */
121 bool
122 nir_remove_unused_io_vars(nir_shader *shader,
123 nir_variable_mode mode,
124 uint64_t *used_by_other_stage,
125 uint64_t *used_by_other_stage_patches)
126 {
127 bool progress = false;
128 uint64_t *used;
129
130 assert(mode == nir_var_shader_in || mode == nir_var_shader_out);
131 struct exec_list *var_list =
132 mode == nir_var_shader_in ? &shader->inputs : &shader->outputs;
133
134 nir_foreach_variable_safe(var, var_list) {
135 if (var->data.patch)
136 used = used_by_other_stage_patches;
137 else
138 used = used_by_other_stage;
139
140 if (var->data.location < VARYING_SLOT_VAR0 && var->data.location >= 0)
141 continue;
142
143 if (var->data.always_active_io)
144 continue;
145
146 if (var->data.explicit_xfb_buffer)
147 continue;
148
149 uint64_t other_stage = used[var->data.location_frac];
150
151 if (!(other_stage & get_variable_io_mask(var, shader->info.stage))) {
152 /* This one is invalid, make it a global variable instead */
153 var->data.location = 0;
154 var->data.mode = nir_var_shader_temp;
155
156 exec_node_remove(&var->node);
157 exec_list_push_tail(&shader->globals, &var->node);
158
159 progress = true;
160 }
161 }
162
163 if (progress)
164 nir_fixup_deref_modes(shader);
165
166 return progress;
167 }
168
169 bool
170 nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer)
171 {
172 assert(producer->info.stage != MESA_SHADER_FRAGMENT);
173 assert(consumer->info.stage != MESA_SHADER_VERTEX);
174
175 uint64_t read[4] = { 0 }, written[4] = { 0 };
176 uint64_t patches_read[4] = { 0 }, patches_written[4] = { 0 };
177
178 nir_foreach_shader_out_variable(var, producer) {
179 for (unsigned i = 0; i < get_num_components(var); i++) {
180 if (var->data.patch) {
181 patches_written[var->data.location_frac + i] |=
182 get_variable_io_mask(var, producer->info.stage);
183 } else {
184 written[var->data.location_frac + i] |=
185 get_variable_io_mask(var, producer->info.stage);
186 }
187 }
188 }
189
190 nir_foreach_shader_in_variable(var, consumer) {
191 for (unsigned i = 0; i < get_num_components(var); i++) {
192 if (var->data.patch) {
193 patches_read[var->data.location_frac + i] |=
194 get_variable_io_mask(var, consumer->info.stage);
195 } else {
196 read[var->data.location_frac + i] |=
197 get_variable_io_mask(var, consumer->info.stage);
198 }
199 }
200 }
201
202 /* Each TCS invocation can read data written by other TCS invocations,
203 * so even if the outputs are not used by the TES we must also make
204 * sure they are not read by the TCS before demoting them to globals.
205 */
206 if (producer->info.stage == MESA_SHADER_TESS_CTRL)
207 tcs_add_output_reads(producer, read, patches_read);
208
209 bool progress = false;
210 progress = nir_remove_unused_io_vars(producer, nir_var_shader_out, read,
211 patches_read);
212
213 progress = nir_remove_unused_io_vars(consumer, nir_var_shader_in, written,
214 patches_written) || progress;
215
216 return progress;
217 }
218
219 static uint8_t
220 get_interp_type(nir_variable *var, const struct glsl_type *type,
221 bool default_to_smooth_interp)
222 {
223 if (glsl_type_is_integer(type))
224 return INTERP_MODE_FLAT;
225 else if (var->data.interpolation != INTERP_MODE_NONE)
226 return var->data.interpolation;
227 else if (default_to_smooth_interp)
228 return INTERP_MODE_SMOOTH;
229 else
230 return INTERP_MODE_NONE;
231 }
232
233 #define INTERPOLATE_LOC_SAMPLE 0
234 #define INTERPOLATE_LOC_CENTROID 1
235 #define INTERPOLATE_LOC_CENTER 2
236
237 static uint8_t
238 get_interp_loc(nir_variable *var)
239 {
240 if (var->data.sample)
241 return INTERPOLATE_LOC_SAMPLE;
242 else if (var->data.centroid)
243 return INTERPOLATE_LOC_CENTROID;
244 else
245 return INTERPOLATE_LOC_CENTER;
246 }
247
248 static bool
249 is_packing_supported_for_type(const struct glsl_type *type)
250 {
251 /* We ignore complex types such as arrays, matrices, structs and bitsizes
252 * other then 32bit. All other vector types should have been split into
253 * scalar variables by the lower_io_to_scalar pass. The only exception
254 * should be OpenGL xfb varyings.
255 * TODO: add support for more complex types?
256 */
257 return glsl_type_is_scalar(type) && glsl_type_is_32bit(type);
258 }
259
260 struct assigned_comps
261 {
262 uint8_t comps;
263 uint8_t interp_type;
264 uint8_t interp_loc;
265 bool is_32bit;
266 };
267
268 /* Packing arrays and dual slot varyings is difficult so to avoid complex
269 * algorithms this function just assigns them their existing location for now.
270 * TODO: allow better packing of complex types.
271 */
272 static void
273 get_unmoveable_components_masks(struct exec_list *var_list,
274 struct assigned_comps *comps,
275 gl_shader_stage stage,
276 bool default_to_smooth_interp)
277 {
278 nir_foreach_variable_safe(var, var_list) {
279 assert(var->data.location >= 0);
280
281 /* Only remap things that aren't built-ins. */
282 if (var->data.location >= VARYING_SLOT_VAR0 &&
283 var->data.location - VARYING_SLOT_VAR0 < MAX_VARYINGS_INCL_PATCH) {
284
285 const struct glsl_type *type = var->type;
286 if (nir_is_per_vertex_io(var, stage) || var->data.per_view) {
287 assert(glsl_type_is_array(type));
288 type = glsl_get_array_element(type);
289 }
290
291 /* If we can pack this varying then don't mark the components as
292 * used.
293 */
294 if (is_packing_supported_for_type(type))
295 continue;
296
297 unsigned location = var->data.location - VARYING_SLOT_VAR0;
298
299 unsigned elements =
300 glsl_type_is_vector_or_scalar(glsl_without_array(type)) ?
301 glsl_get_vector_elements(glsl_without_array(type)) : 4;
302
303 bool dual_slot = glsl_type_is_dual_slot(glsl_without_array(type));
304 unsigned slots = glsl_count_attribute_slots(type, false);
305 unsigned dmul = glsl_type_is_64bit(glsl_without_array(type)) ? 2 : 1;
306 unsigned comps_slot2 = 0;
307 for (unsigned i = 0; i < slots; i++) {
308 if (dual_slot) {
309 if (i & 1) {
310 comps[location + i].comps |= ((1 << comps_slot2) - 1);
311 } else {
312 unsigned num_comps = 4 - var->data.location_frac;
313 comps_slot2 = (elements * dmul) - num_comps;
314
315 /* Assume ARB_enhanced_layouts packing rules for doubles */
316 assert(var->data.location_frac == 0 ||
317 var->data.location_frac == 2);
318 assert(comps_slot2 <= 4);
319
320 comps[location + i].comps |=
321 ((1 << num_comps) - 1) << var->data.location_frac;
322 }
323 } else {
324 comps[location + i].comps |=
325 ((1 << (elements * dmul)) - 1) << var->data.location_frac;
326 }
327
328 comps[location + i].interp_type =
329 get_interp_type(var, type, default_to_smooth_interp);
330 comps[location + i].interp_loc = get_interp_loc(var);
331 comps[location + i].is_32bit =
332 glsl_type_is_32bit(glsl_without_array(type));
333 }
334 }
335 }
336 }
337
338 struct varying_loc
339 {
340 uint8_t component;
341 uint32_t location;
342 };
343
344 static void
345 mark_all_used_slots(nir_variable *var, uint64_t *slots_used,
346 uint64_t slots_used_mask, unsigned num_slots)
347 {
348 unsigned loc_offset = var->data.patch ? VARYING_SLOT_PATCH0 : 0;
349
350 slots_used[var->data.patch ? 1 : 0] |= slots_used_mask &
351 BITFIELD64_RANGE(var->data.location - loc_offset, num_slots);
352 }
353
354 static void
355 mark_used_slot(nir_variable *var, uint64_t *slots_used, unsigned offset)
356 {
357 unsigned loc_offset = var->data.patch ? VARYING_SLOT_PATCH0 : 0;
358
359 slots_used[var->data.patch ? 1 : 0] |=
360 BITFIELD64_BIT(var->data.location - loc_offset + offset);
361 }
362
363 static void
364 remap_slots_and_components(struct exec_list *var_list, gl_shader_stage stage,
365 struct varying_loc (*remap)[4],
366 uint64_t *slots_used, uint64_t *out_slots_read,
367 uint32_t *p_slots_used, uint32_t *p_out_slots_read)
368 {
369 uint64_t out_slots_read_tmp[2] = {0};
370 uint64_t slots_used_tmp[2] = {0};
371
372 /* We don't touch builtins so just copy the bitmask */
373 slots_used_tmp[0] = *slots_used & BITFIELD64_RANGE(0, VARYING_SLOT_VAR0);
374
375 nir_foreach_variable(var, var_list) {
376 assert(var->data.location >= 0);
377
378 /* Only remap things that aren't built-ins */
379 if (var->data.location >= VARYING_SLOT_VAR0 &&
380 var->data.location - VARYING_SLOT_VAR0 < MAX_VARYINGS_INCL_PATCH) {
381
382 const struct glsl_type *type = var->type;
383 if (nir_is_per_vertex_io(var, stage) || var->data.per_view) {
384 assert(glsl_type_is_array(type));
385 type = glsl_get_array_element(type);
386 }
387
388 unsigned num_slots = glsl_count_attribute_slots(type, false);
389 bool used_across_stages = false;
390 bool outputs_read = false;
391
392 unsigned location = var->data.location - VARYING_SLOT_VAR0;
393 struct varying_loc *new_loc = &remap[location][var->data.location_frac];
394
395 unsigned loc_offset = var->data.patch ? VARYING_SLOT_PATCH0 : 0;
396 uint64_t used = var->data.patch ? *p_slots_used : *slots_used;
397 uint64_t outs_used =
398 var->data.patch ? *p_out_slots_read : *out_slots_read;
399 uint64_t slots =
400 BITFIELD64_RANGE(var->data.location - loc_offset, num_slots);
401
402 if (slots & used)
403 used_across_stages = true;
404
405 if (slots & outs_used)
406 outputs_read = true;
407
408 if (new_loc->location) {
409 var->data.location = new_loc->location;
410 var->data.location_frac = new_loc->component;
411 }
412
413 if (var->data.always_active_io) {
414 /* We can't apply link time optimisations (specifically array
415 * splitting) to these so we need to copy the existing mask
416 * otherwise we will mess up the mask for things like partially
417 * marked arrays.
418 */
419 if (used_across_stages)
420 mark_all_used_slots(var, slots_used_tmp, used, num_slots);
421
422 if (outputs_read) {
423 mark_all_used_slots(var, out_slots_read_tmp, outs_used,
424 num_slots);
425 }
426 } else {
427 for (unsigned i = 0; i < num_slots; i++) {
428 if (used_across_stages)
429 mark_used_slot(var, slots_used_tmp, i);
430
431 if (outputs_read)
432 mark_used_slot(var, out_slots_read_tmp, i);
433 }
434 }
435 }
436 }
437
438 *slots_used = slots_used_tmp[0];
439 *out_slots_read = out_slots_read_tmp[0];
440 *p_slots_used = slots_used_tmp[1];
441 *p_out_slots_read = out_slots_read_tmp[1];
442 }
443
444 struct varying_component {
445 nir_variable *var;
446 uint8_t interp_type;
447 uint8_t interp_loc;
448 bool is_32bit;
449 bool is_patch;
450 bool is_intra_stage_only;
451 bool initialised;
452 };
453
454 static int
455 cmp_varying_component(const void *comp1_v, const void *comp2_v)
456 {
457 struct varying_component *comp1 = (struct varying_component *) comp1_v;
458 struct varying_component *comp2 = (struct varying_component *) comp2_v;
459
460 /* We want patches to be order at the end of the array */
461 if (comp1->is_patch != comp2->is_patch)
462 return comp1->is_patch ? 1 : -1;
463
464 /* We want to try to group together TCS outputs that are only read by other
465 * TCS invocations and not consumed by the follow stage.
466 */
467 if (comp1->is_intra_stage_only != comp2->is_intra_stage_only)
468 return comp1->is_intra_stage_only ? 1 : -1;
469
470 /* We can only pack varyings with matching interpolation types so group
471 * them together.
472 */
473 if (comp1->interp_type != comp2->interp_type)
474 return comp1->interp_type - comp2->interp_type;
475
476 /* Interpolation loc must match also. */
477 if (comp1->interp_loc != comp2->interp_loc)
478 return comp1->interp_loc - comp2->interp_loc;
479
480 /* If everything else matches just use the original location to sort */
481 return comp1->var->data.location - comp2->var->data.location;
482 }
483
484 static void
485 gather_varying_component_info(nir_shader *producer, nir_shader *consumer,
486 struct varying_component **varying_comp_info,
487 unsigned *varying_comp_info_size,
488 bool default_to_smooth_interp)
489 {
490 unsigned store_varying_info_idx[MAX_VARYINGS_INCL_PATCH][4] = {{0}};
491 unsigned num_of_comps_to_pack = 0;
492
493 /* Count the number of varying that can be packed and create a mapping
494 * of those varyings to the array we will pass to qsort.
495 */
496 nir_foreach_shader_out_variable(var, producer) {
497
498 /* Only remap things that aren't builtins. */
499 if (var->data.location >= VARYING_SLOT_VAR0 &&
500 var->data.location - VARYING_SLOT_VAR0 < MAX_VARYINGS_INCL_PATCH) {
501
502 /* We can't repack xfb varyings. */
503 if (var->data.always_active_io)
504 continue;
505
506 const struct glsl_type *type = var->type;
507 if (nir_is_per_vertex_io(var, producer->info.stage) || var->data.per_view) {
508 assert(glsl_type_is_array(type));
509 type = glsl_get_array_element(type);
510 }
511
512 if (!is_packing_supported_for_type(type))
513 continue;
514
515 unsigned loc = var->data.location - VARYING_SLOT_VAR0;
516 store_varying_info_idx[loc][var->data.location_frac] =
517 ++num_of_comps_to_pack;
518 }
519 }
520
521 *varying_comp_info_size = num_of_comps_to_pack;
522 *varying_comp_info = rzalloc_array(NULL, struct varying_component,
523 num_of_comps_to_pack);
524
525 nir_function_impl *impl = nir_shader_get_entrypoint(consumer);
526
527 /* Walk over the shader and populate the varying component info array */
528 nir_foreach_block(block, impl) {
529 nir_foreach_instr(instr, block) {
530 if (instr->type != nir_instr_type_intrinsic)
531 continue;
532
533 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
534 if (intr->intrinsic != nir_intrinsic_load_deref &&
535 intr->intrinsic != nir_intrinsic_interp_deref_at_centroid &&
536 intr->intrinsic != nir_intrinsic_interp_deref_at_sample &&
537 intr->intrinsic != nir_intrinsic_interp_deref_at_offset &&
538 intr->intrinsic != nir_intrinsic_interp_deref_at_vertex)
539 continue;
540
541 nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
542 if (deref->mode != nir_var_shader_in)
543 continue;
544
545 /* We only remap things that aren't builtins. */
546 nir_variable *in_var = nir_deref_instr_get_variable(deref);
547 if (in_var->data.location < VARYING_SLOT_VAR0)
548 continue;
549
550 unsigned location = in_var->data.location - VARYING_SLOT_VAR0;
551 if (location >= MAX_VARYINGS_INCL_PATCH)
552 continue;
553
554 unsigned var_info_idx =
555 store_varying_info_idx[location][in_var->data.location_frac];
556 if (!var_info_idx)
557 continue;
558
559 struct varying_component *vc_info =
560 &(*varying_comp_info)[var_info_idx-1];
561
562 if (!vc_info->initialised) {
563 const struct glsl_type *type = in_var->type;
564 if (nir_is_per_vertex_io(in_var, consumer->info.stage) ||
565 in_var->data.per_view) {
566 assert(glsl_type_is_array(type));
567 type = glsl_get_array_element(type);
568 }
569
570 vc_info->var = in_var;
571 vc_info->interp_type =
572 get_interp_type(in_var, type, default_to_smooth_interp);
573 vc_info->interp_loc = get_interp_loc(in_var);
574 vc_info->is_32bit = glsl_type_is_32bit(type);
575 vc_info->is_patch = in_var->data.patch;
576 vc_info->is_intra_stage_only = false;
577 vc_info->initialised = true;
578 }
579 }
580 }
581
582 /* Walk over the shader and populate the varying component info array
583 * for varyings which are read by other TCS instances but are not consumed
584 * by the TES.
585 */
586 if (producer->info.stage == MESA_SHADER_TESS_CTRL) {
587 impl = nir_shader_get_entrypoint(producer);
588
589 nir_foreach_block(block, impl) {
590 nir_foreach_instr(instr, block) {
591 if (instr->type != nir_instr_type_intrinsic)
592 continue;
593
594 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
595 if (intr->intrinsic != nir_intrinsic_load_deref)
596 continue;
597
598 nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
599 if (deref->mode != nir_var_shader_out)
600 continue;
601
602 /* We only remap things that aren't builtins. */
603 nir_variable *out_var = nir_deref_instr_get_variable(deref);
604 if (out_var->data.location < VARYING_SLOT_VAR0)
605 continue;
606
607 unsigned location = out_var->data.location - VARYING_SLOT_VAR0;
608 if (location >= MAX_VARYINGS_INCL_PATCH)
609 continue;
610
611 unsigned var_info_idx =
612 store_varying_info_idx[location][out_var->data.location_frac];
613 if (!var_info_idx) {
614 /* Something went wrong, the shader interfaces didn't match, so
615 * abandon packing. This can happen for example when the
616 * inputs are scalars but the outputs are struct members.
617 */
618 *varying_comp_info_size = 0;
619 break;
620 }
621
622 struct varying_component *vc_info =
623 &(*varying_comp_info)[var_info_idx-1];
624
625 if (!vc_info->initialised) {
626 const struct glsl_type *type = out_var->type;
627 if (nir_is_per_vertex_io(out_var, producer->info.stage)) {
628 assert(glsl_type_is_array(type));
629 type = glsl_get_array_element(type);
630 }
631
632 vc_info->var = out_var;
633 vc_info->interp_type =
634 get_interp_type(out_var, type, default_to_smooth_interp);
635 vc_info->interp_loc = get_interp_loc(out_var);
636 vc_info->is_32bit = glsl_type_is_32bit(type);
637 vc_info->is_patch = out_var->data.patch;
638 vc_info->is_intra_stage_only = true;
639 vc_info->initialised = true;
640 }
641 }
642 }
643 }
644
645 for (unsigned i = 0; i < *varying_comp_info_size; i++ ) {
646 struct varying_component *vc_info = &(*varying_comp_info)[i];
647 if (!vc_info->initialised) {
648 /* Something went wrong, the shader interfaces didn't match, so
649 * abandon packing. This can happen for example when the outputs are
650 * scalars but the inputs are struct members.
651 */
652 *varying_comp_info_size = 0;
653 break;
654 }
655 }
656 }
657
658 static void
659 assign_remap_locations(struct varying_loc (*remap)[4],
660 struct assigned_comps *assigned_comps,
661 struct varying_component *info,
662 unsigned *cursor, unsigned *comp,
663 unsigned max_location)
664 {
665 unsigned tmp_cursor = *cursor;
666 unsigned tmp_comp = *comp;
667
668 for (; tmp_cursor < max_location; tmp_cursor++) {
669
670 if (assigned_comps[tmp_cursor].comps) {
671 /* We can only pack varyings with matching interpolation types,
672 * interpolation loc must match also.
673 * TODO: i965 can handle interpolation locations that don't match,
674 * but the radeonsi nir backend handles everything as vec4s and so
675 * expects this to be the same for all components. We could make this
676 * check driver specfific or drop it if NIR ever become the only
677 * radeonsi backend.
678 */
679 if (assigned_comps[tmp_cursor].interp_type != info->interp_type ||
680 assigned_comps[tmp_cursor].interp_loc != info->interp_loc) {
681 tmp_comp = 0;
682 continue;
683 }
684
685 /* We can only pack varyings with matching types, and the current
686 * algorithm only supports packing 32-bit.
687 */
688 if (!assigned_comps[tmp_cursor].is_32bit) {
689 tmp_comp = 0;
690 continue;
691 }
692
693 while (tmp_comp < 4 &&
694 (assigned_comps[tmp_cursor].comps & (1 << tmp_comp))) {
695 tmp_comp++;
696 }
697 }
698
699 if (tmp_comp == 4) {
700 tmp_comp = 0;
701 continue;
702 }
703
704 unsigned location = info->var->data.location - VARYING_SLOT_VAR0;
705
706 /* Once we have assigned a location mark it as used */
707 assigned_comps[tmp_cursor].comps |= (1 << tmp_comp);
708 assigned_comps[tmp_cursor].interp_type = info->interp_type;
709 assigned_comps[tmp_cursor].interp_loc = info->interp_loc;
710 assigned_comps[tmp_cursor].is_32bit = info->is_32bit;
711
712 /* Assign remap location */
713 remap[location][info->var->data.location_frac].component = tmp_comp++;
714 remap[location][info->var->data.location_frac].location =
715 tmp_cursor + VARYING_SLOT_VAR0;
716
717 break;
718 }
719
720 *cursor = tmp_cursor;
721 *comp = tmp_comp;
722 }
723
724 /* If there are empty components in the slot compact the remaining components
725 * as close to component 0 as possible. This will make it easier to fill the
726 * empty components with components from a different slot in a following pass.
727 */
728 static void
729 compact_components(nir_shader *producer, nir_shader *consumer,
730 struct assigned_comps *assigned_comps,
731 bool default_to_smooth_interp)
732 {
733 struct exec_list *input_list = &consumer->inputs;
734 struct exec_list *output_list = &producer->outputs;
735 struct varying_loc remap[MAX_VARYINGS_INCL_PATCH][4] = {{{0}, {0}}};
736 struct varying_component *varying_comp_info;
737 unsigned varying_comp_info_size;
738
739 /* Gather varying component info */
740 gather_varying_component_info(producer, consumer, &varying_comp_info,
741 &varying_comp_info_size,
742 default_to_smooth_interp);
743
744 /* Sort varying components. */
745 qsort(varying_comp_info, varying_comp_info_size,
746 sizeof(struct varying_component), cmp_varying_component);
747
748 unsigned cursor = 0;
749 unsigned comp = 0;
750
751 /* Set the remap array based on the sorted components */
752 for (unsigned i = 0; i < varying_comp_info_size; i++ ) {
753 struct varying_component *info = &varying_comp_info[i];
754
755 assert(info->is_patch || cursor < MAX_VARYING);
756 if (info->is_patch) {
757 /* The list should be sorted with all non-patch inputs first followed
758 * by patch inputs. When we hit our first patch input, we need to
759 * reset the cursor to MAX_VARYING so we put them in the right slot.
760 */
761 if (cursor < MAX_VARYING) {
762 cursor = MAX_VARYING;
763 comp = 0;
764 }
765
766 assign_remap_locations(remap, assigned_comps, info,
767 &cursor, &comp, MAX_VARYINGS_INCL_PATCH);
768 } else {
769 assign_remap_locations(remap, assigned_comps, info,
770 &cursor, &comp, MAX_VARYING);
771
772 /* Check if we failed to assign a remap location. This can happen if
773 * for example there are a bunch of unmovable components with
774 * mismatching interpolation types causing us to skip over locations
775 * that would have been useful for packing later components.
776 * The solution is to iterate over the locations again (this should
777 * happen very rarely in practice).
778 */
779 if (cursor == MAX_VARYING) {
780 cursor = 0;
781 comp = 0;
782 assign_remap_locations(remap, assigned_comps, info,
783 &cursor, &comp, MAX_VARYING);
784 }
785 }
786 }
787
788 ralloc_free(varying_comp_info);
789
790 uint64_t zero = 0;
791 uint32_t zero32 = 0;
792 remap_slots_and_components(input_list, consumer->info.stage, remap,
793 &consumer->info.inputs_read, &zero,
794 &consumer->info.patch_inputs_read, &zero32);
795 remap_slots_and_components(output_list, producer->info.stage, remap,
796 &producer->info.outputs_written,
797 &producer->info.outputs_read,
798 &producer->info.patch_outputs_written,
799 &producer->info.patch_outputs_read);
800 }
801
802 /* We assume that this has been called more-or-less directly after
803 * remove_unused_varyings. At this point, all of the varyings that we
804 * aren't going to be using have been completely removed and the
805 * inputs_read and outputs_written fields in nir_shader_info reflect
806 * this. Therefore, the total set of valid slots is the OR of the two
807 * sets of varyings; this accounts for varyings which one side may need
808 * to read/write even if the other doesn't. This can happen if, for
809 * instance, an array is used indirectly from one side causing it to be
810 * unsplittable but directly from the other.
811 */
812 void
813 nir_compact_varyings(nir_shader *producer, nir_shader *consumer,
814 bool default_to_smooth_interp)
815 {
816 assert(producer->info.stage != MESA_SHADER_FRAGMENT);
817 assert(consumer->info.stage != MESA_SHADER_VERTEX);
818
819 struct assigned_comps assigned_comps[MAX_VARYINGS_INCL_PATCH] = {{0}};
820
821 get_unmoveable_components_masks(&producer->outputs, assigned_comps,
822 producer->info.stage,
823 default_to_smooth_interp);
824 get_unmoveable_components_masks(&consumer->inputs, assigned_comps,
825 consumer->info.stage,
826 default_to_smooth_interp);
827
828 compact_components(producer, consumer, assigned_comps,
829 default_to_smooth_interp);
830 }
831
832 /*
833 * Mark XFB varyings as always_active_io in the consumer so the linking opts
834 * don't touch them.
835 */
836 void
837 nir_link_xfb_varyings(nir_shader *producer, nir_shader *consumer)
838 {
839 nir_variable *input_vars[MAX_VARYING] = { 0 };
840
841 nir_foreach_shader_in_variable(var, consumer) {
842 if (var->data.location >= VARYING_SLOT_VAR0 &&
843 var->data.location - VARYING_SLOT_VAR0 < MAX_VARYING) {
844
845 unsigned location = var->data.location - VARYING_SLOT_VAR0;
846 input_vars[location] = var;
847 }
848 }
849
850 nir_foreach_shader_out_variable(var, producer) {
851 if (var->data.location >= VARYING_SLOT_VAR0 &&
852 var->data.location - VARYING_SLOT_VAR0 < MAX_VARYING) {
853
854 if (!var->data.always_active_io)
855 continue;
856
857 unsigned location = var->data.location - VARYING_SLOT_VAR0;
858 if (input_vars[location]) {
859 input_vars[location]->data.always_active_io = true;
860 }
861 }
862 }
863 }
864
865 static bool
866 does_varying_match(nir_variable *out_var, nir_variable *in_var)
867 {
868 return in_var->data.location == out_var->data.location &&
869 in_var->data.location_frac == out_var->data.location_frac;
870 }
871
872 static nir_variable *
873 get_matching_input_var(nir_shader *consumer, nir_variable *out_var)
874 {
875 nir_foreach_shader_in_variable(var, consumer) {
876 if (does_varying_match(out_var, var))
877 return var;
878 }
879
880 return NULL;
881 }
882
883 static bool
884 can_replace_varying(nir_variable *out_var)
885 {
886 /* Skip types that require more complex handling.
887 * TODO: add support for these types.
888 */
889 if (glsl_type_is_array(out_var->type) ||
890 glsl_type_is_dual_slot(out_var->type) ||
891 glsl_type_is_matrix(out_var->type) ||
892 glsl_type_is_struct_or_ifc(out_var->type))
893 return false;
894
895 /* Limit this pass to scalars for now to keep things simple. Most varyings
896 * should have been lowered to scalars at this point anyway.
897 */
898 if (!glsl_type_is_scalar(out_var->type))
899 return false;
900
901 if (out_var->data.location < VARYING_SLOT_VAR0 ||
902 out_var->data.location - VARYING_SLOT_VAR0 >= MAX_VARYING)
903 return false;
904
905 return true;
906 }
907
908 static bool
909 replace_constant_input(nir_shader *shader, nir_intrinsic_instr *store_intr)
910 {
911 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
912
913 nir_builder b;
914 nir_builder_init(&b, impl);
915
916 nir_variable *out_var =
917 nir_deref_instr_get_variable(nir_src_as_deref(store_intr->src[0]));
918
919 bool progress = false;
920 nir_foreach_block(block, impl) {
921 nir_foreach_instr(instr, block) {
922 if (instr->type != nir_instr_type_intrinsic)
923 continue;
924
925 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
926 if (intr->intrinsic != nir_intrinsic_load_deref)
927 continue;
928
929 nir_deref_instr *in_deref = nir_src_as_deref(intr->src[0]);
930 if (in_deref->mode != nir_var_shader_in)
931 continue;
932
933 nir_variable *in_var = nir_deref_instr_get_variable(in_deref);
934
935 if (!does_varying_match(out_var, in_var))
936 continue;
937
938 b.cursor = nir_before_instr(instr);
939
940 nir_load_const_instr *out_const =
941 nir_instr_as_load_const(store_intr->src[1].ssa->parent_instr);
942
943 /* Add new const to replace the input */
944 nir_ssa_def *nconst = nir_build_imm(&b, store_intr->num_components,
945 intr->dest.ssa.bit_size,
946 out_const->value);
947
948 nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(nconst));
949
950 progress = true;
951 }
952 }
953
954 return progress;
955 }
956
957 static bool
958 replace_duplicate_input(nir_shader *shader, nir_variable *input_var,
959 nir_intrinsic_instr *dup_store_intr)
960 {
961 assert(input_var);
962
963 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
964
965 nir_builder b;
966 nir_builder_init(&b, impl);
967
968 nir_variable *dup_out_var =
969 nir_deref_instr_get_variable(nir_src_as_deref(dup_store_intr->src[0]));
970
971 bool progress = false;
972 nir_foreach_block(block, impl) {
973 nir_foreach_instr(instr, block) {
974 if (instr->type != nir_instr_type_intrinsic)
975 continue;
976
977 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
978 if (intr->intrinsic != nir_intrinsic_load_deref)
979 continue;
980
981 nir_deref_instr *in_deref = nir_src_as_deref(intr->src[0]);
982 if (in_deref->mode != nir_var_shader_in)
983 continue;
984
985 nir_variable *in_var = nir_deref_instr_get_variable(in_deref);
986
987 if (!does_varying_match(dup_out_var, in_var) ||
988 in_var->data.interpolation != input_var->data.interpolation ||
989 get_interp_loc(in_var) != get_interp_loc(input_var))
990 continue;
991
992 b.cursor = nir_before_instr(instr);
993
994 nir_ssa_def *load = nir_load_var(&b, input_var);
995 nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(load));
996
997 progress = true;
998 }
999 }
1000
1001 return progress;
1002 }
1003
1004 bool
1005 nir_link_opt_varyings(nir_shader *producer, nir_shader *consumer)
1006 {
1007 /* TODO: Add support for more shader stage combinations */
1008 if (consumer->info.stage != MESA_SHADER_FRAGMENT ||
1009 (producer->info.stage != MESA_SHADER_VERTEX &&
1010 producer->info.stage != MESA_SHADER_TESS_EVAL))
1011 return false;
1012
1013 bool progress = false;
1014
1015 nir_function_impl *impl = nir_shader_get_entrypoint(producer);
1016
1017 struct hash_table *varying_values = _mesa_pointer_hash_table_create(NULL);
1018
1019 /* If we find a store in the last block of the producer we can be sure this
1020 * is the only possible value for this output.
1021 */
1022 nir_block *last_block = nir_impl_last_block(impl);
1023 nir_foreach_instr_reverse(instr, last_block) {
1024 if (instr->type != nir_instr_type_intrinsic)
1025 continue;
1026
1027 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1028
1029 if (intr->intrinsic != nir_intrinsic_store_deref)
1030 continue;
1031
1032 nir_deref_instr *out_deref = nir_src_as_deref(intr->src[0]);
1033 if (out_deref->mode != nir_var_shader_out)
1034 continue;
1035
1036 nir_variable *out_var = nir_deref_instr_get_variable(out_deref);
1037 if (!can_replace_varying(out_var))
1038 continue;
1039
1040 if (intr->src[1].ssa->parent_instr->type == nir_instr_type_load_const) {
1041 progress |= replace_constant_input(consumer, intr);
1042 } else {
1043 struct hash_entry *entry =
1044 _mesa_hash_table_search(varying_values, intr->src[1].ssa);
1045 if (entry) {
1046 progress |= replace_duplicate_input(consumer,
1047 (nir_variable *) entry->data,
1048 intr);
1049 } else {
1050 nir_variable *in_var = get_matching_input_var(consumer, out_var);
1051 if (in_var) {
1052 _mesa_hash_table_insert(varying_values, intr->src[1].ssa,
1053 in_var);
1054 }
1055 }
1056 }
1057 }
1058
1059 _mesa_hash_table_destroy(varying_values, NULL);
1060
1061 return progress;
1062 }
1063
1064 /* TODO any better helper somewhere to sort a list? */
1065
1066 static void
1067 insert_sorted(struct exec_list *var_list, nir_variable *new_var)
1068 {
1069 nir_foreach_variable(var, var_list) {
1070 if (var->data.location > new_var->data.location) {
1071 exec_node_insert_node_before(&var->node, &new_var->node);
1072 return;
1073 }
1074 }
1075 exec_list_push_tail(var_list, &new_var->node);
1076 }
1077
1078 static void
1079 sort_varyings(struct exec_list *var_list)
1080 {
1081 struct exec_list new_list;
1082 exec_list_make_empty(&new_list);
1083 nir_foreach_variable_safe(var, var_list) {
1084 exec_node_remove(&var->node);
1085 insert_sorted(&new_list, var);
1086 }
1087 exec_list_move_nodes_to(&new_list, var_list);
1088 }
1089
1090 void
1091 nir_assign_io_var_locations(struct exec_list *var_list, unsigned *size,
1092 gl_shader_stage stage)
1093 {
1094 unsigned location = 0;
1095 unsigned assigned_locations[VARYING_SLOT_TESS_MAX];
1096 uint64_t processed_locs[2] = {0};
1097
1098 sort_varyings(var_list);
1099
1100 int UNUSED last_loc = 0;
1101 bool last_partial = false;
1102 nir_foreach_variable(var, var_list) {
1103 const struct glsl_type *type = var->type;
1104 if (nir_is_per_vertex_io(var, stage) || var->data.per_view) {
1105 assert(glsl_type_is_array(type));
1106 type = glsl_get_array_element(type);
1107 }
1108
1109 int base;
1110 if (var->data.mode == nir_var_shader_in && stage == MESA_SHADER_VERTEX)
1111 base = VERT_ATTRIB_GENERIC0;
1112 else if (var->data.mode == nir_var_shader_out &&
1113 stage == MESA_SHADER_FRAGMENT)
1114 base = FRAG_RESULT_DATA0;
1115 else
1116 base = VARYING_SLOT_VAR0;
1117
1118 unsigned var_size;
1119 if (var->data.compact) {
1120 /* If we are inside a partial compact,
1121 * don't allow another compact to be in this slot
1122 * if it starts at component 0.
1123 */
1124 if (last_partial && var->data.location_frac == 0) {
1125 location++;
1126 }
1127
1128 /* compact variables must be arrays of scalars */
1129 assert(glsl_type_is_array(type));
1130 assert(glsl_type_is_scalar(glsl_get_array_element(type)));
1131 unsigned start = 4 * location + var->data.location_frac;
1132 unsigned end = start + glsl_get_length(type);
1133 var_size = end / 4 - location;
1134 last_partial = end % 4 != 0;
1135 } else {
1136 /* Compact variables bypass the normal varying compacting pass,
1137 * which means they cannot be in the same vec4 slot as a normal
1138 * variable. If part of the current slot is taken up by a compact
1139 * variable, we need to go to the next one.
1140 */
1141 if (last_partial) {
1142 location++;
1143 last_partial = false;
1144 }
1145 var_size = glsl_count_attribute_slots(type, false);
1146 }
1147
1148 /* Builtins don't allow component packing so we only need to worry about
1149 * user defined varyings sharing the same location.
1150 */
1151 bool processed = false;
1152 if (var->data.location >= base) {
1153 unsigned glsl_location = var->data.location - base;
1154
1155 for (unsigned i = 0; i < var_size; i++) {
1156 if (processed_locs[var->data.index] &
1157 ((uint64_t)1 << (glsl_location + i)))
1158 processed = true;
1159 else
1160 processed_locs[var->data.index] |=
1161 ((uint64_t)1 << (glsl_location + i));
1162 }
1163 }
1164
1165 /* Because component packing allows varyings to share the same location
1166 * we may have already have processed this location.
1167 */
1168 if (processed) {
1169 unsigned driver_location = assigned_locations[var->data.location];
1170 var->data.driver_location = driver_location;
1171
1172 /* An array may be packed such that is crosses multiple other arrays
1173 * or variables, we need to make sure we have allocated the elements
1174 * consecutively if the previously proccessed var was shorter than
1175 * the current array we are processing.
1176 *
1177 * NOTE: The code below assumes the var list is ordered in ascending
1178 * location order.
1179 */
1180 assert(last_loc <= var->data.location);
1181 last_loc = var->data.location;
1182 unsigned last_slot_location = driver_location + var_size;
1183 if (last_slot_location > location) {
1184 unsigned num_unallocated_slots = last_slot_location - location;
1185 unsigned first_unallocated_slot = var_size - num_unallocated_slots;
1186 for (unsigned i = first_unallocated_slot; i < var_size; i++) {
1187 assigned_locations[var->data.location + i] = location;
1188 location++;
1189 }
1190 }
1191 continue;
1192 }
1193
1194 for (unsigned i = 0; i < var_size; i++) {
1195 assigned_locations[var->data.location + i] = location + i;
1196 }
1197
1198 var->data.driver_location = location;
1199 location += var_size;
1200 }
1201
1202 if (last_partial)
1203 location++;
1204
1205 *size = location;
1206 }
1207
1208 static uint64_t
1209 get_linked_variable_location(unsigned location, bool patch)
1210 {
1211 if (!patch)
1212 return location;
1213
1214 /* Reserve locations 0...3 for special patch variables
1215 * like tess factors and bounding boxes, and the generic patch
1216 * variables will come after them.
1217 */
1218 if (location >= VARYING_SLOT_PATCH0)
1219 return location - VARYING_SLOT_PATCH0 + 4;
1220 else if (location >= VARYING_SLOT_TESS_LEVEL_OUTER &&
1221 location <= VARYING_SLOT_BOUNDING_BOX1)
1222 return location - VARYING_SLOT_TESS_LEVEL_OUTER;
1223 else
1224 unreachable("Unsupported variable in get_linked_variable_location.");
1225 }
1226
1227 static uint64_t
1228 get_linked_variable_io_mask(nir_variable *variable, gl_shader_stage stage)
1229 {
1230 const struct glsl_type *type = variable->type;
1231
1232 if (nir_is_per_vertex_io(variable, stage)) {
1233 assert(glsl_type_is_array(type));
1234 type = glsl_get_array_element(type);
1235 }
1236
1237 unsigned slots = glsl_count_attribute_slots(type, false);
1238 if (variable->data.compact) {
1239 unsigned component_count = variable->data.location_frac + glsl_get_length(type);
1240 slots = DIV_ROUND_UP(component_count, 4);
1241 }
1242
1243 uint64_t mask = u_bit_consecutive64(0, slots);
1244 return mask;
1245 }
1246
1247 nir_linked_io_var_info
1248 nir_assign_linked_io_var_locations(nir_shader *producer, nir_shader *consumer)
1249 {
1250 assert(producer);
1251 assert(consumer);
1252
1253 uint64_t producer_output_mask = 0;
1254 uint64_t producer_patch_output_mask = 0;
1255
1256 nir_foreach_shader_out_variable(variable, producer) {
1257 uint64_t mask = get_linked_variable_io_mask(variable, producer->info.stage);
1258 uint64_t loc = get_linked_variable_location(variable->data.location, variable->data.patch);
1259
1260 if (variable->data.patch)
1261 producer_patch_output_mask |= mask << loc;
1262 else
1263 producer_output_mask |= mask << loc;
1264 }
1265
1266 uint64_t consumer_input_mask = 0;
1267 uint64_t consumer_patch_input_mask = 0;
1268
1269 nir_foreach_shader_in_variable(variable, consumer) {
1270 uint64_t mask = get_linked_variable_io_mask(variable, consumer->info.stage);
1271 uint64_t loc = get_linked_variable_location(variable->data.location, variable->data.patch);
1272
1273 if (variable->data.patch)
1274 consumer_patch_input_mask |= mask << loc;
1275 else
1276 consumer_input_mask |= mask << loc;
1277 }
1278
1279 uint64_t io_mask = producer_output_mask | consumer_input_mask;
1280 uint64_t patch_io_mask = producer_patch_output_mask | consumer_patch_input_mask;
1281
1282 nir_foreach_shader_out_variable(variable, producer) {
1283 uint64_t loc = get_linked_variable_location(variable->data.location, variable->data.patch);
1284
1285 if (variable->data.patch)
1286 variable->data.driver_location = util_bitcount64(patch_io_mask & u_bit_consecutive64(0, loc)) * 4;
1287 else
1288 variable->data.driver_location = util_bitcount64(io_mask & u_bit_consecutive64(0, loc)) * 4;
1289 }
1290
1291 nir_foreach_shader_in_variable(variable, consumer) {
1292 uint64_t loc = get_linked_variable_location(variable->data.location, variable->data.patch);
1293
1294 if (variable->data.patch)
1295 variable->data.driver_location = util_bitcount64(patch_io_mask & u_bit_consecutive64(0, loc)) * 4;
1296 else
1297 variable->data.driver_location = util_bitcount64(io_mask & u_bit_consecutive64(0, loc)) * 4;
1298 }
1299
1300 nir_linked_io_var_info result = {
1301 .num_linked_io_vars = util_bitcount64(io_mask),
1302 .num_linked_patch_io_vars = util_bitcount64(patch_io_mask),
1303 };
1304
1305 return result;
1306 }