fbab89417cd3e7b8833ecffda48c0eeeb167159e
[mesa.git] / src / amd / compiler / aco_instruction_selection_setup.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include <array>
26 #include <unordered_map>
27 #include "aco_ir.h"
28 #include "nir.h"
29 #include "vulkan/radv_shader.h"
30 #include "vulkan/radv_descriptor_set.h"
31 #include "sid.h"
32 #include "ac_exp_param.h"
33 #include "ac_shader_util.h"
34
35 #include "util/u_math.h"
36
37 #define MAX_INLINE_PUSH_CONSTS 8
38
39 namespace aco {
40
41 enum fs_input {
42 persp_sample_p1,
43 persp_sample_p2,
44 persp_center_p1,
45 persp_center_p2,
46 persp_centroid_p1,
47 persp_centroid_p2,
48 persp_pull_model,
49 linear_sample_p1,
50 linear_sample_p2,
51 linear_center_p1,
52 linear_center_p2,
53 linear_centroid_p1,
54 linear_centroid_p2,
55 line_stipple,
56 frag_pos_0,
57 frag_pos_1,
58 frag_pos_2,
59 frag_pos_3,
60 front_face,
61 ancillary,
62 sample_coverage,
63 fixed_pt,
64 max_inputs,
65 };
66
67 struct vs_output_state {
68 uint8_t mask[VARYING_SLOT_VAR31 + 1];
69 Temp outputs[VARYING_SLOT_VAR31 + 1][4];
70 };
71
72 struct isel_context {
73 const struct radv_nir_compiler_options *options;
74 Program *program;
75 nir_shader *shader;
76 uint32_t constant_data_offset;
77 Block *block;
78 bool *divergent_vals;
79 std::unique_ptr<Temp[]> allocated;
80 std::unordered_map<unsigned, std::array<Temp,4>> allocated_vec;
81 Stage stage; /* Stage */
82 bool has_gfx10_wave64_bpermute = false;
83 struct {
84 bool has_branch;
85 uint16_t loop_nest_depth = 0;
86 struct {
87 unsigned header_idx;
88 Block* exit;
89 bool has_divergent_continue = false;
90 bool has_divergent_branch = false;
91 } parent_loop;
92 struct {
93 bool is_divergent = false;
94 } parent_if;
95 bool exec_potentially_empty = false;
96 } cf_info;
97
98 /* inputs common for merged stages */
99 Temp merged_wave_info = Temp(0, s1);
100
101 /* FS inputs */
102 bool fs_vgpr_args[fs_input::max_inputs];
103 Temp fs_inputs[fs_input::max_inputs];
104 Temp prim_mask = Temp(0, s1);
105 Temp descriptor_sets[MAX_SETS];
106 Temp push_constants = Temp(0, s1);
107 Temp inline_push_consts[MAX_INLINE_PUSH_CONSTS];
108 unsigned num_inline_push_consts = 0;
109 unsigned base_inline_push_consts = 0;
110
111 /* VS inputs */
112 Temp vertex_buffers = Temp(0, s1);
113 Temp base_vertex = Temp(0, s1);
114 Temp start_instance = Temp(0, s1);
115 Temp draw_id = Temp(0, s1);
116 Temp view_index = Temp(0, s1);
117 Temp es2gs_offset = Temp(0, s1);
118 Temp vertex_id = Temp(0, v1);
119 Temp rel_auto_id = Temp(0, v1);
120 Temp instance_id = Temp(0, v1);
121 Temp vs_prim_id = Temp(0, v1);
122 bool needs_instance_id;
123
124 /* CS inputs */
125 Temp num_workgroups = Temp(0, s3);
126 Temp workgroup_ids[3] = {Temp(0, s1), Temp(0, s1), Temp(0, s1)};
127 Temp tg_size = Temp(0, s1);
128 Temp local_invocation_ids = Temp(0, v3);
129
130 /* VS output information */
131 unsigned num_clip_distances;
132 unsigned num_cull_distances;
133 vs_output_state vs_output;
134
135 /* Streamout */
136 Temp streamout_buffers = Temp(0, s1);
137 Temp streamout_write_idx = Temp(0, s1);
138 Temp streamout_config = Temp(0, s1);
139 Temp streamout_offset[4] = {Temp(0, s1), Temp(0, s1), Temp(0, s1), Temp(0, s1)};
140 };
141
142 fs_input get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp)
143 {
144 switch (interp) {
145 case INTERP_MODE_SMOOTH:
146 case INTERP_MODE_NONE:
147 if (intrin == nir_intrinsic_load_barycentric_pixel ||
148 intrin == nir_intrinsic_load_barycentric_at_sample ||
149 intrin == nir_intrinsic_load_barycentric_at_offset)
150 return fs_input::persp_center_p1;
151 else if (intrin == nir_intrinsic_load_barycentric_centroid)
152 return fs_input::persp_centroid_p1;
153 else if (intrin == nir_intrinsic_load_barycentric_sample)
154 return fs_input::persp_sample_p1;
155 break;
156 case INTERP_MODE_NOPERSPECTIVE:
157 if (intrin == nir_intrinsic_load_barycentric_pixel)
158 return fs_input::linear_center_p1;
159 else if (intrin == nir_intrinsic_load_barycentric_centroid)
160 return fs_input::linear_centroid_p1;
161 else if (intrin == nir_intrinsic_load_barycentric_sample)
162 return fs_input::linear_sample_p1;
163 break;
164 default:
165 break;
166 }
167 return fs_input::max_inputs;
168 }
169
170 void init_context(isel_context *ctx, nir_shader *shader)
171 {
172 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
173
174 ctx->shader = shader;
175 ctx->divergent_vals = nir_divergence_analysis(shader, nir_divergence_view_index_uniform);
176
177 std::unique_ptr<Temp[]> allocated{new Temp[impl->ssa_alloc]()};
178 memset(&ctx->fs_vgpr_args, false, sizeof(ctx->fs_vgpr_args));
179
180 bool done = false;
181 while (!done) {
182 done = true;
183 nir_foreach_block(block, impl) {
184 nir_foreach_instr(instr, block) {
185 switch(instr->type) {
186 case nir_instr_type_alu: {
187 nir_alu_instr *alu_instr = nir_instr_as_alu(instr);
188 unsigned size = alu_instr->dest.dest.ssa.num_components;
189 if (alu_instr->dest.dest.ssa.bit_size == 64)
190 size *= 2;
191 RegType type = RegType::sgpr;
192 switch(alu_instr->op) {
193 case nir_op_fmul:
194 case nir_op_fadd:
195 case nir_op_fsub:
196 case nir_op_fmax:
197 case nir_op_fmin:
198 case nir_op_fmax3:
199 case nir_op_fmin3:
200 case nir_op_fmed3:
201 case nir_op_fneg:
202 case nir_op_fabs:
203 case nir_op_fsat:
204 case nir_op_fsign:
205 case nir_op_frcp:
206 case nir_op_frsq:
207 case nir_op_fsqrt:
208 case nir_op_fexp2:
209 case nir_op_flog2:
210 case nir_op_ffract:
211 case nir_op_ffloor:
212 case nir_op_fceil:
213 case nir_op_ftrunc:
214 case nir_op_fround_even:
215 case nir_op_fsin:
216 case nir_op_fcos:
217 case nir_op_f2f32:
218 case nir_op_f2f64:
219 case nir_op_u2f32:
220 case nir_op_u2f64:
221 case nir_op_i2f32:
222 case nir_op_i2f64:
223 case nir_op_pack_half_2x16:
224 case nir_op_unpack_half_2x16_split_x:
225 case nir_op_unpack_half_2x16_split_y:
226 case nir_op_fddx:
227 case nir_op_fddy:
228 case nir_op_fddx_fine:
229 case nir_op_fddy_fine:
230 case nir_op_fddx_coarse:
231 case nir_op_fddy_coarse:
232 case nir_op_fquantize2f16:
233 case nir_op_ldexp:
234 case nir_op_frexp_sig:
235 case nir_op_frexp_exp:
236 case nir_op_cube_face_index:
237 case nir_op_cube_face_coord:
238 type = RegType::vgpr;
239 break;
240 case nir_op_flt:
241 case nir_op_fge:
242 case nir_op_feq:
243 case nir_op_fne:
244 case nir_op_ilt:
245 case nir_op_ige:
246 case nir_op_ult:
247 case nir_op_uge:
248 case nir_op_ieq:
249 case nir_op_ine:
250 case nir_op_i2b1:
251 size = 2;
252 break;
253 case nir_op_f2i64:
254 case nir_op_f2u64:
255 case nir_op_b2i32:
256 case nir_op_b2f32:
257 case nir_op_f2i32:
258 case nir_op_f2u32:
259 type = ctx->divergent_vals[alu_instr->dest.dest.ssa.index] ? RegType::vgpr : RegType::sgpr;
260 break;
261 case nir_op_bcsel:
262 if (alu_instr->dest.dest.ssa.bit_size == 1) {
263 size = 2;
264 } else {
265 if (ctx->divergent_vals[alu_instr->dest.dest.ssa.index]) {
266 type = RegType::vgpr;
267 } else {
268 if (allocated[alu_instr->src[1].src.ssa->index].type() == RegType::vgpr ||
269 allocated[alu_instr->src[2].src.ssa->index].type() == RegType::vgpr) {
270 type = RegType::vgpr;
271 }
272 }
273 if (alu_instr->src[1].src.ssa->num_components == 1 && alu_instr->src[2].src.ssa->num_components == 1) {
274 assert(allocated[alu_instr->src[1].src.ssa->index].size() == allocated[alu_instr->src[2].src.ssa->index].size());
275 size = allocated[alu_instr->src[1].src.ssa->index].size();
276 }
277 }
278 break;
279 case nir_op_mov:
280 if (alu_instr->dest.dest.ssa.bit_size == 1) {
281 size = 2;
282 } else {
283 type = ctx->divergent_vals[alu_instr->dest.dest.ssa.index] ? RegType::vgpr : RegType::sgpr;
284 }
285 break;
286 default:
287 if (alu_instr->dest.dest.ssa.bit_size == 1) {
288 size = 2;
289 } else {
290 for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) {
291 if (allocated[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr)
292 type = RegType::vgpr;
293 }
294 }
295 break;
296 }
297 allocated[alu_instr->dest.dest.ssa.index] = Temp(0, RegClass(type, size));
298 break;
299 }
300 case nir_instr_type_load_const: {
301 unsigned size = nir_instr_as_load_const(instr)->def.num_components;
302 if (nir_instr_as_load_const(instr)->def.bit_size == 64)
303 size *= 2;
304 else if (nir_instr_as_load_const(instr)->def.bit_size == 1)
305 size *= 2;
306 allocated[nir_instr_as_load_const(instr)->def.index] = Temp(0, RegClass(RegType::sgpr, size));
307 break;
308 }
309 case nir_instr_type_intrinsic: {
310 nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
311 if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest)
312 break;
313 unsigned size = intrinsic->dest.ssa.num_components;
314 if (intrinsic->dest.ssa.bit_size == 64)
315 size *= 2;
316 RegType type = RegType::sgpr;
317 switch(intrinsic->intrinsic) {
318 case nir_intrinsic_load_push_constant:
319 case nir_intrinsic_load_work_group_id:
320 case nir_intrinsic_load_num_work_groups:
321 case nir_intrinsic_load_subgroup_id:
322 case nir_intrinsic_load_num_subgroups:
323 case nir_intrinsic_load_first_vertex:
324 case nir_intrinsic_load_base_instance:
325 case nir_intrinsic_get_buffer_size:
326 case nir_intrinsic_vote_all:
327 case nir_intrinsic_vote_any:
328 case nir_intrinsic_read_first_invocation:
329 case nir_intrinsic_read_invocation:
330 case nir_intrinsic_first_invocation:
331 type = RegType::sgpr;
332 if (intrinsic->dest.ssa.bit_size == 1)
333 size = 2;
334 break;
335 case nir_intrinsic_ballot:
336 type = RegType::sgpr;
337 size = 2;
338 break;
339 case nir_intrinsic_load_sample_id:
340 case nir_intrinsic_load_sample_mask_in:
341 case nir_intrinsic_load_input:
342 case nir_intrinsic_load_vertex_id:
343 case nir_intrinsic_load_vertex_id_zero_base:
344 case nir_intrinsic_load_barycentric_sample:
345 case nir_intrinsic_load_barycentric_pixel:
346 case nir_intrinsic_load_barycentric_centroid:
347 case nir_intrinsic_load_barycentric_at_sample:
348 case nir_intrinsic_load_barycentric_at_offset:
349 case nir_intrinsic_load_interpolated_input:
350 case nir_intrinsic_load_frag_coord:
351 case nir_intrinsic_load_sample_pos:
352 case nir_intrinsic_load_layer_id:
353 case nir_intrinsic_load_local_invocation_id:
354 case nir_intrinsic_load_local_invocation_index:
355 case nir_intrinsic_load_subgroup_invocation:
356 case nir_intrinsic_write_invocation_amd:
357 case nir_intrinsic_mbcnt_amd:
358 case nir_intrinsic_load_instance_id:
359 case nir_intrinsic_ssbo_atomic_add:
360 case nir_intrinsic_ssbo_atomic_imin:
361 case nir_intrinsic_ssbo_atomic_umin:
362 case nir_intrinsic_ssbo_atomic_imax:
363 case nir_intrinsic_ssbo_atomic_umax:
364 case nir_intrinsic_ssbo_atomic_and:
365 case nir_intrinsic_ssbo_atomic_or:
366 case nir_intrinsic_ssbo_atomic_xor:
367 case nir_intrinsic_ssbo_atomic_exchange:
368 case nir_intrinsic_ssbo_atomic_comp_swap:
369 case nir_intrinsic_image_deref_atomic_add:
370 case nir_intrinsic_image_deref_atomic_umin:
371 case nir_intrinsic_image_deref_atomic_imin:
372 case nir_intrinsic_image_deref_atomic_umax:
373 case nir_intrinsic_image_deref_atomic_imax:
374 case nir_intrinsic_image_deref_atomic_and:
375 case nir_intrinsic_image_deref_atomic_or:
376 case nir_intrinsic_image_deref_atomic_xor:
377 case nir_intrinsic_image_deref_atomic_exchange:
378 case nir_intrinsic_image_deref_atomic_comp_swap:
379 case nir_intrinsic_image_deref_size:
380 case nir_intrinsic_shared_atomic_add:
381 case nir_intrinsic_shared_atomic_imin:
382 case nir_intrinsic_shared_atomic_umin:
383 case nir_intrinsic_shared_atomic_imax:
384 case nir_intrinsic_shared_atomic_umax:
385 case nir_intrinsic_shared_atomic_and:
386 case nir_intrinsic_shared_atomic_or:
387 case nir_intrinsic_shared_atomic_xor:
388 case nir_intrinsic_shared_atomic_exchange:
389 case nir_intrinsic_shared_atomic_comp_swap:
390 case nir_intrinsic_load_scratch:
391 type = RegType::vgpr;
392 break;
393 case nir_intrinsic_shuffle:
394 case nir_intrinsic_quad_broadcast:
395 case nir_intrinsic_quad_swap_horizontal:
396 case nir_intrinsic_quad_swap_vertical:
397 case nir_intrinsic_quad_swap_diagonal:
398 case nir_intrinsic_quad_swizzle_amd:
399 case nir_intrinsic_masked_swizzle_amd:
400 case nir_intrinsic_inclusive_scan:
401 case nir_intrinsic_exclusive_scan:
402 if (intrinsic->dest.ssa.bit_size == 1) {
403 size = 2;
404 type = RegType::sgpr;
405 } else if (!ctx->divergent_vals[intrinsic->dest.ssa.index]) {
406 type = RegType::sgpr;
407 } else {
408 type = RegType::vgpr;
409 }
410 break;
411 case nir_intrinsic_load_view_index:
412 type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr;
413 break;
414 case nir_intrinsic_load_front_face:
415 case nir_intrinsic_load_helper_invocation:
416 case nir_intrinsic_is_helper_invocation:
417 type = RegType::sgpr;
418 size = 2;
419 break;
420 case nir_intrinsic_reduce:
421 if (intrinsic->dest.ssa.bit_size == 1) {
422 size = 2;
423 type = RegType::sgpr;
424 } else if (nir_intrinsic_cluster_size(intrinsic) == 0 ||
425 !ctx->divergent_vals[intrinsic->dest.ssa.index]) {
426 type = RegType::sgpr;
427 } else {
428 type = RegType::vgpr;
429 }
430 break;
431 case nir_intrinsic_load_ubo:
432 case nir_intrinsic_load_ssbo:
433 case nir_intrinsic_load_global:
434 case nir_intrinsic_vulkan_resource_index:
435 type = ctx->divergent_vals[intrinsic->dest.ssa.index] ? RegType::vgpr : RegType::sgpr;
436 break;
437 /* due to copy propagation, the swizzled imov is removed if num dest components == 1 */
438 case nir_intrinsic_load_shared:
439 if (ctx->divergent_vals[intrinsic->dest.ssa.index])
440 type = RegType::vgpr;
441 else
442 type = RegType::sgpr;
443 break;
444 default:
445 for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs; i++) {
446 if (allocated[intrinsic->src[i].ssa->index].type() == RegType::vgpr)
447 type = RegType::vgpr;
448 }
449 break;
450 }
451 allocated[intrinsic->dest.ssa.index] = Temp(0, RegClass(type, size));
452
453 switch(intrinsic->intrinsic) {
454 case nir_intrinsic_load_barycentric_sample:
455 case nir_intrinsic_load_barycentric_pixel:
456 case nir_intrinsic_load_barycentric_centroid:
457 case nir_intrinsic_load_barycentric_at_sample:
458 case nir_intrinsic_load_barycentric_at_offset: {
459 glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(intrinsic);
460 ctx->fs_vgpr_args[get_interp_input(intrinsic->intrinsic, mode)] = true;
461 break;
462 }
463 case nir_intrinsic_load_front_face:
464 ctx->fs_vgpr_args[fs_input::front_face] = true;
465 break;
466 case nir_intrinsic_load_frag_coord:
467 case nir_intrinsic_load_sample_pos: {
468 uint8_t mask = nir_ssa_def_components_read(&intrinsic->dest.ssa);
469 for (unsigned i = 0; i < 4; i++) {
470 if (mask & (1 << i))
471 ctx->fs_vgpr_args[fs_input::frag_pos_0 + i] = true;
472
473 }
474 break;
475 }
476 case nir_intrinsic_load_sample_id:
477 ctx->fs_vgpr_args[fs_input::ancillary] = true;
478 break;
479 case nir_intrinsic_load_sample_mask_in:
480 ctx->fs_vgpr_args[fs_input::ancillary] = true;
481 ctx->fs_vgpr_args[fs_input::sample_coverage] = true;
482 break;
483 default:
484 break;
485 }
486 break;
487 }
488 case nir_instr_type_tex: {
489 nir_tex_instr* tex = nir_instr_as_tex(instr);
490 unsigned size = tex->dest.ssa.num_components;
491
492 if (tex->dest.ssa.bit_size == 64)
493 size *= 2;
494 if (tex->op == nir_texop_texture_samples)
495 assert(!ctx->divergent_vals[tex->dest.ssa.index]);
496 if (ctx->divergent_vals[tex->dest.ssa.index])
497 allocated[tex->dest.ssa.index] = Temp(0, RegClass(RegType::vgpr, size));
498 else
499 allocated[tex->dest.ssa.index] = Temp(0, RegClass(RegType::sgpr, size));
500 break;
501 }
502 case nir_instr_type_parallel_copy: {
503 nir_foreach_parallel_copy_entry(entry, nir_instr_as_parallel_copy(instr)) {
504 allocated[entry->dest.ssa.index] = allocated[entry->src.ssa->index];
505 }
506 break;
507 }
508 case nir_instr_type_ssa_undef: {
509 unsigned size = nir_instr_as_ssa_undef(instr)->def.num_components;
510 if (nir_instr_as_ssa_undef(instr)->def.bit_size == 64)
511 size *= 2;
512 allocated[nir_instr_as_ssa_undef(instr)->def.index] = Temp(0, RegClass(RegType::sgpr, size));
513 break;
514 }
515 case nir_instr_type_phi: {
516 nir_phi_instr* phi = nir_instr_as_phi(instr);
517 RegType type;
518 unsigned size = phi->dest.ssa.num_components;
519
520 if (phi->dest.ssa.bit_size == 1) {
521 assert(size == 1 && "multiple components not yet supported on boolean phis.");
522 type = RegType::sgpr;
523 size *= 2;
524 allocated[phi->dest.ssa.index] = Temp(0, RegClass(type, size));
525 break;
526 }
527
528 if (ctx->divergent_vals[phi->dest.ssa.index]) {
529 type = RegType::vgpr;
530 } else {
531 type = RegType::sgpr;
532 nir_foreach_phi_src (src, phi) {
533 if (allocated[src->src.ssa->index].type() == RegType::vgpr)
534 type = RegType::vgpr;
535 if (allocated[src->src.ssa->index].type() == RegType::none)
536 done = false;
537 }
538 }
539
540 size *= phi->dest.ssa.bit_size == 64 ? 2 : 1;
541 RegClass rc = RegClass(type, size);
542 if (rc != allocated[phi->dest.ssa.index].regClass()) {
543 done = false;
544 } else {
545 nir_foreach_phi_src(src, phi)
546 assert(allocated[src->src.ssa->index].size() == rc.size());
547 }
548 allocated[phi->dest.ssa.index] = Temp(0, rc);
549 break;
550 }
551 default:
552 break;
553 }
554 }
555 }
556 }
557
558 for (unsigned i = 0; i < impl->ssa_alloc; i++)
559 allocated[i] = Temp(ctx->program->allocateId(), allocated[i].regClass());
560
561 ctx->allocated.reset(allocated.release());
562 }
563
564 struct user_sgpr_info {
565 uint8_t num_sgpr;
566 uint8_t remaining_sgprs;
567 uint8_t user_sgpr_idx;
568 bool need_ring_offsets;
569 bool indirect_all_descriptor_sets;
570 };
571
572 static void allocate_inline_push_consts(isel_context *ctx,
573 user_sgpr_info& user_sgpr_info)
574 {
575 uint8_t remaining_sgprs = user_sgpr_info.remaining_sgprs;
576
577 /* Only supported if shaders use push constants. */
578 if (ctx->program->info->min_push_constant_used == UINT8_MAX)
579 return;
580
581 /* Only supported if shaders don't have indirect push constants. */
582 if (ctx->program->info->has_indirect_push_constants)
583 return;
584
585 /* Only supported for 32-bit push constants. */
586 //TODO: it's possible that some day, the load/store vectorization could make this inaccurate
587 if (!ctx->program->info->has_only_32bit_push_constants)
588 return;
589
590 uint8_t num_push_consts =
591 (ctx->program->info->max_push_constant_used -
592 ctx->program->info->min_push_constant_used) / 4;
593
594 /* Check if the number of user SGPRs is large enough. */
595 if (num_push_consts < remaining_sgprs) {
596 ctx->program->info->num_inline_push_consts = num_push_consts;
597 } else {
598 ctx->program->info->num_inline_push_consts = remaining_sgprs;
599 }
600
601 /* Clamp to the maximum number of allowed inlined push constants. */
602 if (ctx->program->info->num_inline_push_consts > MAX_INLINE_PUSH_CONSTS)
603 ctx->program->info->num_inline_push_consts = MAX_INLINE_PUSH_CONSTS;
604
605 if (ctx->program->info->num_inline_push_consts == num_push_consts &&
606 !ctx->program->info->loads_dynamic_offsets) {
607 /* Disable the default push constants path if all constants are
608 * inlined and if shaders don't use dynamic descriptors.
609 */
610 ctx->program->info->loads_push_constants = false;
611 user_sgpr_info.num_sgpr--;
612 user_sgpr_info.remaining_sgprs++;
613 }
614
615 ctx->program->info->base_inline_push_consts =
616 ctx->program->info->min_push_constant_used / 4;
617
618 user_sgpr_info.num_sgpr += ctx->program->info->num_inline_push_consts;
619 user_sgpr_info.remaining_sgprs -= ctx->program->info->num_inline_push_consts;
620 }
621
622 static void allocate_user_sgprs(isel_context *ctx,
623 bool needs_view_index, user_sgpr_info& user_sgpr_info)
624 {
625 memset(&user_sgpr_info, 0, sizeof(struct user_sgpr_info));
626 uint32_t user_sgpr_count = 0;
627
628 /* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */
629 if (ctx->stage != fragment_fs &&
630 ctx->stage != compute_cs
631 /*|| ctx->is_gs_copy_shader */)
632 user_sgpr_info.need_ring_offsets = true;
633
634 if (ctx->stage == fragment_fs &&
635 ctx->program->info->ps.needs_sample_positions)
636 user_sgpr_info.need_ring_offsets = true;
637
638 /* 2 user sgprs will nearly always be allocated for scratch/rings */
639 user_sgpr_count += 2;
640
641 switch (ctx->stage) {
642 case vertex_vs:
643 /* if (!ctx->is_gs_copy_shader) */ {
644 if (ctx->program->info->vs.has_vertex_buffers)
645 user_sgpr_count++;
646 user_sgpr_count += ctx->program->info->vs.needs_draw_id ? 3 : 2;
647 }
648 break;
649 case fragment_fs:
650 //user_sgpr_count += ctx->program->info->ps.needs_sample_positions;
651 break;
652 case compute_cs:
653 if (ctx->program->info->cs.uses_grid_size)
654 user_sgpr_count += 3;
655 break;
656 default:
657 unreachable("Shader stage not implemented");
658 }
659
660 if (needs_view_index)
661 user_sgpr_count++;
662
663 if (ctx->program->info->loads_push_constants)
664 user_sgpr_count += 1; /* we use 32bit pointers */
665
666 if (ctx->program->info->so.num_outputs)
667 user_sgpr_count += 1; /* we use 32bit pointers */
668
669 uint32_t available_sgprs = ctx->options->chip_class >= GFX9 && !(ctx->stage & hw_cs) ? 32 : 16;
670 uint32_t remaining_sgprs = available_sgprs - user_sgpr_count;
671 uint32_t num_desc_set = util_bitcount(ctx->program->info->desc_set_used_mask);
672
673 if (available_sgprs < user_sgpr_count + num_desc_set) {
674 user_sgpr_info.indirect_all_descriptor_sets = true;
675 user_sgpr_info.num_sgpr = user_sgpr_count + 1;
676 user_sgpr_info.remaining_sgprs = remaining_sgprs - 1;
677 } else {
678 user_sgpr_info.num_sgpr = user_sgpr_count + num_desc_set;
679 user_sgpr_info.remaining_sgprs = remaining_sgprs - num_desc_set;
680 }
681
682 allocate_inline_push_consts(ctx, user_sgpr_info);
683 }
684
685 #define MAX_ARGS 64
686 struct arg_info {
687 RegClass types[MAX_ARGS];
688 Temp *assign[MAX_ARGS];
689 PhysReg reg[MAX_ARGS];
690 unsigned array_params_mask;
691 uint8_t count;
692 uint8_t sgpr_count;
693 uint8_t num_sgprs_used;
694 uint8_t num_vgprs_used;
695 };
696
697 static void
698 add_arg(arg_info *info, RegClass rc, Temp *param_ptr, unsigned reg)
699 {
700 assert(info->count < MAX_ARGS);
701
702 info->assign[info->count] = param_ptr;
703 info->types[info->count] = rc;
704
705 if (rc.type() == RegType::sgpr) {
706 info->num_sgprs_used += rc.size();
707 info->sgpr_count++;
708 info->reg[info->count] = PhysReg{reg};
709 } else {
710 assert(rc.type() == RegType::vgpr);
711 info->num_vgprs_used += rc.size();
712 info->reg[info->count] = PhysReg{reg + 256};
713 }
714 info->count++;
715 }
716
717 static void
718 set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx, uint8_t num_sgprs)
719 {
720 ud_info->sgpr_idx = *sgpr_idx;
721 ud_info->num_sgprs = num_sgprs;
722 *sgpr_idx += num_sgprs;
723 }
724
725 static void
726 set_loc_shader(isel_context *ctx, int idx, uint8_t *sgpr_idx,
727 uint8_t num_sgprs)
728 {
729 struct radv_userdata_info *ud_info = &ctx->program->info->user_sgprs_locs.shader_data[idx];
730 assert(ud_info);
731
732 set_loc(ud_info, sgpr_idx, num_sgprs);
733 }
734
735 static void
736 set_loc_shader_ptr(isel_context *ctx, int idx, uint8_t *sgpr_idx)
737 {
738 bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS;
739
740 set_loc_shader(ctx, idx, sgpr_idx, use_32bit_pointers ? 1 : 2);
741 }
742
743 static void
744 set_loc_desc(isel_context *ctx, int idx, uint8_t *sgpr_idx)
745 {
746 struct radv_userdata_locations *locs = &ctx->program->info->user_sgprs_locs;
747 struct radv_userdata_info *ud_info = &locs->descriptor_sets[idx];
748 assert(ud_info);
749
750 set_loc(ud_info, sgpr_idx, 1);
751 locs->descriptor_sets_enabled |= 1 << idx;
752 }
753
754 static void
755 declare_global_input_sgprs(isel_context *ctx,
756 /* bool has_previous_stage, gl_shader_stage previous_stage, */
757 user_sgpr_info *user_sgpr_info,
758 struct arg_info *args,
759 Temp *desc_sets)
760 {
761 /* 1 for each descriptor set */
762 if (!user_sgpr_info->indirect_all_descriptor_sets) {
763 uint32_t mask = ctx->program->info->desc_set_used_mask;
764 while (mask) {
765 int i = u_bit_scan(&mask);
766 add_arg(args, s1, &desc_sets[i], user_sgpr_info->user_sgpr_idx);
767 set_loc_desc(ctx, i, &user_sgpr_info->user_sgpr_idx);
768 }
769 /* NIR->LLVM might have set this to true if RADV_DEBUG=compiletime */
770 ctx->program->info->need_indirect_descriptor_sets = false;
771 } else {
772 add_arg(args, s1, desc_sets, user_sgpr_info->user_sgpr_idx);
773 set_loc_shader_ptr(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS, &user_sgpr_info->user_sgpr_idx);
774 ctx->program->info->need_indirect_descriptor_sets = true;
775 }
776
777 if (ctx->program->info->loads_push_constants) {
778 /* 1 for push constants and dynamic descriptors */
779 add_arg(args, s1, &ctx->push_constants, user_sgpr_info->user_sgpr_idx);
780 set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, &user_sgpr_info->user_sgpr_idx);
781 }
782
783 if (ctx->program->info->num_inline_push_consts) {
784 unsigned count = ctx->program->info->num_inline_push_consts;
785 for (unsigned i = 0; i < count; i++)
786 add_arg(args, s1, &ctx->inline_push_consts[i], user_sgpr_info->user_sgpr_idx + i);
787 set_loc_shader(ctx, AC_UD_INLINE_PUSH_CONSTANTS, &user_sgpr_info->user_sgpr_idx, count);
788
789 ctx->num_inline_push_consts = ctx->program->info->num_inline_push_consts;
790 ctx->base_inline_push_consts = ctx->program->info->base_inline_push_consts;
791 }
792
793 if (ctx->program->info->so.num_outputs) {
794 add_arg(args, s1, &ctx->streamout_buffers, user_sgpr_info->user_sgpr_idx);
795 set_loc_shader_ptr(ctx, AC_UD_STREAMOUT_BUFFERS, &user_sgpr_info->user_sgpr_idx);
796 }
797 }
798
799 static void
800 declare_vs_input_vgprs(isel_context *ctx, struct arg_info *args)
801 {
802 unsigned vgpr_idx = 0;
803 add_arg(args, v1, &ctx->vertex_id, vgpr_idx++);
804 if (ctx->options->chip_class >= GFX10) {
805 add_arg(args, v1, NULL, vgpr_idx++); /* unused */
806 add_arg(args, v1, &ctx->vs_prim_id, vgpr_idx++);
807 add_arg(args, v1, &ctx->instance_id, vgpr_idx++);
808 } else {
809 if (ctx->options->key.vs.out.as_ls) {
810 add_arg(args, v1, &ctx->rel_auto_id, vgpr_idx++);
811 add_arg(args, v1, &ctx->instance_id, vgpr_idx++);
812 } else {
813 add_arg(args, v1, &ctx->instance_id, vgpr_idx++);
814 add_arg(args, v1, &ctx->vs_prim_id, vgpr_idx++);
815 }
816 add_arg(args, v1, NULL, vgpr_idx); /* unused */
817 }
818 }
819
820 static void
821 declare_streamout_sgprs(isel_context *ctx, struct arg_info *args, unsigned *idx)
822 {
823 /* Streamout SGPRs. */
824 if (ctx->program->info->so.num_outputs) {
825 assert(ctx->stage & hw_vs);
826
827 if (ctx->stage != tess_eval_vs) {
828 add_arg(args, s1, &ctx->streamout_config, (*idx)++);
829 } else {
830 args->assign[args->count - 1] = &ctx->streamout_config;
831 args->types[args->count - 1] = s1;
832 }
833
834 add_arg(args, s1, &ctx->streamout_write_idx, (*idx)++);
835 }
836
837 /* A streamout buffer offset is loaded if the stride is non-zero. */
838 for (unsigned i = 0; i < 4; i++) {
839 if (!ctx->program->info->so.strides[i])
840 continue;
841
842 add_arg(args, s1, &ctx->streamout_offset[i], (*idx)++);
843 }
844 }
845
846 static bool needs_view_index_sgpr(isel_context *ctx)
847 {
848 switch (ctx->stage) {
849 case vertex_vs:
850 return ctx->program->info->needs_multiview_view_index || ctx->options->key.has_multiview_view_index;
851 case tess_eval_vs:
852 return ctx->program->info->needs_multiview_view_index && ctx->options->key.has_multiview_view_index;
853 case vertex_ls:
854 case vertex_es:
855 case vertex_tess_control_hs:
856 case vertex_geometry_gs:
857 case tess_control_hs:
858 case tess_eval_es:
859 case tess_eval_geometry_gs:
860 case geometry_gs:
861 return ctx->program->info->needs_multiview_view_index;
862 default:
863 return false;
864 }
865 }
866
867 static inline bool
868 add_fs_arg(isel_context *ctx, arg_info *args, unsigned &vgpr_idx, fs_input input, unsigned value, bool enable_next = false, RegClass rc = v1)
869 {
870 if (!ctx->fs_vgpr_args[input])
871 return false;
872
873 add_arg(args, rc, &ctx->fs_inputs[input], vgpr_idx);
874 vgpr_idx += rc.size();
875
876 if (enable_next) {
877 add_arg(args, rc, &ctx->fs_inputs[input + 1], vgpr_idx);
878 vgpr_idx += rc.size();
879 }
880
881 ctx->program->config->spi_ps_input_addr |= value;
882 ctx->program->config->spi_ps_input_ena |= value;
883 return true;
884 }
885
886 Pseudo_instruction *add_startpgm(struct isel_context *ctx)
887 {
888 user_sgpr_info user_sgpr_info;
889 bool needs_view_index = needs_view_index_sgpr(ctx);
890 allocate_user_sgprs(ctx, needs_view_index, user_sgpr_info);
891 arg_info args = {};
892
893 /* this needs to be in sgprs 0 and 1 */
894 add_arg(&args, s2, &ctx->program->private_segment_buffer, 0);
895 set_loc_shader_ptr(ctx, AC_UD_SCRATCH_RING_OFFSETS, &user_sgpr_info.user_sgpr_idx);
896
897 unsigned vgpr_idx = 0;
898 switch (ctx->stage) {
899 case vertex_vs: {
900 declare_global_input_sgprs(ctx, &user_sgpr_info, &args, ctx->descriptor_sets);
901 if (ctx->program->info->vs.has_vertex_buffers) {
902 add_arg(&args, s1, &ctx->vertex_buffers, user_sgpr_info.user_sgpr_idx);
903 set_loc_shader_ptr(ctx, AC_UD_VS_VERTEX_BUFFERS, &user_sgpr_info.user_sgpr_idx);
904 }
905 add_arg(&args, s1, &ctx->base_vertex, user_sgpr_info.user_sgpr_idx);
906 add_arg(&args, s1, &ctx->start_instance, user_sgpr_info.user_sgpr_idx + 1);
907 if (ctx->program->info->vs.needs_draw_id) {
908 add_arg(&args, s1, &ctx->draw_id, user_sgpr_info.user_sgpr_idx + 2);
909 set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, &user_sgpr_info.user_sgpr_idx, 3);
910 } else
911 set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, &user_sgpr_info.user_sgpr_idx, 2);
912
913 if (needs_view_index) {
914 add_arg(&args, s1, &ctx->view_index, user_sgpr_info.user_sgpr_idx);
915 set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_info.user_sgpr_idx, 1);
916 }
917
918 assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr);
919 unsigned idx = user_sgpr_info.user_sgpr_idx;
920 if (ctx->options->key.vs.out.as_es)
921 add_arg(&args, s1, &ctx->es2gs_offset, idx++);
922 else
923 declare_streamout_sgprs(ctx, &args, &idx);
924
925 add_arg(&args, s1, &ctx->program->scratch_offset, idx++);
926
927 declare_vs_input_vgprs(ctx, &args);
928 break;
929 }
930 case fragment_fs: {
931 declare_global_input_sgprs(ctx, &user_sgpr_info, &args, ctx->descriptor_sets);
932
933 assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr);
934 add_arg(&args, s1, &ctx->prim_mask, user_sgpr_info.user_sgpr_idx);
935
936 add_arg(&args, s1, &ctx->program->scratch_offset, user_sgpr_info.user_sgpr_idx + 1);
937
938 ctx->program->config->spi_ps_input_addr = 0;
939 ctx->program->config->spi_ps_input_ena = 0;
940
941 bool has_interp_mode = false;
942
943 has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_sample_p1, S_0286CC_PERSP_SAMPLE_ENA(1), true);
944 has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_center_p1, S_0286CC_PERSP_CENTER_ENA(1), true);
945 has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_centroid_p1, S_0286CC_PERSP_CENTROID_ENA(1), true);
946 has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_pull_model, S_0286CC_PERSP_PULL_MODEL_ENA(1), false, v3);
947
948 if (!has_interp_mode && ctx->fs_vgpr_args[fs_input::frag_pos_3]) {
949 /* If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be enabled too */
950 ctx->fs_vgpr_args[fs_input::persp_center_p1] = true;
951 has_interp_mode = add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_center_p1, S_0286CC_PERSP_CENTER_ENA(1), true);
952 }
953
954 has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::linear_sample_p1, S_0286CC_LINEAR_SAMPLE_ENA(1), true);
955 has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::linear_center_p1, S_0286CC_LINEAR_CENTER_ENA(1), true);
956 has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::linear_centroid_p1, S_0286CC_LINEAR_CENTROID_ENA(1), true);
957 has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::line_stipple, S_0286CC_LINE_STIPPLE_TEX_ENA(1));
958
959 if (!has_interp_mode) {
960 /* At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled */
961 ctx->fs_vgpr_args[fs_input::persp_center_p1] = true;
962 has_interp_mode = add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_center_p1, S_0286CC_PERSP_CENTER_ENA(1), true);
963 }
964
965 add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_0, S_0286CC_POS_X_FLOAT_ENA(1));
966 add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_1, S_0286CC_POS_Y_FLOAT_ENA(1));
967 add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_2, S_0286CC_POS_Z_FLOAT_ENA(1));
968 add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_3, S_0286CC_POS_W_FLOAT_ENA(1));
969
970 add_fs_arg(ctx, &args, vgpr_idx, fs_input::front_face, S_0286CC_FRONT_FACE_ENA(1));
971 add_fs_arg(ctx, &args, vgpr_idx, fs_input::ancillary, S_0286CC_ANCILLARY_ENA(1));
972 add_fs_arg(ctx, &args, vgpr_idx, fs_input::sample_coverage, S_0286CC_SAMPLE_COVERAGE_ENA(1));
973 add_fs_arg(ctx, &args, vgpr_idx, fs_input::fixed_pt, S_0286CC_POS_FIXED_PT_ENA(1));
974
975 ASSERTED bool unset_interp_mode = !(ctx->program->config->spi_ps_input_addr & 0x7F) ||
976 (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_addr)
977 && !(ctx->program->config->spi_ps_input_addr & 0xF));
978
979 assert(has_interp_mode);
980 assert(!unset_interp_mode);
981 break;
982 }
983 case compute_cs: {
984 declare_global_input_sgprs(ctx, &user_sgpr_info, &args, ctx->descriptor_sets);
985
986 if (ctx->program->info->cs.uses_grid_size) {
987 add_arg(&args, s3, &ctx->num_workgroups, user_sgpr_info.user_sgpr_idx);
988 set_loc_shader(ctx, AC_UD_CS_GRID_SIZE, &user_sgpr_info.user_sgpr_idx, 3);
989 }
990 assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr);
991 unsigned idx = user_sgpr_info.user_sgpr_idx;
992 for (unsigned i = 0; i < 3; i++) {
993 if (ctx->program->info->cs.uses_block_id[i])
994 add_arg(&args, s1, &ctx->workgroup_ids[i], idx++);
995 }
996
997 if (ctx->program->info->cs.uses_local_invocation_idx)
998 add_arg(&args, s1, &ctx->tg_size, idx++);
999 add_arg(&args, s1, &ctx->program->scratch_offset, idx++);
1000
1001 add_arg(&args, v3, &ctx->local_invocation_ids, vgpr_idx++);
1002 break;
1003 }
1004 default:
1005 unreachable("Shader stage not implemented");
1006 }
1007
1008 ctx->program->info->num_input_vgprs = 0;
1009 ctx->program->info->num_input_sgprs = args.num_sgprs_used;
1010 ctx->program->info->num_user_sgprs = user_sgpr_info.num_sgpr;
1011 ctx->program->info->num_input_vgprs = args.num_vgprs_used;
1012
1013 if (ctx->stage == fragment_fs) {
1014 /* Verify that we have a correct assumption about input VGPR count */
1015 ASSERTED unsigned input_vgpr_cnt = ac_get_fs_input_vgpr_cnt(ctx->program->config, nullptr, nullptr);
1016 assert(input_vgpr_cnt == ctx->program->info->num_input_vgprs);
1017 }
1018
1019 aco_ptr<Pseudo_instruction> startpgm{create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, args.count + 1)};
1020 for (unsigned i = 0; i < args.count; i++) {
1021 if (args.assign[i]) {
1022 *args.assign[i] = Temp{ctx->program->allocateId(), args.types[i]};
1023 startpgm->definitions[i] = Definition(*args.assign[i]);
1024 startpgm->definitions[i].setFixed(args.reg[i]);
1025 }
1026 }
1027 startpgm->definitions[args.count] = Definition{ctx->program->allocateId(), exec, s2};
1028 Pseudo_instruction *instr = startpgm.get();
1029 ctx->block->instructions.push_back(std::move(startpgm));
1030
1031 return instr;
1032 }
1033
1034 int
1035 type_size(const struct glsl_type *type, bool bindless)
1036 {
1037 // TODO: don't we need type->std430_base_alignment() here?
1038 return glsl_count_attribute_slots(type, false);
1039 }
1040
1041 void
1042 shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
1043 {
1044 assert(glsl_type_is_vector_or_scalar(type));
1045
1046 uint32_t comp_size = glsl_type_is_boolean(type)
1047 ? 4 : glsl_get_bit_size(type) / 8;
1048 unsigned length = glsl_get_vector_elements(type);
1049 *size = comp_size * length,
1050 *align = comp_size;
1051 }
1052
1053 int
1054 get_align(nir_variable_mode mode, bool is_store, unsigned bit_size, unsigned num_components)
1055 {
1056 /* TODO: ACO doesn't have good support for non-32-bit reads/writes yet */
1057 if (bit_size != 32)
1058 return -1;
1059
1060 switch (mode) {
1061 case nir_var_mem_ubo:
1062 case nir_var_mem_ssbo:
1063 //case nir_var_mem_push_const: enable with 1240!
1064 case nir_var_mem_shared:
1065 /* TODO: what are the alignment requirements for LDS? */
1066 return num_components <= 4 ? 4 : -1;
1067 default:
1068 return -1;
1069 }
1070 }
1071
1072 void
1073 setup_vs_variables(isel_context *ctx, nir_shader *nir)
1074 {
1075 nir_foreach_variable(variable, &nir->inputs)
1076 {
1077 variable->data.driver_location = variable->data.location * 4;
1078 }
1079 nir_foreach_variable(variable, &nir->outputs)
1080 {
1081 variable->data.driver_location = variable->data.location * 4;
1082 }
1083
1084 radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
1085
1086 memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
1087 sizeof(outinfo->vs_output_param_offset));
1088
1089 ctx->needs_instance_id = ctx->program->info->vs.needs_instance_id;
1090
1091 bool export_clip_dists = ctx->options->key.vs_common_out.export_clip_dists;
1092
1093 outinfo->param_exports = 0;
1094 int pos_written = 0x1;
1095 if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer)
1096 pos_written |= 1 << 1;
1097
1098 nir_foreach_variable(variable, &nir->outputs)
1099 {
1100 int idx = variable->data.location;
1101 unsigned slots = variable->type->count_attribute_slots(false);
1102 if (variable->data.compact) {
1103 unsigned component_count = variable->data.location_frac + variable->type->length;
1104 slots = (component_count + 3) / 4;
1105 }
1106
1107 if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID ||
1108 ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) {
1109 for (unsigned i = 0; i < slots; i++) {
1110 if (outinfo->vs_output_param_offset[idx + i] == AC_EXP_PARAM_UNDEFINED)
1111 outinfo->vs_output_param_offset[idx + i] = outinfo->param_exports++;
1112 }
1113 }
1114 }
1115 if (outinfo->writes_layer &&
1116 outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) {
1117 /* when ctx->options->key.has_multiview_view_index = true, the layer
1118 * variable isn't declared in NIR and it's isel's job to get the layer */
1119 outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++;
1120 }
1121
1122 if (outinfo->export_prim_id) {
1123 assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED);
1124 outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++;
1125 }
1126
1127 ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask);
1128 ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask);
1129
1130 assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8);
1131
1132 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
1133 pos_written |= 1 << 2;
1134 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
1135 pos_written |= 1 << 3;
1136
1137 outinfo->pos_exports = util_bitcount(pos_written);
1138 }
1139
1140 void
1141 setup_variables(isel_context *ctx, nir_shader *nir)
1142 {
1143 switch (nir->info.stage) {
1144 case MESA_SHADER_FRAGMENT: {
1145 nir_foreach_variable(variable, &nir->outputs)
1146 {
1147 int idx = variable->data.location + variable->data.index;
1148 variable->data.driver_location = idx * 4;
1149 }
1150 break;
1151 }
1152 case MESA_SHADER_COMPUTE: {
1153 ctx->program->config->lds_size = (nir->info.cs.shared_size + ctx->program->lds_alloc_granule - 1) /
1154 ctx->program->lds_alloc_granule;
1155 break;
1156 }
1157 case MESA_SHADER_VERTEX: {
1158 setup_vs_variables(ctx, nir);
1159 break;
1160 }
1161 default:
1162 unreachable("Unhandled shader stage.");
1163 }
1164 }
1165
1166 isel_context
1167 setup_isel_context(Program* program,
1168 unsigned shader_count,
1169 struct nir_shader *const *shaders,
1170 ac_shader_config* config,
1171 radv_shader_info *info,
1172 const radv_nir_compiler_options *options)
1173 {
1174 program->stage = 0;
1175 for (unsigned i = 0; i < shader_count; i++) {
1176 switch (shaders[i]->info.stage) {
1177 case MESA_SHADER_VERTEX:
1178 program->stage |= sw_vs;
1179 break;
1180 case MESA_SHADER_TESS_CTRL:
1181 program->stage |= sw_tcs;
1182 break;
1183 case MESA_SHADER_TESS_EVAL:
1184 program->stage |= sw_tes;
1185 break;
1186 case MESA_SHADER_GEOMETRY:
1187 program->stage |= sw_gs;
1188 break;
1189 case MESA_SHADER_FRAGMENT:
1190 program->stage |= sw_fs;
1191 break;
1192 case MESA_SHADER_COMPUTE:
1193 program->stage |= sw_cs;
1194 break;
1195 default:
1196 unreachable("Shader stage not implemented");
1197 }
1198 }
1199 if (program->stage == sw_vs)
1200 program->stage |= hw_vs;
1201 else if (program->stage == sw_fs)
1202 program->stage |= hw_fs;
1203 else if (program->stage == sw_cs)
1204 program->stage |= hw_cs;
1205 else
1206 unreachable("Shader stage not implemented");
1207
1208 program->config = config;
1209 program->info = info;
1210 program->chip_class = options->chip_class;
1211 program->family = options->family;
1212 program->wave_size = info->wave_size;
1213
1214 program->lds_alloc_granule = options->chip_class >= GFX7 ? 512 : 256;
1215 program->lds_limit = options->chip_class >= GFX7 ? 65536 : 32768;
1216 program->vgpr_limit = 256;
1217
1218 if (options->chip_class >= GFX10) {
1219 program->physical_sgprs = 2560; /* doesn't matter as long as it's at least 128 * 20 */
1220 program->sgpr_alloc_granule = 127;
1221 program->sgpr_limit = 106;
1222 } else if (program->chip_class >= GFX8) {
1223 program->physical_sgprs = 800;
1224 program->sgpr_alloc_granule = 15;
1225 if (options->family == CHIP_TONGA || options->family == CHIP_ICELAND)
1226 program->sgpr_limit = 94; /* workaround hardware bug */
1227 else
1228 program->sgpr_limit = 102;
1229 } else {
1230 program->physical_sgprs = 512;
1231 program->sgpr_alloc_granule = 7;
1232 program->sgpr_limit = 104;
1233 }
1234 /* TODO: we don't have to allocate VCC if we don't need it */
1235 program->needs_vcc = true;
1236
1237 for (unsigned i = 0; i < MAX_SETS; ++i)
1238 program->info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
1239 for (unsigned i = 0; i < AC_UD_MAX_UD; ++i)
1240 program->info->user_sgprs_locs.shader_data[i].sgpr_idx = -1;
1241
1242 isel_context ctx = {};
1243 ctx.program = program;
1244 ctx.options = options;
1245 ctx.stage = program->stage;
1246
1247 for (unsigned i = 0; i < fs_input::max_inputs; ++i)
1248 ctx.fs_inputs[i] = Temp(0, v1);
1249 ctx.fs_inputs[fs_input::persp_pull_model] = Temp(0, v3);
1250 for (unsigned i = 0; i < MAX_SETS; ++i)
1251 ctx.descriptor_sets[i] = Temp(0, s1);
1252 for (unsigned i = 0; i < MAX_INLINE_PUSH_CONSTS; ++i)
1253 ctx.inline_push_consts[i] = Temp(0, s1);
1254 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
1255 for (unsigned j = 0; j < 4; ++j)
1256 ctx.vs_output.outputs[i][j] = Temp(0, v1);
1257 }
1258
1259 for (unsigned i = 0; i < shader_count; i++) {
1260 nir_shader *nir = shaders[i];
1261
1262 /* align and copy constant data */
1263 while (program->constant_data.size() % 4u)
1264 program->constant_data.push_back(0);
1265 ctx.constant_data_offset = program->constant_data.size();
1266 program->constant_data.insert(program->constant_data.end(),
1267 (uint8_t*)nir->constant_data,
1268 (uint8_t*)nir->constant_data + nir->constant_data_size);
1269
1270 /* the variable setup has to be done before lower_io / CSE */
1271 if (nir->info.stage == MESA_SHADER_COMPUTE)
1272 nir_lower_vars_to_explicit_types(nir, nir_var_mem_shared, shared_var_info);
1273 setup_variables(&ctx, nir);
1274
1275 /* optimize and lower memory operations */
1276 bool lower_to_scalar = false;
1277 bool lower_pack = false;
1278 // TODO: uncomment this once !1240 is merged
1279 /*if (nir_opt_load_store_vectorize(nir,
1280 (nir_variable_mode)(nir_var_mem_ssbo | nir_var_mem_ubo |
1281 nir_var_mem_push_const | nir_var_mem_shared),
1282 get_align)) {
1283 lower_to_scalar = true;
1284 lower_pack = true;
1285 }*/
1286 if (nir->info.stage == MESA_SHADER_COMPUTE)
1287 lower_to_scalar |= nir_lower_explicit_io(nir, nir_var_mem_shared, nir_address_format_32bit_offset);
1288 else
1289 nir_lower_io(nir, (nir_variable_mode)(nir_var_shader_in | nir_var_shader_out), type_size, (nir_lower_io_options)0);
1290 nir_lower_explicit_io(nir, nir_var_mem_global, nir_address_format_64bit_global);
1291
1292 if (lower_to_scalar)
1293 nir_lower_alu_to_scalar(nir, NULL, NULL);
1294 if (lower_pack)
1295 nir_lower_pack(nir);
1296
1297 /* lower ALU operations */
1298 // TODO: implement logic64 in aco, it's more effective for sgprs
1299 nir_lower_int64(nir, nir->options->lower_int64_options);
1300
1301 nir_opt_idiv_const(nir, 32);
1302 nir_lower_idiv(nir, nir_lower_idiv_precise);
1303
1304 /* optimize the lowered ALU operations */
1305 bool more_algebraic = true;
1306 while (more_algebraic) {
1307 more_algebraic = false;
1308 NIR_PASS_V(nir, nir_copy_prop);
1309 NIR_PASS_V(nir, nir_opt_dce);
1310 NIR_PASS_V(nir, nir_opt_constant_folding);
1311 NIR_PASS(more_algebraic, nir, nir_opt_algebraic);
1312 }
1313
1314 /* Do late algebraic optimization to turn add(a, neg(b)) back into
1315 * subs, then the mandatory cleanup after algebraic. Note that it may
1316 * produce fnegs, and if so then we need to keep running to squash
1317 * fneg(fneg(a)).
1318 */
1319 bool more_late_algebraic = true;
1320 while (more_late_algebraic) {
1321 more_late_algebraic = false;
1322 NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
1323 NIR_PASS_V(nir, nir_opt_constant_folding);
1324 NIR_PASS_V(nir, nir_copy_prop);
1325 NIR_PASS_V(nir, nir_opt_dce);
1326 NIR_PASS_V(nir, nir_opt_cse);
1327 }
1328
1329 /* cleanup passes */
1330 nir_lower_load_const_to_scalar(nir);
1331 nir_opt_shrink_load(nir);
1332 nir_move_options move_opts = (nir_move_options)(
1333 nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | nir_move_comparisons);
1334 nir_opt_sink(nir, move_opts);
1335 nir_opt_move(nir, move_opts);
1336 nir_convert_to_lcssa(nir, true, false);
1337 nir_lower_phis_to_scalar(nir);
1338
1339 nir_function_impl *func = nir_shader_get_entrypoint(nir);
1340 nir_index_ssa_defs(func);
1341
1342 if (options->dump_preoptir) {
1343 fprintf(stderr, "NIR shader before instruction selection:\n");
1344 nir_print_shader(nir, stderr);
1345 }
1346 }
1347
1348 unsigned scratch_size = 0;
1349 for (unsigned i = 0; i < shader_count; i++)
1350 scratch_size = std::max(scratch_size, shaders[i]->scratch_size);
1351 ctx.program->config->scratch_bytes_per_wave = align(scratch_size * ctx.program->wave_size, 1024);
1352
1353 ctx.block = ctx.program->create_and_insert_block();
1354 ctx.block->loop_nest_depth = 0;
1355 ctx.block->kind = block_kind_top_level;
1356
1357 return ctx;
1358 }
1359
1360 }