2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26 #include <unordered_map>
29 #include "vulkan/radv_shader.h"
30 #include "vulkan/radv_descriptor_set.h"
32 #include "ac_exp_param.h"
33 #include "ac_shader_util.h"
35 #include "util/u_math.h"
37 #define MAX_INLINE_PUSH_CONSTS 8
67 struct vs_output_state
{
68 uint8_t mask
[VARYING_SLOT_VAR31
+ 1];
69 Temp outputs
[VARYING_SLOT_VAR31
+ 1][4];
73 struct radv_nir_compiler_options
*options
;
76 uint32_t constant_data_offset
;
79 std::unique_ptr
<Temp
[]> allocated
;
80 std::unordered_map
<unsigned, std::array
<Temp
,4>> allocated_vec
;
81 Stage stage
; /* Stage */
82 bool has_gfx10_wave64_bpermute
= false;
85 uint16_t loop_nest_depth
= 0;
89 bool has_divergent_continue
= false;
90 bool has_divergent_branch
= false;
93 bool is_divergent
= false;
95 bool exec_potentially_empty
= false;
99 bool scratch_enabled
= false;
100 Temp private_segment_buffer
= Temp(0, s2
); /* also the part of the scratch descriptor on compute */
101 Temp scratch_offset
= Temp(0, s1
);
103 /* inputs common for merged stages */
104 Temp merged_wave_info
= Temp(0, s1
);
107 bool fs_vgpr_args
[fs_input::max_inputs
];
108 Temp fs_inputs
[fs_input::max_inputs
];
109 Temp prim_mask
= Temp(0, s1
);
110 Temp descriptor_sets
[MAX_SETS
];
111 Temp push_constants
= Temp(0, s1
);
112 Temp inline_push_consts
[MAX_INLINE_PUSH_CONSTS
];
113 unsigned num_inline_push_consts
= 0;
114 unsigned base_inline_push_consts
= 0;
117 Temp vertex_buffers
= Temp(0, s1
);
118 Temp base_vertex
= Temp(0, s1
);
119 Temp start_instance
= Temp(0, s1
);
120 Temp draw_id
= Temp(0, s1
);
121 Temp view_index
= Temp(0, s1
);
122 Temp es2gs_offset
= Temp(0, s1
);
123 Temp vertex_id
= Temp(0, v1
);
124 Temp rel_auto_id
= Temp(0, v1
);
125 Temp instance_id
= Temp(0, v1
);
126 Temp vs_prim_id
= Temp(0, v1
);
127 bool needs_instance_id
;
130 Temp num_workgroups
[3] = {Temp(0, s1
), Temp(0, s1
), Temp(0, s1
)};
131 Temp workgroup_ids
[3] = {Temp(0, s1
), Temp(0, s1
), Temp(0, s1
)};
132 Temp tg_size
= Temp(0, s1
);
133 Temp local_invocation_ids
[3] = {Temp(0, v1
), Temp(0, v1
), Temp(0, v1
)};
135 /* VS output information */
136 unsigned num_clip_distances
;
137 unsigned num_cull_distances
;
138 vs_output_state vs_output
;
141 Temp streamout_buffers
= Temp(0, s1
);
142 Temp streamout_write_idx
= Temp(0, s1
);
143 Temp streamout_config
= Temp(0, s1
);
144 Temp streamout_offset
[4] = {Temp(0, s1
), Temp(0, s1
), Temp(0, s1
), Temp(0, s1
)};
147 fs_input
get_interp_input(nir_intrinsic_op intrin
, enum glsl_interp_mode interp
)
150 case INTERP_MODE_SMOOTH
:
151 case INTERP_MODE_NONE
:
152 if (intrin
== nir_intrinsic_load_barycentric_pixel
||
153 intrin
== nir_intrinsic_load_barycentric_at_sample
||
154 intrin
== nir_intrinsic_load_barycentric_at_offset
)
155 return fs_input::persp_center_p1
;
156 else if (intrin
== nir_intrinsic_load_barycentric_centroid
)
157 return fs_input::persp_centroid_p1
;
158 else if (intrin
== nir_intrinsic_load_barycentric_sample
)
159 return fs_input::persp_sample_p1
;
161 case INTERP_MODE_NOPERSPECTIVE
:
162 if (intrin
== nir_intrinsic_load_barycentric_pixel
)
163 return fs_input::linear_center_p1
;
164 else if (intrin
== nir_intrinsic_load_barycentric_centroid
)
165 return fs_input::linear_centroid_p1
;
166 else if (intrin
== nir_intrinsic_load_barycentric_sample
)
167 return fs_input::linear_sample_p1
;
172 return fs_input::max_inputs
;
175 void init_context(isel_context
*ctx
, nir_shader
*shader
)
177 nir_function_impl
*impl
= nir_shader_get_entrypoint(shader
);
179 ctx
->shader
= shader
;
180 ctx
->divergent_vals
= nir_divergence_analysis(shader
, nir_divergence_view_index_uniform
);
182 std::unique_ptr
<Temp
[]> allocated
{new Temp
[impl
->ssa_alloc
]()};
183 memset(&ctx
->fs_vgpr_args
, false, sizeof(ctx
->fs_vgpr_args
));
188 nir_foreach_block(block
, impl
) {
189 nir_foreach_instr(instr
, block
) {
190 switch(instr
->type
) {
191 case nir_instr_type_alu
: {
192 nir_alu_instr
*alu_instr
= nir_instr_as_alu(instr
);
193 unsigned size
= alu_instr
->dest
.dest
.ssa
.num_components
;
194 if (alu_instr
->dest
.dest
.ssa
.bit_size
== 64)
196 RegType type
= RegType::sgpr
;
197 switch(alu_instr
->op
) {
219 case nir_op_fround_even
:
228 case nir_op_pack_half_2x16
:
229 case nir_op_unpack_half_2x16_split_x
:
230 case nir_op_unpack_half_2x16_split_y
:
233 case nir_op_fddx_fine
:
234 case nir_op_fddy_fine
:
235 case nir_op_fddx_coarse
:
236 case nir_op_fddy_coarse
:
237 case nir_op_fquantize2f16
:
239 case nir_op_frexp_sig
:
240 case nir_op_frexp_exp
:
241 case nir_op_cube_face_index
:
242 case nir_op_cube_face_coord
:
243 type
= RegType::vgpr
;
255 size
= alu_instr
->src
[0].src
.ssa
->bit_size
== 64 ? 2 : 1;
260 if (ctx
->divergent_vals
[alu_instr
->dest
.dest
.ssa
.index
]) {
263 for (unsigned i
= 0; i
< nir_op_infos
[alu_instr
->op
].num_inputs
; i
++) {
264 if (allocated
[alu_instr
->src
[i
].src
.ssa
->index
].type() == RegType::vgpr
)
275 type
= ctx
->divergent_vals
[alu_instr
->dest
.dest
.ssa
.index
] ? RegType::vgpr
: RegType::sgpr
;
278 if (alu_instr
->dest
.dest
.ssa
.bit_size
== 1) {
279 if (ctx
->divergent_vals
[alu_instr
->dest
.dest
.ssa
.index
])
281 else if (allocated
[alu_instr
->src
[1].src
.ssa
->index
].regClass() == s2
&&
282 allocated
[alu_instr
->src
[2].src
.ssa
->index
].regClass() == s2
)
287 if (ctx
->divergent_vals
[alu_instr
->dest
.dest
.ssa
.index
]) {
288 type
= RegType::vgpr
;
290 if (allocated
[alu_instr
->src
[1].src
.ssa
->index
].type() == RegType::vgpr
||
291 allocated
[alu_instr
->src
[2].src
.ssa
->index
].type() == RegType::vgpr
) {
292 type
= RegType::vgpr
;
295 if (alu_instr
->src
[1].src
.ssa
->num_components
== 1 && alu_instr
->src
[2].src
.ssa
->num_components
== 1) {
296 assert(allocated
[alu_instr
->src
[1].src
.ssa
->index
].size() == allocated
[alu_instr
->src
[2].src
.ssa
->index
].size());
297 size
= allocated
[alu_instr
->src
[1].src
.ssa
->index
].size();
302 if (alu_instr
->dest
.dest
.ssa
.bit_size
== 1) {
303 size
= allocated
[alu_instr
->src
[0].src
.ssa
->index
].size();
305 type
= ctx
->divergent_vals
[alu_instr
->dest
.dest
.ssa
.index
] ? RegType::vgpr
: RegType::sgpr
;
310 if (alu_instr
->dest
.dest
.ssa
.bit_size
== 1) {
311 size
= ctx
->divergent_vals
[alu_instr
->dest
.dest
.ssa
.index
] ? 2 : 1;
317 if (alu_instr
->dest
.dest
.ssa
.bit_size
== 1) {
318 if (ctx
->divergent_vals
[alu_instr
->dest
.dest
.ssa
.index
]) {
322 for (unsigned i
= 0; i
< nir_op_infos
[alu_instr
->op
].num_inputs
; i
++) {
323 if (allocated
[alu_instr
->src
[i
].src
.ssa
->index
].regClass() == s1
) {
330 for (unsigned i
= 0; i
< nir_op_infos
[alu_instr
->op
].num_inputs
; i
++) {
331 if (allocated
[alu_instr
->src
[i
].src
.ssa
->index
].type() == RegType::vgpr
)
332 type
= RegType::vgpr
;
337 allocated
[alu_instr
->dest
.dest
.ssa
.index
] = Temp(0, RegClass(type
, size
));
340 case nir_instr_type_load_const
: {
341 unsigned size
= nir_instr_as_load_const(instr
)->def
.num_components
;
342 if (nir_instr_as_load_const(instr
)->def
.bit_size
== 64)
344 allocated
[nir_instr_as_load_const(instr
)->def
.index
] = Temp(0, RegClass(RegType::sgpr
, size
));
347 case nir_instr_type_intrinsic
: {
348 nir_intrinsic_instr
*intrinsic
= nir_instr_as_intrinsic(instr
);
349 if (!nir_intrinsic_infos
[intrinsic
->intrinsic
].has_dest
)
351 unsigned size
= intrinsic
->dest
.ssa
.num_components
;
352 if (intrinsic
->dest
.ssa
.bit_size
== 64)
354 RegType type
= RegType::sgpr
;
355 switch(intrinsic
->intrinsic
) {
356 case nir_intrinsic_load_push_constant
:
357 case nir_intrinsic_load_work_group_id
:
358 case nir_intrinsic_load_num_work_groups
:
359 case nir_intrinsic_load_subgroup_id
:
360 case nir_intrinsic_load_num_subgroups
:
361 case nir_intrinsic_load_first_vertex
:
362 case nir_intrinsic_load_base_instance
:
363 case nir_intrinsic_get_buffer_size
:
364 case nir_intrinsic_vote_all
:
365 case nir_intrinsic_vote_any
:
366 case nir_intrinsic_read_first_invocation
:
367 case nir_intrinsic_read_invocation
:
368 case nir_intrinsic_first_invocation
:
369 type
= RegType::sgpr
;
371 case nir_intrinsic_ballot
:
372 type
= RegType::sgpr
;
375 case nir_intrinsic_load_sample_id
:
376 case nir_intrinsic_load_sample_mask_in
:
377 case nir_intrinsic_load_input
:
378 case nir_intrinsic_load_vertex_id
:
379 case nir_intrinsic_load_vertex_id_zero_base
:
380 case nir_intrinsic_load_barycentric_sample
:
381 case nir_intrinsic_load_barycentric_pixel
:
382 case nir_intrinsic_load_barycentric_centroid
:
383 case nir_intrinsic_load_barycentric_at_sample
:
384 case nir_intrinsic_load_barycentric_at_offset
:
385 case nir_intrinsic_load_interpolated_input
:
386 case nir_intrinsic_load_frag_coord
:
387 case nir_intrinsic_load_sample_pos
:
388 case nir_intrinsic_load_layer_id
:
389 case nir_intrinsic_load_local_invocation_id
:
390 case nir_intrinsic_load_local_invocation_index
:
391 case nir_intrinsic_load_subgroup_invocation
:
392 case nir_intrinsic_write_invocation_amd
:
393 case nir_intrinsic_mbcnt_amd
:
394 case nir_intrinsic_load_instance_id
:
395 case nir_intrinsic_ssbo_atomic_add
:
396 case nir_intrinsic_ssbo_atomic_imin
:
397 case nir_intrinsic_ssbo_atomic_umin
:
398 case nir_intrinsic_ssbo_atomic_imax
:
399 case nir_intrinsic_ssbo_atomic_umax
:
400 case nir_intrinsic_ssbo_atomic_and
:
401 case nir_intrinsic_ssbo_atomic_or
:
402 case nir_intrinsic_ssbo_atomic_xor
:
403 case nir_intrinsic_ssbo_atomic_exchange
:
404 case nir_intrinsic_ssbo_atomic_comp_swap
:
405 case nir_intrinsic_image_deref_atomic_add
:
406 case nir_intrinsic_image_deref_atomic_umin
:
407 case nir_intrinsic_image_deref_atomic_imin
:
408 case nir_intrinsic_image_deref_atomic_umax
:
409 case nir_intrinsic_image_deref_atomic_imax
:
410 case nir_intrinsic_image_deref_atomic_and
:
411 case nir_intrinsic_image_deref_atomic_or
:
412 case nir_intrinsic_image_deref_atomic_xor
:
413 case nir_intrinsic_image_deref_atomic_exchange
:
414 case nir_intrinsic_image_deref_atomic_comp_swap
:
415 case nir_intrinsic_image_deref_size
:
416 case nir_intrinsic_shared_atomic_add
:
417 case nir_intrinsic_shared_atomic_imin
:
418 case nir_intrinsic_shared_atomic_umin
:
419 case nir_intrinsic_shared_atomic_imax
:
420 case nir_intrinsic_shared_atomic_umax
:
421 case nir_intrinsic_shared_atomic_and
:
422 case nir_intrinsic_shared_atomic_or
:
423 case nir_intrinsic_shared_atomic_xor
:
424 case nir_intrinsic_shared_atomic_exchange
:
425 case nir_intrinsic_shared_atomic_comp_swap
:
426 case nir_intrinsic_load_scratch
:
427 type
= RegType::vgpr
;
429 case nir_intrinsic_shuffle
:
430 case nir_intrinsic_quad_broadcast
:
431 case nir_intrinsic_quad_swap_horizontal
:
432 case nir_intrinsic_quad_swap_vertical
:
433 case nir_intrinsic_quad_swap_diagonal
:
434 case nir_intrinsic_quad_swizzle_amd
:
435 case nir_intrinsic_masked_swizzle_amd
:
436 case nir_intrinsic_inclusive_scan
:
437 case nir_intrinsic_exclusive_scan
:
438 if (!ctx
->divergent_vals
[intrinsic
->dest
.ssa
.index
]) {
439 type
= RegType::sgpr
;
440 } else if (intrinsic
->src
[0].ssa
->bit_size
== 1) {
441 type
= RegType::sgpr
;
444 type
= RegType::vgpr
;
447 case nir_intrinsic_load_view_index
:
448 type
= ctx
->stage
== fragment_fs
? RegType::vgpr
: RegType::sgpr
;
450 case nir_intrinsic_load_front_face
:
451 case nir_intrinsic_load_helper_invocation
:
452 case nir_intrinsic_is_helper_invocation
:
453 type
= RegType::sgpr
;
456 case nir_intrinsic_reduce
:
457 if (nir_intrinsic_cluster_size(intrinsic
) == 0 ||
458 !ctx
->divergent_vals
[intrinsic
->dest
.ssa
.index
]) {
459 type
= RegType::sgpr
;
460 } else if (intrinsic
->src
[0].ssa
->bit_size
== 1) {
461 type
= RegType::sgpr
;
464 type
= RegType::vgpr
;
467 case nir_intrinsic_load_ubo
:
468 case nir_intrinsic_load_ssbo
:
469 case nir_intrinsic_load_global
:
470 case nir_intrinsic_vulkan_resource_index
:
471 type
= ctx
->divergent_vals
[intrinsic
->dest
.ssa
.index
] ? RegType::vgpr
: RegType::sgpr
;
473 /* due to copy propagation, the swizzled imov is removed if num dest components == 1 */
474 case nir_intrinsic_load_shared
:
475 if (ctx
->divergent_vals
[intrinsic
->dest
.ssa
.index
])
476 type
= RegType::vgpr
;
478 type
= RegType::sgpr
;
481 for (unsigned i
= 0; i
< nir_intrinsic_infos
[intrinsic
->intrinsic
].num_srcs
; i
++) {
482 if (allocated
[intrinsic
->src
[i
].ssa
->index
].type() == RegType::vgpr
)
483 type
= RegType::vgpr
;
487 allocated
[intrinsic
->dest
.ssa
.index
] = Temp(0, RegClass(type
, size
));
489 switch(intrinsic
->intrinsic
) {
490 case nir_intrinsic_load_barycentric_sample
:
491 case nir_intrinsic_load_barycentric_pixel
:
492 case nir_intrinsic_load_barycentric_centroid
:
493 case nir_intrinsic_load_barycentric_at_sample
:
494 case nir_intrinsic_load_barycentric_at_offset
: {
495 glsl_interp_mode mode
= (glsl_interp_mode
)nir_intrinsic_interp_mode(intrinsic
);
496 ctx
->fs_vgpr_args
[get_interp_input(intrinsic
->intrinsic
, mode
)] = true;
499 case nir_intrinsic_load_front_face
:
500 ctx
->fs_vgpr_args
[fs_input::front_face
] = true;
502 case nir_intrinsic_load_frag_coord
:
503 case nir_intrinsic_load_sample_pos
: {
504 uint8_t mask
= nir_ssa_def_components_read(&intrinsic
->dest
.ssa
);
505 for (unsigned i
= 0; i
< 4; i
++) {
507 ctx
->fs_vgpr_args
[fs_input::frag_pos_0
+ i
] = true;
512 case nir_intrinsic_load_sample_id
:
513 ctx
->fs_vgpr_args
[fs_input::ancillary
] = true;
515 case nir_intrinsic_load_sample_mask_in
:
516 ctx
->fs_vgpr_args
[fs_input::ancillary
] = true;
517 ctx
->fs_vgpr_args
[fs_input::sample_coverage
] = true;
524 case nir_instr_type_tex
: {
525 nir_tex_instr
* tex
= nir_instr_as_tex(instr
);
526 unsigned size
= tex
->dest
.ssa
.num_components
;
528 if (tex
->dest
.ssa
.bit_size
== 64)
530 if (tex
->op
== nir_texop_texture_samples
)
531 assert(!ctx
->divergent_vals
[tex
->dest
.ssa
.index
]);
532 if (ctx
->divergent_vals
[tex
->dest
.ssa
.index
])
533 allocated
[tex
->dest
.ssa
.index
] = Temp(0, RegClass(RegType::vgpr
, size
));
535 allocated
[tex
->dest
.ssa
.index
] = Temp(0, RegClass(RegType::sgpr
, size
));
538 case nir_instr_type_parallel_copy
: {
539 nir_foreach_parallel_copy_entry(entry
, nir_instr_as_parallel_copy(instr
)) {
540 allocated
[entry
->dest
.ssa
.index
] = allocated
[entry
->src
.ssa
->index
];
544 case nir_instr_type_ssa_undef
: {
545 unsigned size
= nir_instr_as_ssa_undef(instr
)->def
.num_components
;
546 if (nir_instr_as_ssa_undef(instr
)->def
.bit_size
== 64)
548 allocated
[nir_instr_as_ssa_undef(instr
)->def
.index
] = Temp(0, RegClass(RegType::sgpr
, size
));
551 case nir_instr_type_phi
: {
552 nir_phi_instr
* phi
= nir_instr_as_phi(instr
);
554 unsigned size
= phi
->dest
.ssa
.num_components
;
556 if (phi
->dest
.ssa
.bit_size
== 1) {
557 assert(size
== 1 && "multiple components not yet supported on boolean phis.");
558 type
= RegType::sgpr
;
559 size
*= ctx
->divergent_vals
[phi
->dest
.ssa
.index
] ? 2 : 1;
560 allocated
[phi
->dest
.ssa
.index
] = Temp(0, RegClass(type
, size
));
564 if (ctx
->divergent_vals
[phi
->dest
.ssa
.index
]) {
565 type
= RegType::vgpr
;
567 type
= RegType::sgpr
;
568 nir_foreach_phi_src (src
, phi
) {
569 if (allocated
[src
->src
.ssa
->index
].type() == RegType::vgpr
)
570 type
= RegType::vgpr
;
571 if (allocated
[src
->src
.ssa
->index
].type() == RegType::none
)
576 size
*= phi
->dest
.ssa
.bit_size
== 64 ? 2 : 1;
577 RegClass rc
= RegClass(type
, size
);
578 if (rc
!= allocated
[phi
->dest
.ssa
.index
].regClass()) {
581 nir_foreach_phi_src(src
, phi
)
582 assert(allocated
[src
->src
.ssa
->index
].size() == rc
.size());
584 allocated
[phi
->dest
.ssa
.index
] = Temp(0, rc
);
594 for (unsigned i
= 0; i
< impl
->ssa_alloc
; i
++)
595 allocated
[i
] = Temp(ctx
->program
->allocateId(), allocated
[i
].regClass());
597 ctx
->allocated
.reset(allocated
.release());
600 struct user_sgpr_info
{
602 uint8_t remaining_sgprs
;
603 uint8_t user_sgpr_idx
;
604 bool need_ring_offsets
;
605 bool indirect_all_descriptor_sets
;
608 static void allocate_inline_push_consts(isel_context
*ctx
,
609 user_sgpr_info
& user_sgpr_info
)
611 uint8_t remaining_sgprs
= user_sgpr_info
.remaining_sgprs
;
613 /* Only supported if shaders use push constants. */
614 if (ctx
->program
->info
->min_push_constant_used
== UINT8_MAX
)
617 /* Only supported if shaders don't have indirect push constants. */
618 if (ctx
->program
->info
->has_indirect_push_constants
)
621 /* Only supported for 32-bit push constants. */
622 //TODO: it's possible that some day, the load/store vectorization could make this inaccurate
623 if (!ctx
->program
->info
->has_only_32bit_push_constants
)
626 uint8_t num_push_consts
=
627 (ctx
->program
->info
->max_push_constant_used
-
628 ctx
->program
->info
->min_push_constant_used
) / 4;
630 /* Check if the number of user SGPRs is large enough. */
631 if (num_push_consts
< remaining_sgprs
) {
632 ctx
->program
->info
->num_inline_push_consts
= num_push_consts
;
634 ctx
->program
->info
->num_inline_push_consts
= remaining_sgprs
;
637 /* Clamp to the maximum number of allowed inlined push constants. */
638 if (ctx
->program
->info
->num_inline_push_consts
> MAX_INLINE_PUSH_CONSTS
)
639 ctx
->program
->info
->num_inline_push_consts
= MAX_INLINE_PUSH_CONSTS
;
641 if (ctx
->program
->info
->num_inline_push_consts
== num_push_consts
&&
642 !ctx
->program
->info
->loads_dynamic_offsets
) {
643 /* Disable the default push constants path if all constants are
644 * inlined and if shaders don't use dynamic descriptors.
646 ctx
->program
->info
->loads_push_constants
= false;
647 user_sgpr_info
.num_sgpr
--;
648 user_sgpr_info
.remaining_sgprs
++;
651 ctx
->program
->info
->base_inline_push_consts
=
652 ctx
->program
->info
->min_push_constant_used
/ 4;
654 user_sgpr_info
.num_sgpr
+= ctx
->program
->info
->num_inline_push_consts
;
655 user_sgpr_info
.remaining_sgprs
-= ctx
->program
->info
->num_inline_push_consts
;
658 static void allocate_user_sgprs(isel_context
*ctx
,
659 bool needs_view_index
, user_sgpr_info
& user_sgpr_info
)
661 memset(&user_sgpr_info
, 0, sizeof(struct user_sgpr_info
));
662 uint32_t user_sgpr_count
= 0;
664 /* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */
665 if (ctx
->stage
!= fragment_fs
&&
666 ctx
->stage
!= compute_cs
667 /*|| ctx->is_gs_copy_shader */)
668 user_sgpr_info
.need_ring_offsets
= true;
670 if (ctx
->stage
== fragment_fs
&&
671 ctx
->program
->info
->ps
.needs_sample_positions
)
672 user_sgpr_info
.need_ring_offsets
= true;
674 /* 2 user sgprs will nearly always be allocated for scratch/rings */
675 if (ctx
->options
->supports_spill
|| user_sgpr_info
.need_ring_offsets
|| ctx
->scratch_enabled
)
676 user_sgpr_count
+= 2;
678 switch (ctx
->stage
) {
680 /* if (!ctx->is_gs_copy_shader) */ {
681 if (ctx
->program
->info
->vs
.has_vertex_buffers
)
683 user_sgpr_count
+= ctx
->program
->info
->vs
.needs_draw_id
? 3 : 2;
687 //user_sgpr_count += ctx->program->info->ps.needs_sample_positions;
690 if (ctx
->program
->info
->cs
.uses_grid_size
)
691 user_sgpr_count
+= 3;
694 unreachable("Shader stage not implemented");
697 if (needs_view_index
)
700 if (ctx
->program
->info
->loads_push_constants
)
701 user_sgpr_count
+= 1; /* we use 32bit pointers */
703 if (ctx
->program
->info
->so
.num_outputs
)
704 user_sgpr_count
+= 1; /* we use 32bit pointers */
706 uint32_t available_sgprs
= ctx
->options
->chip_class
>= GFX9
&& !(ctx
->stage
& hw_cs
) ? 32 : 16;
707 uint32_t remaining_sgprs
= available_sgprs
- user_sgpr_count
;
708 uint32_t num_desc_set
= util_bitcount(ctx
->program
->info
->desc_set_used_mask
);
710 if (available_sgprs
< user_sgpr_count
+ num_desc_set
) {
711 user_sgpr_info
.indirect_all_descriptor_sets
= true;
712 user_sgpr_info
.num_sgpr
= user_sgpr_count
+ 1;
713 user_sgpr_info
.remaining_sgprs
= remaining_sgprs
- 1;
715 user_sgpr_info
.num_sgpr
= user_sgpr_count
+ num_desc_set
;
716 user_sgpr_info
.remaining_sgprs
= remaining_sgprs
- num_desc_set
;
719 allocate_inline_push_consts(ctx
, user_sgpr_info
);
724 RegClass types
[MAX_ARGS
];
725 Temp
*assign
[MAX_ARGS
];
726 PhysReg reg
[MAX_ARGS
];
727 unsigned array_params_mask
;
730 uint8_t num_sgprs_used
;
731 uint8_t num_vgprs_used
;
735 add_arg(arg_info
*info
, RegClass rc
, Temp
*param_ptr
, unsigned reg
)
737 assert(info
->count
< MAX_ARGS
);
739 info
->assign
[info
->count
] = param_ptr
;
740 info
->types
[info
->count
] = rc
;
742 if (rc
.type() == RegType::sgpr
) {
743 info
->num_sgprs_used
+= rc
.size();
745 info
->reg
[info
->count
] = PhysReg
{reg
};
747 assert(rc
.type() == RegType::vgpr
);
748 info
->num_vgprs_used
+= rc
.size();
749 info
->reg
[info
->count
] = PhysReg
{reg
+ 256};
755 set_loc(struct radv_userdata_info
*ud_info
, uint8_t *sgpr_idx
, uint8_t num_sgprs
)
757 ud_info
->sgpr_idx
= *sgpr_idx
;
758 ud_info
->num_sgprs
= num_sgprs
;
759 *sgpr_idx
+= num_sgprs
;
763 set_loc_shader(isel_context
*ctx
, int idx
, uint8_t *sgpr_idx
,
766 struct radv_userdata_info
*ud_info
= &ctx
->program
->info
->user_sgprs_locs
.shader_data
[idx
];
769 set_loc(ud_info
, sgpr_idx
, num_sgprs
);
773 set_loc_shader_ptr(isel_context
*ctx
, int idx
, uint8_t *sgpr_idx
)
775 bool use_32bit_pointers
= idx
!= AC_UD_SCRATCH_RING_OFFSETS
;
777 set_loc_shader(ctx
, idx
, sgpr_idx
, use_32bit_pointers
? 1 : 2);
781 set_loc_desc(isel_context
*ctx
, int idx
, uint8_t *sgpr_idx
)
783 struct radv_userdata_locations
*locs
= &ctx
->program
->info
->user_sgprs_locs
;
784 struct radv_userdata_info
*ud_info
= &locs
->descriptor_sets
[idx
];
787 set_loc(ud_info
, sgpr_idx
, 1);
788 locs
->descriptor_sets_enabled
|= 1 << idx
;
792 declare_global_input_sgprs(isel_context
*ctx
,
793 /* bool has_previous_stage, gl_shader_stage previous_stage, */
794 user_sgpr_info
*user_sgpr_info
,
795 struct arg_info
*args
,
798 /* 1 for each descriptor set */
799 if (!user_sgpr_info
->indirect_all_descriptor_sets
) {
800 uint32_t mask
= ctx
->program
->info
->desc_set_used_mask
;
802 int i
= u_bit_scan(&mask
);
803 add_arg(args
, s1
, &desc_sets
[i
], user_sgpr_info
->user_sgpr_idx
);
804 set_loc_desc(ctx
, i
, &user_sgpr_info
->user_sgpr_idx
);
806 /* NIR->LLVM might have set this to true if RADV_DEBUG=compiletime */
807 ctx
->program
->info
->need_indirect_descriptor_sets
= false;
809 add_arg(args
, s1
, desc_sets
, user_sgpr_info
->user_sgpr_idx
);
810 set_loc_shader_ptr(ctx
, AC_UD_INDIRECT_DESCRIPTOR_SETS
, &user_sgpr_info
->user_sgpr_idx
);
811 ctx
->program
->info
->need_indirect_descriptor_sets
= true;
814 if (ctx
->program
->info
->loads_push_constants
) {
815 /* 1 for push constants and dynamic descriptors */
816 add_arg(args
, s1
, &ctx
->push_constants
, user_sgpr_info
->user_sgpr_idx
);
817 set_loc_shader_ptr(ctx
, AC_UD_PUSH_CONSTANTS
, &user_sgpr_info
->user_sgpr_idx
);
820 if (ctx
->program
->info
->num_inline_push_consts
) {
821 unsigned count
= ctx
->program
->info
->num_inline_push_consts
;
822 for (unsigned i
= 0; i
< count
; i
++)
823 add_arg(args
, s1
, &ctx
->inline_push_consts
[i
], user_sgpr_info
->user_sgpr_idx
+ i
);
824 set_loc_shader(ctx
, AC_UD_INLINE_PUSH_CONSTANTS
, &user_sgpr_info
->user_sgpr_idx
, count
);
826 ctx
->num_inline_push_consts
= ctx
->program
->info
->num_inline_push_consts
;
827 ctx
->base_inline_push_consts
= ctx
->program
->info
->base_inline_push_consts
;
830 if (ctx
->program
->info
->so
.num_outputs
) {
831 add_arg(args
, s1
, &ctx
->streamout_buffers
, user_sgpr_info
->user_sgpr_idx
);
832 set_loc_shader_ptr(ctx
, AC_UD_STREAMOUT_BUFFERS
, &user_sgpr_info
->user_sgpr_idx
);
837 declare_vs_input_vgprs(isel_context
*ctx
, struct arg_info
*args
)
839 unsigned vgpr_idx
= 0;
840 add_arg(args
, v1
, &ctx
->vertex_id
, vgpr_idx
++);
841 if (ctx
->options
->chip_class
>= GFX10
) {
842 add_arg(args
, v1
, NULL
, vgpr_idx
++); /* unused */
843 add_arg(args
, v1
, &ctx
->vs_prim_id
, vgpr_idx
++);
844 add_arg(args
, v1
, &ctx
->instance_id
, vgpr_idx
++);
846 if (ctx
->options
->key
.vs
.out
.as_ls
) {
847 add_arg(args
, v1
, &ctx
->rel_auto_id
, vgpr_idx
++);
848 add_arg(args
, v1
, &ctx
->instance_id
, vgpr_idx
++);
850 add_arg(args
, v1
, &ctx
->instance_id
, vgpr_idx
++);
851 add_arg(args
, v1
, &ctx
->vs_prim_id
, vgpr_idx
++);
853 add_arg(args
, v1
, NULL
, vgpr_idx
); /* unused */
858 declare_streamout_sgprs(isel_context
*ctx
, struct arg_info
*args
, unsigned *idx
)
860 /* Streamout SGPRs. */
861 if (ctx
->program
->info
->so
.num_outputs
) {
862 assert(ctx
->stage
& hw_vs
);
864 if (ctx
->stage
!= tess_eval_vs
) {
865 add_arg(args
, s1
, &ctx
->streamout_config
, (*idx
)++);
867 args
->assign
[args
->count
- 1] = &ctx
->streamout_config
;
868 args
->types
[args
->count
- 1] = s1
;
871 add_arg(args
, s1
, &ctx
->streamout_write_idx
, (*idx
)++);
874 /* A streamout buffer offset is loaded if the stride is non-zero. */
875 for (unsigned i
= 0; i
< 4; i
++) {
876 if (!ctx
->program
->info
->so
.strides
[i
])
879 add_arg(args
, s1
, &ctx
->streamout_offset
[i
], (*idx
)++);
883 static bool needs_view_index_sgpr(isel_context
*ctx
)
885 switch (ctx
->stage
) {
887 return ctx
->program
->info
->needs_multiview_view_index
|| ctx
->options
->key
.has_multiview_view_index
;
889 return ctx
->program
->info
->needs_multiview_view_index
&& ctx
->options
->key
.has_multiview_view_index
;
892 case vertex_tess_control_hs
:
893 case vertex_geometry_gs
:
894 case tess_control_hs
:
896 case tess_eval_geometry_gs
:
898 return ctx
->program
->info
->needs_multiview_view_index
;
905 add_fs_arg(isel_context
*ctx
, arg_info
*args
, unsigned &vgpr_idx
, fs_input input
, unsigned value
, bool enable_next
= false, RegClass rc
= v1
)
907 if (!ctx
->fs_vgpr_args
[input
])
910 add_arg(args
, rc
, &ctx
->fs_inputs
[input
], vgpr_idx
);
911 vgpr_idx
+= rc
.size();
914 add_arg(args
, rc
, &ctx
->fs_inputs
[input
+ 1], vgpr_idx
);
915 vgpr_idx
+= rc
.size();
918 ctx
->program
->config
->spi_ps_input_addr
|= value
;
919 ctx
->program
->config
->spi_ps_input_ena
|= value
;
923 void add_startpgm(struct isel_context
*ctx
)
925 user_sgpr_info user_sgpr_info
;
926 bool needs_view_index
= needs_view_index_sgpr(ctx
);
927 allocate_user_sgprs(ctx
, needs_view_index
, user_sgpr_info
);
930 /* this needs to be in sgprs 0 and 1 */
931 if (ctx
->options
->supports_spill
|| user_sgpr_info
.need_ring_offsets
|| ctx
->scratch_enabled
) {
932 add_arg(&args
, s2
, &ctx
->private_segment_buffer
, 0);
933 set_loc_shader_ptr(ctx
, AC_UD_SCRATCH_RING_OFFSETS
, &user_sgpr_info
.user_sgpr_idx
);
936 unsigned vgpr_idx
= 0;
937 switch (ctx
->stage
) {
939 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
, ctx
->descriptor_sets
);
940 if (ctx
->program
->info
->vs
.has_vertex_buffers
) {
941 add_arg(&args
, s1
, &ctx
->vertex_buffers
, user_sgpr_info
.user_sgpr_idx
);
942 set_loc_shader_ptr(ctx
, AC_UD_VS_VERTEX_BUFFERS
, &user_sgpr_info
.user_sgpr_idx
);
944 add_arg(&args
, s1
, &ctx
->base_vertex
, user_sgpr_info
.user_sgpr_idx
);
945 add_arg(&args
, s1
, &ctx
->start_instance
, user_sgpr_info
.user_sgpr_idx
+ 1);
946 if (ctx
->program
->info
->vs
.needs_draw_id
) {
947 add_arg(&args
, s1
, &ctx
->draw_id
, user_sgpr_info
.user_sgpr_idx
+ 2);
948 set_loc_shader(ctx
, AC_UD_VS_BASE_VERTEX_START_INSTANCE
, &user_sgpr_info
.user_sgpr_idx
, 3);
950 set_loc_shader(ctx
, AC_UD_VS_BASE_VERTEX_START_INSTANCE
, &user_sgpr_info
.user_sgpr_idx
, 2);
952 if (needs_view_index
) {
953 add_arg(&args
, s1
, &ctx
->view_index
, user_sgpr_info
.user_sgpr_idx
);
954 set_loc_shader(ctx
, AC_UD_VIEW_INDEX
, &user_sgpr_info
.user_sgpr_idx
, 1);
957 assert(user_sgpr_info
.user_sgpr_idx
== user_sgpr_info
.num_sgpr
);
958 unsigned idx
= user_sgpr_info
.user_sgpr_idx
;
959 if (ctx
->options
->key
.vs
.out
.as_es
)
960 add_arg(&args
, s1
, &ctx
->es2gs_offset
, idx
++);
962 declare_streamout_sgprs(ctx
, &args
, &idx
);
964 if (ctx
->scratch_enabled
)
965 add_arg(&args
, s1
, &ctx
->scratch_offset
, idx
++);
967 declare_vs_input_vgprs(ctx
, &args
);
971 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
, ctx
->descriptor_sets
);
973 assert(user_sgpr_info
.user_sgpr_idx
== user_sgpr_info
.num_sgpr
);
974 add_arg(&args
, s1
, &ctx
->prim_mask
, user_sgpr_info
.user_sgpr_idx
);
976 if (ctx
->scratch_enabled
)
977 add_arg(&args
, s1
, &ctx
->scratch_offset
, user_sgpr_info
.user_sgpr_idx
+ 1);
979 ctx
->program
->config
->spi_ps_input_addr
= 0;
980 ctx
->program
->config
->spi_ps_input_ena
= 0;
982 bool has_interp_mode
= false;
984 has_interp_mode
|= add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::persp_sample_p1
, S_0286CC_PERSP_SAMPLE_ENA(1), true);
985 has_interp_mode
|= add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::persp_center_p1
, S_0286CC_PERSP_CENTER_ENA(1), true);
986 has_interp_mode
|= add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::persp_centroid_p1
, S_0286CC_PERSP_CENTROID_ENA(1), true);
987 has_interp_mode
|= add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::persp_pull_model
, S_0286CC_PERSP_PULL_MODEL_ENA(1), false, v3
);
989 if (!has_interp_mode
&& ctx
->fs_vgpr_args
[fs_input::frag_pos_3
]) {
990 /* If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be enabled too */
991 ctx
->fs_vgpr_args
[fs_input::persp_center_p1
] = true;
992 has_interp_mode
= add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::persp_center_p1
, S_0286CC_PERSP_CENTER_ENA(1), true);
995 has_interp_mode
|= add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::linear_sample_p1
, S_0286CC_LINEAR_SAMPLE_ENA(1), true);
996 has_interp_mode
|= add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::linear_center_p1
, S_0286CC_LINEAR_CENTER_ENA(1), true);
997 has_interp_mode
|= add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::linear_centroid_p1
, S_0286CC_LINEAR_CENTROID_ENA(1), true);
998 has_interp_mode
|= add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::line_stipple
, S_0286CC_LINE_STIPPLE_TEX_ENA(1));
1000 if (!has_interp_mode
) {
1001 /* At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled */
1002 ctx
->fs_vgpr_args
[fs_input::persp_center_p1
] = true;
1003 has_interp_mode
= add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::persp_center_p1
, S_0286CC_PERSP_CENTER_ENA(1), true);
1006 add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::frag_pos_0
, S_0286CC_POS_X_FLOAT_ENA(1));
1007 add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::frag_pos_1
, S_0286CC_POS_Y_FLOAT_ENA(1));
1008 add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::frag_pos_2
, S_0286CC_POS_Z_FLOAT_ENA(1));
1009 add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::frag_pos_3
, S_0286CC_POS_W_FLOAT_ENA(1));
1011 add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::front_face
, S_0286CC_FRONT_FACE_ENA(1));
1012 add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::ancillary
, S_0286CC_ANCILLARY_ENA(1));
1013 add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::sample_coverage
, S_0286CC_SAMPLE_COVERAGE_ENA(1));
1014 add_fs_arg(ctx
, &args
, vgpr_idx
, fs_input::fixed_pt
, S_0286CC_POS_FIXED_PT_ENA(1));
1016 ASSERTED
bool unset_interp_mode
= !(ctx
->program
->config
->spi_ps_input_addr
& 0x7F) ||
1017 (G_0286CC_POS_W_FLOAT_ENA(ctx
->program
->config
->spi_ps_input_addr
)
1018 && !(ctx
->program
->config
->spi_ps_input_addr
& 0xF));
1020 assert(has_interp_mode
);
1021 assert(!unset_interp_mode
);
1025 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
, ctx
->descriptor_sets
);
1027 if (ctx
->program
->info
->cs
.uses_grid_size
) {
1028 add_arg(&args
, s1
, &ctx
->num_workgroups
[0], user_sgpr_info
.user_sgpr_idx
);
1029 add_arg(&args
, s1
, &ctx
->num_workgroups
[1], user_sgpr_info
.user_sgpr_idx
+ 1);
1030 add_arg(&args
, s1
, &ctx
->num_workgroups
[2], user_sgpr_info
.user_sgpr_idx
+ 2);
1031 set_loc_shader(ctx
, AC_UD_CS_GRID_SIZE
, &user_sgpr_info
.user_sgpr_idx
, 3);
1033 assert(user_sgpr_info
.user_sgpr_idx
== user_sgpr_info
.num_sgpr
);
1034 unsigned idx
= user_sgpr_info
.user_sgpr_idx
;
1035 for (unsigned i
= 0; i
< 3; i
++) {
1036 if (ctx
->program
->info
->cs
.uses_block_id
[i
])
1037 add_arg(&args
, s1
, &ctx
->workgroup_ids
[i
], idx
++);
1040 if (ctx
->program
->info
->cs
.uses_local_invocation_idx
)
1041 add_arg(&args
, s1
, &ctx
->tg_size
, idx
++);
1042 if (ctx
->scratch_enabled
)
1043 add_arg(&args
, s1
, &ctx
->scratch_offset
, idx
++);
1045 add_arg(&args
, v1
, &ctx
->local_invocation_ids
[0], vgpr_idx
++);
1046 add_arg(&args
, v1
, &ctx
->local_invocation_ids
[1], vgpr_idx
++);
1047 add_arg(&args
, v1
, &ctx
->local_invocation_ids
[2], vgpr_idx
++);
1051 unreachable("Shader stage not implemented");
1054 ctx
->program
->info
->num_input_vgprs
= 0;
1055 ctx
->program
->info
->num_input_sgprs
= args
.num_sgprs_used
;
1056 ctx
->program
->info
->num_user_sgprs
= user_sgpr_info
.num_sgpr
;
1057 ctx
->program
->info
->num_input_vgprs
= args
.num_vgprs_used
;
1059 if (ctx
->stage
== fragment_fs
) {
1060 /* Verify that we have a correct assumption about input VGPR count */
1061 ASSERTED
unsigned input_vgpr_cnt
= ac_get_fs_input_vgpr_cnt(ctx
->program
->config
, nullptr, nullptr);
1062 assert(input_vgpr_cnt
== ctx
->program
->info
->num_input_vgprs
);
1065 aco_ptr
<Pseudo_instruction
> startpgm
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_startpgm
, Format::PSEUDO
, 0, args
.count
+ 1)};
1066 for (unsigned i
= 0; i
< args
.count
; i
++) {
1067 if (args
.assign
[i
]) {
1068 *args
.assign
[i
] = Temp
{ctx
->program
->allocateId(), args
.types
[i
]};
1069 startpgm
->definitions
[i
] = Definition(*args
.assign
[i
]);
1070 startpgm
->definitions
[i
].setFixed(args
.reg
[i
]);
1073 startpgm
->definitions
[args
.count
] = Definition
{ctx
->program
->allocateId(), exec
, s2
};
1074 ctx
->block
->instructions
.push_back(std::move(startpgm
));
1078 type_size(const struct glsl_type
*type
, bool bindless
)
1080 // TODO: don't we need type->std430_base_alignment() here?
1081 return glsl_count_attribute_slots(type
, false);
1085 shared_var_info(const struct glsl_type
*type
, unsigned *size
, unsigned *align
)
1087 assert(glsl_type_is_vector_or_scalar(type
));
1089 uint32_t comp_size
= glsl_type_is_boolean(type
)
1090 ? 4 : glsl_get_bit_size(type
) / 8;
1091 unsigned length
= glsl_get_vector_elements(type
);
1092 *size
= comp_size
* length
,
1097 get_align(nir_variable_mode mode
, bool is_store
, unsigned bit_size
, unsigned num_components
)
1099 /* TODO: ACO doesn't have good support for non-32-bit reads/writes yet */
1104 case nir_var_mem_ubo
:
1105 case nir_var_mem_ssbo
:
1106 //case nir_var_mem_push_const: enable with 1240!
1107 case nir_var_mem_shared
:
1108 /* TODO: what are the alignment requirements for LDS? */
1109 return num_components
<= 4 ? 4 : -1;
1116 setup_vs_variables(isel_context
*ctx
, nir_shader
*nir
)
1118 nir_foreach_variable(variable
, &nir
->inputs
)
1120 variable
->data
.driver_location
= variable
->data
.location
* 4;
1122 nir_foreach_variable(variable
, &nir
->outputs
)
1124 variable
->data
.driver_location
= variable
->data
.location
* 4;
1127 radv_vs_output_info
*outinfo
= &ctx
->program
->info
->vs
.outinfo
;
1129 memset(outinfo
->vs_output_param_offset
, AC_EXP_PARAM_UNDEFINED
,
1130 sizeof(outinfo
->vs_output_param_offset
));
1132 ctx
->needs_instance_id
= ctx
->program
->info
->vs
.needs_instance_id
;
1134 bool export_clip_dists
= ctx
->options
->key
.vs_common_out
.export_clip_dists
;
1136 outinfo
->param_exports
= 0;
1137 int pos_written
= 0x1;
1138 if (outinfo
->writes_pointsize
|| outinfo
->writes_viewport_index
|| outinfo
->writes_layer
)
1139 pos_written
|= 1 << 1;
1141 nir_foreach_variable(variable
, &nir
->outputs
)
1143 int idx
= variable
->data
.location
;
1144 unsigned slots
= variable
->type
->count_attribute_slots(false);
1145 if (variable
->data
.compact
) {
1146 unsigned component_count
= variable
->data
.location_frac
+ variable
->type
->length
;
1147 slots
= (component_count
+ 3) / 4;
1150 if (idx
>= VARYING_SLOT_VAR0
|| idx
== VARYING_SLOT_LAYER
|| idx
== VARYING_SLOT_PRIMITIVE_ID
||
1151 ((idx
== VARYING_SLOT_CLIP_DIST0
|| idx
== VARYING_SLOT_CLIP_DIST1
) && export_clip_dists
)) {
1152 for (unsigned i
= 0; i
< slots
; i
++) {
1153 if (outinfo
->vs_output_param_offset
[idx
+ i
] == AC_EXP_PARAM_UNDEFINED
)
1154 outinfo
->vs_output_param_offset
[idx
+ i
] = outinfo
->param_exports
++;
1158 if (outinfo
->writes_layer
&&
1159 outinfo
->vs_output_param_offset
[VARYING_SLOT_LAYER
] == AC_EXP_PARAM_UNDEFINED
) {
1160 /* when ctx->options->key.has_multiview_view_index = true, the layer
1161 * variable isn't declared in NIR and it's isel's job to get the layer */
1162 outinfo
->vs_output_param_offset
[VARYING_SLOT_LAYER
] = outinfo
->param_exports
++;
1165 if (outinfo
->export_prim_id
) {
1166 assert(outinfo
->vs_output_param_offset
[VARYING_SLOT_PRIMITIVE_ID
] == AC_EXP_PARAM_UNDEFINED
);
1167 outinfo
->vs_output_param_offset
[VARYING_SLOT_PRIMITIVE_ID
] = outinfo
->param_exports
++;
1170 ctx
->num_clip_distances
= util_bitcount(outinfo
->clip_dist_mask
);
1171 ctx
->num_cull_distances
= util_bitcount(outinfo
->cull_dist_mask
);
1173 assert(ctx
->num_clip_distances
+ ctx
->num_cull_distances
<= 8);
1175 if (ctx
->num_clip_distances
+ ctx
->num_cull_distances
> 0)
1176 pos_written
|= 1 << 2;
1177 if (ctx
->num_clip_distances
+ ctx
->num_cull_distances
> 4)
1178 pos_written
|= 1 << 3;
1180 outinfo
->pos_exports
= util_bitcount(pos_written
);
1184 setup_variables(isel_context
*ctx
, nir_shader
*nir
)
1186 switch (nir
->info
.stage
) {
1187 case MESA_SHADER_FRAGMENT
: {
1188 nir_foreach_variable(variable
, &nir
->outputs
)
1190 int idx
= variable
->data
.location
+ variable
->data
.index
;
1191 variable
->data
.driver_location
= idx
* 4;
1195 case MESA_SHADER_COMPUTE
: {
1196 ctx
->program
->config
->lds_size
= (nir
->info
.cs
.shared_size
+ ctx
->program
->lds_alloc_granule
- 1) /
1197 ctx
->program
->lds_alloc_granule
;
1200 case MESA_SHADER_VERTEX
: {
1201 setup_vs_variables(ctx
, nir
);
1205 unreachable("Unhandled shader stage.");
1210 setup_isel_context(Program
* program
,
1211 unsigned shader_count
,
1212 struct nir_shader
*const *shaders
,
1213 ac_shader_config
* config
,
1214 radv_shader_info
*info
,
1215 radv_nir_compiler_options
*options
)
1218 for (unsigned i
= 0; i
< shader_count
; i
++) {
1219 switch (shaders
[i
]->info
.stage
) {
1220 case MESA_SHADER_VERTEX
:
1221 program
->stage
|= sw_vs
;
1223 case MESA_SHADER_TESS_CTRL
:
1224 program
->stage
|= sw_tcs
;
1226 case MESA_SHADER_TESS_EVAL
:
1227 program
->stage
|= sw_tes
;
1229 case MESA_SHADER_GEOMETRY
:
1230 program
->stage
|= sw_gs
;
1232 case MESA_SHADER_FRAGMENT
:
1233 program
->stage
|= sw_fs
;
1235 case MESA_SHADER_COMPUTE
:
1236 program
->stage
|= sw_cs
;
1239 unreachable("Shader stage not implemented");
1242 if (program
->stage
== sw_vs
)
1243 program
->stage
|= hw_vs
;
1244 else if (program
->stage
== sw_fs
)
1245 program
->stage
|= hw_fs
;
1246 else if (program
->stage
== sw_cs
)
1247 program
->stage
|= hw_cs
;
1249 unreachable("Shader stage not implemented");
1251 program
->config
= config
;
1252 program
->info
= info
;
1253 program
->chip_class
= options
->chip_class
;
1254 program
->family
= options
->family
;
1255 program
->wave_size
= options
->wave_size
;
1257 program
->lds_alloc_granule
= options
->chip_class
>= GFX7
? 512 : 256;
1258 program
->lds_limit
= options
->chip_class
>= GFX7
? 65536 : 32768;
1259 program
->vgpr_limit
= 256;
1261 if (options
->chip_class
>= GFX10
) {
1262 program
->physical_sgprs
= 2560; /* doesn't matter as long as it's at least 128 * 20 */
1263 program
->sgpr_alloc_granule
= 127;
1264 program
->sgpr_limit
= 106;
1265 } else if (program
->chip_class
>= GFX8
) {
1266 program
->physical_sgprs
= 800;
1267 program
->sgpr_alloc_granule
= 15;
1268 program
->sgpr_limit
= 102;
1270 program
->physical_sgprs
= 512;
1271 program
->sgpr_alloc_granule
= 7;
1272 if (options
->family
== CHIP_TONGA
|| options
->family
== CHIP_ICELAND
)
1273 program
->sgpr_limit
= 94; /* workaround hardware bug */
1275 program
->sgpr_limit
= 104;
1277 /* TODO: we don't have to allocate VCC if we don't need it */
1278 program
->needs_vcc
= true;
1280 for (unsigned i
= 0; i
< MAX_SETS
; ++i
)
1281 program
->info
->user_sgprs_locs
.descriptor_sets
[i
].sgpr_idx
= -1;
1282 for (unsigned i
= 0; i
< AC_UD_MAX_UD
; ++i
)
1283 program
->info
->user_sgprs_locs
.shader_data
[i
].sgpr_idx
= -1;
1285 isel_context ctx
= {};
1286 ctx
.program
= program
;
1287 ctx
.options
= options
;
1288 ctx
.stage
= program
->stage
;
1290 for (unsigned i
= 0; i
< fs_input::max_inputs
; ++i
)
1291 ctx
.fs_inputs
[i
] = Temp(0, v1
);
1292 ctx
.fs_inputs
[fs_input::persp_pull_model
] = Temp(0, v3
);
1293 for (unsigned i
= 0; i
< MAX_SETS
; ++i
)
1294 ctx
.descriptor_sets
[i
] = Temp(0, s1
);
1295 for (unsigned i
= 0; i
< MAX_INLINE_PUSH_CONSTS
; ++i
)
1296 ctx
.inline_push_consts
[i
] = Temp(0, s1
);
1297 for (unsigned i
= 0; i
<= VARYING_SLOT_VAR31
; ++i
) {
1298 for (unsigned j
= 0; j
< 4; ++j
)
1299 ctx
.vs_output
.outputs
[i
][j
] = Temp(0, v1
);
1302 for (unsigned i
= 0; i
< shader_count
; i
++) {
1303 nir_shader
*nir
= shaders
[i
];
1305 /* align and copy constant data */
1306 while (program
->constant_data
.size() % 4u)
1307 program
->constant_data
.push_back(0);
1308 ctx
.constant_data_offset
= program
->constant_data
.size();
1309 program
->constant_data
.insert(program
->constant_data
.end(),
1310 (uint8_t*)nir
->constant_data
,
1311 (uint8_t*)nir
->constant_data
+ nir
->constant_data_size
);
1313 /* the variable setup has to be done before lower_io / CSE */
1314 if (nir
->info
.stage
== MESA_SHADER_COMPUTE
)
1315 nir_lower_vars_to_explicit_types(nir
, nir_var_mem_shared
, shared_var_info
);
1316 setup_variables(&ctx
, nir
);
1318 /* optimize and lower memory operations */
1319 bool lower_to_scalar
= false;
1320 bool lower_pack
= false;
1321 // TODO: uncomment this once !1240 is merged
1322 /*if (nir_opt_load_store_vectorize(nir,
1323 (nir_variable_mode)(nir_var_mem_ssbo | nir_var_mem_ubo |
1324 nir_var_mem_push_const | nir_var_mem_shared),
1326 lower_to_scalar = true;
1329 if (nir
->info
.stage
== MESA_SHADER_COMPUTE
)
1330 lower_to_scalar
|= nir_lower_explicit_io(nir
, nir_var_mem_shared
, nir_address_format_32bit_offset
);
1332 nir_lower_io(nir
, (nir_variable_mode
)(nir_var_shader_in
| nir_var_shader_out
), type_size
, (nir_lower_io_options
)0);
1333 nir_lower_explicit_io(nir
, nir_var_mem_global
, nir_address_format_64bit_global
);
1335 if (lower_to_scalar
)
1336 nir_lower_alu_to_scalar(nir
, NULL
, NULL
);
1338 nir_lower_pack(nir
);
1340 /* lower ALU operations */
1341 // TODO: implement logic64 in aco, it's more effective for sgprs
1342 nir_lower_int64(nir
, (nir_lower_int64_options
) (nir_lower_imul64
|
1343 nir_lower_imul_high64
|
1344 nir_lower_imul_2x32_64
|
1345 nir_lower_divmod64
|
1347 nir_lower_minmax64
|
1350 nir_opt_idiv_const(nir
, 32);
1351 nir_lower_idiv(nir
, nir_lower_idiv_precise
);
1353 /* optimize the lowered ALU operations */
1354 bool more_algebraic
= true;
1355 while (more_algebraic
) {
1356 more_algebraic
= false;
1357 NIR_PASS_V(nir
, nir_copy_prop
);
1358 NIR_PASS_V(nir
, nir_opt_dce
);
1359 NIR_PASS_V(nir
, nir_opt_constant_folding
);
1360 NIR_PASS(more_algebraic
, nir
, nir_opt_algebraic
);
1363 /* Do late algebraic optimization to turn add(a, neg(b)) back into
1364 * subs, then the mandatory cleanup after algebraic. Note that it may
1365 * produce fnegs, and if so then we need to keep running to squash
1368 bool more_late_algebraic
= true;
1369 while (more_late_algebraic
) {
1370 more_late_algebraic
= false;
1371 NIR_PASS(more_late_algebraic
, nir
, nir_opt_algebraic_late
);
1372 NIR_PASS_V(nir
, nir_opt_constant_folding
);
1373 NIR_PASS_V(nir
, nir_copy_prop
);
1374 NIR_PASS_V(nir
, nir_opt_dce
);
1375 NIR_PASS_V(nir
, nir_opt_cse
);
1378 /* cleanup passes */
1379 nir_lower_load_const_to_scalar(nir
);
1380 nir_opt_shrink_load(nir
);
1381 nir_move_options move_opts
= (nir_move_options
)(
1382 nir_move_const_undef
| nir_move_load_ubo
| nir_move_load_input
| nir_move_comparisons
);
1383 nir_opt_sink(nir
, move_opts
);
1384 nir_opt_move(nir
, move_opts
);
1385 nir_convert_to_lcssa(nir
, true, false);
1386 nir_lower_phis_to_scalar(nir
);
1388 nir_function_impl
*func
= nir_shader_get_entrypoint(nir
);
1389 nir_index_ssa_defs(func
);
1391 if (options
->dump_preoptir
) {
1392 fprintf(stderr
, "NIR shader before instruction selection:\n");
1393 nir_print_shader(nir
, stderr
);
1397 unsigned scratch_size
= 0;
1398 for (unsigned i
= 0; i
< shader_count
; i
++)
1399 scratch_size
= std::max(scratch_size
, shaders
[i
]->scratch_size
);
1400 ctx
.scratch_enabled
= scratch_size
> 0;
1401 ctx
.program
->config
->scratch_bytes_per_wave
= align(scratch_size
* ctx
.options
->wave_size
, 1024);
1402 ctx
.program
->config
->float_mode
= V_00B028_FP_64_DENORMS
;
1403 ctx
.program
->info
->wave_size
= ctx
.options
->wave_size
;
1405 ctx
.block
= ctx
.program
->create_and_insert_block();
1406 ctx
.block
->loop_nest_depth
= 0;
1407 ctx
.block
->kind
= block_kind_top_level
;