2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
28 #include "radv_private.h"
29 #include "radv_shader.h"
30 #include "radv_shader_helper.h"
33 #include <llvm-c/Core.h>
34 #include <llvm-c/TargetMachine.h>
35 #include <llvm-c/Transforms/Scalar.h>
36 #include <llvm-c/Transforms/Utils.h>
39 #include "ac_binary.h"
40 #include "ac_llvm_util.h"
41 #include "ac_llvm_build.h"
42 #include "ac_shader_abi.h"
43 #include "ac_shader_util.h"
44 #include "ac_exp_param.h"
46 #define RADEON_LLVM_MAX_INPUTS (VARYING_SLOT_VAR31 + 1)
48 struct radv_shader_context
{
49 struct ac_llvm_context ac
;
50 const struct radv_nir_compiler_options
*options
;
51 struct radv_shader_variant_info
*shader_info
;
52 struct ac_shader_abi abi
;
54 unsigned max_workgroup_size
;
55 LLVMContextRef context
;
56 LLVMValueRef main_function
;
58 LLVMValueRef descriptor_sets
[MAX_SETS
];
59 LLVMValueRef ring_offsets
;
61 LLVMValueRef vertex_buffers
;
62 LLVMValueRef rel_auto_id
;
63 LLVMValueRef vs_prim_id
;
64 LLVMValueRef es2gs_offset
;
67 LLVMValueRef merged_wave_info
;
68 LLVMValueRef tess_factor_offset
;
69 LLVMValueRef tes_rel_patch_id
;
75 * - bits 0..10: ordered_wave_id
76 * - bits 12..20: number of vertices in group
77 * - bits 22..30: number of primitives in group
79 LLVMValueRef gs_tg_info
;
80 LLVMValueRef gs2vs_offset
;
81 LLVMValueRef gs_wave_id
;
82 LLVMValueRef gs_vtx_offset
[6];
84 LLVMValueRef esgs_ring
;
85 LLVMValueRef gsvs_ring
[4];
86 LLVMValueRef hs_ring_tess_offchip
;
87 LLVMValueRef hs_ring_tess_factor
;
90 LLVMValueRef streamout_buffers
;
91 LLVMValueRef streamout_write_idx
;
92 LLVMValueRef streamout_config
;
93 LLVMValueRef streamout_offset
[4];
95 gl_shader_stage stage
;
97 LLVMValueRef inputs
[RADEON_LLVM_MAX_INPUTS
* 4];
101 bool is_gs_copy_shader
;
102 LLVMValueRef gs_next_vertex
[4];
103 LLVMValueRef gs_curprim_verts
[4];
104 LLVMValueRef gs_generated_prims
[4];
105 LLVMValueRef gs_ngg_emit
;
106 LLVMValueRef gs_ngg_scratch
;
107 unsigned gs_max_out_vertices
;
108 unsigned gs_output_prim
;
110 unsigned tes_primitive_mode
;
112 uint32_t tcs_patch_outputs_read
;
113 uint64_t tcs_outputs_read
;
114 uint32_t tcs_vertices_per_patch
;
115 uint32_t tcs_num_inputs
;
116 uint32_t tcs_num_patches
;
117 uint32_t max_gsvs_emit_size
;
118 uint32_t gsvs_vertex_size
;
120 LLVMValueRef vertexptr
; /* GFX10 only */
123 struct radv_shader_output_values
{
124 LLVMValueRef values
[4];
130 enum radeon_llvm_calling_convention
{
131 RADEON_LLVM_AMDGPU_VS
= 87,
132 RADEON_LLVM_AMDGPU_GS
= 88,
133 RADEON_LLVM_AMDGPU_PS
= 89,
134 RADEON_LLVM_AMDGPU_CS
= 90,
135 RADEON_LLVM_AMDGPU_HS
= 93,
138 static inline struct radv_shader_context
*
139 radv_shader_context_from_abi(struct ac_shader_abi
*abi
)
141 struct radv_shader_context
*ctx
= NULL
;
142 return container_of(abi
, ctx
, abi
);
145 struct ac_build_if_state
147 struct radv_shader_context
*ctx
;
148 LLVMValueRef condition
;
149 LLVMBasicBlockRef entry_block
;
150 LLVMBasicBlockRef true_block
;
151 LLVMBasicBlockRef false_block
;
152 LLVMBasicBlockRef merge_block
;
155 static LLVMBasicBlockRef
156 ac_build_insert_new_block(struct radv_shader_context
*ctx
, const char *name
)
158 LLVMBasicBlockRef current_block
;
159 LLVMBasicBlockRef next_block
;
160 LLVMBasicBlockRef new_block
;
162 /* get current basic block */
163 current_block
= LLVMGetInsertBlock(ctx
->ac
.builder
);
165 /* chqeck if there's another block after this one */
166 next_block
= LLVMGetNextBasicBlock(current_block
);
168 /* insert the new block before the next block */
169 new_block
= LLVMInsertBasicBlockInContext(ctx
->context
, next_block
, name
);
172 /* append new block after current block */
173 LLVMValueRef function
= LLVMGetBasicBlockParent(current_block
);
174 new_block
= LLVMAppendBasicBlockInContext(ctx
->context
, function
, name
);
180 ac_nir_build_if(struct ac_build_if_state
*ifthen
,
181 struct radv_shader_context
*ctx
,
182 LLVMValueRef condition
)
184 LLVMBasicBlockRef block
= LLVMGetInsertBlock(ctx
->ac
.builder
);
186 memset(ifthen
, 0, sizeof *ifthen
);
188 ifthen
->condition
= condition
;
189 ifthen
->entry_block
= block
;
191 /* create endif/merge basic block for the phi functions */
192 ifthen
->merge_block
= ac_build_insert_new_block(ctx
, "endif-block");
194 /* create/insert true_block before merge_block */
196 LLVMInsertBasicBlockInContext(ctx
->context
,
200 /* successive code goes into the true block */
201 LLVMPositionBuilderAtEnd(ctx
->ac
.builder
, ifthen
->true_block
);
208 ac_nir_build_endif(struct ac_build_if_state
*ifthen
)
210 LLVMBuilderRef builder
= ifthen
->ctx
->ac
.builder
;
212 /* Insert branch to the merge block from current block */
213 LLVMBuildBr(builder
, ifthen
->merge_block
);
216 * Now patch in the various branch instructions.
219 /* Insert the conditional branch instruction at the end of entry_block */
220 LLVMPositionBuilderAtEnd(builder
, ifthen
->entry_block
);
221 if (ifthen
->false_block
) {
222 /* we have an else clause */
223 LLVMBuildCondBr(builder
, ifthen
->condition
,
224 ifthen
->true_block
, ifthen
->false_block
);
228 LLVMBuildCondBr(builder
, ifthen
->condition
,
229 ifthen
->true_block
, ifthen
->merge_block
);
232 /* Resume building code at end of the ifthen->merge_block */
233 LLVMPositionBuilderAtEnd(builder
, ifthen
->merge_block
);
237 static LLVMValueRef
get_rel_patch_id(struct radv_shader_context
*ctx
)
239 switch (ctx
->stage
) {
240 case MESA_SHADER_TESS_CTRL
:
241 return ac_unpack_param(&ctx
->ac
, ctx
->abi
.tcs_rel_ids
, 0, 8);
242 case MESA_SHADER_TESS_EVAL
:
243 return ctx
->tes_rel_patch_id
;
246 unreachable("Illegal stage");
251 get_tcs_num_patches(struct radv_shader_context
*ctx
)
253 unsigned num_tcs_input_cp
= ctx
->options
->key
.tcs
.input_vertices
;
254 unsigned num_tcs_output_cp
= ctx
->tcs_vertices_per_patch
;
255 uint32_t input_vertex_size
= ctx
->tcs_num_inputs
* 16;
256 uint32_t input_patch_size
= ctx
->options
->key
.tcs
.input_vertices
* input_vertex_size
;
257 uint32_t num_tcs_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.outputs_written
);
258 uint32_t num_tcs_patch_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.patch_outputs_written
);
259 uint32_t output_vertex_size
= num_tcs_outputs
* 16;
260 uint32_t pervertex_output_patch_size
= ctx
->tcs_vertices_per_patch
* output_vertex_size
;
261 uint32_t output_patch_size
= pervertex_output_patch_size
+ num_tcs_patch_outputs
* 16;
262 unsigned num_patches
;
263 unsigned hardware_lds_size
;
265 /* Ensure that we only need one wave per SIMD so we don't need to check
266 * resource usage. Also ensures that the number of tcs in and out
267 * vertices per threadgroup are at most 256.
269 num_patches
= 64 / MAX2(num_tcs_input_cp
, num_tcs_output_cp
) * 4;
270 /* Make sure that the data fits in LDS. This assumes the shaders only
271 * use LDS for the inputs and outputs.
273 hardware_lds_size
= 32768;
275 /* Looks like STONEY hangs if we use more than 32 KiB LDS in a single
276 * threadgroup, even though there is more than 32 KiB LDS.
278 * Test: dEQP-VK.tessellation.shader_input_output.barrier
280 if (ctx
->options
->chip_class
>= GFX7
&& ctx
->options
->family
!= CHIP_STONEY
)
281 hardware_lds_size
= 65536;
283 num_patches
= MIN2(num_patches
, hardware_lds_size
/ (input_patch_size
+ output_patch_size
));
284 /* Make sure the output data fits in the offchip buffer */
285 num_patches
= MIN2(num_patches
, (ctx
->options
->tess_offchip_block_dw_size
* 4) / output_patch_size
);
286 /* Not necessary for correctness, but improves performance. The
287 * specific value is taken from the proprietary driver.
289 num_patches
= MIN2(num_patches
, 40);
291 /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */
292 if (ctx
->options
->chip_class
== GFX6
) {
293 unsigned one_wave
= ctx
->options
->wave_size
/ MAX2(num_tcs_input_cp
, num_tcs_output_cp
);
294 num_patches
= MIN2(num_patches
, one_wave
);
300 calculate_tess_lds_size(struct radv_shader_context
*ctx
)
302 unsigned num_tcs_input_cp
= ctx
->options
->key
.tcs
.input_vertices
;
303 unsigned num_tcs_output_cp
;
304 unsigned num_tcs_outputs
, num_tcs_patch_outputs
;
305 unsigned input_vertex_size
, output_vertex_size
;
306 unsigned input_patch_size
, output_patch_size
;
307 unsigned pervertex_output_patch_size
;
308 unsigned output_patch0_offset
;
309 unsigned num_patches
;
312 num_tcs_output_cp
= ctx
->tcs_vertices_per_patch
;
313 num_tcs_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.outputs_written
);
314 num_tcs_patch_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.patch_outputs_written
);
316 input_vertex_size
= ctx
->tcs_num_inputs
* 16;
317 output_vertex_size
= num_tcs_outputs
* 16;
319 input_patch_size
= num_tcs_input_cp
* input_vertex_size
;
321 pervertex_output_patch_size
= num_tcs_output_cp
* output_vertex_size
;
322 output_patch_size
= pervertex_output_patch_size
+ num_tcs_patch_outputs
* 16;
324 num_patches
= ctx
->tcs_num_patches
;
325 output_patch0_offset
= input_patch_size
* num_patches
;
327 lds_size
= output_patch0_offset
+ output_patch_size
* num_patches
;
331 /* Tessellation shaders pass outputs to the next shader using LDS.
333 * LS outputs = TCS inputs
334 * TCS outputs = TES inputs
337 * - TCS inputs for patch 0
338 * - TCS inputs for patch 1
339 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
341 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
342 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
343 * - TCS outputs for patch 1
344 * - Per-patch TCS outputs for patch 1
345 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
346 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
349 * All three shaders VS(LS), TCS, TES share the same LDS space.
352 get_tcs_in_patch_stride(struct radv_shader_context
*ctx
)
354 assert (ctx
->stage
== MESA_SHADER_TESS_CTRL
);
355 uint32_t input_vertex_size
= ctx
->tcs_num_inputs
* 16;
356 uint32_t input_patch_size
= ctx
->options
->key
.tcs
.input_vertices
* input_vertex_size
;
358 input_patch_size
/= 4;
359 return LLVMConstInt(ctx
->ac
.i32
, input_patch_size
, false);
363 get_tcs_out_patch_stride(struct radv_shader_context
*ctx
)
365 uint32_t num_tcs_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.outputs_written
);
366 uint32_t num_tcs_patch_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.patch_outputs_written
);
367 uint32_t output_vertex_size
= num_tcs_outputs
* 16;
368 uint32_t pervertex_output_patch_size
= ctx
->tcs_vertices_per_patch
* output_vertex_size
;
369 uint32_t output_patch_size
= pervertex_output_patch_size
+ num_tcs_patch_outputs
* 16;
370 output_patch_size
/= 4;
371 return LLVMConstInt(ctx
->ac
.i32
, output_patch_size
, false);
375 get_tcs_out_vertex_stride(struct radv_shader_context
*ctx
)
377 uint32_t num_tcs_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.outputs_written
);
378 uint32_t output_vertex_size
= num_tcs_outputs
* 16;
379 output_vertex_size
/= 4;
380 return LLVMConstInt(ctx
->ac
.i32
, output_vertex_size
, false);
384 get_tcs_out_patch0_offset(struct radv_shader_context
*ctx
)
386 assert (ctx
->stage
== MESA_SHADER_TESS_CTRL
);
387 uint32_t input_vertex_size
= ctx
->tcs_num_inputs
* 16;
388 uint32_t input_patch_size
= ctx
->options
->key
.tcs
.input_vertices
* input_vertex_size
;
389 uint32_t output_patch0_offset
= input_patch_size
;
390 unsigned num_patches
= ctx
->tcs_num_patches
;
392 output_patch0_offset
*= num_patches
;
393 output_patch0_offset
/= 4;
394 return LLVMConstInt(ctx
->ac
.i32
, output_patch0_offset
, false);
398 get_tcs_out_patch0_patch_data_offset(struct radv_shader_context
*ctx
)
400 assert (ctx
->stage
== MESA_SHADER_TESS_CTRL
);
401 uint32_t input_vertex_size
= ctx
->tcs_num_inputs
* 16;
402 uint32_t input_patch_size
= ctx
->options
->key
.tcs
.input_vertices
* input_vertex_size
;
403 uint32_t output_patch0_offset
= input_patch_size
;
405 uint32_t num_tcs_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.outputs_written
);
406 uint32_t output_vertex_size
= num_tcs_outputs
* 16;
407 uint32_t pervertex_output_patch_size
= ctx
->tcs_vertices_per_patch
* output_vertex_size
;
408 unsigned num_patches
= ctx
->tcs_num_patches
;
410 output_patch0_offset
*= num_patches
;
411 output_patch0_offset
+= pervertex_output_patch_size
;
412 output_patch0_offset
/= 4;
413 return LLVMConstInt(ctx
->ac
.i32
, output_patch0_offset
, false);
417 get_tcs_in_current_patch_offset(struct radv_shader_context
*ctx
)
419 LLVMValueRef patch_stride
= get_tcs_in_patch_stride(ctx
);
420 LLVMValueRef rel_patch_id
= get_rel_patch_id(ctx
);
422 return LLVMBuildMul(ctx
->ac
.builder
, patch_stride
, rel_patch_id
, "");
426 get_tcs_out_current_patch_offset(struct radv_shader_context
*ctx
)
428 LLVMValueRef patch0_offset
= get_tcs_out_patch0_offset(ctx
);
429 LLVMValueRef patch_stride
= get_tcs_out_patch_stride(ctx
);
430 LLVMValueRef rel_patch_id
= get_rel_patch_id(ctx
);
432 return ac_build_imad(&ctx
->ac
, patch_stride
, rel_patch_id
,
437 get_tcs_out_current_patch_data_offset(struct radv_shader_context
*ctx
)
439 LLVMValueRef patch0_patch_data_offset
=
440 get_tcs_out_patch0_patch_data_offset(ctx
);
441 LLVMValueRef patch_stride
= get_tcs_out_patch_stride(ctx
);
442 LLVMValueRef rel_patch_id
= get_rel_patch_id(ctx
);
444 return ac_build_imad(&ctx
->ac
, patch_stride
, rel_patch_id
,
445 patch0_patch_data_offset
);
450 LLVMTypeRef types
[MAX_ARGS
];
451 LLVMValueRef
*assign
[MAX_ARGS
];
454 uint8_t num_sgprs_used
;
455 uint8_t num_vgprs_used
;
458 enum ac_arg_regfile
{
464 add_arg(struct arg_info
*info
, enum ac_arg_regfile regfile
, LLVMTypeRef type
,
465 LLVMValueRef
*param_ptr
)
467 assert(info
->count
< MAX_ARGS
);
469 info
->assign
[info
->count
] = param_ptr
;
470 info
->types
[info
->count
] = type
;
473 if (regfile
== ARG_SGPR
) {
474 info
->num_sgprs_used
+= ac_get_type_size(type
) / 4;
477 assert(regfile
== ARG_VGPR
);
478 info
->num_vgprs_used
+= ac_get_type_size(type
) / 4;
482 static void assign_arguments(LLVMValueRef main_function
,
483 struct arg_info
*info
)
486 for (i
= 0; i
< info
->count
; i
++) {
488 *info
->assign
[i
] = LLVMGetParam(main_function
, i
);
493 create_llvm_function(LLVMContextRef ctx
, LLVMModuleRef module
,
494 LLVMBuilderRef builder
, LLVMTypeRef
*return_types
,
495 unsigned num_return_elems
,
496 struct arg_info
*args
,
497 unsigned max_workgroup_size
,
498 const struct radv_nir_compiler_options
*options
)
500 LLVMTypeRef main_function_type
, ret_type
;
501 LLVMBasicBlockRef main_function_body
;
503 if (num_return_elems
)
504 ret_type
= LLVMStructTypeInContext(ctx
, return_types
,
505 num_return_elems
, true);
507 ret_type
= LLVMVoidTypeInContext(ctx
);
509 /* Setup the function */
511 LLVMFunctionType(ret_type
, args
->types
, args
->count
, 0);
512 LLVMValueRef main_function
=
513 LLVMAddFunction(module
, "main", main_function_type
);
515 LLVMAppendBasicBlockInContext(ctx
, main_function
, "main_body");
516 LLVMPositionBuilderAtEnd(builder
, main_function_body
);
518 LLVMSetFunctionCallConv(main_function
, RADEON_LLVM_AMDGPU_CS
);
519 for (unsigned i
= 0; i
< args
->sgpr_count
; ++i
) {
520 LLVMValueRef P
= LLVMGetParam(main_function
, i
);
522 ac_add_function_attr(ctx
, main_function
, i
+ 1, AC_FUNC_ATTR_INREG
);
524 if (LLVMGetTypeKind(LLVMTypeOf(P
)) == LLVMPointerTypeKind
) {
525 ac_add_function_attr(ctx
, main_function
, i
+ 1, AC_FUNC_ATTR_NOALIAS
);
526 ac_add_attr_dereferenceable(P
, UINT64_MAX
);
530 if (options
->address32_hi
) {
531 ac_llvm_add_target_dep_function_attr(main_function
,
532 "amdgpu-32bit-address-high-bits",
533 options
->address32_hi
);
536 ac_llvm_set_workgroup_size(main_function
, max_workgroup_size
);
538 if (options
->unsafe_math
) {
539 /* These were copied from some LLVM test. */
540 LLVMAddTargetDependentFunctionAttr(main_function
,
541 "less-precise-fpmad",
543 LLVMAddTargetDependentFunctionAttr(main_function
,
546 LLVMAddTargetDependentFunctionAttr(main_function
,
549 LLVMAddTargetDependentFunctionAttr(main_function
,
552 LLVMAddTargetDependentFunctionAttr(main_function
,
553 "no-signed-zeros-fp-math",
556 return main_function
;
561 set_loc(struct radv_userdata_info
*ud_info
, uint8_t *sgpr_idx
,
564 ud_info
->sgpr_idx
= *sgpr_idx
;
565 ud_info
->num_sgprs
= num_sgprs
;
566 *sgpr_idx
+= num_sgprs
;
570 set_loc_shader(struct radv_shader_context
*ctx
, int idx
, uint8_t *sgpr_idx
,
573 struct radv_userdata_info
*ud_info
=
574 &ctx
->shader_info
->user_sgprs_locs
.shader_data
[idx
];
577 set_loc(ud_info
, sgpr_idx
, num_sgprs
);
581 set_loc_shader_ptr(struct radv_shader_context
*ctx
, int idx
, uint8_t *sgpr_idx
)
583 bool use_32bit_pointers
= idx
!= AC_UD_SCRATCH_RING_OFFSETS
;
585 set_loc_shader(ctx
, idx
, sgpr_idx
, use_32bit_pointers
? 1 : 2);
589 set_loc_desc(struct radv_shader_context
*ctx
, int idx
, uint8_t *sgpr_idx
)
591 struct radv_userdata_locations
*locs
=
592 &ctx
->shader_info
->user_sgprs_locs
;
593 struct radv_userdata_info
*ud_info
= &locs
->descriptor_sets
[idx
];
596 set_loc(ud_info
, sgpr_idx
, 1);
598 locs
->descriptor_sets_enabled
|= 1 << idx
;
601 struct user_sgpr_info
{
602 bool need_ring_offsets
;
603 bool indirect_all_descriptor_sets
;
604 uint8_t remaining_sgprs
;
607 static bool needs_view_index_sgpr(struct radv_shader_context
*ctx
,
608 gl_shader_stage stage
)
611 case MESA_SHADER_VERTEX
:
612 if (ctx
->shader_info
->info
.needs_multiview_view_index
||
613 (!ctx
->options
->key
.vs_common_out
.as_es
&& !ctx
->options
->key
.vs_common_out
.as_ls
&& ctx
->options
->key
.has_multiview_view_index
))
616 case MESA_SHADER_TESS_EVAL
:
617 if (ctx
->shader_info
->info
.needs_multiview_view_index
|| (!ctx
->options
->key
.vs_common_out
.as_es
&& ctx
->options
->key
.has_multiview_view_index
))
620 case MESA_SHADER_GEOMETRY
:
621 case MESA_SHADER_TESS_CTRL
:
622 if (ctx
->shader_info
->info
.needs_multiview_view_index
)
632 count_vs_user_sgprs(struct radv_shader_context
*ctx
)
636 if (ctx
->shader_info
->info
.vs
.has_vertex_buffers
)
638 count
+= ctx
->shader_info
->info
.vs
.needs_draw_id
? 3 : 2;
643 static void allocate_inline_push_consts(struct radv_shader_context
*ctx
,
644 struct user_sgpr_info
*user_sgpr_info
)
646 uint8_t remaining_sgprs
= user_sgpr_info
->remaining_sgprs
;
648 /* Only supported if shaders use push constants. */
649 if (ctx
->shader_info
->info
.min_push_constant_used
== UINT8_MAX
)
652 /* Only supported if shaders don't have indirect push constants. */
653 if (ctx
->shader_info
->info
.has_indirect_push_constants
)
656 /* Only supported for 32-bit push constants. */
657 if (!ctx
->shader_info
->info
.has_only_32bit_push_constants
)
660 uint8_t num_push_consts
=
661 (ctx
->shader_info
->info
.max_push_constant_used
-
662 ctx
->shader_info
->info
.min_push_constant_used
) / 4;
664 /* Check if the number of user SGPRs is large enough. */
665 if (num_push_consts
< remaining_sgprs
) {
666 ctx
->shader_info
->info
.num_inline_push_consts
= num_push_consts
;
668 ctx
->shader_info
->info
.num_inline_push_consts
= remaining_sgprs
;
671 /* Clamp to the maximum number of allowed inlined push constants. */
672 if (ctx
->shader_info
->info
.num_inline_push_consts
> AC_MAX_INLINE_PUSH_CONSTS
)
673 ctx
->shader_info
->info
.num_inline_push_consts
= AC_MAX_INLINE_PUSH_CONSTS
;
675 if (ctx
->shader_info
->info
.num_inline_push_consts
== num_push_consts
&&
676 !ctx
->shader_info
->info
.loads_dynamic_offsets
) {
677 /* Disable the default push constants path if all constants are
678 * inlined and if shaders don't use dynamic descriptors.
680 ctx
->shader_info
->info
.loads_push_constants
= false;
683 ctx
->shader_info
->info
.base_inline_push_consts
=
684 ctx
->shader_info
->info
.min_push_constant_used
/ 4;
687 static void allocate_user_sgprs(struct radv_shader_context
*ctx
,
688 gl_shader_stage stage
,
689 bool has_previous_stage
,
690 gl_shader_stage previous_stage
,
691 bool needs_view_index
,
692 struct user_sgpr_info
*user_sgpr_info
)
694 uint8_t user_sgpr_count
= 0;
696 memset(user_sgpr_info
, 0, sizeof(struct user_sgpr_info
));
698 /* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */
699 if (stage
== MESA_SHADER_GEOMETRY
||
700 stage
== MESA_SHADER_VERTEX
||
701 stage
== MESA_SHADER_TESS_CTRL
||
702 stage
== MESA_SHADER_TESS_EVAL
||
703 ctx
->is_gs_copy_shader
)
704 user_sgpr_info
->need_ring_offsets
= true;
706 if (stage
== MESA_SHADER_FRAGMENT
&&
707 ctx
->shader_info
->info
.ps
.needs_sample_positions
)
708 user_sgpr_info
->need_ring_offsets
= true;
710 /* 2 user sgprs will nearly always be allocated for scratch/rings */
711 if (ctx
->options
->supports_spill
|| user_sgpr_info
->need_ring_offsets
) {
712 user_sgpr_count
+= 2;
716 case MESA_SHADER_COMPUTE
:
717 if (ctx
->shader_info
->info
.cs
.uses_grid_size
)
718 user_sgpr_count
+= 3;
720 case MESA_SHADER_FRAGMENT
:
721 user_sgpr_count
+= ctx
->shader_info
->info
.ps
.needs_sample_positions
;
723 case MESA_SHADER_VERTEX
:
724 if (!ctx
->is_gs_copy_shader
)
725 user_sgpr_count
+= count_vs_user_sgprs(ctx
);
727 case MESA_SHADER_TESS_CTRL
:
728 if (has_previous_stage
) {
729 if (previous_stage
== MESA_SHADER_VERTEX
)
730 user_sgpr_count
+= count_vs_user_sgprs(ctx
);
733 case MESA_SHADER_TESS_EVAL
:
735 case MESA_SHADER_GEOMETRY
:
736 if (has_previous_stage
) {
737 if (previous_stage
== MESA_SHADER_VERTEX
) {
738 user_sgpr_count
+= count_vs_user_sgprs(ctx
);
746 if (needs_view_index
)
749 if (ctx
->shader_info
->info
.loads_push_constants
)
752 if (ctx
->streamout_buffers
)
755 uint32_t available_sgprs
= ctx
->options
->chip_class
>= GFX9
&& stage
!= MESA_SHADER_COMPUTE
? 32 : 16;
756 uint32_t remaining_sgprs
= available_sgprs
- user_sgpr_count
;
757 uint32_t num_desc_set
=
758 util_bitcount(ctx
->shader_info
->info
.desc_set_used_mask
);
760 if (remaining_sgprs
< num_desc_set
) {
761 user_sgpr_info
->indirect_all_descriptor_sets
= true;
762 user_sgpr_info
->remaining_sgprs
= remaining_sgprs
- 1;
764 user_sgpr_info
->remaining_sgprs
= remaining_sgprs
- num_desc_set
;
767 allocate_inline_push_consts(ctx
, user_sgpr_info
);
771 declare_global_input_sgprs(struct radv_shader_context
*ctx
,
772 const struct user_sgpr_info
*user_sgpr_info
,
773 struct arg_info
*args
,
774 LLVMValueRef
*desc_sets
)
776 LLVMTypeRef type
= ac_array_in_const32_addr_space(ctx
->ac
.i8
);
778 /* 1 for each descriptor set */
779 if (!user_sgpr_info
->indirect_all_descriptor_sets
) {
780 uint32_t mask
= ctx
->shader_info
->info
.desc_set_used_mask
;
783 int i
= u_bit_scan(&mask
);
785 add_arg(args
, ARG_SGPR
, type
, &ctx
->descriptor_sets
[i
]);
788 add_arg(args
, ARG_SGPR
, ac_array_in_const32_addr_space(type
),
792 if (ctx
->shader_info
->info
.loads_push_constants
) {
793 /* 1 for push constants and dynamic descriptors */
794 add_arg(args
, ARG_SGPR
, type
, &ctx
->abi
.push_constants
);
797 for (unsigned i
= 0; i
< ctx
->shader_info
->info
.num_inline_push_consts
; i
++) {
798 add_arg(args
, ARG_SGPR
, ctx
->ac
.i32
,
799 &ctx
->abi
.inline_push_consts
[i
]);
801 ctx
->abi
.num_inline_push_consts
= ctx
->shader_info
->info
.num_inline_push_consts
;
802 ctx
->abi
.base_inline_push_consts
= ctx
->shader_info
->info
.base_inline_push_consts
;
804 if (ctx
->shader_info
->info
.so
.num_outputs
) {
805 add_arg(args
, ARG_SGPR
,
806 ac_array_in_const32_addr_space(ctx
->ac
.v4i32
),
807 &ctx
->streamout_buffers
);
812 declare_vs_specific_input_sgprs(struct radv_shader_context
*ctx
,
813 gl_shader_stage stage
,
814 bool has_previous_stage
,
815 gl_shader_stage previous_stage
,
816 struct arg_info
*args
)
818 if (!ctx
->is_gs_copy_shader
&&
819 (stage
== MESA_SHADER_VERTEX
||
820 (has_previous_stage
&& previous_stage
== MESA_SHADER_VERTEX
))) {
821 if (ctx
->shader_info
->info
.vs
.has_vertex_buffers
) {
822 add_arg(args
, ARG_SGPR
,
823 ac_array_in_const32_addr_space(ctx
->ac
.v4i32
),
824 &ctx
->vertex_buffers
);
826 add_arg(args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->abi
.base_vertex
);
827 add_arg(args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->abi
.start_instance
);
828 if (ctx
->shader_info
->info
.vs
.needs_draw_id
) {
829 add_arg(args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->abi
.draw_id
);
835 declare_vs_input_vgprs(struct radv_shader_context
*ctx
, struct arg_info
*args
)
837 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.vertex_id
);
838 if (!ctx
->is_gs_copy_shader
) {
839 if (ctx
->options
->key
.vs_common_out
.as_ls
) {
840 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->rel_auto_id
);
841 if (ctx
->ac
.chip_class
>= GFX10
) {
842 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, NULL
); /* user vgpr */
843 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.instance_id
);
845 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.instance_id
);
846 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, NULL
); /* unused */
849 if (ctx
->ac
.chip_class
>= GFX10
) {
850 if (ctx
->options
->key
.vs_common_out
.as_ngg
) {
851 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, NULL
); /* user vgpr */
852 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, NULL
); /* user vgpr */
853 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.instance_id
);
855 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, NULL
); /* unused */
856 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->vs_prim_id
);
857 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.instance_id
);
860 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.instance_id
);
861 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->vs_prim_id
);
862 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, NULL
); /* unused */
869 declare_streamout_sgprs(struct radv_shader_context
*ctx
, gl_shader_stage stage
,
870 struct arg_info
*args
)
874 /* Streamout SGPRs. */
875 if (ctx
->shader_info
->info
.so
.num_outputs
) {
876 assert(stage
== MESA_SHADER_VERTEX
||
877 stage
== MESA_SHADER_TESS_EVAL
);
879 if (stage
!= MESA_SHADER_TESS_EVAL
) {
880 add_arg(args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->streamout_config
);
882 args
->assign
[args
->count
- 1] = &ctx
->streamout_config
;
883 args
->types
[args
->count
- 1] = ctx
->ac
.i32
;
886 add_arg(args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->streamout_write_idx
);
889 /* A streamout buffer offset is loaded if the stride is non-zero. */
890 for (i
= 0; i
< 4; i
++) {
891 if (!ctx
->shader_info
->info
.so
.strides
[i
])
894 add_arg(args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->streamout_offset
[i
]);
899 declare_tes_input_vgprs(struct radv_shader_context
*ctx
, struct arg_info
*args
)
901 add_arg(args
, ARG_VGPR
, ctx
->ac
.f32
, &ctx
->tes_u
);
902 add_arg(args
, ARG_VGPR
, ctx
->ac
.f32
, &ctx
->tes_v
);
903 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->tes_rel_patch_id
);
904 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.tes_patch_id
);
908 set_global_input_locs(struct radv_shader_context
*ctx
,
909 const struct user_sgpr_info
*user_sgpr_info
,
910 LLVMValueRef desc_sets
, uint8_t *user_sgpr_idx
)
912 uint32_t mask
= ctx
->shader_info
->info
.desc_set_used_mask
;
914 if (!user_sgpr_info
->indirect_all_descriptor_sets
) {
916 int i
= u_bit_scan(&mask
);
918 set_loc_desc(ctx
, i
, user_sgpr_idx
);
921 set_loc_shader_ptr(ctx
, AC_UD_INDIRECT_DESCRIPTOR_SETS
,
925 int i
= u_bit_scan(&mask
);
927 ctx
->descriptor_sets
[i
] =
928 ac_build_load_to_sgpr(&ctx
->ac
, desc_sets
,
929 LLVMConstInt(ctx
->ac
.i32
, i
, false));
933 ctx
->shader_info
->need_indirect_descriptor_sets
= true;
936 if (ctx
->shader_info
->info
.loads_push_constants
) {
937 set_loc_shader_ptr(ctx
, AC_UD_PUSH_CONSTANTS
, user_sgpr_idx
);
940 if (ctx
->shader_info
->info
.num_inline_push_consts
) {
941 set_loc_shader(ctx
, AC_UD_INLINE_PUSH_CONSTANTS
, user_sgpr_idx
,
942 ctx
->shader_info
->info
.num_inline_push_consts
);
945 if (ctx
->streamout_buffers
) {
946 set_loc_shader_ptr(ctx
, AC_UD_STREAMOUT_BUFFERS
,
952 set_vs_specific_input_locs(struct radv_shader_context
*ctx
,
953 gl_shader_stage stage
, bool has_previous_stage
,
954 gl_shader_stage previous_stage
,
955 uint8_t *user_sgpr_idx
)
957 if (!ctx
->is_gs_copy_shader
&&
958 (stage
== MESA_SHADER_VERTEX
||
959 (has_previous_stage
&& previous_stage
== MESA_SHADER_VERTEX
))) {
960 if (ctx
->shader_info
->info
.vs
.has_vertex_buffers
) {
961 set_loc_shader_ptr(ctx
, AC_UD_VS_VERTEX_BUFFERS
,
966 if (ctx
->shader_info
->info
.vs
.needs_draw_id
)
969 set_loc_shader(ctx
, AC_UD_VS_BASE_VERTEX_START_INSTANCE
,
970 user_sgpr_idx
, vs_num
);
974 static void set_llvm_calling_convention(LLVMValueRef func
,
975 gl_shader_stage stage
)
977 enum radeon_llvm_calling_convention calling_conv
;
980 case MESA_SHADER_VERTEX
:
981 case MESA_SHADER_TESS_EVAL
:
982 calling_conv
= RADEON_LLVM_AMDGPU_VS
;
984 case MESA_SHADER_GEOMETRY
:
985 calling_conv
= RADEON_LLVM_AMDGPU_GS
;
987 case MESA_SHADER_TESS_CTRL
:
988 calling_conv
= RADEON_LLVM_AMDGPU_HS
;
990 case MESA_SHADER_FRAGMENT
:
991 calling_conv
= RADEON_LLVM_AMDGPU_PS
;
993 case MESA_SHADER_COMPUTE
:
994 calling_conv
= RADEON_LLVM_AMDGPU_CS
;
997 unreachable("Unhandle shader type");
1000 LLVMSetFunctionCallConv(func
, calling_conv
);
1003 /* Returns whether the stage is a stage that can be directly before the GS */
1004 static bool is_pre_gs_stage(gl_shader_stage stage
)
1006 return stage
== MESA_SHADER_VERTEX
|| stage
== MESA_SHADER_TESS_EVAL
;
1009 static void create_function(struct radv_shader_context
*ctx
,
1010 gl_shader_stage stage
,
1011 bool has_previous_stage
,
1012 gl_shader_stage previous_stage
)
1014 uint8_t user_sgpr_idx
;
1015 struct user_sgpr_info user_sgpr_info
;
1016 struct arg_info args
= {};
1017 LLVMValueRef desc_sets
;
1018 bool needs_view_index
= needs_view_index_sgpr(ctx
, stage
);
1020 if (ctx
->ac
.chip_class
>= GFX10
) {
1021 if (is_pre_gs_stage(stage
) && ctx
->options
->key
.vs_common_out
.as_ngg
) {
1022 /* On GFX10, VS is merged into GS for NGG. */
1023 previous_stage
= stage
;
1024 stage
= MESA_SHADER_GEOMETRY
;
1025 has_previous_stage
= true;
1029 allocate_user_sgprs(ctx
, stage
, has_previous_stage
,
1030 previous_stage
, needs_view_index
, &user_sgpr_info
);
1032 if (user_sgpr_info
.need_ring_offsets
&& !ctx
->options
->supports_spill
) {
1033 add_arg(&args
, ARG_SGPR
, ac_array_in_const_addr_space(ctx
->ac
.v4i32
),
1034 &ctx
->ring_offsets
);
1038 case MESA_SHADER_COMPUTE
:
1039 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
,
1042 if (ctx
->shader_info
->info
.cs
.uses_grid_size
) {
1043 add_arg(&args
, ARG_SGPR
, ctx
->ac
.v3i32
,
1044 &ctx
->abi
.num_work_groups
);
1047 for (int i
= 0; i
< 3; i
++) {
1048 ctx
->abi
.workgroup_ids
[i
] = NULL
;
1049 if (ctx
->shader_info
->info
.cs
.uses_block_id
[i
]) {
1050 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1051 &ctx
->abi
.workgroup_ids
[i
]);
1055 if (ctx
->shader_info
->info
.cs
.uses_local_invocation_idx
)
1056 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->abi
.tg_size
);
1057 add_arg(&args
, ARG_VGPR
, ctx
->ac
.v3i32
,
1058 &ctx
->abi
.local_invocation_ids
);
1060 case MESA_SHADER_VERTEX
:
1061 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
,
1064 declare_vs_specific_input_sgprs(ctx
, stage
, has_previous_stage
,
1065 previous_stage
, &args
);
1067 if (needs_view_index
)
1068 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1069 &ctx
->abi
.view_index
);
1070 if (ctx
->options
->key
.vs_common_out
.as_es
) {
1071 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1072 &ctx
->es2gs_offset
);
1073 } else if (ctx
->options
->key
.vs_common_out
.as_ls
) {
1074 /* no extra parameters */
1076 declare_streamout_sgprs(ctx
, stage
, &args
);
1079 declare_vs_input_vgprs(ctx
, &args
);
1081 case MESA_SHADER_TESS_CTRL
:
1082 if (has_previous_stage
) {
1083 // First 6 system regs
1084 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->oc_lds
);
1085 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1086 &ctx
->merged_wave_info
);
1087 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1088 &ctx
->tess_factor_offset
);
1090 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, NULL
); // scratch offset
1091 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, NULL
); // unknown
1092 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, NULL
); // unknown
1094 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
,
1097 declare_vs_specific_input_sgprs(ctx
, stage
,
1099 previous_stage
, &args
);
1101 if (needs_view_index
)
1102 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1103 &ctx
->abi
.view_index
);
1105 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1106 &ctx
->abi
.tcs_patch_id
);
1107 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1108 &ctx
->abi
.tcs_rel_ids
);
1110 declare_vs_input_vgprs(ctx
, &args
);
1112 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
,
1115 if (needs_view_index
)
1116 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1117 &ctx
->abi
.view_index
);
1119 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->oc_lds
);
1120 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1121 &ctx
->tess_factor_offset
);
1122 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1123 &ctx
->abi
.tcs_patch_id
);
1124 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1125 &ctx
->abi
.tcs_rel_ids
);
1128 case MESA_SHADER_TESS_EVAL
:
1129 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
,
1132 if (needs_view_index
)
1133 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1134 &ctx
->abi
.view_index
);
1136 if (ctx
->options
->key
.vs_common_out
.as_es
) {
1137 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->oc_lds
);
1138 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, NULL
);
1139 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1140 &ctx
->es2gs_offset
);
1142 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, NULL
);
1143 declare_streamout_sgprs(ctx
, stage
, &args
);
1144 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->oc_lds
);
1146 declare_tes_input_vgprs(ctx
, &args
);
1148 case MESA_SHADER_GEOMETRY
:
1149 if (has_previous_stage
) {
1150 // First 6 system regs
1151 if (ctx
->options
->key
.vs_common_out
.as_ngg
) {
1152 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1155 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1156 &ctx
->gs2vs_offset
);
1159 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1160 &ctx
->merged_wave_info
);
1161 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->oc_lds
);
1163 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, NULL
); // scratch offset
1164 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, NULL
); // unknown
1165 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, NULL
); // unknown
1167 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
,
1170 if (previous_stage
!= MESA_SHADER_TESS_EVAL
) {
1171 declare_vs_specific_input_sgprs(ctx
, stage
,
1177 if (needs_view_index
)
1178 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1179 &ctx
->abi
.view_index
);
1181 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1182 &ctx
->gs_vtx_offset
[0]);
1183 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1184 &ctx
->gs_vtx_offset
[2]);
1185 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1186 &ctx
->abi
.gs_prim_id
);
1187 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1188 &ctx
->abi
.gs_invocation_id
);
1189 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1190 &ctx
->gs_vtx_offset
[4]);
1192 if (previous_stage
== MESA_SHADER_VERTEX
) {
1193 declare_vs_input_vgprs(ctx
, &args
);
1195 declare_tes_input_vgprs(ctx
, &args
);
1198 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
,
1201 if (needs_view_index
)
1202 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1203 &ctx
->abi
.view_index
);
1205 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->gs2vs_offset
);
1206 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->gs_wave_id
);
1207 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1208 &ctx
->gs_vtx_offset
[0]);
1209 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1210 &ctx
->gs_vtx_offset
[1]);
1211 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1212 &ctx
->abi
.gs_prim_id
);
1213 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1214 &ctx
->gs_vtx_offset
[2]);
1215 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1216 &ctx
->gs_vtx_offset
[3]);
1217 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1218 &ctx
->gs_vtx_offset
[4]);
1219 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1220 &ctx
->gs_vtx_offset
[5]);
1221 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1222 &ctx
->abi
.gs_invocation_id
);
1225 case MESA_SHADER_FRAGMENT
:
1226 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
,
1229 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->abi
.prim_mask
);
1230 add_arg(&args
, ARG_VGPR
, ctx
->ac
.v2i32
, &ctx
->abi
.persp_sample
);
1231 add_arg(&args
, ARG_VGPR
, ctx
->ac
.v2i32
, &ctx
->abi
.persp_center
);
1232 add_arg(&args
, ARG_VGPR
, ctx
->ac
.v2i32
, &ctx
->abi
.persp_centroid
);
1233 add_arg(&args
, ARG_VGPR
, ctx
->ac
.v3i32
, NULL
); /* persp pull model */
1234 add_arg(&args
, ARG_VGPR
, ctx
->ac
.v2i32
, &ctx
->abi
.linear_sample
);
1235 add_arg(&args
, ARG_VGPR
, ctx
->ac
.v2i32
, &ctx
->abi
.linear_center
);
1236 add_arg(&args
, ARG_VGPR
, ctx
->ac
.v2i32
, &ctx
->abi
.linear_centroid
);
1237 add_arg(&args
, ARG_VGPR
, ctx
->ac
.f32
, NULL
); /* line stipple tex */
1238 add_arg(&args
, ARG_VGPR
, ctx
->ac
.f32
, &ctx
->abi
.frag_pos
[0]);
1239 add_arg(&args
, ARG_VGPR
, ctx
->ac
.f32
, &ctx
->abi
.frag_pos
[1]);
1240 add_arg(&args
, ARG_VGPR
, ctx
->ac
.f32
, &ctx
->abi
.frag_pos
[2]);
1241 add_arg(&args
, ARG_VGPR
, ctx
->ac
.f32
, &ctx
->abi
.frag_pos
[3]);
1242 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.front_face
);
1243 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.ancillary
);
1244 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.sample_coverage
);
1245 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
, NULL
); /* fixed pt */
1248 unreachable("Shader stage not implemented");
1251 ctx
->main_function
= create_llvm_function(
1252 ctx
->context
, ctx
->ac
.module
, ctx
->ac
.builder
, NULL
, 0, &args
,
1253 ctx
->max_workgroup_size
, ctx
->options
);
1254 set_llvm_calling_convention(ctx
->main_function
, stage
);
1257 ctx
->shader_info
->num_input_vgprs
= 0;
1258 ctx
->shader_info
->num_input_sgprs
= ctx
->options
->supports_spill
? 2 : 0;
1260 ctx
->shader_info
->num_input_sgprs
+= args
.num_sgprs_used
;
1262 if (ctx
->stage
!= MESA_SHADER_FRAGMENT
)
1263 ctx
->shader_info
->num_input_vgprs
= args
.num_vgprs_used
;
1265 assign_arguments(ctx
->main_function
, &args
);
1269 if (ctx
->options
->supports_spill
|| user_sgpr_info
.need_ring_offsets
) {
1270 set_loc_shader_ptr(ctx
, AC_UD_SCRATCH_RING_OFFSETS
,
1272 if (ctx
->options
->supports_spill
) {
1273 ctx
->ring_offsets
= ac_build_intrinsic(&ctx
->ac
, "llvm.amdgcn.implicit.buffer.ptr",
1274 LLVMPointerType(ctx
->ac
.i8
, AC_ADDR_SPACE_CONST
),
1275 NULL
, 0, AC_FUNC_ATTR_READNONE
);
1276 ctx
->ring_offsets
= LLVMBuildBitCast(ctx
->ac
.builder
, ctx
->ring_offsets
,
1277 ac_array_in_const_addr_space(ctx
->ac
.v4i32
), "");
1281 /* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including
1282 * the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */
1283 if (has_previous_stage
)
1286 set_global_input_locs(ctx
, &user_sgpr_info
, desc_sets
, &user_sgpr_idx
);
1289 case MESA_SHADER_COMPUTE
:
1290 if (ctx
->shader_info
->info
.cs
.uses_grid_size
) {
1291 set_loc_shader(ctx
, AC_UD_CS_GRID_SIZE
,
1295 case MESA_SHADER_VERTEX
:
1296 set_vs_specific_input_locs(ctx
, stage
, has_previous_stage
,
1297 previous_stage
, &user_sgpr_idx
);
1298 if (ctx
->abi
.view_index
)
1299 set_loc_shader(ctx
, AC_UD_VIEW_INDEX
, &user_sgpr_idx
, 1);
1301 case MESA_SHADER_TESS_CTRL
:
1302 set_vs_specific_input_locs(ctx
, stage
, has_previous_stage
,
1303 previous_stage
, &user_sgpr_idx
);
1304 if (ctx
->abi
.view_index
)
1305 set_loc_shader(ctx
, AC_UD_VIEW_INDEX
, &user_sgpr_idx
, 1);
1307 case MESA_SHADER_TESS_EVAL
:
1308 if (ctx
->abi
.view_index
)
1309 set_loc_shader(ctx
, AC_UD_VIEW_INDEX
, &user_sgpr_idx
, 1);
1311 case MESA_SHADER_GEOMETRY
:
1312 if (has_previous_stage
) {
1313 if (previous_stage
== MESA_SHADER_VERTEX
)
1314 set_vs_specific_input_locs(ctx
, stage
,
1319 if (ctx
->abi
.view_index
)
1320 set_loc_shader(ctx
, AC_UD_VIEW_INDEX
, &user_sgpr_idx
, 1);
1322 case MESA_SHADER_FRAGMENT
:
1325 unreachable("Shader stage not implemented");
1328 if (stage
== MESA_SHADER_TESS_CTRL
||
1329 (stage
== MESA_SHADER_VERTEX
&& ctx
->options
->key
.vs_common_out
.as_ls
) ||
1330 /* GFX9 has the ESGS ring buffer in LDS. */
1331 (stage
== MESA_SHADER_GEOMETRY
&& has_previous_stage
)) {
1332 ac_declare_lds_as_pointer(&ctx
->ac
);
1335 ctx
->shader_info
->num_user_sgprs
= user_sgpr_idx
;
1340 radv_load_resource(struct ac_shader_abi
*abi
, LLVMValueRef index
,
1341 unsigned desc_set
, unsigned binding
)
1343 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1344 LLVMValueRef desc_ptr
= ctx
->descriptor_sets
[desc_set
];
1345 struct radv_pipeline_layout
*pipeline_layout
= ctx
->options
->layout
;
1346 struct radv_descriptor_set_layout
*layout
= pipeline_layout
->set
[desc_set
].layout
;
1347 unsigned base_offset
= layout
->binding
[binding
].offset
;
1348 LLVMValueRef offset
, stride
;
1350 if (layout
->binding
[binding
].type
== VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC
||
1351 layout
->binding
[binding
].type
== VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC
) {
1352 unsigned idx
= pipeline_layout
->set
[desc_set
].dynamic_offset_start
+
1353 layout
->binding
[binding
].dynamic_offset_offset
;
1354 desc_ptr
= ctx
->abi
.push_constants
;
1355 base_offset
= pipeline_layout
->push_constant_size
+ 16 * idx
;
1356 stride
= LLVMConstInt(ctx
->ac
.i32
, 16, false);
1358 stride
= LLVMConstInt(ctx
->ac
.i32
, layout
->binding
[binding
].size
, false);
1360 offset
= LLVMConstInt(ctx
->ac
.i32
, base_offset
, false);
1362 if (layout
->binding
[binding
].type
!= VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT
) {
1363 offset
= ac_build_imad(&ctx
->ac
, index
, stride
, offset
);
1366 desc_ptr
= LLVMBuildGEP(ctx
->ac
.builder
, desc_ptr
, &offset
, 1, "");
1367 desc_ptr
= ac_cast_ptr(&ctx
->ac
, desc_ptr
, ctx
->ac
.v4i32
);
1368 LLVMSetMetadata(desc_ptr
, ctx
->ac
.uniform_md_kind
, ctx
->ac
.empty_md
);
1370 if (layout
->binding
[binding
].type
== VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT
) {
1371 uint32_t desc_type
= S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X
) |
1372 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y
) |
1373 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z
) |
1374 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W
);
1376 if (ctx
->ac
.chip_class
>= GFX10
) {
1377 desc_type
|= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT
) |
1378 S_008F0C_OOB_SELECT(3) |
1379 S_008F0C_RESOURCE_LEVEL(1);
1381 desc_type
|= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT
) |
1382 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32
);
1385 LLVMValueRef desc_components
[4] = {
1386 LLVMBuildPtrToInt(ctx
->ac
.builder
, desc_ptr
, ctx
->ac
.intptr
, ""),
1387 LLVMConstInt(ctx
->ac
.i32
, S_008F04_BASE_ADDRESS_HI(ctx
->options
->address32_hi
), false),
1388 /* High limit to support variable sizes. */
1389 LLVMConstInt(ctx
->ac
.i32
, 0xffffffff, false),
1390 LLVMConstInt(ctx
->ac
.i32
, desc_type
, false),
1393 return ac_build_gather_values(&ctx
->ac
, desc_components
, 4);
1400 /* The offchip buffer layout for TCS->TES is
1402 * - attribute 0 of patch 0 vertex 0
1403 * - attribute 0 of patch 0 vertex 1
1404 * - attribute 0 of patch 0 vertex 2
1406 * - attribute 0 of patch 1 vertex 0
1407 * - attribute 0 of patch 1 vertex 1
1409 * - attribute 1 of patch 0 vertex 0
1410 * - attribute 1 of patch 0 vertex 1
1412 * - per patch attribute 0 of patch 0
1413 * - per patch attribute 0 of patch 1
1416 * Note that every attribute has 4 components.
1418 static LLVMValueRef
get_non_vertex_index_offset(struct radv_shader_context
*ctx
)
1420 uint32_t num_patches
= ctx
->tcs_num_patches
;
1421 uint32_t num_tcs_outputs
;
1422 if (ctx
->stage
== MESA_SHADER_TESS_CTRL
)
1423 num_tcs_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.outputs_written
);
1425 num_tcs_outputs
= ctx
->options
->key
.tes
.tcs_num_outputs
;
1427 uint32_t output_vertex_size
= num_tcs_outputs
* 16;
1428 uint32_t pervertex_output_patch_size
= ctx
->tcs_vertices_per_patch
* output_vertex_size
;
1430 return LLVMConstInt(ctx
->ac
.i32
, pervertex_output_patch_size
* num_patches
, false);
1433 static LLVMValueRef
calc_param_stride(struct radv_shader_context
*ctx
,
1434 LLVMValueRef vertex_index
)
1436 LLVMValueRef param_stride
;
1438 param_stride
= LLVMConstInt(ctx
->ac
.i32
, ctx
->tcs_vertices_per_patch
* ctx
->tcs_num_patches
, false);
1440 param_stride
= LLVMConstInt(ctx
->ac
.i32
, ctx
->tcs_num_patches
, false);
1441 return param_stride
;
1444 static LLVMValueRef
get_tcs_tes_buffer_address(struct radv_shader_context
*ctx
,
1445 LLVMValueRef vertex_index
,
1446 LLVMValueRef param_index
)
1448 LLVMValueRef base_addr
;
1449 LLVMValueRef param_stride
, constant16
;
1450 LLVMValueRef rel_patch_id
= get_rel_patch_id(ctx
);
1451 LLVMValueRef vertices_per_patch
= LLVMConstInt(ctx
->ac
.i32
, ctx
->tcs_vertices_per_patch
, false);
1452 constant16
= LLVMConstInt(ctx
->ac
.i32
, 16, false);
1453 param_stride
= calc_param_stride(ctx
, vertex_index
);
1455 base_addr
= ac_build_imad(&ctx
->ac
, rel_patch_id
,
1456 vertices_per_patch
, vertex_index
);
1458 base_addr
= rel_patch_id
;
1461 base_addr
= LLVMBuildAdd(ctx
->ac
.builder
, base_addr
,
1462 LLVMBuildMul(ctx
->ac
.builder
, param_index
,
1463 param_stride
, ""), "");
1465 base_addr
= LLVMBuildMul(ctx
->ac
.builder
, base_addr
, constant16
, "");
1467 if (!vertex_index
) {
1468 LLVMValueRef patch_data_offset
= get_non_vertex_index_offset(ctx
);
1470 base_addr
= LLVMBuildAdd(ctx
->ac
.builder
, base_addr
,
1471 patch_data_offset
, "");
1476 static LLVMValueRef
get_tcs_tes_buffer_address_params(struct radv_shader_context
*ctx
,
1478 unsigned const_index
,
1480 LLVMValueRef vertex_index
,
1481 LLVMValueRef indir_index
)
1483 LLVMValueRef param_index
;
1486 param_index
= LLVMBuildAdd(ctx
->ac
.builder
, LLVMConstInt(ctx
->ac
.i32
, param
, false),
1489 if (const_index
&& !is_compact
)
1490 param
+= const_index
;
1491 param_index
= LLVMConstInt(ctx
->ac
.i32
, param
, false);
1493 return get_tcs_tes_buffer_address(ctx
, vertex_index
, param_index
);
1497 get_dw_address(struct radv_shader_context
*ctx
,
1498 LLVMValueRef dw_addr
,
1500 unsigned const_index
,
1501 bool compact_const_index
,
1502 LLVMValueRef vertex_index
,
1503 LLVMValueRef stride
,
1504 LLVMValueRef indir_index
)
1509 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
1510 LLVMBuildMul(ctx
->ac
.builder
,
1516 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
1517 LLVMBuildMul(ctx
->ac
.builder
, indir_index
,
1518 LLVMConstInt(ctx
->ac
.i32
, 4, false), ""), "");
1519 else if (const_index
&& !compact_const_index
)
1520 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
1521 LLVMConstInt(ctx
->ac
.i32
, const_index
* 4, false), "");
1523 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
1524 LLVMConstInt(ctx
->ac
.i32
, param
* 4, false), "");
1526 if (const_index
&& compact_const_index
)
1527 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
1528 LLVMConstInt(ctx
->ac
.i32
, const_index
, false), "");
1533 load_tcs_varyings(struct ac_shader_abi
*abi
,
1535 LLVMValueRef vertex_index
,
1536 LLVMValueRef indir_index
,
1537 unsigned const_index
,
1539 unsigned driver_location
,
1541 unsigned num_components
,
1546 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1547 LLVMValueRef dw_addr
, stride
;
1548 LLVMValueRef value
[4], result
;
1549 unsigned param
= shader_io_get_unique_index(location
);
1552 uint32_t input_vertex_size
= (ctx
->tcs_num_inputs
* 16) / 4;
1553 stride
= LLVMConstInt(ctx
->ac
.i32
, input_vertex_size
, false);
1554 dw_addr
= get_tcs_in_current_patch_offset(ctx
);
1557 stride
= get_tcs_out_vertex_stride(ctx
);
1558 dw_addr
= get_tcs_out_current_patch_offset(ctx
);
1560 dw_addr
= get_tcs_out_current_patch_data_offset(ctx
);
1565 dw_addr
= get_dw_address(ctx
, dw_addr
, param
, const_index
, is_compact
, vertex_index
, stride
,
1568 for (unsigned i
= 0; i
< num_components
+ component
; i
++) {
1569 value
[i
] = ac_lds_load(&ctx
->ac
, dw_addr
);
1570 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
1573 result
= ac_build_varying_gather_values(&ctx
->ac
, value
, num_components
, component
);
1578 store_tcs_output(struct ac_shader_abi
*abi
,
1579 const nir_variable
*var
,
1580 LLVMValueRef vertex_index
,
1581 LLVMValueRef param_index
,
1582 unsigned const_index
,
1586 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1587 const unsigned location
= var
->data
.location
;
1588 unsigned component
= var
->data
.location_frac
;
1589 const bool is_patch
= var
->data
.patch
;
1590 const bool is_compact
= var
->data
.compact
;
1591 LLVMValueRef dw_addr
;
1592 LLVMValueRef stride
= NULL
;
1593 LLVMValueRef buf_addr
= NULL
;
1595 bool store_lds
= true;
1598 if (!(ctx
->tcs_patch_outputs_read
& (1U << (location
- VARYING_SLOT_PATCH0
))))
1601 if (!(ctx
->tcs_outputs_read
& (1ULL << location
)))
1605 param
= shader_io_get_unique_index(location
);
1606 if ((location
== VARYING_SLOT_CLIP_DIST0
|| location
== VARYING_SLOT_CLIP_DIST1
) && is_compact
) {
1607 const_index
+= component
;
1610 if (const_index
>= 4) {
1617 stride
= get_tcs_out_vertex_stride(ctx
);
1618 dw_addr
= get_tcs_out_current_patch_offset(ctx
);
1620 dw_addr
= get_tcs_out_current_patch_data_offset(ctx
);
1623 dw_addr
= get_dw_address(ctx
, dw_addr
, param
, const_index
, is_compact
, vertex_index
, stride
,
1625 buf_addr
= get_tcs_tes_buffer_address_params(ctx
, param
, const_index
, is_compact
,
1626 vertex_index
, param_index
);
1628 bool is_tess_factor
= false;
1629 if (location
== VARYING_SLOT_TESS_LEVEL_INNER
||
1630 location
== VARYING_SLOT_TESS_LEVEL_OUTER
)
1631 is_tess_factor
= true;
1633 unsigned base
= is_compact
? const_index
: 0;
1634 for (unsigned chan
= 0; chan
< 8; chan
++) {
1635 if (!(writemask
& (1 << chan
)))
1637 LLVMValueRef value
= ac_llvm_extract_elem(&ctx
->ac
, src
, chan
- component
);
1638 value
= ac_to_integer(&ctx
->ac
, value
);
1639 value
= LLVMBuildZExtOrBitCast(ctx
->ac
.builder
, value
, ctx
->ac
.i32
, "");
1641 if (store_lds
|| is_tess_factor
) {
1642 LLVMValueRef dw_addr_chan
=
1643 LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
1644 LLVMConstInt(ctx
->ac
.i32
, chan
, false), "");
1645 ac_lds_store(&ctx
->ac
, dw_addr_chan
, value
);
1648 if (!is_tess_factor
&& writemask
!= 0xF)
1649 ac_build_buffer_store_dword(&ctx
->ac
, ctx
->hs_ring_tess_offchip
, value
, 1,
1650 buf_addr
, ctx
->oc_lds
,
1651 4 * (base
+ chan
), ac_glc
, false);
1654 if (writemask
== 0xF) {
1655 ac_build_buffer_store_dword(&ctx
->ac
, ctx
->hs_ring_tess_offchip
, src
, 4,
1656 buf_addr
, ctx
->oc_lds
,
1657 (base
* 4), ac_glc
, false);
1662 load_tes_input(struct ac_shader_abi
*abi
,
1664 LLVMValueRef vertex_index
,
1665 LLVMValueRef param_index
,
1666 unsigned const_index
,
1668 unsigned driver_location
,
1670 unsigned num_components
,
1675 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1676 LLVMValueRef buf_addr
;
1677 LLVMValueRef result
;
1678 unsigned param
= shader_io_get_unique_index(location
);
1680 if ((location
== VARYING_SLOT_CLIP_DIST0
|| location
== VARYING_SLOT_CLIP_DIST1
) && is_compact
) {
1681 const_index
+= component
;
1683 if (const_index
>= 4) {
1689 buf_addr
= get_tcs_tes_buffer_address_params(ctx
, param
, const_index
,
1690 is_compact
, vertex_index
, param_index
);
1692 LLVMValueRef comp_offset
= LLVMConstInt(ctx
->ac
.i32
, component
* 4, false);
1693 buf_addr
= LLVMBuildAdd(ctx
->ac
.builder
, buf_addr
, comp_offset
, "");
1695 result
= ac_build_buffer_load(&ctx
->ac
, ctx
->hs_ring_tess_offchip
, num_components
, NULL
,
1696 buf_addr
, ctx
->oc_lds
, is_compact
? (4 * const_index
) : 0, ac_glc
, true, false);
1697 result
= ac_trim_vector(&ctx
->ac
, result
, num_components
);
1702 load_gs_input(struct ac_shader_abi
*abi
,
1704 unsigned driver_location
,
1706 unsigned num_components
,
1707 unsigned vertex_index
,
1708 unsigned const_index
,
1711 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1712 LLVMValueRef vtx_offset
;
1713 unsigned param
, vtx_offset_param
;
1714 LLVMValueRef value
[4], result
;
1716 vtx_offset_param
= vertex_index
;
1717 assert(vtx_offset_param
< 6);
1718 vtx_offset
= LLVMBuildMul(ctx
->ac
.builder
, ctx
->gs_vtx_offset
[vtx_offset_param
],
1719 LLVMConstInt(ctx
->ac
.i32
, 4, false), "");
1721 param
= shader_io_get_unique_index(location
);
1723 for (unsigned i
= component
; i
< num_components
+ component
; i
++) {
1724 if (ctx
->ac
.chip_class
>= GFX9
) {
1725 LLVMValueRef dw_addr
= ctx
->gs_vtx_offset
[vtx_offset_param
];
1726 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
1727 LLVMConstInt(ctx
->ac
.i32
, param
* 4 + i
+ const_index
, 0), "");
1728 value
[i
] = ac_lds_load(&ctx
->ac
, dw_addr
);
1730 LLVMValueRef soffset
=
1731 LLVMConstInt(ctx
->ac
.i32
,
1732 (param
* 4 + i
+ const_index
) * 256,
1735 value
[i
] = ac_build_buffer_load(&ctx
->ac
,
1738 vtx_offset
, soffset
,
1739 0, ac_glc
, true, false);
1742 if (ac_get_type_size(type
) == 2) {
1743 value
[i
] = LLVMBuildBitCast(ctx
->ac
.builder
, value
[i
], ctx
->ac
.i32
, "");
1744 value
[i
] = LLVMBuildTrunc(ctx
->ac
.builder
, value
[i
], ctx
->ac
.i16
, "");
1746 value
[i
] = LLVMBuildBitCast(ctx
->ac
.builder
, value
[i
], type
, "");
1748 result
= ac_build_varying_gather_values(&ctx
->ac
, value
, num_components
, component
);
1749 result
= ac_to_integer(&ctx
->ac
, result
);
1754 static void radv_emit_kill(struct ac_shader_abi
*abi
, LLVMValueRef visible
)
1756 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1757 ac_build_kill_if_false(&ctx
->ac
, visible
);
1761 radv_get_sample_pos_offset(uint32_t num_samples
)
1763 uint32_t sample_pos_offset
= 0;
1765 switch (num_samples
) {
1767 sample_pos_offset
= 1;
1770 sample_pos_offset
= 3;
1773 sample_pos_offset
= 7;
1778 return sample_pos_offset
;
1781 static LLVMValueRef
load_sample_position(struct ac_shader_abi
*abi
,
1782 LLVMValueRef sample_id
)
1784 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1786 LLVMValueRef result
;
1787 LLVMValueRef index
= LLVMConstInt(ctx
->ac
.i32
, RING_PS_SAMPLE_POSITIONS
, false);
1788 LLVMValueRef ptr
= LLVMBuildGEP(ctx
->ac
.builder
, ctx
->ring_offsets
, &index
, 1, "");
1790 ptr
= LLVMBuildBitCast(ctx
->ac
.builder
, ptr
,
1791 ac_array_in_const_addr_space(ctx
->ac
.v2f32
), "");
1793 uint32_t sample_pos_offset
=
1794 radv_get_sample_pos_offset(ctx
->options
->key
.fs
.num_samples
);
1797 LLVMBuildAdd(ctx
->ac
.builder
, sample_id
,
1798 LLVMConstInt(ctx
->ac
.i32
, sample_pos_offset
, false), "");
1799 result
= ac_build_load_invariant(&ctx
->ac
, ptr
, sample_id
);
1805 static LLVMValueRef
load_sample_mask_in(struct ac_shader_abi
*abi
)
1807 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1808 uint8_t log2_ps_iter_samples
;
1810 if (ctx
->shader_info
->info
.ps
.force_persample
) {
1811 log2_ps_iter_samples
=
1812 util_logbase2(ctx
->options
->key
.fs
.num_samples
);
1814 log2_ps_iter_samples
= ctx
->options
->key
.fs
.log2_ps_iter_samples
;
1817 /* The bit pattern matches that used by fixed function fragment
1819 static const uint16_t ps_iter_masks
[] = {
1820 0xffff, /* not used */
1826 assert(log2_ps_iter_samples
< ARRAY_SIZE(ps_iter_masks
));
1828 uint32_t ps_iter_mask
= ps_iter_masks
[log2_ps_iter_samples
];
1830 LLVMValueRef result
, sample_id
;
1831 sample_id
= ac_unpack_param(&ctx
->ac
, abi
->ancillary
, 8, 4);
1832 sample_id
= LLVMBuildShl(ctx
->ac
.builder
, LLVMConstInt(ctx
->ac
.i32
, ps_iter_mask
, false), sample_id
, "");
1833 result
= LLVMBuildAnd(ctx
->ac
.builder
, sample_id
, abi
->sample_coverage
, "");
1838 static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context
*ctx
,
1840 LLVMValueRef
*addrs
);
1843 visit_emit_vertex(struct ac_shader_abi
*abi
, unsigned stream
, LLVMValueRef
*addrs
)
1845 LLVMValueRef gs_next_vertex
;
1846 LLVMValueRef can_emit
;
1847 unsigned offset
= 0;
1848 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1850 if (ctx
->options
->key
.vs_common_out
.as_ngg
) {
1851 gfx10_ngg_gs_emit_vertex(ctx
, stream
, addrs
);
1855 /* Write vertex attribute values to GSVS ring */
1856 gs_next_vertex
= LLVMBuildLoad(ctx
->ac
.builder
,
1857 ctx
->gs_next_vertex
[stream
],
1860 /* If this thread has already emitted the declared maximum number of
1861 * vertices, kill it: excessive vertex emissions are not supposed to
1862 * have any effect, and GS threads have no externally observable
1863 * effects other than emitting vertices.
1865 can_emit
= LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntULT
, gs_next_vertex
,
1866 LLVMConstInt(ctx
->ac
.i32
, ctx
->gs_max_out_vertices
, false), "");
1867 ac_build_kill_if_false(&ctx
->ac
, can_emit
);
1869 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
1870 unsigned output_usage_mask
=
1871 ctx
->shader_info
->info
.gs
.output_usage_mask
[i
];
1872 uint8_t output_stream
=
1873 ctx
->shader_info
->info
.gs
.output_streams
[i
];
1874 LLVMValueRef
*out_ptr
= &addrs
[i
* 4];
1875 int length
= util_last_bit(output_usage_mask
);
1877 if (!(ctx
->output_mask
& (1ull << i
)) ||
1878 output_stream
!= stream
)
1881 for (unsigned j
= 0; j
< length
; j
++) {
1882 if (!(output_usage_mask
& (1 << j
)))
1885 LLVMValueRef out_val
= LLVMBuildLoad(ctx
->ac
.builder
,
1887 LLVMValueRef voffset
=
1888 LLVMConstInt(ctx
->ac
.i32
, offset
*
1889 ctx
->gs_max_out_vertices
, false);
1893 voffset
= LLVMBuildAdd(ctx
->ac
.builder
, voffset
, gs_next_vertex
, "");
1894 voffset
= LLVMBuildMul(ctx
->ac
.builder
, voffset
, LLVMConstInt(ctx
->ac
.i32
, 4, false), "");
1896 out_val
= ac_to_integer(&ctx
->ac
, out_val
);
1897 out_val
= LLVMBuildZExtOrBitCast(ctx
->ac
.builder
, out_val
, ctx
->ac
.i32
, "");
1899 ac_build_buffer_store_dword(&ctx
->ac
,
1900 ctx
->gsvs_ring
[stream
],
1902 voffset
, ctx
->gs2vs_offset
, 0,
1903 ac_glc
| ac_slc
, true);
1907 gs_next_vertex
= LLVMBuildAdd(ctx
->ac
.builder
, gs_next_vertex
,
1909 LLVMBuildStore(ctx
->ac
.builder
, gs_next_vertex
, ctx
->gs_next_vertex
[stream
]);
1911 ac_build_sendmsg(&ctx
->ac
,
1912 AC_SENDMSG_GS_OP_EMIT
| AC_SENDMSG_GS
| (stream
<< 8),
1917 visit_end_primitive(struct ac_shader_abi
*abi
, unsigned stream
)
1919 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1921 if (ctx
->options
->key
.vs_common_out
.as_ngg
) {
1922 LLVMBuildStore(ctx
->ac
.builder
, ctx
->ac
.i32_0
, ctx
->gs_curprim_verts
[stream
]);
1926 ac_build_sendmsg(&ctx
->ac
, AC_SENDMSG_GS_OP_CUT
| AC_SENDMSG_GS
| (stream
<< 8), ctx
->gs_wave_id
);
1930 load_tess_coord(struct ac_shader_abi
*abi
)
1932 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1934 LLVMValueRef coord
[4] = {
1941 if (ctx
->tes_primitive_mode
== GL_TRIANGLES
)
1942 coord
[2] = LLVMBuildFSub(ctx
->ac
.builder
, ctx
->ac
.f32_1
,
1943 LLVMBuildFAdd(ctx
->ac
.builder
, coord
[0], coord
[1], ""), "");
1945 return ac_build_gather_values(&ctx
->ac
, coord
, 3);
1949 load_patch_vertices_in(struct ac_shader_abi
*abi
)
1951 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1952 return LLVMConstInt(ctx
->ac
.i32
, ctx
->options
->key
.tcs
.input_vertices
, false);
1956 static LLVMValueRef
radv_load_base_vertex(struct ac_shader_abi
*abi
)
1958 return abi
->base_vertex
;
1961 static LLVMValueRef
radv_load_ssbo(struct ac_shader_abi
*abi
,
1962 LLVMValueRef buffer_ptr
, bool write
)
1964 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1965 LLVMValueRef result
;
1967 LLVMSetMetadata(buffer_ptr
, ctx
->ac
.uniform_md_kind
, ctx
->ac
.empty_md
);
1969 result
= LLVMBuildLoad(ctx
->ac
.builder
, buffer_ptr
, "");
1970 LLVMSetMetadata(result
, ctx
->ac
.invariant_load_md_kind
, ctx
->ac
.empty_md
);
1975 static LLVMValueRef
radv_load_ubo(struct ac_shader_abi
*abi
, LLVMValueRef buffer_ptr
)
1977 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1978 LLVMValueRef result
;
1980 if (LLVMGetTypeKind(LLVMTypeOf(buffer_ptr
)) != LLVMPointerTypeKind
) {
1981 /* Do not load the descriptor for inlined uniform blocks. */
1985 LLVMSetMetadata(buffer_ptr
, ctx
->ac
.uniform_md_kind
, ctx
->ac
.empty_md
);
1987 result
= LLVMBuildLoad(ctx
->ac
.builder
, buffer_ptr
, "");
1988 LLVMSetMetadata(result
, ctx
->ac
.invariant_load_md_kind
, ctx
->ac
.empty_md
);
1993 static LLVMValueRef
radv_get_sampler_desc(struct ac_shader_abi
*abi
,
1994 unsigned descriptor_set
,
1995 unsigned base_index
,
1996 unsigned constant_index
,
1998 enum ac_descriptor_type desc_type
,
1999 bool image
, bool write
,
2002 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
2003 LLVMValueRef list
= ctx
->descriptor_sets
[descriptor_set
];
2004 struct radv_descriptor_set_layout
*layout
= ctx
->options
->layout
->set
[descriptor_set
].layout
;
2005 struct radv_descriptor_set_binding_layout
*binding
= layout
->binding
+ base_index
;
2006 unsigned offset
= binding
->offset
;
2007 unsigned stride
= binding
->size
;
2009 LLVMBuilderRef builder
= ctx
->ac
.builder
;
2012 assert(base_index
< layout
->binding_count
);
2014 switch (desc_type
) {
2016 type
= ctx
->ac
.v8i32
;
2020 type
= ctx
->ac
.v8i32
;
2024 case AC_DESC_SAMPLER
:
2025 type
= ctx
->ac
.v4i32
;
2026 if (binding
->type
== VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER
) {
2027 offset
+= radv_combined_image_descriptor_sampler_offset(binding
);
2032 case AC_DESC_BUFFER
:
2033 type
= ctx
->ac
.v4i32
;
2036 case AC_DESC_PLANE_0
:
2037 case AC_DESC_PLANE_1
:
2038 case AC_DESC_PLANE_2
:
2039 type
= ctx
->ac
.v8i32
;
2041 offset
+= 32 * (desc_type
- AC_DESC_PLANE_0
);
2044 unreachable("invalid desc_type\n");
2047 offset
+= constant_index
* stride
;
2049 if (desc_type
== AC_DESC_SAMPLER
&& binding
->immutable_samplers_offset
&&
2050 (!index
|| binding
->immutable_samplers_equal
)) {
2051 if (binding
->immutable_samplers_equal
)
2054 const uint32_t *samplers
= radv_immutable_samplers(layout
, binding
);
2056 LLVMValueRef constants
[] = {
2057 LLVMConstInt(ctx
->ac
.i32
, samplers
[constant_index
* 4 + 0], 0),
2058 LLVMConstInt(ctx
->ac
.i32
, samplers
[constant_index
* 4 + 1], 0),
2059 LLVMConstInt(ctx
->ac
.i32
, samplers
[constant_index
* 4 + 2], 0),
2060 LLVMConstInt(ctx
->ac
.i32
, samplers
[constant_index
* 4 + 3], 0),
2062 return ac_build_gather_values(&ctx
->ac
, constants
, 4);
2065 assert(stride
% type_size
== 0);
2067 LLVMValueRef adjusted_index
= index
;
2068 if (!adjusted_index
)
2069 adjusted_index
= ctx
->ac
.i32_0
;
2071 adjusted_index
= LLVMBuildMul(builder
, adjusted_index
, LLVMConstInt(ctx
->ac
.i32
, stride
/ type_size
, 0), "");
2073 LLVMValueRef val_offset
= LLVMConstInt(ctx
->ac
.i32
, offset
, 0);
2074 list
= LLVMBuildGEP(builder
, list
, &val_offset
, 1, "");
2075 list
= LLVMBuildPointerCast(builder
, list
,
2076 ac_array_in_const32_addr_space(type
), "");
2078 LLVMValueRef descriptor
= ac_build_load_to_sgpr(&ctx
->ac
, list
, adjusted_index
);
2080 /* 3 plane formats always have same size and format for plane 1 & 2, so
2081 * use the tail from plane 1 so that we can store only the first 16 bytes
2082 * of the last plane. */
2083 if (desc_type
== AC_DESC_PLANE_2
) {
2084 LLVMValueRef descriptor2
= radv_get_sampler_desc(abi
, descriptor_set
, base_index
, constant_index
, index
, AC_DESC_PLANE_1
,image
, write
, bindless
);
2086 LLVMValueRef components
[8];
2087 for (unsigned i
= 0; i
< 4; ++i
)
2088 components
[i
] = ac_llvm_extract_elem(&ctx
->ac
, descriptor
, i
);
2090 for (unsigned i
= 4; i
< 8; ++i
)
2091 components
[i
] = ac_llvm_extract_elem(&ctx
->ac
, descriptor2
, i
);
2092 descriptor
= ac_build_gather_values(&ctx
->ac
, components
, 8);
2098 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
2099 * so we may need to fix it up. */
2101 adjust_vertex_fetch_alpha(struct radv_shader_context
*ctx
,
2102 unsigned adjustment
,
2105 if (adjustment
== RADV_ALPHA_ADJUST_NONE
)
2108 LLVMValueRef c30
= LLVMConstInt(ctx
->ac
.i32
, 30, 0);
2110 alpha
= LLVMBuildBitCast(ctx
->ac
.builder
, alpha
, ctx
->ac
.f32
, "");
2112 if (adjustment
== RADV_ALPHA_ADJUST_SSCALED
)
2113 alpha
= LLVMBuildFPToUI(ctx
->ac
.builder
, alpha
, ctx
->ac
.i32
, "");
2115 alpha
= ac_to_integer(&ctx
->ac
, alpha
);
2117 /* For the integer-like cases, do a natural sign extension.
2119 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
2120 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
2123 alpha
= LLVMBuildShl(ctx
->ac
.builder
, alpha
,
2124 adjustment
== RADV_ALPHA_ADJUST_SNORM
?
2125 LLVMConstInt(ctx
->ac
.i32
, 7, 0) : c30
, "");
2126 alpha
= LLVMBuildAShr(ctx
->ac
.builder
, alpha
, c30
, "");
2128 /* Convert back to the right type. */
2129 if (adjustment
== RADV_ALPHA_ADJUST_SNORM
) {
2131 LLVMValueRef neg_one
= LLVMConstReal(ctx
->ac
.f32
, -1.0);
2132 alpha
= LLVMBuildSIToFP(ctx
->ac
.builder
, alpha
, ctx
->ac
.f32
, "");
2133 clamp
= LLVMBuildFCmp(ctx
->ac
.builder
, LLVMRealULT
, alpha
, neg_one
, "");
2134 alpha
= LLVMBuildSelect(ctx
->ac
.builder
, clamp
, neg_one
, alpha
, "");
2135 } else if (adjustment
== RADV_ALPHA_ADJUST_SSCALED
) {
2136 alpha
= LLVMBuildSIToFP(ctx
->ac
.builder
, alpha
, ctx
->ac
.f32
, "");
2139 return LLVMBuildBitCast(ctx
->ac
.builder
, alpha
, ctx
->ac
.i32
, "");
2143 get_num_channels_from_data_format(unsigned data_format
)
2145 switch (data_format
) {
2146 case V_008F0C_BUF_DATA_FORMAT_8
:
2147 case V_008F0C_BUF_DATA_FORMAT_16
:
2148 case V_008F0C_BUF_DATA_FORMAT_32
:
2150 case V_008F0C_BUF_DATA_FORMAT_8_8
:
2151 case V_008F0C_BUF_DATA_FORMAT_16_16
:
2152 case V_008F0C_BUF_DATA_FORMAT_32_32
:
2154 case V_008F0C_BUF_DATA_FORMAT_10_11_11
:
2155 case V_008F0C_BUF_DATA_FORMAT_11_11_10
:
2156 case V_008F0C_BUF_DATA_FORMAT_32_32_32
:
2158 case V_008F0C_BUF_DATA_FORMAT_8_8_8_8
:
2159 case V_008F0C_BUF_DATA_FORMAT_10_10_10_2
:
2160 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10
:
2161 case V_008F0C_BUF_DATA_FORMAT_16_16_16_16
:
2162 case V_008F0C_BUF_DATA_FORMAT_32_32_32_32
:
2172 radv_fixup_vertex_input_fetches(struct radv_shader_context
*ctx
,
2174 unsigned num_channels
,
2177 LLVMValueRef zero
= is_float
? ctx
->ac
.f32_0
: ctx
->ac
.i32_0
;
2178 LLVMValueRef one
= is_float
? ctx
->ac
.f32_1
: ctx
->ac
.i32_1
;
2179 LLVMValueRef chan
[4];
2181 if (LLVMGetTypeKind(LLVMTypeOf(value
)) == LLVMVectorTypeKind
) {
2182 unsigned vec_size
= LLVMGetVectorSize(LLVMTypeOf(value
));
2184 if (num_channels
== 4 && num_channels
== vec_size
)
2187 num_channels
= MIN2(num_channels
, vec_size
);
2189 for (unsigned i
= 0; i
< num_channels
; i
++)
2190 chan
[i
] = ac_llvm_extract_elem(&ctx
->ac
, value
, i
);
2193 assert(num_channels
== 1);
2198 for (unsigned i
= num_channels
; i
< 4; i
++) {
2199 chan
[i
] = i
== 3 ? one
: zero
;
2200 chan
[i
] = ac_to_integer(&ctx
->ac
, chan
[i
]);
2203 return ac_build_gather_values(&ctx
->ac
, chan
, 4);
2207 handle_vs_input_decl(struct radv_shader_context
*ctx
,
2208 struct nir_variable
*variable
)
2210 LLVMValueRef t_list_ptr
= ctx
->vertex_buffers
;
2211 LLVMValueRef t_offset
;
2212 LLVMValueRef t_list
;
2214 LLVMValueRef buffer_index
;
2215 unsigned attrib_count
= glsl_count_attribute_slots(variable
->type
, true);
2216 uint8_t input_usage_mask
=
2217 ctx
->shader_info
->info
.vs
.input_usage_mask
[variable
->data
.location
];
2218 unsigned num_input_channels
= util_last_bit(input_usage_mask
);
2220 variable
->data
.driver_location
= variable
->data
.location
* 4;
2222 enum glsl_base_type type
= glsl_get_base_type(variable
->type
);
2223 for (unsigned i
= 0; i
< attrib_count
; ++i
) {
2224 LLVMValueRef output
[4];
2225 unsigned attrib_index
= variable
->data
.location
+ i
- VERT_ATTRIB_GENERIC0
;
2226 unsigned attrib_format
= ctx
->options
->key
.vs
.vertex_attribute_formats
[attrib_index
];
2227 unsigned data_format
= attrib_format
& 0x0f;
2228 unsigned num_format
= (attrib_format
>> 4) & 0x07;
2229 bool is_float
= num_format
!= V_008F0C_BUF_NUM_FORMAT_UINT
&&
2230 num_format
!= V_008F0C_BUF_NUM_FORMAT_SINT
;
2232 if (ctx
->options
->key
.vs
.instance_rate_inputs
& (1u << attrib_index
)) {
2233 uint32_t divisor
= ctx
->options
->key
.vs
.instance_rate_divisors
[attrib_index
];
2236 buffer_index
= ctx
->abi
.instance_id
;
2239 buffer_index
= LLVMBuildUDiv(ctx
->ac
.builder
, buffer_index
,
2240 LLVMConstInt(ctx
->ac
.i32
, divisor
, 0), "");
2243 buffer_index
= ctx
->ac
.i32_0
;
2246 buffer_index
= LLVMBuildAdd(ctx
->ac
.builder
, ctx
->abi
.start_instance
, buffer_index
, "");
2248 buffer_index
= LLVMBuildAdd(ctx
->ac
.builder
, ctx
->abi
.vertex_id
,
2249 ctx
->abi
.base_vertex
, "");
2251 /* Adjust the number of channels to load based on the vertex
2254 unsigned num_format_channels
= get_num_channels_from_data_format(data_format
);
2255 unsigned num_channels
= MIN2(num_input_channels
, num_format_channels
);
2256 unsigned attrib_binding
= ctx
->options
->key
.vs
.vertex_attribute_bindings
[attrib_index
];
2257 unsigned attrib_offset
= ctx
->options
->key
.vs
.vertex_attribute_offsets
[attrib_index
];
2258 unsigned attrib_stride
= ctx
->options
->key
.vs
.vertex_attribute_strides
[attrib_index
];
2260 if (ctx
->options
->key
.vs
.post_shuffle
& (1 << attrib_index
)) {
2261 /* Always load, at least, 3 channels for formats that
2262 * need to be shuffled because X<->Z.
2264 num_channels
= MAX2(num_channels
, 3);
2267 if (attrib_stride
!= 0 && attrib_offset
> attrib_stride
) {
2268 LLVMValueRef buffer_offset
=
2269 LLVMConstInt(ctx
->ac
.i32
,
2270 attrib_offset
/ attrib_stride
, false);
2272 buffer_index
= LLVMBuildAdd(ctx
->ac
.builder
,
2276 attrib_offset
= attrib_offset
% attrib_stride
;
2279 t_offset
= LLVMConstInt(ctx
->ac
.i32
, attrib_binding
, false);
2280 t_list
= ac_build_load_to_sgpr(&ctx
->ac
, t_list_ptr
, t_offset
);
2282 input
= ac_build_struct_tbuffer_load(&ctx
->ac
, t_list
,
2284 LLVMConstInt(ctx
->ac
.i32
, attrib_offset
, false),
2285 ctx
->ac
.i32_0
, ctx
->ac
.i32_0
,
2287 data_format
, num_format
, 0, true);
2289 if (ctx
->options
->key
.vs
.post_shuffle
& (1 << attrib_index
)) {
2291 c
[0] = ac_llvm_extract_elem(&ctx
->ac
, input
, 2);
2292 c
[1] = ac_llvm_extract_elem(&ctx
->ac
, input
, 1);
2293 c
[2] = ac_llvm_extract_elem(&ctx
->ac
, input
, 0);
2294 c
[3] = ac_llvm_extract_elem(&ctx
->ac
, input
, 3);
2296 input
= ac_build_gather_values(&ctx
->ac
, c
, 4);
2299 input
= radv_fixup_vertex_input_fetches(ctx
, input
, num_channels
,
2302 for (unsigned chan
= 0; chan
< 4; chan
++) {
2303 LLVMValueRef llvm_chan
= LLVMConstInt(ctx
->ac
.i32
, chan
, false);
2304 output
[chan
] = LLVMBuildExtractElement(ctx
->ac
.builder
, input
, llvm_chan
, "");
2305 if (type
== GLSL_TYPE_FLOAT16
) {
2306 output
[chan
] = LLVMBuildBitCast(ctx
->ac
.builder
, output
[chan
], ctx
->ac
.f32
, "");
2307 output
[chan
] = LLVMBuildFPTrunc(ctx
->ac
.builder
, output
[chan
], ctx
->ac
.f16
, "");
2311 unsigned alpha_adjust
= (ctx
->options
->key
.vs
.alpha_adjust
>> (attrib_index
* 2)) & 3;
2312 output
[3] = adjust_vertex_fetch_alpha(ctx
, alpha_adjust
, output
[3]);
2314 for (unsigned chan
= 0; chan
< 4; chan
++) {
2315 output
[chan
] = ac_to_integer(&ctx
->ac
, output
[chan
]);
2316 if (type
== GLSL_TYPE_UINT16
|| type
== GLSL_TYPE_INT16
)
2317 output
[chan
] = LLVMBuildTrunc(ctx
->ac
.builder
, output
[chan
], ctx
->ac
.i16
, "");
2319 ctx
->inputs
[ac_llvm_reg_index_soa(variable
->data
.location
+ i
, chan
)] = output
[chan
];
2325 handle_vs_inputs(struct radv_shader_context
*ctx
,
2326 struct nir_shader
*nir
) {
2327 nir_foreach_variable(variable
, &nir
->inputs
)
2328 handle_vs_input_decl(ctx
, variable
);
2332 prepare_interp_optimize(struct radv_shader_context
*ctx
,
2333 struct nir_shader
*nir
)
2335 bool uses_center
= false;
2336 bool uses_centroid
= false;
2337 nir_foreach_variable(variable
, &nir
->inputs
) {
2338 if (glsl_get_base_type(glsl_without_array(variable
->type
)) != GLSL_TYPE_FLOAT
||
2339 variable
->data
.sample
)
2342 if (variable
->data
.centroid
)
2343 uses_centroid
= true;
2348 if (uses_center
&& uses_centroid
) {
2349 LLVMValueRef sel
= LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntSLT
, ctx
->abi
.prim_mask
, ctx
->ac
.i32_0
, "");
2350 ctx
->abi
.persp_centroid
= LLVMBuildSelect(ctx
->ac
.builder
, sel
, ctx
->abi
.persp_center
, ctx
->abi
.persp_centroid
, "");
2351 ctx
->abi
.linear_centroid
= LLVMBuildSelect(ctx
->ac
.builder
, sel
, ctx
->abi
.linear_center
, ctx
->abi
.linear_centroid
, "");
2356 scan_shader_output_decl(struct radv_shader_context
*ctx
,
2357 struct nir_variable
*variable
,
2358 struct nir_shader
*shader
,
2359 gl_shader_stage stage
)
2361 int idx
= variable
->data
.location
+ variable
->data
.index
;
2362 unsigned attrib_count
= glsl_count_attribute_slots(variable
->type
, false);
2363 uint64_t mask_attribs
;
2365 variable
->data
.driver_location
= idx
* 4;
2367 /* tess ctrl has it's own load/store paths for outputs */
2368 if (stage
== MESA_SHADER_TESS_CTRL
)
2371 if (variable
->data
.compact
) {
2372 unsigned component_count
= variable
->data
.location_frac
+
2373 glsl_get_length(variable
->type
);
2374 attrib_count
= (component_count
+ 3) / 4;
2377 mask_attribs
= ((1ull << attrib_count
) - 1) << idx
;
2378 if (stage
== MESA_SHADER_VERTEX
||
2379 stage
== MESA_SHADER_TESS_EVAL
||
2380 stage
== MESA_SHADER_GEOMETRY
) {
2381 if (idx
== VARYING_SLOT_CLIP_DIST0
) {
2382 if (stage
== MESA_SHADER_VERTEX
) {
2383 ctx
->shader_info
->vs
.outinfo
.clip_dist_mask
= (1 << shader
->info
.clip_distance_array_size
) - 1;
2384 ctx
->shader_info
->vs
.outinfo
.cull_dist_mask
= (1 << shader
->info
.cull_distance_array_size
) - 1;
2385 ctx
->shader_info
->vs
.outinfo
.cull_dist_mask
<<= shader
->info
.clip_distance_array_size
;
2387 if (stage
== MESA_SHADER_TESS_EVAL
) {
2388 ctx
->shader_info
->tes
.outinfo
.clip_dist_mask
= (1 << shader
->info
.clip_distance_array_size
) - 1;
2389 ctx
->shader_info
->tes
.outinfo
.cull_dist_mask
= (1 << shader
->info
.cull_distance_array_size
) - 1;
2390 ctx
->shader_info
->tes
.outinfo
.cull_dist_mask
<<= shader
->info
.clip_distance_array_size
;
2392 if (stage
== MESA_SHADER_GEOMETRY
) {
2393 ctx
->shader_info
->vs
.outinfo
.clip_dist_mask
= (1 << shader
->info
.clip_distance_array_size
) - 1;
2394 ctx
->shader_info
->vs
.outinfo
.cull_dist_mask
= (1 << shader
->info
.cull_distance_array_size
) - 1;
2395 ctx
->shader_info
->vs
.outinfo
.cull_dist_mask
<<= shader
->info
.clip_distance_array_size
;
2400 ctx
->output_mask
|= mask_attribs
;
2404 /* Initialize arguments for the shader export intrinsic */
2406 si_llvm_init_export_args(struct radv_shader_context
*ctx
,
2407 LLVMValueRef
*values
,
2408 unsigned enabled_channels
,
2410 struct ac_export_args
*args
)
2412 /* Specify the channels that are enabled. */
2413 args
->enabled_channels
= enabled_channels
;
2415 /* Specify whether the EXEC mask represents the valid mask */
2416 args
->valid_mask
= 0;
2418 /* Specify whether this is the last export */
2421 /* Specify the target we are exporting */
2422 args
->target
= target
;
2424 args
->compr
= false;
2425 args
->out
[0] = LLVMGetUndef(ctx
->ac
.f32
);
2426 args
->out
[1] = LLVMGetUndef(ctx
->ac
.f32
);
2427 args
->out
[2] = LLVMGetUndef(ctx
->ac
.f32
);
2428 args
->out
[3] = LLVMGetUndef(ctx
->ac
.f32
);
2433 bool is_16bit
= ac_get_type_size(LLVMTypeOf(values
[0])) == 2;
2434 if (ctx
->stage
== MESA_SHADER_FRAGMENT
) {
2435 unsigned index
= target
- V_008DFC_SQ_EXP_MRT
;
2436 unsigned col_format
= (ctx
->options
->key
.fs
.col_format
>> (4 * index
)) & 0xf;
2437 bool is_int8
= (ctx
->options
->key
.fs
.is_int8
>> index
) & 1;
2438 bool is_int10
= (ctx
->options
->key
.fs
.is_int10
>> index
) & 1;
2441 LLVMValueRef (*packf
)(struct ac_llvm_context
*ctx
, LLVMValueRef args
[2]) = NULL
;
2442 LLVMValueRef (*packi
)(struct ac_llvm_context
*ctx
, LLVMValueRef args
[2],
2443 unsigned bits
, bool hi
) = NULL
;
2445 switch(col_format
) {
2446 case V_028714_SPI_SHADER_ZERO
:
2447 args
->enabled_channels
= 0; /* writemask */
2448 args
->target
= V_008DFC_SQ_EXP_NULL
;
2451 case V_028714_SPI_SHADER_32_R
:
2452 args
->enabled_channels
= 1;
2453 args
->out
[0] = values
[0];
2456 case V_028714_SPI_SHADER_32_GR
:
2457 args
->enabled_channels
= 0x3;
2458 args
->out
[0] = values
[0];
2459 args
->out
[1] = values
[1];
2462 case V_028714_SPI_SHADER_32_AR
:
2463 if (ctx
->ac
.chip_class
>= GFX10
) {
2464 args
->enabled_channels
= 0x3;
2465 args
->out
[0] = values
[0];
2466 args
->out
[1] = values
[3];
2468 args
->enabled_channels
= 0x9;
2469 args
->out
[0] = values
[0];
2470 args
->out
[3] = values
[3];
2474 case V_028714_SPI_SHADER_FP16_ABGR
:
2475 args
->enabled_channels
= 0x5;
2476 packf
= ac_build_cvt_pkrtz_f16
;
2478 for (unsigned chan
= 0; chan
< 4; chan
++)
2479 values
[chan
] = LLVMBuildFPExt(ctx
->ac
.builder
,
2485 case V_028714_SPI_SHADER_UNORM16_ABGR
:
2486 args
->enabled_channels
= 0x5;
2487 packf
= ac_build_cvt_pknorm_u16
;
2490 case V_028714_SPI_SHADER_SNORM16_ABGR
:
2491 args
->enabled_channels
= 0x5;
2492 packf
= ac_build_cvt_pknorm_i16
;
2495 case V_028714_SPI_SHADER_UINT16_ABGR
:
2496 args
->enabled_channels
= 0x5;
2497 packi
= ac_build_cvt_pk_u16
;
2499 for (unsigned chan
= 0; chan
< 4; chan
++)
2500 values
[chan
] = LLVMBuildZExt(ctx
->ac
.builder
,
2501 ac_to_integer(&ctx
->ac
, values
[chan
]),
2506 case V_028714_SPI_SHADER_SINT16_ABGR
:
2507 args
->enabled_channels
= 0x5;
2508 packi
= ac_build_cvt_pk_i16
;
2510 for (unsigned chan
= 0; chan
< 4; chan
++)
2511 values
[chan
] = LLVMBuildSExt(ctx
->ac
.builder
,
2512 ac_to_integer(&ctx
->ac
, values
[chan
]),
2518 case V_028714_SPI_SHADER_32_ABGR
:
2519 memcpy(&args
->out
[0], values
, sizeof(values
[0]) * 4);
2523 /* Pack f16 or norm_i16/u16. */
2525 for (chan
= 0; chan
< 2; chan
++) {
2526 LLVMValueRef pack_args
[2] = {
2528 values
[2 * chan
+ 1]
2530 LLVMValueRef packed
;
2532 packed
= packf(&ctx
->ac
, pack_args
);
2533 args
->out
[chan
] = ac_to_float(&ctx
->ac
, packed
);
2535 args
->compr
= 1; /* COMPR flag */
2540 for (chan
= 0; chan
< 2; chan
++) {
2541 LLVMValueRef pack_args
[2] = {
2542 ac_to_integer(&ctx
->ac
, values
[2 * chan
]),
2543 ac_to_integer(&ctx
->ac
, values
[2 * chan
+ 1])
2545 LLVMValueRef packed
;
2547 packed
= packi(&ctx
->ac
, pack_args
,
2548 is_int8
? 8 : is_int10
? 10 : 16,
2550 args
->out
[chan
] = ac_to_float(&ctx
->ac
, packed
);
2552 args
->compr
= 1; /* COMPR flag */
2558 for (unsigned chan
= 0; chan
< 4; chan
++) {
2559 values
[chan
] = LLVMBuildBitCast(ctx
->ac
.builder
, values
[chan
], ctx
->ac
.i16
, "");
2560 args
->out
[chan
] = LLVMBuildZExt(ctx
->ac
.builder
, values
[chan
], ctx
->ac
.i32
, "");
2563 memcpy(&args
->out
[0], values
, sizeof(values
[0]) * 4);
2565 for (unsigned i
= 0; i
< 4; ++i
)
2566 args
->out
[i
] = ac_to_float(&ctx
->ac
, args
->out
[i
]);
2570 radv_export_param(struct radv_shader_context
*ctx
, unsigned index
,
2571 LLVMValueRef
*values
, unsigned enabled_channels
)
2573 struct ac_export_args args
;
2575 si_llvm_init_export_args(ctx
, values
, enabled_channels
,
2576 V_008DFC_SQ_EXP_PARAM
+ index
, &args
);
2577 ac_build_export(&ctx
->ac
, &args
);
2581 radv_load_output(struct radv_shader_context
*ctx
, unsigned index
, unsigned chan
)
2583 LLVMValueRef output
= ctx
->abi
.outputs
[ac_llvm_reg_index_soa(index
, chan
)];
2584 return LLVMBuildLoad(ctx
->ac
.builder
, output
, "");
2588 radv_emit_stream_output(struct radv_shader_context
*ctx
,
2589 LLVMValueRef
const *so_buffers
,
2590 LLVMValueRef
const *so_write_offsets
,
2591 const struct radv_stream_output
*output
,
2592 struct radv_shader_output_values
*shader_out
)
2594 unsigned num_comps
= util_bitcount(output
->component_mask
);
2595 unsigned buf
= output
->buffer
;
2596 unsigned offset
= output
->offset
;
2598 LLVMValueRef out
[4];
2600 assert(num_comps
&& num_comps
<= 4);
2601 if (!num_comps
|| num_comps
> 4)
2604 /* Get the first component. */
2605 start
= ffs(output
->component_mask
) - 1;
2607 /* Load the output as int. */
2608 for (int i
= 0; i
< num_comps
; i
++) {
2609 out
[i
] = ac_to_integer(&ctx
->ac
, shader_out
->values
[start
+ i
]);
2612 /* Pack the output. */
2613 LLVMValueRef vdata
= NULL
;
2615 switch (num_comps
) {
2616 case 1: /* as i32 */
2619 case 2: /* as v2i32 */
2620 case 3: /* as v4i32 (aligned to 4) */
2621 out
[3] = LLVMGetUndef(ctx
->ac
.i32
);
2623 case 4: /* as v4i32 */
2624 vdata
= ac_build_gather_values(&ctx
->ac
, out
,
2625 !ac_has_vec3_support(ctx
->ac
.chip_class
, false) ?
2626 util_next_power_of_two(num_comps
) :
2631 ac_build_buffer_store_dword(&ctx
->ac
, so_buffers
[buf
],
2632 vdata
, num_comps
, so_write_offsets
[buf
],
2633 ctx
->ac
.i32_0
, offset
,
2634 ac_glc
| ac_slc
, false);
2638 radv_emit_streamout(struct radv_shader_context
*ctx
, unsigned stream
)
2640 struct ac_build_if_state if_ctx
;
2643 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2644 assert(ctx
->streamout_config
);
2645 LLVMValueRef so_vtx_count
=
2646 ac_build_bfe(&ctx
->ac
, ctx
->streamout_config
,
2647 LLVMConstInt(ctx
->ac
.i32
, 16, false),
2648 LLVMConstInt(ctx
->ac
.i32
, 7, false), false);
2650 LLVMValueRef tid
= ac_get_thread_id(&ctx
->ac
);
2652 /* can_emit = tid < so_vtx_count; */
2653 LLVMValueRef can_emit
= LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntULT
,
2654 tid
, so_vtx_count
, "");
2656 /* Emit the streamout code conditionally. This actually avoids
2657 * out-of-bounds buffer access. The hw tells us via the SGPR
2658 * (so_vtx_count) which threads are allowed to emit streamout data.
2660 ac_nir_build_if(&if_ctx
, ctx
, can_emit
);
2662 /* The buffer offset is computed as follows:
2663 * ByteOffset = streamout_offset[buffer_id]*4 +
2664 * (streamout_write_index + thread_id)*stride[buffer_id] +
2667 LLVMValueRef so_write_index
= ctx
->streamout_write_idx
;
2669 /* Compute (streamout_write_index + thread_id). */
2671 LLVMBuildAdd(ctx
->ac
.builder
, so_write_index
, tid
, "");
2673 /* Load the descriptor and compute the write offset for each
2676 LLVMValueRef so_write_offset
[4] = {};
2677 LLVMValueRef so_buffers
[4] = {};
2678 LLVMValueRef buf_ptr
= ctx
->streamout_buffers
;
2680 for (i
= 0; i
< 4; i
++) {
2681 uint16_t stride
= ctx
->shader_info
->info
.so
.strides
[i
];
2686 LLVMValueRef offset
=
2687 LLVMConstInt(ctx
->ac
.i32
, i
, false);
2689 so_buffers
[i
] = ac_build_load_to_sgpr(&ctx
->ac
,
2692 LLVMValueRef so_offset
= ctx
->streamout_offset
[i
];
2694 so_offset
= LLVMBuildMul(ctx
->ac
.builder
, so_offset
,
2695 LLVMConstInt(ctx
->ac
.i32
, 4, false), "");
2697 so_write_offset
[i
] =
2698 ac_build_imad(&ctx
->ac
, so_write_index
,
2699 LLVMConstInt(ctx
->ac
.i32
,
2704 /* Write streamout data. */
2705 for (i
= 0; i
< ctx
->shader_info
->info
.so
.num_outputs
; i
++) {
2706 struct radv_shader_output_values shader_out
= {};
2707 struct radv_stream_output
*output
=
2708 &ctx
->shader_info
->info
.so
.outputs
[i
];
2710 if (stream
!= output
->stream
)
2713 for (int j
= 0; j
< 4; j
++) {
2714 shader_out
.values
[j
] =
2715 radv_load_output(ctx
, output
->location
, j
);
2718 radv_emit_stream_output(ctx
, so_buffers
,so_write_offset
,
2719 output
, &shader_out
);
2722 ac_nir_build_endif(&if_ctx
);
2726 radv_build_param_exports(struct radv_shader_context
*ctx
,
2727 struct radv_shader_output_values
*outputs
,
2729 struct radv_vs_output_info
*outinfo
,
2730 bool export_clip_dists
)
2732 unsigned param_count
= 0;
2734 for (unsigned i
= 0; i
< noutput
; i
++) {
2735 unsigned slot_name
= outputs
[i
].slot_name
;
2736 unsigned usage_mask
= outputs
[i
].usage_mask
;
2738 if (slot_name
!= VARYING_SLOT_LAYER
&&
2739 slot_name
!= VARYING_SLOT_PRIMITIVE_ID
&&
2740 slot_name
!= VARYING_SLOT_CLIP_DIST0
&&
2741 slot_name
!= VARYING_SLOT_CLIP_DIST1
&&
2742 slot_name
< VARYING_SLOT_VAR0
)
2745 if ((slot_name
== VARYING_SLOT_CLIP_DIST0
||
2746 slot_name
== VARYING_SLOT_CLIP_DIST1
) && !export_clip_dists
)
2749 radv_export_param(ctx
, param_count
, outputs
[i
].values
, usage_mask
);
2751 assert(i
< ARRAY_SIZE(outinfo
->vs_output_param_offset
));
2752 outinfo
->vs_output_param_offset
[slot_name
] = param_count
++;
2755 outinfo
->param_exports
= param_count
;
2758 /* Generate export instructions for hardware VS shader stage or NGG GS stage
2759 * (position and parameter data only).
2762 radv_llvm_export_vs(struct radv_shader_context
*ctx
,
2763 struct radv_shader_output_values
*outputs
,
2765 struct radv_vs_output_info
*outinfo
,
2766 bool export_clip_dists
)
2768 LLVMValueRef psize_value
= NULL
, layer_value
= NULL
, viewport_value
= NULL
;
2769 struct ac_export_args pos_args
[4] = {};
2770 unsigned pos_idx
, index
;
2773 /* Build position exports */
2774 for (i
= 0; i
< noutput
; i
++) {
2775 switch (outputs
[i
].slot_name
) {
2776 case VARYING_SLOT_POS
:
2777 si_llvm_init_export_args(ctx
, outputs
[i
].values
, 0xf,
2778 V_008DFC_SQ_EXP_POS
, &pos_args
[0]);
2780 case VARYING_SLOT_PSIZ
:
2781 psize_value
= outputs
[i
].values
[0];
2783 case VARYING_SLOT_LAYER
:
2784 layer_value
= outputs
[i
].values
[0];
2786 case VARYING_SLOT_VIEWPORT
:
2787 viewport_value
= outputs
[i
].values
[0];
2789 case VARYING_SLOT_CLIP_DIST0
:
2790 case VARYING_SLOT_CLIP_DIST1
:
2791 index
= 2 + outputs
[i
].slot_index
;
2792 si_llvm_init_export_args(ctx
, outputs
[i
].values
, 0xf,
2793 V_008DFC_SQ_EXP_POS
+ index
,
2801 /* We need to add the position output manually if it's missing. */
2802 if (!pos_args
[0].out
[0]) {
2803 pos_args
[0].enabled_channels
= 0xf; /* writemask */
2804 pos_args
[0].valid_mask
= 0; /* EXEC mask */
2805 pos_args
[0].done
= 0; /* last export? */
2806 pos_args
[0].target
= V_008DFC_SQ_EXP_POS
;
2807 pos_args
[0].compr
= 0; /* COMPR flag */
2808 pos_args
[0].out
[0] = ctx
->ac
.f32_0
; /* X */
2809 pos_args
[0].out
[1] = ctx
->ac
.f32_0
; /* Y */
2810 pos_args
[0].out
[2] = ctx
->ac
.f32_0
; /* Z */
2811 pos_args
[0].out
[3] = ctx
->ac
.f32_1
; /* W */
2814 if (outinfo
->writes_pointsize
||
2815 outinfo
->writes_layer
||
2816 outinfo
->writes_viewport_index
) {
2817 pos_args
[1].enabled_channels
= ((outinfo
->writes_pointsize
== true ? 1 : 0) |
2818 (outinfo
->writes_layer
== true ? 4 : 0));
2819 pos_args
[1].valid_mask
= 0;
2820 pos_args
[1].done
= 0;
2821 pos_args
[1].target
= V_008DFC_SQ_EXP_POS
+ 1;
2822 pos_args
[1].compr
= 0;
2823 pos_args
[1].out
[0] = ctx
->ac
.f32_0
; /* X */
2824 pos_args
[1].out
[1] = ctx
->ac
.f32_0
; /* Y */
2825 pos_args
[1].out
[2] = ctx
->ac
.f32_0
; /* Z */
2826 pos_args
[1].out
[3] = ctx
->ac
.f32_0
; /* W */
2828 if (outinfo
->writes_pointsize
== true)
2829 pos_args
[1].out
[0] = psize_value
;
2830 if (outinfo
->writes_layer
== true)
2831 pos_args
[1].out
[2] = layer_value
;
2832 if (outinfo
->writes_viewport_index
== true) {
2833 if (ctx
->options
->chip_class
>= GFX9
) {
2834 /* GFX9 has the layer in out.z[10:0] and the viewport
2835 * index in out.z[19:16].
2837 LLVMValueRef v
= viewport_value
;
2838 v
= ac_to_integer(&ctx
->ac
, v
);
2839 v
= LLVMBuildShl(ctx
->ac
.builder
, v
,
2840 LLVMConstInt(ctx
->ac
.i32
, 16, false),
2842 v
= LLVMBuildOr(ctx
->ac
.builder
, v
,
2843 ac_to_integer(&ctx
->ac
, pos_args
[1].out
[2]), "");
2845 pos_args
[1].out
[2] = ac_to_float(&ctx
->ac
, v
);
2846 pos_args
[1].enabled_channels
|= 1 << 2;
2848 pos_args
[1].out
[3] = viewport_value
;
2849 pos_args
[1].enabled_channels
|= 1 << 3;
2854 for (i
= 0; i
< 4; i
++) {
2855 if (pos_args
[i
].out
[0])
2856 outinfo
->pos_exports
++;
2859 /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
2860 * Setting valid_mask=1 prevents it and has no other effect.
2862 if (ctx
->ac
.family
== CHIP_NAVI10
||
2863 ctx
->ac
.family
== CHIP_NAVI12
||
2864 ctx
->ac
.family
== CHIP_NAVI14
)
2865 pos_args
[0].valid_mask
= 1;
2868 for (i
= 0; i
< 4; i
++) {
2869 if (!pos_args
[i
].out
[0])
2872 /* Specify the target we are exporting */
2873 pos_args
[i
].target
= V_008DFC_SQ_EXP_POS
+ pos_idx
++;
2875 if (pos_idx
== outinfo
->pos_exports
)
2876 /* Specify that this is the last export */
2877 pos_args
[i
].done
= 1;
2879 ac_build_export(&ctx
->ac
, &pos_args
[i
]);
2882 /* Build parameter exports */
2883 radv_build_param_exports(ctx
, outputs
, noutput
, outinfo
, export_clip_dists
);
2887 handle_vs_outputs_post(struct radv_shader_context
*ctx
,
2888 bool export_prim_id
,
2889 bool export_clip_dists
,
2890 struct radv_vs_output_info
*outinfo
)
2892 struct radv_shader_output_values
*outputs
;
2893 unsigned noutput
= 0;
2895 if (ctx
->options
->key
.has_multiview_view_index
) {
2896 LLVMValueRef
* tmp_out
= &ctx
->abi
.outputs
[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER
, 0)];
2898 for(unsigned i
= 0; i
< 4; ++i
)
2899 ctx
->abi
.outputs
[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER
, i
)] =
2900 ac_build_alloca_undef(&ctx
->ac
, ctx
->ac
.f32
, "");
2903 LLVMBuildStore(ctx
->ac
.builder
, ac_to_float(&ctx
->ac
, ctx
->abi
.view_index
), *tmp_out
);
2904 ctx
->output_mask
|= 1ull << VARYING_SLOT_LAYER
;
2907 memset(outinfo
->vs_output_param_offset
, AC_EXP_PARAM_UNDEFINED
,
2908 sizeof(outinfo
->vs_output_param_offset
));
2909 outinfo
->pos_exports
= 0;
2911 if (ctx
->output_mask
& (1ull << VARYING_SLOT_PSIZ
)) {
2912 outinfo
->writes_pointsize
= true;
2915 if (ctx
->output_mask
& (1ull << VARYING_SLOT_LAYER
)) {
2916 outinfo
->writes_layer
= true;
2919 if (ctx
->output_mask
& (1ull << VARYING_SLOT_VIEWPORT
)) {
2920 outinfo
->writes_viewport_index
= true;
2923 if (ctx
->shader_info
->info
.so
.num_outputs
&&
2924 !ctx
->is_gs_copy_shader
) {
2925 /* The GS copy shader emission already emits streamout. */
2926 radv_emit_streamout(ctx
, 0);
2929 /* Allocate a temporary array for the output values. */
2930 unsigned num_outputs
= util_bitcount64(ctx
->output_mask
) + export_prim_id
;
2931 outputs
= malloc(num_outputs
* sizeof(outputs
[0]));
2933 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
2934 if (!(ctx
->output_mask
& (1ull << i
)))
2937 outputs
[noutput
].slot_name
= i
;
2938 outputs
[noutput
].slot_index
= i
== VARYING_SLOT_CLIP_DIST1
;
2940 if (ctx
->stage
== MESA_SHADER_VERTEX
&&
2941 !ctx
->is_gs_copy_shader
) {
2942 outputs
[noutput
].usage_mask
=
2943 ctx
->shader_info
->info
.vs
.output_usage_mask
[i
];
2944 } else if (ctx
->stage
== MESA_SHADER_TESS_EVAL
) {
2945 outputs
[noutput
].usage_mask
=
2946 ctx
->shader_info
->info
.tes
.output_usage_mask
[i
];
2948 assert(ctx
->is_gs_copy_shader
);
2949 outputs
[noutput
].usage_mask
=
2950 ctx
->shader_info
->info
.gs
.output_usage_mask
[i
];
2953 for (unsigned j
= 0; j
< 4; j
++) {
2954 outputs
[noutput
].values
[j
] =
2955 ac_to_float(&ctx
->ac
, radv_load_output(ctx
, i
, j
));
2961 /* Export PrimitiveID. */
2962 if (export_prim_id
) {
2963 outinfo
->export_prim_id
= true;
2965 outputs
[noutput
].slot_name
= VARYING_SLOT_PRIMITIVE_ID
;
2966 outputs
[noutput
].slot_index
= 0;
2967 outputs
[noutput
].usage_mask
= 0x1;
2968 outputs
[noutput
].values
[0] = ctx
->vs_prim_id
;
2969 for (unsigned j
= 1; j
< 4; j
++)
2970 outputs
[noutput
].values
[j
] = ctx
->ac
.f32_0
;
2974 radv_llvm_export_vs(ctx
, outputs
, noutput
, outinfo
, export_clip_dists
);
2980 handle_es_outputs_post(struct radv_shader_context
*ctx
,
2981 struct radv_es_output_info
*outinfo
)
2984 uint64_t max_output_written
= 0;
2985 LLVMValueRef lds_base
= NULL
;
2987 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
2990 if (!(ctx
->output_mask
& (1ull << i
)))
2993 param_index
= shader_io_get_unique_index(i
);
2995 max_output_written
= MAX2(param_index
, max_output_written
);
2998 outinfo
->esgs_itemsize
= (max_output_written
+ 1) * 16;
3000 if (ctx
->ac
.chip_class
>= GFX9
) {
3001 unsigned itemsize_dw
= outinfo
->esgs_itemsize
/ 4;
3002 LLVMValueRef vertex_idx
= ac_get_thread_id(&ctx
->ac
);
3003 LLVMValueRef wave_idx
= ac_unpack_param(&ctx
->ac
, ctx
->merged_wave_info
, 24, 4);
3004 vertex_idx
= LLVMBuildOr(ctx
->ac
.builder
, vertex_idx
,
3005 LLVMBuildMul(ctx
->ac
.builder
, wave_idx
,
3006 LLVMConstInt(ctx
->ac
.i32
,
3007 ctx
->ac
.wave_size
, false), ""), "");
3008 lds_base
= LLVMBuildMul(ctx
->ac
.builder
, vertex_idx
,
3009 LLVMConstInt(ctx
->ac
.i32
, itemsize_dw
, 0), "");
3012 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
3013 LLVMValueRef dw_addr
= NULL
;
3014 LLVMValueRef
*out_ptr
= &ctx
->abi
.outputs
[i
* 4];
3015 unsigned output_usage_mask
;
3018 if (!(ctx
->output_mask
& (1ull << i
)))
3021 if (ctx
->stage
== MESA_SHADER_VERTEX
) {
3023 ctx
->shader_info
->info
.vs
.output_usage_mask
[i
];
3025 assert(ctx
->stage
== MESA_SHADER_TESS_EVAL
);
3027 ctx
->shader_info
->info
.tes
.output_usage_mask
[i
];
3030 param_index
= shader_io_get_unique_index(i
);
3033 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, lds_base
,
3034 LLVMConstInt(ctx
->ac
.i32
, param_index
* 4, false),
3038 for (j
= 0; j
< 4; j
++) {
3039 if (!(output_usage_mask
& (1 << j
)))
3042 LLVMValueRef out_val
= LLVMBuildLoad(ctx
->ac
.builder
, out_ptr
[j
], "");
3043 out_val
= ac_to_integer(&ctx
->ac
, out_val
);
3044 out_val
= LLVMBuildZExtOrBitCast(ctx
->ac
.builder
, out_val
, ctx
->ac
.i32
, "");
3046 if (ctx
->ac
.chip_class
>= GFX9
) {
3047 LLVMValueRef dw_addr_offset
=
3048 LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
3049 LLVMConstInt(ctx
->ac
.i32
,
3052 ac_lds_store(&ctx
->ac
, dw_addr_offset
, out_val
);
3054 ac_build_buffer_store_dword(&ctx
->ac
,
3057 NULL
, ctx
->es2gs_offset
,
3058 (4 * param_index
+ j
) * 4,
3059 ac_glc
| ac_slc
, true);
3066 handle_ls_outputs_post(struct radv_shader_context
*ctx
)
3068 LLVMValueRef vertex_id
= ctx
->rel_auto_id
;
3069 uint32_t num_tcs_inputs
= util_last_bit64(ctx
->shader_info
->info
.vs
.ls_outputs_written
);
3070 LLVMValueRef vertex_dw_stride
= LLVMConstInt(ctx
->ac
.i32
, num_tcs_inputs
* 4, false);
3071 LLVMValueRef base_dw_addr
= LLVMBuildMul(ctx
->ac
.builder
, vertex_id
,
3072 vertex_dw_stride
, "");
3074 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
3075 LLVMValueRef
*out_ptr
= &ctx
->abi
.outputs
[i
* 4];
3077 if (!(ctx
->output_mask
& (1ull << i
)))
3080 int param
= shader_io_get_unique_index(i
);
3081 LLVMValueRef dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, base_dw_addr
,
3082 LLVMConstInt(ctx
->ac
.i32
, param
* 4, false),
3084 for (unsigned j
= 0; j
< 4; j
++) {
3085 LLVMValueRef value
= LLVMBuildLoad(ctx
->ac
.builder
, out_ptr
[j
], "");
3086 value
= ac_to_integer(&ctx
->ac
, value
);
3087 value
= LLVMBuildZExtOrBitCast(ctx
->ac
.builder
, value
, ctx
->ac
.i32
, "");
3088 ac_lds_store(&ctx
->ac
, dw_addr
, value
);
3089 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
, ctx
->ac
.i32_1
, "");
3094 static LLVMValueRef
get_wave_id_in_tg(struct radv_shader_context
*ctx
)
3096 return ac_unpack_param(&ctx
->ac
, ctx
->merged_wave_info
, 24, 4);
3099 static LLVMValueRef
get_tgsize(struct radv_shader_context
*ctx
)
3101 return ac_unpack_param(&ctx
->ac
, ctx
->merged_wave_info
, 28, 4);
3104 static LLVMValueRef
get_thread_id_in_tg(struct radv_shader_context
*ctx
)
3106 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3108 tmp
= LLVMBuildMul(builder
, get_wave_id_in_tg(ctx
),
3109 LLVMConstInt(ctx
->ac
.i32
, ctx
->ac
.wave_size
, false), "");
3110 return LLVMBuildAdd(builder
, tmp
, ac_get_thread_id(&ctx
->ac
), "");
3113 static LLVMValueRef
ngg_get_vtx_cnt(struct radv_shader_context
*ctx
)
3115 return ac_build_bfe(&ctx
->ac
, ctx
->gs_tg_info
,
3116 LLVMConstInt(ctx
->ac
.i32
, 12, false),
3117 LLVMConstInt(ctx
->ac
.i32
, 9, false),
3121 static LLVMValueRef
ngg_get_prim_cnt(struct radv_shader_context
*ctx
)
3123 return ac_build_bfe(&ctx
->ac
, ctx
->gs_tg_info
,
3124 LLVMConstInt(ctx
->ac
.i32
, 22, false),
3125 LLVMConstInt(ctx
->ac
.i32
, 9, false),
3130 ngg_gs_get_vertex_storage(struct radv_shader_context
*ctx
)
3132 unsigned num_outputs
= util_bitcount64(ctx
->output_mask
);
3134 LLVMTypeRef elements
[2] = {
3135 LLVMArrayType(ctx
->ac
.i32
, 4 * num_outputs
),
3136 LLVMArrayType(ctx
->ac
.i8
, 4),
3138 LLVMTypeRef type
= LLVMStructTypeInContext(ctx
->ac
.context
, elements
, 2, false);
3139 type
= LLVMPointerType(LLVMArrayType(type
, 0), AC_ADDR_SPACE_LDS
);
3140 return LLVMBuildBitCast(ctx
->ac
.builder
, ctx
->gs_ngg_emit
, type
, "");
3144 * Return a pointer to the LDS storage reserved for the N'th vertex, where N
3145 * is in emit order; that is:
3146 * - during the epilogue, N is the threadidx (relative to the entire threadgroup)
3147 * - during vertex emit, i.e. while the API GS shader invocation is running,
3148 * N = threadidx * gs_max_out_vertices + emitidx
3150 * Goals of the LDS memory layout:
3151 * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
3152 * in uniform control flow
3153 * 2. Eliminate bank conflicts on read for export if, additionally, there is no
3155 * 3. Agnostic to the number of waves (since we don't know it before compiling)
3156 * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
3157 * 5. Avoid wasting memory.
3159 * We use an AoS layout due to point 4 (this also helps point 3). In an AoS
3160 * layout, elimination of bank conflicts requires that each vertex occupy an
3161 * odd number of dwords. We use the additional dword to store the output stream
3162 * index as well as a flag to indicate whether this vertex ends a primitive
3163 * for rasterization.
3165 * Swizzling is required to satisfy points 1 and 2 simultaneously.
3167 * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).
3168 * Indices are swizzled in groups of 32, which ensures point 1 without
3169 * disturbing point 2.
3171 * \return an LDS pointer to type {[N x i32], [4 x i8]}
3174 ngg_gs_vertex_ptr(struct radv_shader_context
*ctx
, LLVMValueRef vertexidx
)
3176 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3177 LLVMValueRef storage
= ngg_gs_get_vertex_storage(ctx
);
3179 /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
3180 unsigned write_stride_2exp
= ffs(ctx
->gs_max_out_vertices
) - 1;
3181 if (write_stride_2exp
) {
3183 LLVMBuildLShr(builder
, vertexidx
,
3184 LLVMConstInt(ctx
->ac
.i32
, 5, false), "");
3185 LLVMValueRef swizzle
=
3186 LLVMBuildAnd(builder
, row
,
3187 LLVMConstInt(ctx
->ac
.i32
, (1u << write_stride_2exp
) - 1,
3189 vertexidx
= LLVMBuildXor(builder
, vertexidx
, swizzle
, "");
3192 return ac_build_gep0(&ctx
->ac
, storage
, vertexidx
);
3196 ngg_gs_emit_vertex_ptr(struct radv_shader_context
*ctx
, LLVMValueRef gsthread
,
3197 LLVMValueRef emitidx
)
3199 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3202 tmp
= LLVMConstInt(ctx
->ac
.i32
, ctx
->gs_max_out_vertices
, false);
3203 tmp
= LLVMBuildMul(builder
, tmp
, gsthread
, "");
3204 const LLVMValueRef vertexidx
= LLVMBuildAdd(builder
, tmp
, emitidx
, "");
3205 return ngg_gs_vertex_ptr(ctx
, vertexidx
);
3208 /* Send GS Alloc Req message from the first wave of the group to SPI.
3209 * Message payload is:
3210 * - bits 0..10: vertices in group
3211 * - bits 12..22: primitives in group
3213 static void build_sendmsg_gs_alloc_req(struct radv_shader_context
*ctx
,
3214 LLVMValueRef vtx_cnt
,
3215 LLVMValueRef prim_cnt
)
3217 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3220 tmp
= LLVMBuildICmp(builder
, LLVMIntEQ
, get_wave_id_in_tg(ctx
), ctx
->ac
.i32_0
, "");
3221 ac_build_ifcc(&ctx
->ac
, tmp
, 5020);
3223 tmp
= LLVMBuildShl(builder
, prim_cnt
, LLVMConstInt(ctx
->ac
.i32
, 12, false),"");
3224 tmp
= LLVMBuildOr(builder
, tmp
, vtx_cnt
, "");
3225 ac_build_sendmsg(&ctx
->ac
, AC_SENDMSG_GS_ALLOC_REQ
, tmp
);
3227 ac_build_endif(&ctx
->ac
, 5020);
3231 unsigned num_vertices
;
3232 LLVMValueRef isnull
;
3233 LLVMValueRef index
[3];
3234 LLVMValueRef edgeflag
[3];
3237 static void build_export_prim(struct radv_shader_context
*ctx
,
3238 const struct ngg_prim
*prim
)
3240 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3241 struct ac_export_args args
;
3244 tmp
= LLVMBuildZExt(builder
, prim
->isnull
, ctx
->ac
.i32
, "");
3245 args
.out
[0] = LLVMBuildShl(builder
, tmp
, LLVMConstInt(ctx
->ac
.i32
, 31, false), "");
3247 for (unsigned i
= 0; i
< prim
->num_vertices
; ++i
) {
3248 tmp
= LLVMBuildShl(builder
, prim
->index
[i
],
3249 LLVMConstInt(ctx
->ac
.i32
, 10 * i
, false), "");
3250 args
.out
[0] = LLVMBuildOr(builder
, args
.out
[0], tmp
, "");
3251 tmp
= LLVMBuildZExt(builder
, prim
->edgeflag
[i
], ctx
->ac
.i32
, "");
3252 tmp
= LLVMBuildShl(builder
, tmp
,
3253 LLVMConstInt(ctx
->ac
.i32
, 10 * i
+ 9, false), "");
3254 args
.out
[0] = LLVMBuildOr(builder
, args
.out
[0], tmp
, "");
3257 args
.out
[0] = LLVMBuildBitCast(builder
, args
.out
[0], ctx
->ac
.f32
, "");
3258 args
.out
[1] = LLVMGetUndef(ctx
->ac
.f32
);
3259 args
.out
[2] = LLVMGetUndef(ctx
->ac
.f32
);
3260 args
.out
[3] = LLVMGetUndef(ctx
->ac
.f32
);
3262 args
.target
= V_008DFC_SQ_EXP_PRIM
;
3263 args
.enabled_channels
= 1;
3265 args
.valid_mask
= false;
3268 ac_build_export(&ctx
->ac
, &args
);
3272 handle_ngg_outputs_post(struct radv_shader_context
*ctx
)
3274 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3275 struct ac_build_if_state if_state
;
3276 unsigned num_vertices
= 3;
3279 assert((ctx
->stage
== MESA_SHADER_VERTEX
||
3280 ctx
->stage
== MESA_SHADER_TESS_EVAL
) && !ctx
->is_gs_copy_shader
);
3282 LLVMValueRef prims_in_wave
= ac_unpack_param(&ctx
->ac
, ctx
->merged_wave_info
, 8, 8);
3283 LLVMValueRef vtx_in_wave
= ac_unpack_param(&ctx
->ac
, ctx
->merged_wave_info
, 0, 8);
3284 LLVMValueRef is_gs_thread
= LLVMBuildICmp(builder
, LLVMIntULT
,
3285 ac_get_thread_id(&ctx
->ac
), prims_in_wave
, "");
3286 LLVMValueRef is_es_thread
= LLVMBuildICmp(builder
, LLVMIntULT
,
3287 ac_get_thread_id(&ctx
->ac
), vtx_in_wave
, "");
3288 LLVMValueRef vtxindex
[] = {
3289 ac_unpack_param(&ctx
->ac
, ctx
->gs_vtx_offset
[0], 0, 16),
3290 ac_unpack_param(&ctx
->ac
, ctx
->gs_vtx_offset
[0], 16, 16),
3291 ac_unpack_param(&ctx
->ac
, ctx
->gs_vtx_offset
[2], 0, 16),
3294 /* TODO: streamout */
3296 /* Copy Primitive IDs from GS threads to the LDS address corresponding
3297 * to the ES thread of the provoking vertex.
3299 if (ctx
->stage
== MESA_SHADER_VERTEX
&&
3300 ctx
->options
->key
.vs_common_out
.export_prim_id
) {
3301 /* TODO: streamout */
3303 ac_build_ifcc(&ctx
->ac
, is_gs_thread
, 5400);
3304 /* Extract the PROVOKING_VTX_INDEX field. */
3305 LLVMValueRef provoking_vtx_in_prim
=
3306 LLVMConstInt(ctx
->ac
.i32
, 0, false);
3308 /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
3309 LLVMValueRef indices
= ac_build_gather_values(&ctx
->ac
, vtxindex
, 3);
3310 LLVMValueRef provoking_vtx_index
=
3311 LLVMBuildExtractElement(builder
, indices
, provoking_vtx_in_prim
, "");
3313 LLVMBuildStore(builder
, ctx
->abi
.gs_prim_id
,
3314 ac_build_gep0(&ctx
->ac
, ctx
->esgs_ring
, provoking_vtx_index
));
3315 ac_build_endif(&ctx
->ac
, 5400);
3318 /* TODO: primitive culling */
3320 build_sendmsg_gs_alloc_req(ctx
, ngg_get_vtx_cnt(ctx
), ngg_get_prim_cnt(ctx
));
3322 /* TODO: streamout queries */
3323 /* Export primitive data to the index buffer. Format is:
3324 * - bits 0..8: index 0
3325 * - bit 9: edge flag 0
3326 * - bits 10..18: index 1
3327 * - bit 19: edge flag 1
3328 * - bits 20..28: index 2
3329 * - bit 29: edge flag 2
3330 * - bit 31: null primitive (skip)
3332 * For the first version, we will always build up all three indices
3333 * independent of the primitive type. The additional garbage data
3336 * TODO: culling depends on the primitive type, so can have some
3339 ac_nir_build_if(&if_state
, ctx
, is_gs_thread
);
3341 struct ngg_prim prim
= {};
3343 prim
.num_vertices
= num_vertices
;
3344 prim
.isnull
= ctx
->ac
.i1false
;
3345 memcpy(prim
.index
, vtxindex
, sizeof(vtxindex
[0]) * 3);
3347 for (unsigned i
= 0; i
< num_vertices
; ++i
) {
3348 tmp
= LLVMBuildLShr(builder
, ctx
->abi
.gs_invocation_id
,
3349 LLVMConstInt(ctx
->ac
.i32
, 8 + i
, false), "");
3350 prim
.edgeflag
[i
] = LLVMBuildTrunc(builder
, tmp
, ctx
->ac
.i1
, "");
3353 build_export_prim(ctx
, &prim
);
3355 ac_nir_build_endif(&if_state
);
3357 /* Export per-vertex data (positions and parameters). */
3358 ac_nir_build_if(&if_state
, ctx
, is_es_thread
);
3360 struct radv_vs_output_info
*outinfo
=
3361 ctx
->stage
== MESA_SHADER_TESS_EVAL
? &ctx
->shader_info
->tes
.outinfo
: &ctx
->shader_info
->vs
.outinfo
;
3363 /* Exporting the primitive ID is handled below. */
3364 /* TODO: use the new VS export path */
3365 handle_vs_outputs_post(ctx
, false,
3366 ctx
->options
->key
.vs_common_out
.export_clip_dists
,
3369 if (ctx
->options
->key
.vs_common_out
.export_prim_id
) {
3370 unsigned param_count
= outinfo
->param_exports
;
3371 LLVMValueRef values
[4];
3373 if (ctx
->stage
== MESA_SHADER_VERTEX
) {
3374 /* Wait for GS stores to finish. */
3375 ac_build_s_barrier(&ctx
->ac
);
3377 tmp
= ac_build_gep0(&ctx
->ac
, ctx
->esgs_ring
,
3378 get_thread_id_in_tg(ctx
));
3379 values
[0] = LLVMBuildLoad(builder
, tmp
, "");
3381 assert(ctx
->stage
== MESA_SHADER_TESS_EVAL
);
3382 values
[0] = ctx
->abi
.tes_patch_id
;
3385 values
[0] = ac_to_float(&ctx
->ac
, values
[0]);
3386 for (unsigned j
= 1; j
< 4; j
++)
3387 values
[j
] = ctx
->ac
.f32_0
;
3389 radv_export_param(ctx
, param_count
, values
, 0x1);
3391 outinfo
->vs_output_param_offset
[VARYING_SLOT_PRIMITIVE_ID
] = param_count
++;
3392 outinfo
->export_prim_id
= true;
3393 outinfo
->param_exports
= param_count
;
3396 ac_nir_build_endif(&if_state
);
3399 static void gfx10_ngg_gs_emit_prologue(struct radv_shader_context
*ctx
)
3401 /* Zero out the part of LDS scratch that is used to accumulate the
3402 * per-stream generated primitive count.
3404 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3405 LLVMValueRef scratchptr
= ctx
->gs_ngg_scratch
;
3406 LLVMValueRef tid
= get_thread_id_in_tg(ctx
);
3407 LLVMBasicBlockRef merge_block
;
3410 LLVMValueRef fn
= LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx
->ac
.builder
));
3411 LLVMBasicBlockRef then_block
= LLVMAppendBasicBlockInContext(ctx
->ac
.context
, fn
, "");
3412 merge_block
= LLVMAppendBasicBlockInContext(ctx
->ac
.context
, fn
, "");
3414 cond
= LLVMBuildICmp(builder
, LLVMIntULT
, tid
, LLVMConstInt(ctx
->ac
.i32
, 4, false), "");
3415 LLVMBuildCondBr(ctx
->ac
.builder
, cond
, then_block
, merge_block
);
3416 LLVMPositionBuilderAtEnd(ctx
->ac
.builder
, then_block
);
3418 LLVMValueRef ptr
= ac_build_gep0(&ctx
->ac
, scratchptr
, tid
);
3419 LLVMBuildStore(builder
, ctx
->ac
.i32_0
, ptr
);
3421 LLVMBuildBr(ctx
->ac
.builder
, merge_block
);
3422 LLVMPositionBuilderAtEnd(ctx
->ac
.builder
, merge_block
);
3424 ac_build_s_barrier(&ctx
->ac
);
3427 static void gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context
*ctx
)
3429 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3430 LLVMValueRef i8_0
= LLVMConstInt(ctx
->ac
.i8
, 0, false);
3433 /* Zero out remaining (non-emitted) primitive flags.
3435 * Note: Alternatively, we could pass the relevant gs_next_vertex to
3436 * the emit threads via LDS. This is likely worse in the expected
3437 * typical case where each GS thread emits the full set of
3440 for (unsigned stream
= 0; stream
< 4; ++stream
) {
3441 unsigned num_components
;
3444 ctx
->shader_info
->info
.gs
.num_stream_output_components
[stream
];
3445 if (!num_components
)
3448 const LLVMValueRef gsthread
= get_thread_id_in_tg(ctx
);
3450 ac_build_bgnloop(&ctx
->ac
, 5100);
3452 const LLVMValueRef vertexidx
=
3453 LLVMBuildLoad(builder
, ctx
->gs_next_vertex
[stream
], "");
3454 tmp
= LLVMBuildICmp(builder
, LLVMIntUGE
, vertexidx
,
3455 LLVMConstInt(ctx
->ac
.i32
, ctx
->gs_max_out_vertices
, false), "");
3456 ac_build_ifcc(&ctx
->ac
, tmp
, 5101);
3457 ac_build_break(&ctx
->ac
);
3458 ac_build_endif(&ctx
->ac
, 5101);
3460 tmp
= LLVMBuildAdd(builder
, vertexidx
, ctx
->ac
.i32_1
, "");
3461 LLVMBuildStore(builder
, tmp
, ctx
->gs_next_vertex
[stream
]);
3463 tmp
= ngg_gs_emit_vertex_ptr(ctx
, gsthread
, vertexidx
);
3464 LLVMValueRef gep_idx
[3] = {
3465 ctx
->ac
.i32_0
, /* implied C-style array */
3466 ctx
->ac
.i32_1
, /* second entry of struct */
3467 LLVMConstInt(ctx
->ac
.i32
, stream
, false),
3469 tmp
= LLVMBuildGEP(builder
, tmp
, gep_idx
, 3, "");
3470 LLVMBuildStore(builder
, i8_0
, tmp
);
3472 ac_build_endloop(&ctx
->ac
, 5100);
3476 static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context
*ctx
)
3478 const unsigned verts_per_prim
= si_conv_gl_prim_to_vertices(ctx
->gs_output_prim
);
3479 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3480 LLVMValueRef tmp
, tmp2
;
3482 ac_build_s_barrier(&ctx
->ac
);
3484 const LLVMValueRef tid
= get_thread_id_in_tg(ctx
);
3485 LLVMValueRef num_emit_threads
= ngg_get_prim_cnt(ctx
);
3487 /* TODO: streamout */
3491 /* Determine vertex liveness. */
3492 LLVMValueRef vertliveptr
= ac_build_alloca(&ctx
->ac
, ctx
->ac
.i1
, "vertexlive");
3494 tmp
= LLVMBuildICmp(builder
, LLVMIntULT
, tid
, num_emit_threads
, "");
3495 ac_build_ifcc(&ctx
->ac
, tmp
, 5120);
3497 for (unsigned i
= 0; i
< verts_per_prim
; ++i
) {
3498 const LLVMValueRef primidx
=
3499 LLVMBuildAdd(builder
, tid
,
3500 LLVMConstInt(ctx
->ac
.i32
, i
, false), "");
3503 tmp
= LLVMBuildICmp(builder
, LLVMIntULT
, primidx
, num_emit_threads
, "");
3504 ac_build_ifcc(&ctx
->ac
, tmp
, 5121 + i
);
3507 /* Load primitive liveness */
3508 tmp
= ngg_gs_vertex_ptr(ctx
, primidx
);
3509 LLVMValueRef gep_idx
[3] = {
3510 ctx
->ac
.i32_0
, /* implicit C-style array */
3511 ctx
->ac
.i32_1
, /* second value of struct */
3512 ctx
->ac
.i32_0
, /* stream 0 */
3514 tmp
= LLVMBuildGEP(builder
, tmp
, gep_idx
, 3, "");
3515 tmp
= LLVMBuildLoad(builder
, tmp
, "");
3516 const LLVMValueRef primlive
=
3517 LLVMBuildTrunc(builder
, tmp
, ctx
->ac
.i1
, "");
3519 tmp
= LLVMBuildLoad(builder
, vertliveptr
, "");
3520 tmp
= LLVMBuildOr(builder
, tmp
, primlive
, ""),
3521 LLVMBuildStore(builder
, tmp
, vertliveptr
);
3524 ac_build_endif(&ctx
->ac
, 5121 + i
);
3527 ac_build_endif(&ctx
->ac
, 5120);
3529 /* Inclusive scan addition across the current wave. */
3530 LLVMValueRef vertlive
= LLVMBuildLoad(builder
, vertliveptr
, "");
3531 struct ac_wg_scan vertlive_scan
= {};
3532 vertlive_scan
.op
= nir_op_iadd
;
3533 vertlive_scan
.enable_reduce
= true;
3534 vertlive_scan
.enable_exclusive
= true;
3535 vertlive_scan
.src
= vertlive
;
3536 vertlive_scan
.scratch
= ac_build_gep0(&ctx
->ac
, ctx
->gs_ngg_scratch
, ctx
->ac
.i32_0
);
3537 vertlive_scan
.waveidx
= get_wave_id_in_tg(ctx
);
3538 vertlive_scan
.numwaves
= get_tgsize(ctx
);
3539 vertlive_scan
.maxwaves
= 8;
3541 ac_build_wg_scan(&ctx
->ac
, &vertlive_scan
);
3543 /* Skip all exports (including index exports) when possible. At least on
3544 * early gfx10 revisions this is also to avoid hangs.
3546 LLVMValueRef have_exports
=
3547 LLVMBuildICmp(builder
, LLVMIntNE
, vertlive_scan
.result_reduce
, ctx
->ac
.i32_0
, "");
3549 LLVMBuildSelect(builder
, have_exports
, num_emit_threads
, ctx
->ac
.i32_0
, "");
3551 /* Allocate export space. Send this message as early as possible, to
3552 * hide the latency of the SQ <-> SPI roundtrip.
3554 * Note: We could consider compacting primitives for export as well.
3555 * PA processes 1 non-null prim / clock, but it fetches 4 DW of
3556 * prim data per clock and skips null primitives at no additional
3557 * cost. So compacting primitives can only be beneficial when
3558 * there are 4 or more contiguous null primitives in the export
3559 * (in the common case of single-dword prim exports).
3561 build_sendmsg_gs_alloc_req(ctx
, vertlive_scan
.result_reduce
, num_emit_threads
);
3563 /* Setup the reverse vertex compaction permutation. We re-use stream 1
3564 * of the primitive liveness flags, relying on the fact that each
3565 * threadgroup can have at most 256 threads. */
3566 ac_build_ifcc(&ctx
->ac
, vertlive
, 5130);
3568 tmp
= ngg_gs_vertex_ptr(ctx
, vertlive_scan
.result_exclusive
);
3569 LLVMValueRef gep_idx
[3] = {
3570 ctx
->ac
.i32_0
, /* implicit C-style array */
3571 ctx
->ac
.i32_1
, /* second value of struct */
3572 ctx
->ac
.i32_1
, /* stream 1 */
3574 tmp
= LLVMBuildGEP(builder
, tmp
, gep_idx
, 3, "");
3575 tmp2
= LLVMBuildTrunc(builder
, tid
, ctx
->ac
.i8
, "");
3576 LLVMBuildStore(builder
, tmp2
, tmp
);
3578 ac_build_endif(&ctx
->ac
, 5130);
3580 ac_build_s_barrier(&ctx
->ac
);
3582 /* Export primitive data */
3583 tmp
= LLVMBuildICmp(builder
, LLVMIntULT
, tid
, num_emit_threads
, "");
3584 ac_build_ifcc(&ctx
->ac
, tmp
, 5140);
3586 struct ngg_prim prim
= {};
3587 prim
.num_vertices
= verts_per_prim
;
3589 tmp
= ngg_gs_vertex_ptr(ctx
, tid
);
3590 LLVMValueRef gep_idx
[3] = {
3591 ctx
->ac
.i32_0
, /* implicit C-style array */
3592 ctx
->ac
.i32_1
, /* second value of struct */
3593 ctx
->ac
.i32_0
, /* primflag */
3595 tmp
= LLVMBuildGEP(builder
, tmp
, gep_idx
, 3, "");
3596 tmp
= LLVMBuildLoad(builder
, tmp
, "");
3597 prim
.isnull
= LLVMBuildICmp(builder
, LLVMIntEQ
, tmp
,
3598 LLVMConstInt(ctx
->ac
.i8
, 0, false), "");
3600 for (unsigned i
= 0; i
< verts_per_prim
; ++i
) {
3601 prim
.index
[i
] = LLVMBuildSub(builder
, vertlive_scan
.result_exclusive
,
3602 LLVMConstInt(ctx
->ac
.i32
, verts_per_prim
- i
- 1, false), "");
3603 prim
.edgeflag
[i
] = ctx
->ac
.i1false
;
3606 build_export_prim(ctx
, &prim
);
3608 ac_build_endif(&ctx
->ac
, 5140);
3610 /* Export position and parameter data */
3611 tmp
= LLVMBuildICmp(builder
, LLVMIntULT
, tid
, vertlive_scan
.result_reduce
, "");
3612 ac_build_ifcc(&ctx
->ac
, tmp
, 5145);
3614 struct radv_vs_output_info
*outinfo
= &ctx
->shader_info
->vs
.outinfo
;
3615 bool export_view_index
= ctx
->options
->key
.has_multiview_view_index
;
3616 struct radv_shader_output_values
*outputs
;
3617 unsigned noutput
= 0;
3619 /* Allocate a temporary array for the output values. */
3620 unsigned num_outputs
= util_bitcount64(ctx
->output_mask
) + export_view_index
;
3621 outputs
= calloc(num_outputs
, sizeof(outputs
[0]));
3623 memset(outinfo
->vs_output_param_offset
, AC_EXP_PARAM_UNDEFINED
,
3624 sizeof(outinfo
->vs_output_param_offset
));
3625 outinfo
->pos_exports
= 0;
3627 tmp
= ngg_gs_vertex_ptr(ctx
, tid
);
3628 LLVMValueRef gep_idx
[3] = {
3629 ctx
->ac
.i32_0
, /* implicit C-style array */
3630 ctx
->ac
.i32_1
, /* second value of struct */
3631 ctx
->ac
.i32_1
, /* stream 1: source data index */
3633 tmp
= LLVMBuildGEP(builder
, tmp
, gep_idx
, 3, "");
3634 tmp
= LLVMBuildLoad(builder
, tmp
, "");
3635 tmp
= LLVMBuildZExt(builder
, tmp
, ctx
->ac
.i32
, "");
3636 const LLVMValueRef vertexptr
= ngg_gs_vertex_ptr(ctx
, tmp
);
3638 if (ctx
->output_mask
& (1ull << VARYING_SLOT_PSIZ
)) {
3639 outinfo
->writes_pointsize
= true;
3642 if (ctx
->output_mask
& (1ull << VARYING_SLOT_LAYER
)) {
3643 outinfo
->writes_layer
= true;
3646 if (ctx
->output_mask
& (1ull << VARYING_SLOT_VIEWPORT
)) {
3647 outinfo
->writes_viewport_index
= true;
3650 unsigned out_idx
= 0;
3651 gep_idx
[1] = ctx
->ac
.i32_0
;
3652 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
3653 if (!(ctx
->output_mask
& (1ull << i
)))
3656 outputs
[noutput
].slot_name
= i
;
3657 outputs
[noutput
].slot_index
= i
== VARYING_SLOT_CLIP_DIST1
;
3659 outputs
[noutput
].usage_mask
= ctx
->shader_info
->info
.gs
.output_usage_mask
[i
];
3660 int length
= util_last_bit(outputs
[noutput
].usage_mask
);
3662 for (unsigned j
= 0; j
< length
; j
++, out_idx
++) {
3663 gep_idx
[2] = LLVMConstInt(ctx
->ac
.i32
, out_idx
, false);
3664 tmp
= LLVMBuildGEP(builder
, vertexptr
, gep_idx
, 3, "");
3665 tmp
= LLVMBuildLoad(builder
, tmp
, "");
3667 LLVMTypeRef type
= LLVMGetAllocatedType(ctx
->abi
.outputs
[ac_llvm_reg_index_soa(i
, j
)]);
3668 if (ac_get_type_size(type
) == 2) {
3669 tmp
= ac_to_integer(&ctx
->ac
, tmp
);
3670 tmp
= LLVMBuildTrunc(ctx
->ac
.builder
, tmp
, ctx
->ac
.i16
, "");
3673 outputs
[noutput
].values
[j
] = ac_to_float(&ctx
->ac
, tmp
);
3676 for (unsigned j
= length
; j
< 4; j
++)
3677 outputs
[noutput
].values
[j
] = LLVMGetUndef(ctx
->ac
.f32
);
3682 /* Export ViewIndex. */
3683 if (export_view_index
) {
3684 outinfo
->writes_layer
= true;
3686 outputs
[noutput
].slot_name
= VARYING_SLOT_LAYER
;
3687 outputs
[noutput
].slot_index
= 0;
3688 outputs
[noutput
].usage_mask
= 0x1;
3689 outputs
[noutput
].values
[0] = ac_to_float(&ctx
->ac
, ctx
->abi
.view_index
);
3690 for (unsigned j
= 1; j
< 4; j
++)
3691 outputs
[noutput
].values
[j
] = ctx
->ac
.f32_0
;
3695 radv_llvm_export_vs(ctx
, outputs
, noutput
, outinfo
,
3696 ctx
->options
->key
.vs_common_out
.export_clip_dists
);
3699 ac_build_endif(&ctx
->ac
, 5145);
3702 static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context
*ctx
,
3704 LLVMValueRef
*addrs
)
3706 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3708 const LLVMValueRef vertexidx
=
3709 LLVMBuildLoad(builder
, ctx
->gs_next_vertex
[stream
], "");
3711 /* If this thread has already emitted the declared maximum number of
3712 * vertices, skip the write: excessive vertex emissions are not
3713 * supposed to have any effect.
3715 const LLVMValueRef can_emit
=
3716 LLVMBuildICmp(builder
, LLVMIntULT
, vertexidx
,
3717 LLVMConstInt(ctx
->ac
.i32
, ctx
->gs_max_out_vertices
, false), "");
3718 ac_build_kill_if_false(&ctx
->ac
, can_emit
);
3720 tmp
= LLVMBuildAdd(builder
, vertexidx
, ctx
->ac
.i32_1
, "");
3721 tmp
= LLVMBuildSelect(builder
, can_emit
, tmp
, vertexidx
, "");
3722 LLVMBuildStore(builder
, tmp
, ctx
->gs_next_vertex
[stream
]);
3724 const LLVMValueRef vertexptr
=
3725 ngg_gs_emit_vertex_ptr(ctx
, get_thread_id_in_tg(ctx
), vertexidx
);
3726 unsigned out_idx
= 0;
3727 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
3728 unsigned output_usage_mask
=
3729 ctx
->shader_info
->info
.gs
.output_usage_mask
[i
];
3730 uint8_t output_stream
=
3731 ctx
->shader_info
->info
.gs
.output_streams
[i
];
3732 LLVMValueRef
*out_ptr
= &addrs
[i
* 4];
3733 int length
= util_last_bit(output_usage_mask
);
3735 if (!(ctx
->output_mask
& (1ull << i
)) ||
3736 output_stream
!= stream
)
3739 for (unsigned j
= 0; j
< length
; j
++, out_idx
++) {
3740 if (!(output_usage_mask
& (1 << j
)))
3743 LLVMValueRef out_val
= LLVMBuildLoad(ctx
->ac
.builder
,
3745 LLVMValueRef gep_idx
[3] = {
3746 ctx
->ac
.i32_0
, /* implied C-style array */
3747 ctx
->ac
.i32_0
, /* first entry of struct */
3748 LLVMConstInt(ctx
->ac
.i32
, out_idx
, false),
3750 LLVMValueRef ptr
= LLVMBuildGEP(builder
, vertexptr
, gep_idx
, 3, "");
3752 out_val
= ac_to_integer(&ctx
->ac
, out_val
);
3753 out_val
= LLVMBuildZExtOrBitCast(ctx
->ac
.builder
, out_val
, ctx
->ac
.i32
, "");
3755 LLVMBuildStore(builder
, out_val
, ptr
);
3758 assert(out_idx
* 4 <= ctx
->gsvs_vertex_size
);
3760 /* Determine and store whether this vertex completed a primitive. */
3761 const LLVMValueRef curverts
= LLVMBuildLoad(builder
, ctx
->gs_curprim_verts
[stream
], "");
3763 tmp
= LLVMConstInt(ctx
->ac
.i32
, si_conv_gl_prim_to_vertices(ctx
->gs_output_prim
) - 1, false);
3764 const LLVMValueRef iscompleteprim
=
3765 LLVMBuildICmp(builder
, LLVMIntUGE
, curverts
, tmp
, "");
3767 tmp
= LLVMBuildAdd(builder
, curverts
, ctx
->ac
.i32_1
, "");
3768 LLVMBuildStore(builder
, tmp
, ctx
->gs_curprim_verts
[stream
]);
3770 LLVMValueRef gep_idx
[3] = {
3771 ctx
->ac
.i32_0
, /* implied C-style array */
3772 ctx
->ac
.i32_1
, /* second struct entry */
3773 LLVMConstInt(ctx
->ac
.i32
, stream
, false),
3775 const LLVMValueRef primflagptr
=
3776 LLVMBuildGEP(builder
, vertexptr
, gep_idx
, 3, "");
3778 tmp
= LLVMBuildZExt(builder
, iscompleteprim
, ctx
->ac
.i8
, "");
3779 LLVMBuildStore(builder
, tmp
, primflagptr
);
3781 tmp
= LLVMBuildLoad(builder
, ctx
->gs_generated_prims
[stream
], "");
3782 tmp
= LLVMBuildAdd(builder
, tmp
, LLVMBuildZExt(builder
, iscompleteprim
, ctx
->ac
.i32
, ""), "");
3783 LLVMBuildStore(builder
, tmp
, ctx
->gs_generated_prims
[stream
]);
3787 write_tess_factors(struct radv_shader_context
*ctx
)
3789 unsigned stride
, outer_comps
, inner_comps
;
3790 struct ac_build_if_state if_ctx
, inner_if_ctx
;
3791 LLVMValueRef invocation_id
= ac_unpack_param(&ctx
->ac
, ctx
->abi
.tcs_rel_ids
, 8, 5);
3792 LLVMValueRef rel_patch_id
= ac_unpack_param(&ctx
->ac
, ctx
->abi
.tcs_rel_ids
, 0, 8);
3793 unsigned tess_inner_index
= 0, tess_outer_index
;
3794 LLVMValueRef lds_base
, lds_inner
= NULL
, lds_outer
, byteoffset
, buffer
;
3795 LLVMValueRef out
[6], vec0
, vec1
, tf_base
, inner
[4], outer
[4];
3797 ac_emit_barrier(&ctx
->ac
, ctx
->stage
);
3799 switch (ctx
->options
->key
.tcs
.primitive_mode
) {
3819 ac_nir_build_if(&if_ctx
, ctx
,
3820 LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntEQ
,
3821 invocation_id
, ctx
->ac
.i32_0
, ""));
3823 lds_base
= get_tcs_out_current_patch_data_offset(ctx
);
3826 tess_inner_index
= shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER
);
3827 lds_inner
= LLVMBuildAdd(ctx
->ac
.builder
, lds_base
,
3828 LLVMConstInt(ctx
->ac
.i32
, tess_inner_index
* 4, false), "");
3831 tess_outer_index
= shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER
);
3832 lds_outer
= LLVMBuildAdd(ctx
->ac
.builder
, lds_base
,
3833 LLVMConstInt(ctx
->ac
.i32
, tess_outer_index
* 4, false), "");
3835 for (i
= 0; i
< 4; i
++) {
3836 inner
[i
] = LLVMGetUndef(ctx
->ac
.i32
);
3837 outer
[i
] = LLVMGetUndef(ctx
->ac
.i32
);
3841 if (ctx
->options
->key
.tcs
.primitive_mode
== GL_ISOLINES
) {
3842 outer
[0] = out
[1] = ac_lds_load(&ctx
->ac
, lds_outer
);
3843 lds_outer
= LLVMBuildAdd(ctx
->ac
.builder
, lds_outer
,
3845 outer
[1] = out
[0] = ac_lds_load(&ctx
->ac
, lds_outer
);
3847 for (i
= 0; i
< outer_comps
; i
++) {
3849 ac_lds_load(&ctx
->ac
, lds_outer
);
3850 lds_outer
= LLVMBuildAdd(ctx
->ac
.builder
, lds_outer
,
3853 for (i
= 0; i
< inner_comps
; i
++) {
3854 inner
[i
] = out
[outer_comps
+i
] =
3855 ac_lds_load(&ctx
->ac
, lds_inner
);
3856 lds_inner
= LLVMBuildAdd(ctx
->ac
.builder
, lds_inner
,
3861 /* Convert the outputs to vectors for stores. */
3862 vec0
= ac_build_gather_values(&ctx
->ac
, out
, MIN2(stride
, 4));
3866 vec1
= ac_build_gather_values(&ctx
->ac
, out
+ 4, stride
- 4);
3869 buffer
= ctx
->hs_ring_tess_factor
;
3870 tf_base
= ctx
->tess_factor_offset
;
3871 byteoffset
= LLVMBuildMul(ctx
->ac
.builder
, rel_patch_id
,
3872 LLVMConstInt(ctx
->ac
.i32
, 4 * stride
, false), "");
3873 unsigned tf_offset
= 0;
3875 if (ctx
->options
->chip_class
<= GFX8
) {
3876 ac_nir_build_if(&inner_if_ctx
, ctx
,
3877 LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntEQ
,
3878 rel_patch_id
, ctx
->ac
.i32_0
, ""));
3880 /* Store the dynamic HS control word. */
3881 ac_build_buffer_store_dword(&ctx
->ac
, buffer
,
3882 LLVMConstInt(ctx
->ac
.i32
, 0x80000000, false),
3883 1, ctx
->ac
.i32_0
, tf_base
,
3887 ac_nir_build_endif(&inner_if_ctx
);
3890 /* Store the tessellation factors. */
3891 ac_build_buffer_store_dword(&ctx
->ac
, buffer
, vec0
,
3892 MIN2(stride
, 4), byteoffset
, tf_base
,
3893 tf_offset
, ac_glc
, false);
3895 ac_build_buffer_store_dword(&ctx
->ac
, buffer
, vec1
,
3896 stride
- 4, byteoffset
, tf_base
,
3897 16 + tf_offset
, ac_glc
, false);
3899 //store to offchip for TES to read - only if TES reads them
3900 if (ctx
->options
->key
.tcs
.tes_reads_tess_factors
) {
3901 LLVMValueRef inner_vec
, outer_vec
, tf_outer_offset
;
3902 LLVMValueRef tf_inner_offset
;
3903 unsigned param_outer
, param_inner
;
3905 param_outer
= shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER
);
3906 tf_outer_offset
= get_tcs_tes_buffer_address(ctx
, NULL
,
3907 LLVMConstInt(ctx
->ac
.i32
, param_outer
, 0));
3909 outer_vec
= ac_build_gather_values(&ctx
->ac
, outer
,
3910 util_next_power_of_two(outer_comps
));
3912 ac_build_buffer_store_dword(&ctx
->ac
, ctx
->hs_ring_tess_offchip
, outer_vec
,
3913 outer_comps
, tf_outer_offset
,
3914 ctx
->oc_lds
, 0, ac_glc
, false);
3916 param_inner
= shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER
);
3917 tf_inner_offset
= get_tcs_tes_buffer_address(ctx
, NULL
,
3918 LLVMConstInt(ctx
->ac
.i32
, param_inner
, 0));
3920 inner_vec
= inner_comps
== 1 ? inner
[0] :
3921 ac_build_gather_values(&ctx
->ac
, inner
, inner_comps
);
3922 ac_build_buffer_store_dword(&ctx
->ac
, ctx
->hs_ring_tess_offchip
, inner_vec
,
3923 inner_comps
, tf_inner_offset
,
3924 ctx
->oc_lds
, 0, ac_glc
, false);
3927 ac_nir_build_endif(&if_ctx
);
3931 handle_tcs_outputs_post(struct radv_shader_context
*ctx
)
3933 write_tess_factors(ctx
);
3937 si_export_mrt_color(struct radv_shader_context
*ctx
,
3938 LLVMValueRef
*color
, unsigned index
,
3939 struct ac_export_args
*args
)
3942 si_llvm_init_export_args(ctx
, color
, 0xf,
3943 V_008DFC_SQ_EXP_MRT
+ index
, args
);
3944 if (!args
->enabled_channels
)
3945 return false; /* unnecessary NULL export */
3951 radv_export_mrt_z(struct radv_shader_context
*ctx
,
3952 LLVMValueRef depth
, LLVMValueRef stencil
,
3953 LLVMValueRef samplemask
)
3955 struct ac_export_args args
;
3957 ac_export_mrt_z(&ctx
->ac
, depth
, stencil
, samplemask
, &args
);
3959 ac_build_export(&ctx
->ac
, &args
);
3963 handle_fs_outputs_post(struct radv_shader_context
*ctx
)
3966 LLVMValueRef depth
= NULL
, stencil
= NULL
, samplemask
= NULL
;
3967 struct ac_export_args color_args
[8];
3969 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
3970 LLVMValueRef values
[4];
3972 if (!(ctx
->output_mask
& (1ull << i
)))
3975 if (i
< FRAG_RESULT_DATA0
)
3978 for (unsigned j
= 0; j
< 4; j
++)
3979 values
[j
] = ac_to_float(&ctx
->ac
,
3980 radv_load_output(ctx
, i
, j
));
3982 bool ret
= si_export_mrt_color(ctx
, values
,
3983 i
- FRAG_RESULT_DATA0
,
3984 &color_args
[index
]);
3989 /* Process depth, stencil, samplemask. */
3990 if (ctx
->shader_info
->info
.ps
.writes_z
) {
3991 depth
= ac_to_float(&ctx
->ac
,
3992 radv_load_output(ctx
, FRAG_RESULT_DEPTH
, 0));
3994 if (ctx
->shader_info
->info
.ps
.writes_stencil
) {
3995 stencil
= ac_to_float(&ctx
->ac
,
3996 radv_load_output(ctx
, FRAG_RESULT_STENCIL
, 0));
3998 if (ctx
->shader_info
->info
.ps
.writes_sample_mask
) {
3999 samplemask
= ac_to_float(&ctx
->ac
,
4000 radv_load_output(ctx
, FRAG_RESULT_SAMPLE_MASK
, 0));
4003 /* Set the DONE bit on last non-null color export only if Z isn't
4007 !ctx
->shader_info
->info
.ps
.writes_z
&&
4008 !ctx
->shader_info
->info
.ps
.writes_stencil
&&
4009 !ctx
->shader_info
->info
.ps
.writes_sample_mask
) {
4010 unsigned last
= index
- 1;
4012 color_args
[last
].valid_mask
= 1; /* whether the EXEC mask is valid */
4013 color_args
[last
].done
= 1; /* DONE bit */
4016 /* Export PS outputs. */
4017 for (unsigned i
= 0; i
< index
; i
++)
4018 ac_build_export(&ctx
->ac
, &color_args
[i
]);
4020 if (depth
|| stencil
|| samplemask
)
4021 radv_export_mrt_z(ctx
, depth
, stencil
, samplemask
);
4023 ac_build_export_null(&ctx
->ac
);
4027 emit_gs_epilogue(struct radv_shader_context
*ctx
)
4029 if (ctx
->options
->key
.vs_common_out
.as_ngg
) {
4030 gfx10_ngg_gs_emit_epilogue_1(ctx
);
4034 if (ctx
->ac
.chip_class
>= GFX10
)
4035 LLVMBuildFence(ctx
->ac
.builder
, LLVMAtomicOrderingRelease
, false, "");
4037 ac_build_sendmsg(&ctx
->ac
, AC_SENDMSG_GS_OP_NOP
| AC_SENDMSG_GS_DONE
, ctx
->gs_wave_id
);
4041 handle_shader_outputs_post(struct ac_shader_abi
*abi
, unsigned max_outputs
,
4042 LLVMValueRef
*addrs
)
4044 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
4046 switch (ctx
->stage
) {
4047 case MESA_SHADER_VERTEX
:
4048 if (ctx
->options
->key
.vs_common_out
.as_ls
)
4049 handle_ls_outputs_post(ctx
);
4050 else if (ctx
->options
->key
.vs_common_out
.as_es
)
4051 handle_es_outputs_post(ctx
, &ctx
->shader_info
->vs
.es_info
);
4052 else if (ctx
->options
->key
.vs_common_out
.as_ngg
)
4053 break; /* handled outside of the shader body */
4055 handle_vs_outputs_post(ctx
, ctx
->options
->key
.vs_common_out
.export_prim_id
,
4056 ctx
->options
->key
.vs_common_out
.export_clip_dists
,
4057 &ctx
->shader_info
->vs
.outinfo
);
4059 case MESA_SHADER_FRAGMENT
:
4060 handle_fs_outputs_post(ctx
);
4062 case MESA_SHADER_GEOMETRY
:
4063 emit_gs_epilogue(ctx
);
4065 case MESA_SHADER_TESS_CTRL
:
4066 handle_tcs_outputs_post(ctx
);
4068 case MESA_SHADER_TESS_EVAL
:
4069 if (ctx
->options
->key
.vs_common_out
.as_es
)
4070 handle_es_outputs_post(ctx
, &ctx
->shader_info
->tes
.es_info
);
4071 else if (ctx
->options
->key
.vs_common_out
.as_ngg
)
4072 break; /* handled outside of the shader body */
4074 handle_vs_outputs_post(ctx
, ctx
->options
->key
.vs_common_out
.export_prim_id
,
4075 ctx
->options
->key
.vs_common_out
.export_clip_dists
,
4076 &ctx
->shader_info
->tes
.outinfo
);
4083 static void ac_llvm_finalize_module(struct radv_shader_context
*ctx
,
4084 LLVMPassManagerRef passmgr
,
4085 const struct radv_nir_compiler_options
*options
)
4087 LLVMRunPassManager(passmgr
, ctx
->ac
.module
);
4088 LLVMDisposeBuilder(ctx
->ac
.builder
);
4090 ac_llvm_context_dispose(&ctx
->ac
);
4094 ac_nir_eliminate_const_vs_outputs(struct radv_shader_context
*ctx
)
4096 struct radv_vs_output_info
*outinfo
;
4098 switch (ctx
->stage
) {
4099 case MESA_SHADER_FRAGMENT
:
4100 case MESA_SHADER_COMPUTE
:
4101 case MESA_SHADER_TESS_CTRL
:
4102 case MESA_SHADER_GEOMETRY
:
4104 case MESA_SHADER_VERTEX
:
4105 if (ctx
->options
->key
.vs_common_out
.as_ls
||
4106 ctx
->options
->key
.vs_common_out
.as_es
)
4108 outinfo
= &ctx
->shader_info
->vs
.outinfo
;
4110 case MESA_SHADER_TESS_EVAL
:
4111 if (ctx
->options
->key
.vs_common_out
.as_es
)
4113 outinfo
= &ctx
->shader_info
->tes
.outinfo
;
4116 unreachable("Unhandled shader type");
4119 ac_optimize_vs_outputs(&ctx
->ac
,
4121 outinfo
->vs_output_param_offset
,
4123 &outinfo
->param_exports
);
4127 ac_setup_rings(struct radv_shader_context
*ctx
)
4129 if (ctx
->options
->chip_class
<= GFX8
&&
4130 (ctx
->stage
== MESA_SHADER_GEOMETRY
||
4131 ctx
->options
->key
.vs_common_out
.as_es
|| ctx
->options
->key
.vs_common_out
.as_es
)) {
4132 unsigned ring
= ctx
->stage
== MESA_SHADER_GEOMETRY
? RING_ESGS_GS
4134 LLVMValueRef offset
= LLVMConstInt(ctx
->ac
.i32
, ring
, false);
4136 ctx
->esgs_ring
= ac_build_load_to_sgpr(&ctx
->ac
,
4141 if (ctx
->is_gs_copy_shader
) {
4143 ac_build_load_to_sgpr(&ctx
->ac
, ctx
->ring_offsets
,
4144 LLVMConstInt(ctx
->ac
.i32
,
4145 RING_GSVS_VS
, false));
4148 if (ctx
->stage
== MESA_SHADER_GEOMETRY
) {
4149 /* The conceptual layout of the GSVS ring is
4150 * v0c0 .. vLv0 v0c1 .. vLc1 ..
4151 * but the real memory layout is swizzled across
4153 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
4155 * Override the buffer descriptor accordingly.
4157 LLVMTypeRef v2i64
= LLVMVectorType(ctx
->ac
.i64
, 2);
4158 uint64_t stream_offset
= 0;
4159 unsigned num_records
= ctx
->ac
.wave_size
;
4160 LLVMValueRef base_ring
;
4163 ac_build_load_to_sgpr(&ctx
->ac
, ctx
->ring_offsets
,
4164 LLVMConstInt(ctx
->ac
.i32
,
4165 RING_GSVS_GS
, false));
4167 for (unsigned stream
= 0; stream
< 4; stream
++) {
4168 unsigned num_components
, stride
;
4169 LLVMValueRef ring
, tmp
;
4172 ctx
->shader_info
->info
.gs
.num_stream_output_components
[stream
];
4174 if (!num_components
)
4177 stride
= 4 * num_components
* ctx
->gs_max_out_vertices
;
4179 /* Limit on the stride field for <= GFX7. */
4180 assert(stride
< (1 << 14));
4182 ring
= LLVMBuildBitCast(ctx
->ac
.builder
,
4183 base_ring
, v2i64
, "");
4184 tmp
= LLVMBuildExtractElement(ctx
->ac
.builder
,
4185 ring
, ctx
->ac
.i32_0
, "");
4186 tmp
= LLVMBuildAdd(ctx
->ac
.builder
, tmp
,
4187 LLVMConstInt(ctx
->ac
.i64
,
4188 stream_offset
, 0), "");
4189 ring
= LLVMBuildInsertElement(ctx
->ac
.builder
,
4190 ring
, tmp
, ctx
->ac
.i32_0
, "");
4192 stream_offset
+= stride
* ctx
->ac
.wave_size
;
4194 ring
= LLVMBuildBitCast(ctx
->ac
.builder
, ring
,
4197 tmp
= LLVMBuildExtractElement(ctx
->ac
.builder
, ring
,
4199 tmp
= LLVMBuildOr(ctx
->ac
.builder
, tmp
,
4200 LLVMConstInt(ctx
->ac
.i32
,
4201 S_008F04_STRIDE(stride
), false), "");
4202 ring
= LLVMBuildInsertElement(ctx
->ac
.builder
, ring
, tmp
,
4205 ring
= LLVMBuildInsertElement(ctx
->ac
.builder
, ring
,
4206 LLVMConstInt(ctx
->ac
.i32
,
4207 num_records
, false),
4208 LLVMConstInt(ctx
->ac
.i32
, 2, false), "");
4210 ctx
->gsvs_ring
[stream
] = ring
;
4214 if (ctx
->stage
== MESA_SHADER_TESS_CTRL
||
4215 ctx
->stage
== MESA_SHADER_TESS_EVAL
) {
4216 ctx
->hs_ring_tess_offchip
= ac_build_load_to_sgpr(&ctx
->ac
, ctx
->ring_offsets
, LLVMConstInt(ctx
->ac
.i32
, RING_HS_TESS_OFFCHIP
, false));
4217 ctx
->hs_ring_tess_factor
= ac_build_load_to_sgpr(&ctx
->ac
, ctx
->ring_offsets
, LLVMConstInt(ctx
->ac
.i32
, RING_HS_TESS_FACTOR
, false));
4222 radv_nir_get_max_workgroup_size(enum chip_class chip_class
,
4223 gl_shader_stage stage
,
4224 const struct nir_shader
*nir
)
4226 const unsigned backup_sizes
[] = {chip_class
>= GFX9
? 128 : 64, 1, 1};
4227 return radv_get_max_workgroup_size(chip_class
, stage
, nir
? nir
->info
.cs
.local_size
: backup_sizes
);
4230 /* Fixup the HW not emitting the TCS regs if there are no HS threads. */
4231 static void ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context
*ctx
)
4233 LLVMValueRef count
= ac_unpack_param(&ctx
->ac
, ctx
->merged_wave_info
, 8, 8);
4234 LLVMValueRef hs_empty
= LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntEQ
, count
,
4236 ctx
->abi
.instance_id
= LLVMBuildSelect(ctx
->ac
.builder
, hs_empty
, ctx
->rel_auto_id
, ctx
->abi
.instance_id
, "");
4237 ctx
->rel_auto_id
= LLVMBuildSelect(ctx
->ac
.builder
, hs_empty
, ctx
->abi
.tcs_rel_ids
, ctx
->rel_auto_id
, "");
4238 ctx
->abi
.vertex_id
= LLVMBuildSelect(ctx
->ac
.builder
, hs_empty
, ctx
->abi
.tcs_patch_id
, ctx
->abi
.vertex_id
, "");
4241 static void prepare_gs_input_vgprs(struct radv_shader_context
*ctx
)
4243 for(int i
= 5; i
>= 0; --i
) {
4244 ctx
->gs_vtx_offset
[i
] = ac_unpack_param(&ctx
->ac
, ctx
->gs_vtx_offset
[i
& ~1],
4248 ctx
->gs_wave_id
= ac_unpack_param(&ctx
->ac
, ctx
->merged_wave_info
, 16, 8);
4251 /* Ensure that the esgs ring is declared.
4253 * We declare it with 64KB alignment as a hint that the
4254 * pointer value will always be 0.
4256 static void declare_esgs_ring(struct radv_shader_context
*ctx
)
4261 assert(!LLVMGetNamedGlobal(ctx
->ac
.module
, "esgs_ring"));
4263 ctx
->esgs_ring
= LLVMAddGlobalInAddressSpace(
4264 ctx
->ac
.module
, LLVMArrayType(ctx
->ac
.i32
, 0),
4267 LLVMSetLinkage(ctx
->esgs_ring
, LLVMExternalLinkage
);
4268 LLVMSetAlignment(ctx
->esgs_ring
, 64 * 1024);
4272 LLVMModuleRef
ac_translate_nir_to_llvm(struct ac_llvm_compiler
*ac_llvm
,
4273 struct nir_shader
*const *shaders
,
4275 struct radv_shader_variant_info
*shader_info
,
4276 const struct radv_nir_compiler_options
*options
)
4278 struct radv_shader_context ctx
= {0};
4280 ctx
.options
= options
;
4281 ctx
.shader_info
= shader_info
;
4283 enum ac_float_mode float_mode
=
4284 options
->unsafe_math
? AC_FLOAT_MODE_UNSAFE_FP_MATH
:
4285 AC_FLOAT_MODE_DEFAULT
;
4287 ac_llvm_context_init(&ctx
.ac
, ac_llvm
, options
->chip_class
,
4288 options
->family
, float_mode
, options
->wave_size
,
4289 options
->wave_size
);
4290 ctx
.context
= ctx
.ac
.context
;
4292 radv_nir_shader_info_init(&shader_info
->info
);
4294 for(int i
= 0; i
< shader_count
; ++i
)
4295 radv_nir_shader_info_pass(shaders
[i
], options
, &shader_info
->info
);
4297 for (i
= 0; i
< MAX_SETS
; i
++)
4298 shader_info
->user_sgprs_locs
.descriptor_sets
[i
].sgpr_idx
= -1;
4299 for (i
= 0; i
< AC_UD_MAX_UD
; i
++)
4300 shader_info
->user_sgprs_locs
.shader_data
[i
].sgpr_idx
= -1;
4302 ctx
.max_workgroup_size
= 0;
4303 for (int i
= 0; i
< shader_count
; ++i
) {
4304 ctx
.max_workgroup_size
= MAX2(ctx
.max_workgroup_size
,
4305 radv_nir_get_max_workgroup_size(ctx
.options
->chip_class
,
4306 shaders
[i
]->info
.stage
,
4310 if (ctx
.ac
.chip_class
>= GFX10
) {
4311 if (is_pre_gs_stage(shaders
[0]->info
.stage
) &&
4312 options
->key
.vs_common_out
.as_ngg
) {
4313 ctx
.max_workgroup_size
= 128;
4317 create_function(&ctx
, shaders
[shader_count
- 1]->info
.stage
, shader_count
>= 2,
4318 shader_count
>= 2 ? shaders
[shader_count
- 2]->info
.stage
: MESA_SHADER_VERTEX
);
4320 ctx
.abi
.inputs
= &ctx
.inputs
[0];
4321 ctx
.abi
.emit_outputs
= handle_shader_outputs_post
;
4322 ctx
.abi
.emit_vertex
= visit_emit_vertex
;
4323 ctx
.abi
.load_ubo
= radv_load_ubo
;
4324 ctx
.abi
.load_ssbo
= radv_load_ssbo
;
4325 ctx
.abi
.load_sampler_desc
= radv_get_sampler_desc
;
4326 ctx
.abi
.load_resource
= radv_load_resource
;
4327 ctx
.abi
.clamp_shadow_reference
= false;
4328 ctx
.abi
.robust_buffer_access
= options
->robust_buffer_access
;
4330 bool is_ngg
= is_pre_gs_stage(shaders
[0]->info
.stage
) && ctx
.options
->key
.vs_common_out
.as_ngg
;
4331 if (shader_count
>= 2 || is_ngg
)
4332 ac_init_exec_full_mask(&ctx
.ac
);
4334 if (options
->has_ls_vgpr_init_bug
&&
4335 shaders
[shader_count
- 1]->info
.stage
== MESA_SHADER_TESS_CTRL
)
4336 ac_nir_fixup_ls_hs_input_vgprs(&ctx
);
4338 for(int i
= 0; i
< shader_count
; ++i
) {
4339 ctx
.stage
= shaders
[i
]->info
.stage
;
4340 ctx
.output_mask
= 0;
4342 if (shaders
[i
]->info
.stage
== MESA_SHADER_GEOMETRY
) {
4343 for (int i
= 0; i
< 4; i
++) {
4344 ctx
.gs_next_vertex
[i
] =
4345 ac_build_alloca(&ctx
.ac
, ctx
.ac
.i32
, "");
4347 if (ctx
.options
->key
.vs_common_out
.as_ngg
) {
4348 for (unsigned i
= 0; i
< 4; ++i
) {
4349 ctx
.gs_curprim_verts
[i
] =
4350 ac_build_alloca(&ctx
.ac
, ctx
.ac
.i32
, "");
4351 ctx
.gs_generated_prims
[i
] =
4352 ac_build_alloca(&ctx
.ac
, ctx
.ac
.i32
, "");
4355 /* TODO: streamout */
4357 LLVMTypeRef ai32
= LLVMArrayType(ctx
.ac
.i32
, 8);
4358 ctx
.gs_ngg_scratch
=
4359 LLVMAddGlobalInAddressSpace(ctx
.ac
.module
,
4360 ai32
, "ngg_scratch", AC_ADDR_SPACE_LDS
);
4361 LLVMSetInitializer(ctx
.gs_ngg_scratch
, LLVMGetUndef(ai32
));
4362 LLVMSetAlignment(ctx
.gs_ngg_scratch
, 4);
4364 ctx
.gs_ngg_emit
= LLVMBuildIntToPtr(ctx
.ac
.builder
, ctx
.ac
.i32_0
,
4365 LLVMPointerType(LLVMArrayType(ctx
.ac
.i32
, 0), AC_ADDR_SPACE_LDS
),
4369 ctx
.gs_max_out_vertices
= shaders
[i
]->info
.gs
.vertices_out
;
4370 ctx
.gs_output_prim
= shaders
[i
]->info
.gs
.output_primitive
;
4371 ctx
.abi
.load_inputs
= load_gs_input
;
4372 ctx
.abi
.emit_primitive
= visit_end_primitive
;
4373 } else if (shaders
[i
]->info
.stage
== MESA_SHADER_TESS_CTRL
) {
4374 ctx
.tcs_outputs_read
= shaders
[i
]->info
.outputs_read
;
4375 ctx
.tcs_patch_outputs_read
= shaders
[i
]->info
.patch_outputs_read
;
4376 ctx
.abi
.load_tess_varyings
= load_tcs_varyings
;
4377 ctx
.abi
.load_patch_vertices_in
= load_patch_vertices_in
;
4378 ctx
.abi
.store_tcs_outputs
= store_tcs_output
;
4379 ctx
.tcs_vertices_per_patch
= shaders
[i
]->info
.tess
.tcs_vertices_out
;
4380 if (shader_count
== 1)
4381 ctx
.tcs_num_inputs
= ctx
.options
->key
.tcs
.num_inputs
;
4383 ctx
.tcs_num_inputs
= util_last_bit64(shader_info
->info
.vs
.ls_outputs_written
);
4384 ctx
.tcs_num_patches
= get_tcs_num_patches(&ctx
);
4385 } else if (shaders
[i
]->info
.stage
== MESA_SHADER_TESS_EVAL
) {
4386 ctx
.tes_primitive_mode
= shaders
[i
]->info
.tess
.primitive_mode
;
4387 ctx
.abi
.load_tess_varyings
= load_tes_input
;
4388 ctx
.abi
.load_tess_coord
= load_tess_coord
;
4389 ctx
.abi
.load_patch_vertices_in
= load_patch_vertices_in
;
4390 ctx
.tcs_vertices_per_patch
= shaders
[i
]->info
.tess
.tcs_vertices_out
;
4391 ctx
.tcs_num_patches
= ctx
.options
->key
.tes
.num_patches
;
4392 } else if (shaders
[i
]->info
.stage
== MESA_SHADER_VERTEX
) {
4393 ctx
.abi
.load_base_vertex
= radv_load_base_vertex
;
4394 } else if (shaders
[i
]->info
.stage
== MESA_SHADER_FRAGMENT
) {
4395 shader_info
->fs
.can_discard
= shaders
[i
]->info
.fs
.uses_discard
;
4396 ctx
.abi
.load_sample_position
= load_sample_position
;
4397 ctx
.abi
.load_sample_mask_in
= load_sample_mask_in
;
4398 ctx
.abi
.emit_kill
= radv_emit_kill
;
4401 if (shaders
[i
]->info
.stage
== MESA_SHADER_VERTEX
&&
4402 ctx
.options
->key
.vs_common_out
.as_ngg
&&
4403 ctx
.options
->key
.vs_common_out
.export_prim_id
) {
4404 declare_esgs_ring(&ctx
);
4407 bool nested_barrier
= false;
4410 if (shaders
[i
]->info
.stage
== MESA_SHADER_GEOMETRY
&&
4411 ctx
.options
->key
.vs_common_out
.as_ngg
) {
4412 gfx10_ngg_gs_emit_prologue(&ctx
);
4413 nested_barrier
= false;
4415 nested_barrier
= true;
4419 if (nested_barrier
) {
4420 /* Execute a barrier before the second shader in
4423 * Execute the barrier inside the conditional block,
4424 * so that empty waves can jump directly to s_endpgm,
4425 * which will also signal the barrier.
4427 * This is possible in gfx9, because an empty wave
4428 * for the second shader does not participate in
4429 * the epilogue. With NGG, empty waves may still
4430 * be required to export data (e.g. GS output vertices),
4431 * so we cannot let them exit early.
4433 * If the shader is TCS and the TCS epilog is present
4434 * and contains a barrier, it will wait there and then
4437 ac_emit_barrier(&ctx
.ac
, ctx
.stage
);
4440 nir_foreach_variable(variable
, &shaders
[i
]->outputs
)
4441 scan_shader_output_decl(&ctx
, variable
, shaders
[i
], shaders
[i
]->info
.stage
);
4443 if (shaders
[i
]->info
.stage
== MESA_SHADER_GEOMETRY
) {
4444 unsigned addclip
= shaders
[i
]->info
.clip_distance_array_size
+
4445 shaders
[i
]->info
.cull_distance_array_size
> 4;
4446 ctx
.gsvs_vertex_size
= (util_bitcount64(ctx
.output_mask
) + addclip
) * 16;
4447 ctx
.max_gsvs_emit_size
= ctx
.gsvs_vertex_size
*
4448 shaders
[i
]->info
.gs
.vertices_out
;
4451 ac_setup_rings(&ctx
);
4453 LLVMBasicBlockRef merge_block
;
4454 if (shader_count
>= 2 || is_ngg
) {
4455 LLVMValueRef fn
= LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx
.ac
.builder
));
4456 LLVMBasicBlockRef then_block
= LLVMAppendBasicBlockInContext(ctx
.ac
.context
, fn
, "");
4457 merge_block
= LLVMAppendBasicBlockInContext(ctx
.ac
.context
, fn
, "");
4459 LLVMValueRef count
= ac_unpack_param(&ctx
.ac
, ctx
.merged_wave_info
, 8 * i
, 8);
4460 LLVMValueRef thread_id
= ac_get_thread_id(&ctx
.ac
);
4461 LLVMValueRef cond
= LLVMBuildICmp(ctx
.ac
.builder
, LLVMIntULT
,
4462 thread_id
, count
, "");
4463 LLVMBuildCondBr(ctx
.ac
.builder
, cond
, then_block
, merge_block
);
4465 LLVMPositionBuilderAtEnd(ctx
.ac
.builder
, then_block
);
4468 if (shaders
[i
]->info
.stage
== MESA_SHADER_FRAGMENT
)
4469 prepare_interp_optimize(&ctx
, shaders
[i
]);
4470 else if(shaders
[i
]->info
.stage
== MESA_SHADER_VERTEX
)
4471 handle_vs_inputs(&ctx
, shaders
[i
]);
4472 else if(shader_count
>= 2 && shaders
[i
]->info
.stage
== MESA_SHADER_GEOMETRY
)
4473 prepare_gs_input_vgprs(&ctx
);
4475 ac_nir_translate(&ctx
.ac
, &ctx
.abi
, shaders
[i
]);
4477 if (shader_count
>= 2 || is_ngg
) {
4478 LLVMBuildBr(ctx
.ac
.builder
, merge_block
);
4479 LLVMPositionBuilderAtEnd(ctx
.ac
.builder
, merge_block
);
4482 /* This needs to be outside the if wrapping the shader body, as sometimes
4483 * the HW generates waves with 0 es/vs threads. */
4484 if (is_pre_gs_stage(shaders
[i
]->info
.stage
) &&
4485 ctx
.options
->key
.vs_common_out
.as_ngg
&&
4486 i
== shader_count
- 1) {
4487 handle_ngg_outputs_post(&ctx
);
4488 } else if (shaders
[i
]->info
.stage
== MESA_SHADER_GEOMETRY
&&
4489 ctx
.options
->key
.vs_common_out
.as_ngg
) {
4490 gfx10_ngg_gs_emit_epilogue_2(&ctx
);
4493 if (shaders
[i
]->info
.stage
== MESA_SHADER_GEOMETRY
) {
4494 shader_info
->gs
.gsvs_vertex_size
= ctx
.gsvs_vertex_size
;
4495 shader_info
->gs
.max_gsvs_emit_size
= ctx
.max_gsvs_emit_size
;
4496 } else if (shaders
[i
]->info
.stage
== MESA_SHADER_TESS_CTRL
) {
4497 shader_info
->tcs
.num_patches
= ctx
.tcs_num_patches
;
4498 shader_info
->tcs
.lds_size
= calculate_tess_lds_size(&ctx
);
4502 LLVMBuildRetVoid(ctx
.ac
.builder
);
4504 if (options
->dump_preoptir
) {
4505 fprintf(stderr
, "%s LLVM IR:\n\n",
4506 radv_get_shader_name(shader_info
,
4507 shaders
[shader_count
- 1]->info
.stage
));
4508 ac_dump_module(ctx
.ac
.module
);
4509 fprintf(stderr
, "\n");
4512 ac_llvm_finalize_module(&ctx
, ac_llvm
->passmgr
, options
);
4514 if (shader_count
== 1)
4515 ac_nir_eliminate_const_vs_outputs(&ctx
);
4517 if (options
->dump_shader
) {
4518 ctx
.shader_info
->private_mem_vgprs
=
4519 ac_count_scratch_private_memory(ctx
.main_function
);
4522 return ctx
.ac
.module
;
4525 static void ac_diagnostic_handler(LLVMDiagnosticInfoRef di
, void *context
)
4527 unsigned *retval
= (unsigned *)context
;
4528 LLVMDiagnosticSeverity severity
= LLVMGetDiagInfoSeverity(di
);
4529 char *description
= LLVMGetDiagInfoDescription(di
);
4531 if (severity
== LLVMDSError
) {
4533 fprintf(stderr
, "LLVM triggered Diagnostic Handler: %s\n",
4537 LLVMDisposeMessage(description
);
4540 static unsigned radv_llvm_compile(LLVMModuleRef M
,
4541 char **pelf_buffer
, size_t *pelf_size
,
4542 struct ac_llvm_compiler
*ac_llvm
)
4544 unsigned retval
= 0;
4545 LLVMContextRef llvm_ctx
;
4547 /* Setup Diagnostic Handler*/
4548 llvm_ctx
= LLVMGetModuleContext(M
);
4550 LLVMContextSetDiagnosticHandler(llvm_ctx
, ac_diagnostic_handler
,
4554 if (!radv_compile_to_elf(ac_llvm
, M
, pelf_buffer
, pelf_size
))
4559 static void ac_compile_llvm_module(struct ac_llvm_compiler
*ac_llvm
,
4560 LLVMModuleRef llvm_module
,
4561 struct radv_shader_binary
**rbinary
,
4562 struct radv_shader_variant_info
*shader_info
,
4563 gl_shader_stage stage
,
4565 const struct radv_nir_compiler_options
*options
)
4567 char *elf_buffer
= NULL
;
4568 size_t elf_size
= 0;
4569 char *llvm_ir_string
= NULL
;
4571 if (options
->dump_shader
) {
4572 fprintf(stderr
, "%s LLVM IR:\n\n", name
);
4573 ac_dump_module(llvm_module
);
4574 fprintf(stderr
, "\n");
4577 if (options
->record_llvm_ir
) {
4578 char *llvm_ir
= LLVMPrintModuleToString(llvm_module
);
4579 llvm_ir_string
= strdup(llvm_ir
);
4580 LLVMDisposeMessage(llvm_ir
);
4583 int v
= radv_llvm_compile(llvm_module
, &elf_buffer
, &elf_size
, ac_llvm
);
4585 fprintf(stderr
, "compile failed\n");
4588 LLVMContextRef ctx
= LLVMGetModuleContext(llvm_module
);
4589 LLVMDisposeModule(llvm_module
);
4590 LLVMContextDispose(ctx
);
4592 size_t llvm_ir_size
= llvm_ir_string
? strlen(llvm_ir_string
) : 0;
4593 size_t alloc_size
= sizeof(struct radv_shader_binary_rtld
) + elf_size
+ llvm_ir_size
+ 1;
4594 struct radv_shader_binary_rtld
*rbin
= calloc(1, alloc_size
);
4595 memcpy(rbin
->data
, elf_buffer
, elf_size
);
4597 memcpy(rbin
->data
+ elf_size
, llvm_ir_string
, llvm_ir_size
+ 1);
4599 rbin
->base
.type
= RADV_BINARY_TYPE_RTLD
;
4600 rbin
->base
.stage
= stage
;
4601 rbin
->base
.total_size
= alloc_size
;
4602 rbin
->elf_size
= elf_size
;
4603 rbin
->llvm_ir_size
= llvm_ir_size
;
4604 *rbinary
= &rbin
->base
;
4606 free(llvm_ir_string
);
4611 ac_fill_shader_info(struct radv_shader_variant_info
*shader_info
, struct nir_shader
*nir
, const struct radv_nir_compiler_options
*options
)
4613 switch (nir
->info
.stage
) {
4614 case MESA_SHADER_COMPUTE
:
4615 for (int i
= 0; i
< 3; ++i
)
4616 shader_info
->cs
.block_size
[i
] = nir
->info
.cs
.local_size
[i
];
4618 case MESA_SHADER_FRAGMENT
:
4619 shader_info
->fs
.early_fragment_test
= nir
->info
.fs
.early_fragment_tests
;
4620 shader_info
->fs
.post_depth_coverage
= nir
->info
.fs
.post_depth_coverage
;
4622 case MESA_SHADER_GEOMETRY
:
4623 shader_info
->gs
.vertices_in
= nir
->info
.gs
.vertices_in
;
4624 shader_info
->gs
.vertices_out
= nir
->info
.gs
.vertices_out
;
4625 shader_info
->gs
.output_prim
= nir
->info
.gs
.output_primitive
;
4626 shader_info
->gs
.invocations
= nir
->info
.gs
.invocations
;
4628 case MESA_SHADER_TESS_EVAL
:
4629 shader_info
->tes
.primitive_mode
= nir
->info
.tess
.primitive_mode
;
4630 shader_info
->tes
.spacing
= nir
->info
.tess
.spacing
;
4631 shader_info
->tes
.ccw
= nir
->info
.tess
.ccw
;
4632 shader_info
->tes
.point_mode
= nir
->info
.tess
.point_mode
;
4633 shader_info
->tes
.as_es
= options
->key
.vs_common_out
.as_es
;
4634 shader_info
->tes
.export_prim_id
= options
->key
.vs_common_out
.export_prim_id
;
4635 shader_info
->is_ngg
= options
->key
.vs_common_out
.as_ngg
;
4637 case MESA_SHADER_TESS_CTRL
:
4638 shader_info
->tcs
.tcs_vertices_out
= nir
->info
.tess
.tcs_vertices_out
;
4640 case MESA_SHADER_VERTEX
:
4641 shader_info
->vs
.as_es
= options
->key
.vs_common_out
.as_es
;
4642 shader_info
->vs
.as_ls
= options
->key
.vs_common_out
.as_ls
;
4643 shader_info
->vs
.export_prim_id
= options
->key
.vs_common_out
.export_prim_id
;
4644 shader_info
->is_ngg
= options
->key
.vs_common_out
.as_ngg
;
4652 radv_compile_nir_shader(struct ac_llvm_compiler
*ac_llvm
,
4653 struct radv_shader_binary
**rbinary
,
4654 struct radv_shader_variant_info
*shader_info
,
4655 struct nir_shader
*const *nir
,
4657 const struct radv_nir_compiler_options
*options
)
4660 LLVMModuleRef llvm_module
;
4662 llvm_module
= ac_translate_nir_to_llvm(ac_llvm
, nir
, nir_count
, shader_info
,
4665 ac_compile_llvm_module(ac_llvm
, llvm_module
, rbinary
, shader_info
,
4666 nir
[nir_count
- 1]->info
.stage
,
4667 radv_get_shader_name(shader_info
,
4668 nir
[nir_count
- 1]->info
.stage
),
4671 for (int i
= 0; i
< nir_count
; ++i
)
4672 ac_fill_shader_info(shader_info
, nir
[i
], options
);
4674 /* Determine the ES type (VS or TES) for the GS on GFX9. */
4675 if (options
->chip_class
>= GFX9
) {
4676 if (nir_count
== 2 &&
4677 nir
[1]->info
.stage
== MESA_SHADER_GEOMETRY
) {
4678 shader_info
->gs
.es_type
= nir
[0]->info
.stage
;
4681 shader_info
->info
.wave_size
= options
->wave_size
;
4685 ac_gs_copy_shader_emit(struct radv_shader_context
*ctx
)
4687 LLVMValueRef vtx_offset
=
4688 LLVMBuildMul(ctx
->ac
.builder
, ctx
->abi
.vertex_id
,
4689 LLVMConstInt(ctx
->ac
.i32
, 4, false), "");
4690 LLVMValueRef stream_id
;
4692 /* Fetch the vertex stream ID. */
4693 if (ctx
->shader_info
->info
.so
.num_outputs
) {
4695 ac_unpack_param(&ctx
->ac
, ctx
->streamout_config
, 24, 2);
4697 stream_id
= ctx
->ac
.i32_0
;
4700 LLVMBasicBlockRef end_bb
;
4701 LLVMValueRef switch_inst
;
4703 end_bb
= LLVMAppendBasicBlockInContext(ctx
->ac
.context
,
4704 ctx
->main_function
, "end");
4705 switch_inst
= LLVMBuildSwitch(ctx
->ac
.builder
, stream_id
, end_bb
, 4);
4707 for (unsigned stream
= 0; stream
< 4; stream
++) {
4708 unsigned num_components
=
4709 ctx
->shader_info
->info
.gs
.num_stream_output_components
[stream
];
4710 LLVMBasicBlockRef bb
;
4713 if (!num_components
)
4716 if (stream
> 0 && !ctx
->shader_info
->info
.so
.num_outputs
)
4719 bb
= LLVMInsertBasicBlockInContext(ctx
->ac
.context
, end_bb
, "out");
4720 LLVMAddCase(switch_inst
, LLVMConstInt(ctx
->ac
.i32
, stream
, 0), bb
);
4721 LLVMPositionBuilderAtEnd(ctx
->ac
.builder
, bb
);
4724 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
4725 unsigned output_usage_mask
=
4726 ctx
->shader_info
->info
.gs
.output_usage_mask
[i
];
4727 unsigned output_stream
=
4728 ctx
->shader_info
->info
.gs
.output_streams
[i
];
4729 int length
= util_last_bit(output_usage_mask
);
4731 if (!(ctx
->output_mask
& (1ull << i
)) ||
4732 output_stream
!= stream
)
4735 for (unsigned j
= 0; j
< length
; j
++) {
4736 LLVMValueRef value
, soffset
;
4738 if (!(output_usage_mask
& (1 << j
)))
4741 soffset
= LLVMConstInt(ctx
->ac
.i32
,
4743 ctx
->gs_max_out_vertices
* 16 * 4, false);
4747 value
= ac_build_buffer_load(&ctx
->ac
,
4750 vtx_offset
, soffset
,
4751 0, ac_glc
| ac_slc
, true, false);
4753 LLVMTypeRef type
= LLVMGetAllocatedType(ctx
->abi
.outputs
[ac_llvm_reg_index_soa(i
, j
)]);
4754 if (ac_get_type_size(type
) == 2) {
4755 value
= LLVMBuildBitCast(ctx
->ac
.builder
, value
, ctx
->ac
.i32
, "");
4756 value
= LLVMBuildTrunc(ctx
->ac
.builder
, value
, ctx
->ac
.i16
, "");
4759 LLVMBuildStore(ctx
->ac
.builder
,
4760 ac_to_float(&ctx
->ac
, value
), ctx
->abi
.outputs
[ac_llvm_reg_index_soa(i
, j
)]);
4764 if (ctx
->shader_info
->info
.so
.num_outputs
)
4765 radv_emit_streamout(ctx
, stream
);
4768 handle_vs_outputs_post(ctx
, false, true,
4769 &ctx
->shader_info
->vs
.outinfo
);
4772 LLVMBuildBr(ctx
->ac
.builder
, end_bb
);
4775 LLVMPositionBuilderAtEnd(ctx
->ac
.builder
, end_bb
);
4779 radv_compile_gs_copy_shader(struct ac_llvm_compiler
*ac_llvm
,
4780 struct nir_shader
*geom_shader
,
4781 struct radv_shader_binary
**rbinary
,
4782 struct radv_shader_variant_info
*shader_info
,
4783 const struct radv_nir_compiler_options
*options
)
4785 struct radv_shader_context ctx
= {0};
4786 ctx
.options
= options
;
4787 ctx
.shader_info
= shader_info
;
4789 enum ac_float_mode float_mode
=
4790 options
->unsafe_math
? AC_FLOAT_MODE_UNSAFE_FP_MATH
:
4791 AC_FLOAT_MODE_DEFAULT
;
4793 ac_llvm_context_init(&ctx
.ac
, ac_llvm
, options
->chip_class
,
4794 options
->family
, float_mode
, 64, 64);
4795 ctx
.context
= ctx
.ac
.context
;
4797 ctx
.is_gs_copy_shader
= true;
4798 ctx
.stage
= MESA_SHADER_VERTEX
;
4800 radv_nir_shader_info_pass(geom_shader
, options
, &shader_info
->info
);
4802 create_function(&ctx
, MESA_SHADER_VERTEX
, false, MESA_SHADER_VERTEX
);
4804 ctx
.gs_max_out_vertices
= geom_shader
->info
.gs
.vertices_out
;
4805 ac_setup_rings(&ctx
);
4807 nir_foreach_variable(variable
, &geom_shader
->outputs
) {
4808 scan_shader_output_decl(&ctx
, variable
, geom_shader
, MESA_SHADER_VERTEX
);
4809 ac_handle_shader_output_decl(&ctx
.ac
, &ctx
.abi
, geom_shader
,
4810 variable
, MESA_SHADER_VERTEX
);
4813 ac_gs_copy_shader_emit(&ctx
);
4815 LLVMBuildRetVoid(ctx
.ac
.builder
);
4817 ac_llvm_finalize_module(&ctx
, ac_llvm
->passmgr
, options
);
4819 ac_compile_llvm_module(ac_llvm
, ctx
.ac
.module
, rbinary
, shader_info
,
4820 MESA_SHADER_VERTEX
, "GS Copy Shader", options
);
4821 (*rbinary
)->is_gs_copy_shader
= true;