2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
28 #include "radv_private.h"
29 #include "radv_shader.h"
30 #include "radv_shader_helper.h"
33 #include <llvm-c/Core.h>
34 #include <llvm-c/TargetMachine.h>
35 #include <llvm-c/Transforms/Scalar.h>
36 #include <llvm-c/Transforms/Utils.h>
39 #include "ac_binary.h"
40 #include "ac_llvm_util.h"
41 #include "ac_llvm_build.h"
42 #include "ac_shader_abi.h"
43 #include "ac_shader_util.h"
44 #include "ac_exp_param.h"
46 #define RADEON_LLVM_MAX_INPUTS (VARYING_SLOT_VAR31 + 1)
48 struct radv_shader_context
{
49 struct ac_llvm_context ac
;
50 const struct radv_nir_compiler_options
*options
;
51 struct radv_shader_variant_info
*shader_info
;
52 struct ac_shader_abi abi
;
54 unsigned max_workgroup_size
;
55 LLVMContextRef context
;
56 LLVMValueRef main_function
;
58 LLVMValueRef descriptor_sets
[RADV_UD_MAX_SETS
];
59 LLVMValueRef ring_offsets
;
61 LLVMValueRef vertex_buffers
;
62 LLVMValueRef rel_auto_id
;
63 LLVMValueRef vs_prim_id
;
64 LLVMValueRef es2gs_offset
;
67 LLVMValueRef merged_wave_info
;
68 LLVMValueRef tess_factor_offset
;
69 LLVMValueRef tes_rel_patch_id
;
75 * - bits 0..10: ordered_wave_id
76 * - bits 12..20: number of vertices in group
77 * - bits 22..30: number of primitives in group
79 LLVMValueRef gs_tg_info
;
80 LLVMValueRef gs2vs_offset
;
81 LLVMValueRef gs_wave_id
;
82 LLVMValueRef gs_vtx_offset
[6];
84 LLVMValueRef esgs_ring
;
85 LLVMValueRef gsvs_ring
[4];
86 LLVMValueRef hs_ring_tess_offchip
;
87 LLVMValueRef hs_ring_tess_factor
;
89 LLVMValueRef persp_sample
, persp_center
, persp_centroid
;
90 LLVMValueRef linear_sample
, linear_center
, linear_centroid
;
93 LLVMValueRef streamout_buffers
;
94 LLVMValueRef streamout_write_idx
;
95 LLVMValueRef streamout_config
;
96 LLVMValueRef streamout_offset
[4];
98 gl_shader_stage stage
;
100 LLVMValueRef inputs
[RADEON_LLVM_MAX_INPUTS
* 4];
101 uint64_t float16_shaded_mask
;
104 uint64_t output_mask
;
106 bool is_gs_copy_shader
;
107 LLVMValueRef gs_next_vertex
[4];
108 LLVMValueRef gs_curprim_verts
[4];
109 LLVMValueRef gs_generated_prims
[4];
110 LLVMValueRef gs_ngg_emit
;
111 LLVMValueRef gs_ngg_scratch
;
112 unsigned gs_max_out_vertices
;
113 unsigned gs_output_prim
;
115 unsigned tes_primitive_mode
;
117 uint32_t tcs_patch_outputs_read
;
118 uint64_t tcs_outputs_read
;
119 uint32_t tcs_vertices_per_patch
;
120 uint32_t tcs_num_inputs
;
121 uint32_t tcs_num_patches
;
122 uint32_t max_gsvs_emit_size
;
123 uint32_t gsvs_vertex_size
;
125 LLVMValueRef vertexptr
; /* GFX10 only */
128 struct radv_shader_output_values
{
129 LLVMValueRef values
[4];
135 enum radeon_llvm_calling_convention
{
136 RADEON_LLVM_AMDGPU_VS
= 87,
137 RADEON_LLVM_AMDGPU_GS
= 88,
138 RADEON_LLVM_AMDGPU_PS
= 89,
139 RADEON_LLVM_AMDGPU_CS
= 90,
140 RADEON_LLVM_AMDGPU_HS
= 93,
143 static inline struct radv_shader_context
*
144 radv_shader_context_from_abi(struct ac_shader_abi
*abi
)
146 struct radv_shader_context
*ctx
= NULL
;
147 return container_of(abi
, ctx
, abi
);
150 struct ac_build_if_state
152 struct radv_shader_context
*ctx
;
153 LLVMValueRef condition
;
154 LLVMBasicBlockRef entry_block
;
155 LLVMBasicBlockRef true_block
;
156 LLVMBasicBlockRef false_block
;
157 LLVMBasicBlockRef merge_block
;
160 static LLVMBasicBlockRef
161 ac_build_insert_new_block(struct radv_shader_context
*ctx
, const char *name
)
163 LLVMBasicBlockRef current_block
;
164 LLVMBasicBlockRef next_block
;
165 LLVMBasicBlockRef new_block
;
167 /* get current basic block */
168 current_block
= LLVMGetInsertBlock(ctx
->ac
.builder
);
170 /* chqeck if there's another block after this one */
171 next_block
= LLVMGetNextBasicBlock(current_block
);
173 /* insert the new block before the next block */
174 new_block
= LLVMInsertBasicBlockInContext(ctx
->context
, next_block
, name
);
177 /* append new block after current block */
178 LLVMValueRef function
= LLVMGetBasicBlockParent(current_block
);
179 new_block
= LLVMAppendBasicBlockInContext(ctx
->context
, function
, name
);
185 ac_nir_build_if(struct ac_build_if_state
*ifthen
,
186 struct radv_shader_context
*ctx
,
187 LLVMValueRef condition
)
189 LLVMBasicBlockRef block
= LLVMGetInsertBlock(ctx
->ac
.builder
);
191 memset(ifthen
, 0, sizeof *ifthen
);
193 ifthen
->condition
= condition
;
194 ifthen
->entry_block
= block
;
196 /* create endif/merge basic block for the phi functions */
197 ifthen
->merge_block
= ac_build_insert_new_block(ctx
, "endif-block");
199 /* create/insert true_block before merge_block */
201 LLVMInsertBasicBlockInContext(ctx
->context
,
205 /* successive code goes into the true block */
206 LLVMPositionBuilderAtEnd(ctx
->ac
.builder
, ifthen
->true_block
);
213 ac_nir_build_endif(struct ac_build_if_state
*ifthen
)
215 LLVMBuilderRef builder
= ifthen
->ctx
->ac
.builder
;
217 /* Insert branch to the merge block from current block */
218 LLVMBuildBr(builder
, ifthen
->merge_block
);
221 * Now patch in the various branch instructions.
224 /* Insert the conditional branch instruction at the end of entry_block */
225 LLVMPositionBuilderAtEnd(builder
, ifthen
->entry_block
);
226 if (ifthen
->false_block
) {
227 /* we have an else clause */
228 LLVMBuildCondBr(builder
, ifthen
->condition
,
229 ifthen
->true_block
, ifthen
->false_block
);
233 LLVMBuildCondBr(builder
, ifthen
->condition
,
234 ifthen
->true_block
, ifthen
->merge_block
);
237 /* Resume building code at end of the ifthen->merge_block */
238 LLVMPositionBuilderAtEnd(builder
, ifthen
->merge_block
);
242 static LLVMValueRef
get_rel_patch_id(struct radv_shader_context
*ctx
)
244 switch (ctx
->stage
) {
245 case MESA_SHADER_TESS_CTRL
:
246 return ac_unpack_param(&ctx
->ac
, ctx
->abi
.tcs_rel_ids
, 0, 8);
247 case MESA_SHADER_TESS_EVAL
:
248 return ctx
->tes_rel_patch_id
;
251 unreachable("Illegal stage");
256 get_tcs_num_patches(struct radv_shader_context
*ctx
)
258 unsigned num_tcs_input_cp
= ctx
->options
->key
.tcs
.input_vertices
;
259 unsigned num_tcs_output_cp
= ctx
->tcs_vertices_per_patch
;
260 uint32_t input_vertex_size
= ctx
->tcs_num_inputs
* 16;
261 uint32_t input_patch_size
= ctx
->options
->key
.tcs
.input_vertices
* input_vertex_size
;
262 uint32_t num_tcs_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.outputs_written
);
263 uint32_t num_tcs_patch_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.patch_outputs_written
);
264 uint32_t output_vertex_size
= num_tcs_outputs
* 16;
265 uint32_t pervertex_output_patch_size
= ctx
->tcs_vertices_per_patch
* output_vertex_size
;
266 uint32_t output_patch_size
= pervertex_output_patch_size
+ num_tcs_patch_outputs
* 16;
267 unsigned num_patches
;
268 unsigned hardware_lds_size
;
270 /* Ensure that we only need one wave per SIMD so we don't need to check
271 * resource usage. Also ensures that the number of tcs in and out
272 * vertices per threadgroup are at most 256.
274 num_patches
= 64 / MAX2(num_tcs_input_cp
, num_tcs_output_cp
) * 4;
275 /* Make sure that the data fits in LDS. This assumes the shaders only
276 * use LDS for the inputs and outputs.
278 hardware_lds_size
= 32768;
280 /* Looks like STONEY hangs if we use more than 32 KiB LDS in a single
281 * threadgroup, even though there is more than 32 KiB LDS.
283 * Test: dEQP-VK.tessellation.shader_input_output.barrier
285 if (ctx
->options
->chip_class
>= GFX7
&& ctx
->options
->family
!= CHIP_STONEY
)
286 hardware_lds_size
= 65536;
288 num_patches
= MIN2(num_patches
, hardware_lds_size
/ (input_patch_size
+ output_patch_size
));
289 /* Make sure the output data fits in the offchip buffer */
290 num_patches
= MIN2(num_patches
, (ctx
->options
->tess_offchip_block_dw_size
* 4) / output_patch_size
);
291 /* Not necessary for correctness, but improves performance. The
292 * specific value is taken from the proprietary driver.
294 num_patches
= MIN2(num_patches
, 40);
296 /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */
297 if (ctx
->options
->chip_class
== GFX6
) {
298 unsigned one_wave
= 64 / MAX2(num_tcs_input_cp
, num_tcs_output_cp
);
299 num_patches
= MIN2(num_patches
, one_wave
);
305 calculate_tess_lds_size(struct radv_shader_context
*ctx
)
307 unsigned num_tcs_input_cp
= ctx
->options
->key
.tcs
.input_vertices
;
308 unsigned num_tcs_output_cp
;
309 unsigned num_tcs_outputs
, num_tcs_patch_outputs
;
310 unsigned input_vertex_size
, output_vertex_size
;
311 unsigned input_patch_size
, output_patch_size
;
312 unsigned pervertex_output_patch_size
;
313 unsigned output_patch0_offset
;
314 unsigned num_patches
;
317 num_tcs_output_cp
= ctx
->tcs_vertices_per_patch
;
318 num_tcs_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.outputs_written
);
319 num_tcs_patch_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.patch_outputs_written
);
321 input_vertex_size
= ctx
->tcs_num_inputs
* 16;
322 output_vertex_size
= num_tcs_outputs
* 16;
324 input_patch_size
= num_tcs_input_cp
* input_vertex_size
;
326 pervertex_output_patch_size
= num_tcs_output_cp
* output_vertex_size
;
327 output_patch_size
= pervertex_output_patch_size
+ num_tcs_patch_outputs
* 16;
329 num_patches
= ctx
->tcs_num_patches
;
330 output_patch0_offset
= input_patch_size
* num_patches
;
332 lds_size
= output_patch0_offset
+ output_patch_size
* num_patches
;
336 /* Tessellation shaders pass outputs to the next shader using LDS.
338 * LS outputs = TCS inputs
339 * TCS outputs = TES inputs
342 * - TCS inputs for patch 0
343 * - TCS inputs for patch 1
344 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
346 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
347 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
348 * - TCS outputs for patch 1
349 * - Per-patch TCS outputs for patch 1
350 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
351 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
354 * All three shaders VS(LS), TCS, TES share the same LDS space.
357 get_tcs_in_patch_stride(struct radv_shader_context
*ctx
)
359 assert (ctx
->stage
== MESA_SHADER_TESS_CTRL
);
360 uint32_t input_vertex_size
= ctx
->tcs_num_inputs
* 16;
361 uint32_t input_patch_size
= ctx
->options
->key
.tcs
.input_vertices
* input_vertex_size
;
363 input_patch_size
/= 4;
364 return LLVMConstInt(ctx
->ac
.i32
, input_patch_size
, false);
368 get_tcs_out_patch_stride(struct radv_shader_context
*ctx
)
370 uint32_t num_tcs_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.outputs_written
);
371 uint32_t num_tcs_patch_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.patch_outputs_written
);
372 uint32_t output_vertex_size
= num_tcs_outputs
* 16;
373 uint32_t pervertex_output_patch_size
= ctx
->tcs_vertices_per_patch
* output_vertex_size
;
374 uint32_t output_patch_size
= pervertex_output_patch_size
+ num_tcs_patch_outputs
* 16;
375 output_patch_size
/= 4;
376 return LLVMConstInt(ctx
->ac
.i32
, output_patch_size
, false);
380 get_tcs_out_vertex_stride(struct radv_shader_context
*ctx
)
382 uint32_t num_tcs_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.outputs_written
);
383 uint32_t output_vertex_size
= num_tcs_outputs
* 16;
384 output_vertex_size
/= 4;
385 return LLVMConstInt(ctx
->ac
.i32
, output_vertex_size
, false);
389 get_tcs_out_patch0_offset(struct radv_shader_context
*ctx
)
391 assert (ctx
->stage
== MESA_SHADER_TESS_CTRL
);
392 uint32_t input_vertex_size
= ctx
->tcs_num_inputs
* 16;
393 uint32_t input_patch_size
= ctx
->options
->key
.tcs
.input_vertices
* input_vertex_size
;
394 uint32_t output_patch0_offset
= input_patch_size
;
395 unsigned num_patches
= ctx
->tcs_num_patches
;
397 output_patch0_offset
*= num_patches
;
398 output_patch0_offset
/= 4;
399 return LLVMConstInt(ctx
->ac
.i32
, output_patch0_offset
, false);
403 get_tcs_out_patch0_patch_data_offset(struct radv_shader_context
*ctx
)
405 assert (ctx
->stage
== MESA_SHADER_TESS_CTRL
);
406 uint32_t input_vertex_size
= ctx
->tcs_num_inputs
* 16;
407 uint32_t input_patch_size
= ctx
->options
->key
.tcs
.input_vertices
* input_vertex_size
;
408 uint32_t output_patch0_offset
= input_patch_size
;
410 uint32_t num_tcs_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.outputs_written
);
411 uint32_t output_vertex_size
= num_tcs_outputs
* 16;
412 uint32_t pervertex_output_patch_size
= ctx
->tcs_vertices_per_patch
* output_vertex_size
;
413 unsigned num_patches
= ctx
->tcs_num_patches
;
415 output_patch0_offset
*= num_patches
;
416 output_patch0_offset
+= pervertex_output_patch_size
;
417 output_patch0_offset
/= 4;
418 return LLVMConstInt(ctx
->ac
.i32
, output_patch0_offset
, false);
422 get_tcs_in_current_patch_offset(struct radv_shader_context
*ctx
)
424 LLVMValueRef patch_stride
= get_tcs_in_patch_stride(ctx
);
425 LLVMValueRef rel_patch_id
= get_rel_patch_id(ctx
);
427 return LLVMBuildMul(ctx
->ac
.builder
, patch_stride
, rel_patch_id
, "");
431 get_tcs_out_current_patch_offset(struct radv_shader_context
*ctx
)
433 LLVMValueRef patch0_offset
= get_tcs_out_patch0_offset(ctx
);
434 LLVMValueRef patch_stride
= get_tcs_out_patch_stride(ctx
);
435 LLVMValueRef rel_patch_id
= get_rel_patch_id(ctx
);
437 return ac_build_imad(&ctx
->ac
, patch_stride
, rel_patch_id
,
442 get_tcs_out_current_patch_data_offset(struct radv_shader_context
*ctx
)
444 LLVMValueRef patch0_patch_data_offset
=
445 get_tcs_out_patch0_patch_data_offset(ctx
);
446 LLVMValueRef patch_stride
= get_tcs_out_patch_stride(ctx
);
447 LLVMValueRef rel_patch_id
= get_rel_patch_id(ctx
);
449 return ac_build_imad(&ctx
->ac
, patch_stride
, rel_patch_id
,
450 patch0_patch_data_offset
);
455 LLVMTypeRef types
[MAX_ARGS
];
456 LLVMValueRef
*assign
[MAX_ARGS
];
459 uint8_t num_sgprs_used
;
460 uint8_t num_vgprs_used
;
463 enum ac_arg_regfile
{
469 add_arg(struct arg_info
*info
, enum ac_arg_regfile regfile
, LLVMTypeRef type
,
470 LLVMValueRef
*param_ptr
)
472 assert(info
->count
< MAX_ARGS
);
474 info
->assign
[info
->count
] = param_ptr
;
475 info
->types
[info
->count
] = type
;
478 if (regfile
== ARG_SGPR
) {
479 info
->num_sgprs_used
+= ac_get_type_size(type
) / 4;
482 assert(regfile
== ARG_VGPR
);
483 info
->num_vgprs_used
+= ac_get_type_size(type
) / 4;
487 static void assign_arguments(LLVMValueRef main_function
,
488 struct arg_info
*info
)
491 for (i
= 0; i
< info
->count
; i
++) {
493 *info
->assign
[i
] = LLVMGetParam(main_function
, i
);
498 create_llvm_function(LLVMContextRef ctx
, LLVMModuleRef module
,
499 LLVMBuilderRef builder
, LLVMTypeRef
*return_types
,
500 unsigned num_return_elems
,
501 struct arg_info
*args
,
502 unsigned max_workgroup_size
,
503 const struct radv_nir_compiler_options
*options
)
505 LLVMTypeRef main_function_type
, ret_type
;
506 LLVMBasicBlockRef main_function_body
;
508 if (num_return_elems
)
509 ret_type
= LLVMStructTypeInContext(ctx
, return_types
,
510 num_return_elems
, true);
512 ret_type
= LLVMVoidTypeInContext(ctx
);
514 /* Setup the function */
516 LLVMFunctionType(ret_type
, args
->types
, args
->count
, 0);
517 LLVMValueRef main_function
=
518 LLVMAddFunction(module
, "main", main_function_type
);
520 LLVMAppendBasicBlockInContext(ctx
, main_function
, "main_body");
521 LLVMPositionBuilderAtEnd(builder
, main_function_body
);
523 LLVMSetFunctionCallConv(main_function
, RADEON_LLVM_AMDGPU_CS
);
524 for (unsigned i
= 0; i
< args
->sgpr_count
; ++i
) {
525 LLVMValueRef P
= LLVMGetParam(main_function
, i
);
527 ac_add_function_attr(ctx
, main_function
, i
+ 1, AC_FUNC_ATTR_INREG
);
529 if (LLVMGetTypeKind(LLVMTypeOf(P
)) == LLVMPointerTypeKind
) {
530 ac_add_function_attr(ctx
, main_function
, i
+ 1, AC_FUNC_ATTR_NOALIAS
);
531 ac_add_attr_dereferenceable(P
, UINT64_MAX
);
535 if (options
->address32_hi
) {
536 ac_llvm_add_target_dep_function_attr(main_function
,
537 "amdgpu-32bit-address-high-bits",
538 options
->address32_hi
);
541 ac_llvm_set_workgroup_size(main_function
, max_workgroup_size
);
543 if (options
->unsafe_math
) {
544 /* These were copied from some LLVM test. */
545 LLVMAddTargetDependentFunctionAttr(main_function
,
546 "less-precise-fpmad",
548 LLVMAddTargetDependentFunctionAttr(main_function
,
551 LLVMAddTargetDependentFunctionAttr(main_function
,
554 LLVMAddTargetDependentFunctionAttr(main_function
,
557 LLVMAddTargetDependentFunctionAttr(main_function
,
558 "no-signed-zeros-fp-math",
561 return main_function
;
566 set_loc(struct radv_userdata_info
*ud_info
, uint8_t *sgpr_idx
,
569 ud_info
->sgpr_idx
= *sgpr_idx
;
570 ud_info
->num_sgprs
= num_sgprs
;
571 *sgpr_idx
+= num_sgprs
;
575 set_loc_shader(struct radv_shader_context
*ctx
, int idx
, uint8_t *sgpr_idx
,
578 struct radv_userdata_info
*ud_info
=
579 &ctx
->shader_info
->user_sgprs_locs
.shader_data
[idx
];
582 set_loc(ud_info
, sgpr_idx
, num_sgprs
);
586 set_loc_shader_ptr(struct radv_shader_context
*ctx
, int idx
, uint8_t *sgpr_idx
)
588 bool use_32bit_pointers
= idx
!= AC_UD_SCRATCH_RING_OFFSETS
;
590 set_loc_shader(ctx
, idx
, sgpr_idx
, use_32bit_pointers
? 1 : 2);
594 set_loc_desc(struct radv_shader_context
*ctx
, int idx
, uint8_t *sgpr_idx
)
596 struct radv_userdata_locations
*locs
=
597 &ctx
->shader_info
->user_sgprs_locs
;
598 struct radv_userdata_info
*ud_info
= &locs
->descriptor_sets
[idx
];
601 set_loc(ud_info
, sgpr_idx
, 1);
603 locs
->descriptor_sets_enabled
|= 1 << idx
;
606 struct user_sgpr_info
{
607 bool need_ring_offsets
;
608 bool indirect_all_descriptor_sets
;
609 uint8_t remaining_sgprs
;
612 static bool needs_view_index_sgpr(struct radv_shader_context
*ctx
,
613 gl_shader_stage stage
)
616 case MESA_SHADER_VERTEX
:
617 if (ctx
->shader_info
->info
.needs_multiview_view_index
||
618 (!ctx
->options
->key
.vs_common_out
.as_es
&& !ctx
->options
->key
.vs_common_out
.as_ls
&& ctx
->options
->key
.has_multiview_view_index
))
621 case MESA_SHADER_TESS_EVAL
:
622 if (ctx
->shader_info
->info
.needs_multiview_view_index
|| (!ctx
->options
->key
.vs_common_out
.as_es
&& ctx
->options
->key
.has_multiview_view_index
))
625 case MESA_SHADER_GEOMETRY
:
626 case MESA_SHADER_TESS_CTRL
:
627 if (ctx
->shader_info
->info
.needs_multiview_view_index
)
637 count_vs_user_sgprs(struct radv_shader_context
*ctx
)
641 if (ctx
->shader_info
->info
.vs
.has_vertex_buffers
)
643 count
+= ctx
->shader_info
->info
.vs
.needs_draw_id
? 3 : 2;
648 static void allocate_inline_push_consts(struct radv_shader_context
*ctx
,
649 struct user_sgpr_info
*user_sgpr_info
)
651 uint8_t remaining_sgprs
= user_sgpr_info
->remaining_sgprs
;
653 /* Only supported if shaders use push constants. */
654 if (ctx
->shader_info
->info
.min_push_constant_used
== UINT8_MAX
)
657 /* Only supported if shaders don't have indirect push constants. */
658 if (ctx
->shader_info
->info
.has_indirect_push_constants
)
661 /* Only supported for 32-bit push constants. */
662 if (!ctx
->shader_info
->info
.has_only_32bit_push_constants
)
665 uint8_t num_push_consts
=
666 (ctx
->shader_info
->info
.max_push_constant_used
-
667 ctx
->shader_info
->info
.min_push_constant_used
) / 4;
669 /* Check if the number of user SGPRs is large enough. */
670 if (num_push_consts
< remaining_sgprs
) {
671 ctx
->shader_info
->info
.num_inline_push_consts
= num_push_consts
;
673 ctx
->shader_info
->info
.num_inline_push_consts
= remaining_sgprs
;
676 /* Clamp to the maximum number of allowed inlined push constants. */
677 if (ctx
->shader_info
->info
.num_inline_push_consts
> AC_MAX_INLINE_PUSH_CONSTS
)
678 ctx
->shader_info
->info
.num_inline_push_consts
= AC_MAX_INLINE_PUSH_CONSTS
;
680 if (ctx
->shader_info
->info
.num_inline_push_consts
== num_push_consts
&&
681 !ctx
->shader_info
->info
.loads_dynamic_offsets
) {
682 /* Disable the default push constants path if all constants are
683 * inlined and if shaders don't use dynamic descriptors.
685 ctx
->shader_info
->info
.loads_push_constants
= false;
688 ctx
->shader_info
->info
.base_inline_push_consts
=
689 ctx
->shader_info
->info
.min_push_constant_used
/ 4;
692 static void allocate_user_sgprs(struct radv_shader_context
*ctx
,
693 gl_shader_stage stage
,
694 bool has_previous_stage
,
695 gl_shader_stage previous_stage
,
696 bool needs_view_index
,
697 struct user_sgpr_info
*user_sgpr_info
)
699 uint8_t user_sgpr_count
= 0;
701 memset(user_sgpr_info
, 0, sizeof(struct user_sgpr_info
));
703 /* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */
704 if (stage
== MESA_SHADER_GEOMETRY
||
705 stage
== MESA_SHADER_VERTEX
||
706 stage
== MESA_SHADER_TESS_CTRL
||
707 stage
== MESA_SHADER_TESS_EVAL
||
708 ctx
->is_gs_copy_shader
)
709 user_sgpr_info
->need_ring_offsets
= true;
711 if (stage
== MESA_SHADER_FRAGMENT
&&
712 ctx
->shader_info
->info
.ps
.needs_sample_positions
)
713 user_sgpr_info
->need_ring_offsets
= true;
715 /* 2 user sgprs will nearly always be allocated for scratch/rings */
716 if (ctx
->options
->supports_spill
|| user_sgpr_info
->need_ring_offsets
) {
717 user_sgpr_count
+= 2;
721 case MESA_SHADER_COMPUTE
:
722 if (ctx
->shader_info
->info
.cs
.uses_grid_size
)
723 user_sgpr_count
+= 3;
725 case MESA_SHADER_FRAGMENT
:
726 user_sgpr_count
+= ctx
->shader_info
->info
.ps
.needs_sample_positions
;
728 case MESA_SHADER_VERTEX
:
729 if (!ctx
->is_gs_copy_shader
)
730 user_sgpr_count
+= count_vs_user_sgprs(ctx
);
732 case MESA_SHADER_TESS_CTRL
:
733 if (has_previous_stage
) {
734 if (previous_stage
== MESA_SHADER_VERTEX
)
735 user_sgpr_count
+= count_vs_user_sgprs(ctx
);
738 case MESA_SHADER_TESS_EVAL
:
740 case MESA_SHADER_GEOMETRY
:
741 if (has_previous_stage
) {
742 if (previous_stage
== MESA_SHADER_VERTEX
) {
743 user_sgpr_count
+= count_vs_user_sgprs(ctx
);
751 if (needs_view_index
)
754 if (ctx
->shader_info
->info
.loads_push_constants
)
757 if (ctx
->streamout_buffers
)
760 uint32_t available_sgprs
= ctx
->options
->chip_class
>= GFX9
&& stage
!= MESA_SHADER_COMPUTE
? 32 : 16;
761 uint32_t remaining_sgprs
= available_sgprs
- user_sgpr_count
;
762 uint32_t num_desc_set
=
763 util_bitcount(ctx
->shader_info
->info
.desc_set_used_mask
);
765 if (remaining_sgprs
< num_desc_set
) {
766 user_sgpr_info
->indirect_all_descriptor_sets
= true;
767 user_sgpr_info
->remaining_sgprs
= remaining_sgprs
- 1;
769 user_sgpr_info
->remaining_sgprs
= remaining_sgprs
- num_desc_set
;
772 allocate_inline_push_consts(ctx
, user_sgpr_info
);
776 declare_global_input_sgprs(struct radv_shader_context
*ctx
,
777 const struct user_sgpr_info
*user_sgpr_info
,
778 struct arg_info
*args
,
779 LLVMValueRef
*desc_sets
)
781 LLVMTypeRef type
= ac_array_in_const32_addr_space(ctx
->ac
.i8
);
783 /* 1 for each descriptor set */
784 if (!user_sgpr_info
->indirect_all_descriptor_sets
) {
785 uint32_t mask
= ctx
->shader_info
->info
.desc_set_used_mask
;
788 int i
= u_bit_scan(&mask
);
790 add_arg(args
, ARG_SGPR
, type
, &ctx
->descriptor_sets
[i
]);
793 add_arg(args
, ARG_SGPR
, ac_array_in_const32_addr_space(type
),
797 if (ctx
->shader_info
->info
.loads_push_constants
) {
798 /* 1 for push constants and dynamic descriptors */
799 add_arg(args
, ARG_SGPR
, type
, &ctx
->abi
.push_constants
);
802 for (unsigned i
= 0; i
< ctx
->shader_info
->info
.num_inline_push_consts
; i
++) {
803 add_arg(args
, ARG_SGPR
, ctx
->ac
.i32
,
804 &ctx
->abi
.inline_push_consts
[i
]);
806 ctx
->abi
.num_inline_push_consts
= ctx
->shader_info
->info
.num_inline_push_consts
;
807 ctx
->abi
.base_inline_push_consts
= ctx
->shader_info
->info
.base_inline_push_consts
;
809 if (ctx
->shader_info
->info
.so
.num_outputs
) {
810 add_arg(args
, ARG_SGPR
,
811 ac_array_in_const32_addr_space(ctx
->ac
.v4i32
),
812 &ctx
->streamout_buffers
);
817 declare_vs_specific_input_sgprs(struct radv_shader_context
*ctx
,
818 gl_shader_stage stage
,
819 bool has_previous_stage
,
820 gl_shader_stage previous_stage
,
821 struct arg_info
*args
)
823 if (!ctx
->is_gs_copy_shader
&&
824 (stage
== MESA_SHADER_VERTEX
||
825 (has_previous_stage
&& previous_stage
== MESA_SHADER_VERTEX
))) {
826 if (ctx
->shader_info
->info
.vs
.has_vertex_buffers
) {
827 add_arg(args
, ARG_SGPR
,
828 ac_array_in_const32_addr_space(ctx
->ac
.v4i32
),
829 &ctx
->vertex_buffers
);
831 add_arg(args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->abi
.base_vertex
);
832 add_arg(args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->abi
.start_instance
);
833 if (ctx
->shader_info
->info
.vs
.needs_draw_id
) {
834 add_arg(args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->abi
.draw_id
);
840 declare_vs_input_vgprs(struct radv_shader_context
*ctx
, struct arg_info
*args
)
842 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.vertex_id
);
843 if (!ctx
->is_gs_copy_shader
) {
844 if (ctx
->options
->key
.vs_common_out
.as_ls
) {
845 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->rel_auto_id
);
846 if (ctx
->ac
.chip_class
>= GFX10
) {
847 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, NULL
); /* user vgpr */
848 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.instance_id
);
850 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.instance_id
);
851 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, NULL
); /* unused */
854 if (ctx
->ac
.chip_class
>= GFX10
) {
855 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, NULL
); /* user vgpr */
856 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, NULL
); /* user vgpr */
857 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.instance_id
);
859 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.instance_id
);
860 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->vs_prim_id
);
861 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, NULL
); /* unused */
868 declare_streamout_sgprs(struct radv_shader_context
*ctx
, gl_shader_stage stage
,
869 struct arg_info
*args
)
873 if (ctx
->ac
.chip_class
>= GFX10
)
876 /* Streamout SGPRs. */
877 if (ctx
->shader_info
->info
.so
.num_outputs
) {
878 assert(stage
== MESA_SHADER_VERTEX
||
879 stage
== MESA_SHADER_TESS_EVAL
);
881 if (stage
!= MESA_SHADER_TESS_EVAL
) {
882 add_arg(args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->streamout_config
);
884 args
->assign
[args
->count
- 1] = &ctx
->streamout_config
;
885 args
->types
[args
->count
- 1] = ctx
->ac
.i32
;
888 add_arg(args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->streamout_write_idx
);
891 /* A streamout buffer offset is loaded if the stride is non-zero. */
892 for (i
= 0; i
< 4; i
++) {
893 if (!ctx
->shader_info
->info
.so
.strides
[i
])
896 add_arg(args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->streamout_offset
[i
]);
901 declare_tes_input_vgprs(struct radv_shader_context
*ctx
, struct arg_info
*args
)
903 add_arg(args
, ARG_VGPR
, ctx
->ac
.f32
, &ctx
->tes_u
);
904 add_arg(args
, ARG_VGPR
, ctx
->ac
.f32
, &ctx
->tes_v
);
905 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->tes_rel_patch_id
);
906 add_arg(args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.tes_patch_id
);
910 set_global_input_locs(struct radv_shader_context
*ctx
,
911 const struct user_sgpr_info
*user_sgpr_info
,
912 LLVMValueRef desc_sets
, uint8_t *user_sgpr_idx
)
914 uint32_t mask
= ctx
->shader_info
->info
.desc_set_used_mask
;
916 if (!user_sgpr_info
->indirect_all_descriptor_sets
) {
918 int i
= u_bit_scan(&mask
);
920 set_loc_desc(ctx
, i
, user_sgpr_idx
);
923 set_loc_shader_ptr(ctx
, AC_UD_INDIRECT_DESCRIPTOR_SETS
,
927 int i
= u_bit_scan(&mask
);
929 ctx
->descriptor_sets
[i
] =
930 ac_build_load_to_sgpr(&ctx
->ac
, desc_sets
,
931 LLVMConstInt(ctx
->ac
.i32
, i
, false));
935 ctx
->shader_info
->need_indirect_descriptor_sets
= true;
938 if (ctx
->shader_info
->info
.loads_push_constants
) {
939 set_loc_shader_ptr(ctx
, AC_UD_PUSH_CONSTANTS
, user_sgpr_idx
);
942 if (ctx
->shader_info
->info
.num_inline_push_consts
) {
943 set_loc_shader(ctx
, AC_UD_INLINE_PUSH_CONSTANTS
, user_sgpr_idx
,
944 ctx
->shader_info
->info
.num_inline_push_consts
);
947 if (ctx
->streamout_buffers
) {
948 set_loc_shader_ptr(ctx
, AC_UD_STREAMOUT_BUFFERS
,
954 set_vs_specific_input_locs(struct radv_shader_context
*ctx
,
955 gl_shader_stage stage
, bool has_previous_stage
,
956 gl_shader_stage previous_stage
,
957 uint8_t *user_sgpr_idx
)
959 if (!ctx
->is_gs_copy_shader
&&
960 (stage
== MESA_SHADER_VERTEX
||
961 (has_previous_stage
&& previous_stage
== MESA_SHADER_VERTEX
))) {
962 if (ctx
->shader_info
->info
.vs
.has_vertex_buffers
) {
963 set_loc_shader_ptr(ctx
, AC_UD_VS_VERTEX_BUFFERS
,
968 if (ctx
->shader_info
->info
.vs
.needs_draw_id
)
971 set_loc_shader(ctx
, AC_UD_VS_BASE_VERTEX_START_INSTANCE
,
972 user_sgpr_idx
, vs_num
);
976 static void set_llvm_calling_convention(LLVMValueRef func
,
977 gl_shader_stage stage
)
979 enum radeon_llvm_calling_convention calling_conv
;
982 case MESA_SHADER_VERTEX
:
983 case MESA_SHADER_TESS_EVAL
:
984 calling_conv
= RADEON_LLVM_AMDGPU_VS
;
986 case MESA_SHADER_GEOMETRY
:
987 calling_conv
= RADEON_LLVM_AMDGPU_GS
;
989 case MESA_SHADER_TESS_CTRL
:
990 calling_conv
= RADEON_LLVM_AMDGPU_HS
;
992 case MESA_SHADER_FRAGMENT
:
993 calling_conv
= RADEON_LLVM_AMDGPU_PS
;
995 case MESA_SHADER_COMPUTE
:
996 calling_conv
= RADEON_LLVM_AMDGPU_CS
;
999 unreachable("Unhandle shader type");
1002 LLVMSetFunctionCallConv(func
, calling_conv
);
1005 /* Returns whether the stage is a stage that can be directly before the GS */
1006 static bool is_pre_gs_stage(gl_shader_stage stage
)
1008 return stage
== MESA_SHADER_VERTEX
|| stage
== MESA_SHADER_TESS_EVAL
;
1011 static void create_function(struct radv_shader_context
*ctx
,
1012 gl_shader_stage stage
,
1013 bool has_previous_stage
,
1014 gl_shader_stage previous_stage
)
1016 uint8_t user_sgpr_idx
;
1017 struct user_sgpr_info user_sgpr_info
;
1018 struct arg_info args
= {};
1019 LLVMValueRef desc_sets
;
1020 bool needs_view_index
= needs_view_index_sgpr(ctx
, stage
);
1022 if (ctx
->ac
.chip_class
>= GFX10
) {
1023 if (is_pre_gs_stage(stage
) && ctx
->options
->key
.vs_common_out
.as_ngg
) {
1024 /* On GFX10, VS is merged into GS for NGG. */
1025 previous_stage
= stage
;
1026 stage
= MESA_SHADER_GEOMETRY
;
1027 has_previous_stage
= true;
1031 allocate_user_sgprs(ctx
, stage
, has_previous_stage
,
1032 previous_stage
, needs_view_index
, &user_sgpr_info
);
1034 if (user_sgpr_info
.need_ring_offsets
&& !ctx
->options
->supports_spill
) {
1035 add_arg(&args
, ARG_SGPR
, ac_array_in_const_addr_space(ctx
->ac
.v4i32
),
1036 &ctx
->ring_offsets
);
1040 case MESA_SHADER_COMPUTE
:
1041 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
,
1044 if (ctx
->shader_info
->info
.cs
.uses_grid_size
) {
1045 add_arg(&args
, ARG_SGPR
, ctx
->ac
.v3i32
,
1046 &ctx
->abi
.num_work_groups
);
1049 for (int i
= 0; i
< 3; i
++) {
1050 ctx
->abi
.workgroup_ids
[i
] = NULL
;
1051 if (ctx
->shader_info
->info
.cs
.uses_block_id
[i
]) {
1052 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1053 &ctx
->abi
.workgroup_ids
[i
]);
1057 if (ctx
->shader_info
->info
.cs
.uses_local_invocation_idx
)
1058 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->abi
.tg_size
);
1059 add_arg(&args
, ARG_VGPR
, ctx
->ac
.v3i32
,
1060 &ctx
->abi
.local_invocation_ids
);
1062 case MESA_SHADER_VERTEX
:
1063 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
,
1066 declare_vs_specific_input_sgprs(ctx
, stage
, has_previous_stage
,
1067 previous_stage
, &args
);
1069 if (needs_view_index
)
1070 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1071 &ctx
->abi
.view_index
);
1072 if (ctx
->options
->key
.vs_common_out
.as_es
) {
1073 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1074 &ctx
->es2gs_offset
);
1075 } else if (ctx
->options
->key
.vs_common_out
.as_ls
) {
1076 /* no extra parameters */
1078 declare_streamout_sgprs(ctx
, stage
, &args
);
1081 declare_vs_input_vgprs(ctx
, &args
);
1083 case MESA_SHADER_TESS_CTRL
:
1084 if (has_previous_stage
) {
1085 // First 6 system regs
1086 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->oc_lds
);
1087 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1088 &ctx
->merged_wave_info
);
1089 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1090 &ctx
->tess_factor_offset
);
1092 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, NULL
); // scratch offset
1093 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, NULL
); // unknown
1094 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, NULL
); // unknown
1096 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
,
1099 declare_vs_specific_input_sgprs(ctx
, stage
,
1101 previous_stage
, &args
);
1103 if (needs_view_index
)
1104 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1105 &ctx
->abi
.view_index
);
1107 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1108 &ctx
->abi
.tcs_patch_id
);
1109 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1110 &ctx
->abi
.tcs_rel_ids
);
1112 declare_vs_input_vgprs(ctx
, &args
);
1114 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
,
1117 if (needs_view_index
)
1118 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1119 &ctx
->abi
.view_index
);
1121 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->oc_lds
);
1122 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1123 &ctx
->tess_factor_offset
);
1124 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1125 &ctx
->abi
.tcs_patch_id
);
1126 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1127 &ctx
->abi
.tcs_rel_ids
);
1130 case MESA_SHADER_TESS_EVAL
:
1131 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
,
1134 if (needs_view_index
)
1135 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1136 &ctx
->abi
.view_index
);
1138 if (ctx
->options
->key
.vs_common_out
.as_es
) {
1139 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->oc_lds
);
1140 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, NULL
);
1141 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1142 &ctx
->es2gs_offset
);
1144 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, NULL
);
1145 declare_streamout_sgprs(ctx
, stage
, &args
);
1146 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->oc_lds
);
1148 declare_tes_input_vgprs(ctx
, &args
);
1150 case MESA_SHADER_GEOMETRY
:
1151 if (has_previous_stage
) {
1152 // First 6 system regs
1153 if (ctx
->options
->key
.vs_common_out
.as_ngg
) {
1154 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1157 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1158 &ctx
->gs2vs_offset
);
1161 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1162 &ctx
->merged_wave_info
);
1163 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->oc_lds
);
1165 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, NULL
); // scratch offset
1166 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, NULL
); // unknown
1167 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, NULL
); // unknown
1169 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
,
1172 if (previous_stage
!= MESA_SHADER_TESS_EVAL
) {
1173 declare_vs_specific_input_sgprs(ctx
, stage
,
1179 if (needs_view_index
)
1180 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1181 &ctx
->abi
.view_index
);
1183 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1184 &ctx
->gs_vtx_offset
[0]);
1185 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1186 &ctx
->gs_vtx_offset
[2]);
1187 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1188 &ctx
->abi
.gs_prim_id
);
1189 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1190 &ctx
->abi
.gs_invocation_id
);
1191 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1192 &ctx
->gs_vtx_offset
[4]);
1194 if (previous_stage
== MESA_SHADER_VERTEX
) {
1195 declare_vs_input_vgprs(ctx
, &args
);
1197 declare_tes_input_vgprs(ctx
, &args
);
1200 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
,
1203 if (needs_view_index
)
1204 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
,
1205 &ctx
->abi
.view_index
);
1207 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->gs2vs_offset
);
1208 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->gs_wave_id
);
1209 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1210 &ctx
->gs_vtx_offset
[0]);
1211 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1212 &ctx
->gs_vtx_offset
[1]);
1213 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1214 &ctx
->abi
.gs_prim_id
);
1215 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1216 &ctx
->gs_vtx_offset
[2]);
1217 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1218 &ctx
->gs_vtx_offset
[3]);
1219 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1220 &ctx
->gs_vtx_offset
[4]);
1221 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1222 &ctx
->gs_vtx_offset
[5]);
1223 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
,
1224 &ctx
->abi
.gs_invocation_id
);
1227 case MESA_SHADER_FRAGMENT
:
1228 declare_global_input_sgprs(ctx
, &user_sgpr_info
, &args
,
1231 add_arg(&args
, ARG_SGPR
, ctx
->ac
.i32
, &ctx
->abi
.prim_mask
);
1232 add_arg(&args
, ARG_VGPR
, ctx
->ac
.v2i32
, &ctx
->persp_sample
);
1233 add_arg(&args
, ARG_VGPR
, ctx
->ac
.v2i32
, &ctx
->persp_center
);
1234 add_arg(&args
, ARG_VGPR
, ctx
->ac
.v2i32
, &ctx
->persp_centroid
);
1235 add_arg(&args
, ARG_VGPR
, ctx
->ac
.v3i32
, NULL
); /* persp pull model */
1236 add_arg(&args
, ARG_VGPR
, ctx
->ac
.v2i32
, &ctx
->linear_sample
);
1237 add_arg(&args
, ARG_VGPR
, ctx
->ac
.v2i32
, &ctx
->linear_center
);
1238 add_arg(&args
, ARG_VGPR
, ctx
->ac
.v2i32
, &ctx
->linear_centroid
);
1239 add_arg(&args
, ARG_VGPR
, ctx
->ac
.f32
, NULL
); /* line stipple tex */
1240 add_arg(&args
, ARG_VGPR
, ctx
->ac
.f32
, &ctx
->abi
.frag_pos
[0]);
1241 add_arg(&args
, ARG_VGPR
, ctx
->ac
.f32
, &ctx
->abi
.frag_pos
[1]);
1242 add_arg(&args
, ARG_VGPR
, ctx
->ac
.f32
, &ctx
->abi
.frag_pos
[2]);
1243 add_arg(&args
, ARG_VGPR
, ctx
->ac
.f32
, &ctx
->abi
.frag_pos
[3]);
1244 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.front_face
);
1245 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.ancillary
);
1246 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
, &ctx
->abi
.sample_coverage
);
1247 add_arg(&args
, ARG_VGPR
, ctx
->ac
.i32
, NULL
); /* fixed pt */
1250 unreachable("Shader stage not implemented");
1253 ctx
->main_function
= create_llvm_function(
1254 ctx
->context
, ctx
->ac
.module
, ctx
->ac
.builder
, NULL
, 0, &args
,
1255 ctx
->max_workgroup_size
, ctx
->options
);
1256 set_llvm_calling_convention(ctx
->main_function
, stage
);
1259 ctx
->shader_info
->num_input_vgprs
= 0;
1260 ctx
->shader_info
->num_input_sgprs
= ctx
->options
->supports_spill
? 2 : 0;
1262 ctx
->shader_info
->num_input_sgprs
+= args
.num_sgprs_used
;
1264 if (ctx
->stage
!= MESA_SHADER_FRAGMENT
)
1265 ctx
->shader_info
->num_input_vgprs
= args
.num_vgprs_used
;
1267 assign_arguments(ctx
->main_function
, &args
);
1271 if (ctx
->options
->supports_spill
|| user_sgpr_info
.need_ring_offsets
) {
1272 set_loc_shader_ptr(ctx
, AC_UD_SCRATCH_RING_OFFSETS
,
1274 if (ctx
->options
->supports_spill
) {
1275 ctx
->ring_offsets
= ac_build_intrinsic(&ctx
->ac
, "llvm.amdgcn.implicit.buffer.ptr",
1276 LLVMPointerType(ctx
->ac
.i8
, AC_ADDR_SPACE_CONST
),
1277 NULL
, 0, AC_FUNC_ATTR_READNONE
);
1278 ctx
->ring_offsets
= LLVMBuildBitCast(ctx
->ac
.builder
, ctx
->ring_offsets
,
1279 ac_array_in_const_addr_space(ctx
->ac
.v4i32
), "");
1283 /* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including
1284 * the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */
1285 if (has_previous_stage
)
1288 set_global_input_locs(ctx
, &user_sgpr_info
, desc_sets
, &user_sgpr_idx
);
1291 case MESA_SHADER_COMPUTE
:
1292 if (ctx
->shader_info
->info
.cs
.uses_grid_size
) {
1293 set_loc_shader(ctx
, AC_UD_CS_GRID_SIZE
,
1297 case MESA_SHADER_VERTEX
:
1298 set_vs_specific_input_locs(ctx
, stage
, has_previous_stage
,
1299 previous_stage
, &user_sgpr_idx
);
1300 if (ctx
->abi
.view_index
)
1301 set_loc_shader(ctx
, AC_UD_VIEW_INDEX
, &user_sgpr_idx
, 1);
1303 case MESA_SHADER_TESS_CTRL
:
1304 set_vs_specific_input_locs(ctx
, stage
, has_previous_stage
,
1305 previous_stage
, &user_sgpr_idx
);
1306 if (ctx
->abi
.view_index
)
1307 set_loc_shader(ctx
, AC_UD_VIEW_INDEX
, &user_sgpr_idx
, 1);
1309 case MESA_SHADER_TESS_EVAL
:
1310 if (ctx
->abi
.view_index
)
1311 set_loc_shader(ctx
, AC_UD_VIEW_INDEX
, &user_sgpr_idx
, 1);
1313 case MESA_SHADER_GEOMETRY
:
1314 if (has_previous_stage
) {
1315 if (previous_stage
== MESA_SHADER_VERTEX
)
1316 set_vs_specific_input_locs(ctx
, stage
,
1321 if (ctx
->abi
.view_index
)
1322 set_loc_shader(ctx
, AC_UD_VIEW_INDEX
, &user_sgpr_idx
, 1);
1324 case MESA_SHADER_FRAGMENT
:
1327 unreachable("Shader stage not implemented");
1330 if (stage
== MESA_SHADER_TESS_CTRL
||
1331 (stage
== MESA_SHADER_VERTEX
&& ctx
->options
->key
.vs_common_out
.as_ls
) ||
1332 /* GFX9 has the ESGS ring buffer in LDS. */
1333 (stage
== MESA_SHADER_GEOMETRY
&& has_previous_stage
)) {
1334 ac_declare_lds_as_pointer(&ctx
->ac
);
1337 ctx
->shader_info
->num_user_sgprs
= user_sgpr_idx
;
1342 radv_load_resource(struct ac_shader_abi
*abi
, LLVMValueRef index
,
1343 unsigned desc_set
, unsigned binding
)
1345 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1346 LLVMValueRef desc_ptr
= ctx
->descriptor_sets
[desc_set
];
1347 struct radv_pipeline_layout
*pipeline_layout
= ctx
->options
->layout
;
1348 struct radv_descriptor_set_layout
*layout
= pipeline_layout
->set
[desc_set
].layout
;
1349 unsigned base_offset
= layout
->binding
[binding
].offset
;
1350 LLVMValueRef offset
, stride
;
1352 if (layout
->binding
[binding
].type
== VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC
||
1353 layout
->binding
[binding
].type
== VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC
) {
1354 unsigned idx
= pipeline_layout
->set
[desc_set
].dynamic_offset_start
+
1355 layout
->binding
[binding
].dynamic_offset_offset
;
1356 desc_ptr
= ctx
->abi
.push_constants
;
1357 base_offset
= pipeline_layout
->push_constant_size
+ 16 * idx
;
1358 stride
= LLVMConstInt(ctx
->ac
.i32
, 16, false);
1360 stride
= LLVMConstInt(ctx
->ac
.i32
, layout
->binding
[binding
].size
, false);
1362 offset
= LLVMConstInt(ctx
->ac
.i32
, base_offset
, false);
1364 if (layout
->binding
[binding
].type
!= VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT
) {
1365 offset
= ac_build_imad(&ctx
->ac
, index
, stride
, offset
);
1368 desc_ptr
= LLVMBuildGEP(ctx
->ac
.builder
, desc_ptr
, &offset
, 1, "");
1369 desc_ptr
= ac_cast_ptr(&ctx
->ac
, desc_ptr
, ctx
->ac
.v4i32
);
1370 LLVMSetMetadata(desc_ptr
, ctx
->ac
.uniform_md_kind
, ctx
->ac
.empty_md
);
1372 if (layout
->binding
[binding
].type
== VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT
) {
1373 uint32_t desc_type
= S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X
) |
1374 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y
) |
1375 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z
) |
1376 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W
) |
1377 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT
) |
1378 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32
);
1380 LLVMValueRef desc_components
[4] = {
1381 LLVMBuildPtrToInt(ctx
->ac
.builder
, desc_ptr
, ctx
->ac
.intptr
, ""),
1382 LLVMConstInt(ctx
->ac
.i32
, S_008F04_BASE_ADDRESS_HI(ctx
->options
->address32_hi
), false),
1383 /* High limit to support variable sizes. */
1384 LLVMConstInt(ctx
->ac
.i32
, 0xffffffff, false),
1385 LLVMConstInt(ctx
->ac
.i32
, desc_type
, false),
1388 return ac_build_gather_values(&ctx
->ac
, desc_components
, 4);
1395 /* The offchip buffer layout for TCS->TES is
1397 * - attribute 0 of patch 0 vertex 0
1398 * - attribute 0 of patch 0 vertex 1
1399 * - attribute 0 of patch 0 vertex 2
1401 * - attribute 0 of patch 1 vertex 0
1402 * - attribute 0 of patch 1 vertex 1
1404 * - attribute 1 of patch 0 vertex 0
1405 * - attribute 1 of patch 0 vertex 1
1407 * - per patch attribute 0 of patch 0
1408 * - per patch attribute 0 of patch 1
1411 * Note that every attribute has 4 components.
1413 static LLVMValueRef
get_non_vertex_index_offset(struct radv_shader_context
*ctx
)
1415 uint32_t num_patches
= ctx
->tcs_num_patches
;
1416 uint32_t num_tcs_outputs
;
1417 if (ctx
->stage
== MESA_SHADER_TESS_CTRL
)
1418 num_tcs_outputs
= util_last_bit64(ctx
->shader_info
->info
.tcs
.outputs_written
);
1420 num_tcs_outputs
= ctx
->options
->key
.tes
.tcs_num_outputs
;
1422 uint32_t output_vertex_size
= num_tcs_outputs
* 16;
1423 uint32_t pervertex_output_patch_size
= ctx
->tcs_vertices_per_patch
* output_vertex_size
;
1425 return LLVMConstInt(ctx
->ac
.i32
, pervertex_output_patch_size
* num_patches
, false);
1428 static LLVMValueRef
calc_param_stride(struct radv_shader_context
*ctx
,
1429 LLVMValueRef vertex_index
)
1431 LLVMValueRef param_stride
;
1433 param_stride
= LLVMConstInt(ctx
->ac
.i32
, ctx
->tcs_vertices_per_patch
* ctx
->tcs_num_patches
, false);
1435 param_stride
= LLVMConstInt(ctx
->ac
.i32
, ctx
->tcs_num_patches
, false);
1436 return param_stride
;
1439 static LLVMValueRef
get_tcs_tes_buffer_address(struct radv_shader_context
*ctx
,
1440 LLVMValueRef vertex_index
,
1441 LLVMValueRef param_index
)
1443 LLVMValueRef base_addr
;
1444 LLVMValueRef param_stride
, constant16
;
1445 LLVMValueRef rel_patch_id
= get_rel_patch_id(ctx
);
1446 LLVMValueRef vertices_per_patch
= LLVMConstInt(ctx
->ac
.i32
, ctx
->tcs_vertices_per_patch
, false);
1447 constant16
= LLVMConstInt(ctx
->ac
.i32
, 16, false);
1448 param_stride
= calc_param_stride(ctx
, vertex_index
);
1450 base_addr
= ac_build_imad(&ctx
->ac
, rel_patch_id
,
1451 vertices_per_patch
, vertex_index
);
1453 base_addr
= rel_patch_id
;
1456 base_addr
= LLVMBuildAdd(ctx
->ac
.builder
, base_addr
,
1457 LLVMBuildMul(ctx
->ac
.builder
, param_index
,
1458 param_stride
, ""), "");
1460 base_addr
= LLVMBuildMul(ctx
->ac
.builder
, base_addr
, constant16
, "");
1462 if (!vertex_index
) {
1463 LLVMValueRef patch_data_offset
= get_non_vertex_index_offset(ctx
);
1465 base_addr
= LLVMBuildAdd(ctx
->ac
.builder
, base_addr
,
1466 patch_data_offset
, "");
1471 static LLVMValueRef
get_tcs_tes_buffer_address_params(struct radv_shader_context
*ctx
,
1473 unsigned const_index
,
1475 LLVMValueRef vertex_index
,
1476 LLVMValueRef indir_index
)
1478 LLVMValueRef param_index
;
1481 param_index
= LLVMBuildAdd(ctx
->ac
.builder
, LLVMConstInt(ctx
->ac
.i32
, param
, false),
1484 if (const_index
&& !is_compact
)
1485 param
+= const_index
;
1486 param_index
= LLVMConstInt(ctx
->ac
.i32
, param
, false);
1488 return get_tcs_tes_buffer_address(ctx
, vertex_index
, param_index
);
1492 get_dw_address(struct radv_shader_context
*ctx
,
1493 LLVMValueRef dw_addr
,
1495 unsigned const_index
,
1496 bool compact_const_index
,
1497 LLVMValueRef vertex_index
,
1498 LLVMValueRef stride
,
1499 LLVMValueRef indir_index
)
1504 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
1505 LLVMBuildMul(ctx
->ac
.builder
,
1511 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
1512 LLVMBuildMul(ctx
->ac
.builder
, indir_index
,
1513 LLVMConstInt(ctx
->ac
.i32
, 4, false), ""), "");
1514 else if (const_index
&& !compact_const_index
)
1515 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
1516 LLVMConstInt(ctx
->ac
.i32
, const_index
* 4, false), "");
1518 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
1519 LLVMConstInt(ctx
->ac
.i32
, param
* 4, false), "");
1521 if (const_index
&& compact_const_index
)
1522 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
1523 LLVMConstInt(ctx
->ac
.i32
, const_index
, false), "");
1528 load_tcs_varyings(struct ac_shader_abi
*abi
,
1530 LLVMValueRef vertex_index
,
1531 LLVMValueRef indir_index
,
1532 unsigned const_index
,
1534 unsigned driver_location
,
1536 unsigned num_components
,
1541 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1542 LLVMValueRef dw_addr
, stride
;
1543 LLVMValueRef value
[4], result
;
1544 unsigned param
= shader_io_get_unique_index(location
);
1547 uint32_t input_vertex_size
= (ctx
->tcs_num_inputs
* 16) / 4;
1548 stride
= LLVMConstInt(ctx
->ac
.i32
, input_vertex_size
, false);
1549 dw_addr
= get_tcs_in_current_patch_offset(ctx
);
1552 stride
= get_tcs_out_vertex_stride(ctx
);
1553 dw_addr
= get_tcs_out_current_patch_offset(ctx
);
1555 dw_addr
= get_tcs_out_current_patch_data_offset(ctx
);
1560 dw_addr
= get_dw_address(ctx
, dw_addr
, param
, const_index
, is_compact
, vertex_index
, stride
,
1563 for (unsigned i
= 0; i
< num_components
+ component
; i
++) {
1564 value
[i
] = ac_lds_load(&ctx
->ac
, dw_addr
);
1565 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
1568 result
= ac_build_varying_gather_values(&ctx
->ac
, value
, num_components
, component
);
1573 store_tcs_output(struct ac_shader_abi
*abi
,
1574 const nir_variable
*var
,
1575 LLVMValueRef vertex_index
,
1576 LLVMValueRef param_index
,
1577 unsigned const_index
,
1581 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1582 const unsigned location
= var
->data
.location
;
1583 unsigned component
= var
->data
.location_frac
;
1584 const bool is_patch
= var
->data
.patch
;
1585 const bool is_compact
= var
->data
.compact
;
1586 LLVMValueRef dw_addr
;
1587 LLVMValueRef stride
= NULL
;
1588 LLVMValueRef buf_addr
= NULL
;
1590 bool store_lds
= true;
1593 if (!(ctx
->tcs_patch_outputs_read
& (1U << (location
- VARYING_SLOT_PATCH0
))))
1596 if (!(ctx
->tcs_outputs_read
& (1ULL << location
)))
1600 param
= shader_io_get_unique_index(location
);
1601 if ((location
== VARYING_SLOT_CLIP_DIST0
|| location
== VARYING_SLOT_CLIP_DIST1
) && is_compact
) {
1602 const_index
+= component
;
1605 if (const_index
>= 4) {
1612 stride
= get_tcs_out_vertex_stride(ctx
);
1613 dw_addr
= get_tcs_out_current_patch_offset(ctx
);
1615 dw_addr
= get_tcs_out_current_patch_data_offset(ctx
);
1618 dw_addr
= get_dw_address(ctx
, dw_addr
, param
, const_index
, is_compact
, vertex_index
, stride
,
1620 buf_addr
= get_tcs_tes_buffer_address_params(ctx
, param
, const_index
, is_compact
,
1621 vertex_index
, param_index
);
1623 bool is_tess_factor
= false;
1624 if (location
== VARYING_SLOT_TESS_LEVEL_INNER
||
1625 location
== VARYING_SLOT_TESS_LEVEL_OUTER
)
1626 is_tess_factor
= true;
1628 unsigned base
= is_compact
? const_index
: 0;
1629 for (unsigned chan
= 0; chan
< 8; chan
++) {
1630 if (!(writemask
& (1 << chan
)))
1632 LLVMValueRef value
= ac_llvm_extract_elem(&ctx
->ac
, src
, chan
- component
);
1633 value
= ac_to_integer(&ctx
->ac
, value
);
1634 value
= LLVMBuildZExtOrBitCast(ctx
->ac
.builder
, value
, ctx
->ac
.i32
, "");
1636 if (store_lds
|| is_tess_factor
) {
1637 LLVMValueRef dw_addr_chan
=
1638 LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
1639 LLVMConstInt(ctx
->ac
.i32
, chan
, false), "");
1640 ac_lds_store(&ctx
->ac
, dw_addr_chan
, value
);
1643 if (!is_tess_factor
&& writemask
!= 0xF)
1644 ac_build_buffer_store_dword(&ctx
->ac
, ctx
->hs_ring_tess_offchip
, value
, 1,
1645 buf_addr
, ctx
->oc_lds
,
1646 4 * (base
+ chan
), ac_glc
, false);
1649 if (writemask
== 0xF) {
1650 ac_build_buffer_store_dword(&ctx
->ac
, ctx
->hs_ring_tess_offchip
, src
, 4,
1651 buf_addr
, ctx
->oc_lds
,
1652 (base
* 4), ac_glc
, false);
1657 load_tes_input(struct ac_shader_abi
*abi
,
1659 LLVMValueRef vertex_index
,
1660 LLVMValueRef param_index
,
1661 unsigned const_index
,
1663 unsigned driver_location
,
1665 unsigned num_components
,
1670 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1671 LLVMValueRef buf_addr
;
1672 LLVMValueRef result
;
1673 unsigned param
= shader_io_get_unique_index(location
);
1675 if ((location
== VARYING_SLOT_CLIP_DIST0
|| location
== VARYING_SLOT_CLIP_DIST1
) && is_compact
) {
1676 const_index
+= component
;
1678 if (const_index
>= 4) {
1684 buf_addr
= get_tcs_tes_buffer_address_params(ctx
, param
, const_index
,
1685 is_compact
, vertex_index
, param_index
);
1687 LLVMValueRef comp_offset
= LLVMConstInt(ctx
->ac
.i32
, component
* 4, false);
1688 buf_addr
= LLVMBuildAdd(ctx
->ac
.builder
, buf_addr
, comp_offset
, "");
1690 result
= ac_build_buffer_load(&ctx
->ac
, ctx
->hs_ring_tess_offchip
, num_components
, NULL
,
1691 buf_addr
, ctx
->oc_lds
, is_compact
? (4 * const_index
) : 0, ac_glc
, true, false);
1692 result
= ac_trim_vector(&ctx
->ac
, result
, num_components
);
1697 load_gs_input(struct ac_shader_abi
*abi
,
1699 unsigned driver_location
,
1701 unsigned num_components
,
1702 unsigned vertex_index
,
1703 unsigned const_index
,
1706 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1707 LLVMValueRef vtx_offset
;
1708 unsigned param
, vtx_offset_param
;
1709 LLVMValueRef value
[4], result
;
1711 vtx_offset_param
= vertex_index
;
1712 assert(vtx_offset_param
< 6);
1713 vtx_offset
= LLVMBuildMul(ctx
->ac
.builder
, ctx
->gs_vtx_offset
[vtx_offset_param
],
1714 LLVMConstInt(ctx
->ac
.i32
, 4, false), "");
1716 param
= shader_io_get_unique_index(location
);
1718 for (unsigned i
= component
; i
< num_components
+ component
; i
++) {
1719 if (ctx
->ac
.chip_class
>= GFX9
) {
1720 LLVMValueRef dw_addr
= ctx
->gs_vtx_offset
[vtx_offset_param
];
1721 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
1722 LLVMConstInt(ctx
->ac
.i32
, param
* 4 + i
+ const_index
, 0), "");
1723 value
[i
] = ac_lds_load(&ctx
->ac
, dw_addr
);
1725 LLVMValueRef soffset
=
1726 LLVMConstInt(ctx
->ac
.i32
,
1727 (param
* 4 + i
+ const_index
) * 256,
1730 value
[i
] = ac_build_buffer_load(&ctx
->ac
,
1733 vtx_offset
, soffset
,
1734 0, ac_glc
, true, false);
1737 if (ac_get_type_size(type
) == 2) {
1738 value
[i
] = LLVMBuildBitCast(ctx
->ac
.builder
, value
[i
], ctx
->ac
.i32
, "");
1739 value
[i
] = LLVMBuildTrunc(ctx
->ac
.builder
, value
[i
], ctx
->ac
.i16
, "");
1741 value
[i
] = LLVMBuildBitCast(ctx
->ac
.builder
, value
[i
], type
, "");
1743 result
= ac_build_varying_gather_values(&ctx
->ac
, value
, num_components
, component
);
1744 result
= ac_to_integer(&ctx
->ac
, result
);
1749 static void radv_emit_kill(struct ac_shader_abi
*abi
, LLVMValueRef visible
)
1751 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1752 ac_build_kill_if_false(&ctx
->ac
, visible
);
1755 static LLVMValueRef
lookup_interp_param(struct ac_shader_abi
*abi
,
1756 enum glsl_interp_mode interp
, unsigned location
)
1758 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1761 case INTERP_MODE_FLAT
:
1764 case INTERP_MODE_SMOOTH
:
1765 case INTERP_MODE_NONE
:
1766 if (location
== INTERP_CENTER
)
1767 return ctx
->persp_center
;
1768 else if (location
== INTERP_CENTROID
)
1769 return ctx
->persp_centroid
;
1770 else if (location
== INTERP_SAMPLE
)
1771 return ctx
->persp_sample
;
1773 case INTERP_MODE_NOPERSPECTIVE
:
1774 if (location
== INTERP_CENTER
)
1775 return ctx
->linear_center
;
1776 else if (location
== INTERP_CENTROID
)
1777 return ctx
->linear_centroid
;
1778 else if (location
== INTERP_SAMPLE
)
1779 return ctx
->linear_sample
;
1786 radv_get_sample_pos_offset(uint32_t num_samples
)
1788 uint32_t sample_pos_offset
= 0;
1790 switch (num_samples
) {
1792 sample_pos_offset
= 1;
1795 sample_pos_offset
= 3;
1798 sample_pos_offset
= 7;
1803 return sample_pos_offset
;
1806 static LLVMValueRef
load_sample_position(struct ac_shader_abi
*abi
,
1807 LLVMValueRef sample_id
)
1809 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1811 LLVMValueRef result
;
1812 LLVMValueRef index
= LLVMConstInt(ctx
->ac
.i32
, RING_PS_SAMPLE_POSITIONS
, false);
1813 LLVMValueRef ptr
= LLVMBuildGEP(ctx
->ac
.builder
, ctx
->ring_offsets
, &index
, 1, "");
1815 ptr
= LLVMBuildBitCast(ctx
->ac
.builder
, ptr
,
1816 ac_array_in_const_addr_space(ctx
->ac
.v2f32
), "");
1818 uint32_t sample_pos_offset
=
1819 radv_get_sample_pos_offset(ctx
->options
->key
.fs
.num_samples
);
1822 LLVMBuildAdd(ctx
->ac
.builder
, sample_id
,
1823 LLVMConstInt(ctx
->ac
.i32
, sample_pos_offset
, false), "");
1824 result
= ac_build_load_invariant(&ctx
->ac
, ptr
, sample_id
);
1830 static LLVMValueRef
load_sample_mask_in(struct ac_shader_abi
*abi
)
1832 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1833 uint8_t log2_ps_iter_samples
;
1835 if (ctx
->shader_info
->info
.ps
.force_persample
) {
1836 log2_ps_iter_samples
=
1837 util_logbase2(ctx
->options
->key
.fs
.num_samples
);
1839 log2_ps_iter_samples
= ctx
->options
->key
.fs
.log2_ps_iter_samples
;
1842 /* The bit pattern matches that used by fixed function fragment
1844 static const uint16_t ps_iter_masks
[] = {
1845 0xffff, /* not used */
1851 assert(log2_ps_iter_samples
< ARRAY_SIZE(ps_iter_masks
));
1853 uint32_t ps_iter_mask
= ps_iter_masks
[log2_ps_iter_samples
];
1855 LLVMValueRef result
, sample_id
;
1856 sample_id
= ac_unpack_param(&ctx
->ac
, abi
->ancillary
, 8, 4);
1857 sample_id
= LLVMBuildShl(ctx
->ac
.builder
, LLVMConstInt(ctx
->ac
.i32
, ps_iter_mask
, false), sample_id
, "");
1858 result
= LLVMBuildAnd(ctx
->ac
.builder
, sample_id
, abi
->sample_coverage
, "");
1863 static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context
*ctx
,
1865 LLVMValueRef
*addrs
);
1868 visit_emit_vertex(struct ac_shader_abi
*abi
, unsigned stream
, LLVMValueRef
*addrs
)
1870 LLVMValueRef gs_next_vertex
;
1871 LLVMValueRef can_emit
;
1872 unsigned offset
= 0;
1873 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1875 if (ctx
->options
->key
.vs_common_out
.as_ngg
) {
1876 gfx10_ngg_gs_emit_vertex(ctx
, stream
, addrs
);
1880 /* Write vertex attribute values to GSVS ring */
1881 gs_next_vertex
= LLVMBuildLoad(ctx
->ac
.builder
,
1882 ctx
->gs_next_vertex
[stream
],
1885 /* If this thread has already emitted the declared maximum number of
1886 * vertices, kill it: excessive vertex emissions are not supposed to
1887 * have any effect, and GS threads have no externally observable
1888 * effects other than emitting vertices.
1890 can_emit
= LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntULT
, gs_next_vertex
,
1891 LLVMConstInt(ctx
->ac
.i32
, ctx
->gs_max_out_vertices
, false), "");
1892 ac_build_kill_if_false(&ctx
->ac
, can_emit
);
1894 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
1895 unsigned output_usage_mask
=
1896 ctx
->shader_info
->info
.gs
.output_usage_mask
[i
];
1897 uint8_t output_stream
=
1898 ctx
->shader_info
->info
.gs
.output_streams
[i
];
1899 LLVMValueRef
*out_ptr
= &addrs
[i
* 4];
1900 int length
= util_last_bit(output_usage_mask
);
1902 if (!(ctx
->output_mask
& (1ull << i
)) ||
1903 output_stream
!= stream
)
1906 for (unsigned j
= 0; j
< length
; j
++) {
1907 if (!(output_usage_mask
& (1 << j
)))
1910 LLVMValueRef out_val
= LLVMBuildLoad(ctx
->ac
.builder
,
1912 LLVMValueRef voffset
=
1913 LLVMConstInt(ctx
->ac
.i32
, offset
*
1914 ctx
->gs_max_out_vertices
, false);
1918 voffset
= LLVMBuildAdd(ctx
->ac
.builder
, voffset
, gs_next_vertex
, "");
1919 voffset
= LLVMBuildMul(ctx
->ac
.builder
, voffset
, LLVMConstInt(ctx
->ac
.i32
, 4, false), "");
1921 out_val
= ac_to_integer(&ctx
->ac
, out_val
);
1922 out_val
= LLVMBuildZExtOrBitCast(ctx
->ac
.builder
, out_val
, ctx
->ac
.i32
, "");
1924 ac_build_buffer_store_dword(&ctx
->ac
,
1925 ctx
->gsvs_ring
[stream
],
1927 voffset
, ctx
->gs2vs_offset
, 0,
1928 ac_glc
| ac_slc
, true);
1932 gs_next_vertex
= LLVMBuildAdd(ctx
->ac
.builder
, gs_next_vertex
,
1934 LLVMBuildStore(ctx
->ac
.builder
, gs_next_vertex
, ctx
->gs_next_vertex
[stream
]);
1936 ac_build_sendmsg(&ctx
->ac
,
1937 AC_SENDMSG_GS_OP_EMIT
| AC_SENDMSG_GS
| (stream
<< 8),
1942 visit_end_primitive(struct ac_shader_abi
*abi
, unsigned stream
)
1944 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1946 if (ctx
->options
->key
.vs_common_out
.as_ngg
) {
1947 LLVMBuildStore(ctx
->ac
.builder
, ctx
->ac
.i32_0
, ctx
->gs_curprim_verts
[stream
]);
1951 ac_build_sendmsg(&ctx
->ac
, AC_SENDMSG_GS_OP_CUT
| AC_SENDMSG_GS
| (stream
<< 8), ctx
->gs_wave_id
);
1955 load_tess_coord(struct ac_shader_abi
*abi
)
1957 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1959 LLVMValueRef coord
[4] = {
1966 if (ctx
->tes_primitive_mode
== GL_TRIANGLES
)
1967 coord
[2] = LLVMBuildFSub(ctx
->ac
.builder
, ctx
->ac
.f32_1
,
1968 LLVMBuildFAdd(ctx
->ac
.builder
, coord
[0], coord
[1], ""), "");
1970 return ac_build_gather_values(&ctx
->ac
, coord
, 3);
1974 load_patch_vertices_in(struct ac_shader_abi
*abi
)
1976 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1977 return LLVMConstInt(ctx
->ac
.i32
, ctx
->options
->key
.tcs
.input_vertices
, false);
1981 static LLVMValueRef
radv_load_base_vertex(struct ac_shader_abi
*abi
)
1983 return abi
->base_vertex
;
1986 static LLVMValueRef
radv_load_ssbo(struct ac_shader_abi
*abi
,
1987 LLVMValueRef buffer_ptr
, bool write
)
1989 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
1990 LLVMValueRef result
;
1992 LLVMSetMetadata(buffer_ptr
, ctx
->ac
.uniform_md_kind
, ctx
->ac
.empty_md
);
1994 result
= LLVMBuildLoad(ctx
->ac
.builder
, buffer_ptr
, "");
1995 LLVMSetMetadata(result
, ctx
->ac
.invariant_load_md_kind
, ctx
->ac
.empty_md
);
2000 static LLVMValueRef
radv_load_ubo(struct ac_shader_abi
*abi
, LLVMValueRef buffer_ptr
)
2002 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
2003 LLVMValueRef result
;
2005 if (LLVMGetTypeKind(LLVMTypeOf(buffer_ptr
)) != LLVMPointerTypeKind
) {
2006 /* Do not load the descriptor for inlined uniform blocks. */
2010 LLVMSetMetadata(buffer_ptr
, ctx
->ac
.uniform_md_kind
, ctx
->ac
.empty_md
);
2012 result
= LLVMBuildLoad(ctx
->ac
.builder
, buffer_ptr
, "");
2013 LLVMSetMetadata(result
, ctx
->ac
.invariant_load_md_kind
, ctx
->ac
.empty_md
);
2018 static LLVMValueRef
radv_get_sampler_desc(struct ac_shader_abi
*abi
,
2019 unsigned descriptor_set
,
2020 unsigned base_index
,
2021 unsigned constant_index
,
2023 enum ac_descriptor_type desc_type
,
2024 bool image
, bool write
,
2027 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
2028 LLVMValueRef list
= ctx
->descriptor_sets
[descriptor_set
];
2029 struct radv_descriptor_set_layout
*layout
= ctx
->options
->layout
->set
[descriptor_set
].layout
;
2030 struct radv_descriptor_set_binding_layout
*binding
= layout
->binding
+ base_index
;
2031 unsigned offset
= binding
->offset
;
2032 unsigned stride
= binding
->size
;
2034 LLVMBuilderRef builder
= ctx
->ac
.builder
;
2037 assert(base_index
< layout
->binding_count
);
2039 switch (desc_type
) {
2041 type
= ctx
->ac
.v8i32
;
2045 type
= ctx
->ac
.v8i32
;
2049 case AC_DESC_SAMPLER
:
2050 type
= ctx
->ac
.v4i32
;
2051 if (binding
->type
== VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER
) {
2052 offset
+= radv_combined_image_descriptor_sampler_offset(binding
);
2057 case AC_DESC_BUFFER
:
2058 type
= ctx
->ac
.v4i32
;
2061 case AC_DESC_PLANE_0
:
2062 case AC_DESC_PLANE_1
:
2063 case AC_DESC_PLANE_2
:
2064 type
= ctx
->ac
.v8i32
;
2066 offset
+= 32 * (desc_type
- AC_DESC_PLANE_0
);
2069 unreachable("invalid desc_type\n");
2072 offset
+= constant_index
* stride
;
2074 if (desc_type
== AC_DESC_SAMPLER
&& binding
->immutable_samplers_offset
&&
2075 (!index
|| binding
->immutable_samplers_equal
)) {
2076 if (binding
->immutable_samplers_equal
)
2079 const uint32_t *samplers
= radv_immutable_samplers(layout
, binding
);
2081 LLVMValueRef constants
[] = {
2082 LLVMConstInt(ctx
->ac
.i32
, samplers
[constant_index
* 4 + 0], 0),
2083 LLVMConstInt(ctx
->ac
.i32
, samplers
[constant_index
* 4 + 1], 0),
2084 LLVMConstInt(ctx
->ac
.i32
, samplers
[constant_index
* 4 + 2], 0),
2085 LLVMConstInt(ctx
->ac
.i32
, samplers
[constant_index
* 4 + 3], 0),
2087 return ac_build_gather_values(&ctx
->ac
, constants
, 4);
2090 assert(stride
% type_size
== 0);
2092 LLVMValueRef adjusted_index
= index
;
2093 if (!adjusted_index
)
2094 adjusted_index
= ctx
->ac
.i32_0
;
2096 adjusted_index
= LLVMBuildMul(builder
, adjusted_index
, LLVMConstInt(ctx
->ac
.i32
, stride
/ type_size
, 0), "");
2098 LLVMValueRef val_offset
= LLVMConstInt(ctx
->ac
.i32
, offset
, 0);
2099 list
= LLVMBuildGEP(builder
, list
, &val_offset
, 1, "");
2100 list
= LLVMBuildPointerCast(builder
, list
,
2101 ac_array_in_const32_addr_space(type
), "");
2103 LLVMValueRef descriptor
= ac_build_load_to_sgpr(&ctx
->ac
, list
, adjusted_index
);
2105 /* 3 plane formats always have same size and format for plane 1 & 2, so
2106 * use the tail from plane 1 so that we can store only the first 16 bytes
2107 * of the last plane. */
2108 if (desc_type
== AC_DESC_PLANE_2
) {
2109 LLVMValueRef descriptor2
= radv_get_sampler_desc(abi
, descriptor_set
, base_index
, constant_index
, index
, AC_DESC_PLANE_1
,image
, write
, bindless
);
2111 LLVMValueRef components
[8];
2112 for (unsigned i
= 0; i
< 4; ++i
)
2113 components
[i
] = ac_llvm_extract_elem(&ctx
->ac
, descriptor
, i
);
2115 for (unsigned i
= 4; i
< 8; ++i
)
2116 components
[i
] = ac_llvm_extract_elem(&ctx
->ac
, descriptor2
, i
);
2117 descriptor
= ac_build_gather_values(&ctx
->ac
, components
, 8);
2123 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
2124 * so we may need to fix it up. */
2126 adjust_vertex_fetch_alpha(struct radv_shader_context
*ctx
,
2127 unsigned adjustment
,
2130 if (adjustment
== RADV_ALPHA_ADJUST_NONE
)
2133 LLVMValueRef c30
= LLVMConstInt(ctx
->ac
.i32
, 30, 0);
2135 alpha
= LLVMBuildBitCast(ctx
->ac
.builder
, alpha
, ctx
->ac
.f32
, "");
2137 if (adjustment
== RADV_ALPHA_ADJUST_SSCALED
)
2138 alpha
= LLVMBuildFPToUI(ctx
->ac
.builder
, alpha
, ctx
->ac
.i32
, "");
2140 alpha
= ac_to_integer(&ctx
->ac
, alpha
);
2142 /* For the integer-like cases, do a natural sign extension.
2144 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
2145 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
2148 alpha
= LLVMBuildShl(ctx
->ac
.builder
, alpha
,
2149 adjustment
== RADV_ALPHA_ADJUST_SNORM
?
2150 LLVMConstInt(ctx
->ac
.i32
, 7, 0) : c30
, "");
2151 alpha
= LLVMBuildAShr(ctx
->ac
.builder
, alpha
, c30
, "");
2153 /* Convert back to the right type. */
2154 if (adjustment
== RADV_ALPHA_ADJUST_SNORM
) {
2156 LLVMValueRef neg_one
= LLVMConstReal(ctx
->ac
.f32
, -1.0);
2157 alpha
= LLVMBuildSIToFP(ctx
->ac
.builder
, alpha
, ctx
->ac
.f32
, "");
2158 clamp
= LLVMBuildFCmp(ctx
->ac
.builder
, LLVMRealULT
, alpha
, neg_one
, "");
2159 alpha
= LLVMBuildSelect(ctx
->ac
.builder
, clamp
, neg_one
, alpha
, "");
2160 } else if (adjustment
== RADV_ALPHA_ADJUST_SSCALED
) {
2161 alpha
= LLVMBuildSIToFP(ctx
->ac
.builder
, alpha
, ctx
->ac
.f32
, "");
2164 return LLVMBuildBitCast(ctx
->ac
.builder
, alpha
, ctx
->ac
.i32
, "");
2168 get_num_channels_from_data_format(unsigned data_format
)
2170 switch (data_format
) {
2171 case V_008F0C_BUF_DATA_FORMAT_8
:
2172 case V_008F0C_BUF_DATA_FORMAT_16
:
2173 case V_008F0C_BUF_DATA_FORMAT_32
:
2175 case V_008F0C_BUF_DATA_FORMAT_8_8
:
2176 case V_008F0C_BUF_DATA_FORMAT_16_16
:
2177 case V_008F0C_BUF_DATA_FORMAT_32_32
:
2179 case V_008F0C_BUF_DATA_FORMAT_10_11_11
:
2180 case V_008F0C_BUF_DATA_FORMAT_11_11_10
:
2181 case V_008F0C_BUF_DATA_FORMAT_32_32_32
:
2183 case V_008F0C_BUF_DATA_FORMAT_8_8_8_8
:
2184 case V_008F0C_BUF_DATA_FORMAT_10_10_10_2
:
2185 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10
:
2186 case V_008F0C_BUF_DATA_FORMAT_16_16_16_16
:
2187 case V_008F0C_BUF_DATA_FORMAT_32_32_32_32
:
2197 radv_fixup_vertex_input_fetches(struct radv_shader_context
*ctx
,
2199 unsigned num_channels
,
2202 LLVMValueRef zero
= is_float
? ctx
->ac
.f32_0
: ctx
->ac
.i32_0
;
2203 LLVMValueRef one
= is_float
? ctx
->ac
.f32_1
: ctx
->ac
.i32_1
;
2204 LLVMValueRef chan
[4];
2206 if (LLVMGetTypeKind(LLVMTypeOf(value
)) == LLVMVectorTypeKind
) {
2207 unsigned vec_size
= LLVMGetVectorSize(LLVMTypeOf(value
));
2209 if (num_channels
== 4 && num_channels
== vec_size
)
2212 num_channels
= MIN2(num_channels
, vec_size
);
2214 for (unsigned i
= 0; i
< num_channels
; i
++)
2215 chan
[i
] = ac_llvm_extract_elem(&ctx
->ac
, value
, i
);
2218 assert(num_channels
== 1);
2223 for (unsigned i
= num_channels
; i
< 4; i
++) {
2224 chan
[i
] = i
== 3 ? one
: zero
;
2225 chan
[i
] = ac_to_integer(&ctx
->ac
, chan
[i
]);
2228 return ac_build_gather_values(&ctx
->ac
, chan
, 4);
2232 handle_vs_input_decl(struct radv_shader_context
*ctx
,
2233 struct nir_variable
*variable
)
2235 LLVMValueRef t_list_ptr
= ctx
->vertex_buffers
;
2236 LLVMValueRef t_offset
;
2237 LLVMValueRef t_list
;
2239 LLVMValueRef buffer_index
;
2240 unsigned attrib_count
= glsl_count_attribute_slots(variable
->type
, true);
2241 uint8_t input_usage_mask
=
2242 ctx
->shader_info
->info
.vs
.input_usage_mask
[variable
->data
.location
];
2243 unsigned num_input_channels
= util_last_bit(input_usage_mask
);
2245 variable
->data
.driver_location
= variable
->data
.location
* 4;
2247 enum glsl_base_type type
= glsl_get_base_type(variable
->type
);
2248 for (unsigned i
= 0; i
< attrib_count
; ++i
) {
2249 LLVMValueRef output
[4];
2250 unsigned attrib_index
= variable
->data
.location
+ i
- VERT_ATTRIB_GENERIC0
;
2251 unsigned attrib_format
= ctx
->options
->key
.vs
.vertex_attribute_formats
[attrib_index
];
2252 unsigned data_format
= attrib_format
& 0x0f;
2253 unsigned num_format
= (attrib_format
>> 4) & 0x07;
2254 bool is_float
= num_format
!= V_008F0C_BUF_NUM_FORMAT_UINT
&&
2255 num_format
!= V_008F0C_BUF_NUM_FORMAT_SINT
;
2257 if (ctx
->options
->key
.vs
.instance_rate_inputs
& (1u << attrib_index
)) {
2258 uint32_t divisor
= ctx
->options
->key
.vs
.instance_rate_divisors
[attrib_index
];
2261 buffer_index
= ctx
->abi
.instance_id
;
2264 buffer_index
= LLVMBuildUDiv(ctx
->ac
.builder
, buffer_index
,
2265 LLVMConstInt(ctx
->ac
.i32
, divisor
, 0), "");
2268 buffer_index
= ctx
->ac
.i32_0
;
2271 buffer_index
= LLVMBuildAdd(ctx
->ac
.builder
, ctx
->abi
.start_instance
, buffer_index
, "");
2273 buffer_index
= LLVMBuildAdd(ctx
->ac
.builder
, ctx
->abi
.vertex_id
,
2274 ctx
->abi
.base_vertex
, "");
2276 /* Adjust the number of channels to load based on the vertex
2279 unsigned num_format_channels
= get_num_channels_from_data_format(data_format
);
2280 unsigned num_channels
= MIN2(num_input_channels
, num_format_channels
);
2281 unsigned attrib_binding
= ctx
->options
->key
.vs
.vertex_attribute_bindings
[attrib_index
];
2282 unsigned attrib_offset
= ctx
->options
->key
.vs
.vertex_attribute_offsets
[attrib_index
];
2283 unsigned attrib_stride
= ctx
->options
->key
.vs
.vertex_attribute_strides
[attrib_index
];
2285 if (ctx
->options
->key
.vs
.post_shuffle
& (1 << attrib_index
)) {
2286 /* Always load, at least, 3 channels for formats that
2287 * need to be shuffled because X<->Z.
2289 num_channels
= MAX2(num_channels
, 3);
2292 if (attrib_stride
!= 0 && attrib_offset
> attrib_stride
) {
2293 LLVMValueRef buffer_offset
=
2294 LLVMConstInt(ctx
->ac
.i32
,
2295 attrib_offset
/ attrib_stride
, false);
2297 buffer_index
= LLVMBuildAdd(ctx
->ac
.builder
,
2301 attrib_offset
= attrib_offset
% attrib_stride
;
2304 t_offset
= LLVMConstInt(ctx
->ac
.i32
, attrib_binding
, false);
2305 t_list
= ac_build_load_to_sgpr(&ctx
->ac
, t_list_ptr
, t_offset
);
2307 input
= ac_build_struct_tbuffer_load(&ctx
->ac
, t_list
,
2309 LLVMConstInt(ctx
->ac
.i32
, attrib_offset
, false),
2310 ctx
->ac
.i32_0
, ctx
->ac
.i32_0
,
2312 data_format
, num_format
, 0, true);
2314 if (ctx
->options
->key
.vs
.post_shuffle
& (1 << attrib_index
)) {
2316 c
[0] = ac_llvm_extract_elem(&ctx
->ac
, input
, 2);
2317 c
[1] = ac_llvm_extract_elem(&ctx
->ac
, input
, 1);
2318 c
[2] = ac_llvm_extract_elem(&ctx
->ac
, input
, 0);
2319 c
[3] = ac_llvm_extract_elem(&ctx
->ac
, input
, 3);
2321 input
= ac_build_gather_values(&ctx
->ac
, c
, 4);
2324 input
= radv_fixup_vertex_input_fetches(ctx
, input
, num_channels
,
2327 for (unsigned chan
= 0; chan
< 4; chan
++) {
2328 LLVMValueRef llvm_chan
= LLVMConstInt(ctx
->ac
.i32
, chan
, false);
2329 output
[chan
] = LLVMBuildExtractElement(ctx
->ac
.builder
, input
, llvm_chan
, "");
2330 if (type
== GLSL_TYPE_FLOAT16
) {
2331 output
[chan
] = LLVMBuildBitCast(ctx
->ac
.builder
, output
[chan
], ctx
->ac
.f32
, "");
2332 output
[chan
] = LLVMBuildFPTrunc(ctx
->ac
.builder
, output
[chan
], ctx
->ac
.f16
, "");
2336 unsigned alpha_adjust
= (ctx
->options
->key
.vs
.alpha_adjust
>> (attrib_index
* 2)) & 3;
2337 output
[3] = adjust_vertex_fetch_alpha(ctx
, alpha_adjust
, output
[3]);
2339 for (unsigned chan
= 0; chan
< 4; chan
++) {
2340 output
[chan
] = ac_to_integer(&ctx
->ac
, output
[chan
]);
2341 if (type
== GLSL_TYPE_UINT16
|| type
== GLSL_TYPE_INT16
)
2342 output
[chan
] = LLVMBuildTrunc(ctx
->ac
.builder
, output
[chan
], ctx
->ac
.i16
, "");
2344 ctx
->inputs
[ac_llvm_reg_index_soa(variable
->data
.location
+ i
, chan
)] = output
[chan
];
2350 handle_vs_inputs(struct radv_shader_context
*ctx
,
2351 struct nir_shader
*nir
) {
2352 nir_foreach_variable(variable
, &nir
->inputs
)
2353 handle_vs_input_decl(ctx
, variable
);
2357 prepare_interp_optimize(struct radv_shader_context
*ctx
,
2358 struct nir_shader
*nir
)
2360 bool uses_center
= false;
2361 bool uses_centroid
= false;
2362 nir_foreach_variable(variable
, &nir
->inputs
) {
2363 if (glsl_get_base_type(glsl_without_array(variable
->type
)) != GLSL_TYPE_FLOAT
||
2364 variable
->data
.sample
)
2367 if (variable
->data
.centroid
)
2368 uses_centroid
= true;
2373 if (uses_center
&& uses_centroid
) {
2374 LLVMValueRef sel
= LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntSLT
, ctx
->abi
.prim_mask
, ctx
->ac
.i32_0
, "");
2375 ctx
->persp_centroid
= LLVMBuildSelect(ctx
->ac
.builder
, sel
, ctx
->persp_center
, ctx
->persp_centroid
, "");
2376 ctx
->linear_centroid
= LLVMBuildSelect(ctx
->ac
.builder
, sel
, ctx
->linear_center
, ctx
->linear_centroid
, "");
2381 scan_shader_output_decl(struct radv_shader_context
*ctx
,
2382 struct nir_variable
*variable
,
2383 struct nir_shader
*shader
,
2384 gl_shader_stage stage
)
2386 int idx
= variable
->data
.location
+ variable
->data
.index
;
2387 unsigned attrib_count
= glsl_count_attribute_slots(variable
->type
, false);
2388 uint64_t mask_attribs
;
2390 variable
->data
.driver_location
= idx
* 4;
2392 /* tess ctrl has it's own load/store paths for outputs */
2393 if (stage
== MESA_SHADER_TESS_CTRL
)
2396 if (variable
->data
.compact
) {
2397 unsigned component_count
= variable
->data
.location_frac
+
2398 glsl_get_length(variable
->type
);
2399 attrib_count
= (component_count
+ 3) / 4;
2402 mask_attribs
= ((1ull << attrib_count
) - 1) << idx
;
2403 if (stage
== MESA_SHADER_VERTEX
||
2404 stage
== MESA_SHADER_TESS_EVAL
||
2405 stage
== MESA_SHADER_GEOMETRY
) {
2406 if (idx
== VARYING_SLOT_CLIP_DIST0
) {
2407 if (stage
== MESA_SHADER_VERTEX
) {
2408 ctx
->shader_info
->vs
.outinfo
.clip_dist_mask
= (1 << shader
->info
.clip_distance_array_size
) - 1;
2409 ctx
->shader_info
->vs
.outinfo
.cull_dist_mask
= (1 << shader
->info
.cull_distance_array_size
) - 1;
2410 ctx
->shader_info
->vs
.outinfo
.cull_dist_mask
<<= shader
->info
.clip_distance_array_size
;
2412 if (stage
== MESA_SHADER_TESS_EVAL
) {
2413 ctx
->shader_info
->tes
.outinfo
.clip_dist_mask
= (1 << shader
->info
.clip_distance_array_size
) - 1;
2414 ctx
->shader_info
->tes
.outinfo
.cull_dist_mask
= (1 << shader
->info
.cull_distance_array_size
) - 1;
2415 ctx
->shader_info
->tes
.outinfo
.cull_dist_mask
<<= shader
->info
.clip_distance_array_size
;
2417 if (stage
== MESA_SHADER_GEOMETRY
) {
2418 ctx
->shader_info
->vs
.outinfo
.clip_dist_mask
= (1 << shader
->info
.clip_distance_array_size
) - 1;
2419 ctx
->shader_info
->vs
.outinfo
.cull_dist_mask
= (1 << shader
->info
.cull_distance_array_size
) - 1;
2420 ctx
->shader_info
->vs
.outinfo
.cull_dist_mask
<<= shader
->info
.clip_distance_array_size
;
2425 ctx
->output_mask
|= mask_attribs
;
2429 /* Initialize arguments for the shader export intrinsic */
2431 si_llvm_init_export_args(struct radv_shader_context
*ctx
,
2432 LLVMValueRef
*values
,
2433 unsigned enabled_channels
,
2435 struct ac_export_args
*args
)
2437 /* Specify the channels that are enabled. */
2438 args
->enabled_channels
= enabled_channels
;
2440 /* Specify whether the EXEC mask represents the valid mask */
2441 args
->valid_mask
= 0;
2443 /* Specify whether this is the last export */
2446 /* Specify the target we are exporting */
2447 args
->target
= target
;
2449 args
->compr
= false;
2450 args
->out
[0] = LLVMGetUndef(ctx
->ac
.f32
);
2451 args
->out
[1] = LLVMGetUndef(ctx
->ac
.f32
);
2452 args
->out
[2] = LLVMGetUndef(ctx
->ac
.f32
);
2453 args
->out
[3] = LLVMGetUndef(ctx
->ac
.f32
);
2458 bool is_16bit
= ac_get_type_size(LLVMTypeOf(values
[0])) == 2;
2459 if (ctx
->stage
== MESA_SHADER_FRAGMENT
) {
2460 unsigned index
= target
- V_008DFC_SQ_EXP_MRT
;
2461 unsigned col_format
= (ctx
->options
->key
.fs
.col_format
>> (4 * index
)) & 0xf;
2462 bool is_int8
= (ctx
->options
->key
.fs
.is_int8
>> index
) & 1;
2463 bool is_int10
= (ctx
->options
->key
.fs
.is_int10
>> index
) & 1;
2466 LLVMValueRef (*packf
)(struct ac_llvm_context
*ctx
, LLVMValueRef args
[2]) = NULL
;
2467 LLVMValueRef (*packi
)(struct ac_llvm_context
*ctx
, LLVMValueRef args
[2],
2468 unsigned bits
, bool hi
) = NULL
;
2470 switch(col_format
) {
2471 case V_028714_SPI_SHADER_ZERO
:
2472 args
->enabled_channels
= 0; /* writemask */
2473 args
->target
= V_008DFC_SQ_EXP_NULL
;
2476 case V_028714_SPI_SHADER_32_R
:
2477 args
->enabled_channels
= 1;
2478 args
->out
[0] = values
[0];
2481 case V_028714_SPI_SHADER_32_GR
:
2482 args
->enabled_channels
= 0x3;
2483 args
->out
[0] = values
[0];
2484 args
->out
[1] = values
[1];
2487 case V_028714_SPI_SHADER_32_AR
:
2488 if (ctx
->ac
.chip_class
>= GFX10
) {
2489 args
->enabled_channels
= 0x3;
2490 args
->out
[0] = values
[0];
2491 args
->out
[1] = values
[3];
2493 args
->enabled_channels
= 0x9;
2494 args
->out
[0] = values
[0];
2495 args
->out
[3] = values
[3];
2499 case V_028714_SPI_SHADER_FP16_ABGR
:
2500 args
->enabled_channels
= 0x5;
2501 packf
= ac_build_cvt_pkrtz_f16
;
2503 for (unsigned chan
= 0; chan
< 4; chan
++)
2504 values
[chan
] = LLVMBuildFPExt(ctx
->ac
.builder
,
2510 case V_028714_SPI_SHADER_UNORM16_ABGR
:
2511 args
->enabled_channels
= 0x5;
2512 packf
= ac_build_cvt_pknorm_u16
;
2515 case V_028714_SPI_SHADER_SNORM16_ABGR
:
2516 args
->enabled_channels
= 0x5;
2517 packf
= ac_build_cvt_pknorm_i16
;
2520 case V_028714_SPI_SHADER_UINT16_ABGR
:
2521 args
->enabled_channels
= 0x5;
2522 packi
= ac_build_cvt_pk_u16
;
2524 for (unsigned chan
= 0; chan
< 4; chan
++)
2525 values
[chan
] = LLVMBuildZExt(ctx
->ac
.builder
,
2526 ac_to_integer(&ctx
->ac
, values
[chan
]),
2531 case V_028714_SPI_SHADER_SINT16_ABGR
:
2532 args
->enabled_channels
= 0x5;
2533 packi
= ac_build_cvt_pk_i16
;
2535 for (unsigned chan
= 0; chan
< 4; chan
++)
2536 values
[chan
] = LLVMBuildSExt(ctx
->ac
.builder
,
2537 ac_to_integer(&ctx
->ac
, values
[chan
]),
2543 case V_028714_SPI_SHADER_32_ABGR
:
2544 memcpy(&args
->out
[0], values
, sizeof(values
[0]) * 4);
2548 /* Pack f16 or norm_i16/u16. */
2550 for (chan
= 0; chan
< 2; chan
++) {
2551 LLVMValueRef pack_args
[2] = {
2553 values
[2 * chan
+ 1]
2555 LLVMValueRef packed
;
2557 packed
= packf(&ctx
->ac
, pack_args
);
2558 args
->out
[chan
] = ac_to_float(&ctx
->ac
, packed
);
2560 args
->compr
= 1; /* COMPR flag */
2565 for (chan
= 0; chan
< 2; chan
++) {
2566 LLVMValueRef pack_args
[2] = {
2567 ac_to_integer(&ctx
->ac
, values
[2 * chan
]),
2568 ac_to_integer(&ctx
->ac
, values
[2 * chan
+ 1])
2570 LLVMValueRef packed
;
2572 packed
= packi(&ctx
->ac
, pack_args
,
2573 is_int8
? 8 : is_int10
? 10 : 16,
2575 args
->out
[chan
] = ac_to_float(&ctx
->ac
, packed
);
2577 args
->compr
= 1; /* COMPR flag */
2583 for (unsigned chan
= 0; chan
< 4; chan
++) {
2584 values
[chan
] = LLVMBuildBitCast(ctx
->ac
.builder
, values
[chan
], ctx
->ac
.i16
, "");
2585 args
->out
[chan
] = LLVMBuildZExt(ctx
->ac
.builder
, values
[chan
], ctx
->ac
.i32
, "");
2588 memcpy(&args
->out
[0], values
, sizeof(values
[0]) * 4);
2590 for (unsigned i
= 0; i
< 4; ++i
)
2591 args
->out
[i
] = ac_to_float(&ctx
->ac
, args
->out
[i
]);
2595 radv_export_param(struct radv_shader_context
*ctx
, unsigned index
,
2596 LLVMValueRef
*values
, unsigned enabled_channels
)
2598 struct ac_export_args args
;
2600 si_llvm_init_export_args(ctx
, values
, enabled_channels
,
2601 V_008DFC_SQ_EXP_PARAM
+ index
, &args
);
2602 ac_build_export(&ctx
->ac
, &args
);
2606 radv_load_output(struct radv_shader_context
*ctx
, unsigned index
, unsigned chan
)
2608 LLVMValueRef output
= ctx
->abi
.outputs
[ac_llvm_reg_index_soa(index
, chan
)];
2609 return LLVMBuildLoad(ctx
->ac
.builder
, output
, "");
2613 radv_emit_stream_output(struct radv_shader_context
*ctx
,
2614 LLVMValueRef
const *so_buffers
,
2615 LLVMValueRef
const *so_write_offsets
,
2616 const struct radv_stream_output
*output
,
2617 struct radv_shader_output_values
*shader_out
)
2619 unsigned num_comps
= util_bitcount(output
->component_mask
);
2620 unsigned buf
= output
->buffer
;
2621 unsigned offset
= output
->offset
;
2623 LLVMValueRef out
[4];
2625 assert(num_comps
&& num_comps
<= 4);
2626 if (!num_comps
|| num_comps
> 4)
2629 /* Get the first component. */
2630 start
= ffs(output
->component_mask
) - 1;
2632 /* Load the output as int. */
2633 for (int i
= 0; i
< num_comps
; i
++) {
2634 out
[i
] = ac_to_integer(&ctx
->ac
, shader_out
->values
[start
+ i
]);
2637 /* Pack the output. */
2638 LLVMValueRef vdata
= NULL
;
2640 switch (num_comps
) {
2641 case 1: /* as i32 */
2644 case 2: /* as v2i32 */
2645 case 3: /* as v4i32 (aligned to 4) */
2646 out
[3] = LLVMGetUndef(ctx
->ac
.i32
);
2648 case 4: /* as v4i32 */
2649 vdata
= ac_build_gather_values(&ctx
->ac
, out
,
2650 !ac_has_vec3_support(ctx
->ac
.chip_class
, false) ?
2651 util_next_power_of_two(num_comps
) :
2656 ac_build_buffer_store_dword(&ctx
->ac
, so_buffers
[buf
],
2657 vdata
, num_comps
, so_write_offsets
[buf
],
2658 ctx
->ac
.i32_0
, offset
,
2659 ac_glc
| ac_slc
, false);
2663 radv_emit_streamout(struct radv_shader_context
*ctx
, unsigned stream
)
2665 struct ac_build_if_state if_ctx
;
2668 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2669 assert(ctx
->streamout_config
);
2670 LLVMValueRef so_vtx_count
=
2671 ac_build_bfe(&ctx
->ac
, ctx
->streamout_config
,
2672 LLVMConstInt(ctx
->ac
.i32
, 16, false),
2673 LLVMConstInt(ctx
->ac
.i32
, 7, false), false);
2675 LLVMValueRef tid
= ac_get_thread_id(&ctx
->ac
);
2677 /* can_emit = tid < so_vtx_count; */
2678 LLVMValueRef can_emit
= LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntULT
,
2679 tid
, so_vtx_count
, "");
2681 /* Emit the streamout code conditionally. This actually avoids
2682 * out-of-bounds buffer access. The hw tells us via the SGPR
2683 * (so_vtx_count) which threads are allowed to emit streamout data.
2685 ac_nir_build_if(&if_ctx
, ctx
, can_emit
);
2687 /* The buffer offset is computed as follows:
2688 * ByteOffset = streamout_offset[buffer_id]*4 +
2689 * (streamout_write_index + thread_id)*stride[buffer_id] +
2692 LLVMValueRef so_write_index
= ctx
->streamout_write_idx
;
2694 /* Compute (streamout_write_index + thread_id). */
2696 LLVMBuildAdd(ctx
->ac
.builder
, so_write_index
, tid
, "");
2698 /* Load the descriptor and compute the write offset for each
2701 LLVMValueRef so_write_offset
[4] = {};
2702 LLVMValueRef so_buffers
[4] = {};
2703 LLVMValueRef buf_ptr
= ctx
->streamout_buffers
;
2705 for (i
= 0; i
< 4; i
++) {
2706 uint16_t stride
= ctx
->shader_info
->info
.so
.strides
[i
];
2711 LLVMValueRef offset
=
2712 LLVMConstInt(ctx
->ac
.i32
, i
, false);
2714 so_buffers
[i
] = ac_build_load_to_sgpr(&ctx
->ac
,
2717 LLVMValueRef so_offset
= ctx
->streamout_offset
[i
];
2719 so_offset
= LLVMBuildMul(ctx
->ac
.builder
, so_offset
,
2720 LLVMConstInt(ctx
->ac
.i32
, 4, false), "");
2722 so_write_offset
[i
] =
2723 ac_build_imad(&ctx
->ac
, so_write_index
,
2724 LLVMConstInt(ctx
->ac
.i32
,
2729 /* Write streamout data. */
2730 for (i
= 0; i
< ctx
->shader_info
->info
.so
.num_outputs
; i
++) {
2731 struct radv_shader_output_values shader_out
= {};
2732 struct radv_stream_output
*output
=
2733 &ctx
->shader_info
->info
.so
.outputs
[i
];
2735 if (stream
!= output
->stream
)
2738 for (int j
= 0; j
< 4; j
++) {
2739 shader_out
.values
[j
] =
2740 radv_load_output(ctx
, output
->location
, j
);
2743 radv_emit_stream_output(ctx
, so_buffers
,so_write_offset
,
2744 output
, &shader_out
);
2747 ac_nir_build_endif(&if_ctx
);
2751 radv_build_param_exports(struct radv_shader_context
*ctx
,
2752 struct radv_shader_output_values
*outputs
,
2754 struct radv_vs_output_info
*outinfo
,
2755 bool export_clip_dists
)
2757 unsigned param_count
= 0;
2759 for (unsigned i
= 0; i
< noutput
; i
++) {
2760 unsigned slot_name
= outputs
[i
].slot_name
;
2761 unsigned usage_mask
= outputs
[i
].usage_mask
;
2763 if (slot_name
!= VARYING_SLOT_LAYER
&&
2764 slot_name
!= VARYING_SLOT_PRIMITIVE_ID
&&
2765 slot_name
!= VARYING_SLOT_CLIP_DIST0
&&
2766 slot_name
!= VARYING_SLOT_CLIP_DIST1
&&
2767 slot_name
< VARYING_SLOT_VAR0
)
2770 if ((slot_name
== VARYING_SLOT_CLIP_DIST0
||
2771 slot_name
== VARYING_SLOT_CLIP_DIST1
) && !export_clip_dists
)
2774 radv_export_param(ctx
, param_count
, outputs
[i
].values
, usage_mask
);
2776 assert(i
< ARRAY_SIZE(outinfo
->vs_output_param_offset
));
2777 outinfo
->vs_output_param_offset
[slot_name
] = param_count
++;
2780 outinfo
->param_exports
= param_count
;
2783 /* Generate export instructions for hardware VS shader stage or NGG GS stage
2784 * (position and parameter data only).
2787 radv_llvm_export_vs(struct radv_shader_context
*ctx
,
2788 struct radv_shader_output_values
*outputs
,
2790 struct radv_vs_output_info
*outinfo
,
2791 bool export_clip_dists
)
2793 LLVMValueRef psize_value
= NULL
, layer_value
= NULL
, viewport_value
= NULL
;
2794 struct ac_export_args pos_args
[4] = {};
2795 unsigned pos_idx
, index
;
2798 /* Build position exports */
2799 for (i
= 0; i
< noutput
; i
++) {
2800 switch (outputs
[i
].slot_name
) {
2801 case VARYING_SLOT_POS
:
2802 si_llvm_init_export_args(ctx
, outputs
[i
].values
, 0xf,
2803 V_008DFC_SQ_EXP_POS
, &pos_args
[0]);
2805 case VARYING_SLOT_PSIZ
:
2806 psize_value
= outputs
[i
].values
[0];
2808 case VARYING_SLOT_LAYER
:
2809 layer_value
= outputs
[i
].values
[0];
2811 case VARYING_SLOT_VIEWPORT
:
2812 viewport_value
= outputs
[i
].values
[0];
2814 case VARYING_SLOT_CLIP_DIST0
:
2815 case VARYING_SLOT_CLIP_DIST1
:
2816 index
= 2 + outputs
[i
].slot_index
;
2817 si_llvm_init_export_args(ctx
, outputs
[i
].values
, 0xf,
2818 V_008DFC_SQ_EXP_POS
+ index
,
2826 /* We need to add the position output manually if it's missing. */
2827 if (!pos_args
[0].out
[0]) {
2828 pos_args
[0].enabled_channels
= 0xf; /* writemask */
2829 pos_args
[0].valid_mask
= 0; /* EXEC mask */
2830 pos_args
[0].done
= 0; /* last export? */
2831 pos_args
[0].target
= V_008DFC_SQ_EXP_POS
;
2832 pos_args
[0].compr
= 0; /* COMPR flag */
2833 pos_args
[0].out
[0] = ctx
->ac
.f32_0
; /* X */
2834 pos_args
[0].out
[1] = ctx
->ac
.f32_0
; /* Y */
2835 pos_args
[0].out
[2] = ctx
->ac
.f32_0
; /* Z */
2836 pos_args
[0].out
[3] = ctx
->ac
.f32_1
; /* W */
2839 if (outinfo
->writes_pointsize
||
2840 outinfo
->writes_layer
||
2841 outinfo
->writes_viewport_index
) {
2842 pos_args
[1].enabled_channels
= ((outinfo
->writes_pointsize
== true ? 1 : 0) |
2843 (outinfo
->writes_layer
== true ? 4 : 0));
2844 pos_args
[1].valid_mask
= 0;
2845 pos_args
[1].done
= 0;
2846 pos_args
[1].target
= V_008DFC_SQ_EXP_POS
+ 1;
2847 pos_args
[1].compr
= 0;
2848 pos_args
[1].out
[0] = ctx
->ac
.f32_0
; /* X */
2849 pos_args
[1].out
[1] = ctx
->ac
.f32_0
; /* Y */
2850 pos_args
[1].out
[2] = ctx
->ac
.f32_0
; /* Z */
2851 pos_args
[1].out
[3] = ctx
->ac
.f32_0
; /* W */
2853 if (outinfo
->writes_pointsize
== true)
2854 pos_args
[1].out
[0] = psize_value
;
2855 if (outinfo
->writes_layer
== true)
2856 pos_args
[1].out
[2] = layer_value
;
2857 if (outinfo
->writes_viewport_index
== true) {
2858 if (ctx
->options
->chip_class
>= GFX9
) {
2859 /* GFX9 has the layer in out.z[10:0] and the viewport
2860 * index in out.z[19:16].
2862 LLVMValueRef v
= viewport_value
;
2863 v
= ac_to_integer(&ctx
->ac
, v
);
2864 v
= LLVMBuildShl(ctx
->ac
.builder
, v
,
2865 LLVMConstInt(ctx
->ac
.i32
, 16, false),
2867 v
= LLVMBuildOr(ctx
->ac
.builder
, v
,
2868 ac_to_integer(&ctx
->ac
, pos_args
[1].out
[2]), "");
2870 pos_args
[1].out
[2] = ac_to_float(&ctx
->ac
, v
);
2871 pos_args
[1].enabled_channels
|= 1 << 2;
2873 pos_args
[1].out
[3] = viewport_value
;
2874 pos_args
[1].enabled_channels
|= 1 << 3;
2879 for (i
= 0; i
< 4; i
++) {
2880 if (pos_args
[i
].out
[0])
2881 outinfo
->pos_exports
++;
2884 /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
2885 * Setting valid_mask=1 prevents it and has no other effect.
2887 if (ctx
->ac
.family
== CHIP_NAVI10
||
2888 ctx
->ac
.family
== CHIP_NAVI12
||
2889 ctx
->ac
.family
== CHIP_NAVI14
)
2890 pos_args
[0].valid_mask
= 1;
2893 for (i
= 0; i
< 4; i
++) {
2894 if (!pos_args
[i
].out
[0])
2897 /* Specify the target we are exporting */
2898 pos_args
[i
].target
= V_008DFC_SQ_EXP_POS
+ pos_idx
++;
2900 if (pos_idx
== outinfo
->pos_exports
)
2901 /* Specify that this is the last export */
2902 pos_args
[i
].done
= 1;
2904 ac_build_export(&ctx
->ac
, &pos_args
[i
]);
2907 /* Build parameter exports */
2908 radv_build_param_exports(ctx
, outputs
, noutput
, outinfo
, export_clip_dists
);
2912 handle_vs_outputs_post(struct radv_shader_context
*ctx
,
2913 bool export_prim_id
,
2914 bool export_clip_dists
,
2915 struct radv_vs_output_info
*outinfo
)
2917 struct radv_shader_output_values
*outputs
;
2918 unsigned noutput
= 0;
2920 if (ctx
->options
->key
.has_multiview_view_index
) {
2921 LLVMValueRef
* tmp_out
= &ctx
->abi
.outputs
[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER
, 0)];
2923 for(unsigned i
= 0; i
< 4; ++i
)
2924 ctx
->abi
.outputs
[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER
, i
)] =
2925 ac_build_alloca_undef(&ctx
->ac
, ctx
->ac
.f32
, "");
2928 LLVMBuildStore(ctx
->ac
.builder
, ac_to_float(&ctx
->ac
, ctx
->abi
.view_index
), *tmp_out
);
2929 ctx
->output_mask
|= 1ull << VARYING_SLOT_LAYER
;
2932 memset(outinfo
->vs_output_param_offset
, AC_EXP_PARAM_UNDEFINED
,
2933 sizeof(outinfo
->vs_output_param_offset
));
2934 outinfo
->pos_exports
= 0;
2936 if (ctx
->output_mask
& (1ull << VARYING_SLOT_PSIZ
)) {
2937 outinfo
->writes_pointsize
= true;
2940 if (ctx
->output_mask
& (1ull << VARYING_SLOT_LAYER
)) {
2941 outinfo
->writes_layer
= true;
2944 if (ctx
->output_mask
& (1ull << VARYING_SLOT_VIEWPORT
)) {
2945 outinfo
->writes_viewport_index
= true;
2948 if (ctx
->shader_info
->info
.so
.num_outputs
&&
2949 !ctx
->is_gs_copy_shader
) {
2950 /* The GS copy shader emission already emits streamout. */
2951 radv_emit_streamout(ctx
, 0);
2954 /* Allocate a temporary array for the output values. */
2955 unsigned num_outputs
= util_bitcount64(ctx
->output_mask
) + export_prim_id
;
2956 outputs
= malloc(num_outputs
* sizeof(outputs
[0]));
2958 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
2959 if (!(ctx
->output_mask
& (1ull << i
)))
2962 outputs
[noutput
].slot_name
= i
;
2963 outputs
[noutput
].slot_index
= i
== VARYING_SLOT_CLIP_DIST1
;
2965 if (ctx
->stage
== MESA_SHADER_VERTEX
&&
2966 !ctx
->is_gs_copy_shader
) {
2967 outputs
[noutput
].usage_mask
=
2968 ctx
->shader_info
->info
.vs
.output_usage_mask
[i
];
2969 } else if (ctx
->stage
== MESA_SHADER_TESS_EVAL
) {
2970 outputs
[noutput
].usage_mask
=
2971 ctx
->shader_info
->info
.tes
.output_usage_mask
[i
];
2973 assert(ctx
->is_gs_copy_shader
);
2974 outputs
[noutput
].usage_mask
=
2975 ctx
->shader_info
->info
.gs
.output_usage_mask
[i
];
2978 for (unsigned j
= 0; j
< 4; j
++) {
2979 outputs
[noutput
].values
[j
] =
2980 ac_to_float(&ctx
->ac
, radv_load_output(ctx
, i
, j
));
2986 /* Export PrimitiveID. */
2987 if (export_prim_id
) {
2988 outinfo
->export_prim_id
= true;
2990 outputs
[noutput
].slot_name
= VARYING_SLOT_PRIMITIVE_ID
;
2991 outputs
[noutput
].slot_index
= 0;
2992 outputs
[noutput
].usage_mask
= 0x1;
2993 outputs
[noutput
].values
[0] = ctx
->vs_prim_id
;
2994 for (unsigned j
= 1; j
< 4; j
++)
2995 outputs
[noutput
].values
[j
] = ctx
->ac
.f32_0
;
2999 radv_llvm_export_vs(ctx
, outputs
, noutput
, outinfo
, export_clip_dists
);
3005 handle_es_outputs_post(struct radv_shader_context
*ctx
,
3006 struct radv_es_output_info
*outinfo
)
3009 uint64_t max_output_written
= 0;
3010 LLVMValueRef lds_base
= NULL
;
3012 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
3015 if (!(ctx
->output_mask
& (1ull << i
)))
3018 param_index
= shader_io_get_unique_index(i
);
3020 max_output_written
= MAX2(param_index
, max_output_written
);
3023 outinfo
->esgs_itemsize
= (max_output_written
+ 1) * 16;
3025 if (ctx
->ac
.chip_class
>= GFX9
) {
3026 unsigned itemsize_dw
= outinfo
->esgs_itemsize
/ 4;
3027 LLVMValueRef vertex_idx
= ac_get_thread_id(&ctx
->ac
);
3028 LLVMValueRef wave_idx
= ac_unpack_param(&ctx
->ac
, ctx
->merged_wave_info
, 24, 4);
3029 vertex_idx
= LLVMBuildOr(ctx
->ac
.builder
, vertex_idx
,
3030 LLVMBuildMul(ctx
->ac
.builder
, wave_idx
,
3031 LLVMConstInt(ctx
->ac
.i32
, 64, false), ""), "");
3032 lds_base
= LLVMBuildMul(ctx
->ac
.builder
, vertex_idx
,
3033 LLVMConstInt(ctx
->ac
.i32
, itemsize_dw
, 0), "");
3036 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
3037 LLVMValueRef dw_addr
= NULL
;
3038 LLVMValueRef
*out_ptr
= &ctx
->abi
.outputs
[i
* 4];
3039 unsigned output_usage_mask
;
3042 if (!(ctx
->output_mask
& (1ull << i
)))
3045 if (ctx
->stage
== MESA_SHADER_VERTEX
) {
3047 ctx
->shader_info
->info
.vs
.output_usage_mask
[i
];
3049 assert(ctx
->stage
== MESA_SHADER_TESS_EVAL
);
3051 ctx
->shader_info
->info
.tes
.output_usage_mask
[i
];
3054 param_index
= shader_io_get_unique_index(i
);
3057 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, lds_base
,
3058 LLVMConstInt(ctx
->ac
.i32
, param_index
* 4, false),
3062 for (j
= 0; j
< 4; j
++) {
3063 if (!(output_usage_mask
& (1 << j
)))
3066 LLVMValueRef out_val
= LLVMBuildLoad(ctx
->ac
.builder
, out_ptr
[j
], "");
3067 out_val
= ac_to_integer(&ctx
->ac
, out_val
);
3068 out_val
= LLVMBuildZExtOrBitCast(ctx
->ac
.builder
, out_val
, ctx
->ac
.i32
, "");
3070 if (ctx
->ac
.chip_class
>= GFX9
) {
3071 LLVMValueRef dw_addr_offset
=
3072 LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
,
3073 LLVMConstInt(ctx
->ac
.i32
,
3076 ac_lds_store(&ctx
->ac
, dw_addr_offset
, out_val
);
3078 ac_build_buffer_store_dword(&ctx
->ac
,
3081 NULL
, ctx
->es2gs_offset
,
3082 (4 * param_index
+ j
) * 4,
3083 ac_glc
| ac_slc
, true);
3090 handle_ls_outputs_post(struct radv_shader_context
*ctx
)
3092 LLVMValueRef vertex_id
= ctx
->rel_auto_id
;
3093 uint32_t num_tcs_inputs
= util_last_bit64(ctx
->shader_info
->info
.vs
.ls_outputs_written
);
3094 LLVMValueRef vertex_dw_stride
= LLVMConstInt(ctx
->ac
.i32
, num_tcs_inputs
* 4, false);
3095 LLVMValueRef base_dw_addr
= LLVMBuildMul(ctx
->ac
.builder
, vertex_id
,
3096 vertex_dw_stride
, "");
3098 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
3099 LLVMValueRef
*out_ptr
= &ctx
->abi
.outputs
[i
* 4];
3101 if (!(ctx
->output_mask
& (1ull << i
)))
3104 int param
= shader_io_get_unique_index(i
);
3105 LLVMValueRef dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, base_dw_addr
,
3106 LLVMConstInt(ctx
->ac
.i32
, param
* 4, false),
3108 for (unsigned j
= 0; j
< 4; j
++) {
3109 LLVMValueRef value
= LLVMBuildLoad(ctx
->ac
.builder
, out_ptr
[j
], "");
3110 value
= ac_to_integer(&ctx
->ac
, value
);
3111 value
= LLVMBuildZExtOrBitCast(ctx
->ac
.builder
, value
, ctx
->ac
.i32
, "");
3112 ac_lds_store(&ctx
->ac
, dw_addr
, value
);
3113 dw_addr
= LLVMBuildAdd(ctx
->ac
.builder
, dw_addr
, ctx
->ac
.i32_1
, "");
3118 static LLVMValueRef
get_wave_id_in_tg(struct radv_shader_context
*ctx
)
3120 return ac_unpack_param(&ctx
->ac
, ctx
->merged_wave_info
, 24, 4);
3123 static LLVMValueRef
get_tgsize(struct radv_shader_context
*ctx
)
3125 return ac_unpack_param(&ctx
->ac
, ctx
->merged_wave_info
, 28, 4);
3128 static LLVMValueRef
get_thread_id_in_tg(struct radv_shader_context
*ctx
)
3130 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3132 tmp
= LLVMBuildMul(builder
, get_wave_id_in_tg(ctx
),
3133 LLVMConstInt(ctx
->ac
.i32
, 64, false), "");
3134 return LLVMBuildAdd(builder
, tmp
, ac_get_thread_id(&ctx
->ac
), "");
3137 static LLVMValueRef
ngg_get_vtx_cnt(struct radv_shader_context
*ctx
)
3139 return ac_build_bfe(&ctx
->ac
, ctx
->gs_tg_info
,
3140 LLVMConstInt(ctx
->ac
.i32
, 12, false),
3141 LLVMConstInt(ctx
->ac
.i32
, 9, false),
3145 static LLVMValueRef
ngg_get_prim_cnt(struct radv_shader_context
*ctx
)
3147 return ac_build_bfe(&ctx
->ac
, ctx
->gs_tg_info
,
3148 LLVMConstInt(ctx
->ac
.i32
, 22, false),
3149 LLVMConstInt(ctx
->ac
.i32
, 9, false),
3154 ngg_gs_get_vertex_storage(struct radv_shader_context
*ctx
)
3156 unsigned num_outputs
= util_bitcount64(ctx
->output_mask
);
3158 LLVMTypeRef elements
[2] = {
3159 LLVMArrayType(ctx
->ac
.i32
, 4 * num_outputs
),
3160 LLVMArrayType(ctx
->ac
.i8
, 4),
3162 LLVMTypeRef type
= LLVMStructTypeInContext(ctx
->ac
.context
, elements
, 2, false);
3163 type
= LLVMPointerType(LLVMArrayType(type
, 0), AC_ADDR_SPACE_LDS
);
3164 return LLVMBuildBitCast(ctx
->ac
.builder
, ctx
->gs_ngg_emit
, type
, "");
3168 * Return a pointer to the LDS storage reserved for the N'th vertex, where N
3169 * is in emit order; that is:
3170 * - during the epilogue, N is the threadidx (relative to the entire threadgroup)
3171 * - during vertex emit, i.e. while the API GS shader invocation is running,
3172 * N = threadidx * gs_max_out_vertices + emitidx
3174 * Goals of the LDS memory layout:
3175 * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
3176 * in uniform control flow
3177 * 2. Eliminate bank conflicts on read for export if, additionally, there is no
3179 * 3. Agnostic to the number of waves (since we don't know it before compiling)
3180 * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
3181 * 5. Avoid wasting memory.
3183 * We use an AoS layout due to point 4 (this also helps point 3). In an AoS
3184 * layout, elimination of bank conflicts requires that each vertex occupy an
3185 * odd number of dwords. We use the additional dword to store the output stream
3186 * index as well as a flag to indicate whether this vertex ends a primitive
3187 * for rasterization.
3189 * Swizzling is required to satisfy points 1 and 2 simultaneously.
3191 * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).
3192 * Indices are swizzled in groups of 32, which ensures point 1 without
3193 * disturbing point 2.
3195 * \return an LDS pointer to type {[N x i32], [4 x i8]}
3198 ngg_gs_vertex_ptr(struct radv_shader_context
*ctx
, LLVMValueRef vertexidx
)
3200 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3201 LLVMValueRef storage
= ngg_gs_get_vertex_storage(ctx
);
3203 /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
3204 unsigned write_stride_2exp
= ffs(ctx
->gs_max_out_vertices
) - 1;
3205 if (write_stride_2exp
) {
3207 LLVMBuildLShr(builder
, vertexidx
,
3208 LLVMConstInt(ctx
->ac
.i32
, 5, false), "");
3209 LLVMValueRef swizzle
=
3210 LLVMBuildAnd(builder
, row
,
3211 LLVMConstInt(ctx
->ac
.i32
, (1u << write_stride_2exp
) - 1,
3213 vertexidx
= LLVMBuildXor(builder
, vertexidx
, swizzle
, "");
3216 return ac_build_gep0(&ctx
->ac
, storage
, vertexidx
);
3220 ngg_gs_emit_vertex_ptr(struct radv_shader_context
*ctx
, LLVMValueRef gsthread
,
3221 LLVMValueRef emitidx
)
3223 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3226 tmp
= LLVMConstInt(ctx
->ac
.i32
, ctx
->gs_max_out_vertices
, false);
3227 tmp
= LLVMBuildMul(builder
, tmp
, gsthread
, "");
3228 const LLVMValueRef vertexidx
= LLVMBuildAdd(builder
, tmp
, emitidx
, "");
3229 return ngg_gs_vertex_ptr(ctx
, vertexidx
);
3232 /* Send GS Alloc Req message from the first wave of the group to SPI.
3233 * Message payload is:
3234 * - bits 0..10: vertices in group
3235 * - bits 12..22: primitives in group
3237 static void build_sendmsg_gs_alloc_req(struct radv_shader_context
*ctx
,
3238 LLVMValueRef vtx_cnt
,
3239 LLVMValueRef prim_cnt
)
3241 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3244 tmp
= LLVMBuildICmp(builder
, LLVMIntEQ
, get_wave_id_in_tg(ctx
), ctx
->ac
.i32_0
, "");
3245 ac_build_ifcc(&ctx
->ac
, tmp
, 5020);
3247 tmp
= LLVMBuildShl(builder
, prim_cnt
, LLVMConstInt(ctx
->ac
.i32
, 12, false),"");
3248 tmp
= LLVMBuildOr(builder
, tmp
, vtx_cnt
, "");
3249 ac_build_sendmsg(&ctx
->ac
, AC_SENDMSG_GS_ALLOC_REQ
, tmp
);
3251 ac_build_endif(&ctx
->ac
, 5020);
3255 unsigned num_vertices
;
3256 LLVMValueRef isnull
;
3257 LLVMValueRef index
[3];
3258 LLVMValueRef edgeflag
[3];
3261 static void build_export_prim(struct radv_shader_context
*ctx
,
3262 const struct ngg_prim
*prim
)
3264 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3265 struct ac_export_args args
;
3268 tmp
= LLVMBuildZExt(builder
, prim
->isnull
, ctx
->ac
.i32
, "");
3269 args
.out
[0] = LLVMBuildShl(builder
, tmp
, LLVMConstInt(ctx
->ac
.i32
, 31, false), "");
3271 for (unsigned i
= 0; i
< prim
->num_vertices
; ++i
) {
3272 tmp
= LLVMBuildShl(builder
, prim
->index
[i
],
3273 LLVMConstInt(ctx
->ac
.i32
, 10 * i
, false), "");
3274 args
.out
[0] = LLVMBuildOr(builder
, args
.out
[0], tmp
, "");
3275 tmp
= LLVMBuildZExt(builder
, prim
->edgeflag
[i
], ctx
->ac
.i32
, "");
3276 tmp
= LLVMBuildShl(builder
, tmp
,
3277 LLVMConstInt(ctx
->ac
.i32
, 10 * i
+ 9, false), "");
3278 args
.out
[0] = LLVMBuildOr(builder
, args
.out
[0], tmp
, "");
3281 args
.out
[0] = LLVMBuildBitCast(builder
, args
.out
[0], ctx
->ac
.f32
, "");
3282 args
.out
[1] = LLVMGetUndef(ctx
->ac
.f32
);
3283 args
.out
[2] = LLVMGetUndef(ctx
->ac
.f32
);
3284 args
.out
[3] = LLVMGetUndef(ctx
->ac
.f32
);
3286 args
.target
= V_008DFC_SQ_EXP_PRIM
;
3287 args
.enabled_channels
= 1;
3289 args
.valid_mask
= false;
3292 ac_build_export(&ctx
->ac
, &args
);
3296 handle_ngg_outputs_post(struct radv_shader_context
*ctx
)
3298 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3299 struct ac_build_if_state if_state
;
3300 unsigned num_vertices
= 3;
3303 assert((ctx
->stage
== MESA_SHADER_VERTEX
||
3304 ctx
->stage
== MESA_SHADER_TESS_EVAL
) && !ctx
->is_gs_copy_shader
);
3306 LLVMValueRef prims_in_wave
= ac_unpack_param(&ctx
->ac
, ctx
->merged_wave_info
, 8, 8);
3307 LLVMValueRef vtx_in_wave
= ac_unpack_param(&ctx
->ac
, ctx
->merged_wave_info
, 0, 8);
3308 LLVMValueRef is_gs_thread
= LLVMBuildICmp(builder
, LLVMIntULT
,
3309 ac_get_thread_id(&ctx
->ac
), prims_in_wave
, "");
3310 LLVMValueRef is_es_thread
= LLVMBuildICmp(builder
, LLVMIntULT
,
3311 ac_get_thread_id(&ctx
->ac
), vtx_in_wave
, "");
3312 LLVMValueRef vtxindex
[] = {
3313 ac_unpack_param(&ctx
->ac
, ctx
->gs_vtx_offset
[0], 0, 16),
3314 ac_unpack_param(&ctx
->ac
, ctx
->gs_vtx_offset
[0], 16, 16),
3315 ac_unpack_param(&ctx
->ac
, ctx
->gs_vtx_offset
[2], 0, 16),
3318 /* TODO: streamout */
3320 /* Copy Primitive IDs from GS threads to the LDS address corresponding
3321 * to the ES thread of the provoking vertex.
3323 if (ctx
->stage
== MESA_SHADER_VERTEX
&&
3324 ctx
->options
->key
.vs_common_out
.export_prim_id
) {
3325 /* TODO: streamout */
3327 ac_build_ifcc(&ctx
->ac
, is_gs_thread
, 5400);
3328 /* Extract the PROVOKING_VTX_INDEX field. */
3329 LLVMValueRef provoking_vtx_in_prim
=
3330 LLVMConstInt(ctx
->ac
.i32
, 0, false);
3332 /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
3333 LLVMValueRef indices
= ac_build_gather_values(&ctx
->ac
, vtxindex
, 3);
3334 LLVMValueRef provoking_vtx_index
=
3335 LLVMBuildExtractElement(builder
, indices
, provoking_vtx_in_prim
, "");
3337 LLVMBuildStore(builder
, ctx
->abi
.gs_prim_id
,
3338 ac_build_gep0(&ctx
->ac
, ctx
->esgs_ring
, provoking_vtx_index
));
3339 ac_build_endif(&ctx
->ac
, 5400);
3342 /* TODO: primitive culling */
3344 build_sendmsg_gs_alloc_req(ctx
, ngg_get_vtx_cnt(ctx
), ngg_get_prim_cnt(ctx
));
3346 /* TODO: streamout queries */
3347 /* Export primitive data to the index buffer. Format is:
3348 * - bits 0..8: index 0
3349 * - bit 9: edge flag 0
3350 * - bits 10..18: index 1
3351 * - bit 19: edge flag 1
3352 * - bits 20..28: index 2
3353 * - bit 29: edge flag 2
3354 * - bit 31: null primitive (skip)
3356 * For the first version, we will always build up all three indices
3357 * independent of the primitive type. The additional garbage data
3360 * TODO: culling depends on the primitive type, so can have some
3363 ac_nir_build_if(&if_state
, ctx
, is_gs_thread
);
3365 struct ngg_prim prim
= {};
3367 prim
.num_vertices
= num_vertices
;
3368 prim
.isnull
= ctx
->ac
.i1false
;
3369 memcpy(prim
.index
, vtxindex
, sizeof(vtxindex
[0]) * 3);
3371 for (unsigned i
= 0; i
< num_vertices
; ++i
) {
3372 tmp
= LLVMBuildLShr(builder
, ctx
->abi
.gs_invocation_id
,
3373 LLVMConstInt(ctx
->ac
.i32
, 8 + i
, false), "");
3374 prim
.edgeflag
[i
] = LLVMBuildTrunc(builder
, tmp
, ctx
->ac
.i1
, "");
3377 build_export_prim(ctx
, &prim
);
3379 ac_nir_build_endif(&if_state
);
3381 /* Export per-vertex data (positions and parameters). */
3382 ac_nir_build_if(&if_state
, ctx
, is_es_thread
);
3384 struct radv_vs_output_info
*outinfo
=
3385 ctx
->stage
== MESA_SHADER_TESS_EVAL
? &ctx
->shader_info
->tes
.outinfo
: &ctx
->shader_info
->vs
.outinfo
;
3387 /* Exporting the primitive ID is handled below. */
3388 /* TODO: use the new VS export path */
3389 handle_vs_outputs_post(ctx
, false,
3390 ctx
->options
->key
.vs_common_out
.export_clip_dists
,
3393 if (ctx
->options
->key
.vs_common_out
.export_prim_id
) {
3394 unsigned param_count
= outinfo
->param_exports
;
3395 LLVMValueRef values
[4];
3397 if (ctx
->stage
== MESA_SHADER_VERTEX
) {
3398 /* Wait for GS stores to finish. */
3399 ac_build_s_barrier(&ctx
->ac
);
3401 tmp
= ac_build_gep0(&ctx
->ac
, ctx
->esgs_ring
,
3402 get_thread_id_in_tg(ctx
));
3403 values
[0] = LLVMBuildLoad(builder
, tmp
, "");
3405 assert(ctx
->stage
== MESA_SHADER_TESS_EVAL
);
3406 values
[0] = ctx
->abi
.tes_patch_id
;
3409 values
[0] = ac_to_float(&ctx
->ac
, values
[0]);
3410 for (unsigned j
= 1; j
< 4; j
++)
3411 values
[j
] = ctx
->ac
.f32_0
;
3413 radv_export_param(ctx
, param_count
, values
, 0x1);
3415 outinfo
->vs_output_param_offset
[VARYING_SLOT_PRIMITIVE_ID
] = param_count
++;
3416 outinfo
->export_prim_id
= true;
3417 outinfo
->param_exports
= param_count
;
3420 ac_nir_build_endif(&if_state
);
3423 static void gfx10_ngg_gs_emit_prologue(struct radv_shader_context
*ctx
)
3425 /* Zero out the part of LDS scratch that is used to accumulate the
3426 * per-stream generated primitive count.
3428 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3429 LLVMValueRef scratchptr
= ctx
->gs_ngg_scratch
;
3430 LLVMValueRef tid
= get_thread_id_in_tg(ctx
);
3431 LLVMBasicBlockRef merge_block
;
3434 LLVMValueRef fn
= LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx
->ac
.builder
));
3435 LLVMBasicBlockRef then_block
= LLVMAppendBasicBlockInContext(ctx
->ac
.context
, fn
, "");
3436 merge_block
= LLVMAppendBasicBlockInContext(ctx
->ac
.context
, fn
, "");
3438 cond
= LLVMBuildICmp(builder
, LLVMIntULT
, tid
, LLVMConstInt(ctx
->ac
.i32
, 4, false), "");
3439 LLVMBuildCondBr(ctx
->ac
.builder
, cond
, then_block
, merge_block
);
3440 LLVMPositionBuilderAtEnd(ctx
->ac
.builder
, then_block
);
3442 LLVMValueRef ptr
= ac_build_gep0(&ctx
->ac
, scratchptr
, tid
);
3443 LLVMBuildStore(builder
, ctx
->ac
.i32_0
, ptr
);
3445 LLVMBuildBr(ctx
->ac
.builder
, merge_block
);
3446 LLVMPositionBuilderAtEnd(ctx
->ac
.builder
, merge_block
);
3448 ac_build_s_barrier(&ctx
->ac
);
3451 static void gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context
*ctx
)
3453 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3454 LLVMValueRef i8_0
= LLVMConstInt(ctx
->ac
.i8
, 0, false);
3457 /* Zero out remaining (non-emitted) primitive flags.
3459 * Note: Alternatively, we could pass the relevant gs_next_vertex to
3460 * the emit threads via LDS. This is likely worse in the expected
3461 * typical case where each GS thread emits the full set of
3464 for (unsigned stream
= 0; stream
< 4; ++stream
) {
3465 unsigned num_components
;
3468 ctx
->shader_info
->info
.gs
.num_stream_output_components
[stream
];
3469 if (!num_components
)
3472 const LLVMValueRef gsthread
= get_thread_id_in_tg(ctx
);
3474 ac_build_bgnloop(&ctx
->ac
, 5100);
3476 const LLVMValueRef vertexidx
=
3477 LLVMBuildLoad(builder
, ctx
->gs_next_vertex
[stream
], "");
3478 tmp
= LLVMBuildICmp(builder
, LLVMIntUGE
, vertexidx
,
3479 LLVMConstInt(ctx
->ac
.i32
, ctx
->gs_max_out_vertices
, false), "");
3480 ac_build_ifcc(&ctx
->ac
, tmp
, 5101);
3481 ac_build_break(&ctx
->ac
);
3482 ac_build_endif(&ctx
->ac
, 5101);
3484 tmp
= LLVMBuildAdd(builder
, vertexidx
, ctx
->ac
.i32_1
, "");
3485 LLVMBuildStore(builder
, tmp
, ctx
->gs_next_vertex
[stream
]);
3487 tmp
= ngg_gs_emit_vertex_ptr(ctx
, gsthread
, vertexidx
);
3488 LLVMValueRef gep_idx
[3] = {
3489 ctx
->ac
.i32_0
, /* implied C-style array */
3490 ctx
->ac
.i32_1
, /* second entry of struct */
3491 LLVMConstInt(ctx
->ac
.i32
, stream
, false),
3493 tmp
= LLVMBuildGEP(builder
, tmp
, gep_idx
, 3, "");
3494 LLVMBuildStore(builder
, i8_0
, tmp
);
3496 ac_build_endloop(&ctx
->ac
, 5100);
3500 static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context
*ctx
)
3502 const unsigned verts_per_prim
= si_conv_gl_prim_to_vertices(ctx
->gs_output_prim
);
3503 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3504 LLVMValueRef tmp
, tmp2
;
3506 ac_build_s_barrier(&ctx
->ac
);
3508 const LLVMValueRef tid
= get_thread_id_in_tg(ctx
);
3509 LLVMValueRef num_emit_threads
= ngg_get_prim_cnt(ctx
);
3511 /* TODO: streamout */
3515 /* Determine vertex liveness. */
3516 LLVMValueRef vertliveptr
= ac_build_alloca(&ctx
->ac
, ctx
->ac
.i1
, "vertexlive");
3518 tmp
= LLVMBuildICmp(builder
, LLVMIntULT
, tid
, num_emit_threads
, "");
3519 ac_build_ifcc(&ctx
->ac
, tmp
, 5120);
3521 for (unsigned i
= 0; i
< verts_per_prim
; ++i
) {
3522 const LLVMValueRef primidx
=
3523 LLVMBuildAdd(builder
, tid
,
3524 LLVMConstInt(ctx
->ac
.i32
, i
, false), "");
3527 tmp
= LLVMBuildICmp(builder
, LLVMIntULT
, primidx
, num_emit_threads
, "");
3528 ac_build_ifcc(&ctx
->ac
, tmp
, 5121 + i
);
3531 /* Load primitive liveness */
3532 tmp
= ngg_gs_vertex_ptr(ctx
, primidx
);
3533 LLVMValueRef gep_idx
[3] = {
3534 ctx
->ac
.i32_0
, /* implicit C-style array */
3535 ctx
->ac
.i32_1
, /* second value of struct */
3536 ctx
->ac
.i32_0
, /* stream 0 */
3538 tmp
= LLVMBuildGEP(builder
, tmp
, gep_idx
, 3, "");
3539 tmp
= LLVMBuildLoad(builder
, tmp
, "");
3540 const LLVMValueRef primlive
=
3541 LLVMBuildTrunc(builder
, tmp
, ctx
->ac
.i1
, "");
3543 tmp
= LLVMBuildLoad(builder
, vertliveptr
, "");
3544 tmp
= LLVMBuildOr(builder
, tmp
, primlive
, ""),
3545 LLVMBuildStore(builder
, tmp
, vertliveptr
);
3548 ac_build_endif(&ctx
->ac
, 5121 + i
);
3551 ac_build_endif(&ctx
->ac
, 5120);
3553 /* Inclusive scan addition across the current wave. */
3554 LLVMValueRef vertlive
= LLVMBuildLoad(builder
, vertliveptr
, "");
3555 struct ac_wg_scan vertlive_scan
= {};
3556 vertlive_scan
.op
= nir_op_iadd
;
3557 vertlive_scan
.enable_reduce
= true;
3558 vertlive_scan
.enable_exclusive
= true;
3559 vertlive_scan
.src
= vertlive
;
3560 vertlive_scan
.scratch
= ac_build_gep0(&ctx
->ac
, ctx
->gs_ngg_scratch
, ctx
->ac
.i32_0
);
3561 vertlive_scan
.waveidx
= get_wave_id_in_tg(ctx
);
3562 vertlive_scan
.numwaves
= get_tgsize(ctx
);
3563 vertlive_scan
.maxwaves
= 8;
3565 ac_build_wg_scan(&ctx
->ac
, &vertlive_scan
);
3567 /* Skip all exports (including index exports) when possible. At least on
3568 * early gfx10 revisions this is also to avoid hangs.
3570 LLVMValueRef have_exports
=
3571 LLVMBuildICmp(builder
, LLVMIntNE
, vertlive_scan
.result_reduce
, ctx
->ac
.i32_0
, "");
3573 LLVMBuildSelect(builder
, have_exports
, num_emit_threads
, ctx
->ac
.i32_0
, "");
3575 /* Allocate export space. Send this message as early as possible, to
3576 * hide the latency of the SQ <-> SPI roundtrip.
3578 * Note: We could consider compacting primitives for export as well.
3579 * PA processes 1 non-null prim / clock, but it fetches 4 DW of
3580 * prim data per clock and skips null primitives at no additional
3581 * cost. So compacting primitives can only be beneficial when
3582 * there are 4 or more contiguous null primitives in the export
3583 * (in the common case of single-dword prim exports).
3585 build_sendmsg_gs_alloc_req(ctx
, vertlive_scan
.result_reduce
, num_emit_threads
);
3587 /* Setup the reverse vertex compaction permutation. We re-use stream 1
3588 * of the primitive liveness flags, relying on the fact that each
3589 * threadgroup can have at most 256 threads. */
3590 ac_build_ifcc(&ctx
->ac
, vertlive
, 5130);
3592 tmp
= ngg_gs_vertex_ptr(ctx
, vertlive_scan
.result_exclusive
);
3593 LLVMValueRef gep_idx
[3] = {
3594 ctx
->ac
.i32_0
, /* implicit C-style array */
3595 ctx
->ac
.i32_1
, /* second value of struct */
3596 ctx
->ac
.i32_1
, /* stream 1 */
3598 tmp
= LLVMBuildGEP(builder
, tmp
, gep_idx
, 3, "");
3599 tmp2
= LLVMBuildTrunc(builder
, tid
, ctx
->ac
.i8
, "");
3600 LLVMBuildStore(builder
, tmp2
, tmp
);
3602 ac_build_endif(&ctx
->ac
, 5130);
3604 ac_build_s_barrier(&ctx
->ac
);
3606 /* Export primitive data */
3607 tmp
= LLVMBuildICmp(builder
, LLVMIntULT
, tid
, num_emit_threads
, "");
3608 ac_build_ifcc(&ctx
->ac
, tmp
, 5140);
3610 struct ngg_prim prim
= {};
3611 prim
.num_vertices
= verts_per_prim
;
3613 tmp
= ngg_gs_vertex_ptr(ctx
, tid
);
3614 LLVMValueRef gep_idx
[3] = {
3615 ctx
->ac
.i32_0
, /* implicit C-style array */
3616 ctx
->ac
.i32_1
, /* second value of struct */
3617 ctx
->ac
.i32_0
, /* primflag */
3619 tmp
= LLVMBuildGEP(builder
, tmp
, gep_idx
, 3, "");
3620 tmp
= LLVMBuildLoad(builder
, tmp
, "");
3621 prim
.isnull
= LLVMBuildICmp(builder
, LLVMIntEQ
, tmp
,
3622 LLVMConstInt(ctx
->ac
.i8
, 0, false), "");
3624 for (unsigned i
= 0; i
< verts_per_prim
; ++i
) {
3625 prim
.index
[i
] = LLVMBuildSub(builder
, vertlive_scan
.result_exclusive
,
3626 LLVMConstInt(ctx
->ac
.i32
, verts_per_prim
- i
- 1, false), "");
3627 prim
.edgeflag
[i
] = ctx
->ac
.i1false
;
3630 build_export_prim(ctx
, &prim
);
3632 ac_build_endif(&ctx
->ac
, 5140);
3634 /* Export position and parameter data */
3635 tmp
= LLVMBuildICmp(builder
, LLVMIntULT
, tid
, vertlive_scan
.result_reduce
, "");
3636 ac_build_ifcc(&ctx
->ac
, tmp
, 5145);
3638 struct radv_vs_output_info
*outinfo
= &ctx
->shader_info
->vs
.outinfo
;
3639 bool export_view_index
= ctx
->options
->key
.has_multiview_view_index
;
3640 struct radv_shader_output_values
*outputs
;
3641 unsigned noutput
= 0;
3643 /* Allocate a temporary array for the output values. */
3644 unsigned num_outputs
= util_bitcount64(ctx
->output_mask
) + export_view_index
;
3645 outputs
= calloc(num_outputs
, sizeof(outputs
[0]));
3647 memset(outinfo
->vs_output_param_offset
, AC_EXP_PARAM_UNDEFINED
,
3648 sizeof(outinfo
->vs_output_param_offset
));
3649 outinfo
->pos_exports
= 0;
3651 tmp
= ngg_gs_vertex_ptr(ctx
, tid
);
3652 LLVMValueRef gep_idx
[3] = {
3653 ctx
->ac
.i32_0
, /* implicit C-style array */
3654 ctx
->ac
.i32_1
, /* second value of struct */
3655 ctx
->ac
.i32_1
, /* stream 1: source data index */
3657 tmp
= LLVMBuildGEP(builder
, tmp
, gep_idx
, 3, "");
3658 tmp
= LLVMBuildLoad(builder
, tmp
, "");
3659 tmp
= LLVMBuildZExt(builder
, tmp
, ctx
->ac
.i32
, "");
3660 const LLVMValueRef vertexptr
= ngg_gs_vertex_ptr(ctx
, tmp
);
3662 if (ctx
->output_mask
& (1ull << VARYING_SLOT_PSIZ
)) {
3663 outinfo
->writes_pointsize
= true;
3666 if (ctx
->output_mask
& (1ull << VARYING_SLOT_LAYER
)) {
3667 outinfo
->writes_layer
= true;
3670 if (ctx
->output_mask
& (1ull << VARYING_SLOT_VIEWPORT
)) {
3671 outinfo
->writes_viewport_index
= true;
3674 unsigned out_idx
= 0;
3675 gep_idx
[1] = ctx
->ac
.i32_0
;
3676 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
3677 if (!(ctx
->output_mask
& (1ull << i
)))
3680 outputs
[noutput
].slot_name
= i
;
3681 outputs
[noutput
].slot_index
= i
== VARYING_SLOT_CLIP_DIST1
;
3683 outputs
[noutput
].usage_mask
= ctx
->shader_info
->info
.gs
.output_usage_mask
[i
];
3684 int length
= util_last_bit(outputs
[noutput
].usage_mask
);
3686 for (unsigned j
= 0; j
< length
; j
++, out_idx
++) {
3687 gep_idx
[2] = LLVMConstInt(ctx
->ac
.i32
, out_idx
, false);
3688 tmp
= LLVMBuildGEP(builder
, vertexptr
, gep_idx
, 3, "");
3689 tmp
= LLVMBuildLoad(builder
, tmp
, "");
3691 LLVMTypeRef type
= LLVMGetAllocatedType(ctx
->abi
.outputs
[ac_llvm_reg_index_soa(i
, j
)]);
3692 if (ac_get_type_size(type
) == 2) {
3693 tmp
= ac_to_integer(&ctx
->ac
, tmp
);
3694 tmp
= LLVMBuildTrunc(ctx
->ac
.builder
, tmp
, ctx
->ac
.i16
, "");
3697 outputs
[noutput
].values
[j
] = ac_to_float(&ctx
->ac
, tmp
);
3700 for (unsigned j
= length
; j
< 4; j
++)
3701 outputs
[noutput
].values
[j
] = LLVMGetUndef(ctx
->ac
.f32
);
3706 /* Export ViewIndex. */
3707 if (export_view_index
) {
3708 outinfo
->writes_layer
= true;
3710 outputs
[noutput
].slot_name
= VARYING_SLOT_LAYER
;
3711 outputs
[noutput
].slot_index
= 0;
3712 outputs
[noutput
].usage_mask
= 0x1;
3713 outputs
[noutput
].values
[0] = ac_to_float(&ctx
->ac
, ctx
->abi
.view_index
);
3714 for (unsigned j
= 1; j
< 4; j
++)
3715 outputs
[noutput
].values
[j
] = ctx
->ac
.f32_0
;
3719 radv_llvm_export_vs(ctx
, outputs
, noutput
, outinfo
,
3720 ctx
->options
->key
.vs_common_out
.export_clip_dists
);
3723 ac_build_endif(&ctx
->ac
, 5145);
3726 static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context
*ctx
,
3728 LLVMValueRef
*addrs
)
3730 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3732 const LLVMValueRef vertexidx
=
3733 LLVMBuildLoad(builder
, ctx
->gs_next_vertex
[stream
], "");
3735 /* If this thread has already emitted the declared maximum number of
3736 * vertices, skip the write: excessive vertex emissions are not
3737 * supposed to have any effect.
3739 const LLVMValueRef can_emit
=
3740 LLVMBuildICmp(builder
, LLVMIntULT
, vertexidx
,
3741 LLVMConstInt(ctx
->ac
.i32
, ctx
->gs_max_out_vertices
, false), "");
3742 ac_build_kill_if_false(&ctx
->ac
, can_emit
);
3744 tmp
= LLVMBuildAdd(builder
, vertexidx
, ctx
->ac
.i32_1
, "");
3745 tmp
= LLVMBuildSelect(builder
, can_emit
, tmp
, vertexidx
, "");
3746 LLVMBuildStore(builder
, tmp
, ctx
->gs_next_vertex
[stream
]);
3748 const LLVMValueRef vertexptr
=
3749 ngg_gs_emit_vertex_ptr(ctx
, get_thread_id_in_tg(ctx
), vertexidx
);
3750 unsigned out_idx
= 0;
3751 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
3752 unsigned output_usage_mask
=
3753 ctx
->shader_info
->info
.gs
.output_usage_mask
[i
];
3754 uint8_t output_stream
=
3755 ctx
->shader_info
->info
.gs
.output_streams
[i
];
3756 LLVMValueRef
*out_ptr
= &addrs
[i
* 4];
3757 int length
= util_last_bit(output_usage_mask
);
3759 if (!(ctx
->output_mask
& (1ull << i
)) ||
3760 output_stream
!= stream
)
3763 for (unsigned j
= 0; j
< length
; j
++, out_idx
++) {
3764 if (!(output_usage_mask
& (1 << j
)))
3767 LLVMValueRef out_val
= LLVMBuildLoad(ctx
->ac
.builder
,
3769 LLVMValueRef gep_idx
[3] = {
3770 ctx
->ac
.i32_0
, /* implied C-style array */
3771 ctx
->ac
.i32_0
, /* first entry of struct */
3772 LLVMConstInt(ctx
->ac
.i32
, out_idx
, false),
3774 LLVMValueRef ptr
= LLVMBuildGEP(builder
, vertexptr
, gep_idx
, 3, "");
3776 out_val
= ac_to_integer(&ctx
->ac
, out_val
);
3777 out_val
= LLVMBuildZExtOrBitCast(ctx
->ac
.builder
, out_val
, ctx
->ac
.i32
, "");
3779 LLVMBuildStore(builder
, out_val
, ptr
);
3782 assert(out_idx
* 4 <= ctx
->gsvs_vertex_size
);
3784 /* Determine and store whether this vertex completed a primitive. */
3785 const LLVMValueRef curverts
= LLVMBuildLoad(builder
, ctx
->gs_curprim_verts
[stream
], "");
3787 tmp
= LLVMConstInt(ctx
->ac
.i32
, si_conv_gl_prim_to_vertices(ctx
->gs_output_prim
) - 1, false);
3788 const LLVMValueRef iscompleteprim
=
3789 LLVMBuildICmp(builder
, LLVMIntUGE
, curverts
, tmp
, "");
3791 tmp
= LLVMBuildAdd(builder
, curverts
, ctx
->ac
.i32_1
, "");
3792 LLVMBuildStore(builder
, tmp
, ctx
->gs_curprim_verts
[stream
]);
3794 LLVMValueRef gep_idx
[3] = {
3795 ctx
->ac
.i32_0
, /* implied C-style array */
3796 ctx
->ac
.i32_1
, /* second struct entry */
3797 LLVMConstInt(ctx
->ac
.i32
, stream
, false),
3799 const LLVMValueRef primflagptr
=
3800 LLVMBuildGEP(builder
, vertexptr
, gep_idx
, 3, "");
3802 tmp
= LLVMBuildZExt(builder
, iscompleteprim
, ctx
->ac
.i8
, "");
3803 LLVMBuildStore(builder
, tmp
, primflagptr
);
3805 tmp
= LLVMBuildLoad(builder
, ctx
->gs_generated_prims
[stream
], "");
3806 tmp
= LLVMBuildAdd(builder
, tmp
, LLVMBuildZExt(builder
, iscompleteprim
, ctx
->ac
.i32
, ""), "");
3807 LLVMBuildStore(builder
, tmp
, ctx
->gs_generated_prims
[stream
]);
3811 write_tess_factors(struct radv_shader_context
*ctx
)
3813 unsigned stride
, outer_comps
, inner_comps
;
3814 struct ac_build_if_state if_ctx
, inner_if_ctx
;
3815 LLVMValueRef invocation_id
= ac_unpack_param(&ctx
->ac
, ctx
->abi
.tcs_rel_ids
, 8, 5);
3816 LLVMValueRef rel_patch_id
= ac_unpack_param(&ctx
->ac
, ctx
->abi
.tcs_rel_ids
, 0, 8);
3817 unsigned tess_inner_index
= 0, tess_outer_index
;
3818 LLVMValueRef lds_base
, lds_inner
= NULL
, lds_outer
, byteoffset
, buffer
;
3819 LLVMValueRef out
[6], vec0
, vec1
, tf_base
, inner
[4], outer
[4];
3821 ac_emit_barrier(&ctx
->ac
, ctx
->stage
);
3823 switch (ctx
->options
->key
.tcs
.primitive_mode
) {
3843 ac_nir_build_if(&if_ctx
, ctx
,
3844 LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntEQ
,
3845 invocation_id
, ctx
->ac
.i32_0
, ""));
3847 lds_base
= get_tcs_out_current_patch_data_offset(ctx
);
3850 tess_inner_index
= shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER
);
3851 lds_inner
= LLVMBuildAdd(ctx
->ac
.builder
, lds_base
,
3852 LLVMConstInt(ctx
->ac
.i32
, tess_inner_index
* 4, false), "");
3855 tess_outer_index
= shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER
);
3856 lds_outer
= LLVMBuildAdd(ctx
->ac
.builder
, lds_base
,
3857 LLVMConstInt(ctx
->ac
.i32
, tess_outer_index
* 4, false), "");
3859 for (i
= 0; i
< 4; i
++) {
3860 inner
[i
] = LLVMGetUndef(ctx
->ac
.i32
);
3861 outer
[i
] = LLVMGetUndef(ctx
->ac
.i32
);
3865 if (ctx
->options
->key
.tcs
.primitive_mode
== GL_ISOLINES
) {
3866 outer
[0] = out
[1] = ac_lds_load(&ctx
->ac
, lds_outer
);
3867 lds_outer
= LLVMBuildAdd(ctx
->ac
.builder
, lds_outer
,
3869 outer
[1] = out
[0] = ac_lds_load(&ctx
->ac
, lds_outer
);
3871 for (i
= 0; i
< outer_comps
; i
++) {
3873 ac_lds_load(&ctx
->ac
, lds_outer
);
3874 lds_outer
= LLVMBuildAdd(ctx
->ac
.builder
, lds_outer
,
3877 for (i
= 0; i
< inner_comps
; i
++) {
3878 inner
[i
] = out
[outer_comps
+i
] =
3879 ac_lds_load(&ctx
->ac
, lds_inner
);
3880 lds_inner
= LLVMBuildAdd(ctx
->ac
.builder
, lds_inner
,
3885 /* Convert the outputs to vectors for stores. */
3886 vec0
= ac_build_gather_values(&ctx
->ac
, out
, MIN2(stride
, 4));
3890 vec1
= ac_build_gather_values(&ctx
->ac
, out
+ 4, stride
- 4);
3893 buffer
= ctx
->hs_ring_tess_factor
;
3894 tf_base
= ctx
->tess_factor_offset
;
3895 byteoffset
= LLVMBuildMul(ctx
->ac
.builder
, rel_patch_id
,
3896 LLVMConstInt(ctx
->ac
.i32
, 4 * stride
, false), "");
3897 unsigned tf_offset
= 0;
3899 if (ctx
->options
->chip_class
<= GFX8
) {
3900 ac_nir_build_if(&inner_if_ctx
, ctx
,
3901 LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntEQ
,
3902 rel_patch_id
, ctx
->ac
.i32_0
, ""));
3904 /* Store the dynamic HS control word. */
3905 ac_build_buffer_store_dword(&ctx
->ac
, buffer
,
3906 LLVMConstInt(ctx
->ac
.i32
, 0x80000000, false),
3907 1, ctx
->ac
.i32_0
, tf_base
,
3911 ac_nir_build_endif(&inner_if_ctx
);
3914 /* Store the tessellation factors. */
3915 ac_build_buffer_store_dword(&ctx
->ac
, buffer
, vec0
,
3916 MIN2(stride
, 4), byteoffset
, tf_base
,
3917 tf_offset
, ac_glc
, false);
3919 ac_build_buffer_store_dword(&ctx
->ac
, buffer
, vec1
,
3920 stride
- 4, byteoffset
, tf_base
,
3921 16 + tf_offset
, ac_glc
, false);
3923 //store to offchip for TES to read - only if TES reads them
3924 if (ctx
->options
->key
.tcs
.tes_reads_tess_factors
) {
3925 LLVMValueRef inner_vec
, outer_vec
, tf_outer_offset
;
3926 LLVMValueRef tf_inner_offset
;
3927 unsigned param_outer
, param_inner
;
3929 param_outer
= shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER
);
3930 tf_outer_offset
= get_tcs_tes_buffer_address(ctx
, NULL
,
3931 LLVMConstInt(ctx
->ac
.i32
, param_outer
, 0));
3933 outer_vec
= ac_build_gather_values(&ctx
->ac
, outer
,
3934 util_next_power_of_two(outer_comps
));
3936 ac_build_buffer_store_dword(&ctx
->ac
, ctx
->hs_ring_tess_offchip
, outer_vec
,
3937 outer_comps
, tf_outer_offset
,
3938 ctx
->oc_lds
, 0, ac_glc
, false);
3940 param_inner
= shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER
);
3941 tf_inner_offset
= get_tcs_tes_buffer_address(ctx
, NULL
,
3942 LLVMConstInt(ctx
->ac
.i32
, param_inner
, 0));
3944 inner_vec
= inner_comps
== 1 ? inner
[0] :
3945 ac_build_gather_values(&ctx
->ac
, inner
, inner_comps
);
3946 ac_build_buffer_store_dword(&ctx
->ac
, ctx
->hs_ring_tess_offchip
, inner_vec
,
3947 inner_comps
, tf_inner_offset
,
3948 ctx
->oc_lds
, 0, ac_glc
, false);
3951 ac_nir_build_endif(&if_ctx
);
3955 handle_tcs_outputs_post(struct radv_shader_context
*ctx
)
3957 write_tess_factors(ctx
);
3961 si_export_mrt_color(struct radv_shader_context
*ctx
,
3962 LLVMValueRef
*color
, unsigned index
,
3963 struct ac_export_args
*args
)
3966 si_llvm_init_export_args(ctx
, color
, 0xf,
3967 V_008DFC_SQ_EXP_MRT
+ index
, args
);
3968 if (!args
->enabled_channels
)
3969 return false; /* unnecessary NULL export */
3975 radv_export_mrt_z(struct radv_shader_context
*ctx
,
3976 LLVMValueRef depth
, LLVMValueRef stencil
,
3977 LLVMValueRef samplemask
)
3979 struct ac_export_args args
;
3981 ac_export_mrt_z(&ctx
->ac
, depth
, stencil
, samplemask
, &args
);
3983 ac_build_export(&ctx
->ac
, &args
);
3987 handle_fs_outputs_post(struct radv_shader_context
*ctx
)
3990 LLVMValueRef depth
= NULL
, stencil
= NULL
, samplemask
= NULL
;
3991 struct ac_export_args color_args
[8];
3993 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
3994 LLVMValueRef values
[4];
3996 if (!(ctx
->output_mask
& (1ull << i
)))
3999 if (i
< FRAG_RESULT_DATA0
)
4002 for (unsigned j
= 0; j
< 4; j
++)
4003 values
[j
] = ac_to_float(&ctx
->ac
,
4004 radv_load_output(ctx
, i
, j
));
4006 bool ret
= si_export_mrt_color(ctx
, values
,
4007 i
- FRAG_RESULT_DATA0
,
4008 &color_args
[index
]);
4013 /* Process depth, stencil, samplemask. */
4014 if (ctx
->shader_info
->info
.ps
.writes_z
) {
4015 depth
= ac_to_float(&ctx
->ac
,
4016 radv_load_output(ctx
, FRAG_RESULT_DEPTH
, 0));
4018 if (ctx
->shader_info
->info
.ps
.writes_stencil
) {
4019 stencil
= ac_to_float(&ctx
->ac
,
4020 radv_load_output(ctx
, FRAG_RESULT_STENCIL
, 0));
4022 if (ctx
->shader_info
->info
.ps
.writes_sample_mask
) {
4023 samplemask
= ac_to_float(&ctx
->ac
,
4024 radv_load_output(ctx
, FRAG_RESULT_SAMPLE_MASK
, 0));
4027 /* Set the DONE bit on last non-null color export only if Z isn't
4031 !ctx
->shader_info
->info
.ps
.writes_z
&&
4032 !ctx
->shader_info
->info
.ps
.writes_stencil
&&
4033 !ctx
->shader_info
->info
.ps
.writes_sample_mask
) {
4034 unsigned last
= index
- 1;
4036 color_args
[last
].valid_mask
= 1; /* whether the EXEC mask is valid */
4037 color_args
[last
].done
= 1; /* DONE bit */
4040 /* Export PS outputs. */
4041 for (unsigned i
= 0; i
< index
; i
++)
4042 ac_build_export(&ctx
->ac
, &color_args
[i
]);
4044 if (depth
|| stencil
|| samplemask
)
4045 radv_export_mrt_z(ctx
, depth
, stencil
, samplemask
);
4047 ac_build_export_null(&ctx
->ac
);
4051 emit_gs_epilogue(struct radv_shader_context
*ctx
)
4053 if (ctx
->options
->key
.vs_common_out
.as_ngg
) {
4054 gfx10_ngg_gs_emit_epilogue_1(ctx
);
4058 if (ctx
->ac
.chip_class
>= GFX10
)
4059 LLVMBuildFence(ctx
->ac
.builder
, LLVMAtomicOrderingRelease
, false, "");
4061 ac_build_sendmsg(&ctx
->ac
, AC_SENDMSG_GS_OP_NOP
| AC_SENDMSG_GS_DONE
, ctx
->gs_wave_id
);
4065 handle_shader_outputs_post(struct ac_shader_abi
*abi
, unsigned max_outputs
,
4066 LLVMValueRef
*addrs
)
4068 struct radv_shader_context
*ctx
= radv_shader_context_from_abi(abi
);
4070 switch (ctx
->stage
) {
4071 case MESA_SHADER_VERTEX
:
4072 if (ctx
->options
->key
.vs_common_out
.as_ls
)
4073 handle_ls_outputs_post(ctx
);
4074 else if (ctx
->options
->key
.vs_common_out
.as_es
)
4075 handle_es_outputs_post(ctx
, &ctx
->shader_info
->vs
.es_info
);
4076 else if (ctx
->options
->key
.vs_common_out
.as_ngg
)
4077 break; /* handled outside of the shader body */
4079 handle_vs_outputs_post(ctx
, ctx
->options
->key
.vs_common_out
.export_prim_id
,
4080 ctx
->options
->key
.vs_common_out
.export_clip_dists
,
4081 &ctx
->shader_info
->vs
.outinfo
);
4083 case MESA_SHADER_FRAGMENT
:
4084 handle_fs_outputs_post(ctx
);
4086 case MESA_SHADER_GEOMETRY
:
4087 emit_gs_epilogue(ctx
);
4089 case MESA_SHADER_TESS_CTRL
:
4090 handle_tcs_outputs_post(ctx
);
4092 case MESA_SHADER_TESS_EVAL
:
4093 if (ctx
->options
->key
.vs_common_out
.as_es
)
4094 handle_es_outputs_post(ctx
, &ctx
->shader_info
->tes
.es_info
);
4095 else if (ctx
->options
->key
.vs_common_out
.as_ngg
)
4096 break; /* handled outside of the shader body */
4098 handle_vs_outputs_post(ctx
, ctx
->options
->key
.vs_common_out
.export_prim_id
,
4099 ctx
->options
->key
.vs_common_out
.export_clip_dists
,
4100 &ctx
->shader_info
->tes
.outinfo
);
4107 static void ac_llvm_finalize_module(struct radv_shader_context
*ctx
,
4108 LLVMPassManagerRef passmgr
,
4109 const struct radv_nir_compiler_options
*options
)
4111 LLVMRunPassManager(passmgr
, ctx
->ac
.module
);
4112 LLVMDisposeBuilder(ctx
->ac
.builder
);
4114 ac_llvm_context_dispose(&ctx
->ac
);
4118 ac_nir_eliminate_const_vs_outputs(struct radv_shader_context
*ctx
)
4120 struct radv_vs_output_info
*outinfo
;
4122 switch (ctx
->stage
) {
4123 case MESA_SHADER_FRAGMENT
:
4124 case MESA_SHADER_COMPUTE
:
4125 case MESA_SHADER_TESS_CTRL
:
4126 case MESA_SHADER_GEOMETRY
:
4128 case MESA_SHADER_VERTEX
:
4129 if (ctx
->options
->key
.vs_common_out
.as_ls
||
4130 ctx
->options
->key
.vs_common_out
.as_es
)
4132 outinfo
= &ctx
->shader_info
->vs
.outinfo
;
4134 case MESA_SHADER_TESS_EVAL
:
4135 if (ctx
->options
->key
.vs_common_out
.as_es
)
4137 outinfo
= &ctx
->shader_info
->tes
.outinfo
;
4140 unreachable("Unhandled shader type");
4143 ac_optimize_vs_outputs(&ctx
->ac
,
4145 outinfo
->vs_output_param_offset
,
4147 &outinfo
->param_exports
);
4151 ac_setup_rings(struct radv_shader_context
*ctx
)
4153 if (ctx
->options
->chip_class
<= GFX8
&&
4154 (ctx
->stage
== MESA_SHADER_GEOMETRY
||
4155 ctx
->options
->key
.vs_common_out
.as_es
|| ctx
->options
->key
.vs_common_out
.as_es
)) {
4156 unsigned ring
= ctx
->stage
== MESA_SHADER_GEOMETRY
? RING_ESGS_GS
4158 LLVMValueRef offset
= LLVMConstInt(ctx
->ac
.i32
, ring
, false);
4160 ctx
->esgs_ring
= ac_build_load_to_sgpr(&ctx
->ac
,
4165 if (ctx
->is_gs_copy_shader
) {
4167 ac_build_load_to_sgpr(&ctx
->ac
, ctx
->ring_offsets
,
4168 LLVMConstInt(ctx
->ac
.i32
,
4169 RING_GSVS_VS
, false));
4172 if (ctx
->stage
== MESA_SHADER_GEOMETRY
) {
4173 /* The conceptual layout of the GSVS ring is
4174 * v0c0 .. vLv0 v0c1 .. vLc1 ..
4175 * but the real memory layout is swizzled across
4177 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
4179 * Override the buffer descriptor accordingly.
4181 LLVMTypeRef v2i64
= LLVMVectorType(ctx
->ac
.i64
, 2);
4182 uint64_t stream_offset
= 0;
4183 unsigned num_records
= 64;
4184 LLVMValueRef base_ring
;
4187 ac_build_load_to_sgpr(&ctx
->ac
, ctx
->ring_offsets
,
4188 LLVMConstInt(ctx
->ac
.i32
,
4189 RING_GSVS_GS
, false));
4191 for (unsigned stream
= 0; stream
< 4; stream
++) {
4192 unsigned num_components
, stride
;
4193 LLVMValueRef ring
, tmp
;
4196 ctx
->shader_info
->info
.gs
.num_stream_output_components
[stream
];
4198 if (!num_components
)
4201 stride
= 4 * num_components
* ctx
->gs_max_out_vertices
;
4203 /* Limit on the stride field for <= GFX7. */
4204 assert(stride
< (1 << 14));
4206 ring
= LLVMBuildBitCast(ctx
->ac
.builder
,
4207 base_ring
, v2i64
, "");
4208 tmp
= LLVMBuildExtractElement(ctx
->ac
.builder
,
4209 ring
, ctx
->ac
.i32_0
, "");
4210 tmp
= LLVMBuildAdd(ctx
->ac
.builder
, tmp
,
4211 LLVMConstInt(ctx
->ac
.i64
,
4212 stream_offset
, 0), "");
4213 ring
= LLVMBuildInsertElement(ctx
->ac
.builder
,
4214 ring
, tmp
, ctx
->ac
.i32_0
, "");
4216 stream_offset
+= stride
* 64;
4218 ring
= LLVMBuildBitCast(ctx
->ac
.builder
, ring
,
4221 tmp
= LLVMBuildExtractElement(ctx
->ac
.builder
, ring
,
4223 tmp
= LLVMBuildOr(ctx
->ac
.builder
, tmp
,
4224 LLVMConstInt(ctx
->ac
.i32
,
4225 S_008F04_STRIDE(stride
), false), "");
4226 ring
= LLVMBuildInsertElement(ctx
->ac
.builder
, ring
, tmp
,
4229 ring
= LLVMBuildInsertElement(ctx
->ac
.builder
, ring
,
4230 LLVMConstInt(ctx
->ac
.i32
,
4231 num_records
, false),
4232 LLVMConstInt(ctx
->ac
.i32
, 2, false), "");
4234 ctx
->gsvs_ring
[stream
] = ring
;
4238 if (ctx
->stage
== MESA_SHADER_TESS_CTRL
||
4239 ctx
->stage
== MESA_SHADER_TESS_EVAL
) {
4240 ctx
->hs_ring_tess_offchip
= ac_build_load_to_sgpr(&ctx
->ac
, ctx
->ring_offsets
, LLVMConstInt(ctx
->ac
.i32
, RING_HS_TESS_OFFCHIP
, false));
4241 ctx
->hs_ring_tess_factor
= ac_build_load_to_sgpr(&ctx
->ac
, ctx
->ring_offsets
, LLVMConstInt(ctx
->ac
.i32
, RING_HS_TESS_FACTOR
, false));
4246 radv_nir_get_max_workgroup_size(enum chip_class chip_class
,
4247 const struct nir_shader
*nir
)
4249 switch (nir
->info
.stage
) {
4250 case MESA_SHADER_TESS_CTRL
:
4251 return chip_class
>= GFX7
? 128 : 64;
4252 case MESA_SHADER_GEOMETRY
:
4253 return chip_class
>= GFX9
? 128 : 64;
4254 case MESA_SHADER_COMPUTE
:
4260 unsigned max_workgroup_size
= nir
->info
.cs
.local_size
[0] *
4261 nir
->info
.cs
.local_size
[1] *
4262 nir
->info
.cs
.local_size
[2];
4263 return max_workgroup_size
;
4266 /* Fixup the HW not emitting the TCS regs if there are no HS threads. */
4267 static void ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context
*ctx
)
4269 LLVMValueRef count
= ac_unpack_param(&ctx
->ac
, ctx
->merged_wave_info
, 8, 8);
4270 LLVMValueRef hs_empty
= LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntEQ
, count
,
4272 ctx
->abi
.instance_id
= LLVMBuildSelect(ctx
->ac
.builder
, hs_empty
, ctx
->rel_auto_id
, ctx
->abi
.instance_id
, "");
4273 ctx
->rel_auto_id
= LLVMBuildSelect(ctx
->ac
.builder
, hs_empty
, ctx
->abi
.tcs_rel_ids
, ctx
->rel_auto_id
, "");
4274 ctx
->abi
.vertex_id
= LLVMBuildSelect(ctx
->ac
.builder
, hs_empty
, ctx
->abi
.tcs_patch_id
, ctx
->abi
.vertex_id
, "");
4277 static void prepare_gs_input_vgprs(struct radv_shader_context
*ctx
)
4279 for(int i
= 5; i
>= 0; --i
) {
4280 ctx
->gs_vtx_offset
[i
] = ac_unpack_param(&ctx
->ac
, ctx
->gs_vtx_offset
[i
& ~1],
4284 ctx
->gs_wave_id
= ac_unpack_param(&ctx
->ac
, ctx
->merged_wave_info
, 16, 8);
4287 /* Ensure that the esgs ring is declared.
4289 * We declare it with 64KB alignment as a hint that the
4290 * pointer value will always be 0.
4292 static void declare_esgs_ring(struct radv_shader_context
*ctx
)
4297 assert(!LLVMGetNamedGlobal(ctx
->ac
.module
, "esgs_ring"));
4299 ctx
->esgs_ring
= LLVMAddGlobalInAddressSpace(
4300 ctx
->ac
.module
, LLVMArrayType(ctx
->ac
.i32
, 0),
4303 LLVMSetLinkage(ctx
->esgs_ring
, LLVMExternalLinkage
);
4304 LLVMSetAlignment(ctx
->esgs_ring
, 64 * 1024);
4308 LLVMModuleRef
ac_translate_nir_to_llvm(struct ac_llvm_compiler
*ac_llvm
,
4309 struct nir_shader
*const *shaders
,
4311 struct radv_shader_variant_info
*shader_info
,
4312 const struct radv_nir_compiler_options
*options
)
4314 struct radv_shader_context ctx
= {0};
4316 ctx
.options
= options
;
4317 ctx
.shader_info
= shader_info
;
4319 ac_llvm_context_init(&ctx
.ac
, options
->chip_class
, options
->family
);
4320 ctx
.context
= ctx
.ac
.context
;
4321 ctx
.ac
.module
= ac_create_module(ac_llvm
->tm
, ctx
.context
);
4323 enum ac_float_mode float_mode
=
4324 options
->unsafe_math
? AC_FLOAT_MODE_UNSAFE_FP_MATH
:
4325 AC_FLOAT_MODE_DEFAULT
;
4327 ctx
.ac
.builder
= ac_create_builder(ctx
.context
, float_mode
);
4329 radv_nir_shader_info_init(&shader_info
->info
);
4331 for(int i
= 0; i
< shader_count
; ++i
)
4332 radv_nir_shader_info_pass(shaders
[i
], options
, &shader_info
->info
);
4334 for (i
= 0; i
< RADV_UD_MAX_SETS
; i
++)
4335 shader_info
->user_sgprs_locs
.descriptor_sets
[i
].sgpr_idx
= -1;
4336 for (i
= 0; i
< AC_UD_MAX_UD
; i
++)
4337 shader_info
->user_sgprs_locs
.shader_data
[i
].sgpr_idx
= -1;
4339 ctx
.max_workgroup_size
= 0;
4340 for (int i
= 0; i
< shader_count
; ++i
) {
4341 ctx
.max_workgroup_size
= MAX2(ctx
.max_workgroup_size
,
4342 radv_nir_get_max_workgroup_size(ctx
.options
->chip_class
,
4346 if (ctx
.ac
.chip_class
>= GFX10
) {
4347 if (is_pre_gs_stage(shaders
[0]->info
.stage
) &&
4348 options
->key
.vs_common_out
.as_ngg
) {
4349 ctx
.max_workgroup_size
= 128;
4353 create_function(&ctx
, shaders
[shader_count
- 1]->info
.stage
, shader_count
>= 2,
4354 shader_count
>= 2 ? shaders
[shader_count
- 2]->info
.stage
: MESA_SHADER_VERTEX
);
4356 ctx
.abi
.inputs
= &ctx
.inputs
[0];
4357 ctx
.abi
.emit_outputs
= handle_shader_outputs_post
;
4358 ctx
.abi
.emit_vertex
= visit_emit_vertex
;
4359 ctx
.abi
.load_ubo
= radv_load_ubo
;
4360 ctx
.abi
.load_ssbo
= radv_load_ssbo
;
4361 ctx
.abi
.load_sampler_desc
= radv_get_sampler_desc
;
4362 ctx
.abi
.load_resource
= radv_load_resource
;
4363 ctx
.abi
.clamp_shadow_reference
= false;
4364 ctx
.abi
.gfx9_stride_size_workaround
= ctx
.ac
.chip_class
== GFX9
&& HAVE_LLVM
< 0x800;
4366 /* Because the new raw/struct atomic intrinsics are buggy with LLVM 8,
4367 * we fallback to the old intrinsics for atomic buffer image operations
4368 * and thus we need to apply the indexing workaround...
4370 ctx
.abi
.gfx9_stride_size_workaround_for_atomic
= ctx
.ac
.chip_class
== GFX9
&& HAVE_LLVM
< 0x900;
4372 bool is_ngg
= is_pre_gs_stage(shaders
[0]->info
.stage
) && ctx
.options
->key
.vs_common_out
.as_ngg
;
4373 if (shader_count
>= 2 || is_ngg
)
4374 ac_init_exec_full_mask(&ctx
.ac
);
4376 if ((ctx
.ac
.family
== CHIP_VEGA10
||
4377 ctx
.ac
.family
== CHIP_RAVEN
) &&
4378 shaders
[shader_count
- 1]->info
.stage
== MESA_SHADER_TESS_CTRL
)
4379 ac_nir_fixup_ls_hs_input_vgprs(&ctx
);
4381 for(int i
= 0; i
< shader_count
; ++i
) {
4382 ctx
.stage
= shaders
[i
]->info
.stage
;
4383 ctx
.output_mask
= 0;
4385 if (shaders
[i
]->info
.stage
== MESA_SHADER_GEOMETRY
) {
4386 for (int i
= 0; i
< 4; i
++) {
4387 ctx
.gs_next_vertex
[i
] =
4388 ac_build_alloca(&ctx
.ac
, ctx
.ac
.i32
, "");
4390 if (ctx
.options
->key
.vs_common_out
.as_ngg
) {
4391 for (unsigned i
= 0; i
< 4; ++i
) {
4392 ctx
.gs_curprim_verts
[i
] =
4393 ac_build_alloca(&ctx
.ac
, ctx
.ac
.i32
, "");
4394 ctx
.gs_generated_prims
[i
] =
4395 ac_build_alloca(&ctx
.ac
, ctx
.ac
.i32
, "");
4398 /* TODO: streamout */
4400 LLVMTypeRef ai32
= LLVMArrayType(ctx
.ac
.i32
, 8);
4401 ctx
.gs_ngg_scratch
=
4402 LLVMAddGlobalInAddressSpace(ctx
.ac
.module
,
4403 ai32
, "ngg_scratch", AC_ADDR_SPACE_LDS
);
4404 LLVMSetInitializer(ctx
.gs_ngg_scratch
, LLVMGetUndef(ai32
));
4405 LLVMSetAlignment(ctx
.gs_ngg_scratch
, 4);
4407 ctx
.gs_ngg_emit
= LLVMBuildIntToPtr(ctx
.ac
.builder
, ctx
.ac
.i32_0
,
4408 LLVMPointerType(LLVMArrayType(ctx
.ac
.i32
, 0), AC_ADDR_SPACE_LDS
),
4412 ctx
.gs_max_out_vertices
= shaders
[i
]->info
.gs
.vertices_out
;
4413 ctx
.gs_output_prim
= shaders
[i
]->info
.gs
.output_primitive
;
4414 ctx
.abi
.load_inputs
= load_gs_input
;
4415 ctx
.abi
.emit_primitive
= visit_end_primitive
;
4416 } else if (shaders
[i
]->info
.stage
== MESA_SHADER_TESS_CTRL
) {
4417 ctx
.tcs_outputs_read
= shaders
[i
]->info
.outputs_read
;
4418 ctx
.tcs_patch_outputs_read
= shaders
[i
]->info
.patch_outputs_read
;
4419 ctx
.abi
.load_tess_varyings
= load_tcs_varyings
;
4420 ctx
.abi
.load_patch_vertices_in
= load_patch_vertices_in
;
4421 ctx
.abi
.store_tcs_outputs
= store_tcs_output
;
4422 ctx
.tcs_vertices_per_patch
= shaders
[i
]->info
.tess
.tcs_vertices_out
;
4423 if (shader_count
== 1)
4424 ctx
.tcs_num_inputs
= ctx
.options
->key
.tcs
.num_inputs
;
4426 ctx
.tcs_num_inputs
= util_last_bit64(shader_info
->info
.vs
.ls_outputs_written
);
4427 ctx
.tcs_num_patches
= get_tcs_num_patches(&ctx
);
4428 } else if (shaders
[i
]->info
.stage
== MESA_SHADER_TESS_EVAL
) {
4429 ctx
.tes_primitive_mode
= shaders
[i
]->info
.tess
.primitive_mode
;
4430 ctx
.abi
.load_tess_varyings
= load_tes_input
;
4431 ctx
.abi
.load_tess_coord
= load_tess_coord
;
4432 ctx
.abi
.load_patch_vertices_in
= load_patch_vertices_in
;
4433 ctx
.tcs_vertices_per_patch
= shaders
[i
]->info
.tess
.tcs_vertices_out
;
4434 ctx
.tcs_num_patches
= ctx
.options
->key
.tes
.num_patches
;
4435 } else if (shaders
[i
]->info
.stage
== MESA_SHADER_VERTEX
) {
4436 ctx
.abi
.load_base_vertex
= radv_load_base_vertex
;
4437 } else if (shaders
[i
]->info
.stage
== MESA_SHADER_FRAGMENT
) {
4438 shader_info
->fs
.can_discard
= shaders
[i
]->info
.fs
.uses_discard
;
4439 ctx
.abi
.lookup_interp_param
= lookup_interp_param
;
4440 ctx
.abi
.load_sample_position
= load_sample_position
;
4441 ctx
.abi
.load_sample_mask_in
= load_sample_mask_in
;
4442 ctx
.abi
.emit_kill
= radv_emit_kill
;
4445 if (shaders
[i
]->info
.stage
== MESA_SHADER_VERTEX
&&
4446 ctx
.options
->key
.vs_common_out
.as_ngg
&&
4447 ctx
.options
->key
.vs_common_out
.export_prim_id
) {
4448 declare_esgs_ring(&ctx
);
4452 ac_emit_barrier(&ctx
.ac
, ctx
.stage
);
4454 nir_foreach_variable(variable
, &shaders
[i
]->outputs
)
4455 scan_shader_output_decl(&ctx
, variable
, shaders
[i
], shaders
[i
]->info
.stage
);
4457 if (shaders
[i
]->info
.stage
== MESA_SHADER_GEOMETRY
) {
4458 unsigned addclip
= shaders
[i
]->info
.clip_distance_array_size
+
4459 shaders
[i
]->info
.cull_distance_array_size
> 4;
4460 ctx
.gsvs_vertex_size
= (util_bitcount64(ctx
.output_mask
) + addclip
) * 16;
4461 ctx
.max_gsvs_emit_size
= ctx
.gsvs_vertex_size
*
4462 shaders
[i
]->info
.gs
.vertices_out
;
4465 ac_setup_rings(&ctx
);
4467 LLVMBasicBlockRef merge_block
;
4468 if (shader_count
>= 2 || is_ngg
) {
4470 if (shaders
[i
]->info
.stage
== MESA_SHADER_GEOMETRY
&&
4471 ctx
.options
->key
.vs_common_out
.as_ngg
) {
4472 gfx10_ngg_gs_emit_prologue(&ctx
);
4475 LLVMValueRef fn
= LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx
.ac
.builder
));
4476 LLVMBasicBlockRef then_block
= LLVMAppendBasicBlockInContext(ctx
.ac
.context
, fn
, "");
4477 merge_block
= LLVMAppendBasicBlockInContext(ctx
.ac
.context
, fn
, "");
4479 LLVMValueRef count
= ac_unpack_param(&ctx
.ac
, ctx
.merged_wave_info
, 8 * i
, 8);
4480 LLVMValueRef thread_id
= ac_get_thread_id(&ctx
.ac
);
4481 LLVMValueRef cond
= LLVMBuildICmp(ctx
.ac
.builder
, LLVMIntULT
,
4482 thread_id
, count
, "");
4483 LLVMBuildCondBr(ctx
.ac
.builder
, cond
, then_block
, merge_block
);
4485 LLVMPositionBuilderAtEnd(ctx
.ac
.builder
, then_block
);
4488 if (shaders
[i
]->info
.stage
== MESA_SHADER_FRAGMENT
)
4489 prepare_interp_optimize(&ctx
, shaders
[i
]);
4490 else if(shaders
[i
]->info
.stage
== MESA_SHADER_VERTEX
)
4491 handle_vs_inputs(&ctx
, shaders
[i
]);
4492 else if(shader_count
>= 2 && shaders
[i
]->info
.stage
== MESA_SHADER_GEOMETRY
)
4493 prepare_gs_input_vgprs(&ctx
);
4495 ac_nir_translate(&ctx
.ac
, &ctx
.abi
, shaders
[i
]);
4497 if (shader_count
>= 2 || is_ngg
) {
4498 LLVMBuildBr(ctx
.ac
.builder
, merge_block
);
4499 LLVMPositionBuilderAtEnd(ctx
.ac
.builder
, merge_block
);
4502 /* This needs to be outside the if wrapping the shader body, as sometimes
4503 * the HW generates waves with 0 es/vs threads. */
4504 if (is_pre_gs_stage(shaders
[i
]->info
.stage
) &&
4505 ctx
.options
->key
.vs_common_out
.as_ngg
&&
4506 i
== shader_count
- 1) {
4507 handle_ngg_outputs_post(&ctx
);
4508 } else if (shaders
[i
]->info
.stage
== MESA_SHADER_GEOMETRY
&&
4509 ctx
.options
->key
.vs_common_out
.as_ngg
) {
4510 gfx10_ngg_gs_emit_epilogue_2(&ctx
);
4513 if (shaders
[i
]->info
.stage
== MESA_SHADER_GEOMETRY
) {
4514 shader_info
->gs
.gsvs_vertex_size
= ctx
.gsvs_vertex_size
;
4515 shader_info
->gs
.max_gsvs_emit_size
= ctx
.max_gsvs_emit_size
;
4516 } else if (shaders
[i
]->info
.stage
== MESA_SHADER_TESS_CTRL
) {
4517 shader_info
->tcs
.num_patches
= ctx
.tcs_num_patches
;
4518 shader_info
->tcs
.lds_size
= calculate_tess_lds_size(&ctx
);
4522 LLVMBuildRetVoid(ctx
.ac
.builder
);
4524 if (options
->dump_preoptir
) {
4525 fprintf(stderr
, "%s LLVM IR:\n\n",
4526 radv_get_shader_name(shader_info
,
4527 shaders
[shader_count
- 1]->info
.stage
));
4528 ac_dump_module(ctx
.ac
.module
);
4529 fprintf(stderr
, "\n");
4532 ac_llvm_finalize_module(&ctx
, ac_llvm
->passmgr
, options
);
4534 if (shader_count
== 1)
4535 ac_nir_eliminate_const_vs_outputs(&ctx
);
4537 if (options
->dump_shader
) {
4538 ctx
.shader_info
->private_mem_vgprs
=
4539 ac_count_scratch_private_memory(ctx
.main_function
);
4542 return ctx
.ac
.module
;
4545 static void ac_diagnostic_handler(LLVMDiagnosticInfoRef di
, void *context
)
4547 unsigned *retval
= (unsigned *)context
;
4548 LLVMDiagnosticSeverity severity
= LLVMGetDiagInfoSeverity(di
);
4549 char *description
= LLVMGetDiagInfoDescription(di
);
4551 if (severity
== LLVMDSError
) {
4553 fprintf(stderr
, "LLVM triggered Diagnostic Handler: %s\n",
4557 LLVMDisposeMessage(description
);
4560 static unsigned radv_llvm_compile(LLVMModuleRef M
,
4561 char **pelf_buffer
, size_t *pelf_size
,
4562 struct ac_llvm_compiler
*ac_llvm
)
4564 unsigned retval
= 0;
4565 LLVMContextRef llvm_ctx
;
4567 /* Setup Diagnostic Handler*/
4568 llvm_ctx
= LLVMGetModuleContext(M
);
4570 LLVMContextSetDiagnosticHandler(llvm_ctx
, ac_diagnostic_handler
,
4574 if (!radv_compile_to_elf(ac_llvm
, M
, pelf_buffer
, pelf_size
))
4579 static void ac_compile_llvm_module(struct ac_llvm_compiler
*ac_llvm
,
4580 LLVMModuleRef llvm_module
,
4581 struct radv_shader_binary
**rbinary
,
4582 struct radv_shader_variant_info
*shader_info
,
4583 gl_shader_stage stage
,
4585 const struct radv_nir_compiler_options
*options
)
4587 char *elf_buffer
= NULL
;
4588 size_t elf_size
= 0;
4589 char *llvm_ir_string
= NULL
;
4591 if (options
->dump_shader
) {
4592 fprintf(stderr
, "%s LLVM IR:\n\n", name
);
4593 ac_dump_module(llvm_module
);
4594 fprintf(stderr
, "\n");
4597 if (options
->record_llvm_ir
) {
4598 char *llvm_ir
= LLVMPrintModuleToString(llvm_module
);
4599 llvm_ir_string
= strdup(llvm_ir
);
4600 LLVMDisposeMessage(llvm_ir
);
4603 int v
= radv_llvm_compile(llvm_module
, &elf_buffer
, &elf_size
, ac_llvm
);
4605 fprintf(stderr
, "compile failed\n");
4608 LLVMContextRef ctx
= LLVMGetModuleContext(llvm_module
);
4609 LLVMDisposeModule(llvm_module
);
4610 LLVMContextDispose(ctx
);
4612 size_t llvm_ir_size
= llvm_ir_string
? strlen(llvm_ir_string
) : 0;
4613 size_t alloc_size
= sizeof(struct radv_shader_binary_rtld
) + elf_size
+ llvm_ir_size
+ 1;
4614 struct radv_shader_binary_rtld
*rbin
= calloc(1, alloc_size
);
4615 memcpy(rbin
->data
, elf_buffer
, elf_size
);
4617 memcpy(rbin
->data
+ elf_size
, llvm_ir_string
, llvm_ir_size
+ 1);
4619 rbin
->base
.type
= RADV_BINARY_TYPE_RTLD
;
4620 rbin
->base
.stage
= stage
;
4621 rbin
->base
.total_size
= alloc_size
;
4622 rbin
->elf_size
= elf_size
;
4623 rbin
->llvm_ir_size
= llvm_ir_size
;
4624 *rbinary
= &rbin
->base
;
4626 free(llvm_ir_string
);
4631 ac_fill_shader_info(struct radv_shader_variant_info
*shader_info
, struct nir_shader
*nir
, const struct radv_nir_compiler_options
*options
)
4633 switch (nir
->info
.stage
) {
4634 case MESA_SHADER_COMPUTE
:
4635 for (int i
= 0; i
< 3; ++i
)
4636 shader_info
->cs
.block_size
[i
] = nir
->info
.cs
.local_size
[i
];
4638 case MESA_SHADER_FRAGMENT
:
4639 shader_info
->fs
.early_fragment_test
= nir
->info
.fs
.early_fragment_tests
;
4640 shader_info
->fs
.post_depth_coverage
= nir
->info
.fs
.post_depth_coverage
;
4642 case MESA_SHADER_GEOMETRY
:
4643 shader_info
->gs
.vertices_in
= nir
->info
.gs
.vertices_in
;
4644 shader_info
->gs
.vertices_out
= nir
->info
.gs
.vertices_out
;
4645 shader_info
->gs
.output_prim
= nir
->info
.gs
.output_primitive
;
4646 shader_info
->gs
.invocations
= nir
->info
.gs
.invocations
;
4648 case MESA_SHADER_TESS_EVAL
:
4649 shader_info
->tes
.primitive_mode
= nir
->info
.tess
.primitive_mode
;
4650 shader_info
->tes
.spacing
= nir
->info
.tess
.spacing
;
4651 shader_info
->tes
.ccw
= nir
->info
.tess
.ccw
;
4652 shader_info
->tes
.point_mode
= nir
->info
.tess
.point_mode
;
4653 shader_info
->tes
.as_es
= options
->key
.vs_common_out
.as_es
;
4654 shader_info
->tes
.export_prim_id
= options
->key
.vs_common_out
.export_prim_id
;
4655 shader_info
->is_ngg
= options
->key
.vs_common_out
.as_ngg
;
4657 case MESA_SHADER_TESS_CTRL
:
4658 shader_info
->tcs
.tcs_vertices_out
= nir
->info
.tess
.tcs_vertices_out
;
4660 case MESA_SHADER_VERTEX
:
4661 shader_info
->vs
.as_es
= options
->key
.vs_common_out
.as_es
;
4662 shader_info
->vs
.as_ls
= options
->key
.vs_common_out
.as_ls
;
4663 shader_info
->vs
.export_prim_id
= options
->key
.vs_common_out
.export_prim_id
;
4664 shader_info
->is_ngg
= options
->key
.vs_common_out
.as_ngg
;
4672 radv_compile_nir_shader(struct ac_llvm_compiler
*ac_llvm
,
4673 struct radv_shader_binary
**rbinary
,
4674 struct radv_shader_variant_info
*shader_info
,
4675 struct nir_shader
*const *nir
,
4677 const struct radv_nir_compiler_options
*options
)
4680 LLVMModuleRef llvm_module
;
4682 llvm_module
= ac_translate_nir_to_llvm(ac_llvm
, nir
, nir_count
, shader_info
,
4685 ac_compile_llvm_module(ac_llvm
, llvm_module
, rbinary
, shader_info
,
4686 nir
[nir_count
- 1]->info
.stage
,
4687 radv_get_shader_name(shader_info
,
4688 nir
[nir_count
- 1]->info
.stage
),
4691 for (int i
= 0; i
< nir_count
; ++i
)
4692 ac_fill_shader_info(shader_info
, nir
[i
], options
);
4694 /* Determine the ES type (VS or TES) for the GS on GFX9. */
4695 if (options
->chip_class
>= GFX9
) {
4696 if (nir_count
== 2 &&
4697 nir
[1]->info
.stage
== MESA_SHADER_GEOMETRY
) {
4698 shader_info
->gs
.es_type
= nir
[0]->info
.stage
;
4704 ac_gs_copy_shader_emit(struct radv_shader_context
*ctx
)
4706 LLVMValueRef vtx_offset
=
4707 LLVMBuildMul(ctx
->ac
.builder
, ctx
->abi
.vertex_id
,
4708 LLVMConstInt(ctx
->ac
.i32
, 4, false), "");
4709 LLVMValueRef stream_id
;
4711 /* Fetch the vertex stream ID. */
4712 if (ctx
->shader_info
->info
.so
.num_outputs
) {
4714 ac_unpack_param(&ctx
->ac
, ctx
->streamout_config
, 24, 2);
4716 stream_id
= ctx
->ac
.i32_0
;
4719 LLVMBasicBlockRef end_bb
;
4720 LLVMValueRef switch_inst
;
4722 end_bb
= LLVMAppendBasicBlockInContext(ctx
->ac
.context
,
4723 ctx
->main_function
, "end");
4724 switch_inst
= LLVMBuildSwitch(ctx
->ac
.builder
, stream_id
, end_bb
, 4);
4726 for (unsigned stream
= 0; stream
< 4; stream
++) {
4727 unsigned num_components
=
4728 ctx
->shader_info
->info
.gs
.num_stream_output_components
[stream
];
4729 LLVMBasicBlockRef bb
;
4732 if (!num_components
)
4735 if (stream
> 0 && !ctx
->shader_info
->info
.so
.num_outputs
)
4738 bb
= LLVMInsertBasicBlockInContext(ctx
->ac
.context
, end_bb
, "out");
4739 LLVMAddCase(switch_inst
, LLVMConstInt(ctx
->ac
.i32
, stream
, 0), bb
);
4740 LLVMPositionBuilderAtEnd(ctx
->ac
.builder
, bb
);
4743 for (unsigned i
= 0; i
< AC_LLVM_MAX_OUTPUTS
; ++i
) {
4744 unsigned output_usage_mask
=
4745 ctx
->shader_info
->info
.gs
.output_usage_mask
[i
];
4746 unsigned output_stream
=
4747 ctx
->shader_info
->info
.gs
.output_streams
[i
];
4748 int length
= util_last_bit(output_usage_mask
);
4750 if (!(ctx
->output_mask
& (1ull << i
)) ||
4751 output_stream
!= stream
)
4754 for (unsigned j
= 0; j
< length
; j
++) {
4755 LLVMValueRef value
, soffset
;
4757 if (!(output_usage_mask
& (1 << j
)))
4760 soffset
= LLVMConstInt(ctx
->ac
.i32
,
4762 ctx
->gs_max_out_vertices
* 16 * 4, false);
4766 value
= ac_build_buffer_load(&ctx
->ac
,
4769 vtx_offset
, soffset
,
4770 0, ac_glc
| ac_slc
, true, false);
4772 LLVMTypeRef type
= LLVMGetAllocatedType(ctx
->abi
.outputs
[ac_llvm_reg_index_soa(i
, j
)]);
4773 if (ac_get_type_size(type
) == 2) {
4774 value
= LLVMBuildBitCast(ctx
->ac
.builder
, value
, ctx
->ac
.i32
, "");
4775 value
= LLVMBuildTrunc(ctx
->ac
.builder
, value
, ctx
->ac
.i16
, "");
4778 LLVMBuildStore(ctx
->ac
.builder
,
4779 ac_to_float(&ctx
->ac
, value
), ctx
->abi
.outputs
[ac_llvm_reg_index_soa(i
, j
)]);
4783 if (ctx
->shader_info
->info
.so
.num_outputs
)
4784 radv_emit_streamout(ctx
, stream
);
4787 handle_vs_outputs_post(ctx
, false, true,
4788 &ctx
->shader_info
->vs
.outinfo
);
4791 LLVMBuildBr(ctx
->ac
.builder
, end_bb
);
4794 LLVMPositionBuilderAtEnd(ctx
->ac
.builder
, end_bb
);
4798 radv_compile_gs_copy_shader(struct ac_llvm_compiler
*ac_llvm
,
4799 struct nir_shader
*geom_shader
,
4800 struct radv_shader_binary
**rbinary
,
4801 struct radv_shader_variant_info
*shader_info
,
4802 const struct radv_nir_compiler_options
*options
)
4804 struct radv_shader_context ctx
= {0};
4805 ctx
.options
= options
;
4806 ctx
.shader_info
= shader_info
;
4808 ac_llvm_context_init(&ctx
.ac
, options
->chip_class
, options
->family
);
4809 ctx
.context
= ctx
.ac
.context
;
4810 ctx
.ac
.module
= ac_create_module(ac_llvm
->tm
, ctx
.context
);
4812 ctx
.is_gs_copy_shader
= true;
4814 enum ac_float_mode float_mode
=
4815 options
->unsafe_math
? AC_FLOAT_MODE_UNSAFE_FP_MATH
:
4816 AC_FLOAT_MODE_DEFAULT
;
4818 ctx
.ac
.builder
= ac_create_builder(ctx
.context
, float_mode
);
4819 ctx
.stage
= MESA_SHADER_VERTEX
;
4821 radv_nir_shader_info_pass(geom_shader
, options
, &shader_info
->info
);
4823 create_function(&ctx
, MESA_SHADER_VERTEX
, false, MESA_SHADER_VERTEX
);
4825 ctx
.gs_max_out_vertices
= geom_shader
->info
.gs
.vertices_out
;
4826 ac_setup_rings(&ctx
);
4828 nir_foreach_variable(variable
, &geom_shader
->outputs
) {
4829 scan_shader_output_decl(&ctx
, variable
, geom_shader
, MESA_SHADER_VERTEX
);
4830 ac_handle_shader_output_decl(&ctx
.ac
, &ctx
.abi
, geom_shader
,
4831 variable
, MESA_SHADER_VERTEX
);
4834 ac_gs_copy_shader_emit(&ctx
);
4836 LLVMBuildRetVoid(ctx
.ac
.builder
);
4838 ac_llvm_finalize_module(&ctx
, ac_llvm
->passmgr
, options
);
4840 ac_compile_llvm_module(ac_llvm
, ctx
.ac
.module
, rbinary
, shader_info
,
4841 MESA_SHADER_VERTEX
, "GS Copy Shader", options
);
4842 (*rbinary
)->is_gs_copy_shader
= true;