ac/nir, radv, radeonsi: Switch to using ac_shader_args
[mesa.git] / src / amd / vulkan / radv_nir_to_llvm.c
1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
27
28 #include "radv_private.h"
29 #include "radv_shader.h"
30 #include "radv_shader_helper.h"
31 #include "radv_shader_args.h"
32 #include "nir/nir.h"
33
34 #include <llvm-c/Core.h>
35 #include <llvm-c/TargetMachine.h>
36 #include <llvm-c/Transforms/Scalar.h>
37 #include <llvm-c/Transforms/Utils.h>
38
39 #include "sid.h"
40 #include "ac_binary.h"
41 #include "ac_llvm_util.h"
42 #include "ac_llvm_build.h"
43 #include "ac_shader_abi.h"
44 #include "ac_shader_util.h"
45 #include "ac_exp_param.h"
46
47 #define RADEON_LLVM_MAX_INPUTS (VARYING_SLOT_VAR31 + 1)
48
49 struct radv_shader_context {
50 struct ac_llvm_context ac;
51 const struct nir_shader *shader;
52 struct ac_shader_abi abi;
53 const struct radv_shader_args *args;
54
55 gl_shader_stage stage;
56
57 unsigned max_workgroup_size;
58 LLVMContextRef context;
59 LLVMValueRef main_function;
60
61 LLVMValueRef descriptor_sets[MAX_SETS];
62
63 LLVMValueRef ring_offsets;
64
65 LLVMValueRef rel_auto_id;
66
67 LLVMValueRef gs_wave_id;
68 LLVMValueRef gs_vtx_offset[6];
69
70 LLVMValueRef esgs_ring;
71 LLVMValueRef gsvs_ring[4];
72 LLVMValueRef hs_ring_tess_offchip;
73 LLVMValueRef hs_ring_tess_factor;
74
75 LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4];
76
77 uint64_t output_mask;
78
79 LLVMValueRef gs_next_vertex[4];
80 LLVMValueRef gs_curprim_verts[4];
81 LLVMValueRef gs_generated_prims[4];
82 LLVMValueRef gs_ngg_emit;
83 LLVMValueRef gs_ngg_scratch;
84
85 uint32_t tcs_num_inputs;
86 uint32_t tcs_num_patches;
87
88 LLVMValueRef vertexptr; /* GFX10 only */
89 };
90
91 struct radv_shader_output_values {
92 LLVMValueRef values[4];
93 unsigned slot_name;
94 unsigned slot_index;
95 unsigned usage_mask;
96 };
97
98 static inline struct radv_shader_context *
99 radv_shader_context_from_abi(struct ac_shader_abi *abi)
100 {
101 struct radv_shader_context *ctx = NULL;
102 return container_of(abi, ctx, abi);
103 }
104
105 static LLVMValueRef get_rel_patch_id(struct radv_shader_context *ctx)
106 {
107 switch (ctx->stage) {
108 case MESA_SHADER_TESS_CTRL:
109 return ac_unpack_param(&ctx->ac,
110 ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids),
111 0, 8);
112 case MESA_SHADER_TESS_EVAL:
113 return ac_get_arg(&ctx->ac, ctx->args->tes_rel_patch_id);
114 break;
115 default:
116 unreachable("Illegal stage");
117 }
118 }
119
120 static unsigned
121 get_tcs_num_patches(struct radv_shader_context *ctx)
122 {
123 unsigned num_tcs_input_cp = ctx->args->options->key.tcs.input_vertices;
124 unsigned num_tcs_output_cp = ctx->shader->info.tess.tcs_vertices_out;
125 uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
126 uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size;
127 uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
128 uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written);
129 uint32_t output_vertex_size = num_tcs_outputs * 16;
130 uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
131 uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
132 unsigned num_patches;
133 unsigned hardware_lds_size;
134
135 /* Ensure that we only need one wave per SIMD so we don't need to check
136 * resource usage. Also ensures that the number of tcs in and out
137 * vertices per threadgroup are at most 256.
138 */
139 num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4;
140 /* Make sure that the data fits in LDS. This assumes the shaders only
141 * use LDS for the inputs and outputs.
142 */
143 hardware_lds_size = 32768;
144
145 /* Looks like STONEY hangs if we use more than 32 KiB LDS in a single
146 * threadgroup, even though there is more than 32 KiB LDS.
147 *
148 * Test: dEQP-VK.tessellation.shader_input_output.barrier
149 */
150 if (ctx->args->options->chip_class >= GFX7 && ctx->args->options->family != CHIP_STONEY)
151 hardware_lds_size = 65536;
152
153 num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size));
154 /* Make sure the output data fits in the offchip buffer */
155 num_patches = MIN2(num_patches, (ctx->args->options->tess_offchip_block_dw_size * 4) / output_patch_size);
156 /* Not necessary for correctness, but improves performance. The
157 * specific value is taken from the proprietary driver.
158 */
159 num_patches = MIN2(num_patches, 40);
160
161 /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */
162 if (ctx->args->options->chip_class == GFX6) {
163 unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
164 num_patches = MIN2(num_patches, one_wave);
165 }
166 return num_patches;
167 }
168
169 static unsigned
170 calculate_tess_lds_size(struct radv_shader_context *ctx)
171 {
172 unsigned num_tcs_input_cp = ctx->args->options->key.tcs.input_vertices;
173 unsigned num_tcs_output_cp;
174 unsigned num_tcs_outputs, num_tcs_patch_outputs;
175 unsigned input_vertex_size, output_vertex_size;
176 unsigned input_patch_size, output_patch_size;
177 unsigned pervertex_output_patch_size;
178 unsigned output_patch0_offset;
179 unsigned num_patches;
180 unsigned lds_size;
181
182 num_tcs_output_cp = ctx->shader->info.tess.tcs_vertices_out;
183 num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
184 num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written);
185
186 input_vertex_size = ctx->tcs_num_inputs * 16;
187 output_vertex_size = num_tcs_outputs * 16;
188
189 input_patch_size = num_tcs_input_cp * input_vertex_size;
190
191 pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
192 output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
193
194 num_patches = ctx->tcs_num_patches;
195 output_patch0_offset = input_patch_size * num_patches;
196
197 lds_size = output_patch0_offset + output_patch_size * num_patches;
198 return lds_size;
199 }
200
201 /* Tessellation shaders pass outputs to the next shader using LDS.
202 *
203 * LS outputs = TCS inputs
204 * TCS outputs = TES inputs
205 *
206 * The LDS layout is:
207 * - TCS inputs for patch 0
208 * - TCS inputs for patch 1
209 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
210 * - ...
211 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
212 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
213 * - TCS outputs for patch 1
214 * - Per-patch TCS outputs for patch 1
215 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
216 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
217 * - ...
218 *
219 * All three shaders VS(LS), TCS, TES share the same LDS space.
220 */
221 static LLVMValueRef
222 get_tcs_in_patch_stride(struct radv_shader_context *ctx)
223 {
224 assert(ctx->stage == MESA_SHADER_TESS_CTRL);
225 uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
226 uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size;
227
228 input_patch_size /= 4;
229 return LLVMConstInt(ctx->ac.i32, input_patch_size, false);
230 }
231
232 static LLVMValueRef
233 get_tcs_out_patch_stride(struct radv_shader_context *ctx)
234 {
235 uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
236 uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written);
237 uint32_t output_vertex_size = num_tcs_outputs * 16;
238 uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
239 uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
240 output_patch_size /= 4;
241 return LLVMConstInt(ctx->ac.i32, output_patch_size, false);
242 }
243
244 static LLVMValueRef
245 get_tcs_out_vertex_stride(struct radv_shader_context *ctx)
246 {
247 uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
248 uint32_t output_vertex_size = num_tcs_outputs * 16;
249 output_vertex_size /= 4;
250 return LLVMConstInt(ctx->ac.i32, output_vertex_size, false);
251 }
252
253 static LLVMValueRef
254 get_tcs_out_patch0_offset(struct radv_shader_context *ctx)
255 {
256 assert (ctx->stage == MESA_SHADER_TESS_CTRL);
257 uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
258 uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size;
259 uint32_t output_patch0_offset = input_patch_size;
260 unsigned num_patches = ctx->tcs_num_patches;
261
262 output_patch0_offset *= num_patches;
263 output_patch0_offset /= 4;
264 return LLVMConstInt(ctx->ac.i32, output_patch0_offset, false);
265 }
266
267 static LLVMValueRef
268 get_tcs_out_patch0_patch_data_offset(struct radv_shader_context *ctx)
269 {
270 assert (ctx->stage == MESA_SHADER_TESS_CTRL);
271 uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
272 uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size;
273 uint32_t output_patch0_offset = input_patch_size;
274
275 uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
276 uint32_t output_vertex_size = num_tcs_outputs * 16;
277 uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
278 unsigned num_patches = ctx->tcs_num_patches;
279
280 output_patch0_offset *= num_patches;
281 output_patch0_offset += pervertex_output_patch_size;
282 output_patch0_offset /= 4;
283 return LLVMConstInt(ctx->ac.i32, output_patch0_offset, false);
284 }
285
286 static LLVMValueRef
287 get_tcs_in_current_patch_offset(struct radv_shader_context *ctx)
288 {
289 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
290 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
291
292 return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
293 }
294
295 static LLVMValueRef
296 get_tcs_out_current_patch_offset(struct radv_shader_context *ctx)
297 {
298 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
299 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
300 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
301
302 return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id,
303 patch0_offset);
304 }
305
306 static LLVMValueRef
307 get_tcs_out_current_patch_data_offset(struct radv_shader_context *ctx)
308 {
309 LLVMValueRef patch0_patch_data_offset =
310 get_tcs_out_patch0_patch_data_offset(ctx);
311 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
312 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
313
314 return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id,
315 patch0_patch_data_offset);
316 }
317
318 static LLVMValueRef
319 create_llvm_function(struct ac_llvm_context *ctx, LLVMModuleRef module,
320 LLVMBuilderRef builder,
321 struct ac_shader_args *args,
322 enum ac_llvm_calling_convention convention,
323 unsigned max_workgroup_size,
324 const struct radv_nir_compiler_options *options)
325 {
326 LLVMValueRef main_function =
327 ac_build_main(args, ctx, convention, "main", ctx->voidt, module);
328
329 if (options->address32_hi) {
330 ac_llvm_add_target_dep_function_attr(main_function,
331 "amdgpu-32bit-address-high-bits",
332 options->address32_hi);
333 }
334
335 ac_llvm_set_workgroup_size(main_function, max_workgroup_size);
336
337 return main_function;
338 }
339
340
341 static void
342 set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx,
343 uint8_t num_sgprs)
344 {
345 ud_info->sgpr_idx = *sgpr_idx;
346 ud_info->num_sgprs = num_sgprs;
347 *sgpr_idx += num_sgprs;
348 }
349
350 static void
351 set_loc_shader(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx,
352 uint8_t num_sgprs)
353 {
354 struct radv_userdata_info *ud_info =
355 &args->shader_info->user_sgprs_locs.shader_data[idx];
356 assert(ud_info);
357
358 set_loc(ud_info, sgpr_idx, num_sgprs);
359 }
360
361 static void
362 set_loc_shader_ptr(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx)
363 {
364 bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS;
365
366 set_loc_shader(args, idx, sgpr_idx, use_32bit_pointers ? 1 : 2);
367 }
368
369 static void
370 set_loc_desc(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx)
371 {
372 struct radv_userdata_locations *locs =
373 &args->shader_info->user_sgprs_locs;
374 struct radv_userdata_info *ud_info = &locs->descriptor_sets[idx];
375 assert(ud_info);
376
377 set_loc(ud_info, sgpr_idx, 1);
378
379 locs->descriptor_sets_enabled |= 1 << idx;
380 }
381
382 struct user_sgpr_info {
383 bool need_ring_offsets;
384 bool indirect_all_descriptor_sets;
385 uint8_t remaining_sgprs;
386 };
387
388 static bool needs_view_index_sgpr(struct radv_shader_args *args,
389 gl_shader_stage stage)
390 {
391 switch (stage) {
392 case MESA_SHADER_VERTEX:
393 if (args->shader_info->needs_multiview_view_index ||
394 (!args->options->key.vs_common_out.as_es && !args->options->key.vs_common_out.as_ls && args->options->key.has_multiview_view_index))
395 return true;
396 break;
397 case MESA_SHADER_TESS_EVAL:
398 if (args->shader_info->needs_multiview_view_index || (!args->options->key.vs_common_out.as_es && args->options->key.has_multiview_view_index))
399 return true;
400 break;
401 case MESA_SHADER_GEOMETRY:
402 case MESA_SHADER_TESS_CTRL:
403 if (args->shader_info->needs_multiview_view_index)
404 return true;
405 break;
406 default:
407 break;
408 }
409 return false;
410 }
411
412 static uint8_t
413 count_vs_user_sgprs(struct radv_shader_args *args)
414 {
415 uint8_t count = 0;
416
417 if (args->shader_info->vs.has_vertex_buffers)
418 count++;
419 count += args->shader_info->vs.needs_draw_id ? 3 : 2;
420
421 return count;
422 }
423
424 static void allocate_inline_push_consts(struct radv_shader_args *args,
425 struct user_sgpr_info *user_sgpr_info)
426 {
427 uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs;
428
429 /* Only supported if shaders use push constants. */
430 if (args->shader_info->min_push_constant_used == UINT8_MAX)
431 return;
432
433 /* Only supported if shaders don't have indirect push constants. */
434 if (args->shader_info->has_indirect_push_constants)
435 return;
436
437 /* Only supported for 32-bit push constants. */
438 if (!args->shader_info->has_only_32bit_push_constants)
439 return;
440
441 uint8_t num_push_consts =
442 (args->shader_info->max_push_constant_used -
443 args->shader_info->min_push_constant_used) / 4;
444
445 /* Check if the number of user SGPRs is large enough. */
446 if (num_push_consts < remaining_sgprs) {
447 args->shader_info->num_inline_push_consts = num_push_consts;
448 } else {
449 args->shader_info->num_inline_push_consts = remaining_sgprs;
450 }
451
452 /* Clamp to the maximum number of allowed inlined push constants. */
453 if (args->shader_info->num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS)
454 args->shader_info->num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS;
455
456 if (args->shader_info->num_inline_push_consts == num_push_consts &&
457 !args->shader_info->loads_dynamic_offsets) {
458 /* Disable the default push constants path if all constants are
459 * inlined and if shaders don't use dynamic descriptors.
460 */
461 args->shader_info->loads_push_constants = false;
462 }
463
464 args->shader_info->base_inline_push_consts =
465 args->shader_info->min_push_constant_used / 4;
466 }
467
468 static void allocate_user_sgprs(struct radv_shader_args *args,
469 gl_shader_stage stage,
470 bool has_previous_stage,
471 gl_shader_stage previous_stage,
472 bool needs_view_index,
473 struct user_sgpr_info *user_sgpr_info)
474 {
475 uint8_t user_sgpr_count = 0;
476
477 memset(user_sgpr_info, 0, sizeof(struct user_sgpr_info));
478
479 /* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */
480 if (stage == MESA_SHADER_GEOMETRY ||
481 stage == MESA_SHADER_VERTEX ||
482 stage == MESA_SHADER_TESS_CTRL ||
483 stage == MESA_SHADER_TESS_EVAL ||
484 args->is_gs_copy_shader)
485 user_sgpr_info->need_ring_offsets = true;
486
487 if (stage == MESA_SHADER_FRAGMENT &&
488 args->shader_info->ps.needs_sample_positions)
489 user_sgpr_info->need_ring_offsets = true;
490
491 /* 2 user sgprs will nearly always be allocated for scratch/rings */
492 if (args->options->supports_spill || user_sgpr_info->need_ring_offsets) {
493 user_sgpr_count += 2;
494 }
495
496 switch (stage) {
497 case MESA_SHADER_COMPUTE:
498 if (args->shader_info->cs.uses_grid_size)
499 user_sgpr_count += 3;
500 break;
501 case MESA_SHADER_FRAGMENT:
502 user_sgpr_count += args->shader_info->ps.needs_sample_positions;
503 break;
504 case MESA_SHADER_VERTEX:
505 if (!args->is_gs_copy_shader)
506 user_sgpr_count += count_vs_user_sgprs(args);
507 break;
508 case MESA_SHADER_TESS_CTRL:
509 if (has_previous_stage) {
510 if (previous_stage == MESA_SHADER_VERTEX)
511 user_sgpr_count += count_vs_user_sgprs(args);
512 }
513 break;
514 case MESA_SHADER_TESS_EVAL:
515 break;
516 case MESA_SHADER_GEOMETRY:
517 if (has_previous_stage) {
518 if (previous_stage == MESA_SHADER_VERTEX) {
519 user_sgpr_count += count_vs_user_sgprs(args);
520 }
521 }
522 break;
523 default:
524 break;
525 }
526
527 if (needs_view_index)
528 user_sgpr_count++;
529
530 if (args->shader_info->loads_push_constants)
531 user_sgpr_count++;
532
533 if (args->shader_info->so.num_outputs)
534 user_sgpr_count++;
535
536 uint32_t available_sgprs = args->options->chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE ? 32 : 16;
537 uint32_t remaining_sgprs = available_sgprs - user_sgpr_count;
538 uint32_t num_desc_set =
539 util_bitcount(args->shader_info->desc_set_used_mask);
540
541 if (remaining_sgprs < num_desc_set) {
542 user_sgpr_info->indirect_all_descriptor_sets = true;
543 user_sgpr_info->remaining_sgprs = remaining_sgprs - 1;
544 } else {
545 user_sgpr_info->remaining_sgprs = remaining_sgprs - num_desc_set;
546 }
547
548 allocate_inline_push_consts(args, user_sgpr_info);
549 }
550
551 static void
552 declare_global_input_sgprs(struct radv_shader_args *args,
553 const struct user_sgpr_info *user_sgpr_info)
554 {
555 /* 1 for each descriptor set */
556 if (!user_sgpr_info->indirect_all_descriptor_sets) {
557 uint32_t mask = args->shader_info->desc_set_used_mask;
558
559 while (mask) {
560 int i = u_bit_scan(&mask);
561
562 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR,
563 &args->descriptor_sets[i]);
564 }
565 } else {
566 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR_PTR,
567 &args->descriptor_sets[0]);
568 }
569
570 if (args->shader_info->loads_push_constants) {
571 /* 1 for push constants and dynamic descriptors */
572 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR,
573 &args->ac.push_constants);
574 }
575
576 for (unsigned i = 0; i < args->shader_info->num_inline_push_consts; i++) {
577 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
578 &args->ac.inline_push_consts[i]);
579 }
580 args->ac.num_inline_push_consts = args->shader_info->num_inline_push_consts;
581 args->ac.base_inline_push_consts = args->shader_info->base_inline_push_consts;
582
583 if (args->shader_info->so.num_outputs) {
584 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
585 &args->streamout_buffers);
586 }
587 }
588
589 static void
590 declare_vs_specific_input_sgprs(struct radv_shader_args *args,
591 gl_shader_stage stage,
592 bool has_previous_stage,
593 gl_shader_stage previous_stage)
594 {
595 if (!args->is_gs_copy_shader &&
596 (stage == MESA_SHADER_VERTEX ||
597 (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
598 if (args->shader_info->vs.has_vertex_buffers) {
599 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
600 &args->vertex_buffers);
601 }
602 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
603 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
604 if (args->shader_info->vs.needs_draw_id) {
605 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
606 }
607 }
608 }
609
610 static void
611 declare_vs_input_vgprs(struct radv_shader_args *args)
612 {
613 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id);
614 if (!args->is_gs_copy_shader) {
615 if (args->options->key.vs_common_out.as_ls) {
616 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->rel_auto_id);
617 if (args->options->chip_class >= GFX10) {
618 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user vgpr */
619 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
620 } else {
621 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
622 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
623 }
624 } else {
625 if (args->options->chip_class >= GFX10) {
626 if (args->options->key.vs_common_out.as_ngg) {
627 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user vgpr */
628 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user vgpr */
629 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
630 } else {
631 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
632 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->vs_prim_id);
633 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
634 }
635 } else {
636 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
637 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->vs_prim_id);
638 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
639 }
640 }
641 }
642 }
643
644 static void
645 declare_streamout_sgprs(struct radv_shader_args *args, gl_shader_stage stage)
646 {
647 int i;
648
649 if (args->options->use_ngg_streamout) {
650 if (stage == MESA_SHADER_TESS_EVAL)
651 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
652 return;
653 }
654
655 /* Streamout SGPRs. */
656 if (args->shader_info->so.num_outputs) {
657 assert(stage == MESA_SHADER_VERTEX ||
658 stage == MESA_SHADER_TESS_EVAL);
659
660 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->streamout_config);
661 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->streamout_write_idx);
662 } else if (stage == MESA_SHADER_TESS_EVAL) {
663 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
664 }
665
666 /* A streamout buffer offset is loaded if the stride is non-zero. */
667 for (i = 0; i < 4; i++) {
668 if (!args->shader_info->so.strides[i])
669 continue;
670
671 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->streamout_offset[i]);
672 }
673 }
674
675 static void
676 declare_tes_input_vgprs(struct radv_shader_args *args)
677 {
678 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->tes_u);
679 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->tes_v);
680 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->tes_rel_patch_id);
681 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_patch_id);
682 }
683
684 static void
685 set_global_input_locs(struct radv_shader_args *args,
686 const struct user_sgpr_info *user_sgpr_info,
687 uint8_t *user_sgpr_idx)
688 {
689 uint32_t mask = args->shader_info->desc_set_used_mask;
690
691 if (!user_sgpr_info->indirect_all_descriptor_sets) {
692 while (mask) {
693 int i = u_bit_scan(&mask);
694
695 set_loc_desc(args, i, user_sgpr_idx);
696 }
697 } else {
698 set_loc_shader_ptr(args, AC_UD_INDIRECT_DESCRIPTOR_SETS,
699 user_sgpr_idx);
700
701 args->shader_info->need_indirect_descriptor_sets = true;
702 }
703
704 if (args->shader_info->loads_push_constants) {
705 set_loc_shader_ptr(args, AC_UD_PUSH_CONSTANTS, user_sgpr_idx);
706 }
707
708 if (args->shader_info->num_inline_push_consts) {
709 set_loc_shader(args, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx,
710 args->shader_info->num_inline_push_consts);
711 }
712
713 if (args->streamout_buffers.used) {
714 set_loc_shader_ptr(args, AC_UD_STREAMOUT_BUFFERS,
715 user_sgpr_idx);
716 }
717 }
718
719 static void
720 load_descriptor_sets(struct radv_shader_context *ctx)
721 {
722 uint32_t mask = ctx->args->shader_info->desc_set_used_mask;
723 if (ctx->args->shader_info->need_indirect_descriptor_sets) {
724 LLVMValueRef desc_sets =
725 ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[0]);
726 while (mask) {
727 int i = u_bit_scan(&mask);
728
729 ctx->descriptor_sets[i] =
730 ac_build_load_to_sgpr(&ctx->ac, desc_sets,
731 LLVMConstInt(ctx->ac.i32, i, false));
732
733 }
734 } else {
735 while (mask) {
736 int i = u_bit_scan(&mask);
737
738 ctx->descriptor_sets[i] =
739 ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[i]);
740 }
741 }
742 }
743
744
745 static void
746 set_vs_specific_input_locs(struct radv_shader_args *args,
747 gl_shader_stage stage, bool has_previous_stage,
748 gl_shader_stage previous_stage,
749 uint8_t *user_sgpr_idx)
750 {
751 if (!args->is_gs_copy_shader &&
752 (stage == MESA_SHADER_VERTEX ||
753 (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
754 if (args->shader_info->vs.has_vertex_buffers) {
755 set_loc_shader_ptr(args, AC_UD_VS_VERTEX_BUFFERS,
756 user_sgpr_idx);
757 }
758
759 unsigned vs_num = 2;
760 if (args->shader_info->vs.needs_draw_id)
761 vs_num++;
762
763 set_loc_shader(args, AC_UD_VS_BASE_VERTEX_START_INSTANCE,
764 user_sgpr_idx, vs_num);
765 }
766 }
767
768 static enum ac_llvm_calling_convention
769 get_llvm_calling_convention(LLVMValueRef func, gl_shader_stage stage)
770 {
771 switch (stage) {
772 case MESA_SHADER_VERTEX:
773 case MESA_SHADER_TESS_EVAL:
774 return AC_LLVM_AMDGPU_VS;
775 break;
776 case MESA_SHADER_GEOMETRY:
777 return AC_LLVM_AMDGPU_GS;
778 break;
779 case MESA_SHADER_TESS_CTRL:
780 return AC_LLVM_AMDGPU_HS;
781 break;
782 case MESA_SHADER_FRAGMENT:
783 return AC_LLVM_AMDGPU_PS;
784 break;
785 case MESA_SHADER_COMPUTE:
786 return AC_LLVM_AMDGPU_CS;
787 break;
788 default:
789 unreachable("Unhandle shader type");
790 }
791 }
792
793 /* Returns whether the stage is a stage that can be directly before the GS */
794 static bool is_pre_gs_stage(gl_shader_stage stage)
795 {
796 return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL;
797 }
798
799 static void declare_inputs(struct radv_shader_args *args,
800 gl_shader_stage stage,
801 bool has_previous_stage,
802 gl_shader_stage previous_stage)
803 {
804 struct user_sgpr_info user_sgpr_info;
805 bool needs_view_index = needs_view_index_sgpr(args, stage);
806
807 if (args->options->chip_class >= GFX10) {
808 if (is_pre_gs_stage(stage) && args->options->key.vs_common_out.as_ngg) {
809 /* On GFX10, VS is merged into GS for NGG. */
810 previous_stage = stage;
811 stage = MESA_SHADER_GEOMETRY;
812 has_previous_stage = true;
813 }
814 }
815
816 for (int i = 0; i < MAX_SETS; i++)
817 args->shader_info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
818 for (int i = 0; i < AC_UD_MAX_UD; i++)
819 args->shader_info->user_sgprs_locs.shader_data[i].sgpr_idx = -1;
820
821
822 allocate_user_sgprs(args, stage, has_previous_stage,
823 previous_stage, needs_view_index, &user_sgpr_info);
824
825 if (user_sgpr_info.need_ring_offsets && !args->options->supports_spill) {
826 ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR,
827 &args->ring_offsets);
828 }
829
830 switch (stage) {
831 case MESA_SHADER_COMPUTE:
832 declare_global_input_sgprs(args, &user_sgpr_info);
833
834 if (args->shader_info->cs.uses_grid_size) {
835 ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT,
836 &args->ac.num_work_groups);
837 }
838
839 for (int i = 0; i < 3; i++) {
840 if (args->shader_info->cs.uses_block_id[i]) {
841 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
842 &args->ac.workgroup_ids[i]);
843 }
844 }
845
846 if (args->shader_info->cs.uses_local_invocation_idx) {
847 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
848 &args->ac.tg_size);
849 }
850
851 ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT,
852 &args->ac.local_invocation_ids);
853 break;
854 case MESA_SHADER_VERTEX:
855 declare_global_input_sgprs(args, &user_sgpr_info);
856
857 declare_vs_specific_input_sgprs(args, stage, has_previous_stage,
858 previous_stage);
859
860 if (needs_view_index) {
861 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
862 &args->ac.view_index);
863 }
864
865 if (args->options->key.vs_common_out.as_es) {
866 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
867 &args->es2gs_offset);
868 } else if (args->options->key.vs_common_out.as_ls) {
869 /* no extra parameters */
870 } else {
871 declare_streamout_sgprs(args, stage);
872 }
873
874 declare_vs_input_vgprs(args);
875 break;
876 case MESA_SHADER_TESS_CTRL:
877 if (has_previous_stage) {
878 // First 6 system regs
879 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->oc_lds);
880 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
881 &args->merged_wave_info);
882 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
883 &args->tess_factor_offset);
884
885 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // scratch offset
886 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // unknown
887 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // unknown
888
889 declare_global_input_sgprs(args, &user_sgpr_info);
890
891 declare_vs_specific_input_sgprs(args, stage,
892 has_previous_stage,
893 previous_stage);
894
895 if (needs_view_index) {
896 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
897 &args->ac.view_index);
898 }
899
900 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
901 &args->ac.tcs_patch_id);
902 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
903 &args->ac.tcs_rel_ids);
904
905 declare_vs_input_vgprs(args);
906 } else {
907 declare_global_input_sgprs(args, &user_sgpr_info);
908
909 if (needs_view_index) {
910 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
911 &args->ac.view_index);
912 }
913
914 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->oc_lds);
915 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
916 &args->tess_factor_offset);
917 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
918 &args->ac.tcs_patch_id);
919 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
920 &args->ac.tcs_rel_ids);
921 }
922 break;
923 case MESA_SHADER_TESS_EVAL:
924 declare_global_input_sgprs(args, &user_sgpr_info);
925
926 if (needs_view_index)
927 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
928 &args->ac.view_index);
929
930 if (args->options->key.vs_common_out.as_es) {
931 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->oc_lds);
932 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
933 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
934 &args->es2gs_offset);
935 } else {
936 declare_streamout_sgprs(args, stage);
937 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->oc_lds);
938 }
939 declare_tes_input_vgprs(args);
940 break;
941 case MESA_SHADER_GEOMETRY:
942 if (has_previous_stage) {
943 // First 6 system regs
944 if (args->options->key.vs_common_out.as_ngg) {
945 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
946 &args->gs_tg_info);
947 } else {
948 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
949 &args->gs2vs_offset);
950 }
951
952 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
953 &args->merged_wave_info);
954 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->oc_lds);
955
956 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // scratch offset
957 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // unknown
958 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // unknown
959
960 declare_global_input_sgprs(args, &user_sgpr_info);
961
962 if (previous_stage != MESA_SHADER_TESS_EVAL) {
963 declare_vs_specific_input_sgprs(args, stage,
964 has_previous_stage,
965 previous_stage);
966 }
967
968 if (needs_view_index) {
969 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
970 &args->ac.view_index);
971 }
972
973 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
974 &args->gs_vtx_offset[0]);
975 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
976 &args->gs_vtx_offset[2]);
977 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
978 &args->ac.gs_prim_id);
979 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
980 &args->ac.gs_invocation_id);
981 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
982 &args->gs_vtx_offset[4]);
983
984 if (previous_stage == MESA_SHADER_VERTEX) {
985 declare_vs_input_vgprs(args);
986 } else {
987 declare_tes_input_vgprs(args);
988 }
989 } else {
990 declare_global_input_sgprs(args, &user_sgpr_info);
991
992 if (needs_view_index) {
993 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
994 &args->ac.view_index);
995 }
996
997 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->gs2vs_offset);
998 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->gs_wave_id);
999 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
1000 &args->gs_vtx_offset[0]);
1001 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
1002 &args->gs_vtx_offset[1]);
1003 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
1004 &args->ac.gs_prim_id);
1005 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
1006 &args->gs_vtx_offset[2]);
1007 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
1008 &args->gs_vtx_offset[3]);
1009 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
1010 &args->gs_vtx_offset[4]);
1011 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
1012 &args->gs_vtx_offset[5]);
1013 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
1014 &args->ac.gs_invocation_id);
1015 }
1016 break;
1017 case MESA_SHADER_FRAGMENT:
1018 declare_global_input_sgprs(args, &user_sgpr_info);
1019
1020 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.prim_mask);
1021 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_sample);
1022 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_center);
1023 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_centroid);
1024 ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, NULL); /* persp pull model */
1025 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_sample);
1026 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_center);
1027 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_centroid);
1028 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); /* line stipple tex */
1029 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[0]);
1030 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[1]);
1031 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[2]);
1032 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[3]);
1033 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.front_face);
1034 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.ancillary);
1035 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.sample_coverage);
1036 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* fixed pt */
1037 break;
1038 default:
1039 unreachable("Shader stage not implemented");
1040 }
1041
1042 args->shader_info->num_input_vgprs = 0;
1043 args->shader_info->num_input_sgprs = args->options->supports_spill ? 2 : 0;
1044 args->shader_info->num_input_sgprs += args->ac.num_sgprs_used;
1045
1046 if (stage != MESA_SHADER_FRAGMENT)
1047 args->shader_info->num_input_vgprs = args->ac.num_vgprs_used;
1048
1049 uint8_t user_sgpr_idx = 0;
1050
1051 if (args->options->supports_spill || user_sgpr_info.need_ring_offsets) {
1052 set_loc_shader_ptr(args, AC_UD_SCRATCH_RING_OFFSETS,
1053 &user_sgpr_idx);
1054 }
1055
1056 /* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including
1057 * the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */
1058 if (has_previous_stage)
1059 user_sgpr_idx = 0;
1060
1061 set_global_input_locs(args, &user_sgpr_info, &user_sgpr_idx);
1062
1063 switch (stage) {
1064 case MESA_SHADER_COMPUTE:
1065 if (args->shader_info->cs.uses_grid_size) {
1066 set_loc_shader(args, AC_UD_CS_GRID_SIZE,
1067 &user_sgpr_idx, 3);
1068 }
1069 break;
1070 case MESA_SHADER_VERTEX:
1071 set_vs_specific_input_locs(args, stage, has_previous_stage,
1072 previous_stage, &user_sgpr_idx);
1073 if (args->ac.view_index.used)
1074 set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
1075 break;
1076 case MESA_SHADER_TESS_CTRL:
1077 set_vs_specific_input_locs(args, stage, has_previous_stage,
1078 previous_stage, &user_sgpr_idx);
1079 if (args->ac.view_index.used)
1080 set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
1081 break;
1082 case MESA_SHADER_TESS_EVAL:
1083 if (args->ac.view_index.used)
1084 set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
1085 break;
1086 case MESA_SHADER_GEOMETRY:
1087 if (has_previous_stage) {
1088 if (previous_stage == MESA_SHADER_VERTEX)
1089 set_vs_specific_input_locs(args, stage,
1090 has_previous_stage,
1091 previous_stage,
1092 &user_sgpr_idx);
1093 }
1094 if (args->ac.view_index.used)
1095 set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
1096 break;
1097 case MESA_SHADER_FRAGMENT:
1098 break;
1099 default:
1100 unreachable("Shader stage not implemented");
1101 }
1102
1103 args->shader_info->num_user_sgprs = user_sgpr_idx;
1104 }
1105
1106 static void create_function(struct radv_shader_context *ctx,
1107 gl_shader_stage stage,
1108 bool has_previous_stage)
1109 {
1110 if (ctx->ac.chip_class >= GFX10) {
1111 if (is_pre_gs_stage(stage) && ctx->args->options->key.vs_common_out.as_ngg) {
1112 /* On GFX10, VS is merged into GS for NGG. */
1113 stage = MESA_SHADER_GEOMETRY;
1114 has_previous_stage = true;
1115 }
1116 }
1117
1118 ctx->main_function = create_llvm_function(
1119 &ctx->ac, ctx->ac.module, ctx->ac.builder, &ctx->args->ac,
1120 get_llvm_calling_convention(ctx->main_function, stage),
1121 ctx->max_workgroup_size,
1122 ctx->args->options);
1123
1124 if (ctx->args->options->supports_spill) {
1125 ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
1126 LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST),
1127 NULL, 0, AC_FUNC_ATTR_READNONE);
1128 ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets,
1129 ac_array_in_const_addr_space(ctx->ac.v4i32), "");
1130 } else if (ctx->args->ring_offsets.used) {
1131 ctx->ring_offsets = ac_get_arg(&ctx->ac, ctx->args->ring_offsets);
1132 }
1133
1134 load_descriptor_sets(ctx);
1135
1136 if (stage == MESA_SHADER_TESS_CTRL ||
1137 (stage == MESA_SHADER_VERTEX && ctx->args->options->key.vs_common_out.as_ls) ||
1138 /* GFX9 has the ESGS ring buffer in LDS. */
1139 (stage == MESA_SHADER_GEOMETRY && has_previous_stage)) {
1140 ac_declare_lds_as_pointer(&ctx->ac);
1141 }
1142
1143 }
1144
1145
1146 static LLVMValueRef
1147 radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index,
1148 unsigned desc_set, unsigned binding)
1149 {
1150 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1151 LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set];
1152 struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout;
1153 struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
1154 unsigned base_offset = layout->binding[binding].offset;
1155 LLVMValueRef offset, stride;
1156
1157 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
1158 layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
1159 unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
1160 layout->binding[binding].dynamic_offset_offset;
1161 desc_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.push_constants);
1162 base_offset = pipeline_layout->push_constant_size + 16 * idx;
1163 stride = LLVMConstInt(ctx->ac.i32, 16, false);
1164 } else
1165 stride = LLVMConstInt(ctx->ac.i32, layout->binding[binding].size, false);
1166
1167 offset = LLVMConstInt(ctx->ac.i32, base_offset, false);
1168
1169 if (layout->binding[binding].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
1170 offset = ac_build_imad(&ctx->ac, index, stride, offset);
1171 }
1172
1173 desc_ptr = LLVMBuildGEP(ctx->ac.builder, desc_ptr, &offset, 1, "");
1174 desc_ptr = ac_cast_ptr(&ctx->ac, desc_ptr, ctx->ac.v4i32);
1175 LLVMSetMetadata(desc_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
1176
1177 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
1178 uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1179 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1180 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1181 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
1182
1183 if (ctx->ac.chip_class >= GFX10) {
1184 desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
1185 S_008F0C_OOB_SELECT(3) |
1186 S_008F0C_RESOURCE_LEVEL(1);
1187 } else {
1188 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1189 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1190 }
1191
1192 LLVMValueRef desc_components[4] = {
1193 LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.intptr, ""),
1194 LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->args->options->address32_hi), false),
1195 /* High limit to support variable sizes. */
1196 LLVMConstInt(ctx->ac.i32, 0xffffffff, false),
1197 LLVMConstInt(ctx->ac.i32, desc_type, false),
1198 };
1199
1200 return ac_build_gather_values(&ctx->ac, desc_components, 4);
1201 }
1202
1203 return desc_ptr;
1204 }
1205
1206
1207 /* The offchip buffer layout for TCS->TES is
1208 *
1209 * - attribute 0 of patch 0 vertex 0
1210 * - attribute 0 of patch 0 vertex 1
1211 * - attribute 0 of patch 0 vertex 2
1212 * ...
1213 * - attribute 0 of patch 1 vertex 0
1214 * - attribute 0 of patch 1 vertex 1
1215 * ...
1216 * - attribute 1 of patch 0 vertex 0
1217 * - attribute 1 of patch 0 vertex 1
1218 * ...
1219 * - per patch attribute 0 of patch 0
1220 * - per patch attribute 0 of patch 1
1221 * ...
1222 *
1223 * Note that every attribute has 4 components.
1224 */
1225 static LLVMValueRef get_non_vertex_index_offset(struct radv_shader_context *ctx)
1226 {
1227 uint32_t num_patches = ctx->tcs_num_patches;
1228 uint32_t num_tcs_outputs;
1229 if (ctx->stage == MESA_SHADER_TESS_CTRL)
1230 num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
1231 else
1232 num_tcs_outputs = ctx->args->options->key.tes.tcs_num_outputs;
1233
1234 uint32_t output_vertex_size = num_tcs_outputs * 16;
1235 uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
1236
1237 return LLVMConstInt(ctx->ac.i32, pervertex_output_patch_size * num_patches, false);
1238 }
1239
1240 static LLVMValueRef calc_param_stride(struct radv_shader_context *ctx,
1241 LLVMValueRef vertex_index)
1242 {
1243 LLVMValueRef param_stride;
1244 if (vertex_index)
1245 param_stride = LLVMConstInt(ctx->ac.i32, ctx->shader->info.tess.tcs_vertices_out * ctx->tcs_num_patches, false);
1246 else
1247 param_stride = LLVMConstInt(ctx->ac.i32, ctx->tcs_num_patches, false);
1248 return param_stride;
1249 }
1250
1251 static LLVMValueRef get_tcs_tes_buffer_address(struct radv_shader_context *ctx,
1252 LLVMValueRef vertex_index,
1253 LLVMValueRef param_index)
1254 {
1255 LLVMValueRef base_addr;
1256 LLVMValueRef param_stride, constant16;
1257 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
1258 LLVMValueRef vertices_per_patch = LLVMConstInt(ctx->ac.i32, ctx->shader->info.tess.tcs_vertices_out, false);
1259 constant16 = LLVMConstInt(ctx->ac.i32, 16, false);
1260 param_stride = calc_param_stride(ctx, vertex_index);
1261 if (vertex_index) {
1262 base_addr = ac_build_imad(&ctx->ac, rel_patch_id,
1263 vertices_per_patch, vertex_index);
1264 } else {
1265 base_addr = rel_patch_id;
1266 }
1267
1268 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
1269 LLVMBuildMul(ctx->ac.builder, param_index,
1270 param_stride, ""), "");
1271
1272 base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
1273
1274 if (!vertex_index) {
1275 LLVMValueRef patch_data_offset = get_non_vertex_index_offset(ctx);
1276
1277 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
1278 patch_data_offset, "");
1279 }
1280 return base_addr;
1281 }
1282
1283 static LLVMValueRef get_tcs_tes_buffer_address_params(struct radv_shader_context *ctx,
1284 unsigned param,
1285 unsigned const_index,
1286 bool is_compact,
1287 LLVMValueRef vertex_index,
1288 LLVMValueRef indir_index)
1289 {
1290 LLVMValueRef param_index;
1291
1292 if (indir_index)
1293 param_index = LLVMBuildAdd(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, param, false),
1294 indir_index, "");
1295 else {
1296 if (const_index && !is_compact)
1297 param += const_index;
1298 param_index = LLVMConstInt(ctx->ac.i32, param, false);
1299 }
1300 return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
1301 }
1302
1303 static LLVMValueRef
1304 get_dw_address(struct radv_shader_context *ctx,
1305 LLVMValueRef dw_addr,
1306 unsigned param,
1307 unsigned const_index,
1308 bool compact_const_index,
1309 LLVMValueRef vertex_index,
1310 LLVMValueRef stride,
1311 LLVMValueRef indir_index)
1312
1313 {
1314
1315 if (vertex_index) {
1316 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1317 LLVMBuildMul(ctx->ac.builder,
1318 vertex_index,
1319 stride, ""), "");
1320 }
1321
1322 if (indir_index)
1323 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1324 LLVMBuildMul(ctx->ac.builder, indir_index,
1325 LLVMConstInt(ctx->ac.i32, 4, false), ""), "");
1326 else if (const_index && !compact_const_index)
1327 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1328 LLVMConstInt(ctx->ac.i32, const_index * 4, false), "");
1329
1330 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1331 LLVMConstInt(ctx->ac.i32, param * 4, false), "");
1332
1333 if (const_index && compact_const_index)
1334 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1335 LLVMConstInt(ctx->ac.i32, const_index, false), "");
1336 return dw_addr;
1337 }
1338
1339 static LLVMValueRef
1340 load_tcs_varyings(struct ac_shader_abi *abi,
1341 LLVMTypeRef type,
1342 LLVMValueRef vertex_index,
1343 LLVMValueRef indir_index,
1344 unsigned const_index,
1345 unsigned location,
1346 unsigned driver_location,
1347 unsigned component,
1348 unsigned num_components,
1349 bool is_patch,
1350 bool is_compact,
1351 bool load_input)
1352 {
1353 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1354 LLVMValueRef dw_addr, stride;
1355 LLVMValueRef value[4], result;
1356 unsigned param = shader_io_get_unique_index(location);
1357
1358 if (load_input) {
1359 uint32_t input_vertex_size = (ctx->tcs_num_inputs * 16) / 4;
1360 stride = LLVMConstInt(ctx->ac.i32, input_vertex_size, false);
1361 dw_addr = get_tcs_in_current_patch_offset(ctx);
1362 } else {
1363 if (!is_patch) {
1364 stride = get_tcs_out_vertex_stride(ctx);
1365 dw_addr = get_tcs_out_current_patch_offset(ctx);
1366 } else {
1367 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1368 stride = NULL;
1369 }
1370 }
1371
1372 dw_addr = get_dw_address(ctx, dw_addr, param, const_index, is_compact, vertex_index, stride,
1373 indir_index);
1374
1375 for (unsigned i = 0; i < num_components + component; i++) {
1376 value[i] = ac_lds_load(&ctx->ac, dw_addr);
1377 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1378 ctx->ac.i32_1, "");
1379 }
1380 result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1381 return result;
1382 }
1383
1384 static void
1385 store_tcs_output(struct ac_shader_abi *abi,
1386 const nir_variable *var,
1387 LLVMValueRef vertex_index,
1388 LLVMValueRef param_index,
1389 unsigned const_index,
1390 LLVMValueRef src,
1391 unsigned writemask)
1392 {
1393 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1394 const unsigned location = var->data.location;
1395 unsigned component = var->data.location_frac;
1396 const bool is_patch = var->data.patch;
1397 const bool is_compact = var->data.compact;
1398 LLVMValueRef dw_addr;
1399 LLVMValueRef stride = NULL;
1400 LLVMValueRef buf_addr = NULL;
1401 LLVMValueRef oc_lds = ac_get_arg(&ctx->ac, ctx->args->oc_lds);
1402 unsigned param;
1403 bool store_lds = true;
1404
1405 if (is_patch) {
1406 if (!(ctx->shader->info.patch_outputs_read & (1U << (location - VARYING_SLOT_PATCH0))))
1407 store_lds = false;
1408 } else {
1409 if (!(ctx->shader->info.outputs_read & (1ULL << location)))
1410 store_lds = false;
1411 }
1412
1413 param = shader_io_get_unique_index(location);
1414 if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) {
1415 const_index += component;
1416 component = 0;
1417
1418 if (const_index >= 4) {
1419 const_index -= 4;
1420 param++;
1421 }
1422 }
1423
1424 if (!is_patch) {
1425 stride = get_tcs_out_vertex_stride(ctx);
1426 dw_addr = get_tcs_out_current_patch_offset(ctx);
1427 } else {
1428 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1429 }
1430
1431 dw_addr = get_dw_address(ctx, dw_addr, param, const_index, is_compact, vertex_index, stride,
1432 param_index);
1433 buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index, is_compact,
1434 vertex_index, param_index);
1435
1436 bool is_tess_factor = false;
1437 if (location == VARYING_SLOT_TESS_LEVEL_INNER ||
1438 location == VARYING_SLOT_TESS_LEVEL_OUTER)
1439 is_tess_factor = true;
1440
1441 unsigned base = is_compact ? const_index : 0;
1442 for (unsigned chan = 0; chan < 8; chan++) {
1443 if (!(writemask & (1 << chan)))
1444 continue;
1445 LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
1446 value = ac_to_integer(&ctx->ac, value);
1447 value = LLVMBuildZExtOrBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
1448
1449 if (store_lds || is_tess_factor) {
1450 LLVMValueRef dw_addr_chan =
1451 LLVMBuildAdd(ctx->ac.builder, dw_addr,
1452 LLVMConstInt(ctx->ac.i32, chan, false), "");
1453 ac_lds_store(&ctx->ac, dw_addr_chan, value);
1454 }
1455
1456 if (!is_tess_factor && writemask != 0xF)
1457 ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, value, 1,
1458 buf_addr, oc_lds,
1459 4 * (base + chan), ac_glc, false);
1460 }
1461
1462 if (writemask == 0xF) {
1463 ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, src, 4,
1464 buf_addr, oc_lds,
1465 (base * 4), ac_glc, false);
1466 }
1467 }
1468
1469 static LLVMValueRef
1470 load_tes_input(struct ac_shader_abi *abi,
1471 LLVMTypeRef type,
1472 LLVMValueRef vertex_index,
1473 LLVMValueRef param_index,
1474 unsigned const_index,
1475 unsigned location,
1476 unsigned driver_location,
1477 unsigned component,
1478 unsigned num_components,
1479 bool is_patch,
1480 bool is_compact,
1481 bool load_input)
1482 {
1483 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1484 LLVMValueRef buf_addr;
1485 LLVMValueRef result;
1486 LLVMValueRef oc_lds = ac_get_arg(&ctx->ac, ctx->args->oc_lds);
1487 unsigned param = shader_io_get_unique_index(location);
1488
1489 if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) {
1490 const_index += component;
1491 component = 0;
1492 if (const_index >= 4) {
1493 const_index -= 4;
1494 param++;
1495 }
1496 }
1497
1498 buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index,
1499 is_compact, vertex_index, param_index);
1500
1501 LLVMValueRef comp_offset = LLVMConstInt(ctx->ac.i32, component * 4, false);
1502 buf_addr = LLVMBuildAdd(ctx->ac.builder, buf_addr, comp_offset, "");
1503
1504 result = ac_build_buffer_load(&ctx->ac, ctx->hs_ring_tess_offchip, num_components, NULL,
1505 buf_addr, oc_lds, is_compact ? (4 * const_index) : 0, ac_glc, true, false);
1506 result = ac_trim_vector(&ctx->ac, result, num_components);
1507 return result;
1508 }
1509
1510 static LLVMValueRef
1511 radv_emit_fetch_64bit(struct radv_shader_context *ctx,
1512 LLVMTypeRef type, LLVMValueRef a, LLVMValueRef b)
1513 {
1514 LLVMValueRef values[2] = {
1515 ac_to_integer(&ctx->ac, a),
1516 ac_to_integer(&ctx->ac, b),
1517 };
1518 LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2);
1519 return LLVMBuildBitCast(ctx->ac.builder, result, type, "");
1520 }
1521
1522 static LLVMValueRef
1523 load_gs_input(struct ac_shader_abi *abi,
1524 unsigned location,
1525 unsigned driver_location,
1526 unsigned component,
1527 unsigned num_components,
1528 unsigned vertex_index,
1529 unsigned const_index,
1530 LLVMTypeRef type)
1531 {
1532 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1533 LLVMValueRef vtx_offset;
1534 unsigned param, vtx_offset_param;
1535 LLVMValueRef value[4], result;
1536
1537 vtx_offset_param = vertex_index;
1538 assert(vtx_offset_param < 6);
1539 vtx_offset = LLVMBuildMul(ctx->ac.builder, ctx->gs_vtx_offset[vtx_offset_param],
1540 LLVMConstInt(ctx->ac.i32, 4, false), "");
1541
1542 param = shader_io_get_unique_index(location);
1543
1544 for (unsigned i = component; i < num_components + component; i++) {
1545 if (ctx->ac.chip_class >= GFX9) {
1546 LLVMValueRef dw_addr = ctx->gs_vtx_offset[vtx_offset_param];
1547 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1548 LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index, 0), "");
1549 value[i] = ac_lds_load(&ctx->ac, dw_addr);
1550
1551 if (ac_get_type_size(type) == 8) {
1552 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1553 LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index + 1, 0), "");
1554 LLVMValueRef tmp = ac_lds_load(&ctx->ac, dw_addr);
1555
1556 value[i] = radv_emit_fetch_64bit(ctx, type, value[i], tmp);
1557 }
1558 } else {
1559 LLVMValueRef soffset =
1560 LLVMConstInt(ctx->ac.i32,
1561 (param * 4 + i + const_index) * 256,
1562 false);
1563
1564 value[i] = ac_build_buffer_load(&ctx->ac,
1565 ctx->esgs_ring, 1,
1566 ctx->ac.i32_0,
1567 vtx_offset, soffset,
1568 0, ac_glc, true, false);
1569
1570 if (ac_get_type_size(type) == 8) {
1571 soffset = LLVMConstInt(ctx->ac.i32,
1572 (param * 4 + i + const_index + 1) * 256,
1573 false);
1574
1575 LLVMValueRef tmp =
1576 ac_build_buffer_load(&ctx->ac,
1577 ctx->esgs_ring, 1,
1578 ctx->ac.i32_0,
1579 vtx_offset, soffset,
1580 0, ac_glc, true, false);
1581
1582 value[i] = radv_emit_fetch_64bit(ctx, type, value[i], tmp);
1583 }
1584 }
1585
1586 if (ac_get_type_size(type) == 2) {
1587 value[i] = LLVMBuildBitCast(ctx->ac.builder, value[i], ctx->ac.i32, "");
1588 value[i] = LLVMBuildTrunc(ctx->ac.builder, value[i], ctx->ac.i16, "");
1589 }
1590 value[i] = LLVMBuildBitCast(ctx->ac.builder, value[i], type, "");
1591 }
1592 result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1593 result = ac_to_integer(&ctx->ac, result);
1594 return result;
1595 }
1596
1597
1598 static void radv_emit_kill(struct ac_shader_abi *abi, LLVMValueRef visible)
1599 {
1600 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1601 ac_build_kill_if_false(&ctx->ac, visible);
1602 }
1603
1604 static uint32_t
1605 radv_get_sample_pos_offset(uint32_t num_samples)
1606 {
1607 uint32_t sample_pos_offset = 0;
1608
1609 switch (num_samples) {
1610 case 2:
1611 sample_pos_offset = 1;
1612 break;
1613 case 4:
1614 sample_pos_offset = 3;
1615 break;
1616 case 8:
1617 sample_pos_offset = 7;
1618 break;
1619 default:
1620 break;
1621 }
1622 return sample_pos_offset;
1623 }
1624
1625 static LLVMValueRef load_sample_position(struct ac_shader_abi *abi,
1626 LLVMValueRef sample_id)
1627 {
1628 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1629
1630 LLVMValueRef result;
1631 LLVMValueRef index = LLVMConstInt(ctx->ac.i32, RING_PS_SAMPLE_POSITIONS, false);
1632 LLVMValueRef ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ring_offsets, &index, 1, "");
1633
1634 ptr = LLVMBuildBitCast(ctx->ac.builder, ptr,
1635 ac_array_in_const_addr_space(ctx->ac.v2f32), "");
1636
1637 uint32_t sample_pos_offset =
1638 radv_get_sample_pos_offset(ctx->args->options->key.fs.num_samples);
1639
1640 sample_id =
1641 LLVMBuildAdd(ctx->ac.builder, sample_id,
1642 LLVMConstInt(ctx->ac.i32, sample_pos_offset, false), "");
1643 result = ac_build_load_invariant(&ctx->ac, ptr, sample_id);
1644
1645 return result;
1646 }
1647
1648
1649 static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi)
1650 {
1651 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1652 uint8_t log2_ps_iter_samples;
1653
1654 if (ctx->args->shader_info->ps.force_persample) {
1655 log2_ps_iter_samples =
1656 util_logbase2(ctx->args->options->key.fs.num_samples);
1657 } else {
1658 log2_ps_iter_samples = ctx->args->options->key.fs.log2_ps_iter_samples;
1659 }
1660
1661 /* The bit pattern matches that used by fixed function fragment
1662 * processing. */
1663 static const uint16_t ps_iter_masks[] = {
1664 0xffff, /* not used */
1665 0x5555,
1666 0x1111,
1667 0x0101,
1668 0x0001,
1669 };
1670 assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
1671
1672 uint32_t ps_iter_mask = ps_iter_masks[log2_ps_iter_samples];
1673
1674 LLVMValueRef result, sample_id;
1675 sample_id = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.ancillary), 8, 4);
1676 sample_id = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false), sample_id, "");
1677 result = LLVMBuildAnd(ctx->ac.builder, sample_id,
1678 ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage), "");
1679 return result;
1680 }
1681
1682
1683 static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
1684 unsigned stream,
1685 LLVMValueRef *addrs);
1686
1687 static void
1688 visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
1689 {
1690 LLVMValueRef gs_next_vertex;
1691 LLVMValueRef can_emit;
1692 unsigned offset = 0;
1693 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1694
1695 if (ctx->args->options->key.vs_common_out.as_ngg) {
1696 gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
1697 return;
1698 }
1699
1700 /* Write vertex attribute values to GSVS ring */
1701 gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
1702 ctx->gs_next_vertex[stream],
1703 "");
1704
1705 /* If this thread has already emitted the declared maximum number of
1706 * vertices, don't emit any more: excessive vertex emissions are not
1707 * supposed to have any effect.
1708 */
1709 can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
1710 LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), "");
1711
1712 bool use_kill = !ctx->args->shader_info->gs.writes_memory;
1713 if (use_kill)
1714 ac_build_kill_if_false(&ctx->ac, can_emit);
1715 else
1716 ac_build_ifcc(&ctx->ac, can_emit, 6505);
1717
1718 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1719 unsigned output_usage_mask =
1720 ctx->args->shader_info->gs.output_usage_mask[i];
1721 uint8_t output_stream =
1722 ctx->args->shader_info->gs.output_streams[i];
1723 LLVMValueRef *out_ptr = &addrs[i * 4];
1724 int length = util_last_bit(output_usage_mask);
1725
1726 if (!(ctx->output_mask & (1ull << i)) ||
1727 output_stream != stream)
1728 continue;
1729
1730 for (unsigned j = 0; j < length; j++) {
1731 if (!(output_usage_mask & (1 << j)))
1732 continue;
1733
1734 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder,
1735 out_ptr[j], "");
1736 LLVMValueRef voffset =
1737 LLVMConstInt(ctx->ac.i32, offset *
1738 ctx->shader->info.gs.vertices_out, false);
1739
1740 offset++;
1741
1742 voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
1743 voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), "");
1744
1745 out_val = ac_to_integer(&ctx->ac, out_val);
1746 out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
1747
1748 ac_build_buffer_store_dword(&ctx->ac,
1749 ctx->gsvs_ring[stream],
1750 out_val, 1,
1751 voffset,
1752 ac_get_arg(&ctx->ac,
1753 ctx->args->gs2vs_offset),
1754 0, ac_glc | ac_slc, true);
1755 }
1756 }
1757
1758 gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex,
1759 ctx->ac.i32_1, "");
1760 LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
1761
1762 ac_build_sendmsg(&ctx->ac,
1763 AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
1764 ctx->gs_wave_id);
1765
1766 if (!use_kill)
1767 ac_build_endif(&ctx->ac, 6505);
1768 }
1769
1770 static void
1771 visit_end_primitive(struct ac_shader_abi *abi, unsigned stream)
1772 {
1773 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1774
1775 if (ctx->args->options->key.vs_common_out.as_ngg) {
1776 LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
1777 return;
1778 }
1779
1780 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), ctx->gs_wave_id);
1781 }
1782
1783 static LLVMValueRef
1784 load_tess_coord(struct ac_shader_abi *abi)
1785 {
1786 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1787
1788 LLVMValueRef coord[4] = {
1789 ac_get_arg(&ctx->ac, ctx->args->tes_u),
1790 ac_get_arg(&ctx->ac, ctx->args->tes_v),
1791 ctx->ac.f32_0,
1792 ctx->ac.f32_0,
1793 };
1794
1795 if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES)
1796 coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
1797 LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), "");
1798
1799 return ac_build_gather_values(&ctx->ac, coord, 3);
1800 }
1801
1802 static LLVMValueRef
1803 load_patch_vertices_in(struct ac_shader_abi *abi)
1804 {
1805 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1806 return LLVMConstInt(ctx->ac.i32, ctx->args->options->key.tcs.input_vertices, false);
1807 }
1808
1809
1810 static LLVMValueRef radv_load_base_vertex(struct ac_shader_abi *abi)
1811 {
1812 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1813 return ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex);
1814 }
1815
1816 static LLVMValueRef radv_load_ssbo(struct ac_shader_abi *abi,
1817 LLVMValueRef buffer_ptr, bool write)
1818 {
1819 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1820 LLVMValueRef result;
1821
1822 LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
1823
1824 result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");
1825 LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
1826
1827 return result;
1828 }
1829
1830 static LLVMValueRef radv_load_ubo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr)
1831 {
1832 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1833 LLVMValueRef result;
1834
1835 if (LLVMGetTypeKind(LLVMTypeOf(buffer_ptr)) != LLVMPointerTypeKind) {
1836 /* Do not load the descriptor for inlined uniform blocks. */
1837 return buffer_ptr;
1838 }
1839
1840 LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
1841
1842 result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");
1843 LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
1844
1845 return result;
1846 }
1847
1848 static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi,
1849 unsigned descriptor_set,
1850 unsigned base_index,
1851 unsigned constant_index,
1852 LLVMValueRef index,
1853 enum ac_descriptor_type desc_type,
1854 bool image, bool write,
1855 bool bindless)
1856 {
1857 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1858 LLVMValueRef list = ctx->descriptor_sets[descriptor_set];
1859 struct radv_descriptor_set_layout *layout = ctx->args->options->layout->set[descriptor_set].layout;
1860 struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
1861 unsigned offset = binding->offset;
1862 unsigned stride = binding->size;
1863 unsigned type_size;
1864 LLVMBuilderRef builder = ctx->ac.builder;
1865 LLVMTypeRef type;
1866
1867 assert(base_index < layout->binding_count);
1868
1869 switch (desc_type) {
1870 case AC_DESC_IMAGE:
1871 type = ctx->ac.v8i32;
1872 type_size = 32;
1873 break;
1874 case AC_DESC_FMASK:
1875 type = ctx->ac.v8i32;
1876 offset += 32;
1877 type_size = 32;
1878 break;
1879 case AC_DESC_SAMPLER:
1880 type = ctx->ac.v4i32;
1881 if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
1882 offset += radv_combined_image_descriptor_sampler_offset(binding);
1883 }
1884
1885 type_size = 16;
1886 break;
1887 case AC_DESC_BUFFER:
1888 type = ctx->ac.v4i32;
1889 type_size = 16;
1890 break;
1891 case AC_DESC_PLANE_0:
1892 case AC_DESC_PLANE_1:
1893 case AC_DESC_PLANE_2:
1894 type = ctx->ac.v8i32;
1895 type_size = 32;
1896 offset += 32 * (desc_type - AC_DESC_PLANE_0);
1897 break;
1898 default:
1899 unreachable("invalid desc_type\n");
1900 }
1901
1902 offset += constant_index * stride;
1903
1904 if (desc_type == AC_DESC_SAMPLER && binding->immutable_samplers_offset &&
1905 (!index || binding->immutable_samplers_equal)) {
1906 if (binding->immutable_samplers_equal)
1907 constant_index = 0;
1908
1909 const uint32_t *samplers = radv_immutable_samplers(layout, binding);
1910
1911 LLVMValueRef constants[] = {
1912 LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 0], 0),
1913 LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 1], 0),
1914 LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 2], 0),
1915 LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 3], 0),
1916 };
1917 return ac_build_gather_values(&ctx->ac, constants, 4);
1918 }
1919
1920 assert(stride % type_size == 0);
1921
1922 LLVMValueRef adjusted_index = index;
1923 if (!adjusted_index)
1924 adjusted_index = ctx->ac.i32_0;
1925
1926 adjusted_index = LLVMBuildMul(builder, adjusted_index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), "");
1927
1928 LLVMValueRef val_offset = LLVMConstInt(ctx->ac.i32, offset, 0);
1929 list = LLVMBuildGEP(builder, list, &val_offset, 1, "");
1930 list = LLVMBuildPointerCast(builder, list,
1931 ac_array_in_const32_addr_space(type), "");
1932
1933 LLVMValueRef descriptor = ac_build_load_to_sgpr(&ctx->ac, list, adjusted_index);
1934
1935 /* 3 plane formats always have same size and format for plane 1 & 2, so
1936 * use the tail from plane 1 so that we can store only the first 16 bytes
1937 * of the last plane. */
1938 if (desc_type == AC_DESC_PLANE_2) {
1939 LLVMValueRef descriptor2 = radv_get_sampler_desc(abi, descriptor_set, base_index, constant_index, index, AC_DESC_PLANE_1,image, write, bindless);
1940
1941 LLVMValueRef components[8];
1942 for (unsigned i = 0; i < 4; ++i)
1943 components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i);
1944
1945 for (unsigned i = 4; i < 8; ++i)
1946 components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor2, i);
1947 descriptor = ac_build_gather_values(&ctx->ac, components, 8);
1948 }
1949
1950 return descriptor;
1951 }
1952
1953 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
1954 * so we may need to fix it up. */
1955 static LLVMValueRef
1956 adjust_vertex_fetch_alpha(struct radv_shader_context *ctx,
1957 unsigned adjustment,
1958 LLVMValueRef alpha)
1959 {
1960 if (adjustment == RADV_ALPHA_ADJUST_NONE)
1961 return alpha;
1962
1963 LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
1964
1965 alpha = LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.f32, "");
1966
1967 if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
1968 alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, "");
1969 else
1970 alpha = ac_to_integer(&ctx->ac, alpha);
1971
1972 /* For the integer-like cases, do a natural sign extension.
1973 *
1974 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
1975 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
1976 * exponent.
1977 */
1978 alpha = LLVMBuildShl(ctx->ac.builder, alpha,
1979 adjustment == RADV_ALPHA_ADJUST_SNORM ?
1980 LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
1981 alpha = LLVMBuildAShr(ctx->ac.builder, alpha, c30, "");
1982
1983 /* Convert back to the right type. */
1984 if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
1985 LLVMValueRef clamp;
1986 LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
1987 alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
1988 clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, alpha, neg_one, "");
1989 alpha = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, alpha, "");
1990 } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
1991 alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
1992 }
1993
1994 return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, "");
1995 }
1996
1997 static unsigned
1998 get_num_channels_from_data_format(unsigned data_format)
1999 {
2000 switch (data_format) {
2001 case V_008F0C_BUF_DATA_FORMAT_8:
2002 case V_008F0C_BUF_DATA_FORMAT_16:
2003 case V_008F0C_BUF_DATA_FORMAT_32:
2004 return 1;
2005 case V_008F0C_BUF_DATA_FORMAT_8_8:
2006 case V_008F0C_BUF_DATA_FORMAT_16_16:
2007 case V_008F0C_BUF_DATA_FORMAT_32_32:
2008 return 2;
2009 case V_008F0C_BUF_DATA_FORMAT_10_11_11:
2010 case V_008F0C_BUF_DATA_FORMAT_11_11_10:
2011 case V_008F0C_BUF_DATA_FORMAT_32_32_32:
2012 return 3;
2013 case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
2014 case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
2015 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
2016 case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
2017 case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
2018 return 4;
2019 default:
2020 break;
2021 }
2022
2023 return 4;
2024 }
2025
2026 static LLVMValueRef
2027 radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx,
2028 LLVMValueRef value,
2029 unsigned num_channels,
2030 bool is_float)
2031 {
2032 LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;
2033 LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;
2034 LLVMValueRef chan[4];
2035
2036 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
2037 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
2038
2039 if (num_channels == 4 && num_channels == vec_size)
2040 return value;
2041
2042 num_channels = MIN2(num_channels, vec_size);
2043
2044 for (unsigned i = 0; i < num_channels; i++)
2045 chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i);
2046 } else {
2047 if (num_channels) {
2048 assert(num_channels == 1);
2049 chan[0] = value;
2050 }
2051 }
2052
2053 for (unsigned i = num_channels; i < 4; i++) {
2054 chan[i] = i == 3 ? one : zero;
2055 chan[i] = ac_to_integer(&ctx->ac, chan[i]);
2056 }
2057
2058 return ac_build_gather_values(&ctx->ac, chan, 4);
2059 }
2060
2061 static void
2062 handle_vs_input_decl(struct radv_shader_context *ctx,
2063 struct nir_variable *variable)
2064 {
2065 LLVMValueRef t_list_ptr = ac_get_arg(&ctx->ac, ctx->args->vertex_buffers);
2066 LLVMValueRef t_offset;
2067 LLVMValueRef t_list;
2068 LLVMValueRef input;
2069 LLVMValueRef buffer_index;
2070 unsigned attrib_count = glsl_count_attribute_slots(variable->type, true);
2071 uint8_t input_usage_mask =
2072 ctx->args->shader_info->vs.input_usage_mask[variable->data.location];
2073 unsigned num_input_channels = util_last_bit(input_usage_mask);
2074
2075 variable->data.driver_location = variable->data.location * 4;
2076
2077 enum glsl_base_type type = glsl_get_base_type(variable->type);
2078 for (unsigned i = 0; i < attrib_count; ++i) {
2079 LLVMValueRef output[4];
2080 unsigned attrib_index = variable->data.location + i - VERT_ATTRIB_GENERIC0;
2081 unsigned attrib_format = ctx->args->options->key.vs.vertex_attribute_formats[attrib_index];
2082 unsigned data_format = attrib_format & 0x0f;
2083 unsigned num_format = (attrib_format >> 4) & 0x07;
2084 bool is_float = num_format != V_008F0C_BUF_NUM_FORMAT_UINT &&
2085 num_format != V_008F0C_BUF_NUM_FORMAT_SINT;
2086
2087 if (ctx->args->options->key.vs.instance_rate_inputs & (1u << attrib_index)) {
2088 uint32_t divisor = ctx->args->options->key.vs.instance_rate_divisors[attrib_index];
2089
2090 if (divisor) {
2091 buffer_index = ctx->abi.instance_id;
2092
2093 if (divisor != 1) {
2094 buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index,
2095 LLVMConstInt(ctx->ac.i32, divisor, 0), "");
2096 }
2097 } else {
2098 buffer_index = ctx->ac.i32_0;
2099 }
2100
2101 buffer_index = LLVMBuildAdd(ctx->ac.builder,
2102 ac_get_arg(&ctx->ac,
2103 ctx->args->ac.start_instance),\
2104 buffer_index, "");
2105 } else {
2106 buffer_index = LLVMBuildAdd(ctx->ac.builder,
2107 ctx->abi.vertex_id,
2108 ac_get_arg(&ctx->ac,
2109 ctx->args->ac.base_vertex), "");
2110 }
2111
2112 /* Adjust the number of channels to load based on the vertex
2113 * attribute format.
2114 */
2115 unsigned num_format_channels = get_num_channels_from_data_format(data_format);
2116 unsigned num_channels = MIN2(num_input_channels, num_format_channels);
2117 unsigned attrib_binding = ctx->args->options->key.vs.vertex_attribute_bindings[attrib_index];
2118 unsigned attrib_offset = ctx->args->options->key.vs.vertex_attribute_offsets[attrib_index];
2119 unsigned attrib_stride = ctx->args->options->key.vs.vertex_attribute_strides[attrib_index];
2120
2121 if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) {
2122 /* Always load, at least, 3 channels for formats that
2123 * need to be shuffled because X<->Z.
2124 */
2125 num_channels = MAX2(num_channels, 3);
2126 }
2127
2128 if (attrib_stride != 0 && attrib_offset > attrib_stride) {
2129 LLVMValueRef buffer_offset =
2130 LLVMConstInt(ctx->ac.i32,
2131 attrib_offset / attrib_stride, false);
2132
2133 buffer_index = LLVMBuildAdd(ctx->ac.builder,
2134 buffer_index,
2135 buffer_offset, "");
2136
2137 attrib_offset = attrib_offset % attrib_stride;
2138 }
2139
2140 t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false);
2141 t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
2142
2143 input = ac_build_struct_tbuffer_load(&ctx->ac, t_list,
2144 buffer_index,
2145 LLVMConstInt(ctx->ac.i32, attrib_offset, false),
2146 ctx->ac.i32_0, ctx->ac.i32_0,
2147 num_channels,
2148 data_format, num_format, 0, true);
2149
2150 if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) {
2151 LLVMValueRef c[4];
2152 c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2);
2153 c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1);
2154 c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0);
2155 c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3);
2156
2157 input = ac_build_gather_values(&ctx->ac, c, 4);
2158 }
2159
2160 input = radv_fixup_vertex_input_fetches(ctx, input, num_channels,
2161 is_float);
2162
2163 for (unsigned chan = 0; chan < 4; chan++) {
2164 LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
2165 output[chan] = LLVMBuildExtractElement(ctx->ac.builder, input, llvm_chan, "");
2166 if (type == GLSL_TYPE_FLOAT16) {
2167 output[chan] = LLVMBuildBitCast(ctx->ac.builder, output[chan], ctx->ac.f32, "");
2168 output[chan] = LLVMBuildFPTrunc(ctx->ac.builder, output[chan], ctx->ac.f16, "");
2169 }
2170 }
2171
2172 unsigned alpha_adjust = (ctx->args->options->key.vs.alpha_adjust >> (attrib_index * 2)) & 3;
2173 output[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, output[3]);
2174
2175 for (unsigned chan = 0; chan < 4; chan++) {
2176 output[chan] = ac_to_integer(&ctx->ac, output[chan]);
2177 if (type == GLSL_TYPE_UINT16 || type == GLSL_TYPE_INT16)
2178 output[chan] = LLVMBuildTrunc(ctx->ac.builder, output[chan], ctx->ac.i16, "");
2179
2180 ctx->inputs[ac_llvm_reg_index_soa(variable->data.location + i, chan)] = output[chan];
2181 }
2182 }
2183 }
2184
2185 static void
2186 handle_vs_inputs(struct radv_shader_context *ctx,
2187 struct nir_shader *nir) {
2188 nir_foreach_variable(variable, &nir->inputs)
2189 handle_vs_input_decl(ctx, variable);
2190 }
2191
2192 static void
2193 prepare_interp_optimize(struct radv_shader_context *ctx,
2194 struct nir_shader *nir)
2195 {
2196 bool uses_center = false;
2197 bool uses_centroid = false;
2198 nir_foreach_variable(variable, &nir->inputs) {
2199 if (glsl_get_base_type(glsl_without_array(variable->type)) != GLSL_TYPE_FLOAT ||
2200 variable->data.sample)
2201 continue;
2202
2203 if (variable->data.centroid)
2204 uses_centroid = true;
2205 else
2206 uses_center = true;
2207 }
2208
2209 ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.persp_centroid);
2210 ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.linear_centroid);
2211
2212 if (uses_center && uses_centroid) {
2213 LLVMValueRef sel = LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT,
2214 ac_get_arg(&ctx->ac, ctx->args->ac.prim_mask),
2215 ctx->ac.i32_0, "");
2216 ctx->abi.persp_centroid =
2217 LLVMBuildSelect(ctx->ac.builder, sel,
2218 ac_get_arg(&ctx->ac, ctx->args->ac.persp_center),
2219 ctx->abi.persp_centroid, "");
2220 ctx->abi.linear_centroid =
2221 LLVMBuildSelect(ctx->ac.builder, sel,
2222 ac_get_arg(&ctx->ac, ctx->args->ac.linear_center),
2223 ctx->abi.linear_centroid, "");
2224 }
2225 }
2226
2227 static void
2228 scan_shader_output_decl(struct radv_shader_context *ctx,
2229 struct nir_variable *variable,
2230 struct nir_shader *shader,
2231 gl_shader_stage stage)
2232 {
2233 int idx = variable->data.location + variable->data.index;
2234 unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
2235 uint64_t mask_attribs;
2236
2237 variable->data.driver_location = idx * 4;
2238
2239 /* tess ctrl has it's own load/store paths for outputs */
2240 if (stage == MESA_SHADER_TESS_CTRL)
2241 return;
2242
2243 if (variable->data.compact) {
2244 unsigned component_count = variable->data.location_frac +
2245 glsl_get_length(variable->type);
2246 attrib_count = (component_count + 3) / 4;
2247 }
2248
2249 mask_attribs = ((1ull << attrib_count) - 1) << idx;
2250
2251 ctx->output_mask |= mask_attribs;
2252 }
2253
2254
2255 /* Initialize arguments for the shader export intrinsic */
2256 static void
2257 si_llvm_init_export_args(struct radv_shader_context *ctx,
2258 LLVMValueRef *values,
2259 unsigned enabled_channels,
2260 unsigned target,
2261 struct ac_export_args *args)
2262 {
2263 /* Specify the channels that are enabled. */
2264 args->enabled_channels = enabled_channels;
2265
2266 /* Specify whether the EXEC mask represents the valid mask */
2267 args->valid_mask = 0;
2268
2269 /* Specify whether this is the last export */
2270 args->done = 0;
2271
2272 /* Specify the target we are exporting */
2273 args->target = target;
2274
2275 args->compr = false;
2276 args->out[0] = LLVMGetUndef(ctx->ac.f32);
2277 args->out[1] = LLVMGetUndef(ctx->ac.f32);
2278 args->out[2] = LLVMGetUndef(ctx->ac.f32);
2279 args->out[3] = LLVMGetUndef(ctx->ac.f32);
2280
2281 if (!values)
2282 return;
2283
2284 bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
2285 if (ctx->stage == MESA_SHADER_FRAGMENT) {
2286 unsigned index = target - V_008DFC_SQ_EXP_MRT;
2287 unsigned col_format = (ctx->args->options->key.fs.col_format >> (4 * index)) & 0xf;
2288 bool is_int8 = (ctx->args->options->key.fs.is_int8 >> index) & 1;
2289 bool is_int10 = (ctx->args->options->key.fs.is_int10 >> index) & 1;
2290 unsigned chan;
2291
2292 LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL;
2293 LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2],
2294 unsigned bits, bool hi) = NULL;
2295
2296 switch(col_format) {
2297 case V_028714_SPI_SHADER_ZERO:
2298 args->enabled_channels = 0; /* writemask */
2299 args->target = V_008DFC_SQ_EXP_NULL;
2300 break;
2301
2302 case V_028714_SPI_SHADER_32_R:
2303 args->enabled_channels = 1;
2304 args->out[0] = values[0];
2305 break;
2306
2307 case V_028714_SPI_SHADER_32_GR:
2308 args->enabled_channels = 0x3;
2309 args->out[0] = values[0];
2310 args->out[1] = values[1];
2311 break;
2312
2313 case V_028714_SPI_SHADER_32_AR:
2314 if (ctx->ac.chip_class >= GFX10) {
2315 args->enabled_channels = 0x3;
2316 args->out[0] = values[0];
2317 args->out[1] = values[3];
2318 } else {
2319 args->enabled_channels = 0x9;
2320 args->out[0] = values[0];
2321 args->out[3] = values[3];
2322 }
2323 break;
2324
2325 case V_028714_SPI_SHADER_FP16_ABGR:
2326 args->enabled_channels = 0x5;
2327 packf = ac_build_cvt_pkrtz_f16;
2328 if (is_16bit) {
2329 for (unsigned chan = 0; chan < 4; chan++)
2330 values[chan] = LLVMBuildFPExt(ctx->ac.builder,
2331 values[chan],
2332 ctx->ac.f32, "");
2333 }
2334 break;
2335
2336 case V_028714_SPI_SHADER_UNORM16_ABGR:
2337 args->enabled_channels = 0x5;
2338 packf = ac_build_cvt_pknorm_u16;
2339 break;
2340
2341 case V_028714_SPI_SHADER_SNORM16_ABGR:
2342 args->enabled_channels = 0x5;
2343 packf = ac_build_cvt_pknorm_i16;
2344 break;
2345
2346 case V_028714_SPI_SHADER_UINT16_ABGR:
2347 args->enabled_channels = 0x5;
2348 packi = ac_build_cvt_pk_u16;
2349 if (is_16bit) {
2350 for (unsigned chan = 0; chan < 4; chan++)
2351 values[chan] = LLVMBuildZExt(ctx->ac.builder,
2352 ac_to_integer(&ctx->ac, values[chan]),
2353 ctx->ac.i32, "");
2354 }
2355 break;
2356
2357 case V_028714_SPI_SHADER_SINT16_ABGR:
2358 args->enabled_channels = 0x5;
2359 packi = ac_build_cvt_pk_i16;
2360 if (is_16bit) {
2361 for (unsigned chan = 0; chan < 4; chan++)
2362 values[chan] = LLVMBuildSExt(ctx->ac.builder,
2363 ac_to_integer(&ctx->ac, values[chan]),
2364 ctx->ac.i32, "");
2365 }
2366 break;
2367
2368 default:
2369 case V_028714_SPI_SHADER_32_ABGR:
2370 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2371 break;
2372 }
2373
2374 /* Pack f16 or norm_i16/u16. */
2375 if (packf) {
2376 for (chan = 0; chan < 2; chan++) {
2377 LLVMValueRef pack_args[2] = {
2378 values[2 * chan],
2379 values[2 * chan + 1]
2380 };
2381 LLVMValueRef packed;
2382
2383 packed = packf(&ctx->ac, pack_args);
2384 args->out[chan] = ac_to_float(&ctx->ac, packed);
2385 }
2386 args->compr = 1; /* COMPR flag */
2387 }
2388
2389 /* Pack i16/u16. */
2390 if (packi) {
2391 for (chan = 0; chan < 2; chan++) {
2392 LLVMValueRef pack_args[2] = {
2393 ac_to_integer(&ctx->ac, values[2 * chan]),
2394 ac_to_integer(&ctx->ac, values[2 * chan + 1])
2395 };
2396 LLVMValueRef packed;
2397
2398 packed = packi(&ctx->ac, pack_args,
2399 is_int8 ? 8 : is_int10 ? 10 : 16,
2400 chan == 1);
2401 args->out[chan] = ac_to_float(&ctx->ac, packed);
2402 }
2403 args->compr = 1; /* COMPR flag */
2404 }
2405 return;
2406 }
2407
2408 if (is_16bit) {
2409 for (unsigned chan = 0; chan < 4; chan++) {
2410 values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i16, "");
2411 args->out[chan] = LLVMBuildZExt(ctx->ac.builder, values[chan], ctx->ac.i32, "");
2412 }
2413 } else
2414 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2415
2416 for (unsigned i = 0; i < 4; ++i)
2417 args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
2418 }
2419
2420 static void
2421 radv_export_param(struct radv_shader_context *ctx, unsigned index,
2422 LLVMValueRef *values, unsigned enabled_channels)
2423 {
2424 struct ac_export_args args;
2425
2426 si_llvm_init_export_args(ctx, values, enabled_channels,
2427 V_008DFC_SQ_EXP_PARAM + index, &args);
2428 ac_build_export(&ctx->ac, &args);
2429 }
2430
2431 static LLVMValueRef
2432 radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan)
2433 {
2434 LLVMValueRef output = ctx->abi.outputs[ac_llvm_reg_index_soa(index, chan)];
2435 return LLVMBuildLoad(ctx->ac.builder, output, "");
2436 }
2437
2438 static void
2439 radv_emit_stream_output(struct radv_shader_context *ctx,
2440 LLVMValueRef const *so_buffers,
2441 LLVMValueRef const *so_write_offsets,
2442 const struct radv_stream_output *output,
2443 struct radv_shader_output_values *shader_out)
2444 {
2445 unsigned num_comps = util_bitcount(output->component_mask);
2446 unsigned buf = output->buffer;
2447 unsigned offset = output->offset;
2448 unsigned start;
2449 LLVMValueRef out[4];
2450
2451 assert(num_comps && num_comps <= 4);
2452 if (!num_comps || num_comps > 4)
2453 return;
2454
2455 /* Get the first component. */
2456 start = ffs(output->component_mask) - 1;
2457
2458 /* Load the output as int. */
2459 for (int i = 0; i < num_comps; i++) {
2460 out[i] = ac_to_integer(&ctx->ac, shader_out->values[start + i]);
2461 }
2462
2463 /* Pack the output. */
2464 LLVMValueRef vdata = NULL;
2465
2466 switch (num_comps) {
2467 case 1: /* as i32 */
2468 vdata = out[0];
2469 break;
2470 case 2: /* as v2i32 */
2471 case 3: /* as v4i32 (aligned to 4) */
2472 out[3] = LLVMGetUndef(ctx->ac.i32);
2473 /* fall through */
2474 case 4: /* as v4i32 */
2475 vdata = ac_build_gather_values(&ctx->ac, out,
2476 !ac_has_vec3_support(ctx->ac.chip_class, false) ?
2477 util_next_power_of_two(num_comps) :
2478 num_comps);
2479 break;
2480 }
2481
2482 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf],
2483 vdata, num_comps, so_write_offsets[buf],
2484 ctx->ac.i32_0, offset,
2485 ac_glc | ac_slc, false);
2486 }
2487
2488 static void
2489 radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream)
2490 {
2491 int i;
2492
2493 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2494 assert(ctx->args->streamout_config.used);
2495 LLVMValueRef so_vtx_count =
2496 ac_build_bfe(&ctx->ac,
2497 ac_get_arg(&ctx->ac, ctx->args->streamout_config),
2498 LLVMConstInt(ctx->ac.i32, 16, false),
2499 LLVMConstInt(ctx->ac.i32, 7, false), false);
2500
2501 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2502
2503 /* can_emit = tid < so_vtx_count; */
2504 LLVMValueRef can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
2505 tid, so_vtx_count, "");
2506
2507 /* Emit the streamout code conditionally. This actually avoids
2508 * out-of-bounds buffer access. The hw tells us via the SGPR
2509 * (so_vtx_count) which threads are allowed to emit streamout data.
2510 */
2511 ac_build_ifcc(&ctx->ac, can_emit, 6501);
2512 {
2513 /* The buffer offset is computed as follows:
2514 * ByteOffset = streamout_offset[buffer_id]*4 +
2515 * (streamout_write_index + thread_id)*stride[buffer_id] +
2516 * attrib_offset
2517 */
2518 LLVMValueRef so_write_index =
2519 ac_get_arg(&ctx->ac, ctx->args->streamout_write_idx);
2520
2521 /* Compute (streamout_write_index + thread_id). */
2522 so_write_index =
2523 LLVMBuildAdd(ctx->ac.builder, so_write_index, tid, "");
2524
2525 /* Load the descriptor and compute the write offset for each
2526 * enabled buffer.
2527 */
2528 LLVMValueRef so_write_offset[4] = {};
2529 LLVMValueRef so_buffers[4] = {};
2530 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers);
2531
2532 for (i = 0; i < 4; i++) {
2533 uint16_t stride = ctx->args->shader_info->so.strides[i];
2534
2535 if (!stride)
2536 continue;
2537
2538 LLVMValueRef offset =
2539 LLVMConstInt(ctx->ac.i32, i, false);
2540
2541 so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac,
2542 buf_ptr, offset);
2543
2544 LLVMValueRef so_offset =
2545 ac_get_arg(&ctx->ac, ctx->args->streamout_offset[i]);
2546
2547 so_offset = LLVMBuildMul(ctx->ac.builder, so_offset,
2548 LLVMConstInt(ctx->ac.i32, 4, false), "");
2549
2550 so_write_offset[i] =
2551 ac_build_imad(&ctx->ac, so_write_index,
2552 LLVMConstInt(ctx->ac.i32,
2553 stride * 4, false),
2554 so_offset);
2555 }
2556
2557 /* Write streamout data. */
2558 for (i = 0; i < ctx->args->shader_info->so.num_outputs; i++) {
2559 struct radv_shader_output_values shader_out = {};
2560 struct radv_stream_output *output =
2561 &ctx->args->shader_info->so.outputs[i];
2562
2563 if (stream != output->stream)
2564 continue;
2565
2566 for (int j = 0; j < 4; j++) {
2567 shader_out.values[j] =
2568 radv_load_output(ctx, output->location, j);
2569 }
2570
2571 radv_emit_stream_output(ctx, so_buffers,so_write_offset,
2572 output, &shader_out);
2573 }
2574 }
2575 ac_build_endif(&ctx->ac, 6501);
2576 }
2577
2578 static void
2579 radv_build_param_exports(struct radv_shader_context *ctx,
2580 struct radv_shader_output_values *outputs,
2581 unsigned noutput,
2582 struct radv_vs_output_info *outinfo,
2583 bool export_clip_dists)
2584 {
2585 unsigned param_count = 0;
2586
2587 for (unsigned i = 0; i < noutput; i++) {
2588 unsigned slot_name = outputs[i].slot_name;
2589 unsigned usage_mask = outputs[i].usage_mask;
2590
2591 if (slot_name != VARYING_SLOT_LAYER &&
2592 slot_name != VARYING_SLOT_PRIMITIVE_ID &&
2593 slot_name != VARYING_SLOT_CLIP_DIST0 &&
2594 slot_name != VARYING_SLOT_CLIP_DIST1 &&
2595 slot_name < VARYING_SLOT_VAR0)
2596 continue;
2597
2598 if ((slot_name == VARYING_SLOT_CLIP_DIST0 ||
2599 slot_name == VARYING_SLOT_CLIP_DIST1) && !export_clip_dists)
2600 continue;
2601
2602 radv_export_param(ctx, param_count, outputs[i].values, usage_mask);
2603
2604 assert(i < ARRAY_SIZE(outinfo->vs_output_param_offset));
2605 outinfo->vs_output_param_offset[slot_name] = param_count++;
2606 }
2607
2608 outinfo->param_exports = param_count;
2609 }
2610
2611 /* Generate export instructions for hardware VS shader stage or NGG GS stage
2612 * (position and parameter data only).
2613 */
2614 static void
2615 radv_llvm_export_vs(struct radv_shader_context *ctx,
2616 struct radv_shader_output_values *outputs,
2617 unsigned noutput,
2618 struct radv_vs_output_info *outinfo,
2619 bool export_clip_dists)
2620 {
2621 LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_value = NULL;
2622 struct ac_export_args pos_args[4] = {};
2623 unsigned pos_idx, index;
2624 int i;
2625
2626 /* Build position exports */
2627 for (i = 0; i < noutput; i++) {
2628 switch (outputs[i].slot_name) {
2629 case VARYING_SLOT_POS:
2630 si_llvm_init_export_args(ctx, outputs[i].values, 0xf,
2631 V_008DFC_SQ_EXP_POS, &pos_args[0]);
2632 break;
2633 case VARYING_SLOT_PSIZ:
2634 psize_value = outputs[i].values[0];
2635 break;
2636 case VARYING_SLOT_LAYER:
2637 layer_value = outputs[i].values[0];
2638 break;
2639 case VARYING_SLOT_VIEWPORT:
2640 viewport_value = outputs[i].values[0];
2641 break;
2642 case VARYING_SLOT_CLIP_DIST0:
2643 case VARYING_SLOT_CLIP_DIST1:
2644 index = 2 + outputs[i].slot_index;
2645 si_llvm_init_export_args(ctx, outputs[i].values, 0xf,
2646 V_008DFC_SQ_EXP_POS + index,
2647 &pos_args[index]);
2648 break;
2649 default:
2650 break;
2651 }
2652 }
2653
2654 /* We need to add the position output manually if it's missing. */
2655 if (!pos_args[0].out[0]) {
2656 pos_args[0].enabled_channels = 0xf; /* writemask */
2657 pos_args[0].valid_mask = 0; /* EXEC mask */
2658 pos_args[0].done = 0; /* last export? */
2659 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2660 pos_args[0].compr = 0; /* COMPR flag */
2661 pos_args[0].out[0] = ctx->ac.f32_0; /* X */
2662 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
2663 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
2664 pos_args[0].out[3] = ctx->ac.f32_1; /* W */
2665 }
2666
2667 if (outinfo->writes_pointsize ||
2668 outinfo->writes_layer ||
2669 outinfo->writes_viewport_index) {
2670 pos_args[1].enabled_channels = ((outinfo->writes_pointsize == true ? 1 : 0) |
2671 (outinfo->writes_layer == true ? 4 : 0));
2672 pos_args[1].valid_mask = 0;
2673 pos_args[1].done = 0;
2674 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2675 pos_args[1].compr = 0;
2676 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
2677 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
2678 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
2679 pos_args[1].out[3] = ctx->ac.f32_0; /* W */
2680
2681 if (outinfo->writes_pointsize == true)
2682 pos_args[1].out[0] = psize_value;
2683 if (outinfo->writes_layer == true)
2684 pos_args[1].out[2] = layer_value;
2685 if (outinfo->writes_viewport_index == true) {
2686 if (ctx->args->options->chip_class >= GFX9) {
2687 /* GFX9 has the layer in out.z[10:0] and the viewport
2688 * index in out.z[19:16].
2689 */
2690 LLVMValueRef v = viewport_value;
2691 v = ac_to_integer(&ctx->ac, v);
2692 v = LLVMBuildShl(ctx->ac.builder, v,
2693 LLVMConstInt(ctx->ac.i32, 16, false),
2694 "");
2695 v = LLVMBuildOr(ctx->ac.builder, v,
2696 ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
2697
2698 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
2699 pos_args[1].enabled_channels |= 1 << 2;
2700 } else {
2701 pos_args[1].out[3] = viewport_value;
2702 pos_args[1].enabled_channels |= 1 << 3;
2703 }
2704 }
2705 }
2706
2707 for (i = 0; i < 4; i++) {
2708 if (pos_args[i].out[0])
2709 outinfo->pos_exports++;
2710 }
2711
2712 /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
2713 * Setting valid_mask=1 prevents it and has no other effect.
2714 */
2715 if (ctx->ac.family == CHIP_NAVI10 ||
2716 ctx->ac.family == CHIP_NAVI12 ||
2717 ctx->ac.family == CHIP_NAVI14)
2718 pos_args[0].valid_mask = 1;
2719
2720 pos_idx = 0;
2721 for (i = 0; i < 4; i++) {
2722 if (!pos_args[i].out[0])
2723 continue;
2724
2725 /* Specify the target we are exporting */
2726 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2727
2728 if (pos_idx == outinfo->pos_exports)
2729 /* Specify that this is the last export */
2730 pos_args[i].done = 1;
2731
2732 ac_build_export(&ctx->ac, &pos_args[i]);
2733 }
2734
2735 /* Build parameter exports */
2736 radv_build_param_exports(ctx, outputs, noutput, outinfo, export_clip_dists);
2737 }
2738
2739 static void
2740 handle_vs_outputs_post(struct radv_shader_context *ctx,
2741 bool export_prim_id,
2742 bool export_clip_dists,
2743 struct radv_vs_output_info *outinfo)
2744 {
2745 struct radv_shader_output_values *outputs;
2746 unsigned noutput = 0;
2747
2748 if (ctx->args->options->key.has_multiview_view_index) {
2749 LLVMValueRef* tmp_out = &ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)];
2750 if(!*tmp_out) {
2751 for(unsigned i = 0; i < 4; ++i)
2752 ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, i)] =
2753 ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
2754 }
2755
2756 LLVMValueRef view_index = ac_get_arg(&ctx->ac, ctx->args->ac.view_index);
2757 LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, view_index), *tmp_out);
2758 ctx->output_mask |= 1ull << VARYING_SLOT_LAYER;
2759 }
2760
2761 memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
2762 sizeof(outinfo->vs_output_param_offset));
2763 outinfo->pos_exports = 0;
2764
2765 if (!ctx->args->options->use_ngg_streamout &&
2766 ctx->args->shader_info->so.num_outputs &&
2767 !ctx->args->is_gs_copy_shader) {
2768 /* The GS copy shader emission already emits streamout. */
2769 radv_emit_streamout(ctx, 0);
2770 }
2771
2772 /* Allocate a temporary array for the output values. */
2773 unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_prim_id;
2774 outputs = malloc(num_outputs * sizeof(outputs[0]));
2775
2776 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2777 if (!(ctx->output_mask & (1ull << i)))
2778 continue;
2779
2780 outputs[noutput].slot_name = i;
2781 outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
2782
2783 if (ctx->stage == MESA_SHADER_VERTEX &&
2784 !ctx->args->is_gs_copy_shader) {
2785 outputs[noutput].usage_mask =
2786 ctx->args->shader_info->vs.output_usage_mask[i];
2787 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
2788 outputs[noutput].usage_mask =
2789 ctx->args->shader_info->tes.output_usage_mask[i];
2790 } else {
2791 assert(ctx->args->is_gs_copy_shader);
2792 outputs[noutput].usage_mask =
2793 ctx->args->shader_info->gs.output_usage_mask[i];
2794 }
2795
2796 for (unsigned j = 0; j < 4; j++) {
2797 outputs[noutput].values[j] =
2798 ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
2799 }
2800
2801 noutput++;
2802 }
2803
2804 /* Export PrimitiveID. */
2805 if (export_prim_id) {
2806 outputs[noutput].slot_name = VARYING_SLOT_PRIMITIVE_ID;
2807 outputs[noutput].slot_index = 0;
2808 outputs[noutput].usage_mask = 0x1;
2809 outputs[noutput].values[0] =
2810 ac_get_arg(&ctx->ac, ctx->args->vs_prim_id);
2811 for (unsigned j = 1; j < 4; j++)
2812 outputs[noutput].values[j] = ctx->ac.f32_0;
2813 noutput++;
2814 }
2815
2816 radv_llvm_export_vs(ctx, outputs, noutput, outinfo, export_clip_dists);
2817
2818 free(outputs);
2819 }
2820
2821 static void
2822 handle_es_outputs_post(struct radv_shader_context *ctx,
2823 struct radv_es_output_info *outinfo)
2824 {
2825 int j;
2826 LLVMValueRef lds_base = NULL;
2827
2828 if (ctx->ac.chip_class >= GFX9) {
2829 unsigned itemsize_dw = outinfo->esgs_itemsize / 4;
2830 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
2831 LLVMValueRef wave_idx =
2832 ac_unpack_param(&ctx->ac,
2833 ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 24, 4);
2834 vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
2835 LLVMBuildMul(ctx->ac.builder, wave_idx,
2836 LLVMConstInt(ctx->ac.i32,
2837 ctx->ac.wave_size, false), ""), "");
2838 lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
2839 LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
2840 }
2841
2842 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2843 LLVMValueRef dw_addr = NULL;
2844 LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4];
2845 unsigned output_usage_mask;
2846 int param_index;
2847
2848 if (!(ctx->output_mask & (1ull << i)))
2849 continue;
2850
2851 if (ctx->stage == MESA_SHADER_VERTEX) {
2852 output_usage_mask =
2853 ctx->args->shader_info->vs.output_usage_mask[i];
2854 } else {
2855 assert(ctx->stage == MESA_SHADER_TESS_EVAL);
2856 output_usage_mask =
2857 ctx->args->shader_info->tes.output_usage_mask[i];
2858 }
2859
2860 param_index = shader_io_get_unique_index(i);
2861
2862 if (lds_base) {
2863 dw_addr = LLVMBuildAdd(ctx->ac.builder, lds_base,
2864 LLVMConstInt(ctx->ac.i32, param_index * 4, false),
2865 "");
2866 }
2867
2868 for (j = 0; j < 4; j++) {
2869 if (!(output_usage_mask & (1 << j)))
2870 continue;
2871
2872 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
2873 out_val = ac_to_integer(&ctx->ac, out_val);
2874 out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
2875
2876 if (ctx->ac.chip_class >= GFX9) {
2877 LLVMValueRef dw_addr_offset =
2878 LLVMBuildAdd(ctx->ac.builder, dw_addr,
2879 LLVMConstInt(ctx->ac.i32,
2880 j, false), "");
2881
2882 ac_lds_store(&ctx->ac, dw_addr_offset, out_val);
2883 } else {
2884 ac_build_buffer_store_dword(&ctx->ac,
2885 ctx->esgs_ring,
2886 out_val, 1,
2887 NULL,
2888 ac_get_arg(&ctx->ac, ctx->args->es2gs_offset),
2889 (4 * param_index + j) * 4,
2890 ac_glc | ac_slc, true);
2891 }
2892 }
2893 }
2894 }
2895
2896 static void
2897 handle_ls_outputs_post(struct radv_shader_context *ctx)
2898 {
2899 LLVMValueRef vertex_id = ctx->rel_auto_id;
2900 uint32_t num_tcs_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written);
2901 LLVMValueRef vertex_dw_stride = LLVMConstInt(ctx->ac.i32, num_tcs_inputs * 4, false);
2902 LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id,
2903 vertex_dw_stride, "");
2904
2905 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {