Added few more stubs so that control reaches to DestroyDevice().
[mesa.git] / src / amd / vulkan / radv_nir_to_llvm.c
1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
27
28 #include "radv_private.h"
29 #include "radv_shader.h"
30 #include "radv_shader_helper.h"
31 #include "radv_shader_args.h"
32 #include "radv_debug.h"
33 #include "nir/nir.h"
34
35 #include "sid.h"
36 #include "ac_binary.h"
37 #include "ac_llvm_util.h"
38 #include "ac_llvm_build.h"
39 #include "ac_shader_abi.h"
40 #include "ac_shader_util.h"
41 #include "ac_exp_param.h"
42
43 #define RADEON_LLVM_MAX_INPUTS (VARYING_SLOT_VAR31 + 1)
44
45 struct radv_shader_context {
46 struct ac_llvm_context ac;
47 const struct nir_shader *shader;
48 struct ac_shader_abi abi;
49 const struct radv_shader_args *args;
50
51 gl_shader_stage stage;
52
53 unsigned max_workgroup_size;
54 LLVMContextRef context;
55 LLVMValueRef main_function;
56
57 LLVMValueRef descriptor_sets[MAX_SETS];
58
59 LLVMValueRef ring_offsets;
60
61 LLVMValueRef rel_auto_id;
62
63 LLVMValueRef gs_wave_id;
64 LLVMValueRef gs_vtx_offset[6];
65
66 LLVMValueRef esgs_ring;
67 LLVMValueRef gsvs_ring[4];
68 LLVMValueRef hs_ring_tess_offchip;
69 LLVMValueRef hs_ring_tess_factor;
70
71 LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4];
72
73 uint64_t output_mask;
74
75 LLVMValueRef gs_next_vertex[4];
76 LLVMValueRef gs_curprim_verts[4];
77 LLVMValueRef gs_generated_prims[4];
78 LLVMValueRef gs_ngg_emit;
79 LLVMValueRef gs_ngg_scratch;
80
81 uint32_t tcs_num_inputs;
82 uint32_t tcs_num_patches;
83
84 LLVMValueRef vertexptr; /* GFX10 only */
85 };
86
87 struct radv_shader_output_values {
88 LLVMValueRef values[4];
89 unsigned slot_name;
90 unsigned slot_index;
91 unsigned usage_mask;
92 };
93
94 static inline struct radv_shader_context *
95 radv_shader_context_from_abi(struct ac_shader_abi *abi)
96 {
97 struct radv_shader_context *ctx = NULL;
98 return container_of(abi, ctx, abi);
99 }
100
101 static LLVMValueRef get_rel_patch_id(struct radv_shader_context *ctx)
102 {
103 switch (ctx->stage) {
104 case MESA_SHADER_TESS_CTRL:
105 return ac_unpack_param(&ctx->ac,
106 ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids),
107 0, 8);
108 case MESA_SHADER_TESS_EVAL:
109 return ac_get_arg(&ctx->ac, ctx->args->tes_rel_patch_id);
110 break;
111 default:
112 unreachable("Illegal stage");
113 }
114 }
115
116 /* Tessellation shaders pass outputs to the next shader using LDS.
117 *
118 * LS outputs = TCS inputs
119 * TCS outputs = TES inputs
120 *
121 * The LDS layout is:
122 * - TCS inputs for patch 0
123 * - TCS inputs for patch 1
124 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
125 * - ...
126 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
127 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
128 * - TCS outputs for patch 1
129 * - Per-patch TCS outputs for patch 1
130 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
131 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
132 * - ...
133 *
134 * All three shaders VS(LS), TCS, TES share the same LDS space.
135 */
136 static LLVMValueRef
137 get_tcs_in_patch_stride(struct radv_shader_context *ctx)
138 {
139 assert(ctx->stage == MESA_SHADER_TESS_CTRL);
140 uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
141 uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size;
142
143 input_patch_size /= 4;
144 return LLVMConstInt(ctx->ac.i32, input_patch_size, false);
145 }
146
147 static LLVMValueRef
148 get_tcs_out_patch_stride(struct radv_shader_context *ctx)
149 {
150 uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
151 uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written);
152 uint32_t output_vertex_size = num_tcs_outputs * 16;
153 uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
154 uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
155 output_patch_size /= 4;
156 return LLVMConstInt(ctx->ac.i32, output_patch_size, false);
157 }
158
159 static LLVMValueRef
160 get_tcs_out_vertex_stride(struct radv_shader_context *ctx)
161 {
162 uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
163 uint32_t output_vertex_size = num_tcs_outputs * 16;
164 output_vertex_size /= 4;
165 return LLVMConstInt(ctx->ac.i32, output_vertex_size, false);
166 }
167
168 static LLVMValueRef
169 get_tcs_out_patch0_offset(struct radv_shader_context *ctx)
170 {
171 assert (ctx->stage == MESA_SHADER_TESS_CTRL);
172 uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
173 uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size;
174 uint32_t output_patch0_offset = input_patch_size;
175 unsigned num_patches = ctx->tcs_num_patches;
176
177 output_patch0_offset *= num_patches;
178 output_patch0_offset /= 4;
179 return LLVMConstInt(ctx->ac.i32, output_patch0_offset, false);
180 }
181
182 static LLVMValueRef
183 get_tcs_out_patch0_patch_data_offset(struct radv_shader_context *ctx)
184 {
185 assert (ctx->stage == MESA_SHADER_TESS_CTRL);
186 uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
187 uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size;
188 uint32_t output_patch0_offset = input_patch_size;
189
190 uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
191 uint32_t output_vertex_size = num_tcs_outputs * 16;
192 uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
193 unsigned num_patches = ctx->tcs_num_patches;
194
195 output_patch0_offset *= num_patches;
196 output_patch0_offset += pervertex_output_patch_size;
197 output_patch0_offset /= 4;
198 return LLVMConstInt(ctx->ac.i32, output_patch0_offset, false);
199 }
200
201 static LLVMValueRef
202 get_tcs_in_current_patch_offset(struct radv_shader_context *ctx)
203 {
204 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
205 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
206
207 return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
208 }
209
210 static LLVMValueRef
211 get_tcs_out_current_patch_offset(struct radv_shader_context *ctx)
212 {
213 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
214 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
215 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
216
217 return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id,
218 patch0_offset);
219 }
220
221 static LLVMValueRef
222 get_tcs_out_current_patch_data_offset(struct radv_shader_context *ctx)
223 {
224 LLVMValueRef patch0_patch_data_offset =
225 get_tcs_out_patch0_patch_data_offset(ctx);
226 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
227 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
228
229 return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id,
230 patch0_patch_data_offset);
231 }
232
233 static LLVMValueRef
234 create_llvm_function(struct ac_llvm_context *ctx, LLVMModuleRef module,
235 LLVMBuilderRef builder,
236 const struct ac_shader_args *args,
237 enum ac_llvm_calling_convention convention,
238 unsigned max_workgroup_size,
239 const struct radv_nir_compiler_options *options)
240 {
241 LLVMValueRef main_function =
242 ac_build_main(args, ctx, convention, "main", ctx->voidt, module);
243
244 if (options->address32_hi) {
245 ac_llvm_add_target_dep_function_attr(main_function,
246 "amdgpu-32bit-address-high-bits",
247 options->address32_hi);
248 }
249
250 ac_llvm_set_workgroup_size(main_function, max_workgroup_size);
251
252 return main_function;
253 }
254
255 static void
256 load_descriptor_sets(struct radv_shader_context *ctx)
257 {
258 uint32_t mask = ctx->args->shader_info->desc_set_used_mask;
259 if (ctx->args->shader_info->need_indirect_descriptor_sets) {
260 LLVMValueRef desc_sets =
261 ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[0]);
262 while (mask) {
263 int i = u_bit_scan(&mask);
264
265 ctx->descriptor_sets[i] =
266 ac_build_load_to_sgpr(&ctx->ac, desc_sets,
267 LLVMConstInt(ctx->ac.i32, i, false));
268
269 }
270 } else {
271 while (mask) {
272 int i = u_bit_scan(&mask);
273
274 ctx->descriptor_sets[i] =
275 ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[i]);
276 }
277 }
278 }
279
280 static enum ac_llvm_calling_convention
281 get_llvm_calling_convention(LLVMValueRef func, gl_shader_stage stage)
282 {
283 switch (stage) {
284 case MESA_SHADER_VERTEX:
285 case MESA_SHADER_TESS_EVAL:
286 return AC_LLVM_AMDGPU_VS;
287 break;
288 case MESA_SHADER_GEOMETRY:
289 return AC_LLVM_AMDGPU_GS;
290 break;
291 case MESA_SHADER_TESS_CTRL:
292 return AC_LLVM_AMDGPU_HS;
293 break;
294 case MESA_SHADER_FRAGMENT:
295 return AC_LLVM_AMDGPU_PS;
296 break;
297 case MESA_SHADER_COMPUTE:
298 return AC_LLVM_AMDGPU_CS;
299 break;
300 default:
301 unreachable("Unhandle shader type");
302 }
303 }
304
305 /* Returns whether the stage is a stage that can be directly before the GS */
306 static bool is_pre_gs_stage(gl_shader_stage stage)
307 {
308 return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL;
309 }
310
311 static void create_function(struct radv_shader_context *ctx,
312 gl_shader_stage stage,
313 bool has_previous_stage)
314 {
315 if (ctx->ac.chip_class >= GFX10) {
316 if (is_pre_gs_stage(stage) && ctx->args->options->key.vs_common_out.as_ngg) {
317 /* On GFX10, VS is merged into GS for NGG. */
318 stage = MESA_SHADER_GEOMETRY;
319 has_previous_stage = true;
320 }
321 }
322
323 ctx->main_function = create_llvm_function(
324 &ctx->ac, ctx->ac.module, ctx->ac.builder, &ctx->args->ac,
325 get_llvm_calling_convention(ctx->main_function, stage),
326 ctx->max_workgroup_size,
327 ctx->args->options);
328
329 ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
330 LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST),
331 NULL, 0, AC_FUNC_ATTR_READNONE);
332 ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets,
333 ac_array_in_const_addr_space(ctx->ac.v4i32), "");
334
335 load_descriptor_sets(ctx);
336
337 if (stage == MESA_SHADER_TESS_CTRL ||
338 (stage == MESA_SHADER_VERTEX && ctx->args->options->key.vs_common_out.as_ls) ||
339 /* GFX9 has the ESGS ring buffer in LDS. */
340 (stage == MESA_SHADER_GEOMETRY && has_previous_stage)) {
341 ac_declare_lds_as_pointer(&ctx->ac);
342 }
343
344 }
345
346
347 static LLVMValueRef
348 radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index,
349 unsigned desc_set, unsigned binding)
350 {
351 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
352 LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set];
353 struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout;
354 struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
355 unsigned base_offset = layout->binding[binding].offset;
356 LLVMValueRef offset, stride;
357
358 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
359 layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
360 unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
361 layout->binding[binding].dynamic_offset_offset;
362 desc_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.push_constants);
363 base_offset = pipeline_layout->push_constant_size + 16 * idx;
364 stride = LLVMConstInt(ctx->ac.i32, 16, false);
365 } else
366 stride = LLVMConstInt(ctx->ac.i32, layout->binding[binding].size, false);
367
368 offset = LLVMConstInt(ctx->ac.i32, base_offset, false);
369
370 if (layout->binding[binding].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
371 offset = ac_build_imad(&ctx->ac, index, stride, offset);
372 }
373
374 desc_ptr = LLVMBuildGEP(ctx->ac.builder, desc_ptr, &offset, 1, "");
375 desc_ptr = ac_cast_ptr(&ctx->ac, desc_ptr, ctx->ac.v4i32);
376 LLVMSetMetadata(desc_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
377
378 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
379 uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
380 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
381 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
382 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
383
384 if (ctx->ac.chip_class >= GFX10) {
385 desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
386 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
387 S_008F0C_RESOURCE_LEVEL(1);
388 } else {
389 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
390 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
391 }
392
393 LLVMValueRef desc_components[4] = {
394 LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.intptr, ""),
395 LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->args->options->address32_hi), false),
396 /* High limit to support variable sizes. */
397 LLVMConstInt(ctx->ac.i32, 0xffffffff, false),
398 LLVMConstInt(ctx->ac.i32, desc_type, false),
399 };
400
401 return ac_build_gather_values(&ctx->ac, desc_components, 4);
402 }
403
404 return desc_ptr;
405 }
406
407
408 /* The offchip buffer layout for TCS->TES is
409 *
410 * - attribute 0 of patch 0 vertex 0
411 * - attribute 0 of patch 0 vertex 1
412 * - attribute 0 of patch 0 vertex 2
413 * ...
414 * - attribute 0 of patch 1 vertex 0
415 * - attribute 0 of patch 1 vertex 1
416 * ...
417 * - attribute 1 of patch 0 vertex 0
418 * - attribute 1 of patch 0 vertex 1
419 * ...
420 * - per patch attribute 0 of patch 0
421 * - per patch attribute 0 of patch 1
422 * ...
423 *
424 * Note that every attribute has 4 components.
425 */
426 static LLVMValueRef get_non_vertex_index_offset(struct radv_shader_context *ctx)
427 {
428 uint32_t num_patches = ctx->tcs_num_patches;
429 uint32_t num_tcs_outputs;
430 if (ctx->stage == MESA_SHADER_TESS_CTRL)
431 num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
432 else
433 num_tcs_outputs = ctx->args->options->key.tes.tcs_num_outputs;
434
435 uint32_t output_vertex_size = num_tcs_outputs * 16;
436 uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
437
438 return LLVMConstInt(ctx->ac.i32, pervertex_output_patch_size * num_patches, false);
439 }
440
441 static LLVMValueRef calc_param_stride(struct radv_shader_context *ctx,
442 LLVMValueRef vertex_index)
443 {
444 LLVMValueRef param_stride;
445 if (vertex_index)
446 param_stride = LLVMConstInt(ctx->ac.i32, ctx->shader->info.tess.tcs_vertices_out * ctx->tcs_num_patches, false);
447 else
448 param_stride = LLVMConstInt(ctx->ac.i32, ctx->tcs_num_patches, false);
449 return param_stride;
450 }
451
452 static LLVMValueRef get_tcs_tes_buffer_address(struct radv_shader_context *ctx,
453 LLVMValueRef vertex_index,
454 LLVMValueRef param_index)
455 {
456 LLVMValueRef base_addr;
457 LLVMValueRef param_stride, constant16;
458 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
459 LLVMValueRef vertices_per_patch = LLVMConstInt(ctx->ac.i32, ctx->shader->info.tess.tcs_vertices_out, false);
460 constant16 = LLVMConstInt(ctx->ac.i32, 16, false);
461 param_stride = calc_param_stride(ctx, vertex_index);
462 if (vertex_index) {
463 base_addr = ac_build_imad(&ctx->ac, rel_patch_id,
464 vertices_per_patch, vertex_index);
465 } else {
466 base_addr = rel_patch_id;
467 }
468
469 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
470 LLVMBuildMul(ctx->ac.builder, param_index,
471 param_stride, ""), "");
472
473 base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
474
475 if (!vertex_index) {
476 LLVMValueRef patch_data_offset = get_non_vertex_index_offset(ctx);
477
478 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
479 patch_data_offset, "");
480 }
481 return base_addr;
482 }
483
484 static LLVMValueRef get_tcs_tes_buffer_address_params(struct radv_shader_context *ctx,
485 unsigned param,
486 unsigned const_index,
487 bool is_compact,
488 LLVMValueRef vertex_index,
489 LLVMValueRef indir_index)
490 {
491 LLVMValueRef param_index;
492
493 if (indir_index)
494 param_index = LLVMBuildAdd(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, param, false),
495 indir_index, "");
496 else {
497 if (const_index && !is_compact)
498 param += const_index;
499 param_index = LLVMConstInt(ctx->ac.i32, param, false);
500 }
501 return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
502 }
503
504 static LLVMValueRef
505 get_dw_address(struct radv_shader_context *ctx,
506 LLVMValueRef dw_addr,
507 unsigned param,
508 unsigned const_index,
509 bool compact_const_index,
510 LLVMValueRef vertex_index,
511 LLVMValueRef stride,
512 LLVMValueRef indir_index)
513
514 {
515
516 if (vertex_index) {
517 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
518 LLVMBuildMul(ctx->ac.builder,
519 vertex_index,
520 stride, ""), "");
521 }
522
523 if (indir_index)
524 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
525 LLVMBuildMul(ctx->ac.builder, indir_index,
526 LLVMConstInt(ctx->ac.i32, 4, false), ""), "");
527 else if (const_index && !compact_const_index)
528 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
529 LLVMConstInt(ctx->ac.i32, const_index * 4, false), "");
530
531 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
532 LLVMConstInt(ctx->ac.i32, param * 4, false), "");
533
534 if (const_index && compact_const_index)
535 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
536 LLVMConstInt(ctx->ac.i32, const_index, false), "");
537 return dw_addr;
538 }
539
540 static LLVMValueRef
541 load_tcs_varyings(struct ac_shader_abi *abi,
542 LLVMTypeRef type,
543 LLVMValueRef vertex_index,
544 LLVMValueRef indir_index,
545 unsigned const_index,
546 unsigned location,
547 unsigned driver_location,
548 unsigned component,
549 unsigned num_components,
550 bool is_patch,
551 bool is_compact,
552 bool load_input)
553 {
554 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
555 LLVMValueRef dw_addr, stride;
556 LLVMValueRef value[4], result;
557 unsigned param = shader_io_get_unique_index(location);
558
559 if (load_input) {
560 uint32_t input_vertex_size = (ctx->tcs_num_inputs * 16) / 4;
561 stride = LLVMConstInt(ctx->ac.i32, input_vertex_size, false);
562 dw_addr = get_tcs_in_current_patch_offset(ctx);
563 } else {
564 if (!is_patch) {
565 stride = get_tcs_out_vertex_stride(ctx);
566 dw_addr = get_tcs_out_current_patch_offset(ctx);
567 } else {
568 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
569 stride = NULL;
570 }
571 }
572
573 dw_addr = get_dw_address(ctx, dw_addr, param, const_index, is_compact, vertex_index, stride,
574 indir_index);
575
576 for (unsigned i = 0; i < num_components + component; i++) {
577 value[i] = ac_lds_load(&ctx->ac, dw_addr);
578 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
579 ctx->ac.i32_1, "");
580 }
581 result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
582 return result;
583 }
584
585 static void
586 store_tcs_output(struct ac_shader_abi *abi,
587 const nir_variable *var,
588 LLVMValueRef vertex_index,
589 LLVMValueRef param_index,
590 unsigned const_index,
591 LLVMValueRef src,
592 unsigned writemask,
593 unsigned component,
594 unsigned driver_location)
595 {
596 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
597 const unsigned location = var->data.location;
598 const bool is_patch = var->data.patch;
599 const bool is_compact = var->data.compact;
600 LLVMValueRef dw_addr;
601 LLVMValueRef stride = NULL;
602 LLVMValueRef buf_addr = NULL;
603 LLVMValueRef oc_lds = ac_get_arg(&ctx->ac, ctx->args->oc_lds);
604 unsigned param;
605 bool store_lds = true;
606
607 if (is_patch) {
608 if (!(ctx->shader->info.patch_outputs_read & (1U << (location - VARYING_SLOT_PATCH0))))
609 store_lds = false;
610 } else {
611 if (!(ctx->shader->info.outputs_read & (1ULL << location)))
612 store_lds = false;
613 }
614
615 param = shader_io_get_unique_index(location);
616 if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) {
617 const_index += component;
618 component = 0;
619
620 if (const_index >= 4) {
621 const_index -= 4;
622 param++;
623 }
624 }
625
626 if (!is_patch) {
627 stride = get_tcs_out_vertex_stride(ctx);
628 dw_addr = get_tcs_out_current_patch_offset(ctx);
629 } else {
630 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
631 }
632
633 dw_addr = get_dw_address(ctx, dw_addr, param, const_index, is_compact, vertex_index, stride,
634 param_index);
635 buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index, is_compact,
636 vertex_index, param_index);
637
638 bool is_tess_factor = false;
639 if (location == VARYING_SLOT_TESS_LEVEL_INNER ||
640 location == VARYING_SLOT_TESS_LEVEL_OUTER)
641 is_tess_factor = true;
642
643 unsigned base = is_compact ? const_index : 0;
644 for (unsigned chan = 0; chan < 8; chan++) {
645 if (!(writemask & (1 << chan)))
646 continue;
647 LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
648 value = ac_to_integer(&ctx->ac, value);
649 value = LLVMBuildZExtOrBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
650
651 if (store_lds || is_tess_factor) {
652 LLVMValueRef dw_addr_chan =
653 LLVMBuildAdd(ctx->ac.builder, dw_addr,
654 LLVMConstInt(ctx->ac.i32, chan, false), "");
655 ac_lds_store(&ctx->ac, dw_addr_chan, value);
656 }
657
658 if (!is_tess_factor && writemask != 0xF)
659 ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, value, 1,
660 buf_addr, oc_lds,
661 4 * (base + chan), ac_glc);
662 }
663
664 if (writemask == 0xF) {
665 ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, src, 4,
666 buf_addr, oc_lds,
667 (base * 4), ac_glc);
668 }
669 }
670
671 static LLVMValueRef
672 load_tes_input(struct ac_shader_abi *abi,
673 LLVMTypeRef type,
674 LLVMValueRef vertex_index,
675 LLVMValueRef param_index,
676 unsigned const_index,
677 unsigned location,
678 unsigned driver_location,
679 unsigned component,
680 unsigned num_components,
681 bool is_patch,
682 bool is_compact,
683 bool load_input)
684 {
685 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
686 LLVMValueRef buf_addr;
687 LLVMValueRef result;
688 LLVMValueRef oc_lds = ac_get_arg(&ctx->ac, ctx->args->oc_lds);
689 unsigned param = shader_io_get_unique_index(location);
690
691 if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) {
692 const_index += component;
693 component = 0;
694 if (const_index >= 4) {
695 const_index -= 4;
696 param++;
697 }
698 }
699
700 buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index,
701 is_compact, vertex_index, param_index);
702
703 LLVMValueRef comp_offset = LLVMConstInt(ctx->ac.i32, component * 4, false);
704 buf_addr = LLVMBuildAdd(ctx->ac.builder, buf_addr, comp_offset, "");
705
706 result = ac_build_buffer_load(&ctx->ac, ctx->hs_ring_tess_offchip, num_components, NULL,
707 buf_addr, oc_lds, is_compact ? (4 * const_index) : 0, ac_glc, true, false);
708 result = ac_trim_vector(&ctx->ac, result, num_components);
709 return result;
710 }
711
712 static LLVMValueRef
713 radv_emit_fetch_64bit(struct radv_shader_context *ctx,
714 LLVMTypeRef type, LLVMValueRef a, LLVMValueRef b)
715 {
716 LLVMValueRef values[2] = {
717 ac_to_integer(&ctx->ac, a),
718 ac_to_integer(&ctx->ac, b),
719 };
720 LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2);
721 return LLVMBuildBitCast(ctx->ac.builder, result, type, "");
722 }
723
724 static LLVMValueRef
725 load_gs_input(struct ac_shader_abi *abi,
726 unsigned location,
727 unsigned driver_location,
728 unsigned component,
729 unsigned num_components,
730 unsigned vertex_index,
731 unsigned const_index,
732 LLVMTypeRef type)
733 {
734 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
735 LLVMValueRef vtx_offset;
736 unsigned param, vtx_offset_param;
737 LLVMValueRef value[4], result;
738
739 vtx_offset_param = vertex_index;
740 assert(vtx_offset_param < 6);
741 vtx_offset = LLVMBuildMul(ctx->ac.builder, ctx->gs_vtx_offset[vtx_offset_param],
742 LLVMConstInt(ctx->ac.i32, 4, false), "");
743
744 param = shader_io_get_unique_index(location);
745
746 for (unsigned i = component; i < num_components + component; i++) {
747 if (ctx->ac.chip_class >= GFX9) {
748 LLVMValueRef dw_addr = ctx->gs_vtx_offset[vtx_offset_param];
749 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
750 LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index, 0), "");
751 value[i] = ac_lds_load(&ctx->ac, dw_addr);
752
753 if (ac_get_type_size(type) == 8) {
754 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
755 LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index + 1, 0), "");
756 LLVMValueRef tmp = ac_lds_load(&ctx->ac, dw_addr);
757
758 value[i] = radv_emit_fetch_64bit(ctx, type, value[i], tmp);
759 }
760 } else {
761 LLVMValueRef soffset =
762 LLVMConstInt(ctx->ac.i32,
763 (param * 4 + i + const_index) * 256,
764 false);
765
766 value[i] = ac_build_buffer_load(&ctx->ac,
767 ctx->esgs_ring, 1,
768 ctx->ac.i32_0,
769 vtx_offset, soffset,
770 0, ac_glc, true, false);
771
772 if (ac_get_type_size(type) == 8) {
773 soffset = LLVMConstInt(ctx->ac.i32,
774 (param * 4 + i + const_index + 1) * 256,
775 false);
776
777 LLVMValueRef tmp =
778 ac_build_buffer_load(&ctx->ac,
779 ctx->esgs_ring, 1,
780 ctx->ac.i32_0,
781 vtx_offset, soffset,
782 0, ac_glc, true, false);
783
784 value[i] = radv_emit_fetch_64bit(ctx, type, value[i], tmp);
785 }
786 }
787
788 if (ac_get_type_size(type) == 2) {
789 value[i] = LLVMBuildBitCast(ctx->ac.builder, value[i], ctx->ac.i32, "");
790 value[i] = LLVMBuildTrunc(ctx->ac.builder, value[i], ctx->ac.i16, "");
791 }
792 value[i] = LLVMBuildBitCast(ctx->ac.builder, value[i], type, "");
793 }
794 result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
795 result = ac_to_integer(&ctx->ac, result);
796 return result;
797 }
798
799 static uint32_t
800 radv_get_sample_pos_offset(uint32_t num_samples)
801 {
802 uint32_t sample_pos_offset = 0;
803
804 switch (num_samples) {
805 case 2:
806 sample_pos_offset = 1;
807 break;
808 case 4:
809 sample_pos_offset = 3;
810 break;
811 case 8:
812 sample_pos_offset = 7;
813 break;
814 default:
815 break;
816 }
817 return sample_pos_offset;
818 }
819
820 static LLVMValueRef load_sample_position(struct ac_shader_abi *abi,
821 LLVMValueRef sample_id)
822 {
823 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
824
825 LLVMValueRef result;
826 LLVMValueRef index = LLVMConstInt(ctx->ac.i32, RING_PS_SAMPLE_POSITIONS, false);
827 LLVMValueRef ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ring_offsets, &index, 1, "");
828
829 ptr = LLVMBuildBitCast(ctx->ac.builder, ptr,
830 ac_array_in_const_addr_space(ctx->ac.v2f32), "");
831
832 uint32_t sample_pos_offset =
833 radv_get_sample_pos_offset(ctx->args->options->key.fs.num_samples);
834
835 sample_id =
836 LLVMBuildAdd(ctx->ac.builder, sample_id,
837 LLVMConstInt(ctx->ac.i32, sample_pos_offset, false), "");
838 result = ac_build_load_invariant(&ctx->ac, ptr, sample_id);
839
840 return result;
841 }
842
843
844 static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi)
845 {
846 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
847 uint8_t log2_ps_iter_samples;
848
849 if (ctx->args->shader_info->ps.force_persample) {
850 log2_ps_iter_samples =
851 util_logbase2(ctx->args->options->key.fs.num_samples);
852 } else {
853 log2_ps_iter_samples = ctx->args->options->key.fs.log2_ps_iter_samples;
854 }
855
856 /* The bit pattern matches that used by fixed function fragment
857 * processing. */
858 static const uint16_t ps_iter_masks[] = {
859 0xffff, /* not used */
860 0x5555,
861 0x1111,
862 0x0101,
863 0x0001,
864 };
865 assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
866
867 uint32_t ps_iter_mask = ps_iter_masks[log2_ps_iter_samples];
868
869 LLVMValueRef result, sample_id;
870 sample_id = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.ancillary), 8, 4);
871 sample_id = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false), sample_id, "");
872 result = LLVMBuildAnd(ctx->ac.builder, sample_id,
873 ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage), "");
874 return result;
875 }
876
877
878 static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
879 unsigned stream,
880 LLVMValueRef vertexidx,
881 LLVMValueRef *addrs);
882
883 static void
884 visit_emit_vertex_with_counter(struct ac_shader_abi *abi, unsigned stream,
885 LLVMValueRef vertexidx, LLVMValueRef *addrs)
886 {
887 unsigned offset = 0;
888 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
889
890 if (ctx->args->options->key.vs_common_out.as_ngg) {
891 gfx10_ngg_gs_emit_vertex(ctx, stream, vertexidx, addrs);
892 return;
893 }
894
895 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
896 unsigned output_usage_mask =
897 ctx->args->shader_info->gs.output_usage_mask[i];
898 uint8_t output_stream =
899 ctx->args->shader_info->gs.output_streams[i];
900 LLVMValueRef *out_ptr = &addrs[i * 4];
901 int length = util_last_bit(output_usage_mask);
902
903 if (!(ctx->output_mask & (1ull << i)) ||
904 output_stream != stream)
905 continue;
906
907 for (unsigned j = 0; j < length; j++) {
908 if (!(output_usage_mask & (1 << j)))
909 continue;
910
911 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder,
912 out_ptr[j], "");
913 LLVMValueRef voffset =
914 LLVMConstInt(ctx->ac.i32, offset *
915 ctx->shader->info.gs.vertices_out, false);
916
917 offset++;
918
919 voffset = LLVMBuildAdd(ctx->ac.builder, voffset, vertexidx, "");
920 voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), "");
921
922 out_val = ac_to_integer(&ctx->ac, out_val);
923 out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
924
925 ac_build_buffer_store_dword(&ctx->ac,
926 ctx->gsvs_ring[stream],
927 out_val, 1,
928 voffset,
929 ac_get_arg(&ctx->ac,
930 ctx->args->gs2vs_offset),
931 0, ac_glc | ac_slc | ac_swizzled);
932 }
933 }
934
935 ac_build_sendmsg(&ctx->ac,
936 AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
937 ctx->gs_wave_id);
938 }
939
940 static void
941 visit_end_primitive(struct ac_shader_abi *abi, unsigned stream)
942 {
943 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
944
945 if (ctx->args->options->key.vs_common_out.as_ngg) {
946 LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
947 return;
948 }
949
950 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), ctx->gs_wave_id);
951 }
952
953 static LLVMValueRef
954 load_tess_coord(struct ac_shader_abi *abi)
955 {
956 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
957
958 LLVMValueRef coord[4] = {
959 ac_get_arg(&ctx->ac, ctx->args->tes_u),
960 ac_get_arg(&ctx->ac, ctx->args->tes_v),
961 ctx->ac.f32_0,
962 ctx->ac.f32_0,
963 };
964
965 if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES)
966 coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
967 LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), "");
968
969 return ac_build_gather_values(&ctx->ac, coord, 3);
970 }
971
972 static LLVMValueRef
973 load_patch_vertices_in(struct ac_shader_abi *abi)
974 {
975 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
976 return LLVMConstInt(ctx->ac.i32, ctx->args->options->key.tcs.input_vertices, false);
977 }
978
979
980 static LLVMValueRef radv_load_base_vertex(struct ac_shader_abi *abi)
981 {
982 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
983 return ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex);
984 }
985
986 static LLVMValueRef radv_load_ssbo(struct ac_shader_abi *abi,
987 LLVMValueRef buffer_ptr, bool write)
988 {
989 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
990 LLVMValueRef result;
991
992 LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
993
994 result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");
995 LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
996
997 return result;
998 }
999
1000 static LLVMValueRef radv_load_ubo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr)
1001 {
1002 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1003 LLVMValueRef result;
1004
1005 if (LLVMGetTypeKind(LLVMTypeOf(buffer_ptr)) != LLVMPointerTypeKind) {
1006 /* Do not load the descriptor for inlined uniform blocks. */
1007 return buffer_ptr;
1008 }
1009
1010 LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
1011
1012 result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");
1013 LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
1014
1015 return result;
1016 }
1017
1018 static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi,
1019 unsigned descriptor_set,
1020 unsigned base_index,
1021 unsigned constant_index,
1022 LLVMValueRef index,
1023 enum ac_descriptor_type desc_type,
1024 bool image, bool write,
1025 bool bindless)
1026 {
1027 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1028 LLVMValueRef list = ctx->descriptor_sets[descriptor_set];
1029 struct radv_descriptor_set_layout *layout = ctx->args->options->layout->set[descriptor_set].layout;
1030 struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
1031 unsigned offset = binding->offset;
1032 unsigned stride = binding->size;
1033 unsigned type_size;
1034 LLVMBuilderRef builder = ctx->ac.builder;
1035 LLVMTypeRef type;
1036
1037 assert(base_index < layout->binding_count);
1038
1039 switch (desc_type) {
1040 case AC_DESC_IMAGE:
1041 type = ctx->ac.v8i32;
1042 type_size = 32;
1043 break;
1044 case AC_DESC_FMASK:
1045 type = ctx->ac.v8i32;
1046 offset += 32;
1047 type_size = 32;
1048 break;
1049 case AC_DESC_SAMPLER:
1050 type = ctx->ac.v4i32;
1051 if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
1052 offset += radv_combined_image_descriptor_sampler_offset(binding);
1053 }
1054
1055 type_size = 16;
1056 break;
1057 case AC_DESC_BUFFER:
1058 type = ctx->ac.v4i32;
1059 type_size = 16;
1060 break;
1061 case AC_DESC_PLANE_0:
1062 case AC_DESC_PLANE_1:
1063 case AC_DESC_PLANE_2:
1064 type = ctx->ac.v8i32;
1065 type_size = 32;
1066 offset += 32 * (desc_type - AC_DESC_PLANE_0);
1067 break;
1068 default:
1069 unreachable("invalid desc_type\n");
1070 }
1071
1072 offset += constant_index * stride;
1073
1074 if (desc_type == AC_DESC_SAMPLER && binding->immutable_samplers_offset &&
1075 (!index || binding->immutable_samplers_equal)) {
1076 if (binding->immutable_samplers_equal)
1077 constant_index = 0;
1078
1079 const uint32_t *samplers = radv_immutable_samplers(layout, binding);
1080
1081 LLVMValueRef constants[] = {
1082 LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 0], 0),
1083 LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 1], 0),
1084 LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 2], 0),
1085 LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 3], 0),
1086 };
1087 return ac_build_gather_values(&ctx->ac, constants, 4);
1088 }
1089
1090 assert(stride % type_size == 0);
1091
1092 LLVMValueRef adjusted_index = index;
1093 if (!adjusted_index)
1094 adjusted_index = ctx->ac.i32_0;
1095
1096 adjusted_index = LLVMBuildMul(builder, adjusted_index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), "");
1097
1098 LLVMValueRef val_offset = LLVMConstInt(ctx->ac.i32, offset, 0);
1099 list = LLVMBuildGEP(builder, list, &val_offset, 1, "");
1100 list = LLVMBuildPointerCast(builder, list,
1101 ac_array_in_const32_addr_space(type), "");
1102
1103 LLVMValueRef descriptor = ac_build_load_to_sgpr(&ctx->ac, list, adjusted_index);
1104
1105 /* 3 plane formats always have same size and format for plane 1 & 2, so
1106 * use the tail from plane 1 so that we can store only the first 16 bytes
1107 * of the last plane. */
1108 if (desc_type == AC_DESC_PLANE_2) {
1109 LLVMValueRef descriptor2 = radv_get_sampler_desc(abi, descriptor_set, base_index, constant_index, index, AC_DESC_PLANE_1,image, write, bindless);
1110
1111 LLVMValueRef components[8];
1112 for (unsigned i = 0; i < 4; ++i)
1113 components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i);
1114
1115 for (unsigned i = 4; i < 8; ++i)
1116 components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor2, i);
1117 descriptor = ac_build_gather_values(&ctx->ac, components, 8);
1118 }
1119
1120 return descriptor;
1121 }
1122
1123 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
1124 * so we may need to fix it up. */
1125 static LLVMValueRef
1126 adjust_vertex_fetch_alpha(struct radv_shader_context *ctx,
1127 unsigned adjustment,
1128 LLVMValueRef alpha)
1129 {
1130 if (adjustment == RADV_ALPHA_ADJUST_NONE)
1131 return alpha;
1132
1133 LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
1134
1135 alpha = LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.f32, "");
1136
1137 if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
1138 alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, "");
1139 else
1140 alpha = ac_to_integer(&ctx->ac, alpha);
1141
1142 /* For the integer-like cases, do a natural sign extension.
1143 *
1144 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
1145 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
1146 * exponent.
1147 */
1148 alpha = LLVMBuildShl(ctx->ac.builder, alpha,
1149 adjustment == RADV_ALPHA_ADJUST_SNORM ?
1150 LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
1151 alpha = LLVMBuildAShr(ctx->ac.builder, alpha, c30, "");
1152
1153 /* Convert back to the right type. */
1154 if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
1155 LLVMValueRef clamp;
1156 LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
1157 alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
1158 clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, alpha, neg_one, "");
1159 alpha = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, alpha, "");
1160 } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
1161 alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
1162 }
1163
1164 return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, "");
1165 }
1166
1167 static LLVMValueRef
1168 radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx,
1169 LLVMValueRef value,
1170 unsigned num_channels,
1171 bool is_float)
1172 {
1173 LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;
1174 LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;
1175 LLVMValueRef chan[4];
1176
1177 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
1178 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
1179
1180 if (num_channels == 4 && num_channels == vec_size)
1181 return value;
1182
1183 num_channels = MIN2(num_channels, vec_size);
1184
1185 for (unsigned i = 0; i < num_channels; i++)
1186 chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i);
1187 } else {
1188 assert(num_channels == 1);
1189 chan[0] = value;
1190 }
1191
1192 for (unsigned i = num_channels; i < 4; i++) {
1193 chan[i] = i == 3 ? one : zero;
1194 chan[i] = ac_to_integer(&ctx->ac, chan[i]);
1195 }
1196
1197 return ac_build_gather_values(&ctx->ac, chan, 4);
1198 }
1199
1200 static void
1201 handle_vs_input_decl(struct radv_shader_context *ctx,
1202 struct nir_variable *variable)
1203 {
1204 LLVMValueRef t_list_ptr = ac_get_arg(&ctx->ac, ctx->args->vertex_buffers);
1205 LLVMValueRef t_offset;
1206 LLVMValueRef t_list;
1207 LLVMValueRef input;
1208 LLVMValueRef buffer_index;
1209 unsigned attrib_count = glsl_count_attribute_slots(variable->type, true);
1210 uint8_t input_usage_mask =
1211 ctx->args->shader_info->vs.input_usage_mask[variable->data.location];
1212 unsigned num_input_channels = util_last_bit(input_usage_mask);
1213
1214 variable->data.driver_location = variable->data.location * 4;
1215
1216 enum glsl_base_type type = glsl_get_base_type(variable->type);
1217 for (unsigned i = 0; i < attrib_count; ++i) {
1218 LLVMValueRef output[4];
1219 unsigned attrib_index = variable->data.location + i - VERT_ATTRIB_GENERIC0;
1220 unsigned attrib_format = ctx->args->options->key.vs.vertex_attribute_formats[attrib_index];
1221 unsigned data_format = attrib_format & 0x0f;
1222 unsigned num_format = (attrib_format >> 4) & 0x07;
1223 bool is_float = num_format != V_008F0C_BUF_NUM_FORMAT_UINT &&
1224 num_format != V_008F0C_BUF_NUM_FORMAT_SINT;
1225
1226 if (ctx->args->options->key.vs.instance_rate_inputs & (1u << attrib_index)) {
1227 uint32_t divisor = ctx->args->options->key.vs.instance_rate_divisors[attrib_index];
1228
1229 if (divisor) {
1230 buffer_index = ctx->abi.instance_id;
1231
1232 if (divisor != 1) {
1233 buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index,
1234 LLVMConstInt(ctx->ac.i32, divisor, 0), "");
1235 }
1236 } else {
1237 buffer_index = ctx->ac.i32_0;
1238 }
1239
1240 buffer_index = LLVMBuildAdd(ctx->ac.builder,
1241 ac_get_arg(&ctx->ac,
1242 ctx->args->ac.start_instance),\
1243 buffer_index, "");
1244 } else {
1245 buffer_index = LLVMBuildAdd(ctx->ac.builder,
1246 ctx->abi.vertex_id,
1247 ac_get_arg(&ctx->ac,
1248 ctx->args->ac.base_vertex), "");
1249 }
1250
1251 const struct ac_data_format_info *vtx_info = ac_get_data_format_info(data_format);
1252
1253 /* Adjust the number of channels to load based on the vertex
1254 * attribute format.
1255 */
1256 unsigned num_channels = MIN2(num_input_channels, vtx_info->num_channels);
1257 unsigned attrib_binding = ctx->args->options->key.vs.vertex_attribute_bindings[attrib_index];
1258 unsigned attrib_offset = ctx->args->options->key.vs.vertex_attribute_offsets[attrib_index];
1259 unsigned attrib_stride = ctx->args->options->key.vs.vertex_attribute_strides[attrib_index];
1260
1261 if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) {
1262 /* Always load, at least, 3 channels for formats that
1263 * need to be shuffled because X<->Z.
1264 */
1265 num_channels = MAX2(num_channels, 3);
1266 }
1267
1268 t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false);
1269 t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
1270
1271 /* Perform per-channel vertex fetch operations if unaligned
1272 * access are detected. Only GFX6 and GFX10 are affected.
1273 */
1274 bool unaligned_vertex_fetches = false;
1275 if ((ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10) &&
1276 vtx_info->chan_format != data_format &&
1277 ((attrib_offset % vtx_info->element_size) ||
1278 (attrib_stride % vtx_info->element_size)))
1279 unaligned_vertex_fetches = true;
1280
1281 if (unaligned_vertex_fetches) {
1282 unsigned chan_format = vtx_info->chan_format;
1283 LLVMValueRef values[4];
1284
1285 assert(ctx->ac.chip_class == GFX6 ||
1286 ctx->ac.chip_class >= GFX10);
1287
1288 for (unsigned chan = 0; chan < num_channels; chan++) {
1289 unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size;
1290 LLVMValueRef chan_index = buffer_index;
1291
1292 if (attrib_stride != 0 && chan_offset > attrib_stride) {
1293 LLVMValueRef buffer_offset =
1294 LLVMConstInt(ctx->ac.i32,
1295 chan_offset / attrib_stride, false);
1296
1297 chan_index = LLVMBuildAdd(ctx->ac.builder,
1298 buffer_index,
1299 buffer_offset, "");
1300
1301 chan_offset = chan_offset % attrib_stride;
1302 }
1303
1304 values[chan] = ac_build_struct_tbuffer_load(&ctx->ac, t_list,
1305 chan_index,
1306 LLVMConstInt(ctx->ac.i32, chan_offset, false),
1307 ctx->ac.i32_0, ctx->ac.i32_0, 1,
1308 chan_format, num_format, 0, true);
1309 }
1310
1311 input = ac_build_gather_values(&ctx->ac, values, num_channels);
1312 } else {
1313 if (attrib_stride != 0 && attrib_offset > attrib_stride) {
1314 LLVMValueRef buffer_offset =
1315 LLVMConstInt(ctx->ac.i32,
1316 attrib_offset / attrib_stride, false);
1317
1318 buffer_index = LLVMBuildAdd(ctx->ac.builder,
1319 buffer_index,
1320 buffer_offset, "");
1321
1322 attrib_offset = attrib_offset % attrib_stride;
1323 }
1324
1325 input = ac_build_struct_tbuffer_load(&ctx->ac, t_list,
1326 buffer_index,
1327 LLVMConstInt(ctx->ac.i32, attrib_offset, false),
1328 ctx->ac.i32_0, ctx->ac.i32_0,
1329 num_channels,
1330 data_format, num_format, 0, true);
1331 }
1332
1333 if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) {
1334 LLVMValueRef c[4];
1335 c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2);
1336 c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1);
1337 c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0);
1338 c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3);
1339
1340 input = ac_build_gather_values(&ctx->ac, c, 4);
1341 }
1342
1343 input = radv_fixup_vertex_input_fetches(ctx, input, num_channels,
1344 is_float);
1345
1346 for (unsigned chan = 0; chan < 4; chan++) {
1347 LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
1348 output[chan] = LLVMBuildExtractElement(ctx->ac.builder, input, llvm_chan, "");
1349 if (type == GLSL_TYPE_FLOAT16) {
1350 output[chan] = LLVMBuildBitCast(ctx->ac.builder, output[chan], ctx->ac.f32, "");
1351 output[chan] = LLVMBuildFPTrunc(ctx->ac.builder, output[chan], ctx->ac.f16, "");
1352 }
1353 }
1354
1355 unsigned alpha_adjust = (ctx->args->options->key.vs.alpha_adjust >> (attrib_index * 2)) & 3;
1356 output[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, output[3]);
1357
1358 for (unsigned chan = 0; chan < 4; chan++) {
1359 output[chan] = ac_to_integer(&ctx->ac, output[chan]);
1360 if (type == GLSL_TYPE_UINT16 || type == GLSL_TYPE_INT16)
1361 output[chan] = LLVMBuildTrunc(ctx->ac.builder, output[chan], ctx->ac.i16, "");
1362
1363 ctx->inputs[ac_llvm_reg_index_soa(variable->data.location + i, chan)] = output[chan];
1364 }
1365 }
1366 }
1367
1368 static void
1369 handle_vs_inputs(struct radv_shader_context *ctx,
1370 struct nir_shader *nir) {
1371 nir_foreach_shader_in_variable(variable, nir)
1372 handle_vs_input_decl(ctx, variable);
1373 }
1374
1375 static void
1376 prepare_interp_optimize(struct radv_shader_context *ctx,
1377 struct nir_shader *nir)
1378 {
1379 bool uses_center = false;
1380 bool uses_centroid = false;
1381 nir_foreach_shader_in_variable(variable, nir) {
1382 if (glsl_get_base_type(glsl_without_array(variable->type)) != GLSL_TYPE_FLOAT ||
1383 variable->data.sample)
1384 continue;
1385
1386 if (variable->data.centroid)
1387 uses_centroid = true;
1388 else
1389 uses_center = true;
1390 }
1391
1392 ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.persp_centroid);
1393 ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.linear_centroid);
1394
1395 if (uses_center && uses_centroid) {
1396 LLVMValueRef sel = LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT,
1397 ac_get_arg(&ctx->ac, ctx->args->ac.prim_mask),
1398 ctx->ac.i32_0, "");
1399 ctx->abi.persp_centroid =
1400 LLVMBuildSelect(ctx->ac.builder, sel,
1401 ac_get_arg(&ctx->ac, ctx->args->ac.persp_center),
1402 ctx->abi.persp_centroid, "");
1403 ctx->abi.linear_centroid =
1404 LLVMBuildSelect(ctx->ac.builder, sel,
1405 ac_get_arg(&ctx->ac, ctx->args->ac.linear_center),
1406 ctx->abi.linear_centroid, "");
1407 }
1408 }
1409
1410 static void
1411 scan_shader_output_decl(struct radv_shader_context *ctx,
1412 struct nir_variable *variable,
1413 struct nir_shader *shader,
1414 gl_shader_stage stage)
1415 {
1416 int idx = variable->data.location + variable->data.index;
1417 unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
1418 uint64_t mask_attribs;
1419
1420 variable->data.driver_location = idx * 4;
1421
1422 /* tess ctrl has it's own load/store paths for outputs */
1423 if (stage == MESA_SHADER_TESS_CTRL)
1424 return;
1425
1426 if (variable->data.compact) {
1427 unsigned component_count = variable->data.location_frac +
1428 glsl_get_length(variable->type);
1429 attrib_count = (component_count + 3) / 4;
1430 }
1431
1432 mask_attribs = ((1ull << attrib_count) - 1) << idx;
1433
1434 ctx->output_mask |= mask_attribs;
1435 }
1436
1437
1438 /* Initialize arguments for the shader export intrinsic */
1439 static void
1440 si_llvm_init_export_args(struct radv_shader_context *ctx,
1441 LLVMValueRef *values,
1442 unsigned enabled_channels,
1443 unsigned target,
1444 struct ac_export_args *args)
1445 {
1446 /* Specify the channels that are enabled. */
1447 args->enabled_channels = enabled_channels;
1448
1449 /* Specify whether the EXEC mask represents the valid mask */
1450 args->valid_mask = 0;
1451
1452 /* Specify whether this is the last export */
1453 args->done = 0;
1454
1455 /* Specify the target we are exporting */
1456 args->target = target;
1457
1458 args->compr = false;
1459 args->out[0] = LLVMGetUndef(ctx->ac.f32);
1460 args->out[1] = LLVMGetUndef(ctx->ac.f32);
1461 args->out[2] = LLVMGetUndef(ctx->ac.f32);
1462 args->out[3] = LLVMGetUndef(ctx->ac.f32);
1463
1464 if (!values)
1465 return;
1466
1467 bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
1468 if (ctx->stage == MESA_SHADER_FRAGMENT) {
1469 unsigned index = target - V_008DFC_SQ_EXP_MRT;
1470 unsigned col_format = (ctx->args->options->key.fs.col_format >> (4 * index)) & 0xf;
1471 bool is_int8 = (ctx->args->options->key.fs.is_int8 >> index) & 1;
1472 bool is_int10 = (ctx->args->options->key.fs.is_int10 >> index) & 1;
1473 unsigned chan;
1474
1475 LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL;
1476 LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2],
1477 unsigned bits, bool hi) = NULL;
1478
1479 switch(col_format) {
1480 case V_028714_SPI_SHADER_ZERO:
1481 args->enabled_channels = 0; /* writemask */
1482 args->target = V_008DFC_SQ_EXP_NULL;
1483 break;
1484
1485 case V_028714_SPI_SHADER_32_R:
1486 args->enabled_channels = 1;
1487 args->out[0] = values[0];
1488 break;
1489
1490 case V_028714_SPI_SHADER_32_GR:
1491 args->enabled_channels = 0x3;
1492 args->out[0] = values[0];
1493 args->out[1] = values[1];
1494 break;
1495
1496 case V_028714_SPI_SHADER_32_AR:
1497 if (ctx->ac.chip_class >= GFX10) {
1498 args->enabled_channels = 0x3;
1499 args->out[0] = values[0];
1500 args->out[1] = values[3];
1501 } else {
1502 args->enabled_channels = 0x9;
1503 args->out[0] = values[0];
1504 args->out[3] = values[3];
1505 }
1506 break;
1507
1508 case V_028714_SPI_SHADER_FP16_ABGR:
1509 args->enabled_channels = 0x5;
1510 packf = ac_build_cvt_pkrtz_f16;
1511 if (is_16bit) {
1512 for (unsigned chan = 0; chan < 4; chan++)
1513 values[chan] = LLVMBuildFPExt(ctx->ac.builder,
1514 values[chan],
1515 ctx->ac.f32, "");
1516 }
1517 break;
1518
1519 case V_028714_SPI_SHADER_UNORM16_ABGR:
1520 args->enabled_channels = 0x5;
1521 packf = ac_build_cvt_pknorm_u16;
1522 break;
1523
1524 case V_028714_SPI_SHADER_SNORM16_ABGR:
1525 args->enabled_channels = 0x5;
1526 packf = ac_build_cvt_pknorm_i16;
1527 break;
1528
1529 case V_028714_SPI_SHADER_UINT16_ABGR:
1530 args->enabled_channels = 0x5;
1531 packi = ac_build_cvt_pk_u16;
1532 if (is_16bit) {
1533 for (unsigned chan = 0; chan < 4; chan++)
1534 values[chan] = LLVMBuildZExt(ctx->ac.builder,
1535 ac_to_integer(&ctx->ac, values[chan]),
1536 ctx->ac.i32, "");
1537 }
1538 break;
1539
1540 case V_028714_SPI_SHADER_SINT16_ABGR:
1541 args->enabled_channels = 0x5;
1542 packi = ac_build_cvt_pk_i16;
1543 if (is_16bit) {
1544 for (unsigned chan = 0; chan < 4; chan++)
1545 values[chan] = LLVMBuildSExt(ctx->ac.builder,
1546 ac_to_integer(&ctx->ac, values[chan]),
1547 ctx->ac.i32, "");
1548 }
1549 break;
1550
1551 default:
1552 case V_028714_SPI_SHADER_32_ABGR:
1553 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1554 break;
1555 }
1556
1557 /* Replace NaN by zero (only 32-bit) to fix game bugs if
1558 * requested.
1559 */
1560 if (ctx->args->options->enable_mrt_output_nan_fixup &&
1561 !is_16bit &&
1562 (col_format == V_028714_SPI_SHADER_32_R ||
1563 col_format == V_028714_SPI_SHADER_32_GR ||
1564 col_format == V_028714_SPI_SHADER_32_AR ||
1565 col_format == V_028714_SPI_SHADER_32_ABGR ||
1566 col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
1567 for (unsigned i = 0; i < 4; i++) {
1568 LLVMValueRef args[2] = {
1569 values[i],
1570 LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN, false)
1571 };
1572 LLVMValueRef isnan =
1573 ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1,
1574 args, 2, AC_FUNC_ATTR_READNONE);
1575 values[i] = LLVMBuildSelect(ctx->ac.builder, isnan,
1576 ctx->ac.f32_0,
1577 values[i], "");
1578 }
1579 }
1580
1581 /* Pack f16 or norm_i16/u16. */
1582 if (packf) {
1583 for (chan = 0; chan < 2; chan++) {
1584 LLVMValueRef pack_args[2] = {
1585 values[2 * chan],
1586 values[2 * chan + 1]
1587 };
1588 LLVMValueRef packed;
1589
1590 packed = packf(&ctx->ac, pack_args);
1591 args->out[chan] = ac_to_float(&ctx->ac, packed);
1592 }
1593 args->compr = 1; /* COMPR flag */
1594 }
1595
1596 /* Pack i16/u16. */
1597 if (packi) {
1598 for (chan = 0; chan < 2; chan++) {
1599 LLVMValueRef pack_args[2] = {
1600 ac_to_integer(&ctx->ac, values[2 * chan]),
1601 ac_to_integer(&ctx->ac, values[2 * chan + 1])
1602 };
1603 LLVMValueRef packed;
1604
1605 packed = packi(&ctx->ac, pack_args,
1606 is_int8 ? 8 : is_int10 ? 10 : 16,
1607 chan == 1);
1608 args->out[chan] = ac_to_float(&ctx->ac, packed);
1609 }
1610 args->compr = 1; /* COMPR flag */
1611 }
1612 return;
1613 }
1614
1615 if (is_16bit) {
1616 for (unsigned chan = 0; chan < 4; chan++) {
1617 values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i16, "");
1618 args->out[chan] = LLVMBuildZExt(ctx->ac.builder, values[chan], ctx->ac.i32, "");
1619 }
1620 } else
1621 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1622
1623 for (unsigned i = 0; i < 4; ++i)
1624 args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
1625 }
1626
1627 static void
1628 radv_export_param(struct radv_shader_context *ctx, unsigned index,
1629 LLVMValueRef *values, unsigned enabled_channels)
1630 {
1631 struct ac_export_args args;
1632
1633 si_llvm_init_export_args(ctx, values, enabled_channels,
1634 V_008DFC_SQ_EXP_PARAM + index, &args);
1635 ac_build_export(&ctx->ac, &args);
1636 }
1637
1638 static LLVMValueRef
1639 radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan)
1640 {
1641 LLVMValueRef output = ctx->abi.outputs[ac_llvm_reg_index_soa(index, chan)];
1642 return LLVMBuildLoad(ctx->ac.builder, output, "");
1643 }
1644
1645 static void
1646 radv_emit_stream_output(struct radv_shader_context *ctx,
1647 LLVMValueRef const *so_buffers,
1648 LLVMValueRef const *so_write_offsets,
1649 const struct radv_stream_output *output,
1650 struct radv_shader_output_values *shader_out)
1651 {
1652 unsigned num_comps = util_bitcount(output->component_mask);
1653 unsigned buf = output->buffer;
1654 unsigned offset = output->offset;
1655 unsigned start;
1656 LLVMValueRef out[4];
1657
1658 assert(num_comps && num_comps <= 4);
1659 if (!num_comps || num_comps > 4)
1660 return;
1661
1662 /* Get the first component. */
1663 start = ffs(output->component_mask) - 1;
1664
1665 /* Load the output as int. */
1666 for (int i = 0; i < num_comps; i++) {
1667 out[i] = ac_to_integer(&ctx->ac, shader_out->values[start + i]);
1668 }
1669
1670 /* Pack the output. */
1671 LLVMValueRef vdata = NULL;
1672
1673 switch (num_comps) {
1674 case 1: /* as i32 */
1675 vdata = out[0];
1676 break;
1677 case 2: /* as v2i32 */
1678 case 3: /* as v4i32 (aligned to 4) */
1679 out[3] = LLVMGetUndef(ctx->ac.i32);
1680 /* fall through */
1681 case 4: /* as v4i32 */
1682 vdata = ac_build_gather_values(&ctx->ac, out,
1683 !ac_has_vec3_support(ctx->ac.chip_class, false) ?
1684 util_next_power_of_two(num_comps) :
1685 num_comps);
1686 break;
1687 }
1688
1689 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf],
1690 vdata, num_comps, so_write_offsets[buf],
1691 ctx->ac.i32_0, offset,
1692 ac_glc | ac_slc);
1693 }
1694
1695 static void
1696 radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream)
1697 {
1698 int i;
1699
1700 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
1701 assert(ctx->args->streamout_config.used);
1702 LLVMValueRef so_vtx_count =
1703 ac_build_bfe(&ctx->ac,
1704 ac_get_arg(&ctx->ac, ctx->args->streamout_config),
1705 LLVMConstInt(ctx->ac.i32, 16, false),
1706 LLVMConstInt(ctx->ac.i32, 7, false), false);
1707
1708 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
1709
1710 /* can_emit = tid < so_vtx_count; */
1711 LLVMValueRef can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
1712 tid, so_vtx_count, "");
1713
1714 /* Emit the streamout code conditionally. This actually avoids
1715 * out-of-bounds buffer access. The hw tells us via the SGPR
1716 * (so_vtx_count) which threads are allowed to emit streamout data.
1717 */
1718 ac_build_ifcc(&ctx->ac, can_emit, 6501);
1719 {
1720 /* The buffer offset is computed as follows:
1721 * ByteOffset = streamout_offset[buffer_id]*4 +
1722 * (streamout_write_index + thread_id)*stride[buffer_id] +
1723 * attrib_offset
1724 */
1725 LLVMValueRef so_write_index =
1726 ac_get_arg(&ctx->ac, ctx->args->streamout_write_idx);
1727
1728 /* Compute (streamout_write_index + thread_id). */
1729 so_write_index =
1730 LLVMBuildAdd(ctx->ac.builder, so_write_index, tid, "");
1731
1732 /* Load the descriptor and compute the write offset for each
1733 * enabled buffer.
1734 */
1735 LLVMValueRef so_write_offset[4] = {};
1736 LLVMValueRef so_buffers[4] = {};
1737 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers);
1738
1739 for (i = 0; i < 4; i++) {
1740 uint16_t stride = ctx->args->shader_info->so.strides[i];
1741
1742 if (!stride)
1743 continue;
1744
1745 LLVMValueRef offset =
1746 LLVMConstInt(ctx->ac.i32, i, false);
1747
1748 so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac,
1749 buf_ptr, offset);
1750
1751 LLVMValueRef so_offset =
1752 ac_get_arg(&ctx->ac, ctx->args->streamout_offset[i]);
1753
1754 so_offset = LLVMBuildMul(ctx->ac.builder, so_offset,
1755 LLVMConstInt(ctx->ac.i32, 4, false), "");
1756
1757 so_write_offset[i] =
1758 ac_build_imad(&ctx->ac, so_write_index,
1759 LLVMConstInt(ctx->ac.i32,
1760 stride * 4, false),
1761 so_offset);
1762 }
1763
1764 /* Write streamout data. */
1765 for (i = 0; i < ctx->args->shader_info->so.num_outputs; i++) {
1766 struct radv_shader_output_values shader_out = {};
1767 struct radv_stream_output *output =
1768 &ctx->args->shader_info->so.outputs[i];
1769
1770 if (stream != output->stream)
1771 continue;
1772
1773 for (int j = 0; j < 4; j++) {
1774 shader_out.values[j] =
1775 radv_load_output(ctx, output->location, j);
1776 }
1777
1778 radv_emit_stream_output(ctx, so_buffers,so_write_offset,
1779 output, &shader_out);
1780 }
1781 }
1782 ac_build_endif(&ctx->ac, 6501);
1783 }
1784
1785 static void
1786 radv_build_param_exports(struct radv_shader_context *ctx,
1787 struct radv_shader_output_values *outputs,
1788 unsigned noutput,
1789 struct radv_vs_output_info *outinfo,
1790 bool export_clip_dists)
1791 {
1792 unsigned param_count = 0;
1793
1794 for (unsigned i = 0; i < noutput; i++) {
1795 unsigned slot_name = outputs[i].slot_name;
1796 unsigned usage_mask = outputs[i].usage_mask;
1797
1798 if (slot_name != VARYING_SLOT_LAYER &&
1799 slot_name != VARYING_SLOT_PRIMITIVE_ID &&
1800 slot_name != VARYING_SLOT_VIEWPORT &&
1801 slot_name != VARYING_SLOT_CLIP_DIST0 &&
1802 slot_name != VARYING_SLOT_CLIP_DIST1 &&
1803 slot_name < VARYING_SLOT_VAR0)
1804 continue;
1805
1806 if ((slot_name == VARYING_SLOT_CLIP_DIST0 ||
1807 slot_name == VARYING_SLOT_CLIP_DIST1) && !export_clip_dists)
1808 continue;
1809
1810 radv_export_param(ctx, param_count, outputs[i].values, usage_mask);
1811
1812 assert(i < ARRAY_SIZE(outinfo->vs_output_param_offset));
1813 outinfo->vs_output_param_offset[slot_name] = param_count++;
1814 }
1815
1816 outinfo->param_exports = param_count;
1817 }
1818
1819 /* Generate export instructions for hardware VS shader stage or NGG GS stage
1820 * (position and parameter data only).
1821 */
1822 static void
1823 radv_llvm_export_vs(struct radv_shader_context *ctx,
1824 struct radv_shader_output_values *outputs,
1825 unsigned noutput,
1826 struct radv_vs_output_info *outinfo,
1827 bool export_clip_dists)
1828 {
1829 LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_value = NULL;
1830 struct ac_export_args pos_args[4] = {};
1831 unsigned pos_idx, index;
1832 int i;
1833
1834 /* Build position exports */
1835 for (i = 0; i < noutput; i++) {
1836 switch (outputs[i].slot_name) {
1837 case VARYING_SLOT_POS:
1838 si_llvm_init_export_args(ctx, outputs[i].values, 0xf,
1839 V_008DFC_SQ_EXP_POS, &pos_args[0]);
1840 break;
1841 case VARYING_SLOT_PSIZ:
1842 psize_value = outputs[i].values[0];
1843 break;
1844 case VARYING_SLOT_LAYER:
1845 layer_value = outputs[i].values[0];
1846 break;
1847 case VARYING_SLOT_VIEWPORT:
1848 viewport_value = outputs[i].values[0];
1849 break;
1850 case VARYING_SLOT_CLIP_DIST0:
1851 case VARYING_SLOT_CLIP_DIST1:
1852 index = 2 + outputs[i].slot_index;
1853 si_llvm_init_export_args(ctx, outputs[i].values, 0xf,
1854 V_008DFC_SQ_EXP_POS + index,
1855 &pos_args[index]);
1856 break;
1857 default:
1858 break;
1859 }
1860 }
1861
1862 /* We need to add the position output manually if it's missing. */
1863 if (!pos_args[0].out[0]) {
1864 pos_args[0].enabled_channels = 0xf; /* writemask */
1865 pos_args[0].valid_mask = 0; /* EXEC mask */
1866 pos_args[0].done = 0; /* last export? */
1867 pos_args[0].target = V_008DFC_SQ_EXP_POS;
1868 pos_args[0].compr = 0; /* COMPR flag */
1869 pos_args[0].out[0] = ctx->ac.f32_0; /* X */
1870 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
1871 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
1872 pos_args[0].out[3] = ctx->ac.f32_1; /* W */
1873 }
1874
1875 if (outinfo->writes_pointsize ||
1876 outinfo->writes_layer ||
1877 outinfo->writes_viewport_index) {
1878 pos_args[1].enabled_channels = ((outinfo->writes_pointsize == true ? 1 : 0) |
1879 (outinfo->writes_layer == true ? 4 : 0));
1880 pos_args[1].valid_mask = 0;
1881 pos_args[1].done = 0;
1882 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
1883 pos_args[1].compr = 0;
1884 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
1885 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
1886 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
1887 pos_args[1].out[3] = ctx->ac.f32_0; /* W */
1888
1889 if (outinfo->writes_pointsize == true)
1890 pos_args[1].out[0] = psize_value;
1891 if (outinfo->writes_layer == true)
1892 pos_args[1].out[2] = layer_value;
1893 if (outinfo->writes_viewport_index == true) {
1894 if (ctx->args->options->chip_class >= GFX9) {
1895 /* GFX9 has the layer in out.z[10:0] and the viewport
1896 * index in out.z[19:16].
1897 */
1898 LLVMValueRef v = viewport_value;
1899 v = ac_to_integer(&ctx->ac, v);
1900 v = LLVMBuildShl(ctx->ac.builder, v,
1901 LLVMConstInt(ctx->ac.i32, 16, false),
1902 "");
1903 v = LLVMBuildOr(ctx->ac.builder, v,
1904 ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
1905
1906 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
1907 pos_args[1].enabled_channels |= 1 << 2;
1908 } else {
1909 pos_args[1].out[3] = viewport_value;
1910 pos_args[1].enabled_channels |= 1 << 3;
1911 }
1912 }
1913 }
1914
1915 for (i = 0; i < 4; i++) {
1916 if (pos_args[i].out[0])
1917 outinfo->pos_exports++;
1918 }
1919
1920 /* GFX10 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
1921 * Setting valid_mask=1 prevents it and has no other effect.
1922 */
1923 if (ctx->ac.chip_class == GFX10)
1924 pos_args[0].valid_mask = 1;
1925
1926 pos_idx = 0;
1927 for (i = 0; i < 4; i++) {
1928 if (!pos_args[i].out[0])
1929 continue;
1930
1931 /* Specify the target we are exporting */
1932 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
1933
1934 if (pos_idx == outinfo->pos_exports)
1935 /* Specify that this is the last export */
1936 pos_args[i].done = 1;
1937
1938 ac_build_export(&ctx->ac, &pos_args[i]);
1939 }
1940
1941 /* Build parameter exports */
1942 radv_build_param_exports(ctx, outputs, noutput, outinfo, export_clip_dists);
1943 }
1944
1945 static void
1946 handle_vs_outputs_post(struct radv_shader_context *ctx,
1947 bool export_prim_id,
1948 bool export_clip_dists,
1949 struct radv_vs_output_info *outinfo)
1950 {
1951 struct radv_shader_output_values *outputs;
1952 unsigned noutput = 0;
1953
1954 if (ctx->args->options->key.has_multiview_view_index) {
1955 LLVMValueRef* tmp_out = &ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)];
1956 if(!*tmp_out) {
1957 for(unsigned i = 0; i < 4; ++i)
1958 ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, i)] =
1959 ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
1960 }
1961
1962 LLVMValueRef view_index = ac_get_arg(&ctx->ac, ctx->args->ac.view_index);
1963 LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, view_index), *tmp_out);
1964 ctx->output_mask |= 1ull << VARYING_SLOT_LAYER;
1965 }
1966
1967 memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
1968 sizeof(outinfo->vs_output_param_offset));
1969 outinfo->pos_exports = 0;
1970
1971 if (!ctx->args->options->use_ngg_streamout &&
1972 ctx->args->shader_info->so.num_outputs &&
1973 !ctx->args->is_gs_copy_shader) {
1974 /* The GS copy shader emission already emits streamout. */
1975 radv_emit_streamout(ctx, 0);
1976 }
1977
1978 /* Allocate a temporary array for the output values. */
1979 unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_prim_id;
1980 outputs = malloc(num_outputs * sizeof(outputs[0]));
1981
1982 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1983 if (!(ctx->output_mask & (1ull << i)))
1984 continue;
1985
1986 outputs[noutput].slot_name = i;
1987 outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
1988
1989 if (ctx->stage == MESA_SHADER_VERTEX &&
1990 !ctx->args->is_gs_copy_shader) {
1991 outputs[noutput].usage_mask =
1992 ctx->args->shader_info->vs.output_usage_mask[i];
1993 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
1994 outputs[noutput].usage_mask =
1995 ctx->args->shader_info->tes.output_usage_mask[i];
1996 } else {
1997 assert(ctx->args->is_gs_copy_shader);
1998 outputs[noutput].usage_mask =
1999 ctx->args->shader_info->gs.output_usage_mask[i];
2000 }
2001
2002 for (unsigned j = 0; j < 4; j++) {
2003 outputs[noutput].values[j] =
2004 ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
2005 }
2006
2007 noutput++;
2008 }
2009
2010 /* Export PrimitiveID. */
2011 if (export_prim_id) {
2012 outputs[noutput].slot_name = VARYING_SLOT_PRIMITIVE_ID;
2013 outputs[noutput].slot_index = 0;
2014 outputs[noutput].usage_mask = 0x1;
2015 outputs[noutput].values[0] =
2016 ac_get_arg(&ctx->ac, ctx->args->vs_prim_id);
2017 for (unsigned j = 1; j < 4; j++)
2018 outputs[noutput].values[j] = ctx->ac.f32_0;
2019 noutput++;
2020 }
2021
2022 radv_llvm_export_vs(ctx, outputs, noutput, outinfo, export_clip_dists);
2023
2024 free(outputs);
2025 }
2026
2027 static void
2028 handle_es_outputs_post(struct radv_shader_context *ctx,
2029 struct radv_es_output_info *outinfo)
2030 {
2031 int j;
2032 LLVMValueRef lds_base = NULL;
2033
2034 if (ctx->ac.chip_class >= GFX9) {
2035 unsigned itemsize_dw = outinfo->esgs_itemsize / 4;
2036 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
2037 LLVMValueRef wave_idx =
2038 ac_unpack_param(&ctx->ac,
2039 ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 24, 4);
2040 vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
2041 LLVMBuildMul(ctx->ac.builder, wave_idx,
2042 LLVMConstInt(ctx->ac.i32,
2043 ctx->ac.wave_size, false), ""), "");
2044 lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
2045 LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
2046 }
2047
2048 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2049 LLVMValueRef dw_addr = NULL;
2050 LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4];
2051 unsigned output_usage_mask;
2052 int param_index;
2053
2054 if (!(ctx->output_mask & (1ull << i)))
2055 continue;
2056
2057 if (ctx->stage == MESA_SHADER_VERTEX) {
2058 output_usage_mask =
2059 ctx->args->shader_info->vs.output_usage_mask[i];
2060 } else {
2061 assert(ctx->stage == MESA_SHADER_TESS_EVAL);
2062 output_usage_mask =
2063 ctx->args->shader_info->tes.output_usage_mask[i];
2064 }
2065
2066 param_index = shader_io_get_unique_index(i);
2067
2068 if (lds_base) {
2069 dw_addr = LLVMBuildAdd(ctx->ac.builder, lds_base,
2070 LLVMConstInt(ctx->ac.i32, param_index * 4, false),
2071 "");
2072 }
2073
2074 for (j = 0; j < 4; j++) {
2075 if (!(output_usage_mask & (1 << j)))
2076 continue;
2077
2078 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
2079 out_val = ac_to_integer(&ctx->ac, out_val);
2080 out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
2081
2082 if (ctx->ac.chip_class >= GFX9) {
2083 LLVMValueRef dw_addr_offset =
2084 LLVMBuildAdd(ctx->ac.builder, dw_addr,
2085 LLVMConstInt(ctx->ac.i32,
2086 j, false), "");
2087
2088 ac_lds_store(&ctx->ac, dw_addr_offset, out_val);
2089 } else {
2090 ac_build_buffer_store_dword(&ctx->ac,
2091 ctx->esgs_ring,
2092 out_val, 1,
2093 NULL,
2094 ac_get_arg(&ctx->ac, ctx->args->es2gs_offset),
2095 (4 * param_index + j) * 4,
2096 ac_glc | ac_slc | ac_swizzled);
2097 }
2098 }
2099 }
2100 }
2101
2102 static void
2103 handle_ls_outputs_post(struct radv_shader_context *ctx)
2104 {
2105 LLVMValueRef vertex_id = ctx->rel_auto_id;
2106 uint32_t num_tcs_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written);
2107 LLVMValueRef vertex_dw_stride = LLVMConstInt(ctx->ac.i32, num_tcs_inputs * 4, false);
2108 LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id,
2109 vertex_dw_stride, "");
2110
2111 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2112 LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4];
2113
2114 if (!(ctx->output_mask & (1ull << i)))
2115 continue;
2116
2117 int param = shader_io_get_unique_index(i);
2118 LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
2119 LLVMConstInt(ctx->ac.i32, param * 4, false),
2120 "");
2121 for (unsigned j = 0; j < 4; j++) {
2122 LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
2123 value = ac_to_integer(&ctx->ac, value);
2124 value = LLVMBuildZExtOrBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
2125 ac_lds_store(&ctx->ac, dw_addr, value);
2126 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, ctx->ac.i32_1, "");
2127 }
2128 }
2129 }
2130
2131 static LLVMValueRef get_wave_id_in_tg(struct radv_shader_context *ctx)
2132 {
2133 return ac_unpack_param(&ctx->ac,
2134 ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 24, 4);
2135 }
2136
2137 static LLVMValueRef get_tgsize(struct radv_shader_context *ctx)
2138 {
2139 return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 28, 4);
2140 }
2141
2142 static LLVMValueRef get_thread_id_in_tg(struct radv_shader_context *ctx)
2143 {
2144 LLVMBuilderRef builder = ctx->ac.builder;
2145 LLVMValueRef tmp;
2146 tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
2147 LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
2148 return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
2149 }
2150
2151 static LLVMValueRef ngg_get_vtx_cnt(struct radv_shader_context *ctx)
2152 {
2153 return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info),
2154 LLVMConstInt(ctx->ac.i32, 12, false),
2155 LLVMConstInt(ctx->ac.i32, 9, false),
2156 false);
2157 }
2158
2159 static LLVMValueRef ngg_get_prim_cnt(struct radv_shader_context *ctx)
2160 {
2161 return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info),
2162 LLVMConstInt(ctx->ac.i32, 22, false),
2163 LLVMConstInt(ctx->ac.i32, 9, false),
2164 false);
2165 }
2166
2167 static LLVMValueRef ngg_get_ordered_id(struct radv_shader_context *ctx)
2168 {
2169 return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info),
2170 ctx->ac.i32_0,
2171 LLVMConstInt(ctx->ac.i32, 12, false),
2172 false);
2173 }
2174
2175 static LLVMValueRef
2176 ngg_gs_get_vertex_storage(struct radv_shader_context *ctx)
2177 {
2178 unsigned num_outputs = util_bitcount64(ctx->output_mask);
2179
2180 if (ctx->args->options->key.has_multiview_view_index)
2181 num_outputs++;
2182
2183 LLVMTypeRef elements[2] = {
2184 LLVMArrayType(ctx->ac.i32, 4 * num_outputs),
2185 LLVMArrayType(ctx->ac.i8, 4),
2186 };
2187 LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
2188 type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
2189 return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
2190 }
2191
2192 /**
2193 * Return a pointer to the LDS storage reserved for the N'th vertex, where N
2194 * is in emit order; that is:
2195 * - during the epilogue, N is the threadidx (relative to the entire threadgroup)
2196 * - during vertex emit, i.e. while the API GS shader invocation is running,
2197 * N = threadidx * gs_max_out_vertices + emitidx
2198 *
2199 * Goals of the LDS memory layout:
2200 * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
2201 * in uniform control flow
2202 * 2. Eliminate bank conflicts on read for export if, additionally, there is no
2203 * culling
2204 * 3. Agnostic to the number of waves (since we don't know it before compiling)
2205 * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
2206 * 5. Avoid wasting memory.
2207 *
2208 * We use an AoS layout due to point 4 (this also helps point 3). In an AoS
2209 * layout, elimination of bank conflicts requires that each vertex occupy an
2210 * odd number of dwords. We use the additional dword to store the output stream
2211 * index as well as a flag to indicate whether this vertex ends a primitive
2212 * for rasterization.
2213 *
2214 * Swizzling is required to satisfy points 1 and 2 simultaneously.
2215 *
2216 * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).
2217 * Indices are swizzled in groups of 32, which ensures point 1 without
2218 * disturbing point 2.
2219 *
2220 * \return an LDS pointer to type {[N x i32], [4 x i8]}
2221 */
2222 static LLVMValueRef
2223 ngg_gs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexidx)
2224 {
2225 LLVMBuilderRef builder = ctx->ac.builder;
2226 LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
2227
2228 /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
2229 unsigned write_stride_2exp = ffs(ctx->shader->info.gs.vertices_out) - 1;
2230 if (write_stride_2exp) {
2231 LLVMValueRef row =
2232 LLVMBuildLShr(builder, vertexidx,
2233 LLVMConstInt(ctx->ac.i32, 5, false), "");
2234 LLVMValueRef swizzle =
2235 LLVMBuildAnd(builder, row,
2236 LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1,
2237 false), "");
2238 vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
2239 }
2240
2241 return ac_build_gep0(&ctx->ac, storage, vertexidx);
2242 }
2243
2244 static LLVMValueRef
2245 ngg_gs_emit_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef gsthread,
2246 LLVMValueRef emitidx)
2247 {
2248 LLVMBuilderRef builder = ctx->ac.builder;
2249 LLVMValueRef tmp;
2250
2251 tmp = LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false);
2252 tmp = LLVMBuildMul(builder, tmp, gsthread, "");
2253 const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
2254 return ngg_gs_vertex_ptr(ctx, vertexidx);
2255 }
2256
2257 static LLVMValueRef
2258 ngg_gs_get_emit_output_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr,
2259 unsigned out_idx)
2260 {
2261 LLVMValueRef gep_idx[3] = {
2262 ctx->ac.i32_0, /* implied C-style array */
2263 ctx->ac.i32_0, /* first struct entry */
2264 LLVMConstInt(ctx->ac.i32, out_idx, false),
2265 };
2266 return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
2267 }
2268
2269 static LLVMValueRef
2270 ngg_gs_get_emit_primflag_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr,
2271 unsigned stream)
2272 {
2273 LLVMValueRef gep_idx[3] = {
2274 ctx->ac.i32_0, /* implied C-style array */
2275 ctx->ac.i32_1, /* second struct entry */
2276 LLVMConstInt(ctx->ac.i32, stream, false),
2277 };
2278 return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
2279 }
2280
2281 static struct radv_stream_output *
2282 radv_get_stream_output_by_loc(struct radv_streamout_info *so, unsigned location)
2283 {
2284 for (unsigned i = 0; i < so->num_outputs; ++i) {
2285 if (so->outputs[i].location == location)
2286 return &so->outputs[i];
2287 }
2288
2289 return NULL;
2290 }
2291
2292 static void build_streamout_vertex(struct radv_shader_context *ctx,
2293 LLVMValueRef *so_buffer, LLVMValueRef *wg_offset_dw,
2294 unsigned stream, LLVMValueRef offset_vtx,
2295 LLVMValueRef vertexptr)
2296 {
2297 struct radv_streamout_info *so = &ctx->args->shader_info->so;
2298 LLVMBuilderRef builder = ctx->ac.builder;
2299 LLVMValueRef offset[4] = {};
2300 LLVMValueRef tmp;
2301
2302 for (unsigned buffer = 0; buffer < 4; ++buffer) {
2303 if (!wg_offset_dw[buffer])
2304 continue;
2305
2306 tmp = LLVMBuildMul(builder, offset_vtx,
2307 LLVMConstInt(ctx->ac.i32, so->strides[buffer], false), "");
2308 tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
2309 offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
2310 }
2311
2312 if (ctx->stage == MESA_SHADER_GEOMETRY) {
2313 struct radv_shader_output_values outputs[AC_LLVM_MAX_OUTPUTS];
2314 unsigned noutput = 0;
2315 unsigned out_idx = 0;
2316
2317 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2318 unsigned output_usage_mask =
2319 ctx->args->shader_info->gs.output_usage_mask[i];
2320 uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
2321
2322 if (!(ctx->output_mask & (1ull << i)) ||
2323 output_stream != stream)
2324 continue;
2325
2326 outputs[noutput].slot_name = i;
2327 outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
2328 outputs[noutput].usage_mask = output_usage_mask;
2329
2330 int length = util_last_bit(output_usage_mask);
2331
2332 for (unsigned j = 0; j < length; j++, out_idx++) {
2333 if (!(output_usage_mask & (1 << j)))
2334 continue;
2335
2336 tmp = ac_build_gep0(&ctx->ac, vertexptr,
2337 LLVMConstInt(ctx->ac.i32, out_idx, false));
2338 outputs[noutput].values[j] = LLVMBuildLoad(builder, tmp, "");
2339 }
2340
2341 for (unsigned j = length; j < 4; j++)
2342 outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32);
2343
2344 noutput++;
2345 }
2346
2347 for (unsigned i = 0; i < noutput; i++) {
2348 struct radv_stream_output *output =
2349 radv_get_stream_output_by_loc(so, outputs[i].slot_name);
2350
2351 if (!output ||
2352 output->stream != stream)
2353 continue;
2354
2355 struct radv_shader_output_values out = {};
2356
2357 for (unsigned j = 0; j < 4; j++) {
2358 out.values[j] = outputs[i].values[j];
2359 }
2360
2361 radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
2362 }
2363 } else {
2364 for (unsigned i = 0; i < so->num_outputs; ++i) {
2365 struct radv_stream_output *output =
2366 &ctx->args->shader_info->so.outputs[i];
2367
2368 if (stream != output->stream)
2369 continue;
2370
2371 struct radv_shader_output_values out = {};
2372
2373 for (unsigned comp = 0; comp < 4; comp++) {
2374 if (!(output->component_mask & (1 << comp)))
2375 continue;
2376
2377 tmp = ac_build_gep0(&ctx->ac, vertexptr,
2378 LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
2379 out.values[comp] = LLVMBuildLoad(builder, tmp, "");
2380 }
2381
2382 radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
2383 }
2384 }
2385 }
2386
2387 struct ngg_streamout {
2388 LLVMValueRef num_vertices;
2389
2390 /* per-thread data */
2391 LLVMValueRef prim_enable[4]; /* i1 per stream */
2392 LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */
2393
2394 /* Output */
2395 LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
2396 };
2397
2398 /**
2399 * Build streamout logic.
2400 *
2401 * Implies a barrier.
2402 *
2403 * Writes number of emitted primitives to gs_ngg_scratch[4:7].
2404 *
2405 * Clobbers gs_ngg_scratch[8:].
2406 */
2407 static void build_streamout(struct radv_shader_context *ctx,
2408 struct ngg_streamout *nggso)
2409 {
2410 struct radv_streamout_info *so = &ctx->args->shader_info->so;
2411 LLVMBuilderRef builder = ctx->ac.builder;
2412 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers);
2413 LLVMValueRef tid = get_thread_id_in_tg(ctx);
2414 LLVMValueRef cond, tmp, tmp2;
2415 LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
2416 LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
2417 LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
2418 LLVMValueRef so_buffer[4] = {};
2419 unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) +
2420 (nggso->vertices[2] ? 1 : 0);
2421 LLVMValueRef prim_stride_dw[4] = {};
2422 LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
2423 int stream_for_buffer[4] = { -1, -1, -1, -1 };
2424 unsigned bufmask_for_stream[4] = {};
2425 bool isgs = ctx->stage == MESA_SHADER_GEOMETRY;
2426 unsigned scratch_emit_base = isgs ? 4 : 0;
2427 LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
2428 unsigned scratch_offset_base = isgs ? 8 : 4;
2429 LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
2430
2431 ac_llvm_add_target_dep_function_attr(ctx->main_function,
2432 "amdgpu-gds-size", 256);
2433
2434 /* Determine the mapping of streamout buffers to vertex streams. */
2435 for (unsigned i = 0; i < so->num_outputs; ++i) {
2436 unsigned buf = so->outputs[i].buffer;
2437 unsigned stream = so->outputs[i].stream;
2438 assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
2439 stream_for_buffer[buf] = stream;
2440 bufmask_for_stream[stream] |= 1 << buf;
2441 }
2442
2443 for (unsigned buffer = 0; buffer < 4; ++buffer) {
2444 if (stream_for_buffer[buffer] == -1)
2445 continue;
2446
2447 assert(so->strides[buffer]);
2448
2449 LLVMValueRef stride_for_buffer =
2450 LLVMConstInt(ctx->ac.i32, so->strides[buffer], false);
2451 prim_stride_dw[buffer] =
2452 LLVMBuildMul(builder, stride_for_buffer,
2453 nggso->num_vertices, "");
2454 prim_stride_dw_vgpr = ac_build_writelane(
2455 &ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
2456 LLVMConstInt(ctx->ac.i32, buffer, false));
2457
2458 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, buffer, false);
2459 so_buffer[buffer] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
2460 offset);
2461 }
2462
2463 cond = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
2464 ac_build_ifcc(&ctx->ac, cond, 5200);
2465 {
2466 LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
2467 LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
2468
2469 /* Advance the streamout offsets in GDS. */
2470 LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
2471 LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
2472
2473 cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
2474 ac_build_ifcc(&ctx->ac, cond, 5210);
2475 {
2476 /* Fetch the number of generated primitives and store
2477 * it in GDS for later use.
2478 */
2479 if (isgs) {
2480 tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
2481 tmp = LLVMBuildLoad(builder, tmp, "");
2482 } else {
2483 tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0,
2484 ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
2485 }
2486 LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
2487
2488 unsigned swizzle[4];
2489 int unused_stream = -1;
2490 for (unsigned stream = 0; stream < 4; ++stream) {
2491 if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) {
2492 unused_stream = stream;
2493 break;
2494 }
2495 }
2496 for (unsigned buffer = 0; buffer < 4; ++buffer) {
2497 if (stream_for_buffer[buffer] >= 0) {
2498 swizzle[buffer] = stream_for_buffer[buffer];
2499 } else {
2500 assert(unused_stream >= 0);
2501 swizzle[buffer] = unused_stream;
2502 }
2503 }
2504
2505 tmp = ac_build_quad_swizzle(&ctx->ac, tmp,
2506 swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2507 tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
2508
2509 LLVMValueRef args[] = {
2510 LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
2511 tmp,
2512 ctx->ac.i32_0, // ordering
2513 ctx->ac.i32_0, // scope
2514 ctx->ac.i1false, // isVolatile
2515 LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
2516 ctx->ac.i1true, // wave release
2517 ctx->ac.i1true, // wave done
2518 };
2519
2520 tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add",
2521 ctx->ac.i32, args, ARRAY_SIZE(args), 0);
2522
2523 /* Keep offsets in a VGPR for quick retrieval via readlane by
2524 * the first wave for bounds checking, and also store in LDS
2525 * for retrieval by all waves later. */
2526 LLVMBuildStore(builder, tmp, offsets_vgpr);
2527
2528 tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac),
2529 scratch_offset_basev, "");
2530 tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
2531 LLVMBuildStore(builder, tmp, tmp2);
2532 }
2533 ac_build_endif(&ctx->ac, 5210);
2534
2535 /* Determine the max emit per buffer. This is done via the SALU, in part
2536 * because LLVM can't generate divide-by-multiply if we try to do this
2537 * via VALU with one lane per buffer.
2538 */
2539 LLVMValueRef max_emit[4] = {};
2540 for (unsigned buffer = 0; buffer < 4; ++buffer) {
2541 if (stream_for_buffer[buffer] == -1)
2542 continue;
2543
2544 /* Compute the streamout buffer size in DWORD. */
2545 LLVMValueRef bufsize_dw =
2546 LLVMBuildLShr(builder,
2547 LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""),
2548 i32_2, "");
2549
2550 /* Load the streamout buffer offset from GDS. */
2551 tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
2552 LLVMValueRef offset_dw =
2553 ac_build_readlane(&ctx->ac, tmp,
2554 LLVMConstInt(ctx->ac.i32, buffer, false));
2555
2556 /* Compute the remaining size to emit. */
2557 LLVMValueRef remaining_dw =
2558 LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
2559 tmp = LLVMBuildUDiv(builder, remaining_dw,
2560 prim_stride_dw[buffer], "");
2561
2562 cond = LLVMBuildICmp(builder, LLVMIntULT,
2563 bufsize_dw, offset_dw, "");
2564 max_emit[buffer] = LLVMBuildSelect(builder, cond,
2565 ctx->ac.i32_0, tmp, "");
2566 }
2567
2568 /* Determine the number of emitted primitives per stream and fixup the
2569 * GDS counter if necessary.
2570 *
2571 * This is complicated by the fact that a single stream can emit to
2572 * multiple buffers (but luckily not vice versa).
2573 */
2574 LLVMValueRef emit_vgpr = ctx->ac.i32_0;
2575
2576 for (unsigned stream = 0; stream < 4; ++stream) {
2577 if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
2578 continue;
2579
2580 /* Load the number of generated primitives from GDS and
2581 * determine that number for the given stream.
2582 */
2583 tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
2584 LLVMValueRef generated =
2585 ac_build_readlane(&ctx->ac, tmp,
2586 LLVMConstInt(ctx->ac.i32, stream, false));
2587
2588
2589 /* Compute the number of emitted primitives. */
2590 LLVMValueRef emit = generated;
2591 for (unsigned buffer = 0; buffer < 4; ++buffer) {
2592 if (stream_for_buffer[buffer] == stream)
2593 emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
2594 }
2595
2596 /* Store the number of emitted primitives for that
2597 * stream.
2598 */
2599 emit_vgpr = ac_build_writelane(&ctx->ac, emit_vgpr, emit,
2600 LLVMConstInt(ctx->ac.i32, stream, false));
2601
2602 /* Fixup the offset using a plain GDS atomic if we overflowed. */
2603 cond = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
2604 ac_build_ifcc(&ctx->ac, cond, 5221); /* scalar branch */
2605 tmp = LLVMBuildLShr(builder,
2606 LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
2607 ac_get_thread_id(&ctx->ac), "");
2608 tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
2609 ac_build_ifcc(&ctx->ac, tmp, 5222);
2610 {
2611 tmp = LLVMBuildSub(builder, generated, emit, "");
2612 tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
2613 tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
2614 LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
2615 LLVMAtomicOrderingMonotonic, false);
2616 }
2617 ac_build_endif(&ctx->ac, 5222);
2618 ac_build_endif(&ctx->ac, 5221);
2619 }
2620
2621 /* Store the number of emitted primitives to LDS for later use. */
2622 cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
2623 ac_build_ifcc(&ctx->ac, cond, 5225);
2624 {
2625 tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac),
2626 scratch_emit_basev, "");
2627 tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
2628 LLVMBuildStore(builder, emit_vgpr, tmp);
2629 }
2630 ac_build_endif(&ctx->ac, 5225);
2631 }
2632 ac_build_endif(&ctx->ac, 5200);
2633
2634 /* Determine the workgroup-relative per-thread / primitive offset into
2635 * the streamout buffers */
2636 struct ac_wg_scan primemit_scan[4] = {};
2637
2638 if (isgs) {
2639 for (unsigned stream = 0; stream < 4; ++stream) {
2640 if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
2641 continue;
2642
2643 primemit_scan[stream].enable_exclusive = true;
2644 primemit_scan[stream].op = nir_op_iadd;
2645 primemit_scan[stream].src = nggso->prim_enable[stream];
2646 primemit_scan[stream].scratch =
2647 ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
2648 LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
2649 primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
2650 primemit_scan[stream].numwaves = get_tgsize(ctx);
2651 primemit_scan[stream].maxwaves = 8;
2652 ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
2653 }
2654 }
2655
2656 ac_build_s_barrier(&ctx->ac);
2657
2658 /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
2659 LLVMValueRef wgoffset_dw[4] = {};
2660
2661 {
2662 LLVMValueRef scratch_vgpr;
2663
2664 tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
2665 scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
2666
2667 for (unsigned buffer = 0; buffer < 4; ++buffer) {
2668 if (stream_for_buffer[buffer] >= 0) {
2669 wgoffset_dw[buffer] = ac_build_readlane(
2670 &ctx->ac, scratch_vgpr,
2671 LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
2672 }
2673 }
2674
2675 for (unsigned stream = 0; stream < 4; ++stream) {
2676 if (ctx->args->shader_info->gs.num_stream_output_components[stream]) {
2677 nggso->emit[stream] = ac_build_readlane(
2678 &ctx->ac, scratch_vgpr,
2679 LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
2680 }
2681 }
2682 }
2683
2684 /* Write out primitive data */
2685 for (unsigned stream = 0; stream < 4; ++stream) {
2686 if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
2687 continue;
2688
2689 if (isgs) {
2690 ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
2691 } else {
2692 primemit_scan[stream].result_exclusive = tid;
2693 }
2694
2695 cond = LLVMBuildICmp(builder, LLVMIntULT,
2696 primemit_scan[stream].result_exclusive,
2697 nggso->emit[stream], "");
2698 cond = LLVMBuildAnd(builder, cond, nggso->prim_enable[stream], "");
2699 ac_build_ifcc(&ctx->ac, cond, 5240);
2700 {
2701 LLVMValueRef offset_vtx =
2702 LLVMBuildMul(builder, primemit_scan[stream].result_exclusive,
2703 nggso->num_vertices, "");
2704
2705 for (unsigned i = 0; i < max_num_vertices; ++i) {
2706 cond = LLVMBuildICmp(builder, LLVMIntULT,
2707 LLVMConstInt(ctx->ac.i32, i, false),
2708 nggso->num_vertices, "");
2709 ac_build_ifcc(&ctx->ac, cond, 5241);
2710 build_streamout_vertex(ctx, so_buffer, wgoffset_dw,
2711 stream, offset_vtx, nggso->vertices[i]);
2712 ac_build_endif(&ctx->ac, 5241);
2713 offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
2714 }
2715 }
2716 ac_build_endif(&ctx->ac, 5240);
2717 }
2718 }
2719
2720 static unsigned ngg_nogs_vertex_size(struct radv_shader_context *ctx)
2721 {
2722 unsigned lds_vertex_size = 0;
2723
2724 if (ctx->args->shader_info->so.num_outputs)
2725 lds_vertex_size = 4 * ctx->args->shader_info->so.num_outputs + 1;
2726
2727 return lds_vertex_size;
2728 }
2729
2730 /**
2731 * Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage
2732 * for the vertex outputs.
2733 */
2734 static LLVMValueRef ngg_nogs_vertex_ptr(struct radv_shader_context *ctx,
2735 LLVMValueRef vtxid)
2736 {
2737 /* The extra dword is used to avoid LDS bank conflicts. */
2738 unsigned vertex_size = ngg_nogs_vertex_size(ctx);
2739 LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
2740 LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
2741 LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
2742 return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
2743 }
2744
2745 static void
2746 handle_ngg_outputs_post_1(struct radv_shader_context *ctx)
2747 {
2748 struct radv_streamout_info *so = &ctx->args->shader_info->so;
2749 LLVMBuilderRef builder = ctx->ac.builder;
2750 LLVMValueRef vertex_ptr = NULL;
2751 LLVMValueRef tmp, tmp2;
2752
2753 assert((ctx->stage == MESA_SHADER_VERTEX ||
2754 ctx->stage == MESA_SHADER_TESS_EVAL) && !ctx->args->is_gs_copy_shader);
2755
2756 if (!ctx->args->shader_info->so.num_outputs)
2757 return;
2758
2759 vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
2760
2761 for (unsigned i = 0; i < so->num_outputs; ++i) {
2762 struct radv_stream_output *output =
2763 &ctx->args->shader_info->so.outputs[i];
2764
2765 unsigned loc = output->location;
2766
2767 for (unsigned comp = 0; comp < 4; comp++) {
2768 if (!(output->component_mask & (1 << comp)))
2769 continue;
2770
2771 tmp = ac_build_gep0(&ctx->ac, vertex_ptr,
2772 LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
2773 tmp2 = LLVMBuildLoad(builder,
2774 ctx->abi.outputs[4 * loc + comp], "");
2775 tmp2 = ac_to_integer(&ctx->ac, tmp2);
2776 LLVMBuildStore(builder, tmp2, tmp);
2777 }
2778 }
2779 }
2780
2781 static void
2782 handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
2783 {
2784 LLVMBuilderRef builder = ctx->ac.builder;
2785 LLVMValueRef tmp;
2786
2787 assert((ctx->stage == MESA_SHADER_VERTEX ||
2788 ctx->stage == MESA_SHADER_TESS_EVAL) && !ctx->args->is_gs_copy_shader);
2789
2790 LLVMValueRef prims_in_wave = ac_unpack_param(&ctx->ac,
2791 ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 8, 8);
2792 LLVMValueRef vtx_in_wave = ac_unpack_param(&ctx->ac,
2793 ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 0, 8);
2794 LLVMValueRef is_gs_thread = LLVMBuildICmp(builder, LLVMIntULT,
2795 ac_get_thread_id(&ctx->ac), prims_in_wave, "");
2796 LLVMValueRef is_es_thread = LLVMBuildICmp(builder, LLVMIntULT,
2797 ac_get_thread_id(&ctx->ac), vtx_in_wave, "");
2798 LLVMValueRef vtxindex[] = {
2799 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[0]), 0, 16),
2800 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[0]), 16, 16),
2801 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[2]), 0, 16),
2802 };
2803
2804 /* Determine the number of vertices per primitive. */
2805 unsigned num_vertices;
2806 LLVMValueRef num_vertices_val;
2807
2808 if (ctx->stage == MESA_SHADER_VERTEX) {
2809 LLVMValueRef outprim_val =
2810 LLVMConstInt(ctx->ac.i32,
2811 ctx->args->options->key.vs.outprim, false);
2812 num_vertices_val = LLVMBuildAdd(builder, outprim_val,
2813 ctx->ac.i32_1, "");
2814 num_vertices = 3; /* TODO: optimize for points & lines */
2815 } else {
2816 assert(ctx->stage == MESA_SHADER_TESS_EVAL);
2817
2818 if (ctx->shader->info.tess.point_mode)
2819 num_vertices = 1;
2820 else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES)
2821 num_vertices = 2;
2822 else
2823 num_vertices = 3;
2824
2825 num_vertices_val = LLVMConstInt(ctx->ac.i32, num_vertices, false);
2826 }
2827
2828 /* Streamout */
2829 if (ctx->args->shader_info->so.num_outputs) {
2830 struct ngg_streamout nggso = {};
2831
2832 nggso.num_vertices = num_vertices_val;
2833 nggso.prim_enable[0] = is_gs_thread;
2834
2835 for (unsigned i = 0; i < num_vertices; ++i)
2836 nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
2837
2838 build_streamout(ctx, &nggso);
2839 }
2840
2841 /* Copy Primitive IDs from GS threads to the LDS address corresponding
2842 * to the ES thread of the provoking vertex.
2843 */
2844 if (ctx->stage == MESA_SHADER_VERTEX &&
2845 ctx->args->options->key.vs_common_out.export_prim_id) {
2846 if (ctx->args->shader_info->so.num_outputs)
2847 ac_build_s_barrier(&ctx->ac);
2848
2849 ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
2850 /* Extract the PROVOKING_VTX_INDEX field. */
2851 LLVMValueRef provoking_vtx_in_prim =
2852 LLVMConstInt(ctx->ac.i32, 0, false);
2853
2854 /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
2855 LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
2856 LLVMValueRef provoking_vtx_index =
2857 LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
2858
2859 LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args->ac.gs_prim_id),
2860 ac_build_gep0(&ctx->ac, ctx->esgs_ring, provoking_vtx_index));
2861 ac_build_endif(&ctx->ac, 5400);
2862 }
2863
2864 /* TODO: primitive culling */
2865
2866 ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
2867 ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx));
2868
2869 /* TODO: streamout queries */
2870 /* Export primitive data to the index buffer.
2871 *
2872 * For the first version, we will always build up all three indices
2873 * independent of the primitive type. The additional garbage data
2874 * shouldn't hurt.
2875 *
2876 * TODO: culling depends on the primitive type, so can have some
2877 * interaction here.
2878 */
2879 ac_build_ifcc(&ctx->ac, is_gs_thread, 6001);
2880 {
2881 struct ac_ngg_prim prim = {};
2882
2883 if (ctx->args->options->key.vs_common_out.as_ngg_passthrough) {
2884 prim.passthrough = ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[0]);
2885 } else {
2886 prim.num_vertices = num_vertices;
2887 prim.isnull = ctx->ac.i1false;
2888 memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);
2889
2890 for (unsigned i = 0; i < num_vertices; ++i) {
2891 tmp = LLVMBuildLShr(builder,
2892 ac_get_arg(&ctx->ac, ctx->args->ac.gs_invocation_id),
2893 LLVMConstInt(ctx->ac.i32, 8 + i, false), "");
2894 prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
2895 }
2896 }
2897
2898 ac_build_export_prim(&ctx->ac, &prim);
2899 }
2900 ac_build_endif(&ctx->ac, 6001);
2901
2902 /* Export per-vertex data (positions and parameters). */
2903 ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
2904 {
2905 struct radv_vs_output_info *outinfo =
2906 ctx->stage == MESA_SHADER_TESS_EVAL ?
2907 &ctx->args->shader_info->tes.outinfo : &ctx->args->shader_info->vs.outinfo;
2908
2909 /* Exporting the primitive ID is handled below. */
2910 /* TODO: use the new VS export path */
2911 handle_vs_outputs_post(ctx, false,
2912 ctx->args->options->key.vs_common_out.export_clip_dists,
2913 outinfo);
2914
2915 if (ctx->args->options->key.vs_common_out.export_prim_id) {
2916 unsigned param_count = outinfo->param_exports;
2917 LLVMValueRef values[4];
2918
2919 if (ctx->stage == MESA_SHADER_VERTEX) {
2920 /* Wait for GS stores to finish. */
2921 ac_build_s_barrier(&ctx->ac);
2922
2923 tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring,
2924 get_thread_id_in_tg(ctx));
2925 values[0] = LLVMBuildLoad(builder, tmp, "");
2926 } else {
2927 assert(ctx->stage == MESA_SHADER_TESS_EVAL);
2928 values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.tes_patch_id);
2929 }
2930
2931 values[0] = ac_to_float(&ctx->ac, values[0]);
2932 for (unsigned j = 1; j < 4; j++)
2933 values[j] = ctx->ac.f32_0;
2934
2935 radv_export_param(ctx, param_count, values, 0x1);
2936
2937 outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count++;
2938 outinfo->param_exports = param_count;
2939 }
2940 }
2941 ac_build_endif(&ctx->ac, 6002);
2942 }
2943
2944 static void gfx10_ngg_gs_emit_prologue(struct radv_shader_context *ctx)
2945 {
2946 /* Zero out the part of LDS scratch that is used to accumulate the
2947 * per-stream generated primitive count.
2948 */
2949 LLVMBuilderRef builder = ctx->ac.builder;
2950 LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
2951 LLVMValueRef tid = get_thread_id_in_tg(ctx);
2952 LLVMBasicBlockRef merge_block;
2953 LLVMValueRef cond;
2954
2955 LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->ac.builder));
2956 LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");
2957 merge_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");
2958
2959 cond = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
2960 LLVMBuildCondBr(ctx->ac.builder, cond, then_block, merge_block);
2961 LLVMPositionBuilderAtEnd(ctx->ac.builder, then_block);
2962
2963 LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
2964 LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
2965
2966 LLVMBuildBr(ctx->ac.builder, merge_block);
2967 LLVMPositionBuilderAtEnd(ctx->ac.builder, merge_block);
2968
2969 ac_build_s_barrier(&ctx->ac);
2970 }
2971
2972 static void gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx)
2973 {
2974 LLVMBuilderRef builder = ctx->ac.builder;
2975 LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
2976 LLVMValueRef tmp;
2977
2978 /* Zero out remaining (non-emitted) primitive flags.
2979 *
2980 * Note: Alternatively, we could pass the relevant gs_next_vertex to
2981 * the emit threads via LDS. This is likely worse in the expected
2982 * typical case where each GS thread emits the full set of
2983 * vertices.
2984 */
2985 for (unsigned stream = 0; stream < 4; ++stream) {
2986 unsigned num_components;
2987
2988 num_components =
2989 ctx->args->shader_info->gs.num_stream_output_components[stream];
2990 if (!num_components)
2991 continue;
2992
2993 const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
2994
2995 ac_build_bgnloop(&ctx->ac, 5100);
2996
2997 const LLVMValueRef vertexidx =
2998 LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
2999 tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
3000 LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), "");
3001 ac_build_ifcc(&ctx->ac, tmp, 5101);
3002 ac_build_break(&ctx->ac);
3003 ac_build_endif(&ctx->ac, 5101);
3004
3005 tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
3006 LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
3007
3008 tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
3009 LLVMBuildStore(builder, i8_0,
3010 ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
3011
3012 ac_build_endloop(&ctx->ac, 5100);
3013 }
3014
3015 /* Accumulate generated primitives counts across the entire threadgroup. */
3016 for (unsigned stream = 0; stream < 4; ++stream) {
3017 unsigned num_components;
3018
3019 num_components =
3020 ctx->args->shader_info->gs.num_stream_output_components[stream];
3021 if (!num_components)
3022 continue;
3023
3024 LLVMValueRef numprims =
3025 LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
3026 numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
3027
3028 tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
3029 ac_build_ifcc(&ctx->ac, tmp, 5105);
3030 {
3031 LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
3032 ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
3033 LLVMConstInt(ctx->ac.i32, stream, false)),
3034 numprims, LLVMAtomicOrderingMonotonic, false);
3035 }
3036 ac_build_endif(&ctx->ac, 5105);
3037 }
3038 }
3039
3040 static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
3041 {
3042 const unsigned verts_per_prim = si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive);
3043 LLVMBuilderRef builder = ctx->ac.builder;
3044 LLVMValueRef tmp, tmp2;
3045
3046 ac_build_s_barrier(&ctx->ac);
3047
3048 const LLVMValueRef tid = get_thread_id_in_tg(ctx);
3049 LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
3050
3051 /* Streamout */
3052 if (ctx->args->shader_info->so.num_outputs) {
3053 struct ngg_streamout nggso = {};
3054
3055 nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
3056
3057 LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
3058 for (unsigned stream = 0; stream < 4; ++stream) {
3059 if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
3060 continue;
3061
3062 tmp = LLVMBuildLoad(builder,
3063 ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
3064 tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
3065 tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
3066 nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
3067 }
3068
3069 for (unsigned i = 0; i < verts_per_prim; ++i) {
3070 tmp = LLVMBuildSub(builder, tid,
3071 LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
3072 tmp = ngg_gs_vertex_ptr(ctx, tmp);
3073 nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
3074 }
3075
3076 build_streamout(ctx, &nggso);
3077 }
3078
3079 /* Write shader query data. */
3080 tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state);
3081 tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
3082 ac_build_ifcc(&ctx->ac, tmp, 5109);
3083 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
3084 LLVMConstInt(ctx->ac.i32, 4, false), "");
3085 ac_build_ifcc(&ctx->ac, tmp, 5110);
3086 {
3087 tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
3088
3089 ac_llvm_add_target_dep_function_attr(ctx->main_function,
3090 "amdgpu-gds-size", 256);
3091
3092 LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
3093 LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
3094
3095 const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup";
3096
3097 /* Use a plain GDS atomic to accumulate the number of generated
3098 * primitives.
3099 */
3100 ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase,
3101 tmp, sync_scope);
3102 }
3103 ac_build_endif(&ctx->ac, 5110);
3104 ac_build_endif(&ctx->ac, 5109);
3105
3106 /* TODO: culling */
3107
3108 /* Determine vertex liveness. */
3109 LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
3110
3111 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
3112 ac_build_ifcc(&ctx->ac, tmp, 5120);
3113 {
3114 for (unsigned i = 0; i < verts_per_prim; ++i) {
3115 const LLVMValueRef primidx =
3116 LLVMBuildAdd(builder, tid,
3117 LLVMConstInt(ctx->ac.i32, i, false), "");
3118
3119 if (i > 0) {
3120 tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
3121 ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
3122 }
3123
3124 /* Load primitive liveness */
3125 tmp = ngg_gs_vertex_ptr(ctx, primidx);
3126 tmp = LLVMBuildLoad(builder,
3127 ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
3128 const LLVMValueRef primlive =
3129 LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
3130
3131 tmp = LLVMBuildLoad(builder, vertliveptr, "");
3132 tmp = LLVMBuildOr(builder, tmp, primlive, ""),
3133 LLVMBuildStore(builder, tmp, vertliveptr);
3134
3135 if (i > 0)
3136 ac_build_endif(&ctx->ac, 5121 + i);
3137 }
3138 }
3139 ac_build_endif(&ctx->ac, 5120);
3140
3141 /* Inclusive scan addition across the current wave. */
3142 LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
3143 struct ac_wg_scan vertlive_scan = {};
3144 vertlive_scan.op = nir_op_iadd;
3145 vertlive_scan.enable_reduce = true;
3146 vertlive_scan.enable_exclusive = true;
3147 vertlive_scan.src = vertlive;
3148 vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
3149 vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
3150 vertlive_scan.numwaves = get_tgsize(ctx);
3151 vertlive_scan.maxwaves = 8;
3152
3153 ac_build_wg_scan(&ctx->ac, &vertlive_scan);
3154
3155 /* Skip all exports (including index exports) when possible. At least on
3156 * early gfx10 revisions this is also to avoid hangs.
3157 */
3158 LLVMValueRef have_exports =
3159 LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
3160 num_emit_threads =
3161 LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
3162
3163 /* Allocate export space. Send this message as early as possible, to
3164 * hide the latency of the SQ <-> SPI roundtrip.
3165 *
3166 * Note: We could consider compacting primitives for export as well.
3167 * PA processes 1 non-null prim / clock, but it fetches 4 DW of
3168 * prim data per clock and skips null primitives at no additional
3169 * cost. So compacting primitives can only be beneficial when
3170 * there are 4 or more contiguous null primitives in the export
3171 * (in the common case of single-dword prim exports).
3172 */
3173 ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
3174 vertlive_scan.result_reduce, num_emit_threads);
3175
3176 /* Setup the reverse vertex compaction permutation. We re-use stream 1
3177 * of the primitive liveness flags, relying on the fact that each
3178 * threadgroup can have at most 256 threads. */
3179 ac_build_ifcc(&ctx->ac, vertlive, 5130);
3180 {
3181 tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
3182 tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
3183 LLVMBuildStore(builder, tmp2,
3184 ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
3185 }
3186 ac_build_endif(&ctx->ac, 5130);
3187
3188 ac_build_s_barrier(&ctx->ac);
3189
3190 /* Export primitive data */
3191 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
3192 ac_build_ifcc(&ctx->ac, tmp, 5140);
3193 {
3194 LLVMValueRef flags;
3195 struct ac_ngg_prim prim = {};
3196 prim.num_vertices = verts_per_prim;
3197
3198 tmp = ngg_gs_vertex_ptr(ctx, tid);
3199 flags = LLVMBuildLoad(builder,
3200 ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
3201 prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
3202
3203 for (unsigned i = 0; i < verts_per_prim; ++i) {
3204 prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
3205 LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
3206 prim.edgeflag[i] = ctx->ac.i1false;
3207 }
3208
3209 /* Geometry shaders output triangle strips, but NGG expects
3210 * triangles. We need to change the vertex order for odd
3211 * triangles to get correct front/back facing by swapping 2
3212 * vertex indices, but we also have to keep the provoking
3213 * vertex in the same place.
3214 */
3215 if (verts_per_prim == 3) {
3216 LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
3217 is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
3218
3219 struct ac_ngg_prim in = prim;
3220 prim.index[0] = in.index[0];
3221 prim.index[1] = LLVMBuildSelect(builder, is_odd,
3222 in.index[2], in.index[1], "");
3223 prim.index[2] = LLVMBuildSelect(builder, is_odd,
3224 in.index[1], in.index[2], "");
3225 }
3226
3227 ac_build_export_prim(&ctx->ac, &prim);
3228 }
3229 ac_build_endif(&ctx->ac, 5140);
3230
3231 /* Export position and parameter data */
3232 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
3233 ac_build_ifcc(&ctx->ac, tmp, 5145);
3234 {
3235 struct radv_vs_output_info *outinfo = &ctx->args->shader_info->vs.outinfo;
3236 bool export_view_index = ctx->args->options->key.has_multiview_view_index;
3237 struct radv_shader_output_values *outputs;
3238 unsigned noutput = 0;
3239
3240 /* Allocate a temporary array for the output values. */
3241 unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_view_index;
3242 outputs = calloc(num_outputs, sizeof(outputs[0]));
3243
3244 memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
3245 sizeof(outinfo->vs_output_param_offset));
3246 outinfo->pos_exports = 0;
3247
3248 tmp = ngg_gs_vertex_ptr(ctx, tid);
3249 tmp = LLVMBuildLoad(builder,
3250 ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
3251 tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
3252 const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
3253
3254 unsigned out_idx = 0;
3255 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
3256 unsigned output_usage_mask =
3257 ctx->args->shader_info->gs.output_usage_mask[i];
3258 int length = util_last_bit(output_usage_mask);
3259
3260 if (!(ctx->output_mask & (1ull << i)))
3261 continue;
3262
3263 outputs[noutput].slot_name = i;
3264 outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
3265 outputs[noutput].usage_mask = output_usage_mask;
3266
3267 for (unsigned j = 0; j < length; j++, out_idx++) {
3268 if (!(output_usage_mask & (1 << j)))
3269 continue;
3270
3271 tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
3272 tmp = LLVMBuildLoad(builder, tmp, "");
3273
3274 LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
3275 if (ac_get_type_size(type) == 2) {
3276 tmp = ac_to_integer(&ctx->ac, tmp);
3277 tmp = LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i16, "");
3278 }
3279
3280 outputs[noutput].values[j] = ac_to_float(&ctx->ac, tmp);
3281 }
3282
3283 for (unsigned j = length; j < 4; j++)
3284 outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32);
3285
3286 noutput++;
3287 }
3288
3289 /* Export ViewIndex. */
3290 if (export_view_index) {
3291 outputs[noutput].slot_name = VARYING_SLOT_LAYER;
3292 outputs[noutput].slot_index = 0;
3293 outputs[noutput].usage_mask = 0x1;
3294 outputs[noutput].values[0] =
3295 ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.view_index));
3296 for (unsigned j = 1; j < 4; j++)
3297 outputs[noutput].values[j] = ctx->ac.f32_0;
3298 noutput++;
3299 }
3300
3301 radv_llvm_export_vs(ctx, outputs, noutput, outinfo,
3302 ctx->args->options->key.vs_common_out.export_clip_dists);
3303 FREE(outputs);
3304 }
3305 ac_build_endif(&ctx->ac, 5145);
3306 }
3307
3308 static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
3309 unsigned stream,
3310 LLVMValueRef vertexidx,
3311 LLVMValueRef *addrs)
3312 {
3313 LLVMBuilderRef builder = ctx->ac.builder;
3314 LLVMValueRef tmp;
3315
3316 const LLVMValueRef vertexptr =
3317 ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
3318 unsigned out_idx = 0;
3319 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
3320 unsigned output_usage_mask =
3321 ctx->args->shader_info->gs.output_usage_mask[i];
3322 uint8_t output_stream =
3323 ctx->args->shader_info->gs.output_streams[i];
3324 LLVMValueRef *out_ptr = &addrs[i * 4];
3325 int length = util_last_bit(output_usage_mask);
3326
3327 if (!(ctx->output_mask & (1ull << i)) ||
3328 output_stream != stream)
3329 continue;
3330
3331 for (unsigned j = 0; j < length; j++, out_idx++) {
3332 if (!(output_usage_mask & (1 << j)))
3333 continue;
3334
3335 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder,
3336 out_ptr[j], "");
3337 out_val = ac_to_integer(&ctx->ac, out_val);
3338 out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
3339
3340 LLVMBuildStore(builder, out_val,
3341 ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
3342 }
3343 }
3344 assert(out_idx * 4 <= ctx->args->shader_info->gs.gsvs_vertex_size);
3345
3346 /* Store the current number of emitted vertices to zero out remaining
3347 * primitive flags in case the geometry shader doesn't emit the maximum
3348 * number of vertices.
3349 */
3350 tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
3351 LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
3352
3353 /* Determine and store whether this vertex completed a primitive. */
3354 const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
3355
3356 tmp = LLVMConstInt(ctx->ac.i32, si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) - 1, false);
3357 const LLVMValueRef iscompleteprim =
3358 LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
3359
3360 /* Since the geometry shader emits triangle strips, we need to
3361 * track which primitive is odd and swap vertex indices to get
3362 * the correct vertex order.
3363 */
3364 LLVMValueRef is_odd = ctx->ac.i1false;
3365 if (stream == 0 &&
3366 si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) == 3) {
3367 tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
3368 is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
3369 }
3370
3371 tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
3372 LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
3373
3374 /* The per-vertex primitive flag encoding:
3375 * bit 0: whether this vertex finishes a primitive
3376 * bit 1: whether the primitive is odd (if we are emitting triangle strips)
3377 */
3378 tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
3379 tmp = LLVMBuildOr(builder, tmp,
3380 LLVMBuildShl(builder,
3381 LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""),
3382 ctx->ac.i8_1, ""), "");
3383 LLVMBuildStore(builder, tmp,
3384 ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
3385
3386 tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
3387 tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
3388 LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
3389 }
3390
3391 static void
3392 write_tess_factors(struct radv_shader_context *ctx)
3393 {
3394 unsigned stride, outer_comps, inner_comps;
3395 LLVMValueRef tcs_rel_ids = ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids);
3396 LLVMValueRef invocation_id = ac_unpack_param(&ctx->ac, tcs_rel_ids, 8, 5);
3397 LLVMValueRef rel_patch_id = ac_unpack_param(&ctx->ac, tcs_rel_ids, 0, 8);
3398 unsigned tess_inner_index = 0, tess_outer_index;
3399 LLVMValueRef lds_base, lds_inner = NULL, lds_outer, byteoffset, buffer;
3400 LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
3401 int i;
3402 ac_emit_barrier(&ctx->ac, ctx->stage);
3403
3404 switch (ctx->args->options->key.tcs.primitive_mode) {
3405 case GL_ISOLINES:
3406 stride = 2;
3407 outer_comps = 2;
3408 inner_comps = 0;
3409 break;
3410 case GL_TRIANGLES:
3411 stride = 4;
3412 outer_comps = 3;
3413 inner_comps = 1;
3414 break;
3415 case GL_QUADS:
3416 stride = 6;
3417 outer_comps = 4;
3418 inner_comps = 2;
3419 break;
3420 default:
3421 return;
3422 }
3423
3424 ac_build_ifcc(&ctx->ac,
3425 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
3426 invocation_id, ctx->ac.i32_0, ""), 6503);
3427
3428 lds_base = get_tcs_out_current_patch_data_offset(ctx);
3429
3430 if (inner_comps) {
3431 tess_inner_index = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER);
3432 lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
3433 LLVMConstInt(ctx->ac.i32, tess_inner_index * 4, false), "");
3434 }
3435
3436 tess_outer_index = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER);
3437 lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
3438 LLVMConstInt(ctx->ac.i32, tess_outer_index * 4, false), "");
3439
3440 for (i = 0; i < 4; i++) {
3441 inner[i] = LLVMGetUndef(ctx->ac.i32);
3442 outer[i] = LLVMGetUndef(ctx->ac.i32);
3443 }
3444
3445 // LINES reversal
3446 if (ctx->args->options->key.tcs.primitive_mode == GL_ISOLINES) {
3447 outer[0] = out[1] = ac_lds_load(&ctx->ac, lds_outer);
3448 lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_outer,
3449 ctx->ac.i32_1, "");
3450 outer[1] = out[0] = ac_lds_load(&ctx->ac, lds_outer);
3451 } else {
3452 for (i = 0; i < outer_comps; i++) {
3453 outer[i] = out[i] =
3454 ac_lds_load(&ctx->ac, lds_outer);
3455 lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_outer,
3456 ctx->ac.i32_1, "");
3457 }
3458 for (i = 0; i < inner_comps; i++) {
3459 inner[i] = out[outer_comps+i] =
3460 ac_lds_load(&ctx->ac, lds_inner);
3461 lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_inner,
3462 ctx->ac.i32_1, "");
3463 }
3464 }
3465
3466 /* Convert the outputs to vectors for stores. */
3467 vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
3468 vec1 = NULL;
3469
3470 if (stride > 4)
3471 vec1 = ac_build_gather_values(&ctx->ac, out + 4, stride - 4);
3472
3473
3474 buffer = ctx->hs_ring_tess_factor;
3475 tf_base = ac_get_arg(&ctx->ac, ctx->args->tess_factor_offset);
3476 byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
3477 LLVMConstInt(ctx->ac.i32, 4 * stride, false), "");
3478 unsigned tf_offset = 0;
3479
3480 if (ctx->ac.chip_class <= GFX8) {
3481 ac_build_ifcc(&ctx->ac,
3482 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
3483 rel_patch_id, ctx->ac.i32_0, ""), 6504);
3484
3485 /* Store the dynamic HS control word. */
3486 ac_build_buffer_store_dword(&ctx->ac, buffer,
3487 LLVMConstInt(ctx->ac.i32, 0x80000000, false),
3488 1, ctx->ac.i32_0, tf_base,
3489 0, ac_glc);
3490 tf_offset += 4;
3491
3492 ac_build_endif(&ctx->ac, 6504);
3493 }
3494
3495 /* Store the tessellation factors. */
3496 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
3497 MIN2(stride, 4), byteoffset, tf_base,
3498 tf_offset, ac_glc);
3499 if (vec1)
3500 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
3501 stride - 4, byteoffset, tf_base,
3502 16 + tf_offset, ac_glc);
3503
3504 //store to offchip for TES to read - only if TES reads them
3505 if (ctx->args->options->key.tcs.tes_reads_tess_factors) {
3506 LLVMValueRef inner_vec, outer_vec, tf_outer_offset;
3507 LLVMValueRef tf_inner_offset;
3508 unsigned param_outer, param_inner;
3509
3510 param_outer = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER);
3511 tf_outer_offset = get_tcs_tes_buffer_address(ctx, NULL,
3512 LLVMConstInt(ctx->ac.i32, param_outer, 0));
3513
3514 outer_vec = ac_build_gather_values(&ctx->ac, outer,
3515 util_next_power_of_two(outer_comps));
3516
3517 ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, outer_vec,
3518 outer_comps, tf_outer_offset,
3519 ac_get_arg(&ctx->ac, ctx->args->oc_lds),
3520 0, ac_glc);
3521 if (inner_comps) {
3522 param_inner = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER);
3523 tf_inner_offset = get_tcs_tes_buffer_address(ctx, NULL,
3524 LLVMConstInt(ctx->ac.i32, param_inner, 0));
3525
3526 inner_vec = inner_comps == 1 ? inner[0] :
3527 ac_build_gather_values(&ctx->ac, inner, inner_comps);
3528 ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, inner_vec,
3529 inner_comps, tf_inner_offset,
3530 ac_get_arg(&ctx->ac, ctx->args->oc_lds),
3531 0, ac_glc);
3532 }
3533 }
3534
3535 ac_build_endif(&ctx->ac, 6503);
3536 }
3537
3538 static void
3539 handle_tcs_outputs_post(struct radv_shader_context *ctx)
3540 {
3541 write_tess_factors(ctx);
3542 }
3543
3544 static bool
3545 si_export_mrt_color(struct radv_shader_context *ctx,
3546 LLVMValueRef *color, unsigned index,
3547 struct ac_export_args *args)
3548 {
3549 /* Export */
3550 si_llvm_init_export_args(ctx, color, 0xf,
3551 V_008DFC_SQ_EXP_MRT + index, args);
3552 if (!args->enabled_channels)
3553 return false; /* unnecessary NULL export */
3554
3555 return true;
3556 }
3557
3558 static void
3559 radv_export_mrt_z(struct radv_shader_context *ctx,
3560 LLVMValueRef depth, LLVMValueRef stencil,
3561 LLVMValueRef samplemask)
3562 {
3563 struct ac_export_args args;
3564
3565 ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
3566
3567 ac_build_export(&ctx->ac, &args);
3568 }
3569
3570 static void
3571 handle_fs_outputs_post(struct radv_shader_context *ctx)
3572 {
3573 unsigned index = 0;
3574 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3575 struct ac_export_args color_args[8];
3576
3577 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
3578 LLVMValueRef values[4];
3579
3580 if (!(ctx->output_mask & (1ull << i)))
3581 continue;
3582
3583 if (i < FRAG_RESULT_DATA0)
3584 continue;
3585
3586 for (unsigned j = 0; j < 4; j++)
3587 values[j] = ac_to_float(&ctx->ac,
3588 radv_load_output(ctx, i, j));
3589
3590 bool ret = si_export_mrt_color(ctx, values,
3591 i - FRAG_RESULT_DATA0,
3592 &color_args[index]);
3593 if (ret)
3594 index++;
3595 }
3596
3597 /* Process depth, stencil, samplemask. */
3598 if (ctx->args->shader_info->ps.writes_z) {
3599 depth = ac_to_float(&ctx->ac,
3600 radv_load_output(ctx, FRAG_RESULT_DEPTH, 0));
3601 }
3602 if (ctx->args->shader_info->ps.writes_stencil) {
3603 stencil = ac_to_float(&ctx->ac,
3604 radv_load_output(ctx, FRAG_RESULT_STENCIL, 0));
3605 }
3606 if (ctx->args->shader_info->ps.writes_sample_mask) {
3607 samplemask = ac_to_float(&ctx->ac,
3608 radv_load_output(ctx, FRAG_RESULT_SAMPLE_MASK, 0));
3609 }
3610
3611 /* Set the DONE bit on last non-null color export only if Z isn't
3612 * exported.
3613 */
3614 if (index > 0 &&
3615 !ctx->args->shader_info->ps.writes_z &&
3616 !ctx->args->shader_info->ps.writes_stencil &&
3617 !ctx->args->shader_info->ps.writes_sample_mask) {
3618 unsigned last = index - 1;
3619
3620 color_args[last].valid_mask = 1; /* whether the EXEC mask is valid */
3621 color_args[last].done = 1; /* DONE bit */
3622 }
3623
3624 /* Export PS outputs. */
3625 for (unsigned i = 0; i < index; i++)
3626 ac_build_export(&ctx->ac, &color_args[i]);
3627
3628 if (depth || stencil || samplemask)
3629 radv_export_mrt_z(ctx, depth, stencil, samplemask);
3630 else if (!index)
3631 ac_build_export_null(&ctx->ac);
3632 }
3633
3634 static void
3635 emit_gs_epilogue(struct radv_shader_context *ctx)
3636 {
3637 if (ctx->args->options->key.vs_common_out.as_ngg) {
3638 gfx10_ngg_gs_emit_epilogue_1(ctx);
3639 return;
3640 }
3641
3642 if (ctx->ac.chip_class >= GFX10)
3643 LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
3644
3645 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, ctx->gs_wave_id);
3646 }
3647
3648 static void
3649 handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs,
3650 LLVMValueRef *addrs)
3651 {
3652 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
3653
3654 switch (ctx->stage) {
3655 case MESA_SHADER_VERTEX:
3656 if (ctx->args->options->key.vs_common_out.as_ls)
3657 handle_ls_outputs_post(ctx);
3658 else if (ctx->args->options->key.vs_common_out.as_es)
3659 handle_es_outputs_post(ctx, &ctx->args->shader_info->vs.es_info);
3660 else if (ctx->args->options->key.vs_common_out.as_ngg)
3661 handle_ngg_outputs_post_1(ctx);
3662 else
3663 handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id,
3664 ctx->args->options->key.vs_common_out.export_clip_dists,
3665 &ctx->args->shader_info->vs.outinfo);
3666 break;
3667 case MESA_SHADER_FRAGMENT:
3668 handle_fs_outputs_post(ctx);
3669 break;
3670 case MESA_SHADER_GEOMETRY:
3671 emit_gs_epilogue(ctx);
3672 break;
3673 case MESA_SHADER_TESS_CTRL:
3674 handle_tcs_outputs_post(ctx);
3675 break;
3676 case MESA_SHADER_TESS_EVAL:
3677 if (ctx->args->options->key.vs_common_out.as_es)
3678 handle_es_outputs_post(ctx, &ctx->args->shader_info->tes.es_info);
3679 else if (ctx->args->options->key.vs_common_out.as_ngg)
3680 handle_ngg_outputs_post_1(ctx);
3681 else
3682 handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id,
3683 ctx->args->options->key.vs_common_out.export_clip_dists,
3684 &ctx->args->shader_info->tes.outinfo);
3685 break;
3686 default:
3687 break;
3688 }
3689 }
3690
3691 static void ac_llvm_finalize_module(struct radv_shader_context *ctx,
3692 LLVMPassManagerRef passmgr,
3693 const struct radv_nir_compiler_options *options)
3694 {
3695 LLVMRunPassManager(passmgr, ctx->ac.module);
3696 LLVMDisposeBuilder(ctx->ac.builder);
3697
3698 ac_llvm_context_dispose(&ctx->ac);
3699 }
3700
3701 static void
3702 ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx)
3703 {
3704 struct radv_vs_output_info *outinfo;
3705
3706 switch (ctx->stage) {
3707 case MESA_SHADER_FRAGMENT:
3708 case MESA_SHADER_COMPUTE:
3709 case MESA_SHADER_TESS_CTRL:
3710 case MESA_SHADER_GEOMETRY:
3711 return;
3712 case MESA_SHADER_VERTEX:
3713 if (ctx->args->options->key.vs_common_out.as_ls ||
3714 ctx->args->options->key.vs_common_out.as_es)
3715 return;
3716 outinfo = &ctx->args->shader_info->vs.outinfo;
3717 break;
3718 case MESA_SHADER_TESS_EVAL:
3719 if (ctx->args->options->key.vs_common_out.as_es)
3720 return;
3721 outinfo = &ctx->args->shader_info->tes.outinfo;
3722 break;
3723 default:
3724 unreachable("Unhandled shader type");
3725 }
3726
3727 ac_optimize_vs_outputs(&ctx->ac,
3728 ctx->main_function,
3729 outinfo->vs_output_param_offset,
3730 VARYING_SLOT_MAX, 0,
3731 &outinfo->param_exports);
3732 }
3733
3734 static void
3735 ac_setup_rings(struct radv_shader_context *ctx)
3736 {
3737 if (ctx->args->options->chip_class <= GFX8 &&
3738 (ctx->stage == MESA_SHADER_GEOMETRY ||
3739 ctx->args->options->key.vs_common_out.as_es)) {
3740 unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS
3741 : RING_ESGS_VS;
3742 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false);
3743
3744 ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac,
3745 ctx->ring_offsets,
3746 offset);
3747 }
3748
3749 if (ctx->args->is_gs_copy_shader) {
3750 ctx->gsvs_ring[0] =
3751 ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,
3752 LLVMConstInt(ctx->ac.i32,
3753 RING_GSVS_VS, false));
3754 }
3755
3756 if (ctx->stage == MESA_SHADER_GEOMETRY) {
3757 /* The conceptual layout of the GSVS ring is
3758 * v0c0 .. vLv0 v0c1 .. vLc1 ..
3759 * but the real memory layout is swizzled across
3760 * threads:
3761 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
3762 * t16v0c0 ..
3763 * Override the buffer descriptor accordingly.
3764 */
3765 LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
3766 uint64_t stream_offset = 0;
3767 unsigned num_records = ctx->ac.wave_size;
3768 LLVMValueRef base_ring;
3769
3770 base_ring =
3771 ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,
3772 LLVMConstInt(ctx->ac.i32,
3773 RING_GSVS_GS, false));
3774
3775 for (unsigned stream = 0; stream < 4; stream++) {
3776 unsigned num_components, stride;
3777 LLVMValueRef ring, tmp;
3778
3779 num_components =
3780 ctx->args->shader_info->gs.num_stream_output_components[stream];
3781
3782 if (!num_components)
3783 continue;
3784
3785 stride = 4 * num_components * ctx->shader->info.gs.vertices_out;
3786
3787 /* Limit on the stride field for <= GFX7. */
3788 assert(stride < (1 << 14));
3789
3790 ring = LLVMBuildBitCast(ctx->ac.builder,
3791 base_ring, v2i64, "");
3792 tmp = LLVMBuildExtractElement(ctx->ac.builder,
3793 ring, ctx->ac.i32_0, "");
3794 tmp = LLVMBuildAdd(ctx->ac.builder, tmp,
3795 LLVMConstInt(ctx->ac.i64,
3796 stream_offset, 0), "");
3797 ring = LLVMBuildInsertElement(ctx->ac.builder,
3798 ring, tmp, ctx->ac.i32_0, "");
3799
3800 stream_offset += stride * ctx->ac.wave_size;
3801
3802 ring = LLVMBuildBitCast(ctx->ac.builder, ring,
3803 ctx->ac.v4i32, "");
3804
3805 tmp = LLVMBuildExtractElement(ctx->ac.builder, ring,
3806 ctx->ac.i32_1, "");
3807 tmp = LLVMBuildOr(ctx->ac.builder, tmp,
3808 LLVMConstInt(ctx->ac.i32,
3809 S_008F04_STRIDE(stride), false), "");
3810 ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp,
3811 ctx->ac.i32_1, "");
3812
3813 ring = LLVMBuildInsertElement(ctx->ac.builder, ring,
3814 LLVMConstInt(ctx->ac.i32,
3815 num_records, false),
3816 LLVMConstInt(ctx->ac.i32, 2, false), "");
3817
3818 ctx->gsvs_ring[stream] = ring;
3819 }
3820 }
3821
3822 if (ctx->stage == MESA_SHADER_TESS_CTRL ||
3823 ctx->stage == MESA_SHADER_TESS_EVAL) {
3824 ctx->hs_ring_tess_offchip = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_OFFCHIP, false));
3825 ctx->hs_ring_tess_factor = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_FACTOR, false));
3826 }
3827 }
3828
3829 unsigned
3830 radv_nir_get_max_workgroup_size(enum chip_class chip_class,
3831 gl_shader_stage stage,
3832 const struct nir_shader *nir)
3833 {
3834 const unsigned backup_sizes[] = {chip_class >= GFX9 ? 128 : 64, 1, 1};
3835 unsigned sizes[3];
3836 for (unsigned i = 0; i < 3; i++)
3837 sizes[i] = nir ? nir->info.cs.local_size[i] : backup_sizes[i];
3838 return radv_get_max_workgroup_size(chip_class, stage, sizes);
3839 }
3840
3841 /* Fixup the HW not emitting the TCS regs if there are no HS threads. */
3842 static void ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx)
3843 {
3844 LLVMValueRef count =
3845 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 8, 8);
3846 LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count,
3847 ctx->ac.i32_0, "");
3848 ctx->abi.instance_id = LLVMBuildSelect(ctx->ac.builder, hs_empty,
3849 ac_get_arg(&ctx->ac, ctx->args->rel_auto_id),
3850 ctx->abi.instance_id, "");
3851 ctx->rel_auto_id = LLVMBuildSelect(ctx->ac.builder, hs_empty,
3852 ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids),
3853 ctx->rel_auto_id,
3854 "");
3855 ctx->abi.vertex_id = LLVMBuildSelect(ctx->ac.builder, hs_empty,
3856 ac_get_arg(&ctx->ac, ctx->args->ac.tcs_patch_id),
3857 ctx->abi.vertex_id, "");
3858 }
3859
3860 static void prepare_gs_input_vgprs(struct radv_shader_context *ctx, bool merged)
3861 {
3862 if (merged) {
3863 for(int i = 5; i >= 0; --i) {
3864 ctx->gs_vtx_offset[i] =
3865 ac_unpack_param(&ctx->ac,
3866 ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[i & ~1]),
3867 (i & 1) * 16, 16);
3868 }
3869
3870 ctx->gs_wave_id = ac_unpack_param(&ctx->ac,
3871 ac_get_arg(&ctx->ac, ctx->args->merged_wave_info),
3872 16, 8);
3873 } else {
3874 for (int i = 0; i < 6; i++)
3875 ctx->gs_vtx_offset[i] = ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[i]);
3876 ctx->gs_wave_id = ac_get_arg(&ctx->ac, ctx->args->gs_wave_id);
3877 }
3878 }
3879
3880 /* Ensure that the esgs ring is declared.
3881 *
3882 * We declare it with 64KB alignment as a hint that the
3883 * pointer value will always be 0.
3884 */
3885 static void declare_esgs_ring(struct radv_shader_context *ctx)
3886 {
3887 if (ctx->esgs_ring)
3888 return;
3889
3890 assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
3891
3892 ctx->esgs_ring = LLVMAddGlobalInAddressSpace(
3893 ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
3894 "esgs_ring",
3895 AC_ADDR_SPACE_LDS);
3896 LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
3897 LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
3898 }
3899
3900 static
3901 LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
3902 struct nir_shader *const *shaders,
3903 int shader_count,
3904 const struct radv_shader_args *args)
3905 {
3906 struct radv_shader_context ctx = {0};
3907 ctx.args = args;
3908
3909 enum ac_float_mode float_mode = AC_FLOAT_MODE_DEFAULT;
3910
3911 if (args->shader_info->float_controls_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) {
3912 float_mode = AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO;
3913 }
3914
3915 ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class,
3916 args->options->family, float_mode,
3917 args->shader_info->wave_size,
3918 args->shader_info->ballot_bit_size);
3919 ctx.context = ctx.ac.context;
3920
3921 ctx.max_workgroup_size = 0;
3922 for (int i = 0; i < shader_count; ++i) {
3923 ctx.max_workgroup_size = MAX2(ctx.max_workgroup_size,
3924 radv_nir_get_max_workgroup_size(args->options->chip_class,
3925 shaders[i]->info.stage,
3926 shaders[i]));
3927 }
3928
3929 if (ctx.ac.chip_class >= GFX10) {
3930 if (is_pre_gs_stage(shaders[0]->info.stage) &&
3931 args->options->key.vs_common_out.as_ngg) {
3932 ctx.max_workgroup_size = 128;
3933 }
3934 }
3935
3936 create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2);
3937
3938 ctx.abi.inputs = &ctx.inputs[0];
3939 ctx.abi.emit_outputs = handle_shader_outputs_post;
3940 ctx.abi.emit_vertex_with_counter = visit_emit_vertex_with_counter;
3941 ctx.abi.load_ubo = radv_load_ubo;
3942 ctx.abi.load_ssbo = radv_load_ssbo;
3943 ctx.abi.load_sampler_desc = radv_get_sampler_desc;
3944 ctx.abi.load_resource = radv_load_resource;
3945 ctx.abi.clamp_shadow_reference = false;
3946 ctx.abi.robust_buffer_access = args->options->robust_buffer_access;
3947
3948 bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && args->options->key.vs_common_out.as_ngg;
3949 if (shader_count >= 2 || is_ngg)
3950 ac_init_exec_full_mask(&ctx.ac);
3951
3952 if (args->ac.vertex_id.used)
3953 ctx.abi.vertex_id = ac_get_arg(&ctx.ac, args->ac.vertex_id);
3954 if (args->rel_auto_id.used)
3955 ctx.rel_auto_id = ac_get_arg(&ctx.ac, args->rel_auto_id);
3956 if (args->ac.instance_id.used)
3957 ctx.abi.instance_id = ac_get_arg(&ctx.ac, args->ac.instance_id);
3958
3959 if (args->options->has_ls_vgpr_init_bug &&
3960 shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)
3961 ac_nir_fixup_ls_hs_input_vgprs(&ctx);
3962
3963 if (is_ngg) {
3964 /* Declare scratch space base for streamout and vertex
3965 * compaction. Whether space is actually allocated is
3966 * determined during linking / PM4 creation.
3967 *
3968 * Add an extra dword per vertex to ensure an odd stride, which
3969 * avoids bank conflicts for SoA accesses.
3970 */
3971 if (!args->options->key.vs_common_out.as_ngg_passthrough)
3972 declare_esgs_ring(&ctx);
3973
3974 /* This is really only needed when streamout and / or vertex
3975 * compaction is enabled.
3976 */
3977 if (args->shader_info->so.num_outputs) {
3978 LLVMTypeRef asi32 = LLVMArrayType(ctx.ac.i32, 8);
3979 ctx.gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx.ac.module,
3980 asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
3981 LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(asi32));
3982 LLVMSetAlignment(ctx.gs_ngg_scratch, 4);
3983 }
3984 }
3985
3986 for(int i = 0; i < shader_count; ++i) {
3987 ctx.stage = shaders[i]->info.stage;
3988 ctx.shader = shaders[i];
3989 ctx.output_mask = 0;
3990
3991 if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY) {
3992 for (int i = 0; i < 4; i++) {
3993 ctx.gs_next_vertex[i] =
3994 ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
3995 }
3996 if (args->options->key.vs_common_out.as_ngg) {
3997 for (unsigned i = 0; i < 4; ++i) {
3998 ctx.gs_curprim_verts[i] =
3999 ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
4000 ctx.gs_generated_prims[i] =
4001 ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
4002 }
4003
4004 unsigned scratch_size = 8;
4005 if (args->shader_info->so.num_outputs)
4006 scratch_size = 44;
4007
4008 LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, scratch_size);
4009 ctx.gs_ngg_scratch =
4010 LLVMAddGlobalInAddressSpace(ctx.ac.module,
4011 ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
4012 LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(ai32));
4013 LLVMSetAlignment(ctx.gs_ngg_scratch, 4);
4014
4015 ctx.gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx.ac.module,
4016 LLVMArrayType(ctx.ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
4017 LLVMSetLinkage(ctx.gs_ngg_emit, LLVMExternalLinkage);
4018 LLVMSetAlignment(ctx.gs_ngg_emit, 4);
4019 }
4020
4021 ctx.abi.load_inputs = load_gs_input;
4022 ctx.abi.emit_primitive = visit_end_primitive;
4023 } else if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) {
4024 ctx.abi.load_tess_varyings = load_tcs_varyings;
4025 ctx.abi.load_patch_vertices_in = load_patch_vertices_in;
4026 ctx.abi.store_tcs_outputs = store_tcs_output;
4027 if (shader_count == 1)
4028 ctx.tcs_num_inputs = args->options->key.tcs.num_inputs;
4029 else
4030 ctx.tcs_num_inputs = util_last_bit64(args->shader_info->vs.ls_outputs_written);
4031 unsigned tcs_num_outputs = util_last_bit64(ctx.args->shader_info->tcs.outputs_written);
4032 unsigned tcs_num_patch_outputs = util_last_bit64(ctx.args->shader_info->tcs.patch_outputs_written);
4033 ctx.tcs_num_patches =
4034 get_tcs_num_patches(
4035 ctx.args->options->key.tcs.input_vertices,
4036 ctx.shader->info.tess.tcs_vertices_out,
4037 ctx.tcs_num_inputs,
4038 tcs_num_outputs,
4039 tcs_num_patch_outputs,
4040 ctx.args->options->tess_offchip_block_dw_size,
4041 ctx.args->options->chip_class,
4042 ctx.args->options->family);
4043 } else if (shaders[i]->info.stage == MESA_SHADER_TESS_EVAL) {
4044 ctx.abi.load_tess_varyings = load_tes_input;
4045 ctx.abi.load_tess_coord = load_tess_coord;
4046 ctx.abi.load_patch_vertices_in = load_patch_vertices_in;
4047 ctx.tcs_num_patches = args->options->key.tes.num_patches;
4048 } else if (shaders[i]->info.stage == MESA_SHADER_VERTEX) {
4049 ctx.abi.load_base_vertex = radv_load_base_vertex;
4050 } else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) {
4051 ctx.abi.load_sample_position = load_sample_position;
4052 ctx.abi.load_sample_mask_in = load_sample_mask_in;
4053 }
4054
4055 if (shaders[i]->info.stage == MESA_SHADER_VERTEX &&
4056 args->options->key.vs_common_out.as_ngg &&
4057 args->options->key.vs_common_out.export_prim_id) {
4058 declare_esgs_ring(&ctx);
4059 }
4060
4061 bool nested_barrier = false;
4062
4063 if (i) {
4064 if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY &&
4065 args->options->key.vs_common_out.as_ngg) {
4066 gfx10_ngg_gs_emit_prologue(&ctx);
4067 nested_barrier = false;
4068 } else {
4069 nested_barrier = true;
4070 }
4071 }
4072
4073 if (nested_barrier) {
4074 /* Execute a barrier before the second shader in
4075 * a merged shader.
4076 *
4077 * Execute the barrier inside the conditional block,
4078 * so that empty waves can jump directly to s_endpgm,
4079 * which will also signal the barrier.
4080 *
4081 * This is possible in gfx9, because an empty wave
4082 * for the second shader does not participate in
4083 * the epilogue. With NGG, empty waves may still
4084 * be required to export data (e.g. GS output vertices),
4085 * so we cannot let them exit early.
4086 *
4087 * If the shader is TCS and the TCS epilog is present
4088 * and contains a barrier, it will wait there and then
4089 * reach s_endpgm.
4090 */
4091 ac_emit_barrier(&ctx.ac, ctx.stage);
4092 }
4093
4094 nir_foreach_shader_out_variable(variable, shaders[i])
4095 scan_shader_output_decl(&ctx, variable, shaders[i], shaders[i]->info.stage);
4096
4097 ac_setup_rings(&ctx);
4098
4099 LLVMBasicBlockRef merge_block = NULL;
4100 if (shader_count >= 2 || is_ngg) {
4101 LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
4102 LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
4103 merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
4104
4105 LLVMValueRef count =
4106 ac_unpack_param(&ctx.ac,
4107 ac_get_arg(&ctx.ac, args->merged_wave_info),
4108 8 * i, 8);
4109 LLVMValueRef thread_id = ac_get_thread_id(&ctx.ac);
4110 LLVMValueRef cond = LLVMBuildICmp(ctx.ac.builder, LLVMIntULT,
4111 thread_id, count, "");
4112 LLVMBuildCondBr(ctx.ac.builder, cond, then_block, merge_block);
4113
4114 LLVMPositionBuilderAtEnd(ctx.ac.builder, then_block);
4115 }
4116
4117 if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT)
4118 prepare_interp_optimize(&ctx, shaders[i]);
4119 else if(shaders[i]->info.stage == MESA_SHADER_VERTEX)
4120 handle_vs_inputs(&ctx, shaders[i]);
4121 else if(shaders[i]->info.stage == MESA_SHADER_GEOMETRY)
4122 prepare_gs_input_vgprs(&ctx, shader_count >= 2);
4123
4124 ac_nir_translate(&ctx.ac, &ctx.abi, &args->ac, shaders[i]);
4125
4126 if (shader_count >= 2 || is_ngg) {
4127 LLVMBuildBr(ctx.ac.builder, merge_block);
4128 LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block);
4129 }
4130
4131 /* This needs to be outside the if wrapping the shader body, as sometimes
4132 * the HW generates waves with 0 es/vs threads. */
4133 if (is_pre_gs_stage(shaders[i]->info.stage) &&
4134 args->options->key.vs_common_out.as_ngg &&
4135 i == shader_count - 1) {
4136 handle_ngg_outputs_post_2(&ctx);
4137 } else if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY &&
4138 args->options->key.vs_common_out.as_ngg) {
4139 gfx10_ngg_gs_emit_epilogue_2(&ctx);
4140 }
4141
4142 if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) {
4143 unsigned tcs_num_outputs = util_last_bit64(ctx.args->shader_info->tcs.outputs_written);
4144 unsigned tcs_num_patch_outputs = util_last_bit64(ctx.args->shader_info->tcs.patch_outputs_written);
4145 args->shader_info->tcs.num_patches = ctx.tcs_num_patches;
4146 args->shader_info->tcs.num_lds_blocks =
4147 calculate_tess_lds_size(
4148 ctx.args->options->chip_class,
4149 ctx.args->options->key.tcs.input_vertices,
4150 ctx.shader->info.tess.tcs_vertices_out,
4151 ctx.tcs_num_inputs,
4152 ctx.tcs_num_patches,
4153 tcs_num_outputs,
4154 tcs_num_patch_outputs);
4155 }
4156 }
4157
4158 LLVMBuildRetVoid(ctx.ac.builder);
4159
4160 if (args->options->dump_preoptir) {
4161 fprintf(stderr, "%s LLVM IR:\n\n",
4162 radv_get_shader_name(args->shader_info,
4163 shaders[shader_count - 1]->info.stage));
4164 ac_dump_module(ctx.ac.module);
4165 fprintf(stderr, "\n");
4166 }
4167
4168 ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options);
4169
4170 if (shader_count == 1)
4171 ac_nir_eliminate_const_vs_outputs(&ctx);
4172
4173 if (args->options->dump_shader) {
4174 args->shader_info->private_mem_vgprs =
4175 ac_count_scratch_private_memory(ctx.main_function);
4176 }
4177
4178 return ctx.ac.module;
4179 }
4180
4181 static void ac_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
4182 {
4183 unsigned *retval = (unsigned *)context;
4184 LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
4185 char *description = LLVMGetDiagInfoDescription(di);
4186
4187 if (severity == LLVMDSError) {
4188 *retval = 1;
4189 fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n",
4190 description);
4191 }
4192
4193 LLVMDisposeMessage(description);
4194 }
4195
4196 static unsigned radv_llvm_compile(LLVMModuleRef M,
4197 char **pelf_buffer, size_t *pelf_size,
4198 struct ac_llvm_compiler *ac_llvm)
4199 {
4200 unsigned retval = 0;
4201 LLVMContextRef llvm_ctx;
4202
4203 /* Setup Diagnostic Handler*/
4204 llvm_ctx = LLVMGetModuleContext(M);
4205
4206 LLVMContextSetDiagnosticHandler(llvm_ctx, ac_diagnostic_handler,
4207 &retval);
4208
4209 /* Compile IR*/
4210 if (!radv_compile_to_elf(ac_llvm, M, pelf_buffer, pelf_size))
4211 retval = 1;
4212 return retval;
4213 }
4214
4215 static void ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm,
4216 LLVMModuleRef llvm_module,
4217 struct radv_shader_binary **rbinary,
4218 gl_shader_stage stage,
4219 const char *name,
4220 const struct radv_nir_compiler_options *options)
4221 {
4222 char *elf_buffer = NULL;
4223 size_t elf_size = 0;
4224 char *llvm_ir_string = NULL;
4225
4226 if (options->dump_shader) {
4227 fprintf(stderr, "%s LLVM IR:\n\n", name);
4228 ac_dump_module(llvm_module);
4229 fprintf(stderr, "\n");
4230 }
4231
4232 if (options->record_ir) {
4233 char *llvm_ir = LLVMPrintModuleToString(llvm_module);
4234 llvm_ir_string = strdup(llvm_ir);
4235 LLVMDisposeMessage(llvm_ir);
4236 }
4237
4238 int v = radv_llvm_compile(llvm_module, &elf_buffer, &elf_size, ac_llvm);
4239 if (v) {
4240 fprintf(stderr, "compile failed\n");
4241 }
4242
4243 LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);
4244 LLVMDisposeModule(llvm_module);
4245 LLVMContextDispose(ctx);
4246
4247 size_t llvm_ir_size = llvm_ir_string ? strlen(llvm_ir_string) : 0;
4248 size_t alloc_size = sizeof(struct radv_shader_binary_rtld) + elf_size + llvm_ir_size + 1;
4249 struct radv_shader_binary_rtld *rbin = calloc(1, alloc_size);
4250 memcpy(rbin->data, elf_buffer, elf_size);
4251 if (llvm_ir_string)
4252 memcpy(rbin->data + elf_size, llvm_ir_string, llvm_ir_size + 1);
4253
4254 rbin->base.type = RADV_BINARY_TYPE_RTLD;
4255 rbin->base.stage = stage;
4256 rbin->base.total_size = alloc_size;
4257 rbin->elf_size = elf_size;
4258 rbin->llvm_ir_size = llvm_ir_size;
4259 *rbinary = &rbin->base;
4260
4261 free(llvm_ir_string);
4262 free(elf_buffer);
4263 }
4264
4265 static void
4266 radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm,
4267 struct radv_shader_binary **rbinary,
4268 const struct radv_shader_args *args,
4269 struct nir_shader *const *nir,
4270 int nir_count)
4271 {
4272
4273 LLVMModuleRef llvm_module;
4274
4275 llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, args);
4276
4277 ac_compile_llvm_module(ac_llvm, llvm_module, rbinary,
4278 nir[nir_count - 1]->info.stage,
4279 radv_get_shader_name(args->shader_info,
4280 nir[nir_count - 1]->info.stage),
4281 args->options);
4282
4283 /* Determine the ES type (VS or TES) for the GS on GFX9. */
4284 if (args->options->chip_class >= GFX9) {
4285 if (nir_count == 2 &&
4286 nir[1]->info.stage == MESA_SHADER_GEOMETRY) {
4287 args->shader_info->gs.es_type = nir[0]->info.stage;
4288 }
4289 }
4290 }
4291
4292 static void
4293 ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
4294 {
4295 LLVMValueRef vtx_offset =
4296 LLVMBuildMul(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id),
4297 LLVMConstInt(ctx->ac.i32, 4, false), "");
4298 LLVMValueRef stream_id;
4299
4300 /* Fetch the vertex stream ID. */
4301 if (!ctx->args->options->use_ngg_streamout &&
4302 ctx->args->shader_info->so.num_outputs) {
4303 stream_id =
4304 ac_unpack_param(&ctx->ac,
4305 ac_get_arg(&ctx->ac,
4306 ctx->args->streamout_config),
4307 24, 2);
4308 } else {
4309 stream_id = ctx->ac.i32_0;
4310 }
4311
4312 LLVMBasicBlockRef end_bb;
4313 LLVMValueRef switch_inst;
4314
4315 end_bb = LLVMAppendBasicBlockInContext(ctx->ac.context,
4316 ctx->main_function, "end");
4317 switch_inst = LLVMBuildSwitch(ctx->ac.builder, stream_id, end_bb, 4);
4318
4319 for (unsigned stream = 0; stream < 4; stream++) {
4320 unsigned num_components =
4321 ctx->args->shader_info->gs.num_stream_output_components[stream];
4322 LLVMBasicBlockRef bb;
4323 unsigned offset;
4324
4325 if (stream > 0 && !num_components)
4326 continue;
4327
4328 if (stream > 0 && !ctx->args->shader_info->so.num_outputs)
4329 continue;
4330
4331 bb = LLVMInsertBasicBlockInContext(ctx->ac.context, end_bb, "out");
4332 LLVMAddCase(switch_inst, LLVMConstInt(ctx->ac.i32, stream, 0), bb);
4333 LLVMPositionBuilderAtEnd(ctx->ac.builder, bb);
4334
4335 offset = 0;
4336 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
4337 unsigned output_usage_mask =
4338 ctx->args->shader_info->gs.output_usage_mask[i];
4339 unsigned output_stream =
4340 ctx->args->shader_info->gs.output_streams[i];
4341 int length = util_last_bit(output_usage_mask);
4342
4343 if (!(ctx->output_mask & (1ull << i)) ||
4344 output_stream != stream)
4345 continue;
4346
4347 for (unsigned j = 0; j < length; j++) {
4348 LLVMValueRef value, soffset;
4349
4350 if (!(output_usage_mask & (1 << j)))
4351 continue;
4352
4353 soffset = LLVMConstInt(ctx->ac.i32,
4354 offset *
4355 ctx->shader->info.gs.vertices_out * 16 * 4, false);
4356
4357 offset++;
4358
4359 value = ac_build_buffer_load(&ctx->ac,
4360 ctx->gsvs_ring[0],
4361 1, ctx->ac.i32_0,
4362 vtx_offset, soffset,
4363 0, ac_glc | ac_slc, true, false);
4364
4365 LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
4366 if (ac_get_type_size(type) == 2) {
4367 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
4368 value = LLVMBuildTrunc(ctx->ac.builder, value, ctx->ac.i16, "");
4369 }
4370
4371 LLVMBuildStore(ctx->ac.builder,
4372 ac_to_float(&ctx->ac, value), ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
4373 }
4374 }
4375
4376 if (!ctx->args->options->use_ngg_streamout &&
4377 ctx->args->shader_info->so.num_outputs)
4378 radv_emit_streamout(ctx, stream);
4379
4380 if (stream == 0) {
4381 handle_vs_outputs_post(ctx, false, true,
4382 &ctx->args->shader_info->vs.outinfo);
4383 }
4384
4385 LLVMBuildBr(ctx->ac.builder, end_bb);
4386 }
4387
4388 LLVMPositionBuilderAtEnd(ctx->ac.builder, end_bb);
4389 }
4390
4391 static void
4392 radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm,
4393 struct nir_shader *geom_shader,
4394 struct radv_shader_binary **rbinary,
4395 const struct radv_shader_args *args)
4396 {
4397 struct radv_shader_context ctx = {0};
4398 ctx.args = args;
4399
4400 assert(args->is_gs_copy_shader);
4401
4402 ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class,
4403 args->options->family, AC_FLOAT_MODE_DEFAULT, 64, 64);
4404 ctx.context = ctx.ac.context;
4405
4406 ctx.stage = MESA_SHADER_VERTEX;
4407 ctx.shader = geom_shader;
4408
4409 create_function(&ctx, MESA_SHADER_VERTEX, false);
4410
4411 ac_setup_rings(&ctx);
4412
4413 nir_foreach_shader_out_variable(variable, geom_shader) {
4414 scan_shader_output_decl(&ctx, variable, geom_shader, MESA_SHADER_VERTEX);
4415 ac_handle_shader_output_decl(&ctx.ac, &ctx.abi, geom_shader,
4416 variable, MESA_SHADER_VERTEX);
4417 }
4418
4419 ac_gs_copy_shader_emit(&ctx);
4420
4421 LLVMBuildRetVoid(ctx.ac.builder);
4422
4423 ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options);
4424
4425 ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary,
4426 MESA_SHADER_VERTEX, "GS Copy Shader", args->options);
4427 (*rbinary)->is_gs_copy_shader = true;
4428
4429 }
4430
4431 void
4432 llvm_compile_shader(struct radv_device *device,
4433 unsigned shader_count,
4434 struct nir_shader *const *shaders,
4435 struct radv_shader_binary **binary,
4436 struct radv_shader_args *args)
4437 {
4438 enum ac_target_machine_options tm_options = 0;
4439 struct ac_llvm_compiler ac_llvm;
4440 bool thread_compiler;
4441
4442 tm_options |= AC_TM_SUPPORTS_SPILL;
4443 if (args->options->check_ir)
4444 tm_options |= AC_TM_CHECK_IR;
4445
4446 thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM);
4447
4448 radv_init_llvm_compiler(&ac_llvm, thread_compiler,
4449 args->options->family, tm_options,
4450 args->shader_info->wave_size);
4451
4452 if (args->is_gs_copy_shader) {
4453 radv_compile_gs_copy_shader(&ac_llvm, *shaders, binary, args);
4454 } else {
4455 radv_compile_nir_shader(&ac_llvm, binary, args,
4456 shaders, shader_count);
4457 }
4458
4459 radv_destroy_llvm_compiler(&ac_llvm, thread_compiler);
4460 }