radeonsi: move VS shader code into si_shader_llvm_vs.c
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "util/u_memory.h"
26 #include "tgsi/tgsi_strings.h"
27 #include "tgsi/tgsi_from_mesa.h"
28
29 #include "ac_exp_param.h"
30 #include "ac_rtld.h"
31 #include "si_shader_internal.h"
32 #include "si_pipe.h"
33 #include "sid.h"
34
35 #include "compiler/nir/nir.h"
36 #include "compiler/nir/nir_serialize.h"
37
38 static const char scratch_rsrc_dword0_symbol[] =
39 "SCRATCH_RSRC_DWORD0";
40
41 static const char scratch_rsrc_dword1_symbol[] =
42 "SCRATCH_RSRC_DWORD1";
43
44 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
45
46 /** Whether the shader runs as a combination of multiple API shaders */
47 static bool is_multi_part_shader(struct si_shader_context *ctx)
48 {
49 if (ctx->screen->info.chip_class <= GFX8)
50 return false;
51
52 return ctx->shader->key.as_ls ||
53 ctx->shader->key.as_es ||
54 ctx->type == PIPE_SHADER_TESS_CTRL ||
55 ctx->type == PIPE_SHADER_GEOMETRY;
56 }
57
58 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
59 bool si_is_merged_shader(struct si_shader_context *ctx)
60 {
61 return ctx->shader->key.as_ngg || is_multi_part_shader(ctx);
62 }
63
64 /**
65 * Returns a unique index for a per-patch semantic name and index. The index
66 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
67 * can be calculated.
68 */
69 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
70 {
71 switch (semantic_name) {
72 case TGSI_SEMANTIC_TESSOUTER:
73 return 0;
74 case TGSI_SEMANTIC_TESSINNER:
75 return 1;
76 case TGSI_SEMANTIC_PATCH:
77 assert(index < 30);
78 return 2 + index;
79
80 default:
81 assert(!"invalid semantic name");
82 return 0;
83 }
84 }
85
86 /**
87 * Returns a unique index for a semantic name and index. The index must be
88 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
89 * calculated.
90 */
91 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
92 unsigned is_varying)
93 {
94 switch (semantic_name) {
95 case TGSI_SEMANTIC_POSITION:
96 return 0;
97 case TGSI_SEMANTIC_GENERIC:
98 /* Since some shader stages use the the highest used IO index
99 * to determine the size to allocate for inputs/outputs
100 * (in LDS, tess and GS rings). GENERIC should be placed right
101 * after POSITION to make that size as small as possible.
102 */
103 if (index < SI_MAX_IO_GENERIC)
104 return 1 + index;
105
106 assert(!"invalid generic index");
107 return 0;
108 case TGSI_SEMANTIC_FOG:
109 return SI_MAX_IO_GENERIC + 1;
110 case TGSI_SEMANTIC_COLOR:
111 assert(index < 2);
112 return SI_MAX_IO_GENERIC + 2 + index;
113 case TGSI_SEMANTIC_BCOLOR:
114 assert(index < 2);
115 /* If it's a varying, COLOR and BCOLOR alias. */
116 if (is_varying)
117 return SI_MAX_IO_GENERIC + 2 + index;
118 else
119 return SI_MAX_IO_GENERIC + 4 + index;
120 case TGSI_SEMANTIC_TEXCOORD:
121 assert(index < 8);
122 return SI_MAX_IO_GENERIC + 6 + index;
123
124 /* These are rarely used between LS and HS or ES and GS. */
125 case TGSI_SEMANTIC_CLIPDIST:
126 assert(index < 2);
127 return SI_MAX_IO_GENERIC + 6 + 8 + index;
128 case TGSI_SEMANTIC_CLIPVERTEX:
129 return SI_MAX_IO_GENERIC + 6 + 8 + 2;
130 case TGSI_SEMANTIC_PSIZE:
131 return SI_MAX_IO_GENERIC + 6 + 8 + 3;
132
133 /* These can't be written by LS, HS, and ES. */
134 case TGSI_SEMANTIC_LAYER:
135 return SI_MAX_IO_GENERIC + 6 + 8 + 4;
136 case TGSI_SEMANTIC_VIEWPORT_INDEX:
137 return SI_MAX_IO_GENERIC + 6 + 8 + 5;
138 case TGSI_SEMANTIC_PRIMID:
139 STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63);
140 return SI_MAX_IO_GENERIC + 6 + 8 + 6;
141 default:
142 fprintf(stderr, "invalid semantic name = %u\n", semantic_name);
143 assert(!"invalid semantic name");
144 return 0;
145 }
146 }
147
148 /**
149 * Get the value of a shader input parameter and extract a bitfield.
150 */
151 static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx,
152 LLVMValueRef value, unsigned rshift,
153 unsigned bitwidth)
154 {
155 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
156 value = ac_to_integer(&ctx->ac, value);
157
158 if (rshift)
159 value = LLVMBuildLShr(ctx->ac.builder, value,
160 LLVMConstInt(ctx->ac.i32, rshift, 0), "");
161
162 if (rshift + bitwidth < 32) {
163 unsigned mask = (1 << bitwidth) - 1;
164 value = LLVMBuildAnd(ctx->ac.builder, value,
165 LLVMConstInt(ctx->ac.i32, mask, 0), "");
166 }
167
168 return value;
169 }
170
171 LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
172 struct ac_arg param, unsigned rshift,
173 unsigned bitwidth)
174 {
175 LLVMValueRef value = ac_get_arg(&ctx->ac, param);
176
177 return unpack_llvm_param(ctx, value, rshift, bitwidth);
178 }
179
180 LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
181 unsigned swizzle)
182 {
183 if (swizzle > 0)
184 return ctx->ac.i32_0;
185
186 switch (ctx->type) {
187 case PIPE_SHADER_VERTEX:
188 return ac_get_arg(&ctx->ac, ctx->vs_prim_id);
189 case PIPE_SHADER_TESS_CTRL:
190 return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id);
191 case PIPE_SHADER_TESS_EVAL:
192 return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id);
193 case PIPE_SHADER_GEOMETRY:
194 return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
195 default:
196 assert(0);
197 return ctx->ac.i32_0;
198 }
199 }
200
201 static LLVMValueRef get_block_size(struct ac_shader_abi *abi)
202 {
203 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
204
205 LLVMValueRef values[3];
206 LLVMValueRef result;
207 unsigned i;
208 unsigned *properties = ctx->shader->selector->info.properties;
209
210 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
211 unsigned sizes[3] = {
212 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
213 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
214 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
215 };
216
217 for (i = 0; i < 3; ++i)
218 values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0);
219
220 result = ac_build_gather_values(&ctx->ac, values, 3);
221 } else {
222 result = ac_get_arg(&ctx->ac, ctx->block_size);
223 }
224
225 return result;
226 }
227
228 void si_declare_compute_memory(struct si_shader_context *ctx)
229 {
230 struct si_shader_selector *sel = ctx->shader->selector;
231 unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
232
233 LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
234 LLVMValueRef var;
235
236 assert(!ctx->ac.lds);
237
238 var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
239 LLVMArrayType(ctx->ac.i8, lds_size),
240 "compute_lds",
241 AC_ADDR_SPACE_LDS);
242 LLVMSetAlignment(var, 64 * 1024);
243
244 ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
245 }
246
247 static void si_dump_streamout(struct pipe_stream_output_info *so)
248 {
249 unsigned i;
250
251 if (so->num_outputs)
252 fprintf(stderr, "STREAMOUT\n");
253
254 for (i = 0; i < so->num_outputs; i++) {
255 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
256 so->output[i].start_component;
257 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
258 i, so->output[i].output_buffer,
259 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
260 so->output[i].register_index,
261 mask & 1 ? "x" : "",
262 mask & 2 ? "y" : "",
263 mask & 4 ? "z" : "",
264 mask & 8 ? "w" : "");
265 }
266 }
267
268 static void declare_streamout_params(struct si_shader_context *ctx,
269 struct pipe_stream_output_info *so)
270 {
271 if (ctx->screen->use_ngg_streamout) {
272 if (ctx->type == PIPE_SHADER_TESS_EVAL)
273 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
274 return;
275 }
276
277 /* Streamout SGPRs. */
278 if (so->num_outputs) {
279 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config);
280 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index);
281 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
282 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
283 }
284
285 /* A streamout buffer offset is loaded if the stride is non-zero. */
286 for (int i = 0; i < 4; i++) {
287 if (!so->stride[i])
288 continue;
289
290 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]);
291 }
292 }
293
294 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
295 {
296 switch (shader->selector->type) {
297 case PIPE_SHADER_VERTEX:
298 case PIPE_SHADER_TESS_EVAL:
299 return shader->key.as_ngg ? 128 : 0;
300
301 case PIPE_SHADER_TESS_CTRL:
302 /* Return this so that LLVM doesn't remove s_barrier
303 * instructions on chips where we use s_barrier. */
304 return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;
305
306 case PIPE_SHADER_GEOMETRY:
307 return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;
308
309 case PIPE_SHADER_COMPUTE:
310 break; /* see below */
311
312 default:
313 return 0;
314 }
315
316 const unsigned *properties = shader->selector->info.properties;
317 unsigned max_work_group_size =
318 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
319 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
320 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
321
322 if (!max_work_group_size) {
323 /* This is a variable group size compute shader,
324 * compile it for the maximum possible group size.
325 */
326 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
327 }
328 return max_work_group_size;
329 }
330
331 static void declare_const_and_shader_buffers(struct si_shader_context *ctx,
332 bool assign_params)
333 {
334 enum ac_arg_type const_shader_buf_type;
335
336 if (ctx->shader->selector->info.const_buffers_declared == 1 &&
337 ctx->shader->selector->info.shader_buffers_declared == 0)
338 const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
339 else
340 const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
341
342 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
343 assign_params ? &ctx->const_and_shader_buffers :
344 &ctx->other_const_and_shader_buffers);
345 }
346
347 static void declare_samplers_and_images(struct si_shader_context *ctx,
348 bool assign_params)
349 {
350 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
351 assign_params ? &ctx->samplers_and_images :
352 &ctx->other_samplers_and_images);
353 }
354
355 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
356 bool assign_params)
357 {
358 declare_const_and_shader_buffers(ctx, assign_params);
359 declare_samplers_and_images(ctx, assign_params);
360 }
361
362 static void declare_global_desc_pointers(struct si_shader_context *ctx)
363 {
364 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
365 &ctx->rw_buffers);
366 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
367 &ctx->bindless_samplers_and_images);
368 }
369
370 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx)
371 {
372 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
373 if (!ctx->shader->is_gs_copy_shader) {
374 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
375 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
376 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
377 }
378 }
379
380 static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
381 {
382 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
383
384 unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
385 if (num_vbos_in_user_sgprs) {
386 unsigned user_sgprs = ctx->args.num_sgprs_used;
387
388 if (si_is_merged_shader(ctx))
389 user_sgprs -= 8;
390 assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
391
392 /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
393 for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
394 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
395
396 assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
397 for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
398 ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
399 }
400 }
401
402 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
403 unsigned *num_prolog_vgprs,
404 bool ngg_cull_shader)
405 {
406 struct si_shader *shader = ctx->shader;
407
408 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
409 if (shader->key.as_ls) {
410 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id);
411 if (ctx->screen->info.chip_class >= GFX10) {
412 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
413 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
414 } else {
415 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
416 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
417 }
418 } else if (ctx->screen->info.chip_class >= GFX10) {
419 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
420 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
421 &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */
422 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
423 } else {
424 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
425 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id);
426 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
427 }
428
429 if (!shader->is_gs_copy_shader) {
430 if (shader->key.opt.ngg_culling && !ngg_cull_shader) {
431 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
432 &ctx->ngg_old_thread_id);
433 }
434
435 /* Vertex load indices. */
436 if (shader->selector->info.num_inputs) {
437 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
438 &ctx->vertex_index0);
439 for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
440 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
441 }
442 *num_prolog_vgprs += shader->selector->info.num_inputs;
443 }
444 }
445
446 static void declare_vs_blit_inputs(struct si_shader_context *ctx,
447 unsigned vs_blit_property)
448 {
449 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
450 &ctx->vs_blit_inputs); /* i16 x1, y1 */
451 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
452 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
453
454 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
455 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
456 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
457 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
458 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
459 } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
460 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
461 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
462 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
463 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
464 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
465 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
466 }
467 }
468
469 static void declare_tes_input_vgprs(struct si_shader_context *ctx, bool ngg_cull_shader)
470 {
471 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
472 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
473 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
474 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
475
476 if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) {
477 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
478 &ctx->ngg_old_thread_id);
479 }
480 }
481
482 enum {
483 /* Convenient merged shader definitions. */
484 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
485 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
486 };
487
488 void si_add_arg_checked(struct ac_shader_args *args,
489 enum ac_arg_regfile file,
490 unsigned registers, enum ac_arg_type type,
491 struct ac_arg *arg,
492 unsigned idx)
493 {
494 assert(args->arg_count == idx);
495 ac_add_arg(args, file, registers, type, arg);
496 }
497
498 void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
499 {
500 struct si_shader *shader = ctx->shader;
501 LLVMTypeRef returns[AC_MAX_ARGS];
502 unsigned i, num_return_sgprs;
503 unsigned num_returns = 0;
504 unsigned num_prolog_vgprs = 0;
505 unsigned type = ctx->type;
506 unsigned vs_blit_property =
507 shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
508
509 memset(&ctx->args, 0, sizeof(ctx->args));
510
511 /* Set MERGED shaders. */
512 if (ctx->screen->info.chip_class >= GFX9) {
513 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
514 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
515 else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY)
516 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
517 }
518
519 switch (type) {
520 case PIPE_SHADER_VERTEX:
521 declare_global_desc_pointers(ctx);
522
523 if (vs_blit_property) {
524 declare_vs_blit_inputs(ctx, vs_blit_property);
525
526 /* VGPRs */
527 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
528 break;
529 }
530
531 declare_per_stage_desc_pointers(ctx, true);
532 declare_vs_specific_input_sgprs(ctx);
533 if (!shader->is_gs_copy_shader)
534 declare_vb_descriptor_input_sgprs(ctx);
535
536 if (shader->key.as_es) {
537 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
538 &ctx->es2gs_offset);
539 } else if (shader->key.as_ls) {
540 /* no extra parameters */
541 } else {
542 /* The locations of the other parameters are assigned dynamically. */
543 declare_streamout_params(ctx, &shader->selector->so);
544 }
545
546 /* VGPRs */
547 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
548
549 /* Return values */
550 if (shader->key.opt.vs_as_prim_discard_cs) {
551 for (i = 0; i < 4; i++)
552 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
553 }
554 break;
555
556 case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */
557 declare_global_desc_pointers(ctx);
558 declare_per_stage_desc_pointers(ctx, true);
559 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
560 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
561 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
562 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
563 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
564 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
565
566 /* VGPRs */
567 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
568 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
569
570 /* param_tcs_offchip_offset and param_tcs_factor_offset are
571 * placed after the user SGPRs.
572 */
573 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
574 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
575 for (i = 0; i < 11; i++)
576 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
577 break;
578
579 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
580 /* Merged stages have 8 system SGPRs at the beginning. */
581 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
582 declare_per_stage_desc_pointers(ctx,
583 ctx->type == PIPE_SHADER_TESS_CTRL);
584 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
585 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
586 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
587 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
588 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
589 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
590
591 declare_global_desc_pointers(ctx);
592 declare_per_stage_desc_pointers(ctx,
593 ctx->type == PIPE_SHADER_VERTEX);
594 declare_vs_specific_input_sgprs(ctx);
595
596 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
597 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
598 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
599 declare_vb_descriptor_input_sgprs(ctx);
600
601 /* VGPRs (first TCS, then VS) */
602 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
603 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
604
605 if (ctx->type == PIPE_SHADER_VERTEX) {
606 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
607
608 /* LS return values are inputs to the TCS main shader part. */
609 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
610 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
611 for (i = 0; i < 2; i++)
612 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
613 } else {
614 /* TCS return values are inputs to the TCS epilog.
615 *
616 * param_tcs_offchip_offset, param_tcs_factor_offset,
617 * param_tcs_offchip_layout, and param_rw_buffers
618 * should be passed to the epilog.
619 */
620 for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
621 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
622 for (i = 0; i < 11; i++)
623 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
624 }
625 break;
626
627 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
628 /* Merged stages have 8 system SGPRs at the beginning. */
629 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
630 declare_per_stage_desc_pointers(ctx,
631 ctx->type == PIPE_SHADER_GEOMETRY);
632
633 if (ctx->shader->key.as_ngg)
634 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info);
635 else
636 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
637
638 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
639 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
640 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
641 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
642 &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */
643 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
644
645 declare_global_desc_pointers(ctx);
646 if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) {
647 declare_per_stage_desc_pointers(ctx,
648 (ctx->type == PIPE_SHADER_VERTEX ||
649 ctx->type == PIPE_SHADER_TESS_EVAL));
650 }
651
652 if (ctx->type == PIPE_SHADER_VERTEX) {
653 if (vs_blit_property)
654 declare_vs_blit_inputs(ctx, vs_blit_property);
655 else
656 declare_vs_specific_input_sgprs(ctx);
657 } else {
658 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
659 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
660 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
661 /* Declare as many input SGPRs as the VS has. */
662 }
663
664 if (ctx->type == PIPE_SHADER_VERTEX)
665 declare_vb_descriptor_input_sgprs(ctx);
666
667 /* VGPRs (first GS, then VS/TES) */
668 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
669 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
670 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
671 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
672 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
673
674 if (ctx->type == PIPE_SHADER_VERTEX) {
675 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
676 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
677 declare_tes_input_vgprs(ctx, ngg_cull_shader);
678 }
679
680 if ((ctx->shader->key.as_es || ngg_cull_shader) &&
681 (ctx->type == PIPE_SHADER_VERTEX ||
682 ctx->type == PIPE_SHADER_TESS_EVAL)) {
683 unsigned num_user_sgprs, num_vgprs;
684
685 if (ctx->type == PIPE_SHADER_VERTEX) {
686 /* For the NGG cull shader, add 1 SGPR to hold
687 * the vertex buffer pointer.
688 */
689 num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
690
691 if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
692 assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
693 num_user_sgprs = SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
694 shader->selector->num_vbos_in_user_sgprs * 4;
695 }
696 } else {
697 num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
698 }
699
700 /* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
701 *
702 * The normal merged ESGS shader only has to return the 5 VGPRs
703 * for the GS stage.
704 */
705 num_vgprs = ngg_cull_shader ? 10 : 5;
706
707 /* ES return values are inputs to GS. */
708 for (i = 0; i < 8 + num_user_sgprs; i++)
709 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
710 for (i = 0; i < num_vgprs; i++)
711 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
712 }
713 break;
714
715 case PIPE_SHADER_TESS_EVAL:
716 declare_global_desc_pointers(ctx);
717 declare_per_stage_desc_pointers(ctx, true);
718 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
719 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
720 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
721
722 if (shader->key.as_es) {
723 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
724 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
725 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
726 } else {
727 declare_streamout_params(ctx, &shader->selector->so);
728 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
729 }
730
731 /* VGPRs */
732 declare_tes_input_vgprs(ctx, ngg_cull_shader);
733 break;
734
735 case PIPE_SHADER_GEOMETRY:
736 declare_global_desc_pointers(ctx);
737 declare_per_stage_desc_pointers(ctx, true);
738 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
739 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id);
740
741 /* VGPRs */
742 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]);
743 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]);
744 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
745 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]);
746 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]);
747 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]);
748 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]);
749 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
750 break;
751
752 case PIPE_SHADER_FRAGMENT:
753 declare_global_desc_pointers(ctx);
754 declare_per_stage_desc_pointers(ctx, true);
755 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL,
756 SI_PARAM_ALPHA_REF);
757 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
758 &ctx->args.prim_mask, SI_PARAM_PRIM_MASK);
759
760 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
761 SI_PARAM_PERSP_SAMPLE);
762 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
763 &ctx->args.persp_center, SI_PARAM_PERSP_CENTER);
764 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
765 &ctx->args.persp_centroid, SI_PARAM_PERSP_CENTROID);
766 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
767 NULL, SI_PARAM_PERSP_PULL_MODEL);
768 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
769 &ctx->args.linear_sample, SI_PARAM_LINEAR_SAMPLE);
770 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
771 &ctx->args.linear_center, SI_PARAM_LINEAR_CENTER);
772 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
773 &ctx->args.linear_centroid, SI_PARAM_LINEAR_CENTROID);
774 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT,
775 NULL, SI_PARAM_LINE_STIPPLE_TEX);
776 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
777 &ctx->args.frag_pos[0], SI_PARAM_POS_X_FLOAT);
778 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
779 &ctx->args.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
780 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
781 &ctx->args.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
782 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
783 &ctx->args.frag_pos[3], SI_PARAM_POS_W_FLOAT);
784 shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
785 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
786 &ctx->args.front_face, SI_PARAM_FRONT_FACE);
787 shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
788 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
789 &ctx->args.ancillary, SI_PARAM_ANCILLARY);
790 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
791 &ctx->args.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
792 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
793 &ctx->pos_fixed_pt, SI_PARAM_POS_FIXED_PT);
794
795 /* Color inputs from the prolog. */
796 if (shader->selector->info.colors_read) {
797 unsigned num_color_elements =
798 util_bitcount(shader->selector->info.colors_read);
799
800 for (i = 0; i < num_color_elements; i++)
801 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
802
803 num_prolog_vgprs += num_color_elements;
804 }
805
806 /* Outputs for the epilog. */
807 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
808 num_returns =
809 num_return_sgprs +
810 util_bitcount(shader->selector->info.colors_written) * 4 +
811 shader->selector->info.writes_z +
812 shader->selector->info.writes_stencil +
813 shader->selector->info.writes_samplemask +
814 1 /* SampleMaskIn */;
815
816 num_returns = MAX2(num_returns,
817 num_return_sgprs +
818 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
819
820 for (i = 0; i < num_return_sgprs; i++)
821 returns[i] = ctx->ac.i32;
822 for (; i < num_returns; i++)
823 returns[i] = ctx->ac.f32;
824 break;
825
826 case PIPE_SHADER_COMPUTE:
827 declare_global_desc_pointers(ctx);
828 declare_per_stage_desc_pointers(ctx, true);
829 if (shader->selector->info.uses_grid_size)
830 ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT,
831 &ctx->args.num_work_groups);
832 if (shader->selector->info.uses_block_size &&
833 shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
834 ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
835
836 unsigned cs_user_data_dwords =
837 shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
838 if (cs_user_data_dwords) {
839 ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT,
840 &ctx->cs_user_data);
841 }
842
843 /* Hardware SGPRs. */
844 for (i = 0; i < 3; i++) {
845 if (shader->selector->info.uses_block_id[i]) {
846 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
847 &ctx->args.workgroup_ids[i]);
848 }
849 }
850 if (shader->selector->info.uses_subgroup_info)
851 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
852
853 /* Hardware VGPRs. */
854 ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
855 &ctx->args.local_invocation_ids);
856 break;
857 default:
858 assert(0 && "unimplemented shader");
859 return;
860 }
861
862 si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main",
863 returns, num_returns, si_get_max_workgroup_size(shader));
864
865 /* Reserve register locations for VGPR inputs the PS prolog may need. */
866 if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
867 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
868 "InitialPSInputAddr",
869 S_0286D0_PERSP_SAMPLE_ENA(1) |
870 S_0286D0_PERSP_CENTER_ENA(1) |
871 S_0286D0_PERSP_CENTROID_ENA(1) |
872 S_0286D0_LINEAR_SAMPLE_ENA(1) |
873 S_0286D0_LINEAR_CENTER_ENA(1) |
874 S_0286D0_LINEAR_CENTROID_ENA(1) |
875 S_0286D0_FRONT_FACE_ENA(1) |
876 S_0286D0_ANCILLARY_ENA(1) |
877 S_0286D0_POS_FIXED_PT_ENA(1));
878 }
879
880 shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
881 shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
882
883 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
884 shader->info.num_input_vgprs -= num_prolog_vgprs;
885
886 if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) {
887 if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
888 /* The LSHS size is not known until draw time, so we append it
889 * at the end of whatever LDS use there may be in the rest of
890 * the shader (currently none, unless LLVM decides to do its
891 * own LDS-based lowering).
892 */
893 ctx->ac.lds = LLVMAddGlobalInAddressSpace(
894 ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
895 "__lds_end", AC_ADDR_SPACE_LDS);
896 LLVMSetAlignment(ctx->ac.lds, 256);
897 } else {
898 ac_declare_lds_as_pointer(&ctx->ac);
899 }
900 }
901
902 /* Unlike radv, we override these arguments in the prolog, so to the
903 * API shader they appear as normal arguments.
904 */
905 if (ctx->type == PIPE_SHADER_VERTEX) {
906 ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
907 ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
908 } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
909 ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
910 ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
911 }
912 }
913
914 /* For the UMR disassembler. */
915 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
916 #define DEBUGGER_NUM_MARKERS 5
917
918 static bool si_shader_binary_open(struct si_screen *screen,
919 struct si_shader *shader,
920 struct ac_rtld_binary *rtld)
921 {
922 const struct si_shader_selector *sel = shader->selector;
923 const char *part_elfs[5];
924 size_t part_sizes[5];
925 unsigned num_parts = 0;
926
927 #define add_part(shader_or_part) \
928 if (shader_or_part) { \
929 part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \
930 part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \
931 num_parts++; \
932 }
933
934 add_part(shader->prolog);
935 add_part(shader->previous_stage);
936 add_part(shader->prolog2);
937 add_part(shader);
938 add_part(shader->epilog);
939
940 #undef add_part
941
942 struct ac_rtld_symbol lds_symbols[2];
943 unsigned num_lds_symbols = 0;
944
945 if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
946 (sel->type == PIPE_SHADER_GEOMETRY || shader->key.as_ngg)) {
947 /* We add this symbol even on LLVM <= 8 to ensure that
948 * shader->config.lds_size is set correctly below.
949 */
950 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
951 sym->name = "esgs_ring";
952 sym->size = shader->gs_info.esgs_ring_size;
953 sym->align = 64 * 1024;
954 }
955
956 if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) {
957 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
958 sym->name = "ngg_emit";
959 sym->size = shader->ngg.ngg_emit_size * 4;
960 sym->align = 4;
961 }
962
963 bool ok = ac_rtld_open(rtld, (struct ac_rtld_open_info){
964 .info = &screen->info,
965 .options = {
966 .halt_at_entry = screen->options.halt_shaders,
967 },
968 .shader_type = tgsi_processor_to_shader_stage(sel->type),
969 .wave_size = si_get_shader_wave_size(shader),
970 .num_parts = num_parts,
971 .elf_ptrs = part_elfs,
972 .elf_sizes = part_sizes,
973 .num_shared_lds_symbols = num_lds_symbols,
974 .shared_lds_symbols = lds_symbols });
975
976 if (rtld->lds_size > 0) {
977 unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
978 shader->config.lds_size =
979 align(rtld->lds_size, alloc_granularity) / alloc_granularity;
980 }
981
982 return ok;
983 }
984
985 static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
986 {
987 struct ac_rtld_binary rtld;
988 si_shader_binary_open(screen, shader, &rtld);
989 return rtld.exec_size;
990 }
991
992 static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
993 {
994 uint64_t *scratch_va = data;
995
996 if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
997 *value = (uint32_t)*scratch_va;
998 return true;
999 }
1000 if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
1001 /* Enable scratch coalescing. */
1002 *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) |
1003 S_008F04_SWIZZLE_ENABLE(1);
1004 return true;
1005 }
1006
1007 return false;
1008 }
1009
1010 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1011 uint64_t scratch_va)
1012 {
1013 struct ac_rtld_binary binary;
1014 if (!si_shader_binary_open(sscreen, shader, &binary))
1015 return false;
1016
1017 si_resource_reference(&shader->bo, NULL);
1018 shader->bo = si_aligned_buffer_create(&sscreen->b,
1019 sscreen->info.cpdma_prefetch_writes_memory ?
1020 0 : SI_RESOURCE_FLAG_READ_ONLY,
1021 PIPE_USAGE_IMMUTABLE,
1022 align(binary.rx_size, SI_CPDMA_ALIGNMENT),
1023 256);
1024 if (!shader->bo)
1025 return false;
1026
1027 /* Upload. */
1028 struct ac_rtld_upload_info u = {};
1029 u.binary = &binary;
1030 u.get_external_symbol = si_get_external_symbol;
1031 u.cb_data = &scratch_va;
1032 u.rx_va = shader->bo->gpu_address;
1033 u.rx_ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
1034 PIPE_TRANSFER_READ_WRITE |
1035 PIPE_TRANSFER_UNSYNCHRONIZED |
1036 RADEON_TRANSFER_TEMPORARY);
1037 if (!u.rx_ptr)
1038 return false;
1039
1040 bool ok = ac_rtld_upload(&u);
1041
1042 sscreen->ws->buffer_unmap(shader->bo->buf);
1043 ac_rtld_close(&binary);
1044
1045 return ok;
1046 }
1047
1048 static void si_shader_dump_disassembly(struct si_screen *screen,
1049 const struct si_shader_binary *binary,
1050 enum pipe_shader_type shader_type,
1051 unsigned wave_size,
1052 struct pipe_debug_callback *debug,
1053 const char *name, FILE *file)
1054 {
1055 struct ac_rtld_binary rtld_binary;
1056
1057 if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
1058 .info = &screen->info,
1059 .shader_type = tgsi_processor_to_shader_stage(shader_type),
1060 .wave_size = wave_size,
1061 .num_parts = 1,
1062 .elf_ptrs = &binary->elf_buffer,
1063 .elf_sizes = &binary->elf_size }))
1064 return;
1065
1066 const char *disasm;
1067 size_t nbytes;
1068
1069 if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
1070 goto out;
1071
1072 if (nbytes > INT_MAX)
1073 goto out;
1074
1075 if (debug && debug->debug_message) {
1076 /* Very long debug messages are cut off, so send the
1077 * disassembly one line at a time. This causes more
1078 * overhead, but on the plus side it simplifies
1079 * parsing of resulting logs.
1080 */
1081 pipe_debug_message(debug, SHADER_INFO,
1082 "Shader Disassembly Begin");
1083
1084 uint64_t line = 0;
1085 while (line < nbytes) {
1086 int count = nbytes - line;
1087 const char *nl = memchr(disasm + line, '\n', nbytes - line);
1088 if (nl)
1089 count = nl - (disasm + line);
1090
1091 if (count) {
1092 pipe_debug_message(debug, SHADER_INFO,
1093 "%.*s", count, disasm + line);
1094 }
1095
1096 line += count + 1;
1097 }
1098
1099 pipe_debug_message(debug, SHADER_INFO,
1100 "Shader Disassembly End");
1101 }
1102
1103 if (file) {
1104 fprintf(file, "Shader %s disassembly:\n", name);
1105 fprintf(file, "%*s", (int)nbytes, disasm);
1106 }
1107
1108 out:
1109 ac_rtld_close(&rtld_binary);
1110 }
1111
1112 static void si_calculate_max_simd_waves(struct si_shader *shader)
1113 {
1114 struct si_screen *sscreen = shader->selector->screen;
1115 struct ac_shader_config *conf = &shader->config;
1116 unsigned num_inputs = shader->selector->info.num_inputs;
1117 unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
1118 unsigned lds_per_wave = 0;
1119 unsigned max_simd_waves;
1120
1121 max_simd_waves = sscreen->info.max_wave64_per_simd;
1122
1123 /* Compute LDS usage for PS. */
1124 switch (shader->selector->type) {
1125 case PIPE_SHADER_FRAGMENT:
1126 /* The minimum usage per wave is (num_inputs * 48). The maximum
1127 * usage is (num_inputs * 48 * 16).
1128 * We can get anything in between and it varies between waves.
1129 *
1130 * The 48 bytes per input for a single primitive is equal to
1131 * 4 bytes/component * 4 components/input * 3 points.
1132 *
1133 * Other stages don't know the size at compile time or don't
1134 * allocate LDS per wave, but instead they do it per thread group.
1135 */
1136 lds_per_wave = conf->lds_size * lds_increment +
1137 align(num_inputs * 48, lds_increment);
1138 break;
1139 case PIPE_SHADER_COMPUTE:
1140 if (shader->selector) {
1141 unsigned max_workgroup_size =
1142 si_get_max_workgroup_size(shader);
1143 lds_per_wave = (conf->lds_size * lds_increment) /
1144 DIV_ROUND_UP(max_workgroup_size,
1145 sscreen->compute_wave_size);
1146 }
1147 break;
1148 default:;
1149 }
1150
1151 /* Compute the per-SIMD wave counts. */
1152 if (conf->num_sgprs) {
1153 max_simd_waves =
1154 MIN2(max_simd_waves,
1155 sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1156 }
1157
1158 if (conf->num_vgprs) {
1159 /* Always print wave limits as Wave64, so that we can compare
1160 * Wave32 and Wave64 with shader-db fairly. */
1161 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1162 max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
1163 }
1164
1165 /* LDS is 64KB per CU (4 SIMDs) on GFX6-9, which is 16KB per SIMD (usage above
1166 * 16KB makes some SIMDs unoccupied).
1167 *
1168 * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
1169 */
1170 unsigned max_lds_size = sscreen->info.chip_class >= GFX10 ? 128*1024 : 64*1024;
1171 unsigned max_lds_per_simd = max_lds_size / 4;
1172 if (lds_per_wave)
1173 max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1174
1175 shader->info.max_simd_waves = max_simd_waves;
1176 }
1177
1178 void si_shader_dump_stats_for_shader_db(struct si_screen *screen,
1179 struct si_shader *shader,
1180 struct pipe_debug_callback *debug)
1181 {
1182 const struct ac_shader_config *conf = &shader->config;
1183
1184 if (screen->options.debug_disassembly)
1185 si_shader_dump_disassembly(screen, &shader->binary,
1186 shader->selector->type,
1187 si_get_shader_wave_size(shader),
1188 debug, "main", NULL);
1189
1190 pipe_debug_message(debug, SHADER_INFO,
1191 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1192 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1193 "Spilled VGPRs: %d PrivMem VGPRs: %d",
1194 conf->num_sgprs, conf->num_vgprs,
1195 si_get_shader_binary_size(screen, shader),
1196 conf->lds_size, conf->scratch_bytes_per_wave,
1197 shader->info.max_simd_waves, conf->spilled_sgprs,
1198 conf->spilled_vgprs, shader->info.private_mem_vgprs);
1199 }
1200
1201 static void si_shader_dump_stats(struct si_screen *sscreen,
1202 struct si_shader *shader,
1203 FILE *file,
1204 bool check_debug_option)
1205 {
1206 const struct ac_shader_config *conf = &shader->config;
1207
1208 if (!check_debug_option ||
1209 si_can_dump_shader(sscreen, shader->selector->type)) {
1210 if (shader->selector->type == PIPE_SHADER_FRAGMENT) {
1211 fprintf(file, "*** SHADER CONFIG ***\n"
1212 "SPI_PS_INPUT_ADDR = 0x%04x\n"
1213 "SPI_PS_INPUT_ENA = 0x%04x\n",
1214 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1215 }
1216
1217 fprintf(file, "*** SHADER STATS ***\n"
1218 "SGPRS: %d\n"
1219 "VGPRS: %d\n"
1220 "Spilled SGPRs: %d\n"
1221 "Spilled VGPRs: %d\n"
1222 "Private memory VGPRs: %d\n"
1223 "Code Size: %d bytes\n"
1224 "LDS: %d blocks\n"
1225 "Scratch: %d bytes per wave\n"
1226 "Max Waves: %d\n"
1227 "********************\n\n\n",
1228 conf->num_sgprs, conf->num_vgprs,
1229 conf->spilled_sgprs, conf->spilled_vgprs,
1230 shader->info.private_mem_vgprs,
1231 si_get_shader_binary_size(sscreen, shader),
1232 conf->lds_size, conf->scratch_bytes_per_wave,
1233 shader->info.max_simd_waves);
1234 }
1235 }
1236
1237 const char *si_get_shader_name(const struct si_shader *shader)
1238 {
1239 switch (shader->selector->type) {
1240 case PIPE_SHADER_VERTEX:
1241 if (shader->key.as_es)
1242 return "Vertex Shader as ES";
1243 else if (shader->key.as_ls)
1244 return "Vertex Shader as LS";
1245 else if (shader->key.opt.vs_as_prim_discard_cs)
1246 return "Vertex Shader as Primitive Discard CS";
1247 else if (shader->key.as_ngg)
1248 return "Vertex Shader as ESGS";
1249 else
1250 return "Vertex Shader as VS";
1251 case PIPE_SHADER_TESS_CTRL:
1252 return "Tessellation Control Shader";
1253 case PIPE_SHADER_TESS_EVAL:
1254 if (shader->key.as_es)
1255 return "Tessellation Evaluation Shader as ES";
1256 else if (shader->key.as_ngg)
1257 return "Tessellation Evaluation Shader as ESGS";
1258 else
1259 return "Tessellation Evaluation Shader as VS";
1260 case PIPE_SHADER_GEOMETRY:
1261 if (shader->is_gs_copy_shader)
1262 return "GS Copy Shader as VS";
1263 else
1264 return "Geometry Shader";
1265 case PIPE_SHADER_FRAGMENT:
1266 return "Pixel Shader";
1267 case PIPE_SHADER_COMPUTE:
1268 return "Compute Shader";
1269 default:
1270 return "Unknown Shader";
1271 }
1272 }
1273
1274 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1275 struct pipe_debug_callback *debug,
1276 FILE *file, bool check_debug_option)
1277 {
1278 enum pipe_shader_type shader_type = shader->selector->type;
1279
1280 if (!check_debug_option ||
1281 si_can_dump_shader(sscreen, shader_type))
1282 si_dump_shader_key(shader, file);
1283
1284 if (!check_debug_option && shader->binary.llvm_ir_string) {
1285 if (shader->previous_stage &&
1286 shader->previous_stage->binary.llvm_ir_string) {
1287 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
1288 si_get_shader_name(shader));
1289 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
1290 }
1291
1292 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
1293 si_get_shader_name(shader));
1294 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
1295 }
1296
1297 if (!check_debug_option ||
1298 (si_can_dump_shader(sscreen, shader_type) &&
1299 !(sscreen->debug_flags & DBG(NO_ASM)))) {
1300 unsigned wave_size = si_get_shader_wave_size(shader);
1301
1302 fprintf(file, "\n%s:\n", si_get_shader_name(shader));
1303
1304 if (shader->prolog)
1305 si_shader_dump_disassembly(sscreen, &shader->prolog->binary,
1306 shader_type, wave_size, debug, "prolog", file);
1307 if (shader->previous_stage)
1308 si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary,
1309 shader_type, wave_size, debug, "previous stage", file);
1310 if (shader->prolog2)
1311 si_shader_dump_disassembly(sscreen, &shader->prolog2->binary,
1312 shader_type, wave_size, debug, "prolog2", file);
1313
1314 si_shader_dump_disassembly(sscreen, &shader->binary, shader_type,
1315 wave_size, debug, "main", file);
1316
1317 if (shader->epilog)
1318 si_shader_dump_disassembly(sscreen, &shader->epilog->binary,
1319 shader_type, wave_size, debug, "epilog", file);
1320 fprintf(file, "\n");
1321 }
1322
1323 si_shader_dump_stats(sscreen, shader, file, check_debug_option);
1324 }
1325
1326 static void si_dump_shader_key_vs(const struct si_shader_key *key,
1327 const struct si_vs_prolog_bits *prolog,
1328 const char *prefix, FILE *f)
1329 {
1330 fprintf(f, " %s.instance_divisor_is_one = %u\n",
1331 prefix, prolog->instance_divisor_is_one);
1332 fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
1333 prefix, prolog->instance_divisor_is_fetched);
1334 fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n",
1335 prefix, prolog->unpack_instance_id_from_vertex_id);
1336 fprintf(f, " %s.ls_vgpr_fix = %u\n",
1337 prefix, prolog->ls_vgpr_fix);
1338
1339 fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
1340 fprintf(f, " mono.vs.fix_fetch = {");
1341 for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
1342 union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
1343 if (i)
1344 fprintf(f, ", ");
1345 if (!fix.bits)
1346 fprintf(f, "0");
1347 else
1348 fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size,
1349 fix.u.num_channels_m1, fix.u.format);
1350 }
1351 fprintf(f, "}\n");
1352 }
1353
1354 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
1355 {
1356 const struct si_shader_key *key = &shader->key;
1357 enum pipe_shader_type shader_type = shader->selector->type;
1358
1359 fprintf(f, "SHADER KEY\n");
1360
1361 switch (shader_type) {
1362 case PIPE_SHADER_VERTEX:
1363 si_dump_shader_key_vs(key, &key->part.vs.prolog,
1364 "part.vs.prolog", f);
1365 fprintf(f, " as_es = %u\n", key->as_es);
1366 fprintf(f, " as_ls = %u\n", key->as_ls);
1367 fprintf(f, " as_ngg = %u\n", key->as_ngg);
1368 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
1369 key->mono.u.vs_export_prim_id);
1370 fprintf(f, " opt.vs_as_prim_discard_cs = %u\n",
1371 key->opt.vs_as_prim_discard_cs);
1372 fprintf(f, " opt.cs_prim_type = %s\n",
1373 tgsi_primitive_names[key->opt.cs_prim_type]);
1374 fprintf(f, " opt.cs_indexed = %u\n",
1375 key->opt.cs_indexed);
1376 fprintf(f, " opt.cs_instancing = %u\n",
1377 key->opt.cs_instancing);
1378 fprintf(f, " opt.cs_primitive_restart = %u\n",
1379 key->opt.cs_primitive_restart);
1380 fprintf(f, " opt.cs_provoking_vertex_first = %u\n",
1381 key->opt.cs_provoking_vertex_first);
1382 fprintf(f, " opt.cs_need_correct_orientation = %u\n",
1383 key->opt.cs_need_correct_orientation);
1384 fprintf(f, " opt.cs_cull_front = %u\n",
1385 key->opt.cs_cull_front);
1386 fprintf(f, " opt.cs_cull_back = %u\n",
1387 key->opt.cs_cull_back);
1388 fprintf(f, " opt.cs_cull_z = %u\n",
1389 key->opt.cs_cull_z);
1390 fprintf(f, " opt.cs_halfz_clip_space = %u\n",
1391 key->opt.cs_halfz_clip_space);
1392 break;
1393
1394 case PIPE_SHADER_TESS_CTRL:
1395 if (shader->selector->screen->info.chip_class >= GFX9) {
1396 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
1397 "part.tcs.ls_prolog", f);
1398 }
1399 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
1400 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
1401 break;
1402
1403 case PIPE_SHADER_TESS_EVAL:
1404 fprintf(f, " as_es = %u\n", key->as_es);
1405 fprintf(f, " as_ngg = %u\n", key->as_ngg);
1406 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
1407 key->mono.u.vs_export_prim_id);
1408 break;
1409
1410 case PIPE_SHADER_GEOMETRY:
1411 if (shader->is_gs_copy_shader)
1412 break;
1413
1414 if (shader->selector->screen->info.chip_class >= GFX9 &&
1415 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
1416 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
1417 "part.gs.vs_prolog", f);
1418 }
1419 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
1420 fprintf(f, " part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs);
1421 fprintf(f, " as_ngg = %u\n", key->as_ngg);
1422 break;
1423
1424 case PIPE_SHADER_COMPUTE:
1425 break;
1426
1427 case PIPE_SHADER_FRAGMENT:
1428 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
1429 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
1430 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
1431 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
1432 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
1433 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
1434 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
1435 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
1436 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
1437 fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n", key->part.ps.prolog.samplemask_log_ps_iter);
1438 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
1439 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
1440 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
1441 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
1442 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
1443 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
1444 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
1445 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
1446 fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n", key->mono.u.ps.interpolate_at_sample_force_center);
1447 fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
1448 fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
1449 fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
1450 break;
1451
1452 default:
1453 assert(0);
1454 }
1455
1456 if ((shader_type == PIPE_SHADER_GEOMETRY ||
1457 shader_type == PIPE_SHADER_TESS_EVAL ||
1458 shader_type == PIPE_SHADER_VERTEX) &&
1459 !key->as_es && !key->as_ls) {
1460 fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
1461 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
1462 if (shader_type != PIPE_SHADER_GEOMETRY)
1463 fprintf(f, " opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
1464 }
1465 }
1466
1467 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
1468 {
1469 struct si_shader *shader = ctx->shader;
1470 struct si_shader_info *info = &shader->selector->info;
1471
1472 if ((ctx->type != PIPE_SHADER_VERTEX &&
1473 ctx->type != PIPE_SHADER_TESS_EVAL) ||
1474 shader->key.as_ls ||
1475 shader->key.as_es)
1476 return;
1477
1478 ac_optimize_vs_outputs(&ctx->ac,
1479 ctx->main_fn,
1480 shader->info.vs_output_param_offset,
1481 info->num_outputs,
1482 &shader->info.nr_param_exports);
1483 }
1484
1485 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
1486 const struct si_vs_prolog_bits *prolog_key,
1487 const struct si_shader_key *key,
1488 bool ngg_cull_shader)
1489 {
1490 /* VGPR initialization fixup for Vega10 and Raven is always done in the
1491 * VS prolog. */
1492 return sel->vs_needs_prolog ||
1493 prolog_key->ls_vgpr_fix ||
1494 prolog_key->unpack_instance_id_from_vertex_id ||
1495 (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
1496 }
1497
1498 static bool si_build_main_function(struct si_shader_context *ctx,
1499 struct nir_shader *nir, bool free_nir,
1500 bool ngg_cull_shader)
1501 {
1502 struct si_shader *shader = ctx->shader;
1503 struct si_shader_selector *sel = shader->selector;
1504
1505 si_llvm_init_resource_callbacks(ctx);
1506
1507 switch (ctx->type) {
1508 case PIPE_SHADER_VERTEX:
1509 si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
1510 break;
1511 case PIPE_SHADER_TESS_CTRL:
1512 si_llvm_init_tcs_callbacks(ctx);
1513 break;
1514 case PIPE_SHADER_TESS_EVAL:
1515 si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
1516 break;
1517 case PIPE_SHADER_GEOMETRY:
1518 si_llvm_init_gs_callbacks(ctx);
1519 break;
1520 case PIPE_SHADER_FRAGMENT:
1521 si_llvm_init_ps_callbacks(ctx);
1522 break;
1523 case PIPE_SHADER_COMPUTE:
1524 ctx->abi.load_local_group_size = get_block_size;
1525 break;
1526 default:
1527 assert(!"Unsupported shader type");
1528 return false;
1529 }
1530
1531 si_create_function(ctx, ngg_cull_shader);
1532
1533 if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)
1534 si_preload_esgs_ring(ctx);
1535
1536 if (ctx->type == PIPE_SHADER_GEOMETRY)
1537 si_preload_gs_rings(ctx);
1538 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1539 si_llvm_preload_tes_rings(ctx);
1540
1541 if (ctx->type == PIPE_SHADER_TESS_CTRL &&
1542 sel->info.tessfactors_are_def_in_all_invocs) {
1543 for (unsigned i = 0; i < 6; i++) {
1544 ctx->invoc0_tess_factors[i] =
1545 ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
1546 }
1547 }
1548
1549 if (ctx->type == PIPE_SHADER_GEOMETRY) {
1550 for (unsigned i = 0; i < 4; i++) {
1551 ctx->gs_next_vertex[i] =
1552 ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
1553 }
1554 if (shader->key.as_ngg) {
1555 for (unsigned i = 0; i < 4; ++i) {
1556 ctx->gs_curprim_verts[i] =
1557 ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
1558 ctx->gs_generated_prims[i] =
1559 ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
1560 }
1561
1562 unsigned scratch_size = 8;
1563 if (sel->so.num_outputs)
1564 scratch_size = 44;
1565
1566 assert(!ctx->gs_ngg_scratch);
1567 LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, scratch_size);
1568 ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
1569 ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
1570 LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
1571 LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
1572
1573 ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx->ac.module,
1574 LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
1575 LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
1576 LLVMSetAlignment(ctx->gs_ngg_emit, 4);
1577 }
1578 }
1579
1580 if (ctx->type != PIPE_SHADER_GEOMETRY &&
1581 (shader->key.as_ngg && !shader->key.as_es)) {
1582 /* Unconditionally declare scratch space base for streamout and
1583 * vertex compaction. Whether space is actually allocated is
1584 * determined during linking / PM4 creation.
1585 *
1586 * Add an extra dword per vertex to ensure an odd stride, which
1587 * avoids bank conflicts for SoA accesses.
1588 */
1589 if (!gfx10_is_ngg_passthrough(shader))
1590 si_llvm_declare_esgs_ring(ctx);
1591
1592 /* This is really only needed when streamout and / or vertex
1593 * compaction is enabled.
1594 */
1595 if (!ctx->gs_ngg_scratch &&
1596 (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
1597 LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, 8);
1598 ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
1599 asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
1600 LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
1601 LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
1602 }
1603 }
1604
1605 /* For GFX9 merged shaders:
1606 * - Set EXEC for the first shader. If the prolog is present, set
1607 * EXEC there instead.
1608 * - Add a barrier before the second shader.
1609 * - In the second shader, reset EXEC to ~0 and wrap the main part in
1610 * an if-statement. This is required for correctness in geometry
1611 * shaders, to ensure that empty GS waves do not send GS_EMIT and
1612 * GS_CUT messages.
1613 *
1614 * For monolithic merged shaders, the first shader is wrapped in an
1615 * if-block together with its prolog in si_build_wrapper_function.
1616 *
1617 * NGG vertex and tess eval shaders running as the last
1618 * vertex/geometry stage handle execution explicitly using
1619 * if-statements.
1620 */
1621 if (ctx->screen->info.chip_class >= GFX9) {
1622 if (!shader->is_monolithic &&
1623 (shader->key.as_es || shader->key.as_ls) &&
1624 (ctx->type == PIPE_SHADER_TESS_EVAL ||
1625 (ctx->type == PIPE_SHADER_VERTEX &&
1626 !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
1627 &shader->key, ngg_cull_shader)))) {
1628 si_init_exec_from_input(ctx,
1629 ctx->merged_wave_info, 0);
1630 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
1631 ctx->type == PIPE_SHADER_GEOMETRY ||
1632 (shader->key.as_ngg && !shader->key.as_es)) {
1633 LLVMValueRef thread_enabled;
1634 bool nested_barrier;
1635
1636 if (!shader->is_monolithic ||
1637 (ctx->type == PIPE_SHADER_TESS_EVAL &&
1638 shader->key.as_ngg && !shader->key.as_es &&
1639 !shader->key.opt.ngg_culling))
1640 ac_init_exec_full_mask(&ctx->ac);
1641
1642 if ((ctx->type == PIPE_SHADER_VERTEX ||
1643 ctx->type == PIPE_SHADER_TESS_EVAL) &&
1644 shader->key.as_ngg && !shader->key.as_es &&
1645 !shader->key.opt.ngg_culling) {
1646 gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
1647
1648 /* Build the primitive export at the beginning
1649 * of the shader if possible.
1650 */
1651 if (gfx10_ngg_export_prim_early(shader))
1652 gfx10_ngg_build_export_prim(ctx, NULL, NULL);
1653 }
1654
1655 if (ctx->type == PIPE_SHADER_TESS_CTRL ||
1656 ctx->type == PIPE_SHADER_GEOMETRY) {
1657 if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
1658 gfx10_ngg_gs_emit_prologue(ctx);
1659 nested_barrier = false;
1660 } else {
1661 nested_barrier = true;
1662 }
1663
1664 thread_enabled = si_is_gs_thread(ctx);
1665 } else {
1666 thread_enabled = si_is_es_thread(ctx);
1667 nested_barrier = false;
1668 }
1669
1670 ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
1671 ctx->merged_wrap_if_label = 11500;
1672 ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
1673
1674 if (nested_barrier) {
1675 /* Execute a barrier before the second shader in
1676 * a merged shader.
1677 *
1678 * Execute the barrier inside the conditional block,
1679 * so that empty waves can jump directly to s_endpgm,
1680 * which will also signal the barrier.
1681 *
1682 * This is possible in gfx9, because an empty wave
1683 * for the second shader does not participate in
1684 * the epilogue. With NGG, empty waves may still
1685 * be required to export data (e.g. GS output vertices),
1686 * so we cannot let them exit early.
1687 *
1688 * If the shader is TCS and the TCS epilog is present
1689 * and contains a barrier, it will wait there and then
1690 * reach s_endpgm.
1691 */
1692 si_llvm_emit_barrier(ctx);
1693 }
1694 }
1695 }
1696
1697 if (sel->force_correct_derivs_after_kill) {
1698 ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, "");
1699 /* true = don't kill. */
1700 LLVMBuildStore(ctx->ac.builder, ctx->ac.i1true,
1701 ctx->postponed_kill);
1702 }
1703
1704 bool success = si_nir_build_llvm(ctx, nir);
1705 if (free_nir)
1706 ralloc_free(nir);
1707 if (!success) {
1708 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
1709 return false;
1710 }
1711
1712 si_llvm_build_ret(ctx, ctx->return_value);
1713 return true;
1714 }
1715
1716 /**
1717 * Compute the VS prolog key, which contains all the information needed to
1718 * build the VS prolog function, and set shader->info bits where needed.
1719 *
1720 * \param info Shader info of the vertex shader.
1721 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
1722 * \param has_old_ Whether the preceding shader part is the NGG cull shader.
1723 * \param prolog_key Key of the VS prolog
1724 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
1725 * \param key Output shader part key.
1726 */
1727 static void si_get_vs_prolog_key(const struct si_shader_info *info,
1728 unsigned num_input_sgprs,
1729 bool ngg_cull_shader,
1730 const struct si_vs_prolog_bits *prolog_key,
1731 struct si_shader *shader_out,
1732 union si_shader_part_key *key)
1733 {
1734 memset(key, 0, sizeof(*key));
1735 key->vs_prolog.states = *prolog_key;
1736 key->vs_prolog.num_input_sgprs = num_input_sgprs;
1737 key->vs_prolog.num_inputs = info->num_inputs;
1738 key->vs_prolog.as_ls = shader_out->key.as_ls;
1739 key->vs_prolog.as_es = shader_out->key.as_es;
1740 key->vs_prolog.as_ngg = shader_out->key.as_ngg;
1741
1742 if (ngg_cull_shader) {
1743 key->vs_prolog.gs_fast_launch_tri_list = !!(shader_out->key.opt.ngg_culling &
1744 SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
1745 key->vs_prolog.gs_fast_launch_tri_strip = !!(shader_out->key.opt.ngg_culling &
1746 SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
1747 } else {
1748 key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
1749 }
1750
1751 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
1752 key->vs_prolog.as_ls = 1;
1753 key->vs_prolog.num_merged_next_stage_vgprs = 2;
1754 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
1755 key->vs_prolog.as_es = 1;
1756 key->vs_prolog.num_merged_next_stage_vgprs = 5;
1757 } else if (shader_out->key.as_ngg) {
1758 key->vs_prolog.num_merged_next_stage_vgprs = 5;
1759 }
1760
1761 /* Enable loading the InstanceID VGPR. */
1762 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
1763
1764 if ((key->vs_prolog.states.instance_divisor_is_one |
1765 key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
1766 shader_out->info.uses_instanceid = true;
1767 }
1768
1769 /**
1770 * Given a list of shader part functions, build a wrapper function that
1771 * runs them in sequence to form a monolithic shader.
1772 */
1773 void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
1774 unsigned num_parts, unsigned main_part,
1775 unsigned next_shader_first_part)
1776 {
1777 LLVMBuilderRef builder = ctx->ac.builder;
1778 /* PS epilog has one arg per color component; gfx9 merged shader
1779 * prologs need to forward 40 SGPRs.
1780 */
1781 LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS];
1782 LLVMTypeRef function_type;
1783 unsigned num_first_params;
1784 unsigned num_out, initial_num_out;
1785 ASSERTED unsigned num_out_sgpr; /* used in debug checks */
1786 ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */
1787 unsigned num_sgprs, num_vgprs;
1788 unsigned gprs;
1789
1790 memset(&ctx->args, 0, sizeof(ctx->args));
1791
1792 for (unsigned i = 0; i < num_parts; ++i) {
1793 ac_add_function_attr(ctx->ac.context, parts[i], -1,
1794 AC_FUNC_ATTR_ALWAYSINLINE);
1795 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
1796 }
1797
1798 /* The parameters of the wrapper function correspond to those of the
1799 * first part in terms of SGPRs and VGPRs, but we use the types of the
1800 * main part to get the right types. This is relevant for the
1801 * dereferenceable attribute on descriptor table pointers.
1802 */
1803 num_sgprs = 0;
1804 num_vgprs = 0;
1805
1806 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
1807 num_first_params = LLVMCountParamTypes(function_type);
1808
1809 for (unsigned i = 0; i < num_first_params; ++i) {
1810 LLVMValueRef param = LLVMGetParam(parts[0], i);
1811
1812 if (ac_is_sgpr_param(param)) {
1813 assert(num_vgprs == 0);
1814 num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
1815 } else {
1816 num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
1817 }
1818 }
1819
1820 gprs = 0;
1821 while (gprs < num_sgprs + num_vgprs) {
1822 LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count);
1823 LLVMTypeRef type = LLVMTypeOf(param);
1824 unsigned size = ac_get_type_size(type) / 4;
1825
1826 /* This is going to get casted anyways, so we don't have to
1827 * have the exact same type. But we do have to preserve the
1828 * pointer-ness so that LLVM knows about it.
1829 */
1830 enum ac_arg_type arg_type = AC_ARG_INT;
1831 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
1832 type = LLVMGetElementType(type);
1833
1834 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
1835 if (LLVMGetVectorSize(type) == 4)
1836 arg_type = AC_ARG_CONST_DESC_PTR;
1837 else if (LLVMGetVectorSize(type) == 8)
1838 arg_type = AC_ARG_CONST_IMAGE_PTR;
1839 else
1840 assert(0);
1841 } else if (type == ctx->ac.f32) {
1842 arg_type = AC_ARG_CONST_FLOAT_PTR;
1843 } else {
1844 assert(0);
1845 }
1846 }
1847
1848 ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR,
1849 size, arg_type, NULL);
1850
1851 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
1852 assert(gprs + size <= num_sgprs + num_vgprs &&
1853 (gprs >= num_sgprs || gprs + size <= num_sgprs));
1854
1855 gprs += size;
1856 }
1857
1858 /* Prepare the return type. */
1859 unsigned num_returns = 0;
1860 LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type;
1861
1862 last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1]));
1863 return_type = LLVMGetReturnType(last_func_type);
1864
1865 switch (LLVMGetTypeKind(return_type)) {
1866 case LLVMStructTypeKind:
1867 num_returns = LLVMCountStructElementTypes(return_type);
1868 assert(num_returns <= ARRAY_SIZE(returns));
1869 LLVMGetStructElementTypes(return_type, returns);
1870 break;
1871 case LLVMVoidTypeKind:
1872 break;
1873 default:
1874 unreachable("unexpected type");
1875 }
1876
1877 si_llvm_create_func(ctx, "wrapper", returns, num_returns,
1878 si_get_max_workgroup_size(ctx->shader));
1879
1880 if (si_is_merged_shader(ctx))
1881 ac_init_exec_full_mask(&ctx->ac);
1882
1883 /* Record the arguments of the function as if they were an output of
1884 * a previous part.
1885 */
1886 num_out = 0;
1887 num_out_sgpr = 0;
1888
1889 for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
1890 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
1891 LLVMTypeRef param_type = LLVMTypeOf(param);
1892 LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32;
1893 unsigned size = ac_get_type_size(param_type) / 4;
1894
1895 if (size == 1) {
1896 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
1897 param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, "");
1898 param_type = ctx->ac.i32;
1899 }
1900
1901 if (param_type != out_type)
1902 param = LLVMBuildBitCast(builder, param, out_type, "");
1903 out[num_out++] = param;
1904 } else {
1905 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
1906
1907 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
1908 param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, "");
1909 param_type = ctx->ac.i64;
1910 }
1911
1912 if (param_type != vector_type)
1913 param = LLVMBuildBitCast(builder, param, vector_type, "");
1914
1915 for (unsigned j = 0; j < size; ++j)
1916 out[num_out++] = LLVMBuildExtractElement(
1917 builder, param, LLVMConstInt(ctx->ac.i32, j, 0), "");
1918 }
1919
1920 if (ctx->args.args[i].file == AC_ARG_SGPR)
1921 num_out_sgpr = num_out;
1922 }
1923
1924 memcpy(initial, out, sizeof(out));
1925 initial_num_out = num_out;
1926 initial_num_out_sgpr = num_out_sgpr;
1927
1928 /* Now chain the parts. */
1929 LLVMValueRef ret = NULL;
1930 for (unsigned part = 0; part < num_parts; ++part) {
1931 LLVMValueRef in[AC_MAX_ARGS];
1932 LLVMTypeRef ret_type;
1933 unsigned out_idx = 0;
1934 unsigned num_params = LLVMCountParams(parts[part]);
1935
1936 /* Merged shaders are executed conditionally depending
1937 * on the number of enabled threads passed in the input SGPRs. */
1938 if (is_multi_part_shader(ctx) && part == 0) {
1939 LLVMValueRef ena, count = initial[3];
1940
1941 count = LLVMBuildAnd(builder, count,
1942 LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
1943 ena = LLVMBuildICmp(builder, LLVMIntULT,
1944 ac_get_thread_id(&ctx->ac), count, "");
1945 ac_build_ifcc(&ctx->ac, ena, 6506);
1946 }
1947
1948 /* Derive arguments for the next part from outputs of the
1949 * previous one.
1950 */
1951 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
1952 LLVMValueRef param;
1953 LLVMTypeRef param_type;
1954 bool is_sgpr;
1955 unsigned param_size;
1956 LLVMValueRef arg = NULL;
1957
1958 param = LLVMGetParam(parts[part], param_idx);
1959 param_type = LLVMTypeOf(param);
1960 param_size = ac_get_type_size(param_type) / 4;
1961 is_sgpr = ac_is_sgpr_param(param);
1962
1963 if (is_sgpr) {
1964 ac_add_function_attr(ctx->ac.context, parts[part],
1965 param_idx + 1, AC_FUNC_ATTR_INREG);
1966 } else if (out_idx < num_out_sgpr) {
1967 /* Skip returned SGPRs the current part doesn't
1968 * declare on the input. */
1969 out_idx = num_out_sgpr;
1970 }
1971
1972 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
1973
1974 if (param_size == 1)
1975 arg = out[out_idx];
1976 else
1977 arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
1978
1979 if (LLVMTypeOf(arg) != param_type) {
1980 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
1981 if (LLVMGetPointerAddressSpace(param_type) ==
1982 AC_ADDR_SPACE_CONST_32BIT) {
1983 arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, "");
1984 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
1985 } else {
1986 arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, "");
1987 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
1988 }
1989 } else {
1990 arg = LLVMBuildBitCast(builder, arg, param_type, "");
1991 }
1992 }
1993
1994 in[param_idx] = arg;
1995 out_idx += param_size;
1996 }
1997
1998 ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
1999
2000 if (is_multi_part_shader(ctx) &&
2001 part + 1 == next_shader_first_part) {
2002 ac_build_endif(&ctx->ac, 6506);
2003
2004 /* The second half of the merged shader should use
2005 * the inputs from the toplevel (wrapper) function,
2006 * not the return value from the last call.
2007 *
2008 * That's because the last call was executed condi-
2009 * tionally, so we can't consume it in the main
2010 * block.
2011 */
2012 memcpy(out, initial, sizeof(initial));
2013 num_out = initial_num_out;
2014 num_out_sgpr = initial_num_out_sgpr;
2015 continue;
2016 }
2017
2018 /* Extract the returned GPRs. */
2019 ret_type = LLVMTypeOf(ret);
2020 num_out = 0;
2021 num_out_sgpr = 0;
2022
2023 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
2024 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
2025
2026 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
2027
2028 for (unsigned i = 0; i < ret_size; ++i) {
2029 LLVMValueRef val =
2030 LLVMBuildExtractValue(builder, ret, i, "");
2031
2032 assert(num_out < ARRAY_SIZE(out));
2033 out[num_out++] = val;
2034
2035 if (LLVMTypeOf(val) == ctx->ac.i32) {
2036 assert(num_out_sgpr + 1 == num_out);
2037 num_out_sgpr = num_out;
2038 }
2039 }
2040 }
2041 }
2042
2043 /* Return the value from the last part. */
2044 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
2045 LLVMBuildRetVoid(builder);
2046 else
2047 LLVMBuildRet(builder, ret);
2048 }
2049
2050 static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
2051 struct si_shader_selector *sel)
2052 {
2053 if (!compiler->low_opt_passes)
2054 return false;
2055
2056 /* Assume a slow CPU. */
2057 assert(!sel->screen->info.has_dedicated_vram &&
2058 sel->screen->info.chip_class <= GFX8);
2059
2060 /* For a crazy dEQP test containing 2597 memory opcodes, mostly
2061 * buffer stores. */
2062 return sel->type == PIPE_SHADER_COMPUTE &&
2063 sel->info.num_memory_instructions > 1000;
2064 }
2065
2066 static struct nir_shader *get_nir_shader(struct si_shader_selector *sel,
2067 bool *free_nir)
2068 {
2069 *free_nir = false;
2070
2071 if (sel->nir) {
2072 return sel->nir;
2073 } else if (sel->nir_binary) {
2074 struct pipe_screen *screen = &sel->screen->b;
2075 const void *options =
2076 screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
2077 sel->type);
2078
2079 struct blob_reader blob_reader;
2080 blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
2081 *free_nir = true;
2082 return nir_deserialize(NULL, options, &blob_reader);
2083 }
2084 return NULL;
2085 }
2086
2087 /* Set the context to a certain shader. Can be called repeatedly
2088 * to change the shader. */
2089 static void si_shader_context_set_ir(struct si_shader_context *ctx,
2090 struct si_shader *shader)
2091 {
2092 struct si_shader_selector *sel = shader->selector;
2093 const struct si_shader_info *info = &sel->info;
2094
2095 ctx->shader = shader;
2096 ctx->type = sel->type;
2097
2098 ctx->num_const_buffers = util_last_bit(info->const_buffers_declared);
2099 ctx->num_shader_buffers = util_last_bit(info->shader_buffers_declared);
2100
2101 ctx->num_samplers = util_last_bit(info->samplers_declared);
2102 ctx->num_images = util_last_bit(info->images_declared);
2103 }
2104
2105 int si_compile_shader(struct si_screen *sscreen,
2106 struct ac_llvm_compiler *compiler,
2107 struct si_shader *shader,
2108 struct pipe_debug_callback *debug)
2109 {
2110 struct si_shader_selector *sel = shader->selector;
2111 struct si_shader_context ctx;
2112 bool free_nir;
2113 struct nir_shader *nir = get_nir_shader(sel, &free_nir);
2114 int r = -1;
2115
2116 /* Dump NIR before doing NIR->LLVM conversion in case the
2117 * conversion fails. */
2118 if (si_can_dump_shader(sscreen, sel->type) &&
2119 !(sscreen->debug_flags & DBG(NO_NIR))) {
2120 nir_print_shader(nir, stderr);
2121 si_dump_streamout(&sel->so);
2122 }
2123
2124 si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
2125 si_shader_context_set_ir(&ctx, shader);
2126
2127 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
2128 sizeof(shader->info.vs_output_param_offset));
2129
2130 shader->info.uses_instanceid = sel->info.uses_instanceid;
2131
2132 LLVMValueRef ngg_cull_main_fn = NULL;
2133 if (ctx.shader->key.opt.ngg_culling) {
2134 if (!si_build_main_function(&ctx, nir, false, true)) {
2135 si_llvm_dispose(&ctx);
2136 return -1;
2137 }
2138 ngg_cull_main_fn = ctx.main_fn;
2139 ctx.main_fn = NULL;
2140 /* Re-set the IR. */
2141 si_shader_context_set_ir(&ctx, shader);
2142 }
2143
2144 if (!si_build_main_function(&ctx, nir, free_nir, false)) {
2145 si_llvm_dispose(&ctx);
2146 return -1;
2147 }
2148
2149 if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
2150 LLVMValueRef parts[4];
2151 unsigned num_parts = 0;
2152 bool has_prolog = false;
2153 LLVMValueRef main_fn = ctx.main_fn;
2154
2155 if (ngg_cull_main_fn) {
2156 if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
2157 &shader->key, true)) {
2158 union si_shader_part_key prolog_key;
2159 si_get_vs_prolog_key(&sel->info,
2160 shader->info.num_input_sgprs,
2161 true,
2162 &shader->key.part.vs.prolog,
2163 shader, &prolog_key);
2164 prolog_key.vs_prolog.is_monolithic = true;
2165 si_llvm_build_vs_prolog(&ctx, &prolog_key);
2166 parts[num_parts++] = ctx.main_fn;
2167 has_prolog = true;
2168 }
2169 parts[num_parts++] = ngg_cull_main_fn;
2170 }
2171
2172 if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
2173 &shader->key, false)) {
2174 union si_shader_part_key prolog_key;
2175 si_get_vs_prolog_key(&sel->info,
2176 shader->info.num_input_sgprs,
2177 false,
2178 &shader->key.part.vs.prolog,
2179 shader, &prolog_key);
2180 prolog_key.vs_prolog.is_monolithic = true;
2181 si_llvm_build_vs_prolog(&ctx, &prolog_key);
2182 parts[num_parts++] = ctx.main_fn;
2183 has_prolog = true;
2184 }
2185 parts[num_parts++] = main_fn;
2186
2187 si_build_wrapper_function(&ctx, parts, num_parts,
2188 has_prolog ? 1 : 0, 0);
2189
2190 if (ctx.shader->key.opt.vs_as_prim_discard_cs)
2191 si_build_prim_discard_compute_shader(&ctx);
2192 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
2193 ngg_cull_main_fn) {
2194 LLVMValueRef parts[2];
2195
2196 parts[0] = ngg_cull_main_fn;
2197 parts[1] = ctx.main_fn;
2198
2199 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
2200 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
2201 if (sscreen->info.chip_class >= GFX9) {
2202 struct si_shader_selector *ls = shader->key.part.tcs.ls;
2203 LLVMValueRef parts[4];
2204 bool vs_needs_prolog =
2205 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog,
2206 &shader->key, false);
2207
2208 /* TCS main part */
2209 parts[2] = ctx.main_fn;
2210
2211 /* TCS epilog */
2212 union si_shader_part_key tcs_epilog_key;
2213 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
2214 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
2215 si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
2216 parts[3] = ctx.main_fn;
2217
2218 /* VS as LS main part */
2219 nir = get_nir_shader(ls, &free_nir);
2220 struct si_shader shader_ls = {};
2221 shader_ls.selector = ls;
2222 shader_ls.key.as_ls = 1;
2223 shader_ls.key.mono = shader->key.mono;
2224 shader_ls.key.opt = shader->key.opt;
2225 shader_ls.is_monolithic = true;
2226 si_shader_context_set_ir(&ctx, &shader_ls);
2227
2228 if (!si_build_main_function(&ctx, nir, free_nir, false)) {
2229 si_llvm_dispose(&ctx);
2230 return -1;
2231 }
2232 shader->info.uses_instanceid |= ls->info.uses_instanceid;
2233 parts[1] = ctx.main_fn;
2234
2235 /* LS prolog */
2236 if (vs_needs_prolog) {
2237 union si_shader_part_key vs_prolog_key;
2238 si_get_vs_prolog_key(&ls->info,
2239 shader_ls.info.num_input_sgprs,
2240 false,
2241 &shader->key.part.tcs.ls_prolog,
2242 shader, &vs_prolog_key);
2243 vs_prolog_key.vs_prolog.is_monolithic = true;
2244 si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
2245 parts[0] = ctx.main_fn;
2246 }
2247
2248 /* Reset the shader context. */
2249 ctx.shader = shader;
2250 ctx.type = PIPE_SHADER_TESS_CTRL;
2251
2252 si_build_wrapper_function(&ctx,
2253 parts + !vs_needs_prolog,
2254 4 - !vs_needs_prolog, vs_needs_prolog,
2255 vs_needs_prolog ? 2 : 1);
2256 } else {
2257 LLVMValueRef parts[2];
2258 union si_shader_part_key epilog_key;
2259
2260 parts[0] = ctx.main_fn;
2261
2262 memset(&epilog_key, 0, sizeof(epilog_key));
2263 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
2264 si_llvm_build_tcs_epilog(&ctx, &epilog_key);
2265 parts[1] = ctx.main_fn;
2266
2267 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
2268 }
2269 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
2270 if (ctx.screen->info.chip_class >= GFX9) {
2271 struct si_shader_selector *es = shader->key.part.gs.es;
2272 LLVMValueRef es_prolog = NULL;
2273 LLVMValueRef es_main = NULL;
2274 LLVMValueRef gs_prolog = NULL;
2275 LLVMValueRef gs_main = ctx.main_fn;
2276
2277 /* GS prolog */
2278 union si_shader_part_key gs_prolog_key;
2279 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
2280 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
2281 gs_prolog_key.gs_prolog.is_monolithic = true;
2282 gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
2283 si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
2284 gs_prolog = ctx.main_fn;
2285
2286 /* ES main part */
2287 nir = get_nir_shader(es, &free_nir);
2288 struct si_shader shader_es = {};
2289 shader_es.selector = es;
2290 shader_es.key.as_es = 1;
2291 shader_es.key.as_ngg = shader->key.as_ngg;
2292 shader_es.key.mono = shader->key.mono;
2293 shader_es.key.opt = shader->key.opt;
2294 shader_es.is_monolithic = true;
2295 si_shader_context_set_ir(&ctx, &shader_es);
2296
2297 if (!si_build_main_function(&ctx, nir, free_nir, false)) {
2298 si_llvm_dispose(&ctx);
2299 return -1;
2300 }
2301 shader->info.uses_instanceid |= es->info.uses_instanceid;
2302 es_main = ctx.main_fn;
2303
2304 /* ES prolog */
2305 if (es->type == PIPE_SHADER_VERTEX &&
2306 si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog,
2307 &shader->key, false)) {
2308 union si_shader_part_key vs_prolog_key;
2309 si_get_vs_prolog_key(&es->info,
2310 shader_es.info.num_input_sgprs,
2311 false,
2312 &shader->key.part.gs.vs_prolog,
2313 shader, &vs_prolog_key);
2314 vs_prolog_key.vs_prolog.is_monolithic = true;
2315 si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
2316 es_prolog = ctx.main_fn;
2317 }
2318
2319 /* Reset the shader context. */
2320 ctx.shader = shader;
2321 ctx.type = PIPE_SHADER_GEOMETRY;
2322
2323 /* Prepare the array of shader parts. */
2324 LLVMValueRef parts[4];
2325 unsigned num_parts = 0, main_part, next_first_part;
2326
2327 if (es_prolog)
2328 parts[num_parts++] = es_prolog;
2329
2330 parts[main_part = num_parts++] = es_main;
2331 parts[next_first_part = num_parts++] = gs_prolog;
2332 parts[num_parts++] = gs_main;
2333
2334 si_build_wrapper_function(&ctx, parts, num_parts,
2335 main_part, next_first_part);
2336 } else {
2337 LLVMValueRef parts[2];
2338 union si_shader_part_key prolog_key;
2339
2340 parts[1] = ctx.main_fn;
2341
2342 memset(&prolog_key, 0, sizeof(prolog_key));
2343 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
2344 si_llvm_build_gs_prolog(&ctx, &prolog_key);
2345 parts[0] = ctx.main_fn;
2346
2347 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
2348 }
2349 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
2350 si_llvm_build_monolithic_ps(&ctx, shader);
2351 }
2352
2353 si_llvm_optimize_module(&ctx);
2354
2355 /* Post-optimization transformations and analysis. */
2356 si_optimize_vs_outputs(&ctx);
2357
2358 if ((debug && debug->debug_message) ||
2359 si_can_dump_shader(sscreen, ctx.type)) {
2360 ctx.shader->info.private_mem_vgprs =
2361 ac_count_scratch_private_memory(ctx.main_fn);
2362 }
2363
2364 /* Make sure the input is a pointer and not integer followed by inttoptr. */
2365 assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) ==
2366 LLVMPointerTypeKind);
2367
2368 /* Compile to bytecode. */
2369 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
2370 &ctx.ac, debug, ctx.type, si_get_shader_name(shader),
2371 si_should_optimize_less(compiler, shader->selector));
2372 si_llvm_dispose(&ctx);
2373 if (r) {
2374 fprintf(stderr, "LLVM failed to compile shader\n");
2375 return r;
2376 }
2377
2378 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
2379 * LLVM 3.9svn has this bug.
2380 */
2381 if (sel->type == PIPE_SHADER_COMPUTE) {
2382 unsigned wave_size = sscreen->compute_wave_size;
2383 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd *
2384 (wave_size == 32 ? 2 : 1);
2385 unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
2386 unsigned max_sgprs_per_wave = 128;
2387 unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
2388 unsigned threads_per_tg = si_get_max_workgroup_size(shader);
2389 unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
2390 unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
2391
2392 max_vgprs = max_vgprs / waves_per_simd;
2393 max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
2394
2395 if (shader->config.num_sgprs > max_sgprs ||
2396 shader->config.num_vgprs > max_vgprs) {
2397 fprintf(stderr, "LLVM failed to compile a shader correctly: "
2398 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
2399 shader->config.num_sgprs, shader->config.num_vgprs,
2400 max_sgprs, max_vgprs);
2401
2402 /* Just terminate the process, because dependent
2403 * shaders can hang due to bad input data, but use
2404 * the env var to allow shader-db to work.
2405 */
2406 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
2407 abort();
2408 }
2409 }
2410
2411 /* Add the scratch offset to input SGPRs. */
2412 if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(&ctx))
2413 shader->info.num_input_sgprs += 1; /* scratch byte offset */
2414
2415 /* Calculate the number of fragment input VGPRs. */
2416 if (ctx.type == PIPE_SHADER_FRAGMENT) {
2417 shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(&shader->config,
2418 &shader->info.face_vgpr_index,
2419 &shader->info.ancillary_vgpr_index);
2420 }
2421
2422 si_calculate_max_simd_waves(shader);
2423 si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
2424 return 0;
2425 }
2426
2427 /**
2428 * Create, compile and return a shader part (prolog or epilog).
2429 *
2430 * \param sscreen screen
2431 * \param list list of shader parts of the same category
2432 * \param type shader type
2433 * \param key shader part key
2434 * \param prolog whether the part being requested is a prolog
2435 * \param tm LLVM target machine
2436 * \param debug debug callback
2437 * \param build the callback responsible for building the main function
2438 * \return non-NULL on success
2439 */
2440 static struct si_shader_part *
2441 si_get_shader_part(struct si_screen *sscreen,
2442 struct si_shader_part **list,
2443 enum pipe_shader_type type,
2444 bool prolog,
2445 union si_shader_part_key *key,
2446 struct ac_llvm_compiler *compiler,
2447 struct pipe_debug_callback *debug,
2448 void (*build)(struct si_shader_context *,
2449 union si_shader_part_key *),
2450 const char *name)
2451 {
2452 struct si_shader_part *result;
2453
2454 simple_mtx_lock(&sscreen->shader_parts_mutex);
2455
2456 /* Find existing. */
2457 for (result = *list; result; result = result->next) {
2458 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
2459 simple_mtx_unlock(&sscreen->shader_parts_mutex);
2460 return result;
2461 }
2462 }
2463
2464 /* Compile a new one. */
2465 result = CALLOC_STRUCT(si_shader_part);
2466 result->key = *key;
2467
2468 struct si_shader shader = {};
2469
2470 switch (type) {
2471 case PIPE_SHADER_VERTEX:
2472 shader.key.as_ls = key->vs_prolog.as_ls;
2473 shader.key.as_es = key->vs_prolog.as_es;
2474 shader.key.as_ngg = key->vs_prolog.as_ngg;
2475 break;
2476 case PIPE_SHADER_TESS_CTRL:
2477 assert(!prolog);
2478 shader.key.part.tcs.epilog = key->tcs_epilog.states;
2479 break;
2480 case PIPE_SHADER_GEOMETRY:
2481 assert(prolog);
2482 shader.key.as_ngg = key->gs_prolog.as_ngg;
2483 break;
2484 case PIPE_SHADER_FRAGMENT:
2485 if (prolog)
2486 shader.key.part.ps.prolog = key->ps_prolog.states;
2487 else
2488 shader.key.part.ps.epilog = key->ps_epilog.states;
2489 break;
2490 default:
2491 unreachable("bad shader part");
2492 }
2493
2494 struct si_shader_context ctx;
2495 si_llvm_context_init(&ctx, sscreen, compiler,
2496 si_get_wave_size(sscreen, type, shader.key.as_ngg,
2497 shader.key.as_es));
2498 ctx.shader = &shader;
2499 ctx.type = type;
2500
2501 build(&ctx, key);
2502
2503 /* Compile. */
2504 si_llvm_optimize_module(&ctx);
2505
2506 if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
2507 &ctx.ac, debug, ctx.type, name, false)) {
2508 FREE(result);
2509 result = NULL;
2510 goto out;
2511 }
2512
2513 result->next = *list;
2514 *list = result;
2515
2516 out:
2517 si_llvm_dispose(&ctx);
2518 simple_mtx_unlock(&sscreen->shader_parts_mutex);
2519 return result;
2520 }
2521
2522 static bool si_get_vs_prolog(struct si_screen *sscreen,
2523 struct ac_llvm_compiler *compiler,
2524 struct si_shader *shader,
2525 struct pipe_debug_callback *debug,
2526 struct si_shader *main_part,
2527 const struct si_vs_prolog_bits *key)
2528 {
2529 struct si_shader_selector *vs = main_part->selector;
2530
2531 if (!si_vs_needs_prolog(vs, key, &shader->key, false))
2532 return true;
2533
2534 /* Get the prolog. */
2535 union si_shader_part_key prolog_key;
2536 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false,
2537 key, shader, &prolog_key);
2538
2539 shader->prolog =
2540 si_get_shader_part(sscreen, &sscreen->vs_prologs,
2541 PIPE_SHADER_VERTEX, true, &prolog_key, compiler,
2542 debug, si_llvm_build_vs_prolog,
2543 "Vertex Shader Prolog");
2544 return shader->prolog != NULL;
2545 }
2546
2547 /**
2548 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
2549 */
2550 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
2551 struct ac_llvm_compiler *compiler,
2552 struct si_shader *shader,
2553 struct pipe_debug_callback *debug)
2554 {
2555 return si_get_vs_prolog(sscreen, compiler, shader, debug, shader,
2556 &shader->key.part.vs.prolog);
2557 }
2558
2559 /**
2560 * Select and compile (or reuse) TCS parts (epilog).
2561 */
2562 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
2563 struct ac_llvm_compiler *compiler,
2564 struct si_shader *shader,
2565 struct pipe_debug_callback *debug)
2566 {
2567 if (sscreen->info.chip_class >= GFX9) {
2568 struct si_shader *ls_main_part =
2569 shader->key.part.tcs.ls->main_shader_part_ls;
2570
2571 if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
2572 &shader->key.part.tcs.ls_prolog))
2573 return false;
2574
2575 shader->previous_stage = ls_main_part;
2576 }
2577
2578 /* Get the epilog. */
2579 union si_shader_part_key epilog_key;
2580 memset(&epilog_key, 0, sizeof(epilog_key));
2581 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
2582
2583 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
2584 PIPE_SHADER_TESS_CTRL, false,
2585 &epilog_key, compiler, debug,
2586 si_llvm_build_tcs_epilog,
2587 "Tessellation Control Shader Epilog");
2588 return shader->epilog != NULL;
2589 }
2590
2591 /**
2592 * Select and compile (or reuse) GS parts (prolog).
2593 */
2594 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
2595 struct ac_llvm_compiler *compiler,
2596 struct si_shader *shader,
2597 struct pipe_debug_callback *debug)
2598 {
2599 if (sscreen->info.chip_class >= GFX9) {
2600 struct si_shader *es_main_part;
2601 enum pipe_shader_type es_type = shader->key.part.gs.es->type;
2602
2603 if (shader->key.as_ngg)
2604 es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
2605 else
2606 es_main_part = shader->key.part.gs.es->main_shader_part_es;
2607
2608 if (es_type == PIPE_SHADER_VERTEX &&
2609 !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
2610 &shader->key.part.gs.vs_prolog))
2611 return false;
2612
2613 shader->previous_stage = es_main_part;
2614 }
2615
2616 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
2617 return true;
2618
2619 union si_shader_part_key prolog_key;
2620 memset(&prolog_key, 0, sizeof(prolog_key));
2621 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
2622 prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
2623
2624 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
2625 PIPE_SHADER_GEOMETRY, true,
2626 &prolog_key, compiler, debug,
2627 si_llvm_build_gs_prolog,
2628 "Geometry Shader Prolog");
2629 return shader->prolog2 != NULL;
2630 }
2631
2632 /**
2633 * Compute the PS prolog key, which contains all the information needed to
2634 * build the PS prolog function, and set related bits in shader->config.
2635 */
2636 void si_get_ps_prolog_key(struct si_shader *shader,
2637 union si_shader_part_key *key,
2638 bool separate_prolog)
2639 {
2640 struct si_shader_info *info = &shader->selector->info;
2641
2642 memset(key, 0, sizeof(*key));
2643 key->ps_prolog.states = shader->key.part.ps.prolog;
2644 key->ps_prolog.colors_read = info->colors_read;
2645 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
2646 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
2647 key->ps_prolog.wqm = info->uses_derivatives &&
2648 (key->ps_prolog.colors_read ||
2649 key->ps_prolog.states.force_persp_sample_interp ||
2650 key->ps_prolog.states.force_linear_sample_interp ||
2651 key->ps_prolog.states.force_persp_center_interp ||
2652 key->ps_prolog.states.force_linear_center_interp ||
2653 key->ps_prolog.states.bc_optimize_for_persp ||
2654 key->ps_prolog.states.bc_optimize_for_linear);
2655 key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
2656
2657 if (info->colors_read) {
2658 unsigned *color = shader->selector->color_attr_index;
2659
2660 if (shader->key.part.ps.prolog.color_two_side) {
2661 /* BCOLORs are stored after the last input. */
2662 key->ps_prolog.num_interp_inputs = info->num_inputs;
2663 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
2664 if (separate_prolog)
2665 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
2666 }
2667
2668 for (unsigned i = 0; i < 2; i++) {
2669 unsigned interp = info->input_interpolate[color[i]];
2670 unsigned location = info->input_interpolate_loc[color[i]];
2671
2672 if (!(info->colors_read & (0xf << i*4)))
2673 continue;
2674
2675 key->ps_prolog.color_attr_index[i] = color[i];
2676
2677 if (shader->key.part.ps.prolog.flatshade_colors &&
2678 interp == TGSI_INTERPOLATE_COLOR)
2679 interp = TGSI_INTERPOLATE_CONSTANT;
2680
2681 switch (interp) {
2682 case TGSI_INTERPOLATE_CONSTANT:
2683 key->ps_prolog.color_interp_vgpr_index[i] = -1;
2684 break;
2685 case TGSI_INTERPOLATE_PERSPECTIVE:
2686 case TGSI_INTERPOLATE_COLOR:
2687 /* Force the interpolation location for colors here. */
2688 if (shader->key.part.ps.prolog.force_persp_sample_interp)
2689 location = TGSI_INTERPOLATE_LOC_SAMPLE;
2690 if (shader->key.part.ps.prolog.force_persp_center_interp)
2691 location = TGSI_INTERPOLATE_LOC_CENTER;
2692
2693 switch (location) {
2694 case TGSI_INTERPOLATE_LOC_SAMPLE:
2695 key->ps_prolog.color_interp_vgpr_index[i] = 0;
2696 if (separate_prolog) {
2697 shader->config.spi_ps_input_ena |=
2698 S_0286CC_PERSP_SAMPLE_ENA(1);
2699 }
2700 break;
2701 case TGSI_INTERPOLATE_LOC_CENTER:
2702 key->ps_prolog.color_interp_vgpr_index[i] = 2;
2703 if (separate_prolog) {
2704 shader->config.spi_ps_input_ena |=
2705 S_0286CC_PERSP_CENTER_ENA(1);
2706 }
2707 break;
2708 case TGSI_INTERPOLATE_LOC_CENTROID:
2709 key->ps_prolog.color_interp_vgpr_index[i] = 4;
2710 if (separate_prolog) {
2711 shader->config.spi_ps_input_ena |=
2712 S_0286CC_PERSP_CENTROID_ENA(1);
2713 }
2714 break;
2715 default:
2716 assert(0);
2717 }
2718 break;
2719 case TGSI_INTERPOLATE_LINEAR:
2720 /* Force the interpolation location for colors here. */
2721 if (shader->key.part.ps.prolog.force_linear_sample_interp)
2722 location = TGSI_INTERPOLATE_LOC_SAMPLE;
2723 if (shader->key.part.ps.prolog.force_linear_center_interp)
2724 location = TGSI_INTERPOLATE_LOC_CENTER;
2725
2726 /* The VGPR assignment for non-monolithic shaders
2727 * works because InitialPSInputAddr is set on the
2728 * main shader and PERSP_PULL_MODEL is never used.
2729 */
2730 switch (location) {
2731 case TGSI_INTERPOLATE_LOC_SAMPLE:
2732 key->ps_prolog.color_interp_vgpr_index[i] =
2733 separate_prolog ? 6 : 9;
2734 if (separate_prolog) {
2735 shader->config.spi_ps_input_ena |=
2736 S_0286CC_LINEAR_SAMPLE_ENA(1);
2737 }
2738 break;
2739 case TGSI_INTERPOLATE_LOC_CENTER:
2740 key->ps_prolog.color_interp_vgpr_index[i] =
2741 separate_prolog ? 8 : 11;
2742 if (separate_prolog) {
2743 shader->config.spi_ps_input_ena |=
2744 S_0286CC_LINEAR_CENTER_ENA(1);
2745 }
2746 break;
2747 case TGSI_INTERPOLATE_LOC_CENTROID:
2748 key->ps_prolog.color_interp_vgpr_index[i] =
2749 separate_prolog ? 10 : 13;
2750 if (separate_prolog) {
2751 shader->config.spi_ps_input_ena |=
2752 S_0286CC_LINEAR_CENTROID_ENA(1);
2753 }
2754 break;
2755 default:
2756 assert(0);
2757 }
2758 break;
2759 default:
2760 assert(0);
2761 }
2762 }
2763 }
2764 }
2765
2766 /**
2767 * Check whether a PS prolog is required based on the key.
2768 */
2769 bool si_need_ps_prolog(const union si_shader_part_key *key)
2770 {
2771 return key->ps_prolog.colors_read ||
2772 key->ps_prolog.states.force_persp_sample_interp ||
2773 key->ps_prolog.states.force_linear_sample_interp ||
2774 key->ps_prolog.states.force_persp_center_interp ||
2775 key->ps_prolog.states.force_linear_center_interp ||
2776 key->ps_prolog.states.bc_optimize_for_persp ||
2777 key->ps_prolog.states.bc_optimize_for_linear ||
2778 key->ps_prolog.states.poly_stipple ||
2779 key->ps_prolog.states.samplemask_log_ps_iter;
2780 }
2781
2782 /**
2783 * Compute the PS epilog key, which contains all the information needed to
2784 * build the PS epilog function.
2785 */
2786 void si_get_ps_epilog_key(struct si_shader *shader,
2787 union si_shader_part_key *key)
2788 {
2789 struct si_shader_info *info = &shader->selector->info;
2790 memset(key, 0, sizeof(*key));
2791 key->ps_epilog.colors_written = info->colors_written;
2792 key->ps_epilog.writes_z = info->writes_z;
2793 key->ps_epilog.writes_stencil = info->writes_stencil;
2794 key->ps_epilog.writes_samplemask = info->writes_samplemask;
2795 key->ps_epilog.states = shader->key.part.ps.epilog;
2796 }
2797
2798 /**
2799 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
2800 */
2801 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
2802 struct ac_llvm_compiler *compiler,
2803 struct si_shader *shader,
2804 struct pipe_debug_callback *debug)
2805 {
2806 union si_shader_part_key prolog_key;
2807 union si_shader_part_key epilog_key;
2808
2809 /* Get the prolog. */
2810 si_get_ps_prolog_key(shader, &prolog_key, true);
2811
2812 /* The prolog is a no-op if these aren't set. */
2813 if (si_need_ps_prolog(&prolog_key)) {
2814 shader->prolog =
2815 si_get_shader_part(sscreen, &sscreen->ps_prologs,
2816 PIPE_SHADER_FRAGMENT, true,
2817 &am