radeonsi: fold si_shader_context_set_ir into si_build_main_function
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "util/u_memory.h"
26 #include "tgsi/tgsi_strings.h"
27 #include "tgsi/tgsi_from_mesa.h"
28
29 #include "ac_exp_param.h"
30 #include "ac_rtld.h"
31 #include "si_shader_internal.h"
32 #include "si_pipe.h"
33 #include "sid.h"
34
35 #include "compiler/nir/nir.h"
36 #include "compiler/nir/nir_serialize.h"
37
38 static const char scratch_rsrc_dword0_symbol[] =
39 "SCRATCH_RSRC_DWORD0";
40
41 static const char scratch_rsrc_dword1_symbol[] =
42 "SCRATCH_RSRC_DWORD1";
43
44 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
45
46 /** Whether the shader runs as a combination of multiple API shaders */
47 static bool is_multi_part_shader(struct si_shader_context *ctx)
48 {
49 if (ctx->screen->info.chip_class <= GFX8)
50 return false;
51
52 return ctx->shader->key.as_ls ||
53 ctx->shader->key.as_es ||
54 ctx->type == PIPE_SHADER_TESS_CTRL ||
55 ctx->type == PIPE_SHADER_GEOMETRY;
56 }
57
58 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
59 bool si_is_merged_shader(struct si_shader_context *ctx)
60 {
61 return ctx->shader->key.as_ngg || is_multi_part_shader(ctx);
62 }
63
64 /**
65 * Returns a unique index for a per-patch semantic name and index. The index
66 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
67 * can be calculated.
68 */
69 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
70 {
71 switch (semantic_name) {
72 case TGSI_SEMANTIC_TESSOUTER:
73 return 0;
74 case TGSI_SEMANTIC_TESSINNER:
75 return 1;
76 case TGSI_SEMANTIC_PATCH:
77 assert(index < 30);
78 return 2 + index;
79
80 default:
81 assert(!"invalid semantic name");
82 return 0;
83 }
84 }
85
86 /**
87 * Returns a unique index for a semantic name and index. The index must be
88 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
89 * calculated.
90 */
91 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
92 unsigned is_varying)
93 {
94 switch (semantic_name) {
95 case TGSI_SEMANTIC_POSITION:
96 return 0;
97 case TGSI_SEMANTIC_GENERIC:
98 /* Since some shader stages use the the highest used IO index
99 * to determine the size to allocate for inputs/outputs
100 * (in LDS, tess and GS rings). GENERIC should be placed right
101 * after POSITION to make that size as small as possible.
102 */
103 if (index < SI_MAX_IO_GENERIC)
104 return 1 + index;
105
106 assert(!"invalid generic index");
107 return 0;
108 case TGSI_SEMANTIC_FOG:
109 return SI_MAX_IO_GENERIC + 1;
110 case TGSI_SEMANTIC_COLOR:
111 assert(index < 2);
112 return SI_MAX_IO_GENERIC + 2 + index;
113 case TGSI_SEMANTIC_BCOLOR:
114 assert(index < 2);
115 /* If it's a varying, COLOR and BCOLOR alias. */
116 if (is_varying)
117 return SI_MAX_IO_GENERIC + 2 + index;
118 else
119 return SI_MAX_IO_GENERIC + 4 + index;
120 case TGSI_SEMANTIC_TEXCOORD:
121 assert(index < 8);
122 return SI_MAX_IO_GENERIC + 6 + index;
123
124 /* These are rarely used between LS and HS or ES and GS. */
125 case TGSI_SEMANTIC_CLIPDIST:
126 assert(index < 2);
127 return SI_MAX_IO_GENERIC + 6 + 8 + index;
128 case TGSI_SEMANTIC_CLIPVERTEX:
129 return SI_MAX_IO_GENERIC + 6 + 8 + 2;
130 case TGSI_SEMANTIC_PSIZE:
131 return SI_MAX_IO_GENERIC + 6 + 8 + 3;
132
133 /* These can't be written by LS, HS, and ES. */
134 case TGSI_SEMANTIC_LAYER:
135 return SI_MAX_IO_GENERIC + 6 + 8 + 4;
136 case TGSI_SEMANTIC_VIEWPORT_INDEX:
137 return SI_MAX_IO_GENERIC + 6 + 8 + 5;
138 case TGSI_SEMANTIC_PRIMID:
139 STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63);
140 return SI_MAX_IO_GENERIC + 6 + 8 + 6;
141 default:
142 fprintf(stderr, "invalid semantic name = %u\n", semantic_name);
143 assert(!"invalid semantic name");
144 return 0;
145 }
146 }
147
148 /**
149 * Get the value of a shader input parameter and extract a bitfield.
150 */
151 static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx,
152 LLVMValueRef value, unsigned rshift,
153 unsigned bitwidth)
154 {
155 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
156 value = ac_to_integer(&ctx->ac, value);
157
158 if (rshift)
159 value = LLVMBuildLShr(ctx->ac.builder, value,
160 LLVMConstInt(ctx->ac.i32, rshift, 0), "");
161
162 if (rshift + bitwidth < 32) {
163 unsigned mask = (1 << bitwidth) - 1;
164 value = LLVMBuildAnd(ctx->ac.builder, value,
165 LLVMConstInt(ctx->ac.i32, mask, 0), "");
166 }
167
168 return value;
169 }
170
171 LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
172 struct ac_arg param, unsigned rshift,
173 unsigned bitwidth)
174 {
175 LLVMValueRef value = ac_get_arg(&ctx->ac, param);
176
177 return unpack_llvm_param(ctx, value, rshift, bitwidth);
178 }
179
180 LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
181 unsigned swizzle)
182 {
183 if (swizzle > 0)
184 return ctx->ac.i32_0;
185
186 switch (ctx->type) {
187 case PIPE_SHADER_VERTEX:
188 return ac_get_arg(&ctx->ac, ctx->vs_prim_id);
189 case PIPE_SHADER_TESS_CTRL:
190 return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id);
191 case PIPE_SHADER_TESS_EVAL:
192 return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id);
193 case PIPE_SHADER_GEOMETRY:
194 return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
195 default:
196 assert(0);
197 return ctx->ac.i32_0;
198 }
199 }
200
201 static LLVMValueRef get_block_size(struct ac_shader_abi *abi)
202 {
203 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
204
205 LLVMValueRef values[3];
206 LLVMValueRef result;
207 unsigned i;
208 unsigned *properties = ctx->shader->selector->info.properties;
209
210 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
211 unsigned sizes[3] = {
212 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
213 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
214 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
215 };
216
217 for (i = 0; i < 3; ++i)
218 values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0);
219
220 result = ac_build_gather_values(&ctx->ac, values, 3);
221 } else {
222 result = ac_get_arg(&ctx->ac, ctx->block_size);
223 }
224
225 return result;
226 }
227
228 void si_declare_compute_memory(struct si_shader_context *ctx)
229 {
230 struct si_shader_selector *sel = ctx->shader->selector;
231 unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
232
233 LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
234 LLVMValueRef var;
235
236 assert(!ctx->ac.lds);
237
238 var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
239 LLVMArrayType(ctx->ac.i8, lds_size),
240 "compute_lds",
241 AC_ADDR_SPACE_LDS);
242 LLVMSetAlignment(var, 64 * 1024);
243
244 ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
245 }
246
247 static void si_dump_streamout(struct pipe_stream_output_info *so)
248 {
249 unsigned i;
250
251 if (so->num_outputs)
252 fprintf(stderr, "STREAMOUT\n");
253
254 for (i = 0; i < so->num_outputs; i++) {
255 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
256 so->output[i].start_component;
257 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
258 i, so->output[i].output_buffer,
259 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
260 so->output[i].register_index,
261 mask & 1 ? "x" : "",
262 mask & 2 ? "y" : "",
263 mask & 4 ? "z" : "",
264 mask & 8 ? "w" : "");
265 }
266 }
267
268 static void declare_streamout_params(struct si_shader_context *ctx,
269 struct pipe_stream_output_info *so)
270 {
271 if (ctx->screen->use_ngg_streamout) {
272 if (ctx->type == PIPE_SHADER_TESS_EVAL)
273 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
274 return;
275 }
276
277 /* Streamout SGPRs. */
278 if (so->num_outputs) {
279 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config);
280 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index);
281 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
282 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
283 }
284
285 /* A streamout buffer offset is loaded if the stride is non-zero. */
286 for (int i = 0; i < 4; i++) {
287 if (!so->stride[i])
288 continue;
289
290 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]);
291 }
292 }
293
294 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
295 {
296 switch (shader->selector->type) {
297 case PIPE_SHADER_VERTEX:
298 case PIPE_SHADER_TESS_EVAL:
299 return shader->key.as_ngg ? 128 : 0;
300
301 case PIPE_SHADER_TESS_CTRL:
302 /* Return this so that LLVM doesn't remove s_barrier
303 * instructions on chips where we use s_barrier. */
304 return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;
305
306 case PIPE_SHADER_GEOMETRY:
307 return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;
308
309 case PIPE_SHADER_COMPUTE:
310 break; /* see below */
311
312 default:
313 return 0;
314 }
315
316 const unsigned *properties = shader->selector->info.properties;
317 unsigned max_work_group_size =
318 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
319 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
320 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
321
322 if (!max_work_group_size) {
323 /* This is a variable group size compute shader,
324 * compile it for the maximum possible group size.
325 */
326 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
327 }
328 return max_work_group_size;
329 }
330
331 static void declare_const_and_shader_buffers(struct si_shader_context *ctx,
332 bool assign_params)
333 {
334 enum ac_arg_type const_shader_buf_type;
335
336 if (ctx->shader->selector->info.const_buffers_declared == 1 &&
337 ctx->shader->selector->info.shader_buffers_declared == 0)
338 const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
339 else
340 const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
341
342 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
343 assign_params ? &ctx->const_and_shader_buffers :
344 &ctx->other_const_and_shader_buffers);
345 }
346
347 static void declare_samplers_and_images(struct si_shader_context *ctx,
348 bool assign_params)
349 {
350 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
351 assign_params ? &ctx->samplers_and_images :
352 &ctx->other_samplers_and_images);
353 }
354
355 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
356 bool assign_params)
357 {
358 declare_const_and_shader_buffers(ctx, assign_params);
359 declare_samplers_and_images(ctx, assign_params);
360 }
361
362 static void declare_global_desc_pointers(struct si_shader_context *ctx)
363 {
364 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
365 &ctx->rw_buffers);
366 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
367 &ctx->bindless_samplers_and_images);
368 }
369
370 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx)
371 {
372 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
373 if (!ctx->shader->is_gs_copy_shader) {
374 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
375 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
376 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
377 }
378 }
379
380 static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
381 {
382 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
383
384 unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
385 if (num_vbos_in_user_sgprs) {
386 unsigned user_sgprs = ctx->args.num_sgprs_used;
387
388 if (si_is_merged_shader(ctx))
389 user_sgprs -= 8;
390 assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
391
392 /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
393 for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
394 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
395
396 assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
397 for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
398 ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
399 }
400 }
401
402 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
403 unsigned *num_prolog_vgprs,
404 bool ngg_cull_shader)
405 {
406 struct si_shader *shader = ctx->shader;
407
408 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
409 if (shader->key.as_ls) {
410 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id);
411 if (ctx->screen->info.chip_class >= GFX10) {
412 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
413 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
414 } else {
415 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
416 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
417 }
418 } else if (ctx->screen->info.chip_class >= GFX10) {
419 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
420 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
421 &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */
422 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
423 } else {
424 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
425 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id);
426 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
427 }
428
429 if (!shader->is_gs_copy_shader) {
430 if (shader->key.opt.ngg_culling && !ngg_cull_shader) {
431 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
432 &ctx->ngg_old_thread_id);
433 }
434
435 /* Vertex load indices. */
436 if (shader->selector->info.num_inputs) {
437 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
438 &ctx->vertex_index0);
439 for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
440 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
441 }
442 *num_prolog_vgprs += shader->selector->info.num_inputs;
443 }
444 }
445
446 static void declare_vs_blit_inputs(struct si_shader_context *ctx,
447 unsigned vs_blit_property)
448 {
449 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
450 &ctx->vs_blit_inputs); /* i16 x1, y1 */
451 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
452 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
453
454 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
455 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
456 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
457 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
458 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
459 } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
460 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
461 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
462 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
463 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
464 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
465 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
466 }
467 }
468
469 static void declare_tes_input_vgprs(struct si_shader_context *ctx, bool ngg_cull_shader)
470 {
471 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
472 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
473 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
474 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
475
476 if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) {
477 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
478 &ctx->ngg_old_thread_id);
479 }
480 }
481
482 enum {
483 /* Convenient merged shader definitions. */
484 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
485 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
486 };
487
488 void si_add_arg_checked(struct ac_shader_args *args,
489 enum ac_arg_regfile file,
490 unsigned registers, enum ac_arg_type type,
491 struct ac_arg *arg,
492 unsigned idx)
493 {
494 assert(args->arg_count == idx);
495 ac_add_arg(args, file, registers, type, arg);
496 }
497
498 void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
499 {
500 struct si_shader *shader = ctx->shader;
501 LLVMTypeRef returns[AC_MAX_ARGS];
502 unsigned i, num_return_sgprs;
503 unsigned num_returns = 0;
504 unsigned num_prolog_vgprs = 0;
505 unsigned type = ctx->type;
506 unsigned vs_blit_property =
507 shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
508
509 memset(&ctx->args, 0, sizeof(ctx->args));
510
511 /* Set MERGED shaders. */
512 if (ctx->screen->info.chip_class >= GFX9) {
513 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
514 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
515 else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY)
516 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
517 }
518
519 switch (type) {
520 case PIPE_SHADER_VERTEX:
521 declare_global_desc_pointers(ctx);
522
523 if (vs_blit_property) {
524 declare_vs_blit_inputs(ctx, vs_blit_property);
525
526 /* VGPRs */
527 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
528 break;
529 }
530
531 declare_per_stage_desc_pointers(ctx, true);
532 declare_vs_specific_input_sgprs(ctx);
533 if (!shader->is_gs_copy_shader)
534 declare_vb_descriptor_input_sgprs(ctx);
535
536 if (shader->key.as_es) {
537 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
538 &ctx->es2gs_offset);
539 } else if (shader->key.as_ls) {
540 /* no extra parameters */
541 } else {
542 /* The locations of the other parameters are assigned dynamically. */
543 declare_streamout_params(ctx, &shader->selector->so);
544 }
545
546 /* VGPRs */
547 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
548
549 /* Return values */
550 if (shader->key.opt.vs_as_prim_discard_cs) {
551 for (i = 0; i < 4; i++)
552 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
553 }
554 break;
555
556 case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */
557 declare_global_desc_pointers(ctx);
558 declare_per_stage_desc_pointers(ctx, true);
559 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
560 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
561 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
562 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
563 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
564 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
565
566 /* VGPRs */
567 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
568 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
569
570 /* param_tcs_offchip_offset and param_tcs_factor_offset are
571 * placed after the user SGPRs.
572 */
573 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
574 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
575 for (i = 0; i < 11; i++)
576 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
577 break;
578
579 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
580 /* Merged stages have 8 system SGPRs at the beginning. */
581 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
582 declare_per_stage_desc_pointers(ctx,
583 ctx->type == PIPE_SHADER_TESS_CTRL);
584 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
585 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
586 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
587 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
588 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
589 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
590
591 declare_global_desc_pointers(ctx);
592 declare_per_stage_desc_pointers(ctx,
593 ctx->type == PIPE_SHADER_VERTEX);
594 declare_vs_specific_input_sgprs(ctx);
595
596 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
597 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
598 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
599 declare_vb_descriptor_input_sgprs(ctx);
600
601 /* VGPRs (first TCS, then VS) */
602 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
603 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
604
605 if (ctx->type == PIPE_SHADER_VERTEX) {
606 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
607
608 /* LS return values are inputs to the TCS main shader part. */
609 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
610 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
611 for (i = 0; i < 2; i++)
612 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
613 } else {
614 /* TCS return values are inputs to the TCS epilog.
615 *
616 * param_tcs_offchip_offset, param_tcs_factor_offset,
617 * param_tcs_offchip_layout, and param_rw_buffers
618 * should be passed to the epilog.
619 */
620 for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
621 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
622 for (i = 0; i < 11; i++)
623 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
624 }
625 break;
626
627 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
628 /* Merged stages have 8 system SGPRs at the beginning. */
629 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
630 declare_per_stage_desc_pointers(ctx,
631 ctx->type == PIPE_SHADER_GEOMETRY);
632
633 if (ctx->shader->key.as_ngg)
634 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info);
635 else
636 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
637
638 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
639 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
640 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
641 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
642 &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */
643 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
644
645 declare_global_desc_pointers(ctx);
646 if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) {
647 declare_per_stage_desc_pointers(ctx,
648 (ctx->type == PIPE_SHADER_VERTEX ||
649 ctx->type == PIPE_SHADER_TESS_EVAL));
650 }
651
652 if (ctx->type == PIPE_SHADER_VERTEX) {
653 if (vs_blit_property)
654 declare_vs_blit_inputs(ctx, vs_blit_property);
655 else
656 declare_vs_specific_input_sgprs(ctx);
657 } else {
658 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
659 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
660 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
661 /* Declare as many input SGPRs as the VS has. */
662 }
663
664 if (ctx->type == PIPE_SHADER_VERTEX)
665 declare_vb_descriptor_input_sgprs(ctx);
666
667 /* VGPRs (first GS, then VS/TES) */
668 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
669 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
670 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
671 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
672 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
673
674 if (ctx->type == PIPE_SHADER_VERTEX) {
675 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
676 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
677 declare_tes_input_vgprs(ctx, ngg_cull_shader);
678 }
679
680 if ((ctx->shader->key.as_es || ngg_cull_shader) &&
681 (ctx->type == PIPE_SHADER_VERTEX ||
682 ctx->type == PIPE_SHADER_TESS_EVAL)) {
683 unsigned num_user_sgprs, num_vgprs;
684
685 if (ctx->type == PIPE_SHADER_VERTEX) {
686 /* For the NGG cull shader, add 1 SGPR to hold
687 * the vertex buffer pointer.
688 */
689 num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
690
691 if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
692 assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
693 num_user_sgprs = SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
694 shader->selector->num_vbos_in_user_sgprs * 4;
695 }
696 } else {
697 num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
698 }
699
700 /* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
701 *
702 * The normal merged ESGS shader only has to return the 5 VGPRs
703 * for the GS stage.
704 */
705 num_vgprs = ngg_cull_shader ? 10 : 5;
706
707 /* ES return values are inputs to GS. */
708 for (i = 0; i < 8 + num_user_sgprs; i++)
709 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
710 for (i = 0; i < num_vgprs; i++)
711 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
712 }
713 break;
714
715 case PIPE_SHADER_TESS_EVAL:
716 declare_global_desc_pointers(ctx);
717 declare_per_stage_desc_pointers(ctx, true);
718 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
719 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
720 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
721
722 if (shader->key.as_es) {
723 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
724 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
725 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
726 } else {
727 declare_streamout_params(ctx, &shader->selector->so);
728 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
729 }
730
731 /* VGPRs */
732 declare_tes_input_vgprs(ctx, ngg_cull_shader);
733 break;
734
735 case PIPE_SHADER_GEOMETRY:
736 declare_global_desc_pointers(ctx);
737 declare_per_stage_desc_pointers(ctx, true);
738 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
739 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id);
740
741 /* VGPRs */
742 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]);
743 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]);
744 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
745 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]);
746 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]);
747 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]);
748 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]);
749 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
750 break;
751
752 case PIPE_SHADER_FRAGMENT:
753 declare_global_desc_pointers(ctx);
754 declare_per_stage_desc_pointers(ctx, true);
755 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL,
756 SI_PARAM_ALPHA_REF);
757 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
758 &ctx->args.prim_mask, SI_PARAM_PRIM_MASK);
759
760 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
761 SI_PARAM_PERSP_SAMPLE);
762 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
763 &ctx->args.persp_center, SI_PARAM_PERSP_CENTER);
764 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
765 &ctx->args.persp_centroid, SI_PARAM_PERSP_CENTROID);
766 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
767 NULL, SI_PARAM_PERSP_PULL_MODEL);
768 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
769 &ctx->args.linear_sample, SI_PARAM_LINEAR_SAMPLE);
770 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
771 &ctx->args.linear_center, SI_PARAM_LINEAR_CENTER);
772 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
773 &ctx->args.linear_centroid, SI_PARAM_LINEAR_CENTROID);
774 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT,
775 NULL, SI_PARAM_LINE_STIPPLE_TEX);
776 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
777 &ctx->args.frag_pos[0], SI_PARAM_POS_X_FLOAT);
778 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
779 &ctx->args.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
780 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
781 &ctx->args.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
782 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
783 &ctx->args.frag_pos[3], SI_PARAM_POS_W_FLOAT);
784 shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
785 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
786 &ctx->args.front_face, SI_PARAM_FRONT_FACE);
787 shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
788 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
789 &ctx->args.ancillary, SI_PARAM_ANCILLARY);
790 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
791 &ctx->args.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
792 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
793 &ctx->pos_fixed_pt, SI_PARAM_POS_FIXED_PT);
794
795 /* Color inputs from the prolog. */
796 if (shader->selector->info.colors_read) {
797 unsigned num_color_elements =
798 util_bitcount(shader->selector->info.colors_read);
799
800 for (i = 0; i < num_color_elements; i++)
801 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
802
803 num_prolog_vgprs += num_color_elements;
804 }
805
806 /* Outputs for the epilog. */
807 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
808 num_returns =
809 num_return_sgprs +
810 util_bitcount(shader->selector->info.colors_written) * 4 +
811 shader->selector->info.writes_z +
812 shader->selector->info.writes_stencil +
813 shader->selector->info.writes_samplemask +
814 1 /* SampleMaskIn */;
815
816 num_returns = MAX2(num_returns,
817 num_return_sgprs +
818 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
819
820 for (i = 0; i < num_return_sgprs; i++)
821 returns[i] = ctx->ac.i32;
822 for (; i < num_returns; i++)
823 returns[i] = ctx->ac.f32;
824 break;
825
826 case PIPE_SHADER_COMPUTE:
827 declare_global_desc_pointers(ctx);
828 declare_per_stage_desc_pointers(ctx, true);
829 if (shader->selector->info.uses_grid_size)
830 ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT,
831 &ctx->args.num_work_groups);
832 if (shader->selector->info.uses_block_size &&
833 shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
834 ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
835
836 unsigned cs_user_data_dwords =
837 shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
838 if (cs_user_data_dwords) {
839 ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT,
840 &ctx->cs_user_data);
841 }
842
843 /* Hardware SGPRs. */
844 for (i = 0; i < 3; i++) {
845 if (shader->selector->info.uses_block_id[i]) {
846 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
847 &ctx->args.workgroup_ids[i]);
848 }
849 }
850 if (shader->selector->info.uses_subgroup_info)
851 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
852
853 /* Hardware VGPRs. */
854 ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
855 &ctx->args.local_invocation_ids);
856 break;
857 default:
858 assert(0 && "unimplemented shader");
859 return;
860 }
861
862 si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main",
863 returns, num_returns, si_get_max_workgroup_size(shader));
864
865 /* Reserve register locations for VGPR inputs the PS prolog may need. */
866 if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
867 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
868 "InitialPSInputAddr",
869 S_0286D0_PERSP_SAMPLE_ENA(1) |
870 S_0286D0_PERSP_CENTER_ENA(1) |
871 S_0286D0_PERSP_CENTROID_ENA(1) |
872 S_0286D0_LINEAR_SAMPLE_ENA(1) |
873 S_0286D0_LINEAR_CENTER_ENA(1) |
874 S_0286D0_LINEAR_CENTROID_ENA(1) |
875 S_0286D0_FRONT_FACE_ENA(1) |
876 S_0286D0_ANCILLARY_ENA(1) |
877 S_0286D0_POS_FIXED_PT_ENA(1));
878 }
879
880 shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
881 shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
882
883 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
884 shader->info.num_input_vgprs -= num_prolog_vgprs;
885
886 if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) {
887 if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
888 /* The LSHS size is not known until draw time, so we append it
889 * at the end of whatever LDS use there may be in the rest of
890 * the shader (currently none, unless LLVM decides to do its
891 * own LDS-based lowering).
892 */
893 ctx->ac.lds = LLVMAddGlobalInAddressSpace(
894 ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
895 "__lds_end", AC_ADDR_SPACE_LDS);
896 LLVMSetAlignment(ctx->ac.lds, 256);
897 } else {
898 ac_declare_lds_as_pointer(&ctx->ac);
899 }
900 }
901
902 /* Unlike radv, we override these arguments in the prolog, so to the
903 * API shader they appear as normal arguments.
904 */
905 if (ctx->type == PIPE_SHADER_VERTEX) {
906 ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
907 ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
908 } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
909 ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
910 ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
911 }
912 }
913
914 /* For the UMR disassembler. */
915 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
916 #define DEBUGGER_NUM_MARKERS 5
917
918 static bool si_shader_binary_open(struct si_screen *screen,
919 struct si_shader *shader,
920 struct ac_rtld_binary *rtld)
921 {
922 const struct si_shader_selector *sel = shader->selector;
923 const char *part_elfs[5];
924 size_t part_sizes[5];
925 unsigned num_parts = 0;
926
927 #define add_part(shader_or_part) \
928 if (shader_or_part) { \
929 part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \
930 part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \
931 num_parts++; \
932 }
933
934 add_part(shader->prolog);
935 add_part(shader->previous_stage);
936 add_part(shader->prolog2);
937 add_part(shader);
938 add_part(shader->epilog);
939
940 #undef add_part
941
942 struct ac_rtld_symbol lds_symbols[2];
943 unsigned num_lds_symbols = 0;
944
945 if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
946 (sel->type == PIPE_SHADER_GEOMETRY || shader->key.as_ngg)) {
947 /* We add this symbol even on LLVM <= 8 to ensure that
948 * shader->config.lds_size is set correctly below.
949 */
950 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
951 sym->name = "esgs_ring";
952 sym->size = shader->gs_info.esgs_ring_size;
953 sym->align = 64 * 1024;
954 }
955
956 if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) {
957 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
958 sym->name = "ngg_emit";
959 sym->size = shader->ngg.ngg_emit_size * 4;
960 sym->align = 4;
961 }
962
963 bool ok = ac_rtld_open(rtld, (struct ac_rtld_open_info){
964 .info = &screen->info,
965 .options = {
966 .halt_at_entry = screen->options.halt_shaders,
967 },
968 .shader_type = tgsi_processor_to_shader_stage(sel->type),
969 .wave_size = si_get_shader_wave_size(shader),
970 .num_parts = num_parts,
971 .elf_ptrs = part_elfs,
972 .elf_sizes = part_sizes,
973 .num_shared_lds_symbols = num_lds_symbols,
974 .shared_lds_symbols = lds_symbols });
975
976 if (rtld->lds_size > 0) {
977 unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
978 shader->config.lds_size =
979 align(rtld->lds_size, alloc_granularity) / alloc_granularity;
980 }
981
982 return ok;
983 }
984
985 static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
986 {
987 struct ac_rtld_binary rtld;
988 si_shader_binary_open(screen, shader, &rtld);
989 return rtld.exec_size;
990 }
991
992 static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
993 {
994 uint64_t *scratch_va = data;
995
996 if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
997 *value = (uint32_t)*scratch_va;
998 return true;
999 }
1000 if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
1001 /* Enable scratch coalescing. */
1002 *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) |
1003 S_008F04_SWIZZLE_ENABLE(1);
1004 return true;
1005 }
1006
1007 return false;
1008 }
1009
1010 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1011 uint64_t scratch_va)
1012 {
1013 struct ac_rtld_binary binary;
1014 if (!si_shader_binary_open(sscreen, shader, &binary))
1015 return false;
1016
1017 si_resource_reference(&shader->bo, NULL);
1018 shader->bo = si_aligned_buffer_create(&sscreen->b,
1019 sscreen->info.cpdma_prefetch_writes_memory ?
1020 0 : SI_RESOURCE_FLAG_READ_ONLY,
1021 PIPE_USAGE_IMMUTABLE,
1022 align(binary.rx_size, SI_CPDMA_ALIGNMENT),
1023 256);
1024 if (!shader->bo)
1025 return false;
1026
1027 /* Upload. */
1028 struct ac_rtld_upload_info u = {};
1029 u.binary = &binary;
1030 u.get_external_symbol = si_get_external_symbol;
1031 u.cb_data = &scratch_va;
1032 u.rx_va = shader->bo->gpu_address;
1033 u.rx_ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
1034 PIPE_TRANSFER_READ_WRITE |
1035 PIPE_TRANSFER_UNSYNCHRONIZED |
1036 RADEON_TRANSFER_TEMPORARY);
1037 if (!u.rx_ptr)
1038 return false;
1039
1040 bool ok = ac_rtld_upload(&u);
1041
1042 sscreen->ws->buffer_unmap(shader->bo->buf);
1043 ac_rtld_close(&binary);
1044
1045 return ok;
1046 }
1047
1048 static void si_shader_dump_disassembly(struct si_screen *screen,
1049 const struct si_shader_binary *binary,
1050 enum pipe_shader_type shader_type,
1051 unsigned wave_size,
1052 struct pipe_debug_callback *debug,
1053 const char *name, FILE *file)
1054 {
1055 struct ac_rtld_binary rtld_binary;
1056
1057 if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
1058 .info = &screen->info,
1059 .shader_type = tgsi_processor_to_shader_stage(shader_type),
1060 .wave_size = wave_size,
1061 .num_parts = 1,
1062 .elf_ptrs = &binary->elf_buffer,
1063 .elf_sizes = &binary->elf_size }))
1064 return;
1065
1066 const char *disasm;
1067 size_t nbytes;
1068
1069 if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
1070 goto out;
1071
1072 if (nbytes > INT_MAX)
1073 goto out;
1074
1075 if (debug && debug->debug_message) {
1076 /* Very long debug messages are cut off, so send the
1077 * disassembly one line at a time. This causes more
1078 * overhead, but on the plus side it simplifies
1079 * parsing of resulting logs.
1080 */
1081 pipe_debug_message(debug, SHADER_INFO,
1082 "Shader Disassembly Begin");
1083
1084 uint64_t line = 0;
1085 while (line < nbytes) {
1086 int count = nbytes - line;
1087 const char *nl = memchr(disasm + line, '\n', nbytes - line);
1088 if (nl)
1089 count = nl - (disasm + line);
1090
1091 if (count) {
1092 pipe_debug_message(debug, SHADER_INFO,
1093 "%.*s", count, disasm + line);
1094 }
1095
1096 line += count + 1;
1097 }
1098
1099 pipe_debug_message(debug, SHADER_INFO,
1100 "Shader Disassembly End");
1101 }
1102
1103 if (file) {
1104 fprintf(file, "Shader %s disassembly:\n", name);
1105 fprintf(file, "%*s", (int)nbytes, disasm);
1106 }
1107
1108 out:
1109 ac_rtld_close(&rtld_binary);
1110 }
1111
1112 static void si_calculate_max_simd_waves(struct si_shader *shader)
1113 {
1114 struct si_screen *sscreen = shader->selector->screen;
1115 struct ac_shader_config *conf = &shader->config;
1116 unsigned num_inputs = shader->selector->info.num_inputs;
1117 unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
1118 unsigned lds_per_wave = 0;
1119 unsigned max_simd_waves;
1120
1121 max_simd_waves = sscreen->info.max_wave64_per_simd;
1122
1123 /* Compute LDS usage for PS. */
1124 switch (shader->selector->type) {
1125 case PIPE_SHADER_FRAGMENT:
1126 /* The minimum usage per wave is (num_inputs * 48). The maximum
1127 * usage is (num_inputs * 48 * 16).
1128 * We can get anything in between and it varies between waves.
1129 *
1130 * The 48 bytes per input for a single primitive is equal to
1131 * 4 bytes/component * 4 components/input * 3 points.
1132 *
1133 * Other stages don't know the size at compile time or don't
1134 * allocate LDS per wave, but instead they do it per thread group.
1135 */
1136 lds_per_wave = conf->lds_size * lds_increment +
1137 align(num_inputs * 48, lds_increment);
1138 break;
1139 case PIPE_SHADER_COMPUTE:
1140 if (shader->selector) {
1141 unsigned max_workgroup_size =
1142 si_get_max_workgroup_size(shader);
1143 lds_per_wave = (conf->lds_size * lds_increment) /
1144 DIV_ROUND_UP(max_workgroup_size,
1145 sscreen->compute_wave_size);
1146 }
1147 break;
1148 default:;
1149 }
1150
1151 /* Compute the per-SIMD wave counts. */
1152 if (conf->num_sgprs) {
1153 max_simd_waves =
1154 MIN2(max_simd_waves,
1155 sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1156 }
1157
1158 if (conf->num_vgprs) {
1159 /* Always print wave limits as Wave64, so that we can compare
1160 * Wave32 and Wave64 with shader-db fairly. */
1161 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1162 max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
1163 }
1164
1165 /* LDS is 64KB per CU (4 SIMDs) on GFX6-9, which is 16KB per SIMD (usage above
1166 * 16KB makes some SIMDs unoccupied).
1167 *
1168 * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
1169 */
1170 unsigned max_lds_size = sscreen->info.chip_class >= GFX10 ? 128*1024 : 64*1024;
1171 unsigned max_lds_per_simd = max_lds_size / 4;
1172 if (lds_per_wave)
1173 max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1174
1175 shader->info.max_simd_waves = max_simd_waves;
1176 }
1177
1178 void si_shader_dump_stats_for_shader_db(struct si_screen *screen,
1179 struct si_shader *shader,
1180 struct pipe_debug_callback *debug)
1181 {
1182 const struct ac_shader_config *conf = &shader->config;
1183
1184 if (screen->options.debug_disassembly)
1185 si_shader_dump_disassembly(screen, &shader->binary,
1186 shader->selector->type,
1187 si_get_shader_wave_size(shader),
1188 debug, "main", NULL);
1189
1190 pipe_debug_message(debug, SHADER_INFO,
1191 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1192 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1193 "Spilled VGPRs: %d PrivMem VGPRs: %d",
1194 conf->num_sgprs, conf->num_vgprs,
1195 si_get_shader_binary_size(screen, shader),
1196 conf->lds_size, conf->scratch_bytes_per_wave,
1197 shader->info.max_simd_waves, conf->spilled_sgprs,
1198 conf->spilled_vgprs, shader->info.private_mem_vgprs);
1199 }
1200
1201 static void si_shader_dump_stats(struct si_screen *sscreen,
1202 struct si_shader *shader,
1203 FILE *file,
1204 bool check_debug_option)
1205 {
1206 const struct ac_shader_config *conf = &shader->config;
1207
1208 if (!check_debug_option ||
1209 si_can_dump_shader(sscreen, shader->selector->type)) {
1210 if (shader->selector->type == PIPE_SHADER_FRAGMENT) {
1211 fprintf(file, "*** SHADER CONFIG ***\n"
1212 "SPI_PS_INPUT_ADDR = 0x%04x\n"
1213 "SPI_PS_INPUT_ENA = 0x%04x\n",
1214 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1215 }
1216
1217 fprintf(file, "*** SHADER STATS ***\n"
1218 "SGPRS: %d\n"
1219 "VGPRS: %d\n"
1220 "Spilled SGPRs: %d\n"
1221 "Spilled VGPRs: %d\n"
1222 "Private memory VGPRs: %d\n"
1223 "Code Size: %d bytes\n"
1224 "LDS: %d blocks\n"
1225 "Scratch: %d bytes per wave\n"
1226 "Max Waves: %d\n"
1227 "********************\n\n\n",
1228 conf->num_sgprs, conf->num_vgprs,
1229 conf->spilled_sgprs, conf->spilled_vgprs,
1230 shader->info.private_mem_vgprs,
1231 si_get_shader_binary_size(sscreen, shader),
1232 conf->lds_size, conf->scratch_bytes_per_wave,
1233 shader->info.max_simd_waves);
1234 }
1235 }
1236
1237 const char *si_get_shader_name(const struct si_shader *shader)
1238 {
1239 switch (shader->selector->type) {
1240 case PIPE_SHADER_VERTEX:
1241 if (shader->key.as_es)
1242 return "Vertex Shader as ES";
1243 else if (shader->key.as_ls)
1244 return "Vertex Shader as LS";
1245 else if (shader->key.opt.vs_as_prim_discard_cs)
1246 return "Vertex Shader as Primitive Discard CS";
1247 else if (shader->key.as_ngg)
1248 return "Vertex Shader as ESGS";
1249 else
1250 return "Vertex Shader as VS";
1251 case PIPE_SHADER_TESS_CTRL:
1252 return "Tessellation Control Shader";
1253 case PIPE_SHADER_TESS_EVAL:
1254 if (shader->key.as_es)
1255 return "Tessellation Evaluation Shader as ES";
1256 else if (shader->key.as_ngg)
1257 return "Tessellation Evaluation Shader as ESGS";
1258 else
1259 return "Tessellation Evaluation Shader as VS";
1260 case PIPE_SHADER_GEOMETRY:
1261 if (shader->is_gs_copy_shader)
1262 return "GS Copy Shader as VS";
1263 else
1264 return "Geometry Shader";
1265 case PIPE_SHADER_FRAGMENT:
1266 return "Pixel Shader";
1267 case PIPE_SHADER_COMPUTE:
1268 return "Compute Shader";
1269 default:
1270 return "Unknown Shader";
1271 }
1272 }
1273
1274 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1275 struct pipe_debug_callback *debug,
1276 FILE *file, bool check_debug_option)
1277 {
1278 enum pipe_shader_type shader_type = shader->selector->type;
1279
1280 if (!check_debug_option ||
1281 si_can_dump_shader(sscreen, shader_type))
1282 si_dump_shader_key(shader, file);
1283
1284 if (!check_debug_option && shader->binary.llvm_ir_string) {
1285 if (shader->previous_stage &&
1286 shader->previous_stage->binary.llvm_ir_string) {
1287 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
1288 si_get_shader_name(shader));
1289 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
1290 }
1291
1292 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
1293 si_get_shader_name(shader));
1294 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
1295 }
1296
1297 if (!check_debug_option ||
1298 (si_can_dump_shader(sscreen, shader_type) &&
1299 !(sscreen->debug_flags & DBG(NO_ASM)))) {
1300 unsigned wave_size = si_get_shader_wave_size(shader);
1301
1302 fprintf(file, "\n%s:\n", si_get_shader_name(shader));
1303
1304 if (shader->prolog)
1305 si_shader_dump_disassembly(sscreen, &shader->prolog->binary,
1306 shader_type, wave_size, debug, "prolog", file);
1307 if (shader->previous_stage)
1308 si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary,
1309 shader_type, wave_size, debug, "previous stage", file);
1310 if (shader->prolog2)
1311 si_shader_dump_disassembly(sscreen, &shader->prolog2->binary,
1312 shader_type, wave_size, debug, "prolog2", file);
1313
1314 si_shader_dump_disassembly(sscreen, &shader->binary, shader_type,
1315 wave_size, debug, "main", file);
1316
1317 if (shader->epilog)
1318 si_shader_dump_disassembly(sscreen, &shader->epilog->binary,
1319 shader_type, wave_size, debug, "epilog", file);
1320 fprintf(file, "\n");
1321 }
1322
1323 si_shader_dump_stats(sscreen, shader, file, check_debug_option);
1324 }
1325
1326 static void si_dump_shader_key_vs(const struct si_shader_key *key,
1327 const struct si_vs_prolog_bits *prolog,
1328 const char *prefix, FILE *f)
1329 {
1330 fprintf(f, " %s.instance_divisor_is_one = %u\n",
1331 prefix, prolog->instance_divisor_is_one);
1332 fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
1333 prefix, prolog->instance_divisor_is_fetched);
1334 fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n",
1335 prefix, prolog->unpack_instance_id_from_vertex_id);
1336 fprintf(f, " %s.ls_vgpr_fix = %u\n",
1337 prefix, prolog->ls_vgpr_fix);
1338
1339 fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
1340 fprintf(f, " mono.vs.fix_fetch = {");
1341 for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
1342 union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
1343 if (i)
1344 fprintf(f, ", ");
1345 if (!fix.bits)
1346 fprintf(f, "0");
1347 else
1348 fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size,
1349 fix.u.num_channels_m1, fix.u.format);
1350 }
1351 fprintf(f, "}\n");
1352 }
1353
1354 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
1355 {
1356 const struct si_shader_key *key = &shader->key;
1357 enum pipe_shader_type shader_type = shader->selector->type;
1358
1359 fprintf(f, "SHADER KEY\n");
1360
1361 switch (shader_type) {
1362 case PIPE_SHADER_VERTEX:
1363 si_dump_shader_key_vs(key, &key->part.vs.prolog,
1364 "part.vs.prolog", f);
1365 fprintf(f, " as_es = %u\n", key->as_es);
1366 fprintf(f, " as_ls = %u\n", key->as_ls);
1367 fprintf(f, " as_ngg = %u\n", key->as_ngg);
1368 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
1369 key->mono.u.vs_export_prim_id);
1370 fprintf(f, " opt.vs_as_prim_discard_cs = %u\n",
1371 key->opt.vs_as_prim_discard_cs);
1372 fprintf(f, " opt.cs_prim_type = %s\n",
1373 tgsi_primitive_names[key->opt.cs_prim_type]);
1374 fprintf(f, " opt.cs_indexed = %u\n",
1375 key->opt.cs_indexed);
1376 fprintf(f, " opt.cs_instancing = %u\n",
1377 key->opt.cs_instancing);
1378 fprintf(f, " opt.cs_primitive_restart = %u\n",
1379 key->opt.cs_primitive_restart);
1380 fprintf(f, " opt.cs_provoking_vertex_first = %u\n",
1381 key->opt.cs_provoking_vertex_first);
1382 fprintf(f, " opt.cs_need_correct_orientation = %u\n",
1383 key->opt.cs_need_correct_orientation);
1384 fprintf(f, " opt.cs_cull_front = %u\n",
1385 key->opt.cs_cull_front);
1386 fprintf(f, " opt.cs_cull_back = %u\n",
1387 key->opt.cs_cull_back);
1388 fprintf(f, " opt.cs_cull_z = %u\n",
1389 key->opt.cs_cull_z);
1390 fprintf(f, " opt.cs_halfz_clip_space = %u\n",
1391 key->opt.cs_halfz_clip_space);
1392 break;
1393
1394 case PIPE_SHADER_TESS_CTRL:
1395 if (shader->selector->screen->info.chip_class >= GFX9) {
1396 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
1397 "part.tcs.ls_prolog", f);
1398 }
1399 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
1400 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
1401 break;
1402
1403 case PIPE_SHADER_TESS_EVAL:
1404 fprintf(f, " as_es = %u\n", key->as_es);
1405 fprintf(f, " as_ngg = %u\n", key->as_ngg);
1406 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
1407 key->mono.u.vs_export_prim_id);
1408 break;
1409
1410 case PIPE_SHADER_GEOMETRY:
1411 if (shader->is_gs_copy_shader)
1412 break;
1413
1414 if (shader->selector->screen->info.chip_class >= GFX9 &&
1415 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
1416 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
1417 "part.gs.vs_prolog", f);
1418 }
1419 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
1420 fprintf(f, " part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs);
1421 fprintf(f, " as_ngg = %u\n", key->as_ngg);
1422 break;
1423
1424 case PIPE_SHADER_COMPUTE:
1425 break;
1426
1427 case PIPE_SHADER_FRAGMENT:
1428 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
1429 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
1430 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
1431 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
1432 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
1433 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
1434 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
1435 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
1436 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
1437 fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n", key->part.ps.prolog.samplemask_log_ps_iter);
1438 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
1439 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
1440 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
1441 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
1442 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
1443 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
1444 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
1445 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
1446 fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n", key->mono.u.ps.interpolate_at_sample_force_center);
1447 fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
1448 fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
1449 fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
1450 break;
1451
1452 default:
1453 assert(0);
1454 }
1455
1456 if ((shader_type == PIPE_SHADER_GEOMETRY ||
1457 shader_type == PIPE_SHADER_TESS_EVAL ||
1458 shader_type == PIPE_SHADER_VERTEX) &&
1459 !key->as_es && !key->as_ls) {
1460 fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
1461 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
1462 if (shader_type != PIPE_SHADER_GEOMETRY)
1463 fprintf(f, " opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
1464 }
1465 }
1466
1467 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
1468 {
1469 struct si_shader *shader = ctx->shader;
1470 struct si_shader_info *info = &shader->selector->info;
1471
1472 if ((ctx->type != PIPE_SHADER_VERTEX &&
1473 ctx->type != PIPE_SHADER_TESS_EVAL) ||
1474 shader->key.as_ls ||
1475 shader->key.as_es)
1476 return;
1477
1478 ac_optimize_vs_outputs(&ctx->ac,
1479 ctx->main_fn,
1480 shader->info.vs_output_param_offset,
1481 info->num_outputs,
1482 &shader->info.nr_param_exports);
1483 }
1484
1485 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
1486 const struct si_vs_prolog_bits *prolog_key,
1487 const struct si_shader_key *key,
1488 bool ngg_cull_shader)
1489 {
1490 /* VGPR initialization fixup for Vega10 and Raven is always done in the
1491 * VS prolog. */
1492 return sel->vs_needs_prolog ||
1493 prolog_key->ls_vgpr_fix ||
1494 prolog_key->unpack_instance_id_from_vertex_id ||
1495 (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
1496 }
1497
1498 static bool si_build_main_function(struct si_shader_context *ctx,
1499 struct si_shader *shader,
1500 struct nir_shader *nir, bool free_nir,
1501 bool ngg_cull_shader)
1502 {
1503 struct si_shader_selector *sel = shader->selector;
1504 const struct si_shader_info *info = &sel->info;
1505
1506 ctx->shader = shader;
1507 ctx->type = sel->type;
1508
1509 ctx->num_const_buffers = util_last_bit(info->const_buffers_declared);
1510 ctx->num_shader_buffers = util_last_bit(info->shader_buffers_declared);
1511
1512 ctx->num_samplers = util_last_bit(info->samplers_declared);
1513 ctx->num_images = util_last_bit(info->images_declared);
1514
1515 si_llvm_init_resource_callbacks(ctx);
1516
1517 switch (ctx->type) {
1518 case PIPE_SHADER_VERTEX:
1519 si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
1520 break;
1521 case PIPE_SHADER_TESS_CTRL:
1522 si_llvm_init_tcs_callbacks(ctx);
1523 break;
1524 case PIPE_SHADER_TESS_EVAL:
1525 si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
1526 break;
1527 case PIPE_SHADER_GEOMETRY:
1528 si_llvm_init_gs_callbacks(ctx);
1529 break;
1530 case PIPE_SHADER_FRAGMENT:
1531 si_llvm_init_ps_callbacks(ctx);
1532 break;
1533 case PIPE_SHADER_COMPUTE:
1534 ctx->abi.load_local_group_size = get_block_size;
1535 break;
1536 default:
1537 assert(!"Unsupported shader type");
1538 return false;
1539 }
1540
1541 si_create_function(ctx, ngg_cull_shader);
1542
1543 if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)
1544 si_preload_esgs_ring(ctx);
1545
1546 if (ctx->type == PIPE_SHADER_GEOMETRY)
1547 si_preload_gs_rings(ctx);
1548 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1549 si_llvm_preload_tes_rings(ctx);
1550
1551 if (ctx->type == PIPE_SHADER_TESS_CTRL &&
1552 sel->info.tessfactors_are_def_in_all_invocs) {
1553 for (unsigned i = 0; i < 6; i++) {
1554 ctx->invoc0_tess_factors[i] =
1555 ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
1556 }
1557 }
1558
1559 if (ctx->type == PIPE_SHADER_GEOMETRY) {
1560 for (unsigned i = 0; i < 4; i++) {
1561 ctx->gs_next_vertex[i] =
1562 ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
1563 }
1564 if (shader->key.as_ngg) {
1565 for (unsigned i = 0; i < 4; ++i) {
1566 ctx->gs_curprim_verts[i] =
1567 ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
1568 ctx->gs_generated_prims[i] =
1569 ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
1570 }
1571
1572 unsigned scratch_size = 8;
1573 if (sel->so.num_outputs)
1574 scratch_size = 44;
1575
1576 assert(!ctx->gs_ngg_scratch);
1577 LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, scratch_size);
1578 ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
1579 ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
1580 LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
1581 LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
1582
1583 ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx->ac.module,
1584 LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
1585 LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
1586 LLVMSetAlignment(ctx->gs_ngg_emit, 4);
1587 }
1588 }
1589
1590 if (ctx->type != PIPE_SHADER_GEOMETRY &&
1591 (shader->key.as_ngg && !shader->key.as_es)) {
1592 /* Unconditionally declare scratch space base for streamout and
1593 * vertex compaction. Whether space is actually allocated is
1594 * determined during linking / PM4 creation.
1595 *
1596 * Add an extra dword per vertex to ensure an odd stride, which
1597 * avoids bank conflicts for SoA accesses.
1598 */
1599 if (!gfx10_is_ngg_passthrough(shader))
1600 si_llvm_declare_esgs_ring(ctx);
1601
1602 /* This is really only needed when streamout and / or vertex
1603 * compaction is enabled.
1604 */
1605 if (!ctx->gs_ngg_scratch &&
1606 (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
1607 LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, 8);
1608 ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
1609 asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
1610 LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
1611 LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
1612 }
1613 }
1614
1615 /* For GFX9 merged shaders:
1616 * - Set EXEC for the first shader. If the prolog is present, set
1617 * EXEC there instead.
1618 * - Add a barrier before the second shader.
1619 * - In the second shader, reset EXEC to ~0 and wrap the main part in
1620 * an if-statement. This is required for correctness in geometry
1621 * shaders, to ensure that empty GS waves do not send GS_EMIT and
1622 * GS_CUT messages.
1623 *
1624 * For monolithic merged shaders, the first shader is wrapped in an
1625 * if-block together with its prolog in si_build_wrapper_function.
1626 *
1627 * NGG vertex and tess eval shaders running as the last
1628 * vertex/geometry stage handle execution explicitly using
1629 * if-statements.
1630 */
1631 if (ctx->screen->info.chip_class >= GFX9) {
1632 if (!shader->is_monolithic &&
1633 (shader->key.as_es || shader->key.as_ls) &&
1634 (ctx->type == PIPE_SHADER_TESS_EVAL ||
1635 (ctx->type == PIPE_SHADER_VERTEX &&
1636 !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
1637 &shader->key, ngg_cull_shader)))) {
1638 si_init_exec_from_input(ctx,
1639 ctx->merged_wave_info, 0);
1640 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
1641 ctx->type == PIPE_SHADER_GEOMETRY ||
1642 (shader->key.as_ngg && !shader->key.as_es)) {
1643 LLVMValueRef thread_enabled;
1644 bool nested_barrier;
1645
1646 if (!shader->is_monolithic ||
1647 (ctx->type == PIPE_SHADER_TESS_EVAL &&
1648 shader->key.as_ngg && !shader->key.as_es &&
1649 !shader->key.opt.ngg_culling))
1650 ac_init_exec_full_mask(&ctx->ac);
1651
1652 if ((ctx->type == PIPE_SHADER_VERTEX ||
1653 ctx->type == PIPE_SHADER_TESS_EVAL) &&
1654 shader->key.as_ngg && !shader->key.as_es &&
1655 !shader->key.opt.ngg_culling) {
1656 gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
1657
1658 /* Build the primitive export at the beginning
1659 * of the shader if possible.
1660 */
1661 if (gfx10_ngg_export_prim_early(shader))
1662 gfx10_ngg_build_export_prim(ctx, NULL, NULL);
1663 }
1664
1665 if (ctx->type == PIPE_SHADER_TESS_CTRL ||
1666 ctx->type == PIPE_SHADER_GEOMETRY) {
1667 if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
1668 gfx10_ngg_gs_emit_prologue(ctx);
1669 nested_barrier = false;
1670 } else {
1671 nested_barrier = true;
1672 }
1673
1674 thread_enabled = si_is_gs_thread(ctx);
1675 } else {
1676 thread_enabled = si_is_es_thread(ctx);
1677 nested_barrier = false;
1678 }
1679
1680 ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
1681 ctx->merged_wrap_if_label = 11500;
1682 ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
1683
1684 if (nested_barrier) {
1685 /* Execute a barrier before the second shader in
1686 * a merged shader.
1687 *
1688 * Execute the barrier inside the conditional block,
1689 * so that empty waves can jump directly to s_endpgm,
1690 * which will also signal the barrier.
1691 *
1692 * This is possible in gfx9, because an empty wave
1693 * for the second shader does not participate in
1694 * the epilogue. With NGG, empty waves may still
1695 * be required to export data (e.g. GS output vertices),
1696 * so we cannot let them exit early.
1697 *
1698 * If the shader is TCS and the TCS epilog is present
1699 * and contains a barrier, it will wait there and then
1700 * reach s_endpgm.
1701 */
1702 si_llvm_emit_barrier(ctx);
1703 }
1704 }
1705 }
1706
1707 if (sel->force_correct_derivs_after_kill) {
1708 ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, "");
1709 /* true = don't kill. */
1710 LLVMBuildStore(ctx->ac.builder, ctx->ac.i1true,
1711 ctx->postponed_kill);
1712 }
1713
1714 bool success = si_nir_build_llvm(ctx, nir);
1715 if (free_nir)
1716 ralloc_free(nir);
1717 if (!success) {
1718 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
1719 return false;
1720 }
1721
1722 si_llvm_build_ret(ctx, ctx->return_value);
1723 return true;
1724 }
1725
1726 /**
1727 * Compute the VS prolog key, which contains all the information needed to
1728 * build the VS prolog function, and set shader->info bits where needed.
1729 *
1730 * \param info Shader info of the vertex shader.
1731 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
1732 * \param has_old_ Whether the preceding shader part is the NGG cull shader.
1733 * \param prolog_key Key of the VS prolog
1734 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
1735 * \param key Output shader part key.
1736 */
1737 static void si_get_vs_prolog_key(const struct si_shader_info *info,
1738 unsigned num_input_sgprs,
1739 bool ngg_cull_shader,
1740 const struct si_vs_prolog_bits *prolog_key,
1741 struct si_shader *shader_out,
1742 union si_shader_part_key *key)
1743 {
1744 memset(key, 0, sizeof(*key));
1745 key->vs_prolog.states = *prolog_key;
1746 key->vs_prolog.num_input_sgprs = num_input_sgprs;
1747 key->vs_prolog.num_inputs = info->num_inputs;
1748 key->vs_prolog.as_ls = shader_out->key.as_ls;
1749 key->vs_prolog.as_es = shader_out->key.as_es;
1750 key->vs_prolog.as_ngg = shader_out->key.as_ngg;
1751
1752 if (ngg_cull_shader) {
1753 key->vs_prolog.gs_fast_launch_tri_list = !!(shader_out->key.opt.ngg_culling &
1754 SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
1755 key->vs_prolog.gs_fast_launch_tri_strip = !!(shader_out->key.opt.ngg_culling &
1756 SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
1757 } else {
1758 key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
1759 }
1760
1761 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
1762 key->vs_prolog.as_ls = 1;
1763 key->vs_prolog.num_merged_next_stage_vgprs = 2;
1764 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
1765 key->vs_prolog.as_es = 1;
1766 key->vs_prolog.num_merged_next_stage_vgprs = 5;
1767 } else if (shader_out->key.as_ngg) {
1768 key->vs_prolog.num_merged_next_stage_vgprs = 5;
1769 }
1770
1771 /* Enable loading the InstanceID VGPR. */
1772 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
1773
1774 if ((key->vs_prolog.states.instance_divisor_is_one |
1775 key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
1776 shader_out->info.uses_instanceid = true;
1777 }
1778
1779 /**
1780 * Given a list of shader part functions, build a wrapper function that
1781 * runs them in sequence to form a monolithic shader.
1782 */
1783 void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
1784 unsigned num_parts, unsigned main_part,
1785 unsigned next_shader_first_part)
1786 {
1787 LLVMBuilderRef builder = ctx->ac.builder;
1788 /* PS epilog has one arg per color component; gfx9 merged shader
1789 * prologs need to forward 40 SGPRs.
1790 */
1791 LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS];
1792 LLVMTypeRef function_type;
1793 unsigned num_first_params;
1794 unsigned num_out, initial_num_out;
1795 ASSERTED unsigned num_out_sgpr; /* used in debug checks */
1796 ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */
1797 unsigned num_sgprs, num_vgprs;
1798 unsigned gprs;
1799
1800 memset(&ctx->args, 0, sizeof(ctx->args));
1801
1802 for (unsigned i = 0; i < num_parts; ++i) {
1803 ac_add_function_attr(ctx->ac.context, parts[i], -1,
1804 AC_FUNC_ATTR_ALWAYSINLINE);
1805 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
1806 }
1807
1808 /* The parameters of the wrapper function correspond to those of the
1809 * first part in terms of SGPRs and VGPRs, but we use the types of the
1810 * main part to get the right types. This is relevant for the
1811 * dereferenceable attribute on descriptor table pointers.
1812 */
1813 num_sgprs = 0;
1814 num_vgprs = 0;
1815
1816 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
1817 num_first_params = LLVMCountParamTypes(function_type);
1818
1819 for (unsigned i = 0; i < num_first_params; ++i) {
1820 LLVMValueRef param = LLVMGetParam(parts[0], i);
1821
1822 if (ac_is_sgpr_param(param)) {
1823 assert(num_vgprs == 0);
1824 num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
1825 } else {
1826 num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
1827 }
1828 }
1829
1830 gprs = 0;
1831 while (gprs < num_sgprs + num_vgprs) {
1832 LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count);
1833 LLVMTypeRef type = LLVMTypeOf(param);
1834 unsigned size = ac_get_type_size(type) / 4;
1835
1836 /* This is going to get casted anyways, so we don't have to
1837 * have the exact same type. But we do have to preserve the
1838 * pointer-ness so that LLVM knows about it.
1839 */
1840 enum ac_arg_type arg_type = AC_ARG_INT;
1841 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
1842 type = LLVMGetElementType(type);
1843
1844 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
1845 if (LLVMGetVectorSize(type) == 4)
1846 arg_type = AC_ARG_CONST_DESC_PTR;
1847 else if (LLVMGetVectorSize(type) == 8)
1848 arg_type = AC_ARG_CONST_IMAGE_PTR;
1849 else
1850 assert(0);
1851 } else if (type == ctx->ac.f32) {
1852 arg_type = AC_ARG_CONST_FLOAT_PTR;
1853 } else {
1854 assert(0);
1855 }
1856 }
1857
1858 ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR,
1859 size, arg_type, NULL);
1860
1861 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
1862 assert(gprs + size <= num_sgprs + num_vgprs &&
1863 (gprs >= num_sgprs || gprs + size <= num_sgprs));
1864
1865 gprs += size;
1866 }
1867
1868 /* Prepare the return type. */
1869 unsigned num_returns = 0;
1870 LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type;
1871
1872 last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1]));
1873 return_type = LLVMGetReturnType(last_func_type);
1874
1875 switch (LLVMGetTypeKind(return_type)) {
1876 case LLVMStructTypeKind:
1877 num_returns = LLVMCountStructElementTypes(return_type);
1878 assert(num_returns <= ARRAY_SIZE(returns));
1879 LLVMGetStructElementTypes(return_type, returns);
1880 break;
1881 case LLVMVoidTypeKind:
1882 break;
1883 default:
1884 unreachable("unexpected type");
1885 }
1886
1887 si_llvm_create_func(ctx, "wrapper", returns, num_returns,
1888 si_get_max_workgroup_size(ctx->shader));
1889
1890 if (si_is_merged_shader(ctx))
1891 ac_init_exec_full_mask(&ctx->ac);
1892
1893 /* Record the arguments of the function as if they were an output of
1894 * a previous part.
1895 */
1896 num_out = 0;
1897 num_out_sgpr = 0;
1898
1899 for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
1900 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
1901 LLVMTypeRef param_type = LLVMTypeOf(param);
1902 LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32;
1903 unsigned size = ac_get_type_size(param_type) / 4;
1904
1905 if (size == 1) {
1906 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
1907 param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, "");
1908 param_type = ctx->ac.i32;
1909 }
1910
1911 if (param_type != out_type)
1912 param = LLVMBuildBitCast(builder, param, out_type, "");
1913 out[num_out++] = param;
1914 } else {
1915 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
1916
1917 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
1918 param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, "");
1919 param_type = ctx->ac.i64;
1920 }
1921
1922 if (param_type != vector_type)
1923 param = LLVMBuildBitCast(builder, param, vector_type, "");
1924
1925 for (unsigned j = 0; j < size; ++j)
1926 out[num_out++] = LLVMBuildExtractElement(
1927 builder, param, LLVMConstInt(ctx->ac.i32, j, 0), "");
1928 }
1929
1930 if (ctx->args.args[i].file == AC_ARG_SGPR)
1931 num_out_sgpr = num_out;
1932 }
1933
1934 memcpy(initial, out, sizeof(out));
1935 initial_num_out = num_out;
1936 initial_num_out_sgpr = num_out_sgpr;
1937
1938 /* Now chain the parts. */
1939 LLVMValueRef ret = NULL;
1940 for (unsigned part = 0; part < num_parts; ++part) {
1941 LLVMValueRef in[AC_MAX_ARGS];
1942 LLVMTypeRef ret_type;
1943 unsigned out_idx = 0;
1944 unsigned num_params = LLVMCountParams(parts[part]);
1945
1946 /* Merged shaders are executed conditionally depending
1947 * on the number of enabled threads passed in the input SGPRs. */
1948 if (is_multi_part_shader(ctx) && part == 0) {
1949 LLVMValueRef ena, count = initial[3];
1950
1951 count = LLVMBuildAnd(builder, count,
1952 LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
1953 ena = LLVMBuildICmp(builder, LLVMIntULT,
1954 ac_get_thread_id(&ctx->ac), count, "");
1955 ac_build_ifcc(&ctx->ac, ena, 6506);
1956 }
1957
1958 /* Derive arguments for the next part from outputs of the
1959 * previous one.
1960 */
1961 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
1962 LLVMValueRef param;
1963 LLVMTypeRef param_type;
1964 bool is_sgpr;
1965 unsigned param_size;
1966 LLVMValueRef arg = NULL;
1967
1968 param = LLVMGetParam(parts[part], param_idx);
1969 param_type = LLVMTypeOf(param);
1970 param_size = ac_get_type_size(param_type) / 4;
1971 is_sgpr = ac_is_sgpr_param(param);
1972
1973 if (is_sgpr) {
1974 ac_add_function_attr(ctx->ac.context, parts[part],
1975 param_idx + 1, AC_FUNC_ATTR_INREG);
1976 } else if (out_idx < num_out_sgpr) {
1977 /* Skip returned SGPRs the current part doesn't
1978 * declare on the input. */
1979 out_idx = num_out_sgpr;
1980 }
1981
1982 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
1983
1984 if (param_size == 1)
1985 arg = out[out_idx];
1986 else
1987 arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
1988
1989 if (LLVMTypeOf(arg) != param_type) {
1990 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
1991 if (LLVMGetPointerAddressSpace(param_type) ==
1992 AC_ADDR_SPACE_CONST_32BIT) {
1993 arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, "");
1994 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
1995 } else {
1996 arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, "");
1997 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
1998 }
1999 } else {
2000 arg = LLVMBuildBitCast(builder, arg, param_type, "");
2001 }
2002 }
2003
2004 in[param_idx] = arg;
2005 out_idx += param_size;
2006 }
2007
2008 ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
2009
2010 if (is_multi_part_shader(ctx) &&
2011 part + 1 == next_shader_first_part) {
2012 ac_build_endif(&ctx->ac, 6506);
2013
2014 /* The second half of the merged shader should use
2015 * the inputs from the toplevel (wrapper) function,
2016 * not the return value from the last call.
2017 *
2018 * That's because the last call was executed condi-
2019 * tionally, so we can't consume it in the main
2020 * block.
2021 */
2022 memcpy(out, initial, sizeof(initial));
2023 num_out = initial_num_out;
2024 num_out_sgpr = initial_num_out_sgpr;
2025 continue;
2026 }
2027
2028 /* Extract the returned GPRs. */
2029 ret_type = LLVMTypeOf(ret);
2030 num_out = 0;
2031 num_out_sgpr = 0;
2032
2033 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
2034 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
2035
2036 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
2037
2038 for (unsigned i = 0; i < ret_size; ++i) {
2039 LLVMValueRef val =
2040 LLVMBuildExtractValue(builder, ret, i, "");
2041
2042 assert(num_out < ARRAY_SIZE(out));
2043 out[num_out++] = val;
2044
2045 if (LLVMTypeOf(val) == ctx->ac.i32) {
2046 assert(num_out_sgpr + 1 == num_out);
2047 num_out_sgpr = num_out;
2048 }
2049 }
2050 }
2051 }
2052
2053 /* Return the value from the last part. */
2054 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
2055 LLVMBuildRetVoid(builder);
2056 else
2057 LLVMBuildRet(builder, ret);
2058 }
2059
2060 static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
2061 struct si_shader_selector *sel)
2062 {
2063 if (!compiler->low_opt_passes)
2064 return false;
2065
2066 /* Assume a slow CPU. */
2067 assert(!sel->screen->info.has_dedicated_vram &&
2068 sel->screen->info.chip_class <= GFX8);
2069
2070 /* For a crazy dEQP test containing 2597 memory opcodes, mostly
2071 * buffer stores. */
2072 return sel->type == PIPE_SHADER_COMPUTE &&
2073 sel->info.num_memory_instructions > 1000;
2074 }
2075
2076 static struct nir_shader *get_nir_shader(struct si_shader_selector *sel,
2077 bool *free_nir)
2078 {
2079 *free_nir = false;
2080
2081 if (sel->nir) {
2082 return sel->nir;
2083 } else if (sel->nir_binary) {
2084 struct pipe_screen *screen = &sel->screen->b;
2085 const void *options =
2086 screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
2087 sel->type);
2088
2089 struct blob_reader blob_reader;
2090 blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
2091 *free_nir = true;
2092 return nir_deserialize(NULL, options, &blob_reader);
2093 }
2094 return NULL;
2095 }
2096
2097 int si_compile_shader(struct si_screen *sscreen,
2098 struct ac_llvm_compiler *compiler,
2099 struct si_shader *shader,
2100 struct pipe_debug_callback *debug)
2101 {
2102 struct si_shader_selector *sel = shader->selector;
2103 struct si_shader_context ctx;
2104 bool free_nir;
2105 struct nir_shader *nir = get_nir_shader(sel, &free_nir);
2106 int r = -1;
2107
2108 /* Dump NIR before doing NIR->LLVM conversion in case the
2109 * conversion fails. */
2110 if (si_can_dump_shader(sscreen, sel->type) &&
2111 !(sscreen->debug_flags & DBG(NO_NIR))) {
2112 nir_print_shader(nir, stderr);
2113 si_dump_streamout(&sel->so);
2114 }
2115
2116 si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
2117
2118 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
2119 sizeof(shader->info.vs_output_param_offset));
2120
2121 shader->info.uses_instanceid = sel->info.uses_instanceid;
2122
2123 LLVMValueRef ngg_cull_main_fn = NULL;
2124 if (shader->key.opt.ngg_culling) {
2125 if (!si_build_main_function(&ctx, shader, nir, false, true)) {
2126 si_llvm_dispose(&ctx);
2127 return -1;
2128 }
2129 ngg_cull_main_fn = ctx.main_fn;
2130 ctx.main_fn = NULL;
2131 }
2132
2133 if (!si_build_main_function(&ctx, shader, nir, free_nir, false)) {
2134 si_llvm_dispose(&ctx);
2135 return -1;
2136 }
2137
2138 if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
2139 LLVMValueRef parts[4];
2140 unsigned num_parts = 0;
2141 bool has_prolog = false;
2142 LLVMValueRef main_fn = ctx.main_fn;
2143
2144 if (ngg_cull_main_fn) {
2145 if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
2146 &shader->key, true)) {
2147 union si_shader_part_key prolog_key;
2148 si_get_vs_prolog_key(&sel->info,
2149 shader->info.num_input_sgprs,
2150 true,
2151 &shader->key.part.vs.prolog,
2152 shader, &prolog_key);
2153 prolog_key.vs_prolog.is_monolithic = true;
2154 si_llvm_build_vs_prolog(&ctx, &prolog_key);
2155 parts[num_parts++] = ctx.main_fn;
2156 has_prolog = true;
2157 }
2158 parts[num_parts++] = ngg_cull_main_fn;
2159 }
2160
2161 if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
2162 &shader->key, false)) {
2163 union si_shader_part_key prolog_key;
2164 si_get_vs_prolog_key(&sel->info,
2165 shader->info.num_input_sgprs,
2166 false,
2167 &shader->key.part.vs.prolog,
2168 shader, &prolog_key);
2169 prolog_key.vs_prolog.is_monolithic = true;
2170 si_llvm_build_vs_prolog(&ctx, &prolog_key);
2171 parts[num_parts++] = ctx.main_fn;
2172 has_prolog = true;
2173 }
2174 parts[num_parts++] = main_fn;
2175
2176 si_build_wrapper_function(&ctx, parts, num_parts,
2177 has_prolog ? 1 : 0, 0);
2178
2179 if (ctx.shader->key.opt.vs_as_prim_discard_cs)
2180 si_build_prim_discard_compute_shader(&ctx);
2181 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
2182 ngg_cull_main_fn) {
2183 LLVMValueRef parts[2];
2184
2185 parts[0] = ngg_cull_main_fn;
2186 parts[1] = ctx.main_fn;
2187
2188 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
2189 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
2190 if (sscreen->info.chip_class >= GFX9) {
2191 struct si_shader_selector *ls = shader->key.part.tcs.ls;
2192 LLVMValueRef parts[4];
2193 bool vs_needs_prolog =
2194 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog,
2195 &shader->key, false);
2196
2197 /* TCS main part */
2198 parts[2] = ctx.main_fn;
2199
2200 /* TCS epilog */
2201 union si_shader_part_key tcs_epilog_key;
2202 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
2203 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
2204 si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
2205 parts[3] = ctx.main_fn;
2206
2207 /* VS as LS main part */
2208 nir = get_nir_shader(ls, &free_nir);
2209 struct si_shader shader_ls = {};
2210 shader_ls.selector = ls;
2211 shader_ls.key.as_ls = 1;
2212 shader_ls.key.mono = shader->key.mono;
2213 shader_ls.key.opt = shader->key.opt;
2214 shader_ls.is_monolithic = true;
2215
2216 if (!si_build_main_function(&ctx, &shader_ls, nir, free_nir, false)) {
2217 si_llvm_dispose(&ctx);
2218 return -1;
2219 }
2220 shader->info.uses_instanceid |= ls->info.uses_instanceid;
2221 parts[1] = ctx.main_fn;
2222
2223 /* LS prolog */
2224 if (vs_needs_prolog) {
2225 union si_shader_part_key vs_prolog_key;
2226 si_get_vs_prolog_key(&ls->info,
2227 shader_ls.info.num_input_sgprs,
2228 false,
2229 &shader->key.part.tcs.ls_prolog,
2230 shader, &vs_prolog_key);
2231 vs_prolog_key.vs_prolog.is_monolithic = true;
2232 si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
2233 parts[0] = ctx.main_fn;
2234 }
2235
2236 /* Reset the shader context. */
2237 ctx.shader = shader;
2238 ctx.type = PIPE_SHADER_TESS_CTRL;
2239
2240 si_build_wrapper_function(&ctx,
2241 parts + !vs_needs_prolog,
2242 4 - !vs_needs_prolog, vs_needs_prolog,
2243 vs_needs_prolog ? 2 : 1);
2244 } else {
2245 LLVMValueRef parts[2];
2246 union si_shader_part_key epilog_key;
2247
2248 parts[0] = ctx.main_fn;
2249
2250 memset(&epilog_key, 0, sizeof(epilog_key));
2251 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
2252 si_llvm_build_tcs_epilog(&ctx, &epilog_key);
2253 parts[1] = ctx.main_fn;
2254
2255 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
2256 }
2257 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
2258 if (ctx.screen->info.chip_class >= GFX9) {
2259 struct si_shader_selector *es = shader->key.part.gs.es;
2260 LLVMValueRef es_prolog = NULL;
2261 LLVMValueRef es_main = NULL;
2262 LLVMValueRef gs_prolog = NULL;
2263 LLVMValueRef gs_main = ctx.main_fn;
2264
2265 /* GS prolog */
2266 union si_shader_part_key gs_prolog_key;
2267 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
2268 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
2269 gs_prolog_key.gs_prolog.is_monolithic = true;
2270 gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
2271 si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
2272 gs_prolog = ctx.main_fn;
2273
2274 /* ES main part */
2275 nir = get_nir_shader(es, &free_nir);
2276 struct si_shader shader_es = {};
2277 shader_es.selector = es;
2278 shader_es.key.as_es = 1;
2279 shader_es.key.as_ngg = shader->key.as_ngg;
2280 shader_es.key.mono = shader->key.mono;
2281 shader_es.key.opt = shader->key.opt;
2282 shader_es.is_monolithic = true;
2283
2284 if (!si_build_main_function(&ctx, &shader_es, nir, free_nir, false)) {
2285 si_llvm_dispose(&ctx);
2286 return -1;
2287 }
2288 shader->info.uses_instanceid |= es->info.uses_instanceid;
2289 es_main = ctx.main_fn;
2290
2291 /* ES prolog */
2292 if (es->type == PIPE_SHADER_VERTEX &&
2293 si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog,
2294 &shader->key, false)) {
2295 union si_shader_part_key vs_prolog_key;
2296 si_get_vs_prolog_key(&es->info,
2297 shader_es.info.num_input_sgprs,
2298 false,
2299 &shader->key.part.gs.vs_prolog,
2300 shader, &vs_prolog_key);
2301 vs_prolog_key.vs_prolog.is_monolithic = true;
2302 si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
2303 es_prolog = ctx.main_fn;
2304 }
2305
2306 /* Reset the shader context. */
2307 ctx.shader = shader;
2308 ctx.type = PIPE_SHADER_GEOMETRY;
2309
2310 /* Prepare the array of shader parts. */
2311 LLVMValueRef parts[4];
2312 unsigned num_parts = 0, main_part, next_first_part;
2313
2314 if (es_prolog)
2315 parts[num_parts++] = es_prolog;
2316
2317 parts[main_part = num_parts++] = es_main;
2318 parts[next_first_part = num_parts++] = gs_prolog;
2319 parts[num_parts++] = gs_main;
2320
2321 si_build_wrapper_function(&ctx, parts, num_parts,
2322 main_part, next_first_part);
2323 } else {
2324 LLVMValueRef parts[2];
2325 union si_shader_part_key prolog_key;
2326
2327 parts[1] = ctx.main_fn;
2328
2329 memset(&prolog_key, 0, sizeof(prolog_key));
2330 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
2331 si_llvm_build_gs_prolog(&ctx, &prolog_key);
2332 parts[0] = ctx.main_fn;
2333
2334 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
2335 }
2336 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
2337 si_llvm_build_monolithic_ps(&ctx, shader);
2338 }
2339
2340 si_llvm_optimize_module(&ctx);
2341
2342 /* Post-optimization transformations and analysis. */
2343 si_optimize_vs_outputs(&ctx);
2344
2345 if ((debug && debug->debug_message) ||
2346 si_can_dump_shader(sscreen, ctx.type)) {
2347 ctx.shader->info.private_mem_vgprs =
2348 ac_count_scratch_private_memory(ctx.main_fn);
2349 }
2350
2351 /* Make sure the input is a pointer and not integer followed by inttoptr. */
2352 assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) ==
2353 LLVMPointerTypeKind);
2354
2355 /* Compile to bytecode. */
2356 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
2357 &ctx.ac, debug, ctx.type, si_get_shader_name(shader),
2358 si_should_optimize_less(compiler, shader->selector));
2359 si_llvm_dispose(&ctx);
2360 if (r) {
2361 fprintf(stderr, "LLVM failed to compile shader\n");
2362 return r;
2363 }
2364
2365 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
2366 * LLVM 3.9svn has this bug.
2367 */
2368 if (sel->type == PIPE_SHADER_COMPUTE) {
2369 unsigned wave_size = sscreen->compute_wave_size;
2370 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd *
2371 (wave_size == 32 ? 2 : 1);
2372 unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
2373 unsigned max_sgprs_per_wave = 128;
2374 unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
2375 unsigned threads_per_tg = si_get_max_workgroup_size(shader);
2376 unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
2377 unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
2378
2379 max_vgprs = max_vgprs / waves_per_simd;
2380 max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
2381
2382 if (shader->config.num_sgprs > max_sgprs ||
2383 shader->config.num_vgprs > max_vgprs) {
2384 fprintf(stderr, "LLVM failed to compile a shader correctly: "
2385 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
2386 shader->config.num_sgprs, shader->config.num_vgprs,
2387 max_sgprs, max_vgprs);
2388
2389 /* Just terminate the process, because dependent
2390 * shaders can hang due to bad input data, but use
2391 * the env var to allow shader-db to work.
2392 */
2393 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
2394 abort();
2395 }
2396 }
2397
2398 /* Add the scratch offset to input SGPRs. */
2399 if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(&ctx))
2400 shader->info.num_input_sgprs += 1; /* scratch byte offset */
2401
2402 /* Calculate the number of fragment input VGPRs. */
2403 if (ctx.type == PIPE_SHADER_FRAGMENT) {
2404 shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(&shader->config,
2405 &shader->info.face_vgpr_index,
2406 &shader->info.ancillary_vgpr_index);
2407 }
2408
2409 si_calculate_max_simd_waves(shader);
2410 si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
2411 return 0;
2412 }
2413
2414 /**
2415 * Create, compile and return a shader part (prolog or epilog).
2416 *
2417 * \param sscreen screen
2418 * \param list list of shader parts of the same category
2419 * \param type shader type
2420 * \param key shader part key
2421 * \param prolog whether the part being requested is a prolog
2422 * \param tm LLVM target machine
2423 * \param debug debug callback
2424 * \param build the callback responsible for building the main function
2425 * \return non-NULL on success
2426 */
2427 static struct si_shader_part *
2428 si_get_shader_part(struct si_screen *sscreen,
2429 struct si_shader_part **list,
2430 enum pipe_shader_type type,
2431 bool prolog,
2432 union si_shader_part_key *key,
2433 struct ac_llvm_compiler *compiler,
2434 struct pipe_debug_callback *debug,
2435 void (*build)(struct si_shader_context *,
2436 union si_shader_part_key *),
2437 const char *name)
2438 {
2439 struct si_shader_part *result;
2440
2441 simple_mtx_lock(&sscreen->shader_parts_mutex);
2442
2443 /* Find existing. */
2444 for (result = *list; result; result = result->next) {
2445 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
2446 simple_mtx_unlock(&sscreen->shader_parts_mutex);
2447 return result;
2448 }
2449 }
2450
2451 /* Compile a new one. */
2452 result = CALLOC_STRUCT(si_shader_part);
2453 result->key = *key;
2454
2455 struct si_shader shader = {};
2456
2457 switch (type) {
2458 case PIPE_SHADER_VERTEX:
2459 shader.key.as_ls = key->vs_prolog.as_ls;
2460 shader.key.as_es = key->vs_prolog.as_es;
2461 shader.key.as_ngg = key->vs_prolog.as_ngg;
2462 break;
2463 case PIPE_SHADER_TESS_CTRL:
2464 assert(!prolog);
2465 shader.key.part.tcs.epilog = key->tcs_epilog.states;
2466 break;
2467 case PIPE_SHADER_GEOMETRY:
2468 assert(prolog);
2469 shader.key.as_ngg = key->gs_prolog.as_ngg;
2470 break;
2471 case PIPE_SHADER_FRAGMENT:
2472 if (prolog)
2473 shader.key.part.ps.prolog = key->ps_prolog.states;
2474 else
2475 shader.key.part.ps.epilog = key->ps_epilog.states;
2476 break;
2477 default:
2478 unreachable("bad shader part");
2479 }
2480
2481 struct si_shader_context ctx;
2482 si_llvm_context_init(&ctx, sscreen, compiler,
2483 si_get_wave_size(sscreen, type, shader.key.as_ngg,
2484 shader.key.as_es));
2485 ctx.shader = &shader;
2486 ctx.type = type;
2487
2488 build(&ctx, key);
2489
2490 /* Compile. */
2491 si_llvm_optimize_module(&ctx);
2492
2493 if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
2494 &ctx.ac, debug, ctx.type, name, false)) {
2495 FREE(result);
2496 result = NULL;
2497 goto out;
2498 }
2499
2500 result->next = *list;
2501 *list = result;
2502
2503 out:
2504 si_llvm_dispose(&ctx);
2505 simple_mtx_unlock(&sscreen->shader_parts_mutex);
2506 return result;
2507 }
2508
2509 static bool si_get_vs_prolog(struct si_screen *sscreen,
2510 struct ac_llvm_compiler *compiler,
2511 struct si_shader *shader,
2512 struct pipe_debug_callback *debug,
2513 struct si_shader *main_part,
2514 const struct si_vs_prolog_bits *key)
2515 {
2516 struct si_shader_selector *vs = main_part->selector;
2517
2518 if (!si_vs_needs_prolog(vs, key, &shader->key, false))
2519 return true;
2520
2521 /* Get the prolog. */
2522 union si_shader_part_key prolog_key;
2523 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false,
2524 key, shader, &prolog_key);
2525
2526 shader->prolog =
2527 si_get_shader_part(sscreen, &sscreen->vs_prologs,
2528 PIPE_SHADER_VERTEX, true, &prolog_key, compiler,
2529 debug, si_llvm_build_vs_prolog,
2530 "Vertex Shader Prolog");
2531 return shader->prolog != NULL;
2532 }
2533
2534 /**
2535 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
2536 */
2537 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
2538 struct ac_llvm_compiler *compiler,
2539 struct si_shader *shader,
2540 struct pipe_debug_callback *debug)
2541 {
2542 return si_get_vs_prolog(sscreen, compiler, shader, debug, shader,
2543 &shader->key.part.vs.prolog);
2544 }
2545
2546 /**
2547 * Select and compile (or reuse) TCS parts (epilog).
2548 */
2549 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
2550 struct ac_llvm_compiler *compiler,
2551 struct si_shader *shader,
2552 struct pipe_debug_callback *debug)
2553 {
2554 if (sscreen->info.chip_class >= GFX9) {
2555 struct si_shader *ls_main_part =
2556 shader->key.part.tcs.ls->main_shader_part_ls;
2557
2558 if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
2559 &shader->key.part.tcs.ls_prolog))
2560 return false;
2561
2562 shader->previous_stage = ls_main_part;
2563 }
2564
2565 /* Get the epilog. */
2566 union si_shader_part_key epilog_key;
2567 memset(&epilog_key, 0, sizeof(epilog_key));
2568 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
2569
2570 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
2571 PIPE_SHADER_TESS_CTRL, false,
2572 &epilog_key, compiler, debug,
2573 si_llvm_build_tcs_epilog,
2574 "Tessellation Control Shader Epilog");
2575 return shader->epilog != NULL;
2576 }
2577
2578 /**
2579 * Select and compile (or reuse) GS parts (prolog).
2580 */
2581 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
2582 struct ac_llvm_compiler *compiler,
2583 struct si_shader *shader,
2584 struct pipe_debug_callback *debug)
2585 {
2586 if (sscreen->info.chip_class >= GFX9) {
2587 struct si_shader *es_main_part;
2588 enum pipe_shader_type es_type = shader->key.part.gs.es->type;
2589
2590 if (shader->key.as_ngg)
2591 es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
2592 else
2593 es_main_part = shader->key.part.gs.es->main_shader_part_es;
2594
2595 if (es_type == PIPE_SHADER_VERTEX &&
2596 !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
2597 &shader->key.part.gs.vs_prolog))
2598 return false;
2599
2600 shader->previous_stage = es_main_part;
2601 }
2602
2603 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
2604 return true;
2605
2606 union si_shader_part_key prolog_key;
2607 memset(&prolog_key, 0, sizeof(prolog_key));
2608 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
2609 prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
2610
2611 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
2612 PIPE_SHADER_GEOMETRY, true,
2613 &prolog_key, compiler, debug,
2614 si_llvm_build_gs_prolog,
2615 "Geometry Shader Prolog");
2616 return shader->prolog2 != NULL;
2617 }
2618
2619 /**
2620 * Compute the PS prolog key, which contains all the information needed to
2621 * build the PS prolog function, and set related bits in shader->config.
2622 */
2623 void si_get_ps_prolog_key(struct si_shader *shader,
2624 union si_shader_part_key *key,
2625 bool separate_prolog)
2626 {
2627 struct si_shader_info *info = &shader->selector->info;
2628
2629 memset(key, 0, sizeof(*key));
2630 key->ps_prolog.states = shader->key.part.ps.prolog;
2631 key->ps_prolog.colors_read = info->colors_read;
2632 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
2633 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
2634 key->ps_prolog.wqm = info->uses_derivatives &&
2635 (key->ps_prolog.colors_read ||
2636 key->ps_prolog.states.force_persp_sample_interp ||
2637 key->ps_prolog.states.force_linear_sample_interp ||
2638 key->ps_prolog.states.force_persp_center_interp ||
2639 key->ps_prolog.states.force_linear_center_interp ||
2640 key->ps_prolog.states.bc_optimize_for_persp ||
2641 key->ps_prolog.states.bc_optimize_for_linear);
2642 key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
2643
2644 if (info->colors_read) {
2645 unsigned *color = shader->selector->color_attr_index;
2646
2647 if (shader->key.part.ps.prolog.color_two_side) {
2648 /* BCOLORs are stored after the last input. */
2649 key->ps_prolog.num_interp_inputs = info->num_inputs;
2650 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
2651 if (separate_prolog)
2652 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
2653 }
2654
2655 for (unsigned i = 0; i < 2; i++) {
2656 unsigned interp = info->input_interpolate[color[i]];
2657 unsigned location = info->input_interpolate_loc[color[i]];
2658
2659 if (!(info->colors_read & (0xf << i*4)))
2660 continue;
2661
2662 key->ps_prolog.color_attr_index[i] = color[i];
2663
2664 if (shader->key.part.ps.prolog.flatshade_colors &&
2665 interp == TGSI_INTERPOLATE_COLOR)
2666 interp = TGSI_INTERPOLATE_CONSTANT;
2667
2668 switch (interp) {
2669 case TGSI_INTERPOLATE_CONSTANT:
2670 key->ps_prolog.color_interp_vgpr_index[i] = -1;
2671 break;
2672 case TGSI_INTERPOLATE_PERSPECTIVE:
2673 case TGSI_INTERPOLATE_COLOR:
2674 /* Force the interpolation location for colors here. */
2675 if (shader->key.part.ps.prolog.force_persp_sample_interp)
2676 location = TGSI_INTERPOLATE_LOC_SAMPLE;
2677 if (shader->key.part.ps.prolog.force_persp_center_interp)
2678 location = TGSI_INTERPOLATE_LOC_CENTER;
2679
2680 switch (location) {
2681 case TGSI_INTERPOLATE_LOC_SAMPLE:
2682 key->ps_prolog.color_interp_vgpr_index[i] = 0;
2683 if (separate_prolog) {
2684 shader->config.spi_ps_input_ena |=
2685 S_0286CC_PERSP_SAMPLE_ENA(1);
2686 }
2687 break;
2688 case TGSI_INTERPOLATE_LOC_CENTER:
2689 key->ps_prolog.color_interp_vgpr_index[i] = 2;
2690 if (separate_prolog) {
2691 shader->config.spi_ps_input_ena |=
2692 S_0286CC_PERSP_CENTER_ENA(1);
2693 }
2694 break;
2695 case TGSI_INTERPOLATE_LOC_CENTROID:
2696 key->ps_prolog.color_interp_vgpr_index[i] = 4;
2697 if (separate_prolog) {
2698 shader->config.spi_ps_input_ena |=
2699 S_0286CC_PERSP_CENTROID_ENA(1);
2700 }
2701 break;
2702 default:
2703 assert(0);
2704 }
2705 break;
2706 case TGSI_INTERPOLATE_LINEAR:
2707 /* Force the interpolation location for colors here. */
2708 if (shader->key.part.ps.prolog.force_linear_sample_interp)
2709 location = TGSI_INTERPOLATE_LOC_SAMPLE;
2710 if (shader->key.part.ps.prolog.force_linear_center_interp)
2711 location = TGSI_INTERPOLATE_LOC_CENTER;
2712
2713 /* The VGPR assignment for non-monolithic shaders
2714 * works because InitialPSInputAddr is set on the
2715 * main shader and PERSP_PULL_MODEL is never used.
2716 */
2717 switch (location) {
2718 case TGSI_INTERPOLATE_LOC_SAMPLE:
2719 key->ps_prolog.color_interp_vgpr_index[i] =
2720 separate_prolog ? 6 : 9;
2721 if (separate_prolog) {
2722 shader->config.spi_ps_input_ena |=
2723 S_0286CC_LINEAR_SAMPLE_ENA(1);
2724 }
2725 break;
2726 case TGSI_INTERPOLATE_LOC_CENTER:
2727 key->ps_prolog.color_interp_vgpr_index[i] =
2728 separate_prolog ? 8 : 11;
2729 if (separate_prolog) {
2730 shader->config.spi_ps_input_ena |=
2731 S_0286CC_LINEAR_CENTER_ENA(1);
2732 }
2733 break;
2734 case TGSI_INTERPOLATE_LOC_CENTROID:
2735 key->ps_prolog.color_interp_vgpr_index[i] =
2736 separate_prolog ? 10 : 13;
2737 if (separate_prolog) {
2738 shader->config.spi_ps_input_ena |=
2739 S_0286CC_LINEAR_CENTROID_ENA(1);
2740 }
2741 break;
2742 default:
2743 assert(0);
2744 }
2745 break;
2746 default:
2747 assert(0);
2748 }
2749 }
2750 }
2751 }
2752
2753 /**
2754 * Check whether a PS prolog is required based on the key.
2755 */
2756 bool si_need_ps_prolog(const union si_shader_part_key *key)
2757 {
2758 return key->ps_prolog.colors_read ||
2759 key->ps_prolog.states.force_persp_sample_interp ||
2760 key->ps_prolog.states.force_linear_sample_interp ||
2761 key->ps_prolog.states.force_persp_center_interp ||
2762 key->ps_prolog.states.force_linear_center_interp ||
2763 key->ps_prolog.states.bc_optimize_for_persp ||
2764 key->ps_prolog.states.bc_optimize_for_linear ||
2765 key->ps_prolog.states.poly_stipple ||
2766 key->ps_prolog.states.samplemask_log_ps_iter;
2767 }
2768
2769 /**
2770 * Compute the PS epilog key, which contains all the information needed to
2771 * build the PS epilog function.
2772 */
2773 void si_get_ps_epilog_key(struct si_shader *shader,
2774 union si_shader_part_key *key)
2775 {
2776 struct si_shader_info *info = &shader->selector->info;
2777 memset(key, 0, sizeof(*key));
2778 key->ps_epilog.colors_written = info->colors_written;
2779 key->ps_epilog.writes_z = info->writes_z;
2780 key->ps_epilog.writes_stencil = info->writes_stencil;
2781 key->ps_epilog.writes_samplemask = info->writes_samplemask;
2782 key->ps_epilog.states = shader->key.part.ps.epilog;
2783 }
2784
2785 /**
2786 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
2787 */
2788 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
2789 struct ac_llvm_compiler *compiler,
2790 struct si_shader *shader,
2791 struct pipe_debug_callback *debug)
2792 {
2793 union si_shader_part_key prolog_key;
2794 union si_shader_part_key epilog_key;
2795
2796 /* Get the prolog. */
2797 si_get_ps_prolog_key(shader, &prolog_key, true);
2798
2799 /* The prolog is a no-op if these aren't set. */
2800 if (si_need_ps_prolog(&prolog_key)) {
2801 shader->prolog =
2802 si_get_shader_part(sscreen, &sscreen->ps_prologs,
2803 PIPE_SHADER_FRAGMENT, true,
2804 &prolog_key, compiler, debug,
2805 si_llvm_build_ps_prolog,
2806 "Fragment Shader Prolog");
2807 if (!shader->prolog)
2808 return false;
2809 }
2810
2811 /* Get the epilog. */
2812 si_get_ps_epilog_key(shader, &epilog_key);
2813
2814 shader->epilog =
2815 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
2816 PIPE_SHADER_FRAGMENT, false,
2817 &epilog_key, compiler, debug,
2818 si_llvm_build_ps_epilog,
2819 "Fragment Shader Epilog");
2820 if (!shader->epilog)
2821 return false;
2822
2823 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
2824 if (shader->key.part.ps.prolog.poly_stipple) {
2825 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2826 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
2827 }
2828
2829 /* Set up the enable bits for per-sample shading if needed. */
2830 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
2831 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2832 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2833 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
2834 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2835 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2836 }
2837 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
2838 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2839 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2840 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
2841 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2842 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2843 }
2844 if (shader->key.part.ps.prolog.force_persp_center_interp &&
2845 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2846 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2847 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
2848 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2849 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2850 }
2851 if (shader->key.part.ps.prolog.force_linear_center_interp &&
2852 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2853 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2854 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
2855 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2856 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2857 }
2858
2859 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
2860 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
2861 !(shader->config.spi_ps_input_ena & 0xf)) {
2862 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2863 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
2864 }
2865
2866 /* At least one pair of interpolation weights must be enabled. */
2867 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
2868 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2869 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
2870 }
2871
2872 /* Samplemask fixup requires the sample ID. */
2873 if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
2874 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2875 assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
2876 }
2877
2878 /* The sample mask input is always enabled, because the API shader always
2879 * passes it through to the epilog. Disable it here if it's unused.
2880 */
2881 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
2882 !shader->selector->info.reads_samplemask)
2883 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
2884
2885 return true;
2886 }
2887
2888 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
2889 unsigned *lds_size)
2890 {
2891 /* If tessellation is all offchip and on-chip GS isn't used, this
2892 * workaround is not needed.
2893 */
2894 return;
2895
2896 /* SPI barrier management bug:
2897 * Make sure we have at least 4k of LDS in use to avoid the bug.
2898 * It applies to workgroup sizes of more than one wavefront.
2899 */
2900 if (sscreen->info.family == CHIP_BONAIRE ||
2901 sscreen->info.family == CHIP_KABINI)
2902 *lds_size = MAX2(*lds_size, 8);
2903 }
2904
2905 void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
2906 {
2907 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
2908
2909 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
2910
2911 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
2912 si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
2913 si_multiwave_lds_size_workaround(sscreen,
2914 &shader->config.lds_size);
2915 }
2916 }
2917
2918 bool si_create_shader_variant(struct si_screen *sscreen,
2919 struct ac_llvm_compiler *compiler,
2920 struct si_shader *shader,
2921 struct pipe_debug_callback *debug)
2922 {
2923 struct si_shader_selector *sel = shader->selector;
2924 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
2925 int r;
2926
2927 /* LS, ES, VS are compiled on demand if the main part hasn't been
2928 * compiled for that stage.
2929 *
2930 * GS are compiled on demand if the main part hasn't been compiled
2931 * for the chosen NGG-ness.
2932 *
2933 * Vertex shaders are compiled on demand when a vertex fetch
2934 * workaround must be applied.
2935 */
2936 if (shader->is_monolithic) {
2937 /* Monolithic shader (compiled as a whole, has many variants,
2938 * may take a long time to compile).
2939 */
2940 r = si_compile_shader(sscreen, compiler, shader, debug);
2941 if (r)
2942 return false;
2943 } else {
2944 /* The shader consists of several parts:
2945 *
2946 * - the middle part is the user shader, it has 1 variant only
2947 * and it was compiled during the creation of the shader
2948 * selector
2949 * - the prolog part is inserted at the beginning
2950 * - the epilog part is inserted at the end
2951 *
2952 * The prolog and epilog have many (but simple) variants.
2953 *
2954 * Starting with gfx9, geometry and tessellation control
2955 * shaders also contain the prolog and user shader parts of
2956 * the previous shader stage.
2957 */
2958
2959 if (!mainp)
2960 return false;
2961
2962 /* Copy the compiled shader data over. */
2963 shader->is_binary_shared = true;
2964 shader->binary = mainp->binary;
2965 shader->config = mainp->config;
2966 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
2967 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
2968 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
2969 shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
2970 memcpy(shader->info.vs_output_param_offset,
2971 mainp->info.vs_output_param_offset,
2972 sizeof(mainp->info.vs_output_param_offset));
2973 shader->info.uses_instanceid = mainp->info.uses_instanceid;
2974 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
2975 shader->info.nr_param_exports = mainp->info.nr_param_exports;
2976
2977 /* Select prologs and/or epilogs. */
2978 switch (sel->type) {
2979 case PIPE_SHADER_VERTEX:
2980 if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
2981 return false;
2982 break;
2983 case PIPE_SHADER_TESS_CTRL:
2984 if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
2985 return false;
2986 break;
2987 case PIPE_SHADER_TESS_EVAL:
2988 break;
2989 case PIPE_SHADER_GEOMETRY:
2990 if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
2991 return false;
2992 break;
2993 case PIPE_SHADER_FRAGMENT:
2994 if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
2995 return false;
2996
2997 /* Make sure we have at least as many VGPRs as there
2998 * are allocated inputs.
2999 */
3000 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
3001 shader->info.num_input_vgprs);
3002 break;
3003 default:;
3004 }
3005
3006 /* Update SGPR and VGPR counts. */
3007 if (shader->prolog) {
3008 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
3009 shader->prolog->config.num_sgprs);
3010 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
3011 shader->prolog->config.num_vgprs);
3012 }
3013 if (shader->previous_stage) {
3014 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
3015 shader->previous_stage->config.num_sgprs);
3016 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
3017 shader->previous_stage->config.num_vgprs);
3018 shader->config.spilled_sgprs =
3019 MAX2(shader->config.spilled_sgprs,
3020 shader->previous_stage->config.spilled_sgprs);
3021 shader->config.spilled_vgprs =
3022 MAX2(shader->config.spilled_vgprs,
3023 shader->previous_stage->config.spilled_vgprs);
3024 shader->info.private_mem_vgprs =
3025 MAX2(shader->info.private_mem_vgprs,
3026 shader->previous_stage->info.private_mem_vgprs);
3027 shader->config.scratch_bytes_per_wave =
3028 MAX2(shader->config.scratch_bytes_per_wave,
3029 shader->previous_stage->config.scratch_bytes_per_wave);
3030 shader->info.uses_instanceid |=
3031 shader->previous_stage->info.uses_instanceid;
3032 }
3033 if (shader->prolog2) {
3034 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
3035 shader->prolog2->config.num_sgprs);
3036 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
3037 shader->prolog2->config.num_vgprs);
3038 }
3039 if (shader->epilog) {
3040 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
3041 shader->epilog->config.num_sgprs);
3042 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
3043 shader->epilog->config.num_vgprs);
3044 }
3045 si_calculate_max_simd_waves(shader);
3046 }
3047
3048 if (shader->key.as_ngg) {
3049 assert(!shader->key.as_es && !shader->key.as_ls);
3050 gfx10_ngg_calculate_subgroup_info(shader);
3051 } else if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY) {
3052 gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
3053 }
3054
3055 si_fix_resource_usage(sscreen, shader);
3056 si_shader_dump(sscreen, shader, debug, stderr, true);
3057
3058 /* Upload. */
3059 if (!si_shader_binary_upload(sscreen, shader, 0)) {
3060 fprintf(stderr, "LLVM failed to upload shader\n");
3061 return false;
3062 }
3063
3064 return true;
3065 }
3066
3067 void si_shader_binary_clean(struct si_shader_binary *binary)
3068 {
3069 free((void *)binary->elf_buffer);
3070 binary->elf_buffer = NULL;
3071
3072 free(binary->llvm_ir_string);
3073 binary->llvm_ir_string = NULL;
3074 }
3075
3076 void si_shader_destroy(struct si_shader *shader)
3077 {
3078 if (shader->scratch_bo)
3079 si_resource_reference(&shader->scratch_bo, NULL);
3080
3081 si_resource_reference(&shader->bo, NULL);
3082
3083 if (!shader->is_binary_shared)
3084 si_shader_binary_clean(&shader->binary);
3085
3086 free(shader->shader_log);
3087 }