radeonsi: replace TGSI_SEMANTIC with VARYING_SLOT and FRAG_RESULT
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "ac_exp_param.h"
26 #include "ac_rtld.h"
27 #include "compiler/nir/nir.h"
28 #include "compiler/nir/nir_serialize.h"
29 #include "si_pipe.h"
30 #include "si_shader_internal.h"
31 #include "sid.h"
32 #include "tgsi/tgsi_from_mesa.h"
33 #include "tgsi/tgsi_strings.h"
34 #include "util/u_memory.h"
35
36 static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
37
38 static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
39
40 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
41
42 /** Whether the shader runs as a combination of multiple API shaders */
43 bool si_is_multi_part_shader(struct si_shader *shader)
44 {
45 if (shader->selector->screen->info.chip_class <= GFX8)
46 return false;
47
48 return shader->key.as_ls || shader->key.as_es ||
49 shader->selector->info.stage == MESA_SHADER_TESS_CTRL ||
50 shader->selector->info.stage == MESA_SHADER_GEOMETRY;
51 }
52
53 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
54 bool si_is_merged_shader(struct si_shader *shader)
55 {
56 return shader->key.as_ngg || si_is_multi_part_shader(shader);
57 }
58
59 /**
60 * Returns a unique index for a per-patch semantic name and index. The index
61 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
62 * can be calculated.
63 */
64 unsigned si_shader_io_get_unique_index_patch(unsigned semantic)
65 {
66 switch (semantic) {
67 case VARYING_SLOT_TESS_LEVEL_OUTER:
68 return 0;
69 case VARYING_SLOT_TESS_LEVEL_INNER:
70 return 1;
71 default:
72 if (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_PATCH0 + 30)
73 return 2 + (semantic - VARYING_SLOT_PATCH0);
74
75 assert(!"invalid semantic");
76 return 0;
77 }
78 }
79
80 /**
81 * Returns a unique index for a semantic name and index. The index must be
82 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
83 * calculated.
84 */
85 unsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying)
86 {
87 switch (semantic) {
88 case VARYING_SLOT_POS:
89 return 0;
90 default:
91 /* Since some shader stages use the the highest used IO index
92 * to determine the size to allocate for inputs/outputs
93 * (in LDS, tess and GS rings). GENERIC should be placed right
94 * after POSITION to make that size as small as possible.
95 */
96 if (semantic >= VARYING_SLOT_VAR0 &&
97 semantic < VARYING_SLOT_VAR0 + SI_MAX_IO_GENERIC)
98 return 1 + (semantic - VARYING_SLOT_VAR0);
99
100 assert(!"invalid generic index");
101 return 0;
102 case VARYING_SLOT_FOGC:
103 return SI_MAX_IO_GENERIC + 1;
104 case VARYING_SLOT_COL0:
105 return SI_MAX_IO_GENERIC + 2;
106 case VARYING_SLOT_COL1:
107 return SI_MAX_IO_GENERIC + 3;
108 case VARYING_SLOT_BFC0:
109 /* If it's a varying, COLOR and BCOLOR alias. */
110 if (is_varying)
111 return SI_MAX_IO_GENERIC + 2;
112 else
113 return SI_MAX_IO_GENERIC + 4;
114 case VARYING_SLOT_BFC1:
115 if (is_varying)
116 return SI_MAX_IO_GENERIC + 3;
117 else
118 return SI_MAX_IO_GENERIC + 5;
119 case VARYING_SLOT_TEX0:
120 case VARYING_SLOT_TEX1:
121 case VARYING_SLOT_TEX2:
122 case VARYING_SLOT_TEX3:
123 case VARYING_SLOT_TEX4:
124 case VARYING_SLOT_TEX5:
125 case VARYING_SLOT_TEX6:
126 case VARYING_SLOT_TEX7:
127 return SI_MAX_IO_GENERIC + 6 + (semantic - VARYING_SLOT_TEX0);
128
129 /* These are rarely used between LS and HS or ES and GS. */
130 case VARYING_SLOT_CLIP_DIST0:
131 return SI_MAX_IO_GENERIC + 6 + 8;
132 case VARYING_SLOT_CLIP_DIST1:
133 return SI_MAX_IO_GENERIC + 6 + 8 + 1;
134 case VARYING_SLOT_CLIP_VERTEX:
135 return SI_MAX_IO_GENERIC + 6 + 8 + 2;
136 case VARYING_SLOT_PSIZ:
137 return SI_MAX_IO_GENERIC + 6 + 8 + 3;
138
139 /* These can't be written by LS, HS, and ES. */
140 case VARYING_SLOT_LAYER:
141 return SI_MAX_IO_GENERIC + 6 + 8 + 4;
142 case VARYING_SLOT_VIEWPORT:
143 return SI_MAX_IO_GENERIC + 6 + 8 + 5;
144 case VARYING_SLOT_PRIMITIVE_ID:
145 STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63);
146 return SI_MAX_IO_GENERIC + 6 + 8 + 6;
147 }
148 }
149
150 static void si_dump_streamout(struct pipe_stream_output_info *so)
151 {
152 unsigned i;
153
154 if (so->num_outputs)
155 fprintf(stderr, "STREAMOUT\n");
156
157 for (i = 0; i < so->num_outputs; i++) {
158 unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
159 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", i, so->output[i].output_buffer,
160 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
161 so->output[i].register_index, mask & 1 ? "x" : "", mask & 2 ? "y" : "",
162 mask & 4 ? "z" : "", mask & 8 ? "w" : "");
163 }
164 }
165
166 static void declare_streamout_params(struct si_shader_context *ctx,
167 struct pipe_stream_output_info *so)
168 {
169 if (ctx->screen->use_ngg_streamout) {
170 if (ctx->stage == MESA_SHADER_TESS_EVAL)
171 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
172 return;
173 }
174
175 /* Streamout SGPRs. */
176 if (so->num_outputs) {
177 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config);
178 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index);
179 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
180 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
181 }
182
183 /* A streamout buffer offset is loaded if the stride is non-zero. */
184 for (int i = 0; i < 4; i++) {
185 if (!so->stride[i])
186 continue;
187
188 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]);
189 }
190 }
191
192 unsigned si_get_max_workgroup_size(const struct si_shader *shader)
193 {
194 switch (shader->selector->info.stage) {
195 case MESA_SHADER_VERTEX:
196 case MESA_SHADER_TESS_EVAL:
197 return shader->key.as_ngg ? 128 : 0;
198
199 case MESA_SHADER_TESS_CTRL:
200 /* Return this so that LLVM doesn't remove s_barrier
201 * instructions on chips where we use s_barrier. */
202 return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;
203
204 case MESA_SHADER_GEOMETRY:
205 return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;
206
207 case MESA_SHADER_COMPUTE:
208 break; /* see below */
209
210 default:
211 return 0;
212 }
213
214 const unsigned *properties = shader->selector->info.properties;
215 unsigned max_work_group_size = properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
216 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
217 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
218
219 if (!max_work_group_size) {
220 /* This is a variable group size compute shader,
221 * compile it for the maximum possible group size.
222 */
223 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
224 }
225 return max_work_group_size;
226 }
227
228 static void declare_const_and_shader_buffers(struct si_shader_context *ctx, bool assign_params)
229 {
230 enum ac_arg_type const_shader_buf_type;
231
232 if (ctx->shader->selector->info.const_buffers_declared == 1 &&
233 ctx->shader->selector->info.shader_buffers_declared == 0)
234 const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
235 else
236 const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
237
238 ac_add_arg(
239 &ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
240 assign_params ? &ctx->const_and_shader_buffers : &ctx->other_const_and_shader_buffers);
241 }
242
243 static void declare_samplers_and_images(struct si_shader_context *ctx, bool assign_params)
244 {
245 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
246 assign_params ? &ctx->samplers_and_images : &ctx->other_samplers_and_images);
247 }
248
249 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, bool assign_params)
250 {
251 declare_const_and_shader_buffers(ctx, assign_params);
252 declare_samplers_and_images(ctx, assign_params);
253 }
254
255 static void declare_global_desc_pointers(struct si_shader_context *ctx)
256 {
257 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->rw_buffers);
258 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
259 &ctx->bindless_samplers_and_images);
260 }
261
262 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx)
263 {
264 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
265 if (!ctx->shader->is_gs_copy_shader) {
266 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
267 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
268 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
269 }
270 }
271
272 static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
273 {
274 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
275
276 unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
277 if (num_vbos_in_user_sgprs) {
278 unsigned user_sgprs = ctx->args.num_sgprs_used;
279
280 if (si_is_merged_shader(ctx->shader))
281 user_sgprs -= 8;
282 assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
283
284 /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
285 for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
286 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
287
288 assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
289 for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
290 ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
291 }
292 }
293
294 static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs,
295 bool ngg_cull_shader)
296 {
297 struct si_shader *shader = ctx->shader;
298
299 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
300 if (shader->key.as_ls) {
301 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id);
302 if (ctx->screen->info.chip_class >= GFX10) {
303 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
304 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
305 } else {
306 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
307 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
308 }
309 } else if (ctx->screen->info.chip_class >= GFX10) {
310 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
311 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
312 &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */
313 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
314 } else {
315 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
316 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id);
317 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
318 }
319
320 if (!shader->is_gs_copy_shader) {
321 if (shader->key.opt.ngg_culling && !ngg_cull_shader) {
322 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id);
323 }
324
325 /* Vertex load indices. */
326 if (shader->selector->info.num_inputs) {
327 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);
328 for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
329 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
330 }
331 *num_prolog_vgprs += shader->selector->info.num_inputs;
332 }
333 }
334
335 static void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_blit_property)
336 {
337 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_blit_inputs); /* i16 x1, y1 */
338 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
339 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
340
341 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
342 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
343 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
344 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
345 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
346 } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
347 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
348 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
349 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
350 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
351 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
352 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
353 }
354 }
355
356 static void declare_tes_input_vgprs(struct si_shader_context *ctx, bool ngg_cull_shader)
357 {
358 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
359 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
360 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
361 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
362
363 if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) {
364 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id);
365 }
366 }
367
368 enum
369 {
370 /* Convenient merged shader definitions. */
371 SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,
372 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
373 };
374
375 void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
376 enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
377 {
378 assert(args->arg_count == idx);
379 ac_add_arg(args, file, registers, type, arg);
380 }
381
382 void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
383 {
384 struct si_shader *shader = ctx->shader;
385 LLVMTypeRef returns[AC_MAX_ARGS];
386 unsigned i, num_return_sgprs;
387 unsigned num_returns = 0;
388 unsigned num_prolog_vgprs = 0;
389 unsigned stage = ctx->stage;
390 unsigned vs_blit_property = shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
391
392 memset(&ctx->args, 0, sizeof(ctx->args));
393
394 /* Set MERGED shaders. */
395 if (ctx->screen->info.chip_class >= GFX9) {
396 if (shader->key.as_ls || stage == MESA_SHADER_TESS_CTRL)
397 stage = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
398 else if (shader->key.as_es || shader->key.as_ngg || stage == MESA_SHADER_GEOMETRY)
399 stage = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
400 }
401
402 switch (stage) {
403 case MESA_SHADER_VERTEX:
404 declare_global_desc_pointers(ctx);
405
406 if (vs_blit_property) {
407 declare_vs_blit_inputs(ctx, vs_blit_property);
408
409 /* VGPRs */
410 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
411 break;
412 }
413
414 declare_per_stage_desc_pointers(ctx, true);
415 declare_vs_specific_input_sgprs(ctx);
416 if (!shader->is_gs_copy_shader)
417 declare_vb_descriptor_input_sgprs(ctx);
418
419 if (shader->key.as_es) {
420 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
421 } else if (shader->key.as_ls) {
422 /* no extra parameters */
423 } else {
424 /* The locations of the other parameters are assigned dynamically. */
425 declare_streamout_params(ctx, &shader->selector->so);
426 }
427
428 /* VGPRs */
429 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
430
431 /* Return values */
432 if (shader->key.opt.vs_as_prim_discard_cs) {
433 for (i = 0; i < 4; i++)
434 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
435 }
436 break;
437
438 case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
439 declare_global_desc_pointers(ctx);
440 declare_per_stage_desc_pointers(ctx, true);
441 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
442 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
443 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
444 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
445 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
446 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
447
448 /* VGPRs */
449 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
450 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
451
452 /* param_tcs_offchip_offset and param_tcs_factor_offset are
453 * placed after the user SGPRs.
454 */
455 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
456 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
457 for (i = 0; i < 11; i++)
458 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
459 break;
460
461 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
462 /* Merged stages have 8 system SGPRs at the beginning. */
463 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
464 declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_TESS_CTRL);
465 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
466 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
467 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
468 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
469 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
470 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
471
472 declare_global_desc_pointers(ctx);
473 declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_VERTEX);
474 declare_vs_specific_input_sgprs(ctx);
475
476 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
477 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
478 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
479 declare_vb_descriptor_input_sgprs(ctx);
480
481 /* VGPRs (first TCS, then VS) */
482 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
483 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
484
485 if (ctx->stage == MESA_SHADER_VERTEX) {
486 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
487
488 /* LS return values are inputs to the TCS main shader part. */
489 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
490 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
491 for (i = 0; i < 2; i++)
492 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
493 } else {
494 /* TCS return values are inputs to the TCS epilog.
495 *
496 * param_tcs_offchip_offset, param_tcs_factor_offset,
497 * param_tcs_offchip_layout, and param_rw_buffers
498 * should be passed to the epilog.
499 */
500 for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
501 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
502 for (i = 0; i < 11; i++)
503 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
504 }
505 break;
506
507 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
508 /* Merged stages have 8 system SGPRs at the beginning. */
509 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
510 declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_GEOMETRY);
511
512 if (ctx->shader->key.as_ngg)
513 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info);
514 else
515 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
516
517 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
518 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
519 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
520 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
521 &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */
522 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
523 NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
524
525 declare_global_desc_pointers(ctx);
526 if (ctx->stage != MESA_SHADER_VERTEX || !vs_blit_property) {
527 declare_per_stage_desc_pointers(
528 ctx, (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL));
529 }
530
531 if (ctx->stage == MESA_SHADER_VERTEX) {
532 if (vs_blit_property)
533 declare_vs_blit_inputs(ctx, vs_blit_property);
534 else
535 declare_vs_specific_input_sgprs(ctx);
536 } else {
537 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
538 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
539 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
540 /* Declare as many input SGPRs as the VS has. */
541 }
542
543 if (ctx->stage == MESA_SHADER_VERTEX)
544 declare_vb_descriptor_input_sgprs(ctx);
545
546 /* VGPRs (first GS, then VS/TES) */
547 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
548 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
549 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
550 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
551 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
552
553 if (ctx->stage == MESA_SHADER_VERTEX) {
554 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
555 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
556 declare_tes_input_vgprs(ctx, ngg_cull_shader);
557 }
558
559 if ((ctx->shader->key.as_es || ngg_cull_shader) &&
560 (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) {
561 unsigned num_user_sgprs, num_vgprs;
562
563 if (ctx->stage == MESA_SHADER_VERTEX) {
564 /* For the NGG cull shader, add 1 SGPR to hold
565 * the vertex buffer pointer.
566 */
567 num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
568
569 if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
570 assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
571 num_user_sgprs =
572 SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;
573 }
574 } else {
575 num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
576 }
577
578 /* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
579 *
580 * The normal merged ESGS shader only has to return the 5 VGPRs
581 * for the GS stage.
582 */
583 num_vgprs = ngg_cull_shader ? 10 : 5;
584
585 /* ES return values are inputs to GS. */
586 for (i = 0; i < 8 + num_user_sgprs; i++)
587 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
588 for (i = 0; i < num_vgprs; i++)
589 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
590 }
591 break;
592
593 case MESA_SHADER_TESS_EVAL:
594 declare_global_desc_pointers(ctx);
595 declare_per_stage_desc_pointers(ctx, true);
596 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
597 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
598 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
599
600 if (shader->key.as_es) {
601 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
602 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
603 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
604 } else {
605 declare_streamout_params(ctx, &shader->selector->so);
606 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
607 }
608
609 /* VGPRs */
610 declare_tes_input_vgprs(ctx, ngg_cull_shader);
611 break;
612
613 case MESA_SHADER_GEOMETRY:
614 declare_global_desc_pointers(ctx);
615 declare_per_stage_desc_pointers(ctx, true);
616 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
617 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id);
618
619 /* VGPRs */
620 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]);
621 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]);
622 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
623 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]);
624 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]);
625 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]);
626 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]);
627 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
628 break;
629
630 case MESA_SHADER_FRAGMENT:
631 declare_global_desc_pointers(ctx);
632 declare_per_stage_desc_pointers(ctx, true);
633 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, SI_PARAM_ALPHA_REF);
634 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.prim_mask,
635 SI_PARAM_PRIM_MASK);
636
637 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
638 SI_PARAM_PERSP_SAMPLE);
639 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_center,
640 SI_PARAM_PERSP_CENTER);
641 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_centroid,
642 SI_PARAM_PERSP_CENTROID);
643 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
644 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_sample,
645 SI_PARAM_LINEAR_SAMPLE);
646 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_center,
647 SI_PARAM_LINEAR_CENTER);
648 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid,
649 SI_PARAM_LINEAR_CENTROID);
650 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
651 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0],
652 SI_PARAM_POS_X_FLOAT);
653 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1],
654 SI_PARAM_POS_Y_FLOAT);
655 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[2],
656 SI_PARAM_POS_Z_FLOAT);
657 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[3],
658 SI_PARAM_POS_W_FLOAT);
659 shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
660 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.front_face,
661 SI_PARAM_FRONT_FACE);
662 shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
663 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.ancillary,
664 SI_PARAM_ANCILLARY);
665 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.sample_coverage,
666 SI_PARAM_SAMPLE_COVERAGE);
667 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->pos_fixed_pt,
668 SI_PARAM_POS_FIXED_PT);
669
670 /* Color inputs from the prolog. */
671 if (shader->selector->info.colors_read) {
672 unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
673
674 for (i = 0; i < num_color_elements; i++)
675 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
676
677 num_prolog_vgprs += num_color_elements;
678 }
679
680 /* Outputs for the epilog. */
681 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
682 num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
683 shader->selector->info.writes_z + shader->selector->info.writes_stencil +
684 shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */;
685
686 num_returns = MAX2(num_returns, num_return_sgprs + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
687
688 for (i = 0; i < num_return_sgprs; i++)
689 returns[i] = ctx->ac.i32;
690 for (; i < num_returns; i++)
691 returns[i] = ctx->ac.f32;
692 break;
693
694 case MESA_SHADER_COMPUTE:
695 declare_global_desc_pointers(ctx);
696 declare_per_stage_desc_pointers(ctx, true);
697 if (shader->selector->info.uses_grid_size)
698 ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups);
699 if (shader->selector->info.uses_block_size &&
700 shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
701 ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
702
703 unsigned cs_user_data_dwords =
704 shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
705 if (cs_user_data_dwords) {
706 ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data);
707 }
708
709 /* Some descriptors can be in user SGPRs. */
710 /* Shader buffers in user SGPRs. */
711 for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
712 while (ctx->args.num_sgprs_used % 4 != 0)
713 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
714
715 ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->cs_shaderbuf[i]);
716 }
717 /* Images in user SGPRs. */
718 for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
719 unsigned num_sgprs = shader->selector->info.image_buffers & (1 << i) ? 4 : 8;
720
721 while (ctx->args.num_sgprs_used % num_sgprs != 0)
722 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
723
724 ac_add_arg(&ctx->args, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &ctx->cs_image[i]);
725 }
726
727 /* Hardware SGPRs. */
728 for (i = 0; i < 3; i++) {
729 if (shader->selector->info.uses_block_id[i]) {
730 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.workgroup_ids[i]);
731 }
732 }
733 if (shader->selector->info.uses_subgroup_info)
734 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
735
736 /* Hardware VGPRs. */
737 ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, &ctx->args.local_invocation_ids);
738 break;
739 default:
740 assert(0 && "unimplemented shader");
741 return;
742 }
743
744 si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main", returns, num_returns,
745 si_get_max_workgroup_size(shader));
746
747 /* Reserve register locations for VGPR inputs the PS prolog may need. */
748 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
749 ac_llvm_add_target_dep_function_attr(
750 ctx->main_fn, "InitialPSInputAddr",
751 S_0286D0_PERSP_SAMPLE_ENA(1) | S_0286D0_PERSP_CENTER_ENA(1) |
752 S_0286D0_PERSP_CENTROID_ENA(1) | S_0286D0_LINEAR_SAMPLE_ENA(1) |
753 S_0286D0_LINEAR_CENTER_ENA(1) | S_0286D0_LINEAR_CENTROID_ENA(1) |
754 S_0286D0_FRONT_FACE_ENA(1) | S_0286D0_ANCILLARY_ENA(1) | S_0286D0_POS_FIXED_PT_ENA(1));
755 }
756
757 shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
758 shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
759
760 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
761 shader->info.num_input_vgprs -= num_prolog_vgprs;
762
763 if (shader->key.as_ls || ctx->stage == MESA_SHADER_TESS_CTRL) {
764 if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
765 /* The LSHS size is not known until draw time, so we append it
766 * at the end of whatever LDS use there may be in the rest of
767 * the shader (currently none, unless LLVM decides to do its
768 * own LDS-based lowering).
769 */
770 ctx->ac.lds = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
771 "__lds_end", AC_ADDR_SPACE_LDS);
772 LLVMSetAlignment(ctx->ac.lds, 256);
773 } else {
774 ac_declare_lds_as_pointer(&ctx->ac);
775 }
776 }
777
778 /* Unlike radv, we override these arguments in the prolog, so to the
779 * API shader they appear as normal arguments.
780 */
781 if (ctx->stage == MESA_SHADER_VERTEX) {
782 ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
783 ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
784 } else if (ctx->stage == MESA_SHADER_FRAGMENT) {
785 ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
786 ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
787 }
788 }
789
790 /* For the UMR disassembler. */
791 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
792 #define DEBUGGER_NUM_MARKERS 5
793
794 static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
795 struct ac_rtld_binary *rtld)
796 {
797 const struct si_shader_selector *sel = shader->selector;
798 const char *part_elfs[5];
799 size_t part_sizes[5];
800 unsigned num_parts = 0;
801
802 #define add_part(shader_or_part) \
803 if (shader_or_part) { \
804 part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \
805 part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \
806 num_parts++; \
807 }
808
809 add_part(shader->prolog);
810 add_part(shader->previous_stage);
811 add_part(shader->prolog2);
812 add_part(shader);
813 add_part(shader->epilog);
814
815 #undef add_part
816
817 struct ac_rtld_symbol lds_symbols[2];
818 unsigned num_lds_symbols = 0;
819
820 if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
821 (sel->info.stage == MESA_SHADER_GEOMETRY || shader->key.as_ngg)) {
822 /* We add this symbol even on LLVM <= 8 to ensure that
823 * shader->config.lds_size is set correctly below.
824 */
825 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
826 sym->name = "esgs_ring";
827 sym->size = shader->gs_info.esgs_ring_size * 4;
828 sym->align = 64 * 1024;
829 }
830
831 if (shader->key.as_ngg && sel->info.stage == MESA_SHADER_GEOMETRY) {
832 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
833 sym->name = "ngg_emit";
834 sym->size = shader->ngg.ngg_emit_size * 4;
835 sym->align = 4;
836 }
837
838 bool ok = ac_rtld_open(
839 rtld, (struct ac_rtld_open_info){.info = &screen->info,
840 .options =
841 {
842 .halt_at_entry = screen->options.halt_shaders,
843 },
844 .shader_type = sel->info.stage,
845 .wave_size = si_get_shader_wave_size(shader),
846 .num_parts = num_parts,
847 .elf_ptrs = part_elfs,
848 .elf_sizes = part_sizes,
849 .num_shared_lds_symbols = num_lds_symbols,
850 .shared_lds_symbols = lds_symbols});
851
852 if (rtld->lds_size > 0) {
853 unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
854 shader->config.lds_size = align(rtld->lds_size, alloc_granularity) / alloc_granularity;
855 }
856
857 return ok;
858 }
859
860 static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
861 {
862 struct ac_rtld_binary rtld;
863 si_shader_binary_open(screen, shader, &rtld);
864 return rtld.exec_size;
865 }
866
867 static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
868 {
869 uint64_t *scratch_va = data;
870
871 if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
872 *value = (uint32_t)*scratch_va;
873 return true;
874 }
875 if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
876 /* Enable scratch coalescing. */
877 *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1);
878 return true;
879 }
880
881 return false;
882 }
883
884 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
885 uint64_t scratch_va)
886 {
887 struct ac_rtld_binary binary;
888 if (!si_shader_binary_open(sscreen, shader, &binary))
889 return false;
890
891 si_resource_reference(&shader->bo, NULL);
892 shader->bo = si_aligned_buffer_create(
893 &sscreen->b, sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY,
894 PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256);
895 if (!shader->bo)
896 return false;
897
898 /* Upload. */
899 struct ac_rtld_upload_info u = {};
900 u.binary = &binary;
901 u.get_external_symbol = si_get_external_symbol;
902 u.cb_data = &scratch_va;
903 u.rx_va = shader->bo->gpu_address;
904 u.rx_ptr = sscreen->ws->buffer_map(
905 shader->bo->buf, NULL,
906 PIPE_TRANSFER_READ_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED | RADEON_TRANSFER_TEMPORARY);
907 if (!u.rx_ptr)
908 return false;
909
910 bool ok = ac_rtld_upload(&u);
911
912 sscreen->ws->buffer_unmap(shader->bo->buf);
913 ac_rtld_close(&binary);
914
915 return ok;
916 }
917
918 static void si_shader_dump_disassembly(struct si_screen *screen,
919 const struct si_shader_binary *binary,
920 gl_shader_stage stage, unsigned wave_size,
921 struct pipe_debug_callback *debug, const char *name,
922 FILE *file)
923 {
924 struct ac_rtld_binary rtld_binary;
925
926 if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
927 .info = &screen->info,
928 .shader_type = stage,
929 .wave_size = wave_size,
930 .num_parts = 1,
931 .elf_ptrs = &binary->elf_buffer,
932 .elf_sizes = &binary->elf_size}))
933 return;
934
935 const char *disasm;
936 size_t nbytes;
937
938 if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
939 goto out;
940
941 if (nbytes > INT_MAX)
942 goto out;
943
944 if (debug && debug->debug_message) {
945 /* Very long debug messages are cut off, so send the
946 * disassembly one line at a time. This causes more
947 * overhead, but on the plus side it simplifies
948 * parsing of resulting logs.
949 */
950 pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
951
952 uint64_t line = 0;
953 while (line < nbytes) {
954 int count = nbytes - line;
955 const char *nl = memchr(disasm + line, '\n', nbytes - line);
956 if (nl)
957 count = nl - (disasm + line);
958
959 if (count) {
960 pipe_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
961 }
962
963 line += count + 1;
964 }
965
966 pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
967 }
968
969 if (file) {
970 fprintf(file, "Shader %s disassembly:\n", name);
971 fprintf(file, "%*s", (int)nbytes, disasm);
972 }
973
974 out:
975 ac_rtld_close(&rtld_binary);
976 }
977
978 static void si_calculate_max_simd_waves(struct si_shader *shader)
979 {
980 struct si_screen *sscreen = shader->selector->screen;
981 struct ac_shader_config *conf = &shader->config;
982 unsigned num_inputs = shader->selector->info.num_inputs;
983 unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
984 unsigned lds_per_wave = 0;
985 unsigned max_simd_waves;
986
987 max_simd_waves = sscreen->info.max_wave64_per_simd;
988
989 /* Compute LDS usage for PS. */
990 switch (shader->selector->info.stage) {
991 case MESA_SHADER_FRAGMENT:
992 /* The minimum usage per wave is (num_inputs * 48). The maximum
993 * usage is (num_inputs * 48 * 16).
994 * We can get anything in between and it varies between waves.
995 *
996 * The 48 bytes per input for a single primitive is equal to
997 * 4 bytes/component * 4 components/input * 3 points.
998 *
999 * Other stages don't know the size at compile time or don't
1000 * allocate LDS per wave, but instead they do it per thread group.
1001 */
1002 lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment);
1003 break;
1004 case MESA_SHADER_COMPUTE:
1005 if (shader->selector) {
1006 unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
1007 lds_per_wave = (conf->lds_size * lds_increment) /
1008 DIV_ROUND_UP(max_workgroup_size, sscreen->compute_wave_size);
1009 }
1010 break;
1011 default:;
1012 }
1013
1014 /* Compute the per-SIMD wave counts. */
1015 if (conf->num_sgprs) {
1016 max_simd_waves =
1017 MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1018 }
1019
1020 if (conf->num_vgprs) {
1021 /* Always print wave limits as Wave64, so that we can compare
1022 * Wave32 and Wave64 with shader-db fairly. */
1023 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1024 max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
1025 }
1026
1027 unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
1028 if (lds_per_wave)
1029 max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1030
1031 shader->info.max_simd_waves = max_simd_waves;
1032 }
1033
1034 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1035 struct pipe_debug_callback *debug)
1036 {
1037 const struct ac_shader_config *conf = &shader->config;
1038
1039 if (screen->options.debug_disassembly)
1040 si_shader_dump_disassembly(screen, &shader->binary, shader->selector->info.stage,
1041 si_get_shader_wave_size(shader), debug, "main", NULL);
1042
1043 pipe_debug_message(debug, SHADER_INFO,
1044 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1045 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1046 "Spilled VGPRs: %d PrivMem VGPRs: %d",
1047 conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
1048 conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
1049 conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs);
1050 }
1051
1052 static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
1053 bool check_debug_option)
1054 {
1055 const struct ac_shader_config *conf = &shader->config;
1056
1057 if (!check_debug_option || si_can_dump_shader(sscreen, shader->selector->info.stage)) {
1058 if (shader->selector->info.stage == MESA_SHADER_FRAGMENT) {
1059 fprintf(file,
1060 "*** SHADER CONFIG ***\n"
1061 "SPI_PS_INPUT_ADDR = 0x%04x\n"
1062 "SPI_PS_INPUT_ENA = 0x%04x\n",
1063 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1064 }
1065
1066 fprintf(file,
1067 "*** SHADER STATS ***\n"
1068 "SGPRS: %d\n"
1069 "VGPRS: %d\n"
1070 "Spilled SGPRs: %d\n"
1071 "Spilled VGPRs: %d\n"
1072 "Private memory VGPRs: %d\n"
1073 "Code Size: %d bytes\n"
1074 "LDS: %d blocks\n"
1075 "Scratch: %d bytes per wave\n"
1076 "Max Waves: %d\n"
1077 "********************\n\n\n",
1078 conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
1079 shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
1080 conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
1081 }
1082 }
1083
1084 const char *si_get_shader_name(const struct si_shader *shader)
1085 {
1086 switch (shader->selector->info.stage) {
1087 case MESA_SHADER_VERTEX:
1088 if (shader->key.as_es)
1089 return "Vertex Shader as ES";
1090 else if (shader->key.as_ls)
1091 return "Vertex Shader as LS";
1092 else if (shader->key.opt.vs_as_prim_discard_cs)
1093 return "Vertex Shader as Primitive Discard CS";
1094 else if (shader->key.as_ngg)
1095 return "Vertex Shader as ESGS";
1096 else
1097 return "Vertex Shader as VS";
1098 case MESA_SHADER_TESS_CTRL:
1099 return "Tessellation Control Shader";
1100 case MESA_SHADER_TESS_EVAL:
1101 if (shader->key.as_es)
1102 return "Tessellation Evaluation Shader as ES";
1103 else if (shader->key.as_ngg)
1104 return "Tessellation Evaluation Shader as ESGS";
1105 else
1106 return "Tessellation Evaluation Shader as VS";
1107 case MESA_SHADER_GEOMETRY:
1108 if (shader->is_gs_copy_shader)
1109 return "GS Copy Shader as VS";
1110 else
1111 return "Geometry Shader";
1112 case MESA_SHADER_FRAGMENT:
1113 return "Pixel Shader";
1114 case MESA_SHADER_COMPUTE:
1115 return "Compute Shader";
1116 default:
1117 return "Unknown Shader";
1118 }
1119 }
1120
1121 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1122 struct pipe_debug_callback *debug, FILE *file, bool check_debug_option)
1123 {
1124 gl_shader_stage stage = shader->selector->info.stage;
1125
1126 if (!check_debug_option || si_can_dump_shader(sscreen, stage))
1127 si_dump_shader_key(shader, file);
1128
1129 if (!check_debug_option && shader->binary.llvm_ir_string) {
1130 if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
1131 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
1132 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
1133 }
1134
1135 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
1136 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
1137 }
1138
1139 if (!check_debug_option ||
1140 (si_can_dump_shader(sscreen, stage) && !(sscreen->debug_flags & DBG(NO_ASM)))) {
1141 unsigned wave_size = si_get_shader_wave_size(shader);
1142
1143 fprintf(file, "\n%s:\n", si_get_shader_name(shader));
1144
1145 if (shader->prolog)
1146 si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, wave_size, debug,
1147 "prolog", file);
1148 if (shader->previous_stage)
1149 si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
1150 wave_size, debug, "previous stage", file);
1151 if (shader->prolog2)
1152 si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, stage, wave_size,
1153 debug, "prolog2", file);
1154
1155 si_shader_dump_disassembly(sscreen, &shader->binary, stage, wave_size, debug, "main",
1156 file);
1157
1158 if (shader->epilog)
1159 si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, wave_size, debug,
1160 "epilog", file);
1161 fprintf(file, "\n");
1162 }
1163
1164 si_shader_dump_stats(sscreen, shader, file, check_debug_option);
1165 }
1166
1167 static void si_dump_shader_key_vs(const struct si_shader_key *key,
1168 const struct si_vs_prolog_bits *prolog, const char *prefix,
1169 FILE *f)
1170 {
1171 fprintf(f, " %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one);
1172 fprintf(f, " %s.instance_divisor_is_fetched = %u\n", prefix,
1173 prolog->instance_divisor_is_fetched);
1174 fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n", prefix,
1175 prolog->unpack_instance_id_from_vertex_id);
1176 fprintf(f, " %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix);
1177
1178 fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
1179 fprintf(f, " mono.vs.fix_fetch = {");
1180 for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
1181 union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
1182 if (i)
1183 fprintf(f, ", ");
1184 if (!fix.bits)
1185 fprintf(f, "0");
1186 else
1187 fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
1188 fix.u.format);
1189 }
1190 fprintf(f, "}\n");
1191 }
1192
1193 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
1194 {
1195 const struct si_shader_key *key = &shader->key;
1196 gl_shader_stage stage = shader->selector->info.stage;
1197
1198 fprintf(f, "SHADER KEY\n");
1199
1200 switch (stage) {
1201 case MESA_SHADER_VERTEX:
1202 si_dump_shader_key_vs(key, &key->part.vs.prolog, "part.vs.prolog", f);
1203 fprintf(f, " as_es = %u\n", key->as_es);
1204 fprintf(f, " as_ls = %u\n", key->as_ls);
1205 fprintf(f, " as_ngg = %u\n", key->as_ngg);
1206 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
1207 fprintf(f, " opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs);
1208 fprintf(f, " opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]);
1209 fprintf(f, " opt.cs_indexed = %u\n", key->opt.cs_indexed);
1210 fprintf(f, " opt.cs_instancing = %u\n", key->opt.cs_instancing);
1211 fprintf(f, " opt.cs_primitive_restart = %u\n", key->opt.cs_primitive_restart);
1212 fprintf(f, " opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first);
1213 fprintf(f, " opt.cs_need_correct_orientation = %u\n", key->opt.cs_need_correct_orientation);
1214 fprintf(f, " opt.cs_cull_front = %u\n", key->opt.cs_cull_front);
1215 fprintf(f, " opt.cs_cull_back = %u\n", key->opt.cs_cull_back);
1216 fprintf(f, " opt.cs_cull_z = %u\n", key->opt.cs_cull_z);
1217 fprintf(f, " opt.cs_halfz_clip_space = %u\n", key->opt.cs_halfz_clip_space);
1218 break;
1219
1220 case MESA_SHADER_TESS_CTRL:
1221 if (shader->selector->screen->info.chip_class >= GFX9) {
1222 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, "part.tcs.ls_prolog", f);
1223 }
1224 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
1225 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n",
1226 key->mono.u.ff_tcs_inputs_to_copy);
1227 break;
1228
1229 case MESA_SHADER_TESS_EVAL:
1230 fprintf(f, " as_es = %u\n", key->as_es);
1231 fprintf(f, " as_ngg = %u\n", key->as_ngg);
1232 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
1233 break;
1234
1235 case MESA_SHADER_GEOMETRY:
1236 if (shader->is_gs_copy_shader)
1237 break;
1238
1239 if (shader->selector->screen->info.chip_class >= GFX9 &&
1240 key->part.gs.es->info.stage == MESA_SHADER_VERTEX) {
1241 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, "part.gs.vs_prolog", f);
1242 }
1243 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n",
1244 key->part.gs.prolog.tri_strip_adj_fix);
1245 fprintf(f, " part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs);
1246 fprintf(f, " as_ngg = %u\n", key->as_ngg);
1247 break;
1248
1249 case MESA_SHADER_COMPUTE:
1250 break;
1251
1252 case MESA_SHADER_FRAGMENT:
1253 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
1254 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
1255 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
1256 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n",
1257 key->part.ps.prolog.force_persp_sample_interp);
1258 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n",
1259 key->part.ps.prolog.force_linear_sample_interp);
1260 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n",
1261 key->part.ps.prolog.force_persp_center_interp);
1262 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n",
1263 key->part.ps.prolog.force_linear_center_interp);
1264 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n",
1265 key->part.ps.prolog.bc_optimize_for_persp);
1266 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n",
1267 key->part.ps.prolog.bc_optimize_for_linear);
1268 fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n",
1269 key->part.ps.prolog.samplemask_log_ps_iter);
1270 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n",
1271 key->part.ps.epilog.spi_shader_col_format);
1272 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
1273 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
1274 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
1275 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
1276 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
1277 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n",
1278 key->part.ps.epilog.poly_line_smoothing);
1279 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
1280 fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n",
1281 key->mono.u.ps.interpolate_at_sample_force_center);
1282 fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
1283 fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
1284 fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
1285 break;
1286
1287 default:
1288 assert(0);
1289 }
1290
1291 if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
1292 stage == MESA_SHADER_VERTEX) &&
1293 !key->as_es && !key->as_ls) {
1294 fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->opt.kill_outputs);
1295 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
1296 if (stage != MESA_SHADER_GEOMETRY)
1297 fprintf(f, " opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
1298 }
1299 }
1300
1301 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
1302 {
1303 struct si_shader *shader = ctx->shader;
1304 struct si_shader_info *info = &shader->selector->info;
1305 unsigned skip_vs_optim_mask = 0;
1306
1307 if ((ctx->stage != MESA_SHADER_VERTEX && ctx->stage != MESA_SHADER_TESS_EVAL) ||
1308 shader->key.as_ls || shader->key.as_es)
1309 return;
1310
1311 /* Optimizing these outputs is not possible, since they might be overriden
1312 * at runtime with S_028644_PT_SPRITE_TEX. */
1313 for (int i = 0; i < info->num_outputs; i++) {
1314 if (info->output_semantic[i] == VARYING_SLOT_PNTC ||
1315 (info->output_semantic[i] >= VARYING_SLOT_TEX0 &&
1316 info->output_semantic[i] <= VARYING_SLOT_TEX7)) {
1317 skip_vs_optim_mask |= 1u << shader->info.vs_output_param_offset[i];
1318 }
1319 }
1320
1321 ac_optimize_vs_outputs(&ctx->ac, ctx->main_fn, shader->info.vs_output_param_offset,
1322 info->num_outputs, skip_vs_optim_mask,
1323 &shader->info.nr_param_exports);
1324 }
1325
1326 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
1327 const struct si_vs_prolog_bits *prolog_key,
1328 const struct si_shader_key *key, bool ngg_cull_shader)
1329 {
1330 /* VGPR initialization fixup for Vega10 and Raven is always done in the
1331 * VS prolog. */
1332 return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix ||
1333 prolog_key->unpack_instance_id_from_vertex_id ||
1334 (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
1335 }
1336
1337 static bool si_build_main_function(struct si_shader_context *ctx, struct si_shader *shader,
1338 struct nir_shader *nir, bool free_nir, bool ngg_cull_shader)
1339 {
1340 struct si_shader_selector *sel = shader->selector;
1341 const struct si_shader_info *info = &sel->info;
1342
1343 ctx->shader = shader;
1344 ctx->stage = sel->info.stage;
1345
1346 ctx->num_const_buffers = util_last_bit(info->const_buffers_declared);
1347 ctx->num_shader_buffers = util_last_bit(info->shader_buffers_declared);
1348
1349 ctx->num_samplers = util_last_bit(info->samplers_declared);
1350 ctx->num_images = util_last_bit(info->images_declared);
1351
1352 si_llvm_init_resource_callbacks(ctx);
1353
1354 switch (ctx->stage) {
1355 case MESA_SHADER_VERTEX:
1356 si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
1357 break;
1358 case MESA_SHADER_TESS_CTRL:
1359 si_llvm_init_tcs_callbacks(ctx);
1360 break;
1361 case MESA_SHADER_TESS_EVAL:
1362 si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
1363 break;
1364 case MESA_SHADER_GEOMETRY:
1365 si_llvm_init_gs_callbacks(ctx);
1366 break;
1367 case MESA_SHADER_FRAGMENT:
1368 si_llvm_init_ps_callbacks(ctx);
1369 break;
1370 case MESA_SHADER_COMPUTE:
1371 ctx->abi.load_local_group_size = si_llvm_get_block_size;
1372 break;
1373 default:
1374 assert(!"Unsupported shader type");
1375 return false;
1376 }
1377
1378 si_create_function(ctx, ngg_cull_shader);
1379
1380 if (ctx->shader->key.as_es || ctx->stage == MESA_SHADER_GEOMETRY)
1381 si_preload_esgs_ring(ctx);
1382
1383 if (ctx->stage == MESA_SHADER_GEOMETRY)
1384 si_preload_gs_rings(ctx);
1385 else if (ctx->stage == MESA_SHADER_TESS_EVAL)
1386 si_llvm_preload_tes_rings(ctx);
1387
1388 if (ctx->stage == MESA_SHADER_TESS_CTRL && sel->info.tessfactors_are_def_in_all_invocs) {
1389 for (unsigned i = 0; i < 6; i++) {
1390 ctx->invoc0_tess_factors[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
1391 }
1392 }
1393
1394 if (ctx->stage == MESA_SHADER_GEOMETRY) {
1395 for (unsigned i = 0; i < 4; i++) {
1396 ctx->gs_next_vertex[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
1397 }
1398 if (shader->key.as_ngg) {
1399 for (unsigned i = 0; i < 4; ++i) {
1400 ctx->gs_curprim_verts[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
1401 ctx->gs_generated_prims[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
1402 }
1403
1404 assert(!ctx->gs_ngg_scratch);
1405 LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
1406 ctx->gs_ngg_scratch =
1407 LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
1408 LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
1409 LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
1410
1411 ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(
1412 ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
1413 LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
1414 LLVMSetAlignment(ctx->gs_ngg_emit, 4);
1415 }
1416 }
1417
1418 if (ctx->stage != MESA_SHADER_GEOMETRY && (shader->key.as_ngg && !shader->key.as_es)) {
1419 /* Unconditionally declare scratch space base for streamout and
1420 * vertex compaction. Whether space is actually allocated is
1421 * determined during linking / PM4 creation.
1422 *
1423 * Add an extra dword per vertex to ensure an odd stride, which
1424 * avoids bank conflicts for SoA accesses.
1425 */
1426 if (!gfx10_is_ngg_passthrough(shader))
1427 si_llvm_declare_esgs_ring(ctx);
1428
1429 /* This is really only needed when streamout and / or vertex
1430 * compaction is enabled.
1431 */
1432 if (!ctx->gs_ngg_scratch && (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
1433 LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
1434 ctx->gs_ngg_scratch =
1435 LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
1436 LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
1437 LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
1438 }
1439 }
1440
1441 /* For GFX9 merged shaders:
1442 * - Set EXEC for the first shader. If the prolog is present, set
1443 * EXEC there instead.
1444 * - Add a barrier before the second shader.
1445 * - In the second shader, reset EXEC to ~0 and wrap the main part in
1446 * an if-statement. This is required for correctness in geometry
1447 * shaders, to ensure that empty GS waves do not send GS_EMIT and
1448 * GS_CUT messages.
1449 *
1450 * For monolithic merged shaders, the first shader is wrapped in an
1451 * if-block together with its prolog in si_build_wrapper_function.
1452 *
1453 * NGG vertex and tess eval shaders running as the last
1454 * vertex/geometry stage handle execution explicitly using
1455 * if-statements.
1456 */
1457 if (ctx->screen->info.chip_class >= GFX9) {
1458 if (!shader->is_monolithic && (shader->key.as_es || shader->key.as_ls) &&
1459 (ctx->stage == MESA_SHADER_TESS_EVAL ||
1460 (ctx->stage == MESA_SHADER_VERTEX &&
1461 !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader)))) {
1462 si_init_exec_from_input(ctx, ctx->merged_wave_info, 0);
1463 } else if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_GEOMETRY ||
1464 (shader->key.as_ngg && !shader->key.as_es)) {
1465 LLVMValueRef thread_enabled;
1466 bool nested_barrier;
1467
1468 if (!shader->is_monolithic || (ctx->stage == MESA_SHADER_TESS_EVAL && shader->key.as_ngg &&
1469 !shader->key.as_es && !shader->key.opt.ngg_culling))
1470 ac_init_exec_full_mask(&ctx->ac);
1471
1472 if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
1473 shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) {
1474 gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
1475
1476 /* Build the primitive export at the beginning
1477 * of the shader if possible.
1478 */
1479 if (gfx10_ngg_export_prim_early(shader))
1480 gfx10_ngg_build_export_prim(ctx, NULL, NULL);
1481 }
1482
1483 if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_GEOMETRY) {
1484 if (ctx->stage == MESA_SHADER_GEOMETRY && shader->key.as_ngg) {
1485 gfx10_ngg_gs_emit_prologue(ctx);
1486 nested_barrier = false;
1487 } else {
1488 nested_barrier = true;
1489 }
1490
1491 thread_enabled = si_is_gs_thread(ctx);
1492 } else {
1493 thread_enabled = si_is_es_thread(ctx);
1494 nested_barrier = false;
1495 }
1496
1497 ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
1498 ctx->merged_wrap_if_label = 11500;
1499 ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
1500
1501 if (nested_barrier) {
1502 /* Execute a barrier before the second shader in
1503 * a merged shader.
1504 *
1505 * Execute the barrier inside the conditional block,
1506 * so that empty waves can jump directly to s_endpgm,
1507 * which will also signal the barrier.
1508 *
1509 * This is possible in gfx9, because an empty wave
1510 * for the second shader does not participate in
1511 * the epilogue. With NGG, empty waves may still
1512 * be required to export data (e.g. GS output vertices),
1513 * so we cannot let them exit early.
1514 *
1515 * If the shader is TCS and the TCS epilog is present
1516 * and contains a barrier, it will wait there and then
1517 * reach s_endpgm.
1518 */
1519 si_llvm_emit_barrier(ctx);
1520 }
1521 }
1522 }
1523
1524 bool success = si_nir_build_llvm(ctx, nir);
1525 if (free_nir)
1526 ralloc_free(nir);
1527 if (!success) {
1528 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
1529 return false;
1530 }
1531
1532 si_llvm_build_ret(ctx, ctx->return_value);
1533 return true;
1534 }
1535
1536 /**
1537 * Compute the VS prolog key, which contains all the information needed to
1538 * build the VS prolog function, and set shader->info bits where needed.
1539 *
1540 * \param info Shader info of the vertex shader.
1541 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
1542 * \param has_old_ Whether the preceding shader part is the NGG cull shader.
1543 * \param prolog_key Key of the VS prolog
1544 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
1545 * \param key Output shader part key.
1546 */
1547 static void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,
1548 bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,
1549 struct si_shader *shader_out, union si_shader_part_key *key)
1550 {
1551 memset(key, 0, sizeof(*key));
1552 key->vs_prolog.states = *prolog_key;
1553 key->vs_prolog.num_input_sgprs = num_input_sgprs;
1554 key->vs_prolog.num_inputs = info->num_inputs;
1555 key->vs_prolog.as_ls = shader_out->key.as_ls;
1556 key->vs_prolog.as_es = shader_out->key.as_es;
1557 key->vs_prolog.as_ngg = shader_out->key.as_ngg;
1558 key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;
1559
1560 if (ngg_cull_shader) {
1561 key->vs_prolog.gs_fast_launch_tri_list =
1562 !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
1563 key->vs_prolog.gs_fast_launch_tri_strip =
1564 !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
1565 } else {
1566 key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
1567 }
1568
1569 if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) {
1570 key->vs_prolog.as_ls = 1;
1571 key->vs_prolog.num_merged_next_stage_vgprs = 2;
1572 } else if (shader_out->selector->info.stage == MESA_SHADER_GEOMETRY) {
1573 key->vs_prolog.as_es = 1;
1574 key->vs_prolog.num_merged_next_stage_vgprs = 5;
1575 } else if (shader_out->key.as_ngg) {
1576 key->vs_prolog.num_merged_next_stage_vgprs = 5;
1577 }
1578
1579 /* Only one of these combinations can be set. as_ngg can be set with as_es. */
1580 assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
1581 (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <=
1582 1);
1583
1584 /* Enable loading the InstanceID VGPR. */
1585 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
1586
1587 if ((key->vs_prolog.states.instance_divisor_is_one |
1588 key->vs_prolog.states.instance_divisor_is_fetched) &
1589 input_mask)
1590 shader_out->info.uses_instanceid = true;
1591 }
1592
1593 static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
1594 struct si_shader_selector *sel)
1595 {
1596 if (!compiler->low_opt_passes)
1597 return false;
1598
1599 /* Assume a slow CPU. */
1600 assert(!sel->screen->info.has_dedicated_vram && sel->screen->info.chip_class <= GFX8);
1601
1602 /* For a crazy dEQP test containing 2597 memory opcodes, mostly
1603 * buffer stores. */
1604 return sel->info.stage == MESA_SHADER_COMPUTE && sel->info.num_memory_instructions > 1000;
1605 }
1606
1607 static struct nir_shader *get_nir_shader(struct si_shader_selector *sel, bool *free_nir)
1608 {
1609 *free_nir = false;
1610
1611 if (sel->nir) {
1612 return sel->nir;
1613 } else if (sel->nir_binary) {
1614 struct pipe_screen *screen = &sel->screen->b;
1615 const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
1616 pipe_shader_type_from_mesa(sel->info.stage));
1617
1618 struct blob_reader blob_reader;
1619 blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
1620 *free_nir = true;
1621 return nir_deserialize(NULL, options, &blob_reader);
1622 }
1623 return NULL;
1624 }
1625
1626 static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1627 struct si_shader *shader, struct pipe_debug_callback *debug,
1628 struct nir_shader *nir, bool free_nir)
1629 {
1630 struct si_shader_selector *sel = shader->selector;
1631 struct si_shader_context ctx;
1632
1633 si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
1634
1635 LLVMValueRef ngg_cull_main_fn = NULL;
1636 if (shader->key.opt.ngg_culling) {
1637 if (!si_build_main_function(&ctx, shader, nir, false, true)) {
1638 si_llvm_dispose(&ctx);
1639 return false;
1640 }
1641 ngg_cull_main_fn = ctx.main_fn;
1642 ctx.main_fn = NULL;
1643 }
1644
1645 if (!si_build_main_function(&ctx, shader, nir, free_nir, false)) {
1646 si_llvm_dispose(&ctx);
1647 return false;
1648 }
1649
1650 if (shader->is_monolithic && ctx.stage == MESA_SHADER_VERTEX) {
1651 LLVMValueRef parts[4];
1652 unsigned num_parts = 0;
1653 bool has_prolog = false;
1654 LLVMValueRef main_fn = ctx.main_fn;
1655
1656 if (ngg_cull_main_fn) {
1657 if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, true)) {
1658 union si_shader_part_key prolog_key;
1659 si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, true,
1660 &shader->key.part.vs.prolog, shader, &prolog_key);
1661 prolog_key.vs_prolog.is_monolithic = true;
1662 si_llvm_build_vs_prolog(&ctx, &prolog_key);
1663 parts[num_parts++] = ctx.main_fn;
1664 has_prolog = true;
1665 }
1666 parts[num_parts++] = ngg_cull_main_fn;
1667 }
1668
1669 if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, false)) {
1670 union si_shader_part_key prolog_key;
1671 si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, false,
1672 &shader->key.part.vs.prolog, shader, &prolog_key);
1673 prolog_key.vs_prolog.is_monolithic = true;
1674 si_llvm_build_vs_prolog(&ctx, &prolog_key);
1675 parts[num_parts++] = ctx.main_fn;
1676 has_prolog = true;
1677 }
1678 parts[num_parts++] = main_fn;
1679
1680 si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0);
1681
1682 if (ctx.shader->key.opt.vs_as_prim_discard_cs)
1683 si_build_prim_discard_compute_shader(&ctx);
1684 } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) {
1685 LLVMValueRef parts[2];
1686
1687 parts[0] = ngg_cull_main_fn;
1688 parts[1] = ctx.main_fn;
1689
1690 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
1691 } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_CTRL) {
1692 if (sscreen->info.chip_class >= GFX9) {
1693 struct si_shader_selector *ls = shader->key.part.tcs.ls;
1694 LLVMValueRef parts[4];
1695 bool vs_needs_prolog =
1696 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog, &shader->key, false);
1697
1698 /* TCS main part */
1699 parts[2] = ctx.main_fn;
1700
1701 /* TCS epilog */
1702 union si_shader_part_key tcs_epilog_key;
1703 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
1704 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
1705 si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
1706 parts[3] = ctx.main_fn;
1707
1708 /* VS as LS main part */
1709 nir = get_nir_shader(ls, &free_nir);
1710 struct si_shader shader_ls = {};
1711 shader_ls.selector = ls;
1712 shader_ls.key.as_ls = 1;
1713 shader_ls.key.mono = shader->key.mono;
1714 shader_ls.key.opt = shader->key.opt;
1715 shader_ls.is_monolithic = true;
1716
1717 if (!si_build_main_function(&ctx, &shader_ls, nir, free_nir, false)) {
1718 si_llvm_dispose(&ctx);
1719 return false;
1720 }
1721 shader->info.uses_instanceid |= ls->info.uses_instanceid;
1722 parts[1] = ctx.main_fn;
1723
1724 /* LS prolog */
1725 if (vs_needs_prolog) {
1726 union si_shader_part_key vs_prolog_key;
1727 si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, false,
1728 &shader->key.part.tcs.ls_prolog, shader, &vs_prolog_key);
1729 vs_prolog_key.vs_prolog.is_monolithic = true;
1730 si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
1731 parts[0] = ctx.main_fn;
1732 }
1733
1734 /* Reset the shader context. */
1735 ctx.shader = shader;
1736 ctx.stage = MESA_SHADER_TESS_CTRL;
1737
1738 si_build_wrapper_function(&ctx, parts + !vs_needs_prolog, 4 - !vs_needs_prolog,
1739 vs_needs_prolog, vs_needs_prolog ? 2 : 1);
1740 } else {
1741 LLVMValueRef parts[2];
1742 union si_shader_part_key epilog_key;
1743
1744 parts[0] = ctx.main_fn;
1745
1746 memset(&epilog_key, 0, sizeof(epilog_key));
1747 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
1748 si_llvm_build_tcs_epilog(&ctx, &epilog_key);
1749 parts[1] = ctx.main_fn;
1750
1751 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
1752 }
1753 } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_GEOMETRY) {
1754 if (ctx.screen->info.chip_class >= GFX9) {
1755 struct si_shader_selector *es = shader->key.part.gs.es;
1756 LLVMValueRef es_prolog = NULL;
1757 LLVMValueRef es_main = NULL;
1758 LLVMValueRef gs_prolog = NULL;
1759 LLVMValueRef gs_main = ctx.main_fn;
1760
1761 /* GS prolog */
1762 union si_shader_part_key gs_prolog_key;
1763 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
1764 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
1765 gs_prolog_key.gs_prolog.is_monolithic = true;
1766 gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
1767 si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
1768 gs_prolog = ctx.main_fn;
1769
1770 /* ES main part */
1771 nir = get_nir_shader(es, &free_nir);
1772 struct si_shader shader_es = {};
1773 shader_es.selector = es;
1774 shader_es.key.as_es = 1;
1775 shader_es.key.as_ngg = shader->key.as_ngg;
1776 shader_es.key.mono = shader->key.mono;
1777 shader_es.key.opt = shader->key.opt;
1778 shader_es.is_monolithic = true;
1779
1780 if (!si_build_main_function(&ctx, &shader_es, nir, free_nir, false)) {
1781 si_llvm_dispose(&ctx);
1782 return false;
1783 }
1784 shader->info.uses_instanceid |= es->info.uses_instanceid;
1785 es_main = ctx.main_fn;
1786
1787 /* ES prolog */
1788 if (es->info.stage == MESA_SHADER_VERTEX &&
1789 si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog, &shader->key, false)) {
1790 union si_shader_part_key vs_prolog_key;
1791 si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, false,
1792 &shader->key.part.gs.vs_prolog, shader, &vs_prolog_key);
1793 vs_prolog_key.vs_prolog.is_monolithic = true;
1794 si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
1795 es_prolog = ctx.main_fn;
1796 }
1797
1798 /* Reset the shader context. */
1799 ctx.shader = shader;
1800 ctx.stage = MESA_SHADER_GEOMETRY;
1801
1802 /* Prepare the array of shader parts. */
1803 LLVMValueRef parts[4];
1804 unsigned num_parts = 0, main_part, next_first_part;
1805
1806 if (es_prolog)
1807 parts[num_parts++] = es_prolog;
1808
1809 parts[main_part = num_parts++] = es_main;
1810 parts[next_first_part = num_parts++] = gs_prolog;
1811 parts[num_parts++] = gs_main;
1812
1813 si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part);
1814 } else {
1815 LLVMValueRef parts[2];
1816 union si_shader_part_key prolog_key;
1817
1818 parts[1] = ctx.main_fn;
1819
1820 memset(&prolog_key, 0, sizeof(prolog_key));
1821 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
1822 si_llvm_build_gs_prolog(&ctx, &prolog_key);
1823 parts[0] = ctx.main_fn;
1824
1825 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
1826 }
1827 } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_FRAGMENT) {
1828 si_llvm_build_monolithic_ps(&ctx, shader);
1829 }
1830
1831 si_llvm_optimize_module(&ctx);
1832
1833 /* Post-optimization transformations and analysis. */
1834 si_optimize_vs_outputs(&ctx);
1835
1836 if ((debug && debug->debug_message) || si_can_dump_shader(sscreen, ctx.stage)) {
1837 ctx.shader->info.private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_fn);
1838 }
1839
1840 /* Make sure the input is a pointer and not integer followed by inttoptr. */
1841 assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
1842
1843 /* Compile to bytecode. */
1844 if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
1845 ctx.stage, si_get_shader_name(shader),
1846 si_should_optimize_less(compiler, shader->selector))) {
1847 si_llvm_dispose(&ctx);
1848 fprintf(stderr, "LLVM failed to compile shader\n");
1849 return false;
1850 }
1851
1852 si_llvm_dispose(&ctx);
1853 return true;
1854 }
1855
1856 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1857 struct si_shader *shader, struct pipe_debug_callback *debug)
1858 {
1859 struct si_shader_selector *sel = shader->selector;
1860 bool free_nir;
1861 struct nir_shader *nir = get_nir_shader(sel, &free_nir);
1862
1863 /* Dump NIR before doing NIR->LLVM conversion in case the
1864 * conversion fails. */
1865 if (si_can_dump_shader(sscreen, sel->info.stage) &&
1866 !(sscreen->debug_flags & DBG(NO_NIR))) {
1867 nir_print_shader(nir, stderr);
1868 si_dump_streamout(&sel->so);
1869 }
1870
1871 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
1872 sizeof(shader->info.vs_output_param_offset));
1873
1874 shader->info.uses_instanceid = sel->info.uses_instanceid;
1875
1876 /* TODO: ACO could compile non-monolithic shaders here (starting
1877 * with PS and NGG VS), but monolithic shaders should be compiled
1878 * by LLVM due to more complicated compilation.
1879 */
1880 if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))
1881 return false;
1882
1883 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
1884 * LLVM 3.9svn has this bug.
1885 */
1886 if (sel->info.stage == MESA_SHADER_COMPUTE) {
1887 unsigned wave_size = sscreen->compute_wave_size;
1888 unsigned max_vgprs =
1889 sscreen->info.num_physical_wave64_vgprs_per_simd * (wave_size == 32 ? 2 : 1);
1890 unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
1891 unsigned max_sgprs_per_wave = 128;
1892 unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
1893 unsigned threads_per_tg = si_get_max_workgroup_size(shader);
1894 unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
1895 unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
1896
1897 max_vgprs = max_vgprs / waves_per_simd;
1898 max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
1899
1900 if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
1901 fprintf(stderr,
1902 "LLVM failed to compile a shader correctly: "
1903 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
1904 shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
1905
1906 /* Just terminate the process, because dependent
1907 * shaders can hang due to bad input data, but use
1908 * the env var to allow shader-db to work.
1909 */
1910 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
1911 abort();
1912 }
1913 }
1914
1915 /* Add the scratch offset to input SGPRs. */
1916 if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader))
1917 shader->info.num_input_sgprs += 1; /* scratch byte offset */
1918
1919 /* Calculate the number of fragment input VGPRs. */
1920 if (sel->info.stage == MESA_SHADER_FRAGMENT) {
1921 shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
1922 &shader->config, &shader->info.face_vgpr_index, &shader->info.ancillary_vgpr_index);
1923 }
1924
1925 si_calculate_max_simd_waves(shader);
1926 si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
1927 return true;
1928 }
1929
1930 /**
1931 * Create, compile and return a shader part (prolog or epilog).
1932 *
1933 * \param sscreen screen
1934 * \param list list of shader parts of the same category
1935 * \param type shader type
1936 * \param key shader part key
1937 * \param prolog whether the part being requested is a prolog
1938 * \param tm LLVM target machine
1939 * \param debug debug callback
1940 * \param build the callback responsible for building the main function
1941 * \return non-NULL on success
1942 */
1943 static struct si_shader_part *
1944 si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
1945 gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
1946 struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug,
1947 void (*build)(struct si_shader_context *, union si_shader_part_key *),
1948 const char *name)
1949 {
1950 struct si_shader_part *result;
1951
1952 simple_mtx_lock(&sscreen->shader_parts_mutex);
1953
1954 /* Find existing. */
1955 for (result = *list; result; result = result->next) {
1956 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
1957 simple_mtx_unlock(&sscreen->shader_parts_mutex);
1958 return result;
1959 }
1960 }
1961
1962 /* Compile a new one. */
1963 result = CALLOC_STRUCT(si_shader_part);
1964 result->key = *key;
1965
1966 struct si_shader_selector sel = {};
1967 sel.screen = sscreen;
1968
1969 struct si_shader shader = {};
1970 shader.selector = &sel;
1971
1972 switch (stage) {
1973 case MESA_SHADER_VERTEX:
1974 shader.key.as_ls = key->vs_prolog.as_ls;
1975 shader.key.as_es = key->vs_prolog.as_es;
1976 shader.key.as_ngg = key->vs_prolog.as_ngg;
1977 shader.key.opt.ngg_culling =
1978 (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
1979 (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0);
1980 shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
1981 break;
1982 case MESA_SHADER_TESS_CTRL:
1983 assert(!prolog);
1984 shader.key.part.tcs.epilog = key->tcs_epilog.states;
1985 break;
1986 case MESA_SHADER_GEOMETRY:
1987 assert(prolog);
1988 shader.key.as_ngg = key->gs_prolog.as_ngg;
1989 break;
1990 case MESA_SHADER_FRAGMENT:
1991 if (prolog)
1992 shader.key.part.ps.prolog = key->ps_prolog.states;
1993 else
1994 shader.key.part.ps.epilog = key->ps_epilog.states;
1995 break;
1996 default:
1997 unreachable("bad shader part");
1998 }
1999
2000 struct si_shader_context ctx;
2001 si_llvm_context_init(&ctx, sscreen, compiler,
2002 si_get_wave_size(sscreen, stage,
2003 shader.key.as_ngg, shader.key.as_es,
2004 shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
2005 shader.key.opt.vs_as_prim_discard_cs));
2006 ctx.shader = &shader;
2007 ctx.stage = stage;
2008
2009 build(&ctx, key);
2010
2011 /* Compile. */
2012 si_llvm_optimize_module(&ctx);
2013
2014 if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, &ctx.ac, debug,
2015 ctx.stage, name, false)) {
2016 FREE(result);
2017 result = NULL;
2018 goto out;
2019 }
2020
2021 result->next = *list;
2022 *list = result;
2023
2024 out:
2025 si_llvm_dispose(&ctx);
2026 simple_mtx_unlock(&sscreen->shader_parts_mutex);
2027 return result;
2028 }
2029
2030 static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2031 struct si_shader *shader, struct pipe_debug_callback *debug,
2032 struct si_shader *main_part, const struct si_vs_prolog_bits *key)
2033 {
2034 struct si_shader_selector *vs = main_part->selector;
2035
2036 if (!si_vs_needs_prolog(vs, key, &shader->key, false))
2037 return true;
2038
2039 /* Get the prolog. */
2040 union si_shader_part_key prolog_key;
2041 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader,
2042 &prolog_key);
2043
2044 shader->prolog =
2045 si_get_shader_part(sscreen, &sscreen->vs_prologs, MESA_SHADER_VERTEX, true, &prolog_key,
2046 compiler, debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog");
2047 return shader->prolog != NULL;
2048 }
2049
2050 /**
2051 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
2052 */
2053 static bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2054 struct si_shader *shader, struct pipe_debug_callback *debug)
2055 {
2056 return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.part.vs.prolog);
2057 }
2058
2059 /**
2060 * Select and compile (or reuse) TCS parts (epilog).
2061 */
2062 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2063 struct si_shader *shader, struct pipe_debug_callback *debug)
2064 {
2065 if (sscreen->info.chip_class >= GFX9) {
2066 struct si_shader *ls_main_part = shader->key.part.tcs.ls->main_shader_part_ls;
2067
2068 if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
2069 &shader->key.part.tcs.ls_prolog))
2070 return false;
2071
2072 shader->previous_stage = ls_main_part;
2073 }
2074
2075 /* Get the epilog. */
2076 union si_shader_part_key epilog_key;
2077 memset(&epilog_key, 0, sizeof(epilog_key));
2078 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
2079
2080 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false,
2081 &epilog_key, compiler, debug, si_llvm_build_tcs_epilog,
2082 "Tessellation Control Shader Epilog");
2083 return shader->epilog != NULL;
2084 }
2085
2086 /**
2087 * Select and compile (or reuse) GS parts (prolog).
2088 */
2089 static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2090 struct si_shader *shader, struct pipe_debug_callback *debug)
2091 {
2092 if (sscreen->info.chip_class >= GFX9) {
2093 struct si_shader *es_main_part;
2094
2095 if (shader->key.as_ngg)
2096 es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
2097 else
2098 es_main_part = shader->key.part.gs.es->main_shader_part_es;
2099
2100 if (shader->key.part.gs.es->info.stage == MESA_SHADER_VERTEX &&
2101 !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
2102 &shader->key.part.gs.vs_prolog))
2103 return false;
2104
2105 shader->previous_stage = es_main_part;
2106 }
2107
2108 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
2109 return true;
2110
2111 union si_shader_part_key prolog_key;
2112 memset(&prolog_key, 0, sizeof(prolog_key));
2113 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
2114 prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
2115
2116 shader->prolog2 =
2117 si_get_shader_part(sscreen, &sscreen->gs_prologs, MESA_SHADER_GEOMETRY, true, &prolog_key,
2118 compiler, debug, si_llvm_build_gs_prolog, "Geometry Shader Prolog");
2119 return shader->prolog2 != NULL;
2120 }
2121
2122 /**
2123 * Compute the PS prolog key, which contains all the information needed to
2124 * build the PS prolog function, and set related bits in shader->config.
2125 */
2126 void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
2127 bool separate_prolog)
2128 {
2129 struct si_shader_info *info = &shader->selector->info;
2130
2131 memset(key, 0, sizeof(*key));
2132 key->ps_prolog.states = shader->key.part.ps.prolog;
2133 key->ps_prolog.colors_read = info->colors_read;
2134 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
2135 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
2136 key->ps_prolog.wqm =
2137 info->uses_derivatives &&
2138 (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
2139 key->ps_prolog.states.force_linear_sample_interp ||
2140 key->ps_prolog.states.force_persp_center_interp ||
2141 key->ps_prolog.states.force_linear_center_interp ||
2142 key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
2143 key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
2144
2145 if (info->colors_read) {
2146 unsigned *color = shader->selector->color_attr_index;
2147
2148 if (shader->key.part.ps.prolog.color_two_side) {
2149 /* BCOLORs are stored after the last input. */
2150 key->ps_prolog.num_interp_inputs = info->num_inputs;
2151 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
2152 if (separate_prolog)
2153 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
2154 }
2155
2156 for (unsigned i = 0; i < 2; i++) {
2157 unsigned interp = info->color_interpolate[i];
2158 unsigned location = info->color_interpolate_loc[i];
2159
2160 if (!(info->colors_read & (0xf << i * 4)))
2161 continue;
2162
2163 key->ps_prolog.color_attr_index[i] = color[i];
2164
2165 if (shader->key.part.ps.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
2166 interp = INTERP_MODE_FLAT;
2167
2168 switch (interp) {
2169 case INTERP_MODE_FLAT:
2170 key->ps_prolog.color_interp_vgpr_index[i] = -1;
2171 break;
2172 case INTERP_MODE_SMOOTH:
2173 case INTERP_MODE_COLOR:
2174 /* Force the interpolation location for colors here. */
2175 if (shader->key.part.ps.prolog.force_persp_sample_interp)
2176 location = TGSI_INTERPOLATE_LOC_SAMPLE;
2177 if (shader->key.part.ps.prolog.force_persp_center_interp)
2178 location = TGSI_INTERPOLATE_LOC_CENTER;
2179
2180 switch (location) {
2181 case TGSI_INTERPOLATE_LOC_SAMPLE:
2182 key->ps_prolog.color_interp_vgpr_index[i] = 0;
2183 if (separate_prolog) {
2184 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2185 }
2186 break;
2187 case TGSI_INTERPOLATE_LOC_CENTER:
2188 key->ps_prolog.color_interp_vgpr_index[i] = 2;
2189 if (separate_prolog) {
2190 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2191 }
2192 break;
2193 case TGSI_INTERPOLATE_LOC_CENTROID:
2194 key->ps_prolog.color_interp_vgpr_index[i] = 4;
2195 if (separate_prolog) {
2196 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
2197 }
2198 break;
2199 default:
2200 assert(0);
2201 }
2202 break;
2203 case INTERP_MODE_NOPERSPECTIVE:
2204 /* Force the interpolation location for colors here. */
2205 if (shader->key.part.ps.prolog.force_linear_sample_interp)
2206 location = TGSI_INTERPOLATE_LOC_SAMPLE;
2207 if (shader->key.part.ps.prolog.force_linear_center_interp)
2208 location = TGSI_INTERPOLATE_LOC_CENTER;
2209
2210 /* The VGPR assignment for non-monolithic shaders
2211 * works because InitialPSInputAddr is set on the
2212 * main shader and PERSP_PULL_MODEL is never used.
2213 */
2214 switch (location) {
2215 case TGSI_INTERPOLATE_LOC_SAMPLE:
2216 key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 6 : 9;
2217 if (separate_prolog) {
2218 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2219 }
2220 break;
2221 case TGSI_INTERPOLATE_LOC_CENTER:
2222 key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 8 : 11;
2223 if (separate_prolog) {
2224 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2225 }
2226 break;
2227 case TGSI_INTERPOLATE_LOC_CENTROID:
2228 key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 10 : 13;
2229 if (separate_prolog) {
2230 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
2231 }
2232 break;
2233 default:
2234 assert(0);
2235 }
2236 break;
2237 default:
2238 assert(0);
2239 }
2240 }
2241 }
2242 }
2243
2244 /**
2245 * Check whether a PS prolog is required based on the key.
2246 */
2247 bool si_need_ps_prolog(const union si_shader_part_key *key)
2248 {
2249 return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
2250 key->ps_prolog.states.force_linear_sample_interp ||
2251 key->ps_prolog.states.force_persp_center_interp ||
2252 key->ps_prolog.states.force_linear_center_interp ||
2253 key->ps_prolog.states.bc_optimize_for_persp ||
2254 key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
2255 key->ps_prolog.states.samplemask_log_ps_iter;
2256 }
2257
2258 /**
2259 * Compute the PS epilog key, which contains all the information needed to
2260 * build the PS epilog function.
2261 */
2262 void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
2263 {
2264 struct si_shader_info *info = &shader->selector->info;
2265 memset(key, 0, sizeof(*key));
2266 key->ps_epilog.colors_written = info->colors_written;
2267 key->ps_epilog.writes_z = info->writes_z;
2268 key->ps_epilog.writes_stencil = info->writes_stencil;
2269 key->ps_epilog.writes_samplemask = info->writes_samplemask;
2270 key->ps_epilog.states = shader->key.part.ps.epilog;
2271 }
2272
2273 /**
2274 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
2275 */
2276 static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2277 struct si_shader *shader, struct pipe_debug_callback *debug)
2278 {
2279 union si_shader_part_key prolog_key;
2280 union si_shader_part_key epilog_key;
2281
2282 /* Get the prolog. */
2283 si_get_ps_prolog_key(shader, &prolog_key, true);
2284
2285 /* The prolog is a no-op if these aren't set. */
2286 if (si_need_ps_prolog(&prolog_key)) {
2287 shader->prolog =
2288 si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
2289 compiler, debug, si_llvm_build_ps_prolog, "Fragment Shader Prolog");
2290 if (!shader->prolog)
2291 return false;
2292 }
2293
2294 /* Get the epilog. */
2295 si_get_ps_epilog_key(shader, &epilog_key);
2296
2297 shader->epilog =
2298 si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
2299 compiler, debug, si_llvm_build_ps_epilog, "Fragment Shader Epilog");
2300 if (!shader->epilog)
2301 return false;
2302
2303 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
2304 if (shader->key.part.ps.prolog.poly_stipple) {
2305 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2306 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
2307 }
2308
2309 /* Set up the enable bits for per-sample shading if needed. */
2310 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
2311 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2312 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2313 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
2314 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2315 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2316 }
2317 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
2318 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2319 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2320 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
2321 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2322 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2323 }
2324 if (shader->key.part.ps.prolog.force_persp_center_interp &&
2325 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2326 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2327 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
2328 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2329 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2330 }
2331 if (shader->key.part.ps.prolog.force_linear_center_interp &&
2332 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2333 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2334 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
2335 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2336 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2337 }
2338
2339 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
2340 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
2341 !(shader->config.spi_ps_input_ena & 0xf)) {
2342 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2343 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
2344 }
2345
2346 /* At least one pair of interpolation weights must be enabled. */
2347 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
2348 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2349 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
2350 }
2351
2352 /* Samplemask fixup requires the sample ID. */
2353 if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
2354 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2355 assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
2356 }
2357
2358 /* The sample mask input is always enabled, because the API shader always
2359 * passes it through to the epilog. Disable it here if it's unused.
2360 */
2361 if (!shader->key.part.ps.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask)
2362 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
2363
2364 return true;
2365 }
2366
2367 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
2368 {
2369 /* If tessellation is all offchip and on-chip GS isn't used, this
2370 * workaround is not needed.
2371 */
2372 return;
2373
2374 /* SPI barrier management bug:
2375 * Make sure we have at least 4k of LDS in use to avoid the bug.
2376 * It applies to workgroup sizes of more than one wavefront.
2377 */
2378 if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
2379 *lds_size = MAX2(*lds_size, 8);
2380 }
2381
2382 void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
2383 {
2384 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
2385
2386 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
2387
2388 if (shader->selector->info.stage == MESA_SHADER_COMPUTE &&
2389 si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
2390 si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
2391 }
2392 }
2393
2394 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2395 struct si_shader *shader, struct pipe_debug_callback *debug)
2396 {
2397 struct si_shader_selector *sel = shader->selector;
2398 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
2399
2400 /* LS, ES, VS are compiled on demand if the main part hasn't been
2401 * compiled for that stage.
2402 *
2403 * GS are compiled on demand if the main part hasn't been compiled
2404 * for the chosen NGG-ness.
2405 *
2406 * Vertex shaders are compiled on demand when a vertex fetch
2407 * workaround must be applied.
2408 */
2409 if (shader->is_monolithic) {
2410 /* Monolithic shader (compiled as a whole, has many variants,
2411 * may take a long time to compile).
2412 */
2413 if (!si_compile_shader(sscreen, compiler, shader, debug))
2414 return false;
2415 } else {
2416 /* The shader consists of several parts:
2417 *
2418 * - the middle part is the user shader, it has 1 variant only
2419 * and it was compiled during the creation of the shader
2420 * selector
2421 * - the prolog part is inserted at the beginning
2422 * - the epilog part is inserted at the end
2423 *
2424 * The prolog and epilog have many (but simple) variants.
2425 *
2426 * Starting with gfx9, geometry and tessellation control
2427 * shaders also contain the prolog and user shader parts of
2428 * the previous shader stage.
2429 */
2430
2431 if (!mainp)
2432 return false;
2433
2434 /* Copy the compiled shader data over. */
2435 shader->is_binary_shared = true;
2436 shader->binary = mainp->binary;
2437 shader->config = mainp->config;
2438 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
2439 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
2440 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
2441 shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
2442 memcpy(shader->info.vs_output_param_offset, mainp->info.vs_output_param_offset,
2443 sizeof(mainp->info.vs_output_param_offset));
2444 shader->info.uses_instanceid = mainp->info.uses_instanceid;
2445 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
2446 shader->info.nr_param_exports = mainp->info.nr_param_exports;
2447
2448 /* Select prologs and/or epilogs. */
2449 switch (sel->info.stage) {
2450 case MESA_SHADER_VERTEX:
2451 if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
2452 return false;
2453 break;
2454 case MESA_SHADER_TESS_CTRL:
2455 if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
2456 return false;
2457 break;
2458 case MESA_SHADER_TESS_EVAL:
2459 break;
2460 case MESA_SHADER_GEOMETRY:
2461 if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
2462 return false;
2463 break;
2464 case MESA_SHADER_FRAGMENT:
2465 if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
2466 return false;
2467
2468 /* Make sure we have at least as many VGPRs as there
2469 * are allocated inputs.
2470 */
2471 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
2472 break;
2473 default:;
2474 }
2475
2476 /* Update SGPR and VGPR counts. */
2477 if (shader->prolog) {
2478 shader->config.num_sgprs =
2479 MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
2480 shader->config.num_vgprs =
2481 MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
2482 }
2483 if (shader->previous_stage) {
2484 shader->config.num_sgprs =
2485 MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
2486 shader->config.num_vgprs =
2487 MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
2488 shader->config.spilled_sgprs =
2489 MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
2490 shader->config.spilled_vgprs =
2491 MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
2492 shader->info.private_mem_vgprs =
2493 MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
2494 shader->config.scratch_bytes_per_wave =
2495 MAX2(shader->config.scratch_bytes_per_wave,
2496 shader->previous_stage->config.scratch_bytes_per_wave);
2497 shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
2498 }
2499 if (shader->prolog2) {
2500 shader->config.num_sgprs =
2501 MAX2(shader->config.num_sgprs, shader->prolog2->config.num_sgprs);
2502 shader->config.num_vgprs =
2503 MAX2(shader->config.num_vgprs, shader->prolog2->config.num_vgprs);
2504 }
2505 if (shader->epilog) {
2506 shader->config.num_sgprs =
2507 MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
2508 shader->config.num_vgprs =
2509 MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
2510 }
2511 si_calculate_max_simd_waves(shader);
2512 }
2513
2514 if (shader->key.as_ngg) {
2515 assert(!shader->key.as_es && !shader->key.as_ls);
2516 if (!gfx10_ngg_calculate_subgroup_info(shader)) {
2517 fprintf(stderr, "Failed to compute subgroup info\n");
2518 return false;
2519 }
2520 } else if (sscreen->info.chip_class >= GFX9 && sel->info.stage == MESA_SHADER_GEOMETRY) {
2521 gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
2522 }
2523
2524 si_fix_resource_usage(sscreen, shader);
2525 si_shader_dump(sscreen, shader, debug, stderr, true);
2526
2527 /* Upload. */
2528 if (!si_shader_binary_upload(sscreen, shader, 0)) {
2529 fprintf(stderr, "LLVM failed to upload shader\n");
2530 return false;
2531 }
2532
2533 return true;
2534 }
2535
2536 void si_shader_binary_clean(struct si_shader_binary *binary)
2537 {
2538 free((void *)binary->elf_buffer);
2539 binary->elf_buffer = NULL;
2540
2541 free(binary->llvm_ir_string);
2542 binary->llvm_ir_string = NULL;
2543 }
2544
2545 void si_shader_destroy(struct si_shader *shader)
2546 {
2547 if (shader->scratch_bo)
2548 si_resource_reference(&shader->scratch_bo, NULL);
2549
2550 si_resource_reference(&shader->bo, NULL);
2551
2552 if (!shader->is_binary_shared)
2553 si_shader_binary_clean(&shader->binary);
2554
2555 free(shader->shader_log);
2556 }